vigil-codeintel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
- vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
- vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
- vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
- vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
- vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
- vigil_forensic/__init__.py +224 -0
- vigil_forensic/_git_utils.py +178 -0
- vigil_forensic/_shared.py +510 -0
- vigil_forensic/_stubs.py +156 -0
- vigil_forensic/gate_checks/__init__.py +1 -0
- vigil_forensic/gate_checks/_ast_helpers.py +629 -0
- vigil_forensic/gate_checks/_deployment_detector.py +573 -0
- vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
- vigil_forensic/gate_checks/authority_checks.py +95 -0
- vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
- vigil_forensic/gate_checks/broad_except_checks.py +301 -0
- vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
- vigil_forensic/gate_checks/common.py +253 -0
- vigil_forensic/gate_checks/config_safety_checks.py +704 -0
- vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
- vigil_forensic/gate_checks/conflict_checks.py +193 -0
- vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
- vigil_forensic/gate_checks/context_health_checks.py +289 -0
- vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
- vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
- vigil_forensic/gate_checks/duplication_checks.py +387 -0
- vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
- vigil_forensic/gate_checks/empty_output_checks.py +87 -0
- vigil_forensic/gate_checks/encoding_checks.py +847 -0
- vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
- vigil_forensic/gate_checks/fallback_checks.py +41 -0
- vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
- vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
- vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
- vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
- vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
- vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
- vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
- vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
- vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
- vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
- vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
- vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
- vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
- vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
- vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
- vigil_forensic/gate_checks/hallucination_checks.py +566 -0
- vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
- vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
- vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
- vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
- vigil_forensic/gate_checks/ml_checks.py +318 -0
- vigil_forensic/gate_checks/performance_checks.py +106 -0
- vigil_forensic/gate_checks/project_specific_runner.py +691 -0
- vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
- vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
- vigil_forensic/gate_checks/reliability_checks.py +389 -0
- vigil_forensic/gate_checks/reporting_checks.py +55 -0
- vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
- vigil_forensic/gate_checks/security_injection_checks.py +332 -0
- vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
- vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
- vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
- vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
- vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
- vigil_forensic/gate_checks/test_quality_checks.py +946 -0
- vigil_forensic/gate_checks/testing_checks.py +149 -0
- vigil_forensic/gate_checks/toctou_checks.py +367 -0
- vigil_forensic/gate_checks/type_checking_checks.py +316 -0
- vigil_forensic/gate_models.py +392 -0
- vigil_forensic/gate_packs/__init__.py +1 -0
- vigil_forensic/gate_packs/universal.py +179 -0
- vigil_forensic/gate_profile.json +31 -0
- vigil_forensic/gate_registry.py +21 -0
- vigil_forensic/language_profiles.py +219 -0
- vigil_forensic/meta_findings.py +207 -0
- vigil_forensic/self_audit.py +725 -0
- vigil_forensic/source_analysis.py +175 -0
- vigil_mapper/__init__.py +103 -0
- vigil_mapper/_ast_helpers_minimal.py +229 -0
- vigil_mapper/_extract_imports_impl.py +123 -0
- vigil_mapper/_file_count_guard.py +129 -0
- vigil_mapper/_git_utils.py +178 -0
- vigil_mapper/_runtime_ast.py +438 -0
- vigil_mapper/_runtime_dispatch.py +137 -0
- vigil_mapper/_seed_helpers.py +82 -0
- vigil_mapper/authority_builder.py +1102 -0
- vigil_mapper/cli_entry.py +731 -0
- vigil_mapper/conflict_builder.py +818 -0
- vigil_mapper/data_contract_builder.py +446 -0
- vigil_mapper/findings_builder.py +716 -0
- vigil_mapper/fingerprint.py +53 -0
- vigil_mapper/hotspot_builder.py +539 -0
- vigil_mapper/map_common.py +449 -0
- vigil_mapper/map_errors.py +55 -0
- vigil_mapper/map_models.py +431 -0
- vigil_mapper/map_models_ext.py +206 -0
- vigil_mapper/map_models_findings.py +130 -0
- vigil_mapper/map_storage.py +455 -0
- vigil_mapper/parse_cache.py +795 -0
- vigil_mapper/refactor_boundary_builder.py +266 -0
- vigil_mapper/runtime_builder.py +527 -0
- vigil_mapper/runtime_tracer.py +243 -0
- vigil_mapper/runtime_tracer_entry.py +199 -0
- vigil_mapper/semantic_diff.py +71 -0
- vigil_mapper/source_adapters/__init__.py +109 -0
- vigil_mapper/source_adapters/_base.py +264 -0
- vigil_mapper/source_adapters/_ir.py +156 -0
- vigil_mapper/source_adapters/_lexer.py +309 -0
- vigil_mapper/source_adapters/_patterns.py +212 -0
- vigil_mapper/source_adapters/_treesitter.py +182 -0
- vigil_mapper/source_adapters/go.py +553 -0
- vigil_mapper/source_adapters/java.py +541 -0
- vigil_mapper/source_adapters/javascript.py +626 -0
- vigil_mapper/source_adapters/python.py +325 -0
- vigil_mapper/source_adapters/typescript.py +749 -0
- vigil_mapper/structural_builder.py +586 -0
- vigil_mcp/__init__.py +1 -0
- vigil_mcp/_jobs.py +587 -0
- vigil_mcp/_paths.py +93 -0
- vigil_mcp/forensic_server.py +419 -0
- vigil_mcp/map_server.py +452 -0
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
"""Shared preprocessing utilities for regex-based source adapters.
|
|
2
|
+
|
|
3
|
+
L2 implementation: C-family comment & string stripping plus multi-line import
|
|
4
|
+
collapsing for TypeScript / JavaScript. Single-pass character-by-character
|
|
5
|
+
scanner in ``strip_comments_and_strings`` -- O(n), stdlib only, no external
|
|
6
|
+
parser dependency.
|
|
7
|
+
|
|
8
|
+
Python adapter does NOT use this module -- it calls ``ast.parse`` directly.
|
|
9
|
+
|
|
10
|
+
Language argument is retained in the signature for future L5 dispatch
|
|
11
|
+
(Go / Java share the C-family rules) but currently only ``"typescript"``,
|
|
12
|
+
``"javascript"`` are recognised. Unknown languages fall back to a safe
|
|
13
|
+
passthrough (caller still works, just with comment/string false-positives).
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import re
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"strip_comments_and_strings",
|
|
22
|
+
"strip_comments_only",
|
|
23
|
+
"join_multiline_imports",
|
|
24
|
+
"strip_go_raw_strings",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
_log = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
# Languages that share C-family comment + string syntax.
|
|
30
|
+
_C_FAMILY: frozenset[str] = frozenset({"typescript", "javascript", "go", "java"})
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def strip_comments_only(content: str, language: str) -> str:
|
|
34
|
+
"""Remove line and block comments while PRESERVING string literals intact.
|
|
35
|
+
|
|
36
|
+
Used by ``extract_imports`` where the module specifier IS a string body
|
|
37
|
+
(``from 'react'``) -- we must not destroy it. String literals are still
|
|
38
|
+
scanned so that a comment opener inside a string (e.g. ``"http://x"``)
|
|
39
|
+
does not accidentally trigger comment stripping.
|
|
40
|
+
|
|
41
|
+
Unknown languages fall back to a passthrough.
|
|
42
|
+
"""
|
|
43
|
+
if language not in _C_FAMILY:
|
|
44
|
+
return content
|
|
45
|
+
|
|
46
|
+
out: list[str] = []
|
|
47
|
+
i = 0
|
|
48
|
+
n = len(content)
|
|
49
|
+
|
|
50
|
+
while i < n:
|
|
51
|
+
ch = content[i]
|
|
52
|
+
nxt = content[i + 1] if i + 1 < n else ""
|
|
53
|
+
|
|
54
|
+
# Line comment //...
|
|
55
|
+
if ch == "/" and nxt == "/":
|
|
56
|
+
j = content.find("\n", i + 2)
|
|
57
|
+
if j == -1:
|
|
58
|
+
break
|
|
59
|
+
out.append("\n")
|
|
60
|
+
i = j + 1
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
# Block comment /* ... */
|
|
64
|
+
if ch == "/" and nxt == "*":
|
|
65
|
+
j = content.find("*/", i + 2)
|
|
66
|
+
if j == -1:
|
|
67
|
+
for c in content[i:]:
|
|
68
|
+
if c == "\n":
|
|
69
|
+
out.append("\n")
|
|
70
|
+
break
|
|
71
|
+
for c in content[i:j + 2]:
|
|
72
|
+
if c == "\n":
|
|
73
|
+
out.append("\n")
|
|
74
|
+
i = j + 2
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
# String literal — preserve contents verbatim but skip past so a
|
|
78
|
+
# ``//`` / ``/*`` inside the body cannot trigger comment stripping.
|
|
79
|
+
if ch in ("'", '"', "`"):
|
|
80
|
+
quote = ch
|
|
81
|
+
out.append(quote)
|
|
82
|
+
i += 1
|
|
83
|
+
while i < n:
|
|
84
|
+
c = content[i]
|
|
85
|
+
if c == "\\" and i + 1 < n:
|
|
86
|
+
out.append(c)
|
|
87
|
+
out.append(content[i + 1])
|
|
88
|
+
i += 2
|
|
89
|
+
continue
|
|
90
|
+
out.append(c)
|
|
91
|
+
if c == quote:
|
|
92
|
+
i += 1
|
|
93
|
+
break
|
|
94
|
+
i += 1
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
out.append(ch)
|
|
98
|
+
i += 1
|
|
99
|
+
|
|
100
|
+
return "".join(out)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def strip_comments_and_strings(content: str, language: str) -> str:
|
|
104
|
+
"""Remove comment and string literal regions from *content*.
|
|
105
|
+
|
|
106
|
+
Replaces:
|
|
107
|
+
- ``//`` line comments -> emit the trailing newline untouched.
|
|
108
|
+
- ``/* ... */`` block comments -> emit newlines that fell inside.
|
|
109
|
+
- ``'...'`` / ``"..."`` / `` `...` `` string spans -> emit empty body
|
|
110
|
+
(quotes preserved) plus any internal newlines.
|
|
111
|
+
|
|
112
|
+
Escape sequences (``\\"``, ``\\'``, ``\\\\``, ``\\n``, etc.) inside strings
|
|
113
|
+
do NOT terminate the string. Template literals are treated as simple
|
|
114
|
+
strings for L2 -- nested ``${...}`` expressions are NOT re-parsed.
|
|
115
|
+
|
|
116
|
+
Newlines are always preserved so that ``re.MULTILINE`` patterns matching
|
|
117
|
+
against the cleaned output still recover correct 1-based line numbers.
|
|
118
|
+
|
|
119
|
+
Unknown languages fall back to a passthrough (returns *content* unchanged)
|
|
120
|
+
-- L2 covers only ``typescript`` and ``javascript``.
|
|
121
|
+
"""
|
|
122
|
+
if language not in _C_FAMILY:
|
|
123
|
+
return content
|
|
124
|
+
|
|
125
|
+
out: list[str] = []
|
|
126
|
+
i = 0
|
|
127
|
+
n = len(content)
|
|
128
|
+
|
|
129
|
+
while i < n:
|
|
130
|
+
ch = content[i]
|
|
131
|
+
nxt = content[i + 1] if i + 1 < n else ""
|
|
132
|
+
|
|
133
|
+
# Line comment //...\n
|
|
134
|
+
if ch == "/" and nxt == "/":
|
|
135
|
+
j = content.find("\n", i + 2)
|
|
136
|
+
if j == -1:
|
|
137
|
+
# Comment runs to EOF; emit nothing more.
|
|
138
|
+
break
|
|
139
|
+
# Preserve the newline so line numbers stay aligned.
|
|
140
|
+
out.append("\n")
|
|
141
|
+
i = j + 1
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
# Block comment /* ... */
|
|
145
|
+
if ch == "/" and nxt == "*":
|
|
146
|
+
j = content.find("*/", i + 2)
|
|
147
|
+
if j == -1:
|
|
148
|
+
# Unterminated block comment runs to EOF — swallow, preserve
|
|
149
|
+
# any embedded newlines so line counts don't collapse.
|
|
150
|
+
for c in content[i:]:
|
|
151
|
+
if c == "\n":
|
|
152
|
+
out.append("\n")
|
|
153
|
+
break
|
|
154
|
+
# Preserve embedded newlines within the comment.
|
|
155
|
+
for c in content[i:j + 2]:
|
|
156
|
+
if c == "\n":
|
|
157
|
+
out.append("\n")
|
|
158
|
+
i = j + 2
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
# String literals
|
|
162
|
+
if ch in ("'", '"', "`"):
|
|
163
|
+
quote = ch
|
|
164
|
+
out.append(quote)
|
|
165
|
+
i += 1
|
|
166
|
+
while i < n:
|
|
167
|
+
c = content[i]
|
|
168
|
+
if c == "\\" and i + 1 < n:
|
|
169
|
+
# Skip the escape + escaped char. Don't emit either into
|
|
170
|
+
# the body (body is intentionally empty), but preserve
|
|
171
|
+
# a newline only if the escape targets a literal newline.
|
|
172
|
+
esc = content[i + 1]
|
|
173
|
+
if esc == "\n":
|
|
174
|
+
out.append("\n")
|
|
175
|
+
i += 2
|
|
176
|
+
continue
|
|
177
|
+
if c == quote:
|
|
178
|
+
out.append(quote)
|
|
179
|
+
i += 1
|
|
180
|
+
break
|
|
181
|
+
if c == "\n":
|
|
182
|
+
# Preserve newline. Note: unescaped newlines in '...'/"..."
|
|
183
|
+
# are a syntax error in TS/JS, but template literals allow
|
|
184
|
+
# them. Either way we keep line counts correct.
|
|
185
|
+
out.append("\n")
|
|
186
|
+
i += 1
|
|
187
|
+
continue
|
|
188
|
+
# Regular character inside string: drop (body becomes empty).
|
|
189
|
+
i += 1
|
|
190
|
+
else:
|
|
191
|
+
# EOF reached without closing quote — bail cleanly.
|
|
192
|
+
_log.debug(
|
|
193
|
+
"strip_comments_and_strings: unterminated %s string literal",
|
|
194
|
+
quote,
|
|
195
|
+
)
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
# Regular source character.
|
|
199
|
+
out.append(ch)
|
|
200
|
+
i += 1
|
|
201
|
+
|
|
202
|
+
return "".join(out)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# Matches ``import { ... } from 'x'`` (or `"x"`) or the re-export form
|
|
206
|
+
# ``export { ... } from 'x'`` where the brace group may span multiple lines.
|
|
207
|
+
# ``from '...'`` is MANDATORY — this prevents the regex from swallowing
|
|
208
|
+
# unrelated brace groups like ``export class Foo {}`` or
|
|
209
|
+
# ``export function bar() { ... }``.
|
|
210
|
+
# Captured groups:
|
|
211
|
+
# 1: leading keyword phrase up to and including the opening brace
|
|
212
|
+
# 2: multi-line brace body (non-greedy, without braces)
|
|
213
|
+
# 3: trailing portion including ``from 'x'`` or ``from "x"``
|
|
214
|
+
_MULTILINE_BRACE_IMPORT = re.compile(
|
|
215
|
+
r"(^[ \t]*(?:import|export)[^\n{]*?\{)" # keyword ... {
|
|
216
|
+
r"([^{}]*?)" # body (no braces)
|
|
217
|
+
r"(\}[ \t]*from[ \t]*['\"][^'\"\n]+['\"][ \t]*;?)", # } from '...' ;
|
|
218
|
+
re.MULTILINE | re.DOTALL,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def join_multiline_imports(content: str, language: str) -> str:
|
|
223
|
+
"""Collapse multi-line brace-group imports onto a single logical line.
|
|
224
|
+
|
|
225
|
+
Transforms::
|
|
226
|
+
|
|
227
|
+
import {
|
|
228
|
+
ComponentA,
|
|
229
|
+
ComponentB,
|
|
230
|
+
} from './components';
|
|
231
|
+
|
|
232
|
+
into::
|
|
233
|
+
|
|
234
|
+
import { ComponentA, ComponentB, } from './components';
|
|
235
|
+
|
|
236
|
+
Leading indentation is preserved so the ``import`` keyword stays at the
|
|
237
|
+
same 1-based line number (any embedded newlines inside the brace group
|
|
238
|
+
are simply replaced with a single space). Lines preceding and following
|
|
239
|
+
the statement are untouched, so downstream regex matchers that count via
|
|
240
|
+
``re.MULTILINE`` still report the original ``import`` line number.
|
|
241
|
+
|
|
242
|
+
Unknown languages fall back to a passthrough.
|
|
243
|
+
"""
|
|
244
|
+
if language not in _C_FAMILY:
|
|
245
|
+
return content
|
|
246
|
+
|
|
247
|
+
def _collapse(match: re.Match[str]) -> str:
|
|
248
|
+
head, body, tail = match.group(1), match.group(2), match.group(3)
|
|
249
|
+
# Count newlines that fall between the opening '{' and the ';' (or
|
|
250
|
+
# line-end). We collapse them into a single space inside the brace
|
|
251
|
+
# group, but append them AFTER the statement so the line number of
|
|
252
|
+
# every subsequent ``import`` keyword — and every later symbol — is
|
|
253
|
+
# preserved byte-for-byte against the original source.
|
|
254
|
+
original = match.group(0)
|
|
255
|
+
nl_count = original.count("\n")
|
|
256
|
+
flat = re.sub(r"\s+", " ", body)
|
|
257
|
+
collapsed = f"{head} {flat.strip()} {tail}"
|
|
258
|
+
# Re-emit the newlines so downstream line numbers stay stable.
|
|
259
|
+
return collapsed + ("\n" * nl_count)
|
|
260
|
+
|
|
261
|
+
return _MULTILINE_BRACE_IMPORT.sub(_collapse, content)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def strip_go_raw_strings(content: str) -> str:
|
|
265
|
+
"""Replace Go backtick raw string bodies with empty bodies.
|
|
266
|
+
|
|
267
|
+
Go raw string literals are delimited by backticks (`` ` ``). They
|
|
268
|
+
have NO escape sequences -- a backtick cannot appear inside the literal.
|
|
269
|
+
The entire span from opening to closing backtick is therefore replaced by
|
|
270
|
+
two consecutive backtick characters, preserving any embedded newlines as
|
|
271
|
+
plain newlines so that downstream line-number counts remain stable.
|
|
272
|
+
|
|
273
|
+
This function must be applied BEFORE ``strip_comments_only`` when
|
|
274
|
+
processing Go source for import extraction, because ``strip_comments_only``
|
|
275
|
+
preserves string bodies verbatim (needed for ``"..."`` import paths) and
|
|
276
|
+
would otherwise keep the backtick body intact -- allowing fake imports
|
|
277
|
+
embedded in raw string literals to produce false-positive ImportEdges.
|
|
278
|
+
|
|
279
|
+
Usage::
|
|
280
|
+
|
|
281
|
+
cleaned = strip_go_raw_strings(source)
|
|
282
|
+
cleaned = strip_comments_only(cleaned, "go")
|
|
283
|
+
"""
|
|
284
|
+
out: list[str] = []
|
|
285
|
+
i = 0
|
|
286
|
+
n = len(content)
|
|
287
|
+
|
|
288
|
+
while i < n:
|
|
289
|
+
ch = content[i]
|
|
290
|
+
if ch == "`":
|
|
291
|
+
# Opening backtick — scan until closing backtick (no escapes).
|
|
292
|
+
out.append("`")
|
|
293
|
+
i += 1
|
|
294
|
+
while i < n:
|
|
295
|
+
c = content[i]
|
|
296
|
+
i += 1
|
|
297
|
+
if c == "`":
|
|
298
|
+
# Closing backtick — emit it and stop.
|
|
299
|
+
out.append("`")
|
|
300
|
+
break
|
|
301
|
+
if c == "\n":
|
|
302
|
+
# Preserve newlines to keep line numbers correct.
|
|
303
|
+
out.append("\n")
|
|
304
|
+
# All other characters in the raw string are dropped.
|
|
305
|
+
else:
|
|
306
|
+
out.append(ch)
|
|
307
|
+
i += 1
|
|
308
|
+
|
|
309
|
+
return "".join(out)
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""Shared regex patterns for TypeScript / JavaScript adapters.
|
|
2
|
+
|
|
3
|
+
Factoring out keeps the adapter modules under the 400-line budget and
|
|
4
|
+
guarantees that TS and JS behave identically on overlapping syntax
|
|
5
|
+
(ES-module import/export, top-level class/function declarations).
|
|
6
|
+
|
|
7
|
+
All patterns operate on *cleaned* source -- i.e. output of
|
|
8
|
+
``_lexer.strip_comments_and_strings`` then ``_lexer.join_multiline_imports``.
|
|
9
|
+
String bodies are therefore empty (``''`` / ``""`` / `` `` ``) and comments
|
|
10
|
+
are gone, which prevents ``//`` or ``/* import fake */`` from matching.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
import logging
|
|
16
|
+
_log = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"RE_IMPORT_DEFAULT",
|
|
20
|
+
"RE_IMPORT_NAMED",
|
|
21
|
+
"RE_IMPORT_NAMESPACE",
|
|
22
|
+
"RE_IMPORT_SIDE_EFFECT",
|
|
23
|
+
"RE_IMPORT_TYPE_DEFAULT",
|
|
24
|
+
"RE_IMPORT_TYPE_NAMED",
|
|
25
|
+
"RE_EXPORT_FROM_NAMED",
|
|
26
|
+
"RE_EXPORT_FROM_STAR",
|
|
27
|
+
"RE_DYNAMIC_IMPORT",
|
|
28
|
+
"RE_REQUIRE_ASSIGN",
|
|
29
|
+
"RE_REQUIRE_BARE",
|
|
30
|
+
"RE_SYMBOL_CLASS",
|
|
31
|
+
"RE_SYMBOL_INTERFACE",
|
|
32
|
+
"RE_SYMBOL_TYPE",
|
|
33
|
+
"RE_SYMBOL_FUNCTION",
|
|
34
|
+
"RE_SYMBOL_CONST",
|
|
35
|
+
"RE_SYMBOL_ENUM",
|
|
36
|
+
"classify_import",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Import-form regex patterns (ES modules)
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
#
|
|
44
|
+
# Conventions:
|
|
45
|
+
# - ``^`` anchors every pattern to start-of-line (re.MULTILINE) so that
|
|
46
|
+
# nested ``import(...)`` inside a function body is still caught by the
|
|
47
|
+
# dynamic-import pattern below (which is NOT line-anchored).
|
|
48
|
+
# - Target module is captured as group ``module``.
|
|
49
|
+
# - Quotes may be single or double. Template-literal imports are not
|
|
50
|
+
# recognised here (rare; low confidence; explicit tech-debt for L6).
|
|
51
|
+
|
|
52
|
+
# ``import X from 'Y'`` (default only)
|
|
53
|
+
RE_IMPORT_DEFAULT = re.compile(
|
|
54
|
+
r"""^[ \t]*import\s+
|
|
55
|
+
(?!type\b) # skip 'import type ...'
|
|
56
|
+
[A-Za-z_$][\w$]* # default binding
|
|
57
|
+
\s+from\s+
|
|
58
|
+
['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
|
|
59
|
+
re.MULTILINE | re.VERBOSE,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# ``import { A, B } from 'Y'`` (named only, maybe with default prefix)
|
|
63
|
+
RE_IMPORT_NAMED = re.compile(
|
|
64
|
+
r"""^[ \t]*import\s+
|
|
65
|
+
(?!type\b)
|
|
66
|
+
(?:[A-Za-z_$][\w$]*\s*,\s*)? # optional default binding
|
|
67
|
+
\{[^}]*\} # named group (may be empty)
|
|
68
|
+
\s+from\s+
|
|
69
|
+
['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
|
|
70
|
+
re.MULTILINE | re.VERBOSE,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# ``import * as X from 'Y'``
|
|
74
|
+
RE_IMPORT_NAMESPACE = re.compile(
|
|
75
|
+
r"""^[ \t]*import\s+
|
|
76
|
+
\*\s+as\s+[A-Za-z_$][\w$]*
|
|
77
|
+
\s+from\s+
|
|
78
|
+
['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
|
|
79
|
+
re.MULTILINE | re.VERBOSE,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# ``import 'Y'`` (side-effect only)
|
|
83
|
+
RE_IMPORT_SIDE_EFFECT = re.compile(
|
|
84
|
+
r"""^[ \t]*import\s+['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
|
|
85
|
+
re.MULTILINE | re.VERBOSE,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# ``import type X from 'Y'`` (TS-only)
|
|
89
|
+
RE_IMPORT_TYPE_DEFAULT = re.compile(
|
|
90
|
+
r"""^[ \t]*import\s+type\s+
|
|
91
|
+
[A-Za-z_$][\w$]*
|
|
92
|
+
\s+from\s+
|
|
93
|
+
['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
|
|
94
|
+
re.MULTILINE | re.VERBOSE,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# ``import type { X } from 'Y'`` (TS-only)
|
|
98
|
+
RE_IMPORT_TYPE_NAMED = re.compile(
|
|
99
|
+
r"""^[ \t]*import\s+type\s+
|
|
100
|
+
\{[^}]*\}
|
|
101
|
+
\s+from\s+
|
|
102
|
+
['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
|
|
103
|
+
re.MULTILINE | re.VERBOSE,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# ``export { A, B } from 'Y'`` (re-export)
|
|
107
|
+
RE_EXPORT_FROM_NAMED = re.compile(
|
|
108
|
+
r"""^[ \t]*export\s+
|
|
109
|
+
\{[^}]*\}
|
|
110
|
+
\s+from\s+
|
|
111
|
+
['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
|
|
112
|
+
re.MULTILINE | re.VERBOSE,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# ``export * from 'Y'`` or ``export * as NS from 'Y'``
|
|
116
|
+
RE_EXPORT_FROM_STAR = re.compile(
|
|
117
|
+
r"""^[ \t]*export\s+
|
|
118
|
+
\*(?:\s+as\s+[A-Za-z_$][\w$]*)?
|
|
119
|
+
\s+from\s+
|
|
120
|
+
['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
|
|
121
|
+
re.MULTILINE | re.VERBOSE,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Dynamic ``import('Y')`` — NOT line-anchored; may appear inside expressions.
|
|
125
|
+
RE_DYNAMIC_IMPORT = re.compile(
|
|
126
|
+
r"""\bimport\s*\(\s*
|
|
127
|
+
['"](?P<module>[^'"\n]+)['"]
|
|
128
|
+
\s*\)""",
|
|
129
|
+
re.VERBOSE,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# CommonJS: ``const|let|var X = require('Y')`` (assignment form)
|
|
133
|
+
RE_REQUIRE_ASSIGN = re.compile(
|
|
134
|
+
r"""^[ \t]*(?:const|let|var)\s+
|
|
135
|
+
(?:[A-Za-z_$][\w$]*|\{[^}]*\}|\[[^\]]*\]) # binding (ident | destructure)
|
|
136
|
+
\s*=\s*require\s*\(\s*
|
|
137
|
+
['"](?P<module>[^'"\n]+)['"]
|
|
138
|
+
\s*\)\s*;?\s*$""",
|
|
139
|
+
re.MULTILINE | re.VERBOSE,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# CommonJS bare: ``require('Y');`` (side-effect)
|
|
143
|
+
RE_REQUIRE_BARE = re.compile(
|
|
144
|
+
r"""^[ \t]*require\s*\(\s*
|
|
145
|
+
['"](?P<module>[^'"\n]+)['"]
|
|
146
|
+
\s*\)\s*;?\s*$""",
|
|
147
|
+
re.MULTILINE | re.VERBOSE,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ---------------------------------------------------------------------------
|
|
152
|
+
# Symbol-definition regex patterns — top-level only
|
|
153
|
+
# ---------------------------------------------------------------------------
|
|
154
|
+
#
|
|
155
|
+
# Top-level is approximated by requiring the declaration keyword to start at
|
|
156
|
+
# column zero, optionally preceded by ``export `` / ``export default ``.
|
|
157
|
+
# Indented declarations inside function bodies, classes, or blocks are not
|
|
158
|
+
# captured — this is intentional for L2 (avoids nested noise; deep parsing
|
|
159
|
+
# is a tree-sitter job for L6+).
|
|
160
|
+
|
|
161
|
+
_EXPORT_PREFIX = r"(?P<export>export\s+(?:default\s+)?)?"
|
|
162
|
+
_TOPLEVEL_START = r"^" + _EXPORT_PREFIX
|
|
163
|
+
|
|
164
|
+
RE_SYMBOL_CLASS = re.compile(
|
|
165
|
+
_TOPLEVEL_START
|
|
166
|
+
+ r"(?:abstract\s+)?class\s+(?P<name>[A-Za-z_$][\w$]*)",
|
|
167
|
+
re.MULTILINE,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
RE_SYMBOL_INTERFACE = re.compile(
|
|
171
|
+
_TOPLEVEL_START + r"interface\s+(?P<name>[A-Za-z_$][\w$]*)",
|
|
172
|
+
re.MULTILINE,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
RE_SYMBOL_TYPE = re.compile(
|
|
176
|
+
_TOPLEVEL_START + r"type\s+(?P<name>[A-Za-z_$][\w$]*)\s*(?:<[^>]*>)?\s*=",
|
|
177
|
+
re.MULTILINE,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
RE_SYMBOL_FUNCTION = re.compile(
|
|
181
|
+
_TOPLEVEL_START
|
|
182
|
+
+ r"(?:async\s+)?function\s*\*?\s*(?P<name>[A-Za-z_$][\w$]*)\s*\(",
|
|
183
|
+
re.MULTILINE,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
RE_SYMBOL_CONST = re.compile(
|
|
187
|
+
_TOPLEVEL_START
|
|
188
|
+
+ r"(?:const|let|var)\s+(?P<name>[A-Za-z_$][\w$]*)\s*[:=]",
|
|
189
|
+
re.MULTILINE,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
RE_SYMBOL_ENUM = re.compile(
|
|
193
|
+
_TOPLEVEL_START
|
|
194
|
+
+ r"(?:const\s+)?enum\s+(?P<name>[A-Za-z_$][\w$]*)",
|
|
195
|
+
re.MULTILINE,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# ---------------------------------------------------------------------------
|
|
200
|
+
# Helpers
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
def classify_import(module: str) -> str:
|
|
204
|
+
"""Return ``"relative"`` for dot-prefixed specifiers, else ``"absolute"``.
|
|
205
|
+
|
|
206
|
+
Matches Node ESM / TS convention: a specifier starting with ``.`` or
|
|
207
|
+
``..`` (or ``/``) is relative to the file; anything else -- including
|
|
208
|
+
scoped packages like ``@scope/pkg`` and node: builtins -- is absolute.
|
|
209
|
+
"""
|
|
210
|
+
if module.startswith(".") or module.startswith("/"):
|
|
211
|
+
return "relative"
|
|
212
|
+
return "absolute"
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Shared tree-sitter helpers for source adapters.
|
|
2
|
+
|
|
3
|
+
Provides a cached parser factory and ergonomic utilities for walking
|
|
4
|
+
tree-sitter parse trees. Language-neutral — designed for reuse by Go,
|
|
5
|
+
Java, JavaScript, TypeScript adapters and any future tree-sitter adapter.
|
|
6
|
+
|
|
7
|
+
Public API
|
|
8
|
+
----------
|
|
9
|
+
get_ts_parser(language)
|
|
10
|
+
Return a cached ``tree_sitter.Parser`` initialised for *language*.
|
|
11
|
+
Supported language names match those accepted by
|
|
12
|
+
``tree_sitter_language_pack.get_language``.
|
|
13
|
+
|
|
14
|
+
parse_bytes(language, source_bytes)
|
|
15
|
+
Convenience: parse *source_bytes* and return the root ``Node``.
|
|
16
|
+
|
|
17
|
+
node_text(node, source_bytes)
|
|
18
|
+
Decode the byte slice that corresponds to *node* in *source_bytes*.
|
|
19
|
+
|
|
20
|
+
node_line(node)
|
|
21
|
+
Return the 1-based source line number for *node*.
|
|
22
|
+
|
|
23
|
+
iter_named_children(node, *types)
|
|
24
|
+
Yield direct named children of *node* whose ``type`` is in *types*.
|
|
25
|
+
If no types are given, yield all named children.
|
|
26
|
+
|
|
27
|
+
walk_named(node, *types)
|
|
28
|
+
Depth-first generator over ALL named descendant nodes (including
|
|
29
|
+
*node* itself) whose ``type`` is in *types*.
|
|
30
|
+
If no types are given, yield all named descendants.
|
|
31
|
+
|
|
32
|
+
Verified against tree-sitter==0.25.2 / tree-sitter-language-pack==1.10.8.
|
|
33
|
+
Parser is constructed as ``Parser(get_language(lang))`` — the ``get_parser``
|
|
34
|
+
wrapper is ABI-broken on 0.25 and must NOT be used.
|
|
35
|
+
"""
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import logging
|
|
39
|
+
from functools import lru_cache
|
|
40
|
+
from typing import Generator
|
|
41
|
+
|
|
42
|
+
from tree_sitter import Node, Parser
|
|
43
|
+
from tree_sitter_language_pack import get_language
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"get_ts_parser",
|
|
47
|
+
"parse_bytes",
|
|
48
|
+
"node_text",
|
|
49
|
+
"node_line",
|
|
50
|
+
"iter_named_children",
|
|
51
|
+
"walk_named",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
_log = logging.getLogger(__name__)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
# Parser cache
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
|
|
61
|
+
@lru_cache(maxsize=32)
|
|
62
|
+
def get_ts_parser(language: str) -> Parser:
|
|
63
|
+
"""Return a cached ``Parser`` initialised for *language*.
|
|
64
|
+
|
|
65
|
+
Uses ``Parser(get_language(language))`` — the verified form for
|
|
66
|
+
tree-sitter==0.25. The result is cached per language string so
|
|
67
|
+
adapters instantiated many times pay the initialisation cost only once.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
language:
|
|
72
|
+
Language name understood by ``tree_sitter_language_pack``,
|
|
73
|
+
e.g. ``"go"``, ``"java"``, ``"javascript"``, ``"typescript"``.
|
|
74
|
+
|
|
75
|
+
Raises
|
|
76
|
+
------
|
|
77
|
+
LookupError
|
|
78
|
+
If *language* is not available in the installed language pack.
|
|
79
|
+
"""
|
|
80
|
+
lang_obj = get_language(language)
|
|
81
|
+
parser = Parser(lang_obj)
|
|
82
|
+
_log.debug("tree-sitter parser created for language=%r", language)
|
|
83
|
+
return parser
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
# Parse convenience
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
def parse_bytes(language: str, source_bytes: bytes) -> Node:
|
|
91
|
+
"""Parse *source_bytes* with the cached parser for *language*.
|
|
92
|
+
|
|
93
|
+
Parameters
|
|
94
|
+
----------
|
|
95
|
+
language:
|
|
96
|
+
Language name (see :func:`get_ts_parser`).
|
|
97
|
+
source_bytes:
|
|
98
|
+
Raw UTF-8 encoded source code.
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
Node
|
|
103
|
+
The ``root_node`` of the parsed tree.
|
|
104
|
+
"""
|
|
105
|
+
parser = get_ts_parser(language)
|
|
106
|
+
tree = parser.parse(source_bytes)
|
|
107
|
+
return tree.root_node
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# ---------------------------------------------------------------------------
|
|
111
|
+
# Node utilities
|
|
112
|
+
# ---------------------------------------------------------------------------
|
|
113
|
+
|
|
114
|
+
def node_text(node: Node, source_bytes: bytes) -> str:
|
|
115
|
+
"""Return the source text slice that corresponds to *node*.
|
|
116
|
+
|
|
117
|
+
Decodes as UTF-8 (errors replaced) so callers always get a ``str``.
|
|
118
|
+
"""
|
|
119
|
+
return source_bytes[node.start_byte:node.end_byte].decode("utf-8", errors="replace")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def node_line(node: Node) -> int:
|
|
123
|
+
"""Return the 1-based line number of *node* in the source file.
|
|
124
|
+
|
|
125
|
+
tree-sitter stores ``start_point`` as a ``(row, column)`` tuple where
|
|
126
|
+
``row`` is 0-based; we add 1 to match the convention used throughout
|
|
127
|
+
the adapter IR.
|
|
128
|
+
"""
|
|
129
|
+
return node.start_point[0] + 1
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def iter_named_children(node: Node, *types: str) -> Generator[Node, None, None]:
|
|
133
|
+
"""Yield direct named children of *node* filtered by ``type``.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
node:
|
|
138
|
+
Parent node whose children to iterate.
|
|
139
|
+
*types:
|
|
140
|
+
Optional whitelist of node type strings. If omitted, all named
|
|
141
|
+
children are yielded.
|
|
142
|
+
|
|
143
|
+
Yields
|
|
144
|
+
------
|
|
145
|
+
Node
|
|
146
|
+
Named child nodes matching the type filter.
|
|
147
|
+
"""
|
|
148
|
+
type_set: frozenset[str] | None = frozenset(types) if types else None
|
|
149
|
+
for child in node.children:
|
|
150
|
+
if not child.is_named:
|
|
151
|
+
continue
|
|
152
|
+
if type_set is None or child.type in type_set:
|
|
153
|
+
yield child
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def walk_named(node: Node, *types: str) -> Generator[Node, None, None]:
|
|
157
|
+
"""Depth-first generator over all named descendants of *node*.
|
|
158
|
+
|
|
159
|
+
*node* itself is included if it matches the type filter.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
node:
|
|
164
|
+
Starting node (included in traversal).
|
|
165
|
+
*types:
|
|
166
|
+
Optional whitelist of node type strings. If omitted, all named
|
|
167
|
+
nodes are yielded.
|
|
168
|
+
|
|
169
|
+
Yields
|
|
170
|
+
------
|
|
171
|
+
Node
|
|
172
|
+
Named descendant nodes matching the type filter.
|
|
173
|
+
"""
|
|
174
|
+
type_set: frozenset[str] | None = frozenset(types) if types else None
|
|
175
|
+
stack = [node]
|
|
176
|
+
while stack:
|
|
177
|
+
current = stack.pop()
|
|
178
|
+
if current.is_named and (type_set is None or current.type in type_set):
|
|
179
|
+
yield current
|
|
180
|
+
# Push children in reverse so left-to-right DFS order is preserved.
|
|
181
|
+
for child in reversed(current.children):
|
|
182
|
+
stack.append(child)
|