vigil-codeintel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
  2. vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
  3. vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
  4. vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
  5. vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
  7. vigil_forensic/__init__.py +224 -0
  8. vigil_forensic/_git_utils.py +178 -0
  9. vigil_forensic/_shared.py +510 -0
  10. vigil_forensic/_stubs.py +156 -0
  11. vigil_forensic/gate_checks/__init__.py +1 -0
  12. vigil_forensic/gate_checks/_ast_helpers.py +629 -0
  13. vigil_forensic/gate_checks/_deployment_detector.py +573 -0
  14. vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
  15. vigil_forensic/gate_checks/authority_checks.py +95 -0
  16. vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
  17. vigil_forensic/gate_checks/broad_except_checks.py +301 -0
  18. vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
  19. vigil_forensic/gate_checks/common.py +253 -0
  20. vigil_forensic/gate_checks/config_safety_checks.py +704 -0
  21. vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
  22. vigil_forensic/gate_checks/conflict_checks.py +193 -0
  23. vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
  24. vigil_forensic/gate_checks/context_health_checks.py +289 -0
  25. vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
  26. vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
  27. vigil_forensic/gate_checks/duplication_checks.py +387 -0
  28. vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
  29. vigil_forensic/gate_checks/empty_output_checks.py +87 -0
  30. vigil_forensic/gate_checks/encoding_checks.py +847 -0
  31. vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
  32. vigil_forensic/gate_checks/fallback_checks.py +41 -0
  33. vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
  34. vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
  35. vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
  36. vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
  37. vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
  38. vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
  39. vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
  40. vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
  41. vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
  42. vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
  43. vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
  44. vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
  45. vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
  46. vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
  47. vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
  48. vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
  49. vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
  50. vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
  51. vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
  52. vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
  53. vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
  54. vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
  55. vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
  56. vigil_forensic/gate_checks/hallucination_checks.py +566 -0
  57. vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
  58. vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
  59. vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
  60. vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
  61. vigil_forensic/gate_checks/ml_checks.py +318 -0
  62. vigil_forensic/gate_checks/performance_checks.py +106 -0
  63. vigil_forensic/gate_checks/project_specific_runner.py +691 -0
  64. vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
  65. vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
  66. vigil_forensic/gate_checks/reliability_checks.py +389 -0
  67. vigil_forensic/gate_checks/reporting_checks.py +55 -0
  68. vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
  69. vigil_forensic/gate_checks/security_injection_checks.py +332 -0
  70. vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
  71. vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
  72. vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
  73. vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
  74. vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
  75. vigil_forensic/gate_checks/test_quality_checks.py +946 -0
  76. vigil_forensic/gate_checks/testing_checks.py +149 -0
  77. vigil_forensic/gate_checks/toctou_checks.py +367 -0
  78. vigil_forensic/gate_checks/type_checking_checks.py +316 -0
  79. vigil_forensic/gate_models.py +392 -0
  80. vigil_forensic/gate_packs/__init__.py +1 -0
  81. vigil_forensic/gate_packs/universal.py +179 -0
  82. vigil_forensic/gate_profile.json +31 -0
  83. vigil_forensic/gate_registry.py +21 -0
  84. vigil_forensic/language_profiles.py +219 -0
  85. vigil_forensic/meta_findings.py +207 -0
  86. vigil_forensic/self_audit.py +725 -0
  87. vigil_forensic/source_analysis.py +175 -0
  88. vigil_mapper/__init__.py +103 -0
  89. vigil_mapper/_ast_helpers_minimal.py +229 -0
  90. vigil_mapper/_extract_imports_impl.py +123 -0
  91. vigil_mapper/_file_count_guard.py +129 -0
  92. vigil_mapper/_git_utils.py +178 -0
  93. vigil_mapper/_runtime_ast.py +438 -0
  94. vigil_mapper/_runtime_dispatch.py +137 -0
  95. vigil_mapper/_seed_helpers.py +82 -0
  96. vigil_mapper/authority_builder.py +1102 -0
  97. vigil_mapper/cli_entry.py +731 -0
  98. vigil_mapper/conflict_builder.py +818 -0
  99. vigil_mapper/data_contract_builder.py +446 -0
  100. vigil_mapper/findings_builder.py +716 -0
  101. vigil_mapper/fingerprint.py +53 -0
  102. vigil_mapper/hotspot_builder.py +539 -0
  103. vigil_mapper/map_common.py +449 -0
  104. vigil_mapper/map_errors.py +55 -0
  105. vigil_mapper/map_models.py +431 -0
  106. vigil_mapper/map_models_ext.py +206 -0
  107. vigil_mapper/map_models_findings.py +130 -0
  108. vigil_mapper/map_storage.py +455 -0
  109. vigil_mapper/parse_cache.py +795 -0
  110. vigil_mapper/refactor_boundary_builder.py +266 -0
  111. vigil_mapper/runtime_builder.py +527 -0
  112. vigil_mapper/runtime_tracer.py +243 -0
  113. vigil_mapper/runtime_tracer_entry.py +199 -0
  114. vigil_mapper/semantic_diff.py +71 -0
  115. vigil_mapper/source_adapters/__init__.py +109 -0
  116. vigil_mapper/source_adapters/_base.py +264 -0
  117. vigil_mapper/source_adapters/_ir.py +156 -0
  118. vigil_mapper/source_adapters/_lexer.py +309 -0
  119. vigil_mapper/source_adapters/_patterns.py +212 -0
  120. vigil_mapper/source_adapters/_treesitter.py +182 -0
  121. vigil_mapper/source_adapters/go.py +553 -0
  122. vigil_mapper/source_adapters/java.py +541 -0
  123. vigil_mapper/source_adapters/javascript.py +626 -0
  124. vigil_mapper/source_adapters/python.py +325 -0
  125. vigil_mapper/source_adapters/typescript.py +749 -0
  126. vigil_mapper/structural_builder.py +586 -0
  127. vigil_mcp/__init__.py +1 -0
  128. vigil_mcp/_jobs.py +587 -0
  129. vigil_mcp/_paths.py +93 -0
  130. vigil_mcp/forensic_server.py +419 -0
  131. vigil_mcp/map_server.py +452 -0
@@ -0,0 +1,309 @@
1
+ """Shared preprocessing utilities for regex-based source adapters.
2
+
3
+ L2 implementation: C-family comment & string stripping plus multi-line import
4
+ collapsing for TypeScript / JavaScript. Single-pass character-by-character
5
+ scanner in ``strip_comments_and_strings`` -- O(n), stdlib only, no external
6
+ parser dependency.
7
+
8
+ Python adapter does NOT use this module -- it calls ``ast.parse`` directly.
9
+
10
+ Language argument is retained in the signature for future L5 dispatch
11
+ (Go / Java share the C-family rules) but currently only ``"typescript"``,
12
+ ``"javascript"`` are recognised. Unknown languages fall back to a safe
13
+ passthrough (caller still works, just with comment/string false-positives).
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import re
19
+
20
+ __all__ = [
21
+ "strip_comments_and_strings",
22
+ "strip_comments_only",
23
+ "join_multiline_imports",
24
+ "strip_go_raw_strings",
25
+ ]
26
+
27
+ _log = logging.getLogger(__name__)
28
+
29
+ # Languages that share C-family comment + string syntax.
30
+ _C_FAMILY: frozenset[str] = frozenset({"typescript", "javascript", "go", "java"})
31
+
32
+
33
+ def strip_comments_only(content: str, language: str) -> str:
34
+ """Remove line and block comments while PRESERVING string literals intact.
35
+
36
+ Used by ``extract_imports`` where the module specifier IS a string body
37
+ (``from 'react'``) -- we must not destroy it. String literals are still
38
+ scanned so that a comment opener inside a string (e.g. ``"http://x"``)
39
+ does not accidentally trigger comment stripping.
40
+
41
+ Unknown languages fall back to a passthrough.
42
+ """
43
+ if language not in _C_FAMILY:
44
+ return content
45
+
46
+ out: list[str] = []
47
+ i = 0
48
+ n = len(content)
49
+
50
+ while i < n:
51
+ ch = content[i]
52
+ nxt = content[i + 1] if i + 1 < n else ""
53
+
54
+ # Line comment //...
55
+ if ch == "/" and nxt == "/":
56
+ j = content.find("\n", i + 2)
57
+ if j == -1:
58
+ break
59
+ out.append("\n")
60
+ i = j + 1
61
+ continue
62
+
63
+ # Block comment /* ... */
64
+ if ch == "/" and nxt == "*":
65
+ j = content.find("*/", i + 2)
66
+ if j == -1:
67
+ for c in content[i:]:
68
+ if c == "\n":
69
+ out.append("\n")
70
+ break
71
+ for c in content[i:j + 2]:
72
+ if c == "\n":
73
+ out.append("\n")
74
+ i = j + 2
75
+ continue
76
+
77
+ # String literal — preserve contents verbatim but skip past so a
78
+ # ``//`` / ``/*`` inside the body cannot trigger comment stripping.
79
+ if ch in ("'", '"', "`"):
80
+ quote = ch
81
+ out.append(quote)
82
+ i += 1
83
+ while i < n:
84
+ c = content[i]
85
+ if c == "\\" and i + 1 < n:
86
+ out.append(c)
87
+ out.append(content[i + 1])
88
+ i += 2
89
+ continue
90
+ out.append(c)
91
+ if c == quote:
92
+ i += 1
93
+ break
94
+ i += 1
95
+ continue
96
+
97
+ out.append(ch)
98
+ i += 1
99
+
100
+ return "".join(out)
101
+
102
+
103
+ def strip_comments_and_strings(content: str, language: str) -> str:
104
+ """Remove comment and string literal regions from *content*.
105
+
106
+ Replaces:
107
+ - ``//`` line comments -> emit the trailing newline untouched.
108
+ - ``/* ... */`` block comments -> emit newlines that fell inside.
109
+ - ``'...'`` / ``"..."`` / `` `...` `` string spans -> emit empty body
110
+ (quotes preserved) plus any internal newlines.
111
+
112
+ Escape sequences (``\\"``, ``\\'``, ``\\\\``, ``\\n``, etc.) inside strings
113
+ do NOT terminate the string. Template literals are treated as simple
114
+ strings for L2 -- nested ``${...}`` expressions are NOT re-parsed.
115
+
116
+ Newlines are always preserved so that ``re.MULTILINE`` patterns matching
117
+ against the cleaned output still recover correct 1-based line numbers.
118
+
119
+ Unknown languages fall back to a passthrough (returns *content* unchanged)
120
+ -- L2 covers only ``typescript`` and ``javascript``.
121
+ """
122
+ if language not in _C_FAMILY:
123
+ return content
124
+
125
+ out: list[str] = []
126
+ i = 0
127
+ n = len(content)
128
+
129
+ while i < n:
130
+ ch = content[i]
131
+ nxt = content[i + 1] if i + 1 < n else ""
132
+
133
+ # Line comment //...\n
134
+ if ch == "/" and nxt == "/":
135
+ j = content.find("\n", i + 2)
136
+ if j == -1:
137
+ # Comment runs to EOF; emit nothing more.
138
+ break
139
+ # Preserve the newline so line numbers stay aligned.
140
+ out.append("\n")
141
+ i = j + 1
142
+ continue
143
+
144
+ # Block comment /* ... */
145
+ if ch == "/" and nxt == "*":
146
+ j = content.find("*/", i + 2)
147
+ if j == -1:
148
+ # Unterminated block comment runs to EOF — swallow, preserve
149
+ # any embedded newlines so line counts don't collapse.
150
+ for c in content[i:]:
151
+ if c == "\n":
152
+ out.append("\n")
153
+ break
154
+ # Preserve embedded newlines within the comment.
155
+ for c in content[i:j + 2]:
156
+ if c == "\n":
157
+ out.append("\n")
158
+ i = j + 2
159
+ continue
160
+
161
+ # String literals
162
+ if ch in ("'", '"', "`"):
163
+ quote = ch
164
+ out.append(quote)
165
+ i += 1
166
+ while i < n:
167
+ c = content[i]
168
+ if c == "\\" and i + 1 < n:
169
+ # Skip the escape + escaped char. Don't emit either into
170
+ # the body (body is intentionally empty), but preserve
171
+ # a newline only if the escape targets a literal newline.
172
+ esc = content[i + 1]
173
+ if esc == "\n":
174
+ out.append("\n")
175
+ i += 2
176
+ continue
177
+ if c == quote:
178
+ out.append(quote)
179
+ i += 1
180
+ break
181
+ if c == "\n":
182
+ # Preserve newline. Note: unescaped newlines in '...'/"..."
183
+ # are a syntax error in TS/JS, but template literals allow
184
+ # them. Either way we keep line counts correct.
185
+ out.append("\n")
186
+ i += 1
187
+ continue
188
+ # Regular character inside string: drop (body becomes empty).
189
+ i += 1
190
+ else:
191
+ # EOF reached without closing quote — bail cleanly.
192
+ _log.debug(
193
+ "strip_comments_and_strings: unterminated %s string literal",
194
+ quote,
195
+ )
196
+ continue
197
+
198
+ # Regular source character.
199
+ out.append(ch)
200
+ i += 1
201
+
202
+ return "".join(out)
203
+
204
+
205
+ # Matches ``import { ... } from 'x'`` (or `"x"`) or the re-export form
206
+ # ``export { ... } from 'x'`` where the brace group may span multiple lines.
207
+ # ``from '...'`` is MANDATORY — this prevents the regex from swallowing
208
+ # unrelated brace groups like ``export class Foo {}`` or
209
+ # ``export function bar() { ... }``.
210
+ # Captured groups:
211
+ # 1: leading keyword phrase up to and including the opening brace
212
+ # 2: multi-line brace body (non-greedy, without braces)
213
+ # 3: trailing portion including ``from 'x'`` or ``from "x"``
214
+ _MULTILINE_BRACE_IMPORT = re.compile(
215
+ r"(^[ \t]*(?:import|export)[^\n{]*?\{)" # keyword ... {
216
+ r"([^{}]*?)" # body (no braces)
217
+ r"(\}[ \t]*from[ \t]*['\"][^'\"\n]+['\"][ \t]*;?)", # } from '...' ;
218
+ re.MULTILINE | re.DOTALL,
219
+ )
220
+
221
+
222
+ def join_multiline_imports(content: str, language: str) -> str:
223
+ """Collapse multi-line brace-group imports onto a single logical line.
224
+
225
+ Transforms::
226
+
227
+ import {
228
+ ComponentA,
229
+ ComponentB,
230
+ } from './components';
231
+
232
+ into::
233
+
234
+ import { ComponentA, ComponentB, } from './components';
235
+
236
+ Leading indentation is preserved so the ``import`` keyword stays at the
237
+ same 1-based line number (any embedded newlines inside the brace group
238
+ are simply replaced with a single space). Lines preceding and following
239
+ the statement are untouched, so downstream regex matchers that count via
240
+ ``re.MULTILINE`` still report the original ``import`` line number.
241
+
242
+ Unknown languages fall back to a passthrough.
243
+ """
244
+ if language not in _C_FAMILY:
245
+ return content
246
+
247
+ def _collapse(match: re.Match[str]) -> str:
248
+ head, body, tail = match.group(1), match.group(2), match.group(3)
249
+ # Count newlines that fall between the opening '{' and the ';' (or
250
+ # line-end). We collapse them into a single space inside the brace
251
+ # group, but append them AFTER the statement so the line number of
252
+ # every subsequent ``import`` keyword — and every later symbol — is
253
+ # preserved byte-for-byte against the original source.
254
+ original = match.group(0)
255
+ nl_count = original.count("\n")
256
+ flat = re.sub(r"\s+", " ", body)
257
+ collapsed = f"{head} {flat.strip()} {tail}"
258
+ # Re-emit the newlines so downstream line numbers stay stable.
259
+ return collapsed + ("\n" * nl_count)
260
+
261
+ return _MULTILINE_BRACE_IMPORT.sub(_collapse, content)
262
+
263
+
264
+ def strip_go_raw_strings(content: str) -> str:
265
+ """Replace Go backtick raw string bodies with empty bodies.
266
+
267
+ Go raw string literals are delimited by backticks (`` ` ``). They
268
+ have NO escape sequences -- a backtick cannot appear inside the literal.
269
+ The entire span from opening to closing backtick is therefore replaced by
270
+ two consecutive backtick characters, preserving any embedded newlines as
271
+ plain newlines so that downstream line-number counts remain stable.
272
+
273
+ This function must be applied BEFORE ``strip_comments_only`` when
274
+ processing Go source for import extraction, because ``strip_comments_only``
275
+ preserves string bodies verbatim (needed for ``"..."`` import paths) and
276
+ would otherwise keep the backtick body intact -- allowing fake imports
277
+ embedded in raw string literals to produce false-positive ImportEdges.
278
+
279
+ Usage::
280
+
281
+ cleaned = strip_go_raw_strings(source)
282
+ cleaned = strip_comments_only(cleaned, "go")
283
+ """
284
+ out: list[str] = []
285
+ i = 0
286
+ n = len(content)
287
+
288
+ while i < n:
289
+ ch = content[i]
290
+ if ch == "`":
291
+ # Opening backtick — scan until closing backtick (no escapes).
292
+ out.append("`")
293
+ i += 1
294
+ while i < n:
295
+ c = content[i]
296
+ i += 1
297
+ if c == "`":
298
+ # Closing backtick — emit it and stop.
299
+ out.append("`")
300
+ break
301
+ if c == "\n":
302
+ # Preserve newlines to keep line numbers correct.
303
+ out.append("\n")
304
+ # All other characters in the raw string are dropped.
305
+ else:
306
+ out.append(ch)
307
+ i += 1
308
+
309
+ return "".join(out)
@@ -0,0 +1,212 @@
1
+ """Shared regex patterns for TypeScript / JavaScript adapters.
2
+
3
+ Factoring out keeps the adapter modules under the 400-line budget and
4
+ guarantees that TS and JS behave identically on overlapping syntax
5
+ (ES-module import/export, top-level class/function declarations).
6
+
7
+ All patterns operate on *cleaned* source -- i.e. output of
8
+ ``_lexer.strip_comments_and_strings`` then ``_lexer.join_multiline_imports``.
9
+ String bodies are therefore empty (``''`` / ``""`` / `` `` ``) and comments
10
+ are gone, which prevents ``//`` or ``/* import fake */`` from matching.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ import logging
16
+ _log = logging.getLogger(__name__)
17
+
18
+ __all__ = [
19
+ "RE_IMPORT_DEFAULT",
20
+ "RE_IMPORT_NAMED",
21
+ "RE_IMPORT_NAMESPACE",
22
+ "RE_IMPORT_SIDE_EFFECT",
23
+ "RE_IMPORT_TYPE_DEFAULT",
24
+ "RE_IMPORT_TYPE_NAMED",
25
+ "RE_EXPORT_FROM_NAMED",
26
+ "RE_EXPORT_FROM_STAR",
27
+ "RE_DYNAMIC_IMPORT",
28
+ "RE_REQUIRE_ASSIGN",
29
+ "RE_REQUIRE_BARE",
30
+ "RE_SYMBOL_CLASS",
31
+ "RE_SYMBOL_INTERFACE",
32
+ "RE_SYMBOL_TYPE",
33
+ "RE_SYMBOL_FUNCTION",
34
+ "RE_SYMBOL_CONST",
35
+ "RE_SYMBOL_ENUM",
36
+ "classify_import",
37
+ ]
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Import-form regex patterns (ES modules)
42
+ # ---------------------------------------------------------------------------
43
+ #
44
+ # Conventions:
45
+ # - ``^`` anchors every pattern to start-of-line (re.MULTILINE) so that
46
+ # nested ``import(...)`` inside a function body is still caught by the
47
+ # dynamic-import pattern below (which is NOT line-anchored).
48
+ # - Target module is captured as group ``module``.
49
+ # - Quotes may be single or double. Template-literal imports are not
50
+ # recognised here (rare; low confidence; explicit tech-debt for L6).
51
+
52
+ # ``import X from 'Y'`` (default only)
53
+ RE_IMPORT_DEFAULT = re.compile(
54
+ r"""^[ \t]*import\s+
55
+ (?!type\b) # skip 'import type ...'
56
+ [A-Za-z_$][\w$]* # default binding
57
+ \s+from\s+
58
+ ['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
59
+ re.MULTILINE | re.VERBOSE,
60
+ )
61
+
62
+ # ``import { A, B } from 'Y'`` (named only, maybe with default prefix)
63
+ RE_IMPORT_NAMED = re.compile(
64
+ r"""^[ \t]*import\s+
65
+ (?!type\b)
66
+ (?:[A-Za-z_$][\w$]*\s*,\s*)? # optional default binding
67
+ \{[^}]*\} # named group (may be empty)
68
+ \s+from\s+
69
+ ['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
70
+ re.MULTILINE | re.VERBOSE,
71
+ )
72
+
73
+ # ``import * as X from 'Y'``
74
+ RE_IMPORT_NAMESPACE = re.compile(
75
+ r"""^[ \t]*import\s+
76
+ \*\s+as\s+[A-Za-z_$][\w$]*
77
+ \s+from\s+
78
+ ['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
79
+ re.MULTILINE | re.VERBOSE,
80
+ )
81
+
82
+ # ``import 'Y'`` (side-effect only)
83
+ RE_IMPORT_SIDE_EFFECT = re.compile(
84
+ r"""^[ \t]*import\s+['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
85
+ re.MULTILINE | re.VERBOSE,
86
+ )
87
+
88
+ # ``import type X from 'Y'`` (TS-only)
89
+ RE_IMPORT_TYPE_DEFAULT = re.compile(
90
+ r"""^[ \t]*import\s+type\s+
91
+ [A-Za-z_$][\w$]*
92
+ \s+from\s+
93
+ ['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
94
+ re.MULTILINE | re.VERBOSE,
95
+ )
96
+
97
+ # ``import type { X } from 'Y'`` (TS-only)
98
+ RE_IMPORT_TYPE_NAMED = re.compile(
99
+ r"""^[ \t]*import\s+type\s+
100
+ \{[^}]*\}
101
+ \s+from\s+
102
+ ['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
103
+ re.MULTILINE | re.VERBOSE,
104
+ )
105
+
106
+ # ``export { A, B } from 'Y'`` (re-export)
107
+ RE_EXPORT_FROM_NAMED = re.compile(
108
+ r"""^[ \t]*export\s+
109
+ \{[^}]*\}
110
+ \s+from\s+
111
+ ['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
112
+ re.MULTILINE | re.VERBOSE,
113
+ )
114
+
115
+ # ``export * from 'Y'`` or ``export * as NS from 'Y'``
116
+ RE_EXPORT_FROM_STAR = re.compile(
117
+ r"""^[ \t]*export\s+
118
+ \*(?:\s+as\s+[A-Za-z_$][\w$]*)?
119
+ \s+from\s+
120
+ ['"](?P<module>[^'"\n]+)['"]\s*;?\s*$""",
121
+ re.MULTILINE | re.VERBOSE,
122
+ )
123
+
124
+ # Dynamic ``import('Y')`` — NOT line-anchored; may appear inside expressions.
125
+ RE_DYNAMIC_IMPORT = re.compile(
126
+ r"""\bimport\s*\(\s*
127
+ ['"](?P<module>[^'"\n]+)['"]
128
+ \s*\)""",
129
+ re.VERBOSE,
130
+ )
131
+
132
+ # CommonJS: ``const|let|var X = require('Y')`` (assignment form)
133
+ RE_REQUIRE_ASSIGN = re.compile(
134
+ r"""^[ \t]*(?:const|let|var)\s+
135
+ (?:[A-Za-z_$][\w$]*|\{[^}]*\}|\[[^\]]*\]) # binding (ident | destructure)
136
+ \s*=\s*require\s*\(\s*
137
+ ['"](?P<module>[^'"\n]+)['"]
138
+ \s*\)\s*;?\s*$""",
139
+ re.MULTILINE | re.VERBOSE,
140
+ )
141
+
142
+ # CommonJS bare: ``require('Y');`` (side-effect)
143
+ RE_REQUIRE_BARE = re.compile(
144
+ r"""^[ \t]*require\s*\(\s*
145
+ ['"](?P<module>[^'"\n]+)['"]
146
+ \s*\)\s*;?\s*$""",
147
+ re.MULTILINE | re.VERBOSE,
148
+ )
149
+
150
+
151
+ # ---------------------------------------------------------------------------
152
+ # Symbol-definition regex patterns — top-level only
153
+ # ---------------------------------------------------------------------------
154
+ #
155
+ # Top-level is approximated by requiring the declaration keyword to start at
156
+ # column zero, optionally preceded by ``export `` / ``export default ``.
157
+ # Indented declarations inside function bodies, classes, or blocks are not
158
+ # captured — this is intentional for L2 (avoids nested noise; deep parsing
159
+ # is a tree-sitter job for L6+).
160
+
161
+ _EXPORT_PREFIX = r"(?P<export>export\s+(?:default\s+)?)?"
162
+ _TOPLEVEL_START = r"^" + _EXPORT_PREFIX
163
+
164
+ RE_SYMBOL_CLASS = re.compile(
165
+ _TOPLEVEL_START
166
+ + r"(?:abstract\s+)?class\s+(?P<name>[A-Za-z_$][\w$]*)",
167
+ re.MULTILINE,
168
+ )
169
+
170
+ RE_SYMBOL_INTERFACE = re.compile(
171
+ _TOPLEVEL_START + r"interface\s+(?P<name>[A-Za-z_$][\w$]*)",
172
+ re.MULTILINE,
173
+ )
174
+
175
+ RE_SYMBOL_TYPE = re.compile(
176
+ _TOPLEVEL_START + r"type\s+(?P<name>[A-Za-z_$][\w$]*)\s*(?:<[^>]*>)?\s*=",
177
+ re.MULTILINE,
178
+ )
179
+
180
+ RE_SYMBOL_FUNCTION = re.compile(
181
+ _TOPLEVEL_START
182
+ + r"(?:async\s+)?function\s*\*?\s*(?P<name>[A-Za-z_$][\w$]*)\s*\(",
183
+ re.MULTILINE,
184
+ )
185
+
186
+ RE_SYMBOL_CONST = re.compile(
187
+ _TOPLEVEL_START
188
+ + r"(?:const|let|var)\s+(?P<name>[A-Za-z_$][\w$]*)\s*[:=]",
189
+ re.MULTILINE,
190
+ )
191
+
192
+ RE_SYMBOL_ENUM = re.compile(
193
+ _TOPLEVEL_START
194
+ + r"(?:const\s+)?enum\s+(?P<name>[A-Za-z_$][\w$]*)",
195
+ re.MULTILINE,
196
+ )
197
+
198
+
199
+ # ---------------------------------------------------------------------------
200
+ # Helpers
201
+ # ---------------------------------------------------------------------------
202
+
203
+ def classify_import(module: str) -> str:
204
+ """Return ``"relative"`` for dot-prefixed specifiers, else ``"absolute"``.
205
+
206
+ Matches Node ESM / TS convention: a specifier starting with ``.`` or
207
+ ``..`` (or ``/``) is relative to the file; anything else -- including
208
+ scoped packages like ``@scope/pkg`` and node: builtins -- is absolute.
209
+ """
210
+ if module.startswith(".") or module.startswith("/"):
211
+ return "relative"
212
+ return "absolute"
@@ -0,0 +1,182 @@
1
+ """Shared tree-sitter helpers for source adapters.
2
+
3
+ Provides a cached parser factory and ergonomic utilities for walking
4
+ tree-sitter parse trees. Language-neutral — designed for reuse by Go,
5
+ Java, JavaScript, TypeScript adapters and any future tree-sitter adapter.
6
+
7
+ Public API
8
+ ----------
9
+ get_ts_parser(language)
10
+ Return a cached ``tree_sitter.Parser`` initialised for *language*.
11
+ Supported language names match those accepted by
12
+ ``tree_sitter_language_pack.get_language``.
13
+
14
+ parse_bytes(language, source_bytes)
15
+ Convenience: parse *source_bytes* and return the root ``Node``.
16
+
17
+ node_text(node, source_bytes)
18
+ Decode the byte slice that corresponds to *node* in *source_bytes*.
19
+
20
+ node_line(node)
21
+ Return the 1-based source line number for *node*.
22
+
23
+ iter_named_children(node, *types)
24
+ Yield direct named children of *node* whose ``type`` is in *types*.
25
+ If no types are given, yield all named children.
26
+
27
+ walk_named(node, *types)
28
+ Depth-first generator over ALL named descendant nodes (including
29
+ *node* itself) whose ``type`` is in *types*.
30
+ If no types are given, yield all named descendants.
31
+
32
+ Verified against tree-sitter==0.25.2 / tree-sitter-language-pack==1.10.8.
33
+ Parser is constructed as ``Parser(get_language(lang))`` — the ``get_parser``
34
+ wrapper is ABI-broken on 0.25 and must NOT be used.
35
+ """
36
+ from __future__ import annotations
37
+
38
+ import logging
39
+ from functools import lru_cache
40
+ from typing import Generator
41
+
42
+ from tree_sitter import Node, Parser
43
+ from tree_sitter_language_pack import get_language
44
+
45
+ __all__ = [
46
+ "get_ts_parser",
47
+ "parse_bytes",
48
+ "node_text",
49
+ "node_line",
50
+ "iter_named_children",
51
+ "walk_named",
52
+ ]
53
+
54
+ _log = logging.getLogger(__name__)
55
+
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # Parser cache
59
+ # ---------------------------------------------------------------------------
60
+
61
+ @lru_cache(maxsize=32)
62
+ def get_ts_parser(language: str) -> Parser:
63
+ """Return a cached ``Parser`` initialised for *language*.
64
+
65
+ Uses ``Parser(get_language(language))`` — the verified form for
66
+ tree-sitter==0.25. The result is cached per language string so
67
+ adapters instantiated many times pay the initialisation cost only once.
68
+
69
+ Parameters
70
+ ----------
71
+ language:
72
+ Language name understood by ``tree_sitter_language_pack``,
73
+ e.g. ``"go"``, ``"java"``, ``"javascript"``, ``"typescript"``.
74
+
75
+ Raises
76
+ ------
77
+ LookupError
78
+ If *language* is not available in the installed language pack.
79
+ """
80
+ lang_obj = get_language(language)
81
+ parser = Parser(lang_obj)
82
+ _log.debug("tree-sitter parser created for language=%r", language)
83
+ return parser
84
+
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # Parse convenience
88
+ # ---------------------------------------------------------------------------
89
+
90
+ def parse_bytes(language: str, source_bytes: bytes) -> Node:
91
+ """Parse *source_bytes* with the cached parser for *language*.
92
+
93
+ Parameters
94
+ ----------
95
+ language:
96
+ Language name (see :func:`get_ts_parser`).
97
+ source_bytes:
98
+ Raw UTF-8 encoded source code.
99
+
100
+ Returns
101
+ -------
102
+ Node
103
+ The ``root_node`` of the parsed tree.
104
+ """
105
+ parser = get_ts_parser(language)
106
+ tree = parser.parse(source_bytes)
107
+ return tree.root_node
108
+
109
+
110
+ # ---------------------------------------------------------------------------
111
+ # Node utilities
112
+ # ---------------------------------------------------------------------------
113
+
114
+ def node_text(node: Node, source_bytes: bytes) -> str:
115
+ """Return the source text slice that corresponds to *node*.
116
+
117
+ Decodes as UTF-8 (errors replaced) so callers always get a ``str``.
118
+ """
119
+ return source_bytes[node.start_byte:node.end_byte].decode("utf-8", errors="replace")
120
+
121
+
122
+ def node_line(node: Node) -> int:
123
+ """Return the 1-based line number of *node* in the source file.
124
+
125
+ tree-sitter stores ``start_point`` as a ``(row, column)`` tuple where
126
+ ``row`` is 0-based; we add 1 to match the convention used throughout
127
+ the adapter IR.
128
+ """
129
+ return node.start_point[0] + 1
130
+
131
+
132
+ def iter_named_children(node: Node, *types: str) -> Generator[Node, None, None]:
133
+ """Yield direct named children of *node* filtered by ``type``.
134
+
135
+ Parameters
136
+ ----------
137
+ node:
138
+ Parent node whose children to iterate.
139
+ *types:
140
+ Optional whitelist of node type strings. If omitted, all named
141
+ children are yielded.
142
+
143
+ Yields
144
+ ------
145
+ Node
146
+ Named child nodes matching the type filter.
147
+ """
148
+ type_set: frozenset[str] | None = frozenset(types) if types else None
149
+ for child in node.children:
150
+ if not child.is_named:
151
+ continue
152
+ if type_set is None or child.type in type_set:
153
+ yield child
154
+
155
+
156
+ def walk_named(node: Node, *types: str) -> Generator[Node, None, None]:
157
+ """Depth-first generator over all named descendants of *node*.
158
+
159
+ *node* itself is included if it matches the type filter.
160
+
161
+ Parameters
162
+ ----------
163
+ node:
164
+ Starting node (included in traversal).
165
+ *types:
166
+ Optional whitelist of node type strings. If omitted, all named
167
+ nodes are yielded.
168
+
169
+ Yields
170
+ ------
171
+ Node
172
+ Named descendant nodes matching the type filter.
173
+ """
174
+ type_set: frozenset[str] | None = frozenset(types) if types else None
175
+ stack = [node]
176
+ while stack:
177
+ current = stack.pop()
178
+ if current.is_named and (type_set is None or current.type in type_set):
179
+ yield current
180
+ # Push children in reverse so left-to-right DFS order is preserved.
181
+ for child in reversed(current.children):
182
+ stack.append(child)