@optave/codegraph 3.9.5 → 3.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/dist/ast-analysis/engine.d.ts.map +1 -1
  2. package/dist/ast-analysis/engine.js +3 -2
  3. package/dist/ast-analysis/engine.js.map +1 -1
  4. package/dist/ast-analysis/rules/csharp.d.ts.map +1 -1
  5. package/dist/ast-analysis/rules/csharp.js +8 -1
  6. package/dist/ast-analysis/rules/csharp.js.map +1 -1
  7. package/dist/ast-analysis/rules/go.d.ts.map +1 -1
  8. package/dist/ast-analysis/rules/go.js +4 -1
  9. package/dist/ast-analysis/rules/go.js.map +1 -1
  10. package/dist/ast-analysis/rules/index.d.ts +6 -0
  11. package/dist/ast-analysis/rules/index.d.ts.map +1 -1
  12. package/dist/ast-analysis/rules/index.js +151 -4
  13. package/dist/ast-analysis/rules/index.js.map +1 -1
  14. package/dist/ast-analysis/rules/java.d.ts.map +1 -1
  15. package/dist/ast-analysis/rules/java.js +5 -1
  16. package/dist/ast-analysis/rules/java.js.map +1 -1
  17. package/dist/ast-analysis/rules/php.d.ts.map +1 -1
  18. package/dist/ast-analysis/rules/php.js +6 -1
  19. package/dist/ast-analysis/rules/php.js.map +1 -1
  20. package/dist/ast-analysis/rules/python.d.ts.map +1 -1
  21. package/dist/ast-analysis/rules/python.js +5 -1
  22. package/dist/ast-analysis/rules/python.js.map +1 -1
  23. package/dist/ast-analysis/rules/ruby.d.ts.map +1 -1
  24. package/dist/ast-analysis/rules/ruby.js +4 -1
  25. package/dist/ast-analysis/rules/ruby.js.map +1 -1
  26. package/dist/ast-analysis/rules/rust.d.ts.map +1 -1
  27. package/dist/ast-analysis/rules/rust.js +5 -1
  28. package/dist/ast-analysis/rules/rust.js.map +1 -1
  29. package/dist/ast-analysis/visitors/ast-store-visitor.d.ts +2 -1
  30. package/dist/ast-analysis/visitors/ast-store-visitor.d.ts.map +1 -1
  31. package/dist/ast-analysis/visitors/ast-store-visitor.js +129 -37
  32. package/dist/ast-analysis/visitors/ast-store-visitor.js.map +1 -1
  33. package/dist/domain/graph/builder/pipeline.d.ts.map +1 -1
  34. package/dist/domain/graph/builder/pipeline.js +14 -2
  35. package/dist/domain/graph/builder/pipeline.js.map +1 -1
  36. package/dist/domain/parser.d.ts +40 -0
  37. package/dist/domain/parser.d.ts.map +1 -1
  38. package/dist/domain/parser.js +104 -0
  39. package/dist/domain/parser.js.map +1 -1
  40. package/dist/domain/search/models.js +1 -1
  41. package/dist/domain/wasm-worker-entry.js +3 -2
  42. package/dist/domain/wasm-worker-entry.js.map +1 -1
  43. package/dist/features/ast.d.ts.map +1 -1
  44. package/dist/features/ast.js +11 -9
  45. package/dist/features/ast.js.map +1 -1
  46. package/grammars/tree-sitter-erlang.wasm +0 -0
  47. package/package.json +7 -7
  48. package/src/ast-analysis/engine.ts +11 -1
  49. package/src/ast-analysis/rules/csharp.ts +8 -1
  50. package/src/ast-analysis/rules/go.ts +4 -1
  51. package/src/ast-analysis/rules/index.ts +181 -4
  52. package/src/ast-analysis/rules/java.ts +5 -1
  53. package/src/ast-analysis/rules/php.ts +6 -1
  54. package/src/ast-analysis/rules/python.ts +5 -1
  55. package/src/ast-analysis/rules/ruby.ts +4 -1
  56. package/src/ast-analysis/rules/rust.ts +5 -1
  57. package/src/ast-analysis/visitors/ast-store-visitor.ts +129 -34
  58. package/src/domain/graph/builder/pipeline.ts +24 -4
  59. package/src/domain/parser.ts +122 -0
  60. package/src/domain/search/models.ts +1 -1
  61. package/src/domain/wasm-worker-entry.ts +11 -1
  62. package/src/features/ast.ts +22 -9
@@ -73,10 +73,187 @@ export const DATAFLOW_RULES: Map<string, DataflowRulesConfig> = new Map([
73
73
  ['ruby', ruby.dataflow],
74
74
  ]);
75
75
 
76
- // ─── AST Type Maps ───────────────────────────────────────────────────────
76
+ // ─── AST Node Type Maps ──────────────────────────────────────────────────
77
+ //
78
+ // These mirror the per-language `LangAstConfig` constants in the native Rust
79
+ // engine (`crates/codegraph-core/src/extractors/helpers.rs`). WASM and native
80
+ // must agree on which tree-sitter node types to emit as `ast_nodes` rows.
81
+ // Languages without a dedicated rules/*.ts file have their maps inlined here.
82
+
83
+ const JS_AST_TYPES = javascript.astTypes as Record<string, string>;
84
+ const PY_AST_TYPES = python.astTypes as Record<string, string>;
85
+ const GO_AST_TYPES = go.astTypes as Record<string, string>;
86
+ const RS_AST_TYPES = rust.astTypes as Record<string, string>;
87
+ const JAVA_AST_TYPES = java.astTypes as Record<string, string>;
88
+ const CS_AST_TYPES = csharp.astTypes as Record<string, string>;
89
+ const RB_AST_TYPES = ruby.astTypes as Record<string, string>;
90
+ const PHP_AST_TYPES = php.astTypes as Record<string, string>;
91
+
92
+ const C_AST_TYPES: Record<string, string> = {
93
+ string_literal: 'string',
94
+ };
95
+
96
+ const CPP_AST_TYPES: Record<string, string> = {
97
+ new_expression: 'new',
98
+ throw_statement: 'throw',
99
+ co_await_expression: 'await',
100
+ string_literal: 'string',
101
+ raw_string_literal: 'string',
102
+ };
103
+
104
+ const KOTLIN_AST_TYPES: Record<string, string> = {
105
+ throw_expression: 'throw',
106
+ string_literal: 'string',
107
+ };
108
+
109
+ const SWIFT_AST_TYPES: Record<string, string> = {
110
+ throw_statement: 'throw',
111
+ await_expression: 'await',
112
+ string_literal: 'string',
113
+ };
114
+
115
+ const SCALA_AST_TYPES: Record<string, string> = {
116
+ object_creation_expression: 'new',
117
+ throw_expression: 'throw',
118
+ string_literal: 'string',
119
+ };
120
+
121
+ const BASH_AST_TYPES: Record<string, string> = {
122
+ string: 'string',
123
+ expansion: 'string',
124
+ };
125
+
126
+ const ELIXIR_AST_TYPES: Record<string, string> = {
127
+ string: 'string',
128
+ sigil: 'regex',
129
+ };
130
+
131
+ const LUA_AST_TYPES: Record<string, string> = {
132
+ string: 'string',
133
+ };
134
+
135
+ const DART_AST_TYPES: Record<string, string> = {
136
+ new_expression: 'new',
137
+ constructor_invocation: 'new',
138
+ throw_expression: 'throw',
139
+ await_expression: 'await',
140
+ string_literal: 'string',
141
+ };
142
+
143
+ const ZIG_AST_TYPES: Record<string, string> = {
144
+ string_literal: 'string',
145
+ };
146
+
147
+ const HASKELL_AST_TYPES: Record<string, string> = {
148
+ string: 'string',
149
+ char: 'string',
150
+ };
151
+
152
+ const OCAML_AST_TYPES: Record<string, string> = {
153
+ string: 'string',
154
+ };
77
155
 
78
156
  export const AST_TYPE_MAPS: Map<string, Record<string, string>> = new Map([
79
- ['javascript', javascript.astTypes as Record<string, string>],
80
- ['typescript', javascript.astTypes as Record<string, string>],
81
- ['tsx', javascript.astTypes as Record<string, string>],
157
+ ['javascript', JS_AST_TYPES],
158
+ ['typescript', JS_AST_TYPES],
159
+ ['tsx', JS_AST_TYPES],
160
+ ['python', PY_AST_TYPES],
161
+ ['go', GO_AST_TYPES],
162
+ ['rust', RS_AST_TYPES],
163
+ ['java', JAVA_AST_TYPES],
164
+ ['csharp', CS_AST_TYPES],
165
+ ['ruby', RB_AST_TYPES],
166
+ ['php', PHP_AST_TYPES],
167
+ ['c', C_AST_TYPES],
168
+ ['cpp', CPP_AST_TYPES],
169
+ ['kotlin', KOTLIN_AST_TYPES],
170
+ ['swift', SWIFT_AST_TYPES],
171
+ ['scala', SCALA_AST_TYPES],
172
+ ['bash', BASH_AST_TYPES],
173
+ ['elixir', ELIXIR_AST_TYPES],
174
+ ['lua', LUA_AST_TYPES],
175
+ ['dart', DART_AST_TYPES],
176
+ ['zig', ZIG_AST_TYPES],
177
+ ['haskell', HASKELL_AST_TYPES],
178
+ ['ocaml', OCAML_AST_TYPES],
179
+ ['ocaml-interface', OCAML_AST_TYPES],
180
+ ]);
181
+
182
+ // ─── Per-language string-extraction config ───────────────────────────────
183
+ //
184
+ // Mirrors `quote_chars` + `string_prefixes` in the native `LangAstConfig`.
185
+ // Used by the AST-store visitor to strip quote characters and language-
186
+ // specific prefix sigils (Python `r"..."`, C# verbatim `@"..."`, Rust raw
187
+ // `r#"..."#`, etc.) when computing string content for the `name` column.
188
+
189
+ export interface AstStringConfig {
190
+ quoteChars: string;
191
+ stringPrefixes: string;
192
+ }
193
+
194
+ const JS_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"`', stringPrefixes: '' };
195
+ const PY_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: 'rbfuRBFU' };
196
+ const GO_STRING_CONFIG: AstStringConfig = { quoteChars: '"`', stringPrefixes: '' };
197
+ const RS_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
198
+ const JAVA_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
199
+ const CS_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
200
+ const RB_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: '' };
201
+ const PHP_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: '' };
202
+ const C_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
203
+ const CPP_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: 'LuUR' };
204
+ const KOTLIN_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
205
+ const SWIFT_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
206
+ const SCALA_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
207
+ const BASH_STRING_CONFIG: AstStringConfig = { quoteChars: '"\'', stringPrefixes: '' };
208
+ const ELIXIR_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
209
+ const LUA_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: '' };
210
+ const DART_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: '' };
211
+ const ZIG_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
212
+ const HASKELL_STRING_CONFIG: AstStringConfig = { quoteChars: '"\'', stringPrefixes: '' };
213
+ const OCAML_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
214
+
215
+ export const AST_STRING_CONFIGS: Map<string, AstStringConfig> = new Map([
216
+ ['javascript', JS_STRING_CONFIG],
217
+ ['typescript', JS_STRING_CONFIG],
218
+ ['tsx', JS_STRING_CONFIG],
219
+ ['python', PY_STRING_CONFIG],
220
+ ['go', GO_STRING_CONFIG],
221
+ ['rust', RS_STRING_CONFIG],
222
+ ['java', JAVA_STRING_CONFIG],
223
+ ['csharp', CS_STRING_CONFIG],
224
+ ['ruby', RB_STRING_CONFIG],
225
+ ['php', PHP_STRING_CONFIG],
226
+ ['c', C_STRING_CONFIG],
227
+ ['cpp', CPP_STRING_CONFIG],
228
+ ['kotlin', KOTLIN_STRING_CONFIG],
229
+ ['swift', SWIFT_STRING_CONFIG],
230
+ ['scala', SCALA_STRING_CONFIG],
231
+ ['bash', BASH_STRING_CONFIG],
232
+ ['elixir', ELIXIR_STRING_CONFIG],
233
+ ['lua', LUA_STRING_CONFIG],
234
+ ['dart', DART_STRING_CONFIG],
235
+ ['zig', ZIG_STRING_CONFIG],
236
+ ['haskell', HASKELL_STRING_CONFIG],
237
+ ['ocaml', OCAML_STRING_CONFIG],
238
+ ['ocaml-interface', OCAML_STRING_CONFIG],
82
239
  ]);
240
+
241
+ // ─── Per-language "stop-after-collect" kinds ─────────────────────────────
242
+ //
243
+ // Mirrors the subtle difference between the native JS walker
244
+ // (`extractors/javascript.rs::walk_ast_nodes_depth`) — which *returns* after
245
+ // collecting `new_expression` and `throw_statement` to avoid double-counting
246
+ // the wrapped expression — and the generic walker (`helpers.rs::walk_ast_
247
+ // nodes_with_config_depth`), which always recurses. For WASM/native parity
248
+ // the JS family must skip recursion on `new` and `throw`; every other
249
+ // language recurses normally.
250
+
251
+ const JS_STOP_RECURSE: ReadonlySet<string> = new Set(['new', 'throw']);
252
+ const EMPTY_STOP_RECURSE: ReadonlySet<string> = new Set();
253
+
254
+ export function astStopRecurseKinds(langId: string): ReadonlySet<string> {
255
+ if (langId === 'javascript' || langId === 'typescript' || langId === 'tsx') {
256
+ return JS_STOP_RECURSE;
257
+ }
258
+ return EMPTY_STOP_RECURSE;
259
+ }
@@ -174,4 +174,8 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({
174
174
 
175
175
  // ─── AST Node Types ───────────────────────────────────────────────────────
176
176
 
177
- export const astTypes: Record<string, string> | null = null;
177
+ export const astTypes: Record<string, string> | null = {
178
+ object_creation_expression: 'new',
179
+ throw_statement: 'throw',
180
+ string_literal: 'string',
181
+ };
@@ -218,4 +218,9 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({
218
218
 
219
219
  // ─── AST Node Types ───────────────────────────────────────────────────────
220
220
 
221
- export const astTypes: Record<string, string> | null = null;
221
+ export const astTypes: Record<string, string> | null = {
222
+ object_creation_expression: 'new',
223
+ throw_expression: 'throw',
224
+ string: 'string',
225
+ encapsed_string: 'string',
226
+ };
@@ -195,4 +195,8 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({
195
195
 
196
196
  // ─── AST Node Types ───────────────────────────────────────────────────────
197
197
 
198
- export const astTypes: Record<string, string> | null = null;
198
+ export const astTypes: Record<string, string> | null = {
199
+ raise_statement: 'throw',
200
+ await: 'await',
201
+ string: 'string',
202
+ };
@@ -203,4 +203,7 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({
203
203
 
204
204
  // ─── AST Node Types ───────────────────────────────────────────────────────
205
205
 
206
- export const astTypes: Record<string, string> | null = null;
206
+ export const astTypes: Record<string, string> | null = {
207
+ string: 'string',
208
+ regex: 'regex',
209
+ };
@@ -172,4 +172,8 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({
172
172
 
173
173
  // ─── AST Node Types ───────────────────────────────────────────────────────
174
174
 
175
- export const astTypes: Record<string, string> | null = null;
175
+ export const astTypes: Record<string, string> | null = {
176
+ await_expression: 'await',
177
+ string_literal: 'string',
178
+ raw_string_literal: 'string',
179
+ };
@@ -5,9 +5,42 @@ import type {
5
5
  Visitor,
6
6
  VisitorContext,
7
7
  } from '../../types.js';
8
+ import type { AstStringConfig } from '../rules/index.js';
8
9
 
9
10
  const TEXT_MAX = 200;
10
11
 
12
+ // ── Cross-language node-type constants (mirror Rust `helpers.rs`) ────────
13
+ const IDENT_TYPES = new Set<string>([
14
+ 'identifier',
15
+ 'type_identifier',
16
+ 'name',
17
+ 'qualified_name',
18
+ 'scoped_identifier',
19
+ 'qualified_identifier',
20
+ 'member_expression',
21
+ 'member_access_expression',
22
+ 'field_expression',
23
+ 'attribute',
24
+ 'scoped_type_identifier',
25
+ ]);
26
+
27
+ const CALL_TYPES = new Set<string>([
28
+ 'call_expression',
29
+ 'call',
30
+ 'invocation_expression',
31
+ 'method_invocation',
32
+ 'function_call_expression',
33
+ 'member_call_expression',
34
+ 'scoped_call_expression',
35
+ ]);
36
+
37
+ const DEFAULT_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"`', stringPrefixes: '' };
38
+
39
+ // Keyword tokens skipped when extracting the inner expression text of a
40
+ // throw/raise/await/new node. Module-level constant avoids reallocating on
41
+ // every call (can be hot in large files).
42
+ const CHILD_EXPR_SKIP_KEYWORDS = new Set<string>(['throw', 'raise', 'await', 'new']);
43
+
11
44
  interface AstStoreRow {
12
45
  file: string;
13
46
  line: number;
@@ -20,69 +53,122 @@ interface AstStoreRow {
20
53
 
21
54
  function truncate(s: string | null | undefined, max: number = TEXT_MAX): string | null {
22
55
  if (!s) return null;
23
- return s.length <= max ? s : `${s.slice(0, max - 1)}\u2026`;
56
+ return s.length <= max ? s : `${s.slice(0, max - 1)}…`;
57
+ }
58
+
59
+ function trimLeadingChars(s: string, chars: string): string {
60
+ if (!chars) return s;
61
+ let i = 0;
62
+ while (i < s.length && chars.includes(s[i]!)) i++;
63
+ return i === 0 ? s : s.slice(i);
24
64
  }
25
65
 
26
- function extractNewName(node: TreeSitterNode): string {
66
+ function trimTrailingChars(s: string, chars: string): string {
67
+ if (!chars) return s;
68
+ let i = s.length;
69
+ while (i > 0 && chars.includes(s[i - 1]!)) i--;
70
+ return i === s.length ? s : s.slice(0, i);
71
+ }
72
+
73
+ /** Extract constructor name from a `new_expression` / `object_creation_expression`. */
74
+ function extractConstructorName(node: TreeSitterNode): string {
75
+ for (const field of ['type', 'class', 'constructor']) {
76
+ const f = node.childForFieldName(field);
77
+ if (f?.text) return f.text;
78
+ }
27
79
  for (let i = 0; i < node.childCount; i++) {
28
80
  const child = node.child(i);
29
81
  if (!child) continue;
30
- if (child.type === 'identifier') return child.text;
31
- if (child.type === 'member_expression') return child.text;
82
+ if (IDENT_TYPES.has(child.type)) return child.text;
83
+ }
84
+ const raw = node.text || '';
85
+ const beforeParen = raw.split('(')[0] || raw;
86
+ return beforeParen.replace(/^new\s+/, '').trim() || '?';
87
+ }
88
+
89
+ /** Extract function name from a call node. */
90
+ function extractCallName(node: TreeSitterNode): string {
91
+ for (const field of ['function', 'method', 'name']) {
92
+ const f = node.childForFieldName(field);
93
+ if (f?.text) return f.text;
32
94
  }
33
- return node.text?.split('(')[0]?.replace('new ', '').trim() || '?';
95
+ const text = node.text || '';
96
+ return text.split('(')[0] || '?';
34
97
  }
35
98
 
36
- function extractExpressionText(node: TreeSitterNode): string | null {
99
+ /** Extract name from a throw/raise statement — matches native `extract_throw_target`. */
100
+ function extractThrowName(node: TreeSitterNode, newTypes: Set<string>): string {
37
101
  for (let i = 0; i < node.childCount; i++) {
38
102
  const child = node.child(i);
39
103
  if (!child) continue;
40
- if (child.type !== 'throw' && child.type !== 'await') {
41
- return truncate(child.text);
42
- }
104
+ const ck = child.type;
105
+ if (newTypes.has(ck)) return extractConstructorName(child);
106
+ if (CALL_TYPES.has(ck)) return extractCallName(child);
107
+ if (IDENT_TYPES.has(ck)) return child.text;
43
108
  }
44
- return truncate(node.text);
109
+ return truncate(node.text) ?? node.text ?? '';
45
110
  }
46
111
 
47
- /** Extract the name from a throw statement's child nodes. */
48
- function extractThrowName(node: TreeSitterNode): string | null {
112
+ /** Extract name from an await expression matches native `extract_awaited_name`. */
113
+ function extractAwaitName(node: TreeSitterNode): string {
49
114
  for (let i = 0; i < node.childCount; i++) {
50
115
  const child = node.child(i);
51
116
  if (!child) continue;
52
- if (child.type === 'new_expression') return extractNewName(child);
53
- if (child.type === 'call_expression') {
54
- const fn = child.childForFieldName('function');
55
- return fn ? fn.text : child.text?.split('(')[0] || '?';
56
- }
57
- if (child.type === 'identifier') return child.text;
117
+ const ck = child.type;
118
+ if (CALL_TYPES.has(ck)) return extractCallName(child);
119
+ if (IDENT_TYPES.has(ck)) return child.text;
58
120
  }
59
- return truncate(node.text);
121
+ return truncate(node.text) ?? node.text ?? '';
60
122
  }
61
123
 
62
- /** Extract the name from an await expression's child nodes. */
63
- function extractAwaitName(node: TreeSitterNode): string | null {
124
+ /** Extract text of the expression inside a throw/await, skipping the keyword. */
125
+ function extractChildExpressionText(node: TreeSitterNode): string | null {
64
126
  for (let i = 0; i < node.childCount; i++) {
65
127
  const child = node.child(i);
66
128
  if (!child) continue;
67
- if (child.type === 'call_expression') {
68
- const fn = child.childForFieldName('function');
69
- return fn ? fn.text : child.text?.split('(')[0] || '?';
70
- }
71
- if (child.type === 'identifier' || child.type === 'member_expression') {
72
- return child.text;
73
- }
129
+ if (!CHILD_EXPR_SKIP_KEYWORDS.has(child.type)) return truncate(child.text);
74
130
  }
75
131
  return truncate(node.text);
76
132
  }
77
133
 
134
+ /**
135
+ * Extract string content from a string-literal node, mirroring the native
136
+ * engine's `build_string_node` (`helpers.rs`). Returns `null` when the
137
+ * content is shorter than 2 Unicode code points.
138
+ */
139
+ function extractStringContent(node: TreeSitterNode, cfg: AstStringConfig): string | null {
140
+ const raw = node.text ?? '';
141
+ const isRawString = node.type.includes('raw_string');
142
+
143
+ let s = raw;
144
+ s = trimLeadingChars(s, '@');
145
+ s = trimLeadingChars(s, cfg.stringPrefixes);
146
+ if (isRawString) s = trimLeadingChars(s, 'r#');
147
+ s = trimLeadingChars(s, cfg.quoteChars);
148
+ if (isRawString) s = trimTrailingChars(s, '#');
149
+ s = trimTrailingChars(s, cfg.quoteChars);
150
+
151
+ // Count code points, not UTF-16 code units — matches Rust `chars().count()`.
152
+ const codePointCount = [...s].length;
153
+ if (codePointCount < 2) return null;
154
+ return s;
155
+ }
156
+
78
157
  export function createAstStoreVisitor(
79
158
  astTypeMap: Record<string, string>,
80
159
  defs: Definition[],
81
160
  relPath: string,
82
161
  nodeIdMap: Map<string, number>,
162
+ stringConfig: AstStringConfig = DEFAULT_STRING_CONFIG,
163
+ stopRecurseKinds: ReadonlySet<string> = new Set(),
83
164
  ): Visitor {
84
165
  const rows: AstStoreRow[] = [];
85
166
  const matched = new Set<number>();
167
+ const newTypes = new Set<string>(
168
+ Object.entries(astTypeMap)
169
+ .filter(([, kind]) => kind === 'new')
170
+ .map(([type]) => type),
171
+ );
86
172
 
87
173
  function findParentDef(line: number): Definition | null {
88
174
  let best: Definition | null = null;
@@ -106,12 +192,15 @@ export function createAstStoreVisitor(
106
192
  type KindHandler = (node: TreeSitterNode) => NameTextResult;
107
193
 
108
194
  const kindHandlers: Record<string, KindHandler> = {
109
- new: (node) => ({ name: extractNewName(node), text: truncate(node.text) }),
110
- throw: (node) => ({ name: extractThrowName(node), text: extractExpressionText(node) }),
111
- await: (node) => ({ name: extractAwaitName(node), text: extractExpressionText(node) }),
195
+ new: (node) => ({ name: extractConstructorName(node), text: truncate(node.text) }),
196
+ throw: (node) => ({
197
+ name: extractThrowName(node, newTypes),
198
+ text: extractChildExpressionText(node),
199
+ }),
200
+ await: (node) => ({ name: extractAwaitName(node), text: extractChildExpressionText(node) }),
112
201
  string: (node) => {
113
- const content = node.text?.replace(/^['"`]|['"`]$/g, '') || '';
114
- if (content.length < 2) return { name: null, text: null, skip: true };
202
+ const content = extractStringContent(node, stringConfig);
203
+ if (content == null) return { name: null, text: null, skip: true };
115
204
  return { name: truncate(content, 100), text: truncate(node.text) };
116
205
  },
117
206
  regex: (node) => ({ name: node.text || '?', text: truncate(node.text) }),
@@ -156,7 +245,13 @@ export function createAstStoreVisitor(
156
245
 
157
246
  collectNode(node, kind);
158
247
 
159
- if (kind !== 'string' && kind !== 'regex') {
248
+ // Mirror the native walker's recursion policy. In JS/TS, the native
249
+ // javascript.rs walker returns after collecting `new` or `throw` to
250
+ // avoid double-counting the wrapped expression (e.g. `throw new
251
+ // Error('x')` emits one `throw` row, not throw+new+string). Other
252
+ // languages go through helpers.rs::walk_ast_nodes_with_config_depth
253
+ // which always recurses — so `stopRecurseKinds` is empty for them.
254
+ if (stopRecurseKinds.has(kind)) {
160
255
  return { skipChildren: true };
161
256
  }
162
257
  },
@@ -32,7 +32,13 @@ import type {
32
32
  ExtractorOutput,
33
33
  SqliteStatement,
34
34
  } from '../../../types.js';
35
- import { getActiveEngine, getInstalledWasmExtensions, parseFilesAuto } from '../../parser.js';
35
+ import {
36
+ classifyNativeDrops,
37
+ formatDropExtensionSummary,
38
+ getActiveEngine,
39
+ getInstalledWasmExtensions,
40
+ parseFilesAuto,
41
+ } from '../../parser.js';
36
42
  import { setWorkspaces } from '../resolve.js';
37
43
  import { PipelineContext } from './context.js';
38
44
  import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js';
@@ -761,18 +767,32 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
761
767
  // minimal installs) can't be parsed by either engine, so they're not a
762
768
  // native regression — excluding them keeps the warn count meaningful.
763
769
  const installedExts = getInstalledWasmExtensions();
770
+ const missingRel: string[] = [];
764
771
  const missingAbs: string[] = [];
765
772
  for (const rel of expected) {
766
773
  if (existing.has(rel)) continue;
767
774
  const ext = path.extname(rel).toLowerCase();
768
775
  if (!installedExts.has(ext)) continue;
776
+ missingRel.push(rel);
769
777
  missingAbs.push(path.join(ctx.rootDir, rel));
770
778
  }
771
779
  if (missingAbs.length === 0) return;
772
780
 
773
- warn(
774
- `Native orchestrator dropped ${missingAbs.length} file(s); backfilling via WASM for engine parity`,
775
- );
781
+ // Classify drops so users see per-extension reasons instead of just a count
782
+ // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust
783
+ // extractor); `native-extractor-failure` indicates a real native bug since
784
+ // the language IS supported by the addon yet the file was dropped anyway.
785
+ const { byReason, totals } = classifyNativeDrops(missingRel);
786
+ if (totals['unsupported-by-native'] > 0) {
787
+ info(
788
+ `Native orchestrator skipped ${totals['unsupported-by-native']} file(s) in languages without a Rust extractor; backfilling via WASM: ${formatDropExtensionSummary(byReason['unsupported-by-native'])}`,
789
+ );
790
+ }
791
+ if (totals['native-extractor-failure'] > 0) {
792
+ warn(
793
+ `Native orchestrator dropped ${totals['native-extractor-failure']} file(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM: ${formatDropExtensionSummary(byReason['native-extractor-failure'])}`,
794
+ );
795
+ }
776
796
  const wasmResults = await parseFilesAuto(missingAbs, ctx.rootDir, { engine: 'wasm' });
777
797
 
778
798
  const rows: unknown[][] = [];
@@ -412,6 +412,128 @@ export function getInstalledWasmExtensions(): Set<string> {
412
412
  return exts;
413
413
  }
414
414
 
415
+ /**
416
+ * Lowercase file extensions covered by the native Rust addon.
417
+ *
418
+ * Mirrors `LanguageKind::from_extension` in
419
+ * `crates/codegraph-core/src/parser_registry.rs`. Used to classify why the
420
+ * native orchestrator dropped a file: extensions outside this set are a
421
+ * legitimate parser limit (no Rust extractor exists), while extensions inside
422
+ * it indicate a real native bug (parse/read/extract failure).
423
+ *
424
+ * Keep this list in sync with the Rust enum — the native addon is a separate
425
+ * npm package, so JS has no runtime way to discover its language coverage.
426
+ */
427
+ export const NATIVE_SUPPORTED_EXTENSIONS: ReadonlySet<string> = new Set([
428
+ '.js',
429
+ '.jsx',
430
+ '.mjs',
431
+ '.cjs',
432
+ '.ts',
433
+ '.tsx',
434
+ '.py',
435
+ '.pyi',
436
+ '.tf',
437
+ '.hcl',
438
+ '.go',
439
+ '.rs',
440
+ '.java',
441
+ '.cs',
442
+ '.rb',
443
+ '.rake',
444
+ '.gemspec',
445
+ '.php',
446
+ '.phtml',
447
+ '.c',
448
+ '.h',
449
+ '.cpp',
450
+ '.cc',
451
+ '.cxx',
452
+ '.hpp',
453
+ '.kt',
454
+ '.kts',
455
+ '.swift',
456
+ '.scala',
457
+ '.sh',
458
+ '.bash',
459
+ '.ex',
460
+ '.exs',
461
+ '.lua',
462
+ '.dart',
463
+ '.zig',
464
+ '.hs',
465
+ '.ml',
466
+ '.mli',
467
+ ]);
468
+
469
+ /**
470
+ * Classification for a file the native orchestrator dropped.
471
+ * - `unsupported-by-native`: extension has no Rust extractor (legitimate parser limit).
472
+ * - `native-extractor-failure`: extension is supported by native but the file was
473
+ * still dropped — points at a real bug (read error, parse failure, extractor crash).
474
+ */
475
+ export type NativeDropReason = 'unsupported-by-native' | 'native-extractor-failure';
476
+
477
+ export interface NativeDropClassification {
478
+ /** Per-reason → per-extension → list of relative paths that hit that bucket. */
479
+ byReason: Record<NativeDropReason, Map<string, string[]>>;
480
+ /** Total file count per reason. */
481
+ totals: Record<NativeDropReason, number>;
482
+ }
483
+
484
+ /**
485
+ * Group the missing files (relative paths) by drop reason and extension so the
486
+ * caller can log per-extension counts and a sample path. Pure function — no
487
+ * I/O, safe to unit-test independently of the build pipeline.
488
+ */
489
+ export function classifyNativeDrops(relPaths: Iterable<string>): NativeDropClassification {
490
+ const byReason: Record<NativeDropReason, Map<string, string[]>> = {
491
+ 'unsupported-by-native': new Map(),
492
+ 'native-extractor-failure': new Map(),
493
+ };
494
+ const totals: Record<NativeDropReason, number> = {
495
+ 'unsupported-by-native': 0,
496
+ 'native-extractor-failure': 0,
497
+ };
498
+ for (const rel of relPaths) {
499
+ const ext = path.extname(rel).toLowerCase();
500
+ const reason: NativeDropReason = NATIVE_SUPPORTED_EXTENSIONS.has(ext)
501
+ ? 'native-extractor-failure'
502
+ : 'unsupported-by-native';
503
+ const bucket = byReason[reason];
504
+ let list = bucket.get(ext);
505
+ if (!list) {
506
+ list = [];
507
+ bucket.set(ext, list);
508
+ }
509
+ list.push(rel);
510
+ totals[reason]++;
511
+ }
512
+ return { byReason, totals };
513
+ }
514
+
515
+ /**
516
+ * Render `{ ext → paths[] }` as `ext (n: sample.ext, ...)` slices for log lines.
517
+ * Caps at 3 sample paths per extension and 6 extensions total to keep warnings
518
+ * readable when many languages are dropped at once. Extensions are sorted by
519
+ * descending file count so the loudest offender shows up first; ties keep
520
+ * insertion order. Pure function — safe to unit-test independently.
521
+ */
522
+ export function formatDropExtensionSummary(buckets: Map<string, string[]>): string {
523
+ const MAX_EXTS = 6;
524
+ const MAX_SAMPLES = 3;
525
+ const entries = Array.from(buckets.entries()).sort((a, b) => b[1].length - a[1].length);
526
+ const shown = entries.slice(0, MAX_EXTS).map(([ext, paths]) => {
527
+ const sample = paths.slice(0, MAX_SAMPLES).join(', ');
528
+ const more = paths.length > MAX_SAMPLES ? `, +${paths.length - MAX_SAMPLES} more` : '';
529
+ return `${ext} (${paths.length}: ${sample}${more})`;
530
+ });
531
+ if (entries.length > MAX_EXTS) {
532
+ shown.push(`+${entries.length - MAX_EXTS} more extension(s)`);
533
+ }
534
+ return shown.join('; ');
535
+ }
536
+
415
537
  // ── Unified API ──────────────────────────────────────────────────────────────
416
538
 
417
539
  function resolveEngine(opts: ParseEngineOpts = {}): ResolvedEngine {
@@ -253,7 +253,7 @@ export async function embed(
253
253
  }
254
254
 
255
255
  if (texts.length > batchSize) {
256
- process.stdout.write(` Embedded ${Math.min(i + batchSize, texts.length)}/${texts.length}\r`);
256
+ process.stderr.write(` Embedded ${Math.min(i + batchSize, texts.length)}/${texts.length}\r`);
257
257
  }
258
258
  }
259
259