@optave/codegraph 3.9.4 → 3.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/README.md +10 -10
  2. package/dist/ast-analysis/engine.d.ts.map +1 -1
  3. package/dist/ast-analysis/engine.js +3 -2
  4. package/dist/ast-analysis/engine.js.map +1 -1
  5. package/dist/ast-analysis/rules/csharp.d.ts.map +1 -1
  6. package/dist/ast-analysis/rules/csharp.js +8 -1
  7. package/dist/ast-analysis/rules/csharp.js.map +1 -1
  8. package/dist/ast-analysis/rules/go.d.ts.map +1 -1
  9. package/dist/ast-analysis/rules/go.js +4 -1
  10. package/dist/ast-analysis/rules/go.js.map +1 -1
  11. package/dist/ast-analysis/rules/index.d.ts +6 -0
  12. package/dist/ast-analysis/rules/index.d.ts.map +1 -1
  13. package/dist/ast-analysis/rules/index.js +151 -4
  14. package/dist/ast-analysis/rules/index.js.map +1 -1
  15. package/dist/ast-analysis/rules/java.d.ts.map +1 -1
  16. package/dist/ast-analysis/rules/java.js +5 -1
  17. package/dist/ast-analysis/rules/java.js.map +1 -1
  18. package/dist/ast-analysis/rules/php.d.ts.map +1 -1
  19. package/dist/ast-analysis/rules/php.js +6 -1
  20. package/dist/ast-analysis/rules/php.js.map +1 -1
  21. package/dist/ast-analysis/rules/python.d.ts.map +1 -1
  22. package/dist/ast-analysis/rules/python.js +5 -1
  23. package/dist/ast-analysis/rules/python.js.map +1 -1
  24. package/dist/ast-analysis/rules/ruby.d.ts.map +1 -1
  25. package/dist/ast-analysis/rules/ruby.js +4 -1
  26. package/dist/ast-analysis/rules/ruby.js.map +1 -1
  27. package/dist/ast-analysis/rules/rust.d.ts.map +1 -1
  28. package/dist/ast-analysis/rules/rust.js +5 -1
  29. package/dist/ast-analysis/rules/rust.js.map +1 -1
  30. package/dist/ast-analysis/visitors/ast-store-visitor.d.ts +2 -1
  31. package/dist/ast-analysis/visitors/ast-store-visitor.d.ts.map +1 -1
  32. package/dist/ast-analysis/visitors/ast-store-visitor.js +129 -37
  33. package/dist/ast-analysis/visitors/ast-store-visitor.js.map +1 -1
  34. package/dist/cli/commands/watch.d.ts.map +1 -1
  35. package/dist/cli/commands/watch.js +2 -0
  36. package/dist/cli/commands/watch.js.map +1 -1
  37. package/dist/cli.js +24 -1
  38. package/dist/cli.js.map +1 -1
  39. package/dist/domain/graph/builder/context.d.ts +2 -0
  40. package/dist/domain/graph/builder/context.d.ts.map +1 -1
  41. package/dist/domain/graph/builder/context.js.map +1 -1
  42. package/dist/domain/graph/builder/helpers.d.ts +13 -2
  43. package/dist/domain/graph/builder/helpers.d.ts.map +1 -1
  44. package/dist/domain/graph/builder/helpers.js +30 -4
  45. package/dist/domain/graph/builder/helpers.js.map +1 -1
  46. package/dist/domain/graph/builder/pipeline.d.ts.map +1 -1
  47. package/dist/domain/graph/builder/pipeline.js +141 -3
  48. package/dist/domain/graph/builder/pipeline.js.map +1 -1
  49. package/dist/domain/graph/builder/stages/collect-files.d.ts.map +1 -1
  50. package/dist/domain/graph/builder/stages/collect-files.js +58 -26
  51. package/dist/domain/graph/builder/stages/collect-files.js.map +1 -1
  52. package/dist/domain/graph/builder/stages/detect-changes.d.ts.map +1 -1
  53. package/dist/domain/graph/builder/stages/detect-changes.js +54 -45
  54. package/dist/domain/graph/builder/stages/detect-changes.js.map +1 -1
  55. package/dist/domain/graph/builder/stages/finalize.d.ts.map +1 -1
  56. package/dist/domain/graph/builder/stages/finalize.js +17 -0
  57. package/dist/domain/graph/builder/stages/finalize.js.map +1 -1
  58. package/dist/domain/graph/journal.d.ts +15 -0
  59. package/dist/domain/graph/journal.d.ts.map +1 -1
  60. package/dist/domain/graph/journal.js +283 -28
  61. package/dist/domain/graph/journal.js.map +1 -1
  62. package/dist/domain/graph/watcher.d.ts +17 -0
  63. package/dist/domain/graph/watcher.d.ts.map +1 -1
  64. package/dist/domain/graph/watcher.js +23 -7
  65. package/dist/domain/graph/watcher.js.map +1 -1
  66. package/dist/domain/parser.d.ts +53 -4
  67. package/dist/domain/parser.d.ts.map +1 -1
  68. package/dist/domain/parser.js +278 -80
  69. package/dist/domain/parser.js.map +1 -1
  70. package/dist/domain/search/generator.d.ts.map +1 -1
  71. package/dist/domain/search/generator.js +28 -2
  72. package/dist/domain/search/generator.js.map +1 -1
  73. package/dist/domain/search/models.js +1 -1
  74. package/dist/domain/wasm-worker-entry.d.ts +24 -0
  75. package/dist/domain/wasm-worker-entry.d.ts.map +1 -0
  76. package/dist/domain/wasm-worker-entry.js +644 -0
  77. package/dist/domain/wasm-worker-entry.js.map +1 -0
  78. package/dist/domain/wasm-worker-pool.d.ts +59 -0
  79. package/dist/domain/wasm-worker-pool.d.ts.map +1 -0
  80. package/dist/domain/wasm-worker-pool.js +312 -0
  81. package/dist/domain/wasm-worker-pool.js.map +1 -0
  82. package/dist/domain/wasm-worker-protocol.d.ts +65 -0
  83. package/dist/domain/wasm-worker-protocol.d.ts.map +1 -0
  84. package/dist/domain/wasm-worker-protocol.js +13 -0
  85. package/dist/domain/wasm-worker-protocol.js.map +1 -0
  86. package/dist/extractors/javascript.js +146 -2
  87. package/dist/extractors/javascript.js.map +1 -1
  88. package/dist/features/ast.d.ts.map +1 -1
  89. package/dist/features/ast.js +11 -9
  90. package/dist/features/ast.js.map +1 -1
  91. package/dist/features/boundaries.d.ts +2 -2
  92. package/dist/features/boundaries.d.ts.map +1 -1
  93. package/dist/features/boundaries.js +2 -31
  94. package/dist/features/boundaries.js.map +1 -1
  95. package/dist/features/snapshot.d.ts.map +1 -1
  96. package/dist/features/snapshot.js +99 -13
  97. package/dist/features/snapshot.js.map +1 -1
  98. package/dist/graph/algorithms/louvain.d.ts.map +1 -1
  99. package/dist/graph/algorithms/louvain.js +2 -4
  100. package/dist/graph/algorithms/louvain.js.map +1 -1
  101. package/dist/infrastructure/config.d.ts.map +1 -1
  102. package/dist/infrastructure/config.js +12 -2
  103. package/dist/infrastructure/config.js.map +1 -1
  104. package/dist/shared/globs.d.ts +40 -0
  105. package/dist/shared/globs.d.ts.map +1 -0
  106. package/dist/shared/globs.js +126 -0
  107. package/dist/shared/globs.js.map +1 -0
  108. package/dist/types.d.ts +26 -1
  109. package/dist/types.d.ts.map +1 -1
  110. package/grammars/tree-sitter-c_sharp.wasm +0 -0
  111. package/grammars/tree-sitter-erlang.wasm +0 -0
  112. package/package.json +7 -7
  113. package/src/ast-analysis/engine.ts +11 -1
  114. package/src/ast-analysis/rules/csharp.ts +8 -1
  115. package/src/ast-analysis/rules/go.ts +4 -1
  116. package/src/ast-analysis/rules/index.ts +181 -4
  117. package/src/ast-analysis/rules/java.ts +5 -1
  118. package/src/ast-analysis/rules/php.ts +6 -1
  119. package/src/ast-analysis/rules/python.ts +5 -1
  120. package/src/ast-analysis/rules/ruby.ts +4 -1
  121. package/src/ast-analysis/rules/rust.ts +5 -1
  122. package/src/ast-analysis/visitors/ast-store-visitor.ts +129 -34
  123. package/src/cli/commands/watch.ts +2 -0
  124. package/src/cli.ts +31 -8
  125. package/src/domain/graph/builder/context.ts +2 -0
  126. package/src/domain/graph/builder/helpers.ts +53 -3
  127. package/src/domain/graph/builder/pipeline.ts +162 -3
  128. package/src/domain/graph/builder/stages/collect-files.ts +56 -26
  129. package/src/domain/graph/builder/stages/detect-changes.ts +57 -49
  130. package/src/domain/graph/builder/stages/finalize.ts +16 -0
  131. package/src/domain/graph/journal.ts +284 -27
  132. package/src/domain/graph/watcher.ts +29 -9
  133. package/src/domain/parser.ts +288 -73
  134. package/src/domain/search/generator.ts +34 -2
  135. package/src/domain/search/models.ts +1 -1
  136. package/src/domain/wasm-worker-entry.ts +798 -0
  137. package/src/domain/wasm-worker-pool.ts +330 -0
  138. package/src/domain/wasm-worker-protocol.ts +81 -0
  139. package/src/extractors/javascript.ts +149 -2
  140. package/src/features/ast.ts +22 -9
  141. package/src/features/boundaries.ts +2 -27
  142. package/src/features/snapshot.ts +93 -14
  143. package/src/graph/algorithms/louvain.ts +2 -4
  144. package/src/infrastructure/config.ts +12 -2
  145. package/src/shared/globs.ts +121 -0
  146. package/src/types.ts +26 -1
@@ -5,9 +5,42 @@ import type {
5
5
  Visitor,
6
6
  VisitorContext,
7
7
  } from '../../types.js';
8
+ import type { AstStringConfig } from '../rules/index.js';
8
9
 
9
10
  const TEXT_MAX = 200;
10
11
 
12
+ // ── Cross-language node-type constants (mirror Rust `helpers.rs`) ────────
13
+ const IDENT_TYPES = new Set<string>([
14
+ 'identifier',
15
+ 'type_identifier',
16
+ 'name',
17
+ 'qualified_name',
18
+ 'scoped_identifier',
19
+ 'qualified_identifier',
20
+ 'member_expression',
21
+ 'member_access_expression',
22
+ 'field_expression',
23
+ 'attribute',
24
+ 'scoped_type_identifier',
25
+ ]);
26
+
27
+ const CALL_TYPES = new Set<string>([
28
+ 'call_expression',
29
+ 'call',
30
+ 'invocation_expression',
31
+ 'method_invocation',
32
+ 'function_call_expression',
33
+ 'member_call_expression',
34
+ 'scoped_call_expression',
35
+ ]);
36
+
37
+ const DEFAULT_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"`', stringPrefixes: '' };
38
+
39
+ // Keyword tokens skipped when extracting the inner expression text of a
40
+ // throw/raise/await/new node. Module-level constant avoids reallocating on
41
+ // every call (can be hot in large files).
42
+ const CHILD_EXPR_SKIP_KEYWORDS = new Set<string>(['throw', 'raise', 'await', 'new']);
43
+
11
44
  interface AstStoreRow {
12
45
  file: string;
13
46
  line: number;
@@ -20,69 +53,122 @@ interface AstStoreRow {
20
53
 
21
54
  function truncate(s: string | null | undefined, max: number = TEXT_MAX): string | null {
22
55
  if (!s) return null;
23
- return s.length <= max ? s : `${s.slice(0, max - 1)}\u2026`;
56
+ return s.length <= max ? s : `${s.slice(0, max - 1)}…`;
57
+ }
58
+
59
+ function trimLeadingChars(s: string, chars: string): string {
60
+ if (!chars) return s;
61
+ let i = 0;
62
+ while (i < s.length && chars.includes(s[i]!)) i++;
63
+ return i === 0 ? s : s.slice(i);
24
64
  }
25
65
 
26
- function extractNewName(node: TreeSitterNode): string {
66
+ function trimTrailingChars(s: string, chars: string): string {
67
+ if (!chars) return s;
68
+ let i = s.length;
69
+ while (i > 0 && chars.includes(s[i - 1]!)) i--;
70
+ return i === s.length ? s : s.slice(0, i);
71
+ }
72
+
73
+ /** Extract constructor name from a `new_expression` / `object_creation_expression`. */
74
+ function extractConstructorName(node: TreeSitterNode): string {
75
+ for (const field of ['type', 'class', 'constructor']) {
76
+ const f = node.childForFieldName(field);
77
+ if (f?.text) return f.text;
78
+ }
27
79
  for (let i = 0; i < node.childCount; i++) {
28
80
  const child = node.child(i);
29
81
  if (!child) continue;
30
- if (child.type === 'identifier') return child.text;
31
- if (child.type === 'member_expression') return child.text;
82
+ if (IDENT_TYPES.has(child.type)) return child.text;
83
+ }
84
+ const raw = node.text || '';
85
+ const beforeParen = raw.split('(')[0] || raw;
86
+ return beforeParen.replace(/^new\s+/, '').trim() || '?';
87
+ }
88
+
89
+ /** Extract function name from a call node. */
90
+ function extractCallName(node: TreeSitterNode): string {
91
+ for (const field of ['function', 'method', 'name']) {
92
+ const f = node.childForFieldName(field);
93
+ if (f?.text) return f.text;
32
94
  }
33
- return node.text?.split('(')[0]?.replace('new ', '').trim() || '?';
95
+ const text = node.text || '';
96
+ return text.split('(')[0] || '?';
34
97
  }
35
98
 
36
- function extractExpressionText(node: TreeSitterNode): string | null {
99
+ /** Extract name from a throw/raise statement — matches native `extract_throw_target`. */
100
+ function extractThrowName(node: TreeSitterNode, newTypes: Set<string>): string {
37
101
  for (let i = 0; i < node.childCount; i++) {
38
102
  const child = node.child(i);
39
103
  if (!child) continue;
40
- if (child.type !== 'throw' && child.type !== 'await') {
41
- return truncate(child.text);
42
- }
104
+ const ck = child.type;
105
+ if (newTypes.has(ck)) return extractConstructorName(child);
106
+ if (CALL_TYPES.has(ck)) return extractCallName(child);
107
+ if (IDENT_TYPES.has(ck)) return child.text;
43
108
  }
44
- return truncate(node.text);
109
+ return truncate(node.text) ?? node.text ?? '';
45
110
  }
46
111
 
47
- /** Extract the name from a throw statement's child nodes. */
48
- function extractThrowName(node: TreeSitterNode): string | null {
112
+ /** Extract name from an await expression matches native `extract_awaited_name`. */
113
+ function extractAwaitName(node: TreeSitterNode): string {
49
114
  for (let i = 0; i < node.childCount; i++) {
50
115
  const child = node.child(i);
51
116
  if (!child) continue;
52
- if (child.type === 'new_expression') return extractNewName(child);
53
- if (child.type === 'call_expression') {
54
- const fn = child.childForFieldName('function');
55
- return fn ? fn.text : child.text?.split('(')[0] || '?';
56
- }
57
- if (child.type === 'identifier') return child.text;
117
+ const ck = child.type;
118
+ if (CALL_TYPES.has(ck)) return extractCallName(child);
119
+ if (IDENT_TYPES.has(ck)) return child.text;
58
120
  }
59
- return truncate(node.text);
121
+ return truncate(node.text) ?? node.text ?? '';
60
122
  }
61
123
 
62
- /** Extract the name from an await expression's child nodes. */
63
- function extractAwaitName(node: TreeSitterNode): string | null {
124
+ /** Extract text of the expression inside a throw/await, skipping the keyword. */
125
+ function extractChildExpressionText(node: TreeSitterNode): string | null {
64
126
  for (let i = 0; i < node.childCount; i++) {
65
127
  const child = node.child(i);
66
128
  if (!child) continue;
67
- if (child.type === 'call_expression') {
68
- const fn = child.childForFieldName('function');
69
- return fn ? fn.text : child.text?.split('(')[0] || '?';
70
- }
71
- if (child.type === 'identifier' || child.type === 'member_expression') {
72
- return child.text;
73
- }
129
+ if (!CHILD_EXPR_SKIP_KEYWORDS.has(child.type)) return truncate(child.text);
74
130
  }
75
131
  return truncate(node.text);
76
132
  }
77
133
 
134
+ /**
135
+ * Extract string content from a string-literal node, mirroring the native
136
+ * engine's `build_string_node` (`helpers.rs`). Returns `null` when the
137
+ * content is shorter than 2 Unicode code points.
138
+ */
139
+ function extractStringContent(node: TreeSitterNode, cfg: AstStringConfig): string | null {
140
+ const raw = node.text ?? '';
141
+ const isRawString = node.type.includes('raw_string');
142
+
143
+ let s = raw;
144
+ s = trimLeadingChars(s, '@');
145
+ s = trimLeadingChars(s, cfg.stringPrefixes);
146
+ if (isRawString) s = trimLeadingChars(s, 'r#');
147
+ s = trimLeadingChars(s, cfg.quoteChars);
148
+ if (isRawString) s = trimTrailingChars(s, '#');
149
+ s = trimTrailingChars(s, cfg.quoteChars);
150
+
151
+ // Count code points, not UTF-16 code units — matches Rust `chars().count()`.
152
+ const codePointCount = [...s].length;
153
+ if (codePointCount < 2) return null;
154
+ return s;
155
+ }
156
+
78
157
  export function createAstStoreVisitor(
79
158
  astTypeMap: Record<string, string>,
80
159
  defs: Definition[],
81
160
  relPath: string,
82
161
  nodeIdMap: Map<string, number>,
162
+ stringConfig: AstStringConfig = DEFAULT_STRING_CONFIG,
163
+ stopRecurseKinds: ReadonlySet<string> = new Set(),
83
164
  ): Visitor {
84
165
  const rows: AstStoreRow[] = [];
85
166
  const matched = new Set<number>();
167
+ const newTypes = new Set<string>(
168
+ Object.entries(astTypeMap)
169
+ .filter(([, kind]) => kind === 'new')
170
+ .map(([type]) => type),
171
+ );
86
172
 
87
173
  function findParentDef(line: number): Definition | null {
88
174
  let best: Definition | null = null;
@@ -106,12 +192,15 @@ export function createAstStoreVisitor(
106
192
  type KindHandler = (node: TreeSitterNode) => NameTextResult;
107
193
 
108
194
  const kindHandlers: Record<string, KindHandler> = {
109
- new: (node) => ({ name: extractNewName(node), text: truncate(node.text) }),
110
- throw: (node) => ({ name: extractThrowName(node), text: extractExpressionText(node) }),
111
- await: (node) => ({ name: extractAwaitName(node), text: extractExpressionText(node) }),
195
+ new: (node) => ({ name: extractConstructorName(node), text: truncate(node.text) }),
196
+ throw: (node) => ({
197
+ name: extractThrowName(node, newTypes),
198
+ text: extractChildExpressionText(node),
199
+ }),
200
+ await: (node) => ({ name: extractAwaitName(node), text: extractChildExpressionText(node) }),
112
201
  string: (node) => {
113
- const content = node.text?.replace(/^['"`]|['"`]$/g, '') || '';
114
- if (content.length < 2) return { name: null, text: null, skip: true };
202
+ const content = extractStringContent(node, stringConfig);
203
+ if (content == null) return { name: null, text: null, skip: true };
115
204
  return { name: truncate(content, 100), text: truncate(node.text) };
116
205
  },
117
206
  regex: (node) => ({ name: node.text || '?', text: truncate(node.text) }),
@@ -156,7 +245,13 @@ export function createAstStoreVisitor(
156
245
 
157
246
  collectNode(node, kind);
158
247
 
159
- if (kind !== 'string' && kind !== 'regex') {
248
+ // Mirror the native walker's recursion policy. In JS/TS, the native
249
+ // javascript.rs walker returns after collecting `new` or `throw` to
250
+ // avoid double-counting the wrapped expression (e.g. `throw new
251
+ // Error('x')` emits one `throw` row, not throw+new+string). Other
252
+ // languages go through helpers.rs::walk_ast_nodes_with_config_depth
253
+ // which always recurses — so `stopRecurseKinds` is empty for them.
254
+ if (stopRecurseKinds.has(kind)) {
160
255
  return { skipChildren: true };
161
256
  }
162
257
  },
@@ -6,6 +6,7 @@ export const command: CommandDefinition = {
6
6
  name: 'watch [dir]',
7
7
  description: 'Watch project for file changes and incrementally update the graph',
8
8
  options: [
9
+ ['-d, --db <path>', 'Path to graph.db'],
9
10
  ['--poll', 'Use stat-based polling (default on Windows to avoid ReFS/Dev Drive crashes)'],
10
11
  ['--native', 'Force native OS file watchers instead of polling'],
11
12
  ['--poll-interval <ms>', 'Polling interval in milliseconds (default: 2000)'],
@@ -22,6 +23,7 @@ export const command: CommandDefinition = {
22
23
  engine,
23
24
  poll,
24
25
  pollInterval: opts.pollInterval ? Number(opts.pollInterval) : undefined,
26
+ dbPath: opts.db ? path.resolve(opts.db) : undefined,
25
27
  });
26
28
  },
27
29
  };
package/src/cli.ts CHANGED
@@ -1,14 +1,37 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  import { run } from './cli/index.js';
4
+ import { disposeParsers } from './domain/parser.js';
4
5
  import { CodegraphError, toErrorMessage } from './shared/errors.js';
5
6
 
6
- run().catch((err: unknown) => {
7
- if (err instanceof CodegraphError) {
8
- console.error(`codegraph [${err.code}]: ${err.message}`);
9
- if (err.file) console.error(` file: ${err.file}`);
10
- } else {
11
- console.error(`codegraph: fatal error${toErrorMessage(err)}`);
7
+ /**
8
+ * After the CLI command finishes, tear down any cached WASM parsers and the
9
+ * worker thread pool. The WASM parse worker (see `domain/wasm-worker-pool.ts`)
10
+ * keeps the event loop alive until `worker.terminate()` is called, so without
11
+ * this teardown short-lived commands like `codegraph build` would hang for
12
+ * minutes before Node gives up surfacing in CI as `spawnSync ETIMEDOUT`
13
+ * even though the command's work is already complete.
14
+ *
15
+ * `disposeParsers` is safe to call when the pool was never instantiated
16
+ * (e.g. native engine, or commands that never parse): it no-ops cleanly.
17
+ */
18
+ async function shutdown(): Promise<void> {
19
+ try {
20
+ await disposeParsers();
21
+ } catch {
22
+ /* don't mask the real exit status over a teardown failure */
12
23
  }
13
- process.exit(1);
14
- });
24
+ }
25
+
26
+ run()
27
+ .then(shutdown)
28
+ .catch(async (err: unknown) => {
29
+ if (err instanceof CodegraphError) {
30
+ console.error(`codegraph [${err.code}]: ${err.message}`);
31
+ if (err.file) console.error(` file: ${err.file}`);
32
+ } else {
33
+ console.error(`codegraph: fatal error — ${toErrorMessage(err)}`);
34
+ }
35
+ await shutdown();
36
+ process.exit(1);
37
+ });
@@ -87,6 +87,8 @@ export class PipelineContext {
87
87
  // ── Phase timing ───────────────────────────────────────────────────
88
88
  timing: {
89
89
  setupMs?: number;
90
+ collectMs?: number;
91
+ detectMs?: number;
90
92
  parseMs?: number;
91
93
  insertMs?: number;
92
94
  resolveMs?: number;
@@ -8,7 +8,8 @@ import fs from 'node:fs';
8
8
  import path from 'node:path';
9
9
  import { purgeFilesData } from '../../../db/index.js';
10
10
  import { warn } from '../../../infrastructure/logger.js';
11
- import { EXTENSIONS, IGNORE_DIRS } from '../../../shared/constants.js';
11
+ import { EXTENSIONS, IGNORE_DIRS, normalizePath } from '../../../shared/constants.js';
12
+ import { compileGlobs, matchesAny } from '../../../shared/globs.js';
12
13
  import type {
13
14
  BetterSqlite3Database,
14
15
  CodegraphConfig,
@@ -58,9 +59,29 @@ function shouldSkipEntry(entry: fs.Dirent, extraIgnore: Set<string> | null): boo
58
59
  return false;
59
60
  }
60
61
 
62
+ /**
63
+ * Check whether a source file passes the configured include/exclude globs.
64
+ *
65
+ * Patterns are matched against the path relative to the project root,
66
+ * normalized to forward slashes (e.g. `src/foo/bar.ts`). When both lists
67
+ * are set, a file must match at least one include and no exclude.
68
+ */
69
+ export function passesIncludeExclude(
70
+ relPath: string,
71
+ includeRegexes: readonly RegExp[],
72
+ excludeRegexes: readonly RegExp[],
73
+ ): boolean {
74
+ if (includeRegexes.length > 0 && !matchesAny(includeRegexes, relPath)) return false;
75
+ if (excludeRegexes.length > 0 && matchesAny(excludeRegexes, relPath)) return false;
76
+ return true;
77
+ }
78
+
61
79
  /**
62
80
  * Recursively collect all source files under `dir`.
63
81
  * When `directories` is a Set, also tracks which directories contain files.
82
+ *
83
+ * The first invocation establishes `dir` as the project root against which
84
+ * `config.include` / `config.exclude` globs are matched.
64
85
  */
65
86
  export function collectFiles(
66
87
  dir: string,
@@ -68,6 +89,9 @@ export function collectFiles(
68
89
  config: Partial<CodegraphConfig>,
69
90
  directories: Set<string>,
70
91
  _visited?: Set<string>,
92
+ _rootDir?: string,
93
+ _includeRegexes?: readonly RegExp[],
94
+ _excludeRegexes?: readonly RegExp[],
71
95
  ): { files: string[]; directories: Set<string> };
72
96
  export function collectFiles(
73
97
  dir: string,
@@ -75,6 +99,9 @@ export function collectFiles(
75
99
  config?: Partial<CodegraphConfig>,
76
100
  directories?: null,
77
101
  _visited?: Set<string>,
102
+ _rootDir?: string,
103
+ _includeRegexes?: readonly RegExp[],
104
+ _excludeRegexes?: readonly RegExp[],
78
105
  ): string[];
79
106
  export function collectFiles(
80
107
  dir: string,
@@ -82,10 +109,20 @@ export function collectFiles(
82
109
  config: Partial<CodegraphConfig> = {},
83
110
  directories: Set<string> | null = null,
84
111
  _visited: Set<string> = new Set(),
112
+ _rootDir?: string,
113
+ _includeRegexes?: readonly RegExp[],
114
+ _excludeRegexes?: readonly RegExp[],
85
115
  ): string[] | { files: string[]; directories: Set<string> } {
86
116
  const trackDirs = directories instanceof Set;
87
117
  let hasFiles = false;
88
118
 
119
+ // First call: compute root and compile include/exclude patterns once,
120
+ // then pass them down recursive calls so we don't recompile per directory.
121
+ const rootDir = _rootDir ?? dir;
122
+ const includeRegexes = _includeRegexes ?? compileGlobs(config.include);
123
+ const excludeRegexes = _excludeRegexes ?? compileGlobs(config.exclude);
124
+ const hasGlobFilters = includeRegexes.length > 0 || excludeRegexes.length > 0;
125
+
89
126
  // Merge config ignoreDirs with defaults
90
127
  const extraIgnore = config.ignoreDirs ? new Set(config.ignoreDirs) : null;
91
128
 
@@ -116,11 +153,24 @@ export function collectFiles(
116
153
  const full = path.join(dir, entry.name);
117
154
  if (entry.isDirectory()) {
118
155
  if (trackDirs) {
119
- collectFiles(full, files, config, directories as Set<string>, _visited);
156
+ collectFiles(
157
+ full,
158
+ files,
159
+ config,
160
+ directories as Set<string>,
161
+ _visited,
162
+ rootDir,
163
+ includeRegexes,
164
+ excludeRegexes,
165
+ );
120
166
  } else {
121
- collectFiles(full, files, config, null, _visited);
167
+ collectFiles(full, files, config, null, _visited, rootDir, includeRegexes, excludeRegexes);
122
168
  }
123
169
  } else if (EXTENSIONS.has(path.extname(entry.name))) {
170
+ if (hasGlobFilters) {
171
+ const rel = normalizePath(path.relative(rootDir, full));
172
+ if (!passesIncludeExclude(rel, includeRegexes, excludeRegexes)) continue;
173
+ }
124
174
  files.push(full);
125
175
  hasFiles = true;
126
176
  }
@@ -21,6 +21,7 @@ import { detectWorkspaces, loadConfig } from '../../../infrastructure/config.js'
21
21
  import { debug, info, warn } from '../../../infrastructure/logger.js';
22
22
  import { loadNative } from '../../../infrastructure/native.js';
23
23
  import { semverCompare } from '../../../infrastructure/update-check.js';
24
+ import { normalizePath } from '../../../shared/constants.js';
24
25
  import { toErrorMessage } from '../../../shared/errors.js';
25
26
  import { CODEGRAPH_VERSION } from '../../../shared/version.js';
26
27
  import type {
@@ -29,11 +30,18 @@ import type {
29
30
  BuildResult,
30
31
  Definition,
31
32
  ExtractorOutput,
33
+ SqliteStatement,
32
34
  } from '../../../types.js';
33
- import { getActiveEngine } from '../../parser.js';
35
+ import {
36
+ classifyNativeDrops,
37
+ formatDropExtensionSummary,
38
+ getActiveEngine,
39
+ getInstalledWasmExtensions,
40
+ parseFilesAuto,
41
+ } from '../../parser.js';
34
42
  import { setWorkspaces } from '../resolve.js';
35
43
  import { PipelineContext } from './context.js';
36
- import { loadPathAliases } from './helpers.js';
44
+ import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js';
37
45
  import { NativeDbProxy } from './native-db-proxy.js';
38
46
  import { buildEdges } from './stages/build-edges.js';
39
47
  import { buildStructure } from './stages/build-structure.js';
@@ -104,6 +112,21 @@ function checkEngineSchemaMismatch(ctx: PipelineContext): void {
104
112
  }
105
113
  }
106
114
 
115
+ function warnOnEmbeddingsWipe(ctx: PipelineContext): void {
116
+ const willBeFullBuild = !ctx.incremental || ctx.forceFullRebuild;
117
+ if (!willBeFullBuild) return;
118
+ let count = 0;
119
+ try {
120
+ count = (ctx.db.prepare('SELECT COUNT(*) AS c FROM embeddings').get() as { c: number }).c;
121
+ } catch {
122
+ return; // embeddings table missing — nothing to warn about
123
+ }
124
+ if (count === 0) return;
125
+ warn(
126
+ `Full rebuild will discard ${count} embedding${count === 1 ? '' : 's'}; re-run \`codegraph embed\` after the build.`,
127
+ );
128
+ }
129
+
107
130
  function loadAliases(ctx: PipelineContext): void {
108
131
  ctx.aliases = loadPathAliases(ctx.rootDir);
109
132
  if (ctx.config.aliases) {
@@ -149,6 +172,7 @@ function setupPipeline(ctx: PipelineContext): void {
149
172
 
150
173
  initializeEngine(ctx);
151
174
  checkEngineSchemaMismatch(ctx);
175
+ warnOnEmbeddingsWipe(ctx);
152
176
  loadAliases(ctx);
153
177
 
154
178
  // Workspace packages (monorepo)
@@ -166,6 +190,8 @@ function formatTimingResult(ctx: PipelineContext): BuildResult {
166
190
  return {
167
191
  phases: {
168
192
  setupMs: +(t.setupMs ?? 0).toFixed(1),
193
+ collectMs: +(t.collectMs ?? 0).toFixed(1),
194
+ detectMs: +(t.detectMs ?? 0).toFixed(1),
169
195
  parseMs: +(t.parseMs ?? 0).toFixed(1),
170
196
  insertMs: +(t.insertMs ?? 0).toFixed(1),
171
197
  resolveMs: +(t.resolveMs ?? 0).toFixed(1),
@@ -540,7 +566,9 @@ function formatNativeTimingResult(
540
566
  ): BuildResult {
541
567
  return {
542
568
  phases: {
543
- setupMs: +((p.setupMs ?? 0) + (p.collectMs ?? 0) + (p.detectMs ?? 0)).toFixed(1),
569
+ setupMs: +(p.setupMs ?? 0).toFixed(1),
570
+ collectMs: +(p.collectMs ?? 0).toFixed(1),
571
+ detectMs: +(p.detectMs ?? 0).toFixed(1),
544
572
  parseMs: +(p.parseMs ?? 0).toFixed(1),
545
573
  insertMs: +(p.insertMs ?? 0).toFixed(1),
546
574
  resolveMs: +(p.resolveMs ?? 0).toFixed(1),
@@ -696,10 +724,137 @@ async function tryNativeOrchestrator(
696
724
  }
697
725
  }
698
726
 
727
+ // Engine parity: the native orchestrator silently drops files whose
728
+ // Rust extractor/grammar is missing or fails (e.g. HCL, Scala, Swift on
729
+ // stale native binaries). WASM handles those — backfill via WASM so both
730
+ // engines process the same file set (#967).
731
+ //
732
+ // Only runs on full builds: incremental builds only touch changed files,
733
+ // which are parsed through parseFilesAuto (which has its own per-file
734
+ // backfill), so a full filesystem scan here would be wasted work.
735
+ if (result.isFullBuild) {
736
+ await backfillNativeDroppedFiles(ctx);
737
+ }
738
+
699
739
  closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
700
740
  return formatNativeTimingResult(p, structurePatchMs, analysisTiming);
701
741
  }
702
742
 
743
+ /**
744
+ * Backfill files that the native orchestrator silently dropped during parse.
745
+ * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
746
+ */
747
+ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
748
+ // Needs a real better-sqlite3 connection for INSERT.
749
+ if (ctx.nativeFirstProxy) {
750
+ closeNativeDb(ctx, 'pre-parity-backfill');
751
+ ctx.db = openDb(ctx.dbPath);
752
+ ctx.nativeFirstProxy = false;
753
+ }
754
+
755
+ const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set<string>());
756
+ const expected = new Set(
757
+ collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))),
758
+ );
759
+
760
+ const existingRows = ctx.db
761
+ .prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'")
762
+ .all() as Array<{ file: string }>;
763
+ const existing = new Set(existingRows.map((r) => r.file));
764
+
765
+ // Restrict backfill to files with an installed WASM grammar. Extensions in
766
+ // LANGUAGE_REGISTRY without a shipped grammar file (e.g. groovy, erlang on
767
+ // minimal installs) can't be parsed by either engine, so they're not a
768
+ // native regression — excluding them keeps the warn count meaningful.
769
+ const installedExts = getInstalledWasmExtensions();
770
+ const missingRel: string[] = [];
771
+ const missingAbs: string[] = [];
772
+ for (const rel of expected) {
773
+ if (existing.has(rel)) continue;
774
+ const ext = path.extname(rel).toLowerCase();
775
+ if (!installedExts.has(ext)) continue;
776
+ missingRel.push(rel);
777
+ missingAbs.push(path.join(ctx.rootDir, rel));
778
+ }
779
+ if (missingAbs.length === 0) return;
780
+
781
+ // Classify drops so users see per-extension reasons instead of just a count
782
+ // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust
783
+ // extractor); `native-extractor-failure` indicates a real native bug since
784
+ // the language IS supported by the addon yet the file was dropped anyway.
785
+ const { byReason, totals } = classifyNativeDrops(missingRel);
786
+ if (totals['unsupported-by-native'] > 0) {
787
+ info(
788
+ `Native orchestrator skipped ${totals['unsupported-by-native']} file(s) in languages without a Rust extractor; backfilling via WASM: ${formatDropExtensionSummary(byReason['unsupported-by-native'])}`,
789
+ );
790
+ }
791
+ if (totals['native-extractor-failure'] > 0) {
792
+ warn(
793
+ `Native orchestrator dropped ${totals['native-extractor-failure']} file(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM: ${formatDropExtensionSummary(byReason['native-extractor-failure'])}`,
794
+ );
795
+ }
796
+ const wasmResults = await parseFilesAuto(missingAbs, ctx.rootDir, { engine: 'wasm' });
797
+
798
+ const rows: unknown[][] = [];
799
+ const exportKeys: unknown[][] = [];
800
+ for (const [relPath, symbols] of wasmResults) {
801
+ // File row — mirrors insertDefinitionsAndExports: qualified_name is null.
802
+ rows.push([relPath, 'file', relPath, 0, null, null, null, null, null]);
803
+ for (const def of symbols.definitions ?? []) {
804
+ // Populate qualified_name/scope the same way the JS fallback does so
805
+ // downstream queries (cross-file references, "go to definition") find
806
+ // these symbols.
807
+ const dotIdx = def.name.lastIndexOf('.');
808
+ const scope = dotIdx !== -1 ? def.name.slice(0, dotIdx) : null;
809
+ rows.push([
810
+ def.name,
811
+ def.kind,
812
+ relPath,
813
+ def.line,
814
+ def.endLine ?? null,
815
+ null,
816
+ def.name,
817
+ scope,
818
+ def.visibility ?? null,
819
+ ]);
820
+ }
821
+ // Exports: insert the row (INSERT OR IGNORE — a matching definition row
822
+ // is a no-op) and queue a key for the second-pass exported=1 update, so
823
+ // queries filtering on exported=1 find backfilled symbols (#970).
824
+ for (const exp of symbols.exports ?? []) {
825
+ rows.push([exp.name, exp.kind, relPath, exp.line, null, null, exp.name, null, null]);
826
+ exportKeys.push([exp.name, exp.kind, relPath, exp.line]);
827
+ }
828
+ }
829
+ const db = ctx.db as unknown as BetterSqlite3Database;
830
+ batchInsertNodes(db, rows);
831
+
832
+ // Mark exported symbols in batches — mirrors insertDefinitionsAndExports.
833
+ if (exportKeys.length > 0) {
834
+ const EXPORT_CHUNK = 500;
835
+ const exportStmtCache = new Map<number, SqliteStatement>();
836
+ for (let i = 0; i < exportKeys.length; i += EXPORT_CHUNK) {
837
+ const end = Math.min(i + EXPORT_CHUNK, exportKeys.length);
838
+ const chunkSize = end - i;
839
+ let updateStmt = exportStmtCache.get(chunkSize);
840
+ if (!updateStmt) {
841
+ const conditions = Array.from(
842
+ { length: chunkSize },
843
+ () => '(name = ? AND kind = ? AND file = ? AND line = ?)',
844
+ ).join(' OR ');
845
+ updateStmt = db.prepare(`UPDATE nodes SET exported = 1 WHERE ${conditions}`);
846
+ exportStmtCache.set(chunkSize, updateStmt);
847
+ }
848
+ const vals: unknown[] = [];
849
+ for (let j = i; j < end; j++) {
850
+ const k = exportKeys[j] as unknown[];
851
+ vals.push(k[0], k[1], k[2], k[3]);
852
+ }
853
+ updateStmt.run(...vals);
854
+ }
855
+ }
856
+ }
857
+
703
858
  // ── Pipeline stages execution ───────────────────────────────────────────
704
859
 
705
860
  async function runPipelineStages(ctx: PipelineContext): Promise<void> {
@@ -845,6 +1000,10 @@ export async function buildGraph(
845
1000
  `Codegraph version changed (${prevVersion} → ${CODEGRAPH_VERSION}), promoting to full rebuild.`,
846
1001
  );
847
1002
  ctx.forceFullRebuild = true;
1003
+ // Re-check embeddings: the initial warnOnEmbeddingsWipe ran before
1004
+ // forceFullRebuild was set here, so the silent-data-loss guard
1005
+ // would otherwise miss this late-promotion path (#986 follow-up).
1006
+ warnOnEmbeddingsWipe(ctx);
848
1007
  }
849
1008
  }
850
1009
  }