@optave/codegraph 3.9.6 → 3.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +26 -12
  2. package/dist/ast-analysis/engine.d.ts.map +1 -1
  3. package/dist/ast-analysis/engine.js +1 -1
  4. package/dist/ast-analysis/engine.js.map +1 -1
  5. package/dist/ast-analysis/rules/index.d.ts.map +1 -1
  6. package/dist/ast-analysis/rules/index.js +77 -0
  7. package/dist/ast-analysis/rules/index.js.map +1 -1
  8. package/dist/ast-analysis/visitors/ast-store-visitor.d.ts.map +1 -1
  9. package/dist/ast-analysis/visitors/ast-store-visitor.js +50 -8
  10. package/dist/ast-analysis/visitors/ast-store-visitor.js.map +1 -1
  11. package/dist/cli/commands/audit.js +1 -1
  12. package/dist/cli/commands/audit.js.map +1 -1
  13. package/dist/cli/commands/build.d.ts.map +1 -1
  14. package/dist/cli/commands/build.js +2 -0
  15. package/dist/cli/commands/build.js.map +1 -1
  16. package/dist/cli/commands/check.js +1 -1
  17. package/dist/cli/commands/check.js.map +1 -1
  18. package/dist/cli/commands/children.js +1 -1
  19. package/dist/cli/commands/children.js.map +1 -1
  20. package/dist/cli/commands/diff-impact.js +1 -1
  21. package/dist/cli/commands/diff-impact.js.map +1 -1
  22. package/dist/cli/commands/roles.js +1 -1
  23. package/dist/cli/commands/roles.js.map +1 -1
  24. package/dist/cli/commands/structure.js +1 -1
  25. package/dist/cli/commands/structure.js.map +1 -1
  26. package/dist/cli/shared/options.js +1 -1
  27. package/dist/cli/shared/options.js.map +1 -1
  28. package/dist/db/connection.d.ts.map +1 -1
  29. package/dist/db/connection.js +8 -0
  30. package/dist/db/connection.js.map +1 -1
  31. package/dist/domain/graph/builder/context.d.ts +10 -0
  32. package/dist/domain/graph/builder/context.d.ts.map +1 -1
  33. package/dist/domain/graph/builder/context.js +10 -0
  34. package/dist/domain/graph/builder/context.js.map +1 -1
  35. package/dist/domain/graph/builder/helpers.d.ts +7 -2
  36. package/dist/domain/graph/builder/helpers.d.ts.map +1 -1
  37. package/dist/domain/graph/builder/helpers.js +7 -2
  38. package/dist/domain/graph/builder/helpers.js.map +1 -1
  39. package/dist/domain/graph/builder/incremental.d.ts +0 -6
  40. package/dist/domain/graph/builder/incremental.d.ts.map +1 -1
  41. package/dist/domain/graph/builder/incremental.js +6 -23
  42. package/dist/domain/graph/builder/incremental.js.map +1 -1
  43. package/dist/domain/graph/builder/pipeline.d.ts +44 -0
  44. package/dist/domain/graph/builder/pipeline.d.ts.map +1 -1
  45. package/dist/domain/graph/builder/pipeline.js +348 -42
  46. package/dist/domain/graph/builder/pipeline.js.map +1 -1
  47. package/dist/domain/graph/builder/stages/build-edges.d.ts.map +1 -1
  48. package/dist/domain/graph/builder/stages/build-edges.js +8 -2
  49. package/dist/domain/graph/builder/stages/build-edges.js.map +1 -1
  50. package/dist/domain/graph/builder/stages/collect-files.d.ts.map +1 -1
  51. package/dist/domain/graph/builder/stages/collect-files.js +8 -0
  52. package/dist/domain/graph/builder/stages/collect-files.js.map +1 -1
  53. package/dist/domain/graph/builder/stages/detect-changes.d.ts +24 -0
  54. package/dist/domain/graph/builder/stages/detect-changes.d.ts.map +1 -1
  55. package/dist/domain/graph/builder/stages/detect-changes.js +117 -3
  56. package/dist/domain/graph/builder/stages/detect-changes.js.map +1 -1
  57. package/dist/domain/graph/builder/stages/finalize.d.ts.map +1 -1
  58. package/dist/domain/graph/builder/stages/finalize.js +9 -6
  59. package/dist/domain/graph/builder/stages/finalize.js.map +1 -1
  60. package/dist/domain/graph/builder/stages/insert-nodes.d.ts +30 -0
  61. package/dist/domain/graph/builder/stages/insert-nodes.d.ts.map +1 -1
  62. package/dist/domain/graph/builder/stages/insert-nodes.js +36 -13
  63. package/dist/domain/graph/builder/stages/insert-nodes.js.map +1 -1
  64. package/dist/domain/graph/builder/stages/resolve-imports.d.ts.map +1 -1
  65. package/dist/domain/graph/builder/stages/resolve-imports.js +73 -22
  66. package/dist/domain/graph/builder/stages/resolve-imports.js.map +1 -1
  67. package/dist/domain/graph/watcher.d.ts.map +1 -1
  68. package/dist/domain/graph/watcher.js +23 -18
  69. package/dist/domain/graph/watcher.js.map +1 -1
  70. package/dist/domain/parser.d.ts +14 -1
  71. package/dist/domain/parser.d.ts.map +1 -1
  72. package/dist/domain/parser.js +104 -11
  73. package/dist/domain/parser.js.map +1 -1
  74. package/dist/domain/search/models.d.ts +16 -0
  75. package/dist/domain/search/models.d.ts.map +1 -1
  76. package/dist/domain/search/models.js +36 -2
  77. package/dist/domain/search/models.js.map +1 -1
  78. package/dist/domain/wasm-worker-entry.js +20 -13
  79. package/dist/domain/wasm-worker-entry.js.map +1 -1
  80. package/dist/extractors/c.js +25 -6
  81. package/dist/extractors/c.js.map +1 -1
  82. package/dist/extractors/cpp.js +47 -6
  83. package/dist/extractors/cpp.js.map +1 -1
  84. package/dist/extractors/cuda.js +90 -14
  85. package/dist/extractors/cuda.js.map +1 -1
  86. package/dist/extractors/elixir.js +83 -3
  87. package/dist/extractors/elixir.js.map +1 -1
  88. package/dist/extractors/erlang.js +56 -20
  89. package/dist/extractors/erlang.js.map +1 -1
  90. package/dist/extractors/fsharp.d.ts +7 -0
  91. package/dist/extractors/fsharp.d.ts.map +1 -1
  92. package/dist/extractors/fsharp.js +94 -0
  93. package/dist/extractors/fsharp.js.map +1 -1
  94. package/dist/extractors/gleam.js +6 -2
  95. package/dist/extractors/gleam.js.map +1 -1
  96. package/dist/extractors/groovy.js +41 -1
  97. package/dist/extractors/groovy.js.map +1 -1
  98. package/dist/extractors/haskell.js +48 -4
  99. package/dist/extractors/haskell.js.map +1 -1
  100. package/dist/extractors/julia.js +172 -41
  101. package/dist/extractors/julia.js.map +1 -1
  102. package/dist/extractors/kotlin.js +4 -0
  103. package/dist/extractors/kotlin.js.map +1 -1
  104. package/dist/extractors/objc.js +184 -47
  105. package/dist/extractors/objc.js.map +1 -1
  106. package/dist/extractors/python.js +7 -4
  107. package/dist/extractors/python.js.map +1 -1
  108. package/dist/extractors/r.js +93 -52
  109. package/dist/extractors/r.js.map +1 -1
  110. package/dist/extractors/scala.d.ts.map +1 -1
  111. package/dist/extractors/scala.js +18 -32
  112. package/dist/extractors/scala.js.map +1 -1
  113. package/dist/extractors/solidity.js +18 -9
  114. package/dist/extractors/solidity.js.map +1 -1
  115. package/dist/extractors/verilog.js +80 -15
  116. package/dist/extractors/verilog.js.map +1 -1
  117. package/dist/infrastructure/config.d.ts +1 -0
  118. package/dist/infrastructure/config.d.ts.map +1 -1
  119. package/dist/infrastructure/config.js +1 -0
  120. package/dist/infrastructure/config.js.map +1 -1
  121. package/dist/mcp/server.d.ts.map +1 -1
  122. package/dist/mcp/server.js +14 -8
  123. package/dist/mcp/server.js.map +1 -1
  124. package/dist/mcp/tool-registry.d.ts +1 -1
  125. package/dist/mcp/tool-registry.d.ts.map +1 -1
  126. package/dist/mcp/tool-registry.js +23 -5
  127. package/dist/mcp/tool-registry.js.map +1 -1
  128. package/dist/mcp/tools/semantic-search.d.ts +1 -0
  129. package/dist/mcp/tools/semantic-search.d.ts.map +1 -1
  130. package/dist/mcp/tools/semantic-search.js +1 -0
  131. package/dist/mcp/tools/semantic-search.js.map +1 -1
  132. package/dist/types.d.ts +16 -1
  133. package/dist/types.d.ts.map +1 -1
  134. package/grammars/tree-sitter-erlang.wasm +0 -0
  135. package/grammars/tree-sitter-fsharp.wasm +0 -0
  136. package/grammars/tree-sitter-fsharp_signature.wasm +0 -0
  137. package/grammars/tree-sitter-gleam.wasm +0 -0
  138. package/package.json +11 -10
  139. package/src/ast-analysis/engine.ts +3 -1
  140. package/src/ast-analysis/rules/index.ts +87 -0
  141. package/src/ast-analysis/visitors/ast-store-visitor.ts +45 -9
  142. package/src/cli/commands/audit.ts +1 -1
  143. package/src/cli/commands/build.ts +2 -0
  144. package/src/cli/commands/check.ts +1 -1
  145. package/src/cli/commands/children.ts +1 -1
  146. package/src/cli/commands/diff-impact.ts +1 -1
  147. package/src/cli/commands/roles.ts +1 -1
  148. package/src/cli/commands/structure.ts +1 -1
  149. package/src/cli/shared/options.ts +1 -1
  150. package/src/db/connection.ts +8 -0
  151. package/src/domain/graph/builder/context.ts +10 -0
  152. package/src/domain/graph/builder/helpers.ts +8 -3
  153. package/src/domain/graph/builder/incremental.ts +6 -41
  154. package/src/domain/graph/builder/pipeline.ts +404 -41
  155. package/src/domain/graph/builder/stages/build-edges.ts +9 -2
  156. package/src/domain/graph/builder/stages/collect-files.ts +9 -0
  157. package/src/domain/graph/builder/stages/detect-changes.ts +130 -4
  158. package/src/domain/graph/builder/stages/finalize.ts +9 -6
  159. package/src/domain/graph/builder/stages/insert-nodes.ts +38 -14
  160. package/src/domain/graph/builder/stages/resolve-imports.ts +79 -25
  161. package/src/domain/graph/watcher.ts +21 -23
  162. package/src/domain/parser.ts +110 -10
  163. package/src/domain/search/models.ts +37 -2
  164. package/src/domain/wasm-worker-entry.ts +20 -13
  165. package/src/extractors/c.ts +27 -8
  166. package/src/extractors/cpp.ts +50 -8
  167. package/src/extractors/cuda.ts +90 -16
  168. package/src/extractors/elixir.ts +75 -3
  169. package/src/extractors/erlang.ts +63 -20
  170. package/src/extractors/fsharp.ts +104 -0
  171. package/src/extractors/gleam.ts +7 -2
  172. package/src/extractors/groovy.ts +45 -1
  173. package/src/extractors/haskell.ts +45 -4
  174. package/src/extractors/julia.ts +164 -43
  175. package/src/extractors/kotlin.ts +4 -0
  176. package/src/extractors/objc.ts +171 -47
  177. package/src/extractors/python.ts +5 -3
  178. package/src/extractors/r.ts +88 -48
  179. package/src/extractors/scala.ts +24 -36
  180. package/src/extractors/solidity.ts +17 -8
  181. package/src/extractors/verilog.ts +83 -15
  182. package/src/infrastructure/config.ts +1 -0
  183. package/src/mcp/server.ts +16 -9
  184. package/src/mcp/tool-registry.ts +28 -5
  185. package/src/mcp/tools/semantic-search.ts +2 -0
  186. package/src/types.ts +16 -0
@@ -7,7 +7,7 @@
7
7
  import fs from 'node:fs';
8
8
  import path from 'node:path';
9
9
  import { performance } from 'node:perf_hooks';
10
- import { acquireAdvisoryLock, closeDbPair, getBuildMeta, initSchema, MIGRATIONS, openDb, releaseAdvisoryLock, setBuildMeta, } from '../../../db/index.js';
10
+ import { acquireAdvisoryLock, closeDb, closeDbPair, getBuildMeta, initSchema, MIGRATIONS, openDb, purgeFilesData, releaseAdvisoryLock, setBuildMeta, } from '../../../db/index.js';
11
11
  import { detectWorkspaces, loadConfig } from '../../../infrastructure/config.js';
12
12
  import { debug, info, warn } from '../../../infrastructure/logger.js';
13
13
  import { loadNative } from '../../../infrastructure/native.js';
@@ -15,16 +15,17 @@ import { semverCompare } from '../../../infrastructure/update-check.js';
15
15
  import { normalizePath } from '../../../shared/constants.js';
16
16
  import { toErrorMessage } from '../../../shared/errors.js';
17
17
  import { CODEGRAPH_VERSION } from '../../../shared/version.js';
18
- import { classifyNativeDrops, formatDropExtensionSummary, getActiveEngine, getInstalledWasmExtensions, parseFilesAuto, } from '../../parser.js';
18
+ import { classifyNativeDrops, formatDropExtensionSummary, getActiveEngine, getInstalledWasmExtensions, NATIVE_SUPPORTED_EXTENSIONS, parseFilesWasmForBackfill, } from '../../parser.js';
19
+ import { writeJournalHeader } from '../journal.js';
19
20
  import { setWorkspaces } from '../resolve.js';
20
21
  import { PipelineContext } from './context.js';
21
- import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js';
22
+ import { batchInsertNodes, collectFiles as collectFilesUtil, fileHash, fileStat, loadPathAliases, readFileSafe, } from './helpers.js';
22
23
  import { NativeDbProxy } from './native-db-proxy.js';
23
24
  import { buildEdges } from './stages/build-edges.js';
24
25
  import { buildStructure } from './stages/build-structure.js';
25
26
  // Pipeline stages
26
27
  import { collectFiles } from './stages/collect-files.js';
27
- import { detectChanges } from './stages/detect-changes.js';
28
+ import { detectChanges, detectNoChanges } from './stages/detect-changes.js';
28
29
  import { finalize } from './stages/finalize.js';
29
30
  import { insertNodes } from './stages/insert-nodes.js';
30
31
  import { parseFiles } from './stages/parse-files.js';
@@ -42,9 +43,10 @@ function initializeEngine(ctx) {
42
43
  suspendJsDb: undefined,
43
44
  resumeJsDb: undefined,
44
45
  };
45
- const { name: engineName, version: engineVersion } = getActiveEngine(ctx.engineOpts);
46
+ const { name: engineName, version: engineVersion, binaryVersion: nativeBinaryVersion, } = getActiveEngine(ctx.engineOpts);
46
47
  ctx.engineName = engineName;
47
48
  ctx.engineVersion = engineVersion;
49
+ ctx.nativeBinaryVersion = nativeBinaryVersion;
48
50
  info(`Using ${engineName} engine${engineVersion ? ` (v${engineVersion})` : ''}`);
49
51
  }
50
52
  function checkEngineSchemaMismatch(ctx) {
@@ -67,12 +69,14 @@ function checkEngineSchemaMismatch(ctx) {
67
69
  info(`Schema version changed (${prevSchema} → ${ctx.schemaVersion}), promoting to full rebuild.`);
68
70
  ctx.forceFullRebuild = true;
69
71
  }
70
- // When the native engine is active, the Rust addon's version (ctx.engineVersion)
71
- // is written into codegraph_version by setBuildMeta after a native orchestrator
72
- // build. The check must compare against the same version, otherwise JS and Rust
73
- // fight over which version to record causing every incremental build to be
74
- // promoted to a full rebuild when npm and crate versions diverge.
75
- const effectiveVersion = ctx.engineName === 'native' && ctx.engineVersion ? ctx.engineVersion : CODEGRAPH_VERSION;
72
+ // When the native engine is active, the Rust orchestrator writes
73
+ // build_meta.codegraph_version = CARGO_PKG_VERSION (the binary's own value).
74
+ // Compare against the same value here so a CI hot-swap that leaves the
75
+ // platform package.json behind doesn't trigger a perpetual full-rebuild
76
+ // loop on every incremental (#1066).
77
+ const effectiveVersion = ctx.engineName === 'native' && ctx.nativeBinaryVersion
78
+ ? ctx.nativeBinaryVersion
79
+ : CODEGRAPH_VERSION;
76
80
  const prevVersion = meta('codegraph_version');
77
81
  if (prevVersion && prevVersion !== effectiveVersion) {
78
82
  info(`Codegraph version changed (${prevVersion} → ${effectiveVersion}), promoting to full rebuild.`);
@@ -113,7 +117,9 @@ function loadAliases(ctx) {
113
117
  }
114
118
  function setupPipeline(ctx) {
115
119
  ctx.rootDir = path.resolve(ctx.rootDir);
116
- ctx.dbPath = path.join(ctx.rootDir, '.codegraph', 'graph.db');
120
+ ctx.dbPath = ctx.opts.dbPath
121
+ ? path.resolve(ctx.opts.dbPath)
122
+ : path.join(ctx.rootDir, '.codegraph', 'graph.db');
117
123
  // Detect whether native engine is available.
118
124
  const enginePref = ctx.opts.engine || 'auto';
119
125
  const native = enginePref !== 'wasm' ? loadNative() : null;
@@ -129,6 +135,16 @@ function setupPipeline(ctx) {
129
135
  ctx.db = openDb(ctx.dbPath);
130
136
  initSchema(ctx.db);
131
137
  ctx.config = loadConfig(ctx.rootDir);
138
+ // Merge caller-supplied excludes on top of the file-config excludes so
139
+ // programmatic callers (e.g. benchmark scripts) can extend exclusion
140
+ // without mutating .codegraphrc.json. Native orchestrator picks this up
141
+ // automatically — it reads exclude off the serialized ctx.config below.
142
+ if (ctx.opts.exclude?.length) {
143
+ ctx.config = {
144
+ ...ctx.config,
145
+ exclude: [...(ctx.config.exclude ?? []), ...ctx.opts.exclude],
146
+ };
147
+ }
132
148
  ctx.incremental =
133
149
  ctx.opts.incremental !== false && ctx.config.build && ctx.config.build.incremental !== false;
134
150
  initializeEngine(ctx);
@@ -508,6 +524,15 @@ async function tryNativeOrchestrator(ctx) {
508
524
  const result = JSON.parse(resultJson);
509
525
  if (result.earlyExit) {
510
526
  info('No changes detected');
527
+ // Even on no-op rebuilds, dropped-language files added since the last
528
+ // full build are still missing from `nodes`/`file_hashes` (#1083), and
529
+ // WASM-only files deleted from disk leave stale rows behind (#1073).
530
+ // The orchestrator's file_collector skipped them, so its earlyExit
531
+ // doesn't imply DB consistency. Run the gap repair before returning.
532
+ const gap = detectDroppedLanguageGap(ctx);
533
+ if (gap.missingAbs.length > 0 || gap.staleRel.length > 0) {
534
+ await backfillNativeDroppedFiles(ctx, gap);
535
+ }
511
536
  closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
512
537
  return 'early-exit';
513
538
  }
@@ -519,16 +544,24 @@ async function tryNativeOrchestrator(ctx) {
519
544
  }
520
545
  const p = result.phases;
521
546
  // Sync build_meta so JS-side version/engine checks work on next build.
522
- // Use the Rust addon version as codegraph_version when the native
523
- // orchestrator performed the build the Rust side's check_version_mismatch
524
- // compares this value against CARGO_PKG_VERSION. Writing the JS
525
- // CODEGRAPH_VERSION here would create a permanent mismatch whenever the
526
- // npm package version diverges from the Rust crate version, forcing every
527
- // subsequent native build to be a full rebuild (no incremental).
547
+ // Use the binary's CARGO_PKG_VERSION (ctx.nativeBinaryVersion), not the
548
+ // platform package.json version (ctx.engineVersion). The Rust side's
549
+ // check_version_mismatch compares against CARGO_PKG_VERSION; writing
550
+ // the package.json value would create a permanent mismatch whenever
551
+ // the binary and platform package.json diverge e.g., CI hot-swap
552
+ // via ci-install-native.mjs (#1066) forcing every subsequent build
553
+ // to be a full rebuild.
554
+ //
555
+ // When the native addon doesn't expose engineVersion() (older addon),
556
+ // fall back to CODEGRAPH_VERSION — same fallback used by both
557
+ // checkEngineSchemaMismatch (read path) and persistBuildMetadata
558
+ // (the JS-pipeline write path in finalize.ts). Using ctx.engineVersion
559
+ // here would re-introduce the asymmetry this PR fixes for that case.
560
+ const nativeVersionForMeta = ctx.nativeBinaryVersion || CODEGRAPH_VERSION;
528
561
  setBuildMeta(ctx.db, {
529
562
  engine: ctx.engineName,
530
- engine_version: ctx.engineVersion || '',
531
- codegraph_version: ctx.engineVersion || CODEGRAPH_VERSION,
563
+ engine_version: nativeVersionForMeta,
564
+ codegraph_version: nativeVersionForMeta,
532
565
  schema_version: String(ctx.schemaVersion),
533
566
  built_at: new Date().toISOString(),
534
567
  });
@@ -578,41 +611,153 @@ async function tryNativeOrchestrator(ctx) {
578
611
  // stale native binaries). WASM handles those — backfill via WASM so both
579
612
  // engines process the same file set (#967).
580
613
  //
581
- // Only runs on full builds: incremental builds only touch changed files,
582
- // which are parsed through parseFilesAuto (which has its own per-file
583
- // backfill), so a full filesystem scan here would be wasted work.
584
- if (result.isFullBuild) {
585
- await backfillNativeDroppedFiles(ctx);
614
+ // Detect the gap once (fs walk + 2 DB queries, ~20–30ms) and use it for
615
+ // both gating and the backfill itself. On dirty incrementals/full builds
616
+ // the orchestrator signals trigger backfill, so the walk happens once
617
+ // (instead of redundantly inside backfill). On quiet incrementals we
618
+ // still pay the walk so we can detect brand-new files in dropped-language
619
+ // extensions — a gap that the orchestrator's `detect_removed_files`
620
+ // filter (#1070) leaves open (#1083, #1091). The pre-check is cheap
621
+ // because the expensive part (WASM re-parse of the missing set) is
622
+ // gated below.
623
+ const removedCount = result.removedCount ?? 0;
624
+ const changedCount = result.changedCount ?? 0;
625
+ const gap = detectDroppedLanguageGap(ctx);
626
+ if (result.isFullBuild ||
627
+ removedCount > 0 ||
628
+ changedCount > 0 ||
629
+ gap.missingAbs.length > 0 ||
630
+ gap.staleRel.length > 0) {
631
+ await backfillNativeDroppedFiles(ctx, gap);
586
632
  }
587
633
  closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
588
634
  return formatNativeTimingResult(p, structurePatchMs, analysisTiming);
589
635
  }
590
636
  /**
591
- * Backfill files that the native orchestrator silently dropped during parse.
592
- * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
637
+ * Compute the WASM-only files present in the DB but missing from disk (#1073).
638
+ *
639
+ * Returns relative paths that:
640
+ * - appear in `existingNodes` or `existingHashes` (in DB),
641
+ * - are absent from `expected` (not on disk),
642
+ * - have an extension installed for WASM, AND
643
+ * - have an extension NOT covered by `nativeSupported` — Rust's
644
+ * `purge_changed_files` handles deletion for natively-supported extensions
645
+ * via its own `detect_removed_files`, so the caller must not double-purge.
646
+ *
647
+ * Extensions are lowercased before lookup to match the registry and Rust's
648
+ * `LanguageKind::from_extension` (which normalises case for the languages
649
+ * where both cases are conventional, e.g. R's `.r` / `.R`).
650
+ *
651
+ * DB paths are forced to forward slashes before comparison with `expected`
652
+ * (which is always normalised). The on-disk invariant is that DB rows are
653
+ * written with forward slashes, but a stale row written by older code on
654
+ * Windows could carry back-slashes — normalising here makes the comparison
655
+ * platform-safe and prevents false-positive purges of live rows. We replace
656
+ * `\\` explicitly (rather than calling `normalizePath`, which only touches
657
+ * `path.sep`) so the defence works when running on POSIX against a DB that
658
+ * was migrated from Windows.
659
+ *
660
+ * Exported for unit testing.
593
661
  */
594
- async function backfillNativeDroppedFiles(ctx) {
595
- // Needs a real better-sqlite3 connection for INSERT.
596
- if (ctx.nativeFirstProxy) {
597
- closeNativeDb(ctx, 'pre-parity-backfill');
598
- ctx.db = openDb(ctx.dbPath);
599
- ctx.nativeFirstProxy = false;
662
+ export function computeWasmOnlyStaleFiles(input) {
663
+ const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input;
664
+ const stale = [];
665
+ const seen = new Set();
666
+ const consider = (rawRel) => {
667
+ const rel = rawRel.replace(/\\/g, '/');
668
+ if (expected.has(rel) || seen.has(rel))
669
+ return;
670
+ const ext = path.extname(rel).toLowerCase();
671
+ if (nativeSupported.has(ext))
672
+ return;
673
+ if (!installedExts.has(ext))
674
+ return;
675
+ seen.add(rel);
676
+ // Push the ORIGINAL raw path (not the normalised form) so the eventual
677
+ // `DELETE FROM nodes WHERE file = ?` predicate in `purgeFilesData`
678
+ // matches the actual stored row. The dedup `seen` set keeps the
679
+ // normalised form so a file written once with `\` and once with `/`
680
+ // is still treated as one entry — but the value the SQL sees has to
681
+ // be byte-identical to what's on disk in the DB.
682
+ stale.push(rawRel);
683
+ };
684
+ for (const rel of existingNodes)
685
+ consider(rel);
686
+ for (const rel of existingHashes)
687
+ consider(rel);
688
+ return stale;
689
+ }
690
+ /**
691
+ * Group relative paths by their lowercased extension. Shape matches the bucket
692
+ * type that `formatDropExtensionSummary` consumes, so callers can render a
693
+ * log-friendly per-extension summary without going through `classifyNativeDrops`
694
+ * when the reason is already known (e.g. the stale-purge path where every path
695
+ * is guaranteed `unsupported-by-native`).
696
+ */
697
+ function groupByExtension(relPaths) {
698
+ const buckets = new Map();
699
+ for (const rel of relPaths) {
700
+ const ext = path.extname(rel).toLowerCase();
701
+ let list = buckets.get(ext);
702
+ if (!list) {
703
+ list = [];
704
+ buckets.set(ext, list);
705
+ }
706
+ list.push(rel);
600
707
  }
708
+ return buckets;
709
+ }
710
+ /**
711
+ * Detect files the native orchestrator silently dropped.
712
+ *
713
+ * Walks the filesystem and compares against `nodes` + `file_hashes`. A file
714
+ * is "missing" if it's absent from EITHER table — both must be present for
715
+ * the fast-skip pre-flight (#1054) to work, and the two can diverge (e.g.
716
+ * legacy DBs where `nodes` was populated but `file_hashes` was not).
717
+ *
718
+ * Restricted to files with an installed WASM grammar; extensions in
719
+ * `LANGUAGE_REGISTRY` without a shipped grammar (e.g. groovy on minimal
720
+ * installs) can't be parsed by either engine, so they're not a native
721
+ * regression — excluding them keeps the warn count in
722
+ * `backfillNativeDroppedFiles` meaningful.
723
+ *
724
+ * Also detects WASM-only files deleted from disk (#1073). Rust's
725
+ * `detect_removed_files` filter (#1070) skips files outside its supported
726
+ * extensions, so deletions of WASM-only languages don't reach the native
727
+ * purge path; the rest of the backfill only inserts rows, so without this
728
+ * step stale `nodes`/`file_hashes` rows would linger across incremental
729
+ * rebuilds until the next full rebuild.
730
+ *
731
+ * Cheap (no DB handoff, no parsing): used both to gate the backfill call
732
+ * and as its working set. NativeDbProxy supports `.prepare().all()`, so
733
+ * this works whether `ctx.db` is a proxy or a real better-sqlite3
734
+ * connection — letting us skip the close-native / reopen-better-sqlite3
735
+ * cost when there's nothing to backfill.
736
+ */
737
+ function detectDroppedLanguageGap(ctx) {
601
738
  const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set());
602
739
  const expected = new Set(collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))));
603
- const existingRows = ctx.db
740
+ const existingNodeRows = ctx.db
604
741
  .prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'")
605
742
  .all();
606
- const existing = new Set(existingRows.map((r) => r.file));
607
- // Restrict backfill to files with an installed WASM grammar. Extensions in
608
- // LANGUAGE_REGISTRY without a shipped grammar file (e.g. groovy, erlang on
609
- // minimal installs) can't be parsed by either engine, so they're not a
610
- // native regression excluding them keeps the warn count meaningful.
743
+ const existingNodes = new Set(existingNodeRows.map((r) => r.file));
744
+ let existingHashes = new Set();
745
+ try {
746
+ const existingHashRows = ctx.db
747
+ .prepare('SELECT DISTINCT file FROM file_hashes')
748
+ .all();
749
+ existingHashes = new Set(existingHashRows.map((r) => r.file));
750
+ }
751
+ catch (e) {
752
+ // file_hashes table may not exist on legacy DBs; treat as fully missing
753
+ // so the backfill writes rows on the upsert path below.
754
+ debug(`detectDroppedLanguageGap: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`);
755
+ }
611
756
  const installedExts = getInstalledWasmExtensions();
612
757
  const missingRel = [];
613
758
  const missingAbs = [];
614
759
  for (const rel of expected) {
615
- if (existing.has(rel))
760
+ if (existingNodes.has(rel) && existingHashes.has(rel))
616
761
  continue;
617
762
  const ext = path.extname(rel).toLowerCase();
618
763
  if (!installedExts.has(ext))
@@ -620,6 +765,51 @@ async function backfillNativeDroppedFiles(ctx) {
620
765
  missingRel.push(rel);
621
766
  missingAbs.push(path.join(ctx.rootDir, rel));
622
767
  }
768
+ const staleRel = computeWasmOnlyStaleFiles({
769
+ existingNodes,
770
+ existingHashes,
771
+ expected,
772
+ installedExts,
773
+ nativeSupported: NATIVE_SUPPORTED_EXTENSIONS,
774
+ });
775
+ return { missingRel, missingAbs, staleRel };
776
+ }
777
+ /**
778
+ * Backfill files that the native orchestrator silently dropped during parse.
779
+ * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
780
+ *
781
+ * Also purges stale rows for WASM-only files deleted from disk (#1073), which
782
+ * Rust's `detect_removed_files` filter (#1070) skips.
783
+ *
784
+ * Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller
785
+ * can use the same scan for both gating and the actual backfill — avoiding
786
+ * a redundant fs walk when the orchestrator's signals already triggered.
787
+ */
788
+ async function backfillNativeDroppedFiles(ctx, gap) {
789
+ const { missingRel, missingAbs, staleRel } = gap;
790
+ if (missingAbs.length === 0 && staleRel.length === 0)
791
+ return;
792
+ // Now that we know there's work to do, hand off to better-sqlite3 (needed
793
+ // for the INSERT path below).
794
+ if (ctx.nativeFirstProxy) {
795
+ closeNativeDb(ctx, 'pre-parity-backfill');
796
+ ctx.db = openDb(ctx.dbPath);
797
+ ctx.nativeFirstProxy = false;
798
+ }
799
+ const dbConn = ctx.db;
800
+ // Purge WASM-only files that were deleted from disk (#1073). Rust's
801
+ // detect_removed_files skips them and the insert path below never visits
802
+ // them, so without this their rows would persist across rebuilds until the
803
+ // next full rebuild reset the DB.
804
+ if (staleRel.length > 0) {
805
+ // `computeWasmOnlyStaleFiles` guarantees every path here has an extension
806
+ // outside NATIVE_SUPPORTED_EXTENSIONS, so `classifyNativeDrops` would
807
+ // always bucket 100% into `unsupported-by-native`. Build the extension
808
+ // summary directly to avoid a redundant classification pass.
809
+ const staleByExt = groupByExtension(staleRel);
810
+ info(`Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByExt)}`);
811
+ purgeFilesData(dbConn, staleRel);
812
+ }
623
813
  if (missingAbs.length === 0)
624
814
  return;
625
815
  // Classify drops so users see per-extension reasons instead of just a count
@@ -633,7 +823,7 @@ async function backfillNativeDroppedFiles(ctx) {
633
823
  if (totals['native-extractor-failure'] > 0) {
634
824
  warn(`Native orchestrator dropped ${totals['native-extractor-failure']} file(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM: ${formatDropExtensionSummary(byReason['native-extractor-failure'])}`);
635
825
  }
636
- const wasmResults = await parseFilesAuto(missingAbs, ctx.rootDir, { engine: 'wasm' });
826
+ const wasmResults = await parseFilesWasmForBackfill(missingAbs, ctx.rootDir);
637
827
  const rows = [];
638
828
  const exportKeys = [];
639
829
  for (const [relPath, symbols] of wasmResults) {
@@ -665,7 +855,7 @@ async function backfillNativeDroppedFiles(ctx) {
665
855
  exportKeys.push([exp.name, exp.kind, relPath, exp.line]);
666
856
  }
667
857
  }
668
- const db = ctx.db;
858
+ const db = dbConn;
669
859
  batchInsertNodes(db, rows);
670
860
  // Mark exported symbols in batches — mirrors insertDefinitionsAndExports.
671
861
  if (exportKeys.length > 0) {
@@ -688,6 +878,67 @@ async function backfillNativeDroppedFiles(ctx) {
688
878
  updateStmt.run(...vals);
689
879
  }
690
880
  }
881
+ // Persist file_hashes rows for every backfilled file. The Rust orchestrator
882
+ // only hashes files it parsed itself, so without this step files in
883
+ // optional-language extensions (e.g. .clj when no Rust extractor exists)
884
+ // would be missing from `file_hashes` — permanently breaking the JS-side
885
+ // fast-skip pre-flight (#1054), which rejects on `collected file missing
886
+ // from file_hashes` and forces every no-op rebuild back through the full
887
+ // ~2s native pipeline (#1068).
888
+ //
889
+ // Iterates `missingRel` (every collected file the Rust orchestrator
890
+ // dropped), not `wasmResults`, so files that produced zero symbols still
891
+ // get a row.
892
+ try {
893
+ const upsertHash = db.prepare('INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)');
894
+ const writeHashes = db.transaction(() => {
895
+ for (let i = 0; i < missingRel.length; i++) {
896
+ const relPath = missingRel[i];
897
+ const absPath = missingAbs[i];
898
+ if (!relPath || !absPath)
899
+ continue;
900
+ let code;
901
+ try {
902
+ code = readFileSafe(absPath);
903
+ }
904
+ catch (e) {
905
+ debug(`backfillNativeDroppedFiles: read failed for ${relPath}: ${toErrorMessage(e)}`);
906
+ continue;
907
+ }
908
+ if (code === null)
909
+ continue;
910
+ const stat = fileStat(absPath);
911
+ const mtime = stat ? stat.mtime : 0;
912
+ const size = stat ? stat.size : 0;
913
+ upsertHash.run(relPath, fileHash(code), mtime, size);
914
+ }
915
+ });
916
+ writeHashes();
917
+ }
918
+ catch (e) {
919
+ debug(`backfillNativeDroppedFiles: file_hashes write failed (table may not exist): ${toErrorMessage(e)}`);
920
+ }
921
+ // Free WASM parse trees from the inline backfill path (#1058).
922
+ // `parseFilesWasmInline` sets `symbols._tree` (a live web-tree-sitter Tree
923
+ // backed by WASM linear memory) on every result, but these symbols are
924
+ // consumed locally for DB row construction and never added to
925
+ // `ctx.allSymbols`, so the finalize-stage `releaseWasmTrees` sweep never
926
+ // sees them. Without this, trees leak WASM memory until process exit —
927
+ // bounded per run but cumulative across in-process integration tests.
928
+ // Mirrors the cleanup discipline established for #931.
929
+ for (const [, symbols] of wasmResults) {
930
+ const tree = symbols._tree;
931
+ if (tree && typeof tree.delete === 'function') {
932
+ try {
933
+ tree.delete();
934
+ }
935
+ catch {
936
+ /* ignore cleanup errors */
937
+ }
938
+ }
939
+ symbols._tree = undefined;
940
+ symbols._langId = undefined;
941
+ }
691
942
  }
692
943
  // ── Pipeline stages execution ───────────────────────────────────────────
693
944
  async function runPipelineStages(ctx) {
@@ -795,6 +1046,61 @@ export async function buildGraph(rootDir, opts = {}) {
795
1046
  ctx.rootDir = rootDir;
796
1047
  try {
797
1048
  setupPipeline(ctx);
1049
+ // ── JS-side fast-skip for native incremental (#1054) ──────────────
1050
+ // The Rust orchestrator's internal early-exit fires reliably locally
1051
+ // but not in CI, where every no-op rebuild was paying the full ~2s
1052
+ // pipeline cost. A read-only mtime+size check here matches WASM's
1053
+ // ~20ms early-exit and skips the orchestrator entirely when no
1054
+ // source files have changed. Tier-2 hashing is left to the native
1055
+ // side: any mismatch falls through and lets Rust's detect_changes
1056
+ // remain the source of truth.
1057
+ //
1058
+ // Diagnostic logging gated by CODEGRAPH_FAST_SKIP_DIAG (#1066) — when
1059
+ // any of the call-site guards short-circuit (forceFullRebuild,
1060
+ // engineName, scope, etc.) we log the reason so the bench gate run
1061
+ // produces observable output even if `detectNoChanges` is never
1062
+ // entered.
1063
+ const fastSkipDiag = process.env.CODEGRAPH_FAST_SKIP_DIAG === '1';
1064
+ if (fastSkipDiag) {
1065
+ const reasons = [];
1066
+ if (!ctx.nativeAvailable)
1067
+ reasons.push('nativeAvailable=false');
1068
+ if (ctx.engineName !== 'native')
1069
+ reasons.push(`engineName=${ctx.engineName}`);
1070
+ if (!ctx.incremental)
1071
+ reasons.push('incremental=false');
1072
+ if (ctx.forceFullRebuild)
1073
+ reasons.push('forceFullRebuild=true');
1074
+ if (ctx.opts.scope)
1075
+ reasons.push('scope=set');
1076
+ if (reasons.length > 0) {
1077
+ info(`[fast-skip] false: pre-flight gate skipped — ${reasons.join(', ')}`);
1078
+ }
1079
+ }
1080
+ if (ctx.nativeAvailable &&
1081
+ ctx.engineName === 'native' &&
1082
+ ctx.incremental &&
1083
+ !ctx.forceFullRebuild &&
1084
+ !ctx.opts.scope) {
1085
+ try {
1086
+ await collectFiles(ctx);
1087
+ if (detectNoChanges(ctx.db, ctx.allFiles, ctx.rootDir, ctx.opts)) {
1088
+ info('No changes detected. Graph is up to date.');
1089
+ writeJournalHeader(ctx.rootDir, Date.now());
1090
+ closeDb(ctx.db);
1091
+ return;
1092
+ }
1093
+ }
1094
+ catch (err) {
1095
+ // Pre-flight is best-effort — any failure falls through to the
1096
+ // orchestrator, which performs its own complete detection.
1097
+ // Reset ctx.allFiles so runPipelineStages re-collects under its own
1098
+ // engine state if we ended up partially populated before throwing.
1099
+ ctx.allFiles = undefined;
1100
+ ctx.discoveredDirs = undefined;
1101
+ debug(`native fast-skip pre-flight failed: ${toErrorMessage(err)}`);
1102
+ }
1103
+ }
798
1104
  // ── Rust orchestrator fast path (#695) ────────────────────────────
799
1105
  // When available, run the entire build pipeline in Rust with zero
800
1106
  // napi crossings (eliminates WAL dual-connection dance). Falls back