@colbymchenry/codegraph-darwin-x64 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/lib/dist/bin/codegraph.js +99 -59
  2. package/lib/dist/bin/codegraph.js.map +1 -1
  3. package/lib/dist/bin/command-supervision.d.ts +12 -0
  4. package/lib/dist/bin/command-supervision.d.ts.map +1 -0
  5. package/lib/dist/bin/command-supervision.js +76 -0
  6. package/lib/dist/bin/command-supervision.js.map +1 -0
  7. package/lib/dist/db/migrations.d.ts +1 -1
  8. package/lib/dist/db/migrations.d.ts.map +1 -1
  9. package/lib/dist/db/migrations.js +25 -1
  10. package/lib/dist/db/migrations.js.map +1 -1
  11. package/lib/dist/db/queries.d.ts.map +1 -1
  12. package/lib/dist/db/queries.js +10 -2
  13. package/lib/dist/db/queries.js.map +1 -1
  14. package/lib/dist/db/schema.sql +11 -0
  15. package/lib/dist/directory.d.ts +32 -0
  16. package/lib/dist/directory.d.ts.map +1 -1
  17. package/lib/dist/directory.js +83 -0
  18. package/lib/dist/directory.js.map +1 -1
  19. package/lib/dist/extraction/index.d.ts +13 -1
  20. package/lib/dist/extraction/index.d.ts.map +1 -1
  21. package/lib/dist/extraction/index.js +310 -218
  22. package/lib/dist/extraction/index.js.map +1 -1
  23. package/lib/dist/extraction/languages/c-cpp.d.ts +16 -0
  24. package/lib/dist/extraction/languages/c-cpp.d.ts.map +1 -1
  25. package/lib/dist/extraction/languages/c-cpp.js +33 -0
  26. package/lib/dist/extraction/languages/c-cpp.js.map +1 -1
  27. package/lib/dist/extraction/parse-pool.d.ts +126 -0
  28. package/lib/dist/extraction/parse-pool.d.ts.map +1 -0
  29. package/lib/dist/extraction/parse-pool.js +319 -0
  30. package/lib/dist/extraction/parse-pool.js.map +1 -0
  31. package/lib/dist/extraction/tree-sitter.d.ts +21 -0
  32. package/lib/dist/extraction/tree-sitter.d.ts.map +1 -1
  33. package/lib/dist/extraction/tree-sitter.js +106 -21
  34. package/lib/dist/extraction/tree-sitter.js.map +1 -1
  35. package/lib/dist/mcp/daemon-paths.d.ts +30 -3
  36. package/lib/dist/mcp/daemon-paths.d.ts.map +1 -1
  37. package/lib/dist/mcp/daemon-paths.js +50 -10
  38. package/lib/dist/mcp/daemon-paths.js.map +1 -1
  39. package/lib/dist/mcp/daemon-registry.d.ts.map +1 -1
  40. package/lib/dist/mcp/daemon-registry.js +7 -3
  41. package/lib/dist/mcp/daemon-registry.js.map +1 -1
  42. package/lib/dist/mcp/daemon.d.ts +48 -0
  43. package/lib/dist/mcp/daemon.d.ts.map +1 -1
  44. package/lib/dist/mcp/daemon.js +203 -32
  45. package/lib/dist/mcp/daemon.js.map +1 -1
  46. package/lib/dist/mcp/engine.d.ts +17 -0
  47. package/lib/dist/mcp/engine.d.ts.map +1 -1
  48. package/lib/dist/mcp/engine.js +73 -1
  49. package/lib/dist/mcp/engine.js.map +1 -1
  50. package/lib/dist/mcp/index.d.ts.map +1 -1
  51. package/lib/dist/mcp/index.js +25 -43
  52. package/lib/dist/mcp/index.js.map +1 -1
  53. package/lib/dist/mcp/ppid-watchdog.d.ts +18 -0
  54. package/lib/dist/mcp/ppid-watchdog.d.ts.map +1 -1
  55. package/lib/dist/mcp/ppid-watchdog.js +37 -0
  56. package/lib/dist/mcp/ppid-watchdog.js.map +1 -1
  57. package/lib/dist/mcp/query-pool.d.ts +94 -0
  58. package/lib/dist/mcp/query-pool.d.ts.map +1 -0
  59. package/lib/dist/mcp/query-pool.js +297 -0
  60. package/lib/dist/mcp/query-pool.js.map +1 -0
  61. package/lib/dist/mcp/query-worker.d.ts +24 -0
  62. package/lib/dist/mcp/query-worker.d.ts.map +1 -0
  63. package/lib/dist/mcp/query-worker.js +87 -0
  64. package/lib/dist/mcp/query-worker.js.map +1 -0
  65. package/lib/dist/mcp/tools.d.ts +57 -0
  66. package/lib/dist/mcp/tools.d.ts.map +1 -1
  67. package/lib/dist/mcp/tools.js +196 -40
  68. package/lib/dist/mcp/tools.js.map +1 -1
  69. package/lib/dist/project-config.d.ts +20 -0
  70. package/lib/dist/project-config.d.ts.map +1 -1
  71. package/lib/dist/project-config.js +42 -2
  72. package/lib/dist/project-config.js.map +1 -1
  73. package/lib/dist/resolution/c-fnptr-synthesizer.d.ts +0 -28
  74. package/lib/dist/resolution/c-fnptr-synthesizer.d.ts.map +1 -1
  75. package/lib/dist/resolution/c-fnptr-synthesizer.js +765 -79
  76. package/lib/dist/resolution/c-fnptr-synthesizer.js.map +1 -1
  77. package/lib/dist/resolution/name-matcher.d.ts.map +1 -1
  78. package/lib/dist/resolution/name-matcher.js +44 -0
  79. package/lib/dist/resolution/name-matcher.js.map +1 -1
  80. package/lib/dist/sync/worktree.d.ts +9 -0
  81. package/lib/dist/sync/worktree.d.ts.map +1 -1
  82. package/lib/dist/sync/worktree.js +40 -0
  83. package/lib/dist/sync/worktree.js.map +1 -1
  84. package/lib/dist/types.d.ts +6 -1
  85. package/lib/dist/types.d.ts.map +1 -1
  86. package/lib/node_modules/.package-lock.json +1 -1
  87. package/lib/package.json +1 -1
  88. package/package.json +1 -1
@@ -51,9 +51,11 @@ exports.scanDirectoryAsync = scanDirectoryAsync;
51
51
  const fs = __importStar(require("fs"));
52
52
  const fsp = __importStar(require("fs/promises"));
53
53
  const path = __importStar(require("path"));
54
+ const os = __importStar(require("os"));
54
55
  const crypto = __importStar(require("crypto"));
55
56
  const child_process_1 = require("child_process");
56
57
  const tree_sitter_1 = require("./tree-sitter");
58
+ const parse_pool_1 = require("./parse-pool");
57
59
  const grammars_1 = require("./grammars");
58
60
  const project_config_1 = require("../project-config");
59
61
  const directory_1 = require("../directory");
@@ -155,12 +157,34 @@ const DEFAULT_IGNORE_DIRS = new Set([
155
157
  // Generic cache
156
158
  '.cache',
157
159
  ]);
160
+ /**
161
+ * Android resource directory types. A `res/` tree holds ONLY non-code resources —
162
+ * layouts, drawables, value bags (strings/colors/styles), menus, navigation
163
+ * graphs — split into one typed subdirectory per kind, optionally density/locale/
164
+ * version-qualified (`values-es`, `drawable-hdpi`, `layout-v21`, …). None of it
165
+ * yields an extractable code symbol, yet on an Android app it DOMINATES the tree
166
+ * (one report: 26k XML files = 97% of the project, 0 symbols), bloating the DB,
167
+ * slowing indexing, and skewing both the file count and `codegraph_explore`
168
+ * results (#1047). So these are excluded by default. The structure is
169
+ * self-identifying — a non-Android project has no `res/layout/` etc., so it's
170
+ * untouched — and the only XML that DOES produce symbols (MyBatis mappers) lives
171
+ * under `src/main/resources/`, never `res/`, so nothing useful is dropped.
172
+ * `res/raw/` is deliberately NOT here: it holds arbitrary bundled assets that can
173
+ * be code-ish (a `.sql` schema, a `.js`), so we leave it indexed. Override any of
174
+ * these with a `.gitignore` negation (e.g. `!res/values/`).
175
+ */
176
+ const ANDROID_RES_TYPES = [
177
+ 'anim', 'animator', 'color', 'drawable', 'font', 'layout',
178
+ 'menu', 'mipmap', 'navigation', 'transition', 'values', 'xml',
179
+ ];
158
180
  /** Gitignore-style patterns for the `ignore` matcher: the dirs above plus a few globs. */
159
181
  const DEFAULT_IGNORE_PATTERNS = [
160
182
  ...Array.from(DEFAULT_IGNORE_DIRS, (d) => `${d}/`),
161
183
  '*.egg-info/', // Python packaging metadata
162
184
  'cmake-build-*/', // CLion / CMake build trees
163
185
  'bazel-*/', // Bazel output symlink trees
186
+ // Android resource dirs at any depth, with their qualifier variants (#1047).
187
+ ...ANDROID_RES_TYPES.map((t) => `**/res/${t}*/`),
164
188
  ];
165
189
  /** True if `buf` decodes as strict UTF-8 (no invalid byte sequences). */
166
190
  function isValidUtf8(buf) {
@@ -265,6 +289,19 @@ function loadIncludeIgnoredMatcher(rootDir) {
265
289
  const patterns = (0, project_config_1.loadIncludeIgnoredPatterns)(rootDir);
266
290
  return patterns.length > 0 ? (0, ignore_1.default)().add(patterns) : null;
267
291
  }
292
+ /**
293
+ * Matcher for the project's `codegraph.json` `exclude` patterns — paths to keep
294
+ * OUT of the index even when git-tracked, which `.gitignore` cannot do (#999).
295
+ * The escape hatch for a committed vendor/theme/SDK directory. Returns `null`
296
+ * when nothing is excluded (the zero-config default → no overhead). Matched
297
+ * against project-root-relative paths, so it applies uniformly across the whole
298
+ * workspace, including inside embedded repos (excluding `static/` means gone
299
+ * everywhere). Built once per scan/sync/scope operation from the scan root.
300
+ */
301
+ function loadExcludeMatcher(rootDir) {
302
+ const patterns = (0, project_config_1.loadExcludePatterns)(rootDir);
303
+ return patterns.length > 0 ? (0, ignore_1.default)().add(patterns) : null;
304
+ }
268
305
  /**
269
306
  * `git ls-files --directory` collapses a wholly-untracked/ignored directory into
270
307
  * one entry — and when the command's own cwd is such a directory (the indexed
@@ -404,14 +441,27 @@ function findNestedGitRepos(absDir, relPrefix) {
404
441
  */
405
442
  class ScopeIgnore {
406
443
  rootMatcher;
444
+ exclude;
407
445
  embedded;
408
446
  defaults = defaultsOnlyIgnore();
409
- constructor(rootMatcher, embedded) {
447
+ constructor(rootMatcher, embedded,
448
+ /**
449
+ * Project `codegraph.json` `exclude` patterns (#999), matched against the
450
+ * full root-relative path. Wins over everything else — an explicit user
451
+ * exclude applies even to tracked files and even inside embedded repos.
452
+ */
453
+ exclude = null) {
410
454
  this.rootMatcher = rootMatcher;
455
+ this.exclude = exclude;
411
456
  // Longest root first so paths in nested embedded repos hit the innermost matcher.
412
457
  this.embedded = [...embedded].sort((a, b) => b.root.length - a.root.length);
413
458
  }
414
459
  ignores(rel) {
460
+ // User `exclude` (#999) is checked first and against the full root-relative
461
+ // path: it must drop git-TRACKED paths (which `.gitignore` can't) and apply
462
+ // everywhere, including ancestors of embedded repos.
463
+ if (this.exclude && this.exclude.ignores(rel))
464
+ return true;
415
465
  for (const { root, matcher } of this.embedded) {
416
466
  if (rel.startsWith(root)) {
417
467
  const inner = rel.slice(root.length);
@@ -438,7 +488,7 @@ exports.ScopeIgnore = ScopeIgnore;
438
488
  */
439
489
  function buildScopeIgnore(rootDir, embeddedRoots) {
440
490
  const roots = embeddedRoots ? [...embeddedRoots] : discoverEmbeddedRepoRoots(rootDir);
441
- return new ScopeIgnore(buildDefaultIgnore(rootDir), roots.map((root) => ({ root, matcher: buildDefaultIgnore(path.join(rootDir, root)) })));
491
+ return new ScopeIgnore(buildDefaultIgnore(rootDir), roots.map((root) => ({ root, matcher: buildDefaultIgnore(path.join(rootDir, root)) })), loadExcludeMatcher(rootDir));
442
492
  }
443
493
  /**
444
494
  * Standalone discovery of every embedded repo root under `rootDir` (relative,
@@ -470,6 +520,28 @@ function discoverEmbeddedRepoRoots(rootDir) {
470
520
  }
471
521
  }
472
522
  catch { /* untracked listing failed — ignored-side discovery still runs */ }
523
+ // Unexpanded gitlinks (mode 160000) with a real checkout on disk — embedded
524
+ // repos `git add`ed without `.gitmodules`, or submodules not active here. The
525
+ // untracked listing above can't see them (they're tracked), so find them the
526
+ // same way collectGitFiles does, keeping watcher scope == indexer scope.
527
+ // (#1031, #1033)
528
+ try {
529
+ const staged = (0, child_process_1.execFileSync)('git', ['ls-files', '-z', '-s', '--recurse-submodules'], { cwd: repoAbs, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true });
530
+ for (const entry of staged.split('\0')) {
531
+ if (!entry || entry.slice(0, 6) !== '160000')
532
+ continue;
533
+ const tab = entry.indexOf('\t');
534
+ if (tab === -1)
535
+ continue;
536
+ const rel = entry.slice(tab + 1);
537
+ const relDir = rel.endsWith('/') ? rel : rel + '/';
538
+ if (defaults.ignores(relDir))
539
+ continue;
540
+ if (classifyGitDir(path.join(repoAbs, rel)) === 'embedded')
541
+ candidates.push(relDir);
542
+ }
543
+ }
544
+ catch { /* staged listing failed — other discovery still runs */ }
473
545
  candidates.push(...findIgnoredEmbeddedRepos(repoAbs, includeIgnored, prefix));
474
546
  for (const rel of candidates) {
475
547
  const full = (0, utils_1.normalizePath)(prefix + rel);
@@ -535,14 +607,37 @@ function collectGitFiles(repoDir, prefix, files, embeddedRoots, includeIgnored =
535
607
  // Without this, monorepos using submodules index 0 files. (See issue #147.)
536
608
  // Note: --recurse-submodules only supports -c/--cached and --stage modes — it
537
609
  // can't be combined with -o, so untracked files are gathered separately below.
610
+ //
611
+ // We use --stage (-s) rather than -c so each entry carries its file mode. That
612
+ // lets us spot gitlink entries (mode 160000) that --recurse-submodules did NOT
613
+ // expand: a nested repo `git add`ed without a `.gitmodules` entry, or a
614
+ // submodule that isn't active/initialized in this checkout. Such a gitlink
615
+ // falls through every pass — it's tracked, so the untracked `-o` listing below
616
+ // never reports it, and --recurse-submodules only expands ACTIVE submodules —
617
+ // so its source would be silently skipped, leaving only the super-repo's own
618
+ // files indexed. We collect those gitlinks here and recurse into them below.
619
+ // (An active submodule is expanded inline by --recurse-submodules and so never
620
+ // surfaces as a 160000 entry — only the unhandled gitlinks do.) (#1031, #1033)
621
+ //
538
622
  // -z gives NUL-separated, unquoted output so non-ASCII (e.g. CJK) paths
539
623
  // survive verbatim. Without it git octal-escapes and double-quotes such paths
540
624
  // (the core.quotepath default), and the quoted form never matches a real file
541
- // on disk → those files are silently dropped from the index. (#541)
542
- const tracked = (0, child_process_1.execFileSync)('git', ['ls-files', '-z', '-c', '--recurse-submodules'], gitOpts);
543
- for (const rel of tracked.split('\0')) {
544
- if (rel)
545
- files.add((0, utils_1.normalizePath)(prefix + rel));
625
+ // on disk → those files are silently dropped from the index. (#541) With -s the
626
+ // path follows a TAB after the `<mode> <object> <stage>` prefix.
627
+ const gitlinkRels = [];
628
+ const tracked = (0, child_process_1.execFileSync)('git', ['ls-files', '-z', '-s', '--recurse-submodules'], gitOpts);
629
+ for (const entry of tracked.split('\0')) {
630
+ if (!entry)
631
+ continue;
632
+ const tab = entry.indexOf('\t');
633
+ if (tab === -1)
634
+ continue; // --stage always emits "<mode> <object> <stage>\t<path>"
635
+ const rel = entry.slice(tab + 1);
636
+ if (entry.slice(0, 6) === '160000') {
637
+ gitlinkRels.push(rel); // an unexpanded gitlink — recursed into below, not a source file itself
638
+ continue;
639
+ }
640
+ files.add((0, utils_1.normalizePath)(prefix + rel));
546
641
  }
547
642
  // Untracked files (submodules manage their own untracked state). Embedded git
548
643
  // repos surface here as a single "subdir/" entry that git refuses to descend
@@ -568,6 +663,25 @@ function collectGitFiles(repoDir, prefix, files, embeddedRoots, includeIgnored =
568
663
  }
569
664
  files.add((0, utils_1.normalizePath)(prefix + rel));
570
665
  }
666
+ // Gitlink entries (mode 160000) that --recurse-submodules left unexpanded —
667
+ // an embedded repo `git add`ed without `.gitmodules`, or a submodule not
668
+ // active/initialized in this checkout. When such a gitlink has a real working
669
+ // tree on disk it is distinct first-party code we must index as its own
670
+ // embedded repo: the tracked pass skipped its contents and the untracked pass
671
+ // never sees it (it's tracked, not "other"). A gitlink with no checkout on disk
672
+ // (an uninitialized submodule — empty dir, no `.git`) has nothing to index and
673
+ // is left alone, as is a submodule worktree (a duplicate view, #945). (#1031, #1033)
674
+ for (const rel of gitlinkRels) {
675
+ const relDir = rel.endsWith('/') ? rel : rel + '/';
676
+ if (defaultsOnlyIgnore().ignores(relDir))
677
+ continue;
678
+ const childDir = path.join(repoDir, rel);
679
+ // 'embedded' = a real .git checkout on disk; 'worktree' and 'none' are skipped.
680
+ if (classifyGitDir(childDir) !== 'embedded')
681
+ continue;
682
+ embeddedRoots?.add((0, utils_1.normalizePath)(prefix + relDir));
683
+ collectGitFiles(childDir, prefix + relDir, files, embeddedRoots, includeIgnored);
684
+ }
571
685
  // Embedded repos hidden by THIS repo's ignore rules (`/packages/` in a
572
686
  // super-repo .gitignore) never appear in any listing above. By default they
573
687
  // stay hidden — `.gitignore` is respected (#970, #976). They are recursed into
@@ -637,14 +751,14 @@ function getGitChangedFiles(rootDir) {
637
751
  // Custom extension → language overrides from the project's codegraph.json,
638
752
  // so change detection sees the same custom-extension files the full index does.
639
753
  const overrides = (0, project_config_1.loadExtensionOverrides)(rootDir);
640
- collectGitStatus(rootDir, '', changes, overrides, loadIncludeIgnoredMatcher(rootDir));
754
+ collectGitStatus(rootDir, '', changes, overrides, loadIncludeIgnoredMatcher(rootDir), loadExcludeMatcher(rootDir));
641
755
  return changes;
642
756
  }
643
757
  catch {
644
758
  return null;
645
759
  }
646
760
  }
647
- function collectGitStatus(repoDir, prefix, out, overrides, includeIgnored = null) {
761
+ function collectGitStatus(repoDir, prefix, out, overrides, includeIgnored = null, exclude = null) {
648
762
  const output = (0, child_process_1.execFileSync)('git', ['status', '--porcelain', '--no-renames'], { cwd: repoDir, encoding: 'utf-8', timeout: 10000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true });
649
763
  // This repo's own ignore rules — built-in defaults (#407) plus its .gitignore.
650
764
  // Change detection must exclude the SAME files the full index does, but git
@@ -683,6 +797,12 @@ function collectGitStatus(repoDir, prefix, out, overrides, includeIgnored = null
683
797
  // index — match against the repo-relative path, same as the full scan. (#766)
684
798
  if (ig.ignores(rel))
685
799
  continue;
800
+ // User `codegraph.json` `exclude` (#999) is project-root-relative, so it's
801
+ // matched against the full path — sync must not re-add a tracked file the
802
+ // full index now keeps out. Deletions above stay unfiltered so a file that
803
+ // WAS indexed before an exclude was added still cleans itself out.
804
+ if (exclude && exclude.ignores(filePath))
805
+ continue;
686
806
  if (statusCode === '??') {
687
807
  out.added.push(filePath);
688
808
  }
@@ -697,11 +817,11 @@ function collectGitStatus(repoDir, prefix, out, overrides, includeIgnored = null
697
817
  // and they are left alone (#970, #976), mirroring the full-index scan.
698
818
  for (const rel of untrackedDirs) {
699
819
  for (const repoRel of findNestedGitRepos(path.join(repoDir, rel), rel)) {
700
- collectGitStatus(path.join(repoDir, repoRel), prefix + repoRel, out, overrides, includeIgnored);
820
+ collectGitStatus(path.join(repoDir, repoRel), prefix + repoRel, out, overrides, includeIgnored, exclude);
701
821
  }
702
822
  }
703
823
  for (const rel of findIgnoredEmbeddedRepos(repoDir, includeIgnored, prefix)) {
704
- collectGitStatus(path.join(repoDir, rel), prefix + rel, out, overrides, includeIgnored);
824
+ collectGitStatus(path.join(repoDir, rel), prefix + rel, out, overrides, includeIgnored, exclude);
705
825
  }
706
826
  }
707
827
  /**
@@ -860,7 +980,14 @@ function scanDirectoryWalk(rootDir, onProgress) {
860
980
  }
861
981
  // Seed a base matcher with the built-in default ignores (merged with the root
862
982
  // .gitignore so a negation can override). Nested .gitignores still layer per-dir.
863
- walk(rootDir, [{ dir: rootDir, ig: buildDefaultIgnore(rootDir) }]);
983
+ const baseMatchers = [{ dir: rootDir, ig: buildDefaultIgnore(rootDir) }];
984
+ // Project `codegraph.json` `exclude` patterns (#999), rooted at the project so
985
+ // `isIgnored` matches them against root-relative paths — same coverage the
986
+ // git path gets via ScopeIgnore, for non-git projects.
987
+ const exclude = loadExcludeMatcher(rootDir);
988
+ if (exclude)
989
+ baseMatchers.push({ dir: rootDir, ig: exclude });
990
+ walk(rootDir, baseMatchers);
864
991
  return files;
865
992
  }
866
993
  /**
@@ -1022,148 +1149,153 @@ class ExtractionOrchestrator {
1022
1149
  if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
1023
1150
  neededLanguages.push('cpp');
1024
1151
  }
1025
- // Try to use a worker thread for parsing (keeps main thread unblocked for UI).
1026
- // Falls back to in-process parsing if the compiled worker is unavailable (e.g. tests).
1152
+ // Parse files on a pool of worker threads (keeps the main thread free for UI
1153
+ // and uses every core). Falls back to in-process parsing when the compiled
1154
+ // worker is unavailable (e.g. running from source in tests).
1027
1155
  const parseWorkerPath = path.join(__dirname, 'parse-worker.js');
1028
1156
  const useWorker = fs.existsSync(parseWorkerPath);
1029
- let WorkerClass = null;
1157
+ let pool = null;
1030
1158
  if (useWorker) {
1031
- const { Worker } = await Promise.resolve().then(() => __importStar(require('worker_threads')));
1032
- WorkerClass = Worker;
1159
+ // CODEGRAPH_PARSE_WORKERS: explicit worker count; 1 = the old single-worker
1160
+ // behaviour (the conservative rollback). Unset → clamp(cores-1, 1, 8).
1161
+ const poolSize = (0, parse_pool_1.resolveParsePoolSize)(process.env.CODEGRAPH_PARSE_WORKERS, os.cpus().length);
1162
+ pool = new parse_pool_1.ParseWorkerPool({
1163
+ languages: neededLanguages,
1164
+ size: poolSize,
1165
+ workerScriptPath: parseWorkerPath,
1166
+ recycleInterval: WORKER_RECYCLE_INTERVAL,
1167
+ parseTimeoutMs: PARSE_TIMEOUT_MS,
1168
+ log,
1169
+ });
1170
+ log(`Parse worker pool: ${poolSize} worker(s)`);
1033
1171
  }
1034
1172
  else {
1035
- // In-process fallback: load grammars locally
1173
+ // In-process fallback: load grammars locally and parse on the main thread.
1036
1174
  await (0, grammars_1.loadGrammarsForLanguages)(neededLanguages);
1037
1175
  }
1038
- // --- Worker lifecycle management ---
1039
- // The worker can crash (OOM in WASM) or hang on pathological files.
1040
- // We track pending parse promises and handle both cases:
1041
- // - Timeout: terminate + restart the worker, reject the timed-out request
1042
- // - Crash: reject all pending promises, restart for remaining files
1043
- let parseWorker = null;
1044
- let nextId = 0;
1045
- let workerParseCount = 0;
1046
- const pendingParses = new Map();
1047
- function rejectAllPending(reason) {
1048
- for (const [id, pending] of pendingParses) {
1049
- clearTimeout(pending.timer);
1050
- pendingParses.delete(id);
1051
- pending.reject(new Error(reason));
1176
+ /**
1177
+ * Parse one file: on the pool when available (the promise REJECTS on a worker
1178
+ * crash/timeout the caller records it and the retry pass re-attempts), or
1179
+ * in-process synchronously as the no-worker fallback. The language is resolved
1180
+ * here on the main thread, where the codegraph.json overrides are loaded.
1181
+ */
1182
+ const parseFile = (filePath, content) => {
1183
+ const language = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1184
+ if (!pool)
1185
+ return Promise.resolve((0, tree_sitter_1.extractFromSource)(filePath, content, language, frameworkNames));
1186
+ return pool.requestParse({ filePath, content, language, frameworkNames });
1187
+ };
1188
+ // --- Bounded rolling-window dispatch, ordered commit ---
1189
+ // Reads stay batched/parallel; parses run concurrently across the pool; the
1190
+ // SQLite store stays on the main thread (it isn't thread-safe). Crucially we
1191
+ // COMMIT results in original file order, not parse-completion order: the
1192
+ // resolution phase (run after indexing) resolves an ambiguous reference to one
1193
+ // of several same-named candidates by the nodes' DB insertion order, so a
1194
+ // stable commit order keeps the resulting graph deterministic — byte-identical
1195
+ // to the single-worker path — instead of drifting with parse timing. The
1196
+ // `completed` buffer holds at most ~windowSize out-of-order results, so memory
1197
+ // stays bounded.
1198
+ const windowSize = pool ? Math.max(4, pool.size * 2) : 1;
1199
+ const inFlight = new Set();
1200
+ const completed = new Map();
1201
+ let nextSeq = 0; // file-order sequence assigned at dispatch
1202
+ let nextToStore = 0; // cursor: next sequence to commit
1203
+ let aborted = false;
1204
+ const storeResult = (filePath, content, stats, result) => {
1205
+ processed++;
1206
+ // Store in database on main thread (SQLite is not thread-safe)
1207
+ if (result.nodes.length > 0 || result.errors.length === 0) {
1208
+ const language = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1209
+ this.storeExtractionResult(filePath, content, language, stats, result);
1052
1210
  }
1053
- }
1054
- function attachWorkerHandlers(w) {
1055
- w.on('message', (msg) => {
1056
- if (msg.type === 'parse-result' && msg.id !== undefined) {
1057
- const pending = pendingParses.get(msg.id);
1058
- if (pending) {
1059
- clearTimeout(pending.timer);
1060
- pendingParses.delete(msg.id);
1061
- pending.resolve(msg.result);
1062
- }
1211
+ if (result.errors.length > 0) {
1212
+ for (const err of result.errors) {
1213
+ if (!err.filePath)
1214
+ err.filePath = filePath;
1063
1215
  }
1064
- });
1065
- w.on('error', (err) => {
1066
- (0, errors_1.logWarn)('Parse worker error', { error: err.message });
1067
- rejectAllPending(`Worker error: ${err.message}`);
1068
- });
1069
- w.on('exit', (code) => {
1070
- if (code !== 0 && pendingParses.size > 0) {
1071
- (0, errors_1.logWarn)('Parse worker exited unexpectedly', { code });
1072
- rejectAllPending(`Worker exited with code ${code}`);
1216
+ errors.push(...result.errors);
1217
+ }
1218
+ if (result.nodes.length > 0) {
1219
+ filesIndexed++;
1220
+ totalNodes += result.nodes.length;
1221
+ totalEdges += result.edges.length;
1222
+ }
1223
+ else if (result.errors.some((e) => e.severity === 'error')) {
1224
+ filesErrored++;
1225
+ }
1226
+ else {
1227
+ // Files with no symbols but no errors (yaml, twig, properties) are
1228
+ // tracked at the file level — count them as indexed so the CLI doesn't
1229
+ // misleadingly report "No files found to index".
1230
+ const lang = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1231
+ if ((0, grammars_1.isFileLevelOnlyLanguage)(lang)) {
1232
+ filesIndexed++;
1073
1233
  }
1074
- // Clear reference so we know to respawn, reset count so
1075
- // the fresh worker gets a full cycle before recycling.
1076
- if (parseWorker === w) {
1077
- parseWorker = null;
1078
- workerParseCount = 0;
1234
+ else {
1235
+ filesSkipped++;
1079
1236
  }
1237
+ }
1238
+ onProgress?.({ phase: 'parsing', current: processed, total, currentFile: filePath });
1239
+ };
1240
+ const recordParseFailure = (filePath, err) => {
1241
+ processed++;
1242
+ filesErrored++;
1243
+ errors.push({
1244
+ message: err instanceof Error ? err.message : String(err),
1245
+ filePath,
1246
+ severity: 'error',
1247
+ code: 'parse_error',
1080
1248
  });
1081
- }
1082
- async function ensureWorker() {
1083
- if (parseWorker)
1084
- return parseWorker;
1085
- log('Spawning new parse worker...');
1086
- parseWorker = new WorkerClass(parseWorkerPath);
1087
- attachWorkerHandlers(parseWorker);
1088
- // Load grammars in the new worker
1089
- await new Promise((resolve, reject) => {
1090
- parseWorker.once('message', (msg) => {
1091
- if (msg.type === 'grammars-loaded')
1092
- resolve();
1093
- else
1094
- reject(new Error(`Unexpected message: ${msg.type}`));
1095
- });
1096
- parseWorker.postMessage({ type: 'load-grammars', languages: neededLanguages });
1097
- });
1098
- return parseWorker;
1099
- }
1100
- if (WorkerClass) {
1101
- await ensureWorker();
1102
- }
1103
- /**
1104
- * Recycle the worker thread to reclaim WASM memory.
1105
- * Terminates the current worker and clears the reference so
1106
- * ensureWorker() will spawn a fresh one on the next call.
1107
- */
1108
- function recycleWorker() {
1109
- if (!parseWorker)
1249
+ onProgress?.({ phase: 'parsing', current: processed, total });
1250
+ };
1251
+ // Commit buffered parses to the DB in file order, advancing the cursor over
1252
+ // contiguous completed results. Runs after each parse settles (and once more
1253
+ // after the drain). storeResult / recordParseFailure run here single-threaded,
1254
+ // so shared counters and SQLite writes never race despite parallel parsing.
1255
+ const flushOrdered = () => {
1256
+ if (aborted)
1110
1257
  return;
1111
- log(`Recycling worker after ${workerParseCount} parses (heap: ${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB RSS)`);
1112
- const w = parseWorker;
1113
- parseWorker = null;
1114
- workerParseCount = 0;
1115
- // Fire-and-forget: worker.terminate() can hang if WASM is stuck
1116
- w.terminate().catch(() => { });
1117
- }
1118
- async function requestParse(filePath, content) {
1119
- // Resolve the language on the main thread (where the project's
1120
- // codegraph.json overrides are loaded) and hand it to the worker, so the
1121
- // worker never needs the override map itself.
1122
- const language = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1123
- if (!WorkerClass) {
1124
- // In-process fallback
1125
- return (0, tree_sitter_1.extractFromSource)(filePath, content, language, frameworkNames);
1258
+ while (completed.has(nextToStore)) {
1259
+ const item = completed.get(nextToStore);
1260
+ completed.delete(nextToStore);
1261
+ nextToStore++;
1262
+ if (item.ok)
1263
+ storeResult(item.filePath, item.content, item.stats, item.result);
1264
+ else
1265
+ recordParseFailure(item.filePath, item.err);
1126
1266
  }
1127
- // Recycle the worker before the next parse if we've hit the threshold.
1128
- // This destroys the WASM linear memory (which can grow but never shrink)
1129
- // and starts a fresh worker with a clean heap.
1130
- if (workerParseCount >= WORKER_RECYCLE_INTERVAL) {
1131
- await recycleWorker();
1267
+ };
1268
+ // Dispatch one file's parse (parses run concurrently across the pool), tagged
1269
+ // with its file-order sequence so flushOrdered commits results in order. The
1270
+ // backpressure below bounds how far parsing runs ahead of the in-order commit.
1271
+ const feed = async (filePath, content, stats) => {
1272
+ const seq = nextSeq++;
1273
+ const p = (async () => {
1274
+ try {
1275
+ const result = await parseFile(filePath, content);
1276
+ completed.set(seq, { ok: true, filePath, content, stats, result });
1277
+ }
1278
+ catch (parseErr) {
1279
+ completed.set(seq, { ok: false, filePath, err: parseErr });
1280
+ }
1281
+ flushOrdered();
1282
+ })();
1283
+ const tracked = p.finally(() => { inFlight.delete(tracked); });
1284
+ inFlight.add(tracked);
1285
+ // Backpressure on the dispatched-but-not-yet-committed count (in-flight +
1286
+ // buffered), not just in-flight: a slow file sitting at the commit cursor
1287
+ // lets later parses finish and buffer, which would otherwise grow without
1288
+ // bound. Wait for parses to settle (each may advance the cursor) until the
1289
+ // window has room. `inFlight.size > 0` guards against an empty race — the
1290
+ // cursor file is always still in flight when the window is full.
1291
+ while (nextSeq - nextToStore >= windowSize && inFlight.size > 0) {
1292
+ await Promise.race(inFlight);
1132
1293
  }
1133
- const worker = await ensureWorker();
1134
- const id = nextId++;
1135
- workerParseCount++;
1136
- // Scale timeout for large files: base 10s + 10s per 100KB
1137
- const timeoutMs = PARSE_TIMEOUT_MS + Math.floor(content.length / 100_000) * 10_000;
1138
- return new Promise((resolve, reject) => {
1139
- const timer = setTimeout(() => {
1140
- pendingParses.delete(id);
1141
- log(`TIMEOUT: ${filePath} exceeded ${timeoutMs}ms — killing worker`);
1142
- // Reject FIRST — worker.terminate() can hang if WASM is stuck
1143
- parseWorker = null;
1144
- workerParseCount = 0;
1145
- reject(new Error(`Parse timed out after ${timeoutMs}ms`));
1146
- // Fire-and-forget: kill the stuck worker in the background
1147
- worker.terminate().catch(() => { });
1148
- }, timeoutMs);
1149
- pendingParses.set(id, { resolve, reject, timer });
1150
- worker.postMessage({ type: 'parse', id, filePath, content, frameworkNames, language });
1151
- });
1152
- }
1294
+ };
1153
1295
  for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
1154
1296
  if (signal?.aborted) {
1155
- if (parseWorker)
1156
- parseWorker.terminate().catch(() => { });
1157
- return {
1158
- success: false,
1159
- filesIndexed,
1160
- filesSkipped,
1161
- filesErrored,
1162
- nodesCreated: totalNodes,
1163
- edgesCreated: totalEdges,
1164
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
1165
- durationMs: Date.now() - startTime,
1166
- };
1297
+ aborted = true;
1298
+ break;
1167
1299
  }
1168
1300
  const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
1169
1301
  // Read files in parallel (with path validation before any I/O)
@@ -1185,29 +1317,13 @@ class ExtractionOrchestrator {
1185
1317
  return { filePath: fp, content: null, stats: null, error: err };
1186
1318
  }
1187
1319
  }));
1188
- // Send to worker for parsing, store results on main thread
1320
+ // Dispatch each readable file into the bounded parse window; the window
1321
+ // stores results on the main thread as they arrive.
1189
1322
  for (const { filePath, content, stats, error } of fileContents) {
1190
1323
  if (signal?.aborted) {
1191
- if (parseWorker)
1192
- parseWorker.terminate().catch(() => { });
1193
- return {
1194
- success: false,
1195
- filesIndexed,
1196
- filesSkipped,
1197
- filesErrored,
1198
- nodesCreated: totalNodes,
1199
- edgesCreated: totalEdges,
1200
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
1201
- durationMs: Date.now() - startTime,
1202
- };
1324
+ aborted = true;
1325
+ break;
1203
1326
  }
1204
- // Report progress before parsing (show current file being worked on)
1205
- onProgress?.({
1206
- phase: 'parsing',
1207
- current: processed,
1208
- total,
1209
- currentFile: filePath,
1210
- });
1211
1327
  if (error || content === null || stats === null) {
1212
1328
  processed++;
1213
1329
  filesErrored++;
@@ -1217,6 +1333,7 @@ class ExtractionOrchestrator {
1217
1333
  severity: 'error',
1218
1334
  code: 'read_error',
1219
1335
  });
1336
+ onProgress?.({ phase: 'parsing', current: processed, total });
1220
1337
  continue;
1221
1338
  }
1222
1339
  // Honour MAX_FILE_SIZE. Without this check, vendored generated
@@ -1236,57 +1353,32 @@ class ExtractionOrchestrator {
1236
1353
  onProgress?.({ phase: 'parsing', current: processed, total });
1237
1354
  continue;
1238
1355
  }
1239
- // Parse in worker thread (main thread stays unblocked).
1240
- // Wrapped in try/catch to handle worker timeouts and crashes gracefully.
1241
- let result;
1242
- try {
1243
- result = await requestParse(filePath, content);
1244
- }
1245
- catch (parseErr) {
1246
- processed++;
1247
- filesErrored++;
1248
- errors.push({
1249
- message: parseErr instanceof Error ? parseErr.message : String(parseErr),
1250
- filePath,
1251
- severity: 'error',
1252
- code: 'parse_error',
1253
- });
1254
- continue;
1255
- }
1256
- processed++;
1257
- // Store in database on main thread (SQLite is not thread-safe)
1258
- if (result.nodes.length > 0 || result.errors.length === 0) {
1259
- const language = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1260
- this.storeExtractionResult(filePath, content, language, stats, result);
1261
- }
1262
- if (result.errors.length > 0) {
1263
- for (const err of result.errors) {
1264
- if (!err.filePath)
1265
- err.filePath = filePath;
1266
- }
1267
- errors.push(...result.errors);
1268
- }
1269
- if (result.nodes.length > 0) {
1270
- filesIndexed++;
1271
- totalNodes += result.nodes.length;
1272
- totalEdges += result.edges.length;
1273
- }
1274
- else if (result.errors.some((e) => e.severity === 'error')) {
1275
- filesErrored++;
1276
- }
1277
- else {
1278
- // Files with no symbols but no errors (yaml, twig, properties) are
1279
- // tracked at the file level — count them as indexed so the CLI
1280
- // doesn't misleadingly report "No files found to index".
1281
- const lang = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1282
- if ((0, grammars_1.isFileLevelOnlyLanguage)(lang)) {
1283
- filesIndexed++;
1284
- }
1285
- else {
1286
- filesSkipped++;
1287
- }
1288
- }
1356
+ // Parse on the pool (main thread stays unblocked). Errors/timeouts are
1357
+ // handled inside feed() recordParseFailure, feeding the retry pass.
1358
+ await feed(filePath, content, stats);
1289
1359
  }
1360
+ if (aborted)
1361
+ break;
1362
+ }
1363
+ // Drain parses still in flight (skip on abort — we tear down below instead),
1364
+ // then commit any results the cursor hasn't reached yet.
1365
+ if (!aborted) {
1366
+ await Promise.all(inFlight);
1367
+ flushOrdered();
1368
+ }
1369
+ if (signal?.aborted || aborted) {
1370
+ if (pool)
1371
+ await pool.destroy();
1372
+ return {
1373
+ success: false,
1374
+ filesIndexed,
1375
+ filesSkipped,
1376
+ filesErrored,
1377
+ nodesCreated: totalNodes,
1378
+ edgesCreated: totalEdges,
1379
+ errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
1380
+ durationMs: Date.now() - startTime,
1381
+ };
1290
1382
  }
1291
1383
  // Report 100% so the progress bar doesn't hang at 99%
1292
1384
  onProgress?.({
@@ -1303,15 +1395,17 @@ class ExtractionOrchestrator {
1303
1395
  // every file gets the absolute cleanest WASM state possible.
1304
1396
  const retryableErrors = errors.filter((e) => e.code === 'parse_error' && e.filePath &&
1305
1397
  (e.message.includes('Worker exited') || e.message.includes('memory access out of bounds')));
1306
- if (retryableErrors.length > 0 && WorkerClass) {
1398
+ if (retryableErrors.length > 0 && pool) {
1307
1399
  log(`Retrying ${retryableErrors.length} files that failed due to WASM memory errors...`);
1400
+ // Fresh WASM heaps for the retry phase. A retry that still crashes its
1401
+ // worker makes the pool respawn it, so later retries keep landing on clean
1402
+ // workers too.
1403
+ pool.recycleAll();
1308
1404
  const stillFailing = [];
1309
1405
  for (const errEntry of retryableErrors) {
1310
1406
  const filePath = errEntry.filePath;
1311
1407
  if (signal?.aborted)
1312
1408
  break;
1313
- // Fresh worker for every retry — maximum WASM headroom
1314
- recycleWorker();
1315
1409
  let content;
1316
1410
  try {
1317
1411
  const fullPath = (0, utils_1.validatePathWithinRoot)(this.rootDir, filePath);
@@ -1324,7 +1418,7 @@ class ExtractionOrchestrator {
1324
1418
  }
1325
1419
  let result;
1326
1420
  try {
1327
- result = await requestParse(filePath, content);
1421
+ result = await parseFile(filePath, content);
1328
1422
  }
1329
1423
  catch {
1330
1424
  stillFailing.push(errEntry);
@@ -1350,11 +1444,11 @@ class ExtractionOrchestrator {
1350
1444
  // code nodes but consume parser memory.
1351
1445
  if (stillFailing.length > 0) {
1352
1446
  log(`${stillFailing.length} files still failing — retrying with comments stripped...`);
1447
+ pool.recycleAll();
1353
1448
  for (const errEntry of stillFailing) {
1354
1449
  const filePath = errEntry.filePath;
1355
1450
  if (signal?.aborted)
1356
1451
  break;
1357
- recycleWorker();
1358
1452
  let fullContent;
1359
1453
  try {
1360
1454
  const fullPath = (0, utils_1.validatePathWithinRoot)(this.rootDir, filePath);
@@ -1373,7 +1467,7 @@ class ExtractionOrchestrator {
1373
1467
  .join('\n');
1374
1468
  let result;
1375
1469
  try {
1376
- result = await requestParse(filePath, stripped);
1470
+ result = await parseFile(filePath, stripped);
1377
1471
  }
1378
1472
  catch {
1379
1473
  continue;
@@ -1394,11 +1488,9 @@ class ExtractionOrchestrator {
1394
1488
  }
1395
1489
  }
1396
1490
  }
1397
- // Shut down parse worker and clear any pending timers
1398
- rejectAllPending('Indexing complete');
1399
- if (parseWorker) {
1400
- parseWorker.terminate().catch(() => { });
1401
- }
1491
+ // Shut down the parse worker pool.
1492
+ if (pool)
1493
+ await pool.destroy();
1402
1494
  return {
1403
1495
  success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
1404
1496
  filesIndexed,