@colbymchenry/codegraph-darwin-x64 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/lib/dist/bin/codegraph.js +79 -52
  2. package/lib/dist/bin/codegraph.js.map +1 -1
  3. package/lib/dist/bin/command-supervision.d.ts +12 -0
  4. package/lib/dist/bin/command-supervision.d.ts.map +1 -0
  5. package/lib/dist/bin/command-supervision.js +76 -0
  6. package/lib/dist/bin/command-supervision.js.map +1 -0
  7. package/lib/dist/db/queries.d.ts.map +1 -1
  8. package/lib/dist/db/queries.js +10 -2
  9. package/lib/dist/db/queries.js.map +1 -1
  10. package/lib/dist/directory.d.ts +32 -0
  11. package/lib/dist/directory.d.ts.map +1 -1
  12. package/lib/dist/directory.js +83 -0
  13. package/lib/dist/directory.js.map +1 -1
  14. package/lib/dist/extraction/index.d.ts +19 -4
  15. package/lib/dist/extraction/index.d.ts.map +1 -1
  16. package/lib/dist/extraction/index.js +287 -241
  17. package/lib/dist/extraction/index.js.map +1 -1
  18. package/lib/dist/extraction/parse-pool.d.ts +126 -0
  19. package/lib/dist/extraction/parse-pool.d.ts.map +1 -0
  20. package/lib/dist/extraction/parse-pool.js +319 -0
  21. package/lib/dist/extraction/parse-pool.js.map +1 -0
  22. package/lib/dist/extraction/tree-sitter.d.ts.map +1 -1
  23. package/lib/dist/extraction/tree-sitter.js +48 -19
  24. package/lib/dist/extraction/tree-sitter.js.map +1 -1
  25. package/lib/dist/mcp/daemon-paths.d.ts +30 -3
  26. package/lib/dist/mcp/daemon-paths.d.ts.map +1 -1
  27. package/lib/dist/mcp/daemon-paths.js +50 -10
  28. package/lib/dist/mcp/daemon-paths.js.map +1 -1
  29. package/lib/dist/mcp/daemon-registry.d.ts.map +1 -1
  30. package/lib/dist/mcp/daemon-registry.js +7 -3
  31. package/lib/dist/mcp/daemon-registry.js.map +1 -1
  32. package/lib/dist/mcp/daemon.d.ts +38 -0
  33. package/lib/dist/mcp/daemon.d.ts.map +1 -1
  34. package/lib/dist/mcp/daemon.js +168 -19
  35. package/lib/dist/mcp/daemon.js.map +1 -1
  36. package/lib/dist/mcp/engine.d.ts +17 -0
  37. package/lib/dist/mcp/engine.d.ts.map +1 -1
  38. package/lib/dist/mcp/engine.js +73 -1
  39. package/lib/dist/mcp/engine.js.map +1 -1
  40. package/lib/dist/mcp/index.d.ts.map +1 -1
  41. package/lib/dist/mcp/index.js +25 -43
  42. package/lib/dist/mcp/index.js.map +1 -1
  43. package/lib/dist/mcp/ppid-watchdog.d.ts +18 -0
  44. package/lib/dist/mcp/ppid-watchdog.d.ts.map +1 -1
  45. package/lib/dist/mcp/ppid-watchdog.js +37 -0
  46. package/lib/dist/mcp/ppid-watchdog.js.map +1 -1
  47. package/lib/dist/mcp/proxy.d.ts.map +1 -1
  48. package/lib/dist/mcp/proxy.js +14 -1
  49. package/lib/dist/mcp/proxy.js.map +1 -1
  50. package/lib/dist/mcp/query-pool.d.ts +94 -0
  51. package/lib/dist/mcp/query-pool.d.ts.map +1 -0
  52. package/lib/dist/mcp/query-pool.js +297 -0
  53. package/lib/dist/mcp/query-pool.js.map +1 -0
  54. package/lib/dist/mcp/query-worker.d.ts +24 -0
  55. package/lib/dist/mcp/query-worker.d.ts.map +1 -0
  56. package/lib/dist/mcp/query-worker.js +87 -0
  57. package/lib/dist/mcp/query-worker.js.map +1 -0
  58. package/lib/dist/mcp/tools.d.ts +57 -0
  59. package/lib/dist/mcp/tools.d.ts.map +1 -1
  60. package/lib/dist/mcp/tools.js +147 -37
  61. package/lib/dist/mcp/tools.js.map +1 -1
  62. package/lib/dist/project-config.d.ts +37 -0
  63. package/lib/dist/project-config.d.ts.map +1 -1
  64. package/lib/dist/project-config.js +127 -32
  65. package/lib/dist/project-config.js.map +1 -1
  66. package/lib/dist/resolution/c-fnptr-synthesizer.d.ts +0 -28
  67. package/lib/dist/resolution/c-fnptr-synthesizer.d.ts.map +1 -1
  68. package/lib/dist/resolution/c-fnptr-synthesizer.js +765 -79
  69. package/lib/dist/resolution/c-fnptr-synthesizer.js.map +1 -1
  70. package/lib/dist/resolution/name-matcher.d.ts.map +1 -1
  71. package/lib/dist/resolution/name-matcher.js +44 -0
  72. package/lib/dist/resolution/name-matcher.js.map +1 -1
  73. package/lib/node_modules/.package-lock.json +1 -1
  74. package/lib/package.json +1 -1
  75. package/package.json +1 -1
@@ -51,9 +51,11 @@ exports.scanDirectoryAsync = scanDirectoryAsync;
51
51
  const fs = __importStar(require("fs"));
52
52
  const fsp = __importStar(require("fs/promises"));
53
53
  const path = __importStar(require("path"));
54
+ const os = __importStar(require("os"));
54
55
  const crypto = __importStar(require("crypto"));
55
56
  const child_process_1 = require("child_process");
56
57
  const tree_sitter_1 = require("./tree-sitter");
58
+ const parse_pool_1 = require("./parse-pool");
57
59
  const grammars_1 = require("./grammars");
58
60
  const project_config_1 = require("../project-config");
59
61
  const directory_1 = require("../directory");
@@ -252,6 +254,32 @@ function buildDefaultIgnore(rootDir) {
252
254
  function defaultsOnlyIgnore() {
253
255
  return (0, ignore_1.default)().add(DEFAULT_IGNORE_PATTERNS);
254
256
  }
257
+ /**
258
+ * Matcher for the project's `codegraph.json` `includeIgnored` patterns — the
259
+ * explicit opt-in to index embedded git repos living inside gitignored
260
+ * directories (#622, #699). Returns `null` when the project opted in nothing,
261
+ * which is the zero-config DEFAULT: `.gitignore` is then fully respected and a
262
+ * gitignored directory (even one holding nested repos) is never walked or
263
+ * indexed (#970, #976). Built once per scan/sync/scope operation from the scan
264
+ * root and threaded down — never global, so multi-project daemons stay isolated.
265
+ */
266
+ function loadIncludeIgnoredMatcher(rootDir) {
267
+ const patterns = (0, project_config_1.loadIncludeIgnoredPatterns)(rootDir);
268
+ return patterns.length > 0 ? (0, ignore_1.default)().add(patterns) : null;
269
+ }
270
+ /**
271
+ * Matcher for the project's `codegraph.json` `exclude` patterns — paths to keep
272
+ * OUT of the index even when git-tracked, which `.gitignore` cannot do (#999).
273
+ * The escape hatch for a committed vendor/theme/SDK directory. Returns `null`
274
+ * when nothing is excluded (the zero-config default → no overhead). Matched
275
+ * against project-root-relative paths, so it applies uniformly across the whole
276
+ * workspace, including inside embedded repos (excluding `static/` means gone
277
+ * everywhere). Built once per scan/sync/scope operation from the scan root.
278
+ */
279
+ function loadExcludeMatcher(rootDir) {
280
+ const patterns = (0, project_config_1.loadExcludePatterns)(rootDir);
281
+ return patterns.length > 0 ? (0, ignore_1.default)().add(patterns) : null;
282
+ }
255
283
  /**
256
284
  * `git ls-files --directory` collapses a wholly-untracked/ignored directory into
257
285
  * one entry — and when the command's own cwd is such a directory (the indexed
@@ -391,14 +419,27 @@ function findNestedGitRepos(absDir, relPrefix) {
391
419
  */
392
420
  class ScopeIgnore {
393
421
  rootMatcher;
422
+ exclude;
394
423
  embedded;
395
424
  defaults = defaultsOnlyIgnore();
396
- constructor(rootMatcher, embedded) {
425
+ constructor(rootMatcher, embedded,
426
+ /**
427
+ * Project `codegraph.json` `exclude` patterns (#999), matched against the
428
+ * full root-relative path. Wins over everything else — an explicit user
429
+ * exclude applies even to tracked files and even inside embedded repos.
430
+ */
431
+ exclude = null) {
397
432
  this.rootMatcher = rootMatcher;
433
+ this.exclude = exclude;
398
434
  // Longest root first so paths in nested embedded repos hit the innermost matcher.
399
435
  this.embedded = [...embedded].sort((a, b) => b.root.length - a.root.length);
400
436
  }
401
437
  ignores(rel) {
438
+ // User `exclude` (#999) is checked first and against the full root-relative
439
+ // path: it must drop git-TRACKED paths (which `.gitignore` can't) and apply
440
+ // everywhere, including ancestors of embedded repos.
441
+ if (this.exclude && this.exclude.ignores(rel))
442
+ return true;
402
443
  for (const { root, matcher } of this.embedded) {
403
444
  if (rel.startsWith(root)) {
404
445
  const inner = rel.slice(root.length);
@@ -425,13 +466,16 @@ exports.ScopeIgnore = ScopeIgnore;
425
466
  */
426
467
  function buildScopeIgnore(rootDir, embeddedRoots) {
427
468
  const roots = embeddedRoots ? [...embeddedRoots] : discoverEmbeddedRepoRoots(rootDir);
428
- return new ScopeIgnore(buildDefaultIgnore(rootDir), roots.map((root) => ({ root, matcher: buildDefaultIgnore(path.join(rootDir, root)) })));
469
+ return new ScopeIgnore(buildDefaultIgnore(rootDir), roots.map((root) => ({ root, matcher: buildDefaultIgnore(path.join(rootDir, root)) })), loadExcludeMatcher(rootDir));
429
470
  }
430
471
  /**
431
472
  * Standalone discovery of every embedded repo root under `rootDir` (relative,
432
- * trailing-slashed) — both the untracked kind (#193) and the gitignored kind
433
- * (#514), recursively (an embedded repo can embed further repos). Returns []
434
- * for non-git roots: the filesystem walk handles nested repos there already.
473
+ * trailing-slashed) — the untracked kind (#193) always, and the gitignored kind
474
+ * (#514) only for directories the project opted in via `codegraph.json`
475
+ * `includeIgnored` (#622, #699); otherwise `.gitignore` is respected and they
476
+ * are not discovered (#970, #976). Recursive (an embedded repo can embed further
477
+ * repos). Returns [] for non-git roots: the filesystem walk handles nested repos
478
+ * there already.
435
479
  */
436
480
  function discoverEmbeddedRepoRoots(rootDir) {
437
481
  try {
@@ -442,6 +486,7 @@ function discoverEmbeddedRepoRoots(rootDir) {
442
486
  }
443
487
  const out = [];
444
488
  const defaults = defaultsOnlyIgnore();
489
+ const includeIgnored = loadIncludeIgnoredMatcher(rootDir);
445
490
  const visit = (repoAbs, prefix) => {
446
491
  const candidates = [];
447
492
  try {
@@ -453,7 +498,7 @@ function discoverEmbeddedRepoRoots(rootDir) {
453
498
  }
454
499
  }
455
500
  catch { /* untracked listing failed — ignored-side discovery still runs */ }
456
- candidates.push(...findIgnoredEmbeddedRepos(repoAbs));
501
+ candidates.push(...findIgnoredEmbeddedRepos(repoAbs, includeIgnored, prefix));
457
502
  for (const rel of candidates) {
458
503
  const full = (0, utils_1.normalizePath)(prefix + rel);
459
504
  out.push(full);
@@ -464,16 +509,30 @@ function discoverEmbeddedRepoRoots(rootDir) {
464
509
  return out;
465
510
  }
466
511
  /**
467
- * Discover embedded repos hidden by `repoDir`'s OWN ignore rules: for each
468
- * gitignored directory (skipping built-in default excludes), search for nested
469
- * `.git` roots. Returns repo paths relative to `repoDir`, trailing-slashed.
512
+ * Discover embedded repos hidden by `repoDir`'s OWN gitignore rules: for each
513
+ * gitignored directory, search for nested `.git` roots. Returns repo paths
514
+ * relative to `repoDir`, trailing-slashed.
515
+ *
516
+ * OPT-IN ONLY. Walking into a gitignored directory contradicts what every other
517
+ * tool (and CodeGraph's own `git ls-files` foundation) does — `.gitignore`
518
+ * excludes. So this returns `[]` unless the project opted the directory in via
519
+ * `codegraph.json` `includeIgnored`; without that, a gitignored dir — including
520
+ * a huge reference/data dir full of nested clones — is left untouched (#970,
521
+ * #976). When opted in, it restores the super-repo-of-clones behavior (#622,
522
+ * #699). `prefix` is the scan-root-relative path of `repoDir`, so a pattern like
523
+ * `services/` opts that whole subtree in at any recursion depth. Built-in
524
+ * default excludes (`node_modules`, …) are always skipped.
470
525
  */
471
- function findIgnoredEmbeddedRepos(repoDir) {
526
+ function findIgnoredEmbeddedRepos(repoDir, includeIgnored, prefix) {
527
+ if (!includeIgnored)
528
+ return [];
472
529
  const defaults = defaultsOnlyIgnore();
473
530
  const repos = [];
474
531
  for (const dir of listIgnoredDirs(repoDir)) {
475
532
  if (defaults.ignores(dir))
476
533
  continue;
534
+ if (!includeIgnored.ignores((0, utils_1.normalizePath)(prefix + dir)))
535
+ continue;
477
536
  repos.push(...findNestedGitRepos(path.join(repoDir, dir), dir));
478
537
  }
479
538
  return repos;
@@ -489,12 +548,15 @@ function findIgnoredEmbeddedRepos(repoDir) {
489
548
  * skips them entirely, and untracked output reports them only as an opaque
490
549
  * "subdir/" entry (trailing slash) rather than expanding their files. Each
491
550
  * embedded repo is its own git boundary, so we re-run `git ls-files` inside it.
492
- * (See issue #193.) GITIGNORED embedded repos are invisible even to that
493
- * they're discovered separately via `findIgnoredEmbeddedRepos` (#514); every
494
- * embedded repo root (however found) is recorded in `embeddedRoots` so callers
495
- * can exempt its files from the parent's own gitignore rules.
551
+ * (See issue #193.) GITIGNORED embedded repos are invisible even to that; they
552
+ * are discovered separately via `findIgnoredEmbeddedRepos` (#514) but ONLY for
553
+ * directories the project opted in through `codegraph.json` `includeIgnored`
554
+ * (`includeIgnored` here, threaded from the scan root) — by default `.gitignore`
555
+ * is respected and they stay out (#970, #976). Every embedded repo root (however
556
+ * found) is recorded in `embeddedRoots` so callers can exempt its files from the
557
+ * parent's own gitignore rules.
496
558
  */
497
- function collectGitFiles(repoDir, prefix, files, embeddedRoots) {
559
+ function collectGitFiles(repoDir, prefix, files, embeddedRoots, includeIgnored = null) {
498
560
  const gitOpts = { cwd: repoDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true };
499
561
  // Tracked files. --recurse-submodules pulls in files from active submodules,
500
562
  // which the index would otherwise represent only as a commit pointer.
@@ -528,18 +590,20 @@ function collectGitFiles(repoDir, prefix, files, embeddedRoots) {
528
590
  // it's a duplicate working view of an already-indexed repo (#848).
529
591
  if (classifyGitDir(childDir) === 'embedded' && !defaultsOnlyIgnore().ignores(rel)) {
530
592
  embeddedRoots?.add((0, utils_1.normalizePath)(prefix + rel));
531
- collectGitFiles(childDir, prefix + rel, files, embeddedRoots);
593
+ collectGitFiles(childDir, prefix + rel, files, embeddedRoots, includeIgnored);
532
594
  }
533
595
  continue;
534
596
  }
535
597
  files.add((0, utils_1.normalizePath)(prefix + rel));
536
598
  }
537
599
  // Embedded repos hidden by THIS repo's ignore rules (`/packages/` in a
538
- // super-repo .gitignore) never appear in any listing above discover and
539
- // recurse into them too. (#514)
540
- for (const rel of findIgnoredEmbeddedRepos(repoDir)) {
600
+ // super-repo .gitignore) never appear in any listing above. By default they
601
+ // stay hidden `.gitignore` is respected (#970, #976). They are recursed into
602
+ // only when the project opted the directory in via `codegraph.json`
603
+ // `includeIgnored` (#622, #699), which `findIgnoredEmbeddedRepos` enforces.
604
+ for (const rel of findIgnoredEmbeddedRepos(repoDir, includeIgnored, prefix)) {
541
605
  embeddedRoots?.add((0, utils_1.normalizePath)(prefix + rel));
542
- collectGitFiles(path.join(repoDir, rel), prefix + rel, files, embeddedRoots);
606
+ collectGitFiles(path.join(repoDir, rel), prefix + rel, files, embeddedRoots, includeIgnored);
543
607
  }
544
608
  }
545
609
  /**
@@ -567,7 +631,7 @@ function getGitVisibleFiles(rootDir) {
567
631
  }
568
632
  const files = new Set();
569
633
  const embeddedRoots = new Set();
570
- collectGitFiles(rootDir, '', files, embeddedRoots);
634
+ collectGitFiles(rootDir, '', files, embeddedRoots, loadIncludeIgnoredMatcher(rootDir));
571
635
  // Apply built-in default ignores uniformly — to tracked files too, since
572
636
  // committing a dependency/build dir doesn't make it project code. A
573
637
  // `.gitignore` negation (e.g. `!vendor/`) is the explicit opt-in. (issue #407)
@@ -585,13 +649,15 @@ function getGitVisibleFiles(rootDir) {
585
649
  * Use `git status` to detect changed files instead of scanning every file.
586
650
  * Returns null on failure so callers fall back to full scan.
587
651
  *
588
- * Recurses into embedded repos — both the untracked kind (#193: the parent's
589
- * status collapses them to an opaque `?? subdir/` entry) and the gitignored
590
- * kind (#514: they never appear in the parent's status at all) running
591
- * `git status` inside each, so changes in a multi-repo workspace sync without
592
- * a full rescan. Deleting an ENTIRE embedded repo dir is the one case this
593
- * cannot see (the child status that would report the deletions is gone with
594
- * it); a full `codegraph index` reconciles that.
652
+ * Recurses into embedded repos — the untracked kind (#193: the parent's status
653
+ * collapses them to an opaque `?? subdir/` entry) always, and the gitignored
654
+ * kind (#514: they never appear in the parent's status at all) only for
655
+ * directories opted in via `codegraph.json` `includeIgnored` (#622, #699)
656
+ * running `git status` inside each, so changes in a multi-repo workspace sync
657
+ * without a full rescan. By default a gitignored dir is left alone, matching the
658
+ * full-index scan (#970, #976). Deleting an ENTIRE embedded repo dir is the one
659
+ * case this cannot see (the child status that would report the deletions is gone
660
+ * with it); a full `codegraph index` reconciles that.
595
661
  */
596
662
  function getGitChangedFiles(rootDir) {
597
663
  try {
@@ -599,14 +665,14 @@ function getGitChangedFiles(rootDir) {
599
665
  // Custom extension → language overrides from the project's codegraph.json,
600
666
  // so change detection sees the same custom-extension files the full index does.
601
667
  const overrides = (0, project_config_1.loadExtensionOverrides)(rootDir);
602
- collectGitStatus(rootDir, '', changes, overrides);
668
+ collectGitStatus(rootDir, '', changes, overrides, loadIncludeIgnoredMatcher(rootDir), loadExcludeMatcher(rootDir));
603
669
  return changes;
604
670
  }
605
671
  catch {
606
672
  return null;
607
673
  }
608
674
  }
609
- function collectGitStatus(repoDir, prefix, out, overrides) {
675
+ function collectGitStatus(repoDir, prefix, out, overrides, includeIgnored = null, exclude = null) {
610
676
  const output = (0, child_process_1.execFileSync)('git', ['status', '--porcelain', '--no-renames'], { cwd: repoDir, encoding: 'utf-8', timeout: 10000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true });
611
677
  // This repo's own ignore rules — built-in defaults (#407) plus its .gitignore.
612
678
  // Change detection must exclude the SAME files the full index does, but git
@@ -645,6 +711,12 @@ function collectGitStatus(repoDir, prefix, out, overrides) {
645
711
  // index — match against the repo-relative path, same as the full scan. (#766)
646
712
  if (ig.ignores(rel))
647
713
  continue;
714
+ // User `codegraph.json` `exclude` (#999) is project-root-relative, so it's
715
+ // matched against the full path — sync must not re-add a tracked file the
716
+ // full index now keeps out. Deletions above stay unfiltered so a file that
717
+ // WAS indexed before an exclude was added still cleans itself out.
718
+ if (exclude && exclude.ignores(filePath))
719
+ continue;
648
720
  if (statusCode === '??') {
649
721
  out.added.push(filePath);
650
722
  }
@@ -654,14 +726,16 @@ function collectGitStatus(repoDir, prefix, out, overrides) {
654
726
  }
655
727
  }
656
728
  // Recurse embedded repos found under untracked dirs (at the dir itself or
657
- // nested deeper) and under this repo's gitignored dirs.
729
+ // nested deeper). Gitignored dirs are walked only for the directories the
730
+ // project opted in via `includeIgnored`; by default `.gitignore` is respected
731
+ // and they are left alone (#970, #976), mirroring the full-index scan.
658
732
  for (const rel of untrackedDirs) {
659
733
  for (const repoRel of findNestedGitRepos(path.join(repoDir, rel), rel)) {
660
- collectGitStatus(path.join(repoDir, repoRel), prefix + repoRel, out, overrides);
734
+ collectGitStatus(path.join(repoDir, repoRel), prefix + repoRel, out, overrides, includeIgnored, exclude);
661
735
  }
662
736
  }
663
- for (const rel of findIgnoredEmbeddedRepos(repoDir)) {
664
- collectGitStatus(path.join(repoDir, rel), prefix + rel, out, overrides);
737
+ for (const rel of findIgnoredEmbeddedRepos(repoDir, includeIgnored, prefix)) {
738
+ collectGitStatus(path.join(repoDir, rel), prefix + rel, out, overrides, includeIgnored, exclude);
665
739
  }
666
740
  }
667
741
  /**
@@ -820,7 +894,14 @@ function scanDirectoryWalk(rootDir, onProgress) {
820
894
  }
821
895
  // Seed a base matcher with the built-in default ignores (merged with the root
822
896
  // .gitignore so a negation can override). Nested .gitignores still layer per-dir.
823
- walk(rootDir, [{ dir: rootDir, ig: buildDefaultIgnore(rootDir) }]);
897
+ const baseMatchers = [{ dir: rootDir, ig: buildDefaultIgnore(rootDir) }];
898
+ // Project `codegraph.json` `exclude` patterns (#999), rooted at the project so
899
+ // `isIgnored` matches them against root-relative paths — same coverage the
900
+ // git path gets via ScopeIgnore, for non-git projects.
901
+ const exclude = loadExcludeMatcher(rootDir);
902
+ if (exclude)
903
+ baseMatchers.push({ dir: rootDir, ig: exclude });
904
+ walk(rootDir, baseMatchers);
824
905
  return files;
825
906
  }
826
907
  /**
@@ -982,148 +1063,153 @@ class ExtractionOrchestrator {
982
1063
  if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
983
1064
  neededLanguages.push('cpp');
984
1065
  }
985
- // Try to use a worker thread for parsing (keeps main thread unblocked for UI).
986
- // Falls back to in-process parsing if the compiled worker is unavailable (e.g. tests).
1066
+ // Parse files on a pool of worker threads (keeps the main thread free for UI
1067
+ // and uses every core). Falls back to in-process parsing when the compiled
1068
+ // worker is unavailable (e.g. running from source in tests).
987
1069
  const parseWorkerPath = path.join(__dirname, 'parse-worker.js');
988
1070
  const useWorker = fs.existsSync(parseWorkerPath);
989
- let WorkerClass = null;
1071
+ let pool = null;
990
1072
  if (useWorker) {
991
- const { Worker } = await Promise.resolve().then(() => __importStar(require('worker_threads')));
992
- WorkerClass = Worker;
1073
+ // CODEGRAPH_PARSE_WORKERS: explicit worker count; 1 = the old single-worker
1074
+ // behaviour (the conservative rollback). Unset → clamp(cores-1, 1, 8).
1075
+ const poolSize = (0, parse_pool_1.resolveParsePoolSize)(process.env.CODEGRAPH_PARSE_WORKERS, os.cpus().length);
1076
+ pool = new parse_pool_1.ParseWorkerPool({
1077
+ languages: neededLanguages,
1078
+ size: poolSize,
1079
+ workerScriptPath: parseWorkerPath,
1080
+ recycleInterval: WORKER_RECYCLE_INTERVAL,
1081
+ parseTimeoutMs: PARSE_TIMEOUT_MS,
1082
+ log,
1083
+ });
1084
+ log(`Parse worker pool: ${poolSize} worker(s)`);
993
1085
  }
994
1086
  else {
995
- // In-process fallback: load grammars locally
1087
+ // In-process fallback: load grammars locally and parse on the main thread.
996
1088
  await (0, grammars_1.loadGrammarsForLanguages)(neededLanguages);
997
1089
  }
998
- // --- Worker lifecycle management ---
999
- // The worker can crash (OOM in WASM) or hang on pathological files.
1000
- // We track pending parse promises and handle both cases:
1001
- // - Timeout: terminate + restart the worker, reject the timed-out request
1002
- // - Crash: reject all pending promises, restart for remaining files
1003
- let parseWorker = null;
1004
- let nextId = 0;
1005
- let workerParseCount = 0;
1006
- const pendingParses = new Map();
1007
- function rejectAllPending(reason) {
1008
- for (const [id, pending] of pendingParses) {
1009
- clearTimeout(pending.timer);
1010
- pendingParses.delete(id);
1011
- pending.reject(new Error(reason));
1090
+ /**
1091
+ * Parse one file: on the pool when available (the promise REJECTS on a worker
1092
+ * crash/timeout the caller records it and the retry pass re-attempts), or
1093
+ * in-process synchronously as the no-worker fallback. The language is resolved
1094
+ * here on the main thread, where the codegraph.json overrides are loaded.
1095
+ */
1096
+ const parseFile = (filePath, content) => {
1097
+ const language = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1098
+ if (!pool)
1099
+ return Promise.resolve((0, tree_sitter_1.extractFromSource)(filePath, content, language, frameworkNames));
1100
+ return pool.requestParse({ filePath, content, language, frameworkNames });
1101
+ };
1102
+ // --- Bounded rolling-window dispatch, ordered commit ---
1103
+ // Reads stay batched/parallel; parses run concurrently across the pool; the
1104
+ // SQLite store stays on the main thread (it isn't thread-safe). Crucially we
1105
+ // COMMIT results in original file order, not parse-completion order: the
1106
+ // resolution phase (run after indexing) resolves an ambiguous reference to one
1107
+ // of several same-named candidates by the nodes' DB insertion order, so a
1108
+ // stable commit order keeps the resulting graph deterministic — byte-identical
1109
+ // to the single-worker path — instead of drifting with parse timing. The
1110
+ // `completed` buffer holds at most ~windowSize out-of-order results, so memory
1111
+ // stays bounded.
1112
+ const windowSize = pool ? Math.max(4, pool.size * 2) : 1;
1113
+ const inFlight = new Set();
1114
+ const completed = new Map();
1115
+ let nextSeq = 0; // file-order sequence assigned at dispatch
1116
+ let nextToStore = 0; // cursor: next sequence to commit
1117
+ let aborted = false;
1118
+ const storeResult = (filePath, content, stats, result) => {
1119
+ processed++;
1120
+ // Store in database on main thread (SQLite is not thread-safe)
1121
+ if (result.nodes.length > 0 || result.errors.length === 0) {
1122
+ const language = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1123
+ this.storeExtractionResult(filePath, content, language, stats, result);
1012
1124
  }
1013
- }
1014
- function attachWorkerHandlers(w) {
1015
- w.on('message', (msg) => {
1016
- if (msg.type === 'parse-result' && msg.id !== undefined) {
1017
- const pending = pendingParses.get(msg.id);
1018
- if (pending) {
1019
- clearTimeout(pending.timer);
1020
- pendingParses.delete(msg.id);
1021
- pending.resolve(msg.result);
1022
- }
1125
+ if (result.errors.length > 0) {
1126
+ for (const err of result.errors) {
1127
+ if (!err.filePath)
1128
+ err.filePath = filePath;
1023
1129
  }
1024
- });
1025
- w.on('error', (err) => {
1026
- (0, errors_1.logWarn)('Parse worker error', { error: err.message });
1027
- rejectAllPending(`Worker error: ${err.message}`);
1028
- });
1029
- w.on('exit', (code) => {
1030
- if (code !== 0 && pendingParses.size > 0) {
1031
- (0, errors_1.logWarn)('Parse worker exited unexpectedly', { code });
1032
- rejectAllPending(`Worker exited with code ${code}`);
1130
+ errors.push(...result.errors);
1131
+ }
1132
+ if (result.nodes.length > 0) {
1133
+ filesIndexed++;
1134
+ totalNodes += result.nodes.length;
1135
+ totalEdges += result.edges.length;
1136
+ }
1137
+ else if (result.errors.some((e) => e.severity === 'error')) {
1138
+ filesErrored++;
1139
+ }
1140
+ else {
1141
+ // Files with no symbols but no errors (yaml, twig, properties) are
1142
+ // tracked at the file level — count them as indexed so the CLI doesn't
1143
+ // misleadingly report "No files found to index".
1144
+ const lang = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1145
+ if ((0, grammars_1.isFileLevelOnlyLanguage)(lang)) {
1146
+ filesIndexed++;
1033
1147
  }
1034
- // Clear reference so we know to respawn, reset count so
1035
- // the fresh worker gets a full cycle before recycling.
1036
- if (parseWorker === w) {
1037
- parseWorker = null;
1038
- workerParseCount = 0;
1148
+ else {
1149
+ filesSkipped++;
1039
1150
  }
1151
+ }
1152
+ onProgress?.({ phase: 'parsing', current: processed, total, currentFile: filePath });
1153
+ };
1154
+ const recordParseFailure = (filePath, err) => {
1155
+ processed++;
1156
+ filesErrored++;
1157
+ errors.push({
1158
+ message: err instanceof Error ? err.message : String(err),
1159
+ filePath,
1160
+ severity: 'error',
1161
+ code: 'parse_error',
1040
1162
  });
1041
- }
1042
- async function ensureWorker() {
1043
- if (parseWorker)
1044
- return parseWorker;
1045
- log('Spawning new parse worker...');
1046
- parseWorker = new WorkerClass(parseWorkerPath);
1047
- attachWorkerHandlers(parseWorker);
1048
- // Load grammars in the new worker
1049
- await new Promise((resolve, reject) => {
1050
- parseWorker.once('message', (msg) => {
1051
- if (msg.type === 'grammars-loaded')
1052
- resolve();
1053
- else
1054
- reject(new Error(`Unexpected message: ${msg.type}`));
1055
- });
1056
- parseWorker.postMessage({ type: 'load-grammars', languages: neededLanguages });
1057
- });
1058
- return parseWorker;
1059
- }
1060
- if (WorkerClass) {
1061
- await ensureWorker();
1062
- }
1063
- /**
1064
- * Recycle the worker thread to reclaim WASM memory.
1065
- * Terminates the current worker and clears the reference so
1066
- * ensureWorker() will spawn a fresh one on the next call.
1067
- */
1068
- function recycleWorker() {
1069
- if (!parseWorker)
1163
+ onProgress?.({ phase: 'parsing', current: processed, total });
1164
+ };
1165
+ // Commit buffered parses to the DB in file order, advancing the cursor over
1166
+ // contiguous completed results. Runs after each parse settles (and once more
1167
+ // after the drain). storeResult / recordParseFailure run here single-threaded,
1168
+ // so shared counters and SQLite writes never race despite parallel parsing.
1169
+ const flushOrdered = () => {
1170
+ if (aborted)
1070
1171
  return;
1071
- log(`Recycling worker after ${workerParseCount} parses (heap: ${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB RSS)`);
1072
- const w = parseWorker;
1073
- parseWorker = null;
1074
- workerParseCount = 0;
1075
- // Fire-and-forget: worker.terminate() can hang if WASM is stuck
1076
- w.terminate().catch(() => { });
1077
- }
1078
- async function requestParse(filePath, content) {
1079
- // Resolve the language on the main thread (where the project's
1080
- // codegraph.json overrides are loaded) and hand it to the worker, so the
1081
- // worker never needs the override map itself.
1082
- const language = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1083
- if (!WorkerClass) {
1084
- // In-process fallback
1085
- return (0, tree_sitter_1.extractFromSource)(filePath, content, language, frameworkNames);
1172
+ while (completed.has(nextToStore)) {
1173
+ const item = completed.get(nextToStore);
1174
+ completed.delete(nextToStore);
1175
+ nextToStore++;
1176
+ if (item.ok)
1177
+ storeResult(item.filePath, item.content, item.stats, item.result);
1178
+ else
1179
+ recordParseFailure(item.filePath, item.err);
1086
1180
  }
1087
- // Recycle the worker before the next parse if we've hit the threshold.
1088
- // This destroys the WASM linear memory (which can grow but never shrink)
1089
- // and starts a fresh worker with a clean heap.
1090
- if (workerParseCount >= WORKER_RECYCLE_INTERVAL) {
1091
- await recycleWorker();
1181
+ };
1182
+ // Dispatch one file's parse (parses run concurrently across the pool), tagged
1183
+ // with its file-order sequence so flushOrdered commits results in order. The
1184
+ // backpressure below bounds how far parsing runs ahead of the in-order commit.
1185
+ const feed = async (filePath, content, stats) => {
1186
+ const seq = nextSeq++;
1187
+ const p = (async () => {
1188
+ try {
1189
+ const result = await parseFile(filePath, content);
1190
+ completed.set(seq, { ok: true, filePath, content, stats, result });
1191
+ }
1192
+ catch (parseErr) {
1193
+ completed.set(seq, { ok: false, filePath, err: parseErr });
1194
+ }
1195
+ flushOrdered();
1196
+ })();
1197
+ const tracked = p.finally(() => { inFlight.delete(tracked); });
1198
+ inFlight.add(tracked);
1199
+ // Backpressure on the dispatched-but-not-yet-committed count (in-flight +
1200
+ // buffered), not just in-flight: a slow file sitting at the commit cursor
1201
+ // lets later parses finish and buffer, which would otherwise grow without
1202
+ // bound. Wait for parses to settle (each may advance the cursor) until the
1203
+ // window has room. `inFlight.size > 0` guards against an empty race — the
1204
+ // cursor file is always still in flight when the window is full.
1205
+ while (nextSeq - nextToStore >= windowSize && inFlight.size > 0) {
1206
+ await Promise.race(inFlight);
1092
1207
  }
1093
- const worker = await ensureWorker();
1094
- const id = nextId++;
1095
- workerParseCount++;
1096
- // Scale timeout for large files: base 10s + 10s per 100KB
1097
- const timeoutMs = PARSE_TIMEOUT_MS + Math.floor(content.length / 100_000) * 10_000;
1098
- return new Promise((resolve, reject) => {
1099
- const timer = setTimeout(() => {
1100
- pendingParses.delete(id);
1101
- log(`TIMEOUT: ${filePath} exceeded ${timeoutMs}ms — killing worker`);
1102
- // Reject FIRST — worker.terminate() can hang if WASM is stuck
1103
- parseWorker = null;
1104
- workerParseCount = 0;
1105
- reject(new Error(`Parse timed out after ${timeoutMs}ms`));
1106
- // Fire-and-forget: kill the stuck worker in the background
1107
- worker.terminate().catch(() => { });
1108
- }, timeoutMs);
1109
- pendingParses.set(id, { resolve, reject, timer });
1110
- worker.postMessage({ type: 'parse', id, filePath, content, frameworkNames, language });
1111
- });
1112
- }
1208
+ };
1113
1209
  for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
1114
1210
  if (signal?.aborted) {
1115
- if (parseWorker)
1116
- parseWorker.terminate().catch(() => { });
1117
- return {
1118
- success: false,
1119
- filesIndexed,
1120
- filesSkipped,
1121
- filesErrored,
1122
- nodesCreated: totalNodes,
1123
- edgesCreated: totalEdges,
1124
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
1125
- durationMs: Date.now() - startTime,
1126
- };
1211
+ aborted = true;
1212
+ break;
1127
1213
  }
1128
1214
  const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
1129
1215
  // Read files in parallel (with path validation before any I/O)
@@ -1145,29 +1231,13 @@ class ExtractionOrchestrator {
1145
1231
  return { filePath: fp, content: null, stats: null, error: err };
1146
1232
  }
1147
1233
  }));
1148
- // Send to worker for parsing, store results on main thread
1234
+ // Dispatch each readable file into the bounded parse window; the window
1235
+ // stores results on the main thread as they arrive.
1149
1236
  for (const { filePath, content, stats, error } of fileContents) {
1150
1237
  if (signal?.aborted) {
1151
- if (parseWorker)
1152
- parseWorker.terminate().catch(() => { });
1153
- return {
1154
- success: false,
1155
- filesIndexed,
1156
- filesSkipped,
1157
- filesErrored,
1158
- nodesCreated: totalNodes,
1159
- edgesCreated: totalEdges,
1160
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
1161
- durationMs: Date.now() - startTime,
1162
- };
1238
+ aborted = true;
1239
+ break;
1163
1240
  }
1164
- // Report progress before parsing (show current file being worked on)
1165
- onProgress?.({
1166
- phase: 'parsing',
1167
- current: processed,
1168
- total,
1169
- currentFile: filePath,
1170
- });
1171
1241
  if (error || content === null || stats === null) {
1172
1242
  processed++;
1173
1243
  filesErrored++;
@@ -1177,6 +1247,7 @@ class ExtractionOrchestrator {
1177
1247
  severity: 'error',
1178
1248
  code: 'read_error',
1179
1249
  });
1250
+ onProgress?.({ phase: 'parsing', current: processed, total });
1180
1251
  continue;
1181
1252
  }
1182
1253
  // Honour MAX_FILE_SIZE. Without this check, vendored generated
@@ -1196,57 +1267,32 @@ class ExtractionOrchestrator {
1196
1267
  onProgress?.({ phase: 'parsing', current: processed, total });
1197
1268
  continue;
1198
1269
  }
1199
- // Parse in worker thread (main thread stays unblocked).
1200
- // Wrapped in try/catch to handle worker timeouts and crashes gracefully.
1201
- let result;
1202
- try {
1203
- result = await requestParse(filePath, content);
1204
- }
1205
- catch (parseErr) {
1206
- processed++;
1207
- filesErrored++;
1208
- errors.push({
1209
- message: parseErr instanceof Error ? parseErr.message : String(parseErr),
1210
- filePath,
1211
- severity: 'error',
1212
- code: 'parse_error',
1213
- });
1214
- continue;
1215
- }
1216
- processed++;
1217
- // Store in database on main thread (SQLite is not thread-safe)
1218
- if (result.nodes.length > 0 || result.errors.length === 0) {
1219
- const language = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1220
- this.storeExtractionResult(filePath, content, language, stats, result);
1221
- }
1222
- if (result.errors.length > 0) {
1223
- for (const err of result.errors) {
1224
- if (!err.filePath)
1225
- err.filePath = filePath;
1226
- }
1227
- errors.push(...result.errors);
1228
- }
1229
- if (result.nodes.length > 0) {
1230
- filesIndexed++;
1231
- totalNodes += result.nodes.length;
1232
- totalEdges += result.edges.length;
1233
- }
1234
- else if (result.errors.some((e) => e.severity === 'error')) {
1235
- filesErrored++;
1236
- }
1237
- else {
1238
- // Files with no symbols but no errors (yaml, twig, properties) are
1239
- // tracked at the file level — count them as indexed so the CLI
1240
- // doesn't misleadingly report "No files found to index".
1241
- const lang = (0, grammars_1.detectLanguage)(filePath, content, overrides);
1242
- if ((0, grammars_1.isFileLevelOnlyLanguage)(lang)) {
1243
- filesIndexed++;
1244
- }
1245
- else {
1246
- filesSkipped++;
1247
- }
1248
- }
1270
+ // Parse on the pool (main thread stays unblocked). Errors/timeouts are
1271
+ // handled inside feed() recordParseFailure, feeding the retry pass.
1272
+ await feed(filePath, content, stats);
1249
1273
  }
1274
+ if (aborted)
1275
+ break;
1276
+ }
1277
+ // Drain parses still in flight (skip on abort — we tear down below instead),
1278
+ // then commit any results the cursor hasn't reached yet.
1279
+ if (!aborted) {
1280
+ await Promise.all(inFlight);
1281
+ flushOrdered();
1282
+ }
1283
+ if (signal?.aborted || aborted) {
1284
+ if (pool)
1285
+ await pool.destroy();
1286
+ return {
1287
+ success: false,
1288
+ filesIndexed,
1289
+ filesSkipped,
1290
+ filesErrored,
1291
+ nodesCreated: totalNodes,
1292
+ edgesCreated: totalEdges,
1293
+ errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
1294
+ durationMs: Date.now() - startTime,
1295
+ };
1250
1296
  }
1251
1297
  // Report 100% so the progress bar doesn't hang at 99%
1252
1298
  onProgress?.({
@@ -1263,15 +1309,17 @@ class ExtractionOrchestrator {
1263
1309
  // every file gets the absolute cleanest WASM state possible.
1264
1310
  const retryableErrors = errors.filter((e) => e.code === 'parse_error' && e.filePath &&
1265
1311
  (e.message.includes('Worker exited') || e.message.includes('memory access out of bounds')));
1266
- if (retryableErrors.length > 0 && WorkerClass) {
1312
+ if (retryableErrors.length > 0 && pool) {
1267
1313
  log(`Retrying ${retryableErrors.length} files that failed due to WASM memory errors...`);
1314
+ // Fresh WASM heaps for the retry phase. A retry that still crashes its
1315
+ // worker makes the pool respawn it, so later retries keep landing on clean
1316
+ // workers too.
1317
+ pool.recycleAll();
1268
1318
  const stillFailing = [];
1269
1319
  for (const errEntry of retryableErrors) {
1270
1320
  const filePath = errEntry.filePath;
1271
1321
  if (signal?.aborted)
1272
1322
  break;
1273
- // Fresh worker for every retry — maximum WASM headroom
1274
- recycleWorker();
1275
1323
  let content;
1276
1324
  try {
1277
1325
  const fullPath = (0, utils_1.validatePathWithinRoot)(this.rootDir, filePath);
@@ -1284,7 +1332,7 @@ class ExtractionOrchestrator {
1284
1332
  }
1285
1333
  let result;
1286
1334
  try {
1287
- result = await requestParse(filePath, content);
1335
+ result = await parseFile(filePath, content);
1288
1336
  }
1289
1337
  catch {
1290
1338
  stillFailing.push(errEntry);
@@ -1310,11 +1358,11 @@ class ExtractionOrchestrator {
1310
1358
  // code nodes but consume parser memory.
1311
1359
  if (stillFailing.length > 0) {
1312
1360
  log(`${stillFailing.length} files still failing — retrying with comments stripped...`);
1361
+ pool.recycleAll();
1313
1362
  for (const errEntry of stillFailing) {
1314
1363
  const filePath = errEntry.filePath;
1315
1364
  if (signal?.aborted)
1316
1365
  break;
1317
- recycleWorker();
1318
1366
  let fullContent;
1319
1367
  try {
1320
1368
  const fullPath = (0, utils_1.validatePathWithinRoot)(this.rootDir, filePath);
@@ -1333,7 +1381,7 @@ class ExtractionOrchestrator {
1333
1381
  .join('\n');
1334
1382
  let result;
1335
1383
  try {
1336
- result = await requestParse(filePath, stripped);
1384
+ result = await parseFile(filePath, stripped);
1337
1385
  }
1338
1386
  catch {
1339
1387
  continue;
@@ -1354,11 +1402,9 @@ class ExtractionOrchestrator {
1354
1402
  }
1355
1403
  }
1356
1404
  }
1357
- // Shut down parse worker and clear any pending timers
1358
- rejectAllPending('Indexing complete');
1359
- if (parseWorker) {
1360
- parseWorker.terminate().catch(() => { });
1361
- }
1405
+ // Shut down the parse worker pool.
1406
+ if (pool)
1407
+ await pool.destroy();
1362
1408
  return {
1363
1409
  success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
1364
1410
  filesIndexed,