codesift-mcp 0.8.7 → 0.8.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/README.md +8 -0
  2. package/dist/cli/help.d.ts.map +1 -1
  3. package/dist/cli/help.js +6 -0
  4. package/dist/cli/help.js.map +1 -1
  5. package/dist/cli/hooks.d.ts.map +1 -1
  6. package/dist/cli/hooks.js +4 -3
  7. package/dist/cli/hooks.js.map +1 -1
  8. package/dist/cli/setup.d.ts +4 -0
  9. package/dist/cli/setup.d.ts.map +1 -1
  10. package/dist/cli/setup.js +97 -7
  11. package/dist/cli/setup.js.map +1 -1
  12. package/dist/config.d.ts +1 -0
  13. package/dist/config.d.ts.map +1 -1
  14. package/dist/config.js +1 -0
  15. package/dist/config.js.map +1 -1
  16. package/dist/formatters-shortening.d.ts +7 -0
  17. package/dist/formatters-shortening.d.ts.map +1 -1
  18. package/dist/formatters-shortening.js +48 -0
  19. package/dist/formatters-shortening.js.map +1 -1
  20. package/dist/formatters.d.ts.map +1 -1
  21. package/dist/formatters.js +14 -2
  22. package/dist/formatters.js.map +1 -1
  23. package/dist/instructions.d.ts +1 -1
  24. package/dist/instructions.d.ts.map +1 -1
  25. package/dist/instructions.js +2 -2
  26. package/dist/register-tool-loaders.d.ts +1 -0
  27. package/dist/register-tool-loaders.d.ts.map +1 -1
  28. package/dist/register-tool-loaders.js +1 -0
  29. package/dist/register-tool-loaders.js.map +1 -1
  30. package/dist/register-tools.d.ts +2 -0
  31. package/dist/register-tools.d.ts.map +1 -1
  32. package/dist/register-tools.js +80 -18
  33. package/dist/register-tools.js.map +1 -1
  34. package/dist/search/model2vec-tokenize.d.ts +22 -0
  35. package/dist/search/model2vec-tokenize.d.ts.map +1 -0
  36. package/dist/search/model2vec-tokenize.js +140 -0
  37. package/dist/search/model2vec-tokenize.js.map +1 -0
  38. package/dist/search/semantic.d.ts.map +1 -1
  39. package/dist/search/semantic.js +7 -0
  40. package/dist/search/semantic.js.map +1 -1
  41. package/dist/search/static-embedding-provider.d.ts +24 -0
  42. package/dist/search/static-embedding-provider.d.ts.map +1 -0
  43. package/dist/search/static-embedding-provider.js +149 -0
  44. package/dist/search/static-embedding-provider.js.map +1 -0
  45. package/dist/server-helpers.d.ts.map +1 -1
  46. package/dist/server-helpers.js +7 -3
  47. package/dist/server-helpers.js.map +1 -1
  48. package/dist/storage/_shared.d.ts.map +1 -1
  49. package/dist/storage/_shared.js +4 -1
  50. package/dist/storage/_shared.js.map +1 -1
  51. package/dist/storage/hash-snapshot.d.ts +36 -0
  52. package/dist/storage/hash-snapshot.d.ts.map +1 -0
  53. package/dist/storage/hash-snapshot.js +101 -0
  54. package/dist/storage/hash-snapshot.js.map +1 -0
  55. package/dist/storage/registry.d.ts.map +1 -1
  56. package/dist/storage/registry.js +35 -1
  57. package/dist/storage/registry.js.map +1 -1
  58. package/dist/storage/usage-stats.d.ts +8 -0
  59. package/dist/storage/usage-stats.d.ts.map +1 -1
  60. package/dist/storage/usage-stats.js +74 -24
  61. package/dist/storage/usage-stats.js.map +1 -1
  62. package/dist/storage/usage-tracker.d.ts +29 -5
  63. package/dist/storage/usage-tracker.d.ts.map +1 -1
  64. package/dist/storage/usage-tracker.js +41 -5
  65. package/dist/storage/usage-tracker.js.map +1 -1
  66. package/dist/tools/conversation-tools.d.ts +8 -1
  67. package/dist/tools/conversation-tools.d.ts.map +1 -1
  68. package/dist/tools/conversation-tools.js +61 -15
  69. package/dist/tools/conversation-tools.js.map +1 -1
  70. package/dist/tools/index-tools.d.ts +33 -2
  71. package/dist/tools/index-tools.d.ts.map +1 -1
  72. package/dist/tools/index-tools.js +524 -40
  73. package/dist/tools/index-tools.js.map +1 -1
  74. package/dist/tools/pg-introspect-tools.d.ts +147 -0
  75. package/dist/tools/pg-introspect-tools.d.ts.map +1 -0
  76. package/dist/tools/pg-introspect-tools.js +396 -0
  77. package/dist/tools/pg-introspect-tools.js.map +1 -0
  78. package/dist/tools/plan-turn-tools.d.ts.map +1 -1
  79. package/dist/tools/plan-turn-tools.js +88 -1
  80. package/dist/tools/plan-turn-tools.js.map +1 -1
  81. package/dist/tools/search-tools.d.ts +12 -0
  82. package/dist/tools/search-tools.d.ts.map +1 -1
  83. package/dist/tools/search-tools.js +120 -6
  84. package/dist/tools/search-tools.js.map +1 -1
  85. package/dist/types.d.ts +27 -0
  86. package/dist/types.d.ts.map +1 -1
  87. package/dist/utils/hf-download-stream.d.ts +21 -0
  88. package/dist/utils/hf-download-stream.d.ts.map +1 -0
  89. package/dist/utils/hf-download-stream.js +101 -0
  90. package/dist/utils/hf-download-stream.js.map +1 -0
  91. package/dist/utils/hf-hub-download.d.ts +8 -0
  92. package/dist/utils/hf-hub-download.d.ts.map +1 -0
  93. package/dist/utils/hf-hub-download.js +149 -0
  94. package/dist/utils/hf-hub-download.js.map +1 -0
  95. package/dist/utils/safetensors-loader.d.ts +9 -0
  96. package/dist/utils/safetensors-loader.d.ts.map +1 -0
  97. package/dist/utils/safetensors-loader.js +95 -0
  98. package/dist/utils/safetensors-loader.js.map +1 -0
  99. package/dist/utils/safetensors-meta-guard.d.ts +7 -0
  100. package/dist/utils/safetensors-meta-guard.d.ts.map +1 -0
  101. package/dist/utils/safetensors-meta-guard.js +50 -0
  102. package/dist/utils/safetensors-meta-guard.js.map +1 -0
  103. package/package.json +3 -1
  104. package/rules/codesift.md +1 -1
  105. package/rules/codesift.mdc +1 -1
  106. package/rules/codex.md +1 -1
  107. package/rules/gemini.md +1 -1
@@ -22,6 +22,7 @@ import { validateGitUrl, validateGitRef } from "../utils/git-validation.js";
22
22
  import { walkDirectory } from "../utils/walk.js";
23
23
  import { onFileChanged as scanOnChanged, onFileDeleted as scanOnDeleted, scanFileForSecrets } from "./secret-tools.js";
24
24
  import { getGraphPath } from "../storage/graph-store.js";
25
+ import { getSnapshotPath, loadHashSnapshot, saveHashSnapshot, HASH_SNAPSHOT_VERSION } from "../storage/hash-snapshot.js";
25
26
  const PARSE_CONCURRENCY = 8;
26
27
  const CHUNK_EMBEDDING_BATCH_SIZE = 96;
27
28
  const GIT_CLONE_TIMEOUT_MS = 120_000;
@@ -76,6 +77,13 @@ async function parseOneFile(filePath, repoRoot, repoName) {
76
77
  try {
77
78
  const stat = await import("node:fs/promises").then((fs) => fs.stat(filePath));
78
79
  const source = await readFile(filePath, "utf-8");
80
+ // CRITICAL-1 (TOCTOU parse↔hash): hash the EXACT source string we parse,
81
+ // here — never via a post-parse re-read. A re-read can observe a different
82
+ // on-disk version if the file is modified between parse and hash, pairing
83
+ // OLD symbols with a NEW sha so future runs permanently reuse mismatched
84
+ // symbols. The sha is NOT persisted inside FileEntry; callers thread it
85
+ // into the hash snapshot (and it saves one extra full read per parsed file).
86
+ const fileSha1 = createHash("sha1").update(source).digest("hex");
79
87
  const relPath = relative(repoRoot, filePath);
80
88
  const baseName = filePath.split("/").pop() ?? "";
81
89
  // Use full-path resolver so multi-dot suffixes like `.gradle.kts` beat
@@ -135,7 +143,7 @@ async function parseOneFile(filePath, repoRoot, repoName) {
135
143
  last_modified: Date.now(),
136
144
  mtime_ms: Math.round(stat.mtimeMs),
137
145
  };
138
- return { symbols, entry };
146
+ return { symbols, entry, sha1: fileSha1 };
139
147
  }
140
148
  catch (err) {
141
149
  const message = err instanceof Error ? err.message : String(err);
@@ -149,6 +157,9 @@ async function parseOneFile(filePath, repoRoot, repoName) {
149
157
  async function parseFiles(files, repoRoot, repoName) {
150
158
  const allSymbols = [];
151
159
  const fileEntries = [];
160
+ // CRITICAL-1: sha1 of the exact parsed source, keyed by relPath. Carried out
161
+ // of parseOneFile so the snapshot never re-reads (and never races) the file.
162
+ const shas = {};
152
163
  for (let i = 0; i < files.length; i += PARSE_CONCURRENCY) {
153
164
  const batch = files.slice(i, i + PARSE_CONCURRENCY);
154
165
  const results = await Promise.all(batch.map((filePath) => parseOneFile(filePath, repoRoot, repoName)));
@@ -156,10 +167,11 @@ async function parseFiles(files, repoRoot, repoName) {
156
167
  if (result) {
157
168
  allSymbols.push(...result.symbols);
158
169
  fileEntries.push(result.entry);
170
+ shas[result.entry.path] = result.sha1;
159
171
  }
160
172
  }
161
173
  }
162
- return { symbols: allSymbols, fileEntries };
174
+ return { symbols: allSymbols, fileEntries, shas };
163
175
  }
164
176
  // ---------------------------------------------------------------------------
165
177
  // Dirty propagation — mark caller files stale when a callee signature changes
@@ -307,6 +319,85 @@ async function embedChunks(fileEntries, rootPath, repoName, indexPath, config, s
307
319
  console.error(`[codesift] Chunk embedding failed for ${repoName}: ${message}`);
308
320
  }
309
321
  }
322
+ /**
323
+ * Decide whether a previously stored index no longer reflects the working
324
+ * tree. Samples up to 256 of its file paths (even stride) and stats them;
325
+ * when at least half are gone the old index is treated as stale. Used by the
326
+ * indexFolder sanity check to break the poisoned-baseline deadlock: an old
327
+ * index bloated with since-deleted trees (.worktrees/, vendored dirs) would
328
+ * otherwise reject every honest reindex as "truncated" forever.
329
+ */
330
+ const STALE_SAMPLE_LIMIT = 256;
331
+ const STALE_MISSING_FRACTION = 0.5;
332
+ async function isExistingIndexStale(existing, rootPath) {
333
+ const paths = existing.files.map((f) => f.path);
334
+ if (paths.length === 0)
335
+ return true;
336
+ const stride = Math.max(1, Math.floor(paths.length / STALE_SAMPLE_LIMIT));
337
+ const sampled = [];
338
+ for (let i = 0; i < paths.length && sampled.length < STALE_SAMPLE_LIMIT; i += stride) {
339
+ const p = paths[i];
340
+ if (p)
341
+ sampled.push(p);
342
+ }
343
+ let missing = 0;
344
+ await Promise.all(sampled.map(async (relPath) => {
345
+ try {
346
+ await stat(join(rootPath, relPath));
347
+ }
348
+ catch {
349
+ missing++;
350
+ }
351
+ }));
352
+ return missing >= sampled.length * STALE_MISSING_FRACTION;
353
+ }
354
+ /**
355
+ * Read a file and return the sha1 hex of its UTF-8 content, or null on read
356
+ * failure (deleted mid-walk, permission error). Code-sized files only — same
357
+ * assumption parseOneFile already makes. Non-throwing: callers treat null as
358
+ * "could not hash → fall through to re-parse".
359
+ */
360
+ async function sha1OfFile(absPath) {
361
+ try {
362
+ const content = await readFile(absPath, "utf-8");
363
+ return createHash("sha1").update(content).digest("hex");
364
+ }
365
+ catch {
366
+ return null;
367
+ }
368
+ }
369
+ /**
370
+ * Exported for unit testing only — not part of the public API.
371
+ *
372
+ * Drains a legacy-hash queue: hashes each file, then re-stats to confirm the
373
+ * mtime has not drifted since the decision-time stat. Entries whose mtime
374
+ * drifted (or whose stat fails) are omitted from the returned map so the next
375
+ * run re-parses them rather than reusing symbols against a mismatched sha.
376
+ *
377
+ * @param queue Items from the legacyHashQueue (relPath + filePath + decision-time mtimeMs).
378
+ * @param hashFn Injectable hash function (default: sha1OfFile). Tests inject a
379
+ * function that also modifies the file so they can trigger the
380
+ * TOCTOU drift-detection path without real concurrency.
381
+ * @param statFn Injectable stat function (default: fs.stat). Tests can stub this
382
+ * to return a post-modification mtime.
383
+ */
384
+ export async function drainLegacyHashQueue(queue, hashFn = sha1OfFile, statFn = (p) => import("node:fs/promises").then((m) => m.stat(p))) {
385
+ const result = {};
386
+ for (let i = 0; i < queue.length; i += PARSE_CONCURRENCY) {
387
+ const batch = queue.slice(i, i + PARSE_CONCURRENCY);
388
+ const shas = await Promise.all(batch.map((q) => hashFn(q.filePath)));
389
+ const stats = await Promise.all(batch.map((q) => statFn(q.filePath).then((st) => Math.round(st.mtimeMs), () => null)));
390
+ batch.forEach((q, j) => {
391
+ const currentMtime = stats[j];
392
+ if (currentMtime === null || currentMtime !== q.mtimeMs) {
393
+ // Mtime drifted or file gone — omit so next run re-parses.
394
+ return;
395
+ }
396
+ result[q.relPath] = shas[j] ?? "";
397
+ });
398
+ }
399
+ return result;
400
+ }
310
401
  export async function indexFolder(folderPath, options) {
311
402
  if (!folderPath || typeof folderPath !== "string") {
312
403
  throw new Error("folderPath is required and must be a non-empty string");
@@ -379,16 +470,87 @@ export async function indexFolder(folderPath, options) {
379
470
  mtimeMap.set(f.path, f.mtime_ms);
380
471
  }
381
472
  }
473
+ // Persistent hash snapshot (Task 6): relPath → sha1 from the previous index.
474
+ // mtime stays the cheap pre-filter (unchanged mtime → reuse without hashing,
475
+ // the fastest path). When mtime *changed*, the snapshot sha1 lets us still
476
+ // reuse symbols for touch/checkout no-op rewrites that bumped mtime without
477
+ // changing content — something mtime-only logic could never catch.
478
+ // null when absent/corrupt/version-or-repo-mismatch → degrade to full parse.
479
+ const snapshotPath = getSnapshotPath(indexPath);
480
+ let oldSnapshot = existing
481
+ ? await loadHashSnapshot(snapshotPath, repoName)
482
+ : null;
483
+ // Staleness guard (Task 6, CRITICAL-2): an incremental saveIncremental /
484
+ // removeFileFromIndex advances index.updated_at WITHOUT touching the
485
+ // snapshot. If saveIndex landed but the subsequent snapshot save failed (or
486
+ // an incremental edit ran after the last full index), the on-disk snapshot
487
+ // is OLDER than the index and its SHAs may no longer match the indexed
488
+ // symbols — carrying them forward (fast path) or sha-matching against them
489
+ // (changed path) would produce wrong reuse on revert+touch sequences. When
490
+ // the snapshot predates the index, discard it: the legacy hash-now
491
+ // convergence path below repopulates a fresh, correct snapshot this run.
492
+ // Guard uses strict inequality (!==), not <. The fresh-write contract is
493
+ // snapshot.created_at === index.updated_at exactly (created_at is anchored to
494
+ // codeIndex.updated_at, not a fresh Date.now()). So ANY mismatch — older OR
495
+ // newer — means the snapshot is not the one paired with this index and must
496
+ // be discarded. A FUTURE created_at (e.g. a snapshot written against a later,
497
+ // since-rolled-back index, or clock skew) is just as untrustworthy as a stale
498
+ // one: its SHAs may not match the indexed symbols.
499
+ if (oldSnapshot && existing && oldSnapshot.created_at !== existing.updated_at) {
500
+ console.warn(`[codesift] hash-snapshot older than index — rebuilding (${repoName})`);
501
+ oldSnapshot = null;
502
+ }
382
503
  const filesToParse = [];
383
504
  const keptSymbols = [];
384
505
  const keptEntries = [];
506
+ // sha1 of every file in the NEW index, by relPath. Populated for reused files
507
+ // here (from the old snapshot when present, else hashed-now for convergence)
508
+ // and for parsed files after parseFiles resolves.
509
+ const newSnapshotFiles = {};
510
+ // CRITICAL-1: reused files whose sha1 must be (re)computed because the old
511
+ // snapshot lacks it (legacy snapshot-less index, or stale snapshot discarded
512
+ // above). Collected here and hashed AFTER the loop in PARSE_CONCURRENCY
513
+ // batches instead of one serial await per file inside the loop — on a first
514
+ // run after upgrade against a many-thousand-file repo the serial version cost
515
+ // thousands of sequential awaits. Behavior is identical, wall-clock is
516
+ // parallelized.
517
+ //
518
+ // mtimeMs: the mtime observed at decision time (the moment we confirmed
519
+ // mtime === prevMtime and placed the file in the queue). We re-stat after
520
+ // hashing to detect any concurrent modification that landed between the two
521
+ // operations. If the mtime drifted, we omit the file from newSnapshotFiles
522
+ // entirely — the missing sha causes the next cold run to re-parse, avoiding
523
+ // a snapshot that pairs new-content sha against old (reused) symbols.
524
+ const legacyHashQueue = [];
525
+ // PERF: pre-build per-file lookups ONCE before the reuse loop. Both reuse
526
+ // branches need (a) the existing index's symbols for a given relPath and (b)
527
+ // its FileEntry. Doing `existing.symbols.filter(s => s.file === relPath)` /
528
+ // `existing.files.find(f => f.path === relPath)` per file is O(files ×
529
+ // symbols) and O(files²) respectively — quadratic, and on a many-thousand
530
+ // file/symbol repo that dominated the reuse-heavy fast path. A single pass
531
+ // builds Map lookups each branch hits in O(1). Built only when there's an
532
+ // existing index to reuse from.
533
+ const symbolsByFile = new Map();
534
+ const fileEntryByPath = new Map();
535
+ if (existing) {
536
+ for (const sym of existing.symbols) {
537
+ const list = symbolsByFile.get(sym.file);
538
+ if (list)
539
+ list.push(sym);
540
+ else
541
+ symbolsByFile.set(sym.file, [sym]);
542
+ }
543
+ for (const fe of existing.files) {
544
+ fileEntryByPath.set(fe.path, fe);
545
+ }
546
+ }
385
547
  if (mtimeMap.size > 0) {
386
548
  const { stat } = await import("node:fs/promises");
387
549
  for (const filePath of files) {
388
550
  const relPath = relative(rootPath, filePath);
389
551
  const prevMtime = mtimeMap.get(relPath);
390
552
  if (prevMtime !== undefined) {
391
- const fileEntry = existing.files.find((f) => f.path === relPath);
553
+ const fileEntry = fileEntryByPath.get(relPath);
392
554
  // Force re-parse if file is marked stale (callee signature changed)
393
555
  if (fileEntry?.stale) {
394
556
  filesToParse.push(filePath);
@@ -397,14 +559,47 @@ export async function indexFolder(folderPath, options) {
397
559
  try {
398
560
  const st = await stat(filePath);
399
561
  if (Math.round(st.mtimeMs) === prevMtime) {
400
- // File unchanged keep existing symbols
401
- const fileSymbols = existing.symbols.filter((s) => s.file === relPath);
562
+ // Fast path: mtime unchanged reuse symbols without hashing.
563
+ const fileSymbols = symbolsByFile.get(relPath) ?? [];
402
564
  if (fileEntry) {
403
565
  keptSymbols.push(...fileSymbols);
404
566
  keptEntries.push(fileEntry);
567
+ // Carry the sha1 forward: reuse from old snapshot if present,
568
+ // else DEFER hashing so legacy (snapshot-less) indexes converge
569
+ // to a complete snapshot after one run — without paying a serial
570
+ // hash per file inside this loop.
571
+ const carried = oldSnapshot?.files[relPath];
572
+ if (carried !== undefined) {
573
+ newSnapshotFiles[relPath] = carried;
574
+ }
575
+ else {
576
+ legacyHashQueue.push({ relPath, filePath, mtimeMs: Math.round(st.mtimeMs) });
577
+ }
405
578
  continue;
406
579
  }
407
580
  }
581
+ else {
582
+ // mtime changed — hash decides reuse vs re-parse. This catches
583
+ // touch/checkout that bumped mtime without changing content.
584
+ const snapSha = oldSnapshot?.files[relPath];
585
+ if (snapSha !== undefined && fileEntry && !fileEntry.stale) {
586
+ const currentSha = await sha1OfFile(filePath);
587
+ if (currentSha !== null && currentSha === snapSha) {
588
+ const fileSymbols = symbolsByFile.get(relPath) ?? [];
589
+ keptSymbols.push(...fileSymbols);
590
+ // FIX: the file's mtime changed but content is identical (touch /
591
+ // checkout no-op rewrite). Reuse the symbols, but DON'T carry the
592
+ // stale FileEntry verbatim — its mtime_ms still holds the OLD
593
+ // mtime, so every future run would see mtime !== prevMtime and
594
+ // re-hash this file forever, permanently degrading it off the
595
+ // mtime fast path. Clone the entry with mtime_ms bumped to the
596
+ // CURRENT stat's mtime so the next run takes the cheap fast path.
597
+ keptEntries.push({ ...fileEntry, mtime_ms: Math.round(st.mtimeMs) });
598
+ newSnapshotFiles[relPath] = currentSha;
599
+ continue;
600
+ }
601
+ }
602
+ }
408
603
  }
409
604
  catch { /* file may have been deleted — reparse */ }
410
605
  }
@@ -414,10 +609,32 @@ export async function indexFolder(folderPath, options) {
414
609
  else {
415
610
  filesToParse.push(...files);
416
611
  }
612
+ // Drain the deferred legacy-hash queue (CRITICAL-1): files reused via the
613
+ // mtime fast path that had no carried sha1 (legacy snapshot-less index, or a
614
+ // stale snapshot discarded by the guard above). See drainLegacyHashQueue for
615
+ // the TOCTOU guard details — entries whose mtime drifted between decision
616
+ // time and hash time are omitted so the next run re-parses rather than
617
+ // reusing symbols against a mismatched sha.
618
+ if (legacyHashQueue.length > 0) {
619
+ const drained = await drainLegacyHashQueue(legacyHashQueue);
620
+ Object.assign(newSnapshotFiles, drained);
621
+ }
417
622
  // Parse only changed/new files
418
- const { symbols: parsedSymbols, fileEntries: parsedEntries } = await parseFiles(filesToParse, rootPath, repoName);
623
+ const { symbols: parsedSymbols, fileEntries: parsedEntries, shas: parsedShas } = await parseFiles(filesToParse, rootPath, repoName);
419
624
  const symbols = [...keptSymbols, ...parsedSymbols];
420
625
  const fileEntries = [...keptEntries, ...parsedEntries];
626
+ // Record sha1s for the files that were actually parsed (changed/new).
627
+ // CRITICAL-1 (TOCTOU): these hashes come straight from parseOneFile — they
628
+ // are the sha1 of the EXACT source string that produced the symbols, so the
629
+ // snapshot can never pair old symbols with a newer file's sha. Only entries
630
+ // that survived parseFiles (parseOneFile returned non-null) have a sha here,
631
+ // keeping the snapshot in lockstep with fileEntries. The previous post-parse
632
+ // double-read loop is gone — one fewer full read per parsed file.
633
+ for (const entry of parsedEntries) {
634
+ const sha = parsedShas[entry.path];
635
+ if (sha !== undefined)
636
+ newSnapshotFiles[entry.path] = sha;
637
+ }
421
638
  // Dirty propagation: detect signature changes and mark caller files stale
422
639
  if (existing && filesToParse.length > 0 && filesToParse.length < files.length) {
423
640
  const staleFiles = propagateDirtySignatures(existing.symbols, symbols, fileEntries);
@@ -425,25 +642,209 @@ export async function indexFolder(folderPath, options) {
425
642
  console.error(`[codesift] Dirty propagation: ${staleFiles.size} caller files marked stale`);
426
643
  }
427
644
  }
428
- // Build and cache BM25 index; invalidate code index cache
429
- const bm25 = buildBM25Index(symbols);
430
- bm25Indexes.set(repoName, bm25);
645
+ // Invalidate code index cache (BM25 is rebuilt below from the FINAL symbol
646
+ // set possibly merged with out-of-scope existing symbols, see merge block).
431
647
  codeIndexes.delete(repoName);
432
648
  // Sanity check: don't overwrite a complete index with a partial one
433
- // (WASM crash or walk failure can produce truncated results)
649
+ // (WASM crash or walk failure can produce truncated results).
650
+ //
651
+ // IMPORTANT: skip the guard when the walk was explicitly narrowed — either
652
+ // max_files was hit (truncated at cap) or include_paths scoped the walk to a
653
+ // subdirectory. In both cases the small result count is EXPECTED and rejecting
654
+ // it would be a false positive (the "1139 vs 9512" bug class). For unrestricted
655
+ // walks the guard stays as-is, protecting against genuine silent truncations.
656
+ //
657
+ // CRITICAL (T7 correctness fix): skipping the guard is necessary but NOT
658
+ // sufficient. A scoped/capped walk only SEES a narrow slice of the repo; if we
659
+ // persisted that slice as the WHOLE index we would wipe every out-of-scope
660
+ // file's symbols from index+snapshot (worse than the guard's old reject,
661
+ // which at least preserved the prior index). So for scoped/capped walks with
662
+ // an existing index we MERGE: keep out-of-scope existing entries verbatim and
663
+ // overlay the walk's results. See the merge block below.
664
+ //
665
+ // "max_files hit" detection: files.length === effective maxFiles. This is the
666
+ // only signal walkDirectory exposes (it sets limitReached internally but does
667
+ // not surface it on the return value). A 1-in-a-million exact-count false
668
+ // positive (repo has exactly maxFiles parseable files) is accepted — the
669
+ // guard skip is conservative (allows write), not destructive.
434
670
  const DROP_THRESHOLD = 0.5; // Reject if new index has <50% of old file count
435
- if (existing && fileEntries.length < existing.file_count * DROP_THRESHOLD && existing.file_count > 50) {
436
- console.error(`[codesift] SANITY CHECK FAILED for ${repoName}: ` +
437
- `new index has ${fileEntries.length} files vs ${existing.file_count} previously. ` +
438
- `Keeping old index. Use invalidate_cache + index_folder to force reindex.`);
439
- return {
440
- repo: repoName,
441
- root: rootPath,
442
- file_count: existing.file_count,
443
- symbol_count: existing.symbol_count,
444
- duration_ms: Date.now() - startTime,
671
+ const walkExplicitlyCapped = hitFileLimit;
672
+ const walkExplicitlyScoped = options?.include_paths !== undefined && options.include_paths.length > 0;
673
+ // MIN_GUARD_FILES: the unrestricted guard only arms above this existing
674
+ // file_count (`existing.file_count > 50` below). The scoped-granularity guard
675
+ // mirrors that shape against the in-scope subset so a tiny scope can't be
676
+ // rejected on noise. Single source of truth so both guards stay in lockstep.
677
+ const MIN_GUARD_FILES = 50;
678
+ if (walkExplicitlyCapped || walkExplicitlyScoped) {
679
+ // ROUND-2 FIX (scoped-granularity guard): the unrestricted guard is skipped
680
+ // for scoped/capped walks because a small *overall* result is expected. But
681
+ // that skip was total — a scoped walk that aborts mid-enumeration (WASM
682
+ // crash, transient FS error, an over-broad exclude) silently truncates the
683
+ // IN-SCOPE slice, and the merge below treats every unwalked in-scope file as
684
+ // a deletion → wipes it from index+snapshot. So for a purely SCOPED (uncapped)
685
+ // walk we re-arm a guard against the IN-SCOPE subset: if the walk enumerated
686
+ // far fewer in-scope files than the existing index held in that same scope,
687
+ // AND those files are still on disk, the enumeration was truncated → reject
688
+ // before any merge/save, leaving the old index+snapshot intact.
689
+ //
690
+ // Capped walks are intentionally EXEMPT: a cap means unseen ≠ deleted (the
691
+ // merge preserves all unwalked files), so there is no truncation to detect —
692
+ // nothing in-scope is dropped. A walk that is BOTH scoped and capped also
693
+ // takes capped semantics (preserve everything unwalked), so the same
694
+ // exemption applies — no in-scope file can be lost.
695
+ if (walkExplicitlyScoped && !walkExplicitlyCapped && existing) {
696
+ const includePaths = options.include_paths;
697
+ const inScopeRel = (relPath) => includePaths.some((p) => relPath.startsWith(p)); // mirror walkDirectory
698
+ const existingInScope = existing.files.filter((fe) => inScopeRel(fe.path));
699
+ // All walked files are in scope by construction (walkDirectory honored
700
+ // includePaths), so walkedInScope is simply the walk's file count.
701
+ const walkedInScope = fileEntries.length;
702
+ if (existingInScope.length > MIN_GUARD_FILES &&
703
+ walkedInScope < existingInScope.length * DROP_THRESHOLD) {
704
+ // Auto-heal analog (in-scope): the shrink may be a genuine mass deletion
705
+ // within the scope, not a truncated walk. Sample the existing in-scope
706
+ // paths on disk (mirrors isExistingIndexStale, but restricted to the
707
+ // scope) — if most are gone, accept the merge.
708
+ const inScopePaths = existingInScope.map((fe) => fe.path);
709
+ const stride = Math.max(1, Math.floor(inScopePaths.length / STALE_SAMPLE_LIMIT));
710
+ const sampled = [];
711
+ for (let i = 0; i < inScopePaths.length && sampled.length < STALE_SAMPLE_LIMIT; i += stride) {
712
+ const p = inScopePaths[i];
713
+ if (p)
714
+ sampled.push(p);
715
+ }
716
+ let missing = 0;
717
+ await Promise.all(sampled.map(async (relPath) => {
718
+ try {
719
+ await stat(join(rootPath, relPath));
720
+ }
721
+ catch {
722
+ missing++;
723
+ }
724
+ }));
725
+ const mostGone = missing >= sampled.length * STALE_MISSING_FRACTION;
726
+ if (mostGone) {
727
+ console.error(`[codesift] Scoped sanity auto-heal for ${repoName}: walked ` +
728
+ `${walkedInScope} of ${existingInScope.length} in-scope files but ` +
729
+ `most sampled in-scope paths no longer exist on disk. Accepting ` +
730
+ `scoped merge (legit in-scope mass deletion).`);
731
+ }
732
+ else {
733
+ console.error(`[codesift] SCOPED SANITY CHECK FAILED for ${repoName}: scoped walk ` +
734
+ `under-enumerated — walked ${walkedInScope} of ${existingInScope.length} ` +
735
+ `in-scope files, which still exist on disk. Keeping old index.`);
736
+ return {
737
+ repo: repoName,
738
+ root: rootPath,
739
+ file_count: existing.file_count,
740
+ symbol_count: existing.symbol_count,
741
+ duration_ms: Date.now() - startTime,
742
+ status: "rejected_partial",
743
+ reason: `scoped walk under-enumerated: walked ${walkedInScope} of ${existingInScope.length} in-scope files (still on disk) — kept old index, nothing was re-registered`,
744
+ hint: "If the in-scope shrink is expected (deleted files, new excludes), run invalidate_cache then index_folder to rebuild from scratch.",
745
+ };
746
+ }
747
+ }
748
+ }
749
+ const detail = walkExplicitlyCapped
750
+ ? `max_files=${maxFiles} hit (${files.length} files returned)`
751
+ : `include_paths=[${options.include_paths.join(", ")}]`;
752
+ console.error(`[codesift] sanity guard skipped: walk explicitly capped/scoped (${detail})`);
753
+ }
754
+ else if (existing && fileEntries.length < existing.file_count * DROP_THRESHOLD && existing.file_count > MIN_GUARD_FILES) {
755
+ // The shrink can also mean the OLD index is the bogus one: an earlier
756
+ // walker may have swept since-deleted trees (.worktrees/, vendored dirs),
757
+ // permanently inflating the baseline so every honest reindex looks
758
+ // truncated and gets rejected forever. Disambiguate by sampling the old
759
+ // index's paths: if most of them no longer exist on disk, the old index
760
+ // is stale dead weight — accept the new result instead of keeping it.
761
+ if (await isExistingIndexStale(existing, rootPath)) {
762
+ console.error(`[codesift] Sanity check auto-heal for ${repoName}: old index has ` +
763
+ `${existing.file_count} files but most sampled paths no longer exist ` +
764
+ `on disk. Accepting new index (${fileEntries.length} files).`);
765
+ }
766
+ else {
767
+ console.error(`[codesift] SANITY CHECK FAILED for ${repoName}: ` +
768
+ `new index has ${fileEntries.length} files vs ${existing.file_count} previously. ` +
769
+ `Keeping old index. Use invalidate_cache + index_folder to force reindex.`);
770
+ return {
771
+ repo: repoName,
772
+ root: rootPath,
773
+ file_count: existing.file_count,
774
+ symbol_count: existing.symbol_count,
775
+ duration_ms: Date.now() - startTime,
776
+ status: "rejected_partial",
777
+ reason: `new walk found ${fileEntries.length} files, <50% of the ${existing.file_count} previously indexed — kept old index, nothing was re-registered`,
778
+ hint: "If the shrink is expected (deleted trees, new excludes), run invalidate_cache then index_folder to rebuild from scratch.",
779
+ };
780
+ }
781
+ }
782
+ // ── MERGE-persist for scoped/capped walks (T7 correctness fix) ────────────
783
+ // A scoped (include_paths) or capped (max_files-hit) walk only enumerated a
784
+ // slice of the repo. Persisting that slice verbatim would delete every
785
+ // out-of-scope file's symbols from index+snapshot. When an existing index is
786
+ // present we instead MERGE: preserve out-of-scope existing entries/symbols/
787
+ // shas and overlay the walk's results.
788
+ //
789
+ // - include_paths scoped (and NOT capped): "scope" = files whose relPath is
790
+ // under any include root (mirror walkDirectory's relPath.startsWith(p)
791
+ // test EXACTLY). Out-of-scope existing files are preserved verbatim;
792
+ // in-scope existing files NOT in the walk set W are dropped (genuine
793
+ // in-scope deletions — the walk fully enumerated the scope).
794
+ // - capped (max_files hit): scope is UNDEFINED — the cap means an unseen
795
+ // file is not necessarily deleted. Preserve ALL existing entries not in W,
796
+ // overlay W. (If a capped walk also passed include_paths, the cap makes the
797
+ // in-scope enumeration incomplete too, so we still only trust W and
798
+ // preserve everything else — capped semantics win.)
799
+ //
800
+ // First run (no existing index) with a scoped/capped walk → save what we have
801
+ // (current behavior, documented): there is nothing to preserve.
802
+ let mergedSymbols = symbols;
803
+ let mergedEntries = fileEntries;
804
+ let mergedSnapshotFiles = newSnapshotFiles;
805
+ if ((walkExplicitlyCapped || walkExplicitlyScoped) && existing) {
806
+ const walkedPaths = new Set(fileEntries.map((fe) => fe.path));
807
+ // A capped walk has undefined scope (unseen ≠ deleted), so it preserves
808
+ // everything not walked. A purely scoped (uncapped) walk additionally drops
809
+ // in-scope-but-unwalked files, since the walk fully enumerated the scope.
810
+ const includePaths = options?.include_paths;
811
+ const inScope = (relPath) => {
812
+ if (walkExplicitlyCapped)
813
+ return false; // cap → never treat as deletable
814
+ if (!includePaths || includePaths.length === 0)
815
+ return false;
816
+ // Mirror walkDirectory's include-path filter exactly.
817
+ return includePaths.some((p) => relPath.startsWith(p));
445
818
  };
819
+ const preservedEntries = [];
820
+ const preservedFilePaths = new Set();
821
+ for (const fe of existing.files) {
822
+ if (walkedPaths.has(fe.path))
823
+ continue; // walk result wins for these
824
+ if (inScope(fe.path))
825
+ continue; // in-scope + not walked = deleted-in-scope
826
+ preservedEntries.push(fe);
827
+ preservedFilePaths.add(fe.path);
828
+ }
829
+ const preservedSymbols = existing.symbols.filter((s) => preservedFilePaths.has(s.file));
830
+ mergedEntries = [...preservedEntries, ...fileEntries];
831
+ mergedSymbols = [...preservedSymbols, ...symbols];
832
+ // Snapshot: preserve out-of-scope shas, overlay walked ones.
833
+ mergedSnapshotFiles = {};
834
+ if (oldSnapshot) {
835
+ for (const relPath of preservedFilePaths) {
836
+ const sha = oldSnapshot.files[relPath];
837
+ if (sha !== undefined)
838
+ mergedSnapshotFiles[relPath] = sha;
839
+ }
840
+ }
841
+ Object.assign(mergedSnapshotFiles, newSnapshotFiles);
446
842
  }
843
+ // Build and cache BM25 index from the FINAL (possibly merged) symbol set.
844
+ // Built here (not before the guard) so a rejected_partial early-return leaves
845
+ // the previous in-memory BM25 index intact rather than swapping in a partial.
846
+ const bm25 = buildBM25Index(mergedSymbols);
847
+ bm25Indexes.set(repoName, bm25);
447
848
  // Resolve workspaces (Task 7) — runs before persistence so collectImportEdges
448
849
  // and other downstream consumers see the populated `workspaces` field.
449
850
  // Gated behind CODESIFT_DISABLE_MONOREPO=1 kill switch (spec D-FB).
@@ -460,24 +861,49 @@ export async function indexFolder(folderPath, options) {
460
861
  // mode is the safe fallback.
461
862
  }
462
863
  }
463
- // Build and save code index
864
+ // Build and save code index from the FINAL (possibly merged) sets.
464
865
  const codeIndex = {
465
866
  repo: repoName,
466
867
  root: rootPath,
467
- symbols,
468
- files: fileEntries,
868
+ symbols: mergedSymbols,
869
+ files: mergedEntries,
469
870
  created_at: Date.now(),
470
871
  updated_at: Date.now(),
471
- symbol_count: symbols.length,
472
- file_count: fileEntries.length,
872
+ symbol_count: mergedSymbols.length,
873
+ file_count: mergedEntries.length,
473
874
  extractor_version: { ...EXTRACTOR_VERSIONS },
474
875
  ...(workspaces ? { workspaces } : {}),
475
876
  };
476
877
  await saveIndex(indexPath, codeIndex);
878
+ // Persist the hash snapshot AFTER the index lands (mirrors registerRepo
879
+ // ordering) and only on the success path — the rejected_partial branch
880
+ // returned earlier, leaving the previous snapshot intact. Non-fatal: the
881
+ // snapshot is a reuse-optimization cache; a write failure just costs a full
882
+ // re-parse next run, so we warn and continue.
883
+ try {
884
+ const newSnapshot = {
885
+ version: HASH_SNAPSHOT_VERSION,
886
+ repo: repoName,
887
+ // CRITICAL-2 (created_at race): use the EXACT timestamp serialized into
888
+ // the index, not a fresh Date.now(). A watcher's saveIncremental that
889
+ // lands between saveIndex and this write would otherwise leave the
890
+ // snapshot OLDER than created_at, blinding the staleness guard above. By
891
+ // anchoring to codeIndex.updated_at, snapshot.created_at === the index's
892
+ // updated_at on a fresh write, so any later incremental strictly advances
893
+ // index.updated_at past it and the guard fires correctly.
894
+ created_at: codeIndex.updated_at,
895
+ files: mergedSnapshotFiles,
896
+ };
897
+ await saveHashSnapshot(snapshotPath, newSnapshot);
898
+ }
899
+ catch (err) {
900
+ const msg = err instanceof Error ? err.message : String(err);
901
+ console.warn(`[codesift] hash-snapshot save failed for ${repoName} (non-fatal): ${msg}`);
902
+ }
477
903
  // Embed symbols and chunks in background (non-fatal, don't block MCP response)
478
904
  // Large repos (71K symbols) can take minutes — fire-and-forget to prevent timeout
479
- embedSymbols(symbols, indexPath, repoName, config)
480
- .then(() => embedChunks(fileEntries, rootPath, repoName, indexPath, config, symbols))
905
+ embedSymbols(mergedSymbols, indexPath, repoName, config)
906
+ .then(() => embedChunks(mergedEntries, rootPath, repoName, indexPath, config, mergedSymbols))
481
907
  .catch((err) => {
482
908
  const msg = err instanceof Error ? err.message : String(err);
483
909
  console.error(`[codesift] Background embedding failed for ${repoName}: ${msg}`);
@@ -496,8 +922,8 @@ export async function indexFolder(folderPath, options) {
496
922
  name: repoName,
497
923
  root: rootPath,
498
924
  index_path: indexPath,
499
- symbol_count: symbols.length,
500
- file_count: fileEntries.length,
925
+ symbol_count: mergedSymbols.length,
926
+ file_count: mergedEntries.length,
501
927
  updated_at: Date.now(),
502
928
  };
503
929
  await registerRepo(config.registryPath, meta);
@@ -520,7 +946,7 @@ export async function indexFolder(folderPath, options) {
520
946
  try {
521
947
  const { detectFrameworks } = await import("../utils/framework-detect.js");
522
948
  const { enableFrameworkToolBundle } = await import("../register-tools.js");
523
- const tempIndex = { root: rootPath, files: fileEntries, symbols };
949
+ const tempIndex = { root: rootPath, files: mergedEntries, symbols: mergedSymbols };
524
950
  const frameworks = detectFrameworks(tempIndex);
525
951
  for (const fw of frameworks) {
526
952
  const enabled = enableFrameworkToolBundle(fw);
@@ -538,8 +964,8 @@ export async function indexFolder(folderPath, options) {
538
964
  return {
539
965
  repo: repoName,
540
966
  root: rootPath,
541
- file_count: fileEntries.length,
542
- symbol_count: symbols.length,
967
+ file_count: mergedEntries.length,
968
+ symbol_count: mergedSymbols.length,
543
969
  duration_ms: Date.now() - startTime,
544
970
  };
545
971
  }
@@ -778,7 +1204,8 @@ export async function invalidateCache(repoName) {
778
1204
  const chunkPath = getChunkPath(meta.index_path);
779
1205
  const chunkEmbeddingPath = getChunkEmbeddingPath(meta.index_path);
780
1206
  const graphStorePath = getGraphPath(meta.index_path);
781
- for (const fp of [meta.index_path, embeddingPath, embeddingMetaPath, chunkPath, chunkEmbeddingPath, graphStorePath]) {
1207
+ const snapshotPath = getSnapshotPath(meta.index_path);
1208
+ for (const fp of [meta.index_path, embeddingPath, embeddingMetaPath, chunkPath, chunkEmbeddingPath, graphStorePath, snapshotPath]) {
782
1209
  try {
783
1210
  await unlink(fp);
784
1211
  }
@@ -788,6 +1215,22 @@ export async function invalidateCache(repoName) {
788
1215
  await removeRepo(config.registryPath, repoName);
789
1216
  return true;
790
1217
  }
1218
+ /**
1219
+ * In-process record of the last indexed state per absolute file path.
1220
+ *
1221
+ * Telemetry (30d, 2026-06): 750 consecutive duplicate index_file calls at
1222
+ * avg 3.7s each (~47 min of agent wall-clock). Two causes: (1) duplicate
1223
+ * hook registrations firing index_file twice per edit, and (2) a race where
1224
+ * call N+1's on-disk mtime pre-check read the index before call N's
1225
+ * serialized saveIncremental landed, forcing a full re-parse + full-index
1226
+ * save. This map short-circuits both in-process in ~1ms (mtime first, then
1227
+ * content hash for touch/no-op rewrites) without loading the on-disk index.
1228
+ */
1229
+ const lastIndexedState = new Map();
1230
+ /** Test hook — clear the in-process last-indexed state. */
1231
+ export function clearLastIndexedStateForTesting() {
1232
+ lastIndexedState.clear();
1233
+ }
791
1234
  /**
792
1235
  * Re-index a single file instantly. Finds the repo by matching the file
793
1236
  * path against indexed repo roots. Updates symbols, BM25 index, and
@@ -815,13 +1258,47 @@ export async function indexFile(filePath) {
815
1258
  clearTsconfigCache();
816
1259
  }
817
1260
  }
818
- // mtime check skip if unchanged
819
- const existing = await loadIndex(matchingRepo.index_path);
820
- if (existing) {
821
- const prevEntry = existing.files.find((f) => f.path === relPath);
822
- if (prevEntry?.mtime_ms) {
823
- const st = await stat(absPath);
824
- if (Math.round(st.mtimeMs) === prevEntry.mtime_ms) {
1261
+ // In-process short-circuit: mtime, then content hash. Both avoid loading
1262
+ // the on-disk index entirely (the expensive part on large repos).
1263
+ const st = await stat(absPath);
1264
+ const mem = lastIndexedState.get(absPath);
1265
+ if (mem && Math.round(st.mtimeMs) === mem.mtimeMs) {
1266
+ return {
1267
+ repo: matchingRepo.name,
1268
+ file: relPath,
1269
+ symbol_count: mem.symbolCount,
1270
+ duration_ms: Date.now() - startTime,
1271
+ skipped: true,
1272
+ };
1273
+ }
1274
+ const content = await readFile(absPath, "utf-8").catch(() => null);
1275
+ const contentHash = content !== null ? createHash("sha1").update(content).digest("hex") : null;
1276
+ if (mem && contentHash !== null && contentHash === mem.contentHash) {
1277
+ // Touched / rewritten with identical content — refresh mtime, skip work.
1278
+ mem.mtimeMs = Math.round(st.mtimeMs);
1279
+ return {
1280
+ repo: matchingRepo.name,
1281
+ file: relPath,
1282
+ symbol_count: mem.symbolCount,
1283
+ duration_ms: Date.now() - startTime,
1284
+ skipped: true,
1285
+ };
1286
+ }
1287
+ // On-disk mtime check — first touch of this file in this process (CLI
1288
+ // hook invocations, fresh server). Skips files unchanged since the last
1289
+ // full index, and seeds the in-process state for subsequent calls.
1290
+ if (!mem) {
1291
+ const existing = await loadIndex(matchingRepo.index_path);
1292
+ if (existing) {
1293
+ const prevEntry = existing.files.find((f) => f.path === relPath);
1294
+ if (prevEntry?.mtime_ms && Math.round(st.mtimeMs) === prevEntry.mtime_ms) {
1295
+ if (contentHash !== null) {
1296
+ lastIndexedState.set(absPath, {
1297
+ mtimeMs: Math.round(st.mtimeMs),
1298
+ contentHash,
1299
+ symbolCount: prevEntry.symbol_count,
1300
+ });
1301
+ }
825
1302
  return {
826
1303
  repo: matchingRepo.name,
827
1304
  file: relPath,
@@ -837,6 +1314,13 @@ export async function indexFile(filePath) {
837
1314
  throw new Error(`Failed to parse "${relPath}"`);
838
1315
  }
839
1316
  await saveIncremental(matchingRepo.index_path, relPath, result.symbols, result.entry);
1317
+ if (contentHash !== null) {
1318
+ lastIndexedState.set(absPath, {
1319
+ mtimeMs: Math.round(st.mtimeMs),
1320
+ contentHash,
1321
+ symbolCount: result.symbols.length,
1322
+ });
1323
+ }
840
1324
  let secretFindingsCount = 0;
841
1325
  if (config.secretScanEnabled) {
842
1326
  try {