npm - codesift-mcp - Versions diffs - 0.8.7 → 0.8.11 - Mend

codesift-mcp 0.8.7 → 0.8.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

package/README.md +8 -0
package/dist/cli/help.d.ts.map +1 -1
package/dist/cli/help.js +6 -0
package/dist/cli/help.js.map +1 -1
package/dist/cli/hooks.d.ts.map +1 -1
package/dist/cli/hooks.js +4 -3
package/dist/cli/hooks.js.map +1 -1
package/dist/cli/setup.d.ts +4 -0
package/dist/cli/setup.d.ts.map +1 -1
package/dist/cli/setup.js +97 -7
package/dist/cli/setup.js.map +1 -1
package/dist/config.d.ts +1 -0
package/dist/config.d.ts.map +1 -1
package/dist/config.js +1 -0
package/dist/config.js.map +1 -1
package/dist/formatters-shortening.d.ts +7 -0
package/dist/formatters-shortening.d.ts.map +1 -1
package/dist/formatters-shortening.js +48 -0
package/dist/formatters-shortening.js.map +1 -1
package/dist/formatters.d.ts.map +1 -1
package/dist/formatters.js +14 -2
package/dist/formatters.js.map +1 -1
package/dist/instructions.d.ts +1 -1
package/dist/instructions.d.ts.map +1 -1
package/dist/instructions.js +2 -2
package/dist/register-tool-loaders.d.ts +1 -0
package/dist/register-tool-loaders.d.ts.map +1 -1
package/dist/register-tool-loaders.js +1 -0
package/dist/register-tool-loaders.js.map +1 -1
package/dist/register-tools.d.ts +2 -0
package/dist/register-tools.d.ts.map +1 -1
package/dist/register-tools.js +80 -18
package/dist/register-tools.js.map +1 -1
package/dist/search/model2vec-tokenize.d.ts +22 -0
package/dist/search/model2vec-tokenize.d.ts.map +1 -0
package/dist/search/model2vec-tokenize.js +140 -0
package/dist/search/model2vec-tokenize.js.map +1 -0
package/dist/search/semantic.d.ts.map +1 -1
package/dist/search/semantic.js +7 -0
package/dist/search/semantic.js.map +1 -1
package/dist/search/static-embedding-provider.d.ts +24 -0
package/dist/search/static-embedding-provider.d.ts.map +1 -0
package/dist/search/static-embedding-provider.js +149 -0
package/dist/search/static-embedding-provider.js.map +1 -0
package/dist/server-helpers.d.ts.map +1 -1
package/dist/server-helpers.js +7 -3
package/dist/server-helpers.js.map +1 -1
package/dist/storage/_shared.d.ts.map +1 -1
package/dist/storage/_shared.js +4 -1
package/dist/storage/_shared.js.map +1 -1
package/dist/storage/hash-snapshot.d.ts +36 -0
package/dist/storage/hash-snapshot.d.ts.map +1 -0
package/dist/storage/hash-snapshot.js +101 -0
package/dist/storage/hash-snapshot.js.map +1 -0
package/dist/storage/registry.d.ts.map +1 -1
package/dist/storage/registry.js +35 -1
package/dist/storage/registry.js.map +1 -1
package/dist/storage/usage-stats.d.ts +8 -0
package/dist/storage/usage-stats.d.ts.map +1 -1
package/dist/storage/usage-stats.js +74 -24
package/dist/storage/usage-stats.js.map +1 -1
package/dist/storage/usage-tracker.d.ts +29 -5
package/dist/storage/usage-tracker.d.ts.map +1 -1
package/dist/storage/usage-tracker.js +41 -5
package/dist/storage/usage-tracker.js.map +1 -1
package/dist/tools/conversation-tools.d.ts +8 -1
package/dist/tools/conversation-tools.d.ts.map +1 -1
package/dist/tools/conversation-tools.js +61 -15
package/dist/tools/conversation-tools.js.map +1 -1
package/dist/tools/index-tools.d.ts +33 -2
package/dist/tools/index-tools.d.ts.map +1 -1
package/dist/tools/index-tools.js +524 -40
package/dist/tools/index-tools.js.map +1 -1
package/dist/tools/pg-introspect-tools.d.ts +147 -0
package/dist/tools/pg-introspect-tools.d.ts.map +1 -0
package/dist/tools/pg-introspect-tools.js +396 -0
package/dist/tools/pg-introspect-tools.js.map +1 -0
package/dist/tools/plan-turn-tools.d.ts.map +1 -1
package/dist/tools/plan-turn-tools.js +88 -1
package/dist/tools/plan-turn-tools.js.map +1 -1
package/dist/tools/search-tools.d.ts +12 -0
package/dist/tools/search-tools.d.ts.map +1 -1
package/dist/tools/search-tools.js +120 -6
package/dist/tools/search-tools.js.map +1 -1
package/dist/types.d.ts +27 -0
package/dist/types.d.ts.map +1 -1
package/dist/utils/hf-download-stream.d.ts +21 -0
package/dist/utils/hf-download-stream.d.ts.map +1 -0
package/dist/utils/hf-download-stream.js +101 -0
package/dist/utils/hf-download-stream.js.map +1 -0
package/dist/utils/hf-hub-download.d.ts +8 -0
package/dist/utils/hf-hub-download.d.ts.map +1 -0
package/dist/utils/hf-hub-download.js +149 -0
package/dist/utils/hf-hub-download.js.map +1 -0
package/dist/utils/safetensors-loader.d.ts +9 -0
package/dist/utils/safetensors-loader.d.ts.map +1 -0
package/dist/utils/safetensors-loader.js +95 -0
package/dist/utils/safetensors-loader.js.map +1 -0
package/dist/utils/safetensors-meta-guard.d.ts +7 -0
package/dist/utils/safetensors-meta-guard.d.ts.map +1 -0
package/dist/utils/safetensors-meta-guard.js +50 -0
package/dist/utils/safetensors-meta-guard.js.map +1 -0
package/package.json +3 -1
package/rules/codesift.md +1 -1
package/rules/codesift.mdc +1 -1
package/rules/codex.md +1 -1
package/rules/gemini.md +1 -1

package/dist/tools/index-tools.js CHANGED Viewed

@@ -22,6 +22,7 @@ import { validateGitUrl, validateGitRef } from "../utils/git-validation.js";
 import { walkDirectory } from "../utils/walk.js";
 import { onFileChanged as scanOnChanged, onFileDeleted as scanOnDeleted, scanFileForSecrets } from "./secret-tools.js";
 import { getGraphPath } from "../storage/graph-store.js";
+import { getSnapshotPath, loadHashSnapshot, saveHashSnapshot, HASH_SNAPSHOT_VERSION } from "../storage/hash-snapshot.js";
 const PARSE_CONCURRENCY = 8;
 const CHUNK_EMBEDDING_BATCH_SIZE = 96;
 const GIT_CLONE_TIMEOUT_MS = 120_000;
@@ -76,6 +77,13 @@ async function parseOneFile(filePath, repoRoot, repoName) {
     try {
         const stat = await import("node:fs/promises").then((fs) => fs.stat(filePath));
         const source = await readFile(filePath, "utf-8");
+        // CRITICAL-1 (TOCTOU parse↔hash): hash the EXACT source string we parse,
+        // here — never via a post-parse re-read. A re-read can observe a different
+        // on-disk version if the file is modified between parse and hash, pairing
+        // OLD symbols with a NEW sha so future runs permanently reuse mismatched
+        // symbols. The sha is NOT persisted inside FileEntry; callers thread it
+        // into the hash snapshot (and it saves one extra full read per parsed file).
+        const fileSha1 = createHash("sha1").update(source).digest("hex");
         const relPath = relative(repoRoot, filePath);
         const baseName = filePath.split("/").pop() ?? "";
         // Use full-path resolver so multi-dot suffixes like `.gradle.kts` beat
@@ -135,7 +143,7 @@ async function parseOneFile(filePath, repoRoot, repoName) {
             last_modified: Date.now(),
             mtime_ms: Math.round(stat.mtimeMs),
         };
-        return { symbols, entry };
+        return { symbols, entry, sha1: fileSha1 };
     }
     catch (err) {
         const message = err instanceof Error ? err.message : String(err);
@@ -149,6 +157,9 @@ async function parseOneFile(filePath, repoRoot, repoName) {
 async function parseFiles(files, repoRoot, repoName) {
     const allSymbols = [];
     const fileEntries = [];
+    // CRITICAL-1: sha1 of the exact parsed source, keyed by relPath. Carried out
+    // of parseOneFile so the snapshot never re-reads (and never races) the file.
+    const shas = {};
     for (let i = 0; i < files.length; i += PARSE_CONCURRENCY) {
         const batch = files.slice(i, i + PARSE_CONCURRENCY);
         const results = await Promise.all(batch.map((filePath) => parseOneFile(filePath, repoRoot, repoName)));
@@ -156,10 +167,11 @@ async function parseFiles(files, repoRoot, repoName) {
             if (result) {
                 allSymbols.push(...result.symbols);
                 fileEntries.push(result.entry);
+                shas[result.entry.path] = result.sha1;
             }
         }
     }
-    return { symbols: allSymbols, fileEntries };
+    return { symbols: allSymbols, fileEntries, shas };
 }
 // ---------------------------------------------------------------------------
 // Dirty propagation — mark caller files stale when a callee signature changes
@@ -307,6 +319,85 @@ async function embedChunks(fileEntries, rootPath, repoName, indexPath, config, s
         console.error(`[codesift] Chunk embedding failed for ${repoName}: ${message}`);
     }
 }
+/**
+ * Decide whether a previously stored index no longer reflects the working
+ * tree. Samples up to 256 of its file paths (even stride) and stats them;
+ * when at least half are gone the old index is treated as stale. Used by the
+ * indexFolder sanity check to break the poisoned-baseline deadlock: an old
+ * index bloated with since-deleted trees (.worktrees/, vendored dirs) would
+ * otherwise reject every honest reindex as "truncated" forever.
+ */
+const STALE_SAMPLE_LIMIT = 256;
+const STALE_MISSING_FRACTION = 0.5;
+async function isExistingIndexStale(existing, rootPath) {
+    const paths = existing.files.map((f) => f.path);
+    if (paths.length === 0)
+        return true;
+    const stride = Math.max(1, Math.floor(paths.length / STALE_SAMPLE_LIMIT));
+    const sampled = [];
+    for (let i = 0; i < paths.length && sampled.length < STALE_SAMPLE_LIMIT; i += stride) {
+        const p = paths[i];
+        if (p)
+            sampled.push(p);
+    }
+    let missing = 0;
+    await Promise.all(sampled.map(async (relPath) => {
+        try {
+            await stat(join(rootPath, relPath));
+        }
+        catch {
+            missing++;
+        }
+    }));
+    return missing >= sampled.length * STALE_MISSING_FRACTION;
+}
+/**
+ * Read a file and return the sha1 hex of its UTF-8 content, or null on read
+ * failure (deleted mid-walk, permission error). Code-sized files only — same
+ * assumption parseOneFile already makes. Non-throwing: callers treat null as
+ * "could not hash → fall through to re-parse".
+ */
+async function sha1OfFile(absPath) {
+    try {
+        const content = await readFile(absPath, "utf-8");
+        return createHash("sha1").update(content).digest("hex");
+    }
+    catch {
+        return null;
+    }
+}
+/**
+ * Exported for unit testing only — not part of the public API.
+ *
+ * Drains a legacy-hash queue: hashes each file, then re-stats to confirm the
+ * mtime has not drifted since the decision-time stat. Entries whose mtime
+ * drifted (or whose stat fails) are omitted from the returned map so the next
+ * run re-parses them rather than reusing symbols against a mismatched sha.
+ *
+ * @param queue  Items from the legacyHashQueue (relPath + filePath + decision-time mtimeMs).
+ * @param hashFn Injectable hash function (default: sha1OfFile). Tests inject a
+ *               function that also modifies the file so they can trigger the
+ *               TOCTOU drift-detection path without real concurrency.
+ * @param statFn Injectable stat function (default: fs.stat). Tests can stub this
+ *               to return a post-modification mtime.
+ */
+export async function drainLegacyHashQueue(queue, hashFn = sha1OfFile, statFn = (p) => import("node:fs/promises").then((m) => m.stat(p))) {
+    const result = {};
+    for (let i = 0; i < queue.length; i += PARSE_CONCURRENCY) {
+        const batch = queue.slice(i, i + PARSE_CONCURRENCY);
+        const shas = await Promise.all(batch.map((q) => hashFn(q.filePath)));
+        const stats = await Promise.all(batch.map((q) => statFn(q.filePath).then((st) => Math.round(st.mtimeMs), () => null)));
+        batch.forEach((q, j) => {
+            const currentMtime = stats[j];
+            if (currentMtime === null || currentMtime !== q.mtimeMs) {
+                // Mtime drifted or file gone — omit so next run re-parses.
+                return;
+            }
+            result[q.relPath] = shas[j] ?? "";
+        });
+    }
+    return result;
+}
 export async function indexFolder(folderPath, options) {
     if (!folderPath || typeof folderPath !== "string") {
         throw new Error("folderPath is required and must be a non-empty string");
@@ -379,16 +470,87 @@ export async function indexFolder(folderPath, options) {
                 mtimeMap.set(f.path, f.mtime_ms);
         }
     }
+    // Persistent hash snapshot (Task 6): relPath → sha1 from the previous index.
+    // mtime stays the cheap pre-filter (unchanged mtime → reuse without hashing,
+    // the fastest path). When mtime *changed*, the snapshot sha1 lets us still
+    // reuse symbols for touch/checkout no-op rewrites that bumped mtime without
+    // changing content — something mtime-only logic could never catch.
+    // null when absent/corrupt/version-or-repo-mismatch → degrade to full parse.
+    const snapshotPath = getSnapshotPath(indexPath);
+    let oldSnapshot = existing
+        ? await loadHashSnapshot(snapshotPath, repoName)
+        : null;
+    // Staleness guard (Task 6, CRITICAL-2): an incremental saveIncremental /
+    // removeFileFromIndex advances index.updated_at WITHOUT touching the
+    // snapshot. If saveIndex landed but the subsequent snapshot save failed (or
+    // an incremental edit ran after the last full index), the on-disk snapshot
+    // is OLDER than the index and its SHAs may no longer match the indexed
+    // symbols — carrying them forward (fast path) or sha-matching against them
+    // (changed path) would produce wrong reuse on revert+touch sequences. When
+    // the snapshot predates the index, discard it: the legacy hash-now
+    // convergence path below repopulates a fresh, correct snapshot this run.
+    // Guard uses strict inequality (!==), not <. The fresh-write contract is
+    // snapshot.created_at === index.updated_at exactly (created_at is anchored to
+    // codeIndex.updated_at, not a fresh Date.now()). So ANY mismatch — older OR
+    // newer — means the snapshot is not the one paired with this index and must
+    // be discarded. A FUTURE created_at (e.g. a snapshot written against a later,
+    // since-rolled-back index, or clock skew) is just as untrustworthy as a stale
+    // one: its SHAs may not match the indexed symbols.
+    if (oldSnapshot && existing && oldSnapshot.created_at !== existing.updated_at) {
+        console.warn(`[codesift] hash-snapshot older than index — rebuilding (${repoName})`);
+        oldSnapshot = null;
+    }
     const filesToParse = [];
     const keptSymbols = [];
     const keptEntries = [];
+    // sha1 of every file in the NEW index, by relPath. Populated for reused files
+    // here (from the old snapshot when present, else hashed-now for convergence)
+    // and for parsed files after parseFiles resolves.
+    const newSnapshotFiles = {};
+    // CRITICAL-1: reused files whose sha1 must be (re)computed because the old
+    // snapshot lacks it (legacy snapshot-less index, or stale snapshot discarded
+    // above). Collected here and hashed AFTER the loop in PARSE_CONCURRENCY
+    // batches instead of one serial await per file inside the loop — on a first
+    // run after upgrade against a many-thousand-file repo the serial version cost
+    // thousands of sequential awaits. Behavior is identical, wall-clock is
+    // parallelized.
+    //
+    // mtimeMs: the mtime observed at decision time (the moment we confirmed
+    // mtime === prevMtime and placed the file in the queue). We re-stat after
+    // hashing to detect any concurrent modification that landed between the two
+    // operations. If the mtime drifted, we omit the file from newSnapshotFiles
+    // entirely — the missing sha causes the next cold run to re-parse, avoiding
+    // a snapshot that pairs new-content sha against old (reused) symbols.
+    const legacyHashQueue = [];
+    // PERF: pre-build per-file lookups ONCE before the reuse loop. Both reuse
+    // branches need (a) the existing index's symbols for a given relPath and (b)
+    // its FileEntry. Doing `existing.symbols.filter(s => s.file === relPath)` /
+    // `existing.files.find(f => f.path === relPath)` per file is O(files ×
+    // symbols) and O(files²) respectively — quadratic, and on a many-thousand
+    // file/symbol repo that dominated the reuse-heavy fast path. A single pass
+    // builds Map lookups each branch hits in O(1). Built only when there's an
+    // existing index to reuse from.
+    const symbolsByFile = new Map();
+    const fileEntryByPath = new Map();
+    if (existing) {
+        for (const sym of existing.symbols) {
+            const list = symbolsByFile.get(sym.file);
+            if (list)
+                list.push(sym);
+            else
+                symbolsByFile.set(sym.file, [sym]);
+        }
+        for (const fe of existing.files) {
+            fileEntryByPath.set(fe.path, fe);
+        }
+    }
     if (mtimeMap.size > 0) {
         const { stat } = await import("node:fs/promises");
         for (const filePath of files) {
             const relPath = relative(rootPath, filePath);
             const prevMtime = mtimeMap.get(relPath);
             if (prevMtime !== undefined) {
-                const fileEntry = existing.files.find((f) => f.path === relPath);
+                const fileEntry = fileEntryByPath.get(relPath);
                 // Force re-parse if file is marked stale (callee signature changed)
                 if (fileEntry?.stale) {
                     filesToParse.push(filePath);
@@ -397,14 +559,47 @@ export async function indexFolder(folderPath, options) {
                 try {
                     const st = await stat(filePath);
                     if (Math.round(st.mtimeMs) === prevMtime) {
-                        // File unchanged — keep existing symbols
-                        const fileSymbols = existing.symbols.filter((s) => s.file === relPath);
+                        // Fast path: mtime unchanged → reuse symbols without hashing.
+                        const fileSymbols = symbolsByFile.get(relPath) ?? [];
                         if (fileEntry) {
                             keptSymbols.push(...fileSymbols);
                             keptEntries.push(fileEntry);
+                            // Carry the sha1 forward: reuse from old snapshot if present,
+                            // else DEFER hashing so legacy (snapshot-less) indexes converge
+                            // to a complete snapshot after one run — without paying a serial
+                            // hash per file inside this loop.
+                            const carried = oldSnapshot?.files[relPath];
+                            if (carried !== undefined) {
+                                newSnapshotFiles[relPath] = carried;
+                            }
+                            else {
+                                legacyHashQueue.push({ relPath, filePath, mtimeMs: Math.round(st.mtimeMs) });
+                            }
                             continue;
                         }
                     }
+                    else {
+                        // mtime changed — hash decides reuse vs re-parse. This catches
+                        // touch/checkout that bumped mtime without changing content.
+                        const snapSha = oldSnapshot?.files[relPath];
+                        if (snapSha !== undefined && fileEntry && !fileEntry.stale) {
+                            const currentSha = await sha1OfFile(filePath);
+                            if (currentSha !== null && currentSha === snapSha) {
+                                const fileSymbols = symbolsByFile.get(relPath) ?? [];
+                                keptSymbols.push(...fileSymbols);
+                                // FIX: the file's mtime changed but content is identical (touch /
+                                // checkout no-op rewrite). Reuse the symbols, but DON'T carry the
+                                // stale FileEntry verbatim — its mtime_ms still holds the OLD
+                                // mtime, so every future run would see mtime !== prevMtime and
+                                // re-hash this file forever, permanently degrading it off the
+                                // mtime fast path. Clone the entry with mtime_ms bumped to the
+                                // CURRENT stat's mtime so the next run takes the cheap fast path.
+                                keptEntries.push({ ...fileEntry, mtime_ms: Math.round(st.mtimeMs) });
+                                newSnapshotFiles[relPath] = currentSha;
+                                continue;
+                            }
+                        }
+                    }
                 }
                 catch { /* file may have been deleted — reparse */ }
             }
@@ -414,10 +609,32 @@ export async function indexFolder(folderPath, options) {
     else {
         filesToParse.push(...files);
     }
+    // Drain the deferred legacy-hash queue (CRITICAL-1): files reused via the
+    // mtime fast path that had no carried sha1 (legacy snapshot-less index, or a
+    // stale snapshot discarded by the guard above). See drainLegacyHashQueue for
+    // the TOCTOU guard details — entries whose mtime drifted between decision
+    // time and hash time are omitted so the next run re-parses rather than
+    // reusing symbols against a mismatched sha.
+    if (legacyHashQueue.length > 0) {
+        const drained = await drainLegacyHashQueue(legacyHashQueue);
+        Object.assign(newSnapshotFiles, drained);
+    }
     // Parse only changed/new files
-    const { symbols: parsedSymbols, fileEntries: parsedEntries } = await parseFiles(filesToParse, rootPath, repoName);
+    const { symbols: parsedSymbols, fileEntries: parsedEntries, shas: parsedShas } = await parseFiles(filesToParse, rootPath, repoName);
     const symbols = [...keptSymbols, ...parsedSymbols];
     const fileEntries = [...keptEntries, ...parsedEntries];
+    // Record sha1s for the files that were actually parsed (changed/new).
+    // CRITICAL-1 (TOCTOU): these hashes come straight from parseOneFile — they
+    // are the sha1 of the EXACT source string that produced the symbols, so the
+    // snapshot can never pair old symbols with a newer file's sha. Only entries
+    // that survived parseFiles (parseOneFile returned non-null) have a sha here,
+    // keeping the snapshot in lockstep with fileEntries. The previous post-parse
+    // double-read loop is gone — one fewer full read per parsed file.
+    for (const entry of parsedEntries) {
+        const sha = parsedShas[entry.path];
+        if (sha !== undefined)
+            newSnapshotFiles[entry.path] = sha;
+    }
     // Dirty propagation: detect signature changes and mark caller files stale
     if (existing && filesToParse.length > 0 && filesToParse.length < files.length) {
         const staleFiles = propagateDirtySignatures(existing.symbols, symbols, fileEntries);
@@ -425,25 +642,209 @@ export async function indexFolder(folderPath, options) {
             console.error(`[codesift] Dirty propagation: ${staleFiles.size} caller files marked stale`);
         }
     }
-    // Build and cache BM25 index; invalidate code index cache
-    const bm25 = buildBM25Index(symbols);
-    bm25Indexes.set(repoName, bm25);
+    // Invalidate code index cache (BM25 is rebuilt below from the FINAL symbol
+    // set — possibly merged with out-of-scope existing symbols, see merge block).
     codeIndexes.delete(repoName);
     // Sanity check: don't overwrite a complete index with a partial one
-    // (WASM crash or walk failure can produce truncated results)
+    // (WASM crash or walk failure can produce truncated results).
+    //
+    // IMPORTANT: skip the guard when the walk was explicitly narrowed — either
+    // max_files was hit (truncated at cap) or include_paths scoped the walk to a
+    // subdirectory. In both cases the small result count is EXPECTED and rejecting
+    // it would be a false positive (the "1139 vs 9512" bug class). For unrestricted
+    // walks the guard stays as-is, protecting against genuine silent truncations.
+    //
+    // CRITICAL (T7 correctness fix): skipping the guard is necessary but NOT
+    // sufficient. A scoped/capped walk only SEES a narrow slice of the repo; if we
+    // persisted that slice as the WHOLE index we would wipe every out-of-scope
+    // file's symbols from index+snapshot (worse than the guard's old reject,
+    // which at least preserved the prior index). So for scoped/capped walks with
+    // an existing index we MERGE: keep out-of-scope existing entries verbatim and
+    // overlay the walk's results. See the merge block below.
+    //
+    // "max_files hit" detection: files.length === effective maxFiles. This is the
+    // only signal walkDirectory exposes (it sets limitReached internally but does
+    // not surface it on the return value). A 1-in-a-million exact-count false
+    // positive (repo has exactly maxFiles parseable files) is accepted — the
+    // guard skip is conservative (allows write), not destructive.
     const DROP_THRESHOLD = 0.5; // Reject if new index has <50% of old file count
-    if (existing && fileEntries.length < existing.file_count * DROP_THRESHOLD && existing.file_count > 50) {
-        console.error(`[codesift] SANITY CHECK FAILED for ${repoName}: ` +
-            `new index has ${fileEntries.length} files vs ${existing.file_count} previously. ` +
-            `Keeping old index. Use invalidate_cache + index_folder to force reindex.`);
-        return {
-            repo: repoName,
-            root: rootPath,
-            file_count: existing.file_count,
-            symbol_count: existing.symbol_count,
-            duration_ms: Date.now() - startTime,
+    const walkExplicitlyCapped = hitFileLimit;
+    const walkExplicitlyScoped = options?.include_paths !== undefined && options.include_paths.length > 0;
+    // MIN_GUARD_FILES: the unrestricted guard only arms above this existing
+    // file_count (`existing.file_count > 50` below). The scoped-granularity guard
+    // mirrors that shape against the in-scope subset so a tiny scope can't be
+    // rejected on noise. Single source of truth so both guards stay in lockstep.
+    const MIN_GUARD_FILES = 50;
+    if (walkExplicitlyCapped || walkExplicitlyScoped) {
+        // ROUND-2 FIX (scoped-granularity guard): the unrestricted guard is skipped
+        // for scoped/capped walks because a small *overall* result is expected. But
+        // that skip was total — a scoped walk that aborts mid-enumeration (WASM
+        // crash, transient FS error, an over-broad exclude) silently truncates the
+        // IN-SCOPE slice, and the merge below treats every unwalked in-scope file as
+        // a deletion → wipes it from index+snapshot. So for a purely SCOPED (uncapped)
+        // walk we re-arm a guard against the IN-SCOPE subset: if the walk enumerated
+        // far fewer in-scope files than the existing index held in that same scope,
+        // AND those files are still on disk, the enumeration was truncated → reject
+        // before any merge/save, leaving the old index+snapshot intact.
+        //
+        // Capped walks are intentionally EXEMPT: a cap means unseen ≠ deleted (the
+        // merge preserves all unwalked files), so there is no truncation to detect —
+        // nothing in-scope is dropped. A walk that is BOTH scoped and capped also
+        // takes capped semantics (preserve everything unwalked), so the same
+        // exemption applies — no in-scope file can be lost.
+        if (walkExplicitlyScoped && !walkExplicitlyCapped && existing) {
+            const includePaths = options.include_paths;
+            const inScopeRel = (relPath) => includePaths.some((p) => relPath.startsWith(p)); // mirror walkDirectory
+            const existingInScope = existing.files.filter((fe) => inScopeRel(fe.path));
+            // All walked files are in scope by construction (walkDirectory honored
+            // includePaths), so walkedInScope is simply the walk's file count.
+            const walkedInScope = fileEntries.length;
+            if (existingInScope.length > MIN_GUARD_FILES &&
+                walkedInScope < existingInScope.length * DROP_THRESHOLD) {
+                // Auto-heal analog (in-scope): the shrink may be a genuine mass deletion
+                // within the scope, not a truncated walk. Sample the existing in-scope
+                // paths on disk (mirrors isExistingIndexStale, but restricted to the
+                // scope) — if most are gone, accept the merge.
+                const inScopePaths = existingInScope.map((fe) => fe.path);
+                const stride = Math.max(1, Math.floor(inScopePaths.length / STALE_SAMPLE_LIMIT));
+                const sampled = [];
+                for (let i = 0; i < inScopePaths.length && sampled.length < STALE_SAMPLE_LIMIT; i += stride) {
+                    const p = inScopePaths[i];
+                    if (p)
+                        sampled.push(p);
+                }
+                let missing = 0;
+                await Promise.all(sampled.map(async (relPath) => {
+                    try {
+                        await stat(join(rootPath, relPath));
+                    }
+                    catch {
+                        missing++;
+                    }
+                }));
+                const mostGone = missing >= sampled.length * STALE_MISSING_FRACTION;
+                if (mostGone) {
+                    console.error(`[codesift] Scoped sanity auto-heal for ${repoName}: walked ` +
+                        `${walkedInScope} of ${existingInScope.length} in-scope files but ` +
+                        `most sampled in-scope paths no longer exist on disk. Accepting ` +
+                        `scoped merge (legit in-scope mass deletion).`);
+                }
+                else {
+                    console.error(`[codesift] SCOPED SANITY CHECK FAILED for ${repoName}: scoped walk ` +
+                        `under-enumerated — walked ${walkedInScope} of ${existingInScope.length} ` +
+                        `in-scope files, which still exist on disk. Keeping old index.`);
+                    return {
+                        repo: repoName,
+                        root: rootPath,
+                        file_count: existing.file_count,
+                        symbol_count: existing.symbol_count,
+                        duration_ms: Date.now() - startTime,
+                        status: "rejected_partial",
+                        reason: `scoped walk under-enumerated: walked ${walkedInScope} of ${existingInScope.length} in-scope files (still on disk) — kept old index, nothing was re-registered`,
+                        hint: "If the in-scope shrink is expected (deleted files, new excludes), run invalidate_cache then index_folder to rebuild from scratch.",
+                    };
+                }
+            }
+        }
+        const detail = walkExplicitlyCapped
+            ? `max_files=${maxFiles} hit (${files.length} files returned)`
+            : `include_paths=[${options.include_paths.join(", ")}]`;
+        console.error(`[codesift] sanity guard skipped: walk explicitly capped/scoped (${detail})`);
+    }
+    else if (existing && fileEntries.length < existing.file_count * DROP_THRESHOLD && existing.file_count > MIN_GUARD_FILES) {
+        // The shrink can also mean the OLD index is the bogus one: an earlier
+        // walker may have swept since-deleted trees (.worktrees/, vendored dirs),
+        // permanently inflating the baseline so every honest reindex looks
+        // truncated and gets rejected forever. Disambiguate by sampling the old
+        // index's paths: if most of them no longer exist on disk, the old index
+        // is stale dead weight — accept the new result instead of keeping it.
+        if (await isExistingIndexStale(existing, rootPath)) {
+            console.error(`[codesift] Sanity check auto-heal for ${repoName}: old index has ` +
+                `${existing.file_count} files but most sampled paths no longer exist ` +
+                `on disk. Accepting new index (${fileEntries.length} files).`);
+        }
+        else {
+            console.error(`[codesift] SANITY CHECK FAILED for ${repoName}: ` +
+                `new index has ${fileEntries.length} files vs ${existing.file_count} previously. ` +
+                `Keeping old index. Use invalidate_cache + index_folder to force reindex.`);
+            return {
+                repo: repoName,
+                root: rootPath,
+                file_count: existing.file_count,
+                symbol_count: existing.symbol_count,
+                duration_ms: Date.now() - startTime,
+                status: "rejected_partial",
+                reason: `new walk found ${fileEntries.length} files, <50% of the ${existing.file_count} previously indexed — kept old index, nothing was re-registered`,
+                hint: "If the shrink is expected (deleted trees, new excludes), run invalidate_cache then index_folder to rebuild from scratch.",
+            };
+        }
+    }
+    // ── MERGE-persist for scoped/capped walks (T7 correctness fix) ────────────
+    // A scoped (include_paths) or capped (max_files-hit) walk only enumerated a
+    // slice of the repo. Persisting that slice verbatim would delete every
+    // out-of-scope file's symbols from index+snapshot. When an existing index is
+    // present we instead MERGE: preserve out-of-scope existing entries/symbols/
+    // shas and overlay the walk's results.
+    //
+    //  - include_paths scoped (and NOT capped): "scope" = files whose relPath is
+    //    under any include root (mirror walkDirectory's relPath.startsWith(p)
+    //    test EXACTLY). Out-of-scope existing files are preserved verbatim;
+    //    in-scope existing files NOT in the walk set W are dropped (genuine
+    //    in-scope deletions — the walk fully enumerated the scope).
+    //  - capped (max_files hit): scope is UNDEFINED — the cap means an unseen
+    //    file is not necessarily deleted. Preserve ALL existing entries not in W,
+    //    overlay W. (If a capped walk also passed include_paths, the cap makes the
+    //    in-scope enumeration incomplete too, so we still only trust W and
+    //    preserve everything else — capped semantics win.)
+    //
+    // First run (no existing index) with a scoped/capped walk → save what we have
+    // (current behavior, documented): there is nothing to preserve.
+    let mergedSymbols = symbols;
+    let mergedEntries = fileEntries;
+    let mergedSnapshotFiles = newSnapshotFiles;
+    if ((walkExplicitlyCapped || walkExplicitlyScoped) && existing) {
+        const walkedPaths = new Set(fileEntries.map((fe) => fe.path));
+        // A capped walk has undefined scope (unseen ≠ deleted), so it preserves
+        // everything not walked. A purely scoped (uncapped) walk additionally drops
+        // in-scope-but-unwalked files, since the walk fully enumerated the scope.
+        const includePaths = options?.include_paths;
+        const inScope = (relPath) => {
+            if (walkExplicitlyCapped)
+                return false; // cap → never treat as deletable
+            if (!includePaths || includePaths.length === 0)
+                return false;
+            // Mirror walkDirectory's include-path filter exactly.
+            return includePaths.some((p) => relPath.startsWith(p));
         };
+        const preservedEntries = [];
+        const preservedFilePaths = new Set();
+        for (const fe of existing.files) {
+            if (walkedPaths.has(fe.path))
+                continue; // walk result wins for these
+            if (inScope(fe.path))
+                continue; // in-scope + not walked = deleted-in-scope
+            preservedEntries.push(fe);
+            preservedFilePaths.add(fe.path);
+        }
+        const preservedSymbols = existing.symbols.filter((s) => preservedFilePaths.has(s.file));
+        mergedEntries = [...preservedEntries, ...fileEntries];
+        mergedSymbols = [...preservedSymbols, ...symbols];
+        // Snapshot: preserve out-of-scope shas, overlay walked ones.
+        mergedSnapshotFiles = {};
+        if (oldSnapshot) {
+            for (const relPath of preservedFilePaths) {
+                const sha = oldSnapshot.files[relPath];
+                if (sha !== undefined)
+                    mergedSnapshotFiles[relPath] = sha;
+            }
+        }
+        Object.assign(mergedSnapshotFiles, newSnapshotFiles);
     }
+    // Build and cache BM25 index from the FINAL (possibly merged) symbol set.
+    // Built here (not before the guard) so a rejected_partial early-return leaves
+    // the previous in-memory BM25 index intact rather than swapping in a partial.
+    const bm25 = buildBM25Index(mergedSymbols);
+    bm25Indexes.set(repoName, bm25);
     // Resolve workspaces (Task 7) — runs before persistence so collectImportEdges
     // and other downstream consumers see the populated `workspaces` field.
     // Gated behind CODESIFT_DISABLE_MONOREPO=1 kill switch (spec D-FB).
@@ -460,24 +861,49 @@ export async function indexFolder(folderPath, options) {
             // mode is the safe fallback.
         }
     }
-    // Build and save code index
+    // Build and save code index from the FINAL (possibly merged) sets.
     const codeIndex = {
         repo: repoName,
         root: rootPath,
-        symbols,
-        files: fileEntries,
+        symbols: mergedSymbols,
+        files: mergedEntries,
         created_at: Date.now(),
         updated_at: Date.now(),
-        symbol_count: symbols.length,
-        file_count: fileEntries.length,
+        symbol_count: mergedSymbols.length,
+        file_count: mergedEntries.length,
         extractor_version: { ...EXTRACTOR_VERSIONS },
         ...(workspaces ? { workspaces } : {}),
     };
     await saveIndex(indexPath, codeIndex);
+    // Persist the hash snapshot AFTER the index lands (mirrors registerRepo
+    // ordering) and only on the success path — the rejected_partial branch
+    // returned earlier, leaving the previous snapshot intact. Non-fatal: the
+    // snapshot is a reuse-optimization cache; a write failure just costs a full
+    // re-parse next run, so we warn and continue.
+    try {
+        const newSnapshot = {
+            version: HASH_SNAPSHOT_VERSION,
+            repo: repoName,
+            // CRITICAL-2 (created_at race): use the EXACT timestamp serialized into
+            // the index, not a fresh Date.now(). A watcher's saveIncremental that
+            // lands between saveIndex and this write would otherwise leave the
+            // snapshot OLDER than created_at, blinding the staleness guard above. By
+            // anchoring to codeIndex.updated_at, snapshot.created_at === the index's
+            // updated_at on a fresh write, so any later incremental strictly advances
+            // index.updated_at past it and the guard fires correctly.
+            created_at: codeIndex.updated_at,
+            files: mergedSnapshotFiles,
+        };
+        await saveHashSnapshot(snapshotPath, newSnapshot);
+    }
+    catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        console.warn(`[codesift] hash-snapshot save failed for ${repoName} (non-fatal): ${msg}`);
+    }
     // Embed symbols and chunks in background (non-fatal, don't block MCP response)
     // Large repos (71K symbols) can take minutes — fire-and-forget to prevent timeout
-    embedSymbols(symbols, indexPath, repoName, config)
-        .then(() => embedChunks(fileEntries, rootPath, repoName, indexPath, config, symbols))
+    embedSymbols(mergedSymbols, indexPath, repoName, config)
+        .then(() => embedChunks(mergedEntries, rootPath, repoName, indexPath, config, mergedSymbols))
         .catch((err) => {
         const msg = err instanceof Error ? err.message : String(err);
         console.error(`[codesift] Background embedding failed for ${repoName}: ${msg}`);
@@ -496,8 +922,8 @@ export async function indexFolder(folderPath, options) {
         name: repoName,
         root: rootPath,
         index_path: indexPath,
-        symbol_count: symbols.length,
-        file_count: fileEntries.length,
+        symbol_count: mergedSymbols.length,
+        file_count: mergedEntries.length,
         updated_at: Date.now(),
     };
     await registerRepo(config.registryPath, meta);
@@ -520,7 +946,7 @@ export async function indexFolder(folderPath, options) {
     try {
         const { detectFrameworks } = await import("../utils/framework-detect.js");
         const { enableFrameworkToolBundle } = await import("../register-tools.js");
-        const tempIndex = { root: rootPath, files: fileEntries, symbols };
+        const tempIndex = { root: rootPath, files: mergedEntries, symbols: mergedSymbols };
         const frameworks = detectFrameworks(tempIndex);
         for (const fw of frameworks) {
             const enabled = enableFrameworkToolBundle(fw);
@@ -538,8 +964,8 @@ export async function indexFolder(folderPath, options) {
     return {
         repo: repoName,
         root: rootPath,
-        file_count: fileEntries.length,
-        symbol_count: symbols.length,
+        file_count: mergedEntries.length,
+        symbol_count: mergedSymbols.length,
         duration_ms: Date.now() - startTime,
     };
 }
@@ -778,7 +1204,8 @@ export async function invalidateCache(repoName) {
     const chunkPath = getChunkPath(meta.index_path);
     const chunkEmbeddingPath = getChunkEmbeddingPath(meta.index_path);
     const graphStorePath = getGraphPath(meta.index_path);
-    for (const fp of [meta.index_path, embeddingPath, embeddingMetaPath, chunkPath, chunkEmbeddingPath, graphStorePath]) {
+    const snapshotPath = getSnapshotPath(meta.index_path);
+    for (const fp of [meta.index_path, embeddingPath, embeddingMetaPath, chunkPath, chunkEmbeddingPath, graphStorePath, snapshotPath]) {
         try {
             await unlink(fp);
         }
@@ -788,6 +1215,22 @@ export async function invalidateCache(repoName) {
     await removeRepo(config.registryPath, repoName);
     return true;
 }
+/**
+ * In-process record of the last indexed state per absolute file path.
+ *
+ * Telemetry (30d, 2026-06): 750 consecutive duplicate index_file calls at
+ * avg 3.7s each (~47 min of agent wall-clock). Two causes: (1) duplicate
+ * hook registrations firing index_file twice per edit, and (2) a race where
+ * call N+1's on-disk mtime pre-check read the index before call N's
+ * serialized saveIncremental landed, forcing a full re-parse + full-index
+ * save. This map short-circuits both in-process in ~1ms (mtime first, then
+ * content hash for touch/no-op rewrites) without loading the on-disk index.
+ */
+const lastIndexedState = new Map();
+/** Test hook — clear the in-process last-indexed state. */
+export function clearLastIndexedStateForTesting() {
+    lastIndexedState.clear();
+}
 /**
  * Re-index a single file instantly. Finds the repo by matching the file
  * path against indexed repo roots. Updates symbols, BM25 index, and
@@ -815,13 +1258,47 @@ export async function indexFile(filePath) {
             clearTsconfigCache();
         }
     }
-    // mtime check — skip if unchanged
-    const existing = await loadIndex(matchingRepo.index_path);
-    if (existing) {
-        const prevEntry = existing.files.find((f) => f.path === relPath);
-        if (prevEntry?.mtime_ms) {
-            const st = await stat(absPath);
-            if (Math.round(st.mtimeMs) === prevEntry.mtime_ms) {
+    // In-process short-circuit: mtime, then content hash. Both avoid loading
+    // the on-disk index entirely (the expensive part on large repos).
+    const st = await stat(absPath);
+    const mem = lastIndexedState.get(absPath);
+    if (mem && Math.round(st.mtimeMs) === mem.mtimeMs) {
+        return {
+            repo: matchingRepo.name,
+            file: relPath,
+            symbol_count: mem.symbolCount,
+            duration_ms: Date.now() - startTime,
+            skipped: true,
+        };
+    }
+    const content = await readFile(absPath, "utf-8").catch(() => null);
+    const contentHash = content !== null ? createHash("sha1").update(content).digest("hex") : null;
+    if (mem && contentHash !== null && contentHash === mem.contentHash) {
+        // Touched / rewritten with identical content — refresh mtime, skip work.
+        mem.mtimeMs = Math.round(st.mtimeMs);
+        return {
+            repo: matchingRepo.name,
+            file: relPath,
+            symbol_count: mem.symbolCount,
+            duration_ms: Date.now() - startTime,
+            skipped: true,
+        };
+    }
+    // On-disk mtime check — first touch of this file in this process (CLI
+    // hook invocations, fresh server). Skips files unchanged since the last
+    // full index, and seeds the in-process state for subsequent calls.
+    if (!mem) {
+        const existing = await loadIndex(matchingRepo.index_path);
+        if (existing) {
+            const prevEntry = existing.files.find((f) => f.path === relPath);
+            if (prevEntry?.mtime_ms && Math.round(st.mtimeMs) === prevEntry.mtime_ms) {
+                if (contentHash !== null) {
+                    lastIndexedState.set(absPath, {
+                        mtimeMs: Math.round(st.mtimeMs),
+                        contentHash,
+                        symbolCount: prevEntry.symbol_count,
+                    });
+                }
                 return {
                     repo: matchingRepo.name,
                     file: relPath,
@@ -837,6 +1314,13 @@ export async function indexFile(filePath) {
         throw new Error(`Failed to parse "${relPath}"`);
     }
     await saveIncremental(matchingRepo.index_path, relPath, result.symbols, result.entry);
+    if (contentHash !== null) {
+        lastIndexedState.set(absPath, {
+            mtimeMs: Math.round(st.mtimeMs),
+            contentHash,
+            symbolCount: result.symbols.length,
+        });
+    }
     let secretFindingsCount = 0;
     if (config.secretScanEnabled) {
         try {