codesift-mcp 0.8.10 → 0.8.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/hooks.d.ts.map +1 -1
- package/dist/cli/hooks.js +2 -1
- package/dist/cli/hooks.js.map +1 -1
- package/dist/config.d.ts +1 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +1 -0
- package/dist/config.js.map +1 -1
- package/dist/register-tools.d.ts.map +1 -1
- package/dist/register-tools.js +39 -1
- package/dist/register-tools.js.map +1 -1
- package/dist/search/model2vec-tokenize.d.ts +22 -0
- package/dist/search/model2vec-tokenize.d.ts.map +1 -0
- package/dist/search/model2vec-tokenize.js +140 -0
- package/dist/search/model2vec-tokenize.js.map +1 -0
- package/dist/search/semantic.d.ts.map +1 -1
- package/dist/search/semantic.js +7 -0
- package/dist/search/semantic.js.map +1 -1
- package/dist/search/static-embedding-provider.d.ts +24 -0
- package/dist/search/static-embedding-provider.d.ts.map +1 -0
- package/dist/search/static-embedding-provider.js +149 -0
- package/dist/search/static-embedding-provider.js.map +1 -0
- package/dist/storage/_shared.d.ts.map +1 -1
- package/dist/storage/_shared.js +4 -1
- package/dist/storage/_shared.js.map +1 -1
- package/dist/storage/hash-snapshot.d.ts +36 -0
- package/dist/storage/hash-snapshot.d.ts.map +1 -0
- package/dist/storage/hash-snapshot.js +101 -0
- package/dist/storage/hash-snapshot.js.map +1 -0
- package/dist/storage/usage-stats.d.ts +8 -0
- package/dist/storage/usage-stats.d.ts.map +1 -1
- package/dist/storage/usage-stats.js +74 -24
- package/dist/storage/usage-stats.js.map +1 -1
- package/dist/storage/usage-tracker.d.ts +16 -0
- package/dist/storage/usage-tracker.d.ts.map +1 -1
- package/dist/storage/usage-tracker.js +23 -3
- package/dist/storage/usage-tracker.js.map +1 -1
- package/dist/tools/index-tools.d.ts +31 -2
- package/dist/tools/index-tools.d.ts.map +1 -1
- package/dist/tools/index-tools.js +460 -33
- package/dist/tools/index-tools.js.map +1 -1
- package/dist/tools/pg-introspect-tools.d.ts +147 -0
- package/dist/tools/pg-introspect-tools.d.ts.map +1 -0
- package/dist/tools/pg-introspect-tools.js +396 -0
- package/dist/tools/pg-introspect-tools.js.map +1 -0
- package/dist/types.d.ts +27 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/utils/hf-download-stream.d.ts +21 -0
- package/dist/utils/hf-download-stream.d.ts.map +1 -0
- package/dist/utils/hf-download-stream.js +101 -0
- package/dist/utils/hf-download-stream.js.map +1 -0
- package/dist/utils/hf-hub-download.d.ts +8 -0
- package/dist/utils/hf-hub-download.d.ts.map +1 -0
- package/dist/utils/hf-hub-download.js +149 -0
- package/dist/utils/hf-hub-download.js.map +1 -0
- package/dist/utils/safetensors-loader.d.ts +9 -0
- package/dist/utils/safetensors-loader.d.ts.map +1 -0
- package/dist/utils/safetensors-loader.js +95 -0
- package/dist/utils/safetensors-loader.js.map +1 -0
- package/dist/utils/safetensors-meta-guard.d.ts +7 -0
- package/dist/utils/safetensors-meta-guard.d.ts.map +1 -0
- package/dist/utils/safetensors-meta-guard.js +50 -0
- package/dist/utils/safetensors-meta-guard.js.map +1 -0
- package/package.json +3 -1
|
@@ -22,6 +22,7 @@ import { validateGitUrl, validateGitRef } from "../utils/git-validation.js";
|
|
|
22
22
|
import { walkDirectory } from "../utils/walk.js";
|
|
23
23
|
import { onFileChanged as scanOnChanged, onFileDeleted as scanOnDeleted, scanFileForSecrets } from "./secret-tools.js";
|
|
24
24
|
import { getGraphPath } from "../storage/graph-store.js";
|
|
25
|
+
import { getSnapshotPath, loadHashSnapshot, saveHashSnapshot, HASH_SNAPSHOT_VERSION } from "../storage/hash-snapshot.js";
|
|
25
26
|
const PARSE_CONCURRENCY = 8;
|
|
26
27
|
const CHUNK_EMBEDDING_BATCH_SIZE = 96;
|
|
27
28
|
const GIT_CLONE_TIMEOUT_MS = 120_000;
|
|
@@ -76,6 +77,13 @@ async function parseOneFile(filePath, repoRoot, repoName) {
|
|
|
76
77
|
try {
|
|
77
78
|
const stat = await import("node:fs/promises").then((fs) => fs.stat(filePath));
|
|
78
79
|
const source = await readFile(filePath, "utf-8");
|
|
80
|
+
// CRITICAL-1 (TOCTOU parse↔hash): hash the EXACT source string we parse,
|
|
81
|
+
// here — never via a post-parse re-read. A re-read can observe a different
|
|
82
|
+
// on-disk version if the file is modified between parse and hash, pairing
|
|
83
|
+
// OLD symbols with a NEW sha so future runs permanently reuse mismatched
|
|
84
|
+
// symbols. The sha is NOT persisted inside FileEntry; callers thread it
|
|
85
|
+
// into the hash snapshot (and it saves one extra full read per parsed file).
|
|
86
|
+
const fileSha1 = createHash("sha1").update(source).digest("hex");
|
|
79
87
|
const relPath = relative(repoRoot, filePath);
|
|
80
88
|
const baseName = filePath.split("/").pop() ?? "";
|
|
81
89
|
// Use full-path resolver so multi-dot suffixes like `.gradle.kts` beat
|
|
@@ -135,7 +143,7 @@ async function parseOneFile(filePath, repoRoot, repoName) {
|
|
|
135
143
|
last_modified: Date.now(),
|
|
136
144
|
mtime_ms: Math.round(stat.mtimeMs),
|
|
137
145
|
};
|
|
138
|
-
return { symbols, entry };
|
|
146
|
+
return { symbols, entry, sha1: fileSha1 };
|
|
139
147
|
}
|
|
140
148
|
catch (err) {
|
|
141
149
|
const message = err instanceof Error ? err.message : String(err);
|
|
@@ -149,6 +157,9 @@ async function parseOneFile(filePath, repoRoot, repoName) {
|
|
|
149
157
|
async function parseFiles(files, repoRoot, repoName) {
|
|
150
158
|
const allSymbols = [];
|
|
151
159
|
const fileEntries = [];
|
|
160
|
+
// CRITICAL-1: sha1 of the exact parsed source, keyed by relPath. Carried out
|
|
161
|
+
// of parseOneFile so the snapshot never re-reads (and never races) the file.
|
|
162
|
+
const shas = {};
|
|
152
163
|
for (let i = 0; i < files.length; i += PARSE_CONCURRENCY) {
|
|
153
164
|
const batch = files.slice(i, i + PARSE_CONCURRENCY);
|
|
154
165
|
const results = await Promise.all(batch.map((filePath) => parseOneFile(filePath, repoRoot, repoName)));
|
|
@@ -156,10 +167,11 @@ async function parseFiles(files, repoRoot, repoName) {
|
|
|
156
167
|
if (result) {
|
|
157
168
|
allSymbols.push(...result.symbols);
|
|
158
169
|
fileEntries.push(result.entry);
|
|
170
|
+
shas[result.entry.path] = result.sha1;
|
|
159
171
|
}
|
|
160
172
|
}
|
|
161
173
|
}
|
|
162
|
-
return { symbols: allSymbols, fileEntries };
|
|
174
|
+
return { symbols: allSymbols, fileEntries, shas };
|
|
163
175
|
}
|
|
164
176
|
// ---------------------------------------------------------------------------
|
|
165
177
|
// Dirty propagation — mark caller files stale when a callee signature changes
|
|
@@ -307,6 +319,85 @@ async function embedChunks(fileEntries, rootPath, repoName, indexPath, config, s
|
|
|
307
319
|
console.error(`[codesift] Chunk embedding failed for ${repoName}: ${message}`);
|
|
308
320
|
}
|
|
309
321
|
}
|
|
322
|
+
/**
|
|
323
|
+
* Decide whether a previously stored index no longer reflects the working
|
|
324
|
+
* tree. Samples up to 256 of its file paths (even stride) and stats them;
|
|
325
|
+
* when at least half are gone the old index is treated as stale. Used by the
|
|
326
|
+
* indexFolder sanity check to break the poisoned-baseline deadlock: an old
|
|
327
|
+
* index bloated with since-deleted trees (.worktrees/, vendored dirs) would
|
|
328
|
+
* otherwise reject every honest reindex as "truncated" forever.
|
|
329
|
+
*/
|
|
330
|
+
const STALE_SAMPLE_LIMIT = 256;
|
|
331
|
+
const STALE_MISSING_FRACTION = 0.5;
|
|
332
|
+
async function isExistingIndexStale(existing, rootPath) {
|
|
333
|
+
const paths = existing.files.map((f) => f.path);
|
|
334
|
+
if (paths.length === 0)
|
|
335
|
+
return true;
|
|
336
|
+
const stride = Math.max(1, Math.floor(paths.length / STALE_SAMPLE_LIMIT));
|
|
337
|
+
const sampled = [];
|
|
338
|
+
for (let i = 0; i < paths.length && sampled.length < STALE_SAMPLE_LIMIT; i += stride) {
|
|
339
|
+
const p = paths[i];
|
|
340
|
+
if (p)
|
|
341
|
+
sampled.push(p);
|
|
342
|
+
}
|
|
343
|
+
let missing = 0;
|
|
344
|
+
await Promise.all(sampled.map(async (relPath) => {
|
|
345
|
+
try {
|
|
346
|
+
await stat(join(rootPath, relPath));
|
|
347
|
+
}
|
|
348
|
+
catch {
|
|
349
|
+
missing++;
|
|
350
|
+
}
|
|
351
|
+
}));
|
|
352
|
+
return missing >= sampled.length * STALE_MISSING_FRACTION;
|
|
353
|
+
}
|
|
354
|
+
/**
|
|
355
|
+
* Read a file and return the sha1 hex of its UTF-8 content, or null on read
|
|
356
|
+
* failure (deleted mid-walk, permission error). Code-sized files only — same
|
|
357
|
+
* assumption parseOneFile already makes. Non-throwing: callers treat null as
|
|
358
|
+
* "could not hash → fall through to re-parse".
|
|
359
|
+
*/
|
|
360
|
+
async function sha1OfFile(absPath) {
|
|
361
|
+
try {
|
|
362
|
+
const content = await readFile(absPath, "utf-8");
|
|
363
|
+
return createHash("sha1").update(content).digest("hex");
|
|
364
|
+
}
|
|
365
|
+
catch {
|
|
366
|
+
return null;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
/**
|
|
370
|
+
* Exported for unit testing only — not part of the public API.
|
|
371
|
+
*
|
|
372
|
+
* Drains a legacy-hash queue: hashes each file, then re-stats to confirm the
|
|
373
|
+
* mtime has not drifted since the decision-time stat. Entries whose mtime
|
|
374
|
+
* drifted (or whose stat fails) are omitted from the returned map so the next
|
|
375
|
+
* run re-parses them rather than reusing symbols against a mismatched sha.
|
|
376
|
+
*
|
|
377
|
+
* @param queue Items from the legacyHashQueue (relPath + filePath + decision-time mtimeMs).
|
|
378
|
+
* @param hashFn Injectable hash function (default: sha1OfFile). Tests inject a
|
|
379
|
+
* function that also modifies the file so they can trigger the
|
|
380
|
+
* TOCTOU drift-detection path without real concurrency.
|
|
381
|
+
* @param statFn Injectable stat function (default: fs.stat). Tests can stub this
|
|
382
|
+
* to return a post-modification mtime.
|
|
383
|
+
*/
|
|
384
|
+
export async function drainLegacyHashQueue(queue, hashFn = sha1OfFile, statFn = (p) => import("node:fs/promises").then((m) => m.stat(p))) {
|
|
385
|
+
const result = {};
|
|
386
|
+
for (let i = 0; i < queue.length; i += PARSE_CONCURRENCY) {
|
|
387
|
+
const batch = queue.slice(i, i + PARSE_CONCURRENCY);
|
|
388
|
+
const shas = await Promise.all(batch.map((q) => hashFn(q.filePath)));
|
|
389
|
+
const stats = await Promise.all(batch.map((q) => statFn(q.filePath).then((st) => Math.round(st.mtimeMs), () => null)));
|
|
390
|
+
batch.forEach((q, j) => {
|
|
391
|
+
const currentMtime = stats[j];
|
|
392
|
+
if (currentMtime === null || currentMtime !== q.mtimeMs) {
|
|
393
|
+
// Mtime drifted or file gone — omit so next run re-parses.
|
|
394
|
+
return;
|
|
395
|
+
}
|
|
396
|
+
result[q.relPath] = shas[j] ?? "";
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
return result;
|
|
400
|
+
}
|
|
310
401
|
export async function indexFolder(folderPath, options) {
|
|
311
402
|
if (!folderPath || typeof folderPath !== "string") {
|
|
312
403
|
throw new Error("folderPath is required and must be a non-empty string");
|
|
@@ -379,16 +470,87 @@ export async function indexFolder(folderPath, options) {
|
|
|
379
470
|
mtimeMap.set(f.path, f.mtime_ms);
|
|
380
471
|
}
|
|
381
472
|
}
|
|
473
|
+
// Persistent hash snapshot (Task 6): relPath → sha1 from the previous index.
|
|
474
|
+
// mtime stays the cheap pre-filter (unchanged mtime → reuse without hashing,
|
|
475
|
+
// the fastest path). When mtime *changed*, the snapshot sha1 lets us still
|
|
476
|
+
// reuse symbols for touch/checkout no-op rewrites that bumped mtime without
|
|
477
|
+
// changing content — something mtime-only logic could never catch.
|
|
478
|
+
// null when absent/corrupt/version-or-repo-mismatch → degrade to full parse.
|
|
479
|
+
const snapshotPath = getSnapshotPath(indexPath);
|
|
480
|
+
let oldSnapshot = existing
|
|
481
|
+
? await loadHashSnapshot(snapshotPath, repoName)
|
|
482
|
+
: null;
|
|
483
|
+
// Staleness guard (Task 6, CRITICAL-2): an incremental saveIncremental /
|
|
484
|
+
// removeFileFromIndex advances index.updated_at WITHOUT touching the
|
|
485
|
+
// snapshot. If saveIndex landed but the subsequent snapshot save failed (or
|
|
486
|
+
// an incremental edit ran after the last full index), the on-disk snapshot
|
|
487
|
+
// is OLDER than the index and its SHAs may no longer match the indexed
|
|
488
|
+
// symbols — carrying them forward (fast path) or sha-matching against them
|
|
489
|
+
// (changed path) would produce wrong reuse on revert+touch sequences. When
|
|
490
|
+
// the snapshot predates the index, discard it: the legacy hash-now
|
|
491
|
+
// convergence path below repopulates a fresh, correct snapshot this run.
|
|
492
|
+
// Guard uses strict inequality (!==), not <. The fresh-write contract is
|
|
493
|
+
// snapshot.created_at === index.updated_at exactly (created_at is anchored to
|
|
494
|
+
// codeIndex.updated_at, not a fresh Date.now()). So ANY mismatch — older OR
|
|
495
|
+
// newer — means the snapshot is not the one paired with this index and must
|
|
496
|
+
// be discarded. A FUTURE created_at (e.g. a snapshot written against a later,
|
|
497
|
+
// since-rolled-back index, or clock skew) is just as untrustworthy as a stale
|
|
498
|
+
// one: its SHAs may not match the indexed symbols.
|
|
499
|
+
if (oldSnapshot && existing && oldSnapshot.created_at !== existing.updated_at) {
|
|
500
|
+
console.warn(`[codesift] hash-snapshot older than index — rebuilding (${repoName})`);
|
|
501
|
+
oldSnapshot = null;
|
|
502
|
+
}
|
|
382
503
|
const filesToParse = [];
|
|
383
504
|
const keptSymbols = [];
|
|
384
505
|
const keptEntries = [];
|
|
506
|
+
// sha1 of every file in the NEW index, by relPath. Populated for reused files
|
|
507
|
+
// here (from the old snapshot when present, else hashed-now for convergence)
|
|
508
|
+
// and for parsed files after parseFiles resolves.
|
|
509
|
+
const newSnapshotFiles = {};
|
|
510
|
+
// CRITICAL-1: reused files whose sha1 must be (re)computed because the old
|
|
511
|
+
// snapshot lacks it (legacy snapshot-less index, or stale snapshot discarded
|
|
512
|
+
// above). Collected here and hashed AFTER the loop in PARSE_CONCURRENCY
|
|
513
|
+
// batches instead of one serial await per file inside the loop — on a first
|
|
514
|
+
// run after upgrade against a many-thousand-file repo the serial version cost
|
|
515
|
+
// thousands of sequential awaits. Behavior is identical, wall-clock is
|
|
516
|
+
// parallelized.
|
|
517
|
+
//
|
|
518
|
+
// mtimeMs: the mtime observed at decision time (the moment we confirmed
|
|
519
|
+
// mtime === prevMtime and placed the file in the queue). We re-stat after
|
|
520
|
+
// hashing to detect any concurrent modification that landed between the two
|
|
521
|
+
// operations. If the mtime drifted, we omit the file from newSnapshotFiles
|
|
522
|
+
// entirely — the missing sha causes the next cold run to re-parse, avoiding
|
|
523
|
+
// a snapshot that pairs new-content sha against old (reused) symbols.
|
|
524
|
+
const legacyHashQueue = [];
|
|
525
|
+
// PERF: pre-build per-file lookups ONCE before the reuse loop. Both reuse
|
|
526
|
+
// branches need (a) the existing index's symbols for a given relPath and (b)
|
|
527
|
+
// its FileEntry. Doing `existing.symbols.filter(s => s.file === relPath)` /
|
|
528
|
+
// `existing.files.find(f => f.path === relPath)` per file is O(files ×
|
|
529
|
+
// symbols) and O(files²) respectively — quadratic, and on a many-thousand
|
|
530
|
+
// file/symbol repo that dominated the reuse-heavy fast path. A single pass
|
|
531
|
+
// builds Map lookups each branch hits in O(1). Built only when there's an
|
|
532
|
+
// existing index to reuse from.
|
|
533
|
+
const symbolsByFile = new Map();
|
|
534
|
+
const fileEntryByPath = new Map();
|
|
535
|
+
if (existing) {
|
|
536
|
+
for (const sym of existing.symbols) {
|
|
537
|
+
const list = symbolsByFile.get(sym.file);
|
|
538
|
+
if (list)
|
|
539
|
+
list.push(sym);
|
|
540
|
+
else
|
|
541
|
+
symbolsByFile.set(sym.file, [sym]);
|
|
542
|
+
}
|
|
543
|
+
for (const fe of existing.files) {
|
|
544
|
+
fileEntryByPath.set(fe.path, fe);
|
|
545
|
+
}
|
|
546
|
+
}
|
|
385
547
|
if (mtimeMap.size > 0) {
|
|
386
548
|
const { stat } = await import("node:fs/promises");
|
|
387
549
|
for (const filePath of files) {
|
|
388
550
|
const relPath = relative(rootPath, filePath);
|
|
389
551
|
const prevMtime = mtimeMap.get(relPath);
|
|
390
552
|
if (prevMtime !== undefined) {
|
|
391
|
-
const fileEntry =
|
|
553
|
+
const fileEntry = fileEntryByPath.get(relPath);
|
|
392
554
|
// Force re-parse if file is marked stale (callee signature changed)
|
|
393
555
|
if (fileEntry?.stale) {
|
|
394
556
|
filesToParse.push(filePath);
|
|
@@ -397,14 +559,47 @@ export async function indexFolder(folderPath, options) {
|
|
|
397
559
|
try {
|
|
398
560
|
const st = await stat(filePath);
|
|
399
561
|
if (Math.round(st.mtimeMs) === prevMtime) {
|
|
400
|
-
//
|
|
401
|
-
const fileSymbols =
|
|
562
|
+
// Fast path: mtime unchanged → reuse symbols without hashing.
|
|
563
|
+
const fileSymbols = symbolsByFile.get(relPath) ?? [];
|
|
402
564
|
if (fileEntry) {
|
|
403
565
|
keptSymbols.push(...fileSymbols);
|
|
404
566
|
keptEntries.push(fileEntry);
|
|
567
|
+
// Carry the sha1 forward: reuse from old snapshot if present,
|
|
568
|
+
// else DEFER hashing so legacy (snapshot-less) indexes converge
|
|
569
|
+
// to a complete snapshot after one run — without paying a serial
|
|
570
|
+
// hash per file inside this loop.
|
|
571
|
+
const carried = oldSnapshot?.files[relPath];
|
|
572
|
+
if (carried !== undefined) {
|
|
573
|
+
newSnapshotFiles[relPath] = carried;
|
|
574
|
+
}
|
|
575
|
+
else {
|
|
576
|
+
legacyHashQueue.push({ relPath, filePath, mtimeMs: Math.round(st.mtimeMs) });
|
|
577
|
+
}
|
|
405
578
|
continue;
|
|
406
579
|
}
|
|
407
580
|
}
|
|
581
|
+
else {
|
|
582
|
+
// mtime changed — hash decides reuse vs re-parse. This catches
|
|
583
|
+
// touch/checkout that bumped mtime without changing content.
|
|
584
|
+
const snapSha = oldSnapshot?.files[relPath];
|
|
585
|
+
if (snapSha !== undefined && fileEntry && !fileEntry.stale) {
|
|
586
|
+
const currentSha = await sha1OfFile(filePath);
|
|
587
|
+
if (currentSha !== null && currentSha === snapSha) {
|
|
588
|
+
const fileSymbols = symbolsByFile.get(relPath) ?? [];
|
|
589
|
+
keptSymbols.push(...fileSymbols);
|
|
590
|
+
// FIX: the file's mtime changed but content is identical (touch /
|
|
591
|
+
// checkout no-op rewrite). Reuse the symbols, but DON'T carry the
|
|
592
|
+
// stale FileEntry verbatim — its mtime_ms still holds the OLD
|
|
593
|
+
// mtime, so every future run would see mtime !== prevMtime and
|
|
594
|
+
// re-hash this file forever, permanently degrading it off the
|
|
595
|
+
// mtime fast path. Clone the entry with mtime_ms bumped to the
|
|
596
|
+
// CURRENT stat's mtime so the next run takes the cheap fast path.
|
|
597
|
+
keptEntries.push({ ...fileEntry, mtime_ms: Math.round(st.mtimeMs) });
|
|
598
|
+
newSnapshotFiles[relPath] = currentSha;
|
|
599
|
+
continue;
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
}
|
|
408
603
|
}
|
|
409
604
|
catch { /* file may have been deleted — reparse */ }
|
|
410
605
|
}
|
|
@@ -414,10 +609,32 @@ export async function indexFolder(folderPath, options) {
|
|
|
414
609
|
else {
|
|
415
610
|
filesToParse.push(...files);
|
|
416
611
|
}
|
|
612
|
+
// Drain the deferred legacy-hash queue (CRITICAL-1): files reused via the
|
|
613
|
+
// mtime fast path that had no carried sha1 (legacy snapshot-less index, or a
|
|
614
|
+
// stale snapshot discarded by the guard above). See drainLegacyHashQueue for
|
|
615
|
+
// the TOCTOU guard details — entries whose mtime drifted between decision
|
|
616
|
+
// time and hash time are omitted so the next run re-parses rather than
|
|
617
|
+
// reusing symbols against a mismatched sha.
|
|
618
|
+
if (legacyHashQueue.length > 0) {
|
|
619
|
+
const drained = await drainLegacyHashQueue(legacyHashQueue);
|
|
620
|
+
Object.assign(newSnapshotFiles, drained);
|
|
621
|
+
}
|
|
417
622
|
// Parse only changed/new files
|
|
418
|
-
const { symbols: parsedSymbols, fileEntries: parsedEntries } = await parseFiles(filesToParse, rootPath, repoName);
|
|
623
|
+
const { symbols: parsedSymbols, fileEntries: parsedEntries, shas: parsedShas } = await parseFiles(filesToParse, rootPath, repoName);
|
|
419
624
|
const symbols = [...keptSymbols, ...parsedSymbols];
|
|
420
625
|
const fileEntries = [...keptEntries, ...parsedEntries];
|
|
626
|
+
// Record sha1s for the files that were actually parsed (changed/new).
|
|
627
|
+
// CRITICAL-1 (TOCTOU): these hashes come straight from parseOneFile — they
|
|
628
|
+
// are the sha1 of the EXACT source string that produced the symbols, so the
|
|
629
|
+
// snapshot can never pair old symbols with a newer file's sha. Only entries
|
|
630
|
+
// that survived parseFiles (parseOneFile returned non-null) have a sha here,
|
|
631
|
+
// keeping the snapshot in lockstep with fileEntries. The previous post-parse
|
|
632
|
+
// double-read loop is gone — one fewer full read per parsed file.
|
|
633
|
+
for (const entry of parsedEntries) {
|
|
634
|
+
const sha = parsedShas[entry.path];
|
|
635
|
+
if (sha !== undefined)
|
|
636
|
+
newSnapshotFiles[entry.path] = sha;
|
|
637
|
+
}
|
|
421
638
|
// Dirty propagation: detect signature changes and mark caller files stale
|
|
422
639
|
if (existing && filesToParse.length > 0 && filesToParse.length < files.length) {
|
|
423
640
|
const staleFiles = propagateDirtySignatures(existing.symbols, symbols, fileEntries);
|
|
@@ -425,25 +642,209 @@ export async function indexFolder(folderPath, options) {
|
|
|
425
642
|
console.error(`[codesift] Dirty propagation: ${staleFiles.size} caller files marked stale`);
|
|
426
643
|
}
|
|
427
644
|
}
|
|
428
|
-
//
|
|
429
|
-
|
|
430
|
-
bm25Indexes.set(repoName, bm25);
|
|
645
|
+
// Invalidate code index cache (BM25 is rebuilt below from the FINAL symbol
|
|
646
|
+
// set — possibly merged with out-of-scope existing symbols, see merge block).
|
|
431
647
|
codeIndexes.delete(repoName);
|
|
432
648
|
// Sanity check: don't overwrite a complete index with a partial one
|
|
433
|
-
// (WASM crash or walk failure can produce truncated results)
|
|
649
|
+
// (WASM crash or walk failure can produce truncated results).
|
|
650
|
+
//
|
|
651
|
+
// IMPORTANT: skip the guard when the walk was explicitly narrowed — either
|
|
652
|
+
// max_files was hit (truncated at cap) or include_paths scoped the walk to a
|
|
653
|
+
// subdirectory. In both cases the small result count is EXPECTED and rejecting
|
|
654
|
+
// it would be a false positive (the "1139 vs 9512" bug class). For unrestricted
|
|
655
|
+
// walks the guard stays as-is, protecting against genuine silent truncations.
|
|
656
|
+
//
|
|
657
|
+
// CRITICAL (T7 correctness fix): skipping the guard is necessary but NOT
|
|
658
|
+
// sufficient. A scoped/capped walk only SEES a narrow slice of the repo; if we
|
|
659
|
+
// persisted that slice as the WHOLE index we would wipe every out-of-scope
|
|
660
|
+
// file's symbols from index+snapshot (worse than the guard's old reject,
|
|
661
|
+
// which at least preserved the prior index). So for scoped/capped walks with
|
|
662
|
+
// an existing index we MERGE: keep out-of-scope existing entries verbatim and
|
|
663
|
+
// overlay the walk's results. See the merge block below.
|
|
664
|
+
//
|
|
665
|
+
// "max_files hit" detection: files.length === effective maxFiles. This is the
|
|
666
|
+
// only signal walkDirectory exposes (it sets limitReached internally but does
|
|
667
|
+
// not surface it on the return value). A 1-in-a-million exact-count false
|
|
668
|
+
// positive (repo has exactly maxFiles parseable files) is accepted — the
|
|
669
|
+
// guard skip is conservative (allows write), not destructive.
|
|
434
670
|
const DROP_THRESHOLD = 0.5; // Reject if new index has <50% of old file count
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
671
|
+
const walkExplicitlyCapped = hitFileLimit;
|
|
672
|
+
const walkExplicitlyScoped = options?.include_paths !== undefined && options.include_paths.length > 0;
|
|
673
|
+
// MIN_GUARD_FILES: the unrestricted guard only arms above this existing
|
|
674
|
+
// file_count (`existing.file_count > 50` below). The scoped-granularity guard
|
|
675
|
+
// mirrors that shape against the in-scope subset so a tiny scope can't be
|
|
676
|
+
// rejected on noise. Single source of truth so both guards stay in lockstep.
|
|
677
|
+
const MIN_GUARD_FILES = 50;
|
|
678
|
+
if (walkExplicitlyCapped || walkExplicitlyScoped) {
|
|
679
|
+
// ROUND-2 FIX (scoped-granularity guard): the unrestricted guard is skipped
|
|
680
|
+
// for scoped/capped walks because a small *overall* result is expected. But
|
|
681
|
+
// that skip was total — a scoped walk that aborts mid-enumeration (WASM
|
|
682
|
+
// crash, transient FS error, an over-broad exclude) silently truncates the
|
|
683
|
+
// IN-SCOPE slice, and the merge below treats every unwalked in-scope file as
|
|
684
|
+
// a deletion → wipes it from index+snapshot. So for a purely SCOPED (uncapped)
|
|
685
|
+
// walk we re-arm a guard against the IN-SCOPE subset: if the walk enumerated
|
|
686
|
+
// far fewer in-scope files than the existing index held in that same scope,
|
|
687
|
+
// AND those files are still on disk, the enumeration was truncated → reject
|
|
688
|
+
// before any merge/save, leaving the old index+snapshot intact.
|
|
689
|
+
//
|
|
690
|
+
// Capped walks are intentionally EXEMPT: a cap means unseen ≠ deleted (the
|
|
691
|
+
// merge preserves all unwalked files), so there is no truncation to detect —
|
|
692
|
+
// nothing in-scope is dropped. A walk that is BOTH scoped and capped also
|
|
693
|
+
// takes capped semantics (preserve everything unwalked), so the same
|
|
694
|
+
// exemption applies — no in-scope file can be lost.
|
|
695
|
+
if (walkExplicitlyScoped && !walkExplicitlyCapped && existing) {
|
|
696
|
+
const includePaths = options.include_paths;
|
|
697
|
+
const inScopeRel = (relPath) => includePaths.some((p) => relPath.startsWith(p)); // mirror walkDirectory
|
|
698
|
+
const existingInScope = existing.files.filter((fe) => inScopeRel(fe.path));
|
|
699
|
+
// All walked files are in scope by construction (walkDirectory honored
|
|
700
|
+
// includePaths), so walkedInScope is simply the walk's file count.
|
|
701
|
+
const walkedInScope = fileEntries.length;
|
|
702
|
+
if (existingInScope.length > MIN_GUARD_FILES &&
|
|
703
|
+
walkedInScope < existingInScope.length * DROP_THRESHOLD) {
|
|
704
|
+
// Auto-heal analog (in-scope): the shrink may be a genuine mass deletion
|
|
705
|
+
// within the scope, not a truncated walk. Sample the existing in-scope
|
|
706
|
+
// paths on disk (mirrors isExistingIndexStale, but restricted to the
|
|
707
|
+
// scope) — if most are gone, accept the merge.
|
|
708
|
+
const inScopePaths = existingInScope.map((fe) => fe.path);
|
|
709
|
+
const stride = Math.max(1, Math.floor(inScopePaths.length / STALE_SAMPLE_LIMIT));
|
|
710
|
+
const sampled = [];
|
|
711
|
+
for (let i = 0; i < inScopePaths.length && sampled.length < STALE_SAMPLE_LIMIT; i += stride) {
|
|
712
|
+
const p = inScopePaths[i];
|
|
713
|
+
if (p)
|
|
714
|
+
sampled.push(p);
|
|
715
|
+
}
|
|
716
|
+
let missing = 0;
|
|
717
|
+
await Promise.all(sampled.map(async (relPath) => {
|
|
718
|
+
try {
|
|
719
|
+
await stat(join(rootPath, relPath));
|
|
720
|
+
}
|
|
721
|
+
catch {
|
|
722
|
+
missing++;
|
|
723
|
+
}
|
|
724
|
+
}));
|
|
725
|
+
const mostGone = missing >= sampled.length * STALE_MISSING_FRACTION;
|
|
726
|
+
if (mostGone) {
|
|
727
|
+
console.error(`[codesift] Scoped sanity auto-heal for ${repoName}: walked ` +
|
|
728
|
+
`${walkedInScope} of ${existingInScope.length} in-scope files but ` +
|
|
729
|
+
`most sampled in-scope paths no longer exist on disk. Accepting ` +
|
|
730
|
+
`scoped merge (legit in-scope mass deletion).`);
|
|
731
|
+
}
|
|
732
|
+
else {
|
|
733
|
+
console.error(`[codesift] SCOPED SANITY CHECK FAILED for ${repoName}: scoped walk ` +
|
|
734
|
+
`under-enumerated — walked ${walkedInScope} of ${existingInScope.length} ` +
|
|
735
|
+
`in-scope files, which still exist on disk. Keeping old index.`);
|
|
736
|
+
return {
|
|
737
|
+
repo: repoName,
|
|
738
|
+
root: rootPath,
|
|
739
|
+
file_count: existing.file_count,
|
|
740
|
+
symbol_count: existing.symbol_count,
|
|
741
|
+
duration_ms: Date.now() - startTime,
|
|
742
|
+
status: "rejected_partial",
|
|
743
|
+
reason: `scoped walk under-enumerated: walked ${walkedInScope} of ${existingInScope.length} in-scope files (still on disk) — kept old index, nothing was re-registered`,
|
|
744
|
+
hint: "If the in-scope shrink is expected (deleted files, new excludes), run invalidate_cache then index_folder to rebuild from scratch.",
|
|
745
|
+
};
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
const detail = walkExplicitlyCapped
|
|
750
|
+
? `max_files=${maxFiles} hit (${files.length} files returned)`
|
|
751
|
+
: `include_paths=[${options.include_paths.join(", ")}]`;
|
|
752
|
+
console.error(`[codesift] sanity guard skipped: walk explicitly capped/scoped (${detail})`);
|
|
753
|
+
}
|
|
754
|
+
else if (existing && fileEntries.length < existing.file_count * DROP_THRESHOLD && existing.file_count > MIN_GUARD_FILES) {
|
|
755
|
+
// The shrink can also mean the OLD index is the bogus one: an earlier
|
|
756
|
+
// walker may have swept since-deleted trees (.worktrees/, vendored dirs),
|
|
757
|
+
// permanently inflating the baseline so every honest reindex looks
|
|
758
|
+
// truncated and gets rejected forever. Disambiguate by sampling the old
|
|
759
|
+
// index's paths: if most of them no longer exist on disk, the old index
|
|
760
|
+
// is stale dead weight — accept the new result instead of keeping it.
|
|
761
|
+
if (await isExistingIndexStale(existing, rootPath)) {
|
|
762
|
+
console.error(`[codesift] Sanity check auto-heal for ${repoName}: old index has ` +
|
|
763
|
+
`${existing.file_count} files but most sampled paths no longer exist ` +
|
|
764
|
+
`on disk. Accepting new index (${fileEntries.length} files).`);
|
|
765
|
+
}
|
|
766
|
+
else {
|
|
767
|
+
console.error(`[codesift] SANITY CHECK FAILED for ${repoName}: ` +
|
|
768
|
+
`new index has ${fileEntries.length} files vs ${existing.file_count} previously. ` +
|
|
769
|
+
`Keeping old index. Use invalidate_cache + index_folder to force reindex.`);
|
|
770
|
+
return {
|
|
771
|
+
repo: repoName,
|
|
772
|
+
root: rootPath,
|
|
773
|
+
file_count: existing.file_count,
|
|
774
|
+
symbol_count: existing.symbol_count,
|
|
775
|
+
duration_ms: Date.now() - startTime,
|
|
776
|
+
status: "rejected_partial",
|
|
777
|
+
reason: `new walk found ${fileEntries.length} files, <50% of the ${existing.file_count} previously indexed — kept old index, nothing was re-registered`,
|
|
778
|
+
hint: "If the shrink is expected (deleted trees, new excludes), run invalidate_cache then index_folder to rebuild from scratch.",
|
|
779
|
+
};
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
// ── MERGE-persist for scoped/capped walks (T7 correctness fix) ────────────
|
|
783
|
+
// A scoped (include_paths) or capped (max_files-hit) walk only enumerated a
|
|
784
|
+
// slice of the repo. Persisting that slice verbatim would delete every
|
|
785
|
+
// out-of-scope file's symbols from index+snapshot. When an existing index is
|
|
786
|
+
// present we instead MERGE: preserve out-of-scope existing entries/symbols/
|
|
787
|
+
// shas and overlay the walk's results.
|
|
788
|
+
//
|
|
789
|
+
// - include_paths scoped (and NOT capped): "scope" = files whose relPath is
|
|
790
|
+
// under any include root (mirror walkDirectory's relPath.startsWith(p)
|
|
791
|
+
// test EXACTLY). Out-of-scope existing files are preserved verbatim;
|
|
792
|
+
// in-scope existing files NOT in the walk set W are dropped (genuine
|
|
793
|
+
// in-scope deletions — the walk fully enumerated the scope).
|
|
794
|
+
// - capped (max_files hit): scope is UNDEFINED — the cap means an unseen
|
|
795
|
+
// file is not necessarily deleted. Preserve ALL existing entries not in W,
|
|
796
|
+
// overlay W. (If a capped walk also passed include_paths, the cap makes the
|
|
797
|
+
// in-scope enumeration incomplete too, so we still only trust W and
|
|
798
|
+
// preserve everything else — capped semantics win.)
|
|
799
|
+
//
|
|
800
|
+
// First run (no existing index) with a scoped/capped walk → save what we have
|
|
801
|
+
// (current behavior, documented): there is nothing to preserve.
|
|
802
|
+
let mergedSymbols = symbols;
|
|
803
|
+
let mergedEntries = fileEntries;
|
|
804
|
+
let mergedSnapshotFiles = newSnapshotFiles;
|
|
805
|
+
if ((walkExplicitlyCapped || walkExplicitlyScoped) && existing) {
|
|
806
|
+
const walkedPaths = new Set(fileEntries.map((fe) => fe.path));
|
|
807
|
+
// A capped walk has undefined scope (unseen ≠ deleted), so it preserves
|
|
808
|
+
// everything not walked. A purely scoped (uncapped) walk additionally drops
|
|
809
|
+
// in-scope-but-unwalked files, since the walk fully enumerated the scope.
|
|
810
|
+
const includePaths = options?.include_paths;
|
|
811
|
+
const inScope = (relPath) => {
|
|
812
|
+
if (walkExplicitlyCapped)
|
|
813
|
+
return false; // cap → never treat as deletable
|
|
814
|
+
if (!includePaths || includePaths.length === 0)
|
|
815
|
+
return false;
|
|
816
|
+
// Mirror walkDirectory's include-path filter exactly.
|
|
817
|
+
return includePaths.some((p) => relPath.startsWith(p));
|
|
445
818
|
};
|
|
819
|
+
const preservedEntries = [];
|
|
820
|
+
const preservedFilePaths = new Set();
|
|
821
|
+
for (const fe of existing.files) {
|
|
822
|
+
if (walkedPaths.has(fe.path))
|
|
823
|
+
continue; // walk result wins for these
|
|
824
|
+
if (inScope(fe.path))
|
|
825
|
+
continue; // in-scope + not walked = deleted-in-scope
|
|
826
|
+
preservedEntries.push(fe);
|
|
827
|
+
preservedFilePaths.add(fe.path);
|
|
828
|
+
}
|
|
829
|
+
const preservedSymbols = existing.symbols.filter((s) => preservedFilePaths.has(s.file));
|
|
830
|
+
mergedEntries = [...preservedEntries, ...fileEntries];
|
|
831
|
+
mergedSymbols = [...preservedSymbols, ...symbols];
|
|
832
|
+
// Snapshot: preserve out-of-scope shas, overlay walked ones.
|
|
833
|
+
mergedSnapshotFiles = {};
|
|
834
|
+
if (oldSnapshot) {
|
|
835
|
+
for (const relPath of preservedFilePaths) {
|
|
836
|
+
const sha = oldSnapshot.files[relPath];
|
|
837
|
+
if (sha !== undefined)
|
|
838
|
+
mergedSnapshotFiles[relPath] = sha;
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
Object.assign(mergedSnapshotFiles, newSnapshotFiles);
|
|
446
842
|
}
|
|
843
|
+
// Build and cache BM25 index from the FINAL (possibly merged) symbol set.
|
|
844
|
+
// Built here (not before the guard) so a rejected_partial early-return leaves
|
|
845
|
+
// the previous in-memory BM25 index intact rather than swapping in a partial.
|
|
846
|
+
const bm25 = buildBM25Index(mergedSymbols);
|
|
847
|
+
bm25Indexes.set(repoName, bm25);
|
|
447
848
|
// Resolve workspaces (Task 7) — runs before persistence so collectImportEdges
|
|
448
849
|
// and other downstream consumers see the populated `workspaces` field.
|
|
449
850
|
// Gated behind CODESIFT_DISABLE_MONOREPO=1 kill switch (spec D-FB).
|
|
@@ -460,24 +861,49 @@ export async function indexFolder(folderPath, options) {
|
|
|
460
861
|
// mode is the safe fallback.
|
|
461
862
|
}
|
|
462
863
|
}
|
|
463
|
-
// Build and save code index
|
|
864
|
+
// Build and save code index from the FINAL (possibly merged) sets.
|
|
464
865
|
const codeIndex = {
|
|
465
866
|
repo: repoName,
|
|
466
867
|
root: rootPath,
|
|
467
|
-
symbols,
|
|
468
|
-
files:
|
|
868
|
+
symbols: mergedSymbols,
|
|
869
|
+
files: mergedEntries,
|
|
469
870
|
created_at: Date.now(),
|
|
470
871
|
updated_at: Date.now(),
|
|
471
|
-
symbol_count:
|
|
472
|
-
file_count:
|
|
872
|
+
symbol_count: mergedSymbols.length,
|
|
873
|
+
file_count: mergedEntries.length,
|
|
473
874
|
extractor_version: { ...EXTRACTOR_VERSIONS },
|
|
474
875
|
...(workspaces ? { workspaces } : {}),
|
|
475
876
|
};
|
|
476
877
|
await saveIndex(indexPath, codeIndex);
|
|
878
|
+
// Persist the hash snapshot AFTER the index lands (mirrors registerRepo
|
|
879
|
+
// ordering) and only on the success path — the rejected_partial branch
|
|
880
|
+
// returned earlier, leaving the previous snapshot intact. Non-fatal: the
|
|
881
|
+
// snapshot is a reuse-optimization cache; a write failure just costs a full
|
|
882
|
+
// re-parse next run, so we warn and continue.
|
|
883
|
+
try {
|
|
884
|
+
const newSnapshot = {
|
|
885
|
+
version: HASH_SNAPSHOT_VERSION,
|
|
886
|
+
repo: repoName,
|
|
887
|
+
// CRITICAL-2 (created_at race): use the EXACT timestamp serialized into
|
|
888
|
+
// the index, not a fresh Date.now(). A watcher's saveIncremental that
|
|
889
|
+
// lands between saveIndex and this write would otherwise leave the
|
|
890
|
+
// snapshot OLDER than created_at, blinding the staleness guard above. By
|
|
891
|
+
// anchoring to codeIndex.updated_at, snapshot.created_at === the index's
|
|
892
|
+
// updated_at on a fresh write, so any later incremental strictly advances
|
|
893
|
+
// index.updated_at past it and the guard fires correctly.
|
|
894
|
+
created_at: codeIndex.updated_at,
|
|
895
|
+
files: mergedSnapshotFiles,
|
|
896
|
+
};
|
|
897
|
+
await saveHashSnapshot(snapshotPath, newSnapshot);
|
|
898
|
+
}
|
|
899
|
+
catch (err) {
|
|
900
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
901
|
+
console.warn(`[codesift] hash-snapshot save failed for ${repoName} (non-fatal): ${msg}`);
|
|
902
|
+
}
|
|
477
903
|
// Embed symbols and chunks in background (non-fatal, don't block MCP response)
|
|
478
904
|
// Large repos (71K symbols) can take minutes — fire-and-forget to prevent timeout
|
|
479
|
-
embedSymbols(
|
|
480
|
-
.then(() => embedChunks(
|
|
905
|
+
embedSymbols(mergedSymbols, indexPath, repoName, config)
|
|
906
|
+
.then(() => embedChunks(mergedEntries, rootPath, repoName, indexPath, config, mergedSymbols))
|
|
481
907
|
.catch((err) => {
|
|
482
908
|
const msg = err instanceof Error ? err.message : String(err);
|
|
483
909
|
console.error(`[codesift] Background embedding failed for ${repoName}: ${msg}`);
|
|
@@ -496,8 +922,8 @@ export async function indexFolder(folderPath, options) {
|
|
|
496
922
|
name: repoName,
|
|
497
923
|
root: rootPath,
|
|
498
924
|
index_path: indexPath,
|
|
499
|
-
symbol_count:
|
|
500
|
-
file_count:
|
|
925
|
+
symbol_count: mergedSymbols.length,
|
|
926
|
+
file_count: mergedEntries.length,
|
|
501
927
|
updated_at: Date.now(),
|
|
502
928
|
};
|
|
503
929
|
await registerRepo(config.registryPath, meta);
|
|
@@ -520,7 +946,7 @@ export async function indexFolder(folderPath, options) {
|
|
|
520
946
|
try {
|
|
521
947
|
const { detectFrameworks } = await import("../utils/framework-detect.js");
|
|
522
948
|
const { enableFrameworkToolBundle } = await import("../register-tools.js");
|
|
523
|
-
const tempIndex = { root: rootPath, files:
|
|
949
|
+
const tempIndex = { root: rootPath, files: mergedEntries, symbols: mergedSymbols };
|
|
524
950
|
const frameworks = detectFrameworks(tempIndex);
|
|
525
951
|
for (const fw of frameworks) {
|
|
526
952
|
const enabled = enableFrameworkToolBundle(fw);
|
|
@@ -538,8 +964,8 @@ export async function indexFolder(folderPath, options) {
|
|
|
538
964
|
return {
|
|
539
965
|
repo: repoName,
|
|
540
966
|
root: rootPath,
|
|
541
|
-
file_count:
|
|
542
|
-
symbol_count:
|
|
967
|
+
file_count: mergedEntries.length,
|
|
968
|
+
symbol_count: mergedSymbols.length,
|
|
543
969
|
duration_ms: Date.now() - startTime,
|
|
544
970
|
};
|
|
545
971
|
}
|
|
@@ -778,7 +1204,8 @@ export async function invalidateCache(repoName) {
|
|
|
778
1204
|
const chunkPath = getChunkPath(meta.index_path);
|
|
779
1205
|
const chunkEmbeddingPath = getChunkEmbeddingPath(meta.index_path);
|
|
780
1206
|
const graphStorePath = getGraphPath(meta.index_path);
|
|
781
|
-
|
|
1207
|
+
const snapshotPath = getSnapshotPath(meta.index_path);
|
|
1208
|
+
for (const fp of [meta.index_path, embeddingPath, embeddingMetaPath, chunkPath, chunkEmbeddingPath, graphStorePath, snapshotPath]) {
|
|
782
1209
|
try {
|
|
783
1210
|
await unlink(fp);
|
|
784
1211
|
}
|