sweet-search 2.5.13 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +36 -9
  2. package/core/cli.js +41 -3
  3. package/core/embedding/embedding-local-model.js +106 -10
  4. package/core/embedding/embedding-service.js +59 -1
  5. package/core/embedding/model-client.mjs +257 -0
  6. package/core/embedding/model-server.mjs +217 -0
  7. package/core/incremental-indexing/application/maintenance-handlers.mjs +19 -98
  8. package/core/incremental-indexing/application/maintenance-worker.mjs +46 -9
  9. package/core/incremental-indexing/application/operator-cli.mjs +14 -5
  10. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +40 -0
  11. package/core/incremental-indexing/application/production-reconciler.mjs +718 -54
  12. package/core/incremental-indexing/application/reconciler.mjs +87 -15
  13. package/core/incremental-indexing/domain/cutoff-cache.mjs +191 -0
  14. package/core/incremental-indexing/domain/interval-autotune.mjs +84 -1
  15. package/core/incremental-indexing/domain/reconcile-counters.mjs +0 -4
  16. package/core/incremental-indexing/domain/watermark-scheduler.mjs +0 -24
  17. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +2 -26
  18. package/core/incremental-indexing/infrastructure/manifest.mjs +1 -9
  19. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +72 -0
  20. package/core/indexing/artifact-builder.js +1 -1
  21. package/core/indexing/dedup/dedup-phase.js +36 -17
  22. package/core/indexing/dedup/exemplar-selector.js +5 -0
  23. package/core/indexing/index-codebase-v21.js +37 -14
  24. package/core/indexing/index-maintainer.mjs +337 -6
  25. package/core/indexing/indexer-ann.js +27 -434
  26. package/core/indexing/indexer-build.js +30 -14
  27. package/core/indexing/indexer-manifest.js +0 -3
  28. package/core/indexing/indexer-phases.js +101 -25
  29. package/core/indexing/maintainer-launcher.mjs +22 -0
  30. package/core/indexing/maintainer-watcher.mjs +397 -0
  31. package/core/indexing/os-priority.mjs +160 -0
  32. package/core/indexing/rss-budget.mjs +425 -0
  33. package/core/indexing/streaming-vectors.js +450 -0
  34. package/core/infrastructure/config/platform.js +14 -10
  35. package/core/infrastructure/onnx-session-utils.js +37 -0
  36. package/core/infrastructure/sparse-gram-delta-reader.js +11 -1
  37. package/core/ranking/late-interaction-index.js +58 -7
  38. package/core/search/daemon-registry.js +199 -0
  39. package/core/search/search-read-semantic.js +9 -3
  40. package/core/search/search-semantic.js +6 -29
  41. package/core/search/search-server.js +527 -27
  42. package/core/search/session-daemon-prewarm.mjs +110 -1
  43. package/core/search/sweet-search.js +0 -38
  44. package/core/vector-store/binary-hnsw-index.js +692 -78
  45. package/core/vector-store/index.js +1 -4
  46. package/eval/agent-read-workflows/bin/_ss-argparse.mjs +51 -5
  47. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +95 -44
  48. package/eval/agent-read-workflows/bin/ss-read +2 -0
  49. package/mcp/tool-handlers.js +1 -2
  50. package/package.json +11 -8
  51. package/scripts/uninstall.js +2 -0
  52. package/core/vector-store/hnsw-index.js +0 -751
@@ -0,0 +1,450 @@
1
+ /**
2
+ * Streaming vectors + late-interaction builder (bounded-memory full rebuild).
3
+ *
4
+ * WHY THIS EXISTS
5
+ * ---------------
6
+ * The default in-memory vectors path (`buildVectorIndex` ‖ `buildLateInteractionIndex`
7
+ * driven from `buildVectorsAndArtifactsPhase`) materialises the ENTIRE chunk
8
+ * corpus at once: `chunkFiles()` returns every chunk + every embed-text, dedup
9
+ * annotates them in place, the embed pass holds all exemplar embeddings + insert
10
+ * rows, and the LI pass keeps every doc's per-token slab in `this.documents`.
11
+ * Peak heap is O(repo). On large repositories (e.g. tursodatabase/libsql ≈ 431k
12
+ * chunks, swc-project/swc ≈ 217k chunks / 180k exemplars) that blows the default
13
+ * ~4 GB Node heap / a RAM-limited box and the indexer crashes — regardless of
14
+ * the encoder backend (CUDA, Metal, CoreML, or ORT-CPU), because the hogs live
15
+ * in the JS layer, not the model.
16
+ *
17
+ * WHAT THIS DOES
18
+ * --------------
19
+ * Streams the same pipeline in bounded windows so peak heap is O(window):
20
+ *
21
+ * 1. PARSE+SPILL — parse files in file-windows (reusing `chunkFiles`), compute
22
+ * dedup fingerprints, apply the LI skip policy (content in hand), and spill
23
+ * each chunk to a temp SQLite store. Only lightweight per-chunk records
24
+ * (id, text length, path/hash, fingerprint, li-keep flag) stay resident.
25
+ * 2. DEDUP — cluster the resident fingerprints GLOBALLY (identical to the
26
+ * in-memory path — needed so dup-heavy repos keep their 94%-alias short-cut
27
+ * instead of re-embedding everything) and annotate the lightweight records.
28
+ * 3. EMBED — stream exemplars in chunk-range windows, hydrate from the
29
+ * store, and insert via the UNCHANGED `pipelinedEmbedAndInsert`
30
+ * (→ `callLocalModelBucketed`: the cache-aware compute-batching the README
31
+ * documents is untouched).
32
+ * 4. ALIAS — stream aliases in windows, copy exemplar vectors via the
33
+ * UNCHANGED `insertAliasVectors`.
34
+ * 5. LI — hand LI-lite records (exemplar token-text only) to the
35
+ * UNCHANGED `buildLateInteractionIndex` in bounded build mode, so per-token
36
+ * slabs are flushed to segments and evicted (peak O(one segment)).
37
+ *
38
+ * On-disk output is byte-for-byte the same format the in-memory path produces
39
+ * (codebase.db vectors + atomic swap; SSLX-v3 LI segments). Small repos and
40
+ * incremental runs keep the original in-memory path untouched (see the gate in
41
+ * buildVectorsAndArtifactsPhase), so benchmark indexes are unaffected.
42
+ */
43
+
44
+ import { createHash } from 'crypto';
45
+ import fs from 'fs/promises';
46
+ import { existsSync } from 'fs';
47
+ import path from 'path';
48
+
49
+ import {
50
+ DB_PATHS,
51
+ EMBEDDING_CONFIG,
52
+ PROJECT_ROOT,
53
+ DEDUP_CONFIG,
54
+ LATE_INTERACTION_CONFIG,
55
+ } from '../infrastructure/config/index.js';
56
+ import {
57
+ isDedupAvailable,
58
+ computeFingerprints,
59
+ clusterFingerprints,
60
+ } from '../infrastructure/index.js';
61
+ import { annotateDedupClusters } from './dedup/dedup-phase.js';
62
+ import {
63
+ chunkFiles,
64
+ createVectorSchema,
65
+ pipelinedEmbedAndInsert,
66
+ insertAliasVectors,
67
+ } from './indexer-build.js';
68
+ import { buildLateInteractionIndex } from './indexer-ann.js';
69
+ import {
70
+ configureJournalMode,
71
+ checkpointWal,
72
+ atomicSwapDatabase,
73
+ log,
74
+ logProgress,
75
+ } from './indexer-utils.js';
76
+
77
+ // Files parsed per chunkFiles() call. Bounds the transient parse working set.
78
+ const PARSE_FILE_WINDOW = Number(process.env.SWEET_SEARCH_STREAM_PARSE_FILES) || 2000;
79
+ // Aliases hydrated per insert window. Bounds the transient alias working set.
80
+ // Alias rows just copy the exemplar's vector + deterministic metadata, so the
81
+ // window size does NOT affect the resulting index — only peak memory.
82
+ const HYDRATE_CHUNK_WINDOW = Number(process.env.SWEET_SEARCH_STREAM_HYDRATE_CHUNKS) || 50_000;
83
+ // Exemplars embedded per call. The embedding written for a chunk is determined
84
+ // by callLocalModelBucketed's bucketing over the set it's handed, so to keep the
85
+ // index BYTE-IDENTICAL to the in-memory path (which embeds all exemplars in one
86
+ // call) the streaming path must also embed all exemplars in ONE call whenever
87
+ // they fit. This window is sized well above any repo that could have indexed
88
+ // in-memory before (~4 GB heap OOMs in-memory well under this exemplar count),
89
+ // so every repo with a valid "before" gets the identical single embed call.
90
+ // Only a repo too huge to ever have indexed in-memory splits into multiple
91
+ // embed windows — and that repo has no prior index to differ from. On CPU the
92
+ // per-chunk embedding is batch-independent (identical even when windowed); the
93
+ // single call only matters for GPU FP-reassociation across batch shapes.
94
+ const EMBED_WINDOW = Number(process.env.SWEET_SEARCH_STREAM_EMBED_WINDOW) || 200_000;
95
+
96
+ // ── small replicas of indexer-build internals (kept private here) ──
97
+
98
+ /** Mirror of chunkFiles()'s embed-text cap (ast-chunker getEmbedTextCap). */
99
+ function embedTextCap() {
100
+ const v = parseInt(process.env.SWEET_SEARCH_EMBED_TEXT_CAP || '', 10);
101
+ return Number.isFinite(v) && v >= 500 ? v : 2000;
102
+ }
103
+
104
+ /** Mirror of indexer-build.js chunkFilePath (not exported). */
105
+ function chunkFilePath(chunk) {
106
+ for (const candidate of [
107
+ chunk?.metadata?.relative_path,
108
+ chunk?.metadata?.path,
109
+ chunk?.metadata?.file_path,
110
+ chunk?.file,
111
+ chunk?.metadata?.file,
112
+ ]) {
113
+ if (typeof candidate !== 'string') continue;
114
+ const n = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
115
+ if (!n || n === '.' || n.startsWith('/')) continue;
116
+ if (/^[A-Za-z]:\//.test(n)) continue;
117
+ if (n === '..' || n.startsWith('../') || n.includes('/../')) continue;
118
+ return n;
119
+ }
120
+ return '';
121
+ }
122
+
123
+ /** Mirror of chunkFiles()'s per-chunk embed-text derivation. */
124
+ function embedTextOf(chunk, cap) {
125
+ if (chunk.embedding_text) return chunk.embedding_text.slice(0, cap);
126
+ return `${chunkFilePath(chunk)} ${chunk.metadata?.symbol || ''}\n${(chunk.text || chunk.content || '').slice(0, 1500)}`;
127
+ }
128
+
129
+ // Dedup-annotation fields written by annotateDedupClusters onto a record's
130
+ // metadata; merged back into the hydrated chunk so downstream sees the same
131
+ // annotations the in-memory path would have set in place.
132
+ const DEDUP_FIELDS = ['simhash', 'isExemplar', 'exemplarId', 'clusterId', 'aliasJaccard', 'liReuseEligible'];
133
+
134
+ function mergeDedupMeta(chunk, recMeta) {
135
+ const m = (chunk.metadata = chunk.metadata || {});
136
+ for (const k of DEDUP_FIELDS) m[k] = recMeta[k];
137
+ return chunk;
138
+ }
139
+
140
+ function* windows(n, size) {
141
+ for (let i = 0; i < n; i += size) yield [i, Math.min(i + size, n)];
142
+ }
143
+
144
+ // ── temp spill store ──
145
+
146
+ async function openSpillStore() {
147
+ const Database = (await import('better-sqlite3')).default;
148
+ const storePath = DB_PATHS.codebase + '.staging-chunks.db';
149
+ for (const p of [storePath, storePath + '-wal', storePath + '-shm']) {
150
+ try { await fs.unlink(p); } catch { /* absent */ }
151
+ }
152
+ await fs.mkdir(path.dirname(storePath), { recursive: true });
153
+ const db = new Database(storePath);
154
+ // Fast, non-durable: this is a throwaway scratch file deleted at the end.
155
+ db.pragma('journal_mode = OFF');
156
+ db.pragma('synchronous = OFF');
157
+ db.exec('CREATE TABLE c (seq INTEGER PRIMARY KEY, j TEXT NOT NULL)');
158
+ const insert = db.prepare('INSERT INTO c (seq, j) VALUES (?, ?)');
159
+ const insertMany = db.transaction((rows) => { for (const r of rows) insert.run(r.seq, r.j); });
160
+ const readRange = db.prepare('SELECT seq, j FROM c WHERE seq >= ? AND seq < ? ORDER BY seq');
161
+ return { db, storePath, insertMany, readRange };
162
+ }
163
+
164
+ // =============================================================================
165
+ // MAIN
166
+ // =============================================================================
167
+
168
+ /**
169
+ * Build vectors (codebase.db) + the staged LI index for a full rebuild with
170
+ * bounded memory. Returns `{ vectorStats, lateInteractionResult, liBuilt }`.
171
+ * The caller (buildVectorsAndArtifactsPhase) promotes the staged LI index and
172
+ * builds quantized artifacts from the swapped codebase.db, exactly as it does
173
+ * for the in-memory path.
174
+ *
175
+ * @param {object} opts
176
+ * @param {string[]} opts.filesToIndex
177
+ * @param {object} opts.modelInfo getModelInfo()
178
+ * @param {boolean} opts.sqliteFastMode
179
+ * @param {boolean} opts.noLateInteraction
180
+ * @param {object} opts.li LI resource-plan knobs + staged paths
181
+ */
182
+ export async function buildVectorsAndLiStreaming(opts) {
183
+ const {
184
+ filesToIndex,
185
+ modelInfo,
186
+ sqliteFastMode = false,
187
+ noLateInteraction = false,
188
+ li = {},
189
+ } = opts;
190
+
191
+ const cap = embedTextCap();
192
+ const dedupOn = DEDUP_CONFIG.enabled && isDedupAvailable();
193
+ const wantLi = !noLateInteraction && LATE_INTERACTION_CONFIG.enabled;
194
+
195
+ log('\n━━━ Phase 2: Vectors + LI (streaming, bounded memory) ━━━', 'bright');
196
+ log(`Streaming ${filesToIndex.length} files (parse window=${PARSE_FILE_WINDOW} files, hydrate window=${HYDRATE_CHUNK_WINDOW} chunks)`, 'dim');
197
+
198
+ const store = await openSpillStore();
199
+ try {
200
+ // ── 1. PARSE + SPILL + FINGERPRINT + LI-SKIP ──
201
+ const records = []; // [{ id, _textLen, file, metadata, liKeep }]
202
+ const fingerprints = []; // parallel to records (freed after clustering)
203
+ let seq = 0;
204
+ let parsed = 0;
205
+
206
+ const { applyIndexingChunkPolicy } = await import('./indexing-file-policy.js');
207
+
208
+ for (const [fi, fj] of windows(filesToIndex.length, PARSE_FILE_WINDOW)) {
209
+ const fileWindow = filesToIndex.slice(fi, fj);
210
+ const { allChunks } = await chunkFiles(fileWindow);
211
+ if (allChunks.length === 0) { parsed += fileWindow.length; continue; }
212
+
213
+ // Fingerprints on raw text (matches runDedupPhase: c.text || c.content).
214
+ let fps = null;
215
+ if (dedupOn) {
216
+ try {
217
+ fps = computeFingerprints(allChunks.map((c) => c.text || c.content || ''), DEDUP_CONFIG);
218
+ } catch { fps = null; }
219
+ }
220
+
221
+ // LI skip policy needs chunk content — apply it here, while we have it,
222
+ // and remember the keep decision so the LI stage can run on LI-lite records.
223
+ let liKeptIds = null;
224
+ if (wantLi) {
225
+ try {
226
+ const { kept } = applyIndexingChunkPolicy(allChunks, { projectRoot: PROJECT_ROOT });
227
+ liKeptIds = new Set(kept.map((c) => c.id));
228
+ } catch { liKeptIds = null; }
229
+ }
230
+
231
+ const rows = new Array(allChunks.length);
232
+ for (let k = 0; k < allChunks.length; k++) {
233
+ const chunk = allChunks[k];
234
+ rows[k] = { seq, j: JSON.stringify(chunk) };
235
+ records.push({
236
+ id: chunk.id,
237
+ _textLen: (chunk.text || chunk.content || '').length,
238
+ file: chunk.file,
239
+ // Carry only the fields selectExemplar reads; annotateDedupClusters
240
+ // writes the dedup annotation fields onto this same object.
241
+ metadata: {
242
+ relative_path: chunk.metadata?.relative_path,
243
+ path: chunk.metadata?.path,
244
+ file_path: chunk.metadata?.file_path,
245
+ file: chunk.metadata?.file,
246
+ hash: chunk.metadata?.hash,
247
+ },
248
+ liKeep: liKeptIds ? liKeptIds.has(chunk.id) : wantLi,
249
+ });
250
+ fingerprints.push(fps ? fps[k] : null);
251
+ seq++;
252
+ }
253
+ store.insertMany(rows);
254
+
255
+ parsed += fileWindow.length;
256
+ logProgress(parsed, filesToIndex.length, 'Parsing+spill');
257
+ }
258
+
259
+ const totalChunks = records.length;
260
+ log(`\n✓ Spilled ${totalChunks} chunks to disk (lightweight records resident only)`, 'green');
261
+ if (totalChunks === 0) {
262
+ return { vectorStats: { chunks: 0, embeddings: 0 }, lateInteractionResult: null, liBuilt: false };
263
+ }
264
+
265
+ // ── 2. GLOBAL DEDUP (on lightweight signatures) ──
266
+ let dedupStats = null;
267
+ if (dedupOn && fingerprints.every((f) => f)) {
268
+ try {
269
+ const clusters = clusterFingerprints(fingerprints, DEDUP_CONFIG);
270
+ dedupStats = annotateDedupClusters(records, fingerprints, clusters, DEDUP_CONFIG);
271
+ const pct = ((dedupStats.totalAliases / totalChunks) * 100).toFixed(1);
272
+ log(`Dedup: ${dedupStats.clustersWithSiblings} clusters, ${dedupStats.totalAliases} aliases (${pct}% of ${totalChunks})`, 'cyan');
273
+ } catch (e) {
274
+ log(`Dedup skipped (${e.message}); embedding every chunk`, 'yellow');
275
+ }
276
+ }
277
+ // Free the heaviest resident structure (≈0.5 KB/chunk) before embedding.
278
+ fingerprints.length = 0;
279
+
280
+ const exemplarSeqs = [];
281
+ const aliasSeqs = [];
282
+ for (let s = 0; s < records.length; s++) {
283
+ if (records[s].metadata.exemplarId) aliasSeqs.push(s); else exemplarSeqs.push(s);
284
+ }
285
+ log(`Embedding ${exemplarSeqs.length} exemplars, copying vectors for ${aliasSeqs.length} aliases`, 'dim');
286
+
287
+ // ── open codebase.db.tmp once; stream inserts; atomic-swap at the end ──
288
+ const Database = (await import('better-sqlite3')).default;
289
+ await fs.mkdir(path.dirname(DB_PATHS.codebase), { recursive: true });
290
+ const tmpPath = DB_PATHS.codebase + '.tmp';
291
+ for (const p of [tmpPath, tmpPath + '-wal', tmpPath + '-shm']) {
292
+ try { await fs.unlink(p); } catch { /* absent */ }
293
+ }
294
+ const vdb = new Database(tmpPath);
295
+ configureJournalMode(vdb, tmpPath, sqliteFastMode);
296
+ createVectorSchema(vdb);
297
+
298
+ const isLocal = modelInfo.provider === 'local';
299
+ const writeFlushRows = EMBEDDING_CONFIG.indexerWriteFlushRows;
300
+ const embeddingOptions = { useCache: false };
301
+ let effectiveDim = modelInfo.dimension;
302
+ if (modelInfo.isRemote) {
303
+ const configuredOutputDim = parseInt(
304
+ process.env.SWEET_SEARCH_INDEXING_OUTPUT_DIMENSION || `${modelInfo.hnswDimension}`, 10);
305
+ if (Number.isFinite(configuredOutputDim) && configuredOutputDim > 0 && configuredOutputDim <= modelInfo.dimension) {
306
+ embeddingOptions.providerOptions = {
307
+ outputDimension: configuredOutputDim,
308
+ inputType: 'document',
309
+ concurrency: parseInt(process.env.SWEET_SEARCH_EMBEDDING_CONCURRENCY || '4', 10),
310
+ };
311
+ effectiveDim = configuredOutputDim;
312
+ }
313
+ }
314
+
315
+ // Hydrate a specific list of seqs (batched IN query), merge dedup
316
+ // annotations, and tag each chunk with its LI-keep flag. Returns chunks in
317
+ // `seqs` order (= seq order = file order), so annotateChunksForVectorInsert's
318
+ // per-file structural-id grouping stays correct (windows are cut at file
319
+ // boundaries — see fileWindows). Hydrating ONLY the seqs a pass needs avoids
320
+ // re-parsing the whole corpus (e.g. the embed pass parses 25k exemplar JSONs
321
+ // for libsql, not all 431k) — the key "don't slow down" optimization.
322
+ const HYDRATE_SUB = 4000;
323
+ const hydrateSeqs = (seqs) => {
324
+ const out = [];
325
+ for (let i = 0; i < seqs.length; i += HYDRATE_SUB) {
326
+ const batch = seqs.slice(i, i + HYDRATE_SUB);
327
+ const rows = store.db
328
+ .prepare(`SELECT seq, j FROM c WHERE seq IN (${batch.map(() => '?').join(',')})`)
329
+ .all(...batch);
330
+ const bySeq = new Map();
331
+ for (const r of rows) bySeq.set(r.seq, r.j);
332
+ for (const s of batch) {
333
+ const j = bySeq.get(s);
334
+ if (j === undefined) continue;
335
+ const chunk = JSON.parse(j);
336
+ if (dedupStats) mergeDedupMeta(chunk, records[s].metadata);
337
+ chunk.__liKeep = records[s].liKeep; // transient; ignored by downstream
338
+ out.push(chunk);
339
+ }
340
+ }
341
+ return out;
342
+ };
343
+
344
+ // File-aligned windows over a seq list: cut at `size` but never mid-file,
345
+ // so a file's chunks always land in one window.
346
+ function* fileWindows(seqs, size) {
347
+ let start = 0;
348
+ while (start < seqs.length) {
349
+ let end = Math.min(start + size, seqs.length);
350
+ while (end < seqs.length && records[seqs[end]].file === records[seqs[end - 1]].file) end++;
351
+ yield seqs.slice(start, end);
352
+ start = end;
353
+ }
354
+ }
355
+
356
+ // LI input is assembled DURING the embed/alias passes (no third parse pass).
357
+ // Exemplars carry token text; LI-reuse-eligible aliases need only a pointer.
358
+ const liExemplars = [];
359
+ const liAliases = [];
360
+ const liLite = (chunk, withText) => (withText
361
+ ? { id: chunk.id, file: chunk.file, metadata: chunk.metadata || {}, li_greedy_text: chunk.li_greedy_text, embedding_text: chunk.embedding_text, li_text: chunk.li_text, text: chunk.text }
362
+ : { id: chunk.id, file: chunk.file, metadata: chunk.metadata || {} });
363
+
364
+ // ── 3. EMBED exemplars (only exemplar seqs hydrated; bucketing UNCHANGED) ──
365
+ let embeddingCount = 0;
366
+ let embeddedSoFar = 0;
367
+ for (const win of fileWindows(exemplarSeqs, EMBED_WINDOW)) {
368
+ const exemplars = hydrateSeqs(win);
369
+ if (exemplars.length === 0) continue;
370
+ const exemplarTexts = exemplars.map((c) => embedTextOf(c, cap));
371
+ const batchSize = isLocal ? exemplarTexts.length : EMBEDDING_CONFIG.indexerBatchSize;
372
+ embeddingCount += await pipelinedEmbedAndInsert(
373
+ vdb, exemplars, exemplarTexts, batchSize, modelInfo,
374
+ (done) => logProgress(embeddedSoFar + done, exemplarSeqs.length, 'Embedding'),
375
+ embeddingOptions, log, writeFlushRows,
376
+ );
377
+ embeddedSoFar += exemplars.length;
378
+ if (wantLi) for (const c of exemplars) if (c.__liKeep) liExemplars.push(liLite(c, true));
379
+ }
380
+ checkpointWal(vdb);
381
+ log(`\n✓ Generated ${embeddingCount} embeddings (${effectiveDim}d)`, 'green');
382
+
383
+ // ── 4. ALIAS inserts (only alias seqs hydrated; exemplar vectors present) ──
384
+ if (aliasSeqs.length > 0) {
385
+ let aliasInserted = 0;
386
+ for (const win of fileWindows(aliasSeqs, HYDRATE_CHUNK_WINDOW)) {
387
+ const aliases = hydrateSeqs(win);
388
+ if (aliases.length === 0) continue;
389
+ aliasInserted += insertAliasVectors(vdb, aliases, modelInfo, { skipOrphanPurge: true });
390
+ if (wantLi) for (const c of aliases) if (c.__liKeep) liAliases.push(liLite(c, !c.metadata?.liReuseEligible));
391
+ }
392
+ log(` ✓ Inserted ${aliasInserted} alias vector(s) (embeddings copied from exemplars)`, 'dim');
393
+ }
394
+
395
+ checkpointWal(vdb);
396
+ try { vdb.pragma('optimize'); } catch { /* best effort */ }
397
+ vdb.close();
398
+ await atomicSwapDatabase(tmpPath, DB_PATHS.codebase);
399
+ const vstat = await fs.stat(DB_PATHS.codebase);
400
+ log(`✓ Saved codebase.db (${(vstat.size / 1024 / 1024).toFixed(2)} MB, ${totalChunks} vectors)`, 'green');
401
+
402
+ const vectorStats = { chunks: totalChunks, embeddings: embeddingCount };
403
+
404
+ // ── 5. LATE INTERACTION (LI-lite input + bounded build mode) ──
405
+ // liExemplars/liAliases were assembled during the embed/alias passes above,
406
+ // so no third parse pass. buildLateInteractionIndex partitions by
407
+ // metadata.exemplarId — order in the input array doesn't matter.
408
+ let lateInteractionResult = null;
409
+ let liBuilt = false;
410
+ if (wantLi) {
411
+ const liChunks = liExemplars.concat(liAliases);
412
+
413
+ try {
414
+ lateInteractionResult = await buildLateInteractionIndex(liChunks, false, [], {
415
+ poolFactor: li.poolFactor ?? 1,
416
+ extendedSkiplist: li.extendedSkiplist ?? false,
417
+ loadFromPath: li.loadFromPath ?? DB_PATHS.lateInteraction,
418
+ saveToPath: li.saveToPath,
419
+ finalIndexPath: li.finalIndexPath ?? DB_PATHS.lateInteraction,
420
+ stagingSegmentDir: li.stagingSegmentDir,
421
+ fullRebuild: true,
422
+ workerCount: li.workerCount ?? 1,
423
+ threadsPerWorker: li.threadsPerWorker ?? 0,
424
+ batchSize: li.batchSize ?? 8,
425
+ batchSizeUpperCap: li.batchSizeUpperCap,
426
+ tokenBudget: li.tokenBudget ?? 8192,
427
+ attentionBudget: li.attentionBudget ?? null,
428
+ projectRoot: PROJECT_ROOT,
429
+ buildEvict: true,
430
+ skipPolicyAlreadyApplied: true,
431
+ });
432
+ liBuilt = true;
433
+ } catch (err) {
434
+ // Non-fatal: vectors (codebase.db) are already committed above, so a
435
+ // failed LI build must not lose them. The caller invalidates/cleans the
436
+ // staged LI index and continues — same contract as the in-memory path.
437
+ log(`Late interaction build failed (non-fatal): ${err.message}`, 'yellow');
438
+ lateInteractionResult = { error: err.message, invalidated: true };
439
+ liBuilt = false;
440
+ }
441
+ }
442
+
443
+ return { vectorStats, lateInteractionResult, liBuilt };
444
+ } finally {
445
+ try { store.db.close(); } catch { /* ignore */ }
446
+ for (const p of [store.storePath, store.storePath + '-wal', store.storePath + '-shm']) {
447
+ try { await fs.unlink(p); } catch { /* absent */ }
448
+ }
449
+ }
450
+ }
@@ -16,21 +16,28 @@ function resolveProjectRoot() {
16
16
  const fromEnv = process.env.SWEET_SEARCH_PROJECT_ROOT?.trim();
17
17
  if (fromEnv) return path.resolve(fromEnv);
18
18
 
19
- // Walk up from cwd looking for .git or package.json to find the real
20
- // project root, so that running from a subdirectory still finds the
21
- // .sweet-search/ data dir and init config.
22
- let dir = process.cwd();
23
- while (true) {
19
+ const cwd = process.cwd();
20
+
21
+ // Prefer an existing sweet-search state dir so indexed corpus subdirectories
22
+ // do not get pulled back to an outer package root.
23
+ for (let dir = cwd; ; dir = path.dirname(dir)) {
24
+ if (existsSync(path.join(dir, '.sweet-search'))) return dir;
25
+ const parent = path.dirname(dir);
26
+ if (parent === dir) break;
27
+ }
28
+
29
+ // Fallback to the historical project markers for cold-start/init flows before
30
+ // .sweet-search/ exists.
31
+ for (let dir = cwd; ; dir = path.dirname(dir)) {
24
32
  if (existsSync(path.join(dir, '.git')) || existsSync(path.join(dir, 'package.json'))) {
25
33
  return dir;
26
34
  }
27
35
  const parent = path.dirname(dir);
28
36
  if (parent === dir) break; // filesystem root
29
- dir = parent;
30
37
  }
31
38
 
32
39
  // Fallback to cwd if no project marker found
33
- return process.cwd();
40
+ return cwd;
34
41
  }
35
42
 
36
43
  // Project root detection
@@ -82,9 +89,6 @@ export const DB_PATHS = {
82
89
  // Code graph (entities + relationships + FTS5 + summaries)
83
90
  codeGraph: path.join(PROJECT_ROOT, DATA_DIR_NAME, 'code-graph.db'),
84
91
 
85
- // HNSW index (in-memory at query time)
86
- hnswIndex: path.join(PROJECT_ROOT, DATA_DIR_NAME, 'codebase-hnsw.idx'),
87
-
88
92
  // Binary HNSW index (32x smaller, Hamming distance)
89
93
  binaryHnswIndex: path.join(PROJECT_ROOT, DATA_DIR_NAME, 'codebase-binary-hnsw.idx'),
90
94
 
@@ -260,3 +260,40 @@ export function bestIntraOpThreads(options = {}) {
260
260
  : effectiveCores - reserveCores;
261
261
  return Math.max(1, Math.min(requested, maxThreads, logicalCores));
262
262
  }
263
+
264
+ /**
265
+ * Intra-op thread count for the BACKGROUND/maintainer ORT profile.
266
+ *
267
+ * The maintainer daemon trades a little throughput for not spiking every
268
+ * P-core during an idle-time reconcile tick. Encoder-only INT8 GEMM recovers
269
+ * ~85–90% throughput at 4 threads (RESEARCH §B), so we run the background
270
+ * encoder at a clamped 2–4 threads regardless of how wide the box is. This is
271
+ * intentionally distinct from {@link bestIntraOpThreads} (the foreground /
272
+ * full-index path), which scales with the hardware.
273
+ *
274
+ * Reads the SWEET_SEARCH_INTRA_OP_THREADS override (shared with the foreground
275
+ * helper), but always clamps the result into [2, 4]; on a single-core box the
276
+ * floor is the available logical-core count so we never request more threads
277
+ * than exist. Affinity (E-core pinning) is deliberately NOT attempted here:
278
+ * `intra_op_thread_affinities` is a no-op on macOS (pthread_setaffinity_np
279
+ * unavailable) — E-core routing comes from process-level taskpolicy -b (G5).
280
+ *
281
+ * Override: SWEET_SEARCH_INTRA_OP_THREADS=N (still clamped to 2–4).
282
+ */
283
+ export function backgroundIntraOpThreads(options = {}) {
284
+ const logicalCores = Math.max(1, options.logicalCores ?? os.cpus().length);
285
+ const LO = 2;
286
+ const HI = 4;
287
+ const upper = Math.min(HI, logicalCores);
288
+ const lower = Math.min(LO, upper);
289
+
290
+ let requested = upper;
291
+ const override = Number.parseInt(process.env.SWEET_SEARCH_INTRA_OP_THREADS ?? '', 10);
292
+ if (Number.isFinite(override) && override > 0) {
293
+ requested = override;
294
+ } else if (Number.isFinite(options.targetThreads) && options.targetThreads > 0) {
295
+ requested = options.targetThreads;
296
+ }
297
+
298
+ return Math.max(lower, Math.min(requested, upper));
299
+ }
@@ -58,7 +58,17 @@ export function listSparseGramDeltaSegments(baseArtifactPath, opts = {}) {
58
58
  export function resolveLatestSparseGramDeltaRecords(baseArtifactPath, opts = {}) {
59
59
  const latest = new Map();
60
60
  for (const seg of listSparseGramDeltaSegments(baseArtifactPath, opts)) {
61
- const raw = fs.readFileSync(seg.path, 'utf-8');
61
+ let raw;
62
+ try {
63
+ raw = fs.readFileSync(seg.path, 'utf-8');
64
+ } catch (err) {
65
+ // TOCTOU: a concurrent compaction/rotation can unlink a segment between
66
+ // listing (existsSync in parseDeltaSegment) and this read. A vanished
67
+ // segment is benign at query time — skip it rather than failing the whole
68
+ // overlay resolution. Surface any other error (EACCES, EISDIR, ...).
69
+ if (err && err.code === 'ENOENT') continue;
70
+ throw err;
71
+ }
62
72
  for (const line of raw.split('\n')) {
63
73
  const trimmed = line.trim();
64
74
  if (!trimmed) continue;