sweet-search 2.5.13 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -9
- package/core/cli.js +41 -3
- package/core/embedding/embedding-local-model.js +106 -10
- package/core/embedding/embedding-service.js +59 -1
- package/core/embedding/model-client.mjs +257 -0
- package/core/embedding/model-server.mjs +217 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +19 -98
- package/core/incremental-indexing/application/maintenance-worker.mjs +46 -9
- package/core/incremental-indexing/application/operator-cli.mjs +14 -5
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +40 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +718 -54
- package/core/incremental-indexing/application/reconciler.mjs +87 -15
- package/core/incremental-indexing/domain/cutoff-cache.mjs +191 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +84 -1
- package/core/incremental-indexing/domain/reconcile-counters.mjs +0 -4
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +0 -24
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +2 -26
- package/core/incremental-indexing/infrastructure/manifest.mjs +1 -9
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +72 -0
- package/core/indexing/artifact-builder.js +1 -1
- package/core/indexing/dedup/dedup-phase.js +36 -17
- package/core/indexing/dedup/exemplar-selector.js +5 -0
- package/core/indexing/index-codebase-v21.js +37 -14
- package/core/indexing/index-maintainer.mjs +337 -6
- package/core/indexing/indexer-ann.js +27 -434
- package/core/indexing/indexer-build.js +30 -14
- package/core/indexing/indexer-manifest.js +0 -3
- package/core/indexing/indexer-phases.js +101 -25
- package/core/indexing/maintainer-launcher.mjs +22 -0
- package/core/indexing/maintainer-watcher.mjs +397 -0
- package/core/indexing/os-priority.mjs +160 -0
- package/core/indexing/rss-budget.mjs +425 -0
- package/core/indexing/streaming-vectors.js +450 -0
- package/core/infrastructure/config/platform.js +14 -10
- package/core/infrastructure/onnx-session-utils.js +37 -0
- package/core/infrastructure/sparse-gram-delta-reader.js +11 -1
- package/core/ranking/late-interaction-index.js +58 -7
- package/core/search/daemon-registry.js +199 -0
- package/core/search/search-read-semantic.js +9 -3
- package/core/search/search-semantic.js +6 -29
- package/core/search/search-server.js +527 -27
- package/core/search/session-daemon-prewarm.mjs +110 -1
- package/core/search/sweet-search.js +0 -38
- package/core/vector-store/binary-hnsw-index.js +692 -78
- package/core/vector-store/index.js +1 -4
- package/eval/agent-read-workflows/bin/_ss-argparse.mjs +51 -5
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +95 -44
- package/eval/agent-read-workflows/bin/ss-read +2 -0
- package/mcp/tool-handlers.js +1 -2
- package/package.json +11 -8
- package/scripts/uninstall.js +2 -0
- package/core/vector-store/hnsw-index.js +0 -751
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Streaming vectors + late-interaction builder (bounded-memory full rebuild).
|
|
3
|
+
*
|
|
4
|
+
* WHY THIS EXISTS
|
|
5
|
+
* ---------------
|
|
6
|
+
* The default in-memory vectors path (`buildVectorIndex` ‖ `buildLateInteractionIndex`
|
|
7
|
+
* driven from `buildVectorsAndArtifactsPhase`) materialises the ENTIRE chunk
|
|
8
|
+
* corpus at once: `chunkFiles()` returns every chunk + every embed-text, dedup
|
|
9
|
+
* annotates them in place, the embed pass holds all exemplar embeddings + insert
|
|
10
|
+
* rows, and the LI pass keeps every doc's per-token slab in `this.documents`.
|
|
11
|
+
* Peak heap is O(repo). On large repositories (e.g. tursodatabase/libsql ≈ 431k
|
|
12
|
+
* chunks, swc-project/swc ≈ 217k chunks / 180k exemplars) that blows the default
|
|
13
|
+
* ~4 GB Node heap / a RAM-limited box and the indexer crashes — regardless of
|
|
14
|
+
* the encoder backend (CUDA, Metal, CoreML, or ORT-CPU), because the hogs live
|
|
15
|
+
* in the JS layer, not the model.
|
|
16
|
+
*
|
|
17
|
+
* WHAT THIS DOES
|
|
18
|
+
* --------------
|
|
19
|
+
* Streams the same pipeline in bounded windows so peak heap is O(window):
|
|
20
|
+
*
|
|
21
|
+
* 1. PARSE+SPILL — parse files in file-windows (reusing `chunkFiles`), compute
|
|
22
|
+
* dedup fingerprints, apply the LI skip policy (content in hand), and spill
|
|
23
|
+
* each chunk to a temp SQLite store. Only lightweight per-chunk records
|
|
24
|
+
* (id, text length, path/hash, fingerprint, li-keep flag) stay resident.
|
|
25
|
+
* 2. DEDUP — cluster the resident fingerprints GLOBALLY (identical to the
|
|
26
|
+
* in-memory path — needed so dup-heavy repos keep their 94%-alias short-cut
|
|
27
|
+
* instead of re-embedding everything) and annotate the lightweight records.
|
|
28
|
+
* 3. EMBED — stream exemplars in chunk-range windows, hydrate from the
|
|
29
|
+
* store, and insert via the UNCHANGED `pipelinedEmbedAndInsert`
|
|
30
|
+
* (→ `callLocalModelBucketed`: the cache-aware compute-batching the README
|
|
31
|
+
* documents is untouched).
|
|
32
|
+
* 4. ALIAS — stream aliases in windows, copy exemplar vectors via the
|
|
33
|
+
* UNCHANGED `insertAliasVectors`.
|
|
34
|
+
* 5. LI — hand LI-lite records (exemplar token-text only) to the
|
|
35
|
+
* UNCHANGED `buildLateInteractionIndex` in bounded build mode, so per-token
|
|
36
|
+
* slabs are flushed to segments and evicted (peak O(one segment)).
|
|
37
|
+
*
|
|
38
|
+
* On-disk output is byte-for-byte the same format the in-memory path produces
|
|
39
|
+
* (codebase.db vectors + atomic swap; SSLX-v3 LI segments). Small repos and
|
|
40
|
+
* incremental runs keep the original in-memory path untouched (see the gate in
|
|
41
|
+
* buildVectorsAndArtifactsPhase), so benchmark indexes are unaffected.
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
import { createHash } from 'crypto';
|
|
45
|
+
import fs from 'fs/promises';
|
|
46
|
+
import { existsSync } from 'fs';
|
|
47
|
+
import path from 'path';
|
|
48
|
+
|
|
49
|
+
import {
|
|
50
|
+
DB_PATHS,
|
|
51
|
+
EMBEDDING_CONFIG,
|
|
52
|
+
PROJECT_ROOT,
|
|
53
|
+
DEDUP_CONFIG,
|
|
54
|
+
LATE_INTERACTION_CONFIG,
|
|
55
|
+
} from '../infrastructure/config/index.js';
|
|
56
|
+
import {
|
|
57
|
+
isDedupAvailable,
|
|
58
|
+
computeFingerprints,
|
|
59
|
+
clusterFingerprints,
|
|
60
|
+
} from '../infrastructure/index.js';
|
|
61
|
+
import { annotateDedupClusters } from './dedup/dedup-phase.js';
|
|
62
|
+
import {
|
|
63
|
+
chunkFiles,
|
|
64
|
+
createVectorSchema,
|
|
65
|
+
pipelinedEmbedAndInsert,
|
|
66
|
+
insertAliasVectors,
|
|
67
|
+
} from './indexer-build.js';
|
|
68
|
+
import { buildLateInteractionIndex } from './indexer-ann.js';
|
|
69
|
+
import {
|
|
70
|
+
configureJournalMode,
|
|
71
|
+
checkpointWal,
|
|
72
|
+
atomicSwapDatabase,
|
|
73
|
+
log,
|
|
74
|
+
logProgress,
|
|
75
|
+
} from './indexer-utils.js';
|
|
76
|
+
|
|
77
|
+
// Files parsed per chunkFiles() call. Bounds the transient parse working set.
|
|
78
|
+
const PARSE_FILE_WINDOW = Number(process.env.SWEET_SEARCH_STREAM_PARSE_FILES) || 2000;
|
|
79
|
+
// Aliases hydrated per insert window. Bounds the transient alias working set.
|
|
80
|
+
// Alias rows just copy the exemplar's vector + deterministic metadata, so the
|
|
81
|
+
// window size does NOT affect the resulting index — only peak memory.
|
|
82
|
+
const HYDRATE_CHUNK_WINDOW = Number(process.env.SWEET_SEARCH_STREAM_HYDRATE_CHUNKS) || 50_000;
|
|
83
|
+
// Exemplars embedded per call. The embedding written for a chunk is determined
|
|
84
|
+
// by callLocalModelBucketed's bucketing over the set it's handed, so to keep the
|
|
85
|
+
// index BYTE-IDENTICAL to the in-memory path (which embeds all exemplars in one
|
|
86
|
+
// call) the streaming path must also embed all exemplars in ONE call whenever
|
|
87
|
+
// they fit. This window is sized well above any repo that could have indexed
|
|
88
|
+
// in-memory before (~4 GB heap OOMs in-memory well under this exemplar count),
|
|
89
|
+
// so every repo with a valid "before" gets the identical single embed call.
|
|
90
|
+
// Only a repo too huge to ever have indexed in-memory splits into multiple
|
|
91
|
+
// embed windows — and that repo has no prior index to differ from. On CPU the
|
|
92
|
+
// per-chunk embedding is batch-independent (identical even when windowed); the
|
|
93
|
+
// single call only matters for GPU FP-reassociation across batch shapes.
|
|
94
|
+
const EMBED_WINDOW = Number(process.env.SWEET_SEARCH_STREAM_EMBED_WINDOW) || 200_000;
|
|
95
|
+
|
|
96
|
+
// ── small replicas of indexer-build internals (kept private here) ──
|
|
97
|
+
|
|
98
|
+
/** Mirror of chunkFiles()'s embed-text cap (ast-chunker getEmbedTextCap). */
|
|
99
|
+
function embedTextCap() {
|
|
100
|
+
const v = parseInt(process.env.SWEET_SEARCH_EMBED_TEXT_CAP || '', 10);
|
|
101
|
+
return Number.isFinite(v) && v >= 500 ? v : 2000;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/** Mirror of indexer-build.js chunkFilePath (not exported). */
|
|
105
|
+
function chunkFilePath(chunk) {
|
|
106
|
+
for (const candidate of [
|
|
107
|
+
chunk?.metadata?.relative_path,
|
|
108
|
+
chunk?.metadata?.path,
|
|
109
|
+
chunk?.metadata?.file_path,
|
|
110
|
+
chunk?.file,
|
|
111
|
+
chunk?.metadata?.file,
|
|
112
|
+
]) {
|
|
113
|
+
if (typeof candidate !== 'string') continue;
|
|
114
|
+
const n = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
|
|
115
|
+
if (!n || n === '.' || n.startsWith('/')) continue;
|
|
116
|
+
if (/^[A-Za-z]:\//.test(n)) continue;
|
|
117
|
+
if (n === '..' || n.startsWith('../') || n.includes('/../')) continue;
|
|
118
|
+
return n;
|
|
119
|
+
}
|
|
120
|
+
return '';
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/** Mirror of chunkFiles()'s per-chunk embed-text derivation. */
|
|
124
|
+
function embedTextOf(chunk, cap) {
|
|
125
|
+
if (chunk.embedding_text) return chunk.embedding_text.slice(0, cap);
|
|
126
|
+
return `${chunkFilePath(chunk)} ${chunk.metadata?.symbol || ''}\n${(chunk.text || chunk.content || '').slice(0, 1500)}`;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Dedup-annotation fields written by annotateDedupClusters onto a record's
|
|
130
|
+
// metadata; merged back into the hydrated chunk so downstream sees the same
|
|
131
|
+
// annotations the in-memory path would have set in place.
|
|
132
|
+
const DEDUP_FIELDS = ['simhash', 'isExemplar', 'exemplarId', 'clusterId', 'aliasJaccard', 'liReuseEligible'];
|
|
133
|
+
|
|
134
|
+
function mergeDedupMeta(chunk, recMeta) {
|
|
135
|
+
const m = (chunk.metadata = chunk.metadata || {});
|
|
136
|
+
for (const k of DEDUP_FIELDS) m[k] = recMeta[k];
|
|
137
|
+
return chunk;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function* windows(n, size) {
|
|
141
|
+
for (let i = 0; i < n; i += size) yield [i, Math.min(i + size, n)];
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// ── temp spill store ──
|
|
145
|
+
|
|
146
|
+
async function openSpillStore() {
|
|
147
|
+
const Database = (await import('better-sqlite3')).default;
|
|
148
|
+
const storePath = DB_PATHS.codebase + '.staging-chunks.db';
|
|
149
|
+
for (const p of [storePath, storePath + '-wal', storePath + '-shm']) {
|
|
150
|
+
try { await fs.unlink(p); } catch { /* absent */ }
|
|
151
|
+
}
|
|
152
|
+
await fs.mkdir(path.dirname(storePath), { recursive: true });
|
|
153
|
+
const db = new Database(storePath);
|
|
154
|
+
// Fast, non-durable: this is a throwaway scratch file deleted at the end.
|
|
155
|
+
db.pragma('journal_mode = OFF');
|
|
156
|
+
db.pragma('synchronous = OFF');
|
|
157
|
+
db.exec('CREATE TABLE c (seq INTEGER PRIMARY KEY, j TEXT NOT NULL)');
|
|
158
|
+
const insert = db.prepare('INSERT INTO c (seq, j) VALUES (?, ?)');
|
|
159
|
+
const insertMany = db.transaction((rows) => { for (const r of rows) insert.run(r.seq, r.j); });
|
|
160
|
+
const readRange = db.prepare('SELECT seq, j FROM c WHERE seq >= ? AND seq < ? ORDER BY seq');
|
|
161
|
+
return { db, storePath, insertMany, readRange };
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// =============================================================================
|
|
165
|
+
// MAIN
|
|
166
|
+
// =============================================================================
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Build vectors (codebase.db) + the staged LI index for a full rebuild with
|
|
170
|
+
* bounded memory. Returns `{ vectorStats, lateInteractionResult, liBuilt }`.
|
|
171
|
+
* The caller (buildVectorsAndArtifactsPhase) promotes the staged LI index and
|
|
172
|
+
* builds quantized artifacts from the swapped codebase.db, exactly as it does
|
|
173
|
+
* for the in-memory path.
|
|
174
|
+
*
|
|
175
|
+
* @param {object} opts
|
|
176
|
+
* @param {string[]} opts.filesToIndex
|
|
177
|
+
* @param {object} opts.modelInfo getModelInfo()
|
|
178
|
+
* @param {boolean} opts.sqliteFastMode
|
|
179
|
+
* @param {boolean} opts.noLateInteraction
|
|
180
|
+
* @param {object} opts.li LI resource-plan knobs + staged paths
|
|
181
|
+
*/
|
|
182
|
+
export async function buildVectorsAndLiStreaming(opts) {
|
|
183
|
+
const {
|
|
184
|
+
filesToIndex,
|
|
185
|
+
modelInfo,
|
|
186
|
+
sqliteFastMode = false,
|
|
187
|
+
noLateInteraction = false,
|
|
188
|
+
li = {},
|
|
189
|
+
} = opts;
|
|
190
|
+
|
|
191
|
+
const cap = embedTextCap();
|
|
192
|
+
const dedupOn = DEDUP_CONFIG.enabled && isDedupAvailable();
|
|
193
|
+
const wantLi = !noLateInteraction && LATE_INTERACTION_CONFIG.enabled;
|
|
194
|
+
|
|
195
|
+
log('\n━━━ Phase 2: Vectors + LI (streaming, bounded memory) ━━━', 'bright');
|
|
196
|
+
log(`Streaming ${filesToIndex.length} files (parse window=${PARSE_FILE_WINDOW} files, hydrate window=${HYDRATE_CHUNK_WINDOW} chunks)`, 'dim');
|
|
197
|
+
|
|
198
|
+
const store = await openSpillStore();
|
|
199
|
+
try {
|
|
200
|
+
// ── 1. PARSE + SPILL + FINGERPRINT + LI-SKIP ──
|
|
201
|
+
const records = []; // [{ id, _textLen, file, metadata, liKeep }]
|
|
202
|
+
const fingerprints = []; // parallel to records (freed after clustering)
|
|
203
|
+
let seq = 0;
|
|
204
|
+
let parsed = 0;
|
|
205
|
+
|
|
206
|
+
const { applyIndexingChunkPolicy } = await import('./indexing-file-policy.js');
|
|
207
|
+
|
|
208
|
+
for (const [fi, fj] of windows(filesToIndex.length, PARSE_FILE_WINDOW)) {
|
|
209
|
+
const fileWindow = filesToIndex.slice(fi, fj);
|
|
210
|
+
const { allChunks } = await chunkFiles(fileWindow);
|
|
211
|
+
if (allChunks.length === 0) { parsed += fileWindow.length; continue; }
|
|
212
|
+
|
|
213
|
+
// Fingerprints on raw text (matches runDedupPhase: c.text || c.content).
|
|
214
|
+
let fps = null;
|
|
215
|
+
if (dedupOn) {
|
|
216
|
+
try {
|
|
217
|
+
fps = computeFingerprints(allChunks.map((c) => c.text || c.content || ''), DEDUP_CONFIG);
|
|
218
|
+
} catch { fps = null; }
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// LI skip policy needs chunk content — apply it here, while we have it,
|
|
222
|
+
// and remember the keep decision so the LI stage can run on LI-lite records.
|
|
223
|
+
let liKeptIds = null;
|
|
224
|
+
if (wantLi) {
|
|
225
|
+
try {
|
|
226
|
+
const { kept } = applyIndexingChunkPolicy(allChunks, { projectRoot: PROJECT_ROOT });
|
|
227
|
+
liKeptIds = new Set(kept.map((c) => c.id));
|
|
228
|
+
} catch { liKeptIds = null; }
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
const rows = new Array(allChunks.length);
|
|
232
|
+
for (let k = 0; k < allChunks.length; k++) {
|
|
233
|
+
const chunk = allChunks[k];
|
|
234
|
+
rows[k] = { seq, j: JSON.stringify(chunk) };
|
|
235
|
+
records.push({
|
|
236
|
+
id: chunk.id,
|
|
237
|
+
_textLen: (chunk.text || chunk.content || '').length,
|
|
238
|
+
file: chunk.file,
|
|
239
|
+
// Carry only the fields selectExemplar reads; annotateDedupClusters
|
|
240
|
+
// writes the dedup annotation fields onto this same object.
|
|
241
|
+
metadata: {
|
|
242
|
+
relative_path: chunk.metadata?.relative_path,
|
|
243
|
+
path: chunk.metadata?.path,
|
|
244
|
+
file_path: chunk.metadata?.file_path,
|
|
245
|
+
file: chunk.metadata?.file,
|
|
246
|
+
hash: chunk.metadata?.hash,
|
|
247
|
+
},
|
|
248
|
+
liKeep: liKeptIds ? liKeptIds.has(chunk.id) : wantLi,
|
|
249
|
+
});
|
|
250
|
+
fingerprints.push(fps ? fps[k] : null);
|
|
251
|
+
seq++;
|
|
252
|
+
}
|
|
253
|
+
store.insertMany(rows);
|
|
254
|
+
|
|
255
|
+
parsed += fileWindow.length;
|
|
256
|
+
logProgress(parsed, filesToIndex.length, 'Parsing+spill');
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
const totalChunks = records.length;
|
|
260
|
+
log(`\n✓ Spilled ${totalChunks} chunks to disk (lightweight records resident only)`, 'green');
|
|
261
|
+
if (totalChunks === 0) {
|
|
262
|
+
return { vectorStats: { chunks: 0, embeddings: 0 }, lateInteractionResult: null, liBuilt: false };
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// ── 2. GLOBAL DEDUP (on lightweight signatures) ──
|
|
266
|
+
let dedupStats = null;
|
|
267
|
+
if (dedupOn && fingerprints.every((f) => f)) {
|
|
268
|
+
try {
|
|
269
|
+
const clusters = clusterFingerprints(fingerprints, DEDUP_CONFIG);
|
|
270
|
+
dedupStats = annotateDedupClusters(records, fingerprints, clusters, DEDUP_CONFIG);
|
|
271
|
+
const pct = ((dedupStats.totalAliases / totalChunks) * 100).toFixed(1);
|
|
272
|
+
log(`Dedup: ${dedupStats.clustersWithSiblings} clusters, ${dedupStats.totalAliases} aliases (${pct}% of ${totalChunks})`, 'cyan');
|
|
273
|
+
} catch (e) {
|
|
274
|
+
log(`Dedup skipped (${e.message}); embedding every chunk`, 'yellow');
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
// Free the heaviest resident structure (≈0.5 KB/chunk) before embedding.
|
|
278
|
+
fingerprints.length = 0;
|
|
279
|
+
|
|
280
|
+
const exemplarSeqs = [];
|
|
281
|
+
const aliasSeqs = [];
|
|
282
|
+
for (let s = 0; s < records.length; s++) {
|
|
283
|
+
if (records[s].metadata.exemplarId) aliasSeqs.push(s); else exemplarSeqs.push(s);
|
|
284
|
+
}
|
|
285
|
+
log(`Embedding ${exemplarSeqs.length} exemplars, copying vectors for ${aliasSeqs.length} aliases`, 'dim');
|
|
286
|
+
|
|
287
|
+
// ── open codebase.db.tmp once; stream inserts; atomic-swap at the end ──
|
|
288
|
+
const Database = (await import('better-sqlite3')).default;
|
|
289
|
+
await fs.mkdir(path.dirname(DB_PATHS.codebase), { recursive: true });
|
|
290
|
+
const tmpPath = DB_PATHS.codebase + '.tmp';
|
|
291
|
+
for (const p of [tmpPath, tmpPath + '-wal', tmpPath + '-shm']) {
|
|
292
|
+
try { await fs.unlink(p); } catch { /* absent */ }
|
|
293
|
+
}
|
|
294
|
+
const vdb = new Database(tmpPath);
|
|
295
|
+
configureJournalMode(vdb, tmpPath, sqliteFastMode);
|
|
296
|
+
createVectorSchema(vdb);
|
|
297
|
+
|
|
298
|
+
const isLocal = modelInfo.provider === 'local';
|
|
299
|
+
const writeFlushRows = EMBEDDING_CONFIG.indexerWriteFlushRows;
|
|
300
|
+
const embeddingOptions = { useCache: false };
|
|
301
|
+
let effectiveDim = modelInfo.dimension;
|
|
302
|
+
if (modelInfo.isRemote) {
|
|
303
|
+
const configuredOutputDim = parseInt(
|
|
304
|
+
process.env.SWEET_SEARCH_INDEXING_OUTPUT_DIMENSION || `${modelInfo.hnswDimension}`, 10);
|
|
305
|
+
if (Number.isFinite(configuredOutputDim) && configuredOutputDim > 0 && configuredOutputDim <= modelInfo.dimension) {
|
|
306
|
+
embeddingOptions.providerOptions = {
|
|
307
|
+
outputDimension: configuredOutputDim,
|
|
308
|
+
inputType: 'document',
|
|
309
|
+
concurrency: parseInt(process.env.SWEET_SEARCH_EMBEDDING_CONCURRENCY || '4', 10),
|
|
310
|
+
};
|
|
311
|
+
effectiveDim = configuredOutputDim;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Hydrate a specific list of seqs (batched IN query), merge dedup
|
|
316
|
+
// annotations, and tag each chunk with its LI-keep flag. Returns chunks in
|
|
317
|
+
// `seqs` order (= seq order = file order), so annotateChunksForVectorInsert's
|
|
318
|
+
// per-file structural-id grouping stays correct (windows are cut at file
|
|
319
|
+
// boundaries — see fileWindows). Hydrating ONLY the seqs a pass needs avoids
|
|
320
|
+
// re-parsing the whole corpus (e.g. the embed pass parses 25k exemplar JSONs
|
|
321
|
+
// for libsql, not all 431k) — the key "don't slow down" optimization.
|
|
322
|
+
const HYDRATE_SUB = 4000;
|
|
323
|
+
const hydrateSeqs = (seqs) => {
|
|
324
|
+
const out = [];
|
|
325
|
+
for (let i = 0; i < seqs.length; i += HYDRATE_SUB) {
|
|
326
|
+
const batch = seqs.slice(i, i + HYDRATE_SUB);
|
|
327
|
+
const rows = store.db
|
|
328
|
+
.prepare(`SELECT seq, j FROM c WHERE seq IN (${batch.map(() => '?').join(',')})`)
|
|
329
|
+
.all(...batch);
|
|
330
|
+
const bySeq = new Map();
|
|
331
|
+
for (const r of rows) bySeq.set(r.seq, r.j);
|
|
332
|
+
for (const s of batch) {
|
|
333
|
+
const j = bySeq.get(s);
|
|
334
|
+
if (j === undefined) continue;
|
|
335
|
+
const chunk = JSON.parse(j);
|
|
336
|
+
if (dedupStats) mergeDedupMeta(chunk, records[s].metadata);
|
|
337
|
+
chunk.__liKeep = records[s].liKeep; // transient; ignored by downstream
|
|
338
|
+
out.push(chunk);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
return out;
|
|
342
|
+
};
|
|
343
|
+
|
|
344
|
+
// File-aligned windows over a seq list: cut at `size` but never mid-file,
|
|
345
|
+
// so a file's chunks always land in one window.
|
|
346
|
+
function* fileWindows(seqs, size) {
|
|
347
|
+
let start = 0;
|
|
348
|
+
while (start < seqs.length) {
|
|
349
|
+
let end = Math.min(start + size, seqs.length);
|
|
350
|
+
while (end < seqs.length && records[seqs[end]].file === records[seqs[end - 1]].file) end++;
|
|
351
|
+
yield seqs.slice(start, end);
|
|
352
|
+
start = end;
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// LI input is assembled DURING the embed/alias passes (no third parse pass).
|
|
357
|
+
// Exemplars carry token text; LI-reuse-eligible aliases need only a pointer.
|
|
358
|
+
const liExemplars = [];
|
|
359
|
+
const liAliases = [];
|
|
360
|
+
const liLite = (chunk, withText) => (withText
|
|
361
|
+
? { id: chunk.id, file: chunk.file, metadata: chunk.metadata || {}, li_greedy_text: chunk.li_greedy_text, embedding_text: chunk.embedding_text, li_text: chunk.li_text, text: chunk.text }
|
|
362
|
+
: { id: chunk.id, file: chunk.file, metadata: chunk.metadata || {} });
|
|
363
|
+
|
|
364
|
+
// ── 3. EMBED exemplars (only exemplar seqs hydrated; bucketing UNCHANGED) ──
|
|
365
|
+
let embeddingCount = 0;
|
|
366
|
+
let embeddedSoFar = 0;
|
|
367
|
+
for (const win of fileWindows(exemplarSeqs, EMBED_WINDOW)) {
|
|
368
|
+
const exemplars = hydrateSeqs(win);
|
|
369
|
+
if (exemplars.length === 0) continue;
|
|
370
|
+
const exemplarTexts = exemplars.map((c) => embedTextOf(c, cap));
|
|
371
|
+
const batchSize = isLocal ? exemplarTexts.length : EMBEDDING_CONFIG.indexerBatchSize;
|
|
372
|
+
embeddingCount += await pipelinedEmbedAndInsert(
|
|
373
|
+
vdb, exemplars, exemplarTexts, batchSize, modelInfo,
|
|
374
|
+
(done) => logProgress(embeddedSoFar + done, exemplarSeqs.length, 'Embedding'),
|
|
375
|
+
embeddingOptions, log, writeFlushRows,
|
|
376
|
+
);
|
|
377
|
+
embeddedSoFar += exemplars.length;
|
|
378
|
+
if (wantLi) for (const c of exemplars) if (c.__liKeep) liExemplars.push(liLite(c, true));
|
|
379
|
+
}
|
|
380
|
+
checkpointWal(vdb);
|
|
381
|
+
log(`\n✓ Generated ${embeddingCount} embeddings (${effectiveDim}d)`, 'green');
|
|
382
|
+
|
|
383
|
+
// ── 4. ALIAS inserts (only alias seqs hydrated; exemplar vectors present) ──
|
|
384
|
+
if (aliasSeqs.length > 0) {
|
|
385
|
+
let aliasInserted = 0;
|
|
386
|
+
for (const win of fileWindows(aliasSeqs, HYDRATE_CHUNK_WINDOW)) {
|
|
387
|
+
const aliases = hydrateSeqs(win);
|
|
388
|
+
if (aliases.length === 0) continue;
|
|
389
|
+
aliasInserted += insertAliasVectors(vdb, aliases, modelInfo, { skipOrphanPurge: true });
|
|
390
|
+
if (wantLi) for (const c of aliases) if (c.__liKeep) liAliases.push(liLite(c, !c.metadata?.liReuseEligible));
|
|
391
|
+
}
|
|
392
|
+
log(` ✓ Inserted ${aliasInserted} alias vector(s) (embeddings copied from exemplars)`, 'dim');
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
checkpointWal(vdb);
|
|
396
|
+
try { vdb.pragma('optimize'); } catch { /* best effort */ }
|
|
397
|
+
vdb.close();
|
|
398
|
+
await atomicSwapDatabase(tmpPath, DB_PATHS.codebase);
|
|
399
|
+
const vstat = await fs.stat(DB_PATHS.codebase);
|
|
400
|
+
log(`✓ Saved codebase.db (${(vstat.size / 1024 / 1024).toFixed(2)} MB, ${totalChunks} vectors)`, 'green');
|
|
401
|
+
|
|
402
|
+
const vectorStats = { chunks: totalChunks, embeddings: embeddingCount };
|
|
403
|
+
|
|
404
|
+
// ── 5. LATE INTERACTION (LI-lite input + bounded build mode) ──
|
|
405
|
+
// liExemplars/liAliases were assembled during the embed/alias passes above,
|
|
406
|
+
// so no third parse pass. buildLateInteractionIndex partitions by
|
|
407
|
+
// metadata.exemplarId — order in the input array doesn't matter.
|
|
408
|
+
let lateInteractionResult = null;
|
|
409
|
+
let liBuilt = false;
|
|
410
|
+
if (wantLi) {
|
|
411
|
+
const liChunks = liExemplars.concat(liAliases);
|
|
412
|
+
|
|
413
|
+
try {
|
|
414
|
+
lateInteractionResult = await buildLateInteractionIndex(liChunks, false, [], {
|
|
415
|
+
poolFactor: li.poolFactor ?? 1,
|
|
416
|
+
extendedSkiplist: li.extendedSkiplist ?? false,
|
|
417
|
+
loadFromPath: li.loadFromPath ?? DB_PATHS.lateInteraction,
|
|
418
|
+
saveToPath: li.saveToPath,
|
|
419
|
+
finalIndexPath: li.finalIndexPath ?? DB_PATHS.lateInteraction,
|
|
420
|
+
stagingSegmentDir: li.stagingSegmentDir,
|
|
421
|
+
fullRebuild: true,
|
|
422
|
+
workerCount: li.workerCount ?? 1,
|
|
423
|
+
threadsPerWorker: li.threadsPerWorker ?? 0,
|
|
424
|
+
batchSize: li.batchSize ?? 8,
|
|
425
|
+
batchSizeUpperCap: li.batchSizeUpperCap,
|
|
426
|
+
tokenBudget: li.tokenBudget ?? 8192,
|
|
427
|
+
attentionBudget: li.attentionBudget ?? null,
|
|
428
|
+
projectRoot: PROJECT_ROOT,
|
|
429
|
+
buildEvict: true,
|
|
430
|
+
skipPolicyAlreadyApplied: true,
|
|
431
|
+
});
|
|
432
|
+
liBuilt = true;
|
|
433
|
+
} catch (err) {
|
|
434
|
+
// Non-fatal: vectors (codebase.db) are already committed above, so a
|
|
435
|
+
// failed LI build must not lose them. The caller invalidates/cleans the
|
|
436
|
+
// staged LI index and continues — same contract as the in-memory path.
|
|
437
|
+
log(`Late interaction build failed (non-fatal): ${err.message}`, 'yellow');
|
|
438
|
+
lateInteractionResult = { error: err.message, invalidated: true };
|
|
439
|
+
liBuilt = false;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
return { vectorStats, lateInteractionResult, liBuilt };
|
|
444
|
+
} finally {
|
|
445
|
+
try { store.db.close(); } catch { /* ignore */ }
|
|
446
|
+
for (const p of [store.storePath, store.storePath + '-wal', store.storePath + '-shm']) {
|
|
447
|
+
try { await fs.unlink(p); } catch { /* absent */ }
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
}
|
|
@@ -16,21 +16,28 @@ function resolveProjectRoot() {
|
|
|
16
16
|
const fromEnv = process.env.SWEET_SEARCH_PROJECT_ROOT?.trim();
|
|
17
17
|
if (fromEnv) return path.resolve(fromEnv);
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
//
|
|
22
|
-
|
|
23
|
-
|
|
19
|
+
const cwd = process.cwd();
|
|
20
|
+
|
|
21
|
+
// Prefer an existing sweet-search state dir so indexed corpus subdirectories
|
|
22
|
+
// do not get pulled back to an outer package root.
|
|
23
|
+
for (let dir = cwd; ; dir = path.dirname(dir)) {
|
|
24
|
+
if (existsSync(path.join(dir, '.sweet-search'))) return dir;
|
|
25
|
+
const parent = path.dirname(dir);
|
|
26
|
+
if (parent === dir) break;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Fallback to the historical project markers for cold-start/init flows before
|
|
30
|
+
// .sweet-search/ exists.
|
|
31
|
+
for (let dir = cwd; ; dir = path.dirname(dir)) {
|
|
24
32
|
if (existsSync(path.join(dir, '.git')) || existsSync(path.join(dir, 'package.json'))) {
|
|
25
33
|
return dir;
|
|
26
34
|
}
|
|
27
35
|
const parent = path.dirname(dir);
|
|
28
36
|
if (parent === dir) break; // filesystem root
|
|
29
|
-
dir = parent;
|
|
30
37
|
}
|
|
31
38
|
|
|
32
39
|
// Fallback to cwd if no project marker found
|
|
33
|
-
return
|
|
40
|
+
return cwd;
|
|
34
41
|
}
|
|
35
42
|
|
|
36
43
|
// Project root detection
|
|
@@ -82,9 +89,6 @@ export const DB_PATHS = {
|
|
|
82
89
|
// Code graph (entities + relationships + FTS5 + summaries)
|
|
83
90
|
codeGraph: path.join(PROJECT_ROOT, DATA_DIR_NAME, 'code-graph.db'),
|
|
84
91
|
|
|
85
|
-
// HNSW index (in-memory at query time)
|
|
86
|
-
hnswIndex: path.join(PROJECT_ROOT, DATA_DIR_NAME, 'codebase-hnsw.idx'),
|
|
87
|
-
|
|
88
92
|
// Binary HNSW index (32x smaller, Hamming distance)
|
|
89
93
|
binaryHnswIndex: path.join(PROJECT_ROOT, DATA_DIR_NAME, 'codebase-binary-hnsw.idx'),
|
|
90
94
|
|
|
@@ -260,3 +260,40 @@ export function bestIntraOpThreads(options = {}) {
|
|
|
260
260
|
: effectiveCores - reserveCores;
|
|
261
261
|
return Math.max(1, Math.min(requested, maxThreads, logicalCores));
|
|
262
262
|
}
|
|
263
|
+
|
|
264
|
+
/**
|
|
265
|
+
* Intra-op thread count for the BACKGROUND/maintainer ORT profile.
|
|
266
|
+
*
|
|
267
|
+
* The maintainer daemon trades a little throughput for not spiking every
|
|
268
|
+
* P-core during an idle-time reconcile tick. Encoder-only INT8 GEMM recovers
|
|
269
|
+
* ~85–90% throughput at 4 threads (RESEARCH §B), so we run the background
|
|
270
|
+
* encoder at a clamped 2–4 threads regardless of how wide the box is. This is
|
|
271
|
+
* intentionally distinct from {@link bestIntraOpThreads} (the foreground /
|
|
272
|
+
* full-index path), which scales with the hardware.
|
|
273
|
+
*
|
|
274
|
+
* Reads the SWEET_SEARCH_INTRA_OP_THREADS override (shared with the foreground
|
|
275
|
+
* helper), but always clamps the result into [2, 4]; on a single-core box the
|
|
276
|
+
* floor is the available logical-core count so we never request more threads
|
|
277
|
+
* than exist. Affinity (E-core pinning) is deliberately NOT attempted here:
|
|
278
|
+
* `intra_op_thread_affinities` is a no-op on macOS (pthread_setaffinity_np
|
|
279
|
+
* unavailable) — E-core routing comes from process-level taskpolicy -b (G5).
|
|
280
|
+
*
|
|
281
|
+
* Override: SWEET_SEARCH_INTRA_OP_THREADS=N (still clamped to 2–4).
|
|
282
|
+
*/
|
|
283
|
+
export function backgroundIntraOpThreads(options = {}) {
|
|
284
|
+
const logicalCores = Math.max(1, options.logicalCores ?? os.cpus().length);
|
|
285
|
+
const LO = 2;
|
|
286
|
+
const HI = 4;
|
|
287
|
+
const upper = Math.min(HI, logicalCores);
|
|
288
|
+
const lower = Math.min(LO, upper);
|
|
289
|
+
|
|
290
|
+
let requested = upper;
|
|
291
|
+
const override = Number.parseInt(process.env.SWEET_SEARCH_INTRA_OP_THREADS ?? '', 10);
|
|
292
|
+
if (Number.isFinite(override) && override > 0) {
|
|
293
|
+
requested = override;
|
|
294
|
+
} else if (Number.isFinite(options.targetThreads) && options.targetThreads > 0) {
|
|
295
|
+
requested = options.targetThreads;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
return Math.max(lower, Math.min(requested, upper));
|
|
299
|
+
}
|
|
@@ -58,7 +58,17 @@ export function listSparseGramDeltaSegments(baseArtifactPath, opts = {}) {
|
|
|
58
58
|
export function resolveLatestSparseGramDeltaRecords(baseArtifactPath, opts = {}) {
|
|
59
59
|
const latest = new Map();
|
|
60
60
|
for (const seg of listSparseGramDeltaSegments(baseArtifactPath, opts)) {
|
|
61
|
-
|
|
61
|
+
let raw;
|
|
62
|
+
try {
|
|
63
|
+
raw = fs.readFileSync(seg.path, 'utf-8');
|
|
64
|
+
} catch (err) {
|
|
65
|
+
// TOCTOU: a concurrent compaction/rotation can unlink a segment between
|
|
66
|
+
// listing (existsSync in parseDeltaSegment) and this read. A vanished
|
|
67
|
+
// segment is benign at query time — skip it rather than failing the whole
|
|
68
|
+
// overlay resolution. Surface any other error (EACCES, EISDIR, ...).
|
|
69
|
+
if (err && err.code === 'ENOENT') continue;
|
|
70
|
+
throw err;
|
|
71
|
+
}
|
|
62
72
|
for (const line of raw.split('\n')) {
|
|
63
73
|
const trimmed = line.trim();
|
|
64
74
|
if (!trimmed) continue;
|