sweet-search 2.5.13 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -9
- package/core/cli.js +41 -3
- package/core/embedding/embedding-local-model.js +106 -10
- package/core/embedding/embedding-service.js +59 -1
- package/core/embedding/model-client.mjs +257 -0
- package/core/embedding/model-server.mjs +217 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +19 -98
- package/core/incremental-indexing/application/maintenance-worker.mjs +46 -9
- package/core/incremental-indexing/application/operator-cli.mjs +14 -5
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +40 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +718 -54
- package/core/incremental-indexing/application/reconciler.mjs +87 -15
- package/core/incremental-indexing/domain/cutoff-cache.mjs +191 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +84 -1
- package/core/incremental-indexing/domain/reconcile-counters.mjs +0 -4
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +0 -24
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +2 -26
- package/core/incremental-indexing/infrastructure/manifest.mjs +1 -9
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +72 -0
- package/core/indexing/artifact-builder.js +1 -1
- package/core/indexing/dedup/dedup-phase.js +36 -17
- package/core/indexing/dedup/exemplar-selector.js +5 -0
- package/core/indexing/index-codebase-v21.js +37 -14
- package/core/indexing/index-maintainer.mjs +337 -6
- package/core/indexing/indexer-ann.js +27 -434
- package/core/indexing/indexer-build.js +30 -14
- package/core/indexing/indexer-manifest.js +0 -3
- package/core/indexing/indexer-phases.js +101 -25
- package/core/indexing/maintainer-launcher.mjs +22 -0
- package/core/indexing/maintainer-watcher.mjs +397 -0
- package/core/indexing/os-priority.mjs +160 -0
- package/core/indexing/rss-budget.mjs +425 -0
- package/core/indexing/streaming-vectors.js +450 -0
- package/core/infrastructure/config/platform.js +14 -10
- package/core/infrastructure/onnx-session-utils.js +37 -0
- package/core/infrastructure/sparse-gram-delta-reader.js +11 -1
- package/core/ranking/late-interaction-index.js +58 -7
- package/core/search/daemon-registry.js +199 -0
- package/core/search/search-read-semantic.js +9 -3
- package/core/search/search-semantic.js +6 -29
- package/core/search/search-server.js +527 -27
- package/core/search/session-daemon-prewarm.mjs +110 -1
- package/core/search/sweet-search.js +0 -38
- package/core/vector-store/binary-hnsw-index.js +692 -78
- package/core/vector-store/index.js +1 -4
- package/eval/agent-read-workflows/bin/_ss-argparse.mjs +51 -5
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +95 -44
- package/eval/agent-read-workflows/bin/ss-read +2 -0
- package/mcp/tool-handlers.js +1 -2
- package/package.json +11 -8
- package/scripts/uninstall.js +2 -0
- package/core/vector-store/hnsw-index.js +0 -751
|
@@ -3,25 +3,17 @@
|
|
|
3
3
|
* Extracted from index-codebase-v21.js for file size compliance (<500 lines).
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
-
import { existsSync
|
|
6
|
+
import { existsSync } from 'fs';
|
|
7
7
|
import path from 'path';
|
|
8
8
|
|
|
9
|
-
import { DB_PATHS,
|
|
9
|
+
import { DB_PATHS, BINARY_HNSW_CONFIG } from '../infrastructure/config/index.js';
|
|
10
10
|
import { chunkedIn } from '../infrastructure/db-utils.js';
|
|
11
|
-
import { HNSWIndex } from '../vector-store/hnsw-index.js';
|
|
12
11
|
import { LateInteractionIndex } from '../ranking/late-interaction-index.js';
|
|
13
|
-
import { truncateForHNSW, getEmbeddings, getModelInfo, fisherYatesShuffle } from '../embedding/embedding-service.js';
|
|
14
12
|
import { buildFromCodebaseDb as buildQuantizedArtifacts, shouldSkipArtifactRebuild, updateArtifactState, ARTIFACT_THRESHOLDS } from './artifact-builder.js';
|
|
15
13
|
import { log, logProgress } from './indexer-utils.js';
|
|
16
14
|
import { JAVA_FAMILY } from './ast-chunker.js';
|
|
17
15
|
import { isIndexAcceleratorAvailable } from './model-pool.js';
|
|
18
16
|
|
|
19
|
-
// =============================================================================
|
|
20
|
-
// DURABLE WRITE HELPERS (Phase E — fsync ordering for checkpoint safety)
|
|
21
|
-
// =============================================================================
|
|
22
|
-
|
|
23
|
-
const CHECKPOINT_INTERVAL_SEC = 30;
|
|
24
|
-
const MIN_VECTORS_BETWEEN_SAVES = 1000;
|
|
25
17
|
|
|
26
18
|
/**
|
|
27
19
|
* v6.2: language-family-conditioned LI input routing.
|
|
@@ -84,38 +76,6 @@ function firstSafeRelativePath(...candidates) {
|
|
|
84
76
|
return null;
|
|
85
77
|
}
|
|
86
78
|
|
|
87
|
-
function fsyncFile(filePath) {
|
|
88
|
-
const fd = openSync(filePath, 'r');
|
|
89
|
-
try { fsyncSync(fd); } finally { closeSync(fd); }
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
function fsyncDirectory(dirPath) {
|
|
93
|
-
try {
|
|
94
|
-
const fd = openSync(dirPath, 'r');
|
|
95
|
-
try { fsyncSync(fd); } finally { closeSync(fd); }
|
|
96
|
-
} catch (_err) {
|
|
97
|
-
// Directory fsync not supported on all platforms (Windows) — best effort
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
function writeCheckpointSidecar(sidecarPath, data) {
|
|
102
|
-
writeFileSync(sidecarPath, JSON.stringify(data, null, 2));
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
function readCheckpointSidecar(sidecarPath) {
|
|
106
|
-
if (!existsSync(sidecarPath)) return null;
|
|
107
|
-
try {
|
|
108
|
-
return JSON.parse(readFileSync(sidecarPath, 'utf-8'));
|
|
109
|
-
} catch (_err) { return null; }
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
function cleanupCheckpoint(indexPath) {
|
|
113
|
-
const checkpointPath = `${indexPath}.checkpoint`;
|
|
114
|
-
const sidecarPath = `${indexPath}.checkpoint.json`;
|
|
115
|
-
try { unlinkSync(checkpointPath); } catch (_e) { /* noop */ }
|
|
116
|
-
try { unlinkSync(sidecarPath); } catch (_e) { /* noop */ }
|
|
117
|
-
}
|
|
118
|
-
|
|
119
79
|
// =============================================================================
|
|
120
80
|
// SQLITE VECTOR STREAMING (Phase B — eliminates O(n*d) in-memory arrays)
|
|
121
81
|
// =============================================================================
|
|
@@ -156,68 +116,6 @@ function vectorIndexWhere(db, alias = '') {
|
|
|
156
116
|
return `${aliasFilterSql(alias)} AND ${liveVectorSql(db, alias)}`;
|
|
157
117
|
}
|
|
158
118
|
|
|
159
|
-
function* streamVectorsFromDb(db, _dim, order = 'sequential') {
|
|
160
|
-
const vectorWhere = vectorIndexWhere(db);
|
|
161
|
-
if (order !== 'sequential') {
|
|
162
|
-
db.exec('CREATE TEMP TABLE IF NOT EXISTS hnsw_order (pos INTEGER PRIMARY KEY, vector_rowid INTEGER)');
|
|
163
|
-
db.exec('DELETE FROM hnsw_order');
|
|
164
|
-
|
|
165
|
-
const rowidRows = db
|
|
166
|
-
.prepare(`SELECT rowid FROM vectors WHERE ${vectorWhere} ORDER BY rowid`)
|
|
167
|
-
.all();
|
|
168
|
-
let indices = rowidRows.map((r) => r.rowid);
|
|
169
|
-
|
|
170
|
-
if (order === 'shuffle') {
|
|
171
|
-
fisherYatesShuffle(indices);
|
|
172
|
-
} else if (order === 'diversity') {
|
|
173
|
-
const pathRows = db
|
|
174
|
-
.prepare(`SELECT rowid, file_path FROM vectors WHERE ${vectorWhere} ORDER BY rowid`)
|
|
175
|
-
.all();
|
|
176
|
-
const filePaths = pathRows.map((r) => r.file_path);
|
|
177
|
-
const permutationPositions = diversityFirstPermutationRowids(filePaths);
|
|
178
|
-
indices = permutationPositions.map((pos) => pathRows[pos - 1]?.rowid).filter(Boolean);
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
const insertOrder = db.prepare('INSERT INTO hnsw_order (pos, vector_rowid) VALUES (?, ?)');
|
|
182
|
-
db.transaction(() => {
|
|
183
|
-
for (let pos = 0; pos < indices.length; pos++) {
|
|
184
|
-
insertOrder.run(pos, indices[pos]);
|
|
185
|
-
}
|
|
186
|
-
})();
|
|
187
|
-
|
|
188
|
-
const stmt = db.prepare(`
|
|
189
|
-
SELECT v.rowid as rowid, v.id, v.file_path, v.embedding, v.metadata
|
|
190
|
-
FROM hnsw_order o
|
|
191
|
-
JOIN vectors v ON v.rowid = o.vector_rowid
|
|
192
|
-
ORDER BY o.pos
|
|
193
|
-
`);
|
|
194
|
-
for (const row of stmt.iterate()) {
|
|
195
|
-
yield {
|
|
196
|
-
rowid: row.rowid,
|
|
197
|
-
id: row.id,
|
|
198
|
-
file: row.file_path,
|
|
199
|
-
embedding: new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.length / 4),
|
|
200
|
-
metadata: row.metadata ? JSON.parse(row.metadata) : {},
|
|
201
|
-
};
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
db.exec('DROP TABLE IF EXISTS temp.hnsw_order');
|
|
205
|
-
} else {
|
|
206
|
-
const stmt = db.prepare(
|
|
207
|
-
`SELECT rowid, id, file_path, embedding, metadata FROM vectors WHERE ${vectorWhere} ORDER BY rowid`,
|
|
208
|
-
);
|
|
209
|
-
for (const row of stmt.iterate()) {
|
|
210
|
-
yield {
|
|
211
|
-
rowid: row.rowid,
|
|
212
|
-
id: row.id,
|
|
213
|
-
file: row.file_path,
|
|
214
|
-
embedding: new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.length / 4),
|
|
215
|
-
metadata: row.metadata ? JSON.parse(row.metadata) : {},
|
|
216
|
-
};
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
|
|
221
119
|
/**
|
|
222
120
|
* Pure decision function — should the hybrid CPU+GPU LI dispatcher arm?
|
|
223
121
|
*
|
|
@@ -362,327 +260,6 @@ function buildLateInteractionBatches(chunks, options = {}) {
|
|
|
362
260
|
return batches;
|
|
363
261
|
}
|
|
364
262
|
|
|
365
|
-
/** Diversity-first permutation returning 1-based rowid indices */
|
|
366
|
-
function diversityFirstPermutationRowids(filePaths) {
|
|
367
|
-
const buckets = new Map();
|
|
368
|
-
for (let i = 0; i < filePaths.length; i++) {
|
|
369
|
-
const dir = filePaths[i] ? filePaths[i].replace(/\/[^/]+$/, '') : '_unknown';
|
|
370
|
-
if (!buckets.has(dir)) buckets.set(dir, []);
|
|
371
|
-
buckets.get(dir).push(i + 1); // 1-based rowid
|
|
372
|
-
}
|
|
373
|
-
const dirs = [...buckets.keys()];
|
|
374
|
-
fisherYatesShuffle(dirs);
|
|
375
|
-
const order = [];
|
|
376
|
-
let remaining = filePaths.length;
|
|
377
|
-
while (remaining > 0) {
|
|
378
|
-
for (const dir of dirs) {
|
|
379
|
-
const bucket = buckets.get(dir);
|
|
380
|
-
if (bucket.length > 0) { order.push(bucket.shift()); remaining--; }
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
return order;
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
// =============================================================================
|
|
387
|
-
// INSERTION ORDER TUNING
|
|
388
|
-
// =============================================================================
|
|
389
|
-
|
|
390
|
-
// NOTE: applyInsertionOrder and diversityFirstPermutation (in-memory array permutation)
|
|
391
|
-
// removed in Phase B. Insertion order is now handled via SQLite temp tables in
|
|
392
|
-
// streamVectorsFromDb() and diversityFirstPermutationRowids().
|
|
393
|
-
|
|
394
|
-
// =============================================================================
|
|
395
|
-
// PHASE 3: HNSW INDEX (Incremental)
|
|
396
|
-
// =============================================================================
|
|
397
|
-
|
|
398
|
-
export async function incrementalUpdateHNSW(dbPath, changedFiles, dryRun = false) {
|
|
399
|
-
log('\n━━━ Phase 4: HNSW Index (Incremental) ━━━', 'bright');
|
|
400
|
-
|
|
401
|
-
if (dryRun) {
|
|
402
|
-
log('DRY RUN: Skipping HNSW incremental update', 'magenta');
|
|
403
|
-
return;
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
const modelInfo = getModelInfo();
|
|
407
|
-
const hnswDim = modelInfo.hnswDimension;
|
|
408
|
-
|
|
409
|
-
log('Loading existing HNSW index...', 'yellow');
|
|
410
|
-
const index = new HNSWIndex({
|
|
411
|
-
dimension: hnswDim,
|
|
412
|
-
M: HNSW_CONFIG.M,
|
|
413
|
-
efConstruction: HNSW_CONFIG.efConstruction,
|
|
414
|
-
efSearch: HNSW_CONFIG.efSearch,
|
|
415
|
-
});
|
|
416
|
-
|
|
417
|
-
let existingCount = 0;
|
|
418
|
-
try {
|
|
419
|
-
await index.load();
|
|
420
|
-
existingCount = index.nextKey;
|
|
421
|
-
log(`✓ Loaded existing index with ${existingCount} vectors`, 'green');
|
|
422
|
-
} catch (err) {
|
|
423
|
-
log(`No existing index found, creating new one`, 'yellow');
|
|
424
|
-
await index.init();
|
|
425
|
-
}
|
|
426
|
-
|
|
427
|
-
let removed = 0;
|
|
428
|
-
if (changedFiles && changedFiles.length > 0) {
|
|
429
|
-
log(`Removing entries for ${changedFiles.length} changed files...`, 'yellow');
|
|
430
|
-
|
|
431
|
-
const changedFileSet = new Set(changedFiles);
|
|
432
|
-
const idsToRemove = [];
|
|
433
|
-
for (const [id, metadata] of index.metadata.entries()) {
|
|
434
|
-
if (metadata.file && changedFileSet.has(metadata.file)) {
|
|
435
|
-
idsToRemove.push(id);
|
|
436
|
-
}
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
for (const id of idsToRemove) {
|
|
440
|
-
await index.remove(id);
|
|
441
|
-
removed++;
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
log(`✓ Removed ${removed} old entries`, 'green');
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
// Read new vectors for changed files from SQLite
|
|
448
|
-
const Database = (await import('better-sqlite3')).default;
|
|
449
|
-
const db = new Database(dbPath, { readonly: true });
|
|
450
|
-
|
|
451
|
-
const changedFileList = [...new Set(changedFiles || [])];
|
|
452
|
-
// Chunk the IN(?,?,...) clause to stay under SQLite's bound-parameter
|
|
453
|
-
// limit (default 32766, historic floor 999). Without chunking, a single
|
|
454
|
-
// indexing pass over >~32k changed files crashes with "too many SQL
|
|
455
|
-
// variables" — observed in production on CoSQA+ (51k docs) and BRIGHT
|
|
456
|
-
// (528k docs). See core/infrastructure/db-utils.js for the helper.
|
|
457
|
-
let rows = [];
|
|
458
|
-
if (changedFileList.length > 0) {
|
|
459
|
-
rows = chunkedIn(
|
|
460
|
-
db,
|
|
461
|
-
`SELECT rowid, id, file_path, embedding, metadata
|
|
462
|
-
FROM vectors
|
|
463
|
-
WHERE ${vectorIndexWhere(db)}
|
|
464
|
-
AND file_path IN (__IN_PLACEHOLDERS__)
|
|
465
|
-
ORDER BY rowid`,
|
|
466
|
-
changedFileList,
|
|
467
|
-
);
|
|
468
|
-
// Each batch is ORDER BY rowid internally, but batch boundaries break
|
|
469
|
-
// global monotonicity. The HNSW insertion loop below relies on rowid
|
|
470
|
-
// order for deterministic graph construction — re-sort explicitly.
|
|
471
|
-
rows.sort((a, b) => a.rowid - b.rowid);
|
|
472
|
-
}
|
|
473
|
-
const totalNew = rows.length;
|
|
474
|
-
|
|
475
|
-
log(`Adding ${totalNew} new entries...`, 'yellow');
|
|
476
|
-
let added = 0;
|
|
477
|
-
|
|
478
|
-
for (const row of rows) {
|
|
479
|
-
const embedding = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.length / 4);
|
|
480
|
-
if (!embedding || embedding.length === 0) continue;
|
|
481
|
-
|
|
482
|
-
const truncatedEmbedding = truncateForHNSW(embedding);
|
|
483
|
-
const metadata = row.metadata ? JSON.parse(row.metadata) : {};
|
|
484
|
-
|
|
485
|
-
await index.add(row.id, truncatedEmbedding, {
|
|
486
|
-
file: row.file_path,
|
|
487
|
-
name: metadata?.symbol,
|
|
488
|
-
type: metadata?.chunk_type,
|
|
489
|
-
});
|
|
490
|
-
|
|
491
|
-
added++;
|
|
492
|
-
|
|
493
|
-
if (added % 500 === 0 || added === totalNew) {
|
|
494
|
-
logProgress(added, totalNew, 'Adding to HNSW');
|
|
495
|
-
}
|
|
496
|
-
}
|
|
497
|
-
|
|
498
|
-
db.close();
|
|
499
|
-
|
|
500
|
-
log('\nSaving merged HNSW index...', 'yellow');
|
|
501
|
-
await index.save();
|
|
502
|
-
|
|
503
|
-
const stats = index.getStats();
|
|
504
|
-
log(`✓ HNSW index saved (${stats.totalVectors} total vectors, +${added} -${removed})`, 'green');
|
|
505
|
-
log(` Engine: ${stats.engine}, Dimension: ${hnswDim}d (Matryoshka)`, 'dim');
|
|
506
|
-
}
|
|
507
|
-
|
|
508
|
-
// =============================================================================
|
|
509
|
-
// PHASE 3: HNSW INDEX (Full Rebuild)
|
|
510
|
-
// =============================================================================
|
|
511
|
-
|
|
512
|
-
export async function buildHNSWIndex(dbPath, dryRun = false) {
|
|
513
|
-
log('\n━━━ Phase 4: HNSW Index ━━━', 'bright');
|
|
514
|
-
|
|
515
|
-
if (dryRun) {
|
|
516
|
-
log('DRY RUN: Skipping HNSW index', 'magenta');
|
|
517
|
-
return;
|
|
518
|
-
}
|
|
519
|
-
|
|
520
|
-
const Database = (await import('better-sqlite3')).default;
|
|
521
|
-
const orderMode = BINARY_HNSW_CONFIG.insertionOrder || 'sequential';
|
|
522
|
-
// Non-sequential orders require temp tables → can't use readonly
|
|
523
|
-
const db = new Database(dbPath, orderMode === 'sequential' ? { readonly: true } : {});
|
|
524
|
-
|
|
525
|
-
const totalVectors = db
|
|
526
|
-
.prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${vectorIndexWhere(db)}`)
|
|
527
|
-
.get().c;
|
|
528
|
-
if (totalVectors === 0) {
|
|
529
|
-
db.close();
|
|
530
|
-
log('No chunks to index', 'yellow');
|
|
531
|
-
return;
|
|
532
|
-
}
|
|
533
|
-
|
|
534
|
-
const modelInfo = getModelInfo();
|
|
535
|
-
const hnswDim = modelInfo.hnswDimension;
|
|
536
|
-
|
|
537
|
-
const index = new HNSWIndex({
|
|
538
|
-
dimension: hnswDim,
|
|
539
|
-
M: HNSW_CONFIG.M,
|
|
540
|
-
efConstruction: HNSW_CONFIG.efConstruction,
|
|
541
|
-
efSearch: HNSW_CONFIG.efSearch,
|
|
542
|
-
maxElements: Math.max(totalVectors * 2, HNSW_CONFIG.maxElements),
|
|
543
|
-
});
|
|
544
|
-
|
|
545
|
-
// Checkpoint resume is only safe with sequential order — non-sequential
|
|
546
|
-
// orders shuffle the stream so rowid is not a reliable resume boundary.
|
|
547
|
-
const canCheckpoint = orderMode === 'sequential';
|
|
548
|
-
|
|
549
|
-
const indexPath = DB_PATHS.hnswIndex;
|
|
550
|
-
const usearchPath = indexPath.replace('.idx', '.usearch');
|
|
551
|
-
const checkpointPath = `${usearchPath}.checkpoint`;
|
|
552
|
-
const sidecarPath = `${usearchPath}.checkpoint.json`;
|
|
553
|
-
const sidecar = canCheckpoint ? readCheckpointSidecar(sidecarPath) : null;
|
|
554
|
-
|
|
555
|
-
let resumeFromRowId = 0;
|
|
556
|
-
|
|
557
|
-
await index.init();
|
|
558
|
-
|
|
559
|
-
if (sidecar && existsSync(checkpointPath)) {
|
|
560
|
-
try {
|
|
561
|
-
if (index.index) {
|
|
562
|
-
// Load raw USearch graph from checkpoint
|
|
563
|
-
index.index.load(checkpointPath);
|
|
564
|
-
resumeFromRowId = sidecar.lastRowId || 0;
|
|
565
|
-
|
|
566
|
-
// Rebuild JS-side metadata (idMap, reverseMap, metadata, nextKey) for
|
|
567
|
-
// vectors already in the checkpoint. Without this, add() reuses keys
|
|
568
|
-
// from 0 and the final .meta.json would be incomplete.
|
|
569
|
-
const metaStmt = db.prepare(
|
|
570
|
-
`SELECT id, file_path, metadata
|
|
571
|
-
FROM vectors
|
|
572
|
-
WHERE rowid <= ? AND ${vectorIndexWhere(db)}
|
|
573
|
-
ORDER BY rowid`
|
|
574
|
-
);
|
|
575
|
-
let restoredKey = 0;
|
|
576
|
-
for (const row of metaStmt.iterate(resumeFromRowId)) {
|
|
577
|
-
const meta = row.metadata ? JSON.parse(row.metadata) : {};
|
|
578
|
-
const key = restoredKey++;
|
|
579
|
-
index.idMap.set(row.id, key);
|
|
580
|
-
index.reverseMap.set(key, row.id);
|
|
581
|
-
index.metadata.set(row.id, {
|
|
582
|
-
file: row.file_path,
|
|
583
|
-
name: meta?.symbol,
|
|
584
|
-
type: meta?.chunk_type,
|
|
585
|
-
});
|
|
586
|
-
}
|
|
587
|
-
index.nextKey = restoredKey;
|
|
588
|
-
|
|
589
|
-
log(`Resuming from checkpoint: ${sidecar.vectorsAdded} vectors, skipping rowid <= ${resumeFromRowId}`, 'green');
|
|
590
|
-
}
|
|
591
|
-
} catch (err) {
|
|
592
|
-
log(`Checkpoint found but could not load, starting fresh: ${err.message}`, 'yellow');
|
|
593
|
-
resumeFromRowId = 0;
|
|
594
|
-
// Reset any partial metadata restoration
|
|
595
|
-
index.idMap.clear();
|
|
596
|
-
index.reverseMap.clear();
|
|
597
|
-
index.metadata.clear();
|
|
598
|
-
index.nextKey = 0;
|
|
599
|
-
}
|
|
600
|
-
}
|
|
601
|
-
|
|
602
|
-
// Discard stale checkpoint from a previous non-sequential build
|
|
603
|
-
if (!canCheckpoint) {
|
|
604
|
-
cleanupCheckpoint(usearchPath);
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
log(`Building HNSW index (${modelInfo.dimension}d → ${hnswDim}d Matryoshka, M=${HNSW_CONFIG.M}, order=${orderMode})...`, 'yellow');
|
|
608
|
-
|
|
609
|
-
let added = resumeFromRowId > 0 ? (sidecar?.vectorsAdded || 0) : 0;
|
|
610
|
-
let lastCheckpointTime = Date.now();
|
|
611
|
-
let vectorsSinceCheckpoint = 0;
|
|
612
|
-
|
|
613
|
-
// try/finally guarantees the DB handle closes and stale checkpoint files
|
|
614
|
-
// get cleaned up even when the build loop throws. Without this, a failed
|
|
615
|
-
// build leaves .checkpoint + .checkpoint.json on disk and the NEXT run
|
|
616
|
-
// silently resumes from an indeterminate state (M5 fix).
|
|
617
|
-
let buildCompleted = false;
|
|
618
|
-
try {
|
|
619
|
-
for (const row of streamVectorsFromDb(db, hnswDim, orderMode)) {
|
|
620
|
-
// Skip already-checkpointed vectors on resume (only valid for sequential order)
|
|
621
|
-
if (resumeFromRowId > 0 && row.rowid <= resumeFromRowId) continue;
|
|
622
|
-
|
|
623
|
-
if (!row.embedding || row.embedding.length === 0) continue;
|
|
624
|
-
|
|
625
|
-
const truncatedEmbedding = truncateForHNSW(row.embedding);
|
|
626
|
-
|
|
627
|
-
await index.add(row.id, truncatedEmbedding, {
|
|
628
|
-
file: row.file,
|
|
629
|
-
name: row.metadata?.symbol,
|
|
630
|
-
type: row.metadata?.chunk_type,
|
|
631
|
-
});
|
|
632
|
-
|
|
633
|
-
added++;
|
|
634
|
-
vectorsSinceCheckpoint++;
|
|
635
|
-
|
|
636
|
-
// Time-based checkpoint: bounded data loss on crash (~30s max)
|
|
637
|
-
// Only for sequential order where rowid-based resume is valid.
|
|
638
|
-
if (canCheckpoint) {
|
|
639
|
-
const elapsed = (Date.now() - lastCheckpointTime) / 1000;
|
|
640
|
-
if (elapsed >= CHECKPOINT_INTERVAL_SEC && vectorsSinceCheckpoint >= MIN_VECTORS_BETWEEN_SAVES) {
|
|
641
|
-
if (!index.useFallback && index.index) {
|
|
642
|
-
index.index.save(checkpointPath);
|
|
643
|
-
fsyncFile(checkpointPath);
|
|
644
|
-
writeCheckpointSidecar(sidecarPath, {
|
|
645
|
-
vectorsAdded: added,
|
|
646
|
-
lastRowId: row.rowid,
|
|
647
|
-
version: row.rowid,
|
|
648
|
-
timestamp: new Date().toISOString(),
|
|
649
|
-
elapsedMs: Date.now() - lastCheckpointTime,
|
|
650
|
-
});
|
|
651
|
-
fsyncFile(sidecarPath);
|
|
652
|
-
fsyncDirectory(path.dirname(checkpointPath));
|
|
653
|
-
if (process.env.DEBUG) log(` checkpoint: ${added}/${totalVectors} vectors`, 'dim');
|
|
654
|
-
}
|
|
655
|
-
lastCheckpointTime = Date.now();
|
|
656
|
-
vectorsSinceCheckpoint = 0;
|
|
657
|
-
}
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
if (added % 500 === 0 || added === totalVectors) {
|
|
661
|
-
logProgress(added, totalVectors, 'Building HNSW');
|
|
662
|
-
}
|
|
663
|
-
}
|
|
664
|
-
|
|
665
|
-
await index.save();
|
|
666
|
-
await index.clearStaleBitmap();
|
|
667
|
-
buildCompleted = true;
|
|
668
|
-
|
|
669
|
-
// Clean up checkpoint files after successful completion
|
|
670
|
-
cleanupCheckpoint(usearchPath);
|
|
671
|
-
|
|
672
|
-
const stats = index.getStats();
|
|
673
|
-
log(`\n✓ HNSW index built: ${stats.totalVectors} vectors (${hnswDim}d)`, 'green');
|
|
674
|
-
log(` Using fallback: ${stats.useFallback}`, 'dim');
|
|
675
|
-
} finally {
|
|
676
|
-
try { db.close(); } catch (_err) { /* already closed */ }
|
|
677
|
-
if (!buildCompleted) {
|
|
678
|
-
// Build threw mid-stream. Remove stale checkpoint files so the next
|
|
679
|
-
// run starts from a known-good "no-resume" state rather than
|
|
680
|
-
// resuming against a different/new vector DB.
|
|
681
|
-
cleanupCheckpoint(usearchPath);
|
|
682
|
-
}
|
|
683
|
-
}
|
|
684
|
-
}
|
|
685
|
-
|
|
686
263
|
// =============================================================================
|
|
687
264
|
// PHASE 4: LATE INTERACTION INDEX
|
|
688
265
|
// =============================================================================
|
|
@@ -704,6 +281,14 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
|
|
|
704
281
|
attentionBudget = null,
|
|
705
282
|
segmentSize = null, // override SSLX-v3 segment threshold (default 10k)
|
|
706
283
|
projectRoot, // honored by LI skip policy for .sweet-search.config.json excludes
|
|
284
|
+
// Bounded-memory build (streaming path): evict each flushed segment's
|
|
285
|
+
// per-token slabs from the index's in-memory map so peak heap stays
|
|
286
|
+
// O(one segment) on huge repos. Safe only for from-scratch full rebuilds.
|
|
287
|
+
buildEvict = false,
|
|
288
|
+
// The streaming caller applies the LI skip policy once during its spill
|
|
289
|
+
// pass (where chunk content is in hand), so skip it here to avoid needing
|
|
290
|
+
// full chunk content resident a second time.
|
|
291
|
+
skipPolicyAlreadyApplied = false,
|
|
707
292
|
} = options;
|
|
708
293
|
log('\n━━━ Phase 3: Late Interaction Index (LateOn-Code) ━━━', 'bright');
|
|
709
294
|
|
|
@@ -718,7 +303,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
|
|
|
718
303
|
// LI-specific check globs can't do: content-based @generated markers.
|
|
719
304
|
// Disable via SWEET_SEARCH_LI_SKIP_DISABLE=1.
|
|
720
305
|
let skippedSummary = null;
|
|
721
|
-
if (Array.isArray(chunks) && chunks.length > 0) {
|
|
306
|
+
if (!skipPolicyAlreadyApplied && Array.isArray(chunks) && chunks.length > 0) {
|
|
722
307
|
const { applyIndexingChunkPolicy } = await import('./indexing-file-policy.js');
|
|
723
308
|
const { kept, stats } = applyIndexingChunkPolicy(chunks, { projectRoot });
|
|
724
309
|
if (stats.totalSkipped > 0) {
|
|
@@ -761,6 +346,7 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
|
|
|
761
346
|
modelId: LATE_INTERACTION_CONFIG.model,
|
|
762
347
|
indexPath: fullRebuild ? saveToPath : loadFromPath,
|
|
763
348
|
loadExisting: !fullRebuild,
|
|
349
|
+
buildEvict: buildEvict && fullRebuild,
|
|
764
350
|
...(segmentSize ? { segmentSize } : {}),
|
|
765
351
|
});
|
|
766
352
|
if (quantBits !== defaultQuantBits || whtSeed !== 0) {
|
|
@@ -1091,7 +677,10 @@ export async function buildLateInteractionIndex(chunks, dryRun = false, filesToR
|
|
|
1091
677
|
const exemplarId = alias.metadata?.exemplarId;
|
|
1092
678
|
const clusterId = alias.metadata?.clusterId;
|
|
1093
679
|
if (!exemplarId || !clusterId) continue;
|
|
1094
|
-
|
|
680
|
+
// hasDoc() (not documents.has()) so alias registration stays valid in
|
|
681
|
+
// bounded build mode, where the exemplar's per-token slab may already
|
|
682
|
+
// have been flushed to a segment and evicted from the live map.
|
|
683
|
+
if (!liIndex.hasDoc(exemplarId)) {
|
|
1095
684
|
orphaned++;
|
|
1096
685
|
continue;
|
|
1097
686
|
}
|
|
@@ -1148,18 +737,22 @@ export async function buildQuantizedArtifactsPhase(dryRun = false, options = {})
|
|
|
1148
737
|
|
|
1149
738
|
const skipCheck = await shouldSkipArtifactRebuild({ changedFiles, force });
|
|
1150
739
|
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
740
|
+
// usearch float HNSW was removed (commit c2a9817) — the binary HNSW is now
|
|
741
|
+
// the ONLY semantic search surface, and search dispatches to it whenever the
|
|
742
|
+
// artifact exists. So we can NO LONGER defer its rebuild on a sub-threshold
|
|
743
|
+
// change: that left vectors freshly committed to codebase.db invisible to
|
|
744
|
+
// 3-stage search until the next rebuild fired (the staleness Codex caught).
|
|
745
|
+
// Any actual change must rebuild the binary artifact to stay consistent with
|
|
746
|
+
// codebase.db; only a genuine no-op run (0 changed files) may skip. (The
|
|
747
|
+
// default daemon reconcile path maintains this per-tick via applyBinaryHNSWDelta.)
|
|
748
|
+
if (skipCheck.shouldSkip && (Number(changedFiles) || 0) === 0) {
|
|
749
|
+
log('Skipping binary artifacts: no files changed since last rebuild', 'dim');
|
|
1156
750
|
await updateArtifactState({
|
|
1157
751
|
rebuilt: false,
|
|
1158
752
|
changedFiles,
|
|
1159
753
|
previousState: skipCheck.state,
|
|
1160
754
|
});
|
|
1161
|
-
|
|
1162
|
-
return { binaryHnsw: null, int8: null, skipped: true, reason: skipCheck.reason };
|
|
755
|
+
return { binaryHnsw: null, int8: null, skipped: true, reason: 'no-changes' };
|
|
1163
756
|
}
|
|
1164
757
|
|
|
1165
758
|
log('Building quantized artifacts from codebase.db...', 'yellow');
|
|
@@ -160,8 +160,11 @@ export async function buildCodeGraph(files, dryRun = false) {
|
|
|
160
160
|
const content = await fs.readFile(filePath, 'utf-8');
|
|
161
161
|
const { entities, relationships } = await extractor.extractFromFile(files[i], content);
|
|
162
162
|
|
|
163
|
-
|
|
164
|
-
|
|
163
|
+
// Element-wise append, not push(...spread): a single generated mega-file
|
|
164
|
+
// (e.g. libsql's 250k-line SQLite amalgamation) can yield 65k+ entities,
|
|
165
|
+
// and spreading that many args into push() overflows the call stack.
|
|
166
|
+
for (let k = 0; k < entities.length; k++) entityBatch.push(entities[k]);
|
|
167
|
+
for (let k = 0; k < relationships.length; k++) relBatch.push(relationships[k]);
|
|
165
168
|
processed++;
|
|
166
169
|
} catch (err) {
|
|
167
170
|
errors++;
|
|
@@ -424,7 +427,7 @@ function prepareVectorInsert(db) {
|
|
|
424
427
|
* call this AFTER pipelinedEmbedAndInsert has written the exemplar rows.
|
|
425
428
|
* Returns the number of alias rows inserted.
|
|
426
429
|
*/
|
|
427
|
-
export function insertAliasVectors(db, aliases, modelInfo) {
|
|
430
|
+
export function insertAliasVectors(db, aliases, modelInfo, options = {}) {
|
|
428
431
|
if (!aliases || aliases.length === 0) return 0;
|
|
429
432
|
|
|
430
433
|
const fetchExemplar = db.prepare(
|
|
@@ -443,16 +446,24 @@ export function insertAliasVectors(db, aliases, modelInfo) {
|
|
|
443
446
|
// resolves to a live vectors row. This happens in incremental re-index
|
|
444
447
|
// when a file containing an exemplar is deleted but alias files in
|
|
445
448
|
// untouched paths still reference it.
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
449
|
+
//
|
|
450
|
+
// `skipOrphanPurge` is set by the streaming full-rebuild path, which calls
|
|
451
|
+
// this once per window into a FRESH temp db: there are no pre-existing rows
|
|
452
|
+
// to orphan, and the full-table json_extract scan would otherwise run once
|
|
453
|
+
// per window (O(windows × table)). A from-scratch build can never produce
|
|
454
|
+
// orphans, so skipping it is safe and keeps indexing fast.
|
|
455
|
+
if (!options.skipOrphanPurge) {
|
|
456
|
+
const orphanDelete = db.prepare(`
|
|
457
|
+
DELETE FROM vectors
|
|
458
|
+
WHERE json_extract(metadata, '$.exemplarId') IS NOT NULL
|
|
459
|
+
AND json_extract(metadata, '$.exemplarId') NOT IN (
|
|
460
|
+
SELECT id FROM vectors WHERE json_extract(metadata, '$.exemplarId') IS NULL
|
|
461
|
+
)
|
|
462
|
+
`);
|
|
463
|
+
const orphansRemoved = orphanDelete.run().changes;
|
|
464
|
+
if (orphansRemoved > 0) {
|
|
465
|
+
log(` ⚠ Purged ${orphansRemoved} orphan alias row(s) (exemplar absent)`, 'yellow');
|
|
466
|
+
}
|
|
456
467
|
}
|
|
457
468
|
|
|
458
469
|
const items = [];
|
|
@@ -584,7 +595,12 @@ export async function pipelinedEmbedAndInsert(db, allChunks, texts, batchSize, m
|
|
|
584
595
|
embeddingCount += batchEmbeddings.length;
|
|
585
596
|
|
|
586
597
|
const batchItems = buildInsertItems(batchChunks, batchEmbeddings, modelInfo, batchAnnotations);
|
|
587
|
-
writeBuffer.push(...batchItems)
|
|
598
|
+
// NOT `writeBuffer.push(...batchItems)`: for local models batchSize ==
|
|
599
|
+
// texts.length, so batchItems holds the WHOLE corpus in one batch. Spreading
|
|
600
|
+
// 100k+ args into push() overflows the call stack (V8 caps spread args at
|
|
601
|
+
// ~65k-125k) and crashed indexing on large repos (swc ~133k chunks, libsql).
|
|
602
|
+
// Append element-by-element so it stays O(n) and stack-safe at any size.
|
|
603
|
+
for (let k = 0; k < batchItems.length; k++) writeBuffer.push(batchItems[k]);
|
|
588
604
|
|
|
589
605
|
if (!useInternalProgress) {
|
|
590
606
|
logProgressFn(Math.min(i + batchSize, texts.length), texts.length, 'Embedding');
|
|
@@ -24,8 +24,6 @@ export function defaultIndexerManifestPaths() {
|
|
|
24
24
|
return {
|
|
25
25
|
codeGraph: basename(DB_PATHS.codeGraph),
|
|
26
26
|
vectors: basename(DB_PATHS.codebase),
|
|
27
|
-
hnsw: basename(DB_PATHS.hnswIndex),
|
|
28
|
-
hnswStale: basename(DB_PATHS.hnswIndex) + '.stale.bin',
|
|
29
27
|
binaryHnsw: basename(DB_PATHS.binaryHnswIndex),
|
|
30
28
|
liManifest: `${liBase}.segments/manifest.json`,
|
|
31
29
|
sparseBase: basename(DB_PATHS.sparseGramIndex),
|
|
@@ -45,7 +43,6 @@ export function publishIndexerManifest(options = {}) {
|
|
|
45
43
|
const defaultTiers = {
|
|
46
44
|
codeGraph: defaultManifest.codeGraph,
|
|
47
45
|
vectors: defaultManifest.vectors,
|
|
48
|
-
hnsw: defaultManifest.hnsw,
|
|
49
46
|
binaryHnsw: defaultManifest.binaryHnsw,
|
|
50
47
|
lateInteraction: defaultManifest.lateInteraction,
|
|
51
48
|
sparseGram: {
|