sweet-search 2.5.13 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -9
- package/core/cli.js +41 -3
- package/core/embedding/embedding-local-model.js +106 -10
- package/core/embedding/embedding-service.js +59 -1
- package/core/embedding/model-client.mjs +257 -0
- package/core/embedding/model-server.mjs +217 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +19 -98
- package/core/incremental-indexing/application/maintenance-worker.mjs +46 -9
- package/core/incremental-indexing/application/operator-cli.mjs +14 -5
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +40 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +718 -54
- package/core/incremental-indexing/application/reconciler.mjs +87 -15
- package/core/incremental-indexing/domain/cutoff-cache.mjs +191 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +84 -1
- package/core/incremental-indexing/domain/reconcile-counters.mjs +0 -4
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +0 -24
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +2 -26
- package/core/incremental-indexing/infrastructure/manifest.mjs +1 -9
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +72 -0
- package/core/indexing/artifact-builder.js +1 -1
- package/core/indexing/dedup/dedup-phase.js +36 -17
- package/core/indexing/dedup/exemplar-selector.js +5 -0
- package/core/indexing/index-codebase-v21.js +37 -14
- package/core/indexing/index-maintainer.mjs +337 -6
- package/core/indexing/indexer-ann.js +27 -434
- package/core/indexing/indexer-build.js +30 -14
- package/core/indexing/indexer-manifest.js +0 -3
- package/core/indexing/indexer-phases.js +101 -25
- package/core/indexing/maintainer-launcher.mjs +22 -0
- package/core/indexing/maintainer-watcher.mjs +397 -0
- package/core/indexing/os-priority.mjs +160 -0
- package/core/indexing/rss-budget.mjs +425 -0
- package/core/indexing/streaming-vectors.js +450 -0
- package/core/infrastructure/config/platform.js +14 -10
- package/core/infrastructure/onnx-session-utils.js +37 -0
- package/core/infrastructure/sparse-gram-delta-reader.js +11 -1
- package/core/ranking/late-interaction-index.js +58 -7
- package/core/search/daemon-registry.js +199 -0
- package/core/search/search-read-semantic.js +9 -3
- package/core/search/search-semantic.js +6 -29
- package/core/search/search-server.js +527 -27
- package/core/search/session-daemon-prewarm.mjs +110 -1
- package/core/search/sweet-search.js +0 -38
- package/core/vector-store/binary-hnsw-index.js +692 -78
- package/core/vector-store/index.js +1 -4
- package/eval/agent-read-workflows/bin/_ss-argparse.mjs +51 -5
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +95 -44
- package/eval/agent-read-workflows/bin/ss-read +2 -0
- package/mcp/tool-handlers.js +1 -2
- package/package.json +11 -8
- package/scripts/uninstall.js +2 -0
- package/core/vector-store/hnsw-index.js +0 -751
|
@@ -1,23 +1,34 @@
|
|
|
1
1
|
import fs from 'node:fs';
|
|
2
|
+
import os from 'node:os';
|
|
2
3
|
import path from 'node:path';
|
|
3
4
|
import { createHash } from 'node:crypto';
|
|
4
5
|
import Database from 'better-sqlite3';
|
|
5
6
|
|
|
6
7
|
import { Reconciler } from './reconciler.mjs';
|
|
7
|
-
import { enqueueMaintenanceJob } from './maintenance-worker.mjs';
|
|
8
|
+
import { enqueueMaintenanceJob, readMaintenanceQueue } from './maintenance-worker.mjs';
|
|
8
9
|
import { createAdmissionPolicy } from '../../indexing/admission-policy.js';
|
|
9
10
|
import { applyIndexingChunkPolicy } from '../../indexing/indexing-file-policy.js';
|
|
10
11
|
import { contentHashSync } from '../infrastructure/hashing.mjs';
|
|
11
12
|
import { readManifest, writeManifest } from '../infrastructure/manifest.mjs';
|
|
12
13
|
import { annotateChunksForDelta, snapshotFileRows, diffChunks, applyDiff } from '../infrastructure/vector-delta-writer.mjs';
|
|
13
14
|
import { appendDeltaRecord, FALLBACK_WEIGHTS_ID, fileIdFor, listDeltaSegments } from '../infrastructure/sparse-gram-delta.mjs';
|
|
14
|
-
import { fts5Merge } from '../infrastructure/sqlite-fts5.mjs';
|
|
15
|
-
import { insertEntity, insertRelationships, markBinaryStale, maintainFloatStore } from './production-reconciler-helpers.mjs';
|
|
15
|
+
import { fts5Merge, fts5MergeBudgetPages } from '../infrastructure/sqlite-fts5.mjs';
|
|
16
|
+
import { insertEntity, insertRelationships, markBinaryStale, maintainFloatStore, flushFloatStore } from './production-reconciler-helpers.mjs';
|
|
17
|
+
import {
|
|
18
|
+
chunkCutoffEnabled,
|
|
19
|
+
computeCutoffSignature,
|
|
20
|
+
signaturesMatch,
|
|
21
|
+
loadCutoffCache,
|
|
22
|
+
getFileSignature,
|
|
23
|
+
setFileSignature,
|
|
24
|
+
deleteFileSignature,
|
|
25
|
+
saveCutoffCache,
|
|
26
|
+
} from '../domain/cutoff-cache.mjs';
|
|
27
|
+
import { FloatVectorStore, getFloatStorePath } from '../../vector-store/float-vector-store.js';
|
|
16
28
|
import { createGraphSchema, GraphExtractor } from '../../graph/graph-extractor.js';
|
|
17
29
|
import { createVectorSchema, ensureVectorSchema, buildInsertItems, insertVectorItems } from '../../indexing/indexer-build.js';
|
|
18
30
|
import { ASTChunker, JAVA_FAMILY } from '../../indexing/ast-chunker.js';
|
|
19
31
|
import { getEmbeddings, getModelInfo } from '../../embedding/embedding-service.js';
|
|
20
|
-
import { HNSWIndex } from '../../vector-store/hnsw-index.js';
|
|
21
32
|
import { BinaryHNSWIndex } from '../../vector-store/binary-hnsw-index.js';
|
|
22
33
|
import { floatToBinary, normalizedFloatToInt8, truncateForHNSW } from '../../infrastructure/quantization.js';
|
|
23
34
|
import { extractSparseGramDeltaRecord } from '../../infrastructure/native-sparse-gram.js';
|
|
@@ -29,6 +40,149 @@ const PROCESSING_QUEUE = 'index-maintainer-queue.processing.jsonl';
|
|
|
29
40
|
const MERKLE_STATE = 'merkle-state.json';
|
|
30
41
|
const METRICS_FILE = 'reconcile-metrics.jsonl';
|
|
31
42
|
|
|
43
|
+
// ---- G2 lever flags ----------------------------------------------------------
|
|
44
|
+
// `flagOn` = strict opt-in (`'1'`), used by the levers that remain DEFAULT-OFF
|
|
45
|
+
// (a trade or unvalidated). `flagDefaultOn` = default-on (ON unless explicitly
|
|
46
|
+
// '0'), used by the PROVEN-safe levers (recall-neutral / byte-identical / soak
|
|
47
|
+
// == baseline). Disable any default-on lever with `=0`.
|
|
48
|
+
const flagOn = (name) => process.env[name] === '1';
|
|
49
|
+
const flagDefaultOn = (name) => process.env[name] !== '0';
|
|
50
|
+
// DEFAULT-ON (verified safe): batch tier writes (byte-identical with det-levels),
|
|
51
|
+
// SQLite memory pragmas (footprint-only), budget-derived FTS5 merge (CPU-budget
|
|
52
|
+
// adaptive, recall-neutral). Disable with the matching env var = '0'.
|
|
53
|
+
const batchTierWritesEnabled = () => flagDefaultOn('SWEET_SEARCH_RECONCILE_BATCH_TIER_WRITES');
|
|
54
|
+
const sqlitePragmasEnabled = () => flagDefaultOn('SWEET_SEARCH_RECONCILE_SQLITE_PRAGMAS');
|
|
55
|
+
const fts5BudgetEnabled = () => flagDefaultOn('SWEET_SEARCH_RECONCILE_FTS5_BUDGET');
|
|
56
|
+
// DEFAULT-OFF (freshness trade — defers HNSW disk saves so on-disk lags the live
|
|
57
|
+
// graph): keep strict opt-in.
|
|
58
|
+
const liveHnswEnabled = () => flagOn('SWEET_SEARCH_RECONCILE_LIVE_HNSW');
|
|
59
|
+
|
|
60
|
+
const BATCH_FLAG = 'SWEET_SEARCH_RECONCILE_BATCH_TIER_WRITES';
|
|
61
|
+
const DET_LEVELS_FLAG = 'SWEET_SEARCH_HNSW_DETERMINISTIC_LEVELS';
|
|
62
|
+
|
|
63
|
+
// One-time-warning latch so the forced-on notice is emitted ONCE per process,
|
|
64
|
+
// not on every tick (createProductionReconciler runs per tick in the daemon).
|
|
65
|
+
let _batchForcedDetLevelsWarned = false;
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Couple the two HNSW-determinism levers so the batch lever can never silently
|
|
69
|
+
* produce a non-byte-identical graph.
|
|
70
|
+
*
|
|
71
|
+
* E.1 batching (`SWEET_SEARCH_RECONCILE_BATCH_TIER_WRITES`) only yields a graph
|
|
72
|
+
* byte-identical to the per-file / compaction paths when per-id deterministic
|
|
73
|
+
* levels (`SWEET_SEARCH_HNSW_DETERMINISTIC_LEVELS`) are ON — see plan §0.5.
|
|
74
|
+
* `binary-hnsw-index.js` reads the det-levels env var directly at insert time,
|
|
75
|
+
* so running batch WITHOUT det-levels is a footgun: the batched tick draws the
|
|
76
|
+
* global RNG in a different interleaving than a per-file run and the resulting
|
|
77
|
+
* graph diverges. These two flags are coupled here.
|
|
78
|
+
*
|
|
79
|
+
* BOTH levers are now DEFAULT-ON (`!== '0'`). "batch effectively ON" therefore
|
|
80
|
+
* means the batch flag is unset (default) OR '1' — i.e. `BATCH_FLAG !== '0'`.
|
|
81
|
+
* In the normal case (both unset / default-on) det-levels is ALREADY effectively
|
|
82
|
+
* ON, so this is a no-op; no env mutation is needed and binary-hnsw-index.js
|
|
83
|
+
* sees det-levels on via its own `!== '0'` gate. The ONE case that must still
|
|
84
|
+
* fail loudly is the explicit contradiction: batch effectively ON while
|
|
85
|
+
* det-levels is EXPLICITLY '0'.
|
|
86
|
+
*
|
|
87
|
+
* Runs per tick from `createProductionReconciler` (which the daemon constructs
|
|
88
|
+
* each tick), so the daemon is covered without touching index-maintainer.mjs.
|
|
89
|
+
*
|
|
90
|
+
* - batch effectively ON + det-levels EXPLICITLY '0' → throw (explicit
|
|
91
|
+
* contradiction: byte-identity requested via batch, but levels forced
|
|
92
|
+
* non-deterministic).
|
|
93
|
+
* - batch effectively ON + det-levels default/unset/'1' → no-op (det-levels is
|
|
94
|
+
* default-on, so the batched graph already stays byte-identical).
|
|
95
|
+
* - batch EXPLICITLY OFF ('0') → no-op (the per-file path is taken; det-levels
|
|
96
|
+
* is independently meaningful and we never touch it).
|
|
97
|
+
*
|
|
98
|
+
* @param {{warn?:Function}} [logger] optional logger (reserved; the default-on
|
|
99
|
+
* defaults make the force-on warning path unreachable, but the signature is
|
|
100
|
+
* preserved for callers + tests).
|
|
101
|
+
*/
|
|
102
|
+
export function normalizeHnswDeterminismFlags(logger = null) {
|
|
103
|
+
void logger;
|
|
104
|
+
// batch is DEFAULT-ON: it is effectively OFF only when explicitly '0'.
|
|
105
|
+
if (process.env[BATCH_FLAG] === '0') return;
|
|
106
|
+
const det = process.env[DET_LEVELS_FLAG];
|
|
107
|
+
// det-levels is also DEFAULT-ON: only an EXPLICIT '0' is the contradiction.
|
|
108
|
+
if (det === '0') {
|
|
109
|
+
throw new Error(
|
|
110
|
+
`${BATCH_FLAG} is enabled (default-on) but requires ${DET_LEVELS_FLAG} to be `
|
|
111
|
+
+ `ON for a byte-identical HNSW graph — ${DET_LEVELS_FLAG} is explicitly '0'. `
|
|
112
|
+
+ `These are contradictory: batch tier writes only converge with the `
|
|
113
|
+
+ `per-file and compaction build paths when per-id deterministic levels are `
|
|
114
|
+
+ `enabled (see INDEX_MAINTAINER_EFFICIENCY_IMPLEMENTATION_PLAN §0.5). Either `
|
|
115
|
+
+ `leave ${DET_LEVELS_FLAG} default-on (omit it or set it to '1') or disable `
|
|
116
|
+
+ `the batch lever with ${BATCH_FLAG}=0.`,
|
|
117
|
+
);
|
|
118
|
+
}
|
|
119
|
+
// det-levels is default-on (unset) or explicitly '1' → already coupled
|
|
120
|
+
// correctly; no env mutation needed (binary-hnsw-index.js reads its own
|
|
121
|
+
// `!== '0'` gate). The legacy force-on warning latch is retained only to keep
|
|
122
|
+
// its symbol stable for any importer.
|
|
123
|
+
void _batchForcedDetLevelsWarned;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// E.2: deletion-fraction threshold + insert cadence for the live (daemon-scoped)
|
|
127
|
+
// HNSW. Save to disk only on graceful shutdown, deletion-fraction >= this, or
|
|
128
|
+
// every N inserts.
|
|
129
|
+
const LIVE_HNSW_DELETION_FRACTION = Number.parseFloat(process.env.SWEET_SEARCH_RECONCILE_LIVE_HNSW_DELETE_FRAC || '0.15');
|
|
130
|
+
const LIVE_HNSW_SAVE_EVERY_INSERTS = Number.parseInt(process.env.SWEET_SEARCH_RECONCILE_LIVE_HNSW_SAVE_EVERY || '2000', 10);
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* E.4 SQLite memory pragmas, applied AFTER `journal_mode=WAL; synchronous=NORMAL`
|
|
134
|
+
* on a write connection. `cache_size=-32768` caps the per-connection page cache
|
|
135
|
+
* at ~32 MiB. `soft_heap_limit` is process-global and is set ONCE at daemon
|
|
136
|
+
* startup by G4 (index-maintainer) — NOT here — to avoid every connection
|
|
137
|
+
* re-setting a process-wide knob. With E.1 these attach to the tick-scoped
|
|
138
|
+
* connection so the cache is meaningful; without E.1 the per-file conn churn
|
|
139
|
+
* makes them near no-ops (documented, not a bug).
|
|
140
|
+
*
|
|
141
|
+
* Gated on `SWEET_SEARCH_RECONCILE_SQLITE_PRAGMAS`; off ⇒ unchanged behavior.
|
|
142
|
+
*
|
|
143
|
+
* @param {import('better-sqlite3').Database} db
|
|
144
|
+
* @param {{readonly?: boolean}} [opts]
|
|
145
|
+
*/
|
|
146
|
+
function applyMemoryPragmas(db, { readonly = false } = {}) {
|
|
147
|
+
if (!sqlitePragmasEnabled()) return;
|
|
148
|
+
try { db.pragma('cache_size = -32768'); } catch {}
|
|
149
|
+
// mmap_size only on readonly maintainer conns (negligible benefit on the JS
|
|
150
|
+
// side — the user search path is native Rust — but harmless; matches the doc).
|
|
151
|
+
if (readonly) {
|
|
152
|
+
try { db.pragma('mmap_size = 268435456'); } catch {}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* E.2: process-scoped (daemon-scoped) live store registry. The index-maintainer
|
|
158
|
+
* daemon runs many ticks in one process, each constructing a fresh
|
|
159
|
+
* `createProductionReconciler`; a module-level registry keyed by the resolved
|
|
160
|
+
* state dir lets the resident HNSW + float store survive across those
|
|
161
|
+
* per-tick adapter instances when `SWEET_SEARCH_RECONCILE_LIVE_HNSW` is on.
|
|
162
|
+
*
|
|
163
|
+
* Each entry: { index: BinaryHNSWIndex, floatStore: FloatVectorStore,
|
|
164
|
+
* insertsSinceSave, deletedCount, totalCount, dirty }
|
|
165
|
+
*/
|
|
166
|
+
const liveStoreRegistry = new Map();
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Release all live stores, saving any that are dirty. Called on graceful daemon
|
|
170
|
+
* shutdown (G4 wires the call; exposed here for tests + the disposeTick path).
|
|
171
|
+
*/
|
|
172
|
+
export async function shutdownLiveStores() {
|
|
173
|
+
for (const [key, entry] of liveStoreRegistry) {
|
|
174
|
+
try {
|
|
175
|
+
if (entry.dirty && entry.index) {
|
|
176
|
+
await entry.index.save(entry.indexPath);
|
|
177
|
+
if (entry.floatStore && entry.floatStore.loaded) {
|
|
178
|
+
await entry.floatStore.save(getFloatStorePath(entry.indexPath));
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
} catch { /* best-effort flush on shutdown */ }
|
|
182
|
+
liveStoreRegistry.delete(key);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
32
186
|
function relPath(projectRoot, filePath) {
|
|
33
187
|
const abs = path.isAbsolute(filePath) ? filePath : path.join(projectRoot, filePath);
|
|
34
188
|
const rel = path.relative(projectRoot, abs).replace(/\\/g, '/');
|
|
@@ -100,10 +254,21 @@ function pickLiInput(chunk) {
|
|
|
100
254
|
return chunk.li_greedy_text || chunk.embedding_text || chunk.li_text || chunk.text || chunk.content || '';
|
|
101
255
|
}
|
|
102
256
|
|
|
103
|
-
async function enrichChunksFromGraph(chunks, stateDir) {
|
|
257
|
+
async function enrichChunksFromGraph(chunks, stateDir, tickCtx = null) {
|
|
104
258
|
const dbPath = path.join(stateDir, 'code-graph.db');
|
|
105
|
-
if (
|
|
106
|
-
|
|
259
|
+
if (chunks.length === 0) return chunks;
|
|
260
|
+
// E.1: reuse the tick-scoped readonly connection when batching; else open a
|
|
261
|
+
// per-file readonly connection exactly as before.
|
|
262
|
+
let db;
|
|
263
|
+
let ownConn = false;
|
|
264
|
+
if (tickCtx?.graphRoDb) {
|
|
265
|
+
db = tickCtx.graphRoDb;
|
|
266
|
+
} else {
|
|
267
|
+
if (!fs.existsSync(dbPath)) return chunks;
|
|
268
|
+
db = new Database(dbPath, { readonly: true });
|
|
269
|
+
applyMemoryPragmas(db, { readonly: true });
|
|
270
|
+
ownConn = true;
|
|
271
|
+
}
|
|
107
272
|
try {
|
|
108
273
|
const entityStmt = db.prepare('SELECT type, name, start_line, end_line FROM entities WHERE file_path = ? AND epoch_retired IS NULL ORDER BY start_line ASC');
|
|
109
274
|
const fileEntityStmt = db.prepare('SELECT id FROM entities WHERE file_path = ? AND logical_entity_id = ? AND epoch_retired IS NULL ORDER BY epoch_written DESC LIMIT 1');
|
|
@@ -125,15 +290,35 @@ async function enrichChunksFromGraph(chunks, stateDir) {
|
|
|
125
290
|
} catch {
|
|
126
291
|
return chunks;
|
|
127
292
|
} finally {
|
|
128
|
-
db.close();
|
|
293
|
+
if (ownConn) db.close();
|
|
129
294
|
}
|
|
130
295
|
return chunks;
|
|
131
296
|
}
|
|
132
297
|
|
|
133
298
|
export function createProductionReconciler(options = {}) {
|
|
299
|
+
// Couple the batch + det-levels flags FIRST (before the adapter is built and
|
|
300
|
+
// before any tier write), so binary-hnsw-index.js — which reads
|
|
301
|
+
// SWEET_SEARCH_HNSW_DETERMINISTIC_LEVELS at insert time — transparently sees
|
|
302
|
+
// the forced value whenever batch is on. Throws on the explicit contradiction
|
|
303
|
+
// (batch=1 + det-levels=0). See `normalizeHnswDeterminismFlags`.
|
|
304
|
+
normalizeHnswDeterminismFlags(options.logger);
|
|
134
305
|
const projectRoot = path.resolve(options.projectRoot || process.env.SWEET_SEARCH_PROJECT_ROOT || process.cwd());
|
|
135
306
|
const stateDir = path.resolve(options.stateDir || process.env.SWEET_SEARCH_STATE_DIR || path.join(projectRoot, '.sweet-search'));
|
|
136
307
|
const adapter = new ProductionReconcileAdapter({ ...options, projectRoot, stateDir });
|
|
308
|
+
// A.4-config: feed the interval-autotune a real load signal. This is ONLY the
|
|
309
|
+
// config half — the daemon (G4) reads the tuned interval back into its sleep
|
|
310
|
+
// loop. DEFAULT-ON (disable with SWEET_SEARCH_RECONCILE_AUTOTUNE=0): when off,
|
|
311
|
+
// `autotuneInterval` stays false and the reconciler never re-tunes (today's
|
|
312
|
+
// fixed-interval behavior). Verified recall-neutral + soak == baseline.
|
|
313
|
+
const autotuneOn = flagDefaultOn('SWEET_SEARCH_RECONCILE_AUTOTUNE');
|
|
314
|
+
const cpuCount = Math.max(1, os.cpus().length);
|
|
315
|
+
const autotuneConfig = autotuneOn
|
|
316
|
+
? {
|
|
317
|
+
autotuneInterval: true,
|
|
318
|
+
cpuLoadAvg: os.loadavg()[0] / cpuCount,
|
|
319
|
+
maintenanceBacklog: adapter.maintenanceBacklog(),
|
|
320
|
+
}
|
|
321
|
+
: {};
|
|
137
322
|
return new Reconciler({
|
|
138
323
|
projectRoot,
|
|
139
324
|
stateDir,
|
|
@@ -142,6 +327,7 @@ export function createProductionReconciler(options = {}) {
|
|
|
142
327
|
config: {
|
|
143
328
|
filesPerTick: Number.parseInt(process.env.SWEET_SEARCH_RECONCILE_FILES_PER_TICK || '50', 10),
|
|
144
329
|
cpuBudgetMs: Number.parseInt(process.env.SWEET_SEARCH_RECONCILE_CPU_BUDGET_MS || '2000', 10),
|
|
330
|
+
...autotuneConfig,
|
|
145
331
|
...(options.config || {}),
|
|
146
332
|
},
|
|
147
333
|
logger: options.logger || console,
|
|
@@ -184,28 +370,311 @@ class ProductionReconcileAdapter {
|
|
|
184
370
|
this._liSkipFiles = new Set();
|
|
185
371
|
this.hashes = new Map();
|
|
186
372
|
this.touched = new Map();
|
|
373
|
+
// E.1: the active tick-scoped store context (null on the per-file path).
|
|
374
|
+
this._tickCtx = null;
|
|
375
|
+
// E.6: chunk-cutoff cache — loaded lazily at tick begin / first vector delta
|
|
376
|
+
// when the flag is on; null when disabled.
|
|
377
|
+
this._cutoffCache = null;
|
|
378
|
+
this._cutoffDirty = false;
|
|
187
379
|
}
|
|
188
380
|
|
|
189
381
|
progress(phase) {
|
|
190
382
|
this.onProgress?.(phase);
|
|
191
383
|
}
|
|
192
384
|
|
|
385
|
+
/**
|
|
386
|
+
* A.4-config: a coarse maintenance-backlog signal for the interval autotune —
|
|
387
|
+
* the depth of the rebuild queue. Best-effort; never throws.
|
|
388
|
+
* @returns {number}
|
|
389
|
+
*/
|
|
390
|
+
maintenanceBacklog() {
|
|
391
|
+
try { return readMaintenanceQueue(this.stateDir).length; } catch { return 0; }
|
|
392
|
+
}
|
|
393
|
+
|
|
193
394
|
adapters() {
|
|
194
|
-
|
|
395
|
+
const hooks = {
|
|
195
396
|
readDirtySet: () => this.readDirtySet(),
|
|
196
397
|
requeueDirtyFiles: (files) => this.requeueDirtyFiles(files),
|
|
197
398
|
hashFile: (file) => this.hashFile(file),
|
|
198
399
|
loadCurrentManifest: () => readManifest(this.stateDir),
|
|
199
400
|
persistManifest: (manifest) => this.persistManifest(manifest),
|
|
200
|
-
applyGraphDelta: (file, hashes, epoch) => this.applyGraphDelta(file, hashes, epoch),
|
|
201
|
-
applyVectorDelta: (file, chunks, hashes, epoch) => this.applyVectorDelta(file, chunks, hashes, epoch),
|
|
202
|
-
|
|
203
|
-
applyBinaryHNSWDelta: (file, ops, epoch) => this.applyBinaryHNSWDelta(file, ops, epoch),
|
|
401
|
+
applyGraphDelta: (file, hashes, epoch, ctx) => this.applyGraphDelta(file, hashes, epoch, ctx),
|
|
402
|
+
applyVectorDelta: (file, chunks, hashes, epoch, ctx) => this.applyVectorDelta(file, chunks, hashes, epoch, ctx),
|
|
403
|
+
applyBinaryHNSWDelta: (file, ops, epoch, ctx) => this.applyBinaryHNSWDelta(file, ops, epoch, ctx),
|
|
204
404
|
applyLIDelta: (file, ops, epoch) => this.applyLIDelta(file, ops, epoch),
|
|
205
405
|
applySparseGramDelta: (file, ops, epoch) => this.applySparseGramDelta(file, ops, epoch),
|
|
206
406
|
readMaintenanceState: () => this.readMaintenanceState(),
|
|
207
407
|
scheduleMaintenance: (job) => enqueueMaintenanceJob(this.stateDir, job),
|
|
208
408
|
};
|
|
409
|
+
// E.1: expose the batch lifecycle hooks ONLY when the flag is on, so the
|
|
410
|
+
// reconciler's `_batchTierWritesEnabled()` gate (which checks for the hooks)
|
|
411
|
+
// stays false by default and the per-file path is taken verbatim.
|
|
412
|
+
if (batchTierWritesEnabled()) {
|
|
413
|
+
hooks.beginTick = (info) => this.beginTick(info);
|
|
414
|
+
hooks.finalizeTick = (ctx, info) => this.finalizeTick(ctx, info);
|
|
415
|
+
hooks.disposeTick = (ctx) => this.disposeTick(ctx);
|
|
416
|
+
}
|
|
417
|
+
return hooks;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
// ---- E.1/E.2 tick-scoped store context -----------------------------------
|
|
421
|
+
|
|
422
|
+
/**
|
|
423
|
+
* E.1: open the tick-scoped store context once at tick start. Opens RW
|
|
424
|
+
* `codebase.db` + `code-graph.db`, a RO `code-graph.db` for enrichment, loads
|
|
425
|
+
* the binary HNSW + float store once, and primes the cutoff cache (E.6).
|
|
426
|
+
*
|
|
427
|
+
* With E.2 (`SWEET_SEARCH_RECONCILE_LIVE_HNSW`) the HNSW + float store come
|
|
428
|
+
* from the daemon-scoped registry (loaded once, kept resident across ticks);
|
|
429
|
+
* otherwise they are loaded fresh and saved+closed at finalize.
|
|
430
|
+
*/
|
|
431
|
+
async beginTick() {
|
|
432
|
+
const codebaseDbPath = path.join(this.stateDir, 'codebase.db');
|
|
433
|
+
const graphDbPath = path.join(this.stateDir, 'code-graph.db');
|
|
434
|
+
const indexPath = path.join(this.stateDir, 'codebase-binary-hnsw.idx');
|
|
435
|
+
fs.mkdirSync(this.stateDir, { recursive: true });
|
|
436
|
+
|
|
437
|
+
const codebaseExisted = fs.existsSync(codebaseDbPath);
|
|
438
|
+
const codebaseDb = new Database(codebaseDbPath);
|
|
439
|
+
codebaseDb.pragma('journal_mode = WAL');
|
|
440
|
+
codebaseDb.pragma('synchronous = NORMAL');
|
|
441
|
+
applyMemoryPragmas(codebaseDb);
|
|
442
|
+
codebaseExisted ? ensureVectorSchema(codebaseDb) : createVectorSchema(codebaseDb);
|
|
443
|
+
|
|
444
|
+
const graphDb = new Database(graphDbPath);
|
|
445
|
+
graphDb.pragma('journal_mode = WAL');
|
|
446
|
+
graphDb.pragma('synchronous = NORMAL');
|
|
447
|
+
applyMemoryPragmas(graphDb);
|
|
448
|
+
const graphHasFts = createGraphSchema(graphDb);
|
|
449
|
+
migrateEntitiesSchema(graphDb);
|
|
450
|
+
migrateRelationshipsSchema(graphDb);
|
|
451
|
+
|
|
452
|
+
// Enrichment reads must observe THIS tick's graph writes. Because the
|
|
453
|
+
// batched path defers the SQLite COMMIT to finalize (persist-before-advance,
|
|
454
|
+
// see below), a SEPARATE readonly connection in WAL mode would NOT see the
|
|
455
|
+
// uncommitted in-tick graph rows. So enrichment reads from the SAME RW
|
|
456
|
+
// connection (`graphDb`) — a connection always sees its own uncommitted
|
|
457
|
+
// writes — preserving per-file enrichment semantics inside one tick.
|
|
458
|
+
const graphRoDb = graphDb;
|
|
459
|
+
|
|
460
|
+
// Resident HNSW + float store (E.1 load-once; E.2 daemon-scoped singleton).
|
|
461
|
+
const live = liveHnswEnabled() ? this._getLiveStore(indexPath) : null;
|
|
462
|
+
let index = live?.index || null;
|
|
463
|
+
let floatStore = live?.floatStore || null;
|
|
464
|
+
if (!index) {
|
|
465
|
+
index = new BinaryHNSWIndex({ indexPath, stalePath: `${indexPath}.stale.bin`, floatDimension: this.modelInfo.hnswDimension });
|
|
466
|
+
try { await index.load(indexPath); } catch { await index.init(); }
|
|
467
|
+
}
|
|
468
|
+
const binaryVectorsBefore = index.idToIndex?.size ?? 0;
|
|
469
|
+
if (!floatStore) {
|
|
470
|
+
floatStore = new FloatVectorStore();
|
|
471
|
+
try { await floatStore.loadOrInit(getFloatStorePath(indexPath), this.modelInfo.hnswDimension); } catch { /* fall back to fresh */ }
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
if (chunkCutoffEnabled() && !this._cutoffCache) {
|
|
475
|
+
this._cutoffCache = loadCutoffCache(this.stateDir);
|
|
476
|
+
this._cutoffDirty = false;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// E.1 PERSIST-BEFORE-ADVANCE: defer the SQLite COMMIT to finalizeTick. Open
|
|
480
|
+
// an explicit outer transaction on each RW connection now; the per-file
|
|
481
|
+
// `db.transaction(fn)()` calls inside apply*Delta then run as SAVEPOINTs
|
|
482
|
+
// (better-sqlite3 nests automatically) and only become durable when
|
|
483
|
+
// finalizeTick COMMITs — which it does ONLY after the HNSW + float batch
|
|
484
|
+
// save fsyncs. A crash/throw before that point rolls the whole tick's SQLite
|
|
485
|
+
// writes back, so a restart re-reconciles from a consistent prior state and
|
|
486
|
+
// can never leave a SQLite-live row missing from the HNSW.
|
|
487
|
+
codebaseDb.exec('BEGIN');
|
|
488
|
+
graphDb.exec('BEGIN');
|
|
489
|
+
|
|
490
|
+
const ctx = {
|
|
491
|
+
indexPath,
|
|
492
|
+
tickStartMs: Date.now(),
|
|
493
|
+
txOpen: true,
|
|
494
|
+
codebaseDb,
|
|
495
|
+
graphDb,
|
|
496
|
+
graphRoDb,
|
|
497
|
+
graphHasFts,
|
|
498
|
+
index,
|
|
499
|
+
floatStore,
|
|
500
|
+
binaryVectorsBefore,
|
|
501
|
+
live: !!live,
|
|
502
|
+
// Accumulated across the tick:
|
|
503
|
+
floatUpserts: [],
|
|
504
|
+
floatRemoveIds: [],
|
|
505
|
+
append: 0,
|
|
506
|
+
tombstone: 0,
|
|
507
|
+
// Files whose ops are staged in this batch (provisional → promoted to
|
|
508
|
+
// merkle only after finalize fsyncs).
|
|
509
|
+
persistedFiles: new Set(),
|
|
510
|
+
pendingAdds: [],
|
|
511
|
+
};
|
|
512
|
+
this._tickCtx = ctx;
|
|
513
|
+
this._lastPersistedFiles = ctx.persistedFiles;
|
|
514
|
+
return ctx;
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
/**
|
|
518
|
+
* E.2: fetch (or lazily create) the daemon-scoped resident store entry.
|
|
519
|
+
*/
|
|
520
|
+
_getLiveStore(indexPath) {
|
|
521
|
+
let entry = liveStoreRegistry.get(indexPath);
|
|
522
|
+
if (!entry) {
|
|
523
|
+
entry = {
|
|
524
|
+
indexPath,
|
|
525
|
+
index: null,
|
|
526
|
+
floatStore: null,
|
|
527
|
+
insertsSinceSave: 0,
|
|
528
|
+
deletedCount: 0,
|
|
529
|
+
totalCount: 0,
|
|
530
|
+
dirty: false,
|
|
531
|
+
loadPromise: null,
|
|
532
|
+
};
|
|
533
|
+
liveStoreRegistry.set(indexPath, entry);
|
|
534
|
+
}
|
|
535
|
+
return entry;
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
/**
|
|
539
|
+
* E.1 PERSIST-BEFORE-ADVANCE: save the batched HNSW + float store once, fsync,
|
|
540
|
+
* then (per E.4) shrink_memory + wal_checkpoint(PASSIVE) on the tick-scoped
|
|
541
|
+
* connections. Returns the set of files whose ops are now persisted so the
|
|
542
|
+
* manifest publish only promotes those into the merkle.
|
|
543
|
+
*
|
|
544
|
+
* With E.2 the resident index is NOT saved every tick — only on a deletion
|
|
545
|
+
* fraction >= threshold, every N inserts, or graceful shutdown. The SQLite
|
|
546
|
+
* tiers always fsync here; the merkle then advances for files whose vector
|
|
547
|
+
* rows landed (HNSW reconverges from those rows on the next save / restart).
|
|
548
|
+
*/
|
|
549
|
+
async finalizeTick(ctx) {
|
|
550
|
+
if (!ctx) return { persistedFiles: new Set(), requeueFiles: [] };
|
|
551
|
+
let hnswSaved = false;
|
|
552
|
+
try {
|
|
553
|
+
// E.1: insert all staged adds into the resident index in a DETERMINISTIC
|
|
554
|
+
// order (sorted by id). Combined with G1's per-id deterministic levels and
|
|
555
|
+
// sorted-order compaction, this makes the batched graph reproducible and
|
|
556
|
+
// byte-identical across batch / rebuild / compaction construction paths.
|
|
557
|
+
const pending = ctx.pendingAdds || [];
|
|
558
|
+
if (pending.length > 0) {
|
|
559
|
+
pending.sort((a, b) => (a.addId < b.addId ? -1 : a.addId > b.addId ? 1 : 0));
|
|
560
|
+
let done = 0;
|
|
561
|
+
for (const op of pending) {
|
|
562
|
+
const truncated = truncateForHNSW(op.embedding, this.modelInfo.hnswDimension);
|
|
563
|
+
await ctx.index.add(op.addId, floatToBinary(truncated), op.metadata || {}, normalizedFloatToInt8(truncated));
|
|
564
|
+
ctx.floatUpserts.push({ id: op.addId, vector: truncated });
|
|
565
|
+
if ((++done) % 100 === 0) this.progress('production:binary-hnsw-loop');
|
|
566
|
+
}
|
|
567
|
+
this.progress('production:binary-hnsw-batched');
|
|
568
|
+
}
|
|
569
|
+
ctx.pendingAdds = [];
|
|
570
|
+
|
|
571
|
+
if (ctx.live) {
|
|
572
|
+
const entry = liveStoreRegistry.get(ctx.indexPath);
|
|
573
|
+
if (entry) {
|
|
574
|
+
entry.index = ctx.index;
|
|
575
|
+
entry.floatStore = ctx.floatStore;
|
|
576
|
+
entry.insertsSinceSave += ctx.append;
|
|
577
|
+
entry.deletedCount += ctx.tombstone;
|
|
578
|
+
entry.totalCount = ctx.index.idToIndex?.size ?? entry.totalCount;
|
|
579
|
+
if (ctx.append > 0 || ctx.tombstone > 0) entry.dirty = true;
|
|
580
|
+
const denom = Math.max(1, entry.totalCount + entry.deletedCount);
|
|
581
|
+
const deletionFraction = entry.deletedCount / denom;
|
|
582
|
+
const shouldSave = deletionFraction >= LIVE_HNSW_DELETION_FRACTION
|
|
583
|
+
|| entry.insertsSinceSave >= LIVE_HNSW_SAVE_EVERY_INSERTS;
|
|
584
|
+
if (shouldSave && entry.dirty) {
|
|
585
|
+
await ctx.index.save(ctx.indexPath);
|
|
586
|
+
await flushFloatStore({
|
|
587
|
+
binaryHnswPath: ctx.indexPath,
|
|
588
|
+
store: ctx.floatStore,
|
|
589
|
+
upserts: ctx.floatUpserts,
|
|
590
|
+
removeIds: ctx.floatRemoveIds,
|
|
591
|
+
binaryVectorsBefore: ctx.binaryVectorsBefore,
|
|
592
|
+
dimension: this.modelInfo.hnswDimension,
|
|
593
|
+
});
|
|
594
|
+
entry.insertsSinceSave = 0;
|
|
595
|
+
entry.deletedCount = 0;
|
|
596
|
+
entry.dirty = false;
|
|
597
|
+
hnswSaved = true;
|
|
598
|
+
} else if (ctx.floatUpserts.length || ctx.floatRemoveIds.length) {
|
|
599
|
+
// Keep the float store's in-memory delta consistent with the live
|
|
600
|
+
// index even when we skip the disk save (so a later threshold save
|
|
601
|
+
// writes the full set).
|
|
602
|
+
ctx.floatStore.applyDelta({ upserts: ctx.floatUpserts, removeIds: ctx.floatRemoveIds });
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
} else if (ctx.append > 0 || ctx.tombstone > 0) {
|
|
606
|
+
await ctx.index.save(ctx.indexPath);
|
|
607
|
+
this.progress('production:binary-hnsw-saved');
|
|
608
|
+
await flushFloatStore({
|
|
609
|
+
binaryHnswPath: ctx.indexPath,
|
|
610
|
+
store: ctx.floatStore,
|
|
611
|
+
upserts: ctx.floatUpserts,
|
|
612
|
+
removeIds: ctx.floatRemoveIds,
|
|
613
|
+
binaryVectorsBefore: ctx.binaryVectorsBefore,
|
|
614
|
+
dimension: this.modelInfo.hnswDimension,
|
|
615
|
+
});
|
|
616
|
+
this.progress('production:float-store-maintained');
|
|
617
|
+
hnswSaved = true;
|
|
618
|
+
}
|
|
619
|
+
// PERSIST-BEFORE-ADVANCE: the HNSW + float batch has now fsynced (or was
|
|
620
|
+
// intentionally not due-to-save under E.2). COMMIT the SQLite tiers ONLY
|
|
621
|
+
// now, so a crash before this point rolled the SQLite writes back too.
|
|
622
|
+
if (ctx.txOpen) {
|
|
623
|
+
ctx.codebaseDb.exec('COMMIT');
|
|
624
|
+
ctx.graphDb.exec('COMMIT');
|
|
625
|
+
ctx.txOpen = false;
|
|
626
|
+
}
|
|
627
|
+
} catch (err) {
|
|
628
|
+
// HNSW save (or COMMIT) failed → roll back the SQLite tiers so nothing is
|
|
629
|
+
// half-persisted, close connections, and surface the error. The manifest
|
|
630
|
+
// never advances for these files (persistedFiles is dropped) and the
|
|
631
|
+
// processing queue is left in place → the next tick re-reconciles them.
|
|
632
|
+
await this.disposeTick(ctx);
|
|
633
|
+
throw err;
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
// E.4: return cache to the OS + checkpoint the WAL, then close. Safe now that
|
|
637
|
+
// the transaction is committed (a checkpoint inside a write tx is a no-op).
|
|
638
|
+
try {
|
|
639
|
+
if (sqlitePragmasEnabled()) {
|
|
640
|
+
try { ctx.codebaseDb.pragma('shrink_memory'); } catch {}
|
|
641
|
+
try { ctx.graphDb.pragma('shrink_memory'); } catch {}
|
|
642
|
+
}
|
|
643
|
+
try { ctx.codebaseDb.pragma('wal_checkpoint(PASSIVE)'); } catch {}
|
|
644
|
+
try { ctx.graphDb.pragma('wal_checkpoint(PASSIVE)'); } catch {}
|
|
645
|
+
} finally {
|
|
646
|
+
await this.disposeTick(ctx);
|
|
647
|
+
}
|
|
648
|
+
void hnswSaved;
|
|
649
|
+
// Stash for persistManifest (which runs after finalize disposed the ctx):
|
|
650
|
+
// only these files are promoted into the merkle (persist-before-advance).
|
|
651
|
+
this._lastPersistedFiles = ctx.persistedFiles;
|
|
652
|
+
return { persistedFiles: ctx.persistedFiles, requeueFiles: [] };
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
/**
|
|
656
|
+
* Close the tick-scoped connections. Idempotent + best-effort. With E.2 the
|
|
657
|
+
* resident index/float store are NOT closed (they belong to the registry).
|
|
658
|
+
*/
|
|
659
|
+
async disposeTick(ctx) {
|
|
660
|
+
if (!ctx) return;
|
|
661
|
+
// Roll back an uncommitted tick transaction (crash/throw before finalize's
|
|
662
|
+
// COMMIT) so the SQLite tiers are left at their prior consistent state.
|
|
663
|
+
if (ctx.txOpen) {
|
|
664
|
+
for (const key of ['codebaseDb', 'graphDb']) {
|
|
665
|
+
try { ctx[key]?.exec('ROLLBACK'); } catch {}
|
|
666
|
+
}
|
|
667
|
+
ctx.txOpen = false;
|
|
668
|
+
}
|
|
669
|
+
// graphRoDb aliases graphDb in the batched path; close each distinct handle
|
|
670
|
+
// once.
|
|
671
|
+
const closed = new Set();
|
|
672
|
+
for (const key of ['codebaseDb', 'graphDb', 'graphRoDb']) {
|
|
673
|
+
const db = ctx[key];
|
|
674
|
+
if (db && !closed.has(db)) { try { db.close(); } catch {} closed.add(db); }
|
|
675
|
+
ctx[key] = null;
|
|
676
|
+
}
|
|
677
|
+
if (this._tickCtx === ctx) this._tickCtx = null;
|
|
209
678
|
}
|
|
210
679
|
|
|
211
680
|
async readDirtySet() {
|
|
@@ -292,16 +761,28 @@ class ProductionReconcileAdapter {
|
|
|
292
761
|
return h;
|
|
293
762
|
}
|
|
294
763
|
|
|
295
|
-
async applyGraphDelta(file, hashes, epoch) {
|
|
764
|
+
async applyGraphDelta(file, hashes, epoch, ctx = null) {
|
|
296
765
|
const rel = typeof file === 'string' ? file : file.path;
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
766
|
+
// E.1: reuse the tick-scoped RW connection (schema already ensured) instead
|
|
767
|
+
// of opening + migrating + closing a connection per file.
|
|
768
|
+
let db;
|
|
769
|
+
let hasFts;
|
|
770
|
+
let ownConn = false;
|
|
771
|
+
if (ctx?.graphDb) {
|
|
772
|
+
db = ctx.graphDb;
|
|
773
|
+
hasFts = ctx.graphHasFts;
|
|
774
|
+
} else {
|
|
775
|
+
const dbPath = path.join(this.stateDir, 'code-graph.db');
|
|
776
|
+
fs.mkdirSync(this.stateDir, { recursive: true });
|
|
777
|
+
db = new Database(dbPath);
|
|
778
|
+
db.pragma('journal_mode = WAL');
|
|
779
|
+
db.pragma('synchronous = NORMAL');
|
|
780
|
+
applyMemoryPragmas(db);
|
|
781
|
+
hasFts = createGraphSchema(db);
|
|
782
|
+
migrateEntitiesSchema(db);
|
|
783
|
+
migrateRelationshipsSchema(db);
|
|
784
|
+
ownConn = true;
|
|
785
|
+
}
|
|
305
786
|
try {
|
|
306
787
|
const oldRows = db.prepare('SELECT rowid, id, logical_entity_id, signature_hash FROM entities WHERE file_path = ? AND epoch_retired IS NULL').all(rel);
|
|
307
788
|
const oldByLogical = new Map(oldRows.map((r) => [r.logical_entity_id || r.id, r]));
|
|
@@ -361,22 +842,41 @@ class ProductionReconcileAdapter {
|
|
|
361
842
|
});
|
|
362
843
|
tx();
|
|
363
844
|
this.progress('production:graph-written');
|
|
364
|
-
|
|
845
|
+
// E.5: budget-derived FTS5 merge. When the budget flag is off this is the
|
|
846
|
+
// fixed 16-page merge exactly as before; when on, a busy tick (elapsed >
|
|
847
|
+
// 1800ms) skips the merge to leave CPU for reconcile.
|
|
848
|
+
if (hasFts) {
|
|
849
|
+
const pages = fts5BudgetEnabled()
|
|
850
|
+
? fts5MergeBudgetPages({ elapsedMs: ctx ? Date.now() - ctx.tickStartMs : 0 })
|
|
851
|
+
: 16;
|
|
852
|
+
if (pages != null) {
|
|
853
|
+
for (const table of ['entities_fts', 'entities_trigram']) try { fts5Merge(db, table, pages); } catch {}
|
|
854
|
+
}
|
|
855
|
+
}
|
|
365
856
|
this.touched.set(rel, { ...(this.touched.get(rel) || {}), graphEntities: entities.length });
|
|
366
857
|
return { ops: { graph_upsert: upsert, graph_tombstone: tombstone }, manifest: { path: 'code-graph.db' } };
|
|
367
858
|
} finally {
|
|
368
|
-
db.close();
|
|
859
|
+
if (ownConn) db.close();
|
|
369
860
|
}
|
|
370
861
|
}
|
|
371
862
|
|
|
372
|
-
async applyVectorDelta(file, _chunks, hashes, epoch) {
|
|
863
|
+
async applyVectorDelta(file, _chunks, hashes, epoch, ctx = null) {
|
|
373
864
|
const rel = typeof file === 'string' ? file : file.path;
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
865
|
+
// E.1: reuse the tick-scoped RW connection (schema already ensured).
|
|
866
|
+
let db;
|
|
867
|
+
let ownConn = false;
|
|
868
|
+
if (ctx?.codebaseDb) {
|
|
869
|
+
db = ctx.codebaseDb;
|
|
870
|
+
} else {
|
|
871
|
+
const dbPath = path.join(this.stateDir, 'codebase.db');
|
|
872
|
+
const existed = fs.existsSync(dbPath);
|
|
873
|
+
db = new Database(dbPath);
|
|
874
|
+
db.pragma('journal_mode = WAL');
|
|
875
|
+
db.pragma('synchronous = NORMAL');
|
|
876
|
+
applyMemoryPragmas(db);
|
|
877
|
+
existed ? ensureVectorSchema(db) : createVectorSchema(db);
|
|
878
|
+
ownConn = true;
|
|
879
|
+
}
|
|
380
880
|
const vectorOps = [];
|
|
381
881
|
let chunks = [];
|
|
382
882
|
try {
|
|
@@ -386,12 +886,57 @@ class ProductionReconcileAdapter {
|
|
|
386
886
|
const summary = retire();
|
|
387
887
|
const retired = summary.retiredRows.map((r) => ({ retireId: r.oldId, file: rel }));
|
|
388
888
|
this.touched.set(rel, { ...(this.touched.get(rel) || {}), hash: hashes, chunkIds: [] });
|
|
889
|
+
// E.6: drop a deleted file's cutoff signature.
|
|
890
|
+
if (this._cutoffCache) { deleteFileSignature(this._cutoffCache, rel); this._cutoffDirty = true; }
|
|
389
891
|
return { ops: { vectors_delete: summary.retiredRows.length }, vectorOps: retired, tokenOps: retired, gramOps: [{ file: rel, deleted: true }] };
|
|
390
892
|
}
|
|
391
893
|
const parsed = await new ASTChunker({ projectRoot: this.projectRoot }).parseFile(rel, hashes.content);
|
|
392
894
|
this.progress('production:vector-parsed');
|
|
393
|
-
chunks = await enrichChunksFromGraph(parsed.map((chunk, i) => ({ ...chunk, file: rel, id: `${rel}:${chunk.metadata?.line_start || 0}-${chunk.metadata?.line_end || chunk.metadata?.line_start || 0}:${i}` })), this.stateDir);
|
|
895
|
+
chunks = await enrichChunksFromGraph(parsed.map((chunk, i) => ({ ...chunk, file: rel, id: `${rel}:${chunk.metadata?.line_start || 0}-${chunk.metadata?.line_end || chunk.metadata?.line_start || 0}:${i}` })), this.stateDir, ctx);
|
|
394
896
|
this.progress('production:vector-enriched');
|
|
897
|
+
// E.6 chunk-hash early-cutoff. The signature is the per-chunk encoder-input
|
|
898
|
+
// hashes (embedding_input_hash + li_input_hash) computed from the ENRICHED
|
|
899
|
+
// chunks — so cross-file enrichment (scope/imports injected above) folds in.
|
|
900
|
+
// If the file changed on disk but produces byte-identical encoder inputs
|
|
901
|
+
// (comment-only / reformat edits, or a dependency change that does NOT
|
|
902
|
+
// alter this file's enriched text), the encode + all tier writes are
|
|
903
|
+
// skipped. CORRECTNESS GATE: keyed ONLY on encoder-input hashes, never on
|
|
904
|
+
// the file's own chunk_text_hash / contentUnchanged.
|
|
905
|
+
if (chunkCutoffEnabled()) {
|
|
906
|
+
if (!this._cutoffCache) { this._cutoffCache = loadCutoffCache(this.stateDir); this._cutoffDirty = false; }
|
|
907
|
+
const signature = computeCutoffSignature(chunks);
|
|
908
|
+
const previous = getFileSignature(this._cutoffCache, rel);
|
|
909
|
+
if (signaturesMatch(previous, signature)) {
|
|
910
|
+
this.progress('production:vector-cutoff-skip');
|
|
911
|
+
// Provisional touched entry: keep the file's existing chunkIds so the
|
|
912
|
+
// merkle hash advances (the encoder inputs are unchanged) without any
|
|
913
|
+
// tier write. The merkle still records the new content hash so the
|
|
914
|
+
// file is not re-queued forever.
|
|
915
|
+
const prevTouched = this.touched.get(rel) || {};
|
|
916
|
+
const prevChunkIds = readJson(path.join(this.stateDir, MERKLE_STATE), { files: {} }).files?.[rel]?.chunkIds || prevTouched.chunkIds || [];
|
|
917
|
+
this.touched.set(rel, { ...prevTouched, hash: hashes, chunkIds: prevChunkIds, content: hashes.content });
|
|
918
|
+
if (ctx) ctx.persistedFiles.add(rel);
|
|
919
|
+
return {
|
|
920
|
+
ops: { vectors_upsert: 0, vectors_delete: 0 },
|
|
921
|
+
chunksTotal: chunks.length,
|
|
922
|
+
chunksEncoded: 0,
|
|
923
|
+
chunksReused: chunks.length,
|
|
924
|
+
chunksMetadataDirty: 0,
|
|
925
|
+
skipped: true,
|
|
926
|
+
vectorOps: [],
|
|
927
|
+
tokenOps: [],
|
|
928
|
+
// E.6 cutoff skips the EXPENSIVE encode + dense/HNSW/LI/graph tiers
|
|
929
|
+
// (encoder inputs are byte-identical), but sparse-grams are derived
|
|
930
|
+
// from RAW file content — a comment/reformat edit, or a change in an
|
|
931
|
+
// un-chunked region, leaves encoder inputs identical yet changes the
|
|
932
|
+
// raw content, so ss-grep/regex would go STALE if we dropped the
|
|
933
|
+
// sparse delta here. Always emit the content-derived sparse-gram delta
|
|
934
|
+
// on cutoff; the reconciler applies ONLY this tier when `skipped`.
|
|
935
|
+
gramOps: [{ file: rel, deleted: false, content: hashes.content, contentHash: hashes.contentHash }],
|
|
936
|
+
manifest: { path: 'codebase.db' },
|
|
937
|
+
};
|
|
938
|
+
}
|
|
939
|
+
}
|
|
395
940
|
// LI generated-content parity: decide ONCE, from the file's full chunk set
|
|
396
941
|
// (exactly like full indexing's per-file applyIndexingChunkPolicy), whether
|
|
397
942
|
// late interaction skips this file. Embeddings/graph/sparse still index it.
|
|
@@ -437,7 +982,89 @@ class ProductionReconcileAdapter {
|
|
|
437
982
|
const reused = delta.toReuse.find((item) => item.ann?.chunkStructId === row.chunkStructId);
|
|
438
983
|
if (reused?.chunk) tokenOps.push({ addId: row.newId, chunk: reused.chunk });
|
|
439
984
|
}
|
|
440
|
-
|
|
985
|
+
|
|
986
|
+
// CRASH-CONSISTENCY (default per-file path durability). On this path the
|
|
987
|
+
// SQLite vector COMMIT above (`production:vector-written`) lands BEFORE the
|
|
988
|
+
// dependent tiers (HNSW, then LI) are persisted, in separate adapter calls
|
|
989
|
+
// with no shared transaction. A SIGKILL between them leaves the vector rows
|
|
990
|
+
// DURABLE in codebase.db while the HNSW node + LI doc were never written —
|
|
991
|
+
// and the merkle did NOT advance (persistManifest never ran). On the next
|
|
992
|
+
// tick the file is re-reconciled, but its committed rows now hash-MATCH an
|
|
993
|
+
// EXACT reuse in diffChunks → zero re-encode → zero add ops → the chunk is
|
|
994
|
+
// QUERYABLE via FTS/SQLite yet permanently MISSING from the HNSW vector
|
|
995
|
+
// index AND the LI index (a real correctness hole: vector / late-interaction
|
|
996
|
+
// search silently never return it). Verified by the determinism harness
|
|
997
|
+
// `--kill-after-tick 2`: live HNSW + LI diverge (the new gamma.js chunk).
|
|
998
|
+
//
|
|
999
|
+
// FIX (minimal persist-before-advance for the per-file path): the published
|
|
1000
|
+
// merkle epoch is the advance authority — persistManifest only advances
|
|
1001
|
+
// `merkle.epoch` AFTER a tick wrote its downstream tiers, so any LIVE vector
|
|
1002
|
+
// row whose `epoch_written` exceeds the highest published merkle epoch was
|
|
1003
|
+
// committed by a tick that never published: a torn post-crash row whose
|
|
1004
|
+
// HNSW node / LI doc may be missing. Re-emit a repair op for each such row
|
|
1005
|
+
// under its EXISTING id (so the recovered index is identical to a clean
|
|
1006
|
+
// run, not a fresh re-encode id):
|
|
1007
|
+
// - HNSW: an ADD op. `index.add` on an already-present id updates in place
|
|
1008
|
+
// (no duplicate node, no graph mutation) → no-op when the node landed,
|
|
1009
|
+
// repair when it didn't.
|
|
1010
|
+
// - LI: a RETIRE+ADD pair. LI `add` appends to a fresh segment and is NOT
|
|
1011
|
+
// idempotent, so we tombstone any existing doc for the id first; the
|
|
1012
|
+
// add then yields exactly one live doc whether or not the doc had
|
|
1013
|
+
// landed. (Retiring a non-existent LI doc is a no-op.)
|
|
1014
|
+
//
|
|
1015
|
+
// The discriminator is exact and never fires on the happy path: a
|
|
1016
|
+
// successful tick always advances `merkle.epoch` to the epoch it wrote its
|
|
1017
|
+
// rows at, so on a non-crash tick every prior live row has
|
|
1018
|
+
// `epoch_written <= merkle.epoch` and the rows written THIS tick are already
|
|
1019
|
+
// in `newIds` (added above), never re-added here. Runs in BOTH the per-file
|
|
1020
|
+
// and the E.1 batched path: the batched path never CREATES a torn row (its
|
|
1021
|
+
// deferred COMMIT + merkle gating rolls a crashed batch back), but a LEGACY
|
|
1022
|
+
// orphan from a pre-batched per-file crash can still exist on disk when the
|
|
1023
|
+
// daemon later runs batched, so the repair must heal it under either mode.
|
|
1024
|
+
// In the batched path the repair ops join `vectorOps`/`tokenOps`, flow
|
|
1025
|
+
// through the tick-scoped adapters into `ctx.pendingAdds`, and are saved by
|
|
1026
|
+
// `finalizeTick` — and `db` reads see the committed orphan (prior-tick) row.
|
|
1027
|
+
// Byte-diff/behaviour on non-crash runs is UNCHANGED in both modes (no
|
|
1028
|
+
// orphan ⇒ no-op; verified by the determinism harness control + sweeps).
|
|
1029
|
+
const repairedIds = [];
|
|
1030
|
+
if (snap.size > 0) {
|
|
1031
|
+
const publishedEpoch = this._publishedEpoch();
|
|
1032
|
+
const alreadyAdded = new Set(newIds);
|
|
1033
|
+
const chunkByStructId = new Map();
|
|
1034
|
+
for (let i = 0; i < chunks.length; i += 1) {
|
|
1035
|
+
const sid = annotations[i]?.chunkStructId;
|
|
1036
|
+
if (sid != null) chunkByStructId.set(sid, chunks[i]);
|
|
1037
|
+
}
|
|
1038
|
+
for (const row of snap.values()) {
|
|
1039
|
+
if (row.epoch_retired != null) continue;
|
|
1040
|
+
if (!Number.isInteger(row.epoch_written) || row.epoch_written <= publishedEpoch) continue;
|
|
1041
|
+
if (alreadyAdded.has(row.id)) continue; // written THIS tick already
|
|
1042
|
+
const dbRow = db.prepare('SELECT id, embedding, metadata FROM vectors WHERE id = ?').get(row.id);
|
|
1043
|
+
if (!dbRow?.embedding) continue;
|
|
1044
|
+
vectorOps.push({ addId: dbRow.id, embedding: float32FromBuffer(dbRow.embedding), metadata: JSON.parse(dbRow.metadata || '{}') });
|
|
1045
|
+
const chunk = chunkByStructId.get(row.chunk_struct_id);
|
|
1046
|
+
if (chunk) {
|
|
1047
|
+
tokenOps.push({ retireId: dbRow.id, file: rel });
|
|
1048
|
+
tokenOps.push({ addId: dbRow.id, chunk });
|
|
1049
|
+
}
|
|
1050
|
+
// Record the repaired row in the merkle chunkIds so the file's recorded
|
|
1051
|
+
// chunk set reflects the rows actually live after recovery (a torn add
|
|
1052
|
+
// produced no `newIds`, so without this the merkle would advance with an
|
|
1053
|
+
// empty chunkIds for a file that does have a live, now-indexed chunk).
|
|
1054
|
+
repairedIds.push(dbRow.id);
|
|
1055
|
+
this.progress('production:vector-crash-recovery');
|
|
1056
|
+
}
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
const recordedChunkIds = repairedIds.length > 0 ? [...newIds, ...repairedIds] : newIds;
|
|
1060
|
+
this.touched.set(rel, { ...(this.touched.get(rel) || {}), hash: hashes, chunkIds: recordedChunkIds, content: hashes.content });
|
|
1061
|
+
// E.6: record this file's new cutoff signature (encoder-input hashes of
|
|
1062
|
+
// the enriched chunks) for next-tick comparison.
|
|
1063
|
+
if (this._cutoffCache) {
|
|
1064
|
+
setFileSignature(this._cutoffCache, rel, computeCutoffSignature(chunks));
|
|
1065
|
+
this._cutoffDirty = true;
|
|
1066
|
+
}
|
|
1067
|
+
if (ctx) ctx.persistedFiles.add(rel);
|
|
441
1068
|
return {
|
|
442
1069
|
ops: { vectors_upsert: newIds.length, vectors_delete: vectorOps.filter((o) => o.retireId).length },
|
|
443
1070
|
chunksTotal: chunks.length,
|
|
@@ -450,32 +1077,40 @@ class ProductionReconcileAdapter {
|
|
|
450
1077
|
manifest: { path: 'codebase.db' },
|
|
451
1078
|
};
|
|
452
1079
|
} finally {
|
|
453
|
-
db.close();
|
|
1080
|
+
if (ownConn) db.close();
|
|
454
1081
|
}
|
|
455
1082
|
}
|
|
456
1083
|
|
|
457
|
-
async
|
|
458
|
-
if (!Array.isArray(ops) || ops.length === 0) return { ops: {
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
1084
|
+
async applyBinaryHNSWDelta(_file, ops, _epoch, ctx = null) {
|
|
1085
|
+
if (!Array.isArray(ops) || ops.length === 0) return { ops: { binary_hnsw_append: 0, binary_hnsw_tombstone: 0 } };
|
|
1086
|
+
|
|
1087
|
+
// E.1 batched path: reuse the resident index, apply tombstones in place, and
|
|
1088
|
+
// ACCUMULATE the add ops onto the tick context. The actual `index.add()`
|
|
1089
|
+
// insertions are deferred to finalizeTick, where they run sorted-by-id so
|
|
1090
|
+
// the graph is reproducible (G1 byte-identity). We still report the per-file
|
|
1091
|
+
// append/tombstone counts here for the tick counters.
|
|
1092
|
+
if (ctx?.index) {
|
|
1093
|
+
const index = ctx.index;
|
|
1094
|
+
let append = 0; let tombstone = 0;
|
|
1095
|
+
for (const op of ops) {
|
|
1096
|
+
if (op.retireId) {
|
|
1097
|
+
if (markBinaryStale(index, op.retireId)) tombstone += 1;
|
|
1098
|
+
ctx.floatRemoveIds.push(op.retireId);
|
|
1099
|
+
}
|
|
1100
|
+
if (op.addId && op.embedding) {
|
|
1101
|
+
// Stage the add; insertion happens in finalize (sorted by id).
|
|
1102
|
+
ctx.pendingAdds = ctx.pendingAdds || [];
|
|
1103
|
+
ctx.pendingAdds.push(op);
|
|
1104
|
+
append += 1;
|
|
1105
|
+
}
|
|
469
1106
|
}
|
|
470
|
-
|
|
1107
|
+
ctx.tombstone += tombstone;
|
|
1108
|
+
// append is committed to ctx.append in finalize after the sorted inserts.
|
|
1109
|
+
ctx.append += append;
|
|
1110
|
+
return { ops: { binary_hnsw_append: append, binary_hnsw_tombstone: tombstone }, manifest: { path: 'codebase-binary-hnsw.idx' } };
|
|
471
1111
|
}
|
|
472
|
-
await index.save(indexPath);
|
|
473
|
-
this.progress('production:hnsw-saved');
|
|
474
|
-
return { ops: { hnsw_add: add, hnsw_tombstone: tombstone }, manifest: { path: 'codebase-hnsw.idx', stale: 'codebase-hnsw.idx.stale.bin' } };
|
|
475
|
-
}
|
|
476
1112
|
|
|
477
|
-
|
|
478
|
-
if (!Array.isArray(ops) || ops.length === 0) return { ops: { binary_hnsw_append: 0, binary_hnsw_tombstone: 0 } };
|
|
1113
|
+
// ---- Per-file path (flag off): exact current behavior. ----
|
|
479
1114
|
const indexPath = path.join(this.stateDir, 'codebase-binary-hnsw.idx');
|
|
480
1115
|
const index = new BinaryHNSWIndex({ indexPath, stalePath: `${indexPath}.stale.bin`, floatDimension: this.modelInfo.hnswDimension });
|
|
481
1116
|
try { await index.load(indexPath); } catch { await index.init(); }
|
|
@@ -527,6 +1162,18 @@ class ProductionReconcileAdapter {
|
|
|
527
1162
|
return { ops: { li_segment_append: appended, li_tombstone: tombstone }, manifest: { path: 'codebase-late-interaction.db', segments: 'codebase-late-interaction.db.segments/manifest.json' } };
|
|
528
1163
|
}
|
|
529
1164
|
|
|
1165
|
+
/**
|
|
1166
|
+
* Highest SUCCESSFULLY-published merkle epoch (the per-file crash-recovery
|
|
1167
|
+
* advance authority — see the torn-row repair in `applyVectorDelta`). Returns
|
|
1168
|
+
* -1 when no merkle has been published yet, so any committed row counts as
|
|
1169
|
+
* un-advanced. Best-effort + read-only.
|
|
1170
|
+
* @returns {number}
|
|
1171
|
+
*/
|
|
1172
|
+
_publishedEpoch() {
|
|
1173
|
+
const merkle = readJson(path.join(this.stateDir, MERKLE_STATE), {});
|
|
1174
|
+
return Number.isInteger(merkle?.epoch) ? merkle.epoch : -1;
|
|
1175
|
+
}
|
|
1176
|
+
|
|
530
1177
|
applySparseGramDelta(_file, ops, epoch) {
|
|
531
1178
|
if (!Array.isArray(ops) || ops.length === 0) return { ops: { sparse_gram_delta_upsert: 0 } };
|
|
532
1179
|
const base = path.join(this.stateDir, 'codebase-sparse-grams.idx');
|
|
@@ -563,14 +1210,27 @@ class ProductionReconcileAdapter {
|
|
|
563
1210
|
const merklePath = path.join(this.stateDir, MERKLE_STATE);
|
|
564
1211
|
const merkle = readJson(merklePath, { version: '2.4', files: {}, stats: {} });
|
|
565
1212
|
merkle.files ||= {};
|
|
1213
|
+
// E.1 PERSIST-BEFORE-ADVANCE: when batching, promote a file into the merkle
|
|
1214
|
+
// ONLY if its ops are in the persisted batch (recorded in finalizeTick). A
|
|
1215
|
+
// file touched this tick but absent from the persisted set (e.g. its HNSW
|
|
1216
|
+
// adds did not make the saved batch) is left at its prior merkle state and
|
|
1217
|
+
// re-reconciled next tick. Deletions always apply (no HNSW add to persist).
|
|
1218
|
+
const persisted = this._lastPersistedFiles;
|
|
1219
|
+
const gate = batchTierWritesEnabled() && persisted instanceof Set;
|
|
566
1220
|
for (const [file, data] of this.touched.entries()) {
|
|
567
1221
|
if (data.hash?.deleted) delete merkle.files[file];
|
|
568
|
-
else merkle.files[file] = { hash: data.hash.contentHash, ...data.hash.stat, epoch: manifest.epoch, chunkIds: data.chunkIds || [] };
|
|
1222
|
+
else if (!gate || persisted.has(file)) merkle.files[file] = { hash: data.hash.contentHash, ...data.hash.stat, epoch: manifest.epoch, chunkIds: data.chunkIds || [] };
|
|
569
1223
|
}
|
|
570
1224
|
merkle.lastIndex = new Date().toISOString();
|
|
571
1225
|
merkle.epoch = manifest.epoch;
|
|
572
1226
|
merkle.stats = { ...(merkle.stats || {}), totalFiles: Object.keys(merkle.files).length };
|
|
573
1227
|
safeWriteJson(merklePath, merkle);
|
|
1228
|
+
// E.6: persist the updated chunk-cutoff cache once per tick (after the
|
|
1229
|
+
// merkle advances). Best-effort; a failure only costs a redundant re-embed.
|
|
1230
|
+
if (this._cutoffCache && this._cutoffDirty) {
|
|
1231
|
+
saveCutoffCache(this.stateDir, this._cutoffCache);
|
|
1232
|
+
this._cutoffDirty = false;
|
|
1233
|
+
}
|
|
574
1234
|
try { fs.unlinkSync(path.join(this.stateDir, PROCESSING_QUEUE)); } catch {}
|
|
575
1235
|
fs.appendFileSync(path.join(this.stateDir, METRICS_FILE), JSON.stringify({ ...manifest, ts: Date.now() / 1000, epoch: manifest.epoch }) + '\n');
|
|
576
1236
|
}
|
|
@@ -580,4 +1240,8 @@ export const __testing = {
|
|
|
580
1240
|
ProductionReconcileAdapter,
|
|
581
1241
|
sparseGramRecord,
|
|
582
1242
|
markBinaryStale,
|
|
1243
|
+
normalizeHnswDeterminismFlags,
|
|
1244
|
+
// Reset the one-time forced-on warning latch so each test case observes the
|
|
1245
|
+
// warning independently.
|
|
1246
|
+
_resetDetLevelsWarnLatch() { _batchForcedDetLevelsWarned = false; },
|
|
583
1247
|
};
|