sweet-search 2.5.2 → 2.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tombstone bitmap (`*.stale.bin`).
|
|
3
|
+
*
|
|
4
|
+
* Plan § 7.3 (Float HNSW), § 7.5 (LI segments), § 34.4 (SIMD-ready layout).
|
|
5
|
+
*
|
|
6
|
+
* Each bit marks one element (HNSW key, LI doc, etc.) as stale. Files are
|
|
7
|
+
* **64-byte aligned** so AVX-512 / AVX2 / NEON masking kernels can scan
|
|
8
|
+
* eight bytes at a time without unaligned-load penalties.
|
|
9
|
+
*
|
|
10
|
+
* Layout:
|
|
11
|
+
*
|
|
12
|
+
* header (16 bytes):
|
|
13
|
+
* 0..3 magic = 'SSTB'
|
|
14
|
+
* 4..7 version = uint32 (1)
|
|
15
|
+
* 8..15 capacity bits = uint64 (little-endian)
|
|
16
|
+
*
|
|
17
|
+
* payload:
|
|
18
|
+
* ceil(capacity_bits / 8) bytes, padded up to a 64-byte boundary.
|
|
19
|
+
*
|
|
20
|
+
* The header is part of the 64-byte alignment promise: callers that mmap
|
|
21
|
+
* the file and pass the payload pointer to a SIMD kernel must skip the
|
|
22
|
+
* first 64 bytes, not the first 16. Phase 6 implements the native SIMD
|
|
23
|
+
* masking kernel; Phase 3 ships the scalar JS path under the same layout.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import fs from 'node:fs';
|
|
27
|
+
import path from 'node:path';
|
|
28
|
+
|
|
29
|
+
const HEADER_MAGIC = Buffer.from('SSTB', 'ascii');
|
|
30
|
+
const HEADER_VERSION = 1;
|
|
31
|
+
const HEADER_RESERVED = 64; // bytes reserved before payload (alignment)
|
|
32
|
+
const BITS_PER_BYTE = 8;
|
|
33
|
+
|
|
34
|
+
function payloadByteOffset() {
|
|
35
|
+
return HEADER_RESERVED;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function payloadByteLength(capacityBits) {
|
|
39
|
+
const minBytes = Math.ceil(capacityBits / BITS_PER_BYTE);
|
|
40
|
+
// Round up to 64-byte alignment so SIMD reads at the tail are safe.
|
|
41
|
+
return Math.ceil(minBytes / 64) * 64;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Create or open a tombstone bitmap. Returns an in-memory bitmap object;
|
|
46
|
+
* callers persist with `saveBitmap`.
|
|
47
|
+
*
|
|
48
|
+
* @param {number} capacityBits
|
|
49
|
+
* @returns {{capacity:number, payload:Buffer}}
|
|
50
|
+
*/
|
|
51
|
+
export function createBitmap(capacityBits) {
|
|
52
|
+
if (!Number.isInteger(capacityBits) || capacityBits <= 0) {
|
|
53
|
+
throw new Error(`createBitmap: capacityBits must be a positive integer, got ${capacityBits}`);
|
|
54
|
+
}
|
|
55
|
+
return {
|
|
56
|
+
capacity: capacityBits,
|
|
57
|
+
payload: Buffer.alloc(payloadByteLength(capacityBits)),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Load a tombstone bitmap from disk. Returns `null` if the file does not
|
|
63
|
+
* exist (caller treats every key as live).
|
|
64
|
+
*
|
|
65
|
+
* @param {string} filePath
|
|
66
|
+
* @returns {{capacity:number, payload:Buffer}|null}
|
|
67
|
+
*/
|
|
68
|
+
export function loadBitmap(filePath) {
|
|
69
|
+
if (!fs.existsSync(filePath)) return null;
|
|
70
|
+
const raw = fs.readFileSync(filePath);
|
|
71
|
+
if (raw.length < HEADER_RESERVED) {
|
|
72
|
+
throw new Error(`loadBitmap: ${filePath} too short (${raw.length} bytes)`);
|
|
73
|
+
}
|
|
74
|
+
if (!raw.subarray(0, 4).equals(HEADER_MAGIC)) {
|
|
75
|
+
throw new Error(`loadBitmap: ${filePath} magic mismatch`);
|
|
76
|
+
}
|
|
77
|
+
const version = raw.readUInt32LE(4);
|
|
78
|
+
if (version !== HEADER_VERSION) {
|
|
79
|
+
throw new Error(`loadBitmap: unsupported version ${version}`);
|
|
80
|
+
}
|
|
81
|
+
const capacity = Number(raw.readBigUInt64LE(8));
|
|
82
|
+
const expectedLength = HEADER_RESERVED + payloadByteLength(capacity);
|
|
83
|
+
if (raw.length < expectedLength) {
|
|
84
|
+
throw new Error(`loadBitmap: ${filePath} truncated payload (${raw.length} bytes, expected ${expectedLength})`);
|
|
85
|
+
}
|
|
86
|
+
const payload = raw.subarray(HEADER_RESERVED, HEADER_RESERVED + payloadByteLength(capacity));
|
|
87
|
+
return {
|
|
88
|
+
capacity,
|
|
89
|
+
payload: Buffer.from(payload),
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Persist the bitmap atomically (`*.tmp` + fsync + rename).
|
|
95
|
+
*
|
|
96
|
+
* @param {string} filePath
|
|
97
|
+
* @param {{capacity:number, payload:Buffer}} bitmap
|
|
98
|
+
*/
|
|
99
|
+
export function saveBitmap(filePath, bitmap) {
|
|
100
|
+
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
|
101
|
+
const tmp = filePath + '.tmp';
|
|
102
|
+
const header = Buffer.alloc(HEADER_RESERVED);
|
|
103
|
+
HEADER_MAGIC.copy(header, 0);
|
|
104
|
+
header.writeUInt32LE(HEADER_VERSION, 4);
|
|
105
|
+
header.writeBigUInt64LE(BigInt(bitmap.capacity), 8);
|
|
106
|
+
const out = Buffer.concat([header, bitmap.payload]);
|
|
107
|
+
|
|
108
|
+
const fd = fs.openSync(tmp, 'w');
|
|
109
|
+
try {
|
|
110
|
+
fs.writeSync(fd, out);
|
|
111
|
+
fs.fsyncSync(fd);
|
|
112
|
+
} finally {
|
|
113
|
+
fs.closeSync(fd);
|
|
114
|
+
}
|
|
115
|
+
fs.renameSync(tmp, filePath);
|
|
116
|
+
try {
|
|
117
|
+
const dirFd = fs.openSync(path.dirname(filePath), 'r');
|
|
118
|
+
try { fs.fsyncSync(dirFd); } finally { fs.closeSync(dirFd); }
|
|
119
|
+
} catch {
|
|
120
|
+
// Best-effort: some tmpfs/container filesystems reject directory fsync.
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Resize a bitmap (used when the underlying graph grows past capacity).
|
|
126
|
+
* Preserves existing bits.
|
|
127
|
+
*
|
|
128
|
+
* @param {{capacity:number, payload:Buffer}} bitmap
|
|
129
|
+
* @param {number} newCapacityBits
|
|
130
|
+
*/
|
|
131
|
+
export function resizeBitmap(bitmap, newCapacityBits) {
|
|
132
|
+
if (newCapacityBits <= bitmap.capacity) return bitmap;
|
|
133
|
+
const fresh = Buffer.alloc(payloadByteLength(newCapacityBits));
|
|
134
|
+
bitmap.payload.copy(fresh, 0, 0, bitmap.payload.length);
|
|
135
|
+
bitmap.capacity = newCapacityBits;
|
|
136
|
+
bitmap.payload = fresh;
|
|
137
|
+
return bitmap;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function byteAndMask(index) {
|
|
141
|
+
const byte = index >>> 3;
|
|
142
|
+
const mask = 1 << (index & 7);
|
|
143
|
+
return { byte, mask };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
export function setBit(bitmap, index) {
|
|
147
|
+
if (index < 0 || index >= bitmap.capacity) {
|
|
148
|
+
throw new RangeError(`setBit: ${index} outside bitmap capacity ${bitmap.capacity}`);
|
|
149
|
+
}
|
|
150
|
+
const { byte, mask } = byteAndMask(index);
|
|
151
|
+
bitmap.payload[byte] |= mask;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
export function clearBit(bitmap, index) {
|
|
155
|
+
if (index < 0 || index >= bitmap.capacity) {
|
|
156
|
+
throw new RangeError(`clearBit: ${index} outside bitmap capacity ${bitmap.capacity}`);
|
|
157
|
+
}
|
|
158
|
+
const { byte, mask } = byteAndMask(index);
|
|
159
|
+
bitmap.payload[byte] &= ~mask & 0xFF;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
export function isSet(bitmap, index) {
|
|
163
|
+
if (index < 0 || index >= bitmap.capacity) return false;
|
|
164
|
+
const { byte, mask } = byteAndMask(index);
|
|
165
|
+
return (bitmap.payload[byte] & mask) !== 0;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Population count. Plan § 7.3 tombstone_fraction = count / total.
|
|
170
|
+
*
|
|
171
|
+
* @param {{capacity:number, payload:Buffer}} bitmap
|
|
172
|
+
* @returns {number}
|
|
173
|
+
*/
|
|
174
|
+
export function popcount(bitmap) {
|
|
175
|
+
let count = 0;
|
|
176
|
+
const buf = bitmap.payload;
|
|
177
|
+
const fullBytes = Math.floor(bitmap.capacity / BITS_PER_BYTE);
|
|
178
|
+
const wordBytes = fullBytes - (fullBytes % 4);
|
|
179
|
+
// popcount via Brian Kernighan's algorithm, 32 bits at a time. Only
|
|
180
|
+
// count bytes covered by capacity; 64-byte alignment padding is not data.
|
|
181
|
+
for (let i = 0; i < wordBytes; i += 4) {
|
|
182
|
+
let x = buf.readUInt32LE(i);
|
|
183
|
+
while (x !== 0) {
|
|
184
|
+
x &= x - 1;
|
|
185
|
+
count += 1;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
for (let i = wordBytes; i < fullBytes; i++) {
|
|
189
|
+
let b = buf[i];
|
|
190
|
+
while (b !== 0) {
|
|
191
|
+
b &= b - 1;
|
|
192
|
+
count += 1;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
const tailBits = bitmap.capacity % BITS_PER_BYTE;
|
|
196
|
+
if (tailBits > 0) {
|
|
197
|
+
let b = buf[fullBytes] & ((1 << tailBits) - 1);
|
|
198
|
+
while (b !== 0) {
|
|
199
|
+
b &= b - 1;
|
|
200
|
+
count += 1;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
return count;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Filter a list of candidate indices by the bitmap. SIMD-ready in v6;
|
|
208
|
+
* scalar fallback here.
|
|
209
|
+
*
|
|
210
|
+
* @param {{capacity:number, payload:Buffer}} bitmap
|
|
211
|
+
* @param {number[]} candidates
|
|
212
|
+
* @returns {number[]}
|
|
213
|
+
*/
|
|
214
|
+
export function filterLive(bitmap, candidates) {
|
|
215
|
+
const live = [];
|
|
216
|
+
for (const idx of candidates) {
|
|
217
|
+
if (!isSet(bitmap, idx)) live.push(idx);
|
|
218
|
+
}
|
|
219
|
+
return live;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Compute the tombstone fraction for the watermark scheduler.
|
|
224
|
+
*
|
|
225
|
+
* @param {{capacity:number, payload:Buffer}} bitmap
|
|
226
|
+
* @param {number} liveTotal Total live elements (not bitmap capacity).
|
|
227
|
+
* @returns {number}
|
|
228
|
+
*/
|
|
229
|
+
export function tombstoneFraction(bitmap, liveTotal) {
|
|
230
|
+
const tombstoned = popcount(bitmap);
|
|
231
|
+
const denom = liveTotal + tombstoned;
|
|
232
|
+
if (denom === 0) return 0;
|
|
233
|
+
return tombstoned / denom;
|
|
234
|
+
}
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vector delta writer.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 7.2 + § 13 Phase 1. The reconcile tick translates a list of
|
|
5
|
+
* dirty chunks into per-row UPSERTs against `codebase.db::vectors`
|
|
6
|
+
* keyed on `(file_path, chunk_struct_id)`. Stable chunks whose
|
|
7
|
+
* `embedding_input_hash` and `li_input_hash` are unchanged keep their
|
|
8
|
+
* BLOB; only changed payloads run through the encoder.
|
|
9
|
+
*
|
|
10
|
+
* This module is intentionally narrow:
|
|
11
|
+
* - It knows the vectors-table column layout (post `migrateVectorsSchema`).
|
|
12
|
+
* - It does NOT call the encoder. The caller (reconcile application
|
|
13
|
+
* service) decides what to re-encode based on the diff result here.
|
|
14
|
+
* - It does NOT touch HNSW, LI, or sparse-gram artifacts; per-tier
|
|
15
|
+
* side effects are dispatched by the reconciler.
|
|
16
|
+
*
|
|
17
|
+
* The diff is the load-bearing API. Given the chunker output for a file
|
|
18
|
+
* and the current DB state, it returns:
|
|
19
|
+
*
|
|
20
|
+
* {
|
|
21
|
+
* toEncode: [ { chunk, denseNeeded, liNeeded } ],
|
|
22
|
+
* toReuse: [ { chunk, prevRow } ],
|
|
23
|
+
* toRetire: [ { rowId, chunkStructId } ],
|
|
24
|
+
* metadataDirty: [ chunk_struct_id ], // populated by reconciler
|
|
25
|
+
* counters: { hit, miss, ... },
|
|
26
|
+
* }
|
|
27
|
+
*
|
|
28
|
+
* The "retire" set covers chunks that existed in DB for this file but no
|
|
29
|
+
* longer have a matching `chunk_struct_id`. Per plan § 7.2, those rows are
|
|
30
|
+
* tombstoned in the same per-file transaction by setting
|
|
31
|
+
* `epoch_retired = ε+1`; the reconciler does the actual SQL write.
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
import { assignStructuralIds } from '../domain/chunk-identity.mjs';
|
|
35
|
+
import { chunkInputHashes } from '../domain/encoder-input.mjs';
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Annotate each chunk with its structural ID + per-consumer hashes in
|
|
39
|
+
* one pass. Returns a parallel array; does not mutate the chunks.
|
|
40
|
+
*
|
|
41
|
+
* @param {Array<object>} chunks
|
|
42
|
+
* @param {string} filePath
|
|
43
|
+
* @returns {Array<{chunkStructId:string, structural:boolean, occurrenceIndex:number|null, hashes:{chunk_text_hash:string, embedding_input_hash:string, li_input_hash:string, metadata_fingerprint:string, dedup_fingerprint:string}}>}
|
|
44
|
+
*/
|
|
45
|
+
export function annotateChunksForDelta(chunks, filePath) {
|
|
46
|
+
const ids = assignStructuralIds(chunks, filePath);
|
|
47
|
+
return ids.map((id, i) => ({
|
|
48
|
+
chunkStructId: id.chunkStructId,
|
|
49
|
+
structural: id.structural,
|
|
50
|
+
occurrenceIndex: id.occurrenceIndex,
|
|
51
|
+
rollingHash: id.rollingHash,
|
|
52
|
+
reason: id.reason,
|
|
53
|
+
hashes: chunkInputHashes(chunks[i]),
|
|
54
|
+
}));
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Fetch the current per-row hash state for one file. Returns a Map keyed
|
|
59
|
+
* by `chunk_struct_id` so the diff can be O(n). Falls back to `chunk_id`
|
|
60
|
+
* (the legacy positional ID stored as the row's primary key) when the
|
|
61
|
+
* row has no structural ID — typical for rows written by the older
|
|
62
|
+
* indexer before the Phase 1 migration.
|
|
63
|
+
*
|
|
64
|
+
* @param {import('better-sqlite3').Database} db
|
|
65
|
+
* @param {string} filePath
|
|
66
|
+
* @param {{manifestEpoch?: number}} [options]
|
|
67
|
+
* @returns {Map<string, {id:string, chunk_struct_id:string, chunk_text_hash:string, embedding_input_hash:string, li_input_hash:string, metadata_fingerprint:string, epoch_written:number, epoch_retired:number|null}>}
|
|
68
|
+
*/
|
|
69
|
+
export function snapshotFileRows(db, filePath, options = {}) {
|
|
70
|
+
const map = new Map();
|
|
71
|
+
const pinned = Number.isInteger(options.manifestEpoch);
|
|
72
|
+
const visibilitySql = pinned
|
|
73
|
+
? `AND (epoch_written IS NULL OR epoch_written <= ?)
|
|
74
|
+
AND (epoch_retired IS NULL OR epoch_retired > ?)`
|
|
75
|
+
: `AND epoch_retired IS NULL`;
|
|
76
|
+
const args = pinned ? [filePath, options.manifestEpoch, options.manifestEpoch] : [filePath];
|
|
77
|
+
const rows = db.prepare(`
|
|
78
|
+
SELECT id, chunk_struct_id, chunk_text_hash, embedding_input_hash,
|
|
79
|
+
li_input_hash, metadata_fingerprint, epoch_written, epoch_retired
|
|
80
|
+
FROM vectors
|
|
81
|
+
WHERE file_path = ?
|
|
82
|
+
${visibilitySql}
|
|
83
|
+
ORDER BY epoch_written DESC
|
|
84
|
+
`).all(...args);
|
|
85
|
+
for (const row of rows) {
|
|
86
|
+
const key = row.chunk_struct_id && row.chunk_struct_id.length > 0
|
|
87
|
+
? row.chunk_struct_id
|
|
88
|
+
: `legacy:${row.id}`;
|
|
89
|
+
if (map.has(key)) continue;
|
|
90
|
+
map.set(key, row);
|
|
91
|
+
}
|
|
92
|
+
return map;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Compute the diff between annotated chunks and the current DB rows.
|
|
97
|
+
*
|
|
98
|
+
* Decision rules per chunk:
|
|
99
|
+
* - structural ID matches an existing row + embedding_input_hash matches
|
|
100
|
+
* → dense reuse.
|
|
101
|
+
* - structural ID matches + li_input_hash matches → LI reuse.
|
|
102
|
+
* - structural ID matches + encoder hashes match but chunk_text_hash or
|
|
103
|
+
* metadata_fingerprint changed → version the row while reusing encoder
|
|
104
|
+
* payloads, so readers pinned to the next epoch see fresh text/metadata.
|
|
105
|
+
* - structural ID matches + only metadata_fingerprint changed →
|
|
106
|
+
* metadata-dirty (caller may need to re-run graph enrichment and
|
|
107
|
+
* re-hash; treat as "needs encode" defensively until the reconciler
|
|
108
|
+
* decides).
|
|
109
|
+
* - new chunk_struct_id → insert + encode both.
|
|
110
|
+
* - existing rows whose struct id is absent from the new chunk list →
|
|
111
|
+
* retire (tombstone in the same per-file transaction).
|
|
112
|
+
*
|
|
113
|
+
* @param {Array<object>} chunks Output of the chunker.
|
|
114
|
+
* @param {Array<{chunkStructId:string, hashes:object}>} annotations From annotateChunksForDelta.
|
|
115
|
+
* @param {Map<string, object>} dbSnapshot From snapshotFileRows.
|
|
116
|
+
* @returns {{toEncode:Array, toReuse:Array, toRetire:Array, counters:{hit:number, miss:number, retire:number, metadata_dirty:number}}}
|
|
117
|
+
*/
|
|
118
|
+
export function diffChunks(chunks, annotations, dbSnapshot) {
|
|
119
|
+
const toEncode = [];
|
|
120
|
+
const toReuse = [];
|
|
121
|
+
const seenIds = new Set();
|
|
122
|
+
const counters = { hit: 0, miss: 0, retire: 0, metadata_dirty: 0 };
|
|
123
|
+
|
|
124
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
125
|
+
const chunk = chunks[i];
|
|
126
|
+
const ann = annotations[i];
|
|
127
|
+
if (!ann || !ann.chunkStructId) {
|
|
128
|
+
// Fallback path: chunk has no structural ID. The reconciler still
|
|
129
|
+
// needs to encode + insert; structural reuse is impossible.
|
|
130
|
+
toEncode.push({
|
|
131
|
+
chunk, ann,
|
|
132
|
+
denseNeeded: true, liNeeded: true,
|
|
133
|
+
reason: 'no-struct-id',
|
|
134
|
+
});
|
|
135
|
+
counters.miss += 1;
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
const key = ann.chunkStructId;
|
|
139
|
+
seenIds.add(key);
|
|
140
|
+
const prev = dbSnapshot.get(key);
|
|
141
|
+
|
|
142
|
+
if (!prev) {
|
|
143
|
+
toEncode.push({
|
|
144
|
+
chunk, ann,
|
|
145
|
+
denseNeeded: true, liNeeded: true,
|
|
146
|
+
reason: 'new',
|
|
147
|
+
});
|
|
148
|
+
counters.miss += 1;
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const denseMatch = prev.embedding_input_hash === ann.hashes.embedding_input_hash
|
|
153
|
+
&& ann.hashes.embedding_input_hash !== '';
|
|
154
|
+
const liMatch = prev.li_input_hash === ann.hashes.li_input_hash
|
|
155
|
+
&& ann.hashes.li_input_hash !== '';
|
|
156
|
+
|
|
157
|
+
const textMatch = prev.chunk_text_hash === ann.hashes.chunk_text_hash
|
|
158
|
+
&& ann.hashes.chunk_text_hash !== '';
|
|
159
|
+
const metadataMatch = prev.metadata_fingerprint === ann.hashes.metadata_fingerprint;
|
|
160
|
+
|
|
161
|
+
if (denseMatch && liMatch && textMatch && metadataMatch) {
|
|
162
|
+
toReuse.push({ chunk, ann, prevRow: prev });
|
|
163
|
+
counters.hit += 1;
|
|
164
|
+
continue;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if (denseMatch && liMatch) {
|
|
168
|
+
// Text and/or metadata shifted, but encoder payloads are still valid.
|
|
169
|
+
// Write a new row version reusing the old embedding BLOB so old and new
|
|
170
|
+
// manifest epochs each see their matching row contents.
|
|
171
|
+
toReuse.push({
|
|
172
|
+
chunk,
|
|
173
|
+
ann,
|
|
174
|
+
prevRow: prev,
|
|
175
|
+
metadataOnly: !metadataMatch,
|
|
176
|
+
textOnly: !textMatch,
|
|
177
|
+
});
|
|
178
|
+
if (!metadataMatch) counters.metadata_dirty += 1;
|
|
179
|
+
counters.hit += 1;
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Partial reuse: dense XOR LI. The reconciler can choose to re-encode
|
|
184
|
+
// only the affected consumer.
|
|
185
|
+
toEncode.push({
|
|
186
|
+
chunk, ann, prevRow: prev,
|
|
187
|
+
denseNeeded: !denseMatch,
|
|
188
|
+
liNeeded: !liMatch,
|
|
189
|
+
reason: denseMatch ? 'li-only' : (liMatch ? 'dense-only' : 'both'),
|
|
190
|
+
});
|
|
191
|
+
counters.miss += 1;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const toRetire = [];
|
|
195
|
+
for (const [key, prev] of dbSnapshot.entries()) {
|
|
196
|
+
if (seenIds.has(key)) continue;
|
|
197
|
+
if (prev.epoch_retired != null) continue; // already tombstoned
|
|
198
|
+
toRetire.push({ rowId: prev.id, chunkStructId: prev.chunk_struct_id });
|
|
199
|
+
counters.retire += 1;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return { toEncode, toReuse, toRetire, counters };
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Apply the writer side of the diff. Exact reuse rows are deliberately
|
|
207
|
+
* left untouched: bumping `epoch_written` in place would make the row
|
|
208
|
+
* disappear for readers pinned to the previous manifest while the SQLite
|
|
209
|
+
* commit is visible but the epoch manifest is not.
|
|
210
|
+
*
|
|
211
|
+
* Text/metadata-only reuse writes a new row version that reuses the previous
|
|
212
|
+
* embedding BLOB, then retires the previous row at `epoch`. This preserves
|
|
213
|
+
* the strict visibility predicate for both old and new readers.
|
|
214
|
+
*
|
|
215
|
+
* Newly encoded rows go through the existing
|
|
216
|
+
* `core/indexing/indexer-build.js::insertVectors` pathway; the reconciler
|
|
217
|
+
* inserts them with the new column values populated via the helpers in
|
|
218
|
+
* this module.
|
|
219
|
+
*
|
|
220
|
+
* @param {import('better-sqlite3').Database} db
|
|
221
|
+
* @param {string} filePath
|
|
222
|
+
* @param {object} diff Output of `diffChunks`.
|
|
223
|
+
* @param {number} epoch ε+1 for this tick.
|
|
224
|
+
* @returns {{versionedRows:Array<{oldId:string,newId:string,chunkStructId:string}>, replacedRows:Array<{oldId:string,chunkStructId:string}>, retiredRows:Array<{oldId:string,chunkStructId:string}>}}
|
|
225
|
+
*/
|
|
226
|
+
export function applyDiff(db, filePath, diff, epoch) {
|
|
227
|
+
if (!Number.isInteger(epoch)) {
|
|
228
|
+
throw new Error(`applyDiff: epoch must be an integer, got ${epoch}`);
|
|
229
|
+
}
|
|
230
|
+
const tombstoneStmt = db.prepare(`
|
|
231
|
+
UPDATE vectors
|
|
232
|
+
SET epoch_retired = ?
|
|
233
|
+
WHERE id = ? AND (epoch_retired IS NULL OR epoch_retired > ?)
|
|
234
|
+
`);
|
|
235
|
+
const summary = { versionedRows: [], replacedRows: [], retiredRows: [] };
|
|
236
|
+
|
|
237
|
+
for (const reused of diff.toReuse) {
|
|
238
|
+
if (!reused.metadataOnly && !reused.textOnly) continue;
|
|
239
|
+
const { chunk, ann, prevRow } = reused;
|
|
240
|
+
const newId = insertReusedRowVersion(db, filePath, chunk, ann, prevRow, epoch);
|
|
241
|
+
tombstoneStmt.run(epoch, prevRow.id, epoch);
|
|
242
|
+
if (newId) {
|
|
243
|
+
summary.versionedRows.push({
|
|
244
|
+
oldId: prevRow.id,
|
|
245
|
+
newId,
|
|
246
|
+
chunkStructId: ann.chunkStructId,
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
for (const encoded of diff.toEncode || []) {
|
|
252
|
+
if (encoded.prevRow?.id) {
|
|
253
|
+
tombstoneStmt.run(epoch, encoded.prevRow.id, epoch);
|
|
254
|
+
summary.replacedRows.push({
|
|
255
|
+
oldId: encoded.prevRow.id,
|
|
256
|
+
chunkStructId: encoded.ann?.chunkStructId ?? encoded.prevRow.chunk_struct_id,
|
|
257
|
+
});
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
for (const retired of diff.toRetire) {
|
|
262
|
+
tombstoneStmt.run(epoch, retired.rowId, epoch);
|
|
263
|
+
summary.retiredRows.push({
|
|
264
|
+
oldId: retired.rowId,
|
|
265
|
+
chunkStructId: retired.chunkStructId,
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
return summary;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
function uniqueVersionedId(db, baseId, epoch) {
|
|
272
|
+
let candidate = `${baseId}@e${epoch}`;
|
|
273
|
+
let suffix = 1;
|
|
274
|
+
const exists = db.prepare('SELECT 1 FROM vectors WHERE id = ?');
|
|
275
|
+
while (exists.get(candidate)) {
|
|
276
|
+
candidate = `${baseId}@e${epoch}.${suffix++}`;
|
|
277
|
+
}
|
|
278
|
+
return candidate;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
function insertReusedRowVersion(db, filePath, chunk, ann, prevRow, epoch) {
|
|
282
|
+
const source = db.prepare('SELECT * FROM vectors WHERE id = ?').get(prevRow.id);
|
|
283
|
+
if (!source) return null;
|
|
284
|
+
|
|
285
|
+
const columns = db.prepare('PRAGMA table_info(vectors)').all().map((c) => c.name);
|
|
286
|
+
const next = { ...source };
|
|
287
|
+
next.id = uniqueVersionedId(db, prevRow.id, epoch);
|
|
288
|
+
if (columns.includes('file_path')) next.file_path = coalescePath(filePath, source.file_path);
|
|
289
|
+
if (columns.includes('text')) next.text = chunk?.content ?? chunk?.text ?? source.text ?? '';
|
|
290
|
+
if (columns.includes('metadata')) {
|
|
291
|
+
next.metadata = Object.hasOwn(chunk ?? {}, 'metadata')
|
|
292
|
+
? JSON.stringify(vectorRowMetadata(filePath, chunk, source.metadata, source.file_path))
|
|
293
|
+
: source.metadata;
|
|
294
|
+
}
|
|
295
|
+
if (columns.includes('chunk_struct_id')) next.chunk_struct_id = ann.chunkStructId;
|
|
296
|
+
if (columns.includes('chunk_text_hash')) next.chunk_text_hash = ann.hashes.chunk_text_hash;
|
|
297
|
+
if (columns.includes('embedding_input_hash')) next.embedding_input_hash = ann.hashes.embedding_input_hash;
|
|
298
|
+
if (columns.includes('li_input_hash')) next.li_input_hash = ann.hashes.li_input_hash;
|
|
299
|
+
if (columns.includes('metadata_fingerprint')) next.metadata_fingerprint = ann.hashes.metadata_fingerprint;
|
|
300
|
+
if (columns.includes('logical_chunk_id')) {
|
|
301
|
+
next.logical_chunk_id = source.logical_chunk_id || ann.chunkStructId;
|
|
302
|
+
}
|
|
303
|
+
if (columns.includes('epoch_written')) next.epoch_written = epoch;
|
|
304
|
+
if (columns.includes('epoch_retired')) next.epoch_retired = null;
|
|
305
|
+
|
|
306
|
+
const quoted = columns.map((c) => `"${c}"`).join(', ');
|
|
307
|
+
const placeholders = columns.map(() => '?').join(', ');
|
|
308
|
+
db.prepare(`INSERT INTO vectors (${quoted}) VALUES (${placeholders})`)
|
|
309
|
+
.run(...columns.map((c) => next[c]));
|
|
310
|
+
return next.id;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
function parseJsonObject(raw) {
|
|
314
|
+
try {
|
|
315
|
+
const value = JSON.parse(raw || '{}');
|
|
316
|
+
return value && typeof value === 'object' && !Array.isArray(value) ? value : {};
|
|
317
|
+
} catch {
|
|
318
|
+
return {};
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function coalesce(...values) {
|
|
323
|
+
for (const value of values) {
|
|
324
|
+
if (value !== undefined && value !== null) return value;
|
|
325
|
+
}
|
|
326
|
+
return null;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
function coalescePath(...values) {
|
|
330
|
+
for (const value of values) {
|
|
331
|
+
if (typeof value !== 'string') continue;
|
|
332
|
+
const normalized = value.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
|
|
333
|
+
if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
|
|
334
|
+
if (/^[A-Za-z]:\//.test(normalized)) continue;
|
|
335
|
+
if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
|
|
336
|
+
return normalized;
|
|
337
|
+
}
|
|
338
|
+
return null;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
function vectorRowMetadata(filePath, chunk, previousRawMetadata, storedFilePath) {
|
|
342
|
+
const previous = parseJsonObject(previousRawMetadata);
|
|
343
|
+
const meta = chunk?.metadata ?? {};
|
|
344
|
+
return {
|
|
345
|
+
...previous,
|
|
346
|
+
file: coalescePath(meta.relative_path, meta.path, meta.file_path, storedFilePath, filePath, chunk?.file, meta.file, previous.file),
|
|
347
|
+
type: coalesce(meta.type, meta.chunk_type, previous.type, 'code'),
|
|
348
|
+
name: coalesce(meta.name, meta.symbol, previous.name),
|
|
349
|
+
startLine: coalesce(meta.startLine, meta.line_start, previous.startLine),
|
|
350
|
+
endLine: coalesce(meta.endLine, meta.line_end, previous.endLine),
|
|
351
|
+
language: coalesce(meta.language, previous.language),
|
|
352
|
+
provider: coalesce(previous.provider, meta.provider),
|
|
353
|
+
dimension: coalesce(previous.dimension, meta.dimension),
|
|
354
|
+
simhash: coalesce(meta.simhash, previous.simhash),
|
|
355
|
+
clusterId: coalesce(meta.clusterId, previous.clusterId),
|
|
356
|
+
exemplarId: coalesce(meta.exemplarId, previous.exemplarId),
|
|
357
|
+
isExemplar: coalesce(meta.isExemplar, previous.isExemplar),
|
|
358
|
+
};
|
|
359
|
+
}
|