sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,234 @@
1
+ /**
2
+ * Tombstone bitmap (`*.stale.bin`).
3
+ *
4
+ * Plan § 7.3 (Float HNSW), § 7.5 (LI segments), § 34.4 (SIMD-ready layout).
5
+ *
6
+ * Each bit marks one element (HNSW key, LI doc, etc.) as stale. Files are
7
+ * **64-byte aligned** so AVX-512 / AVX2 / NEON masking kernels can scan
8
+ * eight bytes at a time without unaligned-load penalties.
9
+ *
10
+ * Layout:
11
+ *
12
+ * header (16 bytes):
13
+ * 0..3 magic = 'SSTB'
14
+ * 4..7 version = uint32 (1)
15
+ * 8..15 capacity bits = uint64 (little-endian)
16
+ *
17
+ * payload:
18
+ * ceil(capacity_bits / 8) bytes, padded up to a 64-byte boundary.
19
+ *
20
+ * The header is part of the 64-byte alignment promise: callers that mmap
21
+ * the file and pass the payload pointer to a SIMD kernel must skip the
22
+ * first 64 bytes, not the first 16. Phase 6 implements the native SIMD
23
+ * masking kernel; Phase 3 ships the scalar JS path under the same layout.
24
+ */
25
+
26
+ import fs from 'node:fs';
27
+ import path from 'node:path';
28
+
29
+ const HEADER_MAGIC = Buffer.from('SSTB', 'ascii');
30
+ const HEADER_VERSION = 1;
31
+ const HEADER_RESERVED = 64; // bytes reserved before payload (alignment)
32
+ const BITS_PER_BYTE = 8;
33
+
34
+ function payloadByteOffset() {
35
+ return HEADER_RESERVED;
36
+ }
37
+
38
+ function payloadByteLength(capacityBits) {
39
+ const minBytes = Math.ceil(capacityBits / BITS_PER_BYTE);
40
+ // Round up to 64-byte alignment so SIMD reads at the tail are safe.
41
+ return Math.ceil(minBytes / 64) * 64;
42
+ }
43
+
44
+ /**
45
+ * Create or open a tombstone bitmap. Returns an in-memory bitmap object;
46
+ * callers persist with `saveBitmap`.
47
+ *
48
+ * @param {number} capacityBits
49
+ * @returns {{capacity:number, payload:Buffer}}
50
+ */
51
+ export function createBitmap(capacityBits) {
52
+ if (!Number.isInteger(capacityBits) || capacityBits <= 0) {
53
+ throw new Error(`createBitmap: capacityBits must be a positive integer, got ${capacityBits}`);
54
+ }
55
+ return {
56
+ capacity: capacityBits,
57
+ payload: Buffer.alloc(payloadByteLength(capacityBits)),
58
+ };
59
+ }
60
+
61
+ /**
62
+ * Load a tombstone bitmap from disk. Returns `null` if the file does not
63
+ * exist (caller treats every key as live).
64
+ *
65
+ * @param {string} filePath
66
+ * @returns {{capacity:number, payload:Buffer}|null}
67
+ */
68
+ export function loadBitmap(filePath) {
69
+ if (!fs.existsSync(filePath)) return null;
70
+ const raw = fs.readFileSync(filePath);
71
+ if (raw.length < HEADER_RESERVED) {
72
+ throw new Error(`loadBitmap: ${filePath} too short (${raw.length} bytes)`);
73
+ }
74
+ if (!raw.subarray(0, 4).equals(HEADER_MAGIC)) {
75
+ throw new Error(`loadBitmap: ${filePath} magic mismatch`);
76
+ }
77
+ const version = raw.readUInt32LE(4);
78
+ if (version !== HEADER_VERSION) {
79
+ throw new Error(`loadBitmap: unsupported version ${version}`);
80
+ }
81
+ const capacity = Number(raw.readBigUInt64LE(8));
82
+ const expectedLength = HEADER_RESERVED + payloadByteLength(capacity);
83
+ if (raw.length < expectedLength) {
84
+ throw new Error(`loadBitmap: ${filePath} truncated payload (${raw.length} bytes, expected ${expectedLength})`);
85
+ }
86
+ const payload = raw.subarray(HEADER_RESERVED, HEADER_RESERVED + payloadByteLength(capacity));
87
+ return {
88
+ capacity,
89
+ payload: Buffer.from(payload),
90
+ };
91
+ }
92
+
93
+ /**
94
+ * Persist the bitmap atomically (`*.tmp` + fsync + rename).
95
+ *
96
+ * @param {string} filePath
97
+ * @param {{capacity:number, payload:Buffer}} bitmap
98
+ */
99
+ export function saveBitmap(filePath, bitmap) {
100
+ fs.mkdirSync(path.dirname(filePath), { recursive: true });
101
+ const tmp = filePath + '.tmp';
102
+ const header = Buffer.alloc(HEADER_RESERVED);
103
+ HEADER_MAGIC.copy(header, 0);
104
+ header.writeUInt32LE(HEADER_VERSION, 4);
105
+ header.writeBigUInt64LE(BigInt(bitmap.capacity), 8);
106
+ const out = Buffer.concat([header, bitmap.payload]);
107
+
108
+ const fd = fs.openSync(tmp, 'w');
109
+ try {
110
+ fs.writeSync(fd, out);
111
+ fs.fsyncSync(fd);
112
+ } finally {
113
+ fs.closeSync(fd);
114
+ }
115
+ fs.renameSync(tmp, filePath);
116
+ try {
117
+ const dirFd = fs.openSync(path.dirname(filePath), 'r');
118
+ try { fs.fsyncSync(dirFd); } finally { fs.closeSync(dirFd); }
119
+ } catch {
120
+ // Best-effort: some tmpfs/container filesystems reject directory fsync.
121
+ }
122
+ }
123
+
124
+ /**
125
+ * Resize a bitmap (used when the underlying graph grows past capacity).
126
+ * Preserves existing bits.
127
+ *
128
+ * @param {{capacity:number, payload:Buffer}} bitmap
129
+ * @param {number} newCapacityBits
130
+ */
131
+ export function resizeBitmap(bitmap, newCapacityBits) {
132
+ if (newCapacityBits <= bitmap.capacity) return bitmap;
133
+ const fresh = Buffer.alloc(payloadByteLength(newCapacityBits));
134
+ bitmap.payload.copy(fresh, 0, 0, bitmap.payload.length);
135
+ bitmap.capacity = newCapacityBits;
136
+ bitmap.payload = fresh;
137
+ return bitmap;
138
+ }
139
+
140
+ function byteAndMask(index) {
141
+ const byte = index >>> 3;
142
+ const mask = 1 << (index & 7);
143
+ return { byte, mask };
144
+ }
145
+
146
+ export function setBit(bitmap, index) {
147
+ if (index < 0 || index >= bitmap.capacity) {
148
+ throw new RangeError(`setBit: ${index} outside bitmap capacity ${bitmap.capacity}`);
149
+ }
150
+ const { byte, mask } = byteAndMask(index);
151
+ bitmap.payload[byte] |= mask;
152
+ }
153
+
154
+ export function clearBit(bitmap, index) {
155
+ if (index < 0 || index >= bitmap.capacity) {
156
+ throw new RangeError(`clearBit: ${index} outside bitmap capacity ${bitmap.capacity}`);
157
+ }
158
+ const { byte, mask } = byteAndMask(index);
159
+ bitmap.payload[byte] &= ~mask & 0xFF;
160
+ }
161
+
162
+ export function isSet(bitmap, index) {
163
+ if (index < 0 || index >= bitmap.capacity) return false;
164
+ const { byte, mask } = byteAndMask(index);
165
+ return (bitmap.payload[byte] & mask) !== 0;
166
+ }
167
+
168
+ /**
169
+ * Population count. Plan § 7.3 tombstone_fraction = count / total.
170
+ *
171
+ * @param {{capacity:number, payload:Buffer}} bitmap
172
+ * @returns {number}
173
+ */
174
+ export function popcount(bitmap) {
175
+ let count = 0;
176
+ const buf = bitmap.payload;
177
+ const fullBytes = Math.floor(bitmap.capacity / BITS_PER_BYTE);
178
+ const wordBytes = fullBytes - (fullBytes % 4);
179
+ // popcount via Brian Kernighan's algorithm, 32 bits at a time. Only
180
+ // count bytes covered by capacity; 64-byte alignment padding is not data.
181
+ for (let i = 0; i < wordBytes; i += 4) {
182
+ let x = buf.readUInt32LE(i);
183
+ while (x !== 0) {
184
+ x &= x - 1;
185
+ count += 1;
186
+ }
187
+ }
188
+ for (let i = wordBytes; i < fullBytes; i++) {
189
+ let b = buf[i];
190
+ while (b !== 0) {
191
+ b &= b - 1;
192
+ count += 1;
193
+ }
194
+ }
195
+ const tailBits = bitmap.capacity % BITS_PER_BYTE;
196
+ if (tailBits > 0) {
197
+ let b = buf[fullBytes] & ((1 << tailBits) - 1);
198
+ while (b !== 0) {
199
+ b &= b - 1;
200
+ count += 1;
201
+ }
202
+ }
203
+ return count;
204
+ }
205
+
206
+ /**
207
+ * Filter a list of candidate indices by the bitmap. SIMD-ready in v6;
208
+ * scalar fallback here.
209
+ *
210
+ * @param {{capacity:number, payload:Buffer}} bitmap
211
+ * @param {number[]} candidates
212
+ * @returns {number[]}
213
+ */
214
+ export function filterLive(bitmap, candidates) {
215
+ const live = [];
216
+ for (const idx of candidates) {
217
+ if (!isSet(bitmap, idx)) live.push(idx);
218
+ }
219
+ return live;
220
+ }
221
+
222
+ /**
223
+ * Compute the tombstone fraction for the watermark scheduler.
224
+ *
225
+ * @param {{capacity:number, payload:Buffer}} bitmap
226
+ * @param {number} liveTotal Total live elements (not bitmap capacity).
227
+ * @returns {number}
228
+ */
229
+ export function tombstoneFraction(bitmap, liveTotal) {
230
+ const tombstoned = popcount(bitmap);
231
+ const denom = liveTotal + tombstoned;
232
+ if (denom === 0) return 0;
233
+ return tombstoned / denom;
234
+ }
@@ -0,0 +1,359 @@
1
+ /**
2
+ * Vector delta writer.
3
+ *
4
+ * Plan § 7.2 + § 13 Phase 1. The reconcile tick translates a list of
5
+ * dirty chunks into per-row UPSERTs against `codebase.db::vectors`
6
+ * keyed on `(file_path, chunk_struct_id)`. Stable chunks whose
7
+ * `embedding_input_hash` and `li_input_hash` are unchanged keep their
8
+ * BLOB; only changed payloads run through the encoder.
9
+ *
10
+ * This module is intentionally narrow:
11
+ * - It knows the vectors-table column layout (post `migrateVectorsSchema`).
12
+ * - It does NOT call the encoder. The caller (reconcile application
13
+ * service) decides what to re-encode based on the diff result here.
14
+ * - It does NOT touch HNSW, LI, or sparse-gram artifacts; per-tier
15
+ * side effects are dispatched by the reconciler.
16
+ *
17
+ * The diff is the load-bearing API. Given the chunker output for a file
18
+ * and the current DB state, it returns:
19
+ *
20
+ * {
21
+ * toEncode: [ { chunk, denseNeeded, liNeeded } ],
22
+ * toReuse: [ { chunk, prevRow } ],
23
+ * toRetire: [ { rowId, chunkStructId } ],
24
+ * metadataDirty: [ chunk_struct_id ], // populated by reconciler
25
+ * counters: { hit, miss, ... },
26
+ * }
27
+ *
28
+ * The "retire" set covers chunks that existed in DB for this file but no
29
+ * longer have a matching `chunk_struct_id`. Per plan § 7.2, those rows are
30
+ * tombstoned in the same per-file transaction by setting
31
+ * `epoch_retired = ε+1`; the reconciler does the actual SQL write.
32
+ */
33
+
34
+ import { assignStructuralIds } from '../domain/chunk-identity.mjs';
35
+ import { chunkInputHashes } from '../domain/encoder-input.mjs';
36
+
37
+ /**
38
+ * Annotate each chunk with its structural ID + per-consumer hashes in
39
+ * one pass. Returns a parallel array; does not mutate the chunks.
40
+ *
41
+ * @param {Array<object>} chunks
42
+ * @param {string} filePath
43
+ * @returns {Array<{chunkStructId:string, structural:boolean, occurrenceIndex:number|null, hashes:{chunk_text_hash:string, embedding_input_hash:string, li_input_hash:string, metadata_fingerprint:string, dedup_fingerprint:string}}>}
44
+ */
45
+ export function annotateChunksForDelta(chunks, filePath) {
46
+ const ids = assignStructuralIds(chunks, filePath);
47
+ return ids.map((id, i) => ({
48
+ chunkStructId: id.chunkStructId,
49
+ structural: id.structural,
50
+ occurrenceIndex: id.occurrenceIndex,
51
+ rollingHash: id.rollingHash,
52
+ reason: id.reason,
53
+ hashes: chunkInputHashes(chunks[i]),
54
+ }));
55
+ }
56
+
57
+ /**
58
+ * Fetch the current per-row hash state for one file. Returns a Map keyed
59
+ * by `chunk_struct_id` so the diff can be O(n). Falls back to `chunk_id`
60
+ * (the legacy positional ID stored as the row's primary key) when the
61
+ * row has no structural ID — typical for rows written by the older
62
+ * indexer before the Phase 1 migration.
63
+ *
64
+ * @param {import('better-sqlite3').Database} db
65
+ * @param {string} filePath
66
+ * @param {{manifestEpoch?: number}} [options]
67
+ * @returns {Map<string, {id:string, chunk_struct_id:string, chunk_text_hash:string, embedding_input_hash:string, li_input_hash:string, metadata_fingerprint:string, epoch_written:number, epoch_retired:number|null}>}
68
+ */
69
+ export function snapshotFileRows(db, filePath, options = {}) {
70
+ const map = new Map();
71
+ const pinned = Number.isInteger(options.manifestEpoch);
72
+ const visibilitySql = pinned
73
+ ? `AND (epoch_written IS NULL OR epoch_written <= ?)
74
+ AND (epoch_retired IS NULL OR epoch_retired > ?)`
75
+ : `AND epoch_retired IS NULL`;
76
+ const args = pinned ? [filePath, options.manifestEpoch, options.manifestEpoch] : [filePath];
77
+ const rows = db.prepare(`
78
+ SELECT id, chunk_struct_id, chunk_text_hash, embedding_input_hash,
79
+ li_input_hash, metadata_fingerprint, epoch_written, epoch_retired
80
+ FROM vectors
81
+ WHERE file_path = ?
82
+ ${visibilitySql}
83
+ ORDER BY epoch_written DESC
84
+ `).all(...args);
85
+ for (const row of rows) {
86
+ const key = row.chunk_struct_id && row.chunk_struct_id.length > 0
87
+ ? row.chunk_struct_id
88
+ : `legacy:${row.id}`;
89
+ if (map.has(key)) continue;
90
+ map.set(key, row);
91
+ }
92
+ return map;
93
+ }
94
+
95
+ /**
96
+ * Compute the diff between annotated chunks and the current DB rows.
97
+ *
98
+ * Decision rules per chunk:
99
+ * - structural ID matches an existing row + embedding_input_hash matches
100
+ * → dense reuse.
101
+ * - structural ID matches + li_input_hash matches → LI reuse.
102
+ * - structural ID matches + encoder hashes match but chunk_text_hash or
103
+ * metadata_fingerprint changed → version the row while reusing encoder
104
+ * payloads, so readers pinned to the next epoch see fresh text/metadata.
105
+ * - structural ID matches + only metadata_fingerprint changed →
106
+ * metadata-dirty (caller may need to re-run graph enrichment and
107
+ * re-hash; treat as "needs encode" defensively until the reconciler
108
+ * decides).
109
+ * - new chunk_struct_id → insert + encode both.
110
+ * - existing rows whose struct id is absent from the new chunk list →
111
+ * retire (tombstone in the same per-file transaction).
112
+ *
113
+ * @param {Array<object>} chunks Output of the chunker.
114
+ * @param {Array<{chunkStructId:string, hashes:object}>} annotations From annotateChunksForDelta.
115
+ * @param {Map<string, object>} dbSnapshot From snapshotFileRows.
116
+ * @returns {{toEncode:Array, toReuse:Array, toRetire:Array, counters:{hit:number, miss:number, retire:number, metadata_dirty:number}}}
117
+ */
118
+ export function diffChunks(chunks, annotations, dbSnapshot) {
119
+ const toEncode = [];
120
+ const toReuse = [];
121
+ const seenIds = new Set();
122
+ const counters = { hit: 0, miss: 0, retire: 0, metadata_dirty: 0 };
123
+
124
+ for (let i = 0; i < chunks.length; i++) {
125
+ const chunk = chunks[i];
126
+ const ann = annotations[i];
127
+ if (!ann || !ann.chunkStructId) {
128
+ // Fallback path: chunk has no structural ID. The reconciler still
129
+ // needs to encode + insert; structural reuse is impossible.
130
+ toEncode.push({
131
+ chunk, ann,
132
+ denseNeeded: true, liNeeded: true,
133
+ reason: 'no-struct-id',
134
+ });
135
+ counters.miss += 1;
136
+ continue;
137
+ }
138
+ const key = ann.chunkStructId;
139
+ seenIds.add(key);
140
+ const prev = dbSnapshot.get(key);
141
+
142
+ if (!prev) {
143
+ toEncode.push({
144
+ chunk, ann,
145
+ denseNeeded: true, liNeeded: true,
146
+ reason: 'new',
147
+ });
148
+ counters.miss += 1;
149
+ continue;
150
+ }
151
+
152
+ const denseMatch = prev.embedding_input_hash === ann.hashes.embedding_input_hash
153
+ && ann.hashes.embedding_input_hash !== '';
154
+ const liMatch = prev.li_input_hash === ann.hashes.li_input_hash
155
+ && ann.hashes.li_input_hash !== '';
156
+
157
+ const textMatch = prev.chunk_text_hash === ann.hashes.chunk_text_hash
158
+ && ann.hashes.chunk_text_hash !== '';
159
+ const metadataMatch = prev.metadata_fingerprint === ann.hashes.metadata_fingerprint;
160
+
161
+ if (denseMatch && liMatch && textMatch && metadataMatch) {
162
+ toReuse.push({ chunk, ann, prevRow: prev });
163
+ counters.hit += 1;
164
+ continue;
165
+ }
166
+
167
+ if (denseMatch && liMatch) {
168
+ // Text and/or metadata shifted, but encoder payloads are still valid.
169
+ // Write a new row version reusing the old embedding BLOB so old and new
170
+ // manifest epochs each see their matching row contents.
171
+ toReuse.push({
172
+ chunk,
173
+ ann,
174
+ prevRow: prev,
175
+ metadataOnly: !metadataMatch,
176
+ textOnly: !textMatch,
177
+ });
178
+ if (!metadataMatch) counters.metadata_dirty += 1;
179
+ counters.hit += 1;
180
+ continue;
181
+ }
182
+
183
+ // Partial reuse: dense XOR LI. The reconciler can choose to re-encode
184
+ // only the affected consumer.
185
+ toEncode.push({
186
+ chunk, ann, prevRow: prev,
187
+ denseNeeded: !denseMatch,
188
+ liNeeded: !liMatch,
189
+ reason: denseMatch ? 'li-only' : (liMatch ? 'dense-only' : 'both'),
190
+ });
191
+ counters.miss += 1;
192
+ }
193
+
194
+ const toRetire = [];
195
+ for (const [key, prev] of dbSnapshot.entries()) {
196
+ if (seenIds.has(key)) continue;
197
+ if (prev.epoch_retired != null) continue; // already tombstoned
198
+ toRetire.push({ rowId: prev.id, chunkStructId: prev.chunk_struct_id });
199
+ counters.retire += 1;
200
+ }
201
+
202
+ return { toEncode, toReuse, toRetire, counters };
203
+ }
204
+
205
+ /**
206
+ * Apply the writer side of the diff. Exact reuse rows are deliberately
207
+ * left untouched: bumping `epoch_written` in place would make the row
208
+ * disappear for readers pinned to the previous manifest while the SQLite
209
+ * commit is visible but the epoch manifest is not.
210
+ *
211
+ * Text/metadata-only reuse writes a new row version that reuses the previous
212
+ * embedding BLOB, then retires the previous row at `epoch`. This preserves
213
+ * the strict visibility predicate for both old and new readers.
214
+ *
215
+ * Newly encoded rows go through the existing
216
+ * `core/indexing/indexer-build.js::insertVectors` pathway; the reconciler
217
+ * inserts them with the new column values populated via the helpers in
218
+ * this module.
219
+ *
220
+ * @param {import('better-sqlite3').Database} db
221
+ * @param {string} filePath
222
+ * @param {object} diff Output of `diffChunks`.
223
+ * @param {number} epoch ε+1 for this tick.
224
+ * @returns {{versionedRows:Array<{oldId:string,newId:string,chunkStructId:string}>, replacedRows:Array<{oldId:string,chunkStructId:string}>, retiredRows:Array<{oldId:string,chunkStructId:string}>}}
225
+ */
226
+ export function applyDiff(db, filePath, diff, epoch) {
227
+ if (!Number.isInteger(epoch)) {
228
+ throw new Error(`applyDiff: epoch must be an integer, got ${epoch}`);
229
+ }
230
+ const tombstoneStmt = db.prepare(`
231
+ UPDATE vectors
232
+ SET epoch_retired = ?
233
+ WHERE id = ? AND (epoch_retired IS NULL OR epoch_retired > ?)
234
+ `);
235
+ const summary = { versionedRows: [], replacedRows: [], retiredRows: [] };
236
+
237
+ for (const reused of diff.toReuse) {
238
+ if (!reused.metadataOnly && !reused.textOnly) continue;
239
+ const { chunk, ann, prevRow } = reused;
240
+ const newId = insertReusedRowVersion(db, filePath, chunk, ann, prevRow, epoch);
241
+ tombstoneStmt.run(epoch, prevRow.id, epoch);
242
+ if (newId) {
243
+ summary.versionedRows.push({
244
+ oldId: prevRow.id,
245
+ newId,
246
+ chunkStructId: ann.chunkStructId,
247
+ });
248
+ }
249
+ }
250
+
251
+ for (const encoded of diff.toEncode || []) {
252
+ if (encoded.prevRow?.id) {
253
+ tombstoneStmt.run(epoch, encoded.prevRow.id, epoch);
254
+ summary.replacedRows.push({
255
+ oldId: encoded.prevRow.id,
256
+ chunkStructId: encoded.ann?.chunkStructId ?? encoded.prevRow.chunk_struct_id,
257
+ });
258
+ }
259
+ }
260
+
261
+ for (const retired of diff.toRetire) {
262
+ tombstoneStmt.run(epoch, retired.rowId, epoch);
263
+ summary.retiredRows.push({
264
+ oldId: retired.rowId,
265
+ chunkStructId: retired.chunkStructId,
266
+ });
267
+ }
268
+ return summary;
269
+ }
270
+
271
+ function uniqueVersionedId(db, baseId, epoch) {
272
+ let candidate = `${baseId}@e${epoch}`;
273
+ let suffix = 1;
274
+ const exists = db.prepare('SELECT 1 FROM vectors WHERE id = ?');
275
+ while (exists.get(candidate)) {
276
+ candidate = `${baseId}@e${epoch}.${suffix++}`;
277
+ }
278
+ return candidate;
279
+ }
280
+
281
+ function insertReusedRowVersion(db, filePath, chunk, ann, prevRow, epoch) {
282
+ const source = db.prepare('SELECT * FROM vectors WHERE id = ?').get(prevRow.id);
283
+ if (!source) return null;
284
+
285
+ const columns = db.prepare('PRAGMA table_info(vectors)').all().map((c) => c.name);
286
+ const next = { ...source };
287
+ next.id = uniqueVersionedId(db, prevRow.id, epoch);
288
+ if (columns.includes('file_path')) next.file_path = coalescePath(filePath, source.file_path);
289
+ if (columns.includes('text')) next.text = chunk?.content ?? chunk?.text ?? source.text ?? '';
290
+ if (columns.includes('metadata')) {
291
+ next.metadata = Object.hasOwn(chunk ?? {}, 'metadata')
292
+ ? JSON.stringify(vectorRowMetadata(filePath, chunk, source.metadata, source.file_path))
293
+ : source.metadata;
294
+ }
295
+ if (columns.includes('chunk_struct_id')) next.chunk_struct_id = ann.chunkStructId;
296
+ if (columns.includes('chunk_text_hash')) next.chunk_text_hash = ann.hashes.chunk_text_hash;
297
+ if (columns.includes('embedding_input_hash')) next.embedding_input_hash = ann.hashes.embedding_input_hash;
298
+ if (columns.includes('li_input_hash')) next.li_input_hash = ann.hashes.li_input_hash;
299
+ if (columns.includes('metadata_fingerprint')) next.metadata_fingerprint = ann.hashes.metadata_fingerprint;
300
+ if (columns.includes('logical_chunk_id')) {
301
+ next.logical_chunk_id = source.logical_chunk_id || ann.chunkStructId;
302
+ }
303
+ if (columns.includes('epoch_written')) next.epoch_written = epoch;
304
+ if (columns.includes('epoch_retired')) next.epoch_retired = null;
305
+
306
+ const quoted = columns.map((c) => `"${c}"`).join(', ');
307
+ const placeholders = columns.map(() => '?').join(', ');
308
+ db.prepare(`INSERT INTO vectors (${quoted}) VALUES (${placeholders})`)
309
+ .run(...columns.map((c) => next[c]));
310
+ return next.id;
311
+ }
312
+
313
+ function parseJsonObject(raw) {
314
+ try {
315
+ const value = JSON.parse(raw || '{}');
316
+ return value && typeof value === 'object' && !Array.isArray(value) ? value : {};
317
+ } catch {
318
+ return {};
319
+ }
320
+ }
321
+
322
+ function coalesce(...values) {
323
+ for (const value of values) {
324
+ if (value !== undefined && value !== null) return value;
325
+ }
326
+ return null;
327
+ }
328
+
329
+ function coalescePath(...values) {
330
+ for (const value of values) {
331
+ if (typeof value !== 'string') continue;
332
+ const normalized = value.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
333
+ if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
334
+ if (/^[A-Za-z]:\//.test(normalized)) continue;
335
+ if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
336
+ return normalized;
337
+ }
338
+ return null;
339
+ }
340
+
341
+ function vectorRowMetadata(filePath, chunk, previousRawMetadata, storedFilePath) {
342
+ const previous = parseJsonObject(previousRawMetadata);
343
+ const meta = chunk?.metadata ?? {};
344
+ return {
345
+ ...previous,
346
+ file: coalescePath(meta.relative_path, meta.path, meta.file_path, storedFilePath, filePath, chunk?.file, meta.file, previous.file),
347
+ type: coalesce(meta.type, meta.chunk_type, previous.type, 'code'),
348
+ name: coalesce(meta.name, meta.symbol, previous.name),
349
+ startLine: coalesce(meta.startLine, meta.line_start, previous.startLine),
350
+ endLine: coalesce(meta.endLine, meta.line_end, previous.endLine),
351
+ language: coalesce(meta.language, previous.language),
352
+ provider: coalesce(previous.provider, meta.provider),
353
+ dimension: coalesce(previous.dimension, meta.dimension),
354
+ simhash: coalesce(meta.simhash, previous.simhash),
355
+ clusterId: coalesce(meta.clusterId, previous.clusterId),
356
+ exemplarId: coalesce(meta.exemplarId, previous.exemplarId),
357
+ isExemplar: coalesce(meta.isExemplar, previous.isExemplar),
358
+ };
359
+ }