sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,519 @@
1
+ /**
2
+ * Maintenance handlers — real implementations for the four reclamation
3
+ * tiers the soak REPORT.md flagged as queue-only.
4
+ *
5
+ * Each handler is registered by `defaultMaintenanceHandlers()` in
6
+ * `maintenance-worker.mjs`. Handlers run inside the reconcile daemon's
7
+ * single-writer process, so on-disk artifacts have one writer at a time.
8
+ *
9
+ * Atomicity contract: per artifact-family, each handler writes its new
10
+ * artifacts via a path that sorts later than the existing ones (sparse
11
+ * gram), an explicit temp+rename (LI segments, HNSW meta/usearch/vectors),
12
+ * or via the existing `*.next` clean-build flag (Binary HNSW). After a
13
+ * successful publish the handler clears the tier's stale bitmap; on
14
+ * failure the previous artifacts remain readable.
15
+ *
16
+ * Manifest semantics:
17
+ * - sparse_gram, LI segment: the reconcile manifest is unchanged. New
18
+ * artifacts replace old ones at canonical paths read fresh per query.
19
+ * - HNSW (float / binary): canonical paths unchanged; the reconcile
20
+ * manifest stays at the current epoch. Cross-process readers that
21
+ * cache an HNSWIndex instance in memory MUST already invalidate on
22
+ * manifest change — but maintenance does not bump the epoch by
23
+ * itself. This matches the existing reconcile tick semantics; a
24
+ * follow-up workstream can add versioned tier paths if needed.
25
+ *
26
+ * The handlers degrade safely when artifacts are missing/corrupt — they
27
+ * throw a descriptive error which the worker converts into the standard
28
+ * retry/dead-letter path.
29
+ */
30
+
31
+ import fs from 'node:fs';
32
+ import path from 'node:path';
33
+ import Database from 'better-sqlite3';
34
+
35
+ import { BinaryHNSWIndex } from '../../vector-store/binary-hnsw-index.js';
36
+ import { HNSWIndex } from '../../vector-store/hnsw-index.js';
37
+ import { LateInteractionIndex } from '../../ranking/late-interaction-index.js';
38
+ import { compactDeltaSegments, listDeltaSegments } from '../infrastructure/sparse-gram-delta.mjs';
39
+ import { mergeLiSegments, LI_MERGE_GRACE_MS } from '../infrastructure/li-segment-merge.mjs';
40
+ import { runVectorGc } from '../infrastructure/vector-gc.mjs';
41
+ import { runGraphGc } from '../infrastructure/graph-gc.mjs';
42
+ import { minLiveEpoch } from '../infrastructure/reader-heartbeat.mjs';
43
+ import { readManifest, writeManifest } from '../infrastructure/manifest.mjs';
44
+ import {
45
+ loadBitmap, popcount, isSet, createBitmap, saveBitmap,
46
+ } from '../infrastructure/tombstone-bitmap.mjs';
47
+
48
+ function safeUnlink(p) { try { fs.unlinkSync(p); } catch { /* ok */ } }
49
+ function progressFn(onProgress) {
50
+ return typeof onProgress === 'function'
51
+ ? (phase) => { onProgress(phase); }
52
+ : () => {};
53
+ }
54
+
55
+ function float32FromBuffer(buffer) {
56
+ const view = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
57
+ return new Float32Array(view);
58
+ }
59
+
60
+ /* ------------------------------------------------------------------ *
61
+ * sparse_gram *
62
+ * ------------------------------------------------------------------ */
63
+
64
+ export async function sparseGramHandler(job, { stateDir, onProgress = null }) {
65
+ const progress = progressFn(onProgress);
66
+ const base = path.join(stateDir, 'codebase-sparse-grams.idx');
67
+ // Stage the compaction in deferred-delete mode. The compacted segment is
68
+ // already on disk via tmp+rename; the consumed old segments stay until
69
+ // we have rewritten the reconcile manifest (or confirmed nobody is
70
+ // pinning the old paths). This closes the microsecond window in which a
71
+ // cross-process reader holding the OLD manifest's `sparseGram.deltas`
72
+ // list could resolve `recordsResolved = 0` against deleted files.
73
+ const result = compactDeltaSegments(base, { dropTombstones: false, deferDelete: true });
74
+ progress('maintenance:sparse-gram:compacted');
75
+ if (result.skipped) {
76
+ return { skipped: result.skipped };
77
+ }
78
+
79
+ const consumedSet = new Set(result.consumedSegmentPaths);
80
+ let manifestUpdated = false;
81
+ let manifestError = null;
82
+ let hadSparseGramPin = false;
83
+ try {
84
+ const manifest = readManifest(stateDir);
85
+ if (manifest?.sparseGram) {
86
+ hadSparseGramPin = true;
87
+ // Future-of-disk list: everything currently in the delta dir minus
88
+ // the segments we are about to unlink. In the steady state that is
89
+ // just the compacted segment; filtering keeps us correct if a
90
+ // reconcile tick somehow slipped in another segment between
91
+ // compaction and manifest write.
92
+ const remaining = listDeltaSegments(base).filter((seg) => !consumedSet.has(seg.path));
93
+ manifest.sparseGram.deltas = remaining.map((seg) =>
94
+ path.relative(stateDir, seg.path).replace(/\\/g, '/'),
95
+ );
96
+ writeManifest(stateDir, manifest);
97
+ manifestUpdated = true;
98
+ }
99
+ } catch (err) {
100
+ manifestError = err?.message || String(err);
101
+ }
102
+
103
+ // Publish gate. Only delete the old segments once the new manifest is
104
+ // live (or we know nobody is pinning the old paths). On a manifest write
105
+ // failure we leave the old segments in place; the next maintenance pass
106
+ // re-runs the compaction across both the leftover compacted file and
107
+ // the old segments, then re-attempts the manifest publish.
108
+ let unlinked = 0;
109
+ const safeToUnlink = manifestUpdated || !hadSparseGramPin;
110
+ if (safeToUnlink) {
111
+ for (const segPath of result.consumedSegmentPaths) {
112
+ try { fs.unlinkSync(segPath); unlinked += 1; } catch { /* tolerate concurrent deletion */ }
113
+ if (unlinked % 100 === 0) progress('maintenance:sparse-gram:unlink');
114
+ }
115
+ }
116
+
117
+ return {
118
+ tier: 'sparse_gram',
119
+ consumedSegments: unlinked,
120
+ recordsWritten: result.recordsWritten,
121
+ compactedPath: path.relative(stateDir, result.compactedPath).replace(/\\/g, '/'),
122
+ manifestUpdated,
123
+ ...(manifestError ? { manifestError } : {}),
124
+ };
125
+ }
126
+
127
+ /* ------------------------------------------------------------------ *
128
+ * binary_hnsw *
129
+ * ------------------------------------------------------------------ */
130
+
131
+ /**
132
+ * Read the set of live vector ids from `codebase.db` (`epoch_retired IS NULL`).
133
+ * `codebase.db` is the source of truth for vector liveness; the Binary-HNSW
134
+ * stale bitmap is a derived query-time cache that can drift from it if a retire
135
+ * op fails to reach the binary tier. Returns `null` when the DB / column is
136
+ * unavailable so the caller can fall back to the stale bitmap.
137
+ */
138
+ function readLiveVectorIds(stateDir) {
139
+ const dbPath = path.join(stateDir, 'codebase.db');
140
+ if (!fs.existsSync(dbPath)) return null;
141
+ const db = new Database(dbPath, { readonly: true });
142
+ try {
143
+ const cols = db.prepare('PRAGMA table_info(vectors)').all().map((c) => c.name);
144
+ if (!cols.includes('epoch_retired')) return null;
145
+ return new Set(db.prepare('SELECT id FROM vectors WHERE epoch_retired IS NULL').all().map((r) => r.id));
146
+ } catch {
147
+ return null;
148
+ } finally {
149
+ db.close();
150
+ }
151
+ }
152
+
153
+ export async function binaryHnswHandler(job, { stateDir, onProgress = null }) {
154
+ const progress = progressFn(onProgress);
155
+ const indexPath = path.join(stateDir, 'codebase-binary-hnsw.idx');
156
+ const metaPath = path.join(stateDir, 'codebase-binary-hnsw.meta.json');
157
+ if (!fs.existsSync(metaPath)) return { skipped: 'no-index' };
158
+
159
+ const existing = new BinaryHNSWIndex({ indexPath });
160
+ await existing.load(indexPath);
161
+ progress('maintenance:binary-hnsw:loaded');
162
+
163
+ // Liveness authority is codebase.db, NOT the binary stale bitmap. This makes
164
+ // binary reclamation self-healing and consistent with floatHnswHandler
165
+ // (which already rebuilds from `vectors WHERE epoch_retired IS NULL`): a
166
+ // vector retired in codebase.db is dropped here even if its binary stale bit
167
+ // was never set. Falls back to the stale bitmap only when codebase.db is
168
+ // unavailable.
169
+ const liveIds = readLiveVectorIds(stateDir);
170
+ const staleBitmap = existing._loadStaleBitmap();
171
+ const live = [];
172
+ for (let i = 0; i < existing.vectors.length; i += 1) {
173
+ const v = existing.vectors[i];
174
+ const isStale = liveIds ? !liveIds.has(v.id) : (staleBitmap && isSet(staleBitmap, i));
175
+ if (isStale) continue;
176
+ const int8 = existing.int8Vectors.get(v.id) || null;
177
+ live.push({ id: v.id, binary: v.binary, metadata: v.metadata, int8 });
178
+ if (i > 0 && i % 1000 === 0) progress('maintenance:binary-hnsw:scan');
179
+ }
180
+ const dropped = existing.vectors.length - live.length;
181
+ if (dropped === 0) {
182
+ return { skipped: 'no-stale-vectors', dropped: 0 };
183
+ }
184
+
185
+ // Rebuild the index in memory and let `BinaryHNSWIndex.save()`
186
+ // publish via its tmp+rename protocol — every sidecar is staged then
187
+ // atomically renamed (data first, .meta.json last) so fresh readers
188
+ // don't see torn `(meta, vectors, graph, int8)` tuples.
189
+ const fresh = new BinaryHNSWIndex({
190
+ indexPath,
191
+ floatDimension: existing.floatDimension,
192
+ M: existing.M,
193
+ efConstruction: existing.efConstruction,
194
+ efSearch: existing.efSearch,
195
+ maxElements: existing.maxElements,
196
+ });
197
+ fresh.resetForBuild();
198
+ let added = 0;
199
+ for (const v of live) {
200
+ await fresh.add(v.id, v.binary, v.metadata, v.int8);
201
+ added += 1;
202
+ if (added % 500 === 0) progress('maintenance:binary-hnsw:add');
203
+ }
204
+ fresh._cleanBuild = true;
205
+ await fresh.save(indexPath);
206
+ progress('maintenance:binary-hnsw:saved');
207
+
208
+ return {
209
+ tier: 'binary_hnsw',
210
+ kept: live.length,
211
+ dropped,
212
+ staleBitmapCleared: true,
213
+ atomicPublish: true,
214
+ };
215
+ }
216
+
217
+ /* ------------------------------------------------------------------ *
218
+ * float_hnsw *
219
+ * ------------------------------------------------------------------ */
220
+
221
+ /**
222
+ * Float HNSW clean replacement.
223
+ *
224
+ * Source of truth for "which vectors are live" is `codebase.db`. The
225
+ * existing HNSW meta.json's idMap is also pruned, but we re-read the DB
226
+ * to pick up `embedding` blobs the in-memory HNSWIndex doesn't expose.
227
+ *
228
+ * Caller invariant: the codebase.db schema columns (`id`, `embedding`,
229
+ * `metadata`, `epoch_retired`) are stable — verified in the production
230
+ * reconciler `applyVectorDelta` path.
231
+ */
232
+ export async function floatHnswHandler(job, { stateDir, onProgress = null }) {
233
+ const progress = progressFn(onProgress);
234
+ const indexPath = path.join(stateDir, 'codebase-hnsw.idx');
235
+ const metaPath = path.join(stateDir, 'codebase-hnsw.meta.json');
236
+ const dbPath = path.join(stateDir, 'codebase.db');
237
+ if (!fs.existsSync(metaPath)) return { skipped: 'no-index' };
238
+ if (!fs.existsSync(dbPath)) return { skipped: 'no-vector-db' };
239
+
240
+ // Load existing index to discover dimension / parameters (cheap).
241
+ const existing = new HNSWIndex({ indexPath });
242
+ try { await existing.load(indexPath); } catch { return { skipped: 'load-failed' }; }
243
+ progress('maintenance:float-hnsw:loaded');
244
+ const dimension = existing.dimension;
245
+ const stalePath = existing.stalePath;
246
+
247
+ const stalePresent = fs.existsSync(stalePath);
248
+ const liveIdsBefore = new Set(existing.idMap.keys());
249
+
250
+ // Walk live vectors from codebase.db.
251
+ const db = new Database(dbPath, { readonly: true });
252
+ let liveRows;
253
+ try {
254
+ liveRows = db.prepare(
255
+ 'SELECT id, embedding, metadata FROM vectors WHERE epoch_retired IS NULL'
256
+ ).all();
257
+ } finally {
258
+ db.close();
259
+ }
260
+
261
+ // If everything aligns AND no stale bitmap → nothing to do.
262
+ if (!stalePresent && liveIdsBefore.size === liveRows.length) {
263
+ return { skipped: 'no-stale-vectors', dropped: 0 };
264
+ }
265
+
266
+ // Rebuild the index in memory and let `HNSWIndex.save()` publish via
267
+ // its tmp+rename protocol — that protocol keeps any cross-process
268
+ // `usearch.view()` mmap valid against the unlinked old inode.
269
+ const fresh = new HNSWIndex({
270
+ indexPath,
271
+ stalePath,
272
+ dimension,
273
+ maxElements: existing.maxElements,
274
+ M: existing.M,
275
+ efConstruction: existing.efConstruction,
276
+ efSearch: existing.efSearch,
277
+ metric: existing.metric,
278
+ });
279
+ await fresh.init();
280
+ for (let i = 0; i < liveRows.length; i += 1) {
281
+ const row = liveRows[i];
282
+ const embedding = float32FromBuffer(row.embedding);
283
+ let meta;
284
+ try { meta = JSON.parse(row.metadata || '{}'); } catch { meta = {}; }
285
+ const truncated = embedding.length > dimension ? embedding.slice(0, dimension) : embedding;
286
+ await fresh.add(row.id, truncated, meta);
287
+ if (i > 0 && i % 500 === 0) progress('maintenance:float-hnsw:add');
288
+ }
289
+ await fresh.save(indexPath);
290
+ progress('maintenance:float-hnsw:saved');
291
+ // Stale bitmap is meaningless after rebuild — keys are fresh.
292
+ safeUnlink(stalePath);
293
+
294
+ return {
295
+ tier: 'float_hnsw',
296
+ kept: liveRows.length,
297
+ dropped: Math.max(0, liveIdsBefore.size - liveRows.length),
298
+ staleBitmapCleared: true,
299
+ atomicPublish: true,
300
+ };
301
+ }
302
+
303
+ /* ------------------------------------------------------------------ *
304
+ * li_segment *
305
+ * ------------------------------------------------------------------ */
306
+
307
+ /**
308
+ * Per-segment recompaction. Reads the sealed segment, drops docs marked
309
+ * by the segment's stale bitmap, writes a new compacted segment, then
310
+ * updates the segment manifest atomically.
311
+ *
312
+ * Crash recovery: if we fail after writing the compacted segment but
313
+ * before updating the manifest, the next pass re-runs from the
314
+ * (untouched) old segment.
315
+ */
316
+ export async function liSegmentHandler(job, { stateDir, onProgress = null }) {
317
+ const progress = progressFn(onProgress);
318
+ const segmentId = job?.payload?.segmentId;
319
+ if (!segmentId || typeof segmentId !== 'string') {
320
+ throw new Error('li_segment: missing payload.segmentId');
321
+ }
322
+ const stubPath = path.join(stateDir, 'codebase-late-interaction.db');
323
+ if (!fs.existsSync(stubPath)) return { skipped: 'no-li-index' };
324
+ let stub;
325
+ try { stub = JSON.parse(fs.readFileSync(stubPath, 'utf-8')); } catch { return { skipped: 'corrupt-stub' }; }
326
+ if (stub?.format !== 'segmented' || !stub.segmentDir) return { skipped: 'legacy-format' };
327
+ const segmentDir = path.resolve(stateDir, stub.segmentDir);
328
+ const manifestPath = path.join(segmentDir, 'manifest.json');
329
+ if (!fs.existsSync(manifestPath)) return { skipped: 'no-segments-manifest' };
330
+ let manifest;
331
+ try { manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); } catch { return { skipped: 'corrupt-manifest' }; }
332
+ if (!Array.isArray(manifest.segments)) return { skipped: 'corrupt-manifest' };
333
+
334
+ const segmentEntry = manifest.segments.find((s) => s?.path === segmentId);
335
+ if (!segmentEntry) return { skipped: 'unknown-segment' };
336
+
337
+ const segmentPath = path.join(segmentDir, segmentId);
338
+ const staleSidecar = segmentPath + '.stale.bin';
339
+ if (!fs.existsSync(staleSidecar)) {
340
+ return { skipped: 'no-stale-bitmap', segmentId };
341
+ }
342
+ const bitmap = loadBitmap(staleSidecar);
343
+ if (!bitmap) return { skipped: 'no-stale-bitmap', segmentId };
344
+ const tombstoned = popcount(bitmap);
345
+ if (tombstoned === 0) {
346
+ safeUnlink(staleSidecar);
347
+ return { skipped: 'no-tombstones-after-bitmap-load', segmentId };
348
+ }
349
+
350
+ // Open the index. The SSLX loader at `_loadSegmented` already drops
351
+ // tombstoned docs via the per-segment stale bitmap, so
352
+ // `index.documents` after init contains only LIVE entries. The
353
+ // already-quantized doc entries can be reused verbatim — we just need
354
+ // to rewrite the segment file with the surviving docs (in insertion
355
+ // order; `_docSegmentPositions` lets us recover that).
356
+ const index = new LateInteractionIndex({
357
+ indexPath: stubPath,
358
+ loadExisting: true,
359
+ modelId: manifest.modelId || null,
360
+ });
361
+ await index.init();
362
+ progress('maintenance:li-segment:loaded');
363
+
364
+ const ordered = [];
365
+ let scannedDocs = 0;
366
+ for (const [docId, doc] of index.documents.entries()) {
367
+ const position = index._docSegmentPositions?.get(docId);
368
+ if (!position || position.segmentPath !== segmentPath) continue;
369
+ ordered.push({ docIndex: position.docIndex, docId, doc });
370
+ scannedDocs += 1;
371
+ if (scannedDocs % 1000 === 0) progress('maintenance:li-segment:scan');
372
+ }
373
+ ordered.sort((a, b) => a.docIndex - b.docIndex);
374
+ const liveDocs = new Map();
375
+ for (const { docId, doc } of ordered) liveDocs.set(docId, doc);
376
+ const droppedDocs = tombstoned;
377
+ if (liveDocs.size === 0) {
378
+ return { skipped: 'no-live-docs', segmentId };
379
+ }
380
+
381
+ // Use a writer purely as the SSLX serializer; we never call `add()`.
382
+ const writer = new LateInteractionIndex({
383
+ indexPath: stubPath,
384
+ loadExisting: false,
385
+ tokenDim: index.tokenDim,
386
+ maxTokens: index.maxTokens,
387
+ useInt8: index.useInt8,
388
+ quantBits: index.quantBits,
389
+ modelId: index.modelId,
390
+ poolFactor: index.poolFactor,
391
+ whtSeed: index.whtSeed,
392
+ whtOrdering: index.whtOrdering,
393
+ matryoshkaDim: index.matryoshkaDim,
394
+ });
395
+ await writer.init();
396
+
397
+ const tmpSegPath = segmentPath + '.compacting.tmp';
398
+ await writer._writeSegmentFile(tmpSegPath, liveDocs);
399
+ progress('maintenance:li-segment:written');
400
+ // Atomic replace of the segment file.
401
+ fs.renameSync(tmpSegPath, segmentPath);
402
+ // Reset the segment's stale bitmap to a fresh, zero-tombstone bitmap
403
+ // sized for the new doc count.
404
+ safeUnlink(staleSidecar);
405
+ if (liveDocs.size > 0) {
406
+ saveBitmap(staleSidecar, createBitmap(Math.max(1, liveDocs.size)));
407
+ }
408
+
409
+ // Update the manifest entry's count atomically.
410
+ segmentEntry.count = liveDocs.size;
411
+ manifest.totalDocuments = manifest.segments.reduce((sum, s) => sum + (s?.count || 0), 0);
412
+ const tmpManifest = manifestPath + '.tmp';
413
+ fs.writeFileSync(tmpManifest, JSON.stringify(manifest, null, 2));
414
+ fs.renameSync(tmpManifest, manifestPath);
415
+
416
+ return {
417
+ tier: 'li_segment',
418
+ segmentId,
419
+ kept: liveDocs.size,
420
+ dropped: droppedDocs,
421
+ staleBitmapCleared: true,
422
+ };
423
+ }
424
+
425
+ /* ------------------------------------------------------------------ *
426
+ * li_segments (batch merge) *
427
+ * ------------------------------------------------------------------ */
428
+
429
+ /**
430
+ * Batch-merge small live LI segments into fewer larger segments so the
431
+ * segment count stays bounded (the per-segment `li_segment` handler only
432
+ * compacts within a segment; it never reduces the count). Idempotent and
433
+ * crash-safe — see `infrastructure/li-segment-merge.mjs`. Honors
434
+ * `SWEET_SEARCH_LI_MERGE_GRACE_MS` for the quarantine grace window.
435
+ */
436
+ export async function liSegmentsHandler(job, { stateDir, onProgress = null }) {
437
+ const progress = progressFn(onProgress);
438
+ const graceRaw = Number.parseInt(process.env.SWEET_SEARCH_LI_MERGE_GRACE_MS || '', 10);
439
+ const graceMs = Number.isFinite(graceRaw) && graceRaw >= 0 ? graceRaw : LI_MERGE_GRACE_MS;
440
+ // A `pending_delete` re-fire only needs the cheap quarantine/orphan sweep —
441
+ // never reload the full index just to unlink a few deferred files.
442
+ const sweepOnly = job?.reason === 'pending_delete';
443
+ const result = await mergeLiSegments(stateDir, { graceMs, sweepOnly });
444
+ progress('maintenance:li-segments:merged');
445
+ return result;
446
+ }
447
+
448
+ /* ------------------------------------------------------------------ *
449
+ * vector_gc (retired-row physical prune) *
450
+ * ------------------------------------------------------------------ */
451
+
452
+ /**
453
+ * Physically delete retired `codebase.db` vector rows that no live or
454
+ * future reader can observe. Reader-safe (see
455
+ * `infrastructure/vector-gc.mjs`); never throws on a missing DB. Batch
456
+ * size / per-run cap tunable via `SWEET_SEARCH_VECTOR_GC_BATCH` and
457
+ * `SWEET_SEARCH_VECTOR_GC_MAX_ROWS`.
458
+ */
459
+ export function vectorGcHandler(job, { stateDir, onProgress = null }) {
460
+ const progress = progressFn(onProgress);
461
+ const batchRaw = Number.parseInt(process.env.SWEET_SEARCH_VECTOR_GC_BATCH || '', 10);
462
+ const maxRaw = Number.parseInt(process.env.SWEET_SEARCH_VECTOR_GC_MAX_ROWS || '', 10);
463
+ const result = runVectorGc(stateDir, {
464
+ minLiveEpoch,
465
+ readManifest,
466
+ batchSize: Number.isFinite(batchRaw) && batchRaw > 0 ? batchRaw : undefined,
467
+ maxRows: Number.isFinite(maxRaw) && maxRaw > 0 ? maxRaw : undefined,
468
+ });
469
+ progress('maintenance:vector-gc:done');
470
+ return result;
471
+ }
472
+
473
+ /* ------------------------------------------------------------------ *
474
+ * graph_gc (retired graph-row physical prune) *
475
+ * ------------------------------------------------------------------ */
476
+
477
+ /**
478
+ * Physically delete retired `code-graph.db` rows (entities + relationships +
479
+ * HCGS summaries) that no live or future reader can observe, keeping the
480
+ * external-content FTS5 indices consistent. Reader-safe (see
481
+ * `infrastructure/graph-gc.mjs`); never throws on a missing DB. Batch size /
482
+ * per-run cap tunable via `SWEET_SEARCH_GRAPH_GC_BATCH` and
483
+ * `SWEET_SEARCH_GRAPH_GC_MAX_ROWS`.
484
+ */
485
+ export function graphGcHandler(job, { stateDir, onProgress = null }) {
486
+ const progress = progressFn(onProgress);
487
+ const batchRaw = Number.parseInt(process.env.SWEET_SEARCH_GRAPH_GC_BATCH || '', 10);
488
+ const maxRaw = Number.parseInt(process.env.SWEET_SEARCH_GRAPH_GC_MAX_ROWS || '', 10);
489
+ const result = runGraphGc(stateDir, {
490
+ minLiveEpoch,
491
+ readManifest,
492
+ batchSize: Number.isFinite(batchRaw) && batchRaw > 0 ? batchRaw : undefined,
493
+ maxRows: Number.isFinite(maxRaw) && maxRaw > 0 ? maxRaw : undefined,
494
+ });
495
+ progress('maintenance:graph-gc:done');
496
+ return result;
497
+ }
498
+
499
+ /* ------------------------------------------------------------------ *
500
+ * Registry *
501
+ * ------------------------------------------------------------------ */
502
+
503
+ /**
504
+ * Build the full handler set used by the maintenance worker. The fts5
505
+ * handler stays in maintenance-worker.mjs::defaultMaintenanceHandlers
506
+ * (built-in to the same file as the worker); this returns the four
507
+ * additional handlers and lets the caller merge them.
508
+ */
509
+ export function reclamationHandlers(stateDir) {
510
+ return {
511
+ sparse_gram: (job, ctx = {}) => sparseGramHandler(job, { stateDir, onProgress: ctx.onProgress }),
512
+ binary_hnsw: (job, ctx = {}) => binaryHnswHandler(job, { stateDir, onProgress: ctx.onProgress }),
513
+ float_hnsw: (job, ctx = {}) => floatHnswHandler(job, { stateDir, onProgress: ctx.onProgress }),
514
+ li_segment: (job, ctx = {}) => liSegmentHandler(job, { stateDir, onProgress: ctx.onProgress }),
515
+ li_segments: (job, ctx = {}) => liSegmentsHandler(job, { stateDir, onProgress: ctx.onProgress }),
516
+ vector_gc: (job, ctx = {}) => vectorGcHandler(job, { stateDir, onProgress: ctx.onProgress }),
517
+ graph_gc: (job, ctx = {}) => graphGcHandler(job, { stateDir, onProgress: ctx.onProgress }),
518
+ };
519
+ }