sweet-search 2.5.2 → 2.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -0,0 +1,519 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Maintenance handlers — real implementations for the four reclamation
|
|
3
|
+
* tiers the soak REPORT.md flagged as queue-only.
|
|
4
|
+
*
|
|
5
|
+
* Each handler is registered by `defaultMaintenanceHandlers()` in
|
|
6
|
+
* `maintenance-worker.mjs`. Handlers run inside the reconcile daemon's
|
|
7
|
+
* single-writer process, so on-disk artifacts have one writer at a time.
|
|
8
|
+
*
|
|
9
|
+
* Atomicity contract: per artifact-family, each handler writes its new
|
|
10
|
+
* artifacts via a path that sorts later than the existing ones (sparse
|
|
11
|
+
* gram), an explicit temp+rename (LI segments, HNSW meta/usearch/vectors),
|
|
12
|
+
* or via the existing `*.next` clean-build flag (Binary HNSW). After a
|
|
13
|
+
* successful publish the handler clears the tier's stale bitmap; on
|
|
14
|
+
* failure the previous artifacts remain readable.
|
|
15
|
+
*
|
|
16
|
+
* Manifest semantics:
|
|
17
|
+
* - sparse_gram, LI segment: the reconcile manifest is unchanged. New
|
|
18
|
+
* artifacts replace old ones at canonical paths read fresh per query.
|
|
19
|
+
* - HNSW (float / binary): canonical paths unchanged; the reconcile
|
|
20
|
+
* manifest stays at the current epoch. Cross-process readers that
|
|
21
|
+
* cache an HNSWIndex instance in memory MUST already invalidate on
|
|
22
|
+
* manifest change — but maintenance does not bump the epoch by
|
|
23
|
+
* itself. This matches the existing reconcile tick semantics; a
|
|
24
|
+
* follow-up workstream can add versioned tier paths if needed.
|
|
25
|
+
*
|
|
26
|
+
* The handlers degrade safely when artifacts are missing/corrupt — they
|
|
27
|
+
* throw a descriptive error which the worker converts into the standard
|
|
28
|
+
* retry/dead-letter path.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
import fs from 'node:fs';
|
|
32
|
+
import path from 'node:path';
|
|
33
|
+
import Database from 'better-sqlite3';
|
|
34
|
+
|
|
35
|
+
import { BinaryHNSWIndex } from '../../vector-store/binary-hnsw-index.js';
|
|
36
|
+
import { HNSWIndex } from '../../vector-store/hnsw-index.js';
|
|
37
|
+
import { LateInteractionIndex } from '../../ranking/late-interaction-index.js';
|
|
38
|
+
import { compactDeltaSegments, listDeltaSegments } from '../infrastructure/sparse-gram-delta.mjs';
|
|
39
|
+
import { mergeLiSegments, LI_MERGE_GRACE_MS } from '../infrastructure/li-segment-merge.mjs';
|
|
40
|
+
import { runVectorGc } from '../infrastructure/vector-gc.mjs';
|
|
41
|
+
import { runGraphGc } from '../infrastructure/graph-gc.mjs';
|
|
42
|
+
import { minLiveEpoch } from '../infrastructure/reader-heartbeat.mjs';
|
|
43
|
+
import { readManifest, writeManifest } from '../infrastructure/manifest.mjs';
|
|
44
|
+
import {
|
|
45
|
+
loadBitmap, popcount, isSet, createBitmap, saveBitmap,
|
|
46
|
+
} from '../infrastructure/tombstone-bitmap.mjs';
|
|
47
|
+
|
|
48
|
+
function safeUnlink(p) { try { fs.unlinkSync(p); } catch { /* ok */ } }
|
|
49
|
+
function progressFn(onProgress) {
|
|
50
|
+
return typeof onProgress === 'function'
|
|
51
|
+
? (phase) => { onProgress(phase); }
|
|
52
|
+
: () => {};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function float32FromBuffer(buffer) {
|
|
56
|
+
const view = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
57
|
+
return new Float32Array(view);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/* ------------------------------------------------------------------ *
|
|
61
|
+
* sparse_gram *
|
|
62
|
+
* ------------------------------------------------------------------ */
|
|
63
|
+
|
|
64
|
+
export async function sparseGramHandler(job, { stateDir, onProgress = null }) {
|
|
65
|
+
const progress = progressFn(onProgress);
|
|
66
|
+
const base = path.join(stateDir, 'codebase-sparse-grams.idx');
|
|
67
|
+
// Stage the compaction in deferred-delete mode. The compacted segment is
|
|
68
|
+
// already on disk via tmp+rename; the consumed old segments stay until
|
|
69
|
+
// we have rewritten the reconcile manifest (or confirmed nobody is
|
|
70
|
+
// pinning the old paths). This closes the microsecond window in which a
|
|
71
|
+
// cross-process reader holding the OLD manifest's `sparseGram.deltas`
|
|
72
|
+
// list could resolve `recordsResolved = 0` against deleted files.
|
|
73
|
+
const result = compactDeltaSegments(base, { dropTombstones: false, deferDelete: true });
|
|
74
|
+
progress('maintenance:sparse-gram:compacted');
|
|
75
|
+
if (result.skipped) {
|
|
76
|
+
return { skipped: result.skipped };
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const consumedSet = new Set(result.consumedSegmentPaths);
|
|
80
|
+
let manifestUpdated = false;
|
|
81
|
+
let manifestError = null;
|
|
82
|
+
let hadSparseGramPin = false;
|
|
83
|
+
try {
|
|
84
|
+
const manifest = readManifest(stateDir);
|
|
85
|
+
if (manifest?.sparseGram) {
|
|
86
|
+
hadSparseGramPin = true;
|
|
87
|
+
// Future-of-disk list: everything currently in the delta dir minus
|
|
88
|
+
// the segments we are about to unlink. In the steady state that is
|
|
89
|
+
// just the compacted segment; filtering keeps us correct if a
|
|
90
|
+
// reconcile tick somehow slipped in another segment between
|
|
91
|
+
// compaction and manifest write.
|
|
92
|
+
const remaining = listDeltaSegments(base).filter((seg) => !consumedSet.has(seg.path));
|
|
93
|
+
manifest.sparseGram.deltas = remaining.map((seg) =>
|
|
94
|
+
path.relative(stateDir, seg.path).replace(/\\/g, '/'),
|
|
95
|
+
);
|
|
96
|
+
writeManifest(stateDir, manifest);
|
|
97
|
+
manifestUpdated = true;
|
|
98
|
+
}
|
|
99
|
+
} catch (err) {
|
|
100
|
+
manifestError = err?.message || String(err);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Publish gate. Only delete the old segments once the new manifest is
|
|
104
|
+
// live (or we know nobody is pinning the old paths). On a manifest write
|
|
105
|
+
// failure we leave the old segments in place; the next maintenance pass
|
|
106
|
+
// re-runs the compaction across both the leftover compacted file and
|
|
107
|
+
// the old segments, then re-attempts the manifest publish.
|
|
108
|
+
let unlinked = 0;
|
|
109
|
+
const safeToUnlink = manifestUpdated || !hadSparseGramPin;
|
|
110
|
+
if (safeToUnlink) {
|
|
111
|
+
for (const segPath of result.consumedSegmentPaths) {
|
|
112
|
+
try { fs.unlinkSync(segPath); unlinked += 1; } catch { /* tolerate concurrent deletion */ }
|
|
113
|
+
if (unlinked % 100 === 0) progress('maintenance:sparse-gram:unlink');
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
tier: 'sparse_gram',
|
|
119
|
+
consumedSegments: unlinked,
|
|
120
|
+
recordsWritten: result.recordsWritten,
|
|
121
|
+
compactedPath: path.relative(stateDir, result.compactedPath).replace(/\\/g, '/'),
|
|
122
|
+
manifestUpdated,
|
|
123
|
+
...(manifestError ? { manifestError } : {}),
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/* ------------------------------------------------------------------ *
|
|
128
|
+
* binary_hnsw *
|
|
129
|
+
* ------------------------------------------------------------------ */
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Read the set of live vector ids from `codebase.db` (`epoch_retired IS NULL`).
|
|
133
|
+
* `codebase.db` is the source of truth for vector liveness; the Binary-HNSW
|
|
134
|
+
* stale bitmap is a derived query-time cache that can drift from it if a retire
|
|
135
|
+
* op fails to reach the binary tier. Returns `null` when the DB / column is
|
|
136
|
+
* unavailable so the caller can fall back to the stale bitmap.
|
|
137
|
+
*/
|
|
138
|
+
function readLiveVectorIds(stateDir) {
|
|
139
|
+
const dbPath = path.join(stateDir, 'codebase.db');
|
|
140
|
+
if (!fs.existsSync(dbPath)) return null;
|
|
141
|
+
const db = new Database(dbPath, { readonly: true });
|
|
142
|
+
try {
|
|
143
|
+
const cols = db.prepare('PRAGMA table_info(vectors)').all().map((c) => c.name);
|
|
144
|
+
if (!cols.includes('epoch_retired')) return null;
|
|
145
|
+
return new Set(db.prepare('SELECT id FROM vectors WHERE epoch_retired IS NULL').all().map((r) => r.id));
|
|
146
|
+
} catch {
|
|
147
|
+
return null;
|
|
148
|
+
} finally {
|
|
149
|
+
db.close();
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
export async function binaryHnswHandler(job, { stateDir, onProgress = null }) {
|
|
154
|
+
const progress = progressFn(onProgress);
|
|
155
|
+
const indexPath = path.join(stateDir, 'codebase-binary-hnsw.idx');
|
|
156
|
+
const metaPath = path.join(stateDir, 'codebase-binary-hnsw.meta.json');
|
|
157
|
+
if (!fs.existsSync(metaPath)) return { skipped: 'no-index' };
|
|
158
|
+
|
|
159
|
+
const existing = new BinaryHNSWIndex({ indexPath });
|
|
160
|
+
await existing.load(indexPath);
|
|
161
|
+
progress('maintenance:binary-hnsw:loaded');
|
|
162
|
+
|
|
163
|
+
// Liveness authority is codebase.db, NOT the binary stale bitmap. This makes
|
|
164
|
+
// binary reclamation self-healing and consistent with floatHnswHandler
|
|
165
|
+
// (which already rebuilds from `vectors WHERE epoch_retired IS NULL`): a
|
|
166
|
+
// vector retired in codebase.db is dropped here even if its binary stale bit
|
|
167
|
+
// was never set. Falls back to the stale bitmap only when codebase.db is
|
|
168
|
+
// unavailable.
|
|
169
|
+
const liveIds = readLiveVectorIds(stateDir);
|
|
170
|
+
const staleBitmap = existing._loadStaleBitmap();
|
|
171
|
+
const live = [];
|
|
172
|
+
for (let i = 0; i < existing.vectors.length; i += 1) {
|
|
173
|
+
const v = existing.vectors[i];
|
|
174
|
+
const isStale = liveIds ? !liveIds.has(v.id) : (staleBitmap && isSet(staleBitmap, i));
|
|
175
|
+
if (isStale) continue;
|
|
176
|
+
const int8 = existing.int8Vectors.get(v.id) || null;
|
|
177
|
+
live.push({ id: v.id, binary: v.binary, metadata: v.metadata, int8 });
|
|
178
|
+
if (i > 0 && i % 1000 === 0) progress('maintenance:binary-hnsw:scan');
|
|
179
|
+
}
|
|
180
|
+
const dropped = existing.vectors.length - live.length;
|
|
181
|
+
if (dropped === 0) {
|
|
182
|
+
return { skipped: 'no-stale-vectors', dropped: 0 };
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// Rebuild the index in memory and let `BinaryHNSWIndex.save()`
|
|
186
|
+
// publish via its tmp+rename protocol — every sidecar is staged then
|
|
187
|
+
// atomically renamed (data first, .meta.json last) so fresh readers
|
|
188
|
+
// don't see torn `(meta, vectors, graph, int8)` tuples.
|
|
189
|
+
const fresh = new BinaryHNSWIndex({
|
|
190
|
+
indexPath,
|
|
191
|
+
floatDimension: existing.floatDimension,
|
|
192
|
+
M: existing.M,
|
|
193
|
+
efConstruction: existing.efConstruction,
|
|
194
|
+
efSearch: existing.efSearch,
|
|
195
|
+
maxElements: existing.maxElements,
|
|
196
|
+
});
|
|
197
|
+
fresh.resetForBuild();
|
|
198
|
+
let added = 0;
|
|
199
|
+
for (const v of live) {
|
|
200
|
+
await fresh.add(v.id, v.binary, v.metadata, v.int8);
|
|
201
|
+
added += 1;
|
|
202
|
+
if (added % 500 === 0) progress('maintenance:binary-hnsw:add');
|
|
203
|
+
}
|
|
204
|
+
fresh._cleanBuild = true;
|
|
205
|
+
await fresh.save(indexPath);
|
|
206
|
+
progress('maintenance:binary-hnsw:saved');
|
|
207
|
+
|
|
208
|
+
return {
|
|
209
|
+
tier: 'binary_hnsw',
|
|
210
|
+
kept: live.length,
|
|
211
|
+
dropped,
|
|
212
|
+
staleBitmapCleared: true,
|
|
213
|
+
atomicPublish: true,
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/* ------------------------------------------------------------------ *
|
|
218
|
+
* float_hnsw *
|
|
219
|
+
* ------------------------------------------------------------------ */
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Float HNSW clean replacement.
|
|
223
|
+
*
|
|
224
|
+
* Source of truth for "which vectors are live" is `codebase.db`. The
|
|
225
|
+
* existing HNSW meta.json's idMap is also pruned, but we re-read the DB
|
|
226
|
+
* to pick up `embedding` blobs the in-memory HNSWIndex doesn't expose.
|
|
227
|
+
*
|
|
228
|
+
* Caller invariant: the codebase.db schema columns (`id`, `embedding`,
|
|
229
|
+
* `metadata`, `epoch_retired`) are stable — verified in the production
|
|
230
|
+
* reconciler `applyVectorDelta` path.
|
|
231
|
+
*/
|
|
232
|
+
export async function floatHnswHandler(job, { stateDir, onProgress = null }) {
|
|
233
|
+
const progress = progressFn(onProgress);
|
|
234
|
+
const indexPath = path.join(stateDir, 'codebase-hnsw.idx');
|
|
235
|
+
const metaPath = path.join(stateDir, 'codebase-hnsw.meta.json');
|
|
236
|
+
const dbPath = path.join(stateDir, 'codebase.db');
|
|
237
|
+
if (!fs.existsSync(metaPath)) return { skipped: 'no-index' };
|
|
238
|
+
if (!fs.existsSync(dbPath)) return { skipped: 'no-vector-db' };
|
|
239
|
+
|
|
240
|
+
// Load existing index to discover dimension / parameters (cheap).
|
|
241
|
+
const existing = new HNSWIndex({ indexPath });
|
|
242
|
+
try { await existing.load(indexPath); } catch { return { skipped: 'load-failed' }; }
|
|
243
|
+
progress('maintenance:float-hnsw:loaded');
|
|
244
|
+
const dimension = existing.dimension;
|
|
245
|
+
const stalePath = existing.stalePath;
|
|
246
|
+
|
|
247
|
+
const stalePresent = fs.existsSync(stalePath);
|
|
248
|
+
const liveIdsBefore = new Set(existing.idMap.keys());
|
|
249
|
+
|
|
250
|
+
// Walk live vectors from codebase.db.
|
|
251
|
+
const db = new Database(dbPath, { readonly: true });
|
|
252
|
+
let liveRows;
|
|
253
|
+
try {
|
|
254
|
+
liveRows = db.prepare(
|
|
255
|
+
'SELECT id, embedding, metadata FROM vectors WHERE epoch_retired IS NULL'
|
|
256
|
+
).all();
|
|
257
|
+
} finally {
|
|
258
|
+
db.close();
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// If everything aligns AND no stale bitmap → nothing to do.
|
|
262
|
+
if (!stalePresent && liveIdsBefore.size === liveRows.length) {
|
|
263
|
+
return { skipped: 'no-stale-vectors', dropped: 0 };
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Rebuild the index in memory and let `HNSWIndex.save()` publish via
|
|
267
|
+
// its tmp+rename protocol — that protocol keeps any cross-process
|
|
268
|
+
// `usearch.view()` mmap valid against the unlinked old inode.
|
|
269
|
+
const fresh = new HNSWIndex({
|
|
270
|
+
indexPath,
|
|
271
|
+
stalePath,
|
|
272
|
+
dimension,
|
|
273
|
+
maxElements: existing.maxElements,
|
|
274
|
+
M: existing.M,
|
|
275
|
+
efConstruction: existing.efConstruction,
|
|
276
|
+
efSearch: existing.efSearch,
|
|
277
|
+
metric: existing.metric,
|
|
278
|
+
});
|
|
279
|
+
await fresh.init();
|
|
280
|
+
for (let i = 0; i < liveRows.length; i += 1) {
|
|
281
|
+
const row = liveRows[i];
|
|
282
|
+
const embedding = float32FromBuffer(row.embedding);
|
|
283
|
+
let meta;
|
|
284
|
+
try { meta = JSON.parse(row.metadata || '{}'); } catch { meta = {}; }
|
|
285
|
+
const truncated = embedding.length > dimension ? embedding.slice(0, dimension) : embedding;
|
|
286
|
+
await fresh.add(row.id, truncated, meta);
|
|
287
|
+
if (i > 0 && i % 500 === 0) progress('maintenance:float-hnsw:add');
|
|
288
|
+
}
|
|
289
|
+
await fresh.save(indexPath);
|
|
290
|
+
progress('maintenance:float-hnsw:saved');
|
|
291
|
+
// Stale bitmap is meaningless after rebuild — keys are fresh.
|
|
292
|
+
safeUnlink(stalePath);
|
|
293
|
+
|
|
294
|
+
return {
|
|
295
|
+
tier: 'float_hnsw',
|
|
296
|
+
kept: liveRows.length,
|
|
297
|
+
dropped: Math.max(0, liveIdsBefore.size - liveRows.length),
|
|
298
|
+
staleBitmapCleared: true,
|
|
299
|
+
atomicPublish: true,
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
/* ------------------------------------------------------------------ *
|
|
304
|
+
* li_segment *
|
|
305
|
+
* ------------------------------------------------------------------ */
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Per-segment recompaction. Reads the sealed segment, drops docs marked
|
|
309
|
+
* by the segment's stale bitmap, writes a new compacted segment, then
|
|
310
|
+
* updates the segment manifest atomically.
|
|
311
|
+
*
|
|
312
|
+
* Crash recovery: if we fail after writing the compacted segment but
|
|
313
|
+
* before updating the manifest, the next pass re-runs from the
|
|
314
|
+
* (untouched) old segment.
|
|
315
|
+
*/
|
|
316
|
+
export async function liSegmentHandler(job, { stateDir, onProgress = null }) {
|
|
317
|
+
const progress = progressFn(onProgress);
|
|
318
|
+
const segmentId = job?.payload?.segmentId;
|
|
319
|
+
if (!segmentId || typeof segmentId !== 'string') {
|
|
320
|
+
throw new Error('li_segment: missing payload.segmentId');
|
|
321
|
+
}
|
|
322
|
+
const stubPath = path.join(stateDir, 'codebase-late-interaction.db');
|
|
323
|
+
if (!fs.existsSync(stubPath)) return { skipped: 'no-li-index' };
|
|
324
|
+
let stub;
|
|
325
|
+
try { stub = JSON.parse(fs.readFileSync(stubPath, 'utf-8')); } catch { return { skipped: 'corrupt-stub' }; }
|
|
326
|
+
if (stub?.format !== 'segmented' || !stub.segmentDir) return { skipped: 'legacy-format' };
|
|
327
|
+
const segmentDir = path.resolve(stateDir, stub.segmentDir);
|
|
328
|
+
const manifestPath = path.join(segmentDir, 'manifest.json');
|
|
329
|
+
if (!fs.existsSync(manifestPath)) return { skipped: 'no-segments-manifest' };
|
|
330
|
+
let manifest;
|
|
331
|
+
try { manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); } catch { return { skipped: 'corrupt-manifest' }; }
|
|
332
|
+
if (!Array.isArray(manifest.segments)) return { skipped: 'corrupt-manifest' };
|
|
333
|
+
|
|
334
|
+
const segmentEntry = manifest.segments.find((s) => s?.path === segmentId);
|
|
335
|
+
if (!segmentEntry) return { skipped: 'unknown-segment' };
|
|
336
|
+
|
|
337
|
+
const segmentPath = path.join(segmentDir, segmentId);
|
|
338
|
+
const staleSidecar = segmentPath + '.stale.bin';
|
|
339
|
+
if (!fs.existsSync(staleSidecar)) {
|
|
340
|
+
return { skipped: 'no-stale-bitmap', segmentId };
|
|
341
|
+
}
|
|
342
|
+
const bitmap = loadBitmap(staleSidecar);
|
|
343
|
+
if (!bitmap) return { skipped: 'no-stale-bitmap', segmentId };
|
|
344
|
+
const tombstoned = popcount(bitmap);
|
|
345
|
+
if (tombstoned === 0) {
|
|
346
|
+
safeUnlink(staleSidecar);
|
|
347
|
+
return { skipped: 'no-tombstones-after-bitmap-load', segmentId };
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
// Open the index. The SSLX loader at `_loadSegmented` already drops
|
|
351
|
+
// tombstoned docs via the per-segment stale bitmap, so
|
|
352
|
+
// `index.documents` after init contains only LIVE entries. The
|
|
353
|
+
// already-quantized doc entries can be reused verbatim — we just need
|
|
354
|
+
// to rewrite the segment file with the surviving docs (in insertion
|
|
355
|
+
// order; `_docSegmentPositions` lets us recover that).
|
|
356
|
+
const index = new LateInteractionIndex({
|
|
357
|
+
indexPath: stubPath,
|
|
358
|
+
loadExisting: true,
|
|
359
|
+
modelId: manifest.modelId || null,
|
|
360
|
+
});
|
|
361
|
+
await index.init();
|
|
362
|
+
progress('maintenance:li-segment:loaded');
|
|
363
|
+
|
|
364
|
+
const ordered = [];
|
|
365
|
+
let scannedDocs = 0;
|
|
366
|
+
for (const [docId, doc] of index.documents.entries()) {
|
|
367
|
+
const position = index._docSegmentPositions?.get(docId);
|
|
368
|
+
if (!position || position.segmentPath !== segmentPath) continue;
|
|
369
|
+
ordered.push({ docIndex: position.docIndex, docId, doc });
|
|
370
|
+
scannedDocs += 1;
|
|
371
|
+
if (scannedDocs % 1000 === 0) progress('maintenance:li-segment:scan');
|
|
372
|
+
}
|
|
373
|
+
ordered.sort((a, b) => a.docIndex - b.docIndex);
|
|
374
|
+
const liveDocs = new Map();
|
|
375
|
+
for (const { docId, doc } of ordered) liveDocs.set(docId, doc);
|
|
376
|
+
const droppedDocs = tombstoned;
|
|
377
|
+
if (liveDocs.size === 0) {
|
|
378
|
+
return { skipped: 'no-live-docs', segmentId };
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Use a writer purely as the SSLX serializer; we never call `add()`.
|
|
382
|
+
const writer = new LateInteractionIndex({
|
|
383
|
+
indexPath: stubPath,
|
|
384
|
+
loadExisting: false,
|
|
385
|
+
tokenDim: index.tokenDim,
|
|
386
|
+
maxTokens: index.maxTokens,
|
|
387
|
+
useInt8: index.useInt8,
|
|
388
|
+
quantBits: index.quantBits,
|
|
389
|
+
modelId: index.modelId,
|
|
390
|
+
poolFactor: index.poolFactor,
|
|
391
|
+
whtSeed: index.whtSeed,
|
|
392
|
+
whtOrdering: index.whtOrdering,
|
|
393
|
+
matryoshkaDim: index.matryoshkaDim,
|
|
394
|
+
});
|
|
395
|
+
await writer.init();
|
|
396
|
+
|
|
397
|
+
const tmpSegPath = segmentPath + '.compacting.tmp';
|
|
398
|
+
await writer._writeSegmentFile(tmpSegPath, liveDocs);
|
|
399
|
+
progress('maintenance:li-segment:written');
|
|
400
|
+
// Atomic replace of the segment file.
|
|
401
|
+
fs.renameSync(tmpSegPath, segmentPath);
|
|
402
|
+
// Reset the segment's stale bitmap to a fresh, zero-tombstone bitmap
|
|
403
|
+
// sized for the new doc count.
|
|
404
|
+
safeUnlink(staleSidecar);
|
|
405
|
+
if (liveDocs.size > 0) {
|
|
406
|
+
saveBitmap(staleSidecar, createBitmap(Math.max(1, liveDocs.size)));
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// Update the manifest entry's count atomically.
|
|
410
|
+
segmentEntry.count = liveDocs.size;
|
|
411
|
+
manifest.totalDocuments = manifest.segments.reduce((sum, s) => sum + (s?.count || 0), 0);
|
|
412
|
+
const tmpManifest = manifestPath + '.tmp';
|
|
413
|
+
fs.writeFileSync(tmpManifest, JSON.stringify(manifest, null, 2));
|
|
414
|
+
fs.renameSync(tmpManifest, manifestPath);
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
tier: 'li_segment',
|
|
418
|
+
segmentId,
|
|
419
|
+
kept: liveDocs.size,
|
|
420
|
+
dropped: droppedDocs,
|
|
421
|
+
staleBitmapCleared: true,
|
|
422
|
+
};
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
/* ------------------------------------------------------------------ *
|
|
426
|
+
* li_segments (batch merge) *
|
|
427
|
+
* ------------------------------------------------------------------ */
|
|
428
|
+
|
|
429
|
+
/**
|
|
430
|
+
* Batch-merge small live LI segments into fewer larger segments so the
|
|
431
|
+
* segment count stays bounded (the per-segment `li_segment` handler only
|
|
432
|
+
* compacts within a segment; it never reduces the count). Idempotent and
|
|
433
|
+
* crash-safe — see `infrastructure/li-segment-merge.mjs`. Honors
|
|
434
|
+
* `SWEET_SEARCH_LI_MERGE_GRACE_MS` for the quarantine grace window.
|
|
435
|
+
*/
|
|
436
|
+
export async function liSegmentsHandler(job, { stateDir, onProgress = null }) {
|
|
437
|
+
const progress = progressFn(onProgress);
|
|
438
|
+
const graceRaw = Number.parseInt(process.env.SWEET_SEARCH_LI_MERGE_GRACE_MS || '', 10);
|
|
439
|
+
const graceMs = Number.isFinite(graceRaw) && graceRaw >= 0 ? graceRaw : LI_MERGE_GRACE_MS;
|
|
440
|
+
// A `pending_delete` re-fire only needs the cheap quarantine/orphan sweep —
|
|
441
|
+
// never reload the full index just to unlink a few deferred files.
|
|
442
|
+
const sweepOnly = job?.reason === 'pending_delete';
|
|
443
|
+
const result = await mergeLiSegments(stateDir, { graceMs, sweepOnly });
|
|
444
|
+
progress('maintenance:li-segments:merged');
|
|
445
|
+
return result;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
/* ------------------------------------------------------------------ *
|
|
449
|
+
* vector_gc (retired-row physical prune) *
|
|
450
|
+
* ------------------------------------------------------------------ */
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* Physically delete retired `codebase.db` vector rows that no live or
|
|
454
|
+
* future reader can observe. Reader-safe (see
|
|
455
|
+
* `infrastructure/vector-gc.mjs`); never throws on a missing DB. Batch
|
|
456
|
+
* size / per-run cap tunable via `SWEET_SEARCH_VECTOR_GC_BATCH` and
|
|
457
|
+
* `SWEET_SEARCH_VECTOR_GC_MAX_ROWS`.
|
|
458
|
+
*/
|
|
459
|
+
export function vectorGcHandler(job, { stateDir, onProgress = null }) {
|
|
460
|
+
const progress = progressFn(onProgress);
|
|
461
|
+
const batchRaw = Number.parseInt(process.env.SWEET_SEARCH_VECTOR_GC_BATCH || '', 10);
|
|
462
|
+
const maxRaw = Number.parseInt(process.env.SWEET_SEARCH_VECTOR_GC_MAX_ROWS || '', 10);
|
|
463
|
+
const result = runVectorGc(stateDir, {
|
|
464
|
+
minLiveEpoch,
|
|
465
|
+
readManifest,
|
|
466
|
+
batchSize: Number.isFinite(batchRaw) && batchRaw > 0 ? batchRaw : undefined,
|
|
467
|
+
maxRows: Number.isFinite(maxRaw) && maxRaw > 0 ? maxRaw : undefined,
|
|
468
|
+
});
|
|
469
|
+
progress('maintenance:vector-gc:done');
|
|
470
|
+
return result;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
/* ------------------------------------------------------------------ *
|
|
474
|
+
* graph_gc (retired graph-row physical prune) *
|
|
475
|
+
* ------------------------------------------------------------------ */
|
|
476
|
+
|
|
477
|
+
/**
|
|
478
|
+
* Physically delete retired `code-graph.db` rows (entities + relationships +
|
|
479
|
+
* HCGS summaries) that no live or future reader can observe, keeping the
|
|
480
|
+
* external-content FTS5 indices consistent. Reader-safe (see
|
|
481
|
+
* `infrastructure/graph-gc.mjs`); never throws on a missing DB. Batch size /
|
|
482
|
+
* per-run cap tunable via `SWEET_SEARCH_GRAPH_GC_BATCH` and
|
|
483
|
+
* `SWEET_SEARCH_GRAPH_GC_MAX_ROWS`.
|
|
484
|
+
*/
|
|
485
|
+
export function graphGcHandler(job, { stateDir, onProgress = null }) {
|
|
486
|
+
const progress = progressFn(onProgress);
|
|
487
|
+
const batchRaw = Number.parseInt(process.env.SWEET_SEARCH_GRAPH_GC_BATCH || '', 10);
|
|
488
|
+
const maxRaw = Number.parseInt(process.env.SWEET_SEARCH_GRAPH_GC_MAX_ROWS || '', 10);
|
|
489
|
+
const result = runGraphGc(stateDir, {
|
|
490
|
+
minLiveEpoch,
|
|
491
|
+
readManifest,
|
|
492
|
+
batchSize: Number.isFinite(batchRaw) && batchRaw > 0 ? batchRaw : undefined,
|
|
493
|
+
maxRows: Number.isFinite(maxRaw) && maxRaw > 0 ? maxRaw : undefined,
|
|
494
|
+
});
|
|
495
|
+
progress('maintenance:graph-gc:done');
|
|
496
|
+
return result;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
/* ------------------------------------------------------------------ *
|
|
500
|
+
* Registry *
|
|
501
|
+
* ------------------------------------------------------------------ */
|
|
502
|
+
|
|
503
|
+
/**
|
|
504
|
+
* Build the full handler set used by the maintenance worker. The fts5
|
|
505
|
+
* handler stays in maintenance-worker.mjs::defaultMaintenanceHandlers
|
|
506
|
+
* (built-in to the same file as the worker); this returns the four
|
|
507
|
+
* additional handlers and lets the caller merge them.
|
|
508
|
+
*/
|
|
509
|
+
export function reclamationHandlers(stateDir) {
|
|
510
|
+
return {
|
|
511
|
+
sparse_gram: (job, ctx = {}) => sparseGramHandler(job, { stateDir, onProgress: ctx.onProgress }),
|
|
512
|
+
binary_hnsw: (job, ctx = {}) => binaryHnswHandler(job, { stateDir, onProgress: ctx.onProgress }),
|
|
513
|
+
float_hnsw: (job, ctx = {}) => floatHnswHandler(job, { stateDir, onProgress: ctx.onProgress }),
|
|
514
|
+
li_segment: (job, ctx = {}) => liSegmentHandler(job, { stateDir, onProgress: ctx.onProgress }),
|
|
515
|
+
li_segments: (job, ctx = {}) => liSegmentsHandler(job, { stateDir, onProgress: ctx.onProgress }),
|
|
516
|
+
vector_gc: (job, ctx = {}) => vectorGcHandler(job, { stateDir, onProgress: ctx.onProgress }),
|
|
517
|
+
graph_gc: (job, ctx = {}) => graphGcHandler(job, { stateDir, onProgress: ctx.onProgress }),
|
|
518
|
+
};
|
|
519
|
+
}
|