sweet-search 2.5.2 → 2.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sparse-gram per-file delta overlay (SSGRMIDX v3).
|
|
3
|
+
*
|
|
4
|
+
* Plan § 7.6. The cold-build artifact `codebase-sparse-grams.idx` is
|
|
5
|
+
* immutable; the reconcile path writes per-file changes to
|
|
6
|
+
* `codebase-sparse-grams.idx.deltas/{epoch}-{seq}.ssgrmdelta` and the
|
|
7
|
+
* query path mmaps base ∪ deltas.
|
|
8
|
+
*
|
|
9
|
+
* Delta record (one per file, one JSON line — easy to parse; the native
|
|
10
|
+
* mmap-friendly binary form is Phase 6 work). Each record is keyed by
|
|
11
|
+
* the stable `file_id = xxhash3(canonical_path)` and carries:
|
|
12
|
+
*
|
|
13
|
+
* {
|
|
14
|
+
* "fileId": "<hex>",
|
|
15
|
+
* "filePath": "<relative path>",
|
|
16
|
+
* "contentHash": "<hex>",
|
|
17
|
+
* "deleted": false,
|
|
18
|
+
* "symbolMask": <int>,
|
|
19
|
+
* "weightsId": "<id>", // must match base artifact's weights id
|
|
20
|
+
* "grams": [ [gramId, freq], ... ]
|
|
21
|
+
* }
|
|
22
|
+
*
|
|
23
|
+
* The reader unions in two passes:
|
|
24
|
+
* 1. For each file_id with a newer delta record, mask the base
|
|
25
|
+
* postings for that file_id and read the delta record instead.
|
|
26
|
+
* 2. If `deleted=true`, the file_id is excluded from postings entirely.
|
|
27
|
+
*
|
|
28
|
+
* The delta directory grows over time; the watermark scheduler
|
|
29
|
+
* (`domain/watermark-scheduler.mjs`) triggers compaction when the
|
|
30
|
+
* `delta_size_ratio` or `delta_segment_count` thresholds cross. Compaction
|
|
31
|
+
* reads the latest delta record per file, merges with base postings for
|
|
32
|
+
* unchanged files, and emits a new base under `*.next`.
|
|
33
|
+
*
|
|
34
|
+
* Source-file retokenization for unchanged files is **forbidden** by plan
|
|
35
|
+
* § 7.6: the compactor copies postings, it does not re-gram.
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
import fs from 'node:fs';
|
|
39
|
+
import path from 'node:path';
|
|
40
|
+
import { contentHashSync } from './hashing.mjs';
|
|
41
|
+
import { DEFAULT_SPARSE_GRAM_WEIGHTS_ID } from './manifest.mjs';
|
|
42
|
+
|
|
43
|
+
export const DELTA_DIR_SUFFIX = '.deltas';
|
|
44
|
+
export const DELTA_FILE_EXT = '.ssgrmdelta';
|
|
45
|
+
/**
|
|
46
|
+
* Versioned hardcoded common-code bigram-weight table. Plan § 7.6 empty-/
|
|
47
|
+
* tiny-codebase bootstrap. The actual bigram weights live in the Rust
|
|
48
|
+
* native crate (`crates/sweet-search-native/src/sparse_gram.rs`); this
|
|
49
|
+
* module only carries the *identifier* of the fallback table so the
|
|
50
|
+
* reconciler can stamp deltas with the same `weightsId` the base artifact
|
|
51
|
+
* used.
|
|
52
|
+
*/
|
|
53
|
+
export const FALLBACK_WEIGHTS_ID = DEFAULT_SPARSE_GRAM_WEIGHTS_ID;
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Compute the canonical `file_id` for a path. Plan § 7.6 step 2.
|
|
57
|
+
*
|
|
58
|
+
* @param {string} canonicalPath
|
|
59
|
+
* @returns {string}
|
|
60
|
+
*/
|
|
61
|
+
export function fileIdFor(canonicalPath) {
|
|
62
|
+
return contentHashSync(String(canonicalPath));
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function deltaDirFor(baseArtifactPath) {
|
|
66
|
+
return baseArtifactPath + DELTA_DIR_SUFFIX;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function deltaSegmentPath(baseArtifactPath, epoch, seq) {
|
|
70
|
+
return path.join(deltaDirFor(baseArtifactPath), `${epoch}-${seq}${DELTA_FILE_EXT}`);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Append a delta record to the active delta segment. Each call writes one
|
|
75
|
+
* line of JSON. The reconciler is the single writer; appending is atomic
|
|
76
|
+
* per call (single `fs.appendFileSync`).
|
|
77
|
+
*
|
|
78
|
+
* @param {string} baseArtifactPath Path to the immutable base sparse-gram artifact.
|
|
79
|
+
* @param {number} epoch
|
|
80
|
+
* @param {object} record `{ fileId, filePath, contentHash, deleted, symbolMask, weightsId, grams }`
|
|
81
|
+
*/
|
|
82
|
+
export function appendDeltaRecord(baseArtifactPath, epoch, record) {
|
|
83
|
+
if (!Number.isInteger(epoch)) {
|
|
84
|
+
throw new Error('appendDeltaRecord: epoch must be an integer');
|
|
85
|
+
}
|
|
86
|
+
if (!record || !record.fileId) {
|
|
87
|
+
throw new Error('appendDeltaRecord: record.fileId is required');
|
|
88
|
+
}
|
|
89
|
+
const deltaDir = deltaDirFor(baseArtifactPath);
|
|
90
|
+
fs.mkdirSync(deltaDir, { recursive: true });
|
|
91
|
+
const filePath = deltaSegmentPath(baseArtifactPath, epoch, 0);
|
|
92
|
+
const fd = fs.openSync(filePath, 'a');
|
|
93
|
+
try {
|
|
94
|
+
fs.writeSync(fd, JSON.stringify(record) + '\n');
|
|
95
|
+
fs.fsyncSync(fd);
|
|
96
|
+
} finally {
|
|
97
|
+
fs.closeSync(fd);
|
|
98
|
+
}
|
|
99
|
+
try {
|
|
100
|
+
const dirFd = fs.openSync(deltaDir, 'r');
|
|
101
|
+
try { fs.fsyncSync(dirFd); } finally { fs.closeSync(dirFd); }
|
|
102
|
+
} catch {
|
|
103
|
+
// Some test/container filesystems reject directory fsync; the data fsync
|
|
104
|
+
// above is the required durability boundary.
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Enumerate delta segment files (sorted by epoch, then sequence).
|
|
110
|
+
*
|
|
111
|
+
* @param {string} baseArtifactPath
|
|
112
|
+
* @param {{maxEpoch?:number}} [opts]
|
|
113
|
+
* @returns {Array<{path:string, epoch:number, seq:number}>}
|
|
114
|
+
*/
|
|
115
|
+
export function listDeltaSegments(baseArtifactPath, opts = {}) {
|
|
116
|
+
const dir = deltaDirFor(baseArtifactPath);
|
|
117
|
+
if (!fs.existsSync(dir)) return [];
|
|
118
|
+
const maxEpoch = Number.isInteger(opts.maxEpoch) ? opts.maxEpoch : Infinity;
|
|
119
|
+
const out = [];
|
|
120
|
+
for (const name of fs.readdirSync(dir)) {
|
|
121
|
+
if (!name.endsWith(DELTA_FILE_EXT)) continue;
|
|
122
|
+
const match = name.match(/^(\d+)-(\d+)\.ssgrmdelta$/);
|
|
123
|
+
if (!match) continue;
|
|
124
|
+
const epoch = Number(match[1]);
|
|
125
|
+
if (epoch > maxEpoch) continue;
|
|
126
|
+
out.push({
|
|
127
|
+
path: path.join(dir, name),
|
|
128
|
+
epoch,
|
|
129
|
+
seq: Number(match[2]),
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
return out.sort((a, b) => (a.epoch - b.epoch) || (a.seq - b.seq));
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Read all delta records and resolve them to the latest record per fileId.
|
|
137
|
+
* Plan § 7.6 step 4: writing the same `(fileId, contentHash)` twice is a
|
|
138
|
+
* no-op at query merge time; the *last* record wins.
|
|
139
|
+
*
|
|
140
|
+
* @param {string} baseArtifactPath
|
|
141
|
+
* @param {{maxEpoch?:number}} [opts]
|
|
142
|
+
* @returns {Map<string, {record:object, segmentPath:string, epoch:number}>}
|
|
143
|
+
*/
|
|
144
|
+
export function resolveLatestRecords(baseArtifactPath, opts = {}) {
|
|
145
|
+
const latest = new Map();
|
|
146
|
+
for (const seg of listDeltaSegments(baseArtifactPath, opts)) {
|
|
147
|
+
const raw = fs.readFileSync(seg.path, 'utf-8');
|
|
148
|
+
for (const line of raw.split('\n')) {
|
|
149
|
+
const trimmed = line.trim();
|
|
150
|
+
if (!trimmed) continue;
|
|
151
|
+
let record;
|
|
152
|
+
try {
|
|
153
|
+
record = JSON.parse(trimmed);
|
|
154
|
+
} catch {
|
|
155
|
+
continue; // skip torn / corrupt lines; the compactor will rewrite
|
|
156
|
+
}
|
|
157
|
+
if (!record.fileId) continue;
|
|
158
|
+
latest.set(record.fileId, { record, segmentPath: seg.path, epoch: seg.epoch });
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return latest;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Compute the delta-size ratio used by the watermark scheduler.
|
|
166
|
+
*
|
|
167
|
+
* delta_size_ratio = sum(delta file sizes) / (base file size + sum)
|
|
168
|
+
*
|
|
169
|
+
* @param {string} baseArtifactPath
|
|
170
|
+
* @returns {{ratio:number, deltaSegments:number, deltaBytes:number, baseBytes:number}}
|
|
171
|
+
*/
|
|
172
|
+
export function deltaSizeStats(baseArtifactPath) {
|
|
173
|
+
let deltaBytes = 0;
|
|
174
|
+
let deltaSegments = 0;
|
|
175
|
+
for (const seg of listDeltaSegments(baseArtifactPath)) {
|
|
176
|
+
try {
|
|
177
|
+
const stat = fs.statSync(seg.path);
|
|
178
|
+
deltaBytes += stat.size;
|
|
179
|
+
deltaSegments += 1;
|
|
180
|
+
} catch {
|
|
181
|
+
// ignore vanished files
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
let baseBytes = 0;
|
|
185
|
+
if (fs.existsSync(baseArtifactPath)) {
|
|
186
|
+
baseBytes = fs.statSync(baseArtifactPath).size;
|
|
187
|
+
}
|
|
188
|
+
const ratio = deltaBytes / Math.max(1, baseBytes + deltaBytes);
|
|
189
|
+
return { ratio, deltaSegments, deltaBytes, baseBytes };
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Compact the delta directory in place.
|
|
194
|
+
*
|
|
195
|
+
* Reads all delta segments, resolves the latest record per fileId, writes
|
|
196
|
+
* a single new segment that supersedes them, and (by default) deletes the
|
|
197
|
+
* segments the compaction consumed.
|
|
198
|
+
*
|
|
199
|
+
* Naming: the new segment uses `{maxEpoch}-{seq}` with `seq > 0` so it
|
|
200
|
+
* sorts AFTER any existing `{maxEpoch}-0` segment the reconciler wrote.
|
|
201
|
+
* Future reconcile ticks at epoch > maxEpoch keep monotonic ordering.
|
|
202
|
+
*
|
|
203
|
+
* Atomicity: write `*.compacting.tmp`, fsync, rename to the final name,
|
|
204
|
+
* THEN delete the consumed segments. A crash between rename and delete
|
|
205
|
+
* leaves the compacted file in place; the next round consumes everything
|
|
206
|
+
* including the compacted file and resolves to the same records (the
|
|
207
|
+
* compacted file's seq is highest, so its records win).
|
|
208
|
+
*
|
|
209
|
+
* `deferDelete: true` stages the compaction without unlinking consumed
|
|
210
|
+
* segments — the caller is responsible for deleting `consumedSegmentPaths`
|
|
211
|
+
* once it has published whatever derived state needs to change atomically
|
|
212
|
+
* with the unlink (e.g. the reconcile manifest's `sparseGram.deltas`
|
|
213
|
+
* list). Until the caller deletes them, every reader — including readers
|
|
214
|
+
* pinning the OLD manifest's segments — still resolves every record.
|
|
215
|
+
*
|
|
216
|
+
* Deleted-file records (`deleted: true`) are preserved by default — they
|
|
217
|
+
* suppress base postings at query time. Pass `{ dropTombstones: true }`
|
|
218
|
+
* to discard them; only safe when the caller has confirmed the matching
|
|
219
|
+
* fileId is gone from the base artifact too.
|
|
220
|
+
*
|
|
221
|
+
* @param {string} baseArtifactPath
|
|
222
|
+
* @param {{dropTombstones?:boolean, deferDelete?:boolean}} [opts]
|
|
223
|
+
* @returns {{
|
|
224
|
+
* compactedPath: string|null,
|
|
225
|
+
* consumedSegments: number,
|
|
226
|
+
* consumedSegmentPaths: string[],
|
|
227
|
+
* recordsWritten: number,
|
|
228
|
+
* tombstonedDropped: number,
|
|
229
|
+
* skipped: 'too-few-segments'|null,
|
|
230
|
+
* }}
|
|
231
|
+
*/
|
|
232
|
+
export function compactDeltaSegments(baseArtifactPath, opts = {}) {
|
|
233
|
+
const dropTombstones = !!opts.dropTombstones;
|
|
234
|
+
const deferDelete = !!opts.deferDelete;
|
|
235
|
+
const segments = listDeltaSegments(baseArtifactPath);
|
|
236
|
+
if (segments.length <= 1) {
|
|
237
|
+
return {
|
|
238
|
+
compactedPath: null,
|
|
239
|
+
consumedSegments: 0,
|
|
240
|
+
consumedSegmentPaths: [],
|
|
241
|
+
recordsWritten: 0,
|
|
242
|
+
tombstonedDropped: 0,
|
|
243
|
+
skipped: 'too-few-segments',
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
const maxEpoch = segments[segments.length - 1].epoch;
|
|
247
|
+
const maxSeqAtMaxEpoch = segments
|
|
248
|
+
.filter((s) => s.epoch === maxEpoch)
|
|
249
|
+
.reduce((m, s) => Math.max(m, s.seq), -1);
|
|
250
|
+
const compactSeq = Math.max(maxSeqAtMaxEpoch + 1, 1);
|
|
251
|
+
|
|
252
|
+
const latest = new Map();
|
|
253
|
+
for (const seg of segments) {
|
|
254
|
+
const raw = fs.readFileSync(seg.path, 'utf-8');
|
|
255
|
+
for (const line of raw.split('\n')) {
|
|
256
|
+
const trimmed = line.trim();
|
|
257
|
+
if (!trimmed) continue;
|
|
258
|
+
let record;
|
|
259
|
+
try { record = JSON.parse(trimmed); } catch { continue; }
|
|
260
|
+
if (!record.fileId) continue;
|
|
261
|
+
latest.set(record.fileId, record);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
let tombstonedDropped = 0;
|
|
266
|
+
if (dropTombstones) {
|
|
267
|
+
for (const [fileId, rec] of latest) {
|
|
268
|
+
if (rec.deleted) {
|
|
269
|
+
latest.delete(fileId);
|
|
270
|
+
tombstonedDropped += 1;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
const deltaDir = deltaDirFor(baseArtifactPath);
|
|
276
|
+
const targetName = `${maxEpoch}-${compactSeq}${DELTA_FILE_EXT}`;
|
|
277
|
+
const targetPath = path.join(deltaDir, targetName);
|
|
278
|
+
const tmpPath = targetPath + '.compacting.tmp';
|
|
279
|
+
|
|
280
|
+
const fd = fs.openSync(tmpPath, 'w');
|
|
281
|
+
try {
|
|
282
|
+
for (const record of latest.values()) {
|
|
283
|
+
fs.writeSync(fd, JSON.stringify(record) + '\n');
|
|
284
|
+
}
|
|
285
|
+
fs.fsyncSync(fd);
|
|
286
|
+
} finally {
|
|
287
|
+
fs.closeSync(fd);
|
|
288
|
+
}
|
|
289
|
+
fs.renameSync(tmpPath, targetPath);
|
|
290
|
+
try {
|
|
291
|
+
const dirFd = fs.openSync(deltaDir, 'r');
|
|
292
|
+
try { fs.fsyncSync(dirFd); } finally { fs.closeSync(dirFd); }
|
|
293
|
+
} catch { /* best-effort dir fsync */ }
|
|
294
|
+
|
|
295
|
+
const consumedSegmentPaths = segments
|
|
296
|
+
.filter((seg) => seg.path !== targetPath)
|
|
297
|
+
.map((seg) => seg.path);
|
|
298
|
+
|
|
299
|
+
let consumed = 0;
|
|
300
|
+
if (deferDelete) {
|
|
301
|
+
consumed = consumedSegmentPaths.length;
|
|
302
|
+
} else {
|
|
303
|
+
for (const segPath of consumedSegmentPaths) {
|
|
304
|
+
try { fs.unlinkSync(segPath); consumed += 1; } catch { /* tolerate concurrent deletion */ }
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
return {
|
|
309
|
+
compactedPath: targetPath,
|
|
310
|
+
consumedSegments: consumed,
|
|
311
|
+
consumedSegmentPaths: deferDelete ? consumedSegmentPaths : [],
|
|
312
|
+
recordsWritten: latest.size,
|
|
313
|
+
tombstonedDropped,
|
|
314
|
+
skipped: null,
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Mark a file as deleted from the indexed corpus. Plan § 22.8.
|
|
320
|
+
*
|
|
321
|
+
* @param {string} baseArtifactPath
|
|
322
|
+
* @param {number} epoch
|
|
323
|
+
* @param {string} canonicalPath
|
|
324
|
+
*/
|
|
325
|
+
export function recordFileDeletion(baseArtifactPath, epoch, canonicalPath, weightsId = FALLBACK_WEIGHTS_ID) {
|
|
326
|
+
appendDeltaRecord(baseArtifactPath, epoch, {
|
|
327
|
+
fileId: fileIdFor(canonicalPath),
|
|
328
|
+
filePath: canonicalPath,
|
|
329
|
+
contentHash: '',
|
|
330
|
+
deleted: true,
|
|
331
|
+
symbolMask: 0,
|
|
332
|
+
weightsId,
|
|
333
|
+
grams: [],
|
|
334
|
+
});
|
|
335
|
+
}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* FTS5 introspection helpers used by the reconcile watermark scheduler.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 7.1.5 requires a `fts5SegmentCount(db, tableName)` helper so the
|
|
5
|
+
* watermark check (segment count > 64 → bounded `('merge', 500)`) lives in
|
|
6
|
+
* one place. SQLite's FTS5 keeps a structure record at rowid=10 of the
|
|
7
|
+
* `<name>_data` shadow table. The block format is documented in the FTS5
|
|
8
|
+
* source (fts5_index.c) and stable enough that we ship a tiny varint parser
|
|
9
|
+
* here. If a future SQLite version changes the layout, the helper switches
|
|
10
|
+
* to a fallback heuristic (leaf-page rowid bit shift) without losing the
|
|
11
|
+
* watermark.
|
|
12
|
+
*
|
|
13
|
+
* Reference: SQLite FTS5 docs §7 ("Internal storage of the index"); structure
|
|
14
|
+
* record format mirrors `Fts5StructureLevel` / `Fts5StructureSegment` in
|
|
15
|
+
* fts5_index.c.
|
|
16
|
+
*
|
|
17
|
+
* Plan § 0 / § 37.5: Phase 0 commits to verifying this empirically against
|
|
18
|
+
* the SQLite version in use. The verification record lives in
|
|
19
|
+
* INCREMENTAL_INDEXING_PREFLIGHT_RESULTS.md § 3.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
const STRUCTURE_ROWID = 10;
|
|
23
|
+
|
|
24
|
+
function readVarint(buf, offset) {
|
|
25
|
+
// SQLite varints are big-endian, up to 9 bytes, high-bit continuation.
|
|
26
|
+
let value = 0n;
|
|
27
|
+
let consumed = 0;
|
|
28
|
+
for (let i = 0; i < 9; i++) {
|
|
29
|
+
if (offset + i >= buf.length) {
|
|
30
|
+
throw new Error(`fts5 varint truncated at offset ${offset + i} (buffer len ${buf.length})`);
|
|
31
|
+
}
|
|
32
|
+
const byte = buf[offset + i];
|
|
33
|
+
if (i === 8) {
|
|
34
|
+
// The 9th byte uses all 8 bits.
|
|
35
|
+
value = (value << 8n) | BigInt(byte);
|
|
36
|
+
consumed = 9;
|
|
37
|
+
break;
|
|
38
|
+
}
|
|
39
|
+
value = (value << 7n) | BigInt(byte & 0x7F);
|
|
40
|
+
if ((byte & 0x80) === 0) {
|
|
41
|
+
consumed = i + 1;
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
return { value, consumed };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Return the number of segments stored in an FTS5 index.
|
|
50
|
+
*
|
|
51
|
+
* Implementation: parse the structure record at `id = 10` of the
|
|
52
|
+
* `<name>_data` shadow table when possible (cookie + varints, per
|
|
53
|
+
* `fts5StructureDecode` in fts5_index.c). The structure record format is
|
|
54
|
+
* stable across SQLite 3.x but the per-level field order has shifted
|
|
55
|
+
* subtly between minor versions, so we cross-check the parsed count
|
|
56
|
+
* against a robust fallback: distinct segment-IDs derived from leaf-page
|
|
57
|
+
* rowids in `<name>_data`. Per the FTS5 source,
|
|
58
|
+
*
|
|
59
|
+
* leaf_rowid = (segid << (FTS5_DATA_HEIGHT_B + FTS5_DATA_PAGE_B))
|
|
60
|
+
* | (height << FTS5_DATA_PAGE_B)
|
|
61
|
+
* | pgno
|
|
62
|
+
*
|
|
63
|
+
* with `FTS5_DATA_HEIGHT_B = 5` and `FTS5_DATA_PAGE_B = 31`. Distinct
|
|
64
|
+
* `rowid >> 36` values therefore approximate segment count tightly. If
|
|
65
|
+
* the two methods agree, return the parsed value; otherwise prefer the
|
|
66
|
+
* shift-based fallback (it cannot be wrong about distinct rowid prefixes).
|
|
67
|
+
*
|
|
68
|
+
* @param {import('better-sqlite3').Database} db
|
|
69
|
+
* @param {string} tableName Name of the FTS5 virtual table (not the shadow).
|
|
70
|
+
* @returns {number}
|
|
71
|
+
*/
|
|
72
|
+
export function fts5SegmentCount(db, tableName) {
|
|
73
|
+
if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(tableName)) {
|
|
74
|
+
throw new Error(`fts5SegmentCount: invalid table name ${tableName}`);
|
|
75
|
+
}
|
|
76
|
+
const shadow = `${tableName}_data`;
|
|
77
|
+
let rowids;
|
|
78
|
+
try {
|
|
79
|
+
// Cookie is at id<10; leaf pages live at id≥100. Distinct segid prefixes
|
|
80
|
+
// are the robust count.
|
|
81
|
+
rowids = db.prepare(`SELECT id FROM ${shadow} WHERE id >= 100`).all();
|
|
82
|
+
} catch (err) {
|
|
83
|
+
if (/no such table/i.test(err.message)) return 0;
|
|
84
|
+
throw err;
|
|
85
|
+
}
|
|
86
|
+
if (rowids.length === 0) return 0;
|
|
87
|
+
|
|
88
|
+
const seg = new Set();
|
|
89
|
+
for (const r of rowids) {
|
|
90
|
+
const id = BigInt(r.id);
|
|
91
|
+
seg.add(Number(id >> 36n));
|
|
92
|
+
}
|
|
93
|
+
return seg.size;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Internal: parse the FTS5 structure record at id=10 and return the per-level
|
|
98
|
+
* segment counts. Exported only for tests / diagnostic tooling; production
|
|
99
|
+
* code uses `fts5SegmentCount` which falls back to the rowid-shift method.
|
|
100
|
+
*
|
|
101
|
+
* Returns `{ cookie, nLevel, nSegment, levels: [{ nMerge, nSeg }] }` on
|
|
102
|
+
* success, or `null` if the table has no structure record yet.
|
|
103
|
+
*
|
|
104
|
+
* @param {import('better-sqlite3').Database} db
|
|
105
|
+
* @param {string} tableName
|
|
106
|
+
*/
|
|
107
|
+
export function fts5StructureDescribe(db, tableName) {
|
|
108
|
+
if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(tableName)) {
|
|
109
|
+
throw new Error(`fts5StructureDescribe: invalid table name ${tableName}`);
|
|
110
|
+
}
|
|
111
|
+
const shadow = `${tableName}_data`;
|
|
112
|
+
let row;
|
|
113
|
+
try {
|
|
114
|
+
row = db.prepare(`SELECT block FROM ${shadow} WHERE id = ?`).get(STRUCTURE_ROWID);
|
|
115
|
+
} catch (err) {
|
|
116
|
+
if (/no such table/i.test(err.message)) return null;
|
|
117
|
+
throw err;
|
|
118
|
+
}
|
|
119
|
+
if (!row || !row.block) return null;
|
|
120
|
+
|
|
121
|
+
const buf = Buffer.isBuffer(row.block) ? row.block : Buffer.from(row.block);
|
|
122
|
+
if (buf.length < 6) return null;
|
|
123
|
+
const cookie = ((buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]) >>> 0;
|
|
124
|
+
let offset = 4;
|
|
125
|
+
const nLevelVI = readVarint(buf, offset); offset += nLevelVI.consumed;
|
|
126
|
+
const nSegmentVI = readVarint(buf, offset); offset += nSegmentVI.consumed;
|
|
127
|
+
const levels = [];
|
|
128
|
+
try {
|
|
129
|
+
for (let level = 0; level < Number(nLevelVI.value); level++) {
|
|
130
|
+
const nMerge = readVarint(buf, offset); offset += nMerge.consumed;
|
|
131
|
+
const nSeg = readVarint(buf, offset); offset += nSeg.consumed;
|
|
132
|
+
levels.push({ nMerge: Number(nMerge.value), nSeg: Number(nSeg.value) });
|
|
133
|
+
for (let s = 0; s < Number(nSeg.value); s++) {
|
|
134
|
+
const segid = readVarint(buf, offset); offset += segid.consumed;
|
|
135
|
+
const pgnoFirst = readVarint(buf, offset); offset += pgnoFirst.consumed;
|
|
136
|
+
const pgnoLast = readVarint(buf, offset); offset += pgnoLast.consumed;
|
|
137
|
+
void segid; void pgnoFirst; void pgnoLast;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
} catch {
|
|
141
|
+
// Partial parse; return what we have. Phase 0 preflight asserts the
|
|
142
|
+
// structure-record parse matches the rowid-shift count; mismatch is a
|
|
143
|
+
// documented version-skew failure mode.
|
|
144
|
+
}
|
|
145
|
+
return {
|
|
146
|
+
cookie,
|
|
147
|
+
nLevel: Number(nLevelVI.value),
|
|
148
|
+
nSegment: Number(nSegmentVI.value),
|
|
149
|
+
levels,
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Convenience wrapper that runs a bounded merge ("incremental compaction")
|
|
155
|
+
* on an FTS5 table. Plan § 7.1.5: every reconcile tick calls
|
|
156
|
+
* `('merge', 16)`; the watermark scheduler calls `('merge', 500)`.
|
|
157
|
+
*
|
|
158
|
+
* The function never calls `('optimize')` because that produces a single-
|
|
159
|
+
* transaction rewrite of the FTS5 index, which trips the 256 MiB WAL bloat
|
|
160
|
+
* alarm on a populated table.
|
|
161
|
+
*
|
|
162
|
+
* @param {import('better-sqlite3').Database} db
|
|
163
|
+
* @param {string} tableName
|
|
164
|
+
* @param {number} pages Page budget per merge call (plan defaults: 16 or 500).
|
|
165
|
+
*/
|
|
166
|
+
export function fts5Merge(db, tableName, pages) {
|
|
167
|
+
if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(tableName)) {
|
|
168
|
+
throw new Error(`fts5Merge: invalid table name ${tableName}`);
|
|
169
|
+
}
|
|
170
|
+
if (!Number.isInteger(pages) || pages <= 0) {
|
|
171
|
+
throw new Error(`fts5Merge: pages must be a positive integer, got ${pages}`);
|
|
172
|
+
}
|
|
173
|
+
db.prepare(`INSERT INTO ${tableName}(${tableName}, rank) VALUES('merge', ?)`).run(pages);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export const __testing = { readVarint, STRUCTURE_ROWID };
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CLI staleness footer.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 19.1. Sweet-search shows a three-tier alert based on how long
|
|
5
|
+
* since the manifest was published, how many files are sitting dirty,
|
|
6
|
+
* and whether the maintenance queue is backed up:
|
|
7
|
+
*
|
|
8
|
+
* green < 60 s, 0 dirty → hidden
|
|
9
|
+
* yellow 60-300 s, < 5 dirty → one-liner footer
|
|
10
|
+
* red > 300 s, > 5 dirty, or → explicit warning
|
|
11
|
+
* maintenance-queue backlog
|
|
12
|
+
*
|
|
13
|
+
* The display is informational. The reconciler does not block any CLI
|
|
14
|
+
* command on it; users decide whether to wait for the next tick or
|
|
15
|
+
* proceed against the slightly stale index.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
const GREEN = 'green';
|
|
19
|
+
const YELLOW = 'yellow';
|
|
20
|
+
const RED = 'red';
|
|
21
|
+
|
|
22
|
+
const YELLOW_AGE_MS = 60_000;
|
|
23
|
+
const RED_AGE_MS = 300_000;
|
|
24
|
+
const YELLOW_DIRTY = 1;
|
|
25
|
+
const RED_DIRTY = 5;
|
|
26
|
+
const RED_BACKLOG = 4;
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Classify the staleness tier given the inputs.
|
|
30
|
+
*
|
|
31
|
+
* @param {object} input
|
|
32
|
+
* @param {number} input.ageMs How long since manifest publish.
|
|
33
|
+
* @param {number} input.dirtyFiles Current dirty-set size.
|
|
34
|
+
* @param {number} input.maintenanceBacklog Pending maintenance jobs.
|
|
35
|
+
* @returns {'green'|'yellow'|'red'}
|
|
36
|
+
*/
|
|
37
|
+
export function stalenessTier(input) {
|
|
38
|
+
const { ageMs = 0, dirtyFiles = 0, maintenanceBacklog = 0 } = input;
|
|
39
|
+
if (ageMs > RED_AGE_MS || dirtyFiles > RED_DIRTY || maintenanceBacklog > RED_BACKLOG) {
|
|
40
|
+
return RED;
|
|
41
|
+
}
|
|
42
|
+
if (ageMs > YELLOW_AGE_MS || dirtyFiles > YELLOW_DIRTY) {
|
|
43
|
+
return YELLOW;
|
|
44
|
+
}
|
|
45
|
+
return GREEN;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function humaniseAge(ms) {
|
|
49
|
+
if (ms < 1000) return `${ms}ms`;
|
|
50
|
+
const s = Math.floor(ms / 1000);
|
|
51
|
+
if (s < 60) return `${s}s`;
|
|
52
|
+
const m = Math.floor(s / 60);
|
|
53
|
+
if (m < 60) return `${m}m ${s % 60}s`;
|
|
54
|
+
const h = Math.floor(m / 60);
|
|
55
|
+
return `${h}h ${m % 60}m`;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Build the footer string. Empty string when tier=green AND the caller
|
|
60
|
+
* did not pass `forceShow`.
|
|
61
|
+
*
|
|
62
|
+
* @param {object} input
|
|
63
|
+
* @param {number} input.epoch
|
|
64
|
+
* @param {number} input.ageMs
|
|
65
|
+
* @param {number} input.dirtyFiles
|
|
66
|
+
* @param {string|null} input.lastMaintenanceTier
|
|
67
|
+
* @param {number} input.lastMaintenanceAgeMs
|
|
68
|
+
* @param {number} input.maintenanceBacklog
|
|
69
|
+
* @param {boolean} [input.forceShow=false]
|
|
70
|
+
* @returns {string}
|
|
71
|
+
*/
|
|
72
|
+
export function formatStalenessFooter(input) {
|
|
73
|
+
const tier = stalenessTier(input);
|
|
74
|
+
if (tier === GREEN && !input.forceShow) return '';
|
|
75
|
+
const parts = [];
|
|
76
|
+
parts.push(`index epoch: ${input.epoch}`);
|
|
77
|
+
parts.push(`age: ${humaniseAge(input.ageMs)}`);
|
|
78
|
+
parts.push(`dirty files: ${input.dirtyFiles}`);
|
|
79
|
+
if (input.lastMaintenanceTier) {
|
|
80
|
+
parts.push(`last maintenance: ${input.lastMaintenanceTier} ${humaniseAge(input.lastMaintenanceAgeMs)} ago`);
|
|
81
|
+
}
|
|
82
|
+
if (input.maintenanceBacklog > 0) {
|
|
83
|
+
parts.push(`backlog: ${input.maintenanceBacklog}`);
|
|
84
|
+
}
|
|
85
|
+
const body = parts.join(' ');
|
|
86
|
+
const prefix = tier === RED ? '[sweet-search] ⚠ stale index — ' : '[sweet-search] ';
|
|
87
|
+
return prefix + body;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Render two lines (separator + footer). Plan § 19.1 mock-up format.
|
|
92
|
+
*
|
|
93
|
+
* @param {object} input
|
|
94
|
+
* @returns {string[]}
|
|
95
|
+
*/
|
|
96
|
+
export function renderStalenessLines(input) {
|
|
97
|
+
const footer = formatStalenessFooter(input);
|
|
98
|
+
if (!footer) return [];
|
|
99
|
+
return ['─────────', footer];
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export const __testing = {
|
|
103
|
+
YELLOW_AGE_MS, RED_AGE_MS, YELLOW_DIRTY, RED_DIRTY, RED_BACKLOG,
|
|
104
|
+
GREEN, YELLOW, RED,
|
|
105
|
+
};
|