sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,335 @@
1
+ /**
2
+ * Sparse-gram per-file delta overlay (SSGRMIDX v3).
3
+ *
4
+ * Plan § 7.6. The cold-build artifact `codebase-sparse-grams.idx` is
5
+ * immutable; the reconcile path writes per-file changes to
6
+ * `codebase-sparse-grams.idx.deltas/{epoch}-{seq}.ssgrmdelta` and the
7
+ * query path mmaps base ∪ deltas.
8
+ *
9
+ * Delta record (one per file, one JSON line — easy to parse; the native
10
+ * mmap-friendly binary form is Phase 6 work). Each record is keyed by
11
+ * the stable `file_id = xxhash3(canonical_path)` and carries:
12
+ *
13
+ * {
14
+ * "fileId": "<hex>",
15
+ * "filePath": "<relative path>",
16
+ * "contentHash": "<hex>",
17
+ * "deleted": false,
18
+ * "symbolMask": <int>,
19
+ * "weightsId": "<id>", // must match base artifact's weights id
20
+ * "grams": [ [gramId, freq], ... ]
21
+ * }
22
+ *
23
+ * The reader unions in two passes:
24
+ * 1. For each file_id with a newer delta record, mask the base
25
+ * postings for that file_id and read the delta record instead.
26
+ * 2. If `deleted=true`, the file_id is excluded from postings entirely.
27
+ *
28
+ * The delta directory grows over time; the watermark scheduler
29
+ * (`domain/watermark-scheduler.mjs`) triggers compaction when the
30
+ * `delta_size_ratio` or `delta_segment_count` thresholds cross. Compaction
31
+ * reads the latest delta record per file, merges with base postings for
32
+ * unchanged files, and emits a new base under `*.next`.
33
+ *
34
+ * Source-file retokenization for unchanged files is **forbidden** by plan
35
+ * § 7.6: the compactor copies postings, it does not re-gram.
36
+ */
37
+
38
+ import fs from 'node:fs';
39
+ import path from 'node:path';
40
+ import { contentHashSync } from './hashing.mjs';
41
+ import { DEFAULT_SPARSE_GRAM_WEIGHTS_ID } from './manifest.mjs';
42
+
43
+ export const DELTA_DIR_SUFFIX = '.deltas';
44
+ export const DELTA_FILE_EXT = '.ssgrmdelta';
45
+ /**
46
+ * Versioned hardcoded common-code bigram-weight table. Plan § 7.6 empty-/
47
+ * tiny-codebase bootstrap. The actual bigram weights live in the Rust
48
+ * native crate (`crates/sweet-search-native/src/sparse_gram.rs`); this
49
+ * module only carries the *identifier* of the fallback table so the
50
+ * reconciler can stamp deltas with the same `weightsId` the base artifact
51
+ * used.
52
+ */
53
+ export const FALLBACK_WEIGHTS_ID = DEFAULT_SPARSE_GRAM_WEIGHTS_ID;
54
+
55
+ /**
56
+ * Compute the canonical `file_id` for a path. Plan § 7.6 step 2.
57
+ *
58
+ * @param {string} canonicalPath
59
+ * @returns {string}
60
+ */
61
+ export function fileIdFor(canonicalPath) {
62
+ return contentHashSync(String(canonicalPath));
63
+ }
64
+
65
+ function deltaDirFor(baseArtifactPath) {
66
+ return baseArtifactPath + DELTA_DIR_SUFFIX;
67
+ }
68
+
69
+ function deltaSegmentPath(baseArtifactPath, epoch, seq) {
70
+ return path.join(deltaDirFor(baseArtifactPath), `${epoch}-${seq}${DELTA_FILE_EXT}`);
71
+ }
72
+
73
+ /**
74
+ * Append a delta record to the active delta segment. Each call writes one
75
+ * line of JSON. The reconciler is the single writer; appending is atomic
76
+ * per call (single `fs.appendFileSync`).
77
+ *
78
+ * @param {string} baseArtifactPath Path to the immutable base sparse-gram artifact.
79
+ * @param {number} epoch
80
+ * @param {object} record `{ fileId, filePath, contentHash, deleted, symbolMask, weightsId, grams }`
81
+ */
82
+ export function appendDeltaRecord(baseArtifactPath, epoch, record) {
83
+ if (!Number.isInteger(epoch)) {
84
+ throw new Error('appendDeltaRecord: epoch must be an integer');
85
+ }
86
+ if (!record || !record.fileId) {
87
+ throw new Error('appendDeltaRecord: record.fileId is required');
88
+ }
89
+ const deltaDir = deltaDirFor(baseArtifactPath);
90
+ fs.mkdirSync(deltaDir, { recursive: true });
91
+ const filePath = deltaSegmentPath(baseArtifactPath, epoch, 0);
92
+ const fd = fs.openSync(filePath, 'a');
93
+ try {
94
+ fs.writeSync(fd, JSON.stringify(record) + '\n');
95
+ fs.fsyncSync(fd);
96
+ } finally {
97
+ fs.closeSync(fd);
98
+ }
99
+ try {
100
+ const dirFd = fs.openSync(deltaDir, 'r');
101
+ try { fs.fsyncSync(dirFd); } finally { fs.closeSync(dirFd); }
102
+ } catch {
103
+ // Some test/container filesystems reject directory fsync; the data fsync
104
+ // above is the required durability boundary.
105
+ }
106
+ }
107
+
108
+ /**
109
+ * Enumerate delta segment files (sorted by epoch, then sequence).
110
+ *
111
+ * @param {string} baseArtifactPath
112
+ * @param {{maxEpoch?:number}} [opts]
113
+ * @returns {Array<{path:string, epoch:number, seq:number}>}
114
+ */
115
+ export function listDeltaSegments(baseArtifactPath, opts = {}) {
116
+ const dir = deltaDirFor(baseArtifactPath);
117
+ if (!fs.existsSync(dir)) return [];
118
+ const maxEpoch = Number.isInteger(opts.maxEpoch) ? opts.maxEpoch : Infinity;
119
+ const out = [];
120
+ for (const name of fs.readdirSync(dir)) {
121
+ if (!name.endsWith(DELTA_FILE_EXT)) continue;
122
+ const match = name.match(/^(\d+)-(\d+)\.ssgrmdelta$/);
123
+ if (!match) continue;
124
+ const epoch = Number(match[1]);
125
+ if (epoch > maxEpoch) continue;
126
+ out.push({
127
+ path: path.join(dir, name),
128
+ epoch,
129
+ seq: Number(match[2]),
130
+ });
131
+ }
132
+ return out.sort((a, b) => (a.epoch - b.epoch) || (a.seq - b.seq));
133
+ }
134
+
135
+ /**
136
+ * Read all delta records and resolve them to the latest record per fileId.
137
+ * Plan § 7.6 step 4: writing the same `(fileId, contentHash)` twice is a
138
+ * no-op at query merge time; the *last* record wins.
139
+ *
140
+ * @param {string} baseArtifactPath
141
+ * @param {{maxEpoch?:number}} [opts]
142
+ * @returns {Map<string, {record:object, segmentPath:string, epoch:number}>}
143
+ */
144
+ export function resolveLatestRecords(baseArtifactPath, opts = {}) {
145
+ const latest = new Map();
146
+ for (const seg of listDeltaSegments(baseArtifactPath, opts)) {
147
+ const raw = fs.readFileSync(seg.path, 'utf-8');
148
+ for (const line of raw.split('\n')) {
149
+ const trimmed = line.trim();
150
+ if (!trimmed) continue;
151
+ let record;
152
+ try {
153
+ record = JSON.parse(trimmed);
154
+ } catch {
155
+ continue; // skip torn / corrupt lines; the compactor will rewrite
156
+ }
157
+ if (!record.fileId) continue;
158
+ latest.set(record.fileId, { record, segmentPath: seg.path, epoch: seg.epoch });
159
+ }
160
+ }
161
+ return latest;
162
+ }
163
+
164
+ /**
165
+ * Compute the delta-size ratio used by the watermark scheduler.
166
+ *
167
+ * delta_size_ratio = sum(delta file sizes) / (base file size + sum)
168
+ *
169
+ * @param {string} baseArtifactPath
170
+ * @returns {{ratio:number, deltaSegments:number, deltaBytes:number, baseBytes:number}}
171
+ */
172
+ export function deltaSizeStats(baseArtifactPath) {
173
+ let deltaBytes = 0;
174
+ let deltaSegments = 0;
175
+ for (const seg of listDeltaSegments(baseArtifactPath)) {
176
+ try {
177
+ const stat = fs.statSync(seg.path);
178
+ deltaBytes += stat.size;
179
+ deltaSegments += 1;
180
+ } catch {
181
+ // ignore vanished files
182
+ }
183
+ }
184
+ let baseBytes = 0;
185
+ if (fs.existsSync(baseArtifactPath)) {
186
+ baseBytes = fs.statSync(baseArtifactPath).size;
187
+ }
188
+ const ratio = deltaBytes / Math.max(1, baseBytes + deltaBytes);
189
+ return { ratio, deltaSegments, deltaBytes, baseBytes };
190
+ }
191
+
192
+ /**
193
+ * Compact the delta directory in place.
194
+ *
195
+ * Reads all delta segments, resolves the latest record per fileId, writes
196
+ * a single new segment that supersedes them, and (by default) deletes the
197
+ * segments the compaction consumed.
198
+ *
199
+ * Naming: the new segment uses `{maxEpoch}-{seq}` with `seq > 0` so it
200
+ * sorts AFTER any existing `{maxEpoch}-0` segment the reconciler wrote.
201
+ * Future reconcile ticks at epoch > maxEpoch keep monotonic ordering.
202
+ *
203
+ * Atomicity: write `*.compacting.tmp`, fsync, rename to the final name,
204
+ * THEN delete the consumed segments. A crash between rename and delete
205
+ * leaves the compacted file in place; the next round consumes everything
206
+ * including the compacted file and resolves to the same records (the
207
+ * compacted file's seq is highest, so its records win).
208
+ *
209
+ * `deferDelete: true` stages the compaction without unlinking consumed
210
+ * segments — the caller is responsible for deleting `consumedSegmentPaths`
211
+ * once it has published whatever derived state needs to change atomically
212
+ * with the unlink (e.g. the reconcile manifest's `sparseGram.deltas`
213
+ * list). Until the caller deletes them, every reader — including readers
214
+ * pinning the OLD manifest's segments — still resolves every record.
215
+ *
216
+ * Deleted-file records (`deleted: true`) are preserved by default — they
217
+ * suppress base postings at query time. Pass `{ dropTombstones: true }`
218
+ * to discard them; only safe when the caller has confirmed the matching
219
+ * fileId is gone from the base artifact too.
220
+ *
221
+ * @param {string} baseArtifactPath
222
+ * @param {{dropTombstones?:boolean, deferDelete?:boolean}} [opts]
223
+ * @returns {{
224
+ * compactedPath: string|null,
225
+ * consumedSegments: number,
226
+ * consumedSegmentPaths: string[],
227
+ * recordsWritten: number,
228
+ * tombstonedDropped: number,
229
+ * skipped: 'too-few-segments'|null,
230
+ * }}
231
+ */
232
+ export function compactDeltaSegments(baseArtifactPath, opts = {}) {
233
+ const dropTombstones = !!opts.dropTombstones;
234
+ const deferDelete = !!opts.deferDelete;
235
+ const segments = listDeltaSegments(baseArtifactPath);
236
+ if (segments.length <= 1) {
237
+ return {
238
+ compactedPath: null,
239
+ consumedSegments: 0,
240
+ consumedSegmentPaths: [],
241
+ recordsWritten: 0,
242
+ tombstonedDropped: 0,
243
+ skipped: 'too-few-segments',
244
+ };
245
+ }
246
+ const maxEpoch = segments[segments.length - 1].epoch;
247
+ const maxSeqAtMaxEpoch = segments
248
+ .filter((s) => s.epoch === maxEpoch)
249
+ .reduce((m, s) => Math.max(m, s.seq), -1);
250
+ const compactSeq = Math.max(maxSeqAtMaxEpoch + 1, 1);
251
+
252
+ const latest = new Map();
253
+ for (const seg of segments) {
254
+ const raw = fs.readFileSync(seg.path, 'utf-8');
255
+ for (const line of raw.split('\n')) {
256
+ const trimmed = line.trim();
257
+ if (!trimmed) continue;
258
+ let record;
259
+ try { record = JSON.parse(trimmed); } catch { continue; }
260
+ if (!record.fileId) continue;
261
+ latest.set(record.fileId, record);
262
+ }
263
+ }
264
+
265
+ let tombstonedDropped = 0;
266
+ if (dropTombstones) {
267
+ for (const [fileId, rec] of latest) {
268
+ if (rec.deleted) {
269
+ latest.delete(fileId);
270
+ tombstonedDropped += 1;
271
+ }
272
+ }
273
+ }
274
+
275
+ const deltaDir = deltaDirFor(baseArtifactPath);
276
+ const targetName = `${maxEpoch}-${compactSeq}${DELTA_FILE_EXT}`;
277
+ const targetPath = path.join(deltaDir, targetName);
278
+ const tmpPath = targetPath + '.compacting.tmp';
279
+
280
+ const fd = fs.openSync(tmpPath, 'w');
281
+ try {
282
+ for (const record of latest.values()) {
283
+ fs.writeSync(fd, JSON.stringify(record) + '\n');
284
+ }
285
+ fs.fsyncSync(fd);
286
+ } finally {
287
+ fs.closeSync(fd);
288
+ }
289
+ fs.renameSync(tmpPath, targetPath);
290
+ try {
291
+ const dirFd = fs.openSync(deltaDir, 'r');
292
+ try { fs.fsyncSync(dirFd); } finally { fs.closeSync(dirFd); }
293
+ } catch { /* best-effort dir fsync */ }
294
+
295
+ const consumedSegmentPaths = segments
296
+ .filter((seg) => seg.path !== targetPath)
297
+ .map((seg) => seg.path);
298
+
299
+ let consumed = 0;
300
+ if (deferDelete) {
301
+ consumed = consumedSegmentPaths.length;
302
+ } else {
303
+ for (const segPath of consumedSegmentPaths) {
304
+ try { fs.unlinkSync(segPath); consumed += 1; } catch { /* tolerate concurrent deletion */ }
305
+ }
306
+ }
307
+
308
+ return {
309
+ compactedPath: targetPath,
310
+ consumedSegments: consumed,
311
+ consumedSegmentPaths: deferDelete ? consumedSegmentPaths : [],
312
+ recordsWritten: latest.size,
313
+ tombstonedDropped,
314
+ skipped: null,
315
+ };
316
+ }
317
+
318
+ /**
319
+ * Mark a file as deleted from the indexed corpus. Plan § 22.8.
320
+ *
321
+ * @param {string} baseArtifactPath
322
+ * @param {number} epoch
323
+ * @param {string} canonicalPath
324
+ */
325
+ export function recordFileDeletion(baseArtifactPath, epoch, canonicalPath, weightsId = FALLBACK_WEIGHTS_ID) {
326
+ appendDeltaRecord(baseArtifactPath, epoch, {
327
+ fileId: fileIdFor(canonicalPath),
328
+ filePath: canonicalPath,
329
+ contentHash: '',
330
+ deleted: true,
331
+ symbolMask: 0,
332
+ weightsId,
333
+ grams: [],
334
+ });
335
+ }
@@ -0,0 +1,176 @@
1
+ /**
2
+ * FTS5 introspection helpers used by the reconcile watermark scheduler.
3
+ *
4
+ * Plan § 7.1.5 requires a `fts5SegmentCount(db, tableName)` helper so the
5
+ * watermark check (segment count > 64 → bounded `('merge', 500)`) lives in
6
+ * one place. SQLite's FTS5 keeps a structure record at rowid=10 of the
7
+ * `<name>_data` shadow table. The block format is documented in the FTS5
8
+ * source (fts5_index.c) and stable enough that we ship a tiny varint parser
9
+ * here. If a future SQLite version changes the layout, the helper switches
10
+ * to a fallback heuristic (leaf-page rowid bit shift) without losing the
11
+ * watermark.
12
+ *
13
+ * Reference: SQLite FTS5 docs §7 ("Internal storage of the index"); structure
14
+ * record format mirrors `Fts5StructureLevel` / `Fts5StructureSegment` in
15
+ * fts5_index.c.
16
+ *
17
+ * Plan § 0 / § 37.5: Phase 0 commits to verifying this empirically against
18
+ * the SQLite version in use. The verification record lives in
19
+ * INCREMENTAL_INDEXING_PREFLIGHT_RESULTS.md § 3.
20
+ */
21
+
22
+ const STRUCTURE_ROWID = 10;
23
+
24
+ function readVarint(buf, offset) {
25
+ // SQLite varints are big-endian, up to 9 bytes, high-bit continuation.
26
+ let value = 0n;
27
+ let consumed = 0;
28
+ for (let i = 0; i < 9; i++) {
29
+ if (offset + i >= buf.length) {
30
+ throw new Error(`fts5 varint truncated at offset ${offset + i} (buffer len ${buf.length})`);
31
+ }
32
+ const byte = buf[offset + i];
33
+ if (i === 8) {
34
+ // The 9th byte uses all 8 bits.
35
+ value = (value << 8n) | BigInt(byte);
36
+ consumed = 9;
37
+ break;
38
+ }
39
+ value = (value << 7n) | BigInt(byte & 0x7F);
40
+ if ((byte & 0x80) === 0) {
41
+ consumed = i + 1;
42
+ break;
43
+ }
44
+ }
45
+ return { value, consumed };
46
+ }
47
+
48
+ /**
49
+ * Return the number of segments stored in an FTS5 index.
50
+ *
51
+ * Implementation: parse the structure record at `id = 10` of the
52
+ * `<name>_data` shadow table when possible (cookie + varints, per
53
+ * `fts5StructureDecode` in fts5_index.c). The structure record format is
54
+ * stable across SQLite 3.x but the per-level field order has shifted
55
+ * subtly between minor versions, so we cross-check the parsed count
56
+ * against a robust fallback: distinct segment-IDs derived from leaf-page
57
+ * rowids in `<name>_data`. Per the FTS5 source,
58
+ *
59
+ * leaf_rowid = (segid << (FTS5_DATA_HEIGHT_B + FTS5_DATA_PAGE_B))
60
+ * | (height << FTS5_DATA_PAGE_B)
61
+ * | pgno
62
+ *
63
+ * with `FTS5_DATA_HEIGHT_B = 5` and `FTS5_DATA_PAGE_B = 31`. Distinct
64
+ * `rowid >> 36` values therefore approximate segment count tightly. If
65
+ * the two methods agree, return the parsed value; otherwise prefer the
66
+ * shift-based fallback (it cannot be wrong about distinct rowid prefixes).
67
+ *
68
+ * @param {import('better-sqlite3').Database} db
69
+ * @param {string} tableName Name of the FTS5 virtual table (not the shadow).
70
+ * @returns {number}
71
+ */
72
+ export function fts5SegmentCount(db, tableName) {
73
+ if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(tableName)) {
74
+ throw new Error(`fts5SegmentCount: invalid table name ${tableName}`);
75
+ }
76
+ const shadow = `${tableName}_data`;
77
+ let rowids;
78
+ try {
79
+ // Cookie is at id<10; leaf pages live at id≥100. Distinct segid prefixes
80
+ // are the robust count.
81
+ rowids = db.prepare(`SELECT id FROM ${shadow} WHERE id >= 100`).all();
82
+ } catch (err) {
83
+ if (/no such table/i.test(err.message)) return 0;
84
+ throw err;
85
+ }
86
+ if (rowids.length === 0) return 0;
87
+
88
+ const seg = new Set();
89
+ for (const r of rowids) {
90
+ const id = BigInt(r.id);
91
+ seg.add(Number(id >> 36n));
92
+ }
93
+ return seg.size;
94
+ }
95
+
96
+ /**
97
+ * Internal: parse the FTS5 structure record at id=10 and return the per-level
98
+ * segment counts. Exported only for tests / diagnostic tooling; production
99
+ * code uses `fts5SegmentCount` which falls back to the rowid-shift method.
100
+ *
101
+ * Returns `{ cookie, nLevel, nSegment, levels: [{ nMerge, nSeg }] }` on
102
+ * success, or `null` if the table has no structure record yet.
103
+ *
104
+ * @param {import('better-sqlite3').Database} db
105
+ * @param {string} tableName
106
+ */
107
+ export function fts5StructureDescribe(db, tableName) {
108
+ if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(tableName)) {
109
+ throw new Error(`fts5StructureDescribe: invalid table name ${tableName}`);
110
+ }
111
+ const shadow = `${tableName}_data`;
112
+ let row;
113
+ try {
114
+ row = db.prepare(`SELECT block FROM ${shadow} WHERE id = ?`).get(STRUCTURE_ROWID);
115
+ } catch (err) {
116
+ if (/no such table/i.test(err.message)) return null;
117
+ throw err;
118
+ }
119
+ if (!row || !row.block) return null;
120
+
121
+ const buf = Buffer.isBuffer(row.block) ? row.block : Buffer.from(row.block);
122
+ if (buf.length < 6) return null;
123
+ const cookie = ((buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]) >>> 0;
124
+ let offset = 4;
125
+ const nLevelVI = readVarint(buf, offset); offset += nLevelVI.consumed;
126
+ const nSegmentVI = readVarint(buf, offset); offset += nSegmentVI.consumed;
127
+ const levels = [];
128
+ try {
129
+ for (let level = 0; level < Number(nLevelVI.value); level++) {
130
+ const nMerge = readVarint(buf, offset); offset += nMerge.consumed;
131
+ const nSeg = readVarint(buf, offset); offset += nSeg.consumed;
132
+ levels.push({ nMerge: Number(nMerge.value), nSeg: Number(nSeg.value) });
133
+ for (let s = 0; s < Number(nSeg.value); s++) {
134
+ const segid = readVarint(buf, offset); offset += segid.consumed;
135
+ const pgnoFirst = readVarint(buf, offset); offset += pgnoFirst.consumed;
136
+ const pgnoLast = readVarint(buf, offset); offset += pgnoLast.consumed;
137
+ void segid; void pgnoFirst; void pgnoLast;
138
+ }
139
+ }
140
+ } catch {
141
+ // Partial parse; return what we have. Phase 0 preflight asserts the
142
+ // structure-record parse matches the rowid-shift count; mismatch is a
143
+ // documented version-skew failure mode.
144
+ }
145
+ return {
146
+ cookie,
147
+ nLevel: Number(nLevelVI.value),
148
+ nSegment: Number(nSegmentVI.value),
149
+ levels,
150
+ };
151
+ }
152
+
153
+ /**
154
+ * Convenience wrapper that runs a bounded merge ("incremental compaction")
155
+ * on an FTS5 table. Plan § 7.1.5: every reconcile tick calls
156
+ * `('merge', 16)`; the watermark scheduler calls `('merge', 500)`.
157
+ *
158
+ * The function never calls `('optimize')` because that produces a single-
159
+ * transaction rewrite of the FTS5 index, which trips the 256 MiB WAL bloat
160
+ * alarm on a populated table.
161
+ *
162
+ * @param {import('better-sqlite3').Database} db
163
+ * @param {string} tableName
164
+ * @param {number} pages Page budget per merge call (plan defaults: 16 or 500).
165
+ */
166
+ export function fts5Merge(db, tableName, pages) {
167
+ if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(tableName)) {
168
+ throw new Error(`fts5Merge: invalid table name ${tableName}`);
169
+ }
170
+ if (!Number.isInteger(pages) || pages <= 0) {
171
+ throw new Error(`fts5Merge: pages must be a positive integer, got ${pages}`);
172
+ }
173
+ db.prepare(`INSERT INTO ${tableName}(${tableName}, rank) VALUES('merge', ?)`).run(pages);
174
+ }
175
+
176
+ export const __testing = { readVarint, STRUCTURE_ROWID };
@@ -0,0 +1,105 @@
1
+ /**
2
+ * CLI staleness footer.
3
+ *
4
+ * Plan § 19.1. Sweet-search shows a three-tier alert based on how long
5
+ * since the manifest was published, how many files are sitting dirty,
6
+ * and whether the maintenance queue is backed up:
7
+ *
8
+ * green < 60 s, 0 dirty → hidden
9
+ * yellow 60-300 s, < 5 dirty → one-liner footer
10
+ * red > 300 s, > 5 dirty, or → explicit warning
11
+ * maintenance-queue backlog
12
+ *
13
+ * The display is informational. The reconciler does not block any CLI
14
+ * command on it; users decide whether to wait for the next tick or
15
+ * proceed against the slightly stale index.
16
+ */
17
+
18
+ const GREEN = 'green';
19
+ const YELLOW = 'yellow';
20
+ const RED = 'red';
21
+
22
+ const YELLOW_AGE_MS = 60_000;
23
+ const RED_AGE_MS = 300_000;
24
+ const YELLOW_DIRTY = 1;
25
+ const RED_DIRTY = 5;
26
+ const RED_BACKLOG = 4;
27
+
28
+ /**
29
+ * Classify the staleness tier given the inputs.
30
+ *
31
+ * @param {object} input
32
+ * @param {number} input.ageMs How long since manifest publish.
33
+ * @param {number} input.dirtyFiles Current dirty-set size.
34
+ * @param {number} input.maintenanceBacklog Pending maintenance jobs.
35
+ * @returns {'green'|'yellow'|'red'}
36
+ */
37
+ export function stalenessTier(input) {
38
+ const { ageMs = 0, dirtyFiles = 0, maintenanceBacklog = 0 } = input;
39
+ if (ageMs > RED_AGE_MS || dirtyFiles > RED_DIRTY || maintenanceBacklog > RED_BACKLOG) {
40
+ return RED;
41
+ }
42
+ if (ageMs > YELLOW_AGE_MS || dirtyFiles > YELLOW_DIRTY) {
43
+ return YELLOW;
44
+ }
45
+ return GREEN;
46
+ }
47
+
48
+ function humaniseAge(ms) {
49
+ if (ms < 1000) return `${ms}ms`;
50
+ const s = Math.floor(ms / 1000);
51
+ if (s < 60) return `${s}s`;
52
+ const m = Math.floor(s / 60);
53
+ if (m < 60) return `${m}m ${s % 60}s`;
54
+ const h = Math.floor(m / 60);
55
+ return `${h}h ${m % 60}m`;
56
+ }
57
+
58
+ /**
59
+ * Build the footer string. Empty string when tier=green AND the caller
60
+ * did not pass `forceShow`.
61
+ *
62
+ * @param {object} input
63
+ * @param {number} input.epoch
64
+ * @param {number} input.ageMs
65
+ * @param {number} input.dirtyFiles
66
+ * @param {string|null} input.lastMaintenanceTier
67
+ * @param {number} input.lastMaintenanceAgeMs
68
+ * @param {number} input.maintenanceBacklog
69
+ * @param {boolean} [input.forceShow=false]
70
+ * @returns {string}
71
+ */
72
+ export function formatStalenessFooter(input) {
73
+ const tier = stalenessTier(input);
74
+ if (tier === GREEN && !input.forceShow) return '';
75
+ const parts = [];
76
+ parts.push(`index epoch: ${input.epoch}`);
77
+ parts.push(`age: ${humaniseAge(input.ageMs)}`);
78
+ parts.push(`dirty files: ${input.dirtyFiles}`);
79
+ if (input.lastMaintenanceTier) {
80
+ parts.push(`last maintenance: ${input.lastMaintenanceTier} ${humaniseAge(input.lastMaintenanceAgeMs)} ago`);
81
+ }
82
+ if (input.maintenanceBacklog > 0) {
83
+ parts.push(`backlog: ${input.maintenanceBacklog}`);
84
+ }
85
+ const body = parts.join(' ');
86
+ const prefix = tier === RED ? '[sweet-search] ⚠ stale index — ' : '[sweet-search] ';
87
+ return prefix + body;
88
+ }
89
+
90
+ /**
91
+ * Render two lines (separator + footer). Plan § 19.1 mock-up format.
92
+ *
93
+ * @param {object} input
94
+ * @returns {string[]}
95
+ */
96
+ export function renderStalenessLines(input) {
97
+ const footer = formatStalenessFooter(input);
98
+ if (!footer) return [];
99
+ return ['─────────', footer];
100
+ }
101
+
102
+ export const __testing = {
103
+ YELLOW_AGE_MS, RED_AGE_MS, YELLOW_DIRTY, RED_DIRTY, RED_BACKLOG,
104
+ GREEN, YELLOW, RED,
105
+ };