sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,119 @@
1
+ /**
2
+ * Single-writer lockfile with stale-owner recovery.
3
+ *
4
+ * Plan § 8.5 + § 8.6. The reconcile daemon, full reindex command, and
5
+ * maintenance worker all coordinate through `.sweet-search/
6
+ * index-maintainer.lock`. Each holder writes `{ pid, bootId, acquiredAt }`
7
+ * into the file; acquirers check whether the owner is still alive and
8
+ * release stale locks automatically.
9
+ *
10
+ * Stale-recovery rules (plan § 8.6 + § 11):
11
+ * - If `bootId` differs from current → stale; clear.
12
+ * - If `bootId` matches but `kill -0 pid` returns ESRCH → stale; clear.
13
+ * - If alive → fail-fast with conflicting pid.
14
+ *
15
+ * Plan § 8.6 "Crash-leak recovery": when a stale lock is cleared, the
16
+ * caller enqueues an immediate Float HNSW background replacement with
17
+ * `reason = "crash_recovery"` so the duplicate-append window from an
18
+ * interrupted tick self-heals within minutes.
19
+ */
20
+
21
+ import fs from 'node:fs';
22
+ import path from 'node:path';
23
+ import { bootIdStub } from './reader-heartbeat.mjs';
24
+
25
+ export const LOCK_FILENAME = 'index-maintainer.lock';
26
+
27
+ class LockHeldError extends Error {
28
+ constructor(owner) {
29
+ super(`index-maintainer.lock held by pid ${owner.pid} (bootId=${owner.bootId})`);
30
+ this.name = 'LockHeldError';
31
+ this.owner = owner;
32
+ }
33
+ }
34
+
35
+ function lockPath(stateDir) {
36
+ return path.join(stateDir, LOCK_FILENAME);
37
+ }
38
+
39
+ function readLock(stateDir) {
40
+ const p = lockPath(stateDir);
41
+ if (!fs.existsSync(p)) return null;
42
+ try {
43
+ return JSON.parse(fs.readFileSync(p, 'utf-8'));
44
+ } catch {
45
+ return { corrupted: true };
46
+ }
47
+ }
48
+
49
+ function processAlive(pid) {
50
+ try {
51
+ process.kill(pid, 0);
52
+ return true;
53
+ } catch (err) {
54
+ return err.code === 'EPERM';
55
+ }
56
+ }
57
+
58
+ /**
59
+ * Try to acquire the lockfile. Returns a token the caller passes to
60
+ * `release()` when done. Throws `LockHeldError` when a live holder
61
+ * has it.
62
+ *
63
+ * @param {string} stateDir
64
+ * @returns {{pid:number, bootId:string, acquiredAt:string, staleCleared:boolean}}
65
+ */
66
+ export function acquireLock(stateDir) {
67
+ fs.mkdirSync(stateDir, { recursive: true });
68
+ const existing = readLock(stateDir);
69
+ let staleCleared = false;
70
+ if (existing && !existing.corrupted) {
71
+ const currentBoot = bootIdStub();
72
+ if (existing.bootId !== currentBoot) {
73
+ // Different boot → owner can't possibly be alive.
74
+ try { fs.unlinkSync(lockPath(stateDir)); } catch {}
75
+ staleCleared = true;
76
+ } else if (!processAlive(existing.pid)) {
77
+ try { fs.unlinkSync(lockPath(stateDir)); } catch {}
78
+ staleCleared = true;
79
+ } else {
80
+ throw new LockHeldError(existing);
81
+ }
82
+ } else if (existing && existing.corrupted) {
83
+ try { fs.unlinkSync(lockPath(stateDir)); } catch {}
84
+ staleCleared = true;
85
+ }
86
+ const token = {
87
+ pid: process.pid,
88
+ bootId: bootIdStub(),
89
+ acquiredAt: new Date().toISOString(),
90
+ staleCleared,
91
+ };
92
+ // O_EXCL semantics via wx flag.
93
+ const fd = fs.openSync(lockPath(stateDir), 'wx');
94
+ try {
95
+ fs.writeSync(fd, JSON.stringify(token));
96
+ fs.fsyncSync(fd);
97
+ } finally {
98
+ fs.closeSync(fd);
99
+ }
100
+ return token;
101
+ }
102
+
103
+ /**
104
+ * Release the lock. Idempotent; missing files are not an error.
105
+ *
106
+ * @param {string} stateDir
107
+ * @param {{pid:number, bootId:string}} token
108
+ */
109
+ export function releaseLock(stateDir, token) {
110
+ const existing = readLock(stateDir);
111
+ if (!existing || existing.corrupted) return;
112
+ if (existing.pid !== token.pid || existing.bootId !== token.bootId) {
113
+ // Someone else has taken the lock; don't unlink theirs.
114
+ return;
115
+ }
116
+ try { fs.unlinkSync(lockPath(stateDir)); } catch {}
117
+ }
118
+
119
+ export { LockHeldError };
@@ -0,0 +1,283 @@
1
+ /**
2
+ * Maintenance-state readers.
3
+ *
4
+ * The reconciler invokes `readMaintenanceState()` at the end of each tick;
5
+ * `domain/watermark-scheduler.mjs::evaluateWatermarks` consumes the output
6
+ * and emits jobs when a tier crosses its threshold. Prior to this module,
7
+ * the production adapter reported only FTS5 and sparse-gram counters, so
8
+ * Float HNSW, Binary HNSW, and LI maintenance jobs never fired (see
9
+ * `eval/results/incremental-soak/REPORT.md` § 6.2 Finding 1).
10
+ *
11
+ * Each reader is intentionally cheap (small JSON + small bitmap reads) and
12
+ * degrades safely to a zero-metric object when an artifact is missing,
13
+ * legacy, or unreadable. Readers MUST NOT throw — the maintenance loop
14
+ * tolerates partial state, not exceptions.
15
+ */
16
+ import fs from 'node:fs';
17
+ import path from 'node:path';
18
+ import Database from 'better-sqlite3';
19
+
20
+ import { loadBitmap, popcount, tombstoneFraction } from './tombstone-bitmap.mjs';
21
+ import { deltaSizeStats } from './sparse-gram-delta.mjs';
22
+ import { evaluateSegmentRatios, LI_SEGMENT_SIZE } from './li-segment-state.mjs';
23
+ import { fts5SegmentCount } from './sqlite-fts5.mjs';
24
+
25
+ function readJson(filePath) {
26
+ try { return JSON.parse(fs.readFileSync(filePath, 'utf-8')); } catch { return null; }
27
+ }
28
+
29
+ /**
30
+ * FTS5: max segment count across the two graph FTS tables. Returns 0 when
31
+ * the graph DB is absent or the tables are unmigrated.
32
+ */
33
+ export function readFts5State(stateDir) {
34
+ const graph = path.join(stateDir, 'code-graph.db');
35
+ if (!fs.existsSync(graph)) return { segmentCount: 0 };
36
+ const db = new Database(graph, { readonly: true });
37
+ try {
38
+ const segmentCount = Math.max(
39
+ fts5SegmentCount(db, 'entities_fts'),
40
+ fts5SegmentCount(db, 'entities_trigram'),
41
+ );
42
+ return { segmentCount };
43
+ } catch {
44
+ return { segmentCount: 0 };
45
+ } finally {
46
+ db.close();
47
+ }
48
+ }
49
+
50
+ /**
51
+ * Sparse-gram delta artifacts: relative size + segment count for the
52
+ * compaction watermark.
53
+ */
54
+ export function readSparseGramState(stateDir) {
55
+ try {
56
+ const stats = deltaSizeStats(path.join(stateDir, 'codebase-sparse-grams.idx'));
57
+ return { deltaSizeRatio: stats.ratio, deltaSegmentCount: stats.deltaSegments };
58
+ } catch {
59
+ return { deltaSizeRatio: 0, deltaSegmentCount: 0 };
60
+ }
61
+ }
62
+
63
+ /**
64
+ * Float HNSW: meta.json carries `idMap` (live ids — pruned on `remove()`)
65
+ * and the stale bitmap at `.idx.stale.bin` mirrors the soft-delete state.
66
+ *
67
+ * tombstoneFraction = popcount(bitmap) / (popcount + liveTotal)
68
+ *
69
+ * `liveCandidateShortfall` stays undefined here — that signal must come
70
+ * from a query-path counter (plan § 7.3), not from offline state.
71
+ */
72
+ export function readFloatHnswState(stateDir) {
73
+ const empty = { tombstoneFraction: 0, deleteCycles: 0 };
74
+ const metaPath = path.join(stateDir, 'codebase-hnsw.meta.json');
75
+ if (!fs.existsSync(metaPath)) return empty;
76
+ const meta = readJson(metaPath);
77
+ if (!meta) return empty;
78
+ const idMap = Array.isArray(meta.idMap) ? meta.idMap : [];
79
+ const liveTotal = idMap.length;
80
+ let bitmap = null;
81
+ try { bitmap = loadBitmap(path.join(stateDir, 'codebase-hnsw.idx.stale.bin')); } catch { bitmap = null; }
82
+ const fraction = bitmap ? tombstoneFraction(bitmap, liveTotal) : 0;
83
+ return { tombstoneFraction: fraction, deleteCycles: 0 };
84
+ }
85
+
86
+ /**
87
+ * Binary HNSW: meta.json's `vectorCount` is the row count in `vectors.json`
88
+ * (never pruned — see `binary-hnsw-index.js::save`). The bitmap at
89
+ * `.idx.stale.bin` flags retired rows; deadDocRatio = popcount / total.
90
+ */
91
+ export function readBinaryHnswState(stateDir) {
92
+ const empty = { deadDocRatio: 0 };
93
+ const metaPath = path.join(stateDir, 'codebase-binary-hnsw.meta.json');
94
+ if (!fs.existsSync(metaPath)) return empty;
95
+ const meta = readJson(metaPath);
96
+ if (!meta) return empty;
97
+ const total = Number(meta.vectorCount) || 0;
98
+ if (total <= 0) return empty;
99
+ let bitmap = null;
100
+ try { bitmap = loadBitmap(path.join(stateDir, 'codebase-binary-hnsw.idx.stale.bin')); } catch { bitmap = null; }
101
+ const markedDead = bitmap ? popcount(bitmap) : 0;
102
+ // True dead count is divergence from codebase.db (the liveness authority):
103
+ // binary rows that are no longer live there, whether or not their stale bit
104
+ // was ever set. This catches a retire that never reached the binary tier so
105
+ // binaryHnswHandler (codebase.db-sourced) is scheduled to reclaim it. Falls
106
+ // back to the stale-bitmap popcount when codebase.db is unavailable.
107
+ let divergentDead = 0;
108
+ const dbPath = path.join(stateDir, 'codebase.db');
109
+ if (fs.existsSync(dbPath)) {
110
+ const db = new Database(dbPath, { readonly: true });
111
+ try {
112
+ const cols = db.prepare('PRAGMA table_info(vectors)').all().map((c) => c.name);
113
+ if (cols.includes('epoch_retired')) {
114
+ const live = db.prepare('SELECT COUNT(*) AS n FROM vectors WHERE epoch_retired IS NULL').get().n || 0;
115
+ divergentDead = Math.max(0, total - live);
116
+ }
117
+ } catch { divergentDead = 0; } finally { db.close(); }
118
+ }
119
+ const dead = Math.max(markedDead, divergentDead);
120
+ // Rows retired in codebase.db whose binary stale bit was never set. This is
121
+ // the precise signature of a retire that failed to reach the binary tier;
122
+ // surfacing it lets the scheduler reclaim promptly instead of waiting for the
123
+ // dead-doc ratio to cross its threshold from unrelated churn.
124
+ const unexplainedDead = Math.max(0, divergentDead - markedDead);
125
+ return { deadDocRatio: dead / total, unexplainedDead };
126
+ }
127
+
128
+ /**
129
+ * LI segments: walks `codebase-late-interaction.db.segments/manifest.json`
130
+ * and asks `li-segment-state.evaluateSegmentRatios` for the stale ratio
131
+ * per segment. Legacy unsegmented indices (or missing manifest) return [].
132
+ */
133
+ export function readLiSegmentsState(stateDir) {
134
+ const stubPath = path.join(stateDir, 'codebase-late-interaction.db');
135
+ if (!fs.existsSync(stubPath)) return [];
136
+ const stub = readJson(stubPath);
137
+ if (!stub || stub.format !== 'segmented' || !stub.segmentDir) return [];
138
+ const segmentDir = path.resolve(stateDir, stub.segmentDir);
139
+ const manifestPath = path.join(segmentDir, 'manifest.json');
140
+ if (!fs.existsSync(manifestPath)) return [];
141
+ const manifest = readJson(manifestPath);
142
+ if (!manifest || !Array.isArray(manifest.segments)) return [];
143
+ const docCounts = new Map();
144
+ for (const seg of manifest.segments) {
145
+ if (!seg || typeof seg.path !== 'string' || !Number.isFinite(seg.count)) continue;
146
+ docCounts.set(path.join(segmentDir, seg.path), seg.count);
147
+ }
148
+ if (docCounts.size === 0) return [];
149
+ try {
150
+ return evaluateSegmentRatios(segmentDir, docCounts);
151
+ } catch {
152
+ return [];
153
+ }
154
+ }
155
+
156
+ /**
157
+ * LI segment-count stats for the batch-merge watermark. `smallSegmentCount`
158
+ * is the number of sealed segments below the SSLX capacity — the ones that
159
+ * accumulate ~1/tick and that the merge collapses. Legacy / missing indices
160
+ * return zeros.
161
+ */
162
+ export function readLiSegmentStats(stateDir) {
163
+ const empty = { segmentCount: 0, smallSegmentCount: 0, pendingDeleteFiles: 0 };
164
+ const stubPath = path.join(stateDir, 'codebase-late-interaction.db');
165
+ if (!fs.existsSync(stubPath)) return empty;
166
+ const stub = readJson(stubPath);
167
+ if (!stub || stub.format !== 'segmented' || !stub.segmentDir) return empty;
168
+ const segmentDir = path.resolve(stateDir, stub.segmentDir);
169
+ const manifestPath = path.join(segmentDir, 'manifest.json');
170
+ if (!fs.existsSync(manifestPath)) return empty;
171
+ const manifest = readJson(manifestPath);
172
+ if (!manifest || !Array.isArray(manifest.segments)) return empty;
173
+ let small = 0;
174
+ for (const seg of manifest.segments) {
175
+ if (Number.isFinite(seg?.count) && seg.count < LI_SEGMENT_SIZE) small += 1;
176
+ }
177
+ // Quarantined (deferred-delete) segment files awaiting a grace-gated sweep.
178
+ // Surfaced so the watermark can re-trigger the merge handler to drain them
179
+ // even when no segment-count merge is otherwise due.
180
+ let pendingDeleteFiles = 0;
181
+ const pendingPath = path.join(segmentDir, 'pending-delete.jsonl');
182
+ try {
183
+ for (const line of fs.readFileSync(pendingPath, 'utf-8').split('\n')) {
184
+ const trimmed = line.trim();
185
+ if (!trimmed) continue;
186
+ try {
187
+ const entry = JSON.parse(trimmed);
188
+ if (entry && Array.isArray(entry.paths)) pendingDeleteFiles += entry.paths.length;
189
+ } catch { /* skip torn line */ }
190
+ }
191
+ } catch { /* no pending-delete journal */ }
192
+ return { segmentCount: manifest.segments.length, smallSegmentCount: small, pendingDeleteFiles };
193
+ }
194
+
195
+ /**
196
+ * Retired-vector counts in `codebase.db` for the physical-GC watermark.
197
+ * `retiredCount` = rows with a non-null `epoch_retired`; `retiredRatio` =
198
+ * retired / total. Returns zeros when the DB / table / column is absent.
199
+ */
200
+ export function readVectorGcState(stateDir) {
201
+ const empty = { retiredCount: 0, retiredRatio: 0, totalCount: 0 };
202
+ const dbPath = path.join(stateDir, 'codebase.db');
203
+ if (!fs.existsSync(dbPath)) return empty;
204
+ const db = new Database(dbPath, { readonly: true });
205
+ try {
206
+ const cols = db.prepare('PRAGMA table_info(vectors)').all().map((c) => c.name);
207
+ if (!cols.includes('epoch_retired')) return empty;
208
+ const total = db.prepare('SELECT COUNT(*) AS n FROM vectors').get().n || 0;
209
+ const retired = db.prepare('SELECT COUNT(*) AS n FROM vectors WHERE epoch_retired IS NOT NULL').get().n || 0;
210
+ return { retiredCount: retired, retiredRatio: total > 0 ? retired / total : 0, totalCount: total };
211
+ } catch {
212
+ return empty;
213
+ } finally {
214
+ db.close();
215
+ }
216
+ }
217
+
218
+ /**
219
+ * Retired-row counts in `code-graph.db` for the graph physical-GC watermark.
220
+ * `retiredEntities` / `retiredRelationships` = rows with a non-null
221
+ * `epoch_retired`; `retiredEntityRatio` = retiredEntities / totalEntities.
222
+ * `retiredRows` is the combined entity+relationship tombstone count used by
223
+ * the count watermark. Returns zeros when the DB / tables / column are absent.
224
+ */
225
+ export function readGraphGcState(stateDir) {
226
+ const empty = {
227
+ retiredEntities: 0, retiredRelationships: 0, retiredSummaries: 0,
228
+ retiredRows: 0, totalEntities: 0, retiredEntityRatio: 0,
229
+ };
230
+ const dbPath = path.join(stateDir, 'code-graph.db');
231
+ if (!fs.existsSync(dbPath)) return empty;
232
+ const db = new Database(dbPath, { readonly: true });
233
+ try {
234
+ const tableHas = (table, column) => {
235
+ try { return db.prepare(`PRAGMA table_info(${table})`).all().some((c) => c.name === column); }
236
+ catch { return false; }
237
+ };
238
+ let retiredEntities = 0;
239
+ let totalEntities = 0;
240
+ if (tableHas('entities', 'epoch_retired')) {
241
+ totalEntities = db.prepare('SELECT COUNT(*) AS n FROM entities').get().n || 0;
242
+ retiredEntities = db.prepare('SELECT COUNT(*) AS n FROM entities WHERE epoch_retired IS NOT NULL').get().n || 0;
243
+ }
244
+ let retiredRelationships = 0;
245
+ if (tableHas('relationships', 'epoch_retired')) {
246
+ retiredRelationships = db.prepare('SELECT COUNT(*) AS n FROM relationships WHERE epoch_retired IS NOT NULL').get().n || 0;
247
+ }
248
+ let retiredSummaries = 0;
249
+ if (tableHas('hcgs_summary_metadata', 'epoch_retired')) {
250
+ retiredSummaries = db.prepare('SELECT COUNT(*) AS n FROM hcgs_summary_metadata WHERE epoch_retired IS NOT NULL').get().n || 0;
251
+ }
252
+ return {
253
+ retiredEntities,
254
+ retiredRelationships,
255
+ retiredSummaries,
256
+ retiredRows: retiredEntities + retiredRelationships,
257
+ totalEntities,
258
+ retiredEntityRatio: totalEntities > 0 ? retiredEntities / totalEntities : 0,
259
+ };
260
+ } catch {
261
+ return empty;
262
+ } finally {
263
+ db.close();
264
+ }
265
+ }
266
+
267
+ /**
268
+ * One-shot bundle for the reconciler adapter. Returns the full shape
269
+ * `evaluateWatermarks` expects: `fts5 / sparseGram / floatHnsw /
270
+ * binaryHnsw / liSegments / liSegmentStats / vectors / graph`.
271
+ */
272
+ export function readMaintenanceState(stateDir) {
273
+ return {
274
+ fts5: readFts5State(stateDir),
275
+ sparseGram: readSparseGramState(stateDir),
276
+ floatHnsw: readFloatHnswState(stateDir),
277
+ binaryHnsw: readBinaryHnswState(stateDir),
278
+ liSegments: readLiSegmentsState(stateDir),
279
+ liSegmentStats: readLiSegmentStats(stateDir),
280
+ vectors: readVectorGcState(stateDir),
281
+ graph: readGraphGcState(stateDir),
282
+ };
283
+ }
@@ -0,0 +1,194 @@
1
+ /**
2
+ * Reconcile manifest reader / writer.
3
+ *
4
+ * Plan § 8.1. The manifest is the single source of truth for "which tier
5
+ * artifacts and sidecars belong to which reconcile epoch". Readers pin
6
+ * one manifest at query start and use only the paths it names for the
7
+ * full query duration; the reconciler stages all per-tier writes under
8
+ * `<artifact>.next` (or per-tier append paths) and publishes a new
9
+ * manifest atomically via `fsync + rename`.
10
+ *
11
+ * Manifest shape (plan § 8.1):
12
+ *
13
+ * {
14
+ * "epoch": 12847,
15
+ * "publishedAt": "2026-05-15T23:00:00.000Z",
16
+ * "codeGraph": { "path": "code-graph.db", "epoch": 12847 },
17
+ * "vectors": { "path": "codebase.db", "epoch": 12847 },
18
+ * "hnsw": { "path": "codebase-hnsw.idx",
19
+ * "stale": "codebase-hnsw.idx.stale.bin",
20
+ * "epoch": 12847 },
21
+ * "binaryHnsw": { "path": "codebase-binary-hnsw.idx", "epoch": 12847 },
22
+ * "lateInteraction":{ "manifest": "codebase-late-interaction.db.segments/manifest.json",
23
+ * "epoch": 12847 },
24
+ * "sparseGram": { "base": "codebase-sparse-grams.idx",
25
+ * "deltas": ["codebase-sparse-grams.idx.deltas/12847-0.ssgrmdelta"],
26
+ * "weightsId": "common-code-bigram-v1",
27
+ * "epoch": 12847 }
28
+ * }
29
+ *
30
+ * The manifest is written to `.sweet-search/reconcile-manifest.json` with
31
+ * a `*.tmp` staging file + fsync + rename + parent-dir fsync. Readers MUST
32
+ * load the manifest fresh per query (cheap — single JSON read of a small
33
+ * file); the reconciler is the only writer.
34
+ */
35
+
36
+ import fs from 'node:fs';
37
+ import path from 'node:path';
38
+
39
+ export const MANIFEST_FILENAME = 'reconcile-manifest.json';
40
+ export const DEFAULT_SPARSE_GRAM_WEIGHTS_ID = 'common-code-bigram-v1';
41
+
42
+ function manifestPath(stateDir) {
43
+ return path.join(stateDir, MANIFEST_FILENAME);
44
+ }
45
+
46
+ function tempManifestPath(stateDir) {
47
+ return path.join(stateDir, MANIFEST_FILENAME + '.tmp');
48
+ }
49
+
50
+ /**
51
+ * Build a zero-state manifest for a brand-new index. Plan § 25.2:
52
+ * the manifest is created at first reconcile tick alongside the
53
+ * STATE_VERSION bump.
54
+ *
55
+ * @param {object} paths per-tier filename map relative to stateDir
56
+ * @returns {object}
57
+ */
58
+ export function zeroManifest(paths) {
59
+ return {
60
+ epoch: 0,
61
+ publishedAt: new Date().toISOString(),
62
+ codeGraph: { path: paths.codeGraph || 'code-graph.db', epoch: 0 },
63
+ vectors: { path: paths.vectors || 'codebase.db', epoch: 0 },
64
+ hnsw: {
65
+ path: paths.hnsw || 'codebase-hnsw.idx',
66
+ stale: paths.hnswStale || 'codebase-hnsw.idx.stale.bin',
67
+ epoch: 0,
68
+ },
69
+ binaryHnsw: {
70
+ path: paths.binaryHnsw || 'codebase-binary-hnsw.idx',
71
+ epoch: 0,
72
+ },
73
+ lateInteraction: {
74
+ manifest: paths.liManifest || 'codebase-late-interaction.db.segments/manifest.json',
75
+ epoch: 0,
76
+ },
77
+ sparseGram: {
78
+ base: paths.sparseBase || 'codebase-sparse-grams.idx',
79
+ deltas: paths.sparseDeltas || [],
80
+ weightsId: paths.weightsId || DEFAULT_SPARSE_GRAM_WEIGHTS_ID,
81
+ epoch: 0,
82
+ },
83
+ };
84
+ }
85
+
86
+ /**
87
+ * Read the current manifest. Returns `null` when none has been written
88
+ * yet — callers fall back to the legacy "open whatever file exists"
89
+ * behaviour until the reconciler runs at least once.
90
+ *
91
+ * @param {string} stateDir
92
+ * @returns {object|null}
93
+ */
94
+ export function readManifest(stateDir) {
95
+ const p = manifestPath(stateDir);
96
+ if (!fs.existsSync(p)) return null;
97
+ try {
98
+ const raw = fs.readFileSync(p, 'utf-8');
99
+ return JSON.parse(raw);
100
+ } catch (err) {
101
+ // A corrupted manifest is recoverable: the next reconcile tick
102
+ // rebuilds it from the per-tier artifacts on disk. We return null so
103
+ // callers can fall back to the legacy paths until then.
104
+ return null;
105
+ }
106
+ }
107
+
108
+ /**
109
+ * Atomic write of a new manifest. Plan § 8.1 step 4 requires:
110
+ *
111
+ * 1. write `*.tmp`
112
+ * 2. fsync the temp file
113
+ * 3. atomically rename to the live name
114
+ * 4. fsync the parent directory so the rename survives a power loss
115
+ *
116
+ * @param {string} stateDir
117
+ * @param {object} manifest
118
+ */
119
+ export function writeManifest(stateDir, manifest) {
120
+ fs.mkdirSync(stateDir, { recursive: true });
121
+ const tmp = tempManifestPath(stateDir);
122
+ const live = manifestPath(stateDir);
123
+ const data = JSON.stringify(manifest, null, 2);
124
+
125
+ const fd = fs.openSync(tmp, 'w');
126
+ try {
127
+ fs.writeSync(fd, data);
128
+ fs.fsyncSync(fd);
129
+ } finally {
130
+ fs.closeSync(fd);
131
+ }
132
+ fs.renameSync(tmp, live);
133
+ // Best-effort dir fsync. Some filesystems (e.g. tmpfs in containers)
134
+ // throw EINVAL on directory fsync; that is harmless because no power
135
+ // loss can affect them.
136
+ try {
137
+ const dirFd = fs.openSync(stateDir, 'r');
138
+ try { fs.fsyncSync(dirFd); } finally { fs.closeSync(dirFd); }
139
+ } catch {
140
+ // ignore
141
+ }
142
+ }
143
+
144
+ /**
145
+ * Build the next-epoch manifest from a previous manifest plus a list of
146
+ * per-tier overrides. The reconciler calls this once per tick after the
147
+ * per-tier writes finish staging.
148
+ *
149
+ * @param {object} prev Previous manifest (use `zeroManifest` on first tick).
150
+ * @param {{epoch:number, tiers:object}} delta
151
+ * @returns {object}
152
+ */
153
+ export function buildNextManifest(prev, delta) {
154
+ if (!Number.isInteger(delta.epoch)) {
155
+ throw new Error('buildNextManifest: delta.epoch must be an integer');
156
+ }
157
+ const out = {
158
+ ...prev,
159
+ epoch: delta.epoch,
160
+ publishedAt: new Date().toISOString(),
161
+ };
162
+ const tiers = delta.tiers || {};
163
+ for (const key of ['codeGraph', 'vectors', 'hnsw', 'binaryHnsw', 'lateInteraction', 'sparseGram']) {
164
+ if (!tiers[key]) {
165
+ // Carry forward the previous tier descriptor with the new epoch.
166
+ out[key] = { ...(prev[key] || {}), epoch: delta.epoch };
167
+ continue;
168
+ }
169
+ out[key] = { ...prev[key], ...tiers[key], epoch: delta.epoch };
170
+ }
171
+ return out;
172
+ }
173
+
174
+ /**
175
+ * SQL predicate fragment that filters rows visible at a given manifest
176
+ * epoch. Plan § 8.1.1 / § 7.2:
177
+ *
178
+ * epoch_written <= :manifestEpoch
179
+ * AND (epoch_retired IS NULL OR epoch_retired > :manifestEpoch)
180
+ *
181
+ * Returned with named-parameter placeholders so callers can bind the
182
+ * value once and reuse the prepared statement across queries.
183
+ *
184
+ * @param {string} [alias] Optional table alias (e.g. `'v.'` or `'e.'`).
185
+ * @returns {string}
186
+ */
187
+ export function epochVisibilityPredicate(alias = '') {
188
+ const normalizedAlias = alias.endsWith('.') ? alias.slice(0, -1) : alias;
189
+ const a = normalizedAlias.length > 0 ? `${normalizedAlias}.` : '';
190
+ return (
191
+ `${a}epoch_written <= :manifestEpoch ` +
192
+ `AND (${a}epoch_retired IS NULL OR ${a}epoch_retired > :manifestEpoch)`
193
+ );
194
+ }