sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,182 @@
1
+ /**
2
+ * HCGS summary invalidation.
3
+ *
4
+ * Plan § 7.7. HCGS summaries are not one of the five first-stage indices,
5
+ * but stale summaries can still poison reader trust. The reconcile path
6
+ * therefore owns HCGS invalidation in v1:
7
+ *
8
+ * 1. Each summary records `(source_entity_ids, source_chunk_struct_ids,
9
+ * source_hashes, epoch_written, epoch_retired)`.
10
+ * 2. When graph/vector deltas retire or replace any source entity/chunk,
11
+ * mark the dependent summary retired in the same manifest epoch.
12
+ * 3. Search and MCP must never serve a summary whose source epoch is not
13
+ * visible in the pinned manifest. If a fresh summary is missing,
14
+ * omit it or trigger existing on-demand regeneration; do not serve
15
+ * the stale text.
16
+ * 4. Regeneration is low-priority CPU / existing provider policy and
17
+ * happens outside the reconcile tick. Invalidation is the
18
+ * correctness requirement; eager LLM regeneration is not.
19
+ *
20
+ * The current HCGS implementation (`core/graph/hcgs-generator.js`) stores
21
+ * `summary` and `summary_embedding` directly on the `entities` table. We
22
+ * add a sidecar table `hcgs_summary_metadata` that the reconciler updates
23
+ * in lock-step with the entity/vector deltas. The HCGS query path consults
24
+ * the sidecar to decide whether the live summary is visible at the pinned
25
+ * manifest epoch.
26
+ *
27
+ * Schema:
28
+ * CREATE TABLE hcgs_summary_metadata (
29
+ * entity_id TEXT PRIMARY KEY,
30
+ * source_entity_ids TEXT NOT NULL, -- JSON array
31
+ * source_chunk_struct_ids TEXT NOT NULL, -- JSON array
32
+ * source_hashes TEXT NOT NULL, -- JSON object {entity_id -> hash}
33
+ * epoch_written INTEGER NOT NULL DEFAULT 0,
34
+ * epoch_retired INTEGER
35
+ * ) WITHOUT ROWID;
36
+ *
37
+ * The reconciler writes one row per summary it observes; the maintenance
38
+ * worker prunes rows whose `epoch_retired` is older than `minLiveEpoch`.
39
+ */
40
+
41
+ /**
42
+ * Ensure the HCGS sidecar schema exists.
43
+ *
44
+ * @param {import('better-sqlite3').Database} db
45
+ */
46
+ export function ensureHcgsSidecarSchema(db) {
47
+ db.exec(`
48
+ CREATE TABLE IF NOT EXISTS hcgs_summary_metadata (
49
+ entity_id TEXT PRIMARY KEY,
50
+ source_entity_ids TEXT NOT NULL,
51
+ source_chunk_struct_ids TEXT NOT NULL,
52
+ source_hashes TEXT NOT NULL,
53
+ epoch_written INTEGER NOT NULL DEFAULT 0,
54
+ epoch_retired INTEGER
55
+ ) WITHOUT ROWID
56
+ `);
57
+ db.exec(`
58
+ CREATE INDEX IF NOT EXISTS idx_hcgs_meta_epoch_written
59
+ ON hcgs_summary_metadata (epoch_written);
60
+ `);
61
+ db.exec(`
62
+ CREATE INDEX IF NOT EXISTS idx_hcgs_meta_epoch_retired
63
+ ON hcgs_summary_metadata (epoch_retired)
64
+ WHERE epoch_retired IS NOT NULL;
65
+ `);
66
+ }
67
+
68
+ /**
69
+ * Record / refresh a summary's source-dependency snapshot.
70
+ *
71
+ * Plan § 7.7 step 1. Caller commits within the per-file transaction so the
72
+ * sidecar can't be in disagreement with the entity row.
73
+ *
74
+ * @param {import('better-sqlite3').Database} db
75
+ * @param {string} entityId
76
+ * @param {{sourceEntityIds:string[], sourceChunkStructIds:string[], sourceHashes:object, epoch:number}} payload
77
+ */
78
+ export function recordSummary(db, entityId, payload) {
79
+ db.prepare(`
80
+ INSERT INTO hcgs_summary_metadata (
81
+ entity_id, source_entity_ids, source_chunk_struct_ids, source_hashes,
82
+ epoch_written, epoch_retired
83
+ ) VALUES (?, ?, ?, ?, ?, NULL)
84
+ ON CONFLICT(entity_id) DO UPDATE SET
85
+ source_entity_ids = excluded.source_entity_ids,
86
+ source_chunk_struct_ids = excluded.source_chunk_struct_ids,
87
+ source_hashes = excluded.source_hashes,
88
+ epoch_written = excluded.epoch_written,
89
+ epoch_retired = NULL
90
+ `).run(
91
+ entityId,
92
+ JSON.stringify(payload.sourceEntityIds ?? []),
93
+ JSON.stringify(payload.sourceChunkStructIds ?? []),
94
+ JSON.stringify(payload.sourceHashes ?? {}),
95
+ Number(payload.epoch),
96
+ );
97
+ }
98
+
99
+ /**
100
+ * Retire summaries whose source entities or chunks changed in this tick.
101
+ *
102
+ * Caller passes `{ retiredEntityIds, retiredChunkStructIds }` collected
103
+ * from the graph + vector deltas. Plan § 7.7 step 2 fires the
104
+ * retirement in the same manifest epoch.
105
+ *
106
+ * @param {import('better-sqlite3').Database} db
107
+ * @param {{retiredEntityIds:Set<string>|Array<string>, retiredChunkStructIds:Set<string>|Array<string>}} sources
108
+ * @param {number} epoch
109
+ * @returns {number} How many summaries were retired.
110
+ */
111
+ export function retireDependentSummaries(db, sources, epoch) {
112
+ const entityIds = Array.from(sources.retiredEntityIds ?? []);
113
+ const chunkIds = Array.from(sources.retiredChunkStructIds ?? []);
114
+ if (entityIds.length === 0 && chunkIds.length === 0) return 0;
115
+
116
+ const rows = db.prepare(`
117
+ SELECT entity_id, source_entity_ids, source_chunk_struct_ids
118
+ FROM hcgs_summary_metadata
119
+ WHERE epoch_retired IS NULL
120
+ `).all();
121
+ const entitySet = new Set(entityIds);
122
+ const chunkSet = new Set(chunkIds);
123
+ const stmt = db.prepare(`
124
+ UPDATE hcgs_summary_metadata
125
+ SET epoch_retired = ?
126
+ WHERE entity_id = ? AND epoch_retired IS NULL
127
+ `);
128
+ let count = 0;
129
+ for (const row of rows) {
130
+ const srcEnts = JSON.parse(row.source_entity_ids || '[]');
131
+ const srcChunks = JSON.parse(row.source_chunk_struct_ids || '[]');
132
+ let hit = false;
133
+ for (const e of srcEnts) if (entitySet.has(e)) { hit = true; break; }
134
+ if (!hit) {
135
+ for (const c of srcChunks) if (chunkSet.has(c)) { hit = true; break; }
136
+ }
137
+ if (hit) {
138
+ stmt.run(epoch, row.entity_id);
139
+ count += 1;
140
+ }
141
+ }
142
+ return count;
143
+ }
144
+
145
+ /**
146
+ * SQL fragment that filters HCGS summary metadata rows visible at a given
147
+ * manifest epoch. Plan § 7.7 step 3 — readers MUST add this predicate to
148
+ * any join against the sidecar before returning a summary.
149
+ *
150
+ * Returns `epoch_written <= :manifestEpoch
151
+ * AND (epoch_retired IS NULL OR epoch_retired > :manifestEpoch)`.
152
+ *
153
+ * @param {string} [alias]
154
+ * @returns {string}
155
+ */
156
+ export function summaryVisibilityPredicate(alias = '') {
157
+ const normalizedAlias = String(alias || '').endsWith('.') ? String(alias).slice(0, -1) : String(alias || '');
158
+ const a = normalizedAlias.length > 0 ? `${normalizedAlias}.` : '';
159
+ return (
160
+ `${a}epoch_written <= :manifestEpoch ` +
161
+ `AND (${a}epoch_retired IS NULL OR ${a}epoch_retired > :manifestEpoch)`
162
+ );
163
+ }
164
+
165
+ /**
166
+ * Drop retired rows older than the prune frontier. Plan § 8.1.1 step 4.
167
+ *
168
+ * @param {import('better-sqlite3').Database} db
169
+ * @param {number} pruneFrontier
170
+ * @returns {number}
171
+ */
172
+ export function pruneRetiredSummaries(db, pruneFrontier) {
173
+ if (!Number.isInteger(pruneFrontier)) {
174
+ throw new Error('pruneRetiredSummaries: pruneFrontier must be an integer');
175
+ }
176
+ const res = db.prepare(`
177
+ DELETE FROM hcgs_summary_metadata
178
+ WHERE epoch_retired IS NOT NULL
179
+ AND epoch_retired <= ?
180
+ `).run(pruneFrontier);
181
+ return res.changes ?? 0;
182
+ }
@@ -0,0 +1,278 @@
1
+ /**
2
+ * LI segment batch merge / compaction.
3
+ *
4
+ * The reconcile write path (`application/production-li-delta.mjs`) seals one
5
+ * new SSLX segment per tick that has any add op, so segment count grows
6
+ * ~1/tick forever. The per-segment `li_segment` handler only drops
7
+ * tombstoned docs *within* a segment — it never reduces the segment count.
8
+ * This module is the missing segment-count bound: it merges many small
9
+ * live segments into `ceil(liveDocs / SEGMENT_SIZE)` segments, dropping
10
+ * tombstoned docs, while leaving already-full sealed segments untouched
11
+ * (those are handled by the per-segment stale-ratio path).
12
+ *
13
+ * Publish safety (no mixed-epoch / no ENOENT for in-flight readers):
14
+ * 1. Sweep crash orphans (segment files not referenced by the manifest
15
+ * and not already quarantined) — makes a crashed prior merge idempotent.
16
+ * 2. Sweep the quarantine journal — physically delete consumed segment
17
+ * files whose grace window has elapsed.
18
+ * 3. Write the merged segment files under unique new names via
19
+ * `*.compacting.tmp` + rename (new names never collide with live ones,
20
+ * so a reader on the OLD manifest never sees a half-written file).
21
+ * 4. Atomically publish `manifest.json` (tmp + rename) listing the kept
22
+ * full segments + the new merged segments only.
23
+ * 5. Quarantine the consumed small-segment files (+ stale sidecars) — they
24
+ * stay readable for any reader still mid-`_loadSegmented` on the old
25
+ * manifest; physical deletion is deferred to a later pass once the
26
+ * grace window has elapsed.
27
+ *
28
+ * Crash recovery:
29
+ * - crash before (4): old manifest still references the old segments; the
30
+ * new files are orphans cleaned by step 1 on the next run.
31
+ * - crash after (4) before (5): new manifest is live; old files become
32
+ * orphans cleaned by step 1 on the next run (>= 1 tick later, well past
33
+ * any reader load window).
34
+ * - crash after (5): the quarantine sweep (step 2) finishes the deletion.
35
+ *
36
+ * Segment naming is collision-proof via a monotonic `nextSeq` persisted in
37
+ * the manifest (see `nextSegmentSeq`); the write path uses the same counter.
38
+ */
39
+
40
+ import fs from 'node:fs';
41
+ import fsp from 'node:fs/promises';
42
+ import path from 'node:path';
43
+ import { LateInteractionIndex } from '../../ranking/late-interaction-index.js';
44
+ import { STALE_SIDECAR_EXT, nextSegmentSeq, LI_SEGMENT_SIZE } from './li-segment-state.mjs';
45
+
46
+ export { nextSegmentSeq, LI_SEGMENT_SIZE };
47
+
48
+ /** Default quarantine grace before a consumed segment file is unlinked. */
49
+ export const LI_MERGE_GRACE_MS = 60_000;
50
+ const QUARANTINE_FILE = 'pending-delete.jsonl';
51
+
52
+ function readJson(filePath, fallback = null) {
53
+ try { return JSON.parse(fs.readFileSync(filePath, 'utf-8')); } catch { return fallback; }
54
+ }
55
+
56
+ function safeUnlink(filePath) {
57
+ try { fs.unlinkSync(filePath); return true; } catch { return false; }
58
+ }
59
+
60
+ async function writeJsonAtomic(filePath, payload) {
61
+ const tmp = `${filePath}.tmp.${process.pid}`;
62
+ await fsp.writeFile(tmp, JSON.stringify(payload, null, 2));
63
+ await fsp.rename(tmp, filePath);
64
+ }
65
+
66
+ /**
67
+ * Resolve the segmented LI layout from the stub. Returns null for legacy /
68
+ * missing / corrupt indices (callers skip — nothing to merge).
69
+ */
70
+ export function resolveSegmentedLayout(stateDir) {
71
+ const stubPath = path.join(stateDir, 'codebase-late-interaction.db');
72
+ if (!fs.existsSync(stubPath)) return null;
73
+ const stub = readJson(stubPath);
74
+ if (!stub || stub.format !== 'segmented' || !stub.segmentDir) return null;
75
+ const segmentDir = path.resolve(stateDir, stub.segmentDir);
76
+ const manifestPath = path.join(segmentDir, 'manifest.json');
77
+ const manifest = readJson(manifestPath);
78
+ if (!manifest || !Array.isArray(manifest.segments)) return null;
79
+ return { stubPath, segmentDir, manifestPath, manifest };
80
+ }
81
+
82
+ function quarantinePath(segmentDir) {
83
+ return path.join(segmentDir, QUARANTINE_FILE);
84
+ }
85
+
86
+ function readQuarantine(segmentDir) {
87
+ const p = quarantinePath(segmentDir);
88
+ if (!fs.existsSync(p)) return [];
89
+ const out = [];
90
+ for (const line of fs.readFileSync(p, 'utf-8').split('\n')) {
91
+ const trimmed = line.trim();
92
+ if (!trimmed) continue;
93
+ try {
94
+ const entry = JSON.parse(trimmed);
95
+ if (entry && Array.isArray(entry.paths)) out.push(entry);
96
+ } catch { /* skip torn line */ }
97
+ }
98
+ return out;
99
+ }
100
+
101
+ function writeQuarantine(segmentDir, entries) {
102
+ const p = quarantinePath(segmentDir);
103
+ if (entries.length === 0) { safeUnlink(p); return; }
104
+ fs.writeFileSync(p, entries.map((e) => JSON.stringify(e)).join('\n') + '\n');
105
+ }
106
+
107
+ /**
108
+ * Physically delete quarantined segment files whose grace window elapsed.
109
+ * Entries still inside the grace window are retained verbatim.
110
+ *
111
+ * @returns {number} files unlinked
112
+ */
113
+ export function sweepQuarantine(segmentDir, graceMs = LI_MERGE_GRACE_MS, now = Date.now()) {
114
+ const entries = readQuarantine(segmentDir);
115
+ if (entries.length === 0) return 0;
116
+ const survivors = [];
117
+ let deleted = 0;
118
+ for (const entry of entries) {
119
+ if (now - (entry.retiredAtMs || 0) > graceMs) {
120
+ for (const rel of entry.paths) {
121
+ if (safeUnlink(path.join(segmentDir, rel))) deleted += 1;
122
+ }
123
+ } else {
124
+ survivors.push(entry);
125
+ }
126
+ }
127
+ writeQuarantine(segmentDir, survivors);
128
+ return deleted;
129
+ }
130
+
131
+ /**
132
+ * Delete crash-orphan `*.bin` segment files (and orphan stale sidecars) that
133
+ * the manifest does not reference and that are not currently quarantined.
134
+ * Safe because no reader ever resolves a path the manifest does not list,
135
+ * and the daemon is the single writer.
136
+ *
137
+ * @returns {number} files unlinked
138
+ */
139
+ export function sweepOrphanSegments(segmentDir, manifest) {
140
+ let names;
141
+ try { names = fs.readdirSync(segmentDir); } catch { return 0; }
142
+ const referenced = new Set((manifest.segments || []).map((s) => s.path));
143
+ const quarantined = new Set();
144
+ for (const entry of readQuarantine(segmentDir)) {
145
+ for (const rel of entry.paths) quarantined.add(rel);
146
+ }
147
+ const isProtected = (segName) => referenced.has(segName) || quarantined.has(segName);
148
+ let deleted = 0;
149
+ for (const name of names) {
150
+ if (name.endsWith(STALE_SIDECAR_EXT)) {
151
+ const base = name.slice(0, -STALE_SIDECAR_EXT.length);
152
+ if (!isProtected(base)) { if (safeUnlink(path.join(segmentDir, name))) deleted += 1; }
153
+ continue;
154
+ }
155
+ if (!name.endsWith('.bin')) continue;
156
+ if (!isProtected(name)) { if (safeUnlink(path.join(segmentDir, name))) deleted += 1; }
157
+ }
158
+ return deleted;
159
+ }
160
+
161
+ /**
162
+ * Merge small live LI segments into fewer larger segments.
163
+ *
164
+ * Pass `sweepOnly: true` to run ONLY the cheap housekeeping sweeps (orphan +
165
+ * quarantine) without loading or rewriting the index. The watermark uses this
166
+ * for the `pending_delete` re-fire so a large index is not reloaded every tick
167
+ * just to drain a few quarantined files.
168
+ *
169
+ * @param {string} stateDir
170
+ * @param {{segmentSize?:number, graceMs?:number, minSmallSegments?:number, sweepOnly?:boolean}} [opts]
171
+ * @returns {Promise<object>} summary, never throws on missing/legacy index
172
+ */
173
+ export async function mergeLiSegments(stateDir, opts = {}) {
174
+ const segmentSize = Number.isInteger(opts.segmentSize) && opts.segmentSize > 0 ? opts.segmentSize : LI_SEGMENT_SIZE;
175
+ const graceMs = Number.isFinite(opts.graceMs) && opts.graceMs >= 0 ? opts.graceMs : LI_MERGE_GRACE_MS;
176
+ const minSmall = Number.isInteger(opts.minSmallSegments) && opts.minSmallSegments > 0 ? opts.minSmallSegments : 2;
177
+
178
+ const layout = resolveSegmentedLayout(stateDir);
179
+ if (!layout) return { skipped: 'no-segmented-index' };
180
+ const { stubPath, segmentDir, manifestPath, manifest } = layout;
181
+
182
+ // Step 1 + 2: housekeeping that must run regardless of whether we merge.
183
+ const orphansDeleted = sweepOrphanSegments(segmentDir, manifest);
184
+ const quarantineDeleted = sweepQuarantine(segmentDir, graceMs);
185
+
186
+ if (opts.sweepOnly) {
187
+ return { swept: true, orphansDeleted, quarantineDeleted, segmentCount: manifest.segments.length };
188
+ }
189
+
190
+ const small = manifest.segments.filter((s) => Number.isFinite(s.count) && s.count < segmentSize);
191
+ if (small.length < minSmall) {
192
+ return { skipped: 'too-few-small-segments', orphansDeleted, quarantineDeleted, segmentCount: manifest.segments.length };
193
+ }
194
+ const keptFull = manifest.segments.filter((s) => !(Number.isFinite(s.count) && s.count < segmentSize));
195
+ const smallNames = new Set(small.map((s) => s.path));
196
+ const smallAbs = new Set(small.map((s) => path.join(segmentDir, s.path)));
197
+
198
+ // Load the live docs (the loader drops tombstoned docs per the stale bitmap).
199
+ const index = new LateInteractionIndex({ indexPath: stubPath, loadExisting: true, modelId: manifest.modelId || null });
200
+ await index.init();
201
+
202
+ const collected = [];
203
+ for (const [docId, doc] of index.documents.entries()) {
204
+ const pos = index._docSegmentPositions?.get(docId);
205
+ if (!pos || !smallAbs.has(pos.segmentPath)) continue;
206
+ collected.push({ segmentPath: pos.segmentPath, docIndex: pos.docIndex, docId, doc });
207
+ }
208
+ collected.sort((a, b) => (a.segmentPath < b.segmentPath ? -1 : a.segmentPath > b.segmentPath ? 1
209
+ : a.docIndex - b.docIndex));
210
+
211
+ // Writer used purely as the SSLX serializer (model/quant params copied verbatim).
212
+ const writer = new LateInteractionIndex({
213
+ indexPath: stubPath,
214
+ loadExisting: false,
215
+ tokenDim: index.tokenDim,
216
+ maxTokens: index.maxTokens,
217
+ useInt8: index.useInt8,
218
+ quantBits: index.quantBits,
219
+ modelId: index.modelId,
220
+ poolFactor: index.poolFactor,
221
+ whtSeed: index.whtSeed,
222
+ whtOrdering: index.whtOrdering,
223
+ matryoshkaDim: index.matryoshkaDim,
224
+ });
225
+ await writer.init();
226
+
227
+ let seq = nextSegmentSeq(manifest);
228
+ const newSegments = [];
229
+ const writtenFinals = [];
230
+ for (let i = 0; i < collected.length; i += segmentSize) {
231
+ const batch = new Map();
232
+ for (const { docId, doc } of collected.slice(i, i + segmentSize)) batch.set(docId, doc);
233
+ const segName = `segment-${String(seq).padStart(4, '0')}.bin`;
234
+ seq += 1;
235
+ const finalPath = path.join(segmentDir, segName);
236
+ const tmpPath = finalPath + '.compacting.tmp';
237
+ await writer._writeSegmentFile(tmpPath, batch);
238
+ fs.renameSync(tmpPath, finalPath);
239
+ writtenFinals.push(finalPath);
240
+ newSegments.push({ path: segName, count: batch.size });
241
+ }
242
+
243
+ // Step 4: atomic manifest publish (kept full segments + new merged segments).
244
+ const nextManifest = {
245
+ ...manifest,
246
+ segments: [...keptFull, ...newSegments],
247
+ nextSeq: seq,
248
+ };
249
+ nextManifest.totalDocuments = nextManifest.segments.reduce((sum, s) => sum + (s?.count || 0), 0);
250
+ await writeJsonAtomic(manifestPath, nextManifest);
251
+
252
+ // Step 5: quarantine the consumed small-segment files (+ stale sidecars).
253
+ const consumedPaths = [];
254
+ for (const name of smallNames) {
255
+ consumedPaths.push(name);
256
+ if (fs.existsSync(path.join(segmentDir, name + STALE_SIDECAR_EXT))) {
257
+ consumedPaths.push(name + STALE_SIDECAR_EXT);
258
+ }
259
+ }
260
+ if (consumedPaths.length > 0) {
261
+ const journal = readQuarantine(segmentDir);
262
+ journal.push({ retiredAtMs: Date.now(), paths: consumedPaths });
263
+ writeQuarantine(segmentDir, journal);
264
+ }
265
+
266
+ return {
267
+ tier: 'li_segments',
268
+ mergedFrom: small.length,
269
+ mergedInto: newSegments.length,
270
+ keptFull: keptFull.length,
271
+ keptDocs: collected.length,
272
+ segmentCountBefore: manifest.segments.length,
273
+ segmentCountAfter: nextManifest.segments.length,
274
+ orphansDeleted,
275
+ quarantineDeleted,
276
+ quarantined: consumedPaths.length,
277
+ };
278
+ }
@@ -0,0 +1,173 @@
1
+ /**
2
+ * LI segment state and per-segment tombstone tracking.
3
+ *
4
+ * Plan § 7.5. Sweet-search's late-interaction index ships as SSLX v3
5
+ * segments of ≤ 10 K docs each. The newest segment is the "growing"
6
+ * write target; sealed segments are immutable. Edits to docs in a sealed
7
+ * segment produce two writes:
8
+ *
9
+ * - the old doc's bit is set in the sealed segment's `*.stale.bin`,
10
+ * - the new doc is appended to the growing segment.
11
+ *
12
+ * This module tracks per-segment stale counts and decides which segments
13
+ * cross the per-segment watermark (`stale_doc_ratio > 0.20` per plan
14
+ * § 7.5 step 3 / `domain/watermark-scheduler.mjs`).
15
+ *
16
+ * The stale bitmap for each segment lives at `<segmentPath>.stale.bin`
17
+ * and uses the layout in `infrastructure/tombstone-bitmap.mjs`.
18
+ */
19
+
20
+ import fs from 'node:fs';
21
+ import path from 'node:path';
22
+ import {
23
+ createBitmap, loadBitmap, saveBitmap,
24
+ resizeBitmap, setBit, isSet, filterLive,
25
+ } from './tombstone-bitmap.mjs';
26
+
27
+ export const STALE_SIDECAR_EXT = '.stale.bin';
28
+ /** SSLX sealed-segment capacity (docs). Mirrors LI_SEGMENT_SIZE in
29
+ * core/ranking/late-interaction-index.js; a segment with fewer docs is
30
+ * "small" and a candidate for batch merge. */
31
+ export const LI_SEGMENT_SIZE = 10_000;
32
+
33
+ function staleSidecarPath(segmentPath) {
34
+ return segmentPath + STALE_SIDECAR_EXT;
35
+ }
36
+
37
+ /**
38
+ * Next monotonic segment sequence number for a segment manifest. Persisted
39
+ * as `manifest.nextSeq`; bootstrapped from the max existing `segment-<n>`
40
+ * index so segment naming stays collision-proof even after the batch merge
41
+ * removes segments out of order. Both the reconcile write path and the
42
+ * merge handler bump this counter.
43
+ *
44
+ * @param {object} manifest
45
+ * @returns {number}
46
+ */
47
+ export function nextSegmentSeq(manifest) {
48
+ if (Number.isInteger(manifest?.nextSeq)) return manifest.nextSeq;
49
+ let maxIdx = -1;
50
+ for (const seg of manifest?.segments || []) {
51
+ const m = typeof seg?.path === 'string' && seg.path.match(/segment-(\d+)\.bin$/);
52
+ if (m) maxIdx = Math.max(maxIdx, Number(m[1]));
53
+ }
54
+ return maxIdx + 1;
55
+ }
56
+
57
+ /**
58
+ * Per-segment state object that the LI tier uses while applying deltas.
59
+ *
60
+ * @param {string} segmentPath
61
+ * @param {number} docCount
62
+ */
63
+ export function openSegmentState(segmentPath, docCount) {
64
+ let bitmap = loadBitmap(staleSidecarPath(segmentPath));
65
+ if (!bitmap) {
66
+ bitmap = createBitmap(Math.max(1, docCount));
67
+ } else if (bitmap.capacity < docCount) {
68
+ bitmap = resizeBitmap(bitmap, docCount);
69
+ }
70
+ return {
71
+ segmentPath,
72
+ docCount,
73
+ bitmap,
74
+ };
75
+ }
76
+
77
+ /**
78
+ * Mark a doc tombstoned. Persists the bitmap to disk so a daemon crash
79
+ * before manifest publish does not lose the tombstone state.
80
+ *
81
+ * @param {object} segmentState
82
+ * @param {number} docIndex
83
+ */
84
+ export function tombstoneDoc(segmentState, docIndex) {
85
+ setBit(segmentState.bitmap, docIndex);
86
+ }
87
+
88
+ /**
89
+ * Persist the stale bitmap to disk via atomic temp+rename.
90
+ *
91
+ * @param {object} segmentState
92
+ */
93
+ export function persistSegmentState(segmentState) {
94
+ saveBitmap(staleSidecarPath(segmentState.segmentPath), segmentState.bitmap);
95
+ }
96
+
97
+ /**
98
+ * Compute the stale_doc_ratio for the watermark check.
99
+ *
100
+ * @param {object} segmentState
101
+ * @returns {number}
102
+ */
103
+ export function staleDocRatio(segmentState) {
104
+ if (segmentState.docCount === 0) return 0;
105
+ let tombstoned = 0;
106
+ for (let i = 0; i < segmentState.docCount; i += 1) {
107
+ if (isSet(segmentState.bitmap, i)) tombstoned += 1;
108
+ }
109
+ return tombstoned / segmentState.docCount;
110
+ }
111
+
112
+ /**
113
+ * Enumerate segment paths in a segments directory. Returns paths sorted
114
+ * lexicographically (caller is responsible for any ordering they need
115
+ * beyond that).
116
+ *
117
+ * @param {string} segmentsDir
118
+ * @returns {string[]}
119
+ */
120
+ export function listSegments(segmentsDir) {
121
+ if (!fs.existsSync(segmentsDir)) return [];
122
+ return fs
123
+ .readdirSync(segmentsDir)
124
+ .filter((n) => n.endsWith('.bin') && !n.endsWith(STALE_SIDECAR_EXT))
125
+ .sort()
126
+ .map((n) => path.join(segmentsDir, n));
127
+ }
128
+
129
+ /**
130
+ * For each segment in the directory, evaluate its current stale ratio.
131
+ * The watermark scheduler (`domain/watermark-scheduler.mjs`) consumes
132
+ * the returned array via `liSegments` input.
133
+ *
134
+ * @param {string} segmentsDir
135
+ * @param {Map<string, number>} [docCountsBySegment] Optional override.
136
+ * @returns {Array<{segmentId:string, staleDocRatio:number}>}
137
+ */
138
+ export function evaluateSegmentRatios(segmentsDir, docCountsBySegment = new Map()) {
139
+ const out = [];
140
+ for (const segmentPath of listSegments(segmentsDir)) {
141
+ const docCount = docCountsBySegment.get(segmentPath) ?? 0;
142
+ if (docCount === 0) continue;
143
+ const state = openSegmentState(segmentPath, docCount);
144
+ out.push({
145
+ segmentId: path.basename(segmentPath),
146
+ staleDocRatio: staleDocRatio(state),
147
+ });
148
+ }
149
+ return out;
150
+ }
151
+
152
+ /**
153
+ * Filter a candidate doc-index list by the segment's stale bitmap. Used
154
+ * by the LI scorer (plan § 7.5 step 5).
155
+ *
156
+ * @param {object} segmentState
157
+ * @param {number[]} candidates
158
+ * @returns {number[]}
159
+ */
160
+ export function filterLiveDocs(segmentState, candidates) {
161
+ return filterLive(segmentState.bitmap, candidates);
162
+ }
163
+
164
+ /**
165
+ * Check whether a doc index is alive (false) or tombstoned (true).
166
+ *
167
+ * @param {object} segmentState
168
+ * @param {number} docIndex
169
+ * @returns {boolean}
170
+ */
171
+ export function isDocTombstoned(segmentState, docIndex) {
172
+ return isSet(segmentState.bitmap, docIndex);
173
+ }