sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,314 @@
1
+ /**
2
+ * Retired-row physical GC for `code-graph.db`.
3
+ *
4
+ * The reconcile path only *tombstones* superseded graph rows: `applyGraphDelta`
5
+ * sets `epoch_retired` on retired entities/relationships (and HCGS summaries),
6
+ * but nothing ever deletes them, so `code-graph.db` grows without bound under a
7
+ * long-lived daemon. The post-fix soak ended with ~90% of entity rows retired
8
+ * (3466/3870, ~3.8 MB), which is the dominant long-run growth source once the
9
+ * vector / LI / sparse tiers are bounded. This module physically removes
10
+ * retired graph rows once no reader can still observe them.
11
+ *
12
+ * Safety (strict visibility — mirrors `vector-gc.mjs` § 8.1.1):
13
+ * An entity row is visible to a reader pinned at manifest epoch E iff
14
+ * (epoch_written IS NULL OR epoch_written <= E)
15
+ * AND (epoch_retired IS NULL OR epoch_retired > E)
16
+ * AND (stale_since IS NULL OR (epoch_retired IS NOT NULL AND epoch_retired > E))
17
+ * Relationships and HCGS summaries use the same epoch_written/epoch_retired
18
+ * rule. So a row with `epoch_retired = R` is invisible to every reader whose
19
+ * pinned epoch E satisfies E >= R. The smallest epoch any live reader pins is
20
+ * `minLiveEpoch` (from reader heartbeats); the repository always re-syncs to
21
+ * the latest manifest, so a reader's query epoch never drops below its
22
+ * heartbeat epoch. Deleting rows with `epoch_retired <= frontier`, where
23
+ * `frontier = minLiveEpoch ?? currentManifestEpoch`, can never remove a row
24
+ * any reader still sees:
25
+ * - readers present → frontier = minLiveEpoch <= every reader's epoch.
26
+ * - no readers → frontier = currentManifestEpoch; any future reader
27
+ * reads a manifest at epoch >= currentManifestEpoch (monotonic), so the
28
+ * deleted rows (retired <= currentManifestEpoch <= future E) are already
29
+ * invisible to it.
30
+ *
31
+ * Reference integrity: deleting a retired entity cannot orphan a *live*
32
+ * relationship. The read path LEFT JOINs relationships to entities under the
33
+ * entity-visibility filter; a retired entity (epoch_retired <= frontier <= E)
34
+ * is already filtered out of that join at every visible epoch, so any live
35
+ * relationship pointing at it already resolves to NULL and is dropped from
36
+ * results. Removing the already-invisible entity changes no query result.
37
+ * A retired entity's own outgoing relationships are retired in the same
38
+ * reconcile transaction at the same epoch, so they fall under the same
39
+ * frontier and are GC'd alongside it.
40
+ *
41
+ * FTS5 consistency: `entities_fts` and `entities_trigram` are external-content
42
+ * FTS5 tables (`content='entities'`, `content_rowid='rowid'`). Deleting an
43
+ * entity content row does NOT update the index, so we must issue the FTS5
44
+ * `'delete'` command (rowid + the originally-indexed column values, read from
45
+ * the still-present row) before unlinking the content row, all in one
46
+ * transaction. Retired rows are immutable apart from `epoch_retired` /
47
+ * `stale_since` (neither is an FTS column), so the current column values equal
48
+ * the indexed values and the delete is exact.
49
+ */
50
+
51
+ import fs from 'node:fs';
52
+ import path from 'node:path';
53
+ import Database from 'better-sqlite3';
54
+
55
+ export const DEFAULT_GRAPH_GC_BATCH = 2000;
56
+ export const DEFAULT_GRAPH_GC_MAX_ROWS = 100_000;
57
+
58
+ function tableExists(db, name) {
59
+ return !!db.prepare(
60
+ "SELECT 1 FROM sqlite_master WHERE type='table' AND name = ?",
61
+ ).get(name);
62
+ }
63
+
64
+ function hasColumn(db, table, column) {
65
+ try {
66
+ return db.prepare(`PRAGMA table_info(${table})`).all().some((c) => c.name === column);
67
+ } catch {
68
+ return false;
69
+ }
70
+ }
71
+
72
+ function normalizeBatchOpts(opts) {
73
+ const batchSize = Number.isInteger(opts.batchSize) && opts.batchSize > 0
74
+ ? opts.batchSize : DEFAULT_GRAPH_GC_BATCH;
75
+ const maxRows = Number.isInteger(opts.maxRows) && opts.maxRows > 0
76
+ ? opts.maxRows : DEFAULT_GRAPH_GC_MAX_ROWS;
77
+ return { batchSize, maxRows };
78
+ }
79
+
80
+ /**
81
+ * Delete retired `relationships` rows at or below `frontier` in bounded
82
+ * batches. Relationships carry no FTS coupling and no dependents, so a plain
83
+ * `rowid IN (… LIMIT ?)` delete is sufficient (works regardless of whether
84
+ * SQLite was built with SQLITE_ENABLE_UPDATE_DELETE_LIMIT).
85
+ *
86
+ * @param {import('better-sqlite3').Database} db
87
+ * @param {number} frontier
88
+ * @param {{batchSize?:number, maxRows?:number}} [opts]
89
+ * @returns {{deleted:number, batches:number, hitCap:boolean, skipped?:string}}
90
+ */
91
+ export function pruneRetiredRelationships(db, frontier, opts = {}) {
92
+ if (!Number.isInteger(frontier)) {
93
+ throw new Error(`pruneRetiredRelationships: frontier must be an integer, got ${frontier}`);
94
+ }
95
+ if (!hasColumn(db, 'relationships', 'epoch_retired')) {
96
+ return { deleted: 0, batches: 0, hitCap: false, skipped: 'no-epoch-column' };
97
+ }
98
+ const { batchSize, maxRows } = normalizeBatchOpts(opts);
99
+ const stmt = db.prepare(`
100
+ DELETE FROM relationships
101
+ WHERE rowid IN (
102
+ SELECT rowid FROM relationships
103
+ WHERE epoch_retired IS NOT NULL AND epoch_retired <= ?
104
+ LIMIT ?
105
+ )
106
+ `);
107
+ let deleted = 0;
108
+ let batches = 0;
109
+ let hitCap = false;
110
+ for (;;) {
111
+ const remainingCap = maxRows - deleted;
112
+ if (remainingCap <= 0) { hitCap = true; break; }
113
+ const take = Math.min(batchSize, remainingCap);
114
+ const changes = stmt.run(frontier, take).changes ?? 0;
115
+ deleted += changes;
116
+ batches += 1;
117
+ if (changes < take) break; // drained
118
+ }
119
+ return { deleted, batches, hitCap };
120
+ }
121
+
122
+ /**
123
+ * Delete retired `entities` rows at or below `frontier` in bounded batches,
124
+ * keeping the external-content FTS5 indices consistent. For each row we issue
125
+ * the FTS5 `'delete'` command (rowid + originally-indexed columns) before
126
+ * unlinking the content row, inside one transaction per batch so cross-process
127
+ * readers (WAL) observe an all-or-nothing change.
128
+ *
129
+ * @param {import('better-sqlite3').Database} db
130
+ * @param {number} frontier
131
+ * @param {{batchSize?:number, maxRows?:number}} [opts]
132
+ * @returns {{deleted:number, batches:number, hitCap:boolean, ftsDeleted:number, skipped?:string}}
133
+ */
134
+ export function pruneRetiredEntities(db, frontier, opts = {}) {
135
+ if (!Number.isInteger(frontier)) {
136
+ throw new Error(`pruneRetiredEntities: frontier must be an integer, got ${frontier}`);
137
+ }
138
+ if (!hasColumn(db, 'entities', 'epoch_retired')) {
139
+ return { deleted: 0, batches: 0, hitCap: false, ftsDeleted: 0, skipped: 'no-epoch-column' };
140
+ }
141
+ const { batchSize, maxRows } = normalizeBatchOpts(opts);
142
+ const hasFts = tableExists(db, 'entities_fts');
143
+ const hasTrigram = tableExists(db, 'entities_trigram');
144
+
145
+ const selectStmt = db.prepare(`
146
+ SELECT rowid AS rid, name, name_alias, signature, doc_comment
147
+ FROM entities
148
+ WHERE epoch_retired IS NOT NULL AND epoch_retired <= ?
149
+ LIMIT ?
150
+ `);
151
+ const ftsDel = hasFts ? db.prepare(
152
+ `INSERT INTO entities_fts(entities_fts, rowid, name, name_alias, signature, doc_comment)
153
+ VALUES('delete', ?, ?, ?, ?, ?)`,
154
+ ) : null;
155
+ const triDel = hasTrigram ? db.prepare(
156
+ `INSERT INTO entities_trigram(entities_trigram, rowid, name, signature)
157
+ VALUES('delete', ?, ?, ?)`,
158
+ ) : null;
159
+ const delStmt = db.prepare('DELETE FROM entities WHERE rowid = ?');
160
+
161
+ const runBatch = db.transaction((take) => {
162
+ const rows = selectStmt.all(frontier, take);
163
+ let ftsRemoved = 0;
164
+ for (const r of rows) {
165
+ if (ftsDel) { try { ftsDel.run(r.rid, r.name, r.name_alias, r.signature, r.doc_comment); ftsRemoved += 1; } catch { /* index drift — tolerate */ } }
166
+ if (triDel) { try { triDel.run(r.rid, r.name, r.signature); } catch { /* tolerate */ } }
167
+ delStmt.run(r.rid);
168
+ }
169
+ return { count: rows.length, ftsRemoved };
170
+ });
171
+
172
+ let deleted = 0;
173
+ let batches = 0;
174
+ let ftsDeleted = 0;
175
+ let hitCap = false;
176
+ for (;;) {
177
+ const remainingCap = maxRows - deleted;
178
+ if (remainingCap <= 0) { hitCap = true; break; }
179
+ const take = Math.min(batchSize, remainingCap);
180
+ const { count, ftsRemoved } = runBatch(take);
181
+ deleted += count;
182
+ ftsDeleted += ftsRemoved;
183
+ batches += 1;
184
+ if (count < take) break; // drained
185
+ }
186
+ return { deleted, batches, hitCap, ftsDeleted };
187
+ }
188
+
189
+ /**
190
+ * Delete retired `hcgs_summary_metadata` rows at or below `frontier` in bounded
191
+ * batches. The table is WITHOUT ROWID (PK `entity_id`), so we bound via an
192
+ * `entity_id IN (… LIMIT ?)` subquery.
193
+ *
194
+ * @param {import('better-sqlite3').Database} db
195
+ * @param {number} frontier
196
+ * @param {{batchSize?:number, maxRows?:number}} [opts]
197
+ * @returns {{deleted:number, batches:number, hitCap:boolean, skipped?:string}}
198
+ */
199
+ export function pruneRetiredGraphSummaries(db, frontier, opts = {}) {
200
+ if (!Number.isInteger(frontier)) {
201
+ throw new Error(`pruneRetiredGraphSummaries: frontier must be an integer, got ${frontier}`);
202
+ }
203
+ if (!tableExists(db, 'hcgs_summary_metadata') || !hasColumn(db, 'hcgs_summary_metadata', 'epoch_retired')) {
204
+ return { deleted: 0, batches: 0, hitCap: false, skipped: 'no-summary-table' };
205
+ }
206
+ const { batchSize, maxRows } = normalizeBatchOpts(opts);
207
+ const stmt = db.prepare(`
208
+ DELETE FROM hcgs_summary_metadata
209
+ WHERE entity_id IN (
210
+ SELECT entity_id FROM hcgs_summary_metadata
211
+ WHERE epoch_retired IS NOT NULL AND epoch_retired <= ?
212
+ LIMIT ?
213
+ )
214
+ `);
215
+ let deleted = 0;
216
+ let batches = 0;
217
+ let hitCap = false;
218
+ for (;;) {
219
+ const remainingCap = maxRows - deleted;
220
+ if (remainingCap <= 0) { hitCap = true; break; }
221
+ const take = Math.min(batchSize, remainingCap);
222
+ const changes = stmt.run(frontier, take).changes ?? 0;
223
+ deleted += changes;
224
+ batches += 1;
225
+ if (changes < take) break; // drained
226
+ }
227
+ return { deleted, batches, hitCap };
228
+ }
229
+
230
+ /**
231
+ * Run retired-row GC against `<stateDir>/code-graph.db`.
232
+ *
233
+ * Computes the safe prune frontier from reader heartbeats (falling back to the
234
+ * current manifest epoch when no readers are live), prunes relationships, then
235
+ * entities (with FTS cleanup), then HCGS summaries — sharing a single per-run
236
+ * row budget — then issues a PASSIVE WAL checkpoint to keep the WAL bounded
237
+ * without ever blocking concurrent readers. Never throws on a missing DB /
238
+ * table / column / heartbeat dir — returns `{ skipped }` instead.
239
+ *
240
+ * @param {string} stateDir
241
+ * @param {{
242
+ * dbPath?:string, batchSize?:number, maxRows?:number,
243
+ * minLiveEpoch?:(dir:string)=>(number|null),
244
+ * readManifest?:(dir:string)=>(object|null),
245
+ * }} [deps]
246
+ * @returns {{
247
+ * deletedEntities:number, deletedRelationships:number, deletedSummaries:number,
248
+ * frontier:number, hadReaders:boolean, batches:number, hitCap:boolean
249
+ * } | {skipped:string}}
250
+ */
251
+ export function runGraphGc(stateDir, deps = {}) {
252
+ const dbPath = deps.dbPath || path.join(stateDir, 'code-graph.db');
253
+ if (!fs.existsSync(dbPath)) return { skipped: 'no-graph-db' };
254
+
255
+ const minLiveEpochFn = deps.minLiveEpoch;
256
+ const readManifestFn = deps.readManifest;
257
+ if (typeof minLiveEpochFn !== 'function' || typeof readManifestFn !== 'function') {
258
+ throw new Error('runGraphGc: minLiveEpoch and readManifest deps are required');
259
+ }
260
+
261
+ let frontier = null;
262
+ let hadReaders = false;
263
+ const live = minLiveEpochFn(stateDir);
264
+ if (Number.isInteger(live)) {
265
+ frontier = live;
266
+ hadReaders = true;
267
+ } else {
268
+ const manifest = readManifestFn(stateDir);
269
+ if (Number.isInteger(manifest?.epoch)) frontier = manifest.epoch;
270
+ }
271
+ if (!Number.isInteger(frontier)) return { skipped: 'no-frontier' };
272
+
273
+ const { batchSize, maxRows } = normalizeBatchOpts(deps);
274
+ const db = new Database(dbPath);
275
+ try {
276
+ db.pragma('journal_mode = WAL');
277
+ db.pragma('synchronous = NORMAL');
278
+
279
+ // Relationships first (cheapest, no FTS), then entities (FTS cleanup),
280
+ // then HCGS summaries — sharing one per-run row budget so a churny graph
281
+ // can never starve the reconcile tick. Safe in any order: every deleted
282
+ // row is already invisible at the frontier.
283
+ let budget = maxRows;
284
+ const rel = tableExists(db, 'relationships')
285
+ ? pruneRetiredRelationships(db, frontier, { batchSize, maxRows: budget })
286
+ : { deleted: 0, batches: 0, hitCap: false };
287
+ budget -= rel.deleted;
288
+ const ent = (budget > 0 && tableExists(db, 'entities'))
289
+ ? pruneRetiredEntities(db, frontier, { batchSize, maxRows: budget })
290
+ : { deleted: 0, batches: 0, hitCap: false };
291
+ budget -= ent.deleted;
292
+ const sum = (budget > 0)
293
+ ? pruneRetiredGraphSummaries(db, frontier, { batchSize, maxRows: budget })
294
+ : { deleted: 0, batches: 0, hitCap: false };
295
+
296
+ const totalDeleted = rel.deleted + ent.deleted + sum.deleted;
297
+ if (totalDeleted > 0) {
298
+ try { db.pragma('wal_checkpoint(PASSIVE)'); } catch { /* best-effort */ }
299
+ }
300
+
301
+ return {
302
+ deletedEntities: ent.deleted,
303
+ deletedRelationships: rel.deleted,
304
+ deletedSummaries: sum.deleted,
305
+ ftsDeleted: ent.ftsDeleted ?? 0,
306
+ frontier,
307
+ hadReaders,
308
+ batches: rel.batches + ent.batches + sum.batches,
309
+ hitCap: !!(rel.hitCap || ent.hitCap || sum.hitCap),
310
+ };
311
+ } finally {
312
+ db.close();
313
+ }
314
+ }
@@ -0,0 +1,298 @@
1
+ /**
2
+ * Content-hashing wrapper for incremental indexing.
3
+ *
4
+ * Sweet-search uses content hashes for local dedup (per-file content, per-chunk
5
+ * content, exact encoder inputs). The original incremental-tracker.js path
6
+ * truncates SHA-256 to 16 hex chars (8 bytes / 64 bits). That is enough for
7
+ * collision avoidance on local corpora but the throughput (~1-2 GiB/s on cores
8
+ * without SHA-NI / ARMv8 Crypto Extensions) becomes meaningful on branch-switch
9
+ * storms that touch tens of thousands of files.
10
+ *
11
+ * Plan § 7.2 / § 21 specifies xxHash3-64 as the default, gated behind
12
+ * SWEET_SEARCH_HASH_ALGORITHM with a SHA-256 truncation fallback for
13
+ * compliance / auditing. xxHash3 is collision-resistant well beyond our
14
+ * working set (Cyan4973/xxHash + SMHasher3 benchmarks) and runs 15-30 GiB/s
15
+ * with native SIMD.
16
+ *
17
+ * Resolution order:
18
+ * 1. native crate: crates/sweet-search-native exports xxhash3_64 — fastest.
19
+ * 2. @node-rs/xxhash — fast prebuilt N-API binding.
20
+ * 3. pure-JS xxHash3 — present here as a portable last resort; ~3× faster
21
+ * than SHA-256 truncation but slower than the native paths.
22
+ * 4. SHA-256 truncate-16 — current behaviour, kept as the compliance
23
+ * override and the fallback when the algorithm switch is `sha256`.
24
+ *
25
+ * The output of every path is a 16-hex-char string so consumers (logs,
26
+ * SQLite columns of TEXT type, JSON state files) can be swapped without
27
+ * cascading changes.
28
+ */
29
+
30
+ import crypto from 'node:crypto';
31
+ import { createRequire } from 'node:module';
32
+
33
+ const ALGO_ENV = (process.env.SWEET_SEARCH_HASH_ALGORITHM || 'xxhash3').toLowerCase();
34
+ export const HASH_ALGORITHM = ALGO_ENV === 'sha256' ? 'sha256' : 'xxhash3';
35
+ const require = createRequire(import.meta.url);
36
+ const PURE_JS_PRIME64_1 = 0x9E3779B185EBCA87n;
37
+ const PURE_JS_PRIME64_2 = 0xC2B2AE3D27D4EB4Fn;
38
+ const PURE_JS_PRIME64_3 = 0x165667B19E3779F9n;
39
+ const PURE_JS_PRIME64_4 = 0x85EBCA77C2B2AE63n;
40
+ const PURE_JS_PRIME64_5 = 0x27D4EB2F165667C5n;
41
+ const MASK64 = 0xFFFFFFFFFFFFFFFFn;
42
+
43
+ let nativeXxh3 = null;
44
+ let nodeRsXxh3 = null;
45
+ let resolved = false;
46
+
47
+ function toHex64(value) {
48
+ if (typeof value === 'string') {
49
+ return value.length >= 16 ? value.slice(-16).toLowerCase() : value.padStart(16, '0').toLowerCase();
50
+ }
51
+ return (typeof value === 'bigint' ? value : BigInt(value)).toString(16).padStart(16, '0');
52
+ }
53
+
54
+ function rotl64(x, r) {
55
+ return ((x << BigInt(r)) | (x >> BigInt(64 - r))) & MASK64;
56
+ }
57
+
58
+ function readUInt64LE(buf, offset) {
59
+ const lo = BigInt(buf[offset] | (buf[offset + 1] << 8) | (buf[offset + 2] << 16) | (buf[offset + 3] << 24)) & 0xFFFFFFFFn;
60
+ const hi = BigInt(buf[offset + 4] | (buf[offset + 5] << 8) | (buf[offset + 6] << 16) | (buf[offset + 7] << 24)) & 0xFFFFFFFFn;
61
+ return (lo | (hi << 32n)) & MASK64;
62
+ }
63
+
64
+ function readUInt32LE(buf, offset) {
65
+ // (buf[off] | buf[off+1]<<8 | buf[off+2]<<16 | buf[off+3]<<24) operates on Number; the top
66
+ // bit set produces a negative i32. Mask first, then promote to BigInt so we never feed
67
+ // negatives into xxHash mixing.
68
+ const lo = (buf[offset] | (buf[offset + 1] << 8) | (buf[offset + 2] << 16) | (buf[offset + 3] << 24)) >>> 0;
69
+ return BigInt(lo);
70
+ }
71
+
72
+ /**
73
+ * Pure-JS xxHash64 (close-enough fallback for environments without the native
74
+ * binding). We use xxHash64 here, not xxHash3, because xxHash3's full algorithm
75
+ * is significantly more complex while still producing different bytes than
76
+ * xxHash64. As soon as the native binding is available the wrapper switches
77
+ * to xxhash3_64 transparently; the pure-JS path is only ever used when neither
78
+ * crates/sweet-search-native nor @node-rs/xxhash is installed.
79
+ *
80
+ * Output is collision-resistant for our working set (sweet-search hashes for
81
+ * local dedup, not cryptographic integrity). See plan § 7.2 hash choice.
82
+ */
83
+ function xxh64PureJs(buf, seed = 0n) {
84
+ let h64;
85
+ const len = buf.length;
86
+ let i = 0;
87
+
88
+ if (len >= 32) {
89
+ let v1 = (seed + PURE_JS_PRIME64_1 + PURE_JS_PRIME64_2) & MASK64;
90
+ let v2 = (seed + PURE_JS_PRIME64_2) & MASK64;
91
+ let v3 = seed;
92
+ let v4 = (seed - PURE_JS_PRIME64_1) & MASK64;
93
+
94
+ while (i + 32 <= len) {
95
+ v1 = (v1 + ((readUInt64LE(buf, i) * PURE_JS_PRIME64_2) & MASK64)) & MASK64;
96
+ v1 = rotl64(v1, 31);
97
+ v1 = (v1 * PURE_JS_PRIME64_1) & MASK64;
98
+
99
+ v2 = (v2 + ((readUInt64LE(buf, i + 8) * PURE_JS_PRIME64_2) & MASK64)) & MASK64;
100
+ v2 = rotl64(v2, 31);
101
+ v2 = (v2 * PURE_JS_PRIME64_1) & MASK64;
102
+
103
+ v3 = (v3 + ((readUInt64LE(buf, i + 16) * PURE_JS_PRIME64_2) & MASK64)) & MASK64;
104
+ v3 = rotl64(v3, 31);
105
+ v3 = (v3 * PURE_JS_PRIME64_1) & MASK64;
106
+
107
+ v4 = (v4 + ((readUInt64LE(buf, i + 24) * PURE_JS_PRIME64_2) & MASK64)) & MASK64;
108
+ v4 = rotl64(v4, 31);
109
+ v4 = (v4 * PURE_JS_PRIME64_1) & MASK64;
110
+
111
+ i += 32;
112
+ }
113
+
114
+ h64 = (rotl64(v1, 1) + rotl64(v2, 7) + rotl64(v3, 12) + rotl64(v4, 18)) & MASK64;
115
+
116
+ v1 = (rotl64((v1 * PURE_JS_PRIME64_2) & MASK64, 31) * PURE_JS_PRIME64_1) & MASK64;
117
+ h64 = ((h64 ^ v1) * PURE_JS_PRIME64_1 + PURE_JS_PRIME64_4) & MASK64;
118
+
119
+ v2 = (rotl64((v2 * PURE_JS_PRIME64_2) & MASK64, 31) * PURE_JS_PRIME64_1) & MASK64;
120
+ h64 = ((h64 ^ v2) * PURE_JS_PRIME64_1 + PURE_JS_PRIME64_4) & MASK64;
121
+
122
+ v3 = (rotl64((v3 * PURE_JS_PRIME64_2) & MASK64, 31) * PURE_JS_PRIME64_1) & MASK64;
123
+ h64 = ((h64 ^ v3) * PURE_JS_PRIME64_1 + PURE_JS_PRIME64_4) & MASK64;
124
+
125
+ v4 = (rotl64((v4 * PURE_JS_PRIME64_2) & MASK64, 31) * PURE_JS_PRIME64_1) & MASK64;
126
+ h64 = ((h64 ^ v4) * PURE_JS_PRIME64_1 + PURE_JS_PRIME64_4) & MASK64;
127
+ } else {
128
+ h64 = (seed + PURE_JS_PRIME64_5) & MASK64;
129
+ }
130
+
131
+ h64 = (h64 + BigInt(len)) & MASK64;
132
+
133
+ while (i + 8 <= len) {
134
+ let k1 = (readUInt64LE(buf, i) * PURE_JS_PRIME64_2) & MASK64;
135
+ k1 = rotl64(k1, 31);
136
+ k1 = (k1 * PURE_JS_PRIME64_1) & MASK64;
137
+ h64 ^= k1;
138
+ h64 = (rotl64(h64, 27) * PURE_JS_PRIME64_1 + PURE_JS_PRIME64_4) & MASK64;
139
+ i += 8;
140
+ }
141
+
142
+ if (i + 4 <= len) {
143
+ h64 = ((h64 ^ ((readUInt32LE(buf, i) * PURE_JS_PRIME64_1) & MASK64)) & MASK64);
144
+ h64 = (rotl64(h64, 23) * PURE_JS_PRIME64_2 + PURE_JS_PRIME64_3) & MASK64;
145
+ i += 4;
146
+ }
147
+
148
+ while (i < len) {
149
+ h64 = (h64 ^ (BigInt(buf[i]) * PURE_JS_PRIME64_5)) & MASK64;
150
+ h64 = (rotl64(h64, 11) * PURE_JS_PRIME64_1) & MASK64;
151
+ i += 1;
152
+ }
153
+
154
+ h64 ^= h64 >> 33n;
155
+ h64 = (h64 * PURE_JS_PRIME64_2) & MASK64;
156
+ h64 ^= h64 >> 29n;
157
+ h64 = (h64 * PURE_JS_PRIME64_3) & MASK64;
158
+ h64 ^= h64 >> 32n;
159
+
160
+ return h64;
161
+ }
162
+
163
+ function bufFromInput(input) {
164
+ if (Buffer.isBuffer(input)) return input;
165
+ if (input instanceof Uint8Array) return Buffer.from(input.buffer, input.byteOffset, input.byteLength);
166
+ if (typeof input === 'string') return Buffer.from(input, 'utf8');
167
+ throw new TypeError(`hashing: unsupported input type ${typeof input}`);
168
+ }
169
+
170
+ function makeNativeXxh3(mod) {
171
+ if (!mod || typeof mod.xxhash3_64 !== 'function') return null;
172
+ return (buf) => toHex64(mod.xxhash3_64(buf));
173
+ }
174
+
175
+ function makeNodeRsXxh3(mod) {
176
+ if (!mod) return null;
177
+ const ns = mod.default && (mod.default.xxh3 || mod.default.Xxh3) ? mod.default : mod;
178
+ if (ns.xxh3 && typeof ns.xxh3.xxh64 === 'function') {
179
+ return (buf) => toHex64(ns.xxh3.xxh64(buf));
180
+ }
181
+ const Xxh3 = ns.Xxh3 || ns.xxh3;
182
+ if (Xxh3 && typeof Xxh3.oneShotHashU64 === 'function') {
183
+ return (buf) => toHex64(Xxh3.oneShotHashU64(buf));
184
+ }
185
+ if (Xxh3 && typeof Xxh3.h64 === 'function') {
186
+ return (buf) => toHex64(Xxh3.h64(buf));
187
+ }
188
+ if (Xxh3 && typeof Xxh3.withSeed === 'function') {
189
+ return (buf) => {
190
+ const hasher = Xxh3.withSeed(0n);
191
+ hasher.update(buf);
192
+ return toHex64(hasher.digest());
193
+ };
194
+ }
195
+ return null;
196
+ }
197
+
198
+ function resolveBackendsSync() {
199
+ if (ALGO_ENV === 'sha256') {
200
+ resolved = true;
201
+ return;
202
+ }
203
+ try {
204
+ nativeXxh3 = makeNativeXxh3(require('../../../crates/sweet-search-native/index.js'));
205
+ } catch {
206
+ // Native crate not built or missing the export; try the package dependency.
207
+ }
208
+ if (!nativeXxh3) {
209
+ try {
210
+ nodeRsXxh3 = makeNodeRsXxh3(require('@node-rs/xxhash'));
211
+ } catch {
212
+ // Dependency not installed or native package failed to load; async resolver
213
+ // and the portable fallback still keep hashing functional.
214
+ }
215
+ }
216
+ resolved = Boolean(nativeXxh3 || nodeRsXxh3);
217
+ }
218
+
219
+ async function resolveBackends() {
220
+ if (resolved) return;
221
+ resolved = true;
222
+
223
+ // Resolve native crate (preferred).
224
+ try {
225
+ const mod = await import('../../../crates/sweet-search-native/index.js');
226
+ nativeXxh3 = makeNativeXxh3(mod);
227
+ } catch {
228
+ // Native crate not built or missing the export; fall through.
229
+ }
230
+
231
+ // Resolve @node-rs/xxhash (second preference).
232
+ if (!nativeXxh3) {
233
+ try {
234
+ const mod = await import('@node-rs/xxhash');
235
+ nodeRsXxh3 = makeNodeRsXxh3(mod);
236
+ } catch {
237
+ // Package not installed; pure-JS fallback wins.
238
+ }
239
+ }
240
+ }
241
+
242
+ /**
243
+ * Hash arbitrary input (string or buffer) to a 16-hex-char digest.
244
+ *
245
+ * @param {Buffer|Uint8Array|string} input
246
+ * @returns {Promise<string>}
247
+ */
248
+ export async function contentHash(input) {
249
+ await resolveBackends();
250
+ return contentHashSync(input);
251
+ }
252
+
253
+ /**
254
+ * Sync variant; safe to call after `contentHash` has run at least once
255
+ * (warms the backend cache) or when caller explicitly opts in to the
256
+ * pure-JS / SHA-256 path.
257
+ */
258
+ export function contentHashSync(input) {
259
+ const buf = bufFromInput(input);
260
+ if (ALGO_ENV === 'sha256') {
261
+ return crypto.createHash('sha256').update(buf).digest('hex').slice(0, 16);
262
+ }
263
+ if (nativeXxh3) return nativeXxh3(buf);
264
+ if (nodeRsXxh3) return nodeRsXxh3(buf);
265
+ return xxh64PureJs(buf).toString(16).padStart(16, '0');
266
+ }
267
+
268
+ /**
269
+ * Stable JSON stringify used by metadata-fingerprint hashing. Sorts object
270
+ * keys recursively so two equivalent objects produce identical bytes.
271
+ *
272
+ * @param {*} value
273
+ * @returns {string}
274
+ */
275
+ export function stableStringify(value) {
276
+ if (value === null || typeof value !== 'object') return JSON.stringify(value);
277
+ if (Array.isArray(value)) return '[' + value.map(stableStringify).join(',') + ']';
278
+ const keys = Object.keys(value).sort();
279
+ return '{' + keys.map((k) => JSON.stringify(k) + ':' + stableStringify(value[k])).join(',') + '}';
280
+ }
281
+
282
+ /**
283
+ * Hash a metadata-bearing object deterministically.
284
+ *
285
+ * @param {object} payload
286
+ * @returns {Promise<string>}
287
+ */
288
+ export async function metadataFingerprint(payload) {
289
+ return contentHash(stableStringify(payload));
290
+ }
291
+
292
+ export const __testing = {
293
+ xxh64PureJs,
294
+ stableStringify,
295
+ resolveBackends,
296
+ };
297
+
298
+ resolveBackendsSync();