sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,260 @@
1
+ /**
2
+ * Stable AST-structural chunk identity.
3
+ *
4
+ * Plan § 7.2 makes the positional `chunk_id` from `ast-chunker.js` the
5
+ * single biggest defeat of incremental encode-skip: inserting one
6
+ * function at the top of a file shifts every downstream `parentCounter`
7
+ * → every chunk ID changes → cache-miss on every chunk → projected 10×
8
+ * CPU saving collapses to 0. The fix is a content-anchored identity that
9
+ * survives whitespace edits, import shuffles, and insertions.
10
+ *
11
+ * Two regimes:
12
+ *
13
+ * 1. **Symbol-attached chunks** (functions, methods, classes, structs,
14
+ * etc.). The identity binds to the containing symbol path plus the
15
+ * whitespace-normalised signature line:
16
+ *
17
+ * struct_id = hash(file_path || parent_symbol_path || symbol_name || signature_norm)
18
+ *
19
+ * Renaming the function flips the ID for the renamed chunk only.
20
+ * Reordering siblings, inserting a new function above, or
21
+ * re-indenting the file is invisible.
22
+ *
23
+ * 2. **Anonymous chunks** (file headers, JSX blocks, free-form
24
+ * statements, regex-split sub-chunks). The identity binds to a
25
+ * rolling hash of normalised content under the parent symbol path,
26
+ * PLUS a mandatory `occurrence_index_in_parent` suffix:
27
+ *
28
+ * struct_id = hash(file_path || parent_symbol_path || rolling_hash) || '_' || occurrence_index
29
+ *
30
+ * The occurrence index is the *count of preceding siblings with the
31
+ * same `(rolling_hash, parent_symbol_path)` tuple* — not the absolute
32
+ * sibling index. This guarantees that two distinct sibling chunks
33
+ * keep distinct IDs even when they sit between identical pairs, and
34
+ * that two identical siblings keep distinct IDs because their
35
+ * occurrence-in-population is different (`_0`, `_1`, ...). See plan
36
+ * § 7.2 "occurrence index ... is mandatory" and § 33 unit-test
37
+ * coverage requirements for the renamed-one-of-two case.
38
+ *
39
+ * Fallback: parser failure or no AST metadata → use the existing
40
+ * positional `chunk_id` and set `structural = false`. Downstream
41
+ * dedup paths still hash the chunk content, so the worst-case is no
42
+ * structural savings for that one chunk.
43
+ *
44
+ * This module is pure domain logic. It must not import the chunker,
45
+ * SQLite, the encoder pool, or any infrastructure adapter.
46
+ */
47
+
48
+ import { contentHashSync } from '../infrastructure/hashing.mjs';
49
+
50
+ /**
51
+ * Normalise whitespace in a signature for hashing. Collapses runs of
52
+ * whitespace into a single space, trims the result, and drops trailing
53
+ * `=> {` / `{` braces so e.g.
54
+ * `async function foo (a , b ) {`
55
+ * `async function foo(a, b) {`
56
+ * hash identically.
57
+ *
58
+ * @param {string} signature
59
+ * @returns {string}
60
+ */
61
+ export function normalizeSignature(signature) {
62
+ if (typeof signature !== 'string') return '';
63
+ return signature
64
+ .replace(/[\s ]+/g, ' ')
65
+ .replace(/\s*\{\s*$/, '')
66
+ .replace(/\s*([(),:;<>=\[\]])\s*/g, '$1')
67
+ .trim();
68
+ }
69
+
70
+ /**
71
+ * Normalise an anonymous chunk's content for rolling-hash purposes.
72
+ * Drops leading/trailing whitespace, collapses internal whitespace runs,
73
+ * and removes blank lines so format-on-save edits cancel.
74
+ *
75
+ * Intentionally does NOT strip comments — comment edits should change
76
+ * the encoder input and therefore the dense embedding, but they do not
77
+ * change the chunk's identity in v1 unless the chunker drops them in a
78
+ * subsequent pass.
79
+ *
80
+ * @param {string} content
81
+ * @returns {string}
82
+ */
83
+ export function normalizeAnonymousContent(content) {
84
+ if (typeof content !== 'string') return '';
85
+ // 1. Collapse internal runs of whitespace (preserving newlines so the
86
+ // rolling hash still distinguishes multi-line forms from one-line
87
+ // forms).
88
+ // 2. Drop blank lines so format-on-save reflows are invisible.
89
+ // 3. Trim leading/trailing whitespace overall.
90
+ const collapsed = content.replace(/[ \t]+/g, ' ');
91
+ const noBlanks = collapsed.split('\n').map((line) => line.trim()).filter(Boolean).join('\n');
92
+ return noBlanks;
93
+ }
94
+
95
+ /**
96
+ * Build the parent-symbol path string used in both regimes. The chunker
97
+ * may already provide `parent_symbol` and `parent_type`; we serialise
98
+ * them as `type:name` and join with `/` for any hierarchy ancestors so
99
+ * `Foo.bar.baz` and `module:Foo / class:Foo / method:bar / lambda:baz`
100
+ * survive sibling-rename without collision.
101
+ *
102
+ * Accepted shapes:
103
+ * - `chunk.metadata.parent_path` — pre-joined `/`-separated string
104
+ * - `chunk.metadata.parent_symbol` plus `chunk.metadata.parent_type`
105
+ * - undefined → empty string (top-level)
106
+ *
107
+ * @param {object} metadata
108
+ * @returns {string}
109
+ */
110
+ export function parentSymbolPath(metadata) {
111
+ if (!metadata) return '';
112
+ if (typeof metadata.parent_path === 'string' && metadata.parent_path.length > 0) {
113
+ return metadata.parent_path;
114
+ }
115
+ const parent = metadata.parent_symbol;
116
+ const parentType = metadata.parent_type;
117
+ if (parent && parentType) return `${parentType}:${parent}`;
118
+ if (parent) return `:${parent}`;
119
+ return '';
120
+ }
121
+
122
+ /**
123
+ * Determine whether a chunk is symbol-attached for identity purposes.
124
+ *
125
+ * @param {object} chunk
126
+ * @returns {boolean}
127
+ */
128
+ export function isSymbolAttached(chunk) {
129
+ if (!chunk || !chunk.metadata) return false;
130
+ const symbol = chunk.metadata.symbol;
131
+ const chunkType = chunk.metadata.chunk_type;
132
+ if (!symbol || symbol === 'unknown' || symbol === 'code') return false;
133
+ if (!chunkType) return false;
134
+ // Treat anything that names a callable / class / module as symbol-
135
+ // attached. The chunker emits chunk_type values like 'function',
136
+ // 'method', 'class', 'struct', 'interface', 'enum', 'module',
137
+ // 'namespace', plus 'code'/'plain'/'doc' for non-symbol chunks.
138
+ return chunkType !== 'code' && chunkType !== 'plain' && chunkType !== 'doc' && chunkType !== 'text';
139
+ }
140
+
141
+ /**
142
+ * Compute the rolling content hash used by anonymous chunk identity.
143
+ *
144
+ * @param {string} content
145
+ * @returns {string}
146
+ */
147
+ export function rollingContentHash(content) {
148
+ return contentHashSync(normalizeAnonymousContent(content));
149
+ }
150
+
151
+ /**
152
+ * Derive a stable structural ID for a single chunk.
153
+ *
154
+ * Returns:
155
+ * {
156
+ * chunkStructId: string,
157
+ * structural: boolean,
158
+ * rollingHash: string | null,
159
+ * reason: 'symbol' | 'anonymous' | 'fallback',
160
+ * }
161
+ *
162
+ * Callers MUST provide the occurrence index when `structural=true` and
163
+ * `reason='anonymous'`; the convenience wrapper
164
+ * `assignStructuralIds(chunks, filePath)` below computes the indices in
165
+ * a single pass and is the recommended entry point.
166
+ *
167
+ * @param {object} chunk
168
+ * @param {string} filePath
169
+ * @param {number|null} occurrenceIndex Mandatory for anonymous chunks.
170
+ * @returns {{chunkStructId:string, structural:boolean, rollingHash:string|null, reason:'symbol'|'anonymous'|'fallback'}}
171
+ */
172
+ export function deriveStructuralId(chunk, filePath, occurrenceIndex) {
173
+ if (!chunk || typeof filePath !== 'string') {
174
+ return { chunkStructId: '', structural: false, rollingHash: null, reason: 'fallback' };
175
+ }
176
+ const metadata = chunk.metadata || {};
177
+ const parentPath = parentSymbolPath(metadata);
178
+
179
+ if (isSymbolAttached(chunk)) {
180
+ const signature = metadata.signature || metadata.symbol_signature || '';
181
+ const sigNorm = normalizeSignature(signature);
182
+ const sigSource = sigNorm.length > 0 ? sigNorm : `${metadata.chunk_type || ''}:${metadata.symbol || ''}`;
183
+ const id = contentHashSync(
184
+ `${filePath}\0${parentPath}\0${metadata.symbol}\0${sigSource}`,
185
+ );
186
+ return {
187
+ chunkStructId: id,
188
+ structural: true,
189
+ rollingHash: null,
190
+ reason: 'symbol',
191
+ };
192
+ }
193
+
194
+ // Anonymous regime.
195
+ if (occurrenceIndex == null || !Number.isFinite(occurrenceIndex) || occurrenceIndex < 0) {
196
+ // Per plan § 7.2, an anonymous chunk without an occurrence index is a
197
+ // bug at the call site, NOT a fallback opportunity. The wrapper below
198
+ // always supplies one; callers reaching this branch directly are
199
+ // misusing the API.
200
+ return { chunkStructId: '', structural: false, rollingHash: null, reason: 'fallback' };
201
+ }
202
+ const text = chunk.content || chunk.text || '';
203
+ const rolling = rollingContentHash(text);
204
+ const id =
205
+ contentHashSync(`${filePath}\0${parentPath}\0${rolling}`) + '_' + occurrenceIndex;
206
+ return {
207
+ chunkStructId: id,
208
+ structural: true,
209
+ rollingHash: rolling,
210
+ reason: 'anonymous',
211
+ };
212
+ }
213
+
214
+ /**
215
+ * Assign stable structural IDs across an ordered chunk list for one file.
216
+ *
217
+ * Pass order:
218
+ * 1. First pass: classify each chunk as symbol-attached / anonymous,
219
+ * compute rolling hashes for anonymous chunks.
220
+ * 2. Second pass: number anonymous chunks within each
221
+ * `(parent_path, rolling_hash)` population (occurrence_index_in_parent).
222
+ * 3. Emit a parallel array of structural-id records aligned with the
223
+ * input chunk list.
224
+ *
225
+ * Mutating the input chunks is intentionally avoided here; the caller
226
+ * decides whether to write the IDs onto chunks, into a SQL transaction,
227
+ * or both.
228
+ *
229
+ * @param {Array<object>} chunks
230
+ * @param {string} filePath
231
+ * @returns {Array<{chunkStructId:string, structural:boolean, rollingHash:string|null, reason:'symbol'|'anonymous'|'fallback', occurrenceIndex:number|null}>}
232
+ */
233
+ export function assignStructuralIds(chunks, filePath) {
234
+ if (!Array.isArray(chunks)) return [];
235
+ const out = new Array(chunks.length);
236
+ const populationCount = new Map();
237
+
238
+ for (let i = 0; i < chunks.length; i++) {
239
+ const chunk = chunks[i];
240
+ if (isSymbolAttached(chunk)) {
241
+ const derived = deriveStructuralId(chunk, filePath, null);
242
+ out[i] = { ...derived, occurrenceIndex: null };
243
+ continue;
244
+ }
245
+ // Anonymous chunk: compute population key, occurrence index, then derive.
246
+ const parentPath = parentSymbolPath(chunk?.metadata || {});
247
+ const text = chunk?.content || chunk?.text || '';
248
+ const rolling = rollingContentHash(text);
249
+ const key = `${parentPath}\0${rolling}`;
250
+ const idx = populationCount.get(key) || 0;
251
+ populationCount.set(key, idx + 1);
252
+
253
+ const derived = deriveStructuralId(chunk, filePath, idx);
254
+ out[i] = { ...derived, occurrenceIndex: idx };
255
+ }
256
+
257
+ return out;
258
+ }
259
+
260
+ export const __testing = { isSymbolAttached, parentSymbolPath, normalizeSignature, normalizeAnonymousContent };
@@ -0,0 +1,193 @@
1
+ /**
2
+ * Encoder-input dependency registry.
3
+ *
4
+ * Plan § 7.2.1, § 12.4. The encoder-input hashes in
5
+ * `encoder-input.mjs` answer the question "given the fully built
6
+ * `embedding_text` / `li_text`, do I still need to encode this chunk?"
7
+ * They do **not** answer "which chunks need their `embedding_text`
8
+ * rebuilt when something outside the chunk body changes?"
9
+ *
10
+ * Today, the production policy keeps cross-file metadata out of encoder
11
+ * inputs: changing file A's callee body does not re-embed unchanged caller
12
+ * file B. But the chunker DOES inject same-file scope / defines / uses
13
+ * enrichment plus import names into `embedding_text`. When any of those
14
+ * facts changes for a stable chunk, the reconciler must:
15
+ *
16
+ * 1. Mark the chunk **metadata-dirty**.
17
+ * 2. Re-run graph enrichment for that chunk to rebuild `embedding_text`
18
+ * / `li_text` / `li_greedy_text`.
19
+ * 3. Compute the exact input hash and reuse the previous payload only
20
+ * on byte-identical match.
21
+ *
22
+ * The registry is a small key→consumer table seeded by the reconcile
23
+ * tick whenever a chunk is encoded. When a key fires (e.g. a same-file
24
+ * scope change), the registry yields every dependent chunk so they can
25
+ * be added to the dirty set.
26
+ *
27
+ * Dependency-key vocabulary (all string keys; readers should treat them
28
+ * opaquely):
29
+ *
30
+ * * `path:<relative_path>` — file-level identity facts.
31
+ * * `lang:<relative_path>` — language detection result for the file.
32
+ * * `chunk-type:<relative_path>` — chunk kind selected by the chunker.
33
+ * * `symbol:<relative_path>` — primary chunk symbol metadata.
34
+ * * `signature:<relative_path>` — AST signature metadata.
35
+ * * `additional-symbols:<relative_path>` — sibling symbols injected into text.
36
+ * * `policy:embed:<n>` — bumps when embed-text policy changes.
37
+ * * `policy:li:<n>` — bumps when LI input policy changes.
38
+ * * `parent:<relative_path>:<parent>` — same-file parent symbol identity.
39
+ * * `same-file-symbols:<relative_path>` — set of symbols defined in this file.
40
+ * * `same-file-imports:<relative_path>` — set of import target names in this file.
41
+ * * `same-file-scope:<relative_path>` — same-file scope/defines/uses enrichment.
42
+ * * `dedup-cluster:<cluster_id>` — dedup cluster membership.
43
+ * * `dedup-exemplar:<exemplar_id>` — dedup alias target.
44
+ * * `entity:<entity_id>` — future cross-file rule (plan § 7.2.1).
45
+ * * `relationship:<source_entity_id>` — future cross-file rule.
46
+ * * `file-exports:<relative_path>` — future cross-file rule.
47
+ * * `graph-centrality:<entity_id>` — future cross-file rule.
48
+ *
49
+ * Consumers (`consumer` column of `encoder_input_dependencies`):
50
+ *
51
+ * * `dense` — affects `embedding_text` only.
52
+ * * `li` — affects `pickLiInput` only.
53
+ * * `dedup` — affects dedup signals.
54
+ *
55
+ * The same chunk can register multiple `(key, consumer)` pairs.
56
+ */
57
+
58
+ import {
59
+ DEDUP_INPUT_POLICY_VERSION,
60
+ EMBED_TEXT_POLICY_VERSION,
61
+ LI_INPUT_POLICY_VERSION,
62
+ } from './encoder-input.mjs';
63
+
64
+ const MAX_DEPENDENCY_KEYS_PER_QUERY = 900;
65
+
66
+ /**
67
+ * Build the same-file dependency set for a single chunk. The reconcile
68
+ * tick calls this **after** graph enrichment so the inputs already
69
+ * reflect the current scope / defines / uses lines.
70
+ *
71
+ * @param {object} chunk Enriched chunk (post graph-enrichment).
72
+ * @returns {Array<{dependency_key: string, consumer: 'dense'|'li'|'dedup'}>}
73
+ */
74
+ export function collectChunkDependencies(chunk) {
75
+ if (!chunk) return [];
76
+ const meta = chunk.metadata || {};
77
+ const rel = meta.relative_path || meta.path || meta.file_path || chunk.file || meta.file || '';
78
+ const parent = meta.parent_symbol || '';
79
+ const parentType = meta.parent_type || '';
80
+ const clusterId = meta.clusterId || '';
81
+ const exemplarId = meta.exemplarId || '';
82
+ const deps = [];
83
+ const push = (dependencyKey, consumers) => {
84
+ for (const consumer of consumers) {
85
+ deps.push({ dependency_key: dependencyKey, consumer });
86
+ }
87
+ };
88
+
89
+ if (rel) {
90
+ const encoderConsumers = ['dense', 'li'];
91
+ push(`path:${rel}`, encoderConsumers);
92
+ push(`lang:${rel}`, encoderConsumers);
93
+ push(`chunk-type:${rel}`, encoderConsumers);
94
+ push(`symbol:${rel}`, encoderConsumers);
95
+ push(`signature:${rel}`, encoderConsumers);
96
+ push(`additional-symbols:${rel}`, encoderConsumers);
97
+ push(`same-file-symbols:${rel}`, encoderConsumers);
98
+ push(`same-file-imports:${rel}`, encoderConsumers);
99
+ push(`same-file-scope:${rel}`, encoderConsumers);
100
+ }
101
+ if (parent && rel) {
102
+ push(`parent:${rel}:${parent}`, ['dense', 'li']);
103
+ }
104
+ if (parentType && rel) {
105
+ push(`parent-type:${rel}:${parentType}`, ['dense', 'li']);
106
+ }
107
+ if (clusterId) {
108
+ deps.push({ dependency_key: `dedup-cluster:${clusterId}`, consumer: 'dedup' });
109
+ }
110
+ if (exemplarId) {
111
+ deps.push({ dependency_key: `dedup-exemplar:${exemplarId}`, consumer: 'dedup' });
112
+ }
113
+ if (Object.hasOwn(meta, 'liReuseEligible')) {
114
+ const key = clusterId ? `dedup-li-reuse:${clusterId}` : 'dedup-li-reuse';
115
+ deps.push({ dependency_key: key, consumer: 'dedup' });
116
+ }
117
+ // Policy fingerprints. Any consumer of these keys is the canonical place
118
+ // to invalidate cached encoder payloads after a policy bump.
119
+ deps.push({ dependency_key: `policy:embed:${EMBED_TEXT_POLICY_VERSION}`, consumer: 'dense' });
120
+ deps.push({ dependency_key: `policy:li:${LI_INPUT_POLICY_VERSION}`, consumer: 'li' });
121
+ deps.push({ dependency_key: `policy:dedup:${DEDUP_INPUT_POLICY_VERSION}`, consumer: 'dedup' });
122
+
123
+ return deps;
124
+ }
125
+
126
+ /**
127
+ * Persist the dependency set for a chunk into the
128
+ * `encoder_input_dependencies` table. Caller controls the transaction.
129
+ *
130
+ * @param {import('better-sqlite3').Database} db
131
+ * @param {string} filePath
132
+ * @param {string} chunkStructId
133
+ * @param {Array<{dependency_key:string, consumer:string}>} deps
134
+ */
135
+ export function persistDependencies(db, filePath, chunkStructId, deps) {
136
+ if (!chunkStructId) return;
137
+ const remove = db.prepare(`
138
+ DELETE FROM encoder_input_dependencies
139
+ WHERE file_path = ? AND chunk_struct_id = ?
140
+ `);
141
+ const insert = db.prepare(`
142
+ INSERT OR IGNORE INTO encoder_input_dependencies
143
+ (dependency_key, file_path, chunk_struct_id, consumer)
144
+ VALUES (?, ?, ?, ?)
145
+ `);
146
+ remove.run(filePath, chunkStructId);
147
+ for (const dep of deps) {
148
+ insert.run(dep.dependency_key, filePath, chunkStructId, dep.consumer);
149
+ }
150
+ }
151
+
152
+ /**
153
+ * Look up dependent chunks for a list of changed dependency keys. The
154
+ * reconciler uses this to expand the metadata-dirty set when external
155
+ * facts change.
156
+ *
157
+ * @param {import('better-sqlite3').Database} db
158
+ * @param {string[]} keys
159
+ * @returns {Array<{file_path:string, chunk_struct_id:string, consumer:string}>}
160
+ */
161
+ export function dependentsOf(db, keys) {
162
+ if (!Array.isArray(keys) || keys.length === 0) return [];
163
+ const out = [];
164
+ const seen = new Set();
165
+ for (let i = 0; i < keys.length; i += MAX_DEPENDENCY_KEYS_PER_QUERY) {
166
+ const batch = keys.slice(i, i + MAX_DEPENDENCY_KEYS_PER_QUERY);
167
+ const placeholders = batch.map(() => '?').join(',');
168
+ const rows = db.prepare(`
169
+ SELECT DISTINCT file_path, chunk_struct_id, consumer
170
+ FROM encoder_input_dependencies
171
+ WHERE dependency_key IN (${placeholders})
172
+ ORDER BY file_path, chunk_struct_id, consumer
173
+ `).all(...batch);
174
+ for (const row of rows) {
175
+ const key = `${row.file_path}\0${row.chunk_struct_id}\0${row.consumer}`;
176
+ if (seen.has(key)) continue;
177
+ seen.add(key);
178
+ out.push(row);
179
+ }
180
+ }
181
+ return out;
182
+ }
183
+
184
+ /**
185
+ * Drop all dependency rows for a file. Used when a file is deleted or its
186
+ * structural identity has been replaced wholesale.
187
+ *
188
+ * @param {import('better-sqlite3').Database} db
189
+ * @param {string} filePath
190
+ */
191
+ export function forgetFile(db, filePath) {
192
+ db.prepare('DELETE FROM encoder_input_dependencies WHERE file_path = ?').run(filePath);
193
+ }
@@ -0,0 +1,225 @@
1
+ /**
2
+ * Exact encoder-input hashes for the dense, LI, and dedup consumers.
3
+ *
4
+ * Plan § 7.2 + § 7.2.1 + § 12.4. The reconcile path must reuse a previously
5
+ * computed dense embedding only when the **exact bytes** sent to the encoder
6
+ * have not changed. The "bytes" depend on far more than the raw chunk body:
7
+ *
8
+ * * Dense (`embedding_text`):
9
+ * - chunk body
10
+ * - relative path, language, chunk type, primary symbol
11
+ * - parent symbol + parent type
12
+ * - AST signature (when the active variant uses signatures)
13
+ * - sibling `additional_symbols`
14
+ * - active embedding-text policy fingerprint
15
+ * - same-file scope / defines / uses enrichment
16
+ *
17
+ * * Late-interaction (`pickLiInput`):
18
+ * - same body
19
+ * - language-routed input variant (Python and Java-family use
20
+ * `li_text`; JS/TS/etc. use the greedy form)
21
+ * - LI input policy fingerprint
22
+ *
23
+ * * Dedup (`simhash` / `clusterId` / `exemplarId` / `liReuseEligible`):
24
+ * - chunk content normalised for dedup
25
+ * - cluster exemplar identity (a member whose exemplar changed is
26
+ * "dedup-dirty" even if its own content is stable)
27
+ *
28
+ * This module is pure domain logic. It accepts plain JS objects and returns
29
+ * 16-hex-char hash strings. The Phase 1 reconciler calls these helpers
30
+ * **after** the existing graph-enrichment pass has built `embedding_text` /
31
+ * `li_text` / `li_greedy_text` on each chunk, so the inputs to the hashers
32
+ * are already metadata-aware.
33
+ */
34
+
35
+ import { contentHashSync, stableStringify, metadataFingerprint } from '../infrastructure/hashing.mjs';
36
+
37
+ /**
38
+ * Policy fingerprint constants. Bump these when the encoder-text recipe
39
+ * changes in a way that invalidates cached payloads. The values are
40
+ * intentionally simple integers; reconcile mixes them into the hash inputs
41
+ * so changing the variant flushes the cache without renaming columns.
42
+ *
43
+ * EMBED_TEXT_POLICY_VERSION mirrors the cold-path `pipelineVersion` bump in
44
+ * `core/indexing/incremental-tracker.js::buildConfigFingerprint`; LI policy
45
+ * has its own counter because LI input routing can change independently of
46
+ * dense input.
47
+ */
48
+ export const EMBED_TEXT_POLICY_VERSION = 1;
49
+ export const LI_INPUT_POLICY_VERSION = 1;
50
+ export const DEDUP_INPUT_POLICY_VERSION = 1;
51
+
52
+ /**
53
+ * Language taxonomy used by `pickLiInput`. The reconcile path mirrors the
54
+ * production helper at `core/indexing/indexer-ann.js`; we keep a local
55
+ * mirror so the incremental domain logic remains dependency-light. Any
56
+ * change to the production taxonomy must update this table AND bump
57
+ * LI_INPUT_POLICY_VERSION.
58
+ */
59
+ const LI_TEXT_LANGS = new Set(['python', 'java', 'php', 'csharp', 'c#', 'kotlin', 'scala']);
60
+ // Everything else (JS/TS/JSX/TSX, Ruby, Go, C/C++/Rust, unknown) uses
61
+ // li_greedy_text → embedding_text → li_text.
62
+
63
+ /**
64
+ * Normalise a path for the LI Java-family slug. This mirrors
65
+ * `core/indexing/ast-chunker.js::normalizePathSlug`: strip only the
66
+ * trailing generated `_<hex>` suffix before the extension.
67
+ *
68
+ * @param {string} relativePath
69
+ */
70
+ export function normalizePathSlug(relativePath) {
71
+ if (!relativePath) return relativePath;
72
+ return String(relativePath).replace(/_[0-9a-f]{6,}(\.[a-zA-Z0-9]+)$/, '$1');
73
+ }
74
+
75
+ /**
76
+ * Return the LI input text for a chunk per the language taxonomy. Mirrors
77
+ * `pickLiInput()` in `core/indexing/indexer-ann.js`.
78
+ *
79
+ * @param {object} chunk Enriched chunk with `metadata.language`,
80
+ * `metadata.relative_path`, `li_text`,
81
+ * `li_greedy_text`, `embedding_text`.
82
+ * @returns {string}
83
+ */
84
+ export function pickLiInputText(chunk) {
85
+ if (!chunk) return '';
86
+ const meta = chunk.metadata || {};
87
+ const language = meta.language;
88
+ const liText = chunk.li_text || '';
89
+ const liGreedy = chunk.li_greedy_text || '';
90
+ const embedText = chunk.embedding_text || chunk.text || chunk.content || '';
91
+
92
+ if (LI_TEXT_LANGS.has(language)) {
93
+ // Python omits the path line and Java-family slug-stripping happens
94
+ // when the chunker builds `li_text`; the hasher must not reconstruct
95
+ // a second, different policy here.
96
+ return liText || embedText;
97
+ }
98
+ // Default route: li_greedy_text → embedding_text → li_text.
99
+ return liGreedy || embedText || liText;
100
+ }
101
+
102
+ /**
103
+ * Compute the dense `embedding_input_hash`. The input is the exact
104
+ * `embedding_text` produced by the chunker + graph-enrichment pass, mixed
105
+ * with the policy fingerprint.
106
+ *
107
+ * @param {object} chunk
108
+ * @returns {string}
109
+ */
110
+ export function denseInputHash(chunk) {
111
+ if (!chunk) return '';
112
+ const text = chunk.embedding_text || chunk.text || chunk.content || '';
113
+ return contentHashSync(`${text}|policy:${EMBED_TEXT_POLICY_VERSION}`);
114
+ }
115
+
116
+ /**
117
+ * Compute the LI `li_input_hash`. Resolves the LI input text via the
118
+ * language taxonomy and mixes the policy fingerprint.
119
+ *
120
+ * @param {object} chunk
121
+ * @returns {string}
122
+ */
123
+ export function liInputHash(chunk) {
124
+ if (!chunk) return '';
125
+ const text = pickLiInputText(chunk);
126
+ return contentHashSync(`${text}|policy:${LI_INPUT_POLICY_VERSION}`);
127
+ }
128
+
129
+ /**
130
+ * Compute the dedup `metadata_fingerprint`. Plan § 7.2.1 calls this out as
131
+ * its own consumer so that a member whose cluster exemplar changed gets
132
+ * re-aliased even when its own `embedding_input_hash` / `li_input_hash`
133
+ * are unchanged.
134
+ *
135
+ * Dedup signal taxonomy:
136
+ * - `simhash` (chunk-local; recomputed when content changes)
137
+ * - `clusterId` (cluster membership)
138
+ * - `exemplarId` (target of the alias)
139
+ * - `isExemplar` (whether this row is the exemplar)
140
+ * - `aliasJaccard` (similarity score; informational)
141
+ * - `liReuseEligible` (drives the LI-side alias decision)
142
+ *
143
+ * The fingerprint is built from a stable JSON of these fields plus the
144
+ * policy version.
145
+ *
146
+ * @param {object} chunk
147
+ * @returns {string}
148
+ */
149
+ export function dedupFingerprint(chunk) {
150
+ const meta = (chunk && chunk.metadata) || {};
151
+ return contentHashSync(stableStringify({
152
+ simhash: meta.simhash ?? null,
153
+ clusterId: meta.clusterId ?? null,
154
+ exemplarId: meta.exemplarId ?? null,
155
+ isExemplar: meta.isExemplar ?? null,
156
+ aliasJaccard: meta.aliasJaccard ?? null,
157
+ liReuseEligible: meta.liReuseEligible ?? null,
158
+ policy: DEDUP_INPUT_POLICY_VERSION,
159
+ }));
160
+ }
161
+
162
+ function chunkRelativePath(chunk, meta = chunk?.metadata || {}) {
163
+ return firstSafeRelativePath(
164
+ meta.relative_path,
165
+ meta.path,
166
+ meta.file_path,
167
+ chunk?.file,
168
+ meta.file,
169
+ );
170
+ }
171
+
172
+ function firstSafeRelativePath(...candidates) {
173
+ for (const candidate of candidates) {
174
+ if (typeof candidate !== 'string') continue;
175
+ const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
176
+ if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
177
+ if (/^[A-Za-z]:\//.test(normalized)) continue;
178
+ if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
179
+ return normalized;
180
+ }
181
+ return null;
182
+ }
183
+
184
+ /**
185
+ * Convenience: compute all three hashes plus the raw `chunk_text_hash` and
186
+ * `metadata_fingerprint` in one pass.
187
+ *
188
+ * The returned `metadata_fingerprint` here is the "encoder-input-affecting
189
+ * metadata hash" required by plan § 7.2 / § 33 for the `metadata_fingerprint`
190
+ * column. It is distinct from the dedup `dedupFingerprint`; both must be
191
+ * stored when the reconcile path lands the row (the dedup fingerprint is
192
+ * carried inside the encoder-input dependency registry; see
193
+ * `core/incremental-indexing/domain/encoder-deps.mjs`).
194
+ *
195
+ * @param {object} chunk
196
+ * @returns {{chunk_text_hash:string, embedding_input_hash:string, li_input_hash:string, metadata_fingerprint:string, dedup_fingerprint:string}}
197
+ */
198
+ export function chunkInputHashes(chunk) {
199
+ const text = chunk?.content || chunk?.text || '';
200
+ const meta = chunk?.metadata || {};
201
+ return {
202
+ chunk_text_hash: contentHashSync(text),
203
+ embedding_input_hash: denseInputHash(chunk),
204
+ li_input_hash: liInputHash(chunk),
205
+ metadata_fingerprint: contentHashSync(stableStringify({
206
+ relative_path: chunkRelativePath(chunk, meta),
207
+ language: meta.language ?? null,
208
+ chunk_type: meta.chunk_type ?? null,
209
+ symbol: meta.symbol ?? null,
210
+ parent_symbol: meta.parent_symbol ?? null,
211
+ parent_type: meta.parent_type ?? null,
212
+ signature: meta.signature ?? null,
213
+ additional_symbols: meta.additional_symbols ?? null,
214
+ policy_embed: EMBED_TEXT_POLICY_VERSION,
215
+ policy_li: LI_INPUT_POLICY_VERSION,
216
+ })),
217
+ dedup_fingerprint: dedupFingerprint(chunk),
218
+ };
219
+ }
220
+
221
+ /**
222
+ * Re-export metadataFingerprint for callers that want the generic stable-
223
+ * stringify hash without the encoder-specific field selection.
224
+ */
225
+ export { metadataFingerprint };