sweet-search 2.5.2 → 2.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Stable AST-structural chunk identity.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 7.2 makes the positional `chunk_id` from `ast-chunker.js` the
|
|
5
|
+
* single biggest defeat of incremental encode-skip: inserting one
|
|
6
|
+
* function at the top of a file shifts every downstream `parentCounter`
|
|
7
|
+
* → every chunk ID changes → cache-miss on every chunk → projected 10×
|
|
8
|
+
* CPU saving collapses to 0. The fix is a content-anchored identity that
|
|
9
|
+
* survives whitespace edits, import shuffles, and insertions.
|
|
10
|
+
*
|
|
11
|
+
* Two regimes:
|
|
12
|
+
*
|
|
13
|
+
* 1. **Symbol-attached chunks** (functions, methods, classes, structs,
|
|
14
|
+
* etc.). The identity binds to the containing symbol path plus the
|
|
15
|
+
* whitespace-normalised signature line:
|
|
16
|
+
*
|
|
17
|
+
* struct_id = hash(file_path || parent_symbol_path || symbol_name || signature_norm)
|
|
18
|
+
*
|
|
19
|
+
* Renaming the function flips the ID for the renamed chunk only.
|
|
20
|
+
* Reordering siblings, inserting a new function above, or
|
|
21
|
+
* re-indenting the file is invisible.
|
|
22
|
+
*
|
|
23
|
+
* 2. **Anonymous chunks** (file headers, JSX blocks, free-form
|
|
24
|
+
* statements, regex-split sub-chunks). The identity binds to a
|
|
25
|
+
* rolling hash of normalised content under the parent symbol path,
|
|
26
|
+
* PLUS a mandatory `occurrence_index_in_parent` suffix:
|
|
27
|
+
*
|
|
28
|
+
* struct_id = hash(file_path || parent_symbol_path || rolling_hash) || '_' || occurrence_index
|
|
29
|
+
*
|
|
30
|
+
* The occurrence index is the *count of preceding siblings with the
|
|
31
|
+
* same `(rolling_hash, parent_symbol_path)` tuple* — not the absolute
|
|
32
|
+
* sibling index. This guarantees that two distinct sibling chunks
|
|
33
|
+
* keep distinct IDs even when they sit between identical pairs, and
|
|
34
|
+
* that two identical siblings keep distinct IDs because their
|
|
35
|
+
* occurrence-in-population is different (`_0`, `_1`, ...). See plan
|
|
36
|
+
* § 7.2 "occurrence index ... is mandatory" and § 33 unit-test
|
|
37
|
+
* coverage requirements for the renamed-one-of-two case.
|
|
38
|
+
*
|
|
39
|
+
* Fallback: parser failure or no AST metadata → use the existing
|
|
40
|
+
* positional `chunk_id` and set `structural = false`. Downstream
|
|
41
|
+
* dedup paths still hash the chunk content, so the worst-case is no
|
|
42
|
+
* structural savings for that one chunk.
|
|
43
|
+
*
|
|
44
|
+
* This module is pure domain logic. It must not import the chunker,
|
|
45
|
+
* SQLite, the encoder pool, or any infrastructure adapter.
|
|
46
|
+
*/
|
|
47
|
+
|
|
48
|
+
import { contentHashSync } from '../infrastructure/hashing.mjs';
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Normalise whitespace in a signature for hashing. Collapses runs of
|
|
52
|
+
* whitespace into a single space, trims the result, and drops trailing
|
|
53
|
+
* `=> {` / `{` braces so e.g.
|
|
54
|
+
* `async function foo (a , b ) {`
|
|
55
|
+
* `async function foo(a, b) {`
|
|
56
|
+
* hash identically.
|
|
57
|
+
*
|
|
58
|
+
* @param {string} signature
|
|
59
|
+
* @returns {string}
|
|
60
|
+
*/
|
|
61
|
+
export function normalizeSignature(signature) {
|
|
62
|
+
if (typeof signature !== 'string') return '';
|
|
63
|
+
return signature
|
|
64
|
+
.replace(/[\s ]+/g, ' ')
|
|
65
|
+
.replace(/\s*\{\s*$/, '')
|
|
66
|
+
.replace(/\s*([(),:;<>=\[\]])\s*/g, '$1')
|
|
67
|
+
.trim();
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Normalise an anonymous chunk's content for rolling-hash purposes.
|
|
72
|
+
* Drops leading/trailing whitespace, collapses internal whitespace runs,
|
|
73
|
+
* and removes blank lines so format-on-save edits cancel.
|
|
74
|
+
*
|
|
75
|
+
* Intentionally does NOT strip comments — comment edits should change
|
|
76
|
+
* the encoder input and therefore the dense embedding, but they do not
|
|
77
|
+
* change the chunk's identity in v1 unless the chunker drops them in a
|
|
78
|
+
* subsequent pass.
|
|
79
|
+
*
|
|
80
|
+
* @param {string} content
|
|
81
|
+
* @returns {string}
|
|
82
|
+
*/
|
|
83
|
+
export function normalizeAnonymousContent(content) {
|
|
84
|
+
if (typeof content !== 'string') return '';
|
|
85
|
+
// 1. Collapse internal runs of whitespace (preserving newlines so the
|
|
86
|
+
// rolling hash still distinguishes multi-line forms from one-line
|
|
87
|
+
// forms).
|
|
88
|
+
// 2. Drop blank lines so format-on-save reflows are invisible.
|
|
89
|
+
// 3. Trim leading/trailing whitespace overall.
|
|
90
|
+
const collapsed = content.replace(/[ \t]+/g, ' ');
|
|
91
|
+
const noBlanks = collapsed.split('\n').map((line) => line.trim()).filter(Boolean).join('\n');
|
|
92
|
+
return noBlanks;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Build the parent-symbol path string used in both regimes. The chunker
|
|
97
|
+
* may already provide `parent_symbol` and `parent_type`; we serialise
|
|
98
|
+
* them as `type:name` and join with `/` for any hierarchy ancestors so
|
|
99
|
+
* `Foo.bar.baz` and `module:Foo / class:Foo / method:bar / lambda:baz`
|
|
100
|
+
* survive sibling-rename without collision.
|
|
101
|
+
*
|
|
102
|
+
* Accepted shapes:
|
|
103
|
+
* - `chunk.metadata.parent_path` — pre-joined `/`-separated string
|
|
104
|
+
* - `chunk.metadata.parent_symbol` plus `chunk.metadata.parent_type`
|
|
105
|
+
* - undefined → empty string (top-level)
|
|
106
|
+
*
|
|
107
|
+
* @param {object} metadata
|
|
108
|
+
* @returns {string}
|
|
109
|
+
*/
|
|
110
|
+
export function parentSymbolPath(metadata) {
|
|
111
|
+
if (!metadata) return '';
|
|
112
|
+
if (typeof metadata.parent_path === 'string' && metadata.parent_path.length > 0) {
|
|
113
|
+
return metadata.parent_path;
|
|
114
|
+
}
|
|
115
|
+
const parent = metadata.parent_symbol;
|
|
116
|
+
const parentType = metadata.parent_type;
|
|
117
|
+
if (parent && parentType) return `${parentType}:${parent}`;
|
|
118
|
+
if (parent) return `:${parent}`;
|
|
119
|
+
return '';
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Determine whether a chunk is symbol-attached for identity purposes.
|
|
124
|
+
*
|
|
125
|
+
* @param {object} chunk
|
|
126
|
+
* @returns {boolean}
|
|
127
|
+
*/
|
|
128
|
+
export function isSymbolAttached(chunk) {
|
|
129
|
+
if (!chunk || !chunk.metadata) return false;
|
|
130
|
+
const symbol = chunk.metadata.symbol;
|
|
131
|
+
const chunkType = chunk.metadata.chunk_type;
|
|
132
|
+
if (!symbol || symbol === 'unknown' || symbol === 'code') return false;
|
|
133
|
+
if (!chunkType) return false;
|
|
134
|
+
// Treat anything that names a callable / class / module as symbol-
|
|
135
|
+
// attached. The chunker emits chunk_type values like 'function',
|
|
136
|
+
// 'method', 'class', 'struct', 'interface', 'enum', 'module',
|
|
137
|
+
// 'namespace', plus 'code'/'plain'/'doc' for non-symbol chunks.
|
|
138
|
+
return chunkType !== 'code' && chunkType !== 'plain' && chunkType !== 'doc' && chunkType !== 'text';
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Compute the rolling content hash used by anonymous chunk identity.
|
|
143
|
+
*
|
|
144
|
+
* @param {string} content
|
|
145
|
+
* @returns {string}
|
|
146
|
+
*/
|
|
147
|
+
export function rollingContentHash(content) {
|
|
148
|
+
return contentHashSync(normalizeAnonymousContent(content));
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Derive a stable structural ID for a single chunk.
|
|
153
|
+
*
|
|
154
|
+
* Returns:
|
|
155
|
+
* {
|
|
156
|
+
* chunkStructId: string,
|
|
157
|
+
* structural: boolean,
|
|
158
|
+
* rollingHash: string | null,
|
|
159
|
+
* reason: 'symbol' | 'anonymous' | 'fallback',
|
|
160
|
+
* }
|
|
161
|
+
*
|
|
162
|
+
* Callers MUST provide the occurrence index when `structural=true` and
|
|
163
|
+
* `reason='anonymous'`; the convenience wrapper
|
|
164
|
+
* `assignStructuralIds(chunks, filePath)` below computes the indices in
|
|
165
|
+
* a single pass and is the recommended entry point.
|
|
166
|
+
*
|
|
167
|
+
* @param {object} chunk
|
|
168
|
+
* @param {string} filePath
|
|
169
|
+
* @param {number|null} occurrenceIndex Mandatory for anonymous chunks.
|
|
170
|
+
* @returns {{chunkStructId:string, structural:boolean, rollingHash:string|null, reason:'symbol'|'anonymous'|'fallback'}}
|
|
171
|
+
*/
|
|
172
|
+
export function deriveStructuralId(chunk, filePath, occurrenceIndex) {
|
|
173
|
+
if (!chunk || typeof filePath !== 'string') {
|
|
174
|
+
return { chunkStructId: '', structural: false, rollingHash: null, reason: 'fallback' };
|
|
175
|
+
}
|
|
176
|
+
const metadata = chunk.metadata || {};
|
|
177
|
+
const parentPath = parentSymbolPath(metadata);
|
|
178
|
+
|
|
179
|
+
if (isSymbolAttached(chunk)) {
|
|
180
|
+
const signature = metadata.signature || metadata.symbol_signature || '';
|
|
181
|
+
const sigNorm = normalizeSignature(signature);
|
|
182
|
+
const sigSource = sigNorm.length > 0 ? sigNorm : `${metadata.chunk_type || ''}:${metadata.symbol || ''}`;
|
|
183
|
+
const id = contentHashSync(
|
|
184
|
+
`${filePath}\0${parentPath}\0${metadata.symbol}\0${sigSource}`,
|
|
185
|
+
);
|
|
186
|
+
return {
|
|
187
|
+
chunkStructId: id,
|
|
188
|
+
structural: true,
|
|
189
|
+
rollingHash: null,
|
|
190
|
+
reason: 'symbol',
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Anonymous regime.
|
|
195
|
+
if (occurrenceIndex == null || !Number.isFinite(occurrenceIndex) || occurrenceIndex < 0) {
|
|
196
|
+
// Per plan § 7.2, an anonymous chunk without an occurrence index is a
|
|
197
|
+
// bug at the call site, NOT a fallback opportunity. The wrapper below
|
|
198
|
+
// always supplies one; callers reaching this branch directly are
|
|
199
|
+
// misusing the API.
|
|
200
|
+
return { chunkStructId: '', structural: false, rollingHash: null, reason: 'fallback' };
|
|
201
|
+
}
|
|
202
|
+
const text = chunk.content || chunk.text || '';
|
|
203
|
+
const rolling = rollingContentHash(text);
|
|
204
|
+
const id =
|
|
205
|
+
contentHashSync(`${filePath}\0${parentPath}\0${rolling}`) + '_' + occurrenceIndex;
|
|
206
|
+
return {
|
|
207
|
+
chunkStructId: id,
|
|
208
|
+
structural: true,
|
|
209
|
+
rollingHash: rolling,
|
|
210
|
+
reason: 'anonymous',
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Assign stable structural IDs across an ordered chunk list for one file.
|
|
216
|
+
*
|
|
217
|
+
* Pass order:
|
|
218
|
+
* 1. First pass: classify each chunk as symbol-attached / anonymous,
|
|
219
|
+
* compute rolling hashes for anonymous chunks.
|
|
220
|
+
* 2. Second pass: number anonymous chunks within each
|
|
221
|
+
* `(parent_path, rolling_hash)` population (occurrence_index_in_parent).
|
|
222
|
+
* 3. Emit a parallel array of structural-id records aligned with the
|
|
223
|
+
* input chunk list.
|
|
224
|
+
*
|
|
225
|
+
* Mutating the input chunks is intentionally avoided here; the caller
|
|
226
|
+
* decides whether to write the IDs onto chunks, into a SQL transaction,
|
|
227
|
+
* or both.
|
|
228
|
+
*
|
|
229
|
+
* @param {Array<object>} chunks
|
|
230
|
+
* @param {string} filePath
|
|
231
|
+
* @returns {Array<{chunkStructId:string, structural:boolean, rollingHash:string|null, reason:'symbol'|'anonymous'|'fallback', occurrenceIndex:number|null}>}
|
|
232
|
+
*/
|
|
233
|
+
export function assignStructuralIds(chunks, filePath) {
|
|
234
|
+
if (!Array.isArray(chunks)) return [];
|
|
235
|
+
const out = new Array(chunks.length);
|
|
236
|
+
const populationCount = new Map();
|
|
237
|
+
|
|
238
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
239
|
+
const chunk = chunks[i];
|
|
240
|
+
if (isSymbolAttached(chunk)) {
|
|
241
|
+
const derived = deriveStructuralId(chunk, filePath, null);
|
|
242
|
+
out[i] = { ...derived, occurrenceIndex: null };
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
// Anonymous chunk: compute population key, occurrence index, then derive.
|
|
246
|
+
const parentPath = parentSymbolPath(chunk?.metadata || {});
|
|
247
|
+
const text = chunk?.content || chunk?.text || '';
|
|
248
|
+
const rolling = rollingContentHash(text);
|
|
249
|
+
const key = `${parentPath}\0${rolling}`;
|
|
250
|
+
const idx = populationCount.get(key) || 0;
|
|
251
|
+
populationCount.set(key, idx + 1);
|
|
252
|
+
|
|
253
|
+
const derived = deriveStructuralId(chunk, filePath, idx);
|
|
254
|
+
out[i] = { ...derived, occurrenceIndex: idx };
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return out;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
export const __testing = { isSymbolAttached, parentSymbolPath, normalizeSignature, normalizeAnonymousContent };
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Encoder-input dependency registry.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 7.2.1, § 12.4. The encoder-input hashes in
|
|
5
|
+
* `encoder-input.mjs` answer the question "given the fully built
|
|
6
|
+
* `embedding_text` / `li_text`, do I still need to encode this chunk?"
|
|
7
|
+
* They do **not** answer "which chunks need their `embedding_text`
|
|
8
|
+
* rebuilt when something outside the chunk body changes?"
|
|
9
|
+
*
|
|
10
|
+
* Today, the production policy keeps cross-file metadata out of encoder
|
|
11
|
+
* inputs: changing file A's callee body does not re-embed unchanged caller
|
|
12
|
+
* file B. But the chunker DOES inject same-file scope / defines / uses
|
|
13
|
+
* enrichment plus import names into `embedding_text`. When any of those
|
|
14
|
+
* facts changes for a stable chunk, the reconciler must:
|
|
15
|
+
*
|
|
16
|
+
* 1. Mark the chunk **metadata-dirty**.
|
|
17
|
+
* 2. Re-run graph enrichment for that chunk to rebuild `embedding_text`
|
|
18
|
+
* / `li_text` / `li_greedy_text`.
|
|
19
|
+
* 3. Compute the exact input hash and reuse the previous payload only
|
|
20
|
+
* on byte-identical match.
|
|
21
|
+
*
|
|
22
|
+
* The registry is a small key→consumer table seeded by the reconcile
|
|
23
|
+
* tick whenever a chunk is encoded. When a key fires (e.g. a same-file
|
|
24
|
+
* scope change), the registry yields every dependent chunk so they can
|
|
25
|
+
* be added to the dirty set.
|
|
26
|
+
*
|
|
27
|
+
* Dependency-key vocabulary (all string keys; readers should treat them
|
|
28
|
+
* opaquely):
|
|
29
|
+
*
|
|
30
|
+
* * `path:<relative_path>` — file-level identity facts.
|
|
31
|
+
* * `lang:<relative_path>` — language detection result for the file.
|
|
32
|
+
* * `chunk-type:<relative_path>` — chunk kind selected by the chunker.
|
|
33
|
+
* * `symbol:<relative_path>` — primary chunk symbol metadata.
|
|
34
|
+
* * `signature:<relative_path>` — AST signature metadata.
|
|
35
|
+
* * `additional-symbols:<relative_path>` — sibling symbols injected into text.
|
|
36
|
+
* * `policy:embed:<n>` — bumps when embed-text policy changes.
|
|
37
|
+
* * `policy:li:<n>` — bumps when LI input policy changes.
|
|
38
|
+
* * `parent:<relative_path>:<parent>` — same-file parent symbol identity.
|
|
39
|
+
* * `same-file-symbols:<relative_path>` — set of symbols defined in this file.
|
|
40
|
+
* * `same-file-imports:<relative_path>` — set of import target names in this file.
|
|
41
|
+
* * `same-file-scope:<relative_path>` — same-file scope/defines/uses enrichment.
|
|
42
|
+
* * `dedup-cluster:<cluster_id>` — dedup cluster membership.
|
|
43
|
+
* * `dedup-exemplar:<exemplar_id>` — dedup alias target.
|
|
44
|
+
* * `entity:<entity_id>` — future cross-file rule (plan § 7.2.1).
|
|
45
|
+
* * `relationship:<source_entity_id>` — future cross-file rule.
|
|
46
|
+
* * `file-exports:<relative_path>` — future cross-file rule.
|
|
47
|
+
* * `graph-centrality:<entity_id>` — future cross-file rule.
|
|
48
|
+
*
|
|
49
|
+
* Consumers (`consumer` column of `encoder_input_dependencies`):
|
|
50
|
+
*
|
|
51
|
+
* * `dense` — affects `embedding_text` only.
|
|
52
|
+
* * `li` — affects `pickLiInput` only.
|
|
53
|
+
* * `dedup` — affects dedup signals.
|
|
54
|
+
*
|
|
55
|
+
* The same chunk can register multiple `(key, consumer)` pairs.
|
|
56
|
+
*/
|
|
57
|
+
|
|
58
|
+
import {
|
|
59
|
+
DEDUP_INPUT_POLICY_VERSION,
|
|
60
|
+
EMBED_TEXT_POLICY_VERSION,
|
|
61
|
+
LI_INPUT_POLICY_VERSION,
|
|
62
|
+
} from './encoder-input.mjs';
|
|
63
|
+
|
|
64
|
+
const MAX_DEPENDENCY_KEYS_PER_QUERY = 900;
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Build the same-file dependency set for a single chunk. The reconcile
|
|
68
|
+
* tick calls this **after** graph enrichment so the inputs already
|
|
69
|
+
* reflect the current scope / defines / uses lines.
|
|
70
|
+
*
|
|
71
|
+
* @param {object} chunk Enriched chunk (post graph-enrichment).
|
|
72
|
+
* @returns {Array<{dependency_key: string, consumer: 'dense'|'li'|'dedup'}>}
|
|
73
|
+
*/
|
|
74
|
+
export function collectChunkDependencies(chunk) {
|
|
75
|
+
if (!chunk) return [];
|
|
76
|
+
const meta = chunk.metadata || {};
|
|
77
|
+
const rel = meta.relative_path || meta.path || meta.file_path || chunk.file || meta.file || '';
|
|
78
|
+
const parent = meta.parent_symbol || '';
|
|
79
|
+
const parentType = meta.parent_type || '';
|
|
80
|
+
const clusterId = meta.clusterId || '';
|
|
81
|
+
const exemplarId = meta.exemplarId || '';
|
|
82
|
+
const deps = [];
|
|
83
|
+
const push = (dependencyKey, consumers) => {
|
|
84
|
+
for (const consumer of consumers) {
|
|
85
|
+
deps.push({ dependency_key: dependencyKey, consumer });
|
|
86
|
+
}
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
if (rel) {
|
|
90
|
+
const encoderConsumers = ['dense', 'li'];
|
|
91
|
+
push(`path:${rel}`, encoderConsumers);
|
|
92
|
+
push(`lang:${rel}`, encoderConsumers);
|
|
93
|
+
push(`chunk-type:${rel}`, encoderConsumers);
|
|
94
|
+
push(`symbol:${rel}`, encoderConsumers);
|
|
95
|
+
push(`signature:${rel}`, encoderConsumers);
|
|
96
|
+
push(`additional-symbols:${rel}`, encoderConsumers);
|
|
97
|
+
push(`same-file-symbols:${rel}`, encoderConsumers);
|
|
98
|
+
push(`same-file-imports:${rel}`, encoderConsumers);
|
|
99
|
+
push(`same-file-scope:${rel}`, encoderConsumers);
|
|
100
|
+
}
|
|
101
|
+
if (parent && rel) {
|
|
102
|
+
push(`parent:${rel}:${parent}`, ['dense', 'li']);
|
|
103
|
+
}
|
|
104
|
+
if (parentType && rel) {
|
|
105
|
+
push(`parent-type:${rel}:${parentType}`, ['dense', 'li']);
|
|
106
|
+
}
|
|
107
|
+
if (clusterId) {
|
|
108
|
+
deps.push({ dependency_key: `dedup-cluster:${clusterId}`, consumer: 'dedup' });
|
|
109
|
+
}
|
|
110
|
+
if (exemplarId) {
|
|
111
|
+
deps.push({ dependency_key: `dedup-exemplar:${exemplarId}`, consumer: 'dedup' });
|
|
112
|
+
}
|
|
113
|
+
if (Object.hasOwn(meta, 'liReuseEligible')) {
|
|
114
|
+
const key = clusterId ? `dedup-li-reuse:${clusterId}` : 'dedup-li-reuse';
|
|
115
|
+
deps.push({ dependency_key: key, consumer: 'dedup' });
|
|
116
|
+
}
|
|
117
|
+
// Policy fingerprints. Any consumer of these keys is the canonical place
|
|
118
|
+
// to invalidate cached encoder payloads after a policy bump.
|
|
119
|
+
deps.push({ dependency_key: `policy:embed:${EMBED_TEXT_POLICY_VERSION}`, consumer: 'dense' });
|
|
120
|
+
deps.push({ dependency_key: `policy:li:${LI_INPUT_POLICY_VERSION}`, consumer: 'li' });
|
|
121
|
+
deps.push({ dependency_key: `policy:dedup:${DEDUP_INPUT_POLICY_VERSION}`, consumer: 'dedup' });
|
|
122
|
+
|
|
123
|
+
return deps;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Persist the dependency set for a chunk into the
|
|
128
|
+
* `encoder_input_dependencies` table. Caller controls the transaction.
|
|
129
|
+
*
|
|
130
|
+
* @param {import('better-sqlite3').Database} db
|
|
131
|
+
* @param {string} filePath
|
|
132
|
+
* @param {string} chunkStructId
|
|
133
|
+
* @param {Array<{dependency_key:string, consumer:string}>} deps
|
|
134
|
+
*/
|
|
135
|
+
export function persistDependencies(db, filePath, chunkStructId, deps) {
|
|
136
|
+
if (!chunkStructId) return;
|
|
137
|
+
const remove = db.prepare(`
|
|
138
|
+
DELETE FROM encoder_input_dependencies
|
|
139
|
+
WHERE file_path = ? AND chunk_struct_id = ?
|
|
140
|
+
`);
|
|
141
|
+
const insert = db.prepare(`
|
|
142
|
+
INSERT OR IGNORE INTO encoder_input_dependencies
|
|
143
|
+
(dependency_key, file_path, chunk_struct_id, consumer)
|
|
144
|
+
VALUES (?, ?, ?, ?)
|
|
145
|
+
`);
|
|
146
|
+
remove.run(filePath, chunkStructId);
|
|
147
|
+
for (const dep of deps) {
|
|
148
|
+
insert.run(dep.dependency_key, filePath, chunkStructId, dep.consumer);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Look up dependent chunks for a list of changed dependency keys. The
|
|
154
|
+
* reconciler uses this to expand the metadata-dirty set when external
|
|
155
|
+
* facts change.
|
|
156
|
+
*
|
|
157
|
+
* @param {import('better-sqlite3').Database} db
|
|
158
|
+
* @param {string[]} keys
|
|
159
|
+
* @returns {Array<{file_path:string, chunk_struct_id:string, consumer:string}>}
|
|
160
|
+
*/
|
|
161
|
+
export function dependentsOf(db, keys) {
|
|
162
|
+
if (!Array.isArray(keys) || keys.length === 0) return [];
|
|
163
|
+
const out = [];
|
|
164
|
+
const seen = new Set();
|
|
165
|
+
for (let i = 0; i < keys.length; i += MAX_DEPENDENCY_KEYS_PER_QUERY) {
|
|
166
|
+
const batch = keys.slice(i, i + MAX_DEPENDENCY_KEYS_PER_QUERY);
|
|
167
|
+
const placeholders = batch.map(() => '?').join(',');
|
|
168
|
+
const rows = db.prepare(`
|
|
169
|
+
SELECT DISTINCT file_path, chunk_struct_id, consumer
|
|
170
|
+
FROM encoder_input_dependencies
|
|
171
|
+
WHERE dependency_key IN (${placeholders})
|
|
172
|
+
ORDER BY file_path, chunk_struct_id, consumer
|
|
173
|
+
`).all(...batch);
|
|
174
|
+
for (const row of rows) {
|
|
175
|
+
const key = `${row.file_path}\0${row.chunk_struct_id}\0${row.consumer}`;
|
|
176
|
+
if (seen.has(key)) continue;
|
|
177
|
+
seen.add(key);
|
|
178
|
+
out.push(row);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
return out;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Drop all dependency rows for a file. Used when a file is deleted or its
|
|
186
|
+
* structural identity has been replaced wholesale.
|
|
187
|
+
*
|
|
188
|
+
* @param {import('better-sqlite3').Database} db
|
|
189
|
+
* @param {string} filePath
|
|
190
|
+
*/
|
|
191
|
+
export function forgetFile(db, filePath) {
|
|
192
|
+
db.prepare('DELETE FROM encoder_input_dependencies WHERE file_path = ?').run(filePath);
|
|
193
|
+
}
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Exact encoder-input hashes for the dense, LI, and dedup consumers.
|
|
3
|
+
*
|
|
4
|
+
* Plan § 7.2 + § 7.2.1 + § 12.4. The reconcile path must reuse a previously
|
|
5
|
+
* computed dense embedding only when the **exact bytes** sent to the encoder
|
|
6
|
+
* have not changed. The "bytes" depend on far more than the raw chunk body:
|
|
7
|
+
*
|
|
8
|
+
* * Dense (`embedding_text`):
|
|
9
|
+
* - chunk body
|
|
10
|
+
* - relative path, language, chunk type, primary symbol
|
|
11
|
+
* - parent symbol + parent type
|
|
12
|
+
* - AST signature (when the active variant uses signatures)
|
|
13
|
+
* - sibling `additional_symbols`
|
|
14
|
+
* - active embedding-text policy fingerprint
|
|
15
|
+
* - same-file scope / defines / uses enrichment
|
|
16
|
+
*
|
|
17
|
+
* * Late-interaction (`pickLiInput`):
|
|
18
|
+
* - same body
|
|
19
|
+
* - language-routed input variant (Python and Java-family use
|
|
20
|
+
* `li_text`; JS/TS/etc. use the greedy form)
|
|
21
|
+
* - LI input policy fingerprint
|
|
22
|
+
*
|
|
23
|
+
* * Dedup (`simhash` / `clusterId` / `exemplarId` / `liReuseEligible`):
|
|
24
|
+
* - chunk content normalised for dedup
|
|
25
|
+
* - cluster exemplar identity (a member whose exemplar changed is
|
|
26
|
+
* "dedup-dirty" even if its own content is stable)
|
|
27
|
+
*
|
|
28
|
+
* This module is pure domain logic. It accepts plain JS objects and returns
|
|
29
|
+
* 16-hex-char hash strings. The Phase 1 reconciler calls these helpers
|
|
30
|
+
* **after** the existing graph-enrichment pass has built `embedding_text` /
|
|
31
|
+
* `li_text` / `li_greedy_text` on each chunk, so the inputs to the hashers
|
|
32
|
+
* are already metadata-aware.
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
import { contentHashSync, stableStringify, metadataFingerprint } from '../infrastructure/hashing.mjs';
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Policy fingerprint constants. Bump these when the encoder-text recipe
|
|
39
|
+
* changes in a way that invalidates cached payloads. The values are
|
|
40
|
+
* intentionally simple integers; reconcile mixes them into the hash inputs
|
|
41
|
+
* so changing the variant flushes the cache without renaming columns.
|
|
42
|
+
*
|
|
43
|
+
* EMBED_TEXT_POLICY_VERSION mirrors the cold-path `pipelineVersion` bump in
|
|
44
|
+
* `core/indexing/incremental-tracker.js::buildConfigFingerprint`; LI policy
|
|
45
|
+
* has its own counter because LI input routing can change independently of
|
|
46
|
+
* dense input.
|
|
47
|
+
*/
|
|
48
|
+
export const EMBED_TEXT_POLICY_VERSION = 1;
|
|
49
|
+
export const LI_INPUT_POLICY_VERSION = 1;
|
|
50
|
+
export const DEDUP_INPUT_POLICY_VERSION = 1;
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Language taxonomy used by `pickLiInput`. The reconcile path mirrors the
|
|
54
|
+
* production helper at `core/indexing/indexer-ann.js`; we keep a local
|
|
55
|
+
* mirror so the incremental domain logic remains dependency-light. Any
|
|
56
|
+
* change to the production taxonomy must update this table AND bump
|
|
57
|
+
* LI_INPUT_POLICY_VERSION.
|
|
58
|
+
*/
|
|
59
|
+
const LI_TEXT_LANGS = new Set(['python', 'java', 'php', 'csharp', 'c#', 'kotlin', 'scala']);
|
|
60
|
+
// Everything else (JS/TS/JSX/TSX, Ruby, Go, C/C++/Rust, unknown) uses
|
|
61
|
+
// li_greedy_text → embedding_text → li_text.
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Normalise a path for the LI Java-family slug. This mirrors
|
|
65
|
+
* `core/indexing/ast-chunker.js::normalizePathSlug`: strip only the
|
|
66
|
+
* trailing generated `_<hex>` suffix before the extension.
|
|
67
|
+
*
|
|
68
|
+
* @param {string} relativePath
|
|
69
|
+
*/
|
|
70
|
+
export function normalizePathSlug(relativePath) {
|
|
71
|
+
if (!relativePath) return relativePath;
|
|
72
|
+
return String(relativePath).replace(/_[0-9a-f]{6,}(\.[a-zA-Z0-9]+)$/, '$1');
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Return the LI input text for a chunk per the language taxonomy. Mirrors
|
|
77
|
+
* `pickLiInput()` in `core/indexing/indexer-ann.js`.
|
|
78
|
+
*
|
|
79
|
+
* @param {object} chunk Enriched chunk with `metadata.language`,
|
|
80
|
+
* `metadata.relative_path`, `li_text`,
|
|
81
|
+
* `li_greedy_text`, `embedding_text`.
|
|
82
|
+
* @returns {string}
|
|
83
|
+
*/
|
|
84
|
+
export function pickLiInputText(chunk) {
|
|
85
|
+
if (!chunk) return '';
|
|
86
|
+
const meta = chunk.metadata || {};
|
|
87
|
+
const language = meta.language;
|
|
88
|
+
const liText = chunk.li_text || '';
|
|
89
|
+
const liGreedy = chunk.li_greedy_text || '';
|
|
90
|
+
const embedText = chunk.embedding_text || chunk.text || chunk.content || '';
|
|
91
|
+
|
|
92
|
+
if (LI_TEXT_LANGS.has(language)) {
|
|
93
|
+
// Python omits the path line and Java-family slug-stripping happens
|
|
94
|
+
// when the chunker builds `li_text`; the hasher must not reconstruct
|
|
95
|
+
// a second, different policy here.
|
|
96
|
+
return liText || embedText;
|
|
97
|
+
}
|
|
98
|
+
// Default route: li_greedy_text → embedding_text → li_text.
|
|
99
|
+
return liGreedy || embedText || liText;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Compute the dense `embedding_input_hash`. The input is the exact
|
|
104
|
+
* `embedding_text` produced by the chunker + graph-enrichment pass, mixed
|
|
105
|
+
* with the policy fingerprint.
|
|
106
|
+
*
|
|
107
|
+
* @param {object} chunk
|
|
108
|
+
* @returns {string}
|
|
109
|
+
*/
|
|
110
|
+
export function denseInputHash(chunk) {
|
|
111
|
+
if (!chunk) return '';
|
|
112
|
+
const text = chunk.embedding_text || chunk.text || chunk.content || '';
|
|
113
|
+
return contentHashSync(`${text}|policy:${EMBED_TEXT_POLICY_VERSION}`);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Compute the LI `li_input_hash`. Resolves the LI input text via the
|
|
118
|
+
* language taxonomy and mixes the policy fingerprint.
|
|
119
|
+
*
|
|
120
|
+
* @param {object} chunk
|
|
121
|
+
* @returns {string}
|
|
122
|
+
*/
|
|
123
|
+
export function liInputHash(chunk) {
|
|
124
|
+
if (!chunk) return '';
|
|
125
|
+
const text = pickLiInputText(chunk);
|
|
126
|
+
return contentHashSync(`${text}|policy:${LI_INPUT_POLICY_VERSION}`);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Compute the dedup `metadata_fingerprint`. Plan § 7.2.1 calls this out as
|
|
131
|
+
* its own consumer so that a member whose cluster exemplar changed gets
|
|
132
|
+
* re-aliased even when its own `embedding_input_hash` / `li_input_hash`
|
|
133
|
+
* are unchanged.
|
|
134
|
+
*
|
|
135
|
+
* Dedup signal taxonomy:
|
|
136
|
+
* - `simhash` (chunk-local; recomputed when content changes)
|
|
137
|
+
* - `clusterId` (cluster membership)
|
|
138
|
+
* - `exemplarId` (target of the alias)
|
|
139
|
+
* - `isExemplar` (whether this row is the exemplar)
|
|
140
|
+
* - `aliasJaccard` (similarity score; informational)
|
|
141
|
+
* - `liReuseEligible` (drives the LI-side alias decision)
|
|
142
|
+
*
|
|
143
|
+
* The fingerprint is built from a stable JSON of these fields plus the
|
|
144
|
+
* policy version.
|
|
145
|
+
*
|
|
146
|
+
* @param {object} chunk
|
|
147
|
+
* @returns {string}
|
|
148
|
+
*/
|
|
149
|
+
export function dedupFingerprint(chunk) {
|
|
150
|
+
const meta = (chunk && chunk.metadata) || {};
|
|
151
|
+
return contentHashSync(stableStringify({
|
|
152
|
+
simhash: meta.simhash ?? null,
|
|
153
|
+
clusterId: meta.clusterId ?? null,
|
|
154
|
+
exemplarId: meta.exemplarId ?? null,
|
|
155
|
+
isExemplar: meta.isExemplar ?? null,
|
|
156
|
+
aliasJaccard: meta.aliasJaccard ?? null,
|
|
157
|
+
liReuseEligible: meta.liReuseEligible ?? null,
|
|
158
|
+
policy: DEDUP_INPUT_POLICY_VERSION,
|
|
159
|
+
}));
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
function chunkRelativePath(chunk, meta = chunk?.metadata || {}) {
|
|
163
|
+
return firstSafeRelativePath(
|
|
164
|
+
meta.relative_path,
|
|
165
|
+
meta.path,
|
|
166
|
+
meta.file_path,
|
|
167
|
+
chunk?.file,
|
|
168
|
+
meta.file,
|
|
169
|
+
);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
function firstSafeRelativePath(...candidates) {
|
|
173
|
+
for (const candidate of candidates) {
|
|
174
|
+
if (typeof candidate !== 'string') continue;
|
|
175
|
+
const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
|
|
176
|
+
if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
|
|
177
|
+
if (/^[A-Za-z]:\//.test(normalized)) continue;
|
|
178
|
+
if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
|
|
179
|
+
return normalized;
|
|
180
|
+
}
|
|
181
|
+
return null;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Convenience: compute all three hashes plus the raw `chunk_text_hash` and
|
|
186
|
+
* `metadata_fingerprint` in one pass.
|
|
187
|
+
*
|
|
188
|
+
* The returned `metadata_fingerprint` here is the "encoder-input-affecting
|
|
189
|
+
* metadata hash" required by plan § 7.2 / § 33 for the `metadata_fingerprint`
|
|
190
|
+
* column. It is distinct from the dedup `dedupFingerprint`; both must be
|
|
191
|
+
* stored when the reconcile path lands the row (the dedup fingerprint is
|
|
192
|
+
* carried inside the encoder-input dependency registry; see
|
|
193
|
+
* `core/incremental-indexing/domain/encoder-deps.mjs`).
|
|
194
|
+
*
|
|
195
|
+
* @param {object} chunk
|
|
196
|
+
* @returns {{chunk_text_hash:string, embedding_input_hash:string, li_input_hash:string, metadata_fingerprint:string, dedup_fingerprint:string}}
|
|
197
|
+
*/
|
|
198
|
+
export function chunkInputHashes(chunk) {
|
|
199
|
+
const text = chunk?.content || chunk?.text || '';
|
|
200
|
+
const meta = chunk?.metadata || {};
|
|
201
|
+
return {
|
|
202
|
+
chunk_text_hash: contentHashSync(text),
|
|
203
|
+
embedding_input_hash: denseInputHash(chunk),
|
|
204
|
+
li_input_hash: liInputHash(chunk),
|
|
205
|
+
metadata_fingerprint: contentHashSync(stableStringify({
|
|
206
|
+
relative_path: chunkRelativePath(chunk, meta),
|
|
207
|
+
language: meta.language ?? null,
|
|
208
|
+
chunk_type: meta.chunk_type ?? null,
|
|
209
|
+
symbol: meta.symbol ?? null,
|
|
210
|
+
parent_symbol: meta.parent_symbol ?? null,
|
|
211
|
+
parent_type: meta.parent_type ?? null,
|
|
212
|
+
signature: meta.signature ?? null,
|
|
213
|
+
additional_symbols: meta.additional_symbols ?? null,
|
|
214
|
+
policy_embed: EMBED_TEXT_POLICY_VERSION,
|
|
215
|
+
policy_li: LI_INPUT_POLICY_VERSION,
|
|
216
|
+
})),
|
|
217
|
+
dedup_fingerprint: dedupFingerprint(chunk),
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Re-export metadataFingerprint for callers that want the generic stable-
|
|
223
|
+
* stringify hash without the encoder-specific field selection.
|
|
224
|
+
*/
|
|
225
|
+
export { metadataFingerprint };
|