sweet-search 2.4.2 → 2.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +43 -5
- package/core/embedding/embedding-cache.js +266 -18
- package/core/embedding/embedding-service.js +45 -9
- package/core/graph/graph-expansion.js +52 -12
- package/core/graph/graph-extractor.js +30 -1
- package/core/indexing/ast-chunker.js +331 -16
- package/core/indexing/chunking/chunk-builder.js +34 -1
- package/core/indexing/index-codebase-v21.js +31 -2
- package/core/indexing/index.js +6 -3
- package/core/indexing/indexer-ann.js +45 -6
- package/core/indexing/indexer-build.js +9 -1
- package/core/indexing/indexer-phases.js +6 -4
- package/core/indexing/indexing-file-policy.js +140 -0
- package/core/indexing/li-skip-policy.js +11 -220
- package/core/infrastructure/codebase-repository.js +21 -0
- package/core/infrastructure/config/embedding.js +20 -1
- package/core/infrastructure/config/graph.js +2 -2
- package/core/infrastructure/config/ranking.js +10 -0
- package/core/infrastructure/config/vector-store.js +1 -1
- package/core/infrastructure/coreml-cascade.js +236 -30
- package/core/infrastructure/coreml-cascade.json +25 -0
- package/core/infrastructure/index.js +17 -0
- package/core/infrastructure/init-config.js +216 -0
- package/core/infrastructure/language-patterns/registry-core.js +18 -0
- package/core/infrastructure/model-registry.js +12 -0
- package/core/infrastructure/native-inference.js +143 -51
- package/core/infrastructure/tree-sitter-provider.js +92 -2
- package/core/ranking/cascaded-scorer.js +6 -2
- package/core/ranking/file-kind-ranking.js +264 -0
- package/core/ranking/late-interaction-index.js +10 -4
- package/core/ranking/late-interaction-policy.js +304 -0
- package/core/search/context-expander.js +267 -28
- package/core/search/index.js +4 -0
- package/core/search/search-cli.js +3 -1
- package/core/search/search-pattern.js +4 -3
- package/core/search/search-postprocess.js +189 -8
- package/core/search/search-read-semantic.js +734 -0
- package/core/search/search-read.js +481 -0
- package/core/search/search-server.js +153 -5
- package/core/search/sweet-search.js +133 -16
- package/core/start-server.js +13 -2
- package/mcp/server.js +41 -0
- package/mcp/tool-handlers.js +117 -6
- package/package.json +9 -7
- package/scripts/init.js +386 -5
- package/scripts/uninstall.js +152 -6
package/core/cli.js
CHANGED
|
@@ -20,15 +20,44 @@ if (args[0] === 'init') {
|
|
|
20
20
|
} else if (args[0] === 'prewarm-vocab') {
|
|
21
21
|
const { handlePrewarmVocabCli } = await import('./vocabulary/index.js');
|
|
22
22
|
await handlePrewarmVocabCli(args.slice(1));
|
|
23
|
+
} else if (args[0] === 'read') {
|
|
24
|
+
// Filesystem-grounded reader; runs in JS (no native equivalent yet).
|
|
25
|
+
const { handleReadCli } = await import('./search/search-read.js');
|
|
26
|
+
await handleReadCli(args.slice(1));
|
|
27
|
+
} else if (args[0] === 'read-semantic') {
|
|
28
|
+
// Hybrid span-selection reader; runs in JS (depends on LI index + ranking).
|
|
29
|
+
const { handleReadSemanticCli } = await import('./search/search-read-semantic.js');
|
|
30
|
+
await handleReadSemanticCli(args.slice(1));
|
|
31
|
+
} else if (args[0] === 'index') {
|
|
32
|
+
// Indexing pipeline. Forwarded to index-codebase-v21.js::main(), which
|
|
33
|
+
// reads its own flags via process.argv. Setting argv here is required
|
|
34
|
+
// because the indexer's parseArgs reads process.argv.slice(2) by default.
|
|
35
|
+
// Without this subcommand, npm-installed users had no way to invoke
|
|
36
|
+
// indexing — `node ./node_modules/sweet-search/core/indexing/index-codebase-v21.js`
|
|
37
|
+
// was a silent no-op (direct-run guard mismatched under symlinked installs)
|
|
38
|
+
// and the bin had no `index` entry at all. Forwards every argument after
|
|
39
|
+
// `index` so existing flag combos (--full / --graph-only / --vectors-only /
|
|
40
|
+
// --files-from-stdin / --late-interaction-model=… / etc.) all work.
|
|
41
|
+
const indexerArgs = args.slice(1);
|
|
42
|
+
process.argv = [process.argv[0], 'index-codebase-v21.js', ...indexerArgs];
|
|
43
|
+
const { main: runIndexer } = await import('./indexing/index-codebase-v21.js');
|
|
44
|
+
await runIndexer();
|
|
45
|
+
} else if (args[0] === '--serve' || args[0] === '--stop') {
|
|
46
|
+
// Warm search server lifecycle is implemented in JS.
|
|
47
|
+
const { runCli } = await import('./search/index.js');
|
|
48
|
+
await runCli(args);
|
|
23
49
|
} else if (args[0] === '--help' || args[0] === '-h' || args.length === 0) {
|
|
24
50
|
console.log(`sweet-search — hybrid code search engine
|
|
25
51
|
|
|
26
52
|
Usage:
|
|
27
|
-
sweet-search <query>
|
|
28
|
-
sweet-search
|
|
29
|
-
sweet-search
|
|
30
|
-
sweet-search
|
|
31
|
-
sweet-search
|
|
53
|
+
sweet-search <query> Search the indexed codebase
|
|
54
|
+
sweet-search read <file...> Filesystem-grounded read (1-20 files)
|
|
55
|
+
sweet-search read-semantic <f> <q> Return only file spans relevant to a query
|
|
56
|
+
sweet-search index [options] Build / update the codebase index
|
|
57
|
+
sweet-search init [options] Set up runtime assets and models
|
|
58
|
+
sweet-search uninstall [opts] Remove local state created by init
|
|
59
|
+
sweet-search prewarm-vocab [file] Pre-warm vocabulary cache with terms
|
|
60
|
+
sweet-search --help Show this help
|
|
32
61
|
|
|
33
62
|
Options:
|
|
34
63
|
--mode <mode> Search mode: auto, lexical, semantic, hybrid, pattern
|
|
@@ -36,6 +65,15 @@ Options:
|
|
|
36
65
|
--json Output results as JSON
|
|
37
66
|
--cold Force cold start (skip warm server)
|
|
38
67
|
|
|
68
|
+
Indexing flags (sweet-search index ...):
|
|
69
|
+
--full Full reindex from scratch
|
|
70
|
+
--graph-only Build code graph only
|
|
71
|
+
--vectors-only Build vectors + HNSW only (skips code graph)
|
|
72
|
+
--files-from-stdin Read newline-delimited paths from stdin
|
|
73
|
+
--late-interaction-model=ID Override the LI variant for this run
|
|
74
|
+
--no-late-interaction Skip LI index build
|
|
75
|
+
--quiet | --verbose Logging verbosity
|
|
76
|
+
|
|
39
77
|
Run 'sweet-search init --help' or 'sweet-search uninstall --help' for subcommand options.`);
|
|
40
78
|
} else {
|
|
41
79
|
const { resolveNativeBinary } = await import('./infrastructure/index.js');
|
|
@@ -45,6 +45,7 @@ export class LRUCache {
|
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
has(key) { return this.cache.has(key); }
|
|
48
|
+
delete(key) { this.hitCount.delete(key); return this.cache.delete(key); }
|
|
48
49
|
getHitCount(key) { return this.hitCount.get(key) || 0; }
|
|
49
50
|
size() { return this.cache.size; }
|
|
50
51
|
clear() { this.cache.clear(); this.hitCount.clear(); }
|
|
@@ -60,6 +61,67 @@ export class LRUCache {
|
|
|
60
61
|
}
|
|
61
62
|
}
|
|
62
63
|
|
|
64
|
+
// =============================================================================
|
|
65
|
+
// ATOMIC JSON WRITER — serialised tmp-file-and-rename
|
|
66
|
+
// =============================================================================
|
|
67
|
+
//
|
|
68
|
+
// Both Vocabulary.save() and QueryStats.save() are fired as background
|
|
69
|
+
// promises from the embedding hot path (`vocabulary.save().catch(()=>{})`
|
|
70
|
+
// inside getEmbedding). Under concurrent benchmarks (12+ in-flight queries)
|
|
71
|
+
// multiple save() calls overlap on the same file. The previous direct
|
|
72
|
+
// `fs.writeFile` was non-atomic — interleaving writes produced invalid JSON,
|
|
73
|
+
// which then poisoned every subsequent `.load()` call and silently degraded
|
|
74
|
+
// retrieval quality to near-zero.
|
|
75
|
+
//
|
|
76
|
+
// `writeJsonAtomic` writes to a unique temp file then atomically renames it
|
|
77
|
+
// into place. `serialiseAtomicWrite` chains an instance's writes so at most
|
|
78
|
+
// one is in flight at a time — and at most one extra coalesced write is
|
|
79
|
+
// queued behind the in-flight one. Bursts of N saves collapse into 2 writes,
|
|
80
|
+
// each producing a fully-formed file.
|
|
81
|
+
|
|
82
|
+
async function writeJsonAtomic(targetPath, json) {
|
|
83
|
+
await fs.mkdir(path.dirname(targetPath), { recursive: true });
|
|
84
|
+
const tmpPath = `${targetPath}.tmp.${process.pid}.${Date.now().toString(36)}.${Math.random().toString(36).slice(2, 8)}`;
|
|
85
|
+
try {
|
|
86
|
+
await fs.writeFile(tmpPath, json);
|
|
87
|
+
await fs.rename(tmpPath, targetPath);
|
|
88
|
+
} catch (err) {
|
|
89
|
+
try { await fs.unlink(tmpPath); } catch { /* may not exist */ }
|
|
90
|
+
throw err;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Serialise calls to `produce()` for a given owner. At most one write is
|
|
96
|
+
* in flight; bursts coalesce so callers arriving during an in-flight save
|
|
97
|
+
* all share a single follow-up save, but no further waste accumulates.
|
|
98
|
+
*
|
|
99
|
+
* @param {object} owner - instance with mutable `_atomicInFlight` / `_atomicPending` slots
|
|
100
|
+
* @param {() => Promise<void>} produce - async writer that captures the
|
|
101
|
+
* current state at call time and writes it atomically
|
|
102
|
+
* @returns {Promise<void>}
|
|
103
|
+
*/
|
|
104
|
+
function serialiseAtomicWrite(owner, produce) {
|
|
105
|
+
if (owner._atomicPending) return owner._atomicPending;
|
|
106
|
+
|
|
107
|
+
const previous = owner._atomicInFlight || Promise.resolve();
|
|
108
|
+
owner._atomicPending = previous
|
|
109
|
+
.catch(() => { /* a previous save's failure must not block the next one */ })
|
|
110
|
+
.then(() => {
|
|
111
|
+
// Move from "pending" to "in-flight" before doing the actual write,
|
|
112
|
+
// so additional save() callers arriving during the write start a new
|
|
113
|
+
// pending entry rather than piggy-backing on this one (otherwise they
|
|
114
|
+
// would not see their latest state on disk).
|
|
115
|
+
owner._atomicPending = null;
|
|
116
|
+
const inFlight = produce();
|
|
117
|
+
owner._atomicInFlight = inFlight.finally(() => {
|
|
118
|
+
if (owner._atomicInFlight === inFlight) owner._atomicInFlight = null;
|
|
119
|
+
});
|
|
120
|
+
return owner._atomicInFlight;
|
|
121
|
+
});
|
|
122
|
+
return owner._atomicPending;
|
|
123
|
+
}
|
|
124
|
+
|
|
63
125
|
// =============================================================================
|
|
64
126
|
// QUERY STATS (Cross-session usage tracking)
|
|
65
127
|
// =============================================================================
|
|
@@ -89,10 +151,20 @@ export class QueryStats {
|
|
|
89
151
|
|
|
90
152
|
async save() {
|
|
91
153
|
if (!this.dirty) return;
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
154
|
+
return serialiseAtomicWrite(this, async () => {
|
|
155
|
+
// Re-check dirty inside the queued task: if a coalesced earlier write
|
|
156
|
+
// already persisted everything, there is nothing left to do.
|
|
157
|
+
if (!this.dirty) return;
|
|
158
|
+
const data = { queries: Object.fromEntries(this.stats), lastUpdated: new Date().toISOString() };
|
|
159
|
+
this.dirty = false;
|
|
160
|
+
try {
|
|
161
|
+
await writeJsonAtomic(this.statsPath, JSON.stringify(data));
|
|
162
|
+
} catch (err) {
|
|
163
|
+
// Re-mark dirty so a future save retries this state
|
|
164
|
+
this.dirty = true;
|
|
165
|
+
throw err;
|
|
166
|
+
}
|
|
167
|
+
});
|
|
96
168
|
}
|
|
97
169
|
|
|
98
170
|
increment(query) {
|
|
@@ -112,28 +184,169 @@ export class QueryStats {
|
|
|
112
184
|
// VOCABULARY
|
|
113
185
|
// =============================================================================
|
|
114
186
|
|
|
187
|
+
// Schema version for the persisted vocabulary file. Bump when the on-disk
|
|
188
|
+
// shape changes in a way that should invalidate previously-saved files.
|
|
189
|
+
// v2: { metadata: { created, lastUpdated, version, provider }, terms: {...} }
|
|
190
|
+
// v3: { metadata: { ..., model, dimension, schemaVersion: 3 }, terms: {...} }
|
|
191
|
+
// — adds full embedding fingerprint so a cache produced under one
|
|
192
|
+
// model is not silently served when a different model is active.
|
|
193
|
+
const VOCAB_SCHEMA_VERSION = 3;
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Coerce an input value into a Float32Array suitable for downstream embedding
|
|
197
|
+
* math (truncateForHNSW, late-interaction MaxSim, cosine similarity).
|
|
198
|
+
*
|
|
199
|
+
* Why this exists: persisted vocabularies are JSON-serialised. JSON.stringify
|
|
200
|
+
* on a Float32Array produces an indexed object `{"0": v0, "1": v1, ...}`,
|
|
201
|
+
* not an array. After `JSON.parse`, the value has `.length === undefined`,
|
|
202
|
+
* `.slice === undefined`, and crashes any downstream consumer that calls
|
|
203
|
+
* vector methods. This helper repairs the value at the cache boundary so
|
|
204
|
+
* the rest of the embedding pipeline can rely on a uniform vector contract.
|
|
205
|
+
*
|
|
206
|
+
* Accepted inputs:
|
|
207
|
+
* - Float32Array → returned as-is
|
|
208
|
+
* - Array<number> → wrapped in Float32Array
|
|
209
|
+
* - Float64Array / Int*Array etc. → copied into Float32Array
|
|
210
|
+
* - Plain object with stringly-keyed numeric indices ("0","1",...,"N-1")
|
|
211
|
+
* → reconstructed as Float32Array of length N
|
|
212
|
+
*
|
|
213
|
+
* Returns null when the input cannot be sensibly interpreted as a vector
|
|
214
|
+
* (callers should drop the cache entry and re-derive).
|
|
215
|
+
*
|
|
216
|
+
* @param {*} value
|
|
217
|
+
* @returns {Float32Array|null}
|
|
218
|
+
*/
|
|
219
|
+
export function coerceToFloat32Vector(value) {
|
|
220
|
+
if (value == null) return null;
|
|
221
|
+
if (value instanceof Float32Array) return value;
|
|
222
|
+
if (Array.isArray(value)) return Float32Array.from(value);
|
|
223
|
+
// Other typed arrays: copy values into a Float32Array.
|
|
224
|
+
if (ArrayBuffer.isView(value) && typeof value.length === 'number') {
|
|
225
|
+
return Float32Array.from(value);
|
|
226
|
+
}
|
|
227
|
+
// Plain object form from JSON-deserialised Float32Array.
|
|
228
|
+
if (typeof value === 'object') {
|
|
229
|
+
const keys = Object.keys(value);
|
|
230
|
+
if (keys.length === 0) return null;
|
|
231
|
+
// All keys must be string-encoded non-negative integers and contiguous
|
|
232
|
+
// from 0 to length-1. (We do not try to "fill gaps" — that would silently
|
|
233
|
+
// mask a real bug.)
|
|
234
|
+
const indices = new Array(keys.length);
|
|
235
|
+
for (let i = 0; i < keys.length; i++) {
|
|
236
|
+
const k = keys[i];
|
|
237
|
+
// Reject anything that isn't an integer-shaped key.
|
|
238
|
+
if (!/^\d+$/.test(k)) return null;
|
|
239
|
+
const n = +k;
|
|
240
|
+
if (!Number.isInteger(n) || n < 0 || n >= keys.length) return null;
|
|
241
|
+
indices[n] = value[k];
|
|
242
|
+
}
|
|
243
|
+
for (let i = 0; i < indices.length; i++) {
|
|
244
|
+
if (typeof indices[i] !== 'number' || !Number.isFinite(indices[i])) return null;
|
|
245
|
+
}
|
|
246
|
+
return Float32Array.from(indices);
|
|
247
|
+
}
|
|
248
|
+
return null;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/** Build the embedding-fingerprint we expect a vocabulary file to match. */
|
|
252
|
+
function currentVocabFingerprint() {
|
|
253
|
+
return {
|
|
254
|
+
schemaVersion: VOCAB_SCHEMA_VERSION,
|
|
255
|
+
provider: EMBEDDING_CONFIG.provider,
|
|
256
|
+
model: EMBEDDING_CONFIG.model,
|
|
257
|
+
dimension: EMBEDDING_CONFIG.dimension,
|
|
258
|
+
};
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/**
|
|
262
|
+
* Decide whether a persisted vocabulary file's metadata is compatible
|
|
263
|
+
* with the active embedding configuration. Returns `{ compatible: bool,
|
|
264
|
+
* reason?: string }`. Stale `provider` / `model` / `dimension` /
|
|
265
|
+
* schema-version mismatches are explicitly rejected; a missing file is
|
|
266
|
+
* treated as a fresh start (compatible by definition).
|
|
267
|
+
*/
|
|
268
|
+
export function isVocabFingerprintCompatible(metadata, fingerprint = currentVocabFingerprint()) {
|
|
269
|
+
if (!metadata || typeof metadata !== 'object') {
|
|
270
|
+
return { compatible: false, reason: 'missing-metadata' };
|
|
271
|
+
}
|
|
272
|
+
// Pre-v3 files persist `version` (not `schemaVersion`) and never recorded
|
|
273
|
+
// `model` / `dimension`. Treat those as incompatible — we cannot prove the
|
|
274
|
+
// cached embeddings came from the active model.
|
|
275
|
+
const persistedSchema = metadata.schemaVersion ?? metadata.version;
|
|
276
|
+
if (persistedSchema !== fingerprint.schemaVersion) {
|
|
277
|
+
return { compatible: false, reason: `schema-version (file=${persistedSchema} expected=${fingerprint.schemaVersion})` };
|
|
278
|
+
}
|
|
279
|
+
if (metadata.provider && metadata.provider !== fingerprint.provider) {
|
|
280
|
+
return { compatible: false, reason: `provider (file=${metadata.provider} expected=${fingerprint.provider})` };
|
|
281
|
+
}
|
|
282
|
+
if (metadata.model && metadata.model !== fingerprint.model) {
|
|
283
|
+
return { compatible: false, reason: `model (file=${metadata.model} expected=${fingerprint.model})` };
|
|
284
|
+
}
|
|
285
|
+
if (Number.isFinite(metadata.dimension)
|
|
286
|
+
&& Number.isFinite(fingerprint.dimension)
|
|
287
|
+
&& metadata.dimension !== fingerprint.dimension) {
|
|
288
|
+
return { compatible: false, reason: `dimension (file=${metadata.dimension} expected=${fingerprint.dimension})` };
|
|
289
|
+
}
|
|
290
|
+
return { compatible: true };
|
|
291
|
+
}
|
|
292
|
+
|
|
115
293
|
export class Vocabulary {
|
|
116
294
|
constructor(vocabPath) {
|
|
117
295
|
this.vocabPath = vocabPath;
|
|
118
296
|
this.terms = new Map();
|
|
119
|
-
this.metadata = {
|
|
297
|
+
this.metadata = {
|
|
298
|
+
created: null,
|
|
299
|
+
lastUpdated: null,
|
|
300
|
+
schemaVersion: VOCAB_SCHEMA_VERSION,
|
|
301
|
+
provider: null,
|
|
302
|
+
model: null,
|
|
303
|
+
dimension: null,
|
|
304
|
+
};
|
|
120
305
|
this.loaded = false;
|
|
121
306
|
}
|
|
122
307
|
|
|
308
|
+
/**
|
|
309
|
+
* Whether `getEmbedding` should consult this vocabulary at all.
|
|
310
|
+
* Reads from `EMBEDDING_CONFIG.cache.useVocabulary` at call time so
|
|
311
|
+
* tests / benchmarks that toggle the env var see the change without
|
|
312
|
+
* having to re-import the module.
|
|
313
|
+
*/
|
|
314
|
+
static isEnabled() {
|
|
315
|
+
return EMBEDDING_CONFIG.cache?.useVocabulary !== false;
|
|
316
|
+
}
|
|
317
|
+
|
|
123
318
|
async load() {
|
|
124
319
|
if (this.loaded) return;
|
|
125
320
|
try {
|
|
126
321
|
if (existsSync(this.vocabPath)) {
|
|
127
322
|
const data = JSON.parse(await fs.readFile(this.vocabPath, 'utf-8'));
|
|
128
|
-
|
|
129
|
-
|
|
323
|
+
const compat = isVocabFingerprintCompatible(data.metadata);
|
|
324
|
+
if (!compat.compatible) {
|
|
325
|
+
console.log(`Vocabulary: Ignoring incompatible cache (${compat.reason})`);
|
|
130
326
|
this.terms.clear();
|
|
131
327
|
} else {
|
|
132
|
-
this.metadata = data.metadata ||
|
|
133
|
-
|
|
134
|
-
|
|
328
|
+
this.metadata = { ...this.metadata, ...(data.metadata || {}) };
|
|
329
|
+
let normalized = 0;
|
|
330
|
+
let dropped = 0;
|
|
331
|
+
for (const [term, raw] of Object.entries(data.terms || {})) {
|
|
332
|
+
// Coerce to Float32Array. Persisted vocabs JSON-serialise typed
|
|
333
|
+
// arrays as indexed objects (`{"0": v0, ...}`), which otherwise
|
|
334
|
+
// crash downstream `embedding.slice(...)` calls (see
|
|
335
|
+
// `truncateForHNSW`). Reject any entry we cannot interpret as a
|
|
336
|
+
// vector — better to re-embed than to surface a corrupt vector.
|
|
337
|
+
const vec = coerceToFloat32Vector(raw);
|
|
338
|
+
if (vec) {
|
|
339
|
+
this.terms.set(term, vec);
|
|
340
|
+
normalized++;
|
|
341
|
+
} else {
|
|
342
|
+
dropped++;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
if (dropped > 0) {
|
|
346
|
+
console.log(`Vocabulary: Loaded ${normalized} pre-computed embeddings (dropped ${dropped} unrecognised)`);
|
|
347
|
+
} else {
|
|
348
|
+
console.log(`Vocabulary: Loaded ${normalized} pre-computed embeddings`);
|
|
135
349
|
}
|
|
136
|
-
console.log(`Vocabulary: Loaded ${this.terms.size} pre-computed embeddings`);
|
|
137
350
|
}
|
|
138
351
|
}
|
|
139
352
|
} catch (err) {
|
|
@@ -143,20 +356,55 @@ export class Vocabulary {
|
|
|
143
356
|
}
|
|
144
357
|
|
|
145
358
|
async save() {
|
|
146
|
-
this
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
359
|
+
return serialiseAtomicWrite(this, async () => {
|
|
360
|
+
// Snapshot mutable state INSIDE the queued task so the file written
|
|
361
|
+
// here matches the latest set/has state at write time, not whatever
|
|
362
|
+
// it was when this save() was first scheduled.
|
|
363
|
+
this.metadata.lastUpdated = new Date().toISOString();
|
|
364
|
+
this.metadata.schemaVersion = VOCAB_SCHEMA_VERSION;
|
|
365
|
+
this.metadata.provider = EMBEDDING_CONFIG.provider;
|
|
366
|
+
this.metadata.model = EMBEDDING_CONFIG.model;
|
|
367
|
+
this.metadata.dimension = EMBEDDING_CONFIG.dimension;
|
|
368
|
+
if (!this.metadata.created) this.metadata.created = this.metadata.lastUpdated;
|
|
369
|
+
// Normalise to plain arrays so JSON.stringify produces a compact,
|
|
370
|
+
// round-trippable form. Float32Array would otherwise serialise as
|
|
371
|
+
// an indexed object ({"0": v0, "1": v1, ...}) which load() can read
|
|
372
|
+
// (via coerceToFloat32Vector) but which is wasteful and was the
|
|
373
|
+
// shape that originally caused the `embedding.slice` bug.
|
|
374
|
+
const termsOut = {};
|
|
375
|
+
for (const [term, vec] of this.terms.entries()) {
|
|
376
|
+
termsOut[term] = vec instanceof Float32Array || ArrayBuffer.isView(vec)
|
|
377
|
+
? Array.from(vec)
|
|
378
|
+
: vec;
|
|
379
|
+
}
|
|
380
|
+
const data = { metadata: this.metadata, terms: termsOut };
|
|
381
|
+
await writeJsonAtomic(this.vocabPath, JSON.stringify(data, null, 2));
|
|
382
|
+
});
|
|
152
383
|
}
|
|
153
384
|
|
|
154
|
-
get(term) {
|
|
385
|
+
get(term) {
|
|
386
|
+
if (!Vocabulary.isEnabled()) return null;
|
|
387
|
+
return this.terms.get(this.normalize(term)) || null;
|
|
388
|
+
}
|
|
155
389
|
set(term, embedding) { this.terms.set(this.normalize(term), embedding); }
|
|
156
390
|
has(term) { return this.terms.has(this.normalize(term)); }
|
|
391
|
+
delete(term) { return this.terms.delete(this.normalize(term)); }
|
|
157
392
|
normalize(term) { return term.toLowerCase().trim(); }
|
|
158
393
|
size() { return this.terms.size; }
|
|
159
394
|
|
|
395
|
+
/**
|
|
396
|
+
* Whether the vocabulary is at or above the configured max-terms cap.
|
|
397
|
+
* Auto-expansion in `getEmbedding` is gated on this so a long-running
|
|
398
|
+
* benchmark cannot inflate the file unbounded. Explicit
|
|
399
|
+
* `addToVocabulary` / `expandVocabulary` calls (admin / pre-warm
|
|
400
|
+
* paths) bypass the cap.
|
|
401
|
+
*/
|
|
402
|
+
isFull() {
|
|
403
|
+
const cap = EMBEDDING_CONFIG.cache?.maxTerms;
|
|
404
|
+
if (!Number.isFinite(cap) || cap <= 0) return false;
|
|
405
|
+
return this.terms.size >= cap;
|
|
406
|
+
}
|
|
407
|
+
|
|
160
408
|
async addDefaultTerms(embedFn) {
|
|
161
409
|
const defaultTerms = [
|
|
162
410
|
'AuthService', 'EmployeeService', 'LoginService', 'UserService',
|
|
@@ -45,6 +45,7 @@ import {
|
|
|
45
45
|
queryDeduplicator,
|
|
46
46
|
queryStats,
|
|
47
47
|
cacheStats,
|
|
48
|
+
coerceToFloat32Vector,
|
|
48
49
|
getCacheStats as _getCacheStats,
|
|
49
50
|
getSemanticCacheStats,
|
|
50
51
|
clearCache,
|
|
@@ -63,6 +64,9 @@ export { TimeWindowRateLimiter };
|
|
|
63
64
|
// UNIFIED EMBEDDING SERVICE (hub functions)
|
|
64
65
|
// =============================================================================
|
|
65
66
|
|
|
67
|
+
// Process-scoped flag so the "vocab is full" message logs once, not per-query.
|
|
68
|
+
let _vocabFullLogged = false;
|
|
69
|
+
|
|
66
70
|
/** Generate embedding using the active provider with circuit breaker */
|
|
67
71
|
async function generateEmbedding(text, provider = EMBEDDING_CONFIG.provider, isQuery = false) {
|
|
68
72
|
const localText = isQuery ? applyLocalQueryPrefix(text) : text;
|
|
@@ -202,17 +206,38 @@ export async function getEmbedding(text, options = {}) {
|
|
|
202
206
|
if (useCache && EMBEDDING_CONFIG.cache?.enabled) {
|
|
203
207
|
const cached = queryCache.get(cacheKey);
|
|
204
208
|
if (cached) {
|
|
205
|
-
|
|
206
|
-
|
|
209
|
+
// Defensive guard: a cache value MUST be a vector with .length and
|
|
210
|
+
// .slice. Persisted vocabularies that round-tripped through JSON
|
|
211
|
+
// produce indexed-object shapes which crash downstream consumers.
|
|
212
|
+
// Coerce; if unrecoverable, drop the entry and fall through.
|
|
213
|
+
const cachedVec = coerceToFloat32Vector(cached);
|
|
214
|
+
if (cachedVec) {
|
|
215
|
+
if (cachedVec !== cached) queryCache.set(cacheKey, cachedVec);
|
|
216
|
+
cacheStats.hits++;
|
|
217
|
+
return { embedding: cachedVec, cached: true, source: 'lru', latency_us: Math.round((performance.now() - start) * 1000) };
|
|
218
|
+
}
|
|
219
|
+
queryCache.delete?.(cacheKey);
|
|
220
|
+
console.warn(`[embedding] LRU cache held non-vector for "${cacheKey.slice(0, 60)}"; regenerating`);
|
|
207
221
|
}
|
|
208
222
|
|
|
209
|
-
if (isQuery) {
|
|
223
|
+
if (isQuery && EMBEDDING_CONFIG.cache?.useVocabulary !== false) {
|
|
210
224
|
await vocabulary.load();
|
|
211
225
|
const vocabHit = vocabulary.get(text);
|
|
212
226
|
if (vocabHit) {
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
227
|
+
const vocabVec = coerceToFloat32Vector(vocabHit);
|
|
228
|
+
if (vocabVec) {
|
|
229
|
+
// Backfill the in-memory vocab map with the typed-array form so
|
|
230
|
+
// subsequent hits skip re-coercion.
|
|
231
|
+
if (vocabVec !== vocabHit) vocabulary.set?.(text, vocabVec);
|
|
232
|
+
cacheStats.vocabularyHits++;
|
|
233
|
+
queryCache.set(cacheKey, vocabVec);
|
|
234
|
+
return { embedding: vocabVec, cached: true, source: 'vocabulary', latency_us: Math.round((performance.now() - start) * 1000) };
|
|
235
|
+
}
|
|
236
|
+
// Unrecoverable vocab entry — drop it and continue. (load() now
|
|
237
|
+
// normalises on read, so this branch should be unreachable in
|
|
238
|
+
// practice; it is the belt-and-braces for older code paths.)
|
|
239
|
+
vocabulary.delete?.(text);
|
|
240
|
+
console.warn(`[embedding] vocabulary held non-vector for "${text.slice(0, 60)}"; dropping and regenerating`);
|
|
216
241
|
}
|
|
217
242
|
}
|
|
218
243
|
}
|
|
@@ -259,9 +284,20 @@ export async function getEmbedding(text, options = {}) {
|
|
|
259
284
|
queryStats.save().catch(() => {});
|
|
260
285
|
const threshold = EMBEDDING_CONFIG.cache?.expansionThreshold || 3;
|
|
261
286
|
if (usageCount >= threshold && !vocabulary.has(text)) {
|
|
262
|
-
vocabulary.
|
|
263
|
-
|
|
264
|
-
|
|
287
|
+
if (vocabulary.isFull()) {
|
|
288
|
+
// Cap reached: skip auto-promotion and log once per batch (the
|
|
289
|
+
// queryStats counter still increments so we don't lose the
|
|
290
|
+
// signal — explicit `addToVocabulary` can still write through).
|
|
291
|
+
if (!_vocabFullLogged) {
|
|
292
|
+
const cap = EMBEDDING_CONFIG.cache?.maxTerms;
|
|
293
|
+
console.log(`Vocabulary: Auto-expand cap reached (${cap} terms); skipping further auto-promotion. Override via SWEET_SEARCH_VOCAB_MAX_TERMS.`);
|
|
294
|
+
_vocabFullLogged = true;
|
|
295
|
+
}
|
|
296
|
+
} else {
|
|
297
|
+
vocabulary.set(text, embedding);
|
|
298
|
+
vocabulary.save().catch(() => {});
|
|
299
|
+
console.log(`Vocabulary: Auto-added "${text}" (used ${usageCount}x)`);
|
|
300
|
+
}
|
|
265
301
|
}
|
|
266
302
|
}
|
|
267
303
|
}
|
|
@@ -276,16 +276,26 @@ export function expandResults(db, results, options = {}) {
|
|
|
276
276
|
*/
|
|
277
277
|
function collectSeedIds(db, results) {
|
|
278
278
|
const seedIds = new Set();
|
|
279
|
+
const needsLineMatch = [];
|
|
280
|
+
|
|
281
|
+
// Distinguish chunk ids from entity ids by shape: chunk ids look like
|
|
282
|
+
// `path/to/file.ext:start-end:n` (always contain `:`), entity ids are
|
|
283
|
+
// hex hashes / opaque tokens that never contain `:`. Treating chunk ids
|
|
284
|
+
// as entity ids feeds them into the relationships SQL and yields zero
|
|
285
|
+
// neighbours — which silently disabled graph expansion on HNSW results.
|
|
286
|
+
const looksLikeEntityId = (s) => typeof s === 'string' && !s.includes(':');
|
|
279
287
|
|
|
280
288
|
for (const r of results) {
|
|
281
289
|
if (r.entity_id) seedIds.add(r.entity_id);
|
|
282
290
|
else if (r.metadata?.entity_id) seedIds.add(r.metadata.entity_id);
|
|
283
|
-
else if (r.id) seedIds.add(r.id);
|
|
291
|
+
else if (r.is_expanded && r.id) seedIds.add(r.id);
|
|
292
|
+
else if (r.id && looksLikeEntityId(r.id)) seedIds.add(r.id);
|
|
293
|
+
else needsLineMatch.push(r);
|
|
284
294
|
}
|
|
285
295
|
|
|
286
|
-
if (
|
|
296
|
+
if (needsLineMatch.length === 0) return seedIds;
|
|
287
297
|
|
|
288
|
-
//
|
|
298
|
+
// Line-range fallback for chunk-id keyed results.
|
|
289
299
|
let entityLookup;
|
|
290
300
|
try {
|
|
291
301
|
entityLookup = db.prepare(`
|
|
@@ -296,18 +306,48 @@ function collectSeedIds(db, results) {
|
|
|
296
306
|
return seedIds;
|
|
297
307
|
}
|
|
298
308
|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
309
|
+
// Chunk-id pattern: `path/to/file.ext:<start>-<end>:<n>`. When metadata
|
|
310
|
+
// doesn't carry file_path / line numbers (older indexes can be sparse),
|
|
311
|
+
// parse them out of the id itself.
|
|
312
|
+
const parseChunkId = (id) => {
|
|
313
|
+
if (typeof id !== 'string' || !id.includes(':')) return null;
|
|
314
|
+
const m = id.match(/^(.+):(\d+)-(\d+):(\d+)$/);
|
|
315
|
+
if (!m) return null;
|
|
316
|
+
return { file: m[1], startLine: parseInt(m[2], 10), endLine: parseInt(m[3], 10) };
|
|
317
|
+
};
|
|
303
318
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
319
|
+
for (const r of needsLineMatch) {
|
|
320
|
+
let filePath = r.file_path || r.file || r.metadata?.file || r.metadata?.path;
|
|
321
|
+
let lineStart = r.start_line || r.startLine
|
|
322
|
+
|| r.metadata?.line_start || r.metadata?.startLine || r.metadata?.start_line;
|
|
323
|
+
let lineEnd = r.end_line || r.endLine
|
|
324
|
+
|| r.metadata?.line_end || r.metadata?.endLine || r.metadata?.end_line;
|
|
325
|
+
if (!filePath || lineStart == null || lineEnd == null) {
|
|
326
|
+
const parsed = parseChunkId(r.id);
|
|
327
|
+
if (parsed) {
|
|
328
|
+
filePath = filePath || parsed.file;
|
|
329
|
+
lineStart = lineStart ?? parsed.startLine;
|
|
330
|
+
lineEnd = lineEnd ?? parsed.endLine;
|
|
309
331
|
}
|
|
310
332
|
}
|
|
333
|
+
if (!filePath || lineStart == null) continue;
|
|
334
|
+
// If we still don't have an end line, treat the chunk as a single line.
|
|
335
|
+
if (lineEnd == null) lineEnd = lineStart;
|
|
336
|
+
|
|
337
|
+
// Find the SMALLEST entity that overlaps the chunk's [start, end] —
|
|
338
|
+
// smaller entities (functions/methods) are more meaningful seeds than
|
|
339
|
+
// file-level container entities. Cap to one seed per result to avoid
|
|
340
|
+
// unbounded seed-set blow-up that can break the relationships SQL.
|
|
341
|
+
let bestId = null;
|
|
342
|
+
let bestSize = Infinity;
|
|
343
|
+
for (const e of entityLookup) {
|
|
344
|
+
if (e.file_path !== filePath) continue;
|
|
345
|
+
if (e.start_line == null || e.end_line == null) continue;
|
|
346
|
+
if (e.start_line > lineEnd || e.end_line < lineStart) continue;
|
|
347
|
+
const size = (e.end_line - e.start_line) + 1;
|
|
348
|
+
if (size < bestSize) { bestSize = size; bestId = e.id; }
|
|
349
|
+
}
|
|
350
|
+
if (bestId) seedIds.add(bestId);
|
|
311
351
|
}
|
|
312
352
|
|
|
313
353
|
return seedIds;
|
|
@@ -253,9 +253,34 @@ export const GENERIC_RELATIONSHIP_MAPPING = Object.freeze({
|
|
|
253
253
|
mixin: 'extends',
|
|
254
254
|
with: 'extends',
|
|
255
255
|
category: 'extends',
|
|
256
|
+
// TS: interface extends interface(s) is a true `extends` edge in
|
|
257
|
+
// the graph (separate pattern key because the registry regex needs
|
|
258
|
+
// to match on the `interface` keyword, not `class`).
|
|
259
|
+
interfaceExtends: 'extends',
|
|
256
260
|
implements: 'implements',
|
|
257
261
|
protocol: 'implements',
|
|
258
262
|
implFor: 'implements',
|
|
263
|
+
// TS: type-only imports/re-exports are still module-level
|
|
264
|
+
// dependencies, so they map to the same `imports` edge.
|
|
265
|
+
typeImport: 'imports',
|
|
266
|
+
typeReexport: 'imports',
|
|
267
|
+
// TS: `<T extends Foo>` is a type reference, not an inheritance
|
|
268
|
+
// edge — emit it as a `uses` relationship (consistent with how
|
|
269
|
+
// decorators and method-of references are handled).
|
|
270
|
+
genericConstraint: 'uses',
|
|
271
|
+
// FOLLOW-UP (documented, NOT implemented): per-line type references
|
|
272
|
+
// in function/method/property signatures (e.g. `function foo(x: User):
|
|
273
|
+
// Result` → `uses` edges to User and Result; `field: Token` → `uses`
|
|
274
|
+
// edge to Token). Intentionally not added at the regex layer — the
|
|
275
|
+
// false-positive surface (matching identifiers in comments, strings,
|
|
276
|
+
// and unrelated positions) is too high. Two prerequisites before
|
|
277
|
+
// shipping:
|
|
278
|
+
// 1. AST-level type-reference extractor (walk `type_annotation` /
|
|
279
|
+
// `parameter` / `return_type` nodes via tree-sitter, not regex)
|
|
280
|
+
// 2. Graph-density benchmark showing retrieval benefit without
|
|
281
|
+
// precision loss (the new `uses` edges should improve graph
|
|
282
|
+
// expansion recall without adding noise that hurts MRR).
|
|
283
|
+
// See May-2026 design discussion in chat history for details.
|
|
259
284
|
decorator: 'uses',
|
|
260
285
|
embed: 'uses',
|
|
261
286
|
extend: 'uses',
|
|
@@ -274,6 +299,9 @@ const escapeRegexLiteral = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&
|
|
|
274
299
|
// Module-scope constant to avoid per-call Set allocation.
|
|
275
300
|
const MULTI_TARGET_TYPES = new Set([
|
|
276
301
|
'plainImport', 'implements', 'inherit', 'protocol', 'with',
|
|
302
|
+
// TS: `interface Foo extends Bar, Baz<T>` — comma-separated
|
|
303
|
+
// parents, generics handled by expandRelationshipTargets.
|
|
304
|
+
'interfaceExtends',
|
|
277
305
|
]);
|
|
278
306
|
|
|
279
307
|
export const TREE_SITTER_ENTITY_PRIORITY = Object.freeze({
|
|
@@ -1328,7 +1356,8 @@ export class GraphExtractor {
|
|
|
1328
1356
|
return { targets: [source], filtered: false };
|
|
1329
1357
|
}
|
|
1330
1358
|
|
|
1331
|
-
if (isJsTs && (relType === 'require' || relType === 'reexport' || relType === 'dynamicImport'
|
|
1359
|
+
if (isJsTs && (relType === 'require' || relType === 'reexport' || relType === 'dynamicImport'
|
|
1360
|
+
|| relType === 'typeImport' || relType === 'typeReexport')) {
|
|
1332
1361
|
const source = match[1]?.trim();
|
|
1333
1362
|
if (!source) return { targets: [], filtered: false };
|
|
1334
1363
|
if (source.startsWith('.')) return { targets: [], filtered: true };
|