opencode-diane 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +180 -0
- package/LICENSE +21 -0
- package/README.md +206 -0
- package/WIKI.md +1430 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.js +1632 -0
- package/dist/ingest/adaptive.d.ts +47 -0
- package/dist/ingest/adaptive.js +182 -0
- package/dist/ingest/code-health.d.ts +58 -0
- package/dist/ingest/code-health.js +202 -0
- package/dist/ingest/code-map.d.ts +71 -0
- package/dist/ingest/code-map.js +670 -0
- package/dist/ingest/cross-refs.d.ts +59 -0
- package/dist/ingest/cross-refs.js +1207 -0
- package/dist/ingest/docs.d.ts +49 -0
- package/dist/ingest/docs.js +325 -0
- package/dist/ingest/git.d.ts +77 -0
- package/dist/ingest/git.js +390 -0
- package/dist/ingest/live-session.d.ts +101 -0
- package/dist/ingest/live-session.js +173 -0
- package/dist/ingest/project-notes.d.ts +28 -0
- package/dist/ingest/project-notes.js +102 -0
- package/dist/ingest/project.d.ts +35 -0
- package/dist/ingest/project.js +430 -0
- package/dist/ingest/session-snapshot.d.ts +63 -0
- package/dist/ingest/session-snapshot.js +94 -0
- package/dist/ingest/sessions.d.ts +29 -0
- package/dist/ingest/sessions.js +164 -0
- package/dist/ingest/tables.d.ts +52 -0
- package/dist/ingest/tables.js +360 -0
- package/dist/mining/skill-miner.d.ts +53 -0
- package/dist/mining/skill-miner.js +234 -0
- package/dist/search/bm25.d.ts +81 -0
- package/dist/search/bm25.js +334 -0
- package/dist/search/e5-embedder.d.ts +30 -0
- package/dist/search/e5-embedder.js +91 -0
- package/dist/search/embed-pass.d.ts +26 -0
- package/dist/search/embed-pass.js +43 -0
- package/dist/search/embedder.d.ts +58 -0
- package/dist/search/embedder.js +85 -0
- package/dist/search/inverted-index.d.ts +51 -0
- package/dist/search/inverted-index.js +139 -0
- package/dist/search/ppr.d.ts +44 -0
- package/dist/search/ppr.js +118 -0
- package/dist/search/tokenize.d.ts +26 -0
- package/dist/search/tokenize.js +98 -0
- package/dist/store/eviction.d.ts +16 -0
- package/dist/store/eviction.js +37 -0
- package/dist/store/repository.d.ts +222 -0
- package/dist/store/repository.js +420 -0
- package/dist/store/sqlite-store.d.ts +89 -0
- package/dist/store/sqlite-store.js +252 -0
- package/dist/store/vector-store.d.ts +66 -0
- package/dist/store/vector-store.js +160 -0
- package/dist/types.d.ts +385 -0
- package/dist/types.js +9 -0
- package/dist/utils/file-log.d.ts +87 -0
- package/dist/utils/file-log.js +215 -0
- package/dist/utils/peer-detection.d.ts +45 -0
- package/dist/utils/peer-detection.js +90 -0
- package/dist/utils/shell.d.ts +43 -0
- package/dist/utils/shell.js +110 -0
- package/dist/utils/usage-skill.d.ts +42 -0
- package/dist/utils/usage-skill.js +129 -0
- package/dist/utils/xlsx.d.ts +36 -0
- package/dist/utils/xlsx.js +270 -0
- package/grammars/tree-sitter-c.wasm +0 -0
- package/grammars/tree-sitter-c_sharp.wasm +0 -0
- package/grammars/tree-sitter-cpp.wasm +0 -0
- package/grammars/tree-sitter-css.wasm +0 -0
- package/grammars/tree-sitter-go.wasm +0 -0
- package/grammars/tree-sitter-html.wasm +0 -0
- package/grammars/tree-sitter-java.wasm +0 -0
- package/grammars/tree-sitter-javascript.wasm +0 -0
- package/grammars/tree-sitter-json.wasm +0 -0
- package/grammars/tree-sitter-php.wasm +0 -0
- package/grammars/tree-sitter-python.wasm +0 -0
- package/grammars/tree-sitter-rust.wasm +0 -0
- package/grammars/tree-sitter-typescript.wasm +0 -0
- package/package.json +80 -0
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 retrieval over the in-memory inverted index.
|
|
3
|
+
*
|
|
4
|
+
* Hierarchical filtering: callers can narrow candidates by category
|
|
5
|
+
* and/or subject before scoring. If neither filter is provided, all
|
|
6
|
+
* docs that contain any query term are considered.
|
|
7
|
+
*
|
|
8
|
+
* k1=1.2, b=0.75 — standard defaults that work well on short docs.
|
|
9
|
+
*/
|
|
10
|
+
import { personalizedPageRank } from "./ppr.js";
|
|
11
|
+
import { tokenize } from "./tokenize.js";
|
|
12
|
+
const K1 = 1.2;
|
|
13
|
+
const B = 0.75;
|
|
14
|
+
/**
|
|
15
|
+
* Fraction of a seed hit's BM25 score given to memories pulled in via
|
|
16
|
+
* the co-change graph. Deliberately small (< 1): a co-change-surfaced
|
|
17
|
+
* file always ranks below a direct textual match, but above nothing.
|
|
18
|
+
*/
|
|
19
|
+
const COCHANGE_BOOST = 0.25;
|
|
20
|
+
/**
|
|
21
|
+
* Only the top-N textual hits act as seeds for co-change propagation.
|
|
22
|
+
* Keeps the boost focused on what the query actually matched and the
|
|
23
|
+
* extra work bounded.
|
|
24
|
+
*/
|
|
25
|
+
const SEED_LIMIT = 5;
|
|
26
|
+
/**
|
|
27
|
+
* Per neighbour file, pull in at most this many memories (ranked by
|
|
28
|
+
* useCount). Stops a hot file with hundreds of commit memories from
|
|
29
|
+
* flooding the result set.
|
|
30
|
+
*/
|
|
31
|
+
const PROPAGATE_PER_FILE = 3;
|
|
32
|
+
/** ~4 chars per token — the rough heuristic used throughout the plugin. */
|
|
33
|
+
export function estimateTokens(s) {
|
|
34
|
+
return Math.ceil(s.length / 4);
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Pack ranked hits into a token budget. `format` renders one hit to
|
|
38
|
+
* the string the agent will actually see, so the estimate matches the
|
|
39
|
+
* real output. Always returns at least one hit (the top-ranked) even
|
|
40
|
+
* if it alone exceeds the budget — an empty result would be worse
|
|
41
|
+
* than a slightly-over one — but in that case the hit's `content` is
|
|
42
|
+
* truncated so the budget stays a real ceiling rather than a wish.
|
|
43
|
+
* Returns the (possibly content-truncated) kept hits and how many
|
|
44
|
+
* were dropped.
|
|
45
|
+
*/
|
|
46
|
+
export function packToTokenBudget(hits, budget, format) {
|
|
47
|
+
if (hits.length === 0)
|
|
48
|
+
return { kept: [], omitted: 0 };
|
|
49
|
+
const kept = [];
|
|
50
|
+
let used = 0;
|
|
51
|
+
for (const h of hits) {
|
|
52
|
+
const cost = estimateTokens(format(h)) + 1; // +1 for the joining newline
|
|
53
|
+
if (kept.length === 0 && cost > budget) {
|
|
54
|
+
// Mandatory first hit overflows on its own — keep it, but trim
|
|
55
|
+
// its content so the result still respects the ceiling. We trim
|
|
56
|
+
// a shallow clone so the stored Memory object is untouched.
|
|
57
|
+
const room = Math.max(40, budget) * 4; // budget back in chars
|
|
58
|
+
const overhead = format(h).length - h.memory.content.length;
|
|
59
|
+
const contentRoom = Math.max(20, room - overhead);
|
|
60
|
+
const trimmed = {
|
|
61
|
+
score: h.score,
|
|
62
|
+
memory: {
|
|
63
|
+
...h.memory,
|
|
64
|
+
content: h.memory.content.length > contentRoom
|
|
65
|
+
? h.memory.content.slice(0, contentRoom - 1) + "…"
|
|
66
|
+
: h.memory.content,
|
|
67
|
+
},
|
|
68
|
+
};
|
|
69
|
+
kept.push(trimmed);
|
|
70
|
+
break;
|
|
71
|
+
}
|
|
72
|
+
if (kept.length > 0 && used + cost > budget)
|
|
73
|
+
break;
|
|
74
|
+
kept.push(h);
|
|
75
|
+
used += cost;
|
|
76
|
+
}
|
|
77
|
+
return { kept, omitted: hits.length - kept.length };
|
|
78
|
+
}
|
|
79
|
+
export function search(index, byId, opts) {
|
|
80
|
+
const queryTokens = tokenize(opts.query);
|
|
81
|
+
if (queryTokens.length === 0)
|
|
82
|
+
return [];
|
|
83
|
+
const limit = opts.limit ?? 10;
|
|
84
|
+
// 1) Compute candidate set: union of postings for query terms,
|
|
85
|
+
// further intersected with category/subject filters if any.
|
|
86
|
+
const candidates = new Set();
|
|
87
|
+
for (const token of queryTokens) {
|
|
88
|
+
const ids = index.postings.get(token);
|
|
89
|
+
if (!ids)
|
|
90
|
+
continue;
|
|
91
|
+
for (const id of ids)
|
|
92
|
+
candidates.add(id);
|
|
93
|
+
}
|
|
94
|
+
if (opts.category)
|
|
95
|
+
intersectInPlace(candidates, index.byCategory.get(opts.category));
|
|
96
|
+
if (opts.subject)
|
|
97
|
+
intersectInPlace(candidates, index.bySubject.get(opts.subject));
|
|
98
|
+
if (candidates.size === 0)
|
|
99
|
+
return [];
|
|
100
|
+
// 2) Compute IDF for each unique query term.
|
|
101
|
+
const N = index.docCount || 1;
|
|
102
|
+
const idf = new Map();
|
|
103
|
+
const uniqueQueryTokens = Array.from(new Set(queryTokens));
|
|
104
|
+
for (const t of uniqueQueryTokens) {
|
|
105
|
+
const df = index.postings.get(t)?.size ?? 0;
|
|
106
|
+
// BM25+ style: log((N - df + 0.5) / (df + 0.5) + 1) — always positive.
|
|
107
|
+
idf.set(t, Math.log((N - df + 0.5) / (df + 0.5) + 1));
|
|
108
|
+
}
|
|
109
|
+
// 3) Score each candidate.
|
|
110
|
+
const avgdl = index.avgDocLength() || 1;
|
|
111
|
+
const scoreById = new Map();
|
|
112
|
+
for (const id of candidates) {
|
|
113
|
+
const doc = index.docs.get(id);
|
|
114
|
+
const mem = byId.get(id);
|
|
115
|
+
if (!doc || !mem)
|
|
116
|
+
continue;
|
|
117
|
+
let score = 0;
|
|
118
|
+
for (const t of uniqueQueryTokens) {
|
|
119
|
+
const tf = doc.tf.get(t) ?? 0;
|
|
120
|
+
if (tf === 0)
|
|
121
|
+
continue;
|
|
122
|
+
const wt = idf.get(t) ?? 0;
|
|
123
|
+
const num = tf * (K1 + 1);
|
|
124
|
+
const denom = tf + K1 * (1 - B + (B * doc.length) / avgdl);
|
|
125
|
+
score += wt * (num / denom);
|
|
126
|
+
}
|
|
127
|
+
// Small recency / popularity bias so that all-else-equal we return
|
|
128
|
+
// recently-touched and frequently-used entries first.
|
|
129
|
+
score += Math.log1p(mem.useCount) * 0.05;
|
|
130
|
+
if (score > 0)
|
|
131
|
+
scoreById.set(id, score);
|
|
132
|
+
}
|
|
133
|
+
// 3b) Co-change graph boost — the Aider PageRank idea.
|
|
134
|
+
// A well-scoring memory about file X *pulls in* memories about
|
|
135
|
+
// files X is historically modified together with, even when
|
|
136
|
+
// those files don't textually match the query — surfacing
|
|
137
|
+
// structurally-related context the query alone would miss.
|
|
138
|
+
//
|
|
139
|
+
// Two strategies, selected by `opts.personalizedPageRank`:
|
|
140
|
+
// - default (off): ONE HOP — boost the direct co-change
|
|
141
|
+
// neighbours of the top textual hits. O(seeds × neighbours),
|
|
142
|
+
// fully inspectable.
|
|
143
|
+
// - on: PERSONALIZED PAGERANK — a restart-biased random walk
|
|
144
|
+
// over the whole co-change graph, so relevance reaches
|
|
145
|
+
// multi-hop files, graded by graph distance.
|
|
146
|
+
//
|
|
147
|
+
// Either way it is bounded and low-scored so it can't drown
|
|
148
|
+
// direct matches: only the top SEED_LIMIT textual hits seed it,
|
|
149
|
+
// each file contributes ≤ PROPAGATE_PER_FILE memories (most-used
|
|
150
|
+
// first), and a pulled-in hit always ranks below a real textual
|
|
151
|
+
// match.
|
|
152
|
+
if (scoreById.size > 0 && index.coChange.size > 0) {
|
|
153
|
+
const seeds = Array.from(scoreById.entries())
|
|
154
|
+
.sort((a, b) => b[1] - a[1])
|
|
155
|
+
.slice(0, SEED_LIMIT);
|
|
156
|
+
if (opts.personalizedPageRank) {
|
|
157
|
+
applyPprBoost(index, byId, scoreById, seeds);
|
|
158
|
+
}
|
|
159
|
+
else {
|
|
160
|
+
applyOneHopBoost(index, byId, scoreById, seeds);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
// 3c) Intent lean — query-dependent and agent-supplied. Applied
|
|
164
|
+
// last, as a *mild* multiplier, so it nudges ranking without
|
|
165
|
+
// ever filtering: a much stronger match still wins regardless of
|
|
166
|
+
// `prefer`. This is why test de-emphasis here is safe — a query
|
|
167
|
+
// that genuinely wants tests still gets them.
|
|
168
|
+
if (opts.prefer && opts.prefer !== "any") {
|
|
169
|
+
for (const [id, s] of scoreById) {
|
|
170
|
+
const mem = byId.get(id);
|
|
171
|
+
if (!mem)
|
|
172
|
+
continue;
|
|
173
|
+
const m = preferMultiplier(opts.prefer, mem);
|
|
174
|
+
if (m !== 1)
|
|
175
|
+
scoreById.set(id, s * m);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
const hits = [];
|
|
179
|
+
for (const [id, score] of scoreById) {
|
|
180
|
+
const mem = byId.get(id);
|
|
181
|
+
if (mem)
|
|
182
|
+
hits.push({ memory: mem, score });
|
|
183
|
+
}
|
|
184
|
+
hits.sort((a, b) => b.score - a.score);
|
|
185
|
+
return hits.slice(0, limit);
|
|
186
|
+
}
|
|
187
|
+
/**
|
|
188
|
+
* Whether a memory's subject path looks test-related. Deliberately
|
|
189
|
+
* minimal and language-neutral: it asks only whether the *word*
|
|
190
|
+
* "test"/"tests" appears as a token of the path. The plugin's own
|
|
191
|
+
* tokenizer extracts that word equally from a `test/` directory, a
|
|
192
|
+
* `_test.go` / `.test.ts` filename, or a `test_x.py` one — so this is
|
|
193
|
+
* one universal signal, not a per-ecosystem catalogue to overfit. It
|
|
194
|
+
* checks tokens, not substrings, so `latest`, `contest`, `testimony`
|
|
195
|
+
* are not mistaken for tests.
|
|
196
|
+
*/
|
|
197
|
+
/**
|
|
198
|
+
* One-hop co-change boost (the default). Lifts memories about the
|
|
199
|
+
* direct co-change neighbours of the seed files: a neighbour memory's
|
|
200
|
+
* score is raised to `seedScore × COCHANGE_BOOST`, never above its own
|
|
201
|
+
* textual score, and at most `PROPAGATE_PER_FILE` memories per file.
|
|
202
|
+
*/
|
|
203
|
+
function applyOneHopBoost(index, byId, scoreById, seeds) {
|
|
204
|
+
for (const [seedId, seedScore] of seeds) {
|
|
205
|
+
const seedMem = byId.get(seedId);
|
|
206
|
+
if (!seedMem)
|
|
207
|
+
continue;
|
|
208
|
+
const seedFile = fileOfSubject(seedMem.subject);
|
|
209
|
+
if (!seedFile)
|
|
210
|
+
continue;
|
|
211
|
+
const neighbors = index.coChangeNeighbors(seedFile);
|
|
212
|
+
if (!neighbors)
|
|
213
|
+
continue;
|
|
214
|
+
for (const neighborFile of neighbors) {
|
|
215
|
+
const memIds = index.bySubject.get(neighborFile);
|
|
216
|
+
if (!memIds)
|
|
217
|
+
continue;
|
|
218
|
+
// Most-used memories about the neighbour file first.
|
|
219
|
+
const ranked = Array.from(memIds)
|
|
220
|
+
.map((id) => byId.get(id))
|
|
221
|
+
.filter((m) => m !== undefined)
|
|
222
|
+
.sort((a, b) => b.useCount - a.useCount)
|
|
223
|
+
.slice(0, PROPAGATE_PER_FILE);
|
|
224
|
+
const propagated = seedScore * COCHANGE_BOOST;
|
|
225
|
+
for (const m of ranked) {
|
|
226
|
+
const current = scoreById.get(m.id) ?? 0;
|
|
227
|
+
// Never lower an existing (textual) score; only lift.
|
|
228
|
+
if (propagated > current)
|
|
229
|
+
scoreById.set(m.id, propagated);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Personalized PageRank co-change boost (opt-in). Runs a
|
|
236
|
+
* restart-biased random walk over the whole co-change graph, seeded on
|
|
237
|
+
* the textual hits, and lifts memories about every file the walk
|
|
238
|
+
* reaches — graded by the walk's stationary score, so a direct
|
|
239
|
+
* neighbour is lifted more than a two-hop file. Bounded: the boost is
|
|
240
|
+
* scaled so the most-central file receives at most
|
|
241
|
+
* `topSeedScore × COCHANGE_BOOST`, keeping any pulled-in hit below the
|
|
242
|
+
* strongest textual match.
|
|
243
|
+
*/
|
|
244
|
+
function applyPprBoost(index, byId, scoreById, seeds) {
|
|
245
|
+
// Personalization vector: each seed's file, weighted by its BM25 score.
|
|
246
|
+
const personalization = new Map();
|
|
247
|
+
for (const [id, score] of seeds) {
|
|
248
|
+
const file = fileOfSubject(byId.get(id)?.subject ?? "");
|
|
249
|
+
if (file)
|
|
250
|
+
personalization.set(file, (personalization.get(file) ?? 0) + score);
|
|
251
|
+
}
|
|
252
|
+
if (personalization.size === 0)
|
|
253
|
+
return;
|
|
254
|
+
const ppr = personalizedPageRank(index.coChange, personalization);
|
|
255
|
+
let maxScore = 0;
|
|
256
|
+
for (const v of ppr.values())
|
|
257
|
+
if (v > maxScore)
|
|
258
|
+
maxScore = v;
|
|
259
|
+
if (maxScore <= 0)
|
|
260
|
+
return;
|
|
261
|
+
// Scale so the most-central file gets at most topSeedScore × COCHANGE_BOOST.
|
|
262
|
+
const topSeedScore = seeds[0][1];
|
|
263
|
+
for (const [file, prob] of ppr) {
|
|
264
|
+
const boost = (prob / maxScore) * topSeedScore * COCHANGE_BOOST;
|
|
265
|
+
if (boost <= 1e-9)
|
|
266
|
+
continue;
|
|
267
|
+
const memIds = index.bySubject.get(file);
|
|
268
|
+
if (!memIds)
|
|
269
|
+
continue;
|
|
270
|
+
const ranked = Array.from(memIds)
|
|
271
|
+
.map((id) => byId.get(id))
|
|
272
|
+
.filter((m) => m !== undefined)
|
|
273
|
+
.sort((a, b) => b.useCount - a.useCount)
|
|
274
|
+
.slice(0, PROPAGATE_PER_FILE);
|
|
275
|
+
for (const m of ranked) {
|
|
276
|
+
// Additive: the stationary score already aggregates every path
|
|
277
|
+
// from every seed, so PPR lifts each file once, on top of any
|
|
278
|
+
// textual score that memory already has.
|
|
279
|
+
scoreById.set(m.id, (scoreById.get(m.id) ?? 0) + boost);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
function subjectLooksLikeTest(subject) {
|
|
284
|
+
for (const tok of tokenize(subject)) {
|
|
285
|
+
if (tok === "test" || tok === "tests")
|
|
286
|
+
return true;
|
|
287
|
+
}
|
|
288
|
+
return false;
|
|
289
|
+
}
|
|
290
|
+
/**
|
|
291
|
+
* The mild, query-dependent ranking lean for `prefer`. Multipliers are
|
|
292
|
+
* gentle constants by design — a nudge, not a gate — and are not tuned
|
|
293
|
+
* to any particular repository.
|
|
294
|
+
*/
|
|
295
|
+
function preferMultiplier(prefer, mem) {
|
|
296
|
+
const isTest = subjectLooksLikeTest(mem.subject);
|
|
297
|
+
if (prefer === "tests")
|
|
298
|
+
return isTest ? 1.5 : 1.0;
|
|
299
|
+
if (prefer === "history")
|
|
300
|
+
return mem.category === "git-history" ? 1.3 : 1.0;
|
|
301
|
+
// prefer === "code": lean toward structural code, away from tests.
|
|
302
|
+
let m = mem.category === "code-map" ? 1.25 : 1.0;
|
|
303
|
+
if (isTest)
|
|
304
|
+
m *= 0.6;
|
|
305
|
+
return m;
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Extract the file path a memory's subject is "about", or null.
|
|
309
|
+
* Commit memories use the bare path as subject; co-change and churn
|
|
310
|
+
* memories prefix it (`co-change:foo`, `churn:foo`). Tree-style
|
|
311
|
+
* placeholders (`tree:abc1234`) and non-file subjects return null.
|
|
312
|
+
*/
|
|
313
|
+
function fileOfSubject(subject) {
|
|
314
|
+
if (subject.startsWith("co-change:"))
|
|
315
|
+
return subject.slice("co-change:".length);
|
|
316
|
+
if (subject.startsWith("churn:"))
|
|
317
|
+
return subject.slice("churn:".length);
|
|
318
|
+
if (subject.startsWith("tree:"))
|
|
319
|
+
return null;
|
|
320
|
+
if (subject.startsWith("recency:"))
|
|
321
|
+
return null;
|
|
322
|
+
if (subject.includes(":"))
|
|
323
|
+
return null; // other category-prefixed subjects
|
|
324
|
+
return subject || null;
|
|
325
|
+
}
|
|
326
|
+
function intersectInPlace(target, other) {
|
|
327
|
+
if (!other) {
|
|
328
|
+
target.clear();
|
|
329
|
+
return;
|
|
330
|
+
}
|
|
331
|
+
for (const id of target)
|
|
332
|
+
if (!other.has(id))
|
|
333
|
+
target.delete(id);
|
|
334
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e5-embedder.ts — the real multilingual embedder, backed by the
|
|
3
|
+
* `intfloat/e5` family via transformers.js (ONNX).
|
|
4
|
+
*
|
|
5
|
+
* This file is the ONLY place that touches the model runtime, and it
|
|
6
|
+
* is imported dynamically — and only when `enableSemanticSearch` is
|
|
7
|
+
* on. `@huggingface/transformers` is an OPTIONAL peer dependency: it is
|
|
8
|
+
* not installed by a default `bun install`, so a user who never
|
|
9
|
+
* enables semantic search never downloads it. The import specifier is
|
|
10
|
+
* deliberately indirected through a `string`-typed constant so the
|
|
11
|
+
* TypeScript build does not require the package to be present.
|
|
12
|
+
*
|
|
13
|
+
* `createE5Embedder` throws if the runtime or the model cannot be
|
|
14
|
+
* loaded (package missing, network blocked, etc.); the caller is
|
|
15
|
+
* expected to catch that, log it, and fall back to lexical-only
|
|
16
|
+
* search — enabling the flag must never break the plugin.
|
|
17
|
+
*
|
|
18
|
+
* e5 is asymmetric: a query and the passage it should match take
|
|
19
|
+
* different prefixes ("query: " / "passage: "). Getting that wrong
|
|
20
|
+
* quietly degrades retrieval, so it is baked into the two methods.
|
|
21
|
+
*/
|
|
22
|
+
import { DEFAULT_EMBEDDING_MODEL, type Embedder } from "./embedder.js";
|
|
23
|
+
export { DEFAULT_EMBEDDING_MODEL };
|
|
24
|
+
/**
|
|
25
|
+
* Build an e5 embedder. Loads the transformers.js runtime and the
|
|
26
|
+
* model (downloaded and cached on first use). Throws — with an
|
|
27
|
+
* actionable message — if the optional dependency is missing or the
|
|
28
|
+
* model cannot be fetched.
|
|
29
|
+
*/
|
|
30
|
+
export declare function createE5Embedder(modelId?: string): Promise<Embedder>;
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e5-embedder.ts — the real multilingual embedder, backed by the
|
|
3
|
+
* `intfloat/e5` family via transformers.js (ONNX).
|
|
4
|
+
*
|
|
5
|
+
* This file is the ONLY place that touches the model runtime, and it
|
|
6
|
+
* is imported dynamically — and only when `enableSemanticSearch` is
|
|
7
|
+
* on. `@huggingface/transformers` is an OPTIONAL peer dependency: it is
|
|
8
|
+
* not installed by a default `bun install`, so a user who never
|
|
9
|
+
* enables semantic search never downloads it. The import specifier is
|
|
10
|
+
* deliberately indirected through a `string`-typed constant so the
|
|
11
|
+
* TypeScript build does not require the package to be present.
|
|
12
|
+
*
|
|
13
|
+
* `createE5Embedder` throws if the runtime or the model cannot be
|
|
14
|
+
* loaded (package missing, network blocked, etc.); the caller is
|
|
15
|
+
* expected to catch that, log it, and fall back to lexical-only
|
|
16
|
+
* search — enabling the flag must never break the plugin.
|
|
17
|
+
*
|
|
18
|
+
* e5 is asymmetric: a query and the passage it should match take
|
|
19
|
+
* different prefixes ("query: " / "passage: "). Getting that wrong
|
|
20
|
+
* quietly degrades retrieval, so it is baked into the two methods.
|
|
21
|
+
*/
|
|
22
|
+
import { DEFAULT_EMBEDDING_MODEL } from "./embedder.js";
|
|
23
|
+
export { DEFAULT_EMBEDDING_MODEL };
|
|
24
|
+
/** Passages longer than this are truncated before embedding (e5 caps at 512 tokens anyway). */
|
|
25
|
+
const MAX_CHARS = 2000;
|
|
26
|
+
/** Embed in batches of this many texts to bound peak memory. */
|
|
27
|
+
const BATCH = 32;
|
|
28
|
+
class E5Embedder {
|
|
29
|
+
id;
|
|
30
|
+
extract;
|
|
31
|
+
constructor(id, extract) {
|
|
32
|
+
this.id = id;
|
|
33
|
+
this.extract = extract;
|
|
34
|
+
}
|
|
35
|
+
async embedQuery(text) {
|
|
36
|
+
const [v] = await this.run([`query: ${clip(text)}`]);
|
|
37
|
+
return v;
|
|
38
|
+
}
|
|
39
|
+
async embedPassages(texts) {
|
|
40
|
+
const out = [];
|
|
41
|
+
for (let i = 0; i < texts.length; i += BATCH) {
|
|
42
|
+
const batch = texts.slice(i, i + BATCH).map((t) => `passage: ${clip(t)}`);
|
|
43
|
+
out.push(...(await this.run(batch)));
|
|
44
|
+
// Yield between batches so a large embedding pass never starves
|
|
45
|
+
// the event loop.
|
|
46
|
+
await new Promise((r) => setTimeout(r, 0));
|
|
47
|
+
}
|
|
48
|
+
return out;
|
|
49
|
+
}
|
|
50
|
+
/** Run the model on a batch and split the flat output into per-text vectors. */
|
|
51
|
+
async run(texts) {
|
|
52
|
+
const t = await this.extract(texts, { pooling: "mean", normalize: true });
|
|
53
|
+
const flat = t.data instanceof Float32Array ? t.data : Float32Array.from(t.data);
|
|
54
|
+
const dim = t.dims[t.dims.length - 1];
|
|
55
|
+
const vecs = [];
|
|
56
|
+
for (let i = 0; i < texts.length; i++) {
|
|
57
|
+
vecs.push(flat.slice(i * dim, (i + 1) * dim));
|
|
58
|
+
}
|
|
59
|
+
return vecs;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Build an e5 embedder. Loads the transformers.js runtime and the
|
|
64
|
+
* model (downloaded and cached on first use). Throws — with an
|
|
65
|
+
* actionable message — if the optional dependency is missing or the
|
|
66
|
+
* model cannot be fetched.
|
|
67
|
+
*/
|
|
68
|
+
export async function createE5Embedder(modelId = DEFAULT_EMBEDDING_MODEL) {
|
|
69
|
+
// Indirected through a `string`-typed constant: the TS build does
|
|
70
|
+
// not try to resolve the optional dependency at compile time.
|
|
71
|
+
const spec = "@huggingface/transformers";
|
|
72
|
+
let mod;
|
|
73
|
+
try {
|
|
74
|
+
mod = (await import(spec));
|
|
75
|
+
}
|
|
76
|
+
catch {
|
|
77
|
+
throw new Error("semantic search needs the optional dependency '@huggingface/transformers' — " +
|
|
78
|
+
"install it with `bun add @huggingface/transformers` (or `npm install @huggingface/transformers`)");
|
|
79
|
+
}
|
|
80
|
+
let extract;
|
|
81
|
+
try {
|
|
82
|
+
extract = await mod.pipeline("feature-extraction", modelId);
|
|
83
|
+
}
|
|
84
|
+
catch (e) {
|
|
85
|
+
throw new Error(`could not load embedding model '${modelId}': ${e instanceof Error ? e.message : String(e)}`);
|
|
86
|
+
}
|
|
87
|
+
return new E5Embedder(modelId, extract);
|
|
88
|
+
}
|
|
89
|
+
function clip(text) {
|
|
90
|
+
return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
|
|
91
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* embed-pass.ts — populate the vector store for memories that don't
|
|
3
|
+
* yet have an embedding.
|
|
4
|
+
*
|
|
5
|
+
* Runs only when semantic search is enabled, in the background, after
|
|
6
|
+
* prefill. It is incremental and crash-safe: vectors are persisted in
|
|
7
|
+
* chunks, so a memory embedded once is never re-embedded across
|
|
8
|
+
* sessions, and an interrupted pass simply resumes. A recall issued
|
|
9
|
+
* before the pass finishes still works — it just fuses against the
|
|
10
|
+
* vectors that exist so far (gracefully degrading toward pure lexical
|
|
11
|
+
* search when few are ready).
|
|
12
|
+
*/
|
|
13
|
+
import type { Embedder } from "./embedder.js";
|
|
14
|
+
import type { MemoryRepository } from "../store/repository.js";
|
|
15
|
+
import type { VectorStore } from "../store/vector-store.js";
|
|
16
|
+
/**
|
|
17
|
+
* Embed every memory that lacks a vector, and drop vectors for
|
|
18
|
+
* memories that no longer exist (evicted or replaced). Returns the
|
|
19
|
+
* counts. Never throws into the caller — an embedding failure is
|
|
20
|
+
* reported via `log` and ends the pass cleanly, leaving lexical
|
|
21
|
+
* search fully functional.
|
|
22
|
+
*/
|
|
23
|
+
export declare function embedMissingMemories(repo: MemoryRepository, vectorStore: VectorStore, embedder: Embedder, log?: (msg: string) => void): Promise<{
|
|
24
|
+
embedded: number;
|
|
25
|
+
pruned: number;
|
|
26
|
+
}>;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* embed-pass.ts — populate the vector store for memories that don't
|
|
3
|
+
* yet have an embedding.
|
|
4
|
+
*
|
|
5
|
+
* Runs only when semantic search is enabled, in the background, after
|
|
6
|
+
* prefill. It is incremental and crash-safe: vectors are persisted in
|
|
7
|
+
* chunks, so a memory embedded once is never re-embedded across
|
|
8
|
+
* sessions, and an interrupted pass simply resumes. A recall issued
|
|
9
|
+
* before the pass finishes still works — it just fuses against the
|
|
10
|
+
* vectors that exist so far (gracefully degrading toward pure lexical
|
|
11
|
+
* search when few are ready).
|
|
12
|
+
*/
|
|
13
|
+
/** Memories are embedded (and persisted) this many at a time. */
|
|
14
|
+
const CHUNK = 64;
|
|
15
|
+
/**
|
|
16
|
+
* Embed every memory that lacks a vector, and drop vectors for
|
|
17
|
+
* memories that no longer exist (evicted or replaced). Returns the
|
|
18
|
+
* counts. Never throws into the caller — an embedding failure is
|
|
19
|
+
* reported via `log` and ends the pass cleanly, leaving lexical
|
|
20
|
+
* search fully functional.
|
|
21
|
+
*/
|
|
22
|
+
export async function embedMissingMemories(repo, vectorStore, embedder, log) {
|
|
23
|
+
const memories = repo.allMemories();
|
|
24
|
+
const validIds = new Set(memories.map((m) => m.id));
|
|
25
|
+
const pruned = vectorStore.prune(validIds);
|
|
26
|
+
const todo = memories.filter((m) => !vectorStore.has(m.id));
|
|
27
|
+
let embedded = 0;
|
|
28
|
+
try {
|
|
29
|
+
for (let i = 0; i < todo.length; i += CHUNK) {
|
|
30
|
+
const chunk = todo.slice(i, i + CHUNK);
|
|
31
|
+
const texts = chunk.map((m) => `${m.subject}\n${m.content}`);
|
|
32
|
+
const vecs = await embedder.embedPassages(texts);
|
|
33
|
+
vectorStore.putMany(chunk.map((m, j) => ({ id: m.id, vec: vecs[j] })));
|
|
34
|
+
embedded += chunk.length;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
catch (e) {
|
|
38
|
+
log?.(`semantic: embedding pass stopped after ${embedded}/${todo.length} — ` +
|
|
39
|
+
`${e instanceof Error ? e.message : String(e)} (lexical search unaffected)`);
|
|
40
|
+
return { embedded, pruned };
|
|
41
|
+
}
|
|
42
|
+
return { embedded, pruned };
|
|
43
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* embedder.ts — the small, dependency-free core of optional semantic
|
|
3
|
+
* search: the `Embedder` contract, vector math, and reciprocal-rank
|
|
4
|
+
* fusion.
|
|
5
|
+
*
|
|
6
|
+
* Nothing here imports a model runtime. The real e5 implementation
|
|
7
|
+
* lives in `e5-embedder.ts` and is only loaded when semantic search is
|
|
8
|
+
* switched on; tests substitute a deterministic stub. Keeping the
|
|
9
|
+
* contract and the math here means the fusion/retrieval logic is
|
|
10
|
+
* unit-testable without downloading a 100 MB model.
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Turns text into a vector. e5 expects asymmetric prefixes — a query
|
|
14
|
+
* and the passage it should match are embedded differently — so the
|
|
15
|
+
* contract has two methods rather than one.
|
|
16
|
+
*/
|
|
17
|
+
export interface Embedder {
|
|
18
|
+
/** Stable model identifier, e.g. "Xenova/multilingual-e5-small". */
|
|
19
|
+
readonly id: string;
|
|
20
|
+
/** Embed a search query. */
|
|
21
|
+
embedQuery(text: string): Promise<Float32Array>;
|
|
22
|
+
/** Embed a batch of documents/passages. */
|
|
23
|
+
embedPassages(texts: string[]): Promise<Float32Array[]>;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Default embedding model — small (~120 MB quantized), ~384-dim, and
|
|
27
|
+
* trained on 100+ languages, which is what makes cross-lingual recall
|
|
28
|
+
* (a query in one language, code/comments in another) work.
|
|
29
|
+
*/
|
|
30
|
+
export declare const DEFAULT_EMBEDDING_MODEL = "Xenova/multilingual-e5-small";
|
|
31
|
+
/** L2-normalise a vector in place and return it. A zero vector is left as-is. */
|
|
32
|
+
export declare function normalize(v: Float32Array): Float32Array;
|
|
33
|
+
/**
|
|
34
|
+
* Cosine similarity of two equal-length vectors, in [-1, 1]. Returns 0
|
|
35
|
+
* on a length mismatch or a zero vector rather than NaN — a recall
|
|
36
|
+
* must never crash on a malformed vector.
|
|
37
|
+
*/
|
|
38
|
+
export declare function cosineSimilarity(a: Float32Array, b: Float32Array): number;
|
|
39
|
+
/** Dot product — equals cosine similarity when both vectors are L2-normalised. */
|
|
40
|
+
export declare function dot(a: Float32Array, b: Float32Array): number;
|
|
41
|
+
/** An id with a fused score, highest first. */
|
|
42
|
+
export interface FusedItem {
|
|
43
|
+
id: string;
|
|
44
|
+
score: number;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Reciprocal-rank fusion of several ranked id-lists into one ranking.
|
|
48
|
+
*
|
|
49
|
+
* score(id) = Σ_lists 1 / (k + rank_in_list) rank is 1-based
|
|
50
|
+
*
|
|
51
|
+
* RRF is the standard way to merge a lexical (BM25) ranking with a
|
|
52
|
+
* semantic (vector) ranking: it needs no score calibration between the
|
|
53
|
+
* two — only the *positions* — and `k` (60 by convention) damps the
|
|
54
|
+
* influence of low ranks. An id absent from a list simply contributes
|
|
55
|
+
* nothing for that list. Deterministic; ties broken by first
|
|
56
|
+
* appearance so the result is stable.
|
|
57
|
+
*/
|
|
58
|
+
export declare function reciprocalRankFusion(lists: string[][], k?: number): FusedItem[];
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* embedder.ts — the small, dependency-free core of optional semantic
|
|
3
|
+
* search: the `Embedder` contract, vector math, and reciprocal-rank
|
|
4
|
+
* fusion.
|
|
5
|
+
*
|
|
6
|
+
* Nothing here imports a model runtime. The real e5 implementation
|
|
7
|
+
* lives in `e5-embedder.ts` and is only loaded when semantic search is
|
|
8
|
+
* switched on; tests substitute a deterministic stub. Keeping the
|
|
9
|
+
* contract and the math here means the fusion/retrieval logic is
|
|
10
|
+
* unit-testable without downloading a 100 MB model.
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Default embedding model — small (~120 MB quantized), ~384-dim, and
|
|
14
|
+
* trained on 100+ languages, which is what makes cross-lingual recall
|
|
15
|
+
* (a query in one language, code/comments in another) work.
|
|
16
|
+
*/
|
|
17
|
+
export const DEFAULT_EMBEDDING_MODEL = "Xenova/multilingual-e5-small";
|
|
18
|
+
/** L2-normalise a vector in place and return it. A zero vector is left as-is. */
|
|
19
|
+
export function normalize(v) {
|
|
20
|
+
let sum = 0;
|
|
21
|
+
for (let i = 0; i < v.length; i++)
|
|
22
|
+
sum += v[i] * v[i];
|
|
23
|
+
const norm = Math.sqrt(sum);
|
|
24
|
+
if (norm > 0) {
|
|
25
|
+
for (let i = 0; i < v.length; i++)
|
|
26
|
+
v[i] /= norm;
|
|
27
|
+
}
|
|
28
|
+
return v;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Cosine similarity of two equal-length vectors, in [-1, 1]. Returns 0
|
|
32
|
+
* on a length mismatch or a zero vector rather than NaN — a recall
|
|
33
|
+
* must never crash on a malformed vector.
|
|
34
|
+
*/
|
|
35
|
+
export function cosineSimilarity(a, b) {
|
|
36
|
+
if (a.length !== b.length || a.length === 0)
|
|
37
|
+
return 0;
|
|
38
|
+
let dot = 0;
|
|
39
|
+
let na = 0;
|
|
40
|
+
let nb = 0;
|
|
41
|
+
for (let i = 0; i < a.length; i++) {
|
|
42
|
+
dot += a[i] * b[i];
|
|
43
|
+
na += a[i] * a[i];
|
|
44
|
+
nb += b[i] * b[i];
|
|
45
|
+
}
|
|
46
|
+
const denom = Math.sqrt(na) * Math.sqrt(nb);
|
|
47
|
+
return denom > 0 ? dot / denom : 0;
|
|
48
|
+
}
|
|
49
|
+
/** Dot product — equals cosine similarity when both vectors are L2-normalised. */
|
|
50
|
+
export function dot(a, b) {
|
|
51
|
+
if (a.length !== b.length)
|
|
52
|
+
return 0;
|
|
53
|
+
let d = 0;
|
|
54
|
+
for (let i = 0; i < a.length; i++)
|
|
55
|
+
d += a[i] * b[i];
|
|
56
|
+
return d;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Reciprocal-rank fusion of several ranked id-lists into one ranking.
|
|
60
|
+
*
|
|
61
|
+
* score(id) = Σ_lists 1 / (k + rank_in_list) rank is 1-based
|
|
62
|
+
*
|
|
63
|
+
* RRF is the standard way to merge a lexical (BM25) ranking with a
|
|
64
|
+
* semantic (vector) ranking: it needs no score calibration between the
|
|
65
|
+
* two — only the *positions* — and `k` (60 by convention) damps the
|
|
66
|
+
* influence of low ranks. An id absent from a list simply contributes
|
|
67
|
+
* nothing for that list. Deterministic; ties broken by first
|
|
68
|
+
* appearance so the result is stable.
|
|
69
|
+
*/
|
|
70
|
+
export function reciprocalRankFusion(lists, k = 60) {
|
|
71
|
+
const score = new Map();
|
|
72
|
+
const firstSeen = new Map();
|
|
73
|
+
let order = 0;
|
|
74
|
+
for (const list of lists) {
|
|
75
|
+
for (let rank = 0; rank < list.length; rank++) {
|
|
76
|
+
const id = list[rank];
|
|
77
|
+
score.set(id, (score.get(id) ?? 0) + 1 / (k + rank + 1));
|
|
78
|
+
if (!firstSeen.has(id))
|
|
79
|
+
firstSeen.set(id, order++);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return [...score.entries()]
|
|
83
|
+
.map(([id, s]) => ({ id, score: s }))
|
|
84
|
+
.sort((a, b) => b.score - a.score || (firstSeen.get(a.id) - firstSeen.get(b.id)));
|
|
85
|
+
}
|