opencode-diane 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +180 -0
  2. package/LICENSE +21 -0
  3. package/README.md +206 -0
  4. package/WIKI.md +1430 -0
  5. package/dist/index.d.ts +28 -0
  6. package/dist/index.js +1632 -0
  7. package/dist/ingest/adaptive.d.ts +47 -0
  8. package/dist/ingest/adaptive.js +182 -0
  9. package/dist/ingest/code-health.d.ts +58 -0
  10. package/dist/ingest/code-health.js +202 -0
  11. package/dist/ingest/code-map.d.ts +71 -0
  12. package/dist/ingest/code-map.js +670 -0
  13. package/dist/ingest/cross-refs.d.ts +59 -0
  14. package/dist/ingest/cross-refs.js +1207 -0
  15. package/dist/ingest/docs.d.ts +49 -0
  16. package/dist/ingest/docs.js +325 -0
  17. package/dist/ingest/git.d.ts +77 -0
  18. package/dist/ingest/git.js +390 -0
  19. package/dist/ingest/live-session.d.ts +101 -0
  20. package/dist/ingest/live-session.js +173 -0
  21. package/dist/ingest/project-notes.d.ts +28 -0
  22. package/dist/ingest/project-notes.js +102 -0
  23. package/dist/ingest/project.d.ts +35 -0
  24. package/dist/ingest/project.js +430 -0
  25. package/dist/ingest/session-snapshot.d.ts +63 -0
  26. package/dist/ingest/session-snapshot.js +94 -0
  27. package/dist/ingest/sessions.d.ts +29 -0
  28. package/dist/ingest/sessions.js +164 -0
  29. package/dist/ingest/tables.d.ts +52 -0
  30. package/dist/ingest/tables.js +360 -0
  31. package/dist/mining/skill-miner.d.ts +53 -0
  32. package/dist/mining/skill-miner.js +234 -0
  33. package/dist/search/bm25.d.ts +81 -0
  34. package/dist/search/bm25.js +334 -0
  35. package/dist/search/e5-embedder.d.ts +30 -0
  36. package/dist/search/e5-embedder.js +91 -0
  37. package/dist/search/embed-pass.d.ts +26 -0
  38. package/dist/search/embed-pass.js +43 -0
  39. package/dist/search/embedder.d.ts +58 -0
  40. package/dist/search/embedder.js +85 -0
  41. package/dist/search/inverted-index.d.ts +51 -0
  42. package/dist/search/inverted-index.js +139 -0
  43. package/dist/search/ppr.d.ts +44 -0
  44. package/dist/search/ppr.js +118 -0
  45. package/dist/search/tokenize.d.ts +26 -0
  46. package/dist/search/tokenize.js +98 -0
  47. package/dist/store/eviction.d.ts +16 -0
  48. package/dist/store/eviction.js +37 -0
  49. package/dist/store/repository.d.ts +222 -0
  50. package/dist/store/repository.js +420 -0
  51. package/dist/store/sqlite-store.d.ts +89 -0
  52. package/dist/store/sqlite-store.js +252 -0
  53. package/dist/store/vector-store.d.ts +66 -0
  54. package/dist/store/vector-store.js +160 -0
  55. package/dist/types.d.ts +385 -0
  56. package/dist/types.js +9 -0
  57. package/dist/utils/file-log.d.ts +87 -0
  58. package/dist/utils/file-log.js +215 -0
  59. package/dist/utils/peer-detection.d.ts +45 -0
  60. package/dist/utils/peer-detection.js +90 -0
  61. package/dist/utils/shell.d.ts +43 -0
  62. package/dist/utils/shell.js +110 -0
  63. package/dist/utils/usage-skill.d.ts +42 -0
  64. package/dist/utils/usage-skill.js +129 -0
  65. package/dist/utils/xlsx.d.ts +36 -0
  66. package/dist/utils/xlsx.js +270 -0
  67. package/grammars/tree-sitter-c.wasm +0 -0
  68. package/grammars/tree-sitter-c_sharp.wasm +0 -0
  69. package/grammars/tree-sitter-cpp.wasm +0 -0
  70. package/grammars/tree-sitter-css.wasm +0 -0
  71. package/grammars/tree-sitter-go.wasm +0 -0
  72. package/grammars/tree-sitter-html.wasm +0 -0
  73. package/grammars/tree-sitter-java.wasm +0 -0
  74. package/grammars/tree-sitter-javascript.wasm +0 -0
  75. package/grammars/tree-sitter-json.wasm +0 -0
  76. package/grammars/tree-sitter-php.wasm +0 -0
  77. package/grammars/tree-sitter-python.wasm +0 -0
  78. package/grammars/tree-sitter-rust.wasm +0 -0
  79. package/grammars/tree-sitter-typescript.wasm +0 -0
  80. package/package.json +80 -0
@@ -0,0 +1,334 @@
1
+ /**
2
+ * BM25 retrieval over the in-memory inverted index.
3
+ *
4
+ * Hierarchical filtering: callers can narrow candidates by category
5
+ * and/or subject before scoring. If neither filter is provided, all
6
+ * docs that contain any query term are considered.
7
+ *
8
+ * k1=1.2, b=0.75 — standard defaults that work well on short docs.
9
+ */
10
+ import { personalizedPageRank } from "./ppr.js";
11
+ import { tokenize } from "./tokenize.js";
12
+ const K1 = 1.2;
13
+ const B = 0.75;
14
+ /**
15
+ * Fraction of a seed hit's BM25 score given to memories pulled in via
16
+ * the co-change graph. Deliberately small (< 1): a co-change-surfaced
17
+ * file always ranks below a direct textual match, but above nothing.
18
+ */
19
+ const COCHANGE_BOOST = 0.25;
20
+ /**
21
+ * Only the top-N textual hits act as seeds for co-change propagation.
22
+ * Keeps the boost focused on what the query actually matched and the
23
+ * extra work bounded.
24
+ */
25
+ const SEED_LIMIT = 5;
26
+ /**
27
+ * Per neighbour file, pull in at most this many memories (ranked by
28
+ * useCount). Stops a hot file with hundreds of commit memories from
29
+ * flooding the result set.
30
+ */
31
+ const PROPAGATE_PER_FILE = 3;
32
+ /** ~4 chars per token — the rough heuristic used throughout the plugin. */
33
+ export function estimateTokens(s) {
34
+ return Math.ceil(s.length / 4);
35
+ }
36
+ /**
37
+ * Pack ranked hits into a token budget. `format` renders one hit to
38
+ * the string the agent will actually see, so the estimate matches the
39
+ * real output. Always returns at least one hit (the top-ranked) even
40
+ * if it alone exceeds the budget — an empty result would be worse
41
+ * than a slightly-over one — but in that case the hit's `content` is
42
+ * truncated so the budget stays a real ceiling rather than a wish.
43
+ * Returns the (possibly content-truncated) kept hits and how many
44
+ * were dropped.
45
+ */
46
+ export function packToTokenBudget(hits, budget, format) {
47
+ if (hits.length === 0)
48
+ return { kept: [], omitted: 0 };
49
+ const kept = [];
50
+ let used = 0;
51
+ for (const h of hits) {
52
+ const cost = estimateTokens(format(h)) + 1; // +1 for the joining newline
53
+ if (kept.length === 0 && cost > budget) {
54
+ // Mandatory first hit overflows on its own — keep it, but trim
55
+ // its content so the result still respects the ceiling. We trim
56
+ // a shallow clone so the stored Memory object is untouched.
57
+ const room = Math.max(40, budget) * 4; // budget back in chars
58
+ const overhead = format(h).length - h.memory.content.length;
59
+ const contentRoom = Math.max(20, room - overhead);
60
+ const trimmed = {
61
+ score: h.score,
62
+ memory: {
63
+ ...h.memory,
64
+ content: h.memory.content.length > contentRoom
65
+ ? h.memory.content.slice(0, contentRoom - 1) + "…"
66
+ : h.memory.content,
67
+ },
68
+ };
69
+ kept.push(trimmed);
70
+ break;
71
+ }
72
+ if (kept.length > 0 && used + cost > budget)
73
+ break;
74
+ kept.push(h);
75
+ used += cost;
76
+ }
77
+ return { kept, omitted: hits.length - kept.length };
78
+ }
79
+ export function search(index, byId, opts) {
80
+ const queryTokens = tokenize(opts.query);
81
+ if (queryTokens.length === 0)
82
+ return [];
83
+ const limit = opts.limit ?? 10;
84
+ // 1) Compute candidate set: union of postings for query terms,
85
+ // further intersected with category/subject filters if any.
86
+ const candidates = new Set();
87
+ for (const token of queryTokens) {
88
+ const ids = index.postings.get(token);
89
+ if (!ids)
90
+ continue;
91
+ for (const id of ids)
92
+ candidates.add(id);
93
+ }
94
+ if (opts.category)
95
+ intersectInPlace(candidates, index.byCategory.get(opts.category));
96
+ if (opts.subject)
97
+ intersectInPlace(candidates, index.bySubject.get(opts.subject));
98
+ if (candidates.size === 0)
99
+ return [];
100
+ // 2) Compute IDF for each unique query term.
101
+ const N = index.docCount || 1;
102
+ const idf = new Map();
103
+ const uniqueQueryTokens = Array.from(new Set(queryTokens));
104
+ for (const t of uniqueQueryTokens) {
105
+ const df = index.postings.get(t)?.size ?? 0;
106
+ // BM25+ style: log((N - df + 0.5) / (df + 0.5) + 1) — always positive.
107
+ idf.set(t, Math.log((N - df + 0.5) / (df + 0.5) + 1));
108
+ }
109
+ // 3) Score each candidate.
110
+ const avgdl = index.avgDocLength() || 1;
111
+ const scoreById = new Map();
112
+ for (const id of candidates) {
113
+ const doc = index.docs.get(id);
114
+ const mem = byId.get(id);
115
+ if (!doc || !mem)
116
+ continue;
117
+ let score = 0;
118
+ for (const t of uniqueQueryTokens) {
119
+ const tf = doc.tf.get(t) ?? 0;
120
+ if (tf === 0)
121
+ continue;
122
+ const wt = idf.get(t) ?? 0;
123
+ const num = tf * (K1 + 1);
124
+ const denom = tf + K1 * (1 - B + (B * doc.length) / avgdl);
125
+ score += wt * (num / denom);
126
+ }
127
+ // Small recency / popularity bias so that all-else-equal we return
128
+ // recently-touched and frequently-used entries first.
129
+ score += Math.log1p(mem.useCount) * 0.05;
130
+ if (score > 0)
131
+ scoreById.set(id, score);
132
+ }
133
+ // 3b) Co-change graph boost — the Aider PageRank idea.
134
+ // A well-scoring memory about file X *pulls in* memories about
135
+ // files X is historically modified together with, even when
136
+ // those files don't textually match the query — surfacing
137
+ // structurally-related context the query alone would miss.
138
+ //
139
+ // Two strategies, selected by `opts.personalizedPageRank`:
140
+ // - default (off): ONE HOP — boost the direct co-change
141
+ // neighbours of the top textual hits. O(seeds × neighbours),
142
+ // fully inspectable.
143
+ // - on: PERSONALIZED PAGERANK — a restart-biased random walk
144
+ // over the whole co-change graph, so relevance reaches
145
+ // multi-hop files, graded by graph distance.
146
+ //
147
+ // Either way it is bounded and low-scored so it can't drown
148
+ // direct matches: only the top SEED_LIMIT textual hits seed it,
149
+ // each file contributes ≤ PROPAGATE_PER_FILE memories (most-used
150
+ // first), and a pulled-in hit always ranks below a real textual
151
+ // match.
152
+ if (scoreById.size > 0 && index.coChange.size > 0) {
153
+ const seeds = Array.from(scoreById.entries())
154
+ .sort((a, b) => b[1] - a[1])
155
+ .slice(0, SEED_LIMIT);
156
+ if (opts.personalizedPageRank) {
157
+ applyPprBoost(index, byId, scoreById, seeds);
158
+ }
159
+ else {
160
+ applyOneHopBoost(index, byId, scoreById, seeds);
161
+ }
162
+ }
163
+ // 3c) Intent lean — query-dependent and agent-supplied. Applied
164
+ // last, as a *mild* multiplier, so it nudges ranking without
165
+ // ever filtering: a much stronger match still wins regardless of
166
+ // `prefer`. This is why test de-emphasis here is safe — a query
167
+ // that genuinely wants tests still gets them.
168
+ if (opts.prefer && opts.prefer !== "any") {
169
+ for (const [id, s] of scoreById) {
170
+ const mem = byId.get(id);
171
+ if (!mem)
172
+ continue;
173
+ const m = preferMultiplier(opts.prefer, mem);
174
+ if (m !== 1)
175
+ scoreById.set(id, s * m);
176
+ }
177
+ }
178
+ const hits = [];
179
+ for (const [id, score] of scoreById) {
180
+ const mem = byId.get(id);
181
+ if (mem)
182
+ hits.push({ memory: mem, score });
183
+ }
184
+ hits.sort((a, b) => b.score - a.score);
185
+ return hits.slice(0, limit);
186
+ }
187
+ /**
188
+ * Whether a memory's subject path looks test-related. Deliberately
189
+ * minimal and language-neutral: it asks only whether the *word*
190
+ * "test"/"tests" appears as a token of the path. The plugin's own
191
+ * tokenizer extracts that word equally from a `test/` directory, a
192
+ * `_test.go` / `.test.ts` filename, or a `test_x.py` one — so this is
193
+ * one universal signal, not a per-ecosystem catalogue to overfit. It
194
+ * checks tokens, not substrings, so `latest`, `contest`, `testimony`
195
+ * are not mistaken for tests.
196
+ */
197
+ /**
198
+ * One-hop co-change boost (the default). Lifts memories about the
199
+ * direct co-change neighbours of the seed files: a neighbour memory's
200
+ * score is raised to `seedScore × COCHANGE_BOOST`, never above its own
201
+ * textual score, and at most `PROPAGATE_PER_FILE` memories per file.
202
+ */
203
+ function applyOneHopBoost(index, byId, scoreById, seeds) {
204
+ for (const [seedId, seedScore] of seeds) {
205
+ const seedMem = byId.get(seedId);
206
+ if (!seedMem)
207
+ continue;
208
+ const seedFile = fileOfSubject(seedMem.subject);
209
+ if (!seedFile)
210
+ continue;
211
+ const neighbors = index.coChangeNeighbors(seedFile);
212
+ if (!neighbors)
213
+ continue;
214
+ for (const neighborFile of neighbors) {
215
+ const memIds = index.bySubject.get(neighborFile);
216
+ if (!memIds)
217
+ continue;
218
+ // Most-used memories about the neighbour file first.
219
+ const ranked = Array.from(memIds)
220
+ .map((id) => byId.get(id))
221
+ .filter((m) => m !== undefined)
222
+ .sort((a, b) => b.useCount - a.useCount)
223
+ .slice(0, PROPAGATE_PER_FILE);
224
+ const propagated = seedScore * COCHANGE_BOOST;
225
+ for (const m of ranked) {
226
+ const current = scoreById.get(m.id) ?? 0;
227
+ // Never lower an existing (textual) score; only lift.
228
+ if (propagated > current)
229
+ scoreById.set(m.id, propagated);
230
+ }
231
+ }
232
+ }
233
+ }
234
+ /**
235
+ * Personalized PageRank co-change boost (opt-in). Runs a
236
+ * restart-biased random walk over the whole co-change graph, seeded on
237
+ * the textual hits, and lifts memories about every file the walk
238
+ * reaches — graded by the walk's stationary score, so a direct
239
+ * neighbour is lifted more than a two-hop file. Bounded: the boost is
240
+ * scaled so the most-central file receives at most
241
+ * `topSeedScore × COCHANGE_BOOST`, keeping any pulled-in hit below the
242
+ * strongest textual match.
243
+ */
244
+ function applyPprBoost(index, byId, scoreById, seeds) {
245
+ // Personalization vector: each seed's file, weighted by its BM25 score.
246
+ const personalization = new Map();
247
+ for (const [id, score] of seeds) {
248
+ const file = fileOfSubject(byId.get(id)?.subject ?? "");
249
+ if (file)
250
+ personalization.set(file, (personalization.get(file) ?? 0) + score);
251
+ }
252
+ if (personalization.size === 0)
253
+ return;
254
+ const ppr = personalizedPageRank(index.coChange, personalization);
255
+ let maxScore = 0;
256
+ for (const v of ppr.values())
257
+ if (v > maxScore)
258
+ maxScore = v;
259
+ if (maxScore <= 0)
260
+ return;
261
+ // Scale so the most-central file gets at most topSeedScore × COCHANGE_BOOST.
262
+ const topSeedScore = seeds[0][1];
263
+ for (const [file, prob] of ppr) {
264
+ const boost = (prob / maxScore) * topSeedScore * COCHANGE_BOOST;
265
+ if (boost <= 1e-9)
266
+ continue;
267
+ const memIds = index.bySubject.get(file);
268
+ if (!memIds)
269
+ continue;
270
+ const ranked = Array.from(memIds)
271
+ .map((id) => byId.get(id))
272
+ .filter((m) => m !== undefined)
273
+ .sort((a, b) => b.useCount - a.useCount)
274
+ .slice(0, PROPAGATE_PER_FILE);
275
+ for (const m of ranked) {
276
+ // Additive: the stationary score already aggregates every path
277
+ // from every seed, so PPR lifts each file once, on top of any
278
+ // textual score that memory already has.
279
+ scoreById.set(m.id, (scoreById.get(m.id) ?? 0) + boost);
280
+ }
281
+ }
282
+ }
283
+ function subjectLooksLikeTest(subject) {
284
+ for (const tok of tokenize(subject)) {
285
+ if (tok === "test" || tok === "tests")
286
+ return true;
287
+ }
288
+ return false;
289
+ }
290
+ /**
291
+ * The mild, query-dependent ranking lean for `prefer`. Multipliers are
292
+ * gentle constants by design — a nudge, not a gate — and are not tuned
293
+ * to any particular repository.
294
+ */
295
+ function preferMultiplier(prefer, mem) {
296
+ const isTest = subjectLooksLikeTest(mem.subject);
297
+ if (prefer === "tests")
298
+ return isTest ? 1.5 : 1.0;
299
+ if (prefer === "history")
300
+ return mem.category === "git-history" ? 1.3 : 1.0;
301
+ // prefer === "code": lean toward structural code, away from tests.
302
+ let m = mem.category === "code-map" ? 1.25 : 1.0;
303
+ if (isTest)
304
+ m *= 0.6;
305
+ return m;
306
+ }
307
+ /**
308
+ * Extract the file path a memory's subject is "about", or null.
309
+ * Commit memories use the bare path as subject; co-change and churn
310
+ * memories prefix it (`co-change:foo`, `churn:foo`). Tree-style
311
+ * placeholders (`tree:abc1234`) and non-file subjects return null.
312
+ */
313
+ function fileOfSubject(subject) {
314
+ if (subject.startsWith("co-change:"))
315
+ return subject.slice("co-change:".length);
316
+ if (subject.startsWith("churn:"))
317
+ return subject.slice("churn:".length);
318
+ if (subject.startsWith("tree:"))
319
+ return null;
320
+ if (subject.startsWith("recency:"))
321
+ return null;
322
+ if (subject.includes(":"))
323
+ return null; // other category-prefixed subjects
324
+ return subject || null;
325
+ }
326
+ function intersectInPlace(target, other) {
327
+ if (!other) {
328
+ target.clear();
329
+ return;
330
+ }
331
+ for (const id of target)
332
+ if (!other.has(id))
333
+ target.delete(id);
334
+ }
@@ -0,0 +1,30 @@
1
+ /**
2
+ * e5-embedder.ts — the real multilingual embedder, backed by the
3
+ * `intfloat/e5` family via transformers.js (ONNX).
4
+ *
5
+ * This file is the ONLY place that touches the model runtime, and it
6
+ * is imported dynamically — and only when `enableSemanticSearch` is
7
+ * on. `@huggingface/transformers` is an OPTIONAL peer dependency: it is
8
+ * not installed by a default `bun install`, so a user who never
9
+ * enables semantic search never downloads it. The import specifier is
10
+ * deliberately indirected through a `string`-typed constant so the
11
+ * TypeScript build does not require the package to be present.
12
+ *
13
+ * `createE5Embedder` throws if the runtime or the model cannot be
14
+ * loaded (package missing, network blocked, etc.); the caller is
15
+ * expected to catch that, log it, and fall back to lexical-only
16
+ * search — enabling the flag must never break the plugin.
17
+ *
18
+ * e5 is asymmetric: a query and the passage it should match take
19
+ * different prefixes ("query: " / "passage: "). Getting that wrong
20
+ * quietly degrades retrieval, so it is baked into the two methods.
21
+ */
22
+ import { DEFAULT_EMBEDDING_MODEL, type Embedder } from "./embedder.js";
23
+ export { DEFAULT_EMBEDDING_MODEL };
24
+ /**
25
+ * Build an e5 embedder. Loads the transformers.js runtime and the
26
+ * model (downloaded and cached on first use). Throws — with an
27
+ * actionable message — if the optional dependency is missing or the
28
+ * model cannot be fetched.
29
+ */
30
+ export declare function createE5Embedder(modelId?: string): Promise<Embedder>;
@@ -0,0 +1,91 @@
1
+ /**
2
+ * e5-embedder.ts — the real multilingual embedder, backed by the
3
+ * `intfloat/e5` family via transformers.js (ONNX).
4
+ *
5
+ * This file is the ONLY place that touches the model runtime, and it
6
+ * is imported dynamically — and only when `enableSemanticSearch` is
7
+ * on. `@huggingface/transformers` is an OPTIONAL peer dependency: it is
8
+ * not installed by a default `bun install`, so a user who never
9
+ * enables semantic search never downloads it. The import specifier is
10
+ * deliberately indirected through a `string`-typed constant so the
11
+ * TypeScript build does not require the package to be present.
12
+ *
13
+ * `createE5Embedder` throws if the runtime or the model cannot be
14
+ * loaded (package missing, network blocked, etc.); the caller is
15
+ * expected to catch that, log it, and fall back to lexical-only
16
+ * search — enabling the flag must never break the plugin.
17
+ *
18
+ * e5 is asymmetric: a query and the passage it should match take
19
+ * different prefixes ("query: " / "passage: "). Getting that wrong
20
+ * quietly degrades retrieval, so it is baked into the two methods.
21
+ */
22
+ import { DEFAULT_EMBEDDING_MODEL } from "./embedder.js";
23
+ export { DEFAULT_EMBEDDING_MODEL };
24
+ /** Passages longer than this are truncated before embedding (e5 caps at 512 tokens anyway). */
25
+ const MAX_CHARS = 2000;
26
+ /** Embed in batches of this many texts to bound peak memory. */
27
+ const BATCH = 32;
28
+ class E5Embedder {
29
+ id;
30
+ extract;
31
+ constructor(id, extract) {
32
+ this.id = id;
33
+ this.extract = extract;
34
+ }
35
+ async embedQuery(text) {
36
+ const [v] = await this.run([`query: ${clip(text)}`]);
37
+ return v;
38
+ }
39
+ async embedPassages(texts) {
40
+ const out = [];
41
+ for (let i = 0; i < texts.length; i += BATCH) {
42
+ const batch = texts.slice(i, i + BATCH).map((t) => `passage: ${clip(t)}`);
43
+ out.push(...(await this.run(batch)));
44
+ // Yield between batches so a large embedding pass never starves
45
+ // the event loop.
46
+ await new Promise((r) => setTimeout(r, 0));
47
+ }
48
+ return out;
49
+ }
50
+ /** Run the model on a batch and split the flat output into per-text vectors. */
51
+ async run(texts) {
52
+ const t = await this.extract(texts, { pooling: "mean", normalize: true });
53
+ const flat = t.data instanceof Float32Array ? t.data : Float32Array.from(t.data);
54
+ const dim = t.dims[t.dims.length - 1];
55
+ const vecs = [];
56
+ for (let i = 0; i < texts.length; i++) {
57
+ vecs.push(flat.slice(i * dim, (i + 1) * dim));
58
+ }
59
+ return vecs;
60
+ }
61
+ }
62
+ /**
63
+ * Build an e5 embedder. Loads the transformers.js runtime and the
64
+ * model (downloaded and cached on first use). Throws — with an
65
+ * actionable message — if the optional dependency is missing or the
66
+ * model cannot be fetched.
67
+ */
68
+ export async function createE5Embedder(modelId = DEFAULT_EMBEDDING_MODEL) {
69
+ // Indirected through a `string`-typed constant: the TS build does
70
+ // not try to resolve the optional dependency at compile time.
71
+ const spec = "@huggingface/transformers";
72
+ let mod;
73
+ try {
74
+ mod = (await import(spec));
75
+ }
76
+ catch {
77
+ throw new Error("semantic search needs the optional dependency '@huggingface/transformers' — " +
78
+ "install it with `bun add @huggingface/transformers` (or `npm install @huggingface/transformers`)");
79
+ }
80
+ let extract;
81
+ try {
82
+ extract = await mod.pipeline("feature-extraction", modelId);
83
+ }
84
+ catch (e) {
85
+ throw new Error(`could not load embedding model '${modelId}': ${e instanceof Error ? e.message : String(e)}`);
86
+ }
87
+ return new E5Embedder(modelId, extract);
88
+ }
89
+ function clip(text) {
90
+ return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
91
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * embed-pass.ts — populate the vector store for memories that don't
3
+ * yet have an embedding.
4
+ *
5
+ * Runs only when semantic search is enabled, in the background, after
6
+ * prefill. It is incremental and crash-safe: vectors are persisted in
7
+ * chunks, so a memory embedded once is never re-embedded across
8
+ * sessions, and an interrupted pass simply resumes. A recall issued
9
+ * before the pass finishes still works — it just fuses against the
10
+ * vectors that exist so far (gracefully degrading toward pure lexical
11
+ * search when few are ready).
12
+ */
13
+ import type { Embedder } from "./embedder.js";
14
+ import type { MemoryRepository } from "../store/repository.js";
15
+ import type { VectorStore } from "../store/vector-store.js";
16
+ /**
17
+ * Embed every memory that lacks a vector, and drop vectors for
18
+ * memories that no longer exist (evicted or replaced). Returns the
19
+ * counts. Never throws into the caller — an embedding failure is
20
+ * reported via `log` and ends the pass cleanly, leaving lexical
21
+ * search fully functional.
22
+ */
23
+ export declare function embedMissingMemories(repo: MemoryRepository, vectorStore: VectorStore, embedder: Embedder, log?: (msg: string) => void): Promise<{
24
+ embedded: number;
25
+ pruned: number;
26
+ }>;
@@ -0,0 +1,43 @@
1
+ /**
2
+ * embed-pass.ts — populate the vector store for memories that don't
3
+ * yet have an embedding.
4
+ *
5
+ * Runs only when semantic search is enabled, in the background, after
6
+ * prefill. It is incremental and crash-safe: vectors are persisted in
7
+ * chunks, so a memory embedded once is never re-embedded across
8
+ * sessions, and an interrupted pass simply resumes. A recall issued
9
+ * before the pass finishes still works — it just fuses against the
10
+ * vectors that exist so far (gracefully degrading toward pure lexical
11
+ * search when few are ready).
12
+ */
13
+ /** Memories are embedded (and persisted) this many at a time. */
14
+ const CHUNK = 64;
15
+ /**
16
+ * Embed every memory that lacks a vector, and drop vectors for
17
+ * memories that no longer exist (evicted or replaced). Returns the
18
+ * counts. Never throws into the caller — an embedding failure is
19
+ * reported via `log` and ends the pass cleanly, leaving lexical
20
+ * search fully functional.
21
+ */
22
+ export async function embedMissingMemories(repo, vectorStore, embedder, log) {
23
+ const memories = repo.allMemories();
24
+ const validIds = new Set(memories.map((m) => m.id));
25
+ const pruned = vectorStore.prune(validIds);
26
+ const todo = memories.filter((m) => !vectorStore.has(m.id));
27
+ let embedded = 0;
28
+ try {
29
+ for (let i = 0; i < todo.length; i += CHUNK) {
30
+ const chunk = todo.slice(i, i + CHUNK);
31
+ const texts = chunk.map((m) => `${m.subject}\n${m.content}`);
32
+ const vecs = await embedder.embedPassages(texts);
33
+ vectorStore.putMany(chunk.map((m, j) => ({ id: m.id, vec: vecs[j] })));
34
+ embedded += chunk.length;
35
+ }
36
+ }
37
+ catch (e) {
38
+ log?.(`semantic: embedding pass stopped after ${embedded}/${todo.length} — ` +
39
+ `${e instanceof Error ? e.message : String(e)} (lexical search unaffected)`);
40
+ return { embedded, pruned };
41
+ }
42
+ return { embedded, pruned };
43
+ }
@@ -0,0 +1,58 @@
1
+ /**
2
+ * embedder.ts — the small, dependency-free core of optional semantic
3
+ * search: the `Embedder` contract, vector math, and reciprocal-rank
4
+ * fusion.
5
+ *
6
+ * Nothing here imports a model runtime. The real e5 implementation
7
+ * lives in `e5-embedder.ts` and is only loaded when semantic search is
8
+ * switched on; tests substitute a deterministic stub. Keeping the
9
+ * contract and the math here means the fusion/retrieval logic is
10
+ * unit-testable without downloading a 100 MB model.
11
+ */
12
+ /**
13
+ * Turns text into a vector. e5 expects asymmetric prefixes — a query
14
+ * and the passage it should match are embedded differently — so the
15
+ * contract has two methods rather than one.
16
+ */
17
+ export interface Embedder {
18
+ /** Stable model identifier, e.g. "Xenova/multilingual-e5-small". */
19
+ readonly id: string;
20
+ /** Embed a search query. */
21
+ embedQuery(text: string): Promise<Float32Array>;
22
+ /** Embed a batch of documents/passages. */
23
+ embedPassages(texts: string[]): Promise<Float32Array[]>;
24
+ }
25
+ /**
26
+ * Default embedding model — small (~120 MB quantized), ~384-dim, and
27
+ * trained on 100+ languages, which is what makes cross-lingual recall
28
+ * (a query in one language, code/comments in another) work.
29
+ */
30
+ export declare const DEFAULT_EMBEDDING_MODEL = "Xenova/multilingual-e5-small";
31
+ /** L2-normalise a vector in place and return it. A zero vector is left as-is. */
32
+ export declare function normalize(v: Float32Array): Float32Array;
33
+ /**
34
+ * Cosine similarity of two equal-length vectors, in [-1, 1]. Returns 0
35
+ * on a length mismatch or a zero vector rather than NaN — a recall
36
+ * must never crash on a malformed vector.
37
+ */
38
+ export declare function cosineSimilarity(a: Float32Array, b: Float32Array): number;
39
+ /** Dot product — equals cosine similarity when both vectors are L2-normalised. */
40
+ export declare function dot(a: Float32Array, b: Float32Array): number;
41
+ /** An id with a fused score, highest first. */
42
+ export interface FusedItem {
43
+ id: string;
44
+ score: number;
45
+ }
46
+ /**
47
+ * Reciprocal-rank fusion of several ranked id-lists into one ranking.
48
+ *
49
+ * score(id) = Σ_lists 1 / (k + rank_in_list) rank is 1-based
50
+ *
51
+ * RRF is the standard way to merge a lexical (BM25) ranking with a
52
+ * semantic (vector) ranking: it needs no score calibration between the
53
+ * two — only the *positions* — and `k` (60 by convention) damps the
54
+ * influence of low ranks. An id absent from a list simply contributes
55
+ * nothing for that list. Deterministic; ties broken by first
56
+ * appearance so the result is stable.
57
+ */
58
+ export declare function reciprocalRankFusion(lists: string[][], k?: number): FusedItem[];
@@ -0,0 +1,85 @@
1
+ /**
2
+ * embedder.ts — the small, dependency-free core of optional semantic
3
+ * search: the `Embedder` contract, vector math, and reciprocal-rank
4
+ * fusion.
5
+ *
6
+ * Nothing here imports a model runtime. The real e5 implementation
7
+ * lives in `e5-embedder.ts` and is only loaded when semantic search is
8
+ * switched on; tests substitute a deterministic stub. Keeping the
9
+ * contract and the math here means the fusion/retrieval logic is
10
+ * unit-testable without downloading a 100 MB model.
11
+ */
12
+ /**
13
+ * Default embedding model — small (~120 MB quantized), ~384-dim, and
14
+ * trained on 100+ languages, which is what makes cross-lingual recall
15
+ * (a query in one language, code/comments in another) work.
16
+ */
17
+ export const DEFAULT_EMBEDDING_MODEL = "Xenova/multilingual-e5-small";
18
+ /** L2-normalise a vector in place and return it. A zero vector is left as-is. */
19
+ export function normalize(v) {
20
+ let sum = 0;
21
+ for (let i = 0; i < v.length; i++)
22
+ sum += v[i] * v[i];
23
+ const norm = Math.sqrt(sum);
24
+ if (norm > 0) {
25
+ for (let i = 0; i < v.length; i++)
26
+ v[i] /= norm;
27
+ }
28
+ return v;
29
+ }
30
+ /**
31
+ * Cosine similarity of two equal-length vectors, in [-1, 1]. Returns 0
32
+ * on a length mismatch or a zero vector rather than NaN — a recall
33
+ * must never crash on a malformed vector.
34
+ */
35
+ export function cosineSimilarity(a, b) {
36
+ if (a.length !== b.length || a.length === 0)
37
+ return 0;
38
+ let dot = 0;
39
+ let na = 0;
40
+ let nb = 0;
41
+ for (let i = 0; i < a.length; i++) {
42
+ dot += a[i] * b[i];
43
+ na += a[i] * a[i];
44
+ nb += b[i] * b[i];
45
+ }
46
+ const denom = Math.sqrt(na) * Math.sqrt(nb);
47
+ return denom > 0 ? dot / denom : 0;
48
+ }
49
+ /** Dot product — equals cosine similarity when both vectors are L2-normalised. */
50
+ export function dot(a, b) {
51
+ if (a.length !== b.length)
52
+ return 0;
53
+ let d = 0;
54
+ for (let i = 0; i < a.length; i++)
55
+ d += a[i] * b[i];
56
+ return d;
57
+ }
58
+ /**
59
+ * Reciprocal-rank fusion of several ranked id-lists into one ranking.
60
+ *
61
+ * score(id) = Σ_lists 1 / (k + rank_in_list) rank is 1-based
62
+ *
63
+ * RRF is the standard way to merge a lexical (BM25) ranking with a
64
+ * semantic (vector) ranking: it needs no score calibration between the
65
+ * two — only the *positions* — and `k` (60 by convention) damps the
66
+ * influence of low ranks. An id absent from a list simply contributes
67
+ * nothing for that list. Deterministic; ties broken by first
68
+ * appearance so the result is stable.
69
+ */
70
+ export function reciprocalRankFusion(lists, k = 60) {
71
+ const score = new Map();
72
+ const firstSeen = new Map();
73
+ let order = 0;
74
+ for (const list of lists) {
75
+ for (let rank = 0; rank < list.length; rank++) {
76
+ const id = list[rank];
77
+ score.set(id, (score.get(id) ?? 0) + 1 / (k + rank + 1));
78
+ if (!firstSeen.has(id))
79
+ firstSeen.set(id, order++);
80
+ }
81
+ }
82
+ return [...score.entries()]
83
+ .map(([id, s]) => ({ id, score: s }))
84
+ .sort((a, b) => b.score - a.score || (firstSeen.get(a.id) - firstSeen.get(b.id)));
85
+ }