@loreai/core 0.0.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +26 -5
  3. package/dist/bun/agents-file.d.ts +59 -0
  4. package/dist/bun/agents-file.d.ts.map +1 -0
  5. package/dist/bun/config.d.ts +58 -0
  6. package/dist/bun/config.d.ts.map +1 -0
  7. package/dist/bun/curator.d.ts +35 -0
  8. package/dist/bun/curator.d.ts.map +1 -0
  9. package/dist/bun/db/driver.bun.d.ts +5 -0
  10. package/dist/bun/db/driver.bun.d.ts.map +1 -0
  11. package/dist/bun/db/driver.node.d.ts +15 -0
  12. package/dist/bun/db/driver.node.d.ts.map +1 -0
  13. package/dist/bun/db.d.ts +22 -0
  14. package/dist/bun/db.d.ts.map +1 -0
  15. package/dist/bun/distillation.d.ts +32 -0
  16. package/dist/bun/distillation.d.ts.map +1 -0
  17. package/dist/bun/embedding.d.ts +90 -0
  18. package/dist/bun/embedding.d.ts.map +1 -0
  19. package/dist/bun/gradient.d.ts +73 -0
  20. package/dist/bun/gradient.d.ts.map +1 -0
  21. package/dist/bun/index.d.ts +19 -0
  22. package/dist/bun/index.d.ts.map +1 -0
  23. package/dist/bun/index.js +28236 -0
  24. package/dist/bun/index.js.map +7 -0
  25. package/dist/bun/lat-reader.d.ts +69 -0
  26. package/dist/bun/lat-reader.d.ts.map +1 -0
  27. package/dist/bun/log.d.ts +17 -0
  28. package/dist/bun/log.d.ts.map +1 -0
  29. package/dist/bun/ltm.d.ts +138 -0
  30. package/dist/bun/ltm.d.ts.map +1 -0
  31. package/dist/bun/markdown.d.ts +37 -0
  32. package/dist/bun/markdown.d.ts.map +1 -0
  33. package/dist/bun/prompt.d.ts +47 -0
  34. package/dist/bun/prompt.d.ts.map +1 -0
  35. package/dist/bun/recall.d.ts +41 -0
  36. package/dist/bun/recall.d.ts.map +1 -0
  37. package/dist/bun/search.d.ts +113 -0
  38. package/dist/bun/search.d.ts.map +1 -0
  39. package/dist/bun/temporal.d.ts +66 -0
  40. package/dist/bun/temporal.d.ts.map +1 -0
  41. package/dist/bun/types.d.ts +180 -0
  42. package/dist/bun/types.d.ts.map +1 -0
  43. package/dist/bun/worker.d.ts +6 -0
  44. package/dist/bun/worker.d.ts.map +1 -0
  45. package/dist/node/agents-file.d.ts +59 -0
  46. package/dist/node/agents-file.d.ts.map +1 -0
  47. package/dist/node/config.d.ts +58 -0
  48. package/dist/node/config.d.ts.map +1 -0
  49. package/dist/node/curator.d.ts +35 -0
  50. package/dist/node/curator.d.ts.map +1 -0
  51. package/dist/node/db/driver.bun.d.ts +5 -0
  52. package/dist/node/db/driver.bun.d.ts.map +1 -0
  53. package/dist/node/db/driver.node.d.ts +15 -0
  54. package/dist/node/db/driver.node.d.ts.map +1 -0
  55. package/dist/node/db.d.ts +22 -0
  56. package/dist/node/db.d.ts.map +1 -0
  57. package/dist/node/distillation.d.ts +32 -0
  58. package/dist/node/distillation.d.ts.map +1 -0
  59. package/dist/node/embedding.d.ts +90 -0
  60. package/dist/node/embedding.d.ts.map +1 -0
  61. package/dist/node/gradient.d.ts +73 -0
  62. package/dist/node/gradient.d.ts.map +1 -0
  63. package/dist/node/index.d.ts +19 -0
  64. package/dist/node/index.d.ts.map +1 -0
  65. package/dist/node/index.js +28253 -0
  66. package/dist/node/index.js.map +7 -0
  67. package/dist/node/lat-reader.d.ts +69 -0
  68. package/dist/node/lat-reader.d.ts.map +1 -0
  69. package/dist/node/log.d.ts +17 -0
  70. package/dist/node/log.d.ts.map +1 -0
  71. package/dist/node/ltm.d.ts +138 -0
  72. package/dist/node/ltm.d.ts.map +1 -0
  73. package/dist/node/markdown.d.ts +37 -0
  74. package/dist/node/markdown.d.ts.map +1 -0
  75. package/dist/node/prompt.d.ts +47 -0
  76. package/dist/node/prompt.d.ts.map +1 -0
  77. package/dist/node/recall.d.ts +41 -0
  78. package/dist/node/recall.d.ts.map +1 -0
  79. package/dist/node/search.d.ts +113 -0
  80. package/dist/node/search.d.ts.map +1 -0
  81. package/dist/node/temporal.d.ts +66 -0
  82. package/dist/node/temporal.d.ts.map +1 -0
  83. package/dist/node/types.d.ts +180 -0
  84. package/dist/node/types.d.ts.map +1 -0
  85. package/dist/node/worker.d.ts +6 -0
  86. package/dist/node/worker.d.ts.map +1 -0
  87. package/dist/types/agents-file.d.ts +59 -0
  88. package/dist/types/agents-file.d.ts.map +1 -0
  89. package/dist/types/config.d.ts +58 -0
  90. package/dist/types/config.d.ts.map +1 -0
  91. package/dist/types/curator.d.ts +35 -0
  92. package/dist/types/curator.d.ts.map +1 -0
  93. package/dist/types/db/driver.bun.d.ts +5 -0
  94. package/dist/types/db/driver.bun.d.ts.map +1 -0
  95. package/dist/types/db/driver.node.d.ts +15 -0
  96. package/dist/types/db/driver.node.d.ts.map +1 -0
  97. package/dist/types/db.d.ts +22 -0
  98. package/dist/types/db.d.ts.map +1 -0
  99. package/dist/types/distillation.d.ts +32 -0
  100. package/dist/types/distillation.d.ts.map +1 -0
  101. package/dist/types/embedding.d.ts +90 -0
  102. package/dist/types/embedding.d.ts.map +1 -0
  103. package/dist/types/gradient.d.ts +73 -0
  104. package/dist/types/gradient.d.ts.map +1 -0
  105. package/dist/types/index.d.ts +19 -0
  106. package/dist/types/index.d.ts.map +1 -0
  107. package/dist/types/lat-reader.d.ts +69 -0
  108. package/dist/types/lat-reader.d.ts.map +1 -0
  109. package/dist/types/log.d.ts +17 -0
  110. package/dist/types/log.d.ts.map +1 -0
  111. package/dist/types/ltm.d.ts +138 -0
  112. package/dist/types/ltm.d.ts.map +1 -0
  113. package/dist/types/markdown.d.ts +37 -0
  114. package/dist/types/markdown.d.ts.map +1 -0
  115. package/dist/types/prompt.d.ts +47 -0
  116. package/dist/types/prompt.d.ts.map +1 -0
  117. package/dist/types/recall.d.ts +41 -0
  118. package/dist/types/recall.d.ts.map +1 -0
  119. package/dist/types/search.d.ts +113 -0
  120. package/dist/types/search.d.ts.map +1 -0
  121. package/dist/types/temporal.d.ts +66 -0
  122. package/dist/types/temporal.d.ts.map +1 -0
  123. package/dist/types/types.d.ts +180 -0
  124. package/dist/types/types.d.ts.map +1 -0
  125. package/dist/types/worker.d.ts +6 -0
  126. package/dist/types/worker.d.ts.map +1 -0
  127. package/package.json +48 -5
  128. package/src/agents-file.ts +406 -0
  129. package/src/config.ts +132 -0
  130. package/src/curator.ts +220 -0
  131. package/src/db/driver.bun.ts +18 -0
  132. package/src/db/driver.node.ts +54 -0
  133. package/src/db.ts +433 -0
  134. package/src/distillation.ts +433 -0
  135. package/src/embedding.ts +528 -0
  136. package/src/gradient.ts +1387 -0
  137. package/src/index.ts +109 -0
  138. package/src/lat-reader.ts +374 -0
  139. package/src/log.ts +27 -0
  140. package/src/ltm.ts +861 -0
  141. package/src/markdown.ts +129 -0
  142. package/src/prompt.ts +454 -0
  143. package/src/recall.ts +446 -0
  144. package/src/search.ts +330 -0
  145. package/src/temporal.ts +379 -0
  146. package/src/types.ts +199 -0
  147. package/src/worker.ts +26 -0
@@ -0,0 +1,528 @@
1
+ /**
2
+ * Embedding integration for vector search.
3
+ *
4
+ * Supports multiple embedding providers (Voyage AI, OpenAI) behind a common
5
+ * interface. Provides embedding generation, pure-JS cosine similarity, and
6
+ * vector search over the knowledge and distillation tables. All operations
7
+ * are gated behind `search.embeddings.enabled` config + the provider's API
8
+ * key env var — falls back silently to FTS-only when unavailable.
9
+ */
10
+
11
+ import { db } from "./db";
12
+ import { config } from "./config";
13
+ import * as log from "./log";
14
+
15
+ // ---------------------------------------------------------------------------
16
+ // Provider interface
17
+ // ---------------------------------------------------------------------------
18
+
19
+ export interface EmbeddingProvider {
20
+ embed(texts: string[], inputType: "document" | "query"): Promise<Float32Array[]>;
21
+ readonly maxBatchSize: number;
22
+ }
23
+
24
+ // ---------------------------------------------------------------------------
25
+ // Voyage AI provider
26
+ // ---------------------------------------------------------------------------
27
+
28
+ const VOYAGE_API_URL = "https://api.voyageai.com/v1/embeddings";
29
+
30
+ type VoyageResponse = {
31
+ data: Array<{ embedding: number[]; index: number }>;
32
+ model: string;
33
+ usage: { total_tokens: number };
34
+ };
35
+
36
+ class VoyageProvider implements EmbeddingProvider {
37
+ readonly maxBatchSize = 128;
38
+ private apiKey: string;
39
+ private model: string;
40
+ private dimensions: number;
41
+
42
+ constructor(apiKey: string, model: string, dimensions: number) {
43
+ this.apiKey = apiKey;
44
+ this.model = model;
45
+ this.dimensions = dimensions;
46
+ }
47
+
48
+ async embed(texts: string[], inputType: "document" | "query"): Promise<Float32Array[]> {
49
+ const res = await fetch(VOYAGE_API_URL, {
50
+ method: "POST",
51
+ headers: {
52
+ "Content-Type": "application/json",
53
+ Authorization: `Bearer ${this.apiKey}`,
54
+ },
55
+ body: JSON.stringify({
56
+ input: texts,
57
+ model: this.model,
58
+ input_type: inputType,
59
+ output_dimension: this.dimensions,
60
+ }),
61
+ });
62
+
63
+ if (!res.ok) {
64
+ const body = await res.text().catch(() => "");
65
+ throw new Error(`Voyage API ${res.status}: ${body}`);
66
+ }
67
+
68
+ const json = (await res.json()) as VoyageResponse;
69
+ const sorted = [...json.data].sort((a, b) => a.index - b.index);
70
+ return sorted.map((d) => new Float32Array(d.embedding));
71
+ }
72
+ }
73
+
74
+ // ---------------------------------------------------------------------------
75
+ // OpenAI provider
76
+ // ---------------------------------------------------------------------------
77
+
78
+ const OPENAI_API_URL = "https://api.openai.com/v1/embeddings";
79
+
80
+ type OpenAIResponse = {
81
+ data: Array<{ embedding: number[]; index: number }>;
82
+ model: string;
83
+ usage: { prompt_tokens: number; total_tokens: number };
84
+ };
85
+
86
+ class OpenAIProvider implements EmbeddingProvider {
87
+ readonly maxBatchSize = 2048;
88
+ private apiKey: string;
89
+ private model: string;
90
+ private dimensions: number;
91
+
92
+ constructor(apiKey: string, model: string, dimensions: number) {
93
+ this.apiKey = apiKey;
94
+ this.model = model;
95
+ this.dimensions = dimensions;
96
+ }
97
+
98
+ async embed(texts: string[], _inputType: "document" | "query"): Promise<Float32Array[]> {
99
+ const body: Record<string, unknown> = {
100
+ input: texts,
101
+ model: this.model,
102
+ };
103
+ // OpenAI supports dimensions parameter for text-embedding-3-* models
104
+ if (this.model.startsWith("text-embedding-3")) {
105
+ body.dimensions = this.dimensions;
106
+ }
107
+
108
+ const res = await fetch(OPENAI_API_URL, {
109
+ method: "POST",
110
+ headers: {
111
+ "Content-Type": "application/json",
112
+ Authorization: `Bearer ${this.apiKey}`,
113
+ },
114
+ body: JSON.stringify(body),
115
+ });
116
+
117
+ if (!res.ok) {
118
+ const responseBody = await res.text().catch(() => "");
119
+ throw new Error(`OpenAI API ${res.status}: ${responseBody}`);
120
+ }
121
+
122
+ const json = (await res.json()) as OpenAIResponse;
123
+ const sorted = [...json.data].sort((a, b) => a.index - b.index);
124
+ return sorted.map((d) => new Float32Array(d.embedding));
125
+ }
126
+ }
127
+
128
+ // ---------------------------------------------------------------------------
129
+ // Provider resolution
130
+ // ---------------------------------------------------------------------------
131
+
132
+ /** Default models per provider — used when config doesn't override. */
133
+ const PROVIDER_DEFAULTS: Record<string, { model: string; dimensions: number }> = {
134
+ voyage: { model: "voyage-code-3", dimensions: 1024 },
135
+ openai: { model: "text-embedding-3-small", dimensions: 1536 },
136
+ };
137
+
138
+ /** Env var name for each provider's API key. */
139
+ const PROVIDER_ENV_KEYS: Record<string, string> = {
140
+ voyage: "VOYAGE_API_KEY",
141
+ openai: "OPENAI_API_KEY",
142
+ };
143
+
144
+ function getProviderApiKey(provider: string): string | undefined {
145
+ const envKey = PROVIDER_ENV_KEYS[provider];
146
+ return envKey ? process.env[envKey] : undefined;
147
+ }
148
+
149
+ let cachedProvider: EmbeddingProvider | null | undefined;
150
+
151
+ function getProvider(): EmbeddingProvider | null {
152
+ if (cachedProvider !== undefined) return cachedProvider;
153
+
154
+ const cfg = config().search.embeddings;
155
+ if (cfg.enabled === false) {
156
+ cachedProvider = null;
157
+ return null;
158
+ }
159
+
160
+ const providerName = cfg.provider;
161
+ const apiKey = getProviderApiKey(providerName);
162
+ if (!apiKey) {
163
+ cachedProvider = null;
164
+ return null;
165
+ }
166
+
167
+ const defaults = PROVIDER_DEFAULTS[providerName];
168
+ const model = cfg.model === defaults?.model ? cfg.model : cfg.model;
169
+ const dimensions = cfg.dimensions;
170
+
171
+ switch (providerName) {
172
+ case "voyage":
173
+ cachedProvider = new VoyageProvider(apiKey, model, dimensions);
174
+ break;
175
+ case "openai":
176
+ cachedProvider = new OpenAIProvider(apiKey, model, dimensions);
177
+ break;
178
+ default:
179
+ log.info(`unknown embedding provider: ${providerName}`);
180
+ cachedProvider = null;
181
+ }
182
+
183
+ return cachedProvider;
184
+ }
185
+
186
+ /** Reset cached provider — called when config changes. */
187
+ export function resetProvider(): void {
188
+ cachedProvider = undefined;
189
+ }
190
+
191
+ // ---------------------------------------------------------------------------
192
+ // Availability
193
+ // ---------------------------------------------------------------------------
194
+
195
+ /** Returns true if embedding is available.
196
+ * Active when the configured provider's API key is set, unless explicitly
197
+ * disabled via `search.embeddings.enabled: false` in .lore.json. */
198
+ export function isAvailable(): boolean {
199
+ return getProvider() !== null;
200
+ }
201
+
202
+ // ---------------------------------------------------------------------------
203
+ // Public embed API
204
+ // ---------------------------------------------------------------------------
205
+
206
+ /**
207
+ * Generate embeddings for the given texts using the configured provider.
208
+ *
209
+ * @param texts Array of texts to embed
210
+ * @param inputType "document" for storage, "query" for search
211
+ * @returns Float32Array per input text
212
+ * @throws On API errors or missing provider
213
+ */
214
+ export async function embed(
215
+ texts: string[],
216
+ inputType: "document" | "query",
217
+ ): Promise<Float32Array[]> {
218
+ const provider = getProvider();
219
+ if (!provider) throw new Error("No embedding provider available");
220
+ return provider.embed(texts, inputType);
221
+ }
222
+
223
+ // ---------------------------------------------------------------------------
224
+ // Cosine similarity (pure JS)
225
+ // ---------------------------------------------------------------------------
226
+
227
+ /**
228
+ * Cosine similarity between two Float32Array vectors.
229
+ * Returns -1.0 to 1.0 where 1.0 = identical direction.
230
+ * Returns 0 if either vector is zero-length.
231
+ */
232
+ export function cosineSimilarity(a: Float32Array, b: Float32Array): number {
233
+ const len = Math.min(a.length, b.length);
234
+ let dot = 0;
235
+ let normA = 0;
236
+ let normB = 0;
237
+ for (let i = 0; i < len; i++) {
238
+ dot += a[i] * b[i];
239
+ normA += a[i] * a[i];
240
+ normB += b[i] * b[i];
241
+ }
242
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
243
+ if (denom === 0) return 0;
244
+ return dot / denom;
245
+ }
246
+
247
+ // ---------------------------------------------------------------------------
248
+ // BLOB conversion
249
+ // ---------------------------------------------------------------------------
250
+
251
+ /** Convert Float32Array to Buffer for SQLite BLOB storage. */
252
+ export function toBlob(arr: Float32Array): Buffer {
253
+ return Buffer.from(arr.buffer, arr.byteOffset, arr.byteLength);
254
+ }
255
+
256
+ /** Convert SQLite BLOB (Buffer/Uint8Array) back to Float32Array. */
257
+ export function fromBlob(blob: Buffer | Uint8Array): Float32Array {
258
+ const bytes = new Uint8Array(blob);
259
+ return new Float32Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 4);
260
+ }
261
+
262
+ // ---------------------------------------------------------------------------
263
+ // Vector search — knowledge
264
+ // ---------------------------------------------------------------------------
265
+
266
+ type VectorHit = { id: string; similarity: number };
267
+
268
+ /**
269
+ * Search all knowledge entries with embeddings by cosine similarity.
270
+ * Returns top-k entries sorted by similarity descending.
271
+ * Pure brute-force — fine for <100 entries (microseconds).
272
+ */
273
+ export function vectorSearch(
274
+ queryEmbedding: Float32Array,
275
+ limit = 10,
276
+ ): VectorHit[] {
277
+ const rows = db()
278
+ .query("SELECT id, embedding FROM knowledge WHERE embedding IS NOT NULL AND confidence > 0.2")
279
+ .all() as Array<{ id: string; embedding: Buffer }>;
280
+
281
+ const scored: VectorHit[] = [];
282
+ for (const row of rows) {
283
+ const vec = fromBlob(row.embedding);
284
+ const sim = cosineSimilarity(queryEmbedding, vec);
285
+ scored.push({ id: row.id, similarity: sim });
286
+ }
287
+
288
+ scored.sort((a, b) => b.similarity - a.similarity);
289
+ return scored.slice(0, limit);
290
+ }
291
+
292
+ // ---------------------------------------------------------------------------
293
+ // Vector search — distillations
294
+ // ---------------------------------------------------------------------------
295
+
296
+ /**
297
+ * Search non-archived distillations with embeddings by cosine similarity.
298
+ * Returns top-k entries sorted by similarity descending.
299
+ * Pure brute-force — fine for ~50 entries.
300
+ */
301
+ export function vectorSearchDistillations(
302
+ queryEmbedding: Float32Array,
303
+ limit = 10,
304
+ ): VectorHit[] {
305
+ const rows = db()
306
+ .query(
307
+ "SELECT id, embedding FROM distillations WHERE embedding IS NOT NULL AND archived = 0",
308
+ )
309
+ .all() as Array<{ id: string; embedding: Buffer }>;
310
+
311
+ const scored: VectorHit[] = [];
312
+ for (const row of rows) {
313
+ const vec = fromBlob(row.embedding);
314
+ const sim = cosineSimilarity(queryEmbedding, vec);
315
+ scored.push({ id: row.id, similarity: sim });
316
+ }
317
+
318
+ scored.sort((a, b) => b.similarity - a.similarity);
319
+ return scored.slice(0, limit);
320
+ }
321
+
322
+ // ---------------------------------------------------------------------------
323
+ // Fire-and-forget embedding
324
+ // ---------------------------------------------------------------------------
325
+
326
+ /**
327
+ * Embed a knowledge entry and store the result in the DB.
328
+ * Fire-and-forget — errors are logged, never thrown.
329
+ * The entry remains usable via FTS even if embedding fails.
330
+ */
331
+ export function embedKnowledgeEntry(
332
+ id: string,
333
+ title: string,
334
+ content: string,
335
+ ): void {
336
+ const text = `${title}\n${content}`;
337
+ embed([text], "document")
338
+ .then(([vec]) => {
339
+ db()
340
+ .query("UPDATE knowledge SET embedding = ? WHERE id = ?")
341
+ .run(toBlob(vec), id);
342
+ })
343
+ .catch((err) => {
344
+ log.info("embedding failed for knowledge entry", id, ":", err);
345
+ });
346
+ }
347
+
348
+ /**
349
+ * Embed a distillation and store the result in the DB.
350
+ * Fire-and-forget — errors are logged, never thrown.
351
+ * The distillation remains searchable via FTS even if embedding fails.
352
+ */
353
+ export function embedDistillation(
354
+ id: string,
355
+ observations: string,
356
+ ): void {
357
+ embed([observations], "document")
358
+ .then(([vec]) => {
359
+ db()
360
+ .query("UPDATE distillations SET embedding = ? WHERE id = ?")
361
+ .run(toBlob(vec), id);
362
+ })
363
+ .catch((err) => {
364
+ log.info("embedding failed for distillation", id, ":", err);
365
+ });
366
+ }
367
+
368
+ // ---------------------------------------------------------------------------
369
+ // Config change detection
370
+ // ---------------------------------------------------------------------------
371
+
372
+ /**
373
+ * Build a config fingerprint from provider + model + dimensions.
374
+ * Used to detect when the embedding config changes (provider swap, model swap,
375
+ * dimension change) so we can clear stale embeddings and re-embed.
376
+ */
377
+ function configFingerprint(): string {
378
+ const cfg = config().search.embeddings;
379
+ return `${cfg.provider}:${cfg.model}:${cfg.dimensions}`;
380
+ }
381
+
382
+ const EMBEDDING_CONFIG_KEY = "lore:embedding_config";
383
+
384
+ /**
385
+ * Check if embedding config has changed since the last backfill.
386
+ * If so, clear all existing embeddings (they're incompatible) and
387
+ * update the stored fingerprint.
388
+ *
389
+ * Returns true if embeddings were cleared (full re-embed needed).
390
+ */
391
+ export function checkConfigChange(): boolean {
392
+ // Read stored fingerprint from kv_meta
393
+ const stored = db()
394
+ .query("SELECT value FROM kv_meta WHERE key = ?")
395
+ .get(EMBEDDING_CONFIG_KEY) as { value: string } | null;
396
+
397
+ const current = configFingerprint();
398
+
399
+ if (stored && stored.value === current) return false;
400
+
401
+ // Config changed (or first run) — clear all embeddings in both tables
402
+ if (stored) {
403
+ const knowledgeCount = db()
404
+ .query("SELECT COUNT(*) as n FROM knowledge WHERE embedding IS NOT NULL")
405
+ .get() as { n: number };
406
+ const distillCount = db()
407
+ .query("SELECT COUNT(*) as n FROM distillations WHERE embedding IS NOT NULL")
408
+ .get() as { n: number };
409
+ const total = knowledgeCount.n + distillCount.n;
410
+ if (total > 0) {
411
+ db().query("UPDATE knowledge SET embedding = NULL").run();
412
+ db().query("UPDATE distillations SET embedding = NULL").run();
413
+ log.info(
414
+ `embedding config changed (${stored.value} → ${current}), cleared ${total} stale embeddings`,
415
+ );
416
+ }
417
+ }
418
+
419
+ // Store new fingerprint
420
+ db()
421
+ .query(
422
+ "INSERT INTO kv_meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = ?",
423
+ )
424
+ .run(EMBEDDING_CONFIG_KEY, current, current);
425
+
426
+ return true;
427
+ }
428
+
429
+ // ---------------------------------------------------------------------------
430
+ // Backfill — knowledge
431
+ // ---------------------------------------------------------------------------
432
+
433
+ /**
434
+ * Embed all knowledge entries that are missing embeddings.
435
+ * Called on startup when embeddings are first enabled.
436
+ * Also handles config changes: if provider/model/dimensions changed, clears
437
+ * stale embeddings first, then re-embeds all entries.
438
+ * Returns the number of entries embedded.
439
+ */
440
+ export async function backfillEmbeddings(): Promise<number> {
441
+ // Detect config changes and clear stale embeddings
442
+ checkConfigChange();
443
+
444
+ const provider = getProvider();
445
+ if (!provider) return 0;
446
+
447
+ const rows = db()
448
+ .query("SELECT id, title, content FROM knowledge WHERE embedding IS NULL AND confidence > 0.2")
449
+ .all() as Array<{ id: string; title: string; content: string }>;
450
+
451
+ if (!rows.length) return 0;
452
+
453
+ const batchSize = provider.maxBatchSize;
454
+ let embedded = 0;
455
+
456
+ for (let i = 0; i < rows.length; i += batchSize) {
457
+ const batch = rows.slice(i, i + batchSize);
458
+ const texts = batch.map((r) => `${r.title}\n${r.content}`);
459
+
460
+ try {
461
+ const vectors = await embed(texts, "document");
462
+ const update = db().prepare(
463
+ "UPDATE knowledge SET embedding = ? WHERE id = ?",
464
+ );
465
+
466
+ for (let j = 0; j < batch.length; j++) {
467
+ update.run(toBlob(vectors[j]), batch[j].id);
468
+ embedded++;
469
+ }
470
+ } catch (err) {
471
+ log.info(`embedding backfill batch ${i}-${i + batch.length} failed:`, err);
472
+ }
473
+ }
474
+
475
+ if (embedded > 0) {
476
+ log.info(`embedded ${embedded} knowledge entries`);
477
+ }
478
+ return embedded;
479
+ }
480
+
481
+ // ---------------------------------------------------------------------------
482
+ // Backfill — distillations
483
+ // ---------------------------------------------------------------------------
484
+
485
+ /**
486
+ * Embed all non-archived distillations that are missing embeddings.
487
+ * Called on startup alongside knowledge backfill.
488
+ * Returns the number of distillations embedded.
489
+ */
490
+ export async function backfillDistillationEmbeddings(): Promise<number> {
491
+ const provider = getProvider();
492
+ if (!provider) return 0;
493
+
494
+ const rows = db()
495
+ .query(
496
+ "SELECT id, observations FROM distillations WHERE embedding IS NULL AND archived = 0 AND observations != ''",
497
+ )
498
+ .all() as Array<{ id: string; observations: string }>;
499
+
500
+ if (!rows.length) return 0;
501
+
502
+ const batchSize = provider.maxBatchSize;
503
+ let embedded = 0;
504
+
505
+ for (let i = 0; i < rows.length; i += batchSize) {
506
+ const batch = rows.slice(i, i + batchSize);
507
+ const texts = batch.map((r) => r.observations);
508
+
509
+ try {
510
+ const vectors = await embed(texts, "document");
511
+ const update = db().prepare(
512
+ "UPDATE distillations SET embedding = ? WHERE id = ?",
513
+ );
514
+
515
+ for (let j = 0; j < batch.length; j++) {
516
+ update.run(toBlob(vectors[j]), batch[j].id);
517
+ embedded++;
518
+ }
519
+ } catch (err) {
520
+ log.info(`distillation embedding backfill batch ${i}-${i + batch.length} failed:`, err);
521
+ }
522
+ }
523
+
524
+ if (embedded > 0) {
525
+ log.info(`embedded ${embedded} distillations`);
526
+ }
527
+ return embedded;
528
+ }