@loreai/core 0.0.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +26 -5
- package/dist/bun/agents-file.d.ts +59 -0
- package/dist/bun/agents-file.d.ts.map +1 -0
- package/dist/bun/config.d.ts +58 -0
- package/dist/bun/config.d.ts.map +1 -0
- package/dist/bun/curator.d.ts +35 -0
- package/dist/bun/curator.d.ts.map +1 -0
- package/dist/bun/db/driver.bun.d.ts +5 -0
- package/dist/bun/db/driver.bun.d.ts.map +1 -0
- package/dist/bun/db/driver.node.d.ts +15 -0
- package/dist/bun/db/driver.node.d.ts.map +1 -0
- package/dist/bun/db.d.ts +22 -0
- package/dist/bun/db.d.ts.map +1 -0
- package/dist/bun/distillation.d.ts +32 -0
- package/dist/bun/distillation.d.ts.map +1 -0
- package/dist/bun/embedding.d.ts +90 -0
- package/dist/bun/embedding.d.ts.map +1 -0
- package/dist/bun/gradient.d.ts +73 -0
- package/dist/bun/gradient.d.ts.map +1 -0
- package/dist/bun/index.d.ts +19 -0
- package/dist/bun/index.d.ts.map +1 -0
- package/dist/bun/index.js +28236 -0
- package/dist/bun/index.js.map +7 -0
- package/dist/bun/lat-reader.d.ts +69 -0
- package/dist/bun/lat-reader.d.ts.map +1 -0
- package/dist/bun/log.d.ts +17 -0
- package/dist/bun/log.d.ts.map +1 -0
- package/dist/bun/ltm.d.ts +138 -0
- package/dist/bun/ltm.d.ts.map +1 -0
- package/dist/bun/markdown.d.ts +37 -0
- package/dist/bun/markdown.d.ts.map +1 -0
- package/dist/bun/prompt.d.ts +47 -0
- package/dist/bun/prompt.d.ts.map +1 -0
- package/dist/bun/recall.d.ts +41 -0
- package/dist/bun/recall.d.ts.map +1 -0
- package/dist/bun/search.d.ts +113 -0
- package/dist/bun/search.d.ts.map +1 -0
- package/dist/bun/temporal.d.ts +66 -0
- package/dist/bun/temporal.d.ts.map +1 -0
- package/dist/bun/types.d.ts +180 -0
- package/dist/bun/types.d.ts.map +1 -0
- package/dist/bun/worker.d.ts +6 -0
- package/dist/bun/worker.d.ts.map +1 -0
- package/dist/node/agents-file.d.ts +59 -0
- package/dist/node/agents-file.d.ts.map +1 -0
- package/dist/node/config.d.ts +58 -0
- package/dist/node/config.d.ts.map +1 -0
- package/dist/node/curator.d.ts +35 -0
- package/dist/node/curator.d.ts.map +1 -0
- package/dist/node/db/driver.bun.d.ts +5 -0
- package/dist/node/db/driver.bun.d.ts.map +1 -0
- package/dist/node/db/driver.node.d.ts +15 -0
- package/dist/node/db/driver.node.d.ts.map +1 -0
- package/dist/node/db.d.ts +22 -0
- package/dist/node/db.d.ts.map +1 -0
- package/dist/node/distillation.d.ts +32 -0
- package/dist/node/distillation.d.ts.map +1 -0
- package/dist/node/embedding.d.ts +90 -0
- package/dist/node/embedding.d.ts.map +1 -0
- package/dist/node/gradient.d.ts +73 -0
- package/dist/node/gradient.d.ts.map +1 -0
- package/dist/node/index.d.ts +19 -0
- package/dist/node/index.d.ts.map +1 -0
- package/dist/node/index.js +28253 -0
- package/dist/node/index.js.map +7 -0
- package/dist/node/lat-reader.d.ts +69 -0
- package/dist/node/lat-reader.d.ts.map +1 -0
- package/dist/node/log.d.ts +17 -0
- package/dist/node/log.d.ts.map +1 -0
- package/dist/node/ltm.d.ts +138 -0
- package/dist/node/ltm.d.ts.map +1 -0
- package/dist/node/markdown.d.ts +37 -0
- package/dist/node/markdown.d.ts.map +1 -0
- package/dist/node/prompt.d.ts +47 -0
- package/dist/node/prompt.d.ts.map +1 -0
- package/dist/node/recall.d.ts +41 -0
- package/dist/node/recall.d.ts.map +1 -0
- package/dist/node/search.d.ts +113 -0
- package/dist/node/search.d.ts.map +1 -0
- package/dist/node/temporal.d.ts +66 -0
- package/dist/node/temporal.d.ts.map +1 -0
- package/dist/node/types.d.ts +180 -0
- package/dist/node/types.d.ts.map +1 -0
- package/dist/node/worker.d.ts +6 -0
- package/dist/node/worker.d.ts.map +1 -0
- package/dist/types/agents-file.d.ts +59 -0
- package/dist/types/agents-file.d.ts.map +1 -0
- package/dist/types/config.d.ts +58 -0
- package/dist/types/config.d.ts.map +1 -0
- package/dist/types/curator.d.ts +35 -0
- package/dist/types/curator.d.ts.map +1 -0
- package/dist/types/db/driver.bun.d.ts +5 -0
- package/dist/types/db/driver.bun.d.ts.map +1 -0
- package/dist/types/db/driver.node.d.ts +15 -0
- package/dist/types/db/driver.node.d.ts.map +1 -0
- package/dist/types/db.d.ts +22 -0
- package/dist/types/db.d.ts.map +1 -0
- package/dist/types/distillation.d.ts +32 -0
- package/dist/types/distillation.d.ts.map +1 -0
- package/dist/types/embedding.d.ts +90 -0
- package/dist/types/embedding.d.ts.map +1 -0
- package/dist/types/gradient.d.ts +73 -0
- package/dist/types/gradient.d.ts.map +1 -0
- package/dist/types/index.d.ts +19 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/lat-reader.d.ts +69 -0
- package/dist/types/lat-reader.d.ts.map +1 -0
- package/dist/types/log.d.ts +17 -0
- package/dist/types/log.d.ts.map +1 -0
- package/dist/types/ltm.d.ts +138 -0
- package/dist/types/ltm.d.ts.map +1 -0
- package/dist/types/markdown.d.ts +37 -0
- package/dist/types/markdown.d.ts.map +1 -0
- package/dist/types/prompt.d.ts +47 -0
- package/dist/types/prompt.d.ts.map +1 -0
- package/dist/types/recall.d.ts +41 -0
- package/dist/types/recall.d.ts.map +1 -0
- package/dist/types/search.d.ts +113 -0
- package/dist/types/search.d.ts.map +1 -0
- package/dist/types/temporal.d.ts +66 -0
- package/dist/types/temporal.d.ts.map +1 -0
- package/dist/types/types.d.ts +180 -0
- package/dist/types/types.d.ts.map +1 -0
- package/dist/types/worker.d.ts +6 -0
- package/dist/types/worker.d.ts.map +1 -0
- package/package.json +48 -5
- package/src/agents-file.ts +406 -0
- package/src/config.ts +132 -0
- package/src/curator.ts +220 -0
- package/src/db/driver.bun.ts +18 -0
- package/src/db/driver.node.ts +54 -0
- package/src/db.ts +433 -0
- package/src/distillation.ts +433 -0
- package/src/embedding.ts +528 -0
- package/src/gradient.ts +1387 -0
- package/src/index.ts +109 -0
- package/src/lat-reader.ts +374 -0
- package/src/log.ts +27 -0
- package/src/ltm.ts +861 -0
- package/src/markdown.ts +129 -0
- package/src/prompt.ts +454 -0
- package/src/recall.ts +446 -0
- package/src/search.ts +330 -0
- package/src/temporal.ts +379 -0
- package/src/types.ts +199 -0
- package/src/worker.ts +26 -0
package/src/embedding.ts
ADDED
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding integration for vector search.
|
|
3
|
+
*
|
|
4
|
+
* Supports multiple embedding providers (Voyage AI, OpenAI) behind a common
|
|
5
|
+
* interface. Provides embedding generation, pure-JS cosine similarity, and
|
|
6
|
+
* vector search over the knowledge and distillation tables. All operations
|
|
7
|
+
* are gated behind `search.embeddings.enabled` config + the provider's API
|
|
8
|
+
* key env var — falls back silently to FTS-only when unavailable.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { db } from "./db";
|
|
12
|
+
import { config } from "./config";
|
|
13
|
+
import * as log from "./log";
|
|
14
|
+
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Provider interface
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
export interface EmbeddingProvider {
|
|
20
|
+
embed(texts: string[], inputType: "document" | "query"): Promise<Float32Array[]>;
|
|
21
|
+
readonly maxBatchSize: number;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Voyage AI provider
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
const VOYAGE_API_URL = "https://api.voyageai.com/v1/embeddings";
|
|
29
|
+
|
|
30
|
+
type VoyageResponse = {
|
|
31
|
+
data: Array<{ embedding: number[]; index: number }>;
|
|
32
|
+
model: string;
|
|
33
|
+
usage: { total_tokens: number };
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
class VoyageProvider implements EmbeddingProvider {
|
|
37
|
+
readonly maxBatchSize = 128;
|
|
38
|
+
private apiKey: string;
|
|
39
|
+
private model: string;
|
|
40
|
+
private dimensions: number;
|
|
41
|
+
|
|
42
|
+
constructor(apiKey: string, model: string, dimensions: number) {
|
|
43
|
+
this.apiKey = apiKey;
|
|
44
|
+
this.model = model;
|
|
45
|
+
this.dimensions = dimensions;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
async embed(texts: string[], inputType: "document" | "query"): Promise<Float32Array[]> {
|
|
49
|
+
const res = await fetch(VOYAGE_API_URL, {
|
|
50
|
+
method: "POST",
|
|
51
|
+
headers: {
|
|
52
|
+
"Content-Type": "application/json",
|
|
53
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
54
|
+
},
|
|
55
|
+
body: JSON.stringify({
|
|
56
|
+
input: texts,
|
|
57
|
+
model: this.model,
|
|
58
|
+
input_type: inputType,
|
|
59
|
+
output_dimension: this.dimensions,
|
|
60
|
+
}),
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
if (!res.ok) {
|
|
64
|
+
const body = await res.text().catch(() => "");
|
|
65
|
+
throw new Error(`Voyage API ${res.status}: ${body}`);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const json = (await res.json()) as VoyageResponse;
|
|
69
|
+
const sorted = [...json.data].sort((a, b) => a.index - b.index);
|
|
70
|
+
return sorted.map((d) => new Float32Array(d.embedding));
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
// OpenAI provider
|
|
76
|
+
// ---------------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
const OPENAI_API_URL = "https://api.openai.com/v1/embeddings";
|
|
79
|
+
|
|
80
|
+
type OpenAIResponse = {
|
|
81
|
+
data: Array<{ embedding: number[]; index: number }>;
|
|
82
|
+
model: string;
|
|
83
|
+
usage: { prompt_tokens: number; total_tokens: number };
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
class OpenAIProvider implements EmbeddingProvider {
|
|
87
|
+
readonly maxBatchSize = 2048;
|
|
88
|
+
private apiKey: string;
|
|
89
|
+
private model: string;
|
|
90
|
+
private dimensions: number;
|
|
91
|
+
|
|
92
|
+
constructor(apiKey: string, model: string, dimensions: number) {
|
|
93
|
+
this.apiKey = apiKey;
|
|
94
|
+
this.model = model;
|
|
95
|
+
this.dimensions = dimensions;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
async embed(texts: string[], _inputType: "document" | "query"): Promise<Float32Array[]> {
|
|
99
|
+
const body: Record<string, unknown> = {
|
|
100
|
+
input: texts,
|
|
101
|
+
model: this.model,
|
|
102
|
+
};
|
|
103
|
+
// OpenAI supports dimensions parameter for text-embedding-3-* models
|
|
104
|
+
if (this.model.startsWith("text-embedding-3")) {
|
|
105
|
+
body.dimensions = this.dimensions;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const res = await fetch(OPENAI_API_URL, {
|
|
109
|
+
method: "POST",
|
|
110
|
+
headers: {
|
|
111
|
+
"Content-Type": "application/json",
|
|
112
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
113
|
+
},
|
|
114
|
+
body: JSON.stringify(body),
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
if (!res.ok) {
|
|
118
|
+
const responseBody = await res.text().catch(() => "");
|
|
119
|
+
throw new Error(`OpenAI API ${res.status}: ${responseBody}`);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const json = (await res.json()) as OpenAIResponse;
|
|
123
|
+
const sorted = [...json.data].sort((a, b) => a.index - b.index);
|
|
124
|
+
return sorted.map((d) => new Float32Array(d.embedding));
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// ---------------------------------------------------------------------------
|
|
129
|
+
// Provider resolution
|
|
130
|
+
// ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
/** Default models per provider — used when config doesn't override. */
|
|
133
|
+
const PROVIDER_DEFAULTS: Record<string, { model: string; dimensions: number }> = {
|
|
134
|
+
voyage: { model: "voyage-code-3", dimensions: 1024 },
|
|
135
|
+
openai: { model: "text-embedding-3-small", dimensions: 1536 },
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
/** Env var name for each provider's API key. */
|
|
139
|
+
const PROVIDER_ENV_KEYS: Record<string, string> = {
|
|
140
|
+
voyage: "VOYAGE_API_KEY",
|
|
141
|
+
openai: "OPENAI_API_KEY",
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
function getProviderApiKey(provider: string): string | undefined {
|
|
145
|
+
const envKey = PROVIDER_ENV_KEYS[provider];
|
|
146
|
+
return envKey ? process.env[envKey] : undefined;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
let cachedProvider: EmbeddingProvider | null | undefined;
|
|
150
|
+
|
|
151
|
+
function getProvider(): EmbeddingProvider | null {
|
|
152
|
+
if (cachedProvider !== undefined) return cachedProvider;
|
|
153
|
+
|
|
154
|
+
const cfg = config().search.embeddings;
|
|
155
|
+
if (cfg.enabled === false) {
|
|
156
|
+
cachedProvider = null;
|
|
157
|
+
return null;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
const providerName = cfg.provider;
|
|
161
|
+
const apiKey = getProviderApiKey(providerName);
|
|
162
|
+
if (!apiKey) {
|
|
163
|
+
cachedProvider = null;
|
|
164
|
+
return null;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const defaults = PROVIDER_DEFAULTS[providerName];
|
|
168
|
+
const model = cfg.model === defaults?.model ? cfg.model : cfg.model;
|
|
169
|
+
const dimensions = cfg.dimensions;
|
|
170
|
+
|
|
171
|
+
switch (providerName) {
|
|
172
|
+
case "voyage":
|
|
173
|
+
cachedProvider = new VoyageProvider(apiKey, model, dimensions);
|
|
174
|
+
break;
|
|
175
|
+
case "openai":
|
|
176
|
+
cachedProvider = new OpenAIProvider(apiKey, model, dimensions);
|
|
177
|
+
break;
|
|
178
|
+
default:
|
|
179
|
+
log.info(`unknown embedding provider: ${providerName}`);
|
|
180
|
+
cachedProvider = null;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
return cachedProvider;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/** Reset cached provider — called when config changes. */
|
|
187
|
+
export function resetProvider(): void {
|
|
188
|
+
cachedProvider = undefined;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// ---------------------------------------------------------------------------
|
|
192
|
+
// Availability
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
/** Returns true if embedding is available.
|
|
196
|
+
* Active when the configured provider's API key is set, unless explicitly
|
|
197
|
+
* disabled via `search.embeddings.enabled: false` in .lore.json. */
|
|
198
|
+
export function isAvailable(): boolean {
|
|
199
|
+
return getProvider() !== null;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// ---------------------------------------------------------------------------
|
|
203
|
+
// Public embed API
|
|
204
|
+
// ---------------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Generate embeddings for the given texts using the configured provider.
|
|
208
|
+
*
|
|
209
|
+
* @param texts Array of texts to embed
|
|
210
|
+
* @param inputType "document" for storage, "query" for search
|
|
211
|
+
* @returns Float32Array per input text
|
|
212
|
+
* @throws On API errors or missing provider
|
|
213
|
+
*/
|
|
214
|
+
export async function embed(
|
|
215
|
+
texts: string[],
|
|
216
|
+
inputType: "document" | "query",
|
|
217
|
+
): Promise<Float32Array[]> {
|
|
218
|
+
const provider = getProvider();
|
|
219
|
+
if (!provider) throw new Error("No embedding provider available");
|
|
220
|
+
return provider.embed(texts, inputType);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// ---------------------------------------------------------------------------
|
|
224
|
+
// Cosine similarity (pure JS)
|
|
225
|
+
// ---------------------------------------------------------------------------
|
|
226
|
+
|
|
227
|
+
/**
|
|
228
|
+
* Cosine similarity between two Float32Array vectors.
|
|
229
|
+
* Returns -1.0 to 1.0 where 1.0 = identical direction.
|
|
230
|
+
* Returns 0 if either vector is zero-length.
|
|
231
|
+
*/
|
|
232
|
+
export function cosineSimilarity(a: Float32Array, b: Float32Array): number {
|
|
233
|
+
const len = Math.min(a.length, b.length);
|
|
234
|
+
let dot = 0;
|
|
235
|
+
let normA = 0;
|
|
236
|
+
let normB = 0;
|
|
237
|
+
for (let i = 0; i < len; i++) {
|
|
238
|
+
dot += a[i] * b[i];
|
|
239
|
+
normA += a[i] * a[i];
|
|
240
|
+
normB += b[i] * b[i];
|
|
241
|
+
}
|
|
242
|
+
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
243
|
+
if (denom === 0) return 0;
|
|
244
|
+
return dot / denom;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// ---------------------------------------------------------------------------
|
|
248
|
+
// BLOB conversion
|
|
249
|
+
// ---------------------------------------------------------------------------
|
|
250
|
+
|
|
251
|
+
/** Convert Float32Array to Buffer for SQLite BLOB storage. */
|
|
252
|
+
export function toBlob(arr: Float32Array): Buffer {
|
|
253
|
+
return Buffer.from(arr.buffer, arr.byteOffset, arr.byteLength);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/** Convert SQLite BLOB (Buffer/Uint8Array) back to Float32Array. */
|
|
257
|
+
export function fromBlob(blob: Buffer | Uint8Array): Float32Array {
|
|
258
|
+
const bytes = new Uint8Array(blob);
|
|
259
|
+
return new Float32Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 4);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// ---------------------------------------------------------------------------
|
|
263
|
+
// Vector search — knowledge
|
|
264
|
+
// ---------------------------------------------------------------------------
|
|
265
|
+
|
|
266
|
+
type VectorHit = { id: string; similarity: number };
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Search all knowledge entries with embeddings by cosine similarity.
|
|
270
|
+
* Returns top-k entries sorted by similarity descending.
|
|
271
|
+
* Pure brute-force — fine for <100 entries (microseconds).
|
|
272
|
+
*/
|
|
273
|
+
export function vectorSearch(
|
|
274
|
+
queryEmbedding: Float32Array,
|
|
275
|
+
limit = 10,
|
|
276
|
+
): VectorHit[] {
|
|
277
|
+
const rows = db()
|
|
278
|
+
.query("SELECT id, embedding FROM knowledge WHERE embedding IS NOT NULL AND confidence > 0.2")
|
|
279
|
+
.all() as Array<{ id: string; embedding: Buffer }>;
|
|
280
|
+
|
|
281
|
+
const scored: VectorHit[] = [];
|
|
282
|
+
for (const row of rows) {
|
|
283
|
+
const vec = fromBlob(row.embedding);
|
|
284
|
+
const sim = cosineSimilarity(queryEmbedding, vec);
|
|
285
|
+
scored.push({ id: row.id, similarity: sim });
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
scored.sort((a, b) => b.similarity - a.similarity);
|
|
289
|
+
return scored.slice(0, limit);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// ---------------------------------------------------------------------------
|
|
293
|
+
// Vector search — distillations
|
|
294
|
+
// ---------------------------------------------------------------------------
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Search non-archived distillations with embeddings by cosine similarity.
|
|
298
|
+
* Returns top-k entries sorted by similarity descending.
|
|
299
|
+
* Pure brute-force — fine for ~50 entries.
|
|
300
|
+
*/
|
|
301
|
+
export function vectorSearchDistillations(
|
|
302
|
+
queryEmbedding: Float32Array,
|
|
303
|
+
limit = 10,
|
|
304
|
+
): VectorHit[] {
|
|
305
|
+
const rows = db()
|
|
306
|
+
.query(
|
|
307
|
+
"SELECT id, embedding FROM distillations WHERE embedding IS NOT NULL AND archived = 0",
|
|
308
|
+
)
|
|
309
|
+
.all() as Array<{ id: string; embedding: Buffer }>;
|
|
310
|
+
|
|
311
|
+
const scored: VectorHit[] = [];
|
|
312
|
+
for (const row of rows) {
|
|
313
|
+
const vec = fromBlob(row.embedding);
|
|
314
|
+
const sim = cosineSimilarity(queryEmbedding, vec);
|
|
315
|
+
scored.push({ id: row.id, similarity: sim });
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
scored.sort((a, b) => b.similarity - a.similarity);
|
|
319
|
+
return scored.slice(0, limit);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// ---------------------------------------------------------------------------
|
|
323
|
+
// Fire-and-forget embedding
|
|
324
|
+
// ---------------------------------------------------------------------------
|
|
325
|
+
|
|
326
|
+
/**
|
|
327
|
+
* Embed a knowledge entry and store the result in the DB.
|
|
328
|
+
* Fire-and-forget — errors are logged, never thrown.
|
|
329
|
+
* The entry remains usable via FTS even if embedding fails.
|
|
330
|
+
*/
|
|
331
|
+
export function embedKnowledgeEntry(
|
|
332
|
+
id: string,
|
|
333
|
+
title: string,
|
|
334
|
+
content: string,
|
|
335
|
+
): void {
|
|
336
|
+
const text = `${title}\n${content}`;
|
|
337
|
+
embed([text], "document")
|
|
338
|
+
.then(([vec]) => {
|
|
339
|
+
db()
|
|
340
|
+
.query("UPDATE knowledge SET embedding = ? WHERE id = ?")
|
|
341
|
+
.run(toBlob(vec), id);
|
|
342
|
+
})
|
|
343
|
+
.catch((err) => {
|
|
344
|
+
log.info("embedding failed for knowledge entry", id, ":", err);
|
|
345
|
+
});
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
/**
|
|
349
|
+
* Embed a distillation and store the result in the DB.
|
|
350
|
+
* Fire-and-forget — errors are logged, never thrown.
|
|
351
|
+
* The distillation remains searchable via FTS even if embedding fails.
|
|
352
|
+
*/
|
|
353
|
+
export function embedDistillation(
|
|
354
|
+
id: string,
|
|
355
|
+
observations: string,
|
|
356
|
+
): void {
|
|
357
|
+
embed([observations], "document")
|
|
358
|
+
.then(([vec]) => {
|
|
359
|
+
db()
|
|
360
|
+
.query("UPDATE distillations SET embedding = ? WHERE id = ?")
|
|
361
|
+
.run(toBlob(vec), id);
|
|
362
|
+
})
|
|
363
|
+
.catch((err) => {
|
|
364
|
+
log.info("embedding failed for distillation", id, ":", err);
|
|
365
|
+
});
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// ---------------------------------------------------------------------------
|
|
369
|
+
// Config change detection
|
|
370
|
+
// ---------------------------------------------------------------------------
|
|
371
|
+
|
|
372
|
+
/**
|
|
373
|
+
* Build a config fingerprint from provider + model + dimensions.
|
|
374
|
+
* Used to detect when the embedding config changes (provider swap, model swap,
|
|
375
|
+
* dimension change) so we can clear stale embeddings and re-embed.
|
|
376
|
+
*/
|
|
377
|
+
function configFingerprint(): string {
|
|
378
|
+
const cfg = config().search.embeddings;
|
|
379
|
+
return `${cfg.provider}:${cfg.model}:${cfg.dimensions}`;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
const EMBEDDING_CONFIG_KEY = "lore:embedding_config";
|
|
383
|
+
|
|
384
|
+
/**
|
|
385
|
+
* Check if embedding config has changed since the last backfill.
|
|
386
|
+
* If so, clear all existing embeddings (they're incompatible) and
|
|
387
|
+
* update the stored fingerprint.
|
|
388
|
+
*
|
|
389
|
+
* Returns true if embeddings were cleared (full re-embed needed).
|
|
390
|
+
*/
|
|
391
|
+
export function checkConfigChange(): boolean {
|
|
392
|
+
// Read stored fingerprint from kv_meta
|
|
393
|
+
const stored = db()
|
|
394
|
+
.query("SELECT value FROM kv_meta WHERE key = ?")
|
|
395
|
+
.get(EMBEDDING_CONFIG_KEY) as { value: string } | null;
|
|
396
|
+
|
|
397
|
+
const current = configFingerprint();
|
|
398
|
+
|
|
399
|
+
if (stored && stored.value === current) return false;
|
|
400
|
+
|
|
401
|
+
// Config changed (or first run) — clear all embeddings in both tables
|
|
402
|
+
if (stored) {
|
|
403
|
+
const knowledgeCount = db()
|
|
404
|
+
.query("SELECT COUNT(*) as n FROM knowledge WHERE embedding IS NOT NULL")
|
|
405
|
+
.get() as { n: number };
|
|
406
|
+
const distillCount = db()
|
|
407
|
+
.query("SELECT COUNT(*) as n FROM distillations WHERE embedding IS NOT NULL")
|
|
408
|
+
.get() as { n: number };
|
|
409
|
+
const total = knowledgeCount.n + distillCount.n;
|
|
410
|
+
if (total > 0) {
|
|
411
|
+
db().query("UPDATE knowledge SET embedding = NULL").run();
|
|
412
|
+
db().query("UPDATE distillations SET embedding = NULL").run();
|
|
413
|
+
log.info(
|
|
414
|
+
`embedding config changed (${stored.value} → ${current}), cleared ${total} stale embeddings`,
|
|
415
|
+
);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
// Store new fingerprint
|
|
420
|
+
db()
|
|
421
|
+
.query(
|
|
422
|
+
"INSERT INTO kv_meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = ?",
|
|
423
|
+
)
|
|
424
|
+
.run(EMBEDDING_CONFIG_KEY, current, current);
|
|
425
|
+
|
|
426
|
+
return true;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// ---------------------------------------------------------------------------
|
|
430
|
+
// Backfill — knowledge
|
|
431
|
+
// ---------------------------------------------------------------------------
|
|
432
|
+
|
|
433
|
+
/**
|
|
434
|
+
* Embed all knowledge entries that are missing embeddings.
|
|
435
|
+
* Called on startup when embeddings are first enabled.
|
|
436
|
+
* Also handles config changes: if provider/model/dimensions changed, clears
|
|
437
|
+
* stale embeddings first, then re-embeds all entries.
|
|
438
|
+
* Returns the number of entries embedded.
|
|
439
|
+
*/
|
|
440
|
+
export async function backfillEmbeddings(): Promise<number> {
|
|
441
|
+
// Detect config changes and clear stale embeddings
|
|
442
|
+
checkConfigChange();
|
|
443
|
+
|
|
444
|
+
const provider = getProvider();
|
|
445
|
+
if (!provider) return 0;
|
|
446
|
+
|
|
447
|
+
const rows = db()
|
|
448
|
+
.query("SELECT id, title, content FROM knowledge WHERE embedding IS NULL AND confidence > 0.2")
|
|
449
|
+
.all() as Array<{ id: string; title: string; content: string }>;
|
|
450
|
+
|
|
451
|
+
if (!rows.length) return 0;
|
|
452
|
+
|
|
453
|
+
const batchSize = provider.maxBatchSize;
|
|
454
|
+
let embedded = 0;
|
|
455
|
+
|
|
456
|
+
for (let i = 0; i < rows.length; i += batchSize) {
|
|
457
|
+
const batch = rows.slice(i, i + batchSize);
|
|
458
|
+
const texts = batch.map((r) => `${r.title}\n${r.content}`);
|
|
459
|
+
|
|
460
|
+
try {
|
|
461
|
+
const vectors = await embed(texts, "document");
|
|
462
|
+
const update = db().prepare(
|
|
463
|
+
"UPDATE knowledge SET embedding = ? WHERE id = ?",
|
|
464
|
+
);
|
|
465
|
+
|
|
466
|
+
for (let j = 0; j < batch.length; j++) {
|
|
467
|
+
update.run(toBlob(vectors[j]), batch[j].id);
|
|
468
|
+
embedded++;
|
|
469
|
+
}
|
|
470
|
+
} catch (err) {
|
|
471
|
+
log.info(`embedding backfill batch ${i}-${i + batch.length} failed:`, err);
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
if (embedded > 0) {
|
|
476
|
+
log.info(`embedded ${embedded} knowledge entries`);
|
|
477
|
+
}
|
|
478
|
+
return embedded;
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// ---------------------------------------------------------------------------
|
|
482
|
+
// Backfill — distillations
|
|
483
|
+
// ---------------------------------------------------------------------------
|
|
484
|
+
|
|
485
|
+
/**
|
|
486
|
+
* Embed all non-archived distillations that are missing embeddings.
|
|
487
|
+
* Called on startup alongside knowledge backfill.
|
|
488
|
+
* Returns the number of distillations embedded.
|
|
489
|
+
*/
|
|
490
|
+
export async function backfillDistillationEmbeddings(): Promise<number> {
|
|
491
|
+
const provider = getProvider();
|
|
492
|
+
if (!provider) return 0;
|
|
493
|
+
|
|
494
|
+
const rows = db()
|
|
495
|
+
.query(
|
|
496
|
+
"SELECT id, observations FROM distillations WHERE embedding IS NULL AND archived = 0 AND observations != ''",
|
|
497
|
+
)
|
|
498
|
+
.all() as Array<{ id: string; observations: string }>;
|
|
499
|
+
|
|
500
|
+
if (!rows.length) return 0;
|
|
501
|
+
|
|
502
|
+
const batchSize = provider.maxBatchSize;
|
|
503
|
+
let embedded = 0;
|
|
504
|
+
|
|
505
|
+
for (let i = 0; i < rows.length; i += batchSize) {
|
|
506
|
+
const batch = rows.slice(i, i + batchSize);
|
|
507
|
+
const texts = batch.map((r) => r.observations);
|
|
508
|
+
|
|
509
|
+
try {
|
|
510
|
+
const vectors = await embed(texts, "document");
|
|
511
|
+
const update = db().prepare(
|
|
512
|
+
"UPDATE distillations SET embedding = ? WHERE id = ?",
|
|
513
|
+
);
|
|
514
|
+
|
|
515
|
+
for (let j = 0; j < batch.length; j++) {
|
|
516
|
+
update.run(toBlob(vectors[j]), batch[j].id);
|
|
517
|
+
embedded++;
|
|
518
|
+
}
|
|
519
|
+
} catch (err) {
|
|
520
|
+
log.info(`distillation embedding backfill batch ${i}-${i + batch.length} failed:`, err);
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
if (embedded > 0) {
|
|
525
|
+
log.info(`embedded ${embedded} distillations`);
|
|
526
|
+
}
|
|
527
|
+
return embedded;
|
|
528
|
+
}
|