audrey 0.14.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,242 +1,265 @@
1
- import { generateId } from './ulid.js';
2
- import { buildPrincipleExtractionPrompt } from './prompts.js';
3
-
4
- function clusterViaKNN(db, episodes, similarityThreshold, minClusterSize) {
5
- const n = episodes.length;
6
- const k = Math.min(50, n);
7
- const idToIndex = new Map(episodes.map((ep, i) => [ep.id, i]));
8
-
9
- const parent = new Array(n);
10
- for (let i = 0; i < n; i++) parent[i] = i;
11
-
12
- function find(x) {
13
- while (parent[x] !== x) {
14
- parent[x] = parent[parent[x]];
15
- x = parent[x];
16
- }
17
- return x;
18
- }
19
-
20
- function union(a, b) {
21
- const ra = find(a);
22
- const rb = find(b);
23
- if (ra !== rb) parent[ra] = rb;
24
- }
25
-
26
- const getEmbedding = db.prepare('SELECT embedding FROM vec_episodes WHERE id = ?');
27
- const knnQuery = db.prepare(`
28
- SELECT id, distance
29
- FROM vec_episodes
30
- WHERE embedding MATCH ? AND k = ? AND consolidated = 0
31
- `);
32
-
33
- for (let i = 0; i < n; i++) {
34
- const ep = episodes[i];
35
- const vecRow = getEmbedding.get(ep.id);
36
- if (!vecRow) continue;
37
-
38
- const neighbors = knnQuery.all(vecRow.embedding, k);
39
- for (const neighbor of neighbors) {
40
- if (neighbor.id === ep.id) continue;
41
- const j = idToIndex.get(neighbor.id);
42
- if (j === undefined) continue;
43
- const similarity = 1.0 - neighbor.distance;
44
- if (similarity >= similarityThreshold) {
45
- union(i, j);
46
- }
47
- }
48
- }
49
-
50
- const groups = new Map();
51
- for (let i = 0; i < n; i++) {
52
- const root = find(i);
53
- if (!groups.has(root)) groups.set(root, []);
54
- groups.get(root).push(episodes[i]);
55
- }
56
-
57
- const clusters = [];
58
- for (const group of groups.values()) {
59
- if (group.length >= minClusterSize) {
60
- clusters.push(group);
61
- }
62
- }
63
- return clusters;
64
- }
65
-
66
- /**
67
- * @param {import('better-sqlite3').Database} db
68
- * @param {import('./embedding.js').EmbeddingProvider} embeddingProvider
69
- * @param {{ similarityThreshold?: number, minClusterSize?: number }} [options]
70
- * @returns {Array<Array<Object>>}
71
- */
72
- export function clusterEpisodes(db, embeddingProvider, options = {}) {
73
- const {
74
- similarityThreshold = 0.85,
75
- minClusterSize = 3,
76
- } = options;
77
-
78
- const episodes = db.prepare(
79
- 'SELECT * FROM episodes WHERE consolidated = 0 AND superseded_by IS NULL AND embedding IS NOT NULL'
80
- ).all();
81
-
82
- if (episodes.length === 0) return [];
83
-
84
- return clusterViaKNN(db, episodes, similarityThreshold, minClusterSize);
85
- }
86
-
87
- function defaultExtractPrinciple(episodes) {
88
- const uniqueContents = [...new Set(episodes.map(e => e.content))];
89
- return {
90
- content: `Recurring pattern: ${uniqueContents.join('; ')}`,
91
- type: 'semantic',
92
- };
93
- }
94
-
95
- async function llmExtractPrinciple(llmProvider, episodes) {
96
- const messages = buildPrincipleExtractionPrompt(episodes);
97
- return llmProvider.json(messages);
98
- }
99
-
100
- /**
101
- * @param {import('better-sqlite3').Database} db
102
- * @param {import('./embedding.js').EmbeddingProvider} embeddingProvider
103
- * @param {{ similarityThreshold?: number, minClusterSize?: number, extractPrinciple?: function, llmProvider?: Object }} [options]
104
- * @returns {Promise<{ runId: string, episodesEvaluated: number, clustersFound: number, principlesExtracted: number }>}
105
- */
106
- export async function runConsolidation(db, embeddingProvider, options = {}) {
107
- const {
108
- similarityThreshold = 0.85,
109
- minClusterSize = 3,
110
- extractPrinciple,
111
- llmProvider,
112
- } = options;
113
-
114
- const runId = generateId();
115
- const now = new Date().toISOString();
116
-
117
- db.prepare(`
118
- INSERT INTO consolidation_runs (id, started_at, status, input_episode_ids, output_memory_ids, consolidation_model)
119
- VALUES (?, ?, 'running', '[]', '[]', ?)
120
- `).run(runId, now, llmProvider?.modelName || null);
121
-
122
- try {
123
- const clusters = clusterEpisodes(db, embeddingProvider, { similarityThreshold, minClusterSize });
124
-
125
- const episodesEvaluated = db.prepare(
126
- 'SELECT COUNT(*) as count FROM episodes WHERE consolidated = 0 AND superseded_by IS NULL AND embedding IS NOT NULL'
127
- ).get().count;
128
-
129
- const clusterData = [];
130
- for (const cluster of clusters) {
131
- let principle;
132
- if (extractPrinciple) {
133
- principle = extractPrinciple(cluster);
134
- } else if (llmProvider) {
135
- principle = await llmExtractPrinciple(llmProvider, cluster);
136
- } else {
137
- principle = defaultExtractPrinciple(cluster);
138
- }
139
-
140
- if (!principle || !principle.content) continue;
141
-
142
- const clusterIds = cluster.map(ep => ep.id);
143
- const sourceTypes = new Set(cluster.map(ep => ep.source));
144
- const vector = await embeddingProvider.embed(principle.content);
145
- const embeddingBuffer = embeddingProvider.vectorToBuffer(vector);
146
-
147
- clusterData.push({
148
- cluster,
149
- principle,
150
- clusterIds,
151
- sourceTypeDiversity: sourceTypes.size,
152
- embeddingBuffer,
153
- semanticId: generateId(),
154
- semanticNow: new Date().toISOString(),
155
- maxSalience: Math.max(...cluster.map(ep => ep.salience ?? 0.5)),
156
- });
157
- }
158
-
159
- const allInputIds = [];
160
- const allOutputIds = [];
161
- let principlesExtracted = 0;
162
-
163
- const promoteAll = db.transaction(() => {
164
- for (const entry of clusterData) {
165
- allInputIds.push(...entry.clusterIds);
166
-
167
- db.prepare(`
168
- INSERT INTO semantics (
169
- id, content, embedding, state, evidence_episode_ids,
170
- evidence_count, supporting_count, source_type_diversity,
171
- consolidation_checkpoint, embedding_model, embedding_version,
172
- consolidation_model, created_at, salience
173
- ) VALUES (?, ?, ?, 'active', ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
174
- `).run(
175
- entry.semanticId,
176
- entry.principle.content,
177
- entry.embeddingBuffer,
178
- JSON.stringify(entry.clusterIds),
179
- entry.cluster.length,
180
- entry.cluster.length,
181
- entry.sourceTypeDiversity,
182
- runId,
183
- embeddingProvider.modelName,
184
- embeddingProvider.modelVersion,
185
- llmProvider?.modelName || null,
186
- entry.semanticNow,
187
- entry.maxSalience,
188
- );
189
-
190
- db.prepare('INSERT INTO vec_semantics(id, embedding, state) VALUES (?, ?, ?)').run(
191
- entry.semanticId, entry.embeddingBuffer, 'active'
192
- );
193
-
194
- allOutputIds.push(entry.semanticId);
195
- principlesExtracted++;
196
-
197
- const markStmt = db.prepare('UPDATE episodes SET consolidated = 1 WHERE id = ?');
198
- const markVecStmt = db.prepare('UPDATE vec_episodes SET consolidated = ? WHERE id = ?');
199
- for (const ep of entry.cluster) {
200
- markStmt.run(ep.id);
201
- markVecStmt.run(BigInt(1), ep.id);
202
- }
203
- }
204
-
205
- const completedAt = new Date().toISOString();
206
- db.prepare(`
207
- UPDATE consolidation_runs
208
- SET status = 'completed',
209
- completed_at = ?,
210
- input_episode_ids = ?,
211
- output_memory_ids = ?
212
- WHERE id = ?
213
- `).run(completedAt, JSON.stringify(allInputIds), JSON.stringify(allOutputIds), runId);
214
- });
215
-
216
- promoteAll();
217
-
218
- db.prepare(`
219
- INSERT INTO consolidation_metrics (id, run_id, min_cluster_size, similarity_threshold,
220
- episodes_evaluated, clusters_found, principles_extracted, created_at)
221
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
222
- `).run(
223
- generateId(), runId, minClusterSize, similarityThreshold,
224
- episodesEvaluated, clusters.length, principlesExtracted, new Date().toISOString(),
225
- );
226
-
227
- return {
228
- runId,
229
- episodesEvaluated,
230
- clustersFound: clusters.length,
231
- principlesExtracted,
232
- };
233
- } catch (err) {
234
- const failedAt = new Date().toISOString();
235
- db.prepare(`
236
- UPDATE consolidation_runs
237
- SET status = 'failed', completed_at = ?
238
- WHERE id = ?
239
- `).run(failedAt, runId);
240
- throw err;
241
- }
242
- }
1
+ import { generateId } from './ulid.js';
2
+ import { buildPrincipleExtractionPrompt } from './prompts.js';
3
+
4
+ function clusterViaKNN(db, episodes, similarityThreshold, minClusterSize) {
5
+ const n = episodes.length;
6
+ const k = Math.min(50, n);
7
+ const idToIndex = new Map(episodes.map((ep, i) => [ep.id, i]));
8
+
9
+ const parent = new Array(n);
10
+ for (let i = 0; i < n; i++) parent[i] = i;
11
+
12
+ function find(x) {
13
+ while (parent[x] !== x) {
14
+ parent[x] = parent[parent[x]];
15
+ x = parent[x];
16
+ }
17
+ return x;
18
+ }
19
+
20
+ function union(a, b) {
21
+ const ra = find(a);
22
+ const rb = find(b);
23
+ if (ra !== rb) parent[ra] = rb;
24
+ }
25
+
26
+ const getEmbedding = db.prepare('SELECT embedding FROM vec_episodes WHERE id = ?');
27
+ const knnQuery = db.prepare(`
28
+ SELECT id, distance
29
+ FROM vec_episodes
30
+ WHERE embedding MATCH ? AND k = ? AND consolidated = 0
31
+ `);
32
+
33
+ for (let i = 0; i < n; i++) {
34
+ const ep = episodes[i];
35
+ const vecRow = getEmbedding.get(ep.id);
36
+ if (!vecRow) continue;
37
+
38
+ const neighbors = knnQuery.all(vecRow.embedding, k);
39
+ for (const neighbor of neighbors) {
40
+ if (neighbor.id === ep.id) continue;
41
+ const j = idToIndex.get(neighbor.id);
42
+ if (j === undefined) continue;
43
+ const similarity = 1.0 - neighbor.distance;
44
+ if (similarity >= similarityThreshold) {
45
+ union(i, j);
46
+ }
47
+ }
48
+ }
49
+
50
+ const groups = new Map();
51
+ for (let i = 0; i < n; i++) {
52
+ const root = find(i);
53
+ if (!groups.has(root)) groups.set(root, []);
54
+ groups.get(root).push(episodes[i]);
55
+ }
56
+
57
+ const clusters = [];
58
+ for (const group of groups.values()) {
59
+ if (group.length >= minClusterSize) {
60
+ clusters.push(group);
61
+ }
62
+ }
63
+ return clusters;
64
+ }
65
+
66
+ /**
67
+ * @param {import('better-sqlite3').Database} db
68
+ * @param {import('./embedding.js').EmbeddingProvider} embeddingProvider
69
+ * @param {{ similarityThreshold?: number, minClusterSize?: number }} [options]
70
+ * @returns {Array<Array<Object>>}
71
+ */
72
+ export function clusterEpisodes(db, embeddingProvider, options = {}) {
73
+ const {
74
+ similarityThreshold = 0.85,
75
+ minClusterSize = 3,
76
+ } = options;
77
+
78
+ const episodes = db.prepare(
79
+ 'SELECT * FROM episodes WHERE consolidated = 0 AND superseded_by IS NULL AND embedding IS NOT NULL'
80
+ ).all();
81
+
82
+ if (episodes.length === 0) return [];
83
+
84
+ return clusterViaKNN(db, episodes, similarityThreshold, minClusterSize);
85
+ }
86
+
87
+ function defaultExtractPrinciple(episodes) {
88
+ const uniqueContents = [...new Set(episodes.map(e => e.content))];
89
+ return {
90
+ content: `Recurring pattern: ${uniqueContents.join('; ')}`,
91
+ type: 'semantic',
92
+ };
93
+ }
94
+
95
+ async function llmExtractPrinciple(llmProvider, episodes) {
96
+ const messages = buildPrincipleExtractionPrompt(episodes);
97
+ return llmProvider.json(messages);
98
+ }
99
+
100
+ /**
101
+ * @param {import('better-sqlite3').Database} db
102
+ * @param {import('./embedding.js').EmbeddingProvider} embeddingProvider
103
+ * @param {{ similarityThreshold?: number, minClusterSize?: number, extractPrinciple?: function, llmProvider?: Object }} [options]
104
+ * @returns {Promise<{ runId: string, episodesEvaluated: number, clustersFound: number, principlesExtracted: number }>}
105
+ */
106
+ export async function runConsolidation(db, embeddingProvider, options = {}) {
107
+ const {
108
+ similarityThreshold = 0.85,
109
+ minClusterSize = 3,
110
+ extractPrinciple,
111
+ llmProvider,
112
+ } = options;
113
+
114
+ const runId = generateId();
115
+ const now = new Date().toISOString();
116
+
117
+ db.prepare(`
118
+ INSERT INTO consolidation_runs (
119
+ id, started_at, status, input_episode_ids, output_memory_ids, consolidation_model, checkpoint_cursor
120
+ )
121
+ VALUES (?, ?, 'running', '[]', '[]', ?, ?)
122
+ `).run(runId, now, llmProvider?.modelName || null, now);
123
+
124
+ try {
125
+ const clusters = clusterEpisodes(db, embeddingProvider, { similarityThreshold, minClusterSize });
126
+
127
+ const episodesEvaluated = db.prepare(
128
+ 'SELECT COUNT(*) as count FROM episodes WHERE consolidated = 0 AND superseded_by IS NULL AND embedding IS NOT NULL'
129
+ ).get().count;
130
+
131
+ const allInputIds = [];
132
+ const allOutputIds = [];
133
+ let principlesExtracted = 0;
134
+ let proceduresExtracted = 0;
135
+ const insertProcedure = db.prepare(`
136
+ INSERT INTO procedures (
137
+ id, content, embedding, state, trigger_conditions,
138
+ evidence_episode_ids, success_count, failure_count,
139
+ embedding_model, embedding_version, created_at, salience
140
+ ) VALUES (?, ?, ?, 'active', ?, ?, 0, 0, ?, ?, ?, ?)
141
+ `);
142
+ const insertVecProcedure = db.prepare('INSERT INTO vec_procedures(id, embedding, state) VALUES (?, ?, ?)');
143
+ const insertSemantic = db.prepare(`
144
+ INSERT INTO semantics (
145
+ id, content, embedding, state, evidence_episode_ids,
146
+ evidence_count, supporting_count, source_type_diversity,
147
+ consolidation_checkpoint, embedding_model, embedding_version,
148
+ consolidation_model, created_at, salience
149
+ ) VALUES (?, ?, ?, 'active', ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
150
+ `);
151
+ const insertVecSemantic = db.prepare('INSERT INTO vec_semantics(id, embedding, state) VALUES (?, ?, ?)');
152
+ const markEpisode = db.prepare('UPDATE episodes SET consolidated = 1 WHERE id = ?');
153
+ const markVecEpisode = db.prepare('UPDATE vec_episodes SET consolidated = ? WHERE id = ?');
154
+ const updateRunCompleted = db.prepare(`
155
+ UPDATE consolidation_runs
156
+ SET status = 'completed',
157
+ completed_at = ?,
158
+ input_episode_ids = ?,
159
+ output_memory_ids = ?
160
+ WHERE id = ?
161
+ `);
162
+ const insertMetrics = db.prepare(`
163
+ INSERT INTO consolidation_metrics (id, run_id, min_cluster_size, similarity_threshold,
164
+ episodes_evaluated, clusters_found, principles_extracted, created_at)
165
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
166
+ `);
167
+
168
+ db.exec('BEGIN IMMEDIATE');
169
+ try {
170
+ for (const cluster of clusters) {
171
+ let principle;
172
+ if (extractPrinciple) {
173
+ principle = extractPrinciple(cluster);
174
+ } else if (llmProvider) {
175
+ principle = await llmExtractPrinciple(llmProvider, cluster);
176
+ } else {
177
+ principle = defaultExtractPrinciple(cluster);
178
+ }
179
+
180
+ if (!principle || !principle.content) continue;
181
+
182
+ const clusterIds = cluster.map(ep => ep.id);
183
+ const sourceTypeDiversity = new Set(cluster.map(ep => ep.source)).size;
184
+ const vector = await embeddingProvider.embed(principle.content);
185
+ const embeddingBuffer = embeddingProvider.vectorToBuffer(vector);
186
+ const memoryId = generateId();
187
+ const createdAt = new Date().toISOString();
188
+ const maxSalience = Math.max(...cluster.map(ep => ep.salience ?? 0.5));
189
+
190
+ allInputIds.push(...clusterIds);
191
+
192
+ if (principle.type === 'procedural') {
193
+ insertProcedure.run(
194
+ memoryId,
195
+ principle.content,
196
+ embeddingBuffer,
197
+ principle.conditions ? JSON.stringify(principle.conditions) : null,
198
+ JSON.stringify(clusterIds),
199
+ embeddingProvider.modelName,
200
+ embeddingProvider.modelVersion,
201
+ createdAt,
202
+ maxSalience,
203
+ );
204
+ insertVecProcedure.run(memoryId, embeddingBuffer, 'active');
205
+ proceduresExtracted++;
206
+ } else {
207
+ insertSemantic.run(
208
+ memoryId,
209
+ principle.content,
210
+ embeddingBuffer,
211
+ JSON.stringify(clusterIds),
212
+ cluster.length,
213
+ cluster.length,
214
+ sourceTypeDiversity,
215
+ runId,
216
+ embeddingProvider.modelName,
217
+ embeddingProvider.modelVersion,
218
+ llmProvider?.modelName || null,
219
+ createdAt,
220
+ maxSalience,
221
+ );
222
+ insertVecSemantic.run(memoryId, embeddingBuffer, 'active');
223
+ }
224
+
225
+ allOutputIds.push(memoryId);
226
+ principlesExtracted++;
227
+
228
+ for (const ep of cluster) {
229
+ markEpisode.run(ep.id);
230
+ markVecEpisode.run(BigInt(1), ep.id);
231
+ }
232
+ }
233
+
234
+ const completedAt = new Date().toISOString();
235
+ updateRunCompleted.run(completedAt, JSON.stringify(allInputIds), JSON.stringify(allOutputIds), runId);
236
+ insertMetrics.run(
237
+ generateId(), runId, minClusterSize, similarityThreshold,
238
+ episodesEvaluated, clusters.length, principlesExtracted, completedAt,
239
+ );
240
+ db.exec('COMMIT');
241
+ } catch (err) {
242
+ if (db.inTransaction) {
243
+ db.exec('ROLLBACK');
244
+ }
245
+ throw err;
246
+ }
247
+
248
+ return {
249
+ runId,
250
+ episodesEvaluated,
251
+ clustersFound: clusters.length,
252
+ principlesExtracted,
253
+ semanticsCreated: principlesExtracted - proceduresExtracted,
254
+ proceduresCreated: proceduresExtracted,
255
+ };
256
+ } catch (err) {
257
+ const failedAt = new Date().toISOString();
258
+ db.prepare(`
259
+ UPDATE consolidation_runs
260
+ SET status = 'failed', completed_at = ?
261
+ WHERE id = ?
262
+ `).run(failedAt, runId);
263
+ throw err;
264
+ }
265
+ }
package/src/context.js CHANGED
@@ -1,15 +1,15 @@
1
- export function contextMatchRatio(encodingContext, retrievalContext) {
2
- if (!encodingContext || !retrievalContext) return 0;
3
- const retrievalKeys = Object.keys(retrievalContext);
4
- if (retrievalKeys.length === 0) return 0;
5
- const sharedKeys = retrievalKeys.filter(k => k in encodingContext);
6
- if (sharedKeys.length === 0) return 0;
7
- const matches = sharedKeys.filter(k => encodingContext[k] === retrievalContext[k]).length;
8
- return matches / retrievalKeys.length;
9
- }
10
-
11
- export function contextModifier(encodingContext, retrievalContext, weight = 0.3) {
12
- if (!encodingContext || !retrievalContext) return 1.0;
13
- const ratio = contextMatchRatio(encodingContext, retrievalContext);
14
- return 1.0 + (weight * ratio);
15
- }
1
+ export function contextMatchRatio(encodingContext, retrievalContext) {
2
+ if (!encodingContext || !retrievalContext) return 0;
3
+ const retrievalKeys = Object.keys(retrievalContext);
4
+ if (retrievalKeys.length === 0) return 0;
5
+ const sharedKeys = retrievalKeys.filter(k => k in encodingContext);
6
+ if (sharedKeys.length === 0) return 0;
7
+ const matches = sharedKeys.filter(k => encodingContext[k] === retrievalContext[k]).length;
8
+ return matches / retrievalKeys.length;
9
+ }
10
+
11
+ export function contextModifier(encodingContext, retrievalContext, weight = 0.3) {
12
+ if (!encodingContext || !retrievalContext) return 1.0;
13
+ const ratio = contextMatchRatio(encodingContext, retrievalContext);
14
+ return 1.0 + (weight * ratio);
15
+ }
package/src/db.js CHANGED
@@ -213,6 +213,33 @@ function migrateEmbeddingsToVec0(db, dimensions) {
213
213
  });
214
214
  }
215
215
 
216
+ function getEmbeddingSyncCounts(db) {
217
+ let vecEpisodes = 0;
218
+ let vecSemantics = 0;
219
+ let vecProcedures = 0;
220
+
221
+ try {
222
+ vecEpisodes = db.prepare('SELECT COUNT(*) as c FROM vec_episodes').get().c;
223
+ vecSemantics = db.prepare('SELECT COUNT(*) as c FROM vec_semantics').get().c;
224
+ vecProcedures = db.prepare('SELECT COUNT(*) as c FROM vec_procedures').get().c;
225
+ } catch {
226
+ // vec tables may not exist yet
227
+ }
228
+
229
+ const episodes = db.prepare('SELECT COUNT(*) as c FROM episodes WHERE embedding IS NOT NULL').get().c;
230
+ const semantics = db.prepare('SELECT COUNT(*) as c FROM semantics WHERE embedding IS NOT NULL').get().c;
231
+ const procedures = db.prepare('SELECT COUNT(*) as c FROM procedures WHERE embedding IS NOT NULL').get().c;
232
+
233
+ return {
234
+ episodes,
235
+ vecEpisodes,
236
+ semantics,
237
+ vecSemantics,
238
+ procedures,
239
+ vecProcedures,
240
+ };
241
+ }
242
+
216
243
  function addColumnIfMissing(db, table, column, definition) {
217
244
  const columns = db.pragma(`table_info(${table})`);
218
245
  const exists = columns.some(col => col.name === column);
@@ -305,6 +332,16 @@ export function createDatabase(dataDir, options = {}) {
305
332
 
306
333
  if (!migrated) {
307
334
  migrateEmbeddingsToVec0(db, dimensions);
335
+ const sync = getEmbeddingSyncCounts(db);
336
+ if (
337
+ sync.episodes !== sync.vecEpisodes
338
+ || sync.semantics !== sync.vecSemantics
339
+ || sync.procedures !== sync.vecProcedures
340
+ ) {
341
+ // Legacy blobs exist but could not be copied cleanly into vec0.
342
+ // Mark the store for lazy re-embedding so the next encode/recall repairs it.
343
+ migrated = true;
344
+ }
308
345
  }
309
346
  }
310
347