@qearlyao/familiar 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +31 -0
- package/HEARTBEAT.md +23 -0
- package/LICENSE +21 -0
- package/MEMORY.md +1 -0
- package/README.md +245 -0
- package/SOUL.md +13 -0
- package/USER.md +13 -0
- package/config.example.toml +221 -0
- package/dist/agent-events.js +167 -0
- package/dist/agent.js +590 -0
- package/dist/browser-tools.js +638 -0
- package/dist/chat-log.js +130 -0
- package/dist/cli.js +168 -0
- package/dist/config.js +804 -0
- package/dist/data-retention.js +54 -0
- package/dist/discord.js +1203 -0
- package/dist/generated-media.js +86 -0
- package/dist/image-derivatives.js +102 -0
- package/dist/image-gen.js +440 -0
- package/dist/inbound-attachments.js +266 -0
- package/dist/index.js +10 -0
- package/dist/media-understanding.js +120 -0
- package/dist/memory/diary/ambient-injector.js +180 -0
- package/dist/memory/diary/ambient.js +124 -0
- package/dist/memory/diary/chunks.js +231 -0
- package/dist/memory/diary/index.js +3 -0
- package/dist/memory/diary/indexer.js +93 -0
- package/dist/memory/doctor.js +250 -0
- package/dist/memory/index/chunk-indexer.js +151 -0
- package/dist/memory/index/embedding-provider.js +119 -0
- package/dist/memory/index/fts-query.js +18 -0
- package/dist/memory/index/retrieval.js +246 -0
- package/dist/memory/index/schema.js +157 -0
- package/dist/memory/index/store.js +513 -0
- package/dist/memory/index/vec.js +72 -0
- package/dist/memory/index/vector-codec.js +27 -0
- package/dist/memory/lcm/backfill.js +247 -0
- package/dist/memory/lcm/condense.js +146 -0
- package/dist/memory/lcm/context-transformer.js +662 -0
- package/dist/memory/lcm/context.js +421 -0
- package/dist/memory/lcm/eviction-score.js +38 -0
- package/dist/memory/lcm/index.js +6 -0
- package/dist/memory/lcm/indexer.js +200 -0
- package/dist/memory/lcm/normalize.js +235 -0
- package/dist/memory/lcm/schema.js +188 -0
- package/dist/memory/lcm/segment-manager.js +136 -0
- package/dist/memory/lcm/store.js +722 -0
- package/dist/memory/lcm/summarizer.js +258 -0
- package/dist/memory/lcm/types.js +1 -0
- package/dist/memory/operator.js +477 -0
- package/dist/memory/service.js +202 -0
- package/dist/memory/tools.js +205 -0
- package/dist/models.js +165 -0
- package/dist/persona.js +54 -0
- package/dist/runtime.js +493 -0
- package/dist/scheduler.js +200 -0
- package/dist/settings.js +116 -0
- package/dist/skills.js +38 -0
- package/dist/tts.js +143 -0
- package/dist/web-auth.js +105 -0
- package/dist/web-events.js +114 -0
- package/dist/web-http.js +29 -0
- package/dist/web-static.js +106 -0
- package/dist/web-tools.js +940 -0
- package/dist/web-types.js +2 -0
- package/dist/web.js +844 -0
- package/package.json +60 -0
- package/web/dist/assets/index-ClgkMgaq.css +2 -0
- package/web/dist/assets/index-Cu2QquuR.js +59 -0
- package/web/dist/favicon.svg +1 -0
- package/web/dist/icons.svg +24 -0
- package/web/dist/index.html +20 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
export function runDoctor(stores, opts = {}) {
|
|
2
|
+
void opts;
|
|
3
|
+
const findings = [];
|
|
4
|
+
findDanglingIndexSources(stores, findings);
|
|
5
|
+
findOrphanEmptySegments(stores, findings);
|
|
6
|
+
findStaleLcmIndexRows(stores, findings);
|
|
7
|
+
findBrokenContextOrdering(stores, findings);
|
|
8
|
+
findSummaryFkViolations(stores, findings);
|
|
9
|
+
findMissingPrunedSnapshots(stores, findings);
|
|
10
|
+
findRequiresReindex(stores, findings);
|
|
11
|
+
findEmbeddingMismatches(stores, findings);
|
|
12
|
+
return { findings, clean: findings.length === 0 };
|
|
13
|
+
}
|
|
14
|
+
export function applyDoctorFixes(stores, report) {
|
|
15
|
+
let fixed = 0;
|
|
16
|
+
const warnings = [];
|
|
17
|
+
const runIndexFixes = () => {
|
|
18
|
+
fixed += stores.index.db
|
|
19
|
+
.prepare(`DELETE FROM memory_index_sources
|
|
20
|
+
WHERE chunk_id NOT IN (SELECT id FROM memory_chunks)`)
|
|
21
|
+
.run().changes;
|
|
22
|
+
const staleSources = stores.index.db
|
|
23
|
+
.prepare(`SELECT corpus, source_id
|
|
24
|
+
FROM memory_index_sources
|
|
25
|
+
WHERE corpus IN ('lcm_record', 'lcm_summary')`)
|
|
26
|
+
.all();
|
|
27
|
+
for (const source of staleSources) {
|
|
28
|
+
if (lcmSourceExists(stores, source.corpus, source.source_id))
|
|
29
|
+
continue;
|
|
30
|
+
const before = countIndexSourceRows(stores.index, source.corpus, source.source_id);
|
|
31
|
+
stores.index.deleteBySourceUnsafe(source.corpus, source.source_id);
|
|
32
|
+
fixed += before;
|
|
33
|
+
}
|
|
34
|
+
};
|
|
35
|
+
if (stores.index.db.inTransaction)
|
|
36
|
+
runIndexFixes();
|
|
37
|
+
else
|
|
38
|
+
stores.index.db.transaction(runIndexFixes).immediate();
|
|
39
|
+
const runLcmFixes = () => {
|
|
40
|
+
fixed += stores.lcm.db
|
|
41
|
+
.prepare(`DELETE FROM lcm_segments
|
|
42
|
+
WHERE status != 'active'
|
|
43
|
+
AND id NOT IN (SELECT DISTINCT segment_id FROM lcm_records)
|
|
44
|
+
AND id NOT IN (SELECT DISTINCT segment_id FROM lcm_summaries)`)
|
|
45
|
+
.run().changes;
|
|
46
|
+
const sessions = stores.lcm.db
|
|
47
|
+
.prepare("SELECT DISTINCT session_key FROM lcm_context_items ORDER BY session_key")
|
|
48
|
+
.all();
|
|
49
|
+
for (const session of sessions) {
|
|
50
|
+
const rows = stores.lcm.db
|
|
51
|
+
.prepare(`SELECT rowid AS rowid, ordinal
|
|
52
|
+
FROM lcm_context_items
|
|
53
|
+
WHERE session_key = ?
|
|
54
|
+
ORDER BY ordinal, rowid`)
|
|
55
|
+
.all(session.session_key);
|
|
56
|
+
for (const [index, row] of rows.entries()) {
|
|
57
|
+
if (row.ordinal === index)
|
|
58
|
+
continue;
|
|
59
|
+
stores.lcm.db
|
|
60
|
+
.prepare("UPDATE lcm_context_items SET ordinal = ?, updated_at = unixepoch() WHERE rowid = ?")
|
|
61
|
+
.run(index, row.rowid);
|
|
62
|
+
fixed += 1;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
if (stores.lcm.db.inTransaction)
|
|
67
|
+
runLcmFixes();
|
|
68
|
+
else
|
|
69
|
+
stores.lcm.db.transaction(runLcmFixes).immediate();
|
|
70
|
+
if (report.findings.some((finding) => finding.kind === "summary_fk_violation")) {
|
|
71
|
+
warnings.push("summary FK violations were not modified; inspect LCM summary lineage manually");
|
|
72
|
+
}
|
|
73
|
+
if (report.findings.some((finding) => finding.kind === "missing_pruned_summary_snapshot")) {
|
|
74
|
+
warnings.push("missing pruned summary snapshots were not modified; inspect retained summaries manually");
|
|
75
|
+
}
|
|
76
|
+
if (report.findings.some((finding) => finding.kind === "embedding_mismatch")) {
|
|
77
|
+
warnings.push("embedding mismatches were not rebuilt; run 'familiar memory reindex'");
|
|
78
|
+
}
|
|
79
|
+
if (report.findings.some((finding) => finding.kind === "requires_reindex")) {
|
|
80
|
+
warnings.push("reindex requirement was not cleared; run 'familiar memory reindex --force'");
|
|
81
|
+
}
|
|
82
|
+
const summary = [`fixed ${fixed} item(s)`, ...warnings].join("; ");
|
|
83
|
+
return { fixed, summary };
|
|
84
|
+
}
|
|
85
|
+
function findDanglingIndexSources(stores, findings) {
|
|
86
|
+
const rows = stores.index.db
|
|
87
|
+
.prepare(`SELECT chunk_id, corpus, source_id, chunk_index
|
|
88
|
+
FROM memory_index_sources
|
|
89
|
+
WHERE chunk_id NOT IN (SELECT id FROM memory_chunks)
|
|
90
|
+
ORDER BY corpus, source_id, chunk_index`)
|
|
91
|
+
.all();
|
|
92
|
+
for (const row of rows) {
|
|
93
|
+
findings.push({
|
|
94
|
+
kind: "dangling_index_source",
|
|
95
|
+
detail: `${row.corpus}:${row.source_id}#${row.chunk_index} references missing chunk ${row.chunk_id}`,
|
|
96
|
+
fixable: true,
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
function findOrphanEmptySegments(stores, findings) {
|
|
101
|
+
const rows = stores.lcm.db
|
|
102
|
+
.prepare(`SELECT id
|
|
103
|
+
FROM lcm_segments
|
|
104
|
+
WHERE status != 'active'
|
|
105
|
+
AND id NOT IN (SELECT DISTINCT segment_id FROM lcm_records)
|
|
106
|
+
AND id NOT IN (SELECT DISTINCT segment_id FROM lcm_summaries)
|
|
107
|
+
ORDER BY started_at, id`)
|
|
108
|
+
.all();
|
|
109
|
+
for (const row of rows) {
|
|
110
|
+
findings.push({
|
|
111
|
+
kind: "orphan_empty_segment",
|
|
112
|
+
detail: `closed segment ${row.id} has no records`,
|
|
113
|
+
fixable: true,
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
function findStaleLcmIndexRows(stores, findings) {
|
|
118
|
+
const rows = stores.index.db
|
|
119
|
+
.prepare(`SELECT corpus, source_id, chunk_index
|
|
120
|
+
FROM memory_index_sources
|
|
121
|
+
WHERE corpus IN ('lcm_record', 'lcm_summary')
|
|
122
|
+
ORDER BY corpus, source_id, chunk_index`)
|
|
123
|
+
.all();
|
|
124
|
+
for (const row of rows) {
|
|
125
|
+
if (lcmSourceExists(stores, row.corpus, row.source_id))
|
|
126
|
+
continue;
|
|
127
|
+
findings.push({
|
|
128
|
+
kind: "stale_lcm_index_source",
|
|
129
|
+
detail: `${row.corpus}:${row.source_id}#${row.chunk_index} points at missing LCM source`,
|
|
130
|
+
fixable: true,
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
function findBrokenContextOrdering(stores, findings) {
|
|
135
|
+
const sessions = stores.lcm.db
|
|
136
|
+
.prepare("SELECT DISTINCT session_key FROM lcm_context_items ORDER BY session_key")
|
|
137
|
+
.all();
|
|
138
|
+
for (const session of sessions) {
|
|
139
|
+
const rows = stores.lcm.db
|
|
140
|
+
.prepare(`SELECT ordinal
|
|
141
|
+
FROM lcm_context_items
|
|
142
|
+
WHERE session_key = ?
|
|
143
|
+
ORDER BY ordinal`)
|
|
144
|
+
.all(session.session_key);
|
|
145
|
+
const ordinals = rows.map((row) => row.ordinal);
|
|
146
|
+
if (ordinals.every((ordinal, index) => ordinal === index) && new Set(ordinals).size === ordinals.length)
|
|
147
|
+
continue;
|
|
148
|
+
findings.push({
|
|
149
|
+
kind: "broken_context_ordering",
|
|
150
|
+
detail: `session ${session.session_key} ordinals are ${ordinals.join(",")}`,
|
|
151
|
+
fixable: true,
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
function findSummaryFkViolations(stores, findings) {
|
|
156
|
+
const sourceRows = stores.lcm.db
|
|
157
|
+
.prepare(`SELECT summary_id, ord, record_id
|
|
158
|
+
FROM lcm_summary_sources
|
|
159
|
+
WHERE record_id IS NOT NULL
|
|
160
|
+
AND record_id NOT IN (SELECT id FROM lcm_records)
|
|
161
|
+
ORDER BY summary_id, ord`)
|
|
162
|
+
.all();
|
|
163
|
+
for (const row of sourceRows) {
|
|
164
|
+
findings.push({
|
|
165
|
+
kind: "summary_fk_violation",
|
|
166
|
+
detail: `summary ${row.summary_id} source ${row.ord} references missing record ${row.record_id}`,
|
|
167
|
+
fixable: false,
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
const parentRows = stores.lcm.db
|
|
171
|
+
.prepare(`SELECT summary_id, parent_summary_id
|
|
172
|
+
FROM lcm_summary_parents
|
|
173
|
+
WHERE parent_summary_id NOT IN (SELECT id FROM lcm_summaries)
|
|
174
|
+
ORDER BY summary_id, parent_summary_id`)
|
|
175
|
+
.all();
|
|
176
|
+
for (const row of parentRows) {
|
|
177
|
+
findings.push({
|
|
178
|
+
kind: "summary_fk_violation",
|
|
179
|
+
detail: `summary ${row.summary_id} references missing parent summary ${row.parent_summary_id}`,
|
|
180
|
+
fixable: false,
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
function findMissingPrunedSnapshots(stores, findings) {
|
|
185
|
+
const rows = stores.lcm.db
|
|
186
|
+
.prepare(`SELECT id
|
|
187
|
+
FROM lcm_summaries
|
|
188
|
+
WHERE covers_from_record_id IS NULL
|
|
189
|
+
AND snapshot_json IS NULL
|
|
190
|
+
ORDER BY id`)
|
|
191
|
+
.all();
|
|
192
|
+
for (const row of rows) {
|
|
193
|
+
findings.push({
|
|
194
|
+
kind: "missing_pruned_summary_snapshot",
|
|
195
|
+
detail: `summary ${row.id} has pruned raw coverage without snapshot_json`,
|
|
196
|
+
fixable: false,
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
function findRequiresReindex(stores, findings) {
|
|
201
|
+
const row = stores.index.db.prepare("SELECT v FROM memory_meta WHERE k = 'requires_reindex'").get();
|
|
202
|
+
if (row?.v !== "1")
|
|
203
|
+
return;
|
|
204
|
+
findings.push({
|
|
205
|
+
kind: "requires_reindex",
|
|
206
|
+
detail: "memory index was cleared after embedding config changed; run 'familiar memory reindex --force'",
|
|
207
|
+
fixable: false,
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
function findEmbeddingMismatches(stores, findings) {
|
|
211
|
+
const current = stores.index.embeddingConfig();
|
|
212
|
+
const rows = stores.index.db
|
|
213
|
+
.prepare(`SELECT id, corpus, embedding_model, embedding_dimensions
|
|
214
|
+
FROM memory_chunks
|
|
215
|
+
WHERE embedding_model != ? OR embedding_dimensions != ?
|
|
216
|
+
ORDER BY id`)
|
|
217
|
+
.all(current.model, current.dimensions);
|
|
218
|
+
for (const row of rows) {
|
|
219
|
+
findings.push({
|
|
220
|
+
kind: "embedding_mismatch",
|
|
221
|
+
detail: `chunk ${row.id} (${row.corpus}) has ${row.embedding_model}/${row.embedding_dimensions}; ` +
|
|
222
|
+
`current is ${current.model}/${current.dimensions}`,
|
|
223
|
+
fixable: false,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
function lcmSourceExists(stores, corpus, sourceId) {
|
|
228
|
+
if (corpus === "lcm_record") {
|
|
229
|
+
const id = parseIndexSourceId(sourceId, "lcm_record");
|
|
230
|
+
return id !== null && stores.lcm.getRecord(id) !== null;
|
|
231
|
+
}
|
|
232
|
+
if (corpus === "lcm_summary") {
|
|
233
|
+
const id = parseIndexSourceId(sourceId, "lcm_summary");
|
|
234
|
+
return id !== null && stores.lcm.getSummary(id) !== null;
|
|
235
|
+
}
|
|
236
|
+
return true;
|
|
237
|
+
}
|
|
238
|
+
function countIndexSourceRows(store, corpus, sourceId) {
|
|
239
|
+
const row = store.db
|
|
240
|
+
.prepare("SELECT COUNT(*) AS n FROM memory_index_sources WHERE corpus = ? AND source_id = ?")
|
|
241
|
+
.get(corpus, sourceId);
|
|
242
|
+
return row.n;
|
|
243
|
+
}
|
|
244
|
+
function parseIndexSourceId(value, prefix) {
|
|
245
|
+
const expectedPrefix = `${prefix}:`;
|
|
246
|
+
if (!value?.startsWith(expectedPrefix))
|
|
247
|
+
return null;
|
|
248
|
+
const id = Number(value.slice(expectedPrefix.length));
|
|
249
|
+
return Number.isInteger(id) && id > 0 ? id : null;
|
|
250
|
+
}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import { createMemoryContentHash } from "./store.js";
|
|
2
|
+
export class ChunkIndexer {
|
|
3
|
+
store;
|
|
4
|
+
embeddingProvider;
|
|
5
|
+
constructor(options) {
|
|
6
|
+
this.store = options.store;
|
|
7
|
+
this.embeddingProvider = options.embeddingProvider;
|
|
8
|
+
}
|
|
9
|
+
async indexChunks(inputs, signal) {
|
|
10
|
+
const prepared = this.prepare(inputs);
|
|
11
|
+
if (prepared.length === 0)
|
|
12
|
+
return { ids: [], embedded: 0, reused: 0, skipped: inputs.length };
|
|
13
|
+
return this.insertPrepared(prepared, inputs.length - prepared.length, signal);
|
|
14
|
+
}
|
|
15
|
+
async replaceSource(corpus, sourceId, inputs, signal) {
|
|
16
|
+
const prepared = this.prepare(inputs.map((input) => ({ ...input, corpus, sourceId })));
|
|
17
|
+
const keepMappings = prepared.map((item) => ({ contentHash: item.contentHash, chunkIndex: item.chunkIndex }));
|
|
18
|
+
this.store.deleteBySourceExceptMappings(corpus, sourceId, keepMappings);
|
|
19
|
+
const result = await this.insertPrepared(prepared, inputs.length - prepared.length, signal);
|
|
20
|
+
return result;
|
|
21
|
+
}
|
|
22
|
+
prepare(inputs) {
|
|
23
|
+
const embeddingConfig = this.store.embeddingConfig();
|
|
24
|
+
const prepared = [];
|
|
25
|
+
for (const input of inputs) {
|
|
26
|
+
const text = input.text.trim();
|
|
27
|
+
if (!text)
|
|
28
|
+
continue;
|
|
29
|
+
const chunkIndex = input.chunkIndex ?? 0;
|
|
30
|
+
const sourceId = input.sourceId ?? null;
|
|
31
|
+
const contentHash = createMemoryContentHash({
|
|
32
|
+
corpus: input.corpus,
|
|
33
|
+
text,
|
|
34
|
+
embeddingModel: embeddingConfig.model,
|
|
35
|
+
embeddingDimensions: embeddingConfig.dimensions,
|
|
36
|
+
});
|
|
37
|
+
prepared.push({
|
|
38
|
+
input,
|
|
39
|
+
text,
|
|
40
|
+
chunkIndex,
|
|
41
|
+
sourceId,
|
|
42
|
+
contentHash,
|
|
43
|
+
existingId: null,
|
|
44
|
+
embedding: input.embedding,
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
return prepared;
|
|
48
|
+
}
|
|
49
|
+
async insertPrepared(prepared, skipped, signal) {
|
|
50
|
+
const startedAt = Date.now();
|
|
51
|
+
if (prepared.length === 0)
|
|
52
|
+
return { ids: [], embedded: 0, reused: 0, skipped };
|
|
53
|
+
const present = this.store.whichHashesPresent(prepared.map((item) => item.contentHash));
|
|
54
|
+
for (const item of prepared)
|
|
55
|
+
item.existingId = present.get(item.contentHash) ?? null;
|
|
56
|
+
const pendingEmbeddings = new Map();
|
|
57
|
+
const suppliedByHash = new Map();
|
|
58
|
+
for (const item of prepared) {
|
|
59
|
+
if (item.embedding) {
|
|
60
|
+
suppliedByHash.set(item.contentHash, item.embedding);
|
|
61
|
+
pendingEmbeddings.delete(item.contentHash);
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
if (item.existingId === null &&
|
|
65
|
+
!pendingEmbeddings.has(item.contentHash) &&
|
|
66
|
+
!suppliedByHash.has(item.contentHash)) {
|
|
67
|
+
pendingEmbeddings.set(item.contentHash, item);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
const itemsToEmbed = [...pendingEmbeddings.values()];
|
|
71
|
+
let embeddingCost = 0;
|
|
72
|
+
const embeddings = itemsToEmbed.length === 0
|
|
73
|
+
? []
|
|
74
|
+
: await this.embeddingProvider.embed(itemsToEmbed.map((item) => {
|
|
75
|
+
embeddingCost += item.text.length;
|
|
76
|
+
return item.text;
|
|
77
|
+
}), signal);
|
|
78
|
+
if (embeddings.length !== itemsToEmbed.length) {
|
|
79
|
+
throw new Error(`Embedding count mismatch: expected ${itemsToEmbed.length}, got ${embeddings.length}`);
|
|
80
|
+
}
|
|
81
|
+
for (let index = 0; index < itemsToEmbed.length; index++) {
|
|
82
|
+
const item = itemsToEmbed[index];
|
|
83
|
+
const embedding = embeddings[index];
|
|
84
|
+
if (!embedding)
|
|
85
|
+
throw new Error(`Embedding provider returned no result for chunk ${index}`);
|
|
86
|
+
item.embedding = embedding;
|
|
87
|
+
}
|
|
88
|
+
const embeddedByHash = new Map(suppliedByHash);
|
|
89
|
+
for (const item of itemsToEmbed)
|
|
90
|
+
embeddedByHash.set(item.contentHash, item.embedding);
|
|
91
|
+
const ids = new Array(prepared.length);
|
|
92
|
+
const toInsert = [];
|
|
93
|
+
const insertPositions = [];
|
|
94
|
+
const existingMappings = [];
|
|
95
|
+
for (let resultIndex = 0; resultIndex < prepared.length; resultIndex++) {
|
|
96
|
+
const item = prepared[resultIndex];
|
|
97
|
+
if (item.existingId !== null) {
|
|
98
|
+
ids[resultIndex] = item.existingId;
|
|
99
|
+
existingMappings.push({
|
|
100
|
+
corpus: item.input.corpus,
|
|
101
|
+
sourceId: item.sourceId,
|
|
102
|
+
sourceRef: item.input.sourceRef ?? null,
|
|
103
|
+
chunkIndex: item.chunkIndex,
|
|
104
|
+
text: item.text,
|
|
105
|
+
snippet: item.input.snippet,
|
|
106
|
+
tokenCount: item.input.tokenCount ?? null,
|
|
107
|
+
metadata: item.input.metadata ?? null,
|
|
108
|
+
embedding: item.embedding ?? new Float32Array(this.store.embeddingConfig().dimensions),
|
|
109
|
+
});
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
const embedding = item.embedding ?? embeddedByHash.get(item.contentHash);
|
|
113
|
+
if (!embedding)
|
|
114
|
+
throw new Error("Missing embedding for memory chunk");
|
|
115
|
+
insertPositions.push(resultIndex);
|
|
116
|
+
toInsert.push({
|
|
117
|
+
corpus: item.input.corpus,
|
|
118
|
+
sourceId: item.sourceId,
|
|
119
|
+
sourceRef: item.input.sourceRef ?? null,
|
|
120
|
+
chunkIndex: item.chunkIndex,
|
|
121
|
+
text: item.text,
|
|
122
|
+
snippet: item.input.snippet,
|
|
123
|
+
tokenCount: item.input.tokenCount ?? null,
|
|
124
|
+
metadata: item.input.metadata ?? null,
|
|
125
|
+
embedding,
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
this.store.recordSourceMappings(existingMappings);
|
|
129
|
+
const insertedIds = this.store.insertChunks(toInsert);
|
|
130
|
+
for (let index = 0; index < insertPositions.length; index++) {
|
|
131
|
+
ids[insertPositions[index]] = insertedIds[index];
|
|
132
|
+
}
|
|
133
|
+
const result = {
|
|
134
|
+
ids,
|
|
135
|
+
embedded: itemsToEmbed.length,
|
|
136
|
+
reused: prepared.length - toInsert.length,
|
|
137
|
+
skipped,
|
|
138
|
+
};
|
|
139
|
+
logMemoryIndexBatch({
|
|
140
|
+
chunks: prepared.length,
|
|
141
|
+
durationMs: Date.now() - startedAt,
|
|
142
|
+
embeddingCost,
|
|
143
|
+
});
|
|
144
|
+
return result;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
function logMemoryIndexBatch(payload) {
|
|
148
|
+
if (process.env.DEBUG !== "memory-index")
|
|
149
|
+
return;
|
|
150
|
+
console.error(JSON.stringify({ event: "memory_index_batch", ...payload }));
|
|
151
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
export function createEmbeddingProvider(config, options = {}) {
|
|
2
|
+
const format = config.memory.embedding.format ?? config.memory.embedding.api;
|
|
3
|
+
if (format === "gemini")
|
|
4
|
+
return new GeminiEmbeddingProvider(config, options.fetchFn ?? fetch);
|
|
5
|
+
throw new Error(`NotImplementedError: memory.embedding.format=${format} is recognized but only gemini is implemented in v0`);
|
|
6
|
+
}
|
|
7
|
+
class GeminiEmbeddingProvider {
|
|
8
|
+
api = "gemini";
|
|
9
|
+
provider;
|
|
10
|
+
model;
|
|
11
|
+
dimensions;
|
|
12
|
+
baseUrl;
|
|
13
|
+
apiKeyEnv;
|
|
14
|
+
batchSize;
|
|
15
|
+
fetchFn;
|
|
16
|
+
constructor(config, fetchFn) {
|
|
17
|
+
this.provider = config.memory.embedding.provider;
|
|
18
|
+
this.model = config.memory.embedding.model;
|
|
19
|
+
this.dimensions = config.memory.embedding.dimensions;
|
|
20
|
+
this.baseUrl = config.memory.embedding.baseUrl.replace(/\/+$/, "");
|
|
21
|
+
this.apiKeyEnv = config.memory.embedding.apiKeyEnv;
|
|
22
|
+
this.batchSize = config.memory.embedding.batchSize;
|
|
23
|
+
if (this.batchSize < 1)
|
|
24
|
+
throw new Error(`Embedding batch size must be >= 1, got ${this.batchSize}`);
|
|
25
|
+
this.fetchFn = fetchFn;
|
|
26
|
+
}
|
|
27
|
+
async embedOne(input, signal) {
|
|
28
|
+
const [embedding] = await this.embed([input], signal);
|
|
29
|
+
if (!embedding)
|
|
30
|
+
throw new Error("Embedding provider returned no result");
|
|
31
|
+
return embedding;
|
|
32
|
+
}
|
|
33
|
+
async embed(inputs, signal) {
|
|
34
|
+
if (inputs.length === 0)
|
|
35
|
+
return [];
|
|
36
|
+
const embeddings = [];
|
|
37
|
+
for (let index = 0; index < inputs.length; index += this.batchSize) {
|
|
38
|
+
const chunk = inputs.slice(index, index + this.batchSize);
|
|
39
|
+
// Sequential batches are gentle on hosted rate limits; add bounded
|
|
40
|
+
// concurrency later if indexing throughput becomes a bottleneck.
|
|
41
|
+
embeddings.push(...(await this.embedBatch(chunk, signal)));
|
|
42
|
+
}
|
|
43
|
+
return embeddings;
|
|
44
|
+
}
|
|
45
|
+
async embedBatch(inputs, signal) {
|
|
46
|
+
const apiKey = this.apiKey();
|
|
47
|
+
const response = await this.fetchFn(this.buildUrl(), {
|
|
48
|
+
method: "POST",
|
|
49
|
+
headers: this.buildHeaders(apiKey),
|
|
50
|
+
body: JSON.stringify({
|
|
51
|
+
requests: inputs.map((input) => ({
|
|
52
|
+
model: this.modelResourceName(),
|
|
53
|
+
content: { parts: embeddingInputParts(input) },
|
|
54
|
+
outputDimensionality: this.dimensions,
|
|
55
|
+
})),
|
|
56
|
+
}),
|
|
57
|
+
signal,
|
|
58
|
+
});
|
|
59
|
+
const { body, rawText } = await parseJsonResponse(response);
|
|
60
|
+
if (!response.ok) {
|
|
61
|
+
const message = typeof body.error?.message === "string"
|
|
62
|
+
? body.error.message
|
|
63
|
+
: truncate(rawText.trim() || response.statusText);
|
|
64
|
+
throw new Error(`Embedding request failed: HTTP ${response.status} ${message}`.trim());
|
|
65
|
+
}
|
|
66
|
+
const rawEmbeddings = Array.isArray(body.embeddings) ? body.embeddings : body.embedding ? [body.embedding] : [];
|
|
67
|
+
if (rawEmbeddings.length !== inputs.length) {
|
|
68
|
+
throw new Error(`Embedding response count mismatch: expected ${inputs.length}, got ${rawEmbeddings.length}`);
|
|
69
|
+
}
|
|
70
|
+
return rawEmbeddings.map((embedding, index) => this.parseEmbeddingValues(embedding.values, index));
|
|
71
|
+
}
|
|
72
|
+
buildUrl() {
|
|
73
|
+
return `${this.baseUrl}/${this.modelResourceName()}:batchEmbedContents`;
|
|
74
|
+
}
|
|
75
|
+
buildHeaders(apiKey) {
|
|
76
|
+
const headers = { "content-type": "application/json" };
|
|
77
|
+
if (apiKey)
|
|
78
|
+
headers["x-goog-api-key"] = apiKey;
|
|
79
|
+
return headers;
|
|
80
|
+
}
|
|
81
|
+
apiKey() {
|
|
82
|
+
return this.apiKeyEnv ? process.env[this.apiKeyEnv] : undefined;
|
|
83
|
+
}
|
|
84
|
+
modelResourceName() {
|
|
85
|
+
return this.model.startsWith("models/") ? this.model : `models/${this.model}`;
|
|
86
|
+
}
|
|
87
|
+
parseEmbeddingValues(values, index) {
|
|
88
|
+
if (!Array.isArray(values) || values.some((value) => typeof value !== "number" || !Number.isFinite(value))) {
|
|
89
|
+
throw new Error(`Embedding response ${index} did not contain numeric values`);
|
|
90
|
+
}
|
|
91
|
+
if (values.length !== this.dimensions) {
|
|
92
|
+
throw new Error(`Embedding dimension mismatch for result ${index}: expected ${this.dimensions}, got ${values.length}`);
|
|
93
|
+
}
|
|
94
|
+
return new Float32Array(values);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
function embeddingInputParts(input) {
|
|
98
|
+
if (typeof input === "string")
|
|
99
|
+
return [{ text: input }];
|
|
100
|
+
return input.parts.map((part) => {
|
|
101
|
+
if (part.type === "text")
|
|
102
|
+
return { text: part.text };
|
|
103
|
+
return { inlineData: { mimeType: part.mimeType, data: part.data } };
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
async function parseJsonResponse(response) {
|
|
107
|
+
const rawText = await response.text();
|
|
108
|
+
if (!rawText.trim())
|
|
109
|
+
return { body: {}, rawText };
|
|
110
|
+
try {
|
|
111
|
+
return { body: JSON.parse(rawText), rawText };
|
|
112
|
+
}
|
|
113
|
+
catch {
|
|
114
|
+
return { body: {}, rawText };
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
function truncate(text, maxLength = 300) {
|
|
118
|
+
return text.length <= maxLength ? text : `${text.slice(0, maxLength)}...`;
|
|
119
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
export function normalizeFtsMatchQuery(query) {
|
|
2
|
+
const tokens = [];
|
|
3
|
+
for (const rawToken of query.normalize("NFKC").split(/\s+/u)) {
|
|
4
|
+
if (!rawToken)
|
|
5
|
+
continue;
|
|
6
|
+
const hasPrefix = rawToken.endsWith("*");
|
|
7
|
+
const body = hasPrefix ? rawToken.slice(0, -1) : rawToken;
|
|
8
|
+
const parts = body.match(/[\p{L}\p{N}_]+/gu) ?? [];
|
|
9
|
+
if (parts.length === 0)
|
|
10
|
+
continue;
|
|
11
|
+
for (let index = 0; index < parts.length; index++) {
|
|
12
|
+
const part = parts[index];
|
|
13
|
+
const suffix = hasPrefix && index === parts.length - 1 ? "*" : "";
|
|
14
|
+
tokens.push(`"${part.replaceAll('"', '""')}"${suffix}`);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
return tokens.length > 0 ? tokens.join(" ") : null;
|
|
18
|
+
}
|