@agfpd/iapeer-memory-core 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/db.ts ADDED
@@ -0,0 +1,550 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ import { Database } from "bun:sqlite";
4
+ // sqlite-vec ships its own .d.ts; runtime API is just `load(db)`.
5
+ import * as sqliteVec from "sqlite-vec";
6
+ import type { CoreConfig } from "./config.js";
7
+ import { prepareSqliteRuntime } from "./sqlite-loader.js";
8
+ import { fromJson, toJson } from "./utils.js";
9
+
10
+ export type CoreDb = Database & {
11
+ /**
12
+ * True if `sqlite-vec` virtual-table is available on this connection. When
13
+ * false, `vec_chunks` does not exist and `vectorSearch` must fall back to
14
+ * the brute-force `SELECT embedding FROM chunks` path. Set once at
15
+ * `openDatabase` time based on the process-wide sqlite runtime.
16
+ */
17
+ vecAvailable: boolean;
18
+ };
19
+
20
+ export type IndexedDocumentRow = {
21
+ path: string;
22
+ title: string;
23
+ type: string | null;
24
+ status: string | null;
25
+ tags: string[];
26
+ contentHash: string;
27
+ frontmatter: Record<string, unknown>;
28
+ created: string | null;
29
+ updated: string | null;
30
+ indexedAt: string;
31
+ };
32
+
33
+ export type SearchRow = {
34
+ path: string;
35
+ title: string;
36
+ score: number;
37
+ snippet: string;
38
+ };
39
+
40
+ export type OpenDatabaseOptions = {
41
+ /**
42
+ * Writer-only. When a `vec_chunks` table already exists at a dimension that
43
+ * differs from `config.embedding.dimensions` (the embedder was swapped for
44
+ * one with a different output width — e.g. Qwen3-Embedding-8B@4096 →
45
+ * Qwen3-Embedding-4B@2560), DROP it so the CREATE below rebuilds it at the
46
+ * new dimension. `CREATE VIRTUAL TABLE IF NOT EXISTS` alone keeps the stale
47
+ * dimension forever, and every embedding INSERT then fails with
48
+ * "Dimension mismatch ... Expected N ... received M", crash-looping the
49
+ * writer's vault scan. Embeddings are invalidated on the same swap
50
+ * (`checkEmbeddingModelChanged` nulls `chunks.embedding`), so dropping the
51
+ * vec mirror loses nothing — the re-embed pass repopulates both tables.
52
+ *
53
+ * Defaults to false so read-only MCP frontends (`src/server.ts`) never
54
+ * mutate the table: the single writer daemon owns this migration. A reader
55
+ * that opened during the swap window just sees an empty/stale vec_chunks and
56
+ * degrades to BM25 until the writer finishes re-embedding.
57
+ */
58
+ migrateVecDimension?: boolean;
59
+ };
60
+
61
+ export function openDatabase(config: CoreConfig, options: OpenDatabaseOptions = {}): CoreDb {
62
+ fs.mkdirSync(path.dirname(config.index.dbPath), { recursive: true });
63
+
64
+ // Process-wide: swap bun's stripped sqlite for one that supports extension
65
+ // loading (homebrew on macOS, distro libsqlite3 on Linux). Idempotent —
66
+ // safe to call from every openDatabase, the helper caches the decision.
67
+ const runtime = prepareSqliteRuntime();
68
+
69
+ // strict: true — bind named params (`@a`, `$a`, `:a`) by key without prefix.
70
+ // Without strict mode bun silently inserts NULL for `VALUES (@a) RUN { a: "x" }`.
71
+ const db = new Database(config.index.dbPath, { create: true, strict: true }) as CoreDb;
72
+ db.vecAvailable = false;
73
+ // The DB stores verbatim chunks of private vault content. On shared systems
74
+ // (multi-user macOS, misconfigured VPS) the default mode (typically 0644)
75
+ // would expose the whole vault to other local users. Lock it down.
76
+ // WAL/SHM siblings get the same treatment as soon as they appear.
77
+ for (const suffix of ["", "-wal", "-shm"]) {
78
+ try {
79
+ fs.chmodSync(config.index.dbPath + suffix, 0o600);
80
+ } catch {
81
+ // Best effort — file may not exist yet (sidecars appear on first write)
82
+ // or the filesystem may not support chmod (FAT32, some network mounts).
83
+ }
84
+ }
85
+ db.run("PRAGMA journal_mode = WAL");
86
+ db.run("PRAGMA foreign_keys = ON");
87
+
88
+ // Schema migration: prior versions carried `source` / `agent_id` columns
89
+ // for a diary channel that was never wired in the MCP build. The fields
90
+ // were removed in v0.7. Old databases keep working because SQLite ignores
91
+ // extra columns on INSERT only if they're NULLable — but `source` was
92
+ // NOT NULL, so a plain start against the legacy schema fails. Detect the
93
+ // legacy column and drop the four content tables; `fullScanOnStartup`
94
+ // (default true) will rebuild them. `meta` is preserved so embedding /
95
+ // parser fingerprints survive and don't force a needless re-embed sweep
96
+ // on every upgrade — they're invalidated separately by content changes.
97
+ const docCols = db.prepare("PRAGMA table_info(documents)").all() as Array<{ name: string }>;
98
+ const hasLegacyDiary = docCols.some((c) => c.name === "source" || c.name === "agent_id");
99
+ if (hasLegacyDiary) {
100
+ db.exec(`
101
+ DROP TABLE IF EXISTS documents;
102
+ DROP TABLE IF EXISTS chunk_fts;
103
+ DROP TABLE IF EXISTS chunks;
104
+ DROP TABLE IF EXISTS edges;
105
+ `);
106
+ }
107
+
108
+ db.exec(`
109
+ CREATE TABLE IF NOT EXISTS documents (
110
+ path TEXT PRIMARY KEY,
111
+ title TEXT,
112
+ type TEXT,
113
+ status TEXT,
114
+ tags TEXT,
115
+ content_hash TEXT,
116
+ frontmatter TEXT,
117
+ created TEXT,
118
+ updated TEXT,
119
+ indexed_at TEXT
120
+ );
121
+
122
+ CREATE VIRTUAL TABLE IF NOT EXISTS chunk_fts USING fts5(
123
+ chunk_text,
124
+ doc_path UNINDEXED,
125
+ chunk_index UNINDEXED,
126
+ tokenize='porter unicode61'
127
+ );
128
+
129
+ CREATE TABLE IF NOT EXISTS chunks (
130
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
131
+ doc_path TEXT NOT NULL,
132
+ chunk_index INTEGER NOT NULL,
133
+ chunk_text TEXT NOT NULL,
134
+ embedding BLOB,
135
+ UNIQUE(doc_path, chunk_index)
136
+ );
137
+
138
+ CREATE TABLE IF NOT EXISTS edges (
139
+ source_path TEXT NOT NULL,
140
+ target_path TEXT NOT NULL,
141
+ context_snippet TEXT,
142
+ PRIMARY KEY (source_path, target_path)
143
+ );
144
+
145
+ -- Wikilinks that could not be resolved to a real note. Kept first-class
146
+ -- instead of being silently dropped from edges: a missing/ambiguous link
147
+ -- is a vault health signal (surfaced via vault_map orphan_wikilinks +
148
+ -- the Index nightly health-check). reason ∈ 'missing' | 'ambiguous'.
149
+ CREATE TABLE IF NOT EXISTS unresolved_links (
150
+ source_path TEXT NOT NULL,
151
+ raw_target TEXT NOT NULL,
152
+ reason TEXT NOT NULL,
153
+ context_snippet TEXT,
154
+ PRIMARY KEY (source_path, raw_target)
155
+ );
156
+
157
+ CREATE TABLE IF NOT EXISTS meta (
158
+ key TEXT PRIMARY KEY,
159
+ value TEXT
160
+ );
161
+ `);
162
+
163
+ // ---- sqlite-vec: KNN-capable mirror of `chunks.embedding` ----
164
+ //
165
+ // `vec_chunks.rowid` == `chunks.id`, so JOINs are cheap and dedup-by-path
166
+ // still uses the chunks table. We store cosine-distance vectors because
167
+ // the legacy `vectorSearch` used JS-side cosineSimilarity; switching the
168
+ // metric here would change ranking semantics. Dimension comes from the
169
+ // embedding config — must match what TEI returns or the INSERT fails.
170
+ //
171
+ // If the runtime SQLite has no extension support (no homebrew sqlite, or
172
+ // OMIT_LOAD_EXTENSION compiled in), we skip the virtual table entirely
173
+ // and search.ts falls back to the brute-force path. That keeps the plugin
174
+ // usable on machines that haven't installed a non-stripped libsqlite3.
175
+ if (runtime.available && config.embedding) {
176
+ try {
177
+ sqliteVec.load(db);
178
+ const dim = config.embedding.dimensions;
179
+ // Writer-only: if an existing vec_chunks was created at a different
180
+ // dimension (embedder swapped), DROP it first — IF NOT EXISTS would
181
+ // otherwise keep the old width and every INSERT fails (see
182
+ // OpenDatabaseOptions.migrateVecDimension). Must run with sqlite-vec
183
+ // loaded: DROP TABLE on a vec0 virtual table needs the module resolvable
184
+ // (a plain sqlite3 CLI without the extension errors "no such module:
185
+ // vec0"), which is exactly why this lives in code, not a CLI one-liner.
186
+ if (options.migrateVecDimension && dropVecChunksIfDimensionMismatch(db, dim)) {
187
+ process.stderr.write(
188
+ `[iapeer-memory] vec_chunks dimension changed → float[${dim}]; dropped stale table, ` +
189
+ `re-embed will repopulate it\n`,
190
+ );
191
+ }
192
+ db.exec(
193
+ `CREATE VIRTUAL TABLE IF NOT EXISTS vec_chunks USING vec0(embedding float[${dim}] distance_metric=cosine)`,
194
+ );
195
+ db.vecAvailable = true;
196
+ } catch (err) {
197
+ // Best effort — degraded BM25-only path still works.
198
+ // Logging goes through callers' own loggers (we don't take a logger here
199
+ // to keep openDatabase signature stable for callers/tests).
200
+ process.stderr.write(
201
+ `[iapeer-memory] sqlite-vec load failed: ${String(err)} — vector search falls back to brute-force\n`,
202
+ );
203
+ }
204
+ }
205
+
206
+ return db;
207
+ }
208
+
209
+ /**
210
+ * Dimension a `vec_chunks` table was created with, parsed from its stored
211
+ * CREATE statement (`...float[N]...`) in sqlite_master. Returns null if the
212
+ * table doesn't exist or the width can't be parsed. The reflected table
213
+ * definition is the source of truth — the dimension is deliberately NOT tracked
214
+ * in `meta` as well, so it can never drift from the actual column width.
215
+ */
216
+ export function vecChunksDimension(db: Database): number | null {
217
+ const row = db
218
+ .prepare("SELECT sql FROM sqlite_master WHERE type='table' AND name='vec_chunks'")
219
+ .get() as { sql?: string } | null;
220
+ if (!row?.sql) return null;
221
+ const m = row.sql.match(/float\[(\d+)\]/);
222
+ return m ? Number(m[1]) : null;
223
+ }
224
+
225
+ /**
226
+ * DROP `vec_chunks` when its on-disk dimension differs from `dim`. vec0 tables
227
+ * carry shadow tables (`vec_chunks_chunks`, `vec_chunks_rowids`, …); DROP TABLE
228
+ * on the virtual table cascades to them via vec0's xDestroy, so this is a
229
+ * complete reset. Caller MUST have `sqliteVec.load(db)`'d first (DROP needs the
230
+ * vec0 module resolvable). Returns true iff a stale table was dropped; no-op
231
+ * (returns false) when the dimension already matches or the table is absent.
232
+ */
233
+ function dropVecChunksIfDimensionMismatch(db: CoreDb, dim: number): boolean {
234
+ const existing = vecChunksDimension(db);
235
+ if (existing !== null && existing !== dim) {
236
+ db.exec("DROP TABLE IF EXISTS vec_chunks");
237
+ return true;
238
+ }
239
+ return false;
240
+ }
241
+
242
+ export function getMeta(db: CoreDb, key: string): string | null {
243
+ const row = db.prepare("SELECT value FROM meta WHERE key = ?").get(key) as { value?: string } | null;
244
+ return row?.value ?? null;
245
+ }
246
+
247
+ export function setMeta(db: CoreDb, key: string, value: string): void {
248
+ db.prepare("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value").run(key, value);
249
+ }
250
+
251
+ /**
252
+ * Check if embedding model changed since last indexation.
253
+ * If changed — clear all embeddings and update stored model info.
254
+ * Returns true if embeddings were invalidated.
255
+ */
256
+ export function checkEmbeddingModelChanged(db: CoreDb, config: { model: string; dimensions: number } | null): boolean {
257
+ if (!config) return false;
258
+
259
+ const fingerprint = `${config.model}:${config.dimensions}`;
260
+ const stored = getMeta(db, "embedding_fingerprint");
261
+
262
+ if (stored === fingerprint) return false;
263
+
264
+ // Model changed (or first run) — clear all embeddings
265
+ db.prepare("UPDATE chunks SET embedding = NULL").run();
266
+ setMeta(db, "embedding_fingerprint", fingerprint);
267
+
268
+ return stored !== null; // true = invalidated old embeddings, false = first run
269
+ }
270
+
271
+ /**
272
+ * Check if the parser fingerprint changed since last indexation.
273
+ *
274
+ * Bumped manually when the chunking algorithm changes in a way that affects
275
+ * what's stored on disk — e.g. adding a title prefix to chunk[0]. Changing
276
+ * this version forces the next startup to re-parse every note: we set
277
+ * content_hash = NULL on all documents, so indexFile's "skip if hash matches"
278
+ * short-circuit no longer fires and the parser runs on each file.
279
+ *
280
+ * Returns true if invalidation occurred (existing index was dropped).
281
+ */
282
+ export function checkParserChanged(db: CoreDb, version: string): boolean {
283
+ const stored = getMeta(db, "parser_fingerprint");
284
+ if (stored === version) return false;
285
+
286
+ db.prepare("UPDATE documents SET content_hash = NULL").run();
287
+ setMeta(db, "parser_fingerprint", version);
288
+
289
+ return stored !== null;
290
+ }
291
+
292
+ export function getStoredHash(db: CoreDb, docPath: string): string | null {
293
+ const row = db.prepare("SELECT content_hash FROM documents WHERE path = ?").get(docPath) as { content_hash?: string } | null;
294
+ return row?.content_hash ?? null;
295
+ }
296
+
297
+ /**
298
+ * Drop all vec_chunks rows owned by a document. Used when a document is
299
+ * about to be re-chunked (upsert) or deleted entirely. No-op if vec is not
300
+ * loaded on this connection.
301
+ *
302
+ * vec_chunks is rowid-only — it has no doc_path column — so we join through
303
+ * `chunks` to find which rowids to delete. This is the only place that
304
+ * pre-existing chunks.id values are used as a deletion key, so we must
305
+ * resolve them BEFORE the chunks rows themselves are deleted.
306
+ */
307
+ function deleteVecChunksByDoc(db: CoreDb, docPath: string): void {
308
+ if (!db.vecAvailable) return;
309
+ const rows = db.prepare("SELECT id FROM chunks WHERE doc_path = ?").all(docPath) as Array<{ id: number }>;
310
+ if (rows.length === 0) return;
311
+ const stmt = db.prepare("DELETE FROM vec_chunks WHERE rowid = ?");
312
+ for (const r of rows) stmt.run(r.id);
313
+ }
314
+
315
+ export function upsertDocument(db: CoreDb, row: IndexedDocumentRow, chunks: { chunkIndex: number; text: string }[], links: { target: string; contextSnippet: string }[]): void {
316
+ const tx = db.transaction(() => {
317
+ // vec_chunks rowids referenced by this doc's old chunks must die BEFORE
318
+ // we delete the chunks rows themselves — once `chunks.id` is gone we
319
+ // can't recover the rowid mapping.
320
+ deleteVecChunksByDoc(db, row.path);
321
+ db.prepare(
322
+ `INSERT INTO documents (
323
+ path, title, type, status, tags, content_hash, frontmatter, created, updated, indexed_at
324
+ ) VALUES (
325
+ @path, @title, @type, @status, @tags, @contentHash, @frontmatter, @created, @updated, @indexedAt
326
+ )
327
+ ON CONFLICT(path) DO UPDATE SET
328
+ title=excluded.title,
329
+ type=excluded.type,
330
+ status=excluded.status,
331
+ tags=excluded.tags,
332
+ content_hash=excluded.content_hash,
333
+ frontmatter=excluded.frontmatter,
334
+ created=excluded.created,
335
+ updated=excluded.updated,
336
+ indexed_at=excluded.indexed_at`
337
+ ).run({
338
+ path: row.path,
339
+ title: row.title,
340
+ type: row.type,
341
+ status: row.status,
342
+ tags: toJson(row.tags),
343
+ contentHash: row.contentHash,
344
+ frontmatter: toJson(row.frontmatter),
345
+ created: row.created,
346
+ updated: row.updated,
347
+ indexedAt: row.indexedAt,
348
+ });
349
+
350
+ db.prepare("DELETE FROM chunk_fts WHERE doc_path = ?").run(row.path);
351
+ db.prepare("DELETE FROM chunks WHERE doc_path = ?").run(row.path);
352
+ db.prepare("DELETE FROM edges WHERE source_path = ?").run(row.path);
353
+ db.prepare("DELETE FROM unresolved_links WHERE source_path = ?").run(row.path);
354
+
355
+ const insertChunkFts = db.prepare("INSERT INTO chunk_fts (chunk_text, doc_path, chunk_index) VALUES (?, ?, ?)");
356
+ const insertChunk = db.prepare("INSERT INTO chunks (doc_path, chunk_index, chunk_text) VALUES (?, ?, ?)");
357
+ for (const chunk of chunks) {
358
+ insertChunkFts.run(chunk.text, row.path, chunk.chunkIndex);
359
+ insertChunk.run(row.path, chunk.chunkIndex, chunk.text);
360
+ }
361
+
362
+ const insertEdge = db.prepare("INSERT OR IGNORE INTO edges (source_path, target_path, context_snippet) VALUES (?, ?, ?)");
363
+ for (const link of links) {
364
+ insertEdge.run(row.path, link.target, link.contextSnippet);
365
+ }
366
+ });
367
+
368
+ tx();
369
+ }
370
+
371
+ export function deleteMissingDocuments(db: CoreDb, existingPaths: Set<string>): number {
372
+ const rows = db.prepare("SELECT path FROM documents").all() as Array<{ path: string }>;
373
+ const stale = rows.map((row) => row.path).filter((docPath) => !existingPaths.has(docPath));
374
+ if (stale.length === 0) return 0;
375
+
376
+ const tx = db.transaction(() => {
377
+ const deleteDoc = db.prepare("DELETE FROM documents WHERE path = ?");
378
+ const deleteFts = db.prepare("DELETE FROM chunk_fts WHERE doc_path = ?");
379
+ const deleteOutgoing = db.prepare("DELETE FROM edges WHERE source_path = ?");
380
+ const deleteIncoming = db.prepare("DELETE FROM edges WHERE target_path = ?");
381
+ const deleteUnresolved = db.prepare("DELETE FROM unresolved_links WHERE source_path = ?");
382
+ const deleteChunks = db.prepare("DELETE FROM chunks WHERE doc_path = ?");
383
+ for (const docPath of stale) {
384
+ // vec_chunks first — drops the rowids before the chunks rows that own them go away.
385
+ deleteVecChunksByDoc(db, docPath);
386
+ deleteDoc.run(docPath);
387
+ deleteFts.run(docPath);
388
+ deleteOutgoing.run(docPath);
389
+ deleteIncoming.run(docPath);
390
+ deleteUnresolved.run(docPath);
391
+ deleteChunks.run(docPath);
392
+ }
393
+ });
394
+
395
+ tx();
396
+ return stale.length;
397
+ }
398
+
399
+ export function searchDocuments(db: CoreDb, params: { query: string; limit: number }): SearchRow[] {
400
+ const stmt = db.prepare(`
401
+ SELECT
402
+ d.path as path,
403
+ d.title as title,
404
+ rank as score,
405
+ snippet(chunk_fts, 0, '[', ']', ' … ', 18) as snippet
406
+ FROM chunk_fts
407
+ JOIN documents d ON d.path = chunk_fts.doc_path
408
+ WHERE chunk_fts MATCH ?
409
+ ORDER BY rank ASC
410
+ LIMIT ?
411
+ `);
412
+
413
+ // FTS5 rank is per-row, not per-document. We deduplicate by path, keeping the best score.
414
+ const rows = stmt.all(params.query, params.limit * 3) as SearchRow[];
415
+ const seen = new Map<string, SearchRow>();
416
+ for (const row of rows) {
417
+ const existing = seen.get(row.path);
418
+ if (!existing || row.score < existing.score) {
419
+ seen.set(row.path, row);
420
+ }
421
+ }
422
+ return [...seen.values()]
423
+ .sort((a, b) => a.score - b.score)
424
+ .slice(0, params.limit);
425
+ }
426
+
427
+ export function getDocumentMeta(db: CoreDb, docPath: string): {
428
+ path: string;
429
+ title: string;
430
+ type: string | null;
431
+ status: string | null;
432
+ tags: string[];
433
+ frontmatter: Record<string, unknown>;
434
+ created: string | null;
435
+ updated: string | null;
436
+ } | null {
437
+ const row = db.prepare(`SELECT path, title, type, status, tags, frontmatter, created, updated FROM documents WHERE path = ?`).get(docPath) as Record<string, unknown> | null;
438
+ if (!row) return null;
439
+ return {
440
+ path: String(row.path),
441
+ title: String(row.title ?? ""),
442
+ type: (row.type as string | null) ?? null,
443
+ status: (row.status as string | null) ?? null,
444
+ tags: fromJson<string[]>(typeof row.tags === "string" ? row.tags : null, []),
445
+ frontmatter: fromJson<Record<string, unknown>>(typeof row.frontmatter === "string" ? row.frontmatter : null, {}),
446
+ created: (row.created as string | null) ?? null,
447
+ updated: (row.updated as string | null) ?? null,
448
+ };
449
+ }
450
+
451
+ export function getRelatedPaths(db: CoreDb, docPath: string, limit = 5): string[] {
452
+ const rows = db.prepare(`
453
+ SELECT target_path as path FROM edges WHERE source_path = ?
454
+ UNION
455
+ SELECT source_path as path FROM edges WHERE target_path = ?
456
+ LIMIT ?
457
+ `).all(docPath, docPath, limit) as Array<{ path: string }>;
458
+ return rows.map((row) => row.path);
459
+ }
460
+
461
+ export function getChunkTexts(db: CoreDb, docPath: string): string[] {
462
+ const rows = db.prepare("SELECT chunk_text FROM chunks WHERE doc_path = ? ORDER BY chunk_index").all(docPath) as Array<{ chunk_text: string }>;
463
+ return rows.map((r) => r.chunk_text);
464
+ }
465
+
466
+ export function getChunksWithoutEmbeddings(db: CoreDb, limit: number): Array<{ id: number; docPath: string; chunkText: string }> {
467
+ return db.prepare(
468
+ "SELECT id, doc_path as docPath, chunk_text as chunkText FROM chunks WHERE embedding IS NULL LIMIT ?"
469
+ ).all(limit) as Array<{ id: number; docPath: string; chunkText: string }>;
470
+ }
471
+
472
+ export function storeChunkEmbeddings(db: CoreDb, updates: Array<{ id: number; embedding: Buffer }>): void {
473
+ const stmt = db.prepare("UPDATE chunks SET embedding = ? WHERE id = ?");
474
+ // vec_chunks mirrors the same buffer keyed by rowid == chunks.id. INSERT OR
475
+ // REPLACE because on re-index the chunks row gets a new id but a deleted
476
+ // doc may briefly reuse one — REPLACE keeps the vec table consistent
477
+ // without needing a separate "is this a new id?" check.
478
+ const vecStmt = db.vecAvailable
479
+ ? db.prepare("INSERT OR REPLACE INTO vec_chunks(rowid, embedding) VALUES (?, ?)")
480
+ : null;
481
+ const tx = db.transaction(() => {
482
+ for (const u of updates) {
483
+ stmt.run(u.embedding, u.id);
484
+ if (vecStmt) vecStmt.run(u.id, u.embedding);
485
+ }
486
+ });
487
+ tx();
488
+ }
489
+
490
+ /**
491
+ * Backfill `vec_chunks` from `chunks.embedding` BLOBs. Called once at writer
492
+ * startup when vec is available: existing DBs predate vec_chunks and have all
493
+ * their embeddings only in the legacy column. After this pass vec_chunks
494
+ * mirrors chunks and stays in sync via storeChunkEmbeddings / upsertDocument
495
+ * / deleteMissingDocuments.
496
+ *
497
+ * Idempotent: if vec_chunks already covers every chunk, the streaming pass
498
+ * just produces zero work via INSERT OR REPLACE. Streams in batches to keep
499
+ * heap bounded — full vault has ~1500 chunks × 16KB embedding = ~24 MB.
500
+ */
501
+ export function backfillVecChunks(db: CoreDb, batchSize = 200): number {
502
+ if (!db.vecAvailable) return 0;
503
+ // Only chunks NOT already in vec_chunks. NOT EXISTS lets sqlite skip the
504
+ // join when vec_chunks is fully populated.
505
+ const selectMissing = db.prepare(
506
+ `SELECT c.id, c.embedding
507
+ FROM chunks c
508
+ WHERE c.embedding IS NOT NULL
509
+ AND NOT EXISTS (SELECT 1 FROM vec_chunks v WHERE v.rowid = c.id)
510
+ LIMIT ?`,
511
+ );
512
+ const insert = db.prepare(
513
+ "INSERT OR REPLACE INTO vec_chunks(rowid, embedding) VALUES (?, ?)",
514
+ );
515
+ let total = 0;
516
+ while (true) {
517
+ const rows = selectMissing.all(batchSize) as Array<{ id: number; embedding: Buffer }>;
518
+ if (rows.length === 0) break;
519
+ const tx = db.transaction(() => {
520
+ for (const r of rows) insert.run(r.id, r.embedding);
521
+ });
522
+ tx();
523
+ total += rows.length;
524
+ if (rows.length < batchSize) break;
525
+ }
526
+ return total;
527
+ }
528
+
529
+ export function getBacklinks(db: CoreDb, docPath: string): Array<{ path: string; contextSnippet: string | null }> {
530
+ return db.prepare(`SELECT source_path as path, context_snippet as contextSnippet FROM edges WHERE target_path = ? ORDER BY source_path ASC`).all(docPath) as Array<{ path: string; contextSnippet: string | null }>;
531
+ }
532
+
533
+ export function documentExists(db: CoreDb, docPath: string): boolean {
534
+ return db.prepare("SELECT 1 FROM documents WHERE path = ? LIMIT 1").get(docPath) != null;
535
+ }
536
+
537
+ /**
538
+ * Unresolved wikilinks (missing target / ambiguous basename across folders).
539
+ * First-class health signal — surfaced through vault_map's opt-in
540
+ * `orphan_wikilinks` part and the Index nightly health-check.
541
+ */
542
+ export function getUnresolvedLinks(
543
+ db: CoreDb,
544
+ ): Array<{ source: string; target: string; reason: string }> {
545
+ return db
546
+ .prepare(
547
+ "SELECT source_path as source, raw_target as target, reason FROM unresolved_links ORDER BY source_path ASC, raw_target ASC",
548
+ )
549
+ .all() as Array<{ source: string; target: string; reason: string }>;
550
+ }