@agfpd/iapeer-memory-core 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +32 -0
- package/src/config.ts +257 -0
- package/src/context-render.ts +185 -0
- package/src/db.ts +550 -0
- package/src/embedding.ts +174 -0
- package/src/fm-update.ts +352 -0
- package/src/frontmatter-fill.ts +529 -0
- package/src/graph.ts +427 -0
- package/src/http-client.ts +129 -0
- package/src/human-edit-detect.ts +213 -0
- package/src/index-render.ts +876 -0
- package/src/index.ts +65 -0
- package/src/indexer.ts +323 -0
- package/src/log.ts +27 -0
- package/src/mcp-tools.ts +468 -0
- package/src/memoryd.ts +680 -0
- package/src/migrate-auto-memory.ts +289 -0
- package/src/parser.ts +269 -0
- package/src/permanent-detect.ts +110 -0
- package/src/render-doctrine.ts +113 -0
- package/src/reranker.ts +162 -0
- package/src/search.ts +806 -0
- package/src/smart-hash.ts +85 -0
- package/src/sqlite-loader.ts +151 -0
- package/src/tags-mirror.ts +47 -0
- package/src/taxonomy.ts +385 -0
- package/src/utils.ts +69 -0
- package/tsconfig.json +24 -0
package/src/db.ts
ADDED
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { Database } from "bun:sqlite";
|
|
4
|
+
// sqlite-vec ships its own .d.ts; runtime API is just `load(db)`.
|
|
5
|
+
import * as sqliteVec from "sqlite-vec";
|
|
6
|
+
import type { CoreConfig } from "./config.js";
|
|
7
|
+
import { prepareSqliteRuntime } from "./sqlite-loader.js";
|
|
8
|
+
import { fromJson, toJson } from "./utils.js";
|
|
9
|
+
|
|
10
|
+
export type CoreDb = Database & {
|
|
11
|
+
/**
|
|
12
|
+
* True if `sqlite-vec` virtual-table is available on this connection. When
|
|
13
|
+
* false, `vec_chunks` does not exist and `vectorSearch` must fall back to
|
|
14
|
+
* the brute-force `SELECT embedding FROM chunks` path. Set once at
|
|
15
|
+
* `openDatabase` time based on the process-wide sqlite runtime.
|
|
16
|
+
*/
|
|
17
|
+
vecAvailable: boolean;
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
export type IndexedDocumentRow = {
|
|
21
|
+
path: string;
|
|
22
|
+
title: string;
|
|
23
|
+
type: string | null;
|
|
24
|
+
status: string | null;
|
|
25
|
+
tags: string[];
|
|
26
|
+
contentHash: string;
|
|
27
|
+
frontmatter: Record<string, unknown>;
|
|
28
|
+
created: string | null;
|
|
29
|
+
updated: string | null;
|
|
30
|
+
indexedAt: string;
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
export type SearchRow = {
|
|
34
|
+
path: string;
|
|
35
|
+
title: string;
|
|
36
|
+
score: number;
|
|
37
|
+
snippet: string;
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
export type OpenDatabaseOptions = {
|
|
41
|
+
/**
|
|
42
|
+
* Writer-only. When a `vec_chunks` table already exists at a dimension that
|
|
43
|
+
* differs from `config.embedding.dimensions` (the embedder was swapped for
|
|
44
|
+
* one with a different output width — e.g. Qwen3-Embedding-8B@4096 →
|
|
45
|
+
* Qwen3-Embedding-4B@2560), DROP it so the CREATE below rebuilds it at the
|
|
46
|
+
* new dimension. `CREATE VIRTUAL TABLE IF NOT EXISTS` alone keeps the stale
|
|
47
|
+
* dimension forever, and every embedding INSERT then fails with
|
|
48
|
+
* "Dimension mismatch ... Expected N ... received M", crash-looping the
|
|
49
|
+
* writer's vault scan. Embeddings are invalidated on the same swap
|
|
50
|
+
* (`checkEmbeddingModelChanged` nulls `chunks.embedding`), so dropping the
|
|
51
|
+
* vec mirror loses nothing — the re-embed pass repopulates both tables.
|
|
52
|
+
*
|
|
53
|
+
* Defaults to false so read-only MCP frontends (`src/server.ts`) never
|
|
54
|
+
* mutate the table: the single writer daemon owns this migration. A reader
|
|
55
|
+
* that opened during the swap window just sees an empty/stale vec_chunks and
|
|
56
|
+
* degrades to BM25 until the writer finishes re-embedding.
|
|
57
|
+
*/
|
|
58
|
+
migrateVecDimension?: boolean;
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
export function openDatabase(config: CoreConfig, options: OpenDatabaseOptions = {}): CoreDb {
|
|
62
|
+
fs.mkdirSync(path.dirname(config.index.dbPath), { recursive: true });
|
|
63
|
+
|
|
64
|
+
// Process-wide: swap bun's stripped sqlite for one that supports extension
|
|
65
|
+
// loading (homebrew on macOS, distro libsqlite3 on Linux). Idempotent —
|
|
66
|
+
// safe to call from every openDatabase, the helper caches the decision.
|
|
67
|
+
const runtime = prepareSqliteRuntime();
|
|
68
|
+
|
|
69
|
+
// strict: true — bind named params (`@a`, `$a`, `:a`) by key without prefix.
|
|
70
|
+
// Without strict mode bun silently inserts NULL for `VALUES (@a) RUN { a: "x" }`.
|
|
71
|
+
const db = new Database(config.index.dbPath, { create: true, strict: true }) as CoreDb;
|
|
72
|
+
db.vecAvailable = false;
|
|
73
|
+
// The DB stores verbatim chunks of private vault content. On shared systems
|
|
74
|
+
// (multi-user macOS, misconfigured VPS) the default mode (typically 0644)
|
|
75
|
+
// would expose the whole vault to other local users. Lock it down.
|
|
76
|
+
// WAL/SHM siblings get the same treatment as soon as they appear.
|
|
77
|
+
for (const suffix of ["", "-wal", "-shm"]) {
|
|
78
|
+
try {
|
|
79
|
+
fs.chmodSync(config.index.dbPath + suffix, 0o600);
|
|
80
|
+
} catch {
|
|
81
|
+
// Best effort — file may not exist yet (sidecars appear on first write)
|
|
82
|
+
// or the filesystem may not support chmod (FAT32, some network mounts).
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
db.run("PRAGMA journal_mode = WAL");
|
|
86
|
+
db.run("PRAGMA foreign_keys = ON");
|
|
87
|
+
|
|
88
|
+
// Schema migration: prior versions carried `source` / `agent_id` columns
|
|
89
|
+
// for a diary channel that was never wired in the MCP build. The fields
|
|
90
|
+
// were removed in v0.7. Old databases keep working because SQLite ignores
|
|
91
|
+
// extra columns on INSERT only if they're NULLable — but `source` was
|
|
92
|
+
// NOT NULL, so a plain start against the legacy schema fails. Detect the
|
|
93
|
+
// legacy column and drop the four content tables; `fullScanOnStartup`
|
|
94
|
+
// (default true) will rebuild them. `meta` is preserved so embedding /
|
|
95
|
+
// parser fingerprints survive and don't force a needless re-embed sweep
|
|
96
|
+
// on every upgrade — they're invalidated separately by content changes.
|
|
97
|
+
const docCols = db.prepare("PRAGMA table_info(documents)").all() as Array<{ name: string }>;
|
|
98
|
+
const hasLegacyDiary = docCols.some((c) => c.name === "source" || c.name === "agent_id");
|
|
99
|
+
if (hasLegacyDiary) {
|
|
100
|
+
db.exec(`
|
|
101
|
+
DROP TABLE IF EXISTS documents;
|
|
102
|
+
DROP TABLE IF EXISTS chunk_fts;
|
|
103
|
+
DROP TABLE IF EXISTS chunks;
|
|
104
|
+
DROP TABLE IF EXISTS edges;
|
|
105
|
+
`);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
db.exec(`
|
|
109
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
110
|
+
path TEXT PRIMARY KEY,
|
|
111
|
+
title TEXT,
|
|
112
|
+
type TEXT,
|
|
113
|
+
status TEXT,
|
|
114
|
+
tags TEXT,
|
|
115
|
+
content_hash TEXT,
|
|
116
|
+
frontmatter TEXT,
|
|
117
|
+
created TEXT,
|
|
118
|
+
updated TEXT,
|
|
119
|
+
indexed_at TEXT
|
|
120
|
+
);
|
|
121
|
+
|
|
122
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunk_fts USING fts5(
|
|
123
|
+
chunk_text,
|
|
124
|
+
doc_path UNINDEXED,
|
|
125
|
+
chunk_index UNINDEXED,
|
|
126
|
+
tokenize='porter unicode61'
|
|
127
|
+
);
|
|
128
|
+
|
|
129
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
130
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
131
|
+
doc_path TEXT NOT NULL,
|
|
132
|
+
chunk_index INTEGER NOT NULL,
|
|
133
|
+
chunk_text TEXT NOT NULL,
|
|
134
|
+
embedding BLOB,
|
|
135
|
+
UNIQUE(doc_path, chunk_index)
|
|
136
|
+
);
|
|
137
|
+
|
|
138
|
+
CREATE TABLE IF NOT EXISTS edges (
|
|
139
|
+
source_path TEXT NOT NULL,
|
|
140
|
+
target_path TEXT NOT NULL,
|
|
141
|
+
context_snippet TEXT,
|
|
142
|
+
PRIMARY KEY (source_path, target_path)
|
|
143
|
+
);
|
|
144
|
+
|
|
145
|
+
-- Wikilinks that could not be resolved to a real note. Kept first-class
|
|
146
|
+
-- instead of being silently dropped from edges: a missing/ambiguous link
|
|
147
|
+
-- is a vault health signal (surfaced via vault_map orphan_wikilinks +
|
|
148
|
+
-- the Index nightly health-check). reason ∈ 'missing' | 'ambiguous'.
|
|
149
|
+
CREATE TABLE IF NOT EXISTS unresolved_links (
|
|
150
|
+
source_path TEXT NOT NULL,
|
|
151
|
+
raw_target TEXT NOT NULL,
|
|
152
|
+
reason TEXT NOT NULL,
|
|
153
|
+
context_snippet TEXT,
|
|
154
|
+
PRIMARY KEY (source_path, raw_target)
|
|
155
|
+
);
|
|
156
|
+
|
|
157
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
158
|
+
key TEXT PRIMARY KEY,
|
|
159
|
+
value TEXT
|
|
160
|
+
);
|
|
161
|
+
`);
|
|
162
|
+
|
|
163
|
+
// ---- sqlite-vec: KNN-capable mirror of `chunks.embedding` ----
|
|
164
|
+
//
|
|
165
|
+
// `vec_chunks.rowid` == `chunks.id`, so JOINs are cheap and dedup-by-path
|
|
166
|
+
// still uses the chunks table. We store cosine-distance vectors because
|
|
167
|
+
// the legacy `vectorSearch` used JS-side cosineSimilarity; switching the
|
|
168
|
+
// metric here would change ranking semantics. Dimension comes from the
|
|
169
|
+
// embedding config — must match what TEI returns or the INSERT fails.
|
|
170
|
+
//
|
|
171
|
+
// If the runtime SQLite has no extension support (no homebrew sqlite, or
|
|
172
|
+
// OMIT_LOAD_EXTENSION compiled in), we skip the virtual table entirely
|
|
173
|
+
// and search.ts falls back to the brute-force path. That keeps the plugin
|
|
174
|
+
// usable on machines that haven't installed a non-stripped libsqlite3.
|
|
175
|
+
if (runtime.available && config.embedding) {
|
|
176
|
+
try {
|
|
177
|
+
sqliteVec.load(db);
|
|
178
|
+
const dim = config.embedding.dimensions;
|
|
179
|
+
// Writer-only: if an existing vec_chunks was created at a different
|
|
180
|
+
// dimension (embedder swapped), DROP it first — IF NOT EXISTS would
|
|
181
|
+
// otherwise keep the old width and every INSERT fails (see
|
|
182
|
+
// OpenDatabaseOptions.migrateVecDimension). Must run with sqlite-vec
|
|
183
|
+
// loaded: DROP TABLE on a vec0 virtual table needs the module resolvable
|
|
184
|
+
// (a plain sqlite3 CLI without the extension errors "no such module:
|
|
185
|
+
// vec0"), which is exactly why this lives in code, not a CLI one-liner.
|
|
186
|
+
if (options.migrateVecDimension && dropVecChunksIfDimensionMismatch(db, dim)) {
|
|
187
|
+
process.stderr.write(
|
|
188
|
+
`[iapeer-memory] vec_chunks dimension changed → float[${dim}]; dropped stale table, ` +
|
|
189
|
+
`re-embed will repopulate it\n`,
|
|
190
|
+
);
|
|
191
|
+
}
|
|
192
|
+
db.exec(
|
|
193
|
+
`CREATE VIRTUAL TABLE IF NOT EXISTS vec_chunks USING vec0(embedding float[${dim}] distance_metric=cosine)`,
|
|
194
|
+
);
|
|
195
|
+
db.vecAvailable = true;
|
|
196
|
+
} catch (err) {
|
|
197
|
+
// Best effort — degraded BM25-only path still works.
|
|
198
|
+
// Logging goes through callers' own loggers (we don't take a logger here
|
|
199
|
+
// to keep openDatabase signature stable for callers/tests).
|
|
200
|
+
process.stderr.write(
|
|
201
|
+
`[iapeer-memory] sqlite-vec load failed: ${String(err)} — vector search falls back to brute-force\n`,
|
|
202
|
+
);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
return db;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Dimension a `vec_chunks` table was created with, parsed from its stored
|
|
211
|
+
* CREATE statement (`...float[N]...`) in sqlite_master. Returns null if the
|
|
212
|
+
* table doesn't exist or the width can't be parsed. The reflected table
|
|
213
|
+
* definition is the source of truth — the dimension is deliberately NOT tracked
|
|
214
|
+
* in `meta` as well, so it can never drift from the actual column width.
|
|
215
|
+
*/
|
|
216
|
+
export function vecChunksDimension(db: Database): number | null {
|
|
217
|
+
const row = db
|
|
218
|
+
.prepare("SELECT sql FROM sqlite_master WHERE type='table' AND name='vec_chunks'")
|
|
219
|
+
.get() as { sql?: string } | null;
|
|
220
|
+
if (!row?.sql) return null;
|
|
221
|
+
const m = row.sql.match(/float\[(\d+)\]/);
|
|
222
|
+
return m ? Number(m[1]) : null;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* DROP `vec_chunks` when its on-disk dimension differs from `dim`. vec0 tables
|
|
227
|
+
* carry shadow tables (`vec_chunks_chunks`, `vec_chunks_rowids`, …); DROP TABLE
|
|
228
|
+
* on the virtual table cascades to them via vec0's xDestroy, so this is a
|
|
229
|
+
* complete reset. Caller MUST have `sqliteVec.load(db)`'d first (DROP needs the
|
|
230
|
+
* vec0 module resolvable). Returns true iff a stale table was dropped; no-op
|
|
231
|
+
* (returns false) when the dimension already matches or the table is absent.
|
|
232
|
+
*/
|
|
233
|
+
function dropVecChunksIfDimensionMismatch(db: CoreDb, dim: number): boolean {
|
|
234
|
+
const existing = vecChunksDimension(db);
|
|
235
|
+
if (existing !== null && existing !== dim) {
|
|
236
|
+
db.exec("DROP TABLE IF EXISTS vec_chunks");
|
|
237
|
+
return true;
|
|
238
|
+
}
|
|
239
|
+
return false;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
export function getMeta(db: CoreDb, key: string): string | null {
|
|
243
|
+
const row = db.prepare("SELECT value FROM meta WHERE key = ?").get(key) as { value?: string } | null;
|
|
244
|
+
return row?.value ?? null;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
export function setMeta(db: CoreDb, key: string, value: string): void {
|
|
248
|
+
db.prepare("INSERT INTO meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value").run(key, value);
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Check if embedding model changed since last indexation.
|
|
253
|
+
* If changed — clear all embeddings and update stored model info.
|
|
254
|
+
* Returns true if embeddings were invalidated.
|
|
255
|
+
*/
|
|
256
|
+
export function checkEmbeddingModelChanged(db: CoreDb, config: { model: string; dimensions: number } | null): boolean {
|
|
257
|
+
if (!config) return false;
|
|
258
|
+
|
|
259
|
+
const fingerprint = `${config.model}:${config.dimensions}`;
|
|
260
|
+
const stored = getMeta(db, "embedding_fingerprint");
|
|
261
|
+
|
|
262
|
+
if (stored === fingerprint) return false;
|
|
263
|
+
|
|
264
|
+
// Model changed (or first run) — clear all embeddings
|
|
265
|
+
db.prepare("UPDATE chunks SET embedding = NULL").run();
|
|
266
|
+
setMeta(db, "embedding_fingerprint", fingerprint);
|
|
267
|
+
|
|
268
|
+
return stored !== null; // true = invalidated old embeddings, false = first run
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Check if the parser fingerprint changed since last indexation.
|
|
273
|
+
*
|
|
274
|
+
* Bumped manually when the chunking algorithm changes in a way that affects
|
|
275
|
+
* what's stored on disk — e.g. adding a title prefix to chunk[0]. Changing
|
|
276
|
+
* this version forces the next startup to re-parse every note: we set
|
|
277
|
+
* content_hash = NULL on all documents, so indexFile's "skip if hash matches"
|
|
278
|
+
* short-circuit no longer fires and the parser runs on each file.
|
|
279
|
+
*
|
|
280
|
+
* Returns true if invalidation occurred (existing index was dropped).
|
|
281
|
+
*/
|
|
282
|
+
export function checkParserChanged(db: CoreDb, version: string): boolean {
|
|
283
|
+
const stored = getMeta(db, "parser_fingerprint");
|
|
284
|
+
if (stored === version) return false;
|
|
285
|
+
|
|
286
|
+
db.prepare("UPDATE documents SET content_hash = NULL").run();
|
|
287
|
+
setMeta(db, "parser_fingerprint", version);
|
|
288
|
+
|
|
289
|
+
return stored !== null;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
export function getStoredHash(db: CoreDb, docPath: string): string | null {
|
|
293
|
+
const row = db.prepare("SELECT content_hash FROM documents WHERE path = ?").get(docPath) as { content_hash?: string } | null;
|
|
294
|
+
return row?.content_hash ?? null;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Drop all vec_chunks rows owned by a document. Used when a document is
|
|
299
|
+
* about to be re-chunked (upsert) or deleted entirely. No-op if vec is not
|
|
300
|
+
* loaded on this connection.
|
|
301
|
+
*
|
|
302
|
+
* vec_chunks is rowid-only — it has no doc_path column — so we join through
|
|
303
|
+
* `chunks` to find which rowids to delete. This is the only place that
|
|
304
|
+
* pre-existing chunks.id values are used as a deletion key, so we must
|
|
305
|
+
* resolve them BEFORE the chunks rows themselves are deleted.
|
|
306
|
+
*/
|
|
307
|
+
function deleteVecChunksByDoc(db: CoreDb, docPath: string): void {
|
|
308
|
+
if (!db.vecAvailable) return;
|
|
309
|
+
const rows = db.prepare("SELECT id FROM chunks WHERE doc_path = ?").all(docPath) as Array<{ id: number }>;
|
|
310
|
+
if (rows.length === 0) return;
|
|
311
|
+
const stmt = db.prepare("DELETE FROM vec_chunks WHERE rowid = ?");
|
|
312
|
+
for (const r of rows) stmt.run(r.id);
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
export function upsertDocument(db: CoreDb, row: IndexedDocumentRow, chunks: { chunkIndex: number; text: string }[], links: { target: string; contextSnippet: string }[]): void {
|
|
316
|
+
const tx = db.transaction(() => {
|
|
317
|
+
// vec_chunks rowids referenced by this doc's old chunks must die BEFORE
|
|
318
|
+
// we delete the chunks rows themselves — once `chunks.id` is gone we
|
|
319
|
+
// can't recover the rowid mapping.
|
|
320
|
+
deleteVecChunksByDoc(db, row.path);
|
|
321
|
+
db.prepare(
|
|
322
|
+
`INSERT INTO documents (
|
|
323
|
+
path, title, type, status, tags, content_hash, frontmatter, created, updated, indexed_at
|
|
324
|
+
) VALUES (
|
|
325
|
+
@path, @title, @type, @status, @tags, @contentHash, @frontmatter, @created, @updated, @indexedAt
|
|
326
|
+
)
|
|
327
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
328
|
+
title=excluded.title,
|
|
329
|
+
type=excluded.type,
|
|
330
|
+
status=excluded.status,
|
|
331
|
+
tags=excluded.tags,
|
|
332
|
+
content_hash=excluded.content_hash,
|
|
333
|
+
frontmatter=excluded.frontmatter,
|
|
334
|
+
created=excluded.created,
|
|
335
|
+
updated=excluded.updated,
|
|
336
|
+
indexed_at=excluded.indexed_at`
|
|
337
|
+
).run({
|
|
338
|
+
path: row.path,
|
|
339
|
+
title: row.title,
|
|
340
|
+
type: row.type,
|
|
341
|
+
status: row.status,
|
|
342
|
+
tags: toJson(row.tags),
|
|
343
|
+
contentHash: row.contentHash,
|
|
344
|
+
frontmatter: toJson(row.frontmatter),
|
|
345
|
+
created: row.created,
|
|
346
|
+
updated: row.updated,
|
|
347
|
+
indexedAt: row.indexedAt,
|
|
348
|
+
});
|
|
349
|
+
|
|
350
|
+
db.prepare("DELETE FROM chunk_fts WHERE doc_path = ?").run(row.path);
|
|
351
|
+
db.prepare("DELETE FROM chunks WHERE doc_path = ?").run(row.path);
|
|
352
|
+
db.prepare("DELETE FROM edges WHERE source_path = ?").run(row.path);
|
|
353
|
+
db.prepare("DELETE FROM unresolved_links WHERE source_path = ?").run(row.path);
|
|
354
|
+
|
|
355
|
+
const insertChunkFts = db.prepare("INSERT INTO chunk_fts (chunk_text, doc_path, chunk_index) VALUES (?, ?, ?)");
|
|
356
|
+
const insertChunk = db.prepare("INSERT INTO chunks (doc_path, chunk_index, chunk_text) VALUES (?, ?, ?)");
|
|
357
|
+
for (const chunk of chunks) {
|
|
358
|
+
insertChunkFts.run(chunk.text, row.path, chunk.chunkIndex);
|
|
359
|
+
insertChunk.run(row.path, chunk.chunkIndex, chunk.text);
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
const insertEdge = db.prepare("INSERT OR IGNORE INTO edges (source_path, target_path, context_snippet) VALUES (?, ?, ?)");
|
|
363
|
+
for (const link of links) {
|
|
364
|
+
insertEdge.run(row.path, link.target, link.contextSnippet);
|
|
365
|
+
}
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
tx();
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
export function deleteMissingDocuments(db: CoreDb, existingPaths: Set<string>): number {
|
|
372
|
+
const rows = db.prepare("SELECT path FROM documents").all() as Array<{ path: string }>;
|
|
373
|
+
const stale = rows.map((row) => row.path).filter((docPath) => !existingPaths.has(docPath));
|
|
374
|
+
if (stale.length === 0) return 0;
|
|
375
|
+
|
|
376
|
+
const tx = db.transaction(() => {
|
|
377
|
+
const deleteDoc = db.prepare("DELETE FROM documents WHERE path = ?");
|
|
378
|
+
const deleteFts = db.prepare("DELETE FROM chunk_fts WHERE doc_path = ?");
|
|
379
|
+
const deleteOutgoing = db.prepare("DELETE FROM edges WHERE source_path = ?");
|
|
380
|
+
const deleteIncoming = db.prepare("DELETE FROM edges WHERE target_path = ?");
|
|
381
|
+
const deleteUnresolved = db.prepare("DELETE FROM unresolved_links WHERE source_path = ?");
|
|
382
|
+
const deleteChunks = db.prepare("DELETE FROM chunks WHERE doc_path = ?");
|
|
383
|
+
for (const docPath of stale) {
|
|
384
|
+
// vec_chunks first — drops the rowids before the chunks rows that own them go away.
|
|
385
|
+
deleteVecChunksByDoc(db, docPath);
|
|
386
|
+
deleteDoc.run(docPath);
|
|
387
|
+
deleteFts.run(docPath);
|
|
388
|
+
deleteOutgoing.run(docPath);
|
|
389
|
+
deleteIncoming.run(docPath);
|
|
390
|
+
deleteUnresolved.run(docPath);
|
|
391
|
+
deleteChunks.run(docPath);
|
|
392
|
+
}
|
|
393
|
+
});
|
|
394
|
+
|
|
395
|
+
tx();
|
|
396
|
+
return stale.length;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
export function searchDocuments(db: CoreDb, params: { query: string; limit: number }): SearchRow[] {
|
|
400
|
+
const stmt = db.prepare(`
|
|
401
|
+
SELECT
|
|
402
|
+
d.path as path,
|
|
403
|
+
d.title as title,
|
|
404
|
+
rank as score,
|
|
405
|
+
snippet(chunk_fts, 0, '[', ']', ' … ', 18) as snippet
|
|
406
|
+
FROM chunk_fts
|
|
407
|
+
JOIN documents d ON d.path = chunk_fts.doc_path
|
|
408
|
+
WHERE chunk_fts MATCH ?
|
|
409
|
+
ORDER BY rank ASC
|
|
410
|
+
LIMIT ?
|
|
411
|
+
`);
|
|
412
|
+
|
|
413
|
+
// FTS5 rank is per-row, not per-document. We deduplicate by path, keeping the best score.
|
|
414
|
+
const rows = stmt.all(params.query, params.limit * 3) as SearchRow[];
|
|
415
|
+
const seen = new Map<string, SearchRow>();
|
|
416
|
+
for (const row of rows) {
|
|
417
|
+
const existing = seen.get(row.path);
|
|
418
|
+
if (!existing || row.score < existing.score) {
|
|
419
|
+
seen.set(row.path, row);
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
return [...seen.values()]
|
|
423
|
+
.sort((a, b) => a.score - b.score)
|
|
424
|
+
.slice(0, params.limit);
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
export function getDocumentMeta(db: CoreDb, docPath: string): {
|
|
428
|
+
path: string;
|
|
429
|
+
title: string;
|
|
430
|
+
type: string | null;
|
|
431
|
+
status: string | null;
|
|
432
|
+
tags: string[];
|
|
433
|
+
frontmatter: Record<string, unknown>;
|
|
434
|
+
created: string | null;
|
|
435
|
+
updated: string | null;
|
|
436
|
+
} | null {
|
|
437
|
+
const row = db.prepare(`SELECT path, title, type, status, tags, frontmatter, created, updated FROM documents WHERE path = ?`).get(docPath) as Record<string, unknown> | null;
|
|
438
|
+
if (!row) return null;
|
|
439
|
+
return {
|
|
440
|
+
path: String(row.path),
|
|
441
|
+
title: String(row.title ?? ""),
|
|
442
|
+
type: (row.type as string | null) ?? null,
|
|
443
|
+
status: (row.status as string | null) ?? null,
|
|
444
|
+
tags: fromJson<string[]>(typeof row.tags === "string" ? row.tags : null, []),
|
|
445
|
+
frontmatter: fromJson<Record<string, unknown>>(typeof row.frontmatter === "string" ? row.frontmatter : null, {}),
|
|
446
|
+
created: (row.created as string | null) ?? null,
|
|
447
|
+
updated: (row.updated as string | null) ?? null,
|
|
448
|
+
};
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
export function getRelatedPaths(db: CoreDb, docPath: string, limit = 5): string[] {
|
|
452
|
+
const rows = db.prepare(`
|
|
453
|
+
SELECT target_path as path FROM edges WHERE source_path = ?
|
|
454
|
+
UNION
|
|
455
|
+
SELECT source_path as path FROM edges WHERE target_path = ?
|
|
456
|
+
LIMIT ?
|
|
457
|
+
`).all(docPath, docPath, limit) as Array<{ path: string }>;
|
|
458
|
+
return rows.map((row) => row.path);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
export function getChunkTexts(db: CoreDb, docPath: string): string[] {
|
|
462
|
+
const rows = db.prepare("SELECT chunk_text FROM chunks WHERE doc_path = ? ORDER BY chunk_index").all(docPath) as Array<{ chunk_text: string }>;
|
|
463
|
+
return rows.map((r) => r.chunk_text);
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
export function getChunksWithoutEmbeddings(db: CoreDb, limit: number): Array<{ id: number; docPath: string; chunkText: string }> {
|
|
467
|
+
return db.prepare(
|
|
468
|
+
"SELECT id, doc_path as docPath, chunk_text as chunkText FROM chunks WHERE embedding IS NULL LIMIT ?"
|
|
469
|
+
).all(limit) as Array<{ id: number; docPath: string; chunkText: string }>;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
export function storeChunkEmbeddings(db: CoreDb, updates: Array<{ id: number; embedding: Buffer }>): void {
|
|
473
|
+
const stmt = db.prepare("UPDATE chunks SET embedding = ? WHERE id = ?");
|
|
474
|
+
// vec_chunks mirrors the same buffer keyed by rowid == chunks.id. INSERT OR
|
|
475
|
+
// REPLACE because on re-index the chunks row gets a new id but a deleted
|
|
476
|
+
// doc may briefly reuse one — REPLACE keeps the vec table consistent
|
|
477
|
+
// without needing a separate "is this a new id?" check.
|
|
478
|
+
const vecStmt = db.vecAvailable
|
|
479
|
+
? db.prepare("INSERT OR REPLACE INTO vec_chunks(rowid, embedding) VALUES (?, ?)")
|
|
480
|
+
: null;
|
|
481
|
+
const tx = db.transaction(() => {
|
|
482
|
+
for (const u of updates) {
|
|
483
|
+
stmt.run(u.embedding, u.id);
|
|
484
|
+
if (vecStmt) vecStmt.run(u.id, u.embedding);
|
|
485
|
+
}
|
|
486
|
+
});
|
|
487
|
+
tx();
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
/**
|
|
491
|
+
* Backfill `vec_chunks` from `chunks.embedding` BLOBs. Called once at writer
|
|
492
|
+
* startup when vec is available: existing DBs predate vec_chunks and have all
|
|
493
|
+
* their embeddings only in the legacy column. After this pass vec_chunks
|
|
494
|
+
* mirrors chunks and stays in sync via storeChunkEmbeddings / upsertDocument
|
|
495
|
+
* / deleteMissingDocuments.
|
|
496
|
+
*
|
|
497
|
+
* Idempotent: if vec_chunks already covers every chunk, the streaming pass
|
|
498
|
+
* just produces zero work via INSERT OR REPLACE. Streams in batches to keep
|
|
499
|
+
* heap bounded — full vault has ~1500 chunks × 16KB embedding = ~24 MB.
|
|
500
|
+
*/
|
|
501
|
+
export function backfillVecChunks(db: CoreDb, batchSize = 200): number {
|
|
502
|
+
if (!db.vecAvailable) return 0;
|
|
503
|
+
// Only chunks NOT already in vec_chunks. NOT EXISTS lets sqlite skip the
|
|
504
|
+
// join when vec_chunks is fully populated.
|
|
505
|
+
const selectMissing = db.prepare(
|
|
506
|
+
`SELECT c.id, c.embedding
|
|
507
|
+
FROM chunks c
|
|
508
|
+
WHERE c.embedding IS NOT NULL
|
|
509
|
+
AND NOT EXISTS (SELECT 1 FROM vec_chunks v WHERE v.rowid = c.id)
|
|
510
|
+
LIMIT ?`,
|
|
511
|
+
);
|
|
512
|
+
const insert = db.prepare(
|
|
513
|
+
"INSERT OR REPLACE INTO vec_chunks(rowid, embedding) VALUES (?, ?)",
|
|
514
|
+
);
|
|
515
|
+
let total = 0;
|
|
516
|
+
while (true) {
|
|
517
|
+
const rows = selectMissing.all(batchSize) as Array<{ id: number; embedding: Buffer }>;
|
|
518
|
+
if (rows.length === 0) break;
|
|
519
|
+
const tx = db.transaction(() => {
|
|
520
|
+
for (const r of rows) insert.run(r.id, r.embedding);
|
|
521
|
+
});
|
|
522
|
+
tx();
|
|
523
|
+
total += rows.length;
|
|
524
|
+
if (rows.length < batchSize) break;
|
|
525
|
+
}
|
|
526
|
+
return total;
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
export function getBacklinks(db: CoreDb, docPath: string): Array<{ path: string; contextSnippet: string | null }> {
|
|
530
|
+
return db.prepare(`SELECT source_path as path, context_snippet as contextSnippet FROM edges WHERE target_path = ? ORDER BY source_path ASC`).all(docPath) as Array<{ path: string; contextSnippet: string | null }>;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
export function documentExists(db: CoreDb, docPath: string): boolean {
|
|
534
|
+
return db.prepare("SELECT 1 FROM documents WHERE path = ? LIMIT 1").get(docPath) != null;
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
/**
|
|
538
|
+
* Unresolved wikilinks (missing target / ambiguous basename across folders).
|
|
539
|
+
* First-class health signal — surfaced through vault_map's opt-in
|
|
540
|
+
* `orphan_wikilinks` part and the Index nightly health-check.
|
|
541
|
+
*/
|
|
542
|
+
export function getUnresolvedLinks(
|
|
543
|
+
db: CoreDb,
|
|
544
|
+
): Array<{ source: string; target: string; reason: string }> {
|
|
545
|
+
return db
|
|
546
|
+
.prepare(
|
|
547
|
+
"SELECT source_path as source, raw_target as target, reason FROM unresolved_links ORDER BY source_path ASC, raw_target ASC",
|
|
548
|
+
)
|
|
549
|
+
.all() as Array<{ source: string; target: string; reason: string }>;
|
|
550
|
+
}
|