context-vault 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +383 -0
- package/bin/cli.js +588 -0
- package/package.json +30 -0
- package/smithery.yaml +10 -0
- package/src/capture/README.md +23 -0
- package/src/capture/file-ops.js +75 -0
- package/src/capture/formatters.js +29 -0
- package/src/capture/index.js +91 -0
- package/src/core/README.md +20 -0
- package/src/core/categories.js +50 -0
- package/src/core/config.js +76 -0
- package/src/core/files.js +114 -0
- package/src/core/frontmatter.js +108 -0
- package/src/core/status.js +105 -0
- package/src/index/README.md +28 -0
- package/src/index/db.js +138 -0
- package/src/index/embed.js +56 -0
- package/src/index/index.js +258 -0
- package/src/retrieve/README.md +19 -0
- package/src/retrieve/index.js +173 -0
- package/src/server/README.md +44 -0
- package/src/server/helpers.js +29 -0
- package/src/server/index.js +82 -0
- package/src/server/tools.js +211 -0
- package/ui/Context.applescript +36 -0
- package/ui/index.html +1377 -0
- package/ui/serve.js +473 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Index Layer
|
|
2
|
+
|
|
3
|
+
The sync layer. Owns the SQLite database as a derived index. Handles both single-entry indexing (write-through from capture) and bulk sync (reindex from disk).
|
|
4
|
+
|
|
5
|
+
## Public API (`index.js`)
|
|
6
|
+
|
|
7
|
+
```js
|
|
8
|
+
indexEntry(ctx, entry) → Promise<void> // Index a single entry after capture
|
|
9
|
+
reindex(ctx, { fullSync }) → Promise<stats> // Bulk sync vault dir ↔ database
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
- `indexEntry` — called immediately after `writeEntry()` in the server coordinator. Inserts the row and generates a vector embedding.
|
|
13
|
+
- `reindex` — walks the vault directory, diffs against DB state, and adds/updates/removes entries. `fullSync: true` enables updates and deletions; `false` is add-only.
|
|
14
|
+
|
|
15
|
+
## Internal
|
|
16
|
+
|
|
17
|
+
| File | Purpose |
|
|
18
|
+
|------|---------|
|
|
19
|
+
| `db.js` | Schema DDL (v4), `initDatabase()`, `prepareStatements()`, `insertVec()`, `deleteVec()` |
|
|
20
|
+
| `embed.js` | HuggingFace `all-MiniLM-L6-v2` embedding via `@huggingface/transformers`. Lazy-loaded singleton. |
|
|
21
|
+
|
|
22
|
+
## Dependency Rule
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
index/ → core/ (only)
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Never import from `capture/`, `retrieve/`, or `server/`.
|
package/src/index/db.js
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* db.js — Database schema, initialization, and prepared statements
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import Database from "better-sqlite3";
|
|
6
|
+
import * as sqliteVec from "sqlite-vec";
|
|
7
|
+
import { unlinkSync } from "node:fs";
|
|
8
|
+
|
|
9
|
+
// ─── Schema DDL (v5 — categories) ───────────────────────────────────────────
|
|
10
|
+
|
|
11
|
+
export const SCHEMA_DDL = `
|
|
12
|
+
CREATE TABLE IF NOT EXISTS vault (
|
|
13
|
+
id TEXT PRIMARY KEY,
|
|
14
|
+
kind TEXT NOT NULL,
|
|
15
|
+
category TEXT NOT NULL DEFAULT 'knowledge',
|
|
16
|
+
title TEXT,
|
|
17
|
+
body TEXT NOT NULL,
|
|
18
|
+
meta TEXT,
|
|
19
|
+
tags TEXT,
|
|
20
|
+
source TEXT,
|
|
21
|
+
file_path TEXT UNIQUE,
|
|
22
|
+
identity_key TEXT,
|
|
23
|
+
expires_at TEXT,
|
|
24
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
CREATE INDEX IF NOT EXISTS idx_vault_kind ON vault(kind);
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_vault_category ON vault(category);
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_vault_category_created ON vault(category, created_at DESC);
|
|
30
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_vault_identity ON vault(kind, identity_key) WHERE identity_key IS NOT NULL;
|
|
31
|
+
|
|
32
|
+
-- Single FTS5 table
|
|
33
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS vault_fts USING fts5(
|
|
34
|
+
title, body, tags, kind,
|
|
35
|
+
content='vault', content_rowid='rowid'
|
|
36
|
+
);
|
|
37
|
+
|
|
38
|
+
-- FTS sync triggers
|
|
39
|
+
CREATE TRIGGER IF NOT EXISTS vault_ai AFTER INSERT ON vault BEGIN
|
|
40
|
+
INSERT INTO vault_fts(rowid, title, body, tags, kind)
|
|
41
|
+
VALUES (new.rowid, new.title, new.body, new.tags, new.kind);
|
|
42
|
+
END;
|
|
43
|
+
CREATE TRIGGER IF NOT EXISTS vault_ad AFTER DELETE ON vault BEGIN
|
|
44
|
+
INSERT INTO vault_fts(vault_fts, rowid, title, body, tags, kind)
|
|
45
|
+
VALUES ('delete', old.rowid, old.title, old.body, old.tags, old.kind);
|
|
46
|
+
END;
|
|
47
|
+
CREATE TRIGGER IF NOT EXISTS vault_au AFTER UPDATE ON vault BEGIN
|
|
48
|
+
INSERT INTO vault_fts(vault_fts, rowid, title, body, tags, kind)
|
|
49
|
+
VALUES ('delete', old.rowid, old.title, old.body, old.tags, old.kind);
|
|
50
|
+
INSERT INTO vault_fts(rowid, title, body, tags, kind)
|
|
51
|
+
VALUES (new.rowid, new.title, new.body, new.tags, new.kind);
|
|
52
|
+
END;
|
|
53
|
+
|
|
54
|
+
-- Single vec table (384-dim float32 for all-MiniLM-L6-v2)
|
|
55
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS vault_vec USING vec0(embedding float[384]);
|
|
56
|
+
`;
|
|
57
|
+
|
|
58
|
+
// ─── Database Init ───────────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
export function initDatabase(dbPath) {
|
|
61
|
+
const db = new Database(dbPath);
|
|
62
|
+
db.pragma("journal_mode = WAL");
|
|
63
|
+
db.pragma("foreign_keys = ON");
|
|
64
|
+
try {
|
|
65
|
+
sqliteVec.load(db);
|
|
66
|
+
} catch (e) {
|
|
67
|
+
console.error(`[context-mcp] Failed to load sqlite-vec native module.`);
|
|
68
|
+
console.error(`[context-mcp] This usually means prebuilt binaries aren't available for your platform.`);
|
|
69
|
+
console.error(`[context-mcp] Try: npm rebuild sqlite-vec`);
|
|
70
|
+
console.error(`[context-mcp] Error: ${e.message}`);
|
|
71
|
+
throw e;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const version = db.pragma("user_version", { simple: true });
|
|
75
|
+
|
|
76
|
+
// Enforce fresh-DB-only — old schemas get a full rebuild
|
|
77
|
+
if (version > 0 && version < 5) {
|
|
78
|
+
console.error(`[context-mcp] Schema v${version} is outdated. Rebuilding database...`);
|
|
79
|
+
db.close();
|
|
80
|
+
unlinkSync(dbPath);
|
|
81
|
+
try { unlinkSync(dbPath + "-wal"); } catch {}
|
|
82
|
+
try { unlinkSync(dbPath + "-shm"); } catch {}
|
|
83
|
+
const freshDb = new Database(dbPath);
|
|
84
|
+
freshDb.pragma("journal_mode = WAL");
|
|
85
|
+
freshDb.pragma("foreign_keys = ON");
|
|
86
|
+
try {
|
|
87
|
+
sqliteVec.load(freshDb);
|
|
88
|
+
} catch (e) {
|
|
89
|
+
console.error(`[context-mcp] Failed to load sqlite-vec native module.`);
|
|
90
|
+
console.error(`[context-mcp] This usually means prebuilt binaries aren't available for your platform.`);
|
|
91
|
+
console.error(`[context-mcp] Try: npm rebuild sqlite-vec`);
|
|
92
|
+
console.error(`[context-mcp] Error: ${e.message}`);
|
|
93
|
+
throw e;
|
|
94
|
+
}
|
|
95
|
+
freshDb.exec(SCHEMA_DDL);
|
|
96
|
+
freshDb.pragma("user_version = 5");
|
|
97
|
+
return freshDb;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (version < 5) {
|
|
101
|
+
db.exec(SCHEMA_DDL);
|
|
102
|
+
db.pragma("user_version = 5");
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return db;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// ─── Prepared Statements Factory ─────────────────────────────────────────────
|
|
109
|
+
|
|
110
|
+
export function prepareStatements(db) {
|
|
111
|
+
return {
|
|
112
|
+
insertEntry: db.prepare(`INSERT INTO vault (id, kind, category, title, body, meta, tags, source, file_path, identity_key, expires_at, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`),
|
|
113
|
+
updateEntry: db.prepare(`UPDATE vault SET title = ?, body = ?, meta = ?, tags = ?, source = ?, category = ?, identity_key = ?, expires_at = ? WHERE file_path = ?`),
|
|
114
|
+
deleteEntry: db.prepare(`DELETE FROM vault WHERE id = ?`),
|
|
115
|
+
getRowid: db.prepare(`SELECT rowid FROM vault WHERE id = ?`),
|
|
116
|
+
getRowidByPath: db.prepare(`SELECT rowid FROM vault WHERE file_path = ?`),
|
|
117
|
+
getByIdentityKey: db.prepare(`SELECT * FROM vault WHERE kind = ? AND identity_key = ?`),
|
|
118
|
+
upsertByIdentityKey: db.prepare(`UPDATE vault SET title = ?, body = ?, meta = ?, tags = ?, source = ?, category = ?, file_path = ?, expires_at = ? WHERE kind = ? AND identity_key = ?`),
|
|
119
|
+
insertVecStmt: db.prepare(`INSERT INTO vault_vec (rowid, embedding) VALUES (?, ?)`),
|
|
120
|
+
deleteVecStmt: db.prepare(`DELETE FROM vault_vec WHERE rowid = ?`),
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ─── Vector Helpers (parameterized rowid via cached statements) ──────────────
|
|
125
|
+
|
|
126
|
+
export function insertVec(stmts, rowid, embedding) {
|
|
127
|
+
// sqlite-vec requires BigInt for primary key — better-sqlite3 binds Number as REAL,
|
|
128
|
+
// but vec0 virtual tables only accept INTEGER rowids
|
|
129
|
+
const safeRowid = BigInt(rowid);
|
|
130
|
+
if (safeRowid < 1n) throw new Error(`Invalid rowid: ${rowid}`);
|
|
131
|
+
stmts.insertVecStmt.run(safeRowid, embedding);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
export function deleteVec(stmts, rowid) {
|
|
135
|
+
const safeRowid = BigInt(rowid);
|
|
136
|
+
if (safeRowid < 1n) throw new Error(`Invalid rowid: ${rowid}`);
|
|
137
|
+
stmts.deleteVecStmt.run(safeRowid);
|
|
138
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* embed.js — Text embedding via HuggingFace transformers
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { pipeline } from "@huggingface/transformers";
|
|
6
|
+
|
|
7
|
+
let extractor = null;
|
|
8
|
+
|
|
9
|
+
async function ensurePipeline() {
|
|
10
|
+
if (!extractor) {
|
|
11
|
+
try {
|
|
12
|
+
extractor = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");
|
|
13
|
+
} catch (e) {
|
|
14
|
+
console.error(`[context-mcp] Failed to load embedding model: ${e.message}`);
|
|
15
|
+
console.error(`[context-mcp] The model (~80 MB) is downloaded on first run.`);
|
|
16
|
+
console.error(`[context-mcp] Check: network connectivity, disk space, Node.js >=20`);
|
|
17
|
+
throw e;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
return extractor;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export async function embed(text) {
|
|
24
|
+
const ext = await ensurePipeline();
|
|
25
|
+
const result = await ext([text], { pooling: "mean", normalize: true });
|
|
26
|
+
// P5: Health check — force re-init on empty results
|
|
27
|
+
if (!result?.data?.length) {
|
|
28
|
+
extractor = null;
|
|
29
|
+
throw new Error("Embedding pipeline returned empty result");
|
|
30
|
+
}
|
|
31
|
+
return new Float32Array(result.data);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* P4: Batch embedding — embed multiple texts in a single pipeline call.
|
|
36
|
+
* Returns an array of Float32Array embeddings (one per input text).
|
|
37
|
+
*/
|
|
38
|
+
export async function embedBatch(texts) {
|
|
39
|
+
if (!texts.length) return [];
|
|
40
|
+
const ext = await ensurePipeline();
|
|
41
|
+
const result = await ext(texts, { pooling: "mean", normalize: true });
|
|
42
|
+
if (!result?.data?.length) {
|
|
43
|
+
extractor = null;
|
|
44
|
+
throw new Error("Embedding pipeline returned empty result");
|
|
45
|
+
}
|
|
46
|
+
const dim = result.data.length / texts.length;
|
|
47
|
+
if (!Number.isInteger(dim) || dim <= 0) {
|
|
48
|
+
throw new Error(`Unexpected embedding dimension: ${result.data.length} / ${texts.length} = ${dim}`);
|
|
49
|
+
}
|
|
50
|
+
return texts.map((_, i) => new Float32Array(result.data.buffer, i * dim * 4, dim));
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** P5: Force re-initialization on next embed call. */
|
|
54
|
+
export function resetEmbedPipeline() {
|
|
55
|
+
extractor = null;
|
|
56
|
+
}
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Index Layer — Public API
|
|
3
|
+
*
|
|
4
|
+
* Owns the database as a derived index. Handles both bulk sync (reindex)
|
|
5
|
+
* and single-entry indexing (indexEntry) for write-through capture.
|
|
6
|
+
*
|
|
7
|
+
* Agent Constraint: Can import ../core. Owns db.js and embed.js.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { readFileSync, readdirSync, existsSync } from "node:fs";
|
|
11
|
+
import { join, basename } from "node:path";
|
|
12
|
+
import { dirToKind, walkDir, ulid } from "../core/files.js";
|
|
13
|
+
import { categoryFor, CATEGORY_DIRS } from "../core/categories.js";
|
|
14
|
+
import { parseFrontmatter, parseEntryFromMarkdown } from "../core/frontmatter.js";
|
|
15
|
+
import { embedBatch } from "./embed.js";
|
|
16
|
+
|
|
17
|
+
const EXCLUDED_DIRS = new Set(["projects", "_archive"]);
|
|
18
|
+
const EXCLUDED_FILES = new Set(["context.md", "memory.md", "README.md"]);
|
|
19
|
+
|
|
20
|
+
const EMBED_BATCH_SIZE = 32;
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Index a single entry with idempotent upsert behavior.
|
|
24
|
+
* Called immediately after Capture Layer writes the file.
|
|
25
|
+
*
|
|
26
|
+
* For entities with identity_key: uses upsertByIdentityKey if existing row found.
|
|
27
|
+
*
|
|
28
|
+
* @param {{ db, stmts, embed, insertVec, deleteVec }} ctx
|
|
29
|
+
* @param {{ id, kind, category, title, body, meta, tags, source, filePath, createdAt, identity_key, expires_at }} entry
|
|
30
|
+
*/
|
|
31
|
+
export async function indexEntry(ctx, { id, kind, category, title, body, meta, tags, source, filePath, createdAt, identity_key, expires_at }) {
|
|
32
|
+
const tagsJson = tags ? JSON.stringify(tags) : null;
|
|
33
|
+
const metaJson = meta ? JSON.stringify(meta) : null;
|
|
34
|
+
const cat = category || categoryFor(kind);
|
|
35
|
+
|
|
36
|
+
let wasUpdate = false;
|
|
37
|
+
|
|
38
|
+
// Entity upsert: check by (kind, identity_key) first
|
|
39
|
+
if (cat === "entity" && identity_key) {
|
|
40
|
+
const existing = ctx.stmts.getByIdentityKey.get(kind, identity_key);
|
|
41
|
+
if (existing) {
|
|
42
|
+
ctx.stmts.upsertByIdentityKey.run(
|
|
43
|
+
title || null, body, metaJson, tagsJson, source || "claude-code", cat, filePath, expires_at || null,
|
|
44
|
+
kind, identity_key
|
|
45
|
+
);
|
|
46
|
+
wasUpdate = true;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (!wasUpdate) {
|
|
51
|
+
try {
|
|
52
|
+
ctx.stmts.insertEntry.run(id, kind, cat, title || null, body, metaJson, tagsJson, source || "claude-code", filePath, identity_key || null, expires_at || null, createdAt);
|
|
53
|
+
} catch (e) {
|
|
54
|
+
if (e.message.includes("UNIQUE constraint")) {
|
|
55
|
+
ctx.stmts.updateEntry.run(title || null, body, metaJson, tagsJson, source || "claude-code", cat, identity_key || null, expires_at || null, filePath);
|
|
56
|
+
wasUpdate = true;
|
|
57
|
+
} else {
|
|
58
|
+
throw e;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// After update, get rowid by file_path (since id might differ); otherwise by id
|
|
64
|
+
const rowidResult = wasUpdate
|
|
65
|
+
? ctx.stmts.getRowidByPath.get(filePath)
|
|
66
|
+
: ctx.stmts.getRowid.get(id);
|
|
67
|
+
|
|
68
|
+
if (!rowidResult || rowidResult.rowid == null) {
|
|
69
|
+
throw new Error(`Could not find rowid for entry: ${wasUpdate ? `file_path=${filePath}` : `id=${id}`}`);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const rowid = Number(rowidResult.rowid);
|
|
73
|
+
if (!Number.isFinite(rowid) || rowid < 1) {
|
|
74
|
+
throw new Error(`Invalid rowid retrieved: ${rowidResult.rowid} (type: ${typeof rowidResult.rowid})`);
|
|
75
|
+
}
|
|
76
|
+
const embeddingText = [title, body].filter(Boolean).join(" ");
|
|
77
|
+
const embedding = await ctx.embed(embeddingText);
|
|
78
|
+
|
|
79
|
+
// Upsert vec: delete old if exists, then insert new
|
|
80
|
+
try { ctx.deleteVec(rowid); } catch { /* no-op if not found */ }
|
|
81
|
+
ctx.insertVec(rowid, embedding);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Bulk reindex: sync vault directory into the database.
|
|
86
|
+
* P2: Wrapped in a transaction for atomicity.
|
|
87
|
+
* P3: Detects title/tag/meta changes, not just body.
|
|
88
|
+
* P4: Batches embedding calls for performance.
|
|
89
|
+
*
|
|
90
|
+
* @param {{ db, config, stmts, embed, insertVec, deleteVec }} ctx
|
|
91
|
+
* @param {{ fullSync?: boolean }} opts — fullSync=true adds/updates/deletes; false=add-only
|
|
92
|
+
* @returns {Promise<{added: number, updated: number, removed: number, unchanged: number}>}
|
|
93
|
+
*/
|
|
94
|
+
export async function reindex(ctx, opts = {}) {
|
|
95
|
+
const { fullSync = true } = opts;
|
|
96
|
+
const stats = { added: 0, updated: 0, removed: 0, unchanged: 0 };
|
|
97
|
+
|
|
98
|
+
if (!existsSync(ctx.config.vaultDir)) return stats;
|
|
99
|
+
|
|
100
|
+
// Use INSERT OR IGNORE for reindex — handles files with duplicate frontmatter IDs
|
|
101
|
+
const upsertEntry = ctx.db.prepare(
|
|
102
|
+
`INSERT OR IGNORE INTO vault (id, kind, category, title, body, meta, tags, source, file_path, identity_key, expires_at, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
103
|
+
);
|
|
104
|
+
|
|
105
|
+
// Auto-discover kind directories, supporting both:
|
|
106
|
+
// - Nested: knowledge/insights/, events/sessions/ (category dirs at top level)
|
|
107
|
+
// - Flat: insights/, decisions/ (legacy — kind dirs at top level)
|
|
108
|
+
const kindEntries = []; // { kind, dir }
|
|
109
|
+
const topDirs = readdirSync(ctx.config.vaultDir, { withFileTypes: true })
|
|
110
|
+
.filter((d) => d.isDirectory() && !EXCLUDED_DIRS.has(d.name) && !d.name.startsWith("_"));
|
|
111
|
+
|
|
112
|
+
for (const d of topDirs) {
|
|
113
|
+
if (CATEGORY_DIRS.has(d.name)) {
|
|
114
|
+
// Category directory — look one level deeper for kind directories
|
|
115
|
+
const catDir = join(ctx.config.vaultDir, d.name);
|
|
116
|
+
const subDirs = readdirSync(catDir, { withFileTypes: true })
|
|
117
|
+
.filter((sd) => sd.isDirectory() && !sd.name.startsWith("_"));
|
|
118
|
+
for (const sd of subDirs) {
|
|
119
|
+
kindEntries.push({ kind: dirToKind(sd.name), dir: join(catDir, sd.name) });
|
|
120
|
+
}
|
|
121
|
+
} else {
|
|
122
|
+
// Legacy flat structure — top-level dir is a kind dir
|
|
123
|
+
kindEntries.push({ kind: dirToKind(d.name), dir: join(ctx.config.vaultDir, d.name) });
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// P2: Wrap entire reindex in a transaction
|
|
128
|
+
ctx.db.exec("BEGIN");
|
|
129
|
+
try {
|
|
130
|
+
// P4: Collect entries needing embedding, then batch-embed
|
|
131
|
+
const pendingEmbeds = []; // { rowid, text }
|
|
132
|
+
|
|
133
|
+
for (const { kind, dir } of kindEntries) {
|
|
134
|
+
const category = categoryFor(kind);
|
|
135
|
+
const mdFiles = walkDir(dir).filter((f) => !EXCLUDED_FILES.has(basename(f.filePath)));
|
|
136
|
+
|
|
137
|
+
// P3: Fetch all mutable fields for change detection
|
|
138
|
+
const dbRows = ctx.db.prepare("SELECT id, file_path, body, title, tags, meta FROM vault WHERE kind = ?").all(kind);
|
|
139
|
+
const dbByPath = new Map(dbRows.map((r) => [r.file_path, r]));
|
|
140
|
+
const diskPaths = new Set(mdFiles.map((e) => e.filePath));
|
|
141
|
+
|
|
142
|
+
for (const { filePath, relDir } of mdFiles) {
|
|
143
|
+
const existing = dbByPath.get(filePath);
|
|
144
|
+
|
|
145
|
+
// In add-only mode, skip files already in DB
|
|
146
|
+
if (!fullSync && existing) {
|
|
147
|
+
stats.unchanged++;
|
|
148
|
+
continue;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
152
|
+
if (!raw.startsWith("---\n")) {
|
|
153
|
+
console.error(`[reindex] skipping (no frontmatter): ${filePath}`);
|
|
154
|
+
continue;
|
|
155
|
+
}
|
|
156
|
+
const { meta: fmMeta, body: rawBody } = parseFrontmatter(raw);
|
|
157
|
+
const parsed = parseEntryFromMarkdown(kind, rawBody, fmMeta);
|
|
158
|
+
|
|
159
|
+
// Extract identity_key and expires_at from frontmatter
|
|
160
|
+
const identity_key = fmMeta.identity_key || null;
|
|
161
|
+
const expires_at = fmMeta.expires_at || null;
|
|
162
|
+
|
|
163
|
+
// Derive folder from disk location (source of truth)
|
|
164
|
+
const meta = { ...(parsed.meta || {}) };
|
|
165
|
+
if (relDir) meta.folder = relDir;
|
|
166
|
+
else delete meta.folder;
|
|
167
|
+
const metaJson = Object.keys(meta).length ? JSON.stringify(meta) : null;
|
|
168
|
+
|
|
169
|
+
if (!existing) {
|
|
170
|
+
// New file — add to DB (OR IGNORE if ID already exists at another path)
|
|
171
|
+
const id = fmMeta.id || ulid();
|
|
172
|
+
const tagsJson = fmMeta.tags ? JSON.stringify(fmMeta.tags) : null;
|
|
173
|
+
const created = fmMeta.created || new Date().toISOString();
|
|
174
|
+
|
|
175
|
+
const result = upsertEntry.run(id, kind, category, parsed.title || null, parsed.body, metaJson, tagsJson, fmMeta.source || "file", filePath, identity_key, expires_at, created);
|
|
176
|
+
if (result.changes > 0) {
|
|
177
|
+
const rowid = ctx.stmts.getRowid.get(id).rowid;
|
|
178
|
+
const embeddingText = [parsed.title, parsed.body].filter(Boolean).join(" ");
|
|
179
|
+
pendingEmbeds.push({ rowid, text: embeddingText });
|
|
180
|
+
stats.added++;
|
|
181
|
+
} else {
|
|
182
|
+
stats.unchanged++;
|
|
183
|
+
}
|
|
184
|
+
} else if (fullSync) {
|
|
185
|
+
// P3: Compare all mutable fields, not just body
|
|
186
|
+
const tagsJson = fmMeta.tags ? JSON.stringify(fmMeta.tags) : null;
|
|
187
|
+
const titleChanged = (parsed.title || null) !== (existing.title || null);
|
|
188
|
+
const bodyChanged = existing.body !== parsed.body;
|
|
189
|
+
const tagsChanged = tagsJson !== (existing.tags || null);
|
|
190
|
+
const metaChanged = metaJson !== (existing.meta || null);
|
|
191
|
+
|
|
192
|
+
if (bodyChanged || titleChanged || tagsChanged || metaChanged) {
|
|
193
|
+
ctx.stmts.updateEntry.run(parsed.title || null, parsed.body, metaJson, tagsJson, fmMeta.source || "file", category, identity_key, expires_at, filePath);
|
|
194
|
+
|
|
195
|
+
// P0: Re-embed if title or body changed
|
|
196
|
+
if (bodyChanged || titleChanged) {
|
|
197
|
+
const rowid = ctx.stmts.getRowid.get(existing.id)?.rowid;
|
|
198
|
+
if (rowid) {
|
|
199
|
+
ctx.deleteVec(rowid);
|
|
200
|
+
const embeddingText = [parsed.title, parsed.body].filter(Boolean).join(" ");
|
|
201
|
+
pendingEmbeds.push({ rowid, text: embeddingText });
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
stats.updated++;
|
|
205
|
+
} else {
|
|
206
|
+
stats.unchanged++;
|
|
207
|
+
}
|
|
208
|
+
} else {
|
|
209
|
+
stats.unchanged++;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Find deleted files (in DB but not on disk) — only in fullSync mode
|
|
214
|
+
if (fullSync) {
|
|
215
|
+
for (const [dbPath, row] of dbByPath) {
|
|
216
|
+
if (!diskPaths.has(dbPath)) {
|
|
217
|
+
const vRowid = ctx.stmts.getRowid.get(row.id)?.rowid;
|
|
218
|
+
if (vRowid) ctx.deleteVec(vRowid);
|
|
219
|
+
ctx.stmts.deleteEntry.run(row.id);
|
|
220
|
+
stats.removed++;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// P4: Batch embed all pending texts
|
|
227
|
+
for (let i = 0; i < pendingEmbeds.length; i += EMBED_BATCH_SIZE) {
|
|
228
|
+
const batch = pendingEmbeds.slice(i, i + EMBED_BATCH_SIZE);
|
|
229
|
+
const embeddings = await embedBatch(batch.map((e) => e.text));
|
|
230
|
+
for (let j = 0; j < batch.length; j++) {
|
|
231
|
+
ctx.insertVec(batch[j].rowid, embeddings[j]);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Clean up entries for kinds whose directories no longer exist on disk
|
|
236
|
+
if (fullSync) {
|
|
237
|
+
const indexedKinds = new Set(kindEntries.map((ke) => ke.kind));
|
|
238
|
+
const allDbKinds = ctx.db.prepare("SELECT DISTINCT kind FROM vault").all();
|
|
239
|
+
for (const { kind } of allDbKinds) {
|
|
240
|
+
if (!indexedKinds.has(kind)) {
|
|
241
|
+
const orphaned = ctx.db.prepare("SELECT id, rowid FROM vault WHERE kind = ?").all(kind);
|
|
242
|
+
for (const row of orphaned) {
|
|
243
|
+
try { ctx.deleteVec(row.rowid); } catch {}
|
|
244
|
+
ctx.stmts.deleteEntry.run(row.id);
|
|
245
|
+
stats.removed++;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
ctx.db.exec("COMMIT");
|
|
252
|
+
} catch (e) {
|
|
253
|
+
ctx.db.exec("ROLLBACK");
|
|
254
|
+
throw e;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return stats;
|
|
258
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Retrieve Layer
|
|
2
|
+
|
|
3
|
+
The read path. All query logic lives here — hybrid search and any future retrieval strategies.
|
|
4
|
+
|
|
5
|
+
## Public API (`index.js`)
|
|
6
|
+
|
|
7
|
+
```js
|
|
8
|
+
hybridSearch(ctx, query, { kindFilter, limit, offset }) → Promise<Array<result>>
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Runs both FTS5 text matching and vector cosine similarity, merges scores with recency weighting, and returns results sorted by combined relevance.
|
|
12
|
+
|
|
13
|
+
## Dependency Rule
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
retrieve/ → core/ (allowed but currently unused)
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Read-only access to the database via `ctx.db`. Never imports from `capture/`, `index/`, or `server/`.
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Retrieve Layer — Public API
|
|
3
|
+
*
|
|
4
|
+
* All read-path query logic: hybrid semantic search and any future
|
|
5
|
+
* query patterns (scoped, recency-weighted, etc.).
|
|
6
|
+
*
|
|
7
|
+
* Agent Constraint: Read-only access to DB. Never writes.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const FTS_WEIGHT = 0.4;
|
|
11
|
+
const VEC_WEIGHT = 0.6;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Strip FTS5 metacharacters from query words and build an AND query.
|
|
15
|
+
* Returns null if no valid words remain.
|
|
16
|
+
*/
|
|
17
|
+
function buildFtsQuery(query) {
|
|
18
|
+
const words = query
|
|
19
|
+
.split(/\s+/)
|
|
20
|
+
.map((w) => w.replace(/[*"()\-:^~{}]/g, ""))
|
|
21
|
+
.filter((w) => w.length > 0);
|
|
22
|
+
if (!words.length) return null;
|
|
23
|
+
return words.map((w) => `"${w}"`).join(" AND ");
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Category-aware recency decay:
|
|
28
|
+
* knowledge + entity: no decay (enduring)
|
|
29
|
+
* event: steeper decay (~0.5 at 30 days)
|
|
30
|
+
*/
|
|
31
|
+
function recencyBoost(createdAt, category) {
|
|
32
|
+
if (category !== "event") return 1.0;
|
|
33
|
+
const ageDays = (Date.now() - new Date(createdAt).getTime()) / 86400000;
|
|
34
|
+
return 1 / (1 + ageDays / 30);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Build additional WHERE clauses for category/time filtering.
|
|
39
|
+
* Returns { clauses: string[], params: any[] }
|
|
40
|
+
*/
|
|
41
|
+
function buildFilterClauses({ categoryFilter, since, until }) {
|
|
42
|
+
const clauses = [];
|
|
43
|
+
const params = [];
|
|
44
|
+
if (categoryFilter) {
|
|
45
|
+
clauses.push("e.category = ?");
|
|
46
|
+
params.push(categoryFilter);
|
|
47
|
+
}
|
|
48
|
+
if (since) {
|
|
49
|
+
clauses.push("e.created_at >= ?");
|
|
50
|
+
params.push(since);
|
|
51
|
+
}
|
|
52
|
+
if (until) {
|
|
53
|
+
clauses.push("e.created_at <= ?");
|
|
54
|
+
params.push(until);
|
|
55
|
+
}
|
|
56
|
+
clauses.push("(e.expires_at IS NULL OR e.expires_at > datetime('now'))");
|
|
57
|
+
return { clauses, params };
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Hybrid search combining FTS5 text matching and vector similarity.
|
|
62
|
+
*
|
|
63
|
+
* @param {{ db, embed }} ctx
|
|
64
|
+
* @param {string} query
|
|
65
|
+
* @param {{ kindFilter?: string|null, categoryFilter?: string|null, since?: string|null, until?: string|null, limit?: number, offset?: number }} opts
|
|
66
|
+
* @returns {Promise<Array<{id, kind, category, title, body, meta, tags, source, file_path, created_at, score}>>}
|
|
67
|
+
*/
|
|
68
|
+
export async function hybridSearch(
|
|
69
|
+
ctx,
|
|
70
|
+
query,
|
|
71
|
+
{ kindFilter = null, categoryFilter = null, since = null, until = null, limit = 20, offset = 0 } = {}
|
|
72
|
+
) {
|
|
73
|
+
const results = new Map();
|
|
74
|
+
const extraFilters = buildFilterClauses({ categoryFilter, since, until });
|
|
75
|
+
|
|
76
|
+
// FTS5 search
|
|
77
|
+
const ftsQuery = buildFtsQuery(query);
|
|
78
|
+
if (ftsQuery) {
|
|
79
|
+
try {
|
|
80
|
+
const whereParts = ["vault_fts MATCH ?"];
|
|
81
|
+
const ftsParams = [ftsQuery];
|
|
82
|
+
|
|
83
|
+
if (kindFilter) {
|
|
84
|
+
whereParts.push("e.kind = ?");
|
|
85
|
+
ftsParams.push(kindFilter);
|
|
86
|
+
}
|
|
87
|
+
whereParts.push(...extraFilters.clauses);
|
|
88
|
+
ftsParams.push(...extraFilters.params);
|
|
89
|
+
|
|
90
|
+
const ftsSQL = `SELECT e.*, rank FROM vault_fts f JOIN vault e ON f.rowid = e.rowid WHERE ${whereParts.join(" AND ")} ORDER BY rank LIMIT 15`;
|
|
91
|
+
const rows = ctx.db.prepare(ftsSQL).all(...ftsParams);
|
|
92
|
+
|
|
93
|
+
// Normalize FTS scores to [0, 1]
|
|
94
|
+
const ftsScores = rows.map((r) => Math.abs(r.rank || 0));
|
|
95
|
+
const maxFts = Math.max(...ftsScores, 1);
|
|
96
|
+
|
|
97
|
+
for (let i = 0; i < rows.length; i++) {
|
|
98
|
+
const { rank: _rank, ...row } = rows[i];
|
|
99
|
+
const normalized = ftsScores[i] / maxFts;
|
|
100
|
+
results.set(row.id, { ...row, score: normalized * FTS_WEIGHT });
|
|
101
|
+
}
|
|
102
|
+
} catch (err) {
|
|
103
|
+
if (err.message?.includes("fts5: syntax error")) {
|
|
104
|
+
// Expected: malformed query, fall through to vector search
|
|
105
|
+
} else {
|
|
106
|
+
console.error(`[retrieve] FTS search error: ${err.message}`);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Vector similarity search
|
|
112
|
+
try {
|
|
113
|
+
const vecCount = ctx.db
|
|
114
|
+
.prepare("SELECT COUNT(*) as c FROM vault_vec")
|
|
115
|
+
.get().c;
|
|
116
|
+
if (vecCount > 0) {
|
|
117
|
+
const queryVec = await ctx.embed(query);
|
|
118
|
+
const vecLimit = kindFilter ? 30 : 15;
|
|
119
|
+
const vecRows = ctx.db
|
|
120
|
+
.prepare(
|
|
121
|
+
`SELECT v.rowid, v.distance FROM vault_vec v WHERE embedding MATCH ? ORDER BY distance LIMIT ${vecLimit}`
|
|
122
|
+
)
|
|
123
|
+
.all(queryVec);
|
|
124
|
+
|
|
125
|
+
if (vecRows.length) {
|
|
126
|
+
// Batch hydration: single query instead of N+1
|
|
127
|
+
const rowids = vecRows.map((vr) => vr.rowid);
|
|
128
|
+
const placeholders = rowids.map(() => "?").join(",");
|
|
129
|
+
const hydrated = ctx.db
|
|
130
|
+
.prepare(`SELECT rowid, * FROM vault WHERE rowid IN (${placeholders})`)
|
|
131
|
+
.all(...rowids);
|
|
132
|
+
|
|
133
|
+
const byRowid = new Map();
|
|
134
|
+
for (const row of hydrated) byRowid.set(row.rowid, row);
|
|
135
|
+
|
|
136
|
+
for (const vr of vecRows) {
|
|
137
|
+
const row = byRowid.get(vr.rowid);
|
|
138
|
+
if (!row) continue;
|
|
139
|
+
if (kindFilter && row.kind !== kindFilter) continue;
|
|
140
|
+
if (categoryFilter && row.category !== categoryFilter) continue;
|
|
141
|
+
if (since && row.created_at < since) continue;
|
|
142
|
+
if (until && row.created_at > until) continue;
|
|
143
|
+
if (row.expires_at && new Date(row.expires_at) <= new Date()) continue;
|
|
144
|
+
|
|
145
|
+
const { rowid: _rowid, ...cleanRow } = row;
|
|
146
|
+
// sqlite-vec returns L2 distance [0, 2] for normalized vectors.
|
|
147
|
+
// Convert to similarity [1, 0] with: 1 - distance/2
|
|
148
|
+
const vecScore = Math.max(0, 1 - vr.distance / 2) * VEC_WEIGHT;
|
|
149
|
+
const existing = results.get(cleanRow.id);
|
|
150
|
+
if (existing) {
|
|
151
|
+
existing.score += vecScore;
|
|
152
|
+
} else {
|
|
153
|
+
results.set(cleanRow.id, { ...cleanRow, score: vecScore });
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
} catch (err) {
|
|
159
|
+
if (err.message?.includes("no such table")) {
|
|
160
|
+
// Expected on fresh vaults with no vec table yet
|
|
161
|
+
} else {
|
|
162
|
+
console.error(`[retrieve] Vector search error: ${err.message}`);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Apply category-aware recency boost
|
|
167
|
+
for (const [, entry] of results) {
|
|
168
|
+
entry.score *= recencyBoost(entry.created_at, entry.category);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const sorted = [...results.values()].sort((a, b) => b.score - a.score);
|
|
172
|
+
return sorted.slice(offset, offset + limit);
|
|
173
|
+
}
|