@context-vault/core 2.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,101 @@
1
+ /**
2
+ * embed.js — Text embedding via HuggingFace transformers
3
+ *
4
+ * Graceful degradation: if the embedding model fails to load (offline, first run,
5
+ * disk issues), semantic search is disabled but FTS still works.
6
+ */
7
+
8
+ import { join } from "node:path";
9
+ import { homedir } from "node:os";
10
+ import { mkdirSync } from "node:fs";
11
+
12
+ let extractor = null;
13
+
14
+ /** @type {null | true | false} null = unknown, true = working, false = failed */
15
+ let embedAvailable = null;
16
+
17
+ async function ensurePipeline() {
18
+ if (embedAvailable === false) return null;
19
+ if (extractor) return extractor;
20
+
21
+ try {
22
+ // Dynamic import — @huggingface/transformers is optional (its transitive
23
+ // dep `sharp` can fail to install on some platforms). When missing, the
24
+ // server still works with full-text search only.
25
+ const { pipeline, env } = await import("@huggingface/transformers");
26
+
27
+ // Redirect model cache to ~/.context-mcp/models/ so it works when the
28
+ // package is installed globally in a root-owned directory (e.g. /usr/lib/node_modules/).
29
+ const modelCacheDir = join(homedir(), ".context-mcp", "models");
30
+ mkdirSync(modelCacheDir, { recursive: true });
31
+ env.cacheDir = modelCacheDir;
32
+
33
+ console.error(
34
+ "[context-vault] Loading embedding model (first run may download ~22MB)...",
35
+ );
36
+ extractor = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");
37
+ embedAvailable = true;
38
+ return extractor;
39
+ } catch (e) {
40
+ embedAvailable = false;
41
+ console.error(
42
+ `[context-vault] Failed to load embedding model: ${e.message}`,
43
+ );
44
+ console.error(
45
+ `[context-vault] Semantic search disabled. Full-text search still works.`,
46
+ );
47
+ return null;
48
+ }
49
+ }
50
+
51
+ export async function embed(text) {
52
+ const ext = await ensurePipeline();
53
+ if (!ext) return null;
54
+
55
+ const result = await ext([text], { pooling: "mean", normalize: true });
56
+ // Health check — force re-init on empty results
57
+ if (!result?.data?.length) {
58
+ extractor = null;
59
+ embedAvailable = null;
60
+ throw new Error("Embedding pipeline returned empty result");
61
+ }
62
+ return new Float32Array(result.data);
63
+ }
64
+
65
+ /**
66
+ * Batch embedding — embed multiple texts in a single pipeline call.
67
+ * Returns an array of Float32Array embeddings (one per input text).
68
+ * Returns array of nulls if embedding is unavailable.
69
+ */
70
+ export async function embedBatch(texts) {
71
+ if (!texts.length) return [];
72
+ const ext = await ensurePipeline();
73
+ if (!ext) return texts.map(() => null);
74
+
75
+ const result = await ext(texts, { pooling: "mean", normalize: true });
76
+ if (!result?.data?.length) {
77
+ extractor = null;
78
+ embedAvailable = null;
79
+ throw new Error("Embedding pipeline returned empty result");
80
+ }
81
+ const dim = result.data.length / texts.length;
82
+ if (!Number.isInteger(dim) || dim <= 0) {
83
+ throw new Error(
84
+ `Unexpected embedding dimension: ${result.data.length} / ${texts.length} = ${dim}`,
85
+ );
86
+ }
87
+ return texts.map(
88
+ (_, i) => new Float32Array(result.data.buffer, i * dim * 4, dim),
89
+ );
90
+ }
91
+
92
+ /** Force re-initialization on next embed call. */
93
+ export function resetEmbedPipeline() {
94
+ extractor = null;
95
+ embedAvailable = null;
96
+ }
97
+
98
+ /** Check if embedding is currently available. */
99
+ export function isEmbedAvailable() {
100
+ return embedAvailable;
101
+ }
@@ -0,0 +1,451 @@
1
+ /**
2
+ * Index Layer — Public API
3
+ *
4
+ * Owns the database as a derived index. Handles both bulk sync (reindex)
5
+ * and single-entry indexing (indexEntry) for write-through capture.
6
+ *
7
+ * Agent Constraint: Can import ../core. Owns db.js and embed.js.
8
+ */
9
+
10
+ import { readFileSync, readdirSync, existsSync, unlinkSync } from "node:fs";
11
+ import { join, basename } from "node:path";
12
+ import { dirToKind, walkDir, ulid } from "../core/files.js";
13
+ import { categoryFor, CATEGORY_DIRS } from "../core/categories.js";
14
+ import {
15
+ parseFrontmatter,
16
+ parseEntryFromMarkdown,
17
+ } from "../core/frontmatter.js";
18
+ import { embedBatch } from "./embed.js";
19
+
20
+ const EXCLUDED_DIRS = new Set(["projects", "_archive"]);
21
+ const EXCLUDED_FILES = new Set(["context.md", "memory.md", "README.md"]);
22
+
23
+ const EMBED_BATCH_SIZE = 32;
24
+
25
+ /**
26
+ * Index a single entry with idempotent upsert behavior.
27
+ * Called immediately after Capture Layer writes the file.
28
+ *
29
+ * For entities with identity_key: uses upsertByIdentityKey if existing row found.
30
+ *
31
+ * @param {import('../server/types.js').BaseCtx & Partial<import('../server/types.js').HostedCtxExtensions>} ctx
32
+ * @param {{ id, kind, category, title, body, meta, tags, source, filePath, createdAt, identity_key, expires_at, userId }} entry
33
+ */
34
+ export async function indexEntry(
35
+ ctx,
36
+ {
37
+ id,
38
+ kind,
39
+ category,
40
+ title,
41
+ body,
42
+ meta,
43
+ tags,
44
+ source,
45
+ filePath,
46
+ createdAt,
47
+ identity_key,
48
+ expires_at,
49
+ userId,
50
+ },
51
+ ) {
52
+ const tagsJson = tags ? JSON.stringify(tags) : null;
53
+ const metaJson = meta ? JSON.stringify(meta) : null;
54
+ const cat = category || categoryFor(kind);
55
+ const userIdVal = userId || null;
56
+
57
+ let wasUpdate = false;
58
+
59
+ // Entity upsert: check by (kind, identity_key, user_id) first
60
+ if (cat === "entity" && identity_key) {
61
+ const existing = ctx.stmts.getByIdentityKey.get(
62
+ kind,
63
+ identity_key,
64
+ userIdVal,
65
+ );
66
+ if (existing) {
67
+ ctx.stmts.upsertByIdentityKey.run(
68
+ title || null,
69
+ body,
70
+ metaJson,
71
+ tagsJson,
72
+ source || "claude-code",
73
+ cat,
74
+ filePath,
75
+ expires_at || null,
76
+ kind,
77
+ identity_key,
78
+ userIdVal,
79
+ );
80
+ wasUpdate = true;
81
+ }
82
+ }
83
+
84
+ if (!wasUpdate) {
85
+ // Prepare encryption if ctx.encrypt is available
86
+ let encrypted = null;
87
+ if (ctx.encrypt) {
88
+ encrypted = await ctx.encrypt({ title, body, meta });
89
+ }
90
+
91
+ try {
92
+ if (encrypted) {
93
+ // Encrypted insert: store preview in body column for FTS, full content in encrypted columns
94
+ const bodyPreview = body.slice(0, 200);
95
+ ctx.stmts.insertEntryEncrypted.run(
96
+ id,
97
+ userIdVal,
98
+ kind,
99
+ cat,
100
+ title || null,
101
+ bodyPreview,
102
+ metaJson,
103
+ tagsJson,
104
+ source || "claude-code",
105
+ filePath,
106
+ identity_key || null,
107
+ expires_at || null,
108
+ createdAt,
109
+ encrypted.body_encrypted,
110
+ encrypted.title_encrypted,
111
+ encrypted.meta_encrypted,
112
+ encrypted.iv,
113
+ );
114
+ } else {
115
+ ctx.stmts.insertEntry.run(
116
+ id,
117
+ userIdVal,
118
+ kind,
119
+ cat,
120
+ title || null,
121
+ body,
122
+ metaJson,
123
+ tagsJson,
124
+ source || "claude-code",
125
+ filePath,
126
+ identity_key || null,
127
+ expires_at || null,
128
+ createdAt,
129
+ );
130
+ }
131
+ } catch (e) {
132
+ if (e.message.includes("UNIQUE constraint")) {
133
+ ctx.stmts.updateEntry.run(
134
+ title || null,
135
+ body,
136
+ metaJson,
137
+ tagsJson,
138
+ source || "claude-code",
139
+ cat,
140
+ identity_key || null,
141
+ expires_at || null,
142
+ filePath,
143
+ );
144
+ wasUpdate = true;
145
+ } else {
146
+ throw e;
147
+ }
148
+ }
149
+ }
150
+
151
+ // After update, get rowid by file_path (since id might differ); otherwise by id
152
+ const rowidResult = wasUpdate
153
+ ? ctx.stmts.getRowidByPath.get(filePath)
154
+ : ctx.stmts.getRowid.get(id);
155
+
156
+ if (!rowidResult || rowidResult.rowid == null) {
157
+ throw new Error(
158
+ `Could not find rowid for entry: ${wasUpdate ? `file_path=${filePath}` : `id=${id}`}`,
159
+ );
160
+ }
161
+
162
+ const rowid = Number(rowidResult.rowid);
163
+ if (!Number.isFinite(rowid) || rowid < 1) {
164
+ throw new Error(
165
+ `Invalid rowid retrieved: ${rowidResult.rowid} (type: ${typeof rowidResult.rowid})`,
166
+ );
167
+ }
168
+
169
+ // Embeddings are always generated from plaintext (before encryption)
170
+ const embeddingText = [title, body].filter(Boolean).join(" ");
171
+ const embedding = await ctx.embed(embeddingText);
172
+
173
+ // Upsert vec: delete old if exists, then insert new (skip if embedding unavailable)
174
+ if (embedding) {
175
+ try {
176
+ ctx.deleteVec(rowid);
177
+ } catch {
178
+ /* no-op if not found */
179
+ }
180
+ ctx.insertVec(rowid, embedding);
181
+ }
182
+ }
183
+
184
+ /**
185
+ * Bulk reindex: sync vault directory into the database.
186
+ * P2: Wrapped in a transaction for atomicity.
187
+ * P3: Detects title/tag/meta changes, not just body.
188
+ * P4: Batches embedding calls for performance.
189
+ *
190
+ * @param {import('../server/types.js').BaseCtx} ctx
191
+ * @param {{ fullSync?: boolean }} opts — fullSync=true adds/updates/deletes; false=add-only
192
+ * @returns {Promise<{added: number, updated: number, removed: number, unchanged: number}>}
193
+ */
194
+ export async function reindex(ctx, opts = {}) {
195
+ const { fullSync = true } = opts;
196
+ const stats = { added: 0, updated: 0, removed: 0, unchanged: 0 };
197
+
198
+ if (!existsSync(ctx.config.vaultDir)) return stats;
199
+
200
+ // Use INSERT OR IGNORE for reindex — handles files with duplicate frontmatter IDs
201
+ // user_id is NULL for reindex (always local mode)
202
+ const upsertEntry = ctx.db.prepare(
203
+ `INSERT OR IGNORE INTO vault (id, user_id, kind, category, title, body, meta, tags, source, file_path, identity_key, expires_at, created_at) VALUES (?, NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
204
+ );
205
+
206
+ // Auto-discover kind directories, supporting both:
207
+ // - Nested: knowledge/insights/, events/sessions/ (category dirs at top level)
208
+ // - Flat: insights/, decisions/ (legacy — kind dirs at top level)
209
+ const kindEntries = []; // { kind, dir }
210
+ const topDirs = readdirSync(ctx.config.vaultDir, {
211
+ withFileTypes: true,
212
+ }).filter(
213
+ (d) =>
214
+ d.isDirectory() && !EXCLUDED_DIRS.has(d.name) && !d.name.startsWith("_"),
215
+ );
216
+
217
+ for (const d of topDirs) {
218
+ if (CATEGORY_DIRS.has(d.name)) {
219
+ // Category directory — look one level deeper for kind directories
220
+ const catDir = join(ctx.config.vaultDir, d.name);
221
+ const subDirs = readdirSync(catDir, { withFileTypes: true }).filter(
222
+ (sd) => sd.isDirectory() && !sd.name.startsWith("_"),
223
+ );
224
+ for (const sd of subDirs) {
225
+ kindEntries.push({
226
+ kind: dirToKind(sd.name),
227
+ dir: join(catDir, sd.name),
228
+ });
229
+ }
230
+ } else {
231
+ // Legacy flat structure — top-level dir is a kind dir
232
+ kindEntries.push({
233
+ kind: dirToKind(d.name),
234
+ dir: join(ctx.config.vaultDir, d.name),
235
+ });
236
+ }
237
+ }
238
+
239
+ // Phase 1: Sync DB ops in a transaction — FTS is searchable immediately after COMMIT.
240
+ // Phase 2: Async embedding runs post-transaction so it can't hold the write lock
241
+ // or roll back DB state on failure.
242
+ const pendingEmbeds = []; // { rowid, text }
243
+ const staleVecRowids = []; // rowids whose old vectors need deleting before re-embed
244
+
245
+ ctx.db.exec("BEGIN");
246
+ try {
247
+ for (const { kind, dir } of kindEntries) {
248
+ const category = categoryFor(kind);
249
+ const mdFiles = walkDir(dir).filter(
250
+ (f) => !EXCLUDED_FILES.has(basename(f.filePath)),
251
+ );
252
+
253
+ // P3: Fetch all mutable fields for change detection
254
+ const dbRows = ctx.db
255
+ .prepare(
256
+ "SELECT id, file_path, body, title, tags, meta FROM vault WHERE kind = ?",
257
+ )
258
+ .all(kind);
259
+ const dbByPath = new Map(dbRows.map((r) => [r.file_path, r]));
260
+ const diskPaths = new Set(mdFiles.map((e) => e.filePath));
261
+
262
+ for (const { filePath, relDir } of mdFiles) {
263
+ const existing = dbByPath.get(filePath);
264
+
265
+ // In add-only mode, skip files already in DB
266
+ if (!fullSync && existing) {
267
+ stats.unchanged++;
268
+ continue;
269
+ }
270
+
271
+ const raw = readFileSync(filePath, "utf-8");
272
+ if (!raw.startsWith("---\n")) {
273
+ console.error(`[reindex] skipping (no frontmatter): ${filePath}`);
274
+ continue;
275
+ }
276
+ const { meta: fmMeta, body: rawBody } = parseFrontmatter(raw);
277
+ const parsed = parseEntryFromMarkdown(kind, rawBody, fmMeta);
278
+
279
+ // Extract identity_key and expires_at from frontmatter
280
+ const identity_key = fmMeta.identity_key || null;
281
+ const expires_at = fmMeta.expires_at || null;
282
+
283
+ // Derive folder from disk location (source of truth)
284
+ const meta = { ...(parsed.meta || {}) };
285
+ if (relDir) meta.folder = relDir;
286
+ else delete meta.folder;
287
+ const metaJson = Object.keys(meta).length ? JSON.stringify(meta) : null;
288
+
289
+ if (!existing) {
290
+ // New file — add to DB (OR IGNORE if ID already exists at another path)
291
+ const id = fmMeta.id || ulid();
292
+ const tagsJson = fmMeta.tags ? JSON.stringify(fmMeta.tags) : null;
293
+ const created = fmMeta.created || new Date().toISOString();
294
+
295
+ const result = upsertEntry.run(
296
+ id,
297
+ kind,
298
+ category,
299
+ parsed.title || null,
300
+ parsed.body,
301
+ metaJson,
302
+ tagsJson,
303
+ fmMeta.source || "file",
304
+ filePath,
305
+ identity_key,
306
+ expires_at,
307
+ created,
308
+ );
309
+ if (result.changes > 0) {
310
+ const rowid = ctx.stmts.getRowid.get(id).rowid;
311
+ const embeddingText = [parsed.title, parsed.body]
312
+ .filter(Boolean)
313
+ .join(" ");
314
+ pendingEmbeds.push({ rowid, text: embeddingText });
315
+ stats.added++;
316
+ } else {
317
+ stats.unchanged++;
318
+ }
319
+ } else if (fullSync) {
320
+ // P3: Compare all mutable fields, not just body
321
+ const tagsJson = fmMeta.tags ? JSON.stringify(fmMeta.tags) : null;
322
+ const titleChanged =
323
+ (parsed.title || null) !== (existing.title || null);
324
+ const bodyChanged = existing.body !== parsed.body;
325
+ const tagsChanged = tagsJson !== (existing.tags || null);
326
+ const metaChanged = metaJson !== (existing.meta || null);
327
+
328
+ if (bodyChanged || titleChanged || tagsChanged || metaChanged) {
329
+ ctx.stmts.updateEntry.run(
330
+ parsed.title || null,
331
+ parsed.body,
332
+ metaJson,
333
+ tagsJson,
334
+ fmMeta.source || "file",
335
+ category,
336
+ identity_key,
337
+ expires_at,
338
+ filePath,
339
+ );
340
+
341
+ // Queue re-embed if title or body changed (vector ops deferred to Phase 2)
342
+ if (bodyChanged || titleChanged) {
343
+ const rowid = ctx.stmts.getRowid.get(existing.id)?.rowid;
344
+ if (rowid) {
345
+ staleVecRowids.push(rowid);
346
+ const embeddingText = [parsed.title, parsed.body]
347
+ .filter(Boolean)
348
+ .join(" ");
349
+ pendingEmbeds.push({ rowid, text: embeddingText });
350
+ }
351
+ }
352
+ stats.updated++;
353
+ } else {
354
+ stats.unchanged++;
355
+ }
356
+ } else {
357
+ stats.unchanged++;
358
+ }
359
+ }
360
+
361
+ // Find deleted files (in DB but not on disk) — only in fullSync mode
362
+ if (fullSync) {
363
+ for (const [dbPath, row] of dbByPath) {
364
+ if (!diskPaths.has(dbPath)) {
365
+ const vRowid = ctx.stmts.getRowid.get(row.id)?.rowid;
366
+ if (vRowid) {
367
+ try {
368
+ ctx.deleteVec(vRowid);
369
+ } catch {}
370
+ }
371
+ ctx.stmts.deleteEntry.run(row.id);
372
+ stats.removed++;
373
+ }
374
+ }
375
+ }
376
+ }
377
+
378
+ // Clean up entries for kinds whose directories no longer exist on disk
379
+ if (fullSync) {
380
+ const indexedKinds = new Set(kindEntries.map((ke) => ke.kind));
381
+ const allDbKinds = ctx.db
382
+ .prepare("SELECT DISTINCT kind FROM vault")
383
+ .all();
384
+ for (const { kind } of allDbKinds) {
385
+ if (!indexedKinds.has(kind)) {
386
+ const orphaned = ctx.db
387
+ .prepare("SELECT id, rowid FROM vault WHERE kind = ?")
388
+ .all(kind);
389
+ for (const row of orphaned) {
390
+ try {
391
+ ctx.deleteVec(row.rowid);
392
+ } catch {}
393
+ ctx.stmts.deleteEntry.run(row.id);
394
+ stats.removed++;
395
+ }
396
+ }
397
+ }
398
+ }
399
+
400
+ // Prune expired entries
401
+ const expired = ctx.db
402
+ .prepare(
403
+ "SELECT id, file_path FROM vault WHERE expires_at IS NOT NULL AND expires_at <= datetime('now')",
404
+ )
405
+ .all();
406
+
407
+ for (const row of expired) {
408
+ if (row.file_path) {
409
+ try {
410
+ unlinkSync(row.file_path);
411
+ } catch {}
412
+ }
413
+ const vRowid = ctx.stmts.getRowid.get(row.id)?.rowid;
414
+ if (vRowid) {
415
+ try {
416
+ ctx.deleteVec(vRowid);
417
+ } catch {}
418
+ }
419
+ ctx.stmts.deleteEntry.run(row.id);
420
+ stats.removed++;
421
+ }
422
+
423
+ ctx.db.exec("COMMIT");
424
+ } catch (e) {
425
+ ctx.db.exec("ROLLBACK");
426
+ throw e;
427
+ }
428
+
429
+ // Phase 2: Async embedding — runs after COMMIT so FTS is already searchable.
430
+ // Failures here are non-fatal; semantic search catches up on next reindex.
431
+
432
+ // Delete stale vectors for updated entries before re-embedding
433
+ for (const rowid of staleVecRowids) {
434
+ try {
435
+ ctx.deleteVec(rowid);
436
+ } catch {}
437
+ }
438
+
439
+ // Batch embed all pending texts
440
+ for (let i = 0; i < pendingEmbeds.length; i += EMBED_BATCH_SIZE) {
441
+ const batch = pendingEmbeds.slice(i, i + EMBED_BATCH_SIZE);
442
+ const embeddings = await embedBatch(batch.map((e) => e.text));
443
+ for (let j = 0; j < batch.length; j++) {
444
+ if (embeddings[j]) {
445
+ ctx.insertVec(batch[j].rowid, embeddings[j]);
446
+ }
447
+ }
448
+ }
449
+
450
+ return stats;
451
+ }
package/src/index.js ADDED
@@ -0,0 +1,62 @@
1
+ /**
2
+ * @context-vault/core — Shared core for context-vault
3
+ *
4
+ * Re-exports all public APIs from capture, index, retrieve, server, and core layers.
5
+ */
6
+
7
+ // Core utilities
8
+ export {
9
+ categoryFor,
10
+ categoryDirFor,
11
+ CATEGORY_DIRS,
12
+ } from "./core/categories.js";
13
+ export { parseArgs, resolveConfig } from "./core/config.js";
14
+ export {
15
+ ulid,
16
+ slugify,
17
+ kindToDir,
18
+ dirToKind,
19
+ normalizeKind,
20
+ kindToPath,
21
+ safeJoin,
22
+ walkDir,
23
+ } from "./core/files.js";
24
+ export {
25
+ formatFrontmatter,
26
+ parseFrontmatter,
27
+ extractCustomMeta,
28
+ parseEntryFromMarkdown,
29
+ } from "./core/frontmatter.js";
30
+ export { gatherVaultStatus } from "./core/status.js";
31
+
32
+ // Capture layer
33
+ export {
34
+ writeEntry,
35
+ updateEntryFile,
36
+ captureAndIndex,
37
+ } from "./capture/index.js";
38
+ export { writeEntryFile } from "./capture/file-ops.js";
39
+ export { formatBody } from "./capture/formatters.js";
40
+
41
+ // Index layer
42
+ export {
43
+ SCHEMA_DDL,
44
+ initDatabase,
45
+ prepareStatements,
46
+ insertVec,
47
+ deleteVec,
48
+ } from "./index/db.js";
49
+ export { embed, embedBatch, resetEmbedPipeline } from "./index/embed.js";
50
+ export { indexEntry, reindex } from "./index/index.js";
51
+
52
+ // Retrieve layer
53
+ export { hybridSearch } from "./retrieve/index.js";
54
+
55
+ // Server tools & helpers
56
+ export { registerTools } from "./server/tools.js";
57
+ export {
58
+ ok,
59
+ err,
60
+ ensureVaultExists,
61
+ ensureValidKind,
62
+ } from "./server/helpers.js";