@context-vault/core 2.17.1 → 3.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/dist/capture.d.ts +21 -0
  2. package/dist/capture.d.ts.map +1 -0
  3. package/dist/capture.js +269 -0
  4. package/dist/capture.js.map +1 -0
  5. package/dist/categories.d.ts +6 -0
  6. package/dist/categories.d.ts.map +1 -0
  7. package/dist/categories.js +50 -0
  8. package/dist/categories.js.map +1 -0
  9. package/dist/config.d.ts +4 -0
  10. package/dist/config.d.ts.map +1 -0
  11. package/dist/config.js +190 -0
  12. package/dist/config.js.map +1 -0
  13. package/dist/constants.d.ts +33 -0
  14. package/dist/constants.d.ts.map +1 -0
  15. package/dist/constants.js +23 -0
  16. package/dist/constants.js.map +1 -0
  17. package/dist/db.d.ts +13 -0
  18. package/dist/db.d.ts.map +1 -0
  19. package/dist/db.js +191 -0
  20. package/dist/db.js.map +1 -0
  21. package/dist/embed.d.ts +5 -0
  22. package/dist/embed.d.ts.map +1 -0
  23. package/dist/embed.js +78 -0
  24. package/dist/embed.js.map +1 -0
  25. package/dist/files.d.ts +13 -0
  26. package/dist/files.d.ts.map +1 -0
  27. package/dist/files.js +66 -0
  28. package/dist/files.js.map +1 -0
  29. package/dist/formatters.d.ts +8 -0
  30. package/dist/formatters.d.ts.map +1 -0
  31. package/dist/formatters.js +18 -0
  32. package/dist/formatters.js.map +1 -0
  33. package/dist/frontmatter.d.ts +12 -0
  34. package/dist/frontmatter.d.ts.map +1 -0
  35. package/dist/frontmatter.js +101 -0
  36. package/dist/frontmatter.js.map +1 -0
  37. package/dist/index.d.ts +10 -0
  38. package/dist/index.d.ts.map +1 -0
  39. package/dist/index.js +297 -0
  40. package/dist/index.js.map +1 -0
  41. package/dist/ingest-url.d.ts +20 -0
  42. package/dist/ingest-url.d.ts.map +1 -0
  43. package/dist/ingest-url.js +113 -0
  44. package/dist/ingest-url.js.map +1 -0
  45. package/dist/main.d.ts +14 -0
  46. package/dist/main.d.ts.map +1 -0
  47. package/dist/main.js +25 -0
  48. package/dist/main.js.map +1 -0
  49. package/dist/search.d.ts +18 -0
  50. package/dist/search.d.ts.map +1 -0
  51. package/dist/search.js +238 -0
  52. package/dist/search.js.map +1 -0
  53. package/dist/types.d.ts +176 -0
  54. package/dist/types.d.ts.map +1 -0
  55. package/dist/types.js +2 -0
  56. package/dist/types.js.map +1 -0
  57. package/package.json +66 -16
  58. package/src/capture.ts +308 -0
  59. package/src/categories.ts +54 -0
  60. package/src/{core/config.js → config.ts} +34 -33
  61. package/src/{constants.js → constants.ts} +6 -3
  62. package/src/db.ts +229 -0
  63. package/src/{index/embed.js → embed.ts} +10 -35
  64. package/src/{core/files.js → files.ts} +15 -20
  65. package/src/{capture/formatters.js → formatters.ts} +13 -11
  66. package/src/{core/frontmatter.js → frontmatter.ts} +26 -33
  67. package/src/index.ts +353 -0
  68. package/src/ingest-url.ts +99 -0
  69. package/src/main.ts +111 -0
  70. package/src/{retrieve/index.js → search.ts} +62 -150
  71. package/src/types.ts +166 -0
  72. package/src/capture/file-ops.js +0 -99
  73. package/src/capture/import-pipeline.js +0 -46
  74. package/src/capture/importers.js +0 -387
  75. package/src/capture/index.js +0 -250
  76. package/src/capture/ingest-url.js +0 -252
  77. package/src/consolidation/index.js +0 -112
  78. package/src/core/categories.js +0 -73
  79. package/src/core/error-log.js +0 -54
  80. package/src/core/linking.js +0 -161
  81. package/src/core/migrate-dirs.js +0 -196
  82. package/src/core/status.js +0 -350
  83. package/src/core/telemetry.js +0 -90
  84. package/src/core/temporal.js +0 -146
  85. package/src/index/db.js +0 -586
  86. package/src/index/index.js +0 -583
  87. package/src/index.js +0 -71
  88. package/src/server/helpers.js +0 -44
  89. package/src/server/tools/clear-context.js +0 -47
  90. package/src/server/tools/context-status.js +0 -182
  91. package/src/server/tools/create-snapshot.js +0 -200
  92. package/src/server/tools/delete-context.js +0 -60
  93. package/src/server/tools/get-context.js +0 -765
  94. package/src/server/tools/ingest-project.js +0 -244
  95. package/src/server/tools/ingest-url.js +0 -88
  96. package/src/server/tools/list-buckets.js +0 -116
  97. package/src/server/tools/list-context.js +0 -163
  98. package/src/server/tools/save-context.js +0 -632
  99. package/src/server/tools/session-start.js +0 -285
  100. package/src/server/tools.js +0 -172
  101. package/src/sync/sync.js +0 -235
package/src/index.ts ADDED
@@ -0,0 +1,353 @@
1
+ import { readFileSync, readdirSync, existsSync, unlinkSync } from "node:fs";
2
+ import { join, basename } from "node:path";
3
+ import { dirToKind, walkDir, ulid } from "./files.js";
4
+ import { categoryFor, defaultTierFor, CATEGORY_DIRS } from "./categories.js";
5
+ import { parseFrontmatter, parseEntryFromMarkdown } from "./frontmatter.js";
6
+ import { embedBatch } from "./embed.js";
7
+ import type { BaseCtx, IndexEntryInput, ReindexStats } from "./types.js";
8
+
9
+ const EXCLUDED_DIRS = new Set(["projects", "_archive"]);
10
+ const EXCLUDED_FILES = new Set(["context.md", "memory.md", "README.md"]);
11
+ const EMBED_BATCH_SIZE = 32;
12
+
13
+ export async function indexEntry(
14
+ ctx: BaseCtx,
15
+ entry: IndexEntryInput & { supersedes?: string[] | null; related_to?: string[] | null },
16
+ precomputedEmbedding?: Float32Array | null,
17
+ ): Promise<void> {
18
+ const {
19
+ id, kind, category, title, body, meta, tags, source,
20
+ filePath, createdAt, identity_key, expires_at, source_files, tier,
21
+ } = entry;
22
+
23
+ if (expires_at && new Date(expires_at) <= new Date()) return;
24
+
25
+ const tagsJson = tags ? JSON.stringify(tags) : null;
26
+ const metaJson = meta ? JSON.stringify(meta) : null;
27
+ const sourceFilesJson = source_files ? JSON.stringify(source_files) : null;
28
+ const cat = category || categoryFor(kind);
29
+ const effectiveTier = tier || defaultTierFor(kind);
30
+
31
+ let wasUpdate = false;
32
+
33
+ if (cat === "entity" && identity_key) {
34
+ const existing = ctx.stmts.getByIdentityKey.get(kind, identity_key) as Record<string, unknown> | undefined;
35
+ if (existing) {
36
+ ctx.stmts.upsertByIdentityKey.run(
37
+ title || null, body, metaJson, tagsJson,
38
+ source || "claude-code", cat, filePath,
39
+ expires_at || null, sourceFilesJson,
40
+ kind, identity_key,
41
+ );
42
+ wasUpdate = true;
43
+ }
44
+ }
45
+
46
+ if (!wasUpdate) {
47
+ try {
48
+ ctx.stmts.insertEntry.run(
49
+ id, kind, cat, title || null, body, metaJson, tagsJson,
50
+ source || "claude-code", filePath,
51
+ identity_key || null, expires_at || null,
52
+ createdAt, createdAt, sourceFilesJson, effectiveTier,
53
+ );
54
+ } catch (e) {
55
+ if ((e as Error).message.includes("UNIQUE constraint")) {
56
+ ctx.stmts.updateEntry.run(
57
+ title || null, body, metaJson, tagsJson,
58
+ source || "claude-code", cat,
59
+ identity_key || null, expires_at || null, filePath,
60
+ );
61
+ if (sourceFilesJson !== null && ctx.stmts.updateSourceFiles) {
62
+ const entryRow = ctx.stmts.getRowidByPath.get(filePath) as { rowid: number } | undefined;
63
+ if (entryRow) {
64
+ const idRow = ctx.db
65
+ .prepare("SELECT id FROM vault WHERE file_path = ?")
66
+ .get(filePath) as { id: string } | undefined;
67
+ if (idRow)
68
+ ctx.stmts.updateSourceFiles.run(sourceFilesJson, idRow.id);
69
+ }
70
+ }
71
+ wasUpdate = true;
72
+ } else {
73
+ throw e;
74
+ }
75
+ }
76
+ }
77
+
78
+ const rowidResult = wasUpdate
79
+ ? ctx.stmts.getRowidByPath.get(filePath) as { rowid: number } | undefined
80
+ : ctx.stmts.getRowid.get(id) as { rowid: number } | undefined;
81
+
82
+ if (!rowidResult || rowidResult.rowid == null) {
83
+ throw new Error(
84
+ `Could not find rowid for entry: ${wasUpdate ? `file_path=${filePath}` : `id=${id}`}`,
85
+ );
86
+ }
87
+
88
+ const rowid = Number(rowidResult.rowid);
89
+ if (!Number.isFinite(rowid) || rowid < 1) {
90
+ throw new Error(
91
+ `Invalid rowid retrieved: ${rowidResult.rowid} (type: ${typeof rowidResult.rowid})`,
92
+ );
93
+ }
94
+
95
+ if (cat !== "event") {
96
+ const embedding = precomputedEmbedding !== undefined
97
+ ? precomputedEmbedding
98
+ : await ctx.embed([title, body].filter(Boolean).join(" "));
99
+
100
+ if (embedding) {
101
+ try { ctx.deleteVec(rowid); } catch { /* no-op */ }
102
+ ctx.insertVec(rowid, embedding);
103
+ }
104
+ }
105
+ }
106
+
107
+ export async function pruneExpired(ctx: BaseCtx): Promise<number> {
108
+ const expired = ctx.db
109
+ .prepare(
110
+ "SELECT id, file_path FROM vault WHERE expires_at IS NOT NULL AND expires_at <= datetime('now')",
111
+ )
112
+ .all() as { id: string; file_path: string | null }[];
113
+
114
+ for (const row of expired) {
115
+ if (row.file_path) {
116
+ try { unlinkSync(row.file_path); } catch {}
117
+ }
118
+ const vRowid = (ctx.stmts.getRowid.get(row.id) as { rowid: number } | undefined)?.rowid;
119
+ if (vRowid) {
120
+ try { ctx.deleteVec(Number(vRowid)); } catch {}
121
+ }
122
+ ctx.stmts.deleteEntry.run(row.id);
123
+ }
124
+
125
+ return expired.length;
126
+ }
127
+
128
+ export async function reindex(
129
+ ctx: BaseCtx,
130
+ opts: { fullSync?: boolean } = {},
131
+ ): Promise<ReindexStats> {
132
+ const { fullSync = true } = opts;
133
+ const stats: ReindexStats = { added: 0, updated: 0, removed: 0, unchanged: 0 };
134
+
135
+ if (!existsSync(ctx.config.vaultDir)) return stats;
136
+
137
+ const upsertEntry = ctx.db.prepare(
138
+ `INSERT OR IGNORE INTO vault (id, kind, category, title, body, meta, tags, source, file_path, identity_key, expires_at, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
139
+ );
140
+
141
+ const kindEntries: { kind: string; dir: string }[] = [];
142
+ const topDirs = readdirSync(ctx.config.vaultDir, {
143
+ withFileTypes: true,
144
+ }).filter(
145
+ (d) =>
146
+ d.isDirectory() && !EXCLUDED_DIRS.has(d.name) && !d.name.startsWith("_"),
147
+ );
148
+
149
+ for (const d of topDirs) {
150
+ if (CATEGORY_DIRS.has(d.name)) {
151
+ const catDir = join(ctx.config.vaultDir, d.name);
152
+ const subDirs = readdirSync(catDir, { withFileTypes: true }).filter(
153
+ (sd) => sd.isDirectory() && !sd.name.startsWith("_"),
154
+ );
155
+ for (const sd of subDirs) {
156
+ kindEntries.push({
157
+ kind: dirToKind(sd.name),
158
+ dir: join(catDir, sd.name),
159
+ });
160
+ }
161
+ } else {
162
+ kindEntries.push({
163
+ kind: dirToKind(d.name),
164
+ dir: join(ctx.config.vaultDir, d.name),
165
+ });
166
+ }
167
+ }
168
+
169
+ const pendingEmbeds: { rowid: number; text: string }[] = [];
170
+
171
+ ctx.db.exec("BEGIN");
172
+ try {
173
+ for (const { kind, dir } of kindEntries) {
174
+ const category = categoryFor(kind);
175
+ const mdFiles = walkDir(dir).filter(
176
+ (f) => !EXCLUDED_FILES.has(basename(f.filePath)),
177
+ );
178
+
179
+ const dbRows = ctx.db
180
+ .prepare(
181
+ "SELECT id, file_path, body, title, tags, meta, related_to FROM vault WHERE kind = ?",
182
+ )
183
+ .all(kind) as Record<string, unknown>[];
184
+ const dbByPath = new Map(dbRows.map((r) => [r.file_path as string, r]));
185
+ const diskPaths = new Set(mdFiles.map((e) => e.filePath));
186
+
187
+ for (const { filePath, relDir } of mdFiles) {
188
+ const existing = dbByPath.get(filePath);
189
+
190
+ if (!fullSync && existing) {
191
+ stats.unchanged++;
192
+ continue;
193
+ }
194
+
195
+ const raw = readFileSync(filePath, "utf-8");
196
+ if (!raw.startsWith("---\n")) {
197
+ console.error(`[reindex] skipping (no frontmatter): ${filePath}`);
198
+ continue;
199
+ }
200
+ const { meta: fmMeta, body: rawBody } = parseFrontmatter(raw);
201
+ const parsed = parseEntryFromMarkdown(kind, rawBody, fmMeta);
202
+
203
+ const identity_key = (fmMeta.identity_key as string) || null;
204
+ const expires_at = (fmMeta.expires_at as string) || null;
205
+ const related_to = Array.isArray(fmMeta.related_to)
206
+ ? (fmMeta.related_to as string[])
207
+ : null;
208
+ const relatedToJson = related_to?.length
209
+ ? JSON.stringify(related_to)
210
+ : null;
211
+
212
+ const meta: Record<string, unknown> = { ...(parsed.meta || {}) };
213
+ if (relDir) meta.folder = relDir;
214
+ else delete meta.folder;
215
+ const metaJson = Object.keys(meta).length ? JSON.stringify(meta) : null;
216
+
217
+ if (!existing) {
218
+ const id = (fmMeta.id as string) || ulid();
219
+ const tagsJson = fmMeta.tags ? JSON.stringify(fmMeta.tags) : null;
220
+ const created = (fmMeta.created as string) || new Date().toISOString();
221
+
222
+ const result = upsertEntry.run(
223
+ id, kind, category, parsed.title || null, parsed.body,
224
+ metaJson, tagsJson, (fmMeta.source as string) || "file",
225
+ filePath, identity_key, expires_at,
226
+ created, (fmMeta.updated as string) || created,
227
+ );
228
+ if ((result as { changes: number }).changes > 0) {
229
+ if (relatedToJson && ctx.stmts.updateRelatedTo) {
230
+ ctx.stmts.updateRelatedTo.run(relatedToJson, id);
231
+ }
232
+ if (category !== "event") {
233
+ const rowidResult = ctx.stmts.getRowid.get(id) as { rowid: number } | undefined;
234
+ if (rowidResult?.rowid) {
235
+ const embeddingText = [parsed.title, parsed.body]
236
+ .filter(Boolean)
237
+ .join(" ");
238
+ pendingEmbeds.push({
239
+ rowid: rowidResult.rowid,
240
+ text: embeddingText,
241
+ });
242
+ }
243
+ }
244
+ stats.added++;
245
+ } else {
246
+ stats.unchanged++;
247
+ }
248
+ } else if (fullSync) {
249
+ const tagsJson = fmMeta.tags ? JSON.stringify(fmMeta.tags) : null;
250
+ const titleChanged = (parsed.title || null) !== ((existing.title as string) || null);
251
+ const bodyChanged = (existing.body as string) !== parsed.body;
252
+ const tagsChanged = tagsJson !== ((existing.tags as string) || null);
253
+ const metaChanged = metaJson !== ((existing.meta as string) || null);
254
+ const relatedToChanged = relatedToJson !== ((existing.related_to as string) || null);
255
+
256
+ if (bodyChanged || titleChanged || tagsChanged || metaChanged || relatedToChanged) {
257
+ ctx.stmts.updateEntry.run(
258
+ parsed.title || null, parsed.body, metaJson, tagsJson,
259
+ (fmMeta.source as string) || "file", category,
260
+ identity_key, expires_at, filePath,
261
+ );
262
+ if (relatedToChanged && ctx.stmts.updateRelatedTo) {
263
+ ctx.stmts.updateRelatedTo.run(relatedToJson, existing.id as string);
264
+ }
265
+
266
+ if ((bodyChanged || titleChanged) && category !== "event") {
267
+ const rowid = (ctx.stmts.getRowid.get(existing.id as string) as { rowid: number } | undefined)?.rowid;
268
+ if (rowid) {
269
+ const embeddingText = [parsed.title, parsed.body]
270
+ .filter(Boolean)
271
+ .join(" ");
272
+ pendingEmbeds.push({ rowid, text: embeddingText });
273
+ }
274
+ }
275
+ stats.updated++;
276
+ } else {
277
+ stats.unchanged++;
278
+ }
279
+ } else {
280
+ stats.unchanged++;
281
+ }
282
+ }
283
+
284
+ if (fullSync) {
285
+ for (const [dbPath, row] of dbByPath) {
286
+ if (!diskPaths.has(dbPath)) {
287
+ const vRowid = (ctx.stmts.getRowid.get(row.id as string) as { rowid: number } | undefined)?.rowid;
288
+ if (vRowid) {
289
+ try { ctx.deleteVec(vRowid); } catch {}
290
+ }
291
+ ctx.stmts.deleteEntry.run(row.id as string);
292
+ stats.removed++;
293
+ }
294
+ }
295
+ }
296
+ }
297
+
298
+ if (fullSync) {
299
+ const indexedKinds = new Set(kindEntries.map((ke) => ke.kind));
300
+ const allDbKinds = ctx.db
301
+ .prepare("SELECT DISTINCT kind FROM vault")
302
+ .all() as { kind: string }[];
303
+ for (const { kind } of allDbKinds) {
304
+ if (!indexedKinds.has(kind)) {
305
+ const orphaned = ctx.db
306
+ .prepare("SELECT id, rowid FROM vault WHERE kind = ?")
307
+ .all(kind) as { id: string; rowid: number }[];
308
+ for (const row of orphaned) {
309
+ try { ctx.deleteVec(row.rowid); } catch {}
310
+ ctx.stmts.deleteEntry.run(row.id);
311
+ stats.removed++;
312
+ }
313
+ }
314
+ }
315
+ }
316
+
317
+ const expired = ctx.db
318
+ .prepare(
319
+ "SELECT id, file_path FROM vault WHERE expires_at IS NOT NULL AND expires_at <= datetime('now')",
320
+ )
321
+ .all() as { id: string; file_path: string | null }[];
322
+
323
+ for (const row of expired) {
324
+ if (row.file_path) {
325
+ try { unlinkSync(row.file_path); } catch {}
326
+ }
327
+ const vRowid = (ctx.stmts.getRowid.get(row.id) as { rowid: number } | undefined)?.rowid;
328
+ if (vRowid) {
329
+ try { ctx.deleteVec(Number(vRowid)); } catch {}
330
+ }
331
+ ctx.stmts.deleteEntry.run(row.id);
332
+ stats.removed++;
333
+ }
334
+
335
+ ctx.db.exec("COMMIT");
336
+ } catch (e) {
337
+ ctx.db.exec("ROLLBACK");
338
+ throw e;
339
+ }
340
+
341
+ for (let i = 0; i < pendingEmbeds.length; i += EMBED_BATCH_SIZE) {
342
+ const batch = pendingEmbeds.slice(i, i + EMBED_BATCH_SIZE);
343
+ const embeddings = await embedBatch(batch.map((e) => e.text));
344
+ for (let j = 0; j < batch.length; j++) {
345
+ if (embeddings[j]) {
346
+ try { ctx.deleteVec(batch[j].rowid); } catch {}
347
+ ctx.insertVec(batch[j].rowid, embeddings[j]!);
348
+ }
349
+ }
350
+ }
351
+
352
+ return stats;
353
+ }
@@ -0,0 +1,99 @@
1
+ export function htmlToMarkdown(html: string): string {
2
+ let md = html;
3
+ md = md.replace(/<script[\s\S]*?<\/script>/gi, "");
4
+ md = md.replace(/<style[\s\S]*?<\/style>/gi, "");
5
+ md = md.replace(/<nav[\s\S]*?<\/nav>/gi, "");
6
+ md = md.replace(/<header[\s\S]*?<\/header>/gi, "");
7
+ md = md.replace(/<footer[\s\S]*?<\/footer>/gi, "");
8
+ md = md.replace(/<aside[\s\S]*?<\/aside>/gi, "");
9
+ md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, (_, c: string) => `\n# ${stripTags(c).trim()}\n`);
10
+ md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, (_, c: string) => `\n## ${stripTags(c).trim()}\n`);
11
+ md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, (_, c: string) => `\n### ${stripTags(c).trim()}\n`);
12
+ md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, (_, c: string) => `\n#### ${stripTags(c).trim()}\n`);
13
+ md = md.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, (_, c: string) => `\n##### ${stripTags(c).trim()}\n`);
14
+ md = md.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, (_, c: string) => `\n###### ${stripTags(c).trim()}\n`);
15
+ md = md.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_, href: string, text: string) => {
16
+ const cleanText = stripTags(text).trim();
17
+ return cleanText ? `[${cleanText}](${href})` : "";
18
+ });
19
+ md = md.replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (_, c: string) => `\n\`\`\`\n${decodeEntities(c).trim()}\n\`\`\`\n`);
20
+ md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_, c: string) => `\n\`\`\`\n${decodeEntities(stripTags(c)).trim()}\n\`\`\`\n`);
21
+ md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (_, c: string) => `\`${decodeEntities(c).trim()}\``);
22
+ md = md.replace(/<(strong|b)[^>]*>([\s\S]*?)<\/\1>/gi, (_, __: string, c: string) => `**${stripTags(c).trim()}**`);
23
+ md = md.replace(/<(em|i)[^>]*>([\s\S]*?)<\/\1>/gi, (_, __: string, c: string) => `*${stripTags(c).trim()}*`);
24
+ md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_, c: string) => `- ${stripTags(c).trim()}\n`);
25
+ md = md.replace(/<br\s*\/?>/gi, "\n");
26
+ md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_, c: string) => `\n${stripTags(c).trim()}\n`);
27
+ md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_, c: string) => {
28
+ return "\n" + stripTags(c).trim().split("\n").map((l: string) => `> ${l}`).join("\n") + "\n";
29
+ });
30
+ md = stripTags(md);
31
+ md = decodeEntities(md);
32
+ md = md.replace(/\n{3,}/g, "\n\n").trim();
33
+ return md;
34
+ }
35
+
36
+ function stripTags(html: string): string {
37
+ return html.replace(/<[^>]+>/g, "");
38
+ }
39
+
40
+ function decodeEntities(text: string): string {
41
+ return text
42
+ .replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">")
43
+ .replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&nbsp;/g, " ")
44
+ .replace(/&#(\d+);/g, (_, n: string) => String.fromCharCode(parseInt(n, 10)))
45
+ .replace(/&#x([0-9a-f]+);/gi, (_, n: string) => String.fromCharCode(parseInt(n, 16)));
46
+ }
47
+
48
+ export function extractHtmlContent(html: string, _url: string): { title: string; body: string } {
49
+ const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
50
+ const title = titleMatch ? stripTags(decodeEntities(titleMatch[1])).trim() : "";
51
+ let contentHtml = "";
52
+ const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
53
+ const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
54
+ if (articleMatch) contentHtml = articleMatch[1];
55
+ else if (mainMatch) contentHtml = mainMatch[1];
56
+ else {
57
+ const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
58
+ contentHtml = bodyMatch ? bodyMatch[1] : html;
59
+ }
60
+ const body = htmlToMarkdown(contentHtml);
61
+ return { title, body };
62
+ }
63
+
64
+ export async function ingestUrl(
65
+ url: string,
66
+ opts: { kind?: string; tags?: string[]; source?: string; maxBodyLength?: number; timeoutMs?: number } = {},
67
+ ): Promise<{ kind: string; title: string; body: string; tags: string[]; meta: Record<string, unknown>; source: string }> {
68
+ const { kind = "reference", tags = [], source, maxBodyLength = 50000, timeoutMs = 15000 } = opts;
69
+ let domain: string;
70
+ try { domain = new URL(url).hostname; } catch { throw new Error(`Invalid URL: ${url}`); }
71
+ const controller = new AbortController();
72
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
73
+ let response: Response;
74
+ try {
75
+ response = await fetch(url, {
76
+ signal: controller.signal,
77
+ headers: { "User-Agent": "ContextVault/1.0 (+https://github.com/fellanH/context-vault)", Accept: "text/html,application/xhtml+xml,text/plain,*/*" },
78
+ });
79
+ } catch (err) {
80
+ if ((err as Error).name === "AbortError") throw new Error(`Request timed out after ${timeoutMs}ms`);
81
+ throw new Error(`Fetch failed: ${(err as Error).message}`);
82
+ } finally { clearTimeout(timeout); }
83
+ if (!response.ok) throw new Error(`HTTP ${response.status}: ${response.statusText}`);
84
+ const contentType = response.headers.get("content-type") || "";
85
+ const html = await response.text();
86
+ let title: string, body: string;
87
+ if (contentType.includes("text/html") || contentType.includes("application/xhtml")) {
88
+ const extracted = extractHtmlContent(html, url);
89
+ title = extracted.title; body = extracted.body;
90
+ } else { title = domain; body = html; }
91
+ if (body.length > maxBodyLength) body = body.slice(0, maxBodyLength) + "\n\n[Content truncated]";
92
+ if (!body.trim()) throw new Error("No readable content extracted from URL");
93
+ return {
94
+ kind, title: title || domain, body,
95
+ tags: [...tags, "web-import"],
96
+ meta: { url, domain, fetched_at: new Date().toISOString(), content_type: contentType.split(";")[0].trim() || "text/html" },
97
+ source: source || domain,
98
+ };
99
+ }
package/src/main.ts ADDED
@@ -0,0 +1,111 @@
1
+ // Types
2
+ export type {
3
+ VaultConfig,
4
+ RecallConfig,
5
+ ConsolidationConfig,
6
+ GrowthThresholds,
7
+ PreparedStatements,
8
+ VaultEntry,
9
+ SearchResult,
10
+ CaptureInput,
11
+ CaptureResult,
12
+ IndexEntryInput,
13
+ ReindexStats,
14
+ BaseCtx,
15
+ SearchOptions,
16
+ } from "./types.js";
17
+
18
+ // Constants
19
+ export {
20
+ APP_URL,
21
+ API_URL,
22
+ MARKETING_URL,
23
+ GITHUB_ISSUES_URL,
24
+ MAX_BODY_LENGTH,
25
+ MAX_TITLE_LENGTH,
26
+ MAX_KIND_LENGTH,
27
+ MAX_TAG_LENGTH,
28
+ MAX_TAGS_COUNT,
29
+ MAX_META_LENGTH,
30
+ MAX_SOURCE_LENGTH,
31
+ MAX_IDENTITY_KEY_LENGTH,
32
+ DEFAULT_GROWTH_THRESHOLDS,
33
+ DEFAULT_LIFECYCLE,
34
+ } from "./constants.js";
35
+
36
+ // Categories
37
+ export {
38
+ categoryFor,
39
+ categoryDirFor,
40
+ defaultTierFor,
41
+ CATEGORY_DIRS,
42
+ KIND_STALENESS_DAYS,
43
+ } from "./categories.js";
44
+
45
+ // Config
46
+ export { parseArgs, resolveConfig } from "./config.js";
47
+
48
+ // Files
49
+ export {
50
+ ulid,
51
+ slugify,
52
+ kindToDir,
53
+ dirToKind,
54
+ normalizeKind,
55
+ kindToPath,
56
+ safeJoin,
57
+ walkDir,
58
+ } from "./files.js";
59
+
60
+ // Frontmatter
61
+ export {
62
+ formatFrontmatter,
63
+ parseFrontmatter,
64
+ extractCustomMeta,
65
+ parseEntryFromMarkdown,
66
+ } from "./frontmatter.js";
67
+
68
+ // Formatters
69
+ export { formatBody } from "./formatters.js";
70
+
71
+ // Database
72
+ export {
73
+ SCHEMA_DDL,
74
+ NativeModuleError,
75
+ initDatabase,
76
+ prepareStatements,
77
+ insertVec,
78
+ deleteVec,
79
+ testConnection,
80
+ } from "./db.js";
81
+
82
+ // Embeddings
83
+ export { embed, embedBatch, resetEmbedPipeline, isEmbedAvailable } from "./embed.js";
84
+
85
+ // Index (reindex + indexEntry)
86
+ export { indexEntry, reindex, pruneExpired } from "./index.js";
87
+
88
+ // Search (retrieve)
89
+ export {
90
+ hybridSearch,
91
+ buildFtsQuery,
92
+ buildFilterClauses,
93
+ recencyBoost,
94
+ recencyDecayScore,
95
+ dotProduct,
96
+ reciprocalRankFusion,
97
+ } from "./search.js";
98
+
99
+ // Capture
100
+ export {
101
+ writeEntry,
102
+ updateEntryFile,
103
+ captureAndIndex,
104
+ } from "./capture.js";
105
+
106
+ // Ingest URL
107
+ export {
108
+ htmlToMarkdown,
109
+ extractHtmlContent,
110
+ ingestUrl,
111
+ } from "./ingest-url.js";