@context-vault/core 2.17.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/dist/capture.d.ts +21 -0
  2. package/dist/capture.d.ts.map +1 -0
  3. package/dist/capture.js +269 -0
  4. package/dist/capture.js.map +1 -0
  5. package/dist/categories.d.ts +6 -0
  6. package/dist/categories.d.ts.map +1 -0
  7. package/dist/categories.js +50 -0
  8. package/dist/categories.js.map +1 -0
  9. package/dist/config.d.ts +4 -0
  10. package/dist/config.d.ts.map +1 -0
  11. package/dist/config.js +190 -0
  12. package/dist/config.js.map +1 -0
  13. package/dist/constants.d.ts +33 -0
  14. package/dist/constants.d.ts.map +1 -0
  15. package/dist/constants.js +23 -0
  16. package/dist/constants.js.map +1 -0
  17. package/dist/db.d.ts +13 -0
  18. package/dist/db.d.ts.map +1 -0
  19. package/dist/db.js +191 -0
  20. package/dist/db.js.map +1 -0
  21. package/dist/embed.d.ts +5 -0
  22. package/dist/embed.d.ts.map +1 -0
  23. package/dist/embed.js +78 -0
  24. package/dist/embed.js.map +1 -0
  25. package/dist/files.d.ts +13 -0
  26. package/dist/files.d.ts.map +1 -0
  27. package/dist/files.js +66 -0
  28. package/dist/files.js.map +1 -0
  29. package/dist/formatters.d.ts +8 -0
  30. package/dist/formatters.d.ts.map +1 -0
  31. package/dist/formatters.js +18 -0
  32. package/dist/formatters.js.map +1 -0
  33. package/dist/frontmatter.d.ts +12 -0
  34. package/dist/frontmatter.d.ts.map +1 -0
  35. package/dist/frontmatter.js +101 -0
  36. package/dist/frontmatter.js.map +1 -0
  37. package/dist/index.d.ts +10 -0
  38. package/dist/index.d.ts.map +1 -0
  39. package/dist/index.js +297 -0
  40. package/dist/index.js.map +1 -0
  41. package/dist/ingest-url.d.ts +20 -0
  42. package/dist/ingest-url.d.ts.map +1 -0
  43. package/dist/ingest-url.js +113 -0
  44. package/dist/ingest-url.js.map +1 -0
  45. package/dist/main.d.ts +14 -0
  46. package/dist/main.d.ts.map +1 -0
  47. package/dist/main.js +25 -0
  48. package/dist/main.js.map +1 -0
  49. package/dist/search.d.ts +18 -0
  50. package/dist/search.d.ts.map +1 -0
  51. package/dist/search.js +238 -0
  52. package/dist/search.js.map +1 -0
  53. package/dist/types.d.ts +176 -0
  54. package/dist/types.d.ts.map +1 -0
  55. package/dist/types.js +2 -0
  56. package/dist/types.js.map +1 -0
  57. package/package.json +66 -17
  58. package/src/capture.ts +308 -0
  59. package/src/categories.ts +54 -0
  60. package/src/{core/config.js → config.ts} +34 -33
  61. package/src/{constants.js → constants.ts} +6 -3
  62. package/src/db.ts +229 -0
  63. package/src/{index/embed.js → embed.ts} +10 -35
  64. package/src/files.ts +80 -0
  65. package/src/{capture/formatters.js → formatters.ts} +13 -11
  66. package/src/{core/frontmatter.js → frontmatter.ts} +27 -33
  67. package/src/index.ts +351 -0
  68. package/src/ingest-url.ts +99 -0
  69. package/src/main.ts +111 -0
  70. package/src/search.ts +285 -0
  71. package/src/types.ts +166 -0
  72. package/src/capture/file-ops.js +0 -97
  73. package/src/capture/import-pipeline.js +0 -46
  74. package/src/capture/importers.js +0 -387
  75. package/src/capture/index.js +0 -236
  76. package/src/capture/ingest-url.js +0 -252
  77. package/src/consolidation/index.js +0 -112
  78. package/src/core/categories.js +0 -72
  79. package/src/core/error-log.js +0 -54
  80. package/src/core/files.js +0 -108
  81. package/src/core/status.js +0 -350
  82. package/src/core/telemetry.js +0 -90
  83. package/src/index/db.js +0 -416
  84. package/src/index/index.js +0 -522
  85. package/src/index.js +0 -66
  86. package/src/retrieve/index.js +0 -500
  87. package/src/server/helpers.js +0 -44
  88. package/src/server/tools/clear-context.js +0 -47
  89. package/src/server/tools/context-status.js +0 -182
  90. package/src/server/tools/create-snapshot.js +0 -231
  91. package/src/server/tools/delete-context.js +0 -60
  92. package/src/server/tools/get-context.js +0 -678
  93. package/src/server/tools/ingest-project.js +0 -244
  94. package/src/server/tools/ingest-url.js +0 -88
  95. package/src/server/tools/list-buckets.js +0 -116
  96. package/src/server/tools/list-context.js +0 -163
  97. package/src/server/tools/save-context.js +0 -609
  98. package/src/server/tools/session-start.js +0 -285
  99. package/src/server/tools/submit-feedback.js +0 -55
  100. package/src/server/tools.js +0 -174
  101. package/src/sync/sync.js +0 -235
package/src/index.ts ADDED
@@ -0,0 +1,351 @@
1
+ import { readFileSync, readdirSync, existsSync, unlinkSync } from "node:fs";
2
+ import { join, basename } from "node:path";
3
+ import { dirToKind, walkDir, ulid } from "./files.js";
4
+ import { categoryFor, defaultTierFor, CATEGORY_DIRS } from "./categories.js";
5
+ import { parseFrontmatter, parseEntryFromMarkdown } from "./frontmatter.js";
6
+ import { embedBatch } from "./embed.js";
7
+ import type { BaseCtx, IndexEntryInput, ReindexStats } from "./types.js";
8
+
9
+ const EXCLUDED_DIRS = new Set(["projects", "_archive"]);
10
+ const EXCLUDED_FILES = new Set(["context.md", "memory.md", "README.md"]);
11
+ const EMBED_BATCH_SIZE = 32;
12
+
13
+ export async function indexEntry(
14
+ ctx: BaseCtx,
15
+ entry: IndexEntryInput & { supersedes?: string[] | null; related_to?: string[] | null },
16
+ ): Promise<void> {
17
+ const {
18
+ id, kind, category, title, body, meta, tags, source,
19
+ filePath, createdAt, identity_key, expires_at, source_files, tier,
20
+ } = entry;
21
+
22
+ if (expires_at && new Date(expires_at) <= new Date()) return;
23
+
24
+ const tagsJson = tags ? JSON.stringify(tags) : null;
25
+ const metaJson = meta ? JSON.stringify(meta) : null;
26
+ const sourceFilesJson = source_files ? JSON.stringify(source_files) : null;
27
+ const cat = category || categoryFor(kind);
28
+ const effectiveTier = tier || defaultTierFor(kind);
29
+
30
+ let wasUpdate = false;
31
+
32
+ if (cat === "entity" && identity_key) {
33
+ const existing = ctx.stmts.getByIdentityKey.get(kind, identity_key) as Record<string, unknown> | undefined;
34
+ if (existing) {
35
+ ctx.stmts.upsertByIdentityKey.run(
36
+ title || null, body, metaJson, tagsJson,
37
+ source || "claude-code", cat, filePath,
38
+ expires_at || null, sourceFilesJson,
39
+ kind, identity_key,
40
+ );
41
+ wasUpdate = true;
42
+ }
43
+ }
44
+
45
+ if (!wasUpdate) {
46
+ try {
47
+ ctx.stmts.insertEntry.run(
48
+ id, kind, cat, title || null, body, metaJson, tagsJson,
49
+ source || "claude-code", filePath,
50
+ identity_key || null, expires_at || null,
51
+ createdAt, createdAt, sourceFilesJson, effectiveTier,
52
+ );
53
+ } catch (e) {
54
+ if ((e as Error).message.includes("UNIQUE constraint")) {
55
+ ctx.stmts.updateEntry.run(
56
+ title || null, body, metaJson, tagsJson,
57
+ source || "claude-code", cat,
58
+ identity_key || null, expires_at || null, filePath,
59
+ );
60
+ if (sourceFilesJson !== null && ctx.stmts.updateSourceFiles) {
61
+ const entryRow = ctx.stmts.getRowidByPath.get(filePath) as { rowid: number } | undefined;
62
+ if (entryRow) {
63
+ const idRow = ctx.db
64
+ .prepare("SELECT id FROM vault WHERE file_path = ?")
65
+ .get(filePath) as { id: string } | undefined;
66
+ if (idRow)
67
+ ctx.stmts.updateSourceFiles.run(sourceFilesJson, idRow.id);
68
+ }
69
+ }
70
+ wasUpdate = true;
71
+ } else {
72
+ throw e;
73
+ }
74
+ }
75
+ }
76
+
77
+ const rowidResult = wasUpdate
78
+ ? ctx.stmts.getRowidByPath.get(filePath) as { rowid: number } | undefined
79
+ : ctx.stmts.getRowid.get(id) as { rowid: number } | undefined;
80
+
81
+ if (!rowidResult || rowidResult.rowid == null) {
82
+ throw new Error(
83
+ `Could not find rowid for entry: ${wasUpdate ? `file_path=${filePath}` : `id=${id}`}`,
84
+ );
85
+ }
86
+
87
+ const rowid = Number(rowidResult.rowid);
88
+ if (!Number.isFinite(rowid) || rowid < 1) {
89
+ throw new Error(
90
+ `Invalid rowid retrieved: ${rowidResult.rowid} (type: ${typeof rowidResult.rowid})`,
91
+ );
92
+ }
93
+
94
+ if (cat !== "event") {
95
+ const embeddingText = [title, body].filter(Boolean).join(" ");
96
+ const embedding = await ctx.embed(embeddingText);
97
+
98
+ if (embedding) {
99
+ try { ctx.deleteVec(rowid); } catch { /* no-op */ }
100
+ ctx.insertVec(rowid, embedding);
101
+ }
102
+ }
103
+ }
104
+
105
+ export async function pruneExpired(ctx: BaseCtx): Promise<number> {
106
+ const expired = ctx.db
107
+ .prepare(
108
+ "SELECT id, file_path FROM vault WHERE expires_at IS NOT NULL AND expires_at <= datetime('now')",
109
+ )
110
+ .all() as { id: string; file_path: string | null }[];
111
+
112
+ for (const row of expired) {
113
+ if (row.file_path) {
114
+ try { unlinkSync(row.file_path); } catch {}
115
+ }
116
+ const vRowid = (ctx.stmts.getRowid.get(row.id) as { rowid: number } | undefined)?.rowid;
117
+ if (vRowid) {
118
+ try { ctx.deleteVec(Number(vRowid)); } catch {}
119
+ }
120
+ ctx.stmts.deleteEntry.run(row.id);
121
+ }
122
+
123
+ return expired.length;
124
+ }
125
+
126
+ export async function reindex(
127
+ ctx: BaseCtx,
128
+ opts: { fullSync?: boolean } = {},
129
+ ): Promise<ReindexStats> {
130
+ const { fullSync = true } = opts;
131
+ const stats: ReindexStats = { added: 0, updated: 0, removed: 0, unchanged: 0 };
132
+
133
+ if (!existsSync(ctx.config.vaultDir)) return stats;
134
+
135
+ const upsertEntry = ctx.db.prepare(
136
+ `INSERT OR IGNORE INTO vault (id, kind, category, title, body, meta, tags, source, file_path, identity_key, expires_at, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
137
+ );
138
+
139
+ const kindEntries: { kind: string; dir: string }[] = [];
140
+ const topDirs = readdirSync(ctx.config.vaultDir, {
141
+ withFileTypes: true,
142
+ }).filter(
143
+ (d) =>
144
+ d.isDirectory() && !EXCLUDED_DIRS.has(d.name) && !d.name.startsWith("_"),
145
+ );
146
+
147
+ for (const d of topDirs) {
148
+ if (CATEGORY_DIRS.has(d.name)) {
149
+ const catDir = join(ctx.config.vaultDir, d.name);
150
+ const subDirs = readdirSync(catDir, { withFileTypes: true }).filter(
151
+ (sd) => sd.isDirectory() && !sd.name.startsWith("_"),
152
+ );
153
+ for (const sd of subDirs) {
154
+ kindEntries.push({
155
+ kind: dirToKind(sd.name),
156
+ dir: join(catDir, sd.name),
157
+ });
158
+ }
159
+ } else {
160
+ kindEntries.push({
161
+ kind: dirToKind(d.name),
162
+ dir: join(ctx.config.vaultDir, d.name),
163
+ });
164
+ }
165
+ }
166
+
167
+ const pendingEmbeds: { rowid: number; text: string }[] = [];
168
+
169
+ ctx.db.exec("BEGIN");
170
+ try {
171
+ for (const { kind, dir } of kindEntries) {
172
+ const category = categoryFor(kind);
173
+ const mdFiles = walkDir(dir).filter(
174
+ (f) => !EXCLUDED_FILES.has(basename(f.filePath)),
175
+ );
176
+
177
+ const dbRows = ctx.db
178
+ .prepare(
179
+ "SELECT id, file_path, body, title, tags, meta, related_to FROM vault WHERE kind = ?",
180
+ )
181
+ .all(kind) as Record<string, unknown>[];
182
+ const dbByPath = new Map(dbRows.map((r) => [r.file_path as string, r]));
183
+ const diskPaths = new Set(mdFiles.map((e) => e.filePath));
184
+
185
+ for (const { filePath, relDir } of mdFiles) {
186
+ const existing = dbByPath.get(filePath);
187
+
188
+ if (!fullSync && existing) {
189
+ stats.unchanged++;
190
+ continue;
191
+ }
192
+
193
+ const raw = readFileSync(filePath, "utf-8");
194
+ if (!raw.startsWith("---\n")) {
195
+ console.error(`[reindex] skipping (no frontmatter): ${filePath}`);
196
+ continue;
197
+ }
198
+ const { meta: fmMeta, body: rawBody } = parseFrontmatter(raw);
199
+ const parsed = parseEntryFromMarkdown(kind, rawBody, fmMeta);
200
+
201
+ const identity_key = (fmMeta.identity_key as string) || null;
202
+ const expires_at = (fmMeta.expires_at as string) || null;
203
+ const related_to = Array.isArray(fmMeta.related_to)
204
+ ? (fmMeta.related_to as string[])
205
+ : null;
206
+ const relatedToJson = related_to?.length
207
+ ? JSON.stringify(related_to)
208
+ : null;
209
+
210
+ const meta: Record<string, unknown> = { ...(parsed.meta || {}) };
211
+ if (relDir) meta.folder = relDir;
212
+ else delete meta.folder;
213
+ const metaJson = Object.keys(meta).length ? JSON.stringify(meta) : null;
214
+
215
+ if (!existing) {
216
+ const id = (fmMeta.id as string) || ulid();
217
+ const tagsJson = fmMeta.tags ? JSON.stringify(fmMeta.tags) : null;
218
+ const created = (fmMeta.created as string) || new Date().toISOString();
219
+
220
+ const result = upsertEntry.run(
221
+ id, kind, category, parsed.title || null, parsed.body,
222
+ metaJson, tagsJson, (fmMeta.source as string) || "file",
223
+ filePath, identity_key, expires_at,
224
+ created, (fmMeta.updated as string) || created,
225
+ );
226
+ if ((result as { changes: number }).changes > 0) {
227
+ if (relatedToJson && ctx.stmts.updateRelatedTo) {
228
+ ctx.stmts.updateRelatedTo.run(relatedToJson, id);
229
+ }
230
+ if (category !== "event") {
231
+ const rowidResult = ctx.stmts.getRowid.get(id) as { rowid: number } | undefined;
232
+ if (rowidResult?.rowid) {
233
+ const embeddingText = [parsed.title, parsed.body]
234
+ .filter(Boolean)
235
+ .join(" ");
236
+ pendingEmbeds.push({
237
+ rowid: rowidResult.rowid,
238
+ text: embeddingText,
239
+ });
240
+ }
241
+ }
242
+ stats.added++;
243
+ } else {
244
+ stats.unchanged++;
245
+ }
246
+ } else if (fullSync) {
247
+ const tagsJson = fmMeta.tags ? JSON.stringify(fmMeta.tags) : null;
248
+ const titleChanged = (parsed.title || null) !== ((existing.title as string) || null);
249
+ const bodyChanged = (existing.body as string) !== parsed.body;
250
+ const tagsChanged = tagsJson !== ((existing.tags as string) || null);
251
+ const metaChanged = metaJson !== ((existing.meta as string) || null);
252
+ const relatedToChanged = relatedToJson !== ((existing.related_to as string) || null);
253
+
254
+ if (bodyChanged || titleChanged || tagsChanged || metaChanged || relatedToChanged) {
255
+ ctx.stmts.updateEntry.run(
256
+ parsed.title || null, parsed.body, metaJson, tagsJson,
257
+ (fmMeta.source as string) || "file", category,
258
+ identity_key, expires_at, filePath,
259
+ );
260
+ if (relatedToChanged && ctx.stmts.updateRelatedTo) {
261
+ ctx.stmts.updateRelatedTo.run(relatedToJson, existing.id as string);
262
+ }
263
+
264
+ if ((bodyChanged || titleChanged) && category !== "event") {
265
+ const rowid = (ctx.stmts.getRowid.get(existing.id as string) as { rowid: number } | undefined)?.rowid;
266
+ if (rowid) {
267
+ const embeddingText = [parsed.title, parsed.body]
268
+ .filter(Boolean)
269
+ .join(" ");
270
+ pendingEmbeds.push({ rowid, text: embeddingText });
271
+ }
272
+ }
273
+ stats.updated++;
274
+ } else {
275
+ stats.unchanged++;
276
+ }
277
+ } else {
278
+ stats.unchanged++;
279
+ }
280
+ }
281
+
282
+ if (fullSync) {
283
+ for (const [dbPath, row] of dbByPath) {
284
+ if (!diskPaths.has(dbPath)) {
285
+ const vRowid = (ctx.stmts.getRowid.get(row.id as string) as { rowid: number } | undefined)?.rowid;
286
+ if (vRowid) {
287
+ try { ctx.deleteVec(vRowid); } catch {}
288
+ }
289
+ ctx.stmts.deleteEntry.run(row.id as string);
290
+ stats.removed++;
291
+ }
292
+ }
293
+ }
294
+ }
295
+
296
+ if (fullSync) {
297
+ const indexedKinds = new Set(kindEntries.map((ke) => ke.kind));
298
+ const allDbKinds = ctx.db
299
+ .prepare("SELECT DISTINCT kind FROM vault")
300
+ .all() as { kind: string }[];
301
+ for (const { kind } of allDbKinds) {
302
+ if (!indexedKinds.has(kind)) {
303
+ const orphaned = ctx.db
304
+ .prepare("SELECT id, rowid FROM vault WHERE kind = ?")
305
+ .all(kind) as { id: string; rowid: number }[];
306
+ for (const row of orphaned) {
307
+ try { ctx.deleteVec(row.rowid); } catch {}
308
+ ctx.stmts.deleteEntry.run(row.id);
309
+ stats.removed++;
310
+ }
311
+ }
312
+ }
313
+ }
314
+
315
+ const expired = ctx.db
316
+ .prepare(
317
+ "SELECT id, file_path FROM vault WHERE expires_at IS NOT NULL AND expires_at <= datetime('now')",
318
+ )
319
+ .all() as { id: string; file_path: string | null }[];
320
+
321
+ for (const row of expired) {
322
+ if (row.file_path) {
323
+ try { unlinkSync(row.file_path); } catch {}
324
+ }
325
+ const vRowid = (ctx.stmts.getRowid.get(row.id) as { rowid: number } | undefined)?.rowid;
326
+ if (vRowid) {
327
+ try { ctx.deleteVec(Number(vRowid)); } catch {}
328
+ }
329
+ ctx.stmts.deleteEntry.run(row.id);
330
+ stats.removed++;
331
+ }
332
+
333
+ ctx.db.exec("COMMIT");
334
+ } catch (e) {
335
+ ctx.db.exec("ROLLBACK");
336
+ throw e;
337
+ }
338
+
339
+ for (let i = 0; i < pendingEmbeds.length; i += EMBED_BATCH_SIZE) {
340
+ const batch = pendingEmbeds.slice(i, i + EMBED_BATCH_SIZE);
341
+ const embeddings = await embedBatch(batch.map((e) => e.text));
342
+ for (let j = 0; j < batch.length; j++) {
343
+ if (embeddings[j]) {
344
+ try { ctx.deleteVec(batch[j].rowid); } catch {}
345
+ ctx.insertVec(batch[j].rowid, embeddings[j]!);
346
+ }
347
+ }
348
+ }
349
+
350
+ return stats;
351
+ }
@@ -0,0 +1,99 @@
1
+ export function htmlToMarkdown(html: string): string {
2
+ let md = html;
3
+ md = md.replace(/<script[\s\S]*?<\/script>/gi, "");
4
+ md = md.replace(/<style[\s\S]*?<\/style>/gi, "");
5
+ md = md.replace(/<nav[\s\S]*?<\/nav>/gi, "");
6
+ md = md.replace(/<header[\s\S]*?<\/header>/gi, "");
7
+ md = md.replace(/<footer[\s\S]*?<\/footer>/gi, "");
8
+ md = md.replace(/<aside[\s\S]*?<\/aside>/gi, "");
9
+ md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, (_, c: string) => `\n# ${stripTags(c).trim()}\n`);
10
+ md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, (_, c: string) => `\n## ${stripTags(c).trim()}\n`);
11
+ md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, (_, c: string) => `\n### ${stripTags(c).trim()}\n`);
12
+ md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, (_, c: string) => `\n#### ${stripTags(c).trim()}\n`);
13
+ md = md.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, (_, c: string) => `\n##### ${stripTags(c).trim()}\n`);
14
+ md = md.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, (_, c: string) => `\n###### ${stripTags(c).trim()}\n`);
15
+ md = md.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_, href: string, text: string) => {
16
+ const cleanText = stripTags(text).trim();
17
+ return cleanText ? `[${cleanText}](${href})` : "";
18
+ });
19
+ md = md.replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (_, c: string) => `\n\`\`\`\n${decodeEntities(c).trim()}\n\`\`\`\n`);
20
+ md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_, c: string) => `\n\`\`\`\n${decodeEntities(stripTags(c)).trim()}\n\`\`\`\n`);
21
+ md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (_, c: string) => `\`${decodeEntities(c).trim()}\``);
22
+ md = md.replace(/<(strong|b)[^>]*>([\s\S]*?)<\/\1>/gi, (_, __: string, c: string) => `**${stripTags(c).trim()}**`);
23
+ md = md.replace(/<(em|i)[^>]*>([\s\S]*?)<\/\1>/gi, (_, __: string, c: string) => `*${stripTags(c).trim()}*`);
24
+ md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_, c: string) => `- ${stripTags(c).trim()}\n`);
25
+ md = md.replace(/<br\s*\/?>/gi, "\n");
26
+ md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_, c: string) => `\n${stripTags(c).trim()}\n`);
27
+ md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_, c: string) => {
28
+ return "\n" + stripTags(c).trim().split("\n").map((l: string) => `> ${l}`).join("\n") + "\n";
29
+ });
30
+ md = stripTags(md);
31
+ md = decodeEntities(md);
32
+ md = md.replace(/\n{3,}/g, "\n\n").trim();
33
+ return md;
34
+ }
35
+
36
+ function stripTags(html: string): string {
37
+ return html.replace(/<[^>]+>/g, "");
38
+ }
39
+
40
+ function decodeEntities(text: string): string {
41
+ return text
42
+ .replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">")
43
+ .replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&nbsp;/g, " ")
44
+ .replace(/&#(\d+);/g, (_, n: string) => String.fromCharCode(parseInt(n, 10)))
45
+ .replace(/&#x([0-9a-f]+);/gi, (_, n: string) => String.fromCharCode(parseInt(n, 16)));
46
+ }
47
+
48
+ export function extractHtmlContent(html: string, _url: string): { title: string; body: string } {
49
+ const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
50
+ const title = titleMatch ? stripTags(decodeEntities(titleMatch[1])).trim() : "";
51
+ let contentHtml = "";
52
+ const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
53
+ const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
54
+ if (articleMatch) contentHtml = articleMatch[1];
55
+ else if (mainMatch) contentHtml = mainMatch[1];
56
+ else {
57
+ const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
58
+ contentHtml = bodyMatch ? bodyMatch[1] : html;
59
+ }
60
+ const body = htmlToMarkdown(contentHtml);
61
+ return { title, body };
62
+ }
63
+
64
+ export async function ingestUrl(
65
+ url: string,
66
+ opts: { kind?: string; tags?: string[]; source?: string; maxBodyLength?: number; timeoutMs?: number } = {},
67
+ ): Promise<{ kind: string; title: string; body: string; tags: string[]; meta: Record<string, unknown>; source: string }> {
68
+ const { kind = "reference", tags = [], source, maxBodyLength = 50000, timeoutMs = 15000 } = opts;
69
+ let domain: string;
70
+ try { domain = new URL(url).hostname; } catch { throw new Error(`Invalid URL: ${url}`); }
71
+ const controller = new AbortController();
72
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
73
+ let response: Response;
74
+ try {
75
+ response = await fetch(url, {
76
+ signal: controller.signal,
77
+ headers: { "User-Agent": "ContextVault/1.0 (+https://github.com/fellanH/context-vault)", Accept: "text/html,application/xhtml+xml,text/plain,*/*" },
78
+ });
79
+ } catch (err) {
80
+ if ((err as Error).name === "AbortError") throw new Error(`Request timed out after ${timeoutMs}ms`);
81
+ throw new Error(`Fetch failed: ${(err as Error).message}`);
82
+ } finally { clearTimeout(timeout); }
83
+ if (!response.ok) throw new Error(`HTTP ${response.status}: ${response.statusText}`);
84
+ const contentType = response.headers.get("content-type") || "";
85
+ const html = await response.text();
86
+ let title: string, body: string;
87
+ if (contentType.includes("text/html") || contentType.includes("application/xhtml")) {
88
+ const extracted = extractHtmlContent(html, url);
89
+ title = extracted.title; body = extracted.body;
90
+ } else { title = domain; body = html; }
91
+ if (body.length > maxBodyLength) body = body.slice(0, maxBodyLength) + "\n\n[Content truncated]";
92
+ if (!body.trim()) throw new Error("No readable content extracted from URL");
93
+ return {
94
+ kind, title: title || domain, body,
95
+ tags: [...tags, "web-import"],
96
+ meta: { url, domain, fetched_at: new Date().toISOString(), content_type: contentType.split(";")[0].trim() || "text/html" },
97
+ source: source || domain,
98
+ };
99
+ }
package/src/main.ts ADDED
@@ -0,0 +1,111 @@
1
+ // Types
2
+ export type {
3
+ VaultConfig,
4
+ RecallConfig,
5
+ ConsolidationConfig,
6
+ GrowthThresholds,
7
+ PreparedStatements,
8
+ VaultEntry,
9
+ SearchResult,
10
+ CaptureInput,
11
+ CaptureResult,
12
+ IndexEntryInput,
13
+ ReindexStats,
14
+ BaseCtx,
15
+ SearchOptions,
16
+ } from "./types.js";
17
+
18
+ // Constants
19
+ export {
20
+ APP_URL,
21
+ API_URL,
22
+ MARKETING_URL,
23
+ GITHUB_ISSUES_URL,
24
+ MAX_BODY_LENGTH,
25
+ MAX_TITLE_LENGTH,
26
+ MAX_KIND_LENGTH,
27
+ MAX_TAG_LENGTH,
28
+ MAX_TAGS_COUNT,
29
+ MAX_META_LENGTH,
30
+ MAX_SOURCE_LENGTH,
31
+ MAX_IDENTITY_KEY_LENGTH,
32
+ DEFAULT_GROWTH_THRESHOLDS,
33
+ DEFAULT_LIFECYCLE,
34
+ } from "./constants.js";
35
+
36
+ // Categories
37
+ export {
38
+ categoryFor,
39
+ categoryDirFor,
40
+ defaultTierFor,
41
+ CATEGORY_DIRS,
42
+ KIND_STALENESS_DAYS,
43
+ } from "./categories.js";
44
+
45
+ // Config
46
+ export { parseArgs, resolveConfig } from "./config.js";
47
+
48
+ // Files
49
+ export {
50
+ ulid,
51
+ slugify,
52
+ kindToDir,
53
+ dirToKind,
54
+ normalizeKind,
55
+ kindToPath,
56
+ safeJoin,
57
+ walkDir,
58
+ } from "./files.js";
59
+
60
+ // Frontmatter
61
+ export {
62
+ formatFrontmatter,
63
+ parseFrontmatter,
64
+ extractCustomMeta,
65
+ parseEntryFromMarkdown,
66
+ } from "./frontmatter.js";
67
+
68
+ // Formatters
69
+ export { formatBody } from "./formatters.js";
70
+
71
+ // Database
72
+ export {
73
+ SCHEMA_DDL,
74
+ NativeModuleError,
75
+ initDatabase,
76
+ prepareStatements,
77
+ insertVec,
78
+ deleteVec,
79
+ testConnection,
80
+ } from "./db.js";
81
+
82
+ // Embeddings
83
+ export { embed, embedBatch, resetEmbedPipeline, isEmbedAvailable } from "./embed.js";
84
+
85
+ // Index (reindex + indexEntry)
86
+ export { indexEntry, reindex, pruneExpired } from "./index.js";
87
+
88
+ // Search (retrieve)
89
+ export {
90
+ hybridSearch,
91
+ buildFtsQuery,
92
+ buildFilterClauses,
93
+ recencyBoost,
94
+ recencyDecayScore,
95
+ dotProduct,
96
+ reciprocalRankFusion,
97
+ } from "./search.js";
98
+
99
+ // Capture
100
+ export {
101
+ writeEntry,
102
+ updateEntryFile,
103
+ captureAndIndex,
104
+ } from "./capture.js";
105
+
106
+ // Ingest URL
107
+ export {
108
+ htmlToMarkdown,
109
+ extractHtmlContent,
110
+ ingestUrl,
111
+ } from "./ingest-url.js";