context-vault 2.4.2 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@context-vault/core",
3
- "version": "2.4.2",
3
+ "version": "2.6.0",
4
4
  "type": "module",
5
5
  "description": "Shared core: capture, index, retrieve, tools, and utilities for context-vault",
6
6
  "main": "src/index.js",
@@ -19,7 +19,11 @@
19
19
  "./core/config": "./src/core/config.js",
20
20
  "./core/files": "./src/core/files.js",
21
21
  "./core/frontmatter": "./src/core/frontmatter.js",
22
- "./core/status": "./src/core/status.js"
22
+ "./core/status": "./src/core/status.js",
23
+ "./capture/importers": "./src/capture/importers.js",
24
+ "./capture/import-pipeline": "./src/capture/import-pipeline.js",
25
+ "./capture/ingest-url": "./src/capture/ingest-url.js",
26
+ "./sync": "./src/sync/sync.js"
23
27
  },
24
28
  "files": [
25
29
  "src/"
@@ -31,8 +35,8 @@
31
35
  "author": "Felix Hellstrom",
32
36
  "repository": {
33
37
  "type": "git",
34
- "url": "git+https://github.com/fellanH/context-mcp.git",
38
+ "url": "git+https://github.com/fellanH/context-vault.git",
35
39
  "directory": "packages/core"
36
40
  },
37
- "homepage": "https://github.com/fellanH/context-mcp"
41
+ "homepage": "https://github.com/fellanH/context-vault"
38
42
  }
@@ -10,7 +10,7 @@ import { formatFrontmatter } from "../core/frontmatter.js";
10
10
  import { slugify, kindToPath } from "../core/files.js";
11
11
  import { formatBody } from "./formatters.js";
12
12
 
13
- function safeFolderPath(vaultDir, kind, folder) {
13
+ export function safeFolderPath(vaultDir, kind, folder) {
14
14
  const base = resolve(vaultDir, kindToPath(kind));
15
15
  if (!folder) return base;
16
16
  const resolved = resolve(base, folder);
@@ -0,0 +1,85 @@
1
+ /**
2
+ * import-pipeline.js — Batch import orchestrator
3
+ *
4
+ * Processes an array of EntryData through captureAndIndex(),
5
+ * reporting progress and collecting results.
6
+ */
7
+
8
+ import { captureAndIndex } from "./index.js";
9
+ import { indexEntry } from "../index/index.js";
10
+
11
+ /**
12
+ * @typedef {object} EntryData
13
+ * @property {string} kind
14
+ * @property {string} [title]
15
+ * @property {string} body
16
+ * @property {string[]} [tags]
17
+ * @property {object} [meta]
18
+ * @property {string} [source]
19
+ * @property {string} [identity_key]
20
+ * @property {string} [expires_at]
21
+ */
22
+
23
+ /**
24
+ * @typedef {object} ImportResult
25
+ * @property {number} imported
26
+ * @property {number} failed
27
+ * @property {Array<{ index: number, title?: string, error: string }>} errors
28
+ */
29
+
30
+ /**
31
+ * Import an array of entries into the vault.
32
+ *
33
+ * @param {object} ctx — Vault context (db, config, stmts, embed, insertVec, deleteVec)
34
+ * @param {EntryData[]} entries
35
+ * @param {{ onProgress?: (current: number, total: number) => void, source?: string }} [opts]
36
+ * @returns {Promise<ImportResult>}
37
+ */
38
+ export async function importEntries(ctx, entries, opts = {}) {
39
+ const { onProgress, source } = opts;
40
+ let imported = 0;
41
+ let failed = 0;
42
+ const errors = [];
43
+
44
+ for (let i = 0; i < entries.length; i++) {
45
+ const entry = entries[i];
46
+
47
+ if (onProgress) {
48
+ onProgress(i + 1, entries.length);
49
+ }
50
+
51
+ try {
52
+ if (!entry.body?.trim()) {
53
+ failed++;
54
+ errors.push({ index: i, title: entry.title, error: "Empty body" });
55
+ continue;
56
+ }
57
+
58
+ await captureAndIndex(
59
+ ctx,
60
+ {
61
+ kind: entry.kind || "insight",
62
+ title: entry.title || null,
63
+ body: entry.body,
64
+ meta: entry.meta,
65
+ tags: entry.tags,
66
+ source: entry.source || source || "import",
67
+ identity_key: entry.identity_key,
68
+ expires_at: entry.expires_at,
69
+ userId: ctx.userId || null,
70
+ },
71
+ indexEntry
72
+ );
73
+ imported++;
74
+ } catch (err) {
75
+ failed++;
76
+ errors.push({
77
+ index: i,
78
+ title: entry.title || null,
79
+ error: err.message,
80
+ });
81
+ }
82
+ }
83
+
84
+ return { imported, failed, errors };
85
+ }
@@ -0,0 +1,360 @@
1
+ /**
2
+ * importers.js — Format detection + parsers for bulk import
3
+ *
4
+ * Detects and parses markdown, CSV/TSV, JSON, and plain text files into
5
+ * the EntryData shape that captureAndIndex() accepts.
6
+ *
7
+ * No external dependencies — CSV parsed with split + quote handling,
8
+ * markdown uses existing parseFrontmatter().
9
+ */
10
+
11
+ import { readdirSync, readFileSync, statSync } from "node:fs";
12
+ import { join, extname, basename } from "node:path";
13
+ import { parseFrontmatter, parseEntryFromMarkdown } from "../core/frontmatter.js";
14
+ import { dirToKind } from "../core/files.js";
15
+
16
+ // ─── Format Detection ────────────────────────────────────────────────────────
17
+
18
+ /**
19
+ * Detect the format of a file by extension and content heuristics.
20
+ * @param {string} filePath
21
+ * @param {string} [content]
22
+ * @returns {"markdown"|"csv"|"tsv"|"json"|"text"}
23
+ */
24
+ export function detectFormat(filePath, content) {
25
+ const ext = extname(filePath).toLowerCase();
26
+
27
+ if (ext === ".md" || ext === ".markdown") return "markdown";
28
+ if (ext === ".csv") return "csv";
29
+ if (ext === ".tsv") return "tsv";
30
+ if (ext === ".json" || ext === ".jsonl") return "json";
31
+
32
+ // Content-based heuristics if extension is ambiguous
33
+ if (content) {
34
+ const trimmed = content.trimStart();
35
+ if (trimmed.startsWith("---\n")) return "markdown";
36
+ if (trimmed.startsWith("[") || trimmed.startsWith("{")) return "json";
37
+ }
38
+
39
+ return "text";
40
+ }
41
+
42
+ // ─── CSV Parsing Helpers ─────────────────────────────────────────────────────
43
+
44
+ /**
45
+ * Parse a CSV line respecting quoted fields.
46
+ * @param {string} line
47
+ * @param {string} delimiter
48
+ * @returns {string[]}
49
+ */
50
+ function parseCsvLine(line, delimiter) {
51
+ const fields = [];
52
+ let current = "";
53
+ let inQuotes = false;
54
+
55
+ for (let i = 0; i < line.length; i++) {
56
+ const ch = line[i];
57
+ if (inQuotes) {
58
+ if (ch === '"') {
59
+ if (i + 1 < line.length && line[i + 1] === '"') {
60
+ current += '"';
61
+ i++;
62
+ } else {
63
+ inQuotes = false;
64
+ }
65
+ } else {
66
+ current += ch;
67
+ }
68
+ } else if (ch === '"') {
69
+ inQuotes = true;
70
+ } else if (ch === delimiter) {
71
+ fields.push(current.trim());
72
+ current = "";
73
+ } else {
74
+ current += ch;
75
+ }
76
+ }
77
+ fields.push(current.trim());
78
+ return fields;
79
+ }
80
+
81
+ // ─── Recognized CSV columns ─────────────────────────────────────────────────
82
+
83
+ const KNOWN_COLUMNS = new Set([
84
+ "kind", "title", "body", "tags", "source",
85
+ "identity_key", "expires_at",
86
+ ]);
87
+
88
+ // ─── Parsers ─────────────────────────────────────────────────────────────────
89
+
90
+ /**
91
+ * Parse a markdown file into EntryData.
92
+ * Reuses parseFrontmatter + parseEntryFromMarkdown from core.
93
+ *
94
+ * @param {string} content
95
+ * @param {{ kind?: string, source?: string }} [opts]
96
+ * @returns {import("./import-pipeline.js").EntryData[]}
97
+ */
98
+ export function parseMarkdown(content, opts = {}) {
99
+ const { meta: fmMeta, body: rawBody } = parseFrontmatter(content);
100
+
101
+ // Derive kind from frontmatter or option
102
+ const kind = fmMeta.kind || opts.kind || "insight";
103
+ const parsed = parseEntryFromMarkdown(kind, rawBody, fmMeta);
104
+
105
+ return [{
106
+ kind,
107
+ title: parsed.title || fmMeta.title || null,
108
+ body: parsed.body || rawBody,
109
+ tags: Array.isArray(fmMeta.tags) ? fmMeta.tags : undefined,
110
+ meta: parsed.meta || undefined,
111
+ source: fmMeta.source || opts.source || "import",
112
+ identity_key: fmMeta.identity_key || undefined,
113
+ expires_at: fmMeta.expires_at || undefined,
114
+ }];
115
+ }
116
+
117
+ /**
118
+ * Parse a CSV or TSV file into EntryData[].
119
+ * Header row required. Recognized columns map directly; unknown → meta.
120
+ * Tags column is comma-separated within field.
121
+ *
122
+ * @param {string} content
123
+ * @param {string} delimiter - "," for CSV, "\t" for TSV
124
+ * @param {{ kind?: string, source?: string }} [opts]
125
+ * @returns {import("./import-pipeline.js").EntryData[]}
126
+ */
127
+ export function parseCsv(content, delimiter, opts = {}) {
128
+ const lines = content.split(/\r?\n/).filter((l) => l.trim());
129
+ if (lines.length < 2) return [];
130
+
131
+ const headers = parseCsvLine(lines[0], delimiter).map((h) => h.toLowerCase().trim());
132
+ const entries = [];
133
+
134
+ for (let i = 1; i < lines.length; i++) {
135
+ const values = parseCsvLine(lines[i], delimiter);
136
+ if (values.every((v) => !v)) continue; // skip empty rows
137
+
138
+ const entry = {
139
+ kind: opts.kind || "insight",
140
+ body: "",
141
+ source: opts.source || "csv-import",
142
+ };
143
+ const meta = {};
144
+
145
+ for (let j = 0; j < headers.length; j++) {
146
+ const col = headers[j];
147
+ const val = values[j] || "";
148
+
149
+ if (col === "kind" && val) {
150
+ entry.kind = val;
151
+ } else if (col === "title" && val) {
152
+ entry.title = val;
153
+ } else if (col === "body" && val) {
154
+ entry.body = val;
155
+ } else if (col === "tags" && val) {
156
+ entry.tags = val.split(",").map((t) => t.trim()).filter(Boolean);
157
+ } else if (col === "source" && val) {
158
+ entry.source = val;
159
+ } else if (col === "identity_key" && val) {
160
+ entry.identity_key = val;
161
+ } else if (col === "expires_at" && val) {
162
+ entry.expires_at = val;
163
+ } else if (val && !KNOWN_COLUMNS.has(col)) {
164
+ meta[col] = val;
165
+ }
166
+ }
167
+
168
+ if (!entry.body) continue; // skip rows with no body
169
+ if (Object.keys(meta).length) entry.meta = meta;
170
+ entries.push(entry);
171
+ }
172
+
173
+ return entries;
174
+ }
175
+
176
+ /**
177
+ * Parse a JSON file into EntryData[].
178
+ * Supports: array-of-entries, {entries:[...]}, or ChatGPT export format.
179
+ *
180
+ * @param {string} content
181
+ * @param {{ kind?: string, source?: string }} [opts]
182
+ * @returns {import("./import-pipeline.js").EntryData[]}
183
+ */
184
+ export function parseJson(content, opts = {}) {
185
+ let data;
186
+ try {
187
+ data = JSON.parse(content);
188
+ } catch {
189
+ return [];
190
+ }
191
+
192
+ // Detect format
193
+ let rawEntries;
194
+
195
+ if (Array.isArray(data)) {
196
+ // Array-of-entries OR ChatGPT export format
197
+ if (data.length > 0 && data[0].mapping && data[0].create_time !== undefined) {
198
+ return parseChatGptExport(data, opts);
199
+ }
200
+ rawEntries = data;
201
+ } else if (data && Array.isArray(data.entries)) {
202
+ rawEntries = data.entries;
203
+ } else {
204
+ // Single entry object
205
+ rawEntries = [data];
206
+ }
207
+
208
+ return rawEntries
209
+ .filter((e) => e && typeof e === "object" && e.body)
210
+ .map((e) => ({
211
+ kind: e.kind || opts.kind || "insight",
212
+ title: e.title || null,
213
+ body: e.body,
214
+ tags: Array.isArray(e.tags) ? e.tags : undefined,
215
+ meta: e.meta && typeof e.meta === "object" ? e.meta : undefined,
216
+ source: e.source || opts.source || "json-import",
217
+ identity_key: e.identity_key || undefined,
218
+ expires_at: e.expires_at || undefined,
219
+ }));
220
+ }
221
+
222
+ /**
223
+ * Parse ChatGPT export format (array of conversations with mapping + create_time).
224
+ */
225
+ function parseChatGptExport(conversations, opts = {}) {
226
+ const entries = [];
227
+
228
+ for (const conv of conversations) {
229
+ if (!conv.title || !conv.mapping) continue;
230
+
231
+ // Extract all assistant messages from the mapping
232
+ const messages = Object.values(conv.mapping)
233
+ .filter((m) => m.message?.author?.role === "assistant" && m.message.content?.parts?.length)
234
+ .map((m) => m.message.content.parts.join("\n"))
235
+ .filter(Boolean);
236
+
237
+ if (!messages.length) continue;
238
+
239
+ const body = messages.join("\n\n---\n\n");
240
+ const created = conv.create_time
241
+ ? new Date(conv.create_time * 1000).toISOString()
242
+ : undefined;
243
+
244
+ entries.push({
245
+ kind: opts.kind || "conversation",
246
+ title: conv.title,
247
+ body,
248
+ tags: ["chatgpt-import"],
249
+ meta: { conversation_id: conv.id, created_at_original: created },
250
+ source: opts.source || "chatgpt-export",
251
+ });
252
+ }
253
+
254
+ return entries;
255
+ }
256
+
257
+ /**
258
+ * Parse a plain text file into a single EntryData.
259
+ *
260
+ * @param {string} content
261
+ * @param {string} filePath
262
+ * @param {{ kind?: string, source?: string }} [opts]
263
+ * @returns {import("./import-pipeline.js").EntryData[]}
264
+ */
265
+ export function parseText(content, filePath, opts = {}) {
266
+ const trimmed = content.trim();
267
+ if (!trimmed) return [];
268
+
269
+ const name = basename(filePath, extname(filePath));
270
+ const title = name.replace(/[-_]/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
271
+
272
+ return [{
273
+ kind: opts.kind || "insight",
274
+ title,
275
+ body: trimmed,
276
+ source: opts.source || "text-import",
277
+ }];
278
+ }
279
+
280
+ /**
281
+ * Parse a single file (auto-detect format).
282
+ *
283
+ * @param {string} filePath
284
+ * @param {string} content
285
+ * @param {{ kind?: string, source?: string }} [opts]
286
+ * @returns {import("./import-pipeline.js").EntryData[]}
287
+ */
288
+ export function parseFile(filePath, content, opts = {}) {
289
+ const format = detectFormat(filePath, content);
290
+
291
+ switch (format) {
292
+ case "markdown":
293
+ return parseMarkdown(content, opts);
294
+ case "csv":
295
+ return parseCsv(content, ",", opts);
296
+ case "tsv":
297
+ return parseCsv(content, "\t", opts);
298
+ case "json":
299
+ return parseJson(content, opts);
300
+ case "text":
301
+ return parseText(content, filePath, opts);
302
+ default:
303
+ return [];
304
+ }
305
+ }
306
+
307
+ /**
308
+ * Recursively parse a directory of files.
309
+ * Walks subdirectories, filters by extension, infers kind from directory name.
310
+ *
311
+ * @param {string} dirPath
312
+ * @param {{ kind?: string, source?: string, extensions?: string[] }} [opts]
313
+ * @returns {import("./import-pipeline.js").EntryData[]}
314
+ */
315
+ export function parseDirectory(dirPath, opts = {}) {
316
+ const extensions = opts.extensions || [".md", ".markdown", ".csv", ".tsv", ".json", ".txt"];
317
+ const entries = [];
318
+
319
+ function walk(dir, inferredKind) {
320
+ let items;
321
+ try {
322
+ items = readdirSync(dir, { withFileTypes: true });
323
+ } catch {
324
+ return;
325
+ }
326
+
327
+ for (const item of items) {
328
+ if (item.name.startsWith(".") || item.name.startsWith("_")) continue;
329
+
330
+ const fullPath = join(dir, item.name);
331
+
332
+ if (item.isDirectory()) {
333
+ // Try to infer kind from directory name
334
+ const kind = dirToKind(item.name) !== item.name
335
+ ? dirToKind(item.name)
336
+ : inferredKind;
337
+ walk(fullPath, kind);
338
+ } else if (item.isFile()) {
339
+ const ext = extname(item.name).toLowerCase();
340
+ if (!extensions.includes(ext)) continue;
341
+
342
+ try {
343
+ const content = readFileSync(fullPath, "utf-8");
344
+ const fileOpts = { ...opts };
345
+ if (inferredKind && !fileOpts.kind) fileOpts.kind = inferredKind;
346
+ const parsed = parseFile(fullPath, content, fileOpts);
347
+ entries.push(...parsed);
348
+ } catch {
349
+ // Skip unreadable files
350
+ }
351
+ }
352
+ }
353
+ }
354
+
355
+ // Infer kind from the top-level directory name
356
+ const topKind = opts.kind || undefined;
357
+ walk(dirPath, topKind);
358
+
359
+ return entries;
360
+ }
@@ -0,0 +1,216 @@
1
+ /**
2
+ * ingest-url.js — URL fetch + HTML → markdown conversion
3
+ *
4
+ * Fetches a URL, extracts readable content, converts to markdown,
5
+ * and returns an EntryData object ready for captureAndIndex().
6
+ *
7
+ * Uses Node built-in fetch() (Node 18+). No external dependencies.
8
+ */
9
+
10
+ // ─── HTML → Markdown ─────────────────────────────────────────────────────────
11
+
12
+ /**
13
+ * Convert HTML to simplified markdown.
14
+ * Strips scripts/styles, converts headings/links/lists/code.
15
+ *
16
+ * @param {string} html
17
+ * @returns {string}
18
+ */
19
+ export function htmlToMarkdown(html) {
20
+ let md = html;
21
+
22
+ // Remove scripts, styles, nav, header, footer, aside
23
+ md = md.replace(/<script[\s\S]*?<\/script>/gi, "");
24
+ md = md.replace(/<style[\s\S]*?<\/style>/gi, "");
25
+ md = md.replace(/<nav[\s\S]*?<\/nav>/gi, "");
26
+ md = md.replace(/<header[\s\S]*?<\/header>/gi, "");
27
+ md = md.replace(/<footer[\s\S]*?<\/footer>/gi, "");
28
+ md = md.replace(/<aside[\s\S]*?<\/aside>/gi, "");
29
+
30
+ // Convert headings
31
+ md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, (_, c) => `\n# ${stripTags(c).trim()}\n`);
32
+ md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, (_, c) => `\n## ${stripTags(c).trim()}\n`);
33
+ md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, (_, c) => `\n### ${stripTags(c).trim()}\n`);
34
+ md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, (_, c) => `\n#### ${stripTags(c).trim()}\n`);
35
+ md = md.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, (_, c) => `\n##### ${stripTags(c).trim()}\n`);
36
+ md = md.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, (_, c) => `\n###### ${stripTags(c).trim()}\n`);
37
+
38
+ // Convert links
39
+ md = md.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_, href, text) => {
40
+ const cleanText = stripTags(text).trim();
41
+ return cleanText ? `[${cleanText}](${href})` : "";
42
+ });
43
+
44
+ // Convert code blocks
45
+ md = md.replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (_, c) => `\n\`\`\`\n${decodeEntities(c).trim()}\n\`\`\`\n`);
46
+ md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_, c) => `\n\`\`\`\n${decodeEntities(stripTags(c)).trim()}\n\`\`\`\n`);
47
+
48
+ // Convert inline code
49
+ md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (_, c) => `\`${decodeEntities(c).trim()}\``);
50
+
51
+ // Convert strong/em
52
+ md = md.replace(/<(strong|b)[^>]*>([\s\S]*?)<\/\1>/gi, (_, __, c) => `**${stripTags(c).trim()}**`);
53
+ md = md.replace(/<(em|i)[^>]*>([\s\S]*?)<\/\1>/gi, (_, __, c) => `*${stripTags(c).trim()}*`);
54
+
55
+ // Convert list items
56
+ md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_, c) => `- ${stripTags(c).trim()}\n`);
57
+
58
+ // Convert paragraphs and line breaks
59
+ md = md.replace(/<br\s*\/?>/gi, "\n");
60
+ md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_, c) => `\n${stripTags(c).trim()}\n`);
61
+ md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_, c) => {
62
+ return "\n" + stripTags(c).trim().split("\n").map((l) => `> ${l}`).join("\n") + "\n";
63
+ });
64
+
65
+ // Remove remaining HTML tags
66
+ md = stripTags(md);
67
+
68
+ // Decode HTML entities
69
+ md = decodeEntities(md);
70
+
71
+ // Clean up whitespace
72
+ md = md.replace(/\n{3,}/g, "\n\n").trim();
73
+
74
+ return md;
75
+ }
76
+
77
+ function stripTags(html) {
78
+ return html.replace(/<[^>]+>/g, "");
79
+ }
80
+
81
+ function decodeEntities(text) {
82
+ return text
83
+ .replace(/&amp;/g, "&")
84
+ .replace(/&lt;/g, "<")
85
+ .replace(/&gt;/g, ">")
86
+ .replace(/&quot;/g, '"')
87
+ .replace(/&#39;/g, "'")
88
+ .replace(/&nbsp;/g, " ")
89
+ .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
90
+ .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCharCode(parseInt(n, 16)));
91
+ }
92
+
93
+ // ─── HTML Content Extraction ─────────────────────────────────────────────────
94
+
95
+ /**
96
+ * Extract the main readable content from an HTML page.
97
+ * Prefers <article> or <main>, falls back to <body>.
98
+ *
99
+ * @param {string} html
100
+ * @param {string} url
101
+ * @returns {{ title: string, body: string }}
102
+ */
103
+ export function extractHtmlContent(html, url) {
104
+ // Extract <title>
105
+ const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
106
+ const title = titleMatch ? stripTags(decodeEntities(titleMatch[1])).trim() : "";
107
+
108
+ // Try to extract main content area
109
+ let contentHtml = "";
110
+
111
+ const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
112
+ const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
113
+
114
+ if (articleMatch) {
115
+ contentHtml = articleMatch[1];
116
+ } else if (mainMatch) {
117
+ contentHtml = mainMatch[1];
118
+ } else {
119
+ // Fall back to <body>
120
+ const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
121
+ contentHtml = bodyMatch ? bodyMatch[1] : html;
122
+ }
123
+
124
+ const body = htmlToMarkdown(contentHtml);
125
+
126
+ return { title, body };
127
+ }
128
+
129
+ // ─── URL Ingestion ───────────────────────────────────────────────────────────
130
+
131
+ /**
132
+ * Fetch a URL, extract readable content, and return an EntryData object.
133
+ *
134
+ * @param {string} url
135
+ * @param {{ kind?: string, tags?: string[], source?: string, maxBodyLength?: number, timeoutMs?: number }} [opts]
136
+ * @returns {Promise<{ kind: string, title: string, body: string, tags: string[], meta: object, source: string }>}
137
+ */
138
+ export async function ingestUrl(url, opts = {}) {
139
+ const {
140
+ kind = "reference",
141
+ tags = [],
142
+ source,
143
+ maxBodyLength = 50000,
144
+ timeoutMs = 15000,
145
+ } = opts;
146
+
147
+ let domain;
148
+ try {
149
+ domain = new URL(url).hostname;
150
+ } catch {
151
+ throw new Error(`Invalid URL: ${url}`);
152
+ }
153
+
154
+ const controller = new AbortController();
155
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
156
+
157
+ let response;
158
+ try {
159
+ response = await fetch(url, {
160
+ signal: controller.signal,
161
+ headers: {
162
+ "User-Agent": "ContextVault/1.0 (+https://github.com/fellanH/context-vault)",
163
+ Accept: "text/html,application/xhtml+xml,text/plain,*/*",
164
+ },
165
+ });
166
+ } catch (err) {
167
+ if (err.name === "AbortError") {
168
+ throw new Error(`Request timed out after ${timeoutMs}ms`);
169
+ }
170
+ throw new Error(`Fetch failed: ${err.message}`);
171
+ } finally {
172
+ clearTimeout(timeout);
173
+ }
174
+
175
+ if (!response.ok) {
176
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
177
+ }
178
+
179
+ const contentType = response.headers.get("content-type") || "";
180
+ const html = await response.text();
181
+
182
+ let title, body;
183
+
184
+ if (contentType.includes("text/html") || contentType.includes("application/xhtml")) {
185
+ const extracted = extractHtmlContent(html, url);
186
+ title = extracted.title;
187
+ body = extracted.body;
188
+ } else {
189
+ // Plain text or other — use as-is
190
+ title = domain;
191
+ body = html;
192
+ }
193
+
194
+ // Truncate if too long
195
+ if (body.length > maxBodyLength) {
196
+ body = body.slice(0, maxBodyLength) + "\n\n[Content truncated]";
197
+ }
198
+
199
+ if (!body.trim()) {
200
+ throw new Error("No readable content extracted from URL");
201
+ }
202
+
203
+ return {
204
+ kind,
205
+ title: title || domain,
206
+ body,
207
+ tags: [...tags, "web-import"],
208
+ meta: {
209
+ url,
210
+ domain,
211
+ fetched_at: new Date().toISOString(),
212
+ content_type: contentType.split(";")[0].trim() || "text/html",
213
+ },
214
+ source: source || domain,
215
+ };
216
+ }