context-vault 2.4.2 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/app-dist/assets/index-DjXoWapE.css +1 -0
- package/app-dist/assets/index-R4n9Qz4U.js +380 -0
- package/app-dist/index.html +16 -0
- package/bin/cli.js +534 -36
- package/node_modules/@context-vault/core/package.json +8 -4
- package/node_modules/@context-vault/core/src/capture/file-ops.js +1 -1
- package/node_modules/@context-vault/core/src/capture/import-pipeline.js +85 -0
- package/node_modules/@context-vault/core/src/capture/importers.js +360 -0
- package/node_modules/@context-vault/core/src/capture/ingest-url.js +216 -0
- package/node_modules/@context-vault/core/src/core/config.js +13 -3
- package/node_modules/@context-vault/core/src/index/db.js +18 -4
- package/node_modules/@context-vault/core/src/retrieve/index.js +12 -7
- package/node_modules/@context-vault/core/src/server/tools.js +149 -15
- package/node_modules/@context-vault/core/src/sync/sync.js +230 -0
- package/package.json +6 -5
- package/scripts/local-server.js +204 -3
- package/scripts/prepack.js +13 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@context-vault/core",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.6.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Shared core: capture, index, retrieve, tools, and utilities for context-vault",
|
|
6
6
|
"main": "src/index.js",
|
|
@@ -19,7 +19,11 @@
|
|
|
19
19
|
"./core/config": "./src/core/config.js",
|
|
20
20
|
"./core/files": "./src/core/files.js",
|
|
21
21
|
"./core/frontmatter": "./src/core/frontmatter.js",
|
|
22
|
-
"./core/status": "./src/core/status.js"
|
|
22
|
+
"./core/status": "./src/core/status.js",
|
|
23
|
+
"./capture/importers": "./src/capture/importers.js",
|
|
24
|
+
"./capture/import-pipeline": "./src/capture/import-pipeline.js",
|
|
25
|
+
"./capture/ingest-url": "./src/capture/ingest-url.js",
|
|
26
|
+
"./sync": "./src/sync/sync.js"
|
|
23
27
|
},
|
|
24
28
|
"files": [
|
|
25
29
|
"src/"
|
|
@@ -31,8 +35,8 @@
|
|
|
31
35
|
"author": "Felix Hellstrom",
|
|
32
36
|
"repository": {
|
|
33
37
|
"type": "git",
|
|
34
|
-
"url": "git+https://github.com/fellanH/context-
|
|
38
|
+
"url": "git+https://github.com/fellanH/context-vault.git",
|
|
35
39
|
"directory": "packages/core"
|
|
36
40
|
},
|
|
37
|
-
"homepage": "https://github.com/fellanH/context-
|
|
41
|
+
"homepage": "https://github.com/fellanH/context-vault"
|
|
38
42
|
}
|
|
@@ -10,7 +10,7 @@ import { formatFrontmatter } from "../core/frontmatter.js";
|
|
|
10
10
|
import { slugify, kindToPath } from "../core/files.js";
|
|
11
11
|
import { formatBody } from "./formatters.js";
|
|
12
12
|
|
|
13
|
-
function safeFolderPath(vaultDir, kind, folder) {
|
|
13
|
+
export function safeFolderPath(vaultDir, kind, folder) {
|
|
14
14
|
const base = resolve(vaultDir, kindToPath(kind));
|
|
15
15
|
if (!folder) return base;
|
|
16
16
|
const resolved = resolve(base, folder);
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* import-pipeline.js — Batch import orchestrator
|
|
3
|
+
*
|
|
4
|
+
* Processes an array of EntryData through captureAndIndex(),
|
|
5
|
+
* reporting progress and collecting results.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { captureAndIndex } from "./index.js";
|
|
9
|
+
import { indexEntry } from "../index/index.js";
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* @typedef {object} EntryData
|
|
13
|
+
* @property {string} kind
|
|
14
|
+
* @property {string} [title]
|
|
15
|
+
* @property {string} body
|
|
16
|
+
* @property {string[]} [tags]
|
|
17
|
+
* @property {object} [meta]
|
|
18
|
+
* @property {string} [source]
|
|
19
|
+
* @property {string} [identity_key]
|
|
20
|
+
* @property {string} [expires_at]
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* @typedef {object} ImportResult
|
|
25
|
+
* @property {number} imported
|
|
26
|
+
* @property {number} failed
|
|
27
|
+
* @property {Array<{ index: number, title?: string, error: string }>} errors
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Import an array of entries into the vault.
|
|
32
|
+
*
|
|
33
|
+
* @param {object} ctx — Vault context (db, config, stmts, embed, insertVec, deleteVec)
|
|
34
|
+
* @param {EntryData[]} entries
|
|
35
|
+
* @param {{ onProgress?: (current: number, total: number) => void, source?: string }} [opts]
|
|
36
|
+
* @returns {Promise<ImportResult>}
|
|
37
|
+
*/
|
|
38
|
+
export async function importEntries(ctx, entries, opts = {}) {
|
|
39
|
+
const { onProgress, source } = opts;
|
|
40
|
+
let imported = 0;
|
|
41
|
+
let failed = 0;
|
|
42
|
+
const errors = [];
|
|
43
|
+
|
|
44
|
+
for (let i = 0; i < entries.length; i++) {
|
|
45
|
+
const entry = entries[i];
|
|
46
|
+
|
|
47
|
+
if (onProgress) {
|
|
48
|
+
onProgress(i + 1, entries.length);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
try {
|
|
52
|
+
if (!entry.body?.trim()) {
|
|
53
|
+
failed++;
|
|
54
|
+
errors.push({ index: i, title: entry.title, error: "Empty body" });
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
await captureAndIndex(
|
|
59
|
+
ctx,
|
|
60
|
+
{
|
|
61
|
+
kind: entry.kind || "insight",
|
|
62
|
+
title: entry.title || null,
|
|
63
|
+
body: entry.body,
|
|
64
|
+
meta: entry.meta,
|
|
65
|
+
tags: entry.tags,
|
|
66
|
+
source: entry.source || source || "import",
|
|
67
|
+
identity_key: entry.identity_key,
|
|
68
|
+
expires_at: entry.expires_at,
|
|
69
|
+
userId: ctx.userId || null,
|
|
70
|
+
},
|
|
71
|
+
indexEntry
|
|
72
|
+
);
|
|
73
|
+
imported++;
|
|
74
|
+
} catch (err) {
|
|
75
|
+
failed++;
|
|
76
|
+
errors.push({
|
|
77
|
+
index: i,
|
|
78
|
+
title: entry.title || null,
|
|
79
|
+
error: err.message,
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return { imported, failed, errors };
|
|
85
|
+
}
|
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* importers.js — Format detection + parsers for bulk import
|
|
3
|
+
*
|
|
4
|
+
* Detects and parses markdown, CSV/TSV, JSON, and plain text files into
|
|
5
|
+
* the EntryData shape that captureAndIndex() accepts.
|
|
6
|
+
*
|
|
7
|
+
* No external dependencies — CSV parsed with split + quote handling,
|
|
8
|
+
* markdown uses existing parseFrontmatter().
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { readdirSync, readFileSync, statSync } from "node:fs";
|
|
12
|
+
import { join, extname, basename } from "node:path";
|
|
13
|
+
import { parseFrontmatter, parseEntryFromMarkdown } from "../core/frontmatter.js";
|
|
14
|
+
import { dirToKind } from "../core/files.js";
|
|
15
|
+
|
|
16
|
+
// ─── Format Detection ────────────────────────────────────────────────────────
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Detect the format of a file by extension and content heuristics.
|
|
20
|
+
* @param {string} filePath
|
|
21
|
+
* @param {string} [content]
|
|
22
|
+
* @returns {"markdown"|"csv"|"tsv"|"json"|"text"}
|
|
23
|
+
*/
|
|
24
|
+
export function detectFormat(filePath, content) {
|
|
25
|
+
const ext = extname(filePath).toLowerCase();
|
|
26
|
+
|
|
27
|
+
if (ext === ".md" || ext === ".markdown") return "markdown";
|
|
28
|
+
if (ext === ".csv") return "csv";
|
|
29
|
+
if (ext === ".tsv") return "tsv";
|
|
30
|
+
if (ext === ".json" || ext === ".jsonl") return "json";
|
|
31
|
+
|
|
32
|
+
// Content-based heuristics if extension is ambiguous
|
|
33
|
+
if (content) {
|
|
34
|
+
const trimmed = content.trimStart();
|
|
35
|
+
if (trimmed.startsWith("---\n")) return "markdown";
|
|
36
|
+
if (trimmed.startsWith("[") || trimmed.startsWith("{")) return "json";
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return "text";
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// ─── CSV Parsing Helpers ─────────────────────────────────────────────────────
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Parse a CSV line respecting quoted fields.
|
|
46
|
+
* @param {string} line
|
|
47
|
+
* @param {string} delimiter
|
|
48
|
+
* @returns {string[]}
|
|
49
|
+
*/
|
|
50
|
+
function parseCsvLine(line, delimiter) {
|
|
51
|
+
const fields = [];
|
|
52
|
+
let current = "";
|
|
53
|
+
let inQuotes = false;
|
|
54
|
+
|
|
55
|
+
for (let i = 0; i < line.length; i++) {
|
|
56
|
+
const ch = line[i];
|
|
57
|
+
if (inQuotes) {
|
|
58
|
+
if (ch === '"') {
|
|
59
|
+
if (i + 1 < line.length && line[i + 1] === '"') {
|
|
60
|
+
current += '"';
|
|
61
|
+
i++;
|
|
62
|
+
} else {
|
|
63
|
+
inQuotes = false;
|
|
64
|
+
}
|
|
65
|
+
} else {
|
|
66
|
+
current += ch;
|
|
67
|
+
}
|
|
68
|
+
} else if (ch === '"') {
|
|
69
|
+
inQuotes = true;
|
|
70
|
+
} else if (ch === delimiter) {
|
|
71
|
+
fields.push(current.trim());
|
|
72
|
+
current = "";
|
|
73
|
+
} else {
|
|
74
|
+
current += ch;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
fields.push(current.trim());
|
|
78
|
+
return fields;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ─── Recognized CSV columns ─────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
const KNOWN_COLUMNS = new Set([
|
|
84
|
+
"kind", "title", "body", "tags", "source",
|
|
85
|
+
"identity_key", "expires_at",
|
|
86
|
+
]);
|
|
87
|
+
|
|
88
|
+
// ─── Parsers ─────────────────────────────────────────────────────────────────
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Parse a markdown file into EntryData.
|
|
92
|
+
* Reuses parseFrontmatter + parseEntryFromMarkdown from core.
|
|
93
|
+
*
|
|
94
|
+
* @param {string} content
|
|
95
|
+
* @param {{ kind?: string, source?: string }} [opts]
|
|
96
|
+
* @returns {import("./import-pipeline.js").EntryData[]}
|
|
97
|
+
*/
|
|
98
|
+
export function parseMarkdown(content, opts = {}) {
|
|
99
|
+
const { meta: fmMeta, body: rawBody } = parseFrontmatter(content);
|
|
100
|
+
|
|
101
|
+
// Derive kind from frontmatter or option
|
|
102
|
+
const kind = fmMeta.kind || opts.kind || "insight";
|
|
103
|
+
const parsed = parseEntryFromMarkdown(kind, rawBody, fmMeta);
|
|
104
|
+
|
|
105
|
+
return [{
|
|
106
|
+
kind,
|
|
107
|
+
title: parsed.title || fmMeta.title || null,
|
|
108
|
+
body: parsed.body || rawBody,
|
|
109
|
+
tags: Array.isArray(fmMeta.tags) ? fmMeta.tags : undefined,
|
|
110
|
+
meta: parsed.meta || undefined,
|
|
111
|
+
source: fmMeta.source || opts.source || "import",
|
|
112
|
+
identity_key: fmMeta.identity_key || undefined,
|
|
113
|
+
expires_at: fmMeta.expires_at || undefined,
|
|
114
|
+
}];
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Parse a CSV or TSV file into EntryData[].
|
|
119
|
+
* Header row required. Recognized columns map directly; unknown → meta.
|
|
120
|
+
* Tags column is comma-separated within field.
|
|
121
|
+
*
|
|
122
|
+
* @param {string} content
|
|
123
|
+
* @param {string} delimiter - "," for CSV, "\t" for TSV
|
|
124
|
+
* @param {{ kind?: string, source?: string }} [opts]
|
|
125
|
+
* @returns {import("./import-pipeline.js").EntryData[]}
|
|
126
|
+
*/
|
|
127
|
+
export function parseCsv(content, delimiter, opts = {}) {
|
|
128
|
+
const lines = content.split(/\r?\n/).filter((l) => l.trim());
|
|
129
|
+
if (lines.length < 2) return [];
|
|
130
|
+
|
|
131
|
+
const headers = parseCsvLine(lines[0], delimiter).map((h) => h.toLowerCase().trim());
|
|
132
|
+
const entries = [];
|
|
133
|
+
|
|
134
|
+
for (let i = 1; i < lines.length; i++) {
|
|
135
|
+
const values = parseCsvLine(lines[i], delimiter);
|
|
136
|
+
if (values.every((v) => !v)) continue; // skip empty rows
|
|
137
|
+
|
|
138
|
+
const entry = {
|
|
139
|
+
kind: opts.kind || "insight",
|
|
140
|
+
body: "",
|
|
141
|
+
source: opts.source || "csv-import",
|
|
142
|
+
};
|
|
143
|
+
const meta = {};
|
|
144
|
+
|
|
145
|
+
for (let j = 0; j < headers.length; j++) {
|
|
146
|
+
const col = headers[j];
|
|
147
|
+
const val = values[j] || "";
|
|
148
|
+
|
|
149
|
+
if (col === "kind" && val) {
|
|
150
|
+
entry.kind = val;
|
|
151
|
+
} else if (col === "title" && val) {
|
|
152
|
+
entry.title = val;
|
|
153
|
+
} else if (col === "body" && val) {
|
|
154
|
+
entry.body = val;
|
|
155
|
+
} else if (col === "tags" && val) {
|
|
156
|
+
entry.tags = val.split(",").map((t) => t.trim()).filter(Boolean);
|
|
157
|
+
} else if (col === "source" && val) {
|
|
158
|
+
entry.source = val;
|
|
159
|
+
} else if (col === "identity_key" && val) {
|
|
160
|
+
entry.identity_key = val;
|
|
161
|
+
} else if (col === "expires_at" && val) {
|
|
162
|
+
entry.expires_at = val;
|
|
163
|
+
} else if (val && !KNOWN_COLUMNS.has(col)) {
|
|
164
|
+
meta[col] = val;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (!entry.body) continue; // skip rows with no body
|
|
169
|
+
if (Object.keys(meta).length) entry.meta = meta;
|
|
170
|
+
entries.push(entry);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return entries;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Parse a JSON file into EntryData[].
|
|
178
|
+
* Supports: array-of-entries, {entries:[...]}, or ChatGPT export format.
|
|
179
|
+
*
|
|
180
|
+
* @param {string} content
|
|
181
|
+
* @param {{ kind?: string, source?: string }} [opts]
|
|
182
|
+
* @returns {import("./import-pipeline.js").EntryData[]}
|
|
183
|
+
*/
|
|
184
|
+
export function parseJson(content, opts = {}) {
|
|
185
|
+
let data;
|
|
186
|
+
try {
|
|
187
|
+
data = JSON.parse(content);
|
|
188
|
+
} catch {
|
|
189
|
+
return [];
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Detect format
|
|
193
|
+
let rawEntries;
|
|
194
|
+
|
|
195
|
+
if (Array.isArray(data)) {
|
|
196
|
+
// Array-of-entries OR ChatGPT export format
|
|
197
|
+
if (data.length > 0 && data[0].mapping && data[0].create_time !== undefined) {
|
|
198
|
+
return parseChatGptExport(data, opts);
|
|
199
|
+
}
|
|
200
|
+
rawEntries = data;
|
|
201
|
+
} else if (data && Array.isArray(data.entries)) {
|
|
202
|
+
rawEntries = data.entries;
|
|
203
|
+
} else {
|
|
204
|
+
// Single entry object
|
|
205
|
+
rawEntries = [data];
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
return rawEntries
|
|
209
|
+
.filter((e) => e && typeof e === "object" && e.body)
|
|
210
|
+
.map((e) => ({
|
|
211
|
+
kind: e.kind || opts.kind || "insight",
|
|
212
|
+
title: e.title || null,
|
|
213
|
+
body: e.body,
|
|
214
|
+
tags: Array.isArray(e.tags) ? e.tags : undefined,
|
|
215
|
+
meta: e.meta && typeof e.meta === "object" ? e.meta : undefined,
|
|
216
|
+
source: e.source || opts.source || "json-import",
|
|
217
|
+
identity_key: e.identity_key || undefined,
|
|
218
|
+
expires_at: e.expires_at || undefined,
|
|
219
|
+
}));
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Parse ChatGPT export format (array of conversations with mapping + create_time).
|
|
224
|
+
*/
|
|
225
|
+
function parseChatGptExport(conversations, opts = {}) {
|
|
226
|
+
const entries = [];
|
|
227
|
+
|
|
228
|
+
for (const conv of conversations) {
|
|
229
|
+
if (!conv.title || !conv.mapping) continue;
|
|
230
|
+
|
|
231
|
+
// Extract all assistant messages from the mapping
|
|
232
|
+
const messages = Object.values(conv.mapping)
|
|
233
|
+
.filter((m) => m.message?.author?.role === "assistant" && m.message.content?.parts?.length)
|
|
234
|
+
.map((m) => m.message.content.parts.join("\n"))
|
|
235
|
+
.filter(Boolean);
|
|
236
|
+
|
|
237
|
+
if (!messages.length) continue;
|
|
238
|
+
|
|
239
|
+
const body = messages.join("\n\n---\n\n");
|
|
240
|
+
const created = conv.create_time
|
|
241
|
+
? new Date(conv.create_time * 1000).toISOString()
|
|
242
|
+
: undefined;
|
|
243
|
+
|
|
244
|
+
entries.push({
|
|
245
|
+
kind: opts.kind || "conversation",
|
|
246
|
+
title: conv.title,
|
|
247
|
+
body,
|
|
248
|
+
tags: ["chatgpt-import"],
|
|
249
|
+
meta: { conversation_id: conv.id, created_at_original: created },
|
|
250
|
+
source: opts.source || "chatgpt-export",
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return entries;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Parse a plain text file into a single EntryData.
|
|
259
|
+
*
|
|
260
|
+
* @param {string} content
|
|
261
|
+
* @param {string} filePath
|
|
262
|
+
* @param {{ kind?: string, source?: string }} [opts]
|
|
263
|
+
* @returns {import("./import-pipeline.js").EntryData[]}
|
|
264
|
+
*/
|
|
265
|
+
export function parseText(content, filePath, opts = {}) {
|
|
266
|
+
const trimmed = content.trim();
|
|
267
|
+
if (!trimmed) return [];
|
|
268
|
+
|
|
269
|
+
const name = basename(filePath, extname(filePath));
|
|
270
|
+
const title = name.replace(/[-_]/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
|
271
|
+
|
|
272
|
+
return [{
|
|
273
|
+
kind: opts.kind || "insight",
|
|
274
|
+
title,
|
|
275
|
+
body: trimmed,
|
|
276
|
+
source: opts.source || "text-import",
|
|
277
|
+
}];
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Parse a single file (auto-detect format).
|
|
282
|
+
*
|
|
283
|
+
* @param {string} filePath
|
|
284
|
+
* @param {string} content
|
|
285
|
+
* @param {{ kind?: string, source?: string }} [opts]
|
|
286
|
+
* @returns {import("./import-pipeline.js").EntryData[]}
|
|
287
|
+
*/
|
|
288
|
+
export function parseFile(filePath, content, opts = {}) {
|
|
289
|
+
const format = detectFormat(filePath, content);
|
|
290
|
+
|
|
291
|
+
switch (format) {
|
|
292
|
+
case "markdown":
|
|
293
|
+
return parseMarkdown(content, opts);
|
|
294
|
+
case "csv":
|
|
295
|
+
return parseCsv(content, ",", opts);
|
|
296
|
+
case "tsv":
|
|
297
|
+
return parseCsv(content, "\t", opts);
|
|
298
|
+
case "json":
|
|
299
|
+
return parseJson(content, opts);
|
|
300
|
+
case "text":
|
|
301
|
+
return parseText(content, filePath, opts);
|
|
302
|
+
default:
|
|
303
|
+
return [];
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Recursively parse a directory of files.
|
|
309
|
+
* Walks subdirectories, filters by extension, infers kind from directory name.
|
|
310
|
+
*
|
|
311
|
+
* @param {string} dirPath
|
|
312
|
+
* @param {{ kind?: string, source?: string, extensions?: string[] }} [opts]
|
|
313
|
+
* @returns {import("./import-pipeline.js").EntryData[]}
|
|
314
|
+
*/
|
|
315
|
+
export function parseDirectory(dirPath, opts = {}) {
|
|
316
|
+
const extensions = opts.extensions || [".md", ".markdown", ".csv", ".tsv", ".json", ".txt"];
|
|
317
|
+
const entries = [];
|
|
318
|
+
|
|
319
|
+
function walk(dir, inferredKind) {
|
|
320
|
+
let items;
|
|
321
|
+
try {
|
|
322
|
+
items = readdirSync(dir, { withFileTypes: true });
|
|
323
|
+
} catch {
|
|
324
|
+
return;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
for (const item of items) {
|
|
328
|
+
if (item.name.startsWith(".") || item.name.startsWith("_")) continue;
|
|
329
|
+
|
|
330
|
+
const fullPath = join(dir, item.name);
|
|
331
|
+
|
|
332
|
+
if (item.isDirectory()) {
|
|
333
|
+
// Try to infer kind from directory name
|
|
334
|
+
const kind = dirToKind(item.name) !== item.name
|
|
335
|
+
? dirToKind(item.name)
|
|
336
|
+
: inferredKind;
|
|
337
|
+
walk(fullPath, kind);
|
|
338
|
+
} else if (item.isFile()) {
|
|
339
|
+
const ext = extname(item.name).toLowerCase();
|
|
340
|
+
if (!extensions.includes(ext)) continue;
|
|
341
|
+
|
|
342
|
+
try {
|
|
343
|
+
const content = readFileSync(fullPath, "utf-8");
|
|
344
|
+
const fileOpts = { ...opts };
|
|
345
|
+
if (inferredKind && !fileOpts.kind) fileOpts.kind = inferredKind;
|
|
346
|
+
const parsed = parseFile(fullPath, content, fileOpts);
|
|
347
|
+
entries.push(...parsed);
|
|
348
|
+
} catch {
|
|
349
|
+
// Skip unreadable files
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// Infer kind from the top-level directory name
|
|
356
|
+
const topKind = opts.kind || undefined;
|
|
357
|
+
walk(dirPath, topKind);
|
|
358
|
+
|
|
359
|
+
return entries;
|
|
360
|
+
}
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ingest-url.js — URL fetch + HTML → markdown conversion
|
|
3
|
+
*
|
|
4
|
+
* Fetches a URL, extracts readable content, converts to markdown,
|
|
5
|
+
* and returns an EntryData object ready for captureAndIndex().
|
|
6
|
+
*
|
|
7
|
+
* Uses Node built-in fetch() (Node 18+). No external dependencies.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
// ─── HTML → Markdown ─────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Convert HTML to simplified markdown.
|
|
14
|
+
* Strips scripts/styles, converts headings/links/lists/code.
|
|
15
|
+
*
|
|
16
|
+
* @param {string} html
|
|
17
|
+
* @returns {string}
|
|
18
|
+
*/
|
|
19
|
+
export function htmlToMarkdown(html) {
|
|
20
|
+
let md = html;
|
|
21
|
+
|
|
22
|
+
// Remove scripts, styles, nav, header, footer, aside
|
|
23
|
+
md = md.replace(/<script[\s\S]*?<\/script>/gi, "");
|
|
24
|
+
md = md.replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
25
|
+
md = md.replace(/<nav[\s\S]*?<\/nav>/gi, "");
|
|
26
|
+
md = md.replace(/<header[\s\S]*?<\/header>/gi, "");
|
|
27
|
+
md = md.replace(/<footer[\s\S]*?<\/footer>/gi, "");
|
|
28
|
+
md = md.replace(/<aside[\s\S]*?<\/aside>/gi, "");
|
|
29
|
+
|
|
30
|
+
// Convert headings
|
|
31
|
+
md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, (_, c) => `\n# ${stripTags(c).trim()}\n`);
|
|
32
|
+
md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, (_, c) => `\n## ${stripTags(c).trim()}\n`);
|
|
33
|
+
md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, (_, c) => `\n### ${stripTags(c).trim()}\n`);
|
|
34
|
+
md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, (_, c) => `\n#### ${stripTags(c).trim()}\n`);
|
|
35
|
+
md = md.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, (_, c) => `\n##### ${stripTags(c).trim()}\n`);
|
|
36
|
+
md = md.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, (_, c) => `\n###### ${stripTags(c).trim()}\n`);
|
|
37
|
+
|
|
38
|
+
// Convert links
|
|
39
|
+
md = md.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_, href, text) => {
|
|
40
|
+
const cleanText = stripTags(text).trim();
|
|
41
|
+
return cleanText ? `[${cleanText}](${href})` : "";
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
// Convert code blocks
|
|
45
|
+
md = md.replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (_, c) => `\n\`\`\`\n${decodeEntities(c).trim()}\n\`\`\`\n`);
|
|
46
|
+
md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_, c) => `\n\`\`\`\n${decodeEntities(stripTags(c)).trim()}\n\`\`\`\n`);
|
|
47
|
+
|
|
48
|
+
// Convert inline code
|
|
49
|
+
md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (_, c) => `\`${decodeEntities(c).trim()}\``);
|
|
50
|
+
|
|
51
|
+
// Convert strong/em
|
|
52
|
+
md = md.replace(/<(strong|b)[^>]*>([\s\S]*?)<\/\1>/gi, (_, __, c) => `**${stripTags(c).trim()}**`);
|
|
53
|
+
md = md.replace(/<(em|i)[^>]*>([\s\S]*?)<\/\1>/gi, (_, __, c) => `*${stripTags(c).trim()}*`);
|
|
54
|
+
|
|
55
|
+
// Convert list items
|
|
56
|
+
md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_, c) => `- ${stripTags(c).trim()}\n`);
|
|
57
|
+
|
|
58
|
+
// Convert paragraphs and line breaks
|
|
59
|
+
md = md.replace(/<br\s*\/?>/gi, "\n");
|
|
60
|
+
md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_, c) => `\n${stripTags(c).trim()}\n`);
|
|
61
|
+
md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_, c) => {
|
|
62
|
+
return "\n" + stripTags(c).trim().split("\n").map((l) => `> ${l}`).join("\n") + "\n";
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
// Remove remaining HTML tags
|
|
66
|
+
md = stripTags(md);
|
|
67
|
+
|
|
68
|
+
// Decode HTML entities
|
|
69
|
+
md = decodeEntities(md);
|
|
70
|
+
|
|
71
|
+
// Clean up whitespace
|
|
72
|
+
md = md.replace(/\n{3,}/g, "\n\n").trim();
|
|
73
|
+
|
|
74
|
+
return md;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function stripTags(html) {
|
|
78
|
+
return html.replace(/<[^>]+>/g, "");
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function decodeEntities(text) {
|
|
82
|
+
return text
|
|
83
|
+
.replace(/&/g, "&")
|
|
84
|
+
.replace(/</g, "<")
|
|
85
|
+
.replace(/>/g, ">")
|
|
86
|
+
.replace(/"/g, '"')
|
|
87
|
+
.replace(/'/g, "'")
|
|
88
|
+
.replace(/ /g, " ")
|
|
89
|
+
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
|
|
90
|
+
.replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCharCode(parseInt(n, 16)));
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// ─── HTML Content Extraction ─────────────────────────────────────────────────
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Extract the main readable content from an HTML page.
|
|
97
|
+
* Prefers <article> or <main>, falls back to <body>.
|
|
98
|
+
*
|
|
99
|
+
* @param {string} html
|
|
100
|
+
* @param {string} url
|
|
101
|
+
* @returns {{ title: string, body: string }}
|
|
102
|
+
*/
|
|
103
|
+
export function extractHtmlContent(html, url) {
|
|
104
|
+
// Extract <title>
|
|
105
|
+
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
106
|
+
const title = titleMatch ? stripTags(decodeEntities(titleMatch[1])).trim() : "";
|
|
107
|
+
|
|
108
|
+
// Try to extract main content area
|
|
109
|
+
let contentHtml = "";
|
|
110
|
+
|
|
111
|
+
const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
|
|
112
|
+
const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
|
|
113
|
+
|
|
114
|
+
if (articleMatch) {
|
|
115
|
+
contentHtml = articleMatch[1];
|
|
116
|
+
} else if (mainMatch) {
|
|
117
|
+
contentHtml = mainMatch[1];
|
|
118
|
+
} else {
|
|
119
|
+
// Fall back to <body>
|
|
120
|
+
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
121
|
+
contentHtml = bodyMatch ? bodyMatch[1] : html;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const body = htmlToMarkdown(contentHtml);
|
|
125
|
+
|
|
126
|
+
return { title, body };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// ─── URL Ingestion ───────────────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Fetch a URL, extract readable content, and return an EntryData object.
|
|
133
|
+
*
|
|
134
|
+
* @param {string} url
|
|
135
|
+
* @param {{ kind?: string, tags?: string[], source?: string, maxBodyLength?: number, timeoutMs?: number }} [opts]
|
|
136
|
+
* @returns {Promise<{ kind: string, title: string, body: string, tags: string[], meta: object, source: string }>}
|
|
137
|
+
*/
|
|
138
|
+
export async function ingestUrl(url, opts = {}) {
|
|
139
|
+
const {
|
|
140
|
+
kind = "reference",
|
|
141
|
+
tags = [],
|
|
142
|
+
source,
|
|
143
|
+
maxBodyLength = 50000,
|
|
144
|
+
timeoutMs = 15000,
|
|
145
|
+
} = opts;
|
|
146
|
+
|
|
147
|
+
let domain;
|
|
148
|
+
try {
|
|
149
|
+
domain = new URL(url).hostname;
|
|
150
|
+
} catch {
|
|
151
|
+
throw new Error(`Invalid URL: ${url}`);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const controller = new AbortController();
|
|
155
|
+
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
156
|
+
|
|
157
|
+
let response;
|
|
158
|
+
try {
|
|
159
|
+
response = await fetch(url, {
|
|
160
|
+
signal: controller.signal,
|
|
161
|
+
headers: {
|
|
162
|
+
"User-Agent": "ContextVault/1.0 (+https://github.com/fellanH/context-vault)",
|
|
163
|
+
Accept: "text/html,application/xhtml+xml,text/plain,*/*",
|
|
164
|
+
},
|
|
165
|
+
});
|
|
166
|
+
} catch (err) {
|
|
167
|
+
if (err.name === "AbortError") {
|
|
168
|
+
throw new Error(`Request timed out after ${timeoutMs}ms`);
|
|
169
|
+
}
|
|
170
|
+
throw new Error(`Fetch failed: ${err.message}`);
|
|
171
|
+
} finally {
|
|
172
|
+
clearTimeout(timeout);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if (!response.ok) {
|
|
176
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
const contentType = response.headers.get("content-type") || "";
|
|
180
|
+
const html = await response.text();
|
|
181
|
+
|
|
182
|
+
let title, body;
|
|
183
|
+
|
|
184
|
+
if (contentType.includes("text/html") || contentType.includes("application/xhtml")) {
|
|
185
|
+
const extracted = extractHtmlContent(html, url);
|
|
186
|
+
title = extracted.title;
|
|
187
|
+
body = extracted.body;
|
|
188
|
+
} else {
|
|
189
|
+
// Plain text or other — use as-is
|
|
190
|
+
title = domain;
|
|
191
|
+
body = html;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Truncate if too long
|
|
195
|
+
if (body.length > maxBodyLength) {
|
|
196
|
+
body = body.slice(0, maxBodyLength) + "\n\n[Content truncated]";
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
if (!body.trim()) {
|
|
200
|
+
throw new Error("No readable content extracted from URL");
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
kind,
|
|
205
|
+
title: title || domain,
|
|
206
|
+
body,
|
|
207
|
+
tags: [...tags, "web-import"],
|
|
208
|
+
meta: {
|
|
209
|
+
url,
|
|
210
|
+
domain,
|
|
211
|
+
fetched_at: new Date().toISOString(),
|
|
212
|
+
content_type: contentType.split(";")[0].trim() || "text/html",
|
|
213
|
+
},
|
|
214
|
+
source: source || domain,
|
|
215
|
+
};
|
|
216
|
+
}
|