ex-brain 0.2.6 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -1
- package/src/ai/ax-pipeline.ts +114 -0
- package/src/ai/compiler.ts +118 -113
- package/src/ai/entity-link.ts +96 -78
- package/src/ai/timeline-extractor.ts +110 -99
- package/src/commands/compile-cmd.ts +1 -1
- package/src/commands/entity-links.ts +105 -0
- package/src/commands/import-cmd.ts +464 -0
- package/src/commands/index.ts +30 -2194
- package/src/commands/misc-cmds.ts +190 -0
- package/src/commands/misc-commands.ts +252 -0
- package/src/commands/put-cmd.ts +525 -0
- package/src/commands/query-cmd.ts +486 -0
- package/src/commands/shared.ts +109 -0
- package/src/commands/timeline-cmd.ts +159 -0
- package/src/config/index.ts +53 -0
- package/src/config/init.ts +50 -0
- package/src/config/paths.ts +21 -0
- package/src/config/schema.ts +121 -0
- package/src/config/settings.ts +168 -0
- package/src/db/client.ts +1 -1
- package/src/markdown/document-loader.ts +514 -0
- package/src/mcp/server.ts +148 -0
- package/src/repositories/brain-repo.ts +43 -1
- package/src/settings.ts +27 -282
- /package/src/{config.ts → slug-utils.ts} +0 -0
|
@@ -0,0 +1,514 @@
|
|
|
1
|
+
import { readFile, readdir, stat } from "node:fs/promises";
|
|
2
|
+
import { basename, extname, join, resolve } from "node:path";
|
|
3
|
+
|
|
4
|
+
/** Supported document kinds for ingestion. */
|
|
5
|
+
export type DocumentKind =
|
|
6
|
+
| "text"
|
|
7
|
+
| "markdown"
|
|
8
|
+
| "pdf"
|
|
9
|
+
| "docx"
|
|
10
|
+
| "doc"
|
|
11
|
+
| "html"
|
|
12
|
+
| "json"
|
|
13
|
+
| "unknown";
|
|
14
|
+
|
|
15
|
+
export interface LoadedDocument {
|
|
16
|
+
/** Extracted plain-text content (utf-8). */
|
|
17
|
+
text: string;
|
|
18
|
+
/** Original file/URL name without parent path. */
|
|
19
|
+
fileName: string;
|
|
20
|
+
/** Detected document kind. */
|
|
21
|
+
kind: DocumentKind;
|
|
22
|
+
/** Source descriptor: an absolute path for files or the original URL. */
|
|
23
|
+
source: string;
|
|
24
|
+
/** Where the source came from. */
|
|
25
|
+
sourceType: "url" | "file";
|
|
26
|
+
/** MIME type detected from response headers or file extension. */
|
|
27
|
+
mimeType?: string;
|
|
28
|
+
/** Byte size of the *raw* source (downloaded bytes or file size). */
|
|
29
|
+
bytes: number;
|
|
30
|
+
/** Extra metadata: page count for PDF, mammoth warnings, etc. */
|
|
31
|
+
metadata: Record<string, unknown>;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export interface LoadDocumentOptions {
|
|
35
|
+
/** Override automatic kind detection. */
|
|
36
|
+
forceKind?: DocumentKind;
|
|
37
|
+
/** Network fetch timeout (ms). Default: 30s. */
|
|
38
|
+
fetchTimeoutMs?: number;
|
|
39
|
+
/** Maximum bytes accepted from a remote URL. Default: 50 MB. */
|
|
40
|
+
maxBytes?: number;
|
|
41
|
+
/** Custom user-agent for URL fetches. */
|
|
42
|
+
userAgent?: string;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const DEFAULT_TIMEOUT = 30_000;
|
|
46
|
+
const DEFAULT_MAX_BYTES = 50 * 1024 * 1024;
|
|
47
|
+
const DEFAULT_UA = "ebrain-ingest/1 (+https://github.com/ebrain)";
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Detect whether `input` is a remote URL (http/https) we should download.
|
|
51
|
+
* `file://` URLs are treated as local files.
|
|
52
|
+
*/
|
|
53
|
+
export function isRemoteUrl(input: string): boolean {
|
|
54
|
+
return /^https?:\/\//i.test(input.trim());
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Map a file extension or content-type to a DocumentKind.
|
|
59
|
+
* Returns `"unknown"` if no clear match is found.
|
|
60
|
+
*/
|
|
61
|
+
export function detectKind(opts: {
|
|
62
|
+
fileName?: string;
|
|
63
|
+
contentType?: string;
|
|
64
|
+
}): DocumentKind {
|
|
65
|
+
const ct = (opts.contentType ?? "").toLowerCase().split(";")[0]?.trim();
|
|
66
|
+
if (ct) {
|
|
67
|
+
if (ct === "application/pdf") return "pdf";
|
|
68
|
+
if (
|
|
69
|
+
ct ===
|
|
70
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
|
|
71
|
+
ct === "application/vnd.ms-word.document.macroenabled.12"
|
|
72
|
+
)
|
|
73
|
+
return "docx";
|
|
74
|
+
if (ct === "application/msword") return "doc";
|
|
75
|
+
if (ct === "text/markdown" || ct === "text/x-markdown") return "markdown";
|
|
76
|
+
if (ct === "text/html" || ct === "application/xhtml+xml") return "html";
|
|
77
|
+
if (ct === "application/json" || ct.endsWith("+json")) return "json";
|
|
78
|
+
if (ct.startsWith("text/")) return "text";
|
|
79
|
+
}
|
|
80
|
+
const ext = (extname(opts.fileName ?? "").toLowerCase() || "").replace(
|
|
81
|
+
/^\./,
|
|
82
|
+
"",
|
|
83
|
+
);
|
|
84
|
+
switch (ext) {
|
|
85
|
+
case "pdf":
|
|
86
|
+
return "pdf";
|
|
87
|
+
case "docx":
|
|
88
|
+
return "docx";
|
|
89
|
+
case "doc":
|
|
90
|
+
return "doc";
|
|
91
|
+
case "md":
|
|
92
|
+
case "markdown":
|
|
93
|
+
case "mdx":
|
|
94
|
+
return "markdown";
|
|
95
|
+
case "htm":
|
|
96
|
+
case "html":
|
|
97
|
+
case "xhtml":
|
|
98
|
+
return "html";
|
|
99
|
+
case "json":
|
|
100
|
+
return "json";
|
|
101
|
+
case "txt":
|
|
102
|
+
case "text":
|
|
103
|
+
case "log":
|
|
104
|
+
case "csv":
|
|
105
|
+
case "tsv":
|
|
106
|
+
case "yaml":
|
|
107
|
+
case "yml":
|
|
108
|
+
case "ini":
|
|
109
|
+
case "rst":
|
|
110
|
+
case "org":
|
|
111
|
+
return "text";
|
|
112
|
+
default:
|
|
113
|
+
return "unknown";
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
interface FetchedResource {
|
|
118
|
+
bytes: Buffer;
|
|
119
|
+
fileName: string;
|
|
120
|
+
contentType?: string;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
async function fetchUrl(
|
|
124
|
+
url: string,
|
|
125
|
+
opts: Required<Pick<LoadDocumentOptions, "fetchTimeoutMs" | "maxBytes" | "userAgent">>,
|
|
126
|
+
): Promise<FetchedResource> {
|
|
127
|
+
const controller = new AbortController();
|
|
128
|
+
const timer = setTimeout(() => controller.abort(), opts.fetchTimeoutMs);
|
|
129
|
+
let resp: Response;
|
|
130
|
+
try {
|
|
131
|
+
resp = await fetch(url, {
|
|
132
|
+
headers: { "User-Agent": opts.userAgent, Accept: "*/*" },
|
|
133
|
+
redirect: "follow",
|
|
134
|
+
signal: controller.signal,
|
|
135
|
+
});
|
|
136
|
+
} catch (err) {
|
|
137
|
+
clearTimeout(timer);
|
|
138
|
+
const reason = err instanceof Error ? err.message : String(err);
|
|
139
|
+
throw new Error(`failed to fetch ${url}: ${reason}`);
|
|
140
|
+
}
|
|
141
|
+
clearTimeout(timer);
|
|
142
|
+
if (!resp.ok) {
|
|
143
|
+
throw new Error(`fetch ${url} returned HTTP ${resp.status} ${resp.statusText}`);
|
|
144
|
+
}
|
|
145
|
+
const contentLength = Number(resp.headers.get("content-length") ?? "0");
|
|
146
|
+
if (contentLength && contentLength > opts.maxBytes) {
|
|
147
|
+
throw new Error(
|
|
148
|
+
`remote document too large: ${contentLength} bytes (limit ${opts.maxBytes})`,
|
|
149
|
+
);
|
|
150
|
+
}
|
|
151
|
+
const ab = await resp.arrayBuffer();
|
|
152
|
+
if (ab.byteLength > opts.maxBytes) {
|
|
153
|
+
throw new Error(
|
|
154
|
+
`remote document too large: ${ab.byteLength} bytes (limit ${opts.maxBytes})`,
|
|
155
|
+
);
|
|
156
|
+
}
|
|
157
|
+
const contentType = resp.headers.get("content-type") ?? undefined;
|
|
158
|
+
const fileName = inferFileNameFromUrl(url, resp, contentType);
|
|
159
|
+
return { bytes: Buffer.from(ab), fileName, contentType };
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
function inferFileNameFromUrl(
|
|
163
|
+
url: string,
|
|
164
|
+
resp: Response,
|
|
165
|
+
contentType?: string,
|
|
166
|
+
): string {
|
|
167
|
+
// 1. Content-Disposition wins if it carries a filename.
|
|
168
|
+
const dispo = resp.headers.get("content-disposition");
|
|
169
|
+
if (dispo) {
|
|
170
|
+
const m =
|
|
171
|
+
/filename\*=UTF-8''([^;]+)/i.exec(dispo) ??
|
|
172
|
+
/filename="?([^";]+)"?/i.exec(dispo);
|
|
173
|
+
if (m && m[1]) {
|
|
174
|
+
try {
|
|
175
|
+
return decodeURIComponent(m[1]).trim();
|
|
176
|
+
} catch {
|
|
177
|
+
return m[1].trim();
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
// 2. Last path segment of the *final* URL (after redirects).
|
|
182
|
+
const finalUrl = resp.url || url;
|
|
183
|
+
let pathname = "";
|
|
184
|
+
try {
|
|
185
|
+
pathname = new URL(finalUrl).pathname;
|
|
186
|
+
} catch {
|
|
187
|
+
pathname = finalUrl;
|
|
188
|
+
}
|
|
189
|
+
const last = pathname.split("/").filter(Boolean).pop() ?? "";
|
|
190
|
+
if (last && /\.[a-z0-9]{1,8}$/i.test(last)) {
|
|
191
|
+
try {
|
|
192
|
+
return decodeURIComponent(last);
|
|
193
|
+
} catch {
|
|
194
|
+
return last;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
// 3. Synthesise from host + content-type extension.
|
|
198
|
+
let host = "remote";
|
|
199
|
+
try {
|
|
200
|
+
host = new URL(finalUrl).hostname.replace(/^www\./, "");
|
|
201
|
+
} catch {
|
|
202
|
+
// ignore
|
|
203
|
+
}
|
|
204
|
+
const ext = mimeToExt(contentType);
|
|
205
|
+
return ext ? `${host}.${ext}` : host;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
function mimeToExt(contentType?: string): string | undefined {
|
|
209
|
+
const ct = (contentType ?? "").toLowerCase().split(";")[0]?.trim();
|
|
210
|
+
if (!ct) return undefined;
|
|
211
|
+
if (ct === "application/pdf") return "pdf";
|
|
212
|
+
if (
|
|
213
|
+
ct ===
|
|
214
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
215
|
+
)
|
|
216
|
+
return "docx";
|
|
217
|
+
if (ct === "application/msword") return "doc";
|
|
218
|
+
if (ct === "text/markdown" || ct === "text/x-markdown") return "md";
|
|
219
|
+
if (ct === "text/html" || ct === "application/xhtml+xml") return "html";
|
|
220
|
+
if (ct === "application/json" || ct.endsWith("+json")) return "json";
|
|
221
|
+
if (ct.startsWith("text/")) return "txt";
|
|
222
|
+
return undefined;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Detect kind from raw bytes magic numbers. Used as a tie-breaker when the
|
|
227
|
+
* extension/content-type is missing or wrong.
|
|
228
|
+
*/
|
|
229
|
+
function detectKindFromMagic(bytes: Buffer): DocumentKind | undefined {
|
|
230
|
+
if (bytes.length >= 4) {
|
|
231
|
+
// %PDF-
|
|
232
|
+
if (
|
|
233
|
+
bytes[0] === 0x25 &&
|
|
234
|
+
bytes[1] === 0x50 &&
|
|
235
|
+
bytes[2] === 0x44 &&
|
|
236
|
+
bytes[3] === 0x46
|
|
237
|
+
)
|
|
238
|
+
return "pdf";
|
|
239
|
+
// PK\x03\x04 → ZIP container (docx, xlsx, …); we assume docx here, callers can override.
|
|
240
|
+
if (
|
|
241
|
+
bytes[0] === 0x50 &&
|
|
242
|
+
bytes[1] === 0x4b &&
|
|
243
|
+
bytes[2] === 0x03 &&
|
|
244
|
+
bytes[3] === 0x04
|
|
245
|
+
)
|
|
246
|
+
return "docx";
|
|
247
|
+
// \xD0\xCF\x11\xE0 → legacy OLE (.doc, .xls)
|
|
248
|
+
if (
|
|
249
|
+
bytes[0] === 0xd0 &&
|
|
250
|
+
bytes[1] === 0xcf &&
|
|
251
|
+
bytes[2] === 0x11 &&
|
|
252
|
+
bytes[3] === 0xe0
|
|
253
|
+
)
|
|
254
|
+
return "doc";
|
|
255
|
+
}
|
|
256
|
+
return undefined;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/** Strip HTML tags + collapse whitespace into plain text. */
|
|
260
|
+
export function htmlToPlainText(html: string): string {
|
|
261
|
+
// Remove <script>/<style> blocks entirely.
|
|
262
|
+
let s = html.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, "");
|
|
263
|
+
s = s.replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, "");
|
|
264
|
+
s = s.replace(/<!--[\s\S]*?-->/g, "");
|
|
265
|
+
// Convert block-level tags to newlines so paragraphs survive.
|
|
266
|
+
s = s.replace(/<\/(p|div|section|article|li|h[1-6]|tr|table|br)\s*>/gi, "\n");
|
|
267
|
+
s = s.replace(/<br\s*\/?>(?=)/gi, "\n");
|
|
268
|
+
// Drop remaining tags.
|
|
269
|
+
s = s.replace(/<[^>]+>/g, "");
|
|
270
|
+
// Decode the most common HTML entities.
|
|
271
|
+
s = s
|
|
272
|
+
.replace(/ /g, " ")
|
|
273
|
+
.replace(/&/g, "&")
|
|
274
|
+
.replace(/</g, "<")
|
|
275
|
+
.replace(/>/g, ">")
|
|
276
|
+
.replace(/"/g, '"')
|
|
277
|
+
.replace(/'/g, "'")
|
|
278
|
+
.replace(/'/g, "'")
|
|
279
|
+
.replace(/&#(\d+);/g, (_, d: string) =>
|
|
280
|
+
String.fromCodePoint(Number(d)),
|
|
281
|
+
)
|
|
282
|
+
.replace(/&#x([0-9a-f]+);/gi, (_, h: string) =>
|
|
283
|
+
String.fromCodePoint(Number.parseInt(h, 16)),
|
|
284
|
+
);
|
|
285
|
+
return s.replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
async function extractPdf(
|
|
289
|
+
bytes: Buffer,
|
|
290
|
+
): Promise<{ text: string; metadata: Record<string, unknown> }> {
|
|
291
|
+
const { extractText, getDocumentProxy } = await import("unpdf");
|
|
292
|
+
const data = new Uint8Array(bytes);
|
|
293
|
+
const pdf = await getDocumentProxy(data);
|
|
294
|
+
const { totalPages, text } = await extractText(pdf, { mergePages: true });
|
|
295
|
+
return {
|
|
296
|
+
text: text.trim(),
|
|
297
|
+
metadata: { pageCount: totalPages, parser: "unpdf" },
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
async function extractDocx(
|
|
302
|
+
bytes: Buffer,
|
|
303
|
+
): Promise<{ text: string; metadata: Record<string, unknown> }> {
|
|
304
|
+
const mammothMod = await import("mammoth");
|
|
305
|
+
// CJS-from-ESM: mammoth exports the API on default in some toolchains.
|
|
306
|
+
const mammoth: typeof import("mammoth") =
|
|
307
|
+
(mammothMod as unknown as { default?: typeof import("mammoth") }).default ??
|
|
308
|
+
(mammothMod as unknown as typeof import("mammoth"));
|
|
309
|
+
const result = await mammoth.extractRawText({ buffer: bytes });
|
|
310
|
+
const warnings = result.messages
|
|
311
|
+
.filter((m) => m.type === "warning")
|
|
312
|
+
.map((m) => m.message);
|
|
313
|
+
const errors = result.messages
|
|
314
|
+
.filter((m) => m.type === "error")
|
|
315
|
+
.map((m) => m.message);
|
|
316
|
+
return {
|
|
317
|
+
text: result.value.trim(),
|
|
318
|
+
metadata: {
|
|
319
|
+
parser: "mammoth",
|
|
320
|
+
warnings,
|
|
321
|
+
...(errors.length > 0 ? { errors } : {}),
|
|
322
|
+
},
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
/**
|
|
327
|
+
* Load and extract text content from a local file path or remote URL.
|
|
328
|
+
*
|
|
329
|
+
* Supported kinds:
|
|
330
|
+
* - PDF (`.pdf`, `application/pdf`) → text via `unpdf`
|
|
331
|
+
* - Word `.docx` → text via `mammoth`
|
|
332
|
+
* - HTML / Markdown / JSON / plain text → utf-8 decoded (HTML stripped)
|
|
333
|
+
*
|
|
334
|
+
* Unsupported `.doc` (legacy OLE) raises a clear error.
|
|
335
|
+
*/
|
|
336
|
+
export async function loadDocument(
|
|
337
|
+
input: string,
|
|
338
|
+
opts: LoadDocumentOptions = {},
|
|
339
|
+
): Promise<LoadedDocument> {
|
|
340
|
+
const fetchTimeoutMs = opts.fetchTimeoutMs ?? DEFAULT_TIMEOUT;
|
|
341
|
+
const maxBytes = opts.maxBytes ?? DEFAULT_MAX_BYTES;
|
|
342
|
+
const userAgent = opts.userAgent ?? DEFAULT_UA;
|
|
343
|
+
|
|
344
|
+
let bytes: Buffer;
|
|
345
|
+
let fileName: string;
|
|
346
|
+
let source: string;
|
|
347
|
+
let sourceType: "url" | "file";
|
|
348
|
+
let mimeType: string | undefined;
|
|
349
|
+
|
|
350
|
+
if (isRemoteUrl(input)) {
|
|
351
|
+
const fetched = await fetchUrl(input, {
|
|
352
|
+
fetchTimeoutMs,
|
|
353
|
+
maxBytes,
|
|
354
|
+
userAgent,
|
|
355
|
+
});
|
|
356
|
+
bytes = fetched.bytes;
|
|
357
|
+
fileName = fetched.fileName;
|
|
358
|
+
source = input;
|
|
359
|
+
sourceType = "url";
|
|
360
|
+
mimeType = fetched.contentType;
|
|
361
|
+
} else {
|
|
362
|
+
const fullPath = resolve(input);
|
|
363
|
+
const st = await stat(fullPath).catch(() => null);
|
|
364
|
+
if (!st || !st.isFile()) {
|
|
365
|
+
throw new Error(`file not found: ${input}`);
|
|
366
|
+
}
|
|
367
|
+
if (st.size > maxBytes) {
|
|
368
|
+
throw new Error(
|
|
369
|
+
`local document too large: ${st.size} bytes (limit ${maxBytes})`,
|
|
370
|
+
);
|
|
371
|
+
}
|
|
372
|
+
bytes = await readFile(fullPath);
|
|
373
|
+
fileName = basename(fullPath);
|
|
374
|
+
source = fullPath;
|
|
375
|
+
sourceType = "file";
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
let kind: DocumentKind =
|
|
379
|
+
opts.forceKind ?? detectKind({ fileName, contentType: mimeType });
|
|
380
|
+
// Magic-based fallback / override: covers servers that send
|
|
381
|
+
// `application/octet-stream` for PDFs and ZIP-based docx files.
|
|
382
|
+
if (kind === "unknown" || kind === "text") {
|
|
383
|
+
const magic = detectKindFromMagic(bytes);
|
|
384
|
+
if (magic === "pdf") kind = "pdf";
|
|
385
|
+
else if (magic === "docx") kind = "docx";
|
|
386
|
+
else if (magic === "doc") kind = "doc";
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
let text = "";
|
|
390
|
+
let metadata: Record<string, unknown> = {};
|
|
391
|
+
|
|
392
|
+
switch (kind) {
|
|
393
|
+
case "pdf": {
|
|
394
|
+
const out = await extractPdf(bytes);
|
|
395
|
+
text = out.text;
|
|
396
|
+
metadata = out.metadata;
|
|
397
|
+
break;
|
|
398
|
+
}
|
|
399
|
+
case "docx": {
|
|
400
|
+
const out = await extractDocx(bytes);
|
|
401
|
+
text = out.text;
|
|
402
|
+
metadata = out.metadata;
|
|
403
|
+
break;
|
|
404
|
+
}
|
|
405
|
+
case "doc":
|
|
406
|
+
throw new Error(
|
|
407
|
+
`legacy .doc (OLE) format is not supported — convert to .docx or PDF first (e.g. via libreoffice --convert-to docx ${fileName})`,
|
|
408
|
+
);
|
|
409
|
+
case "html": {
|
|
410
|
+
text = htmlToPlainText(bytes.toString("utf8"));
|
|
411
|
+
metadata = { parser: "html-strip" };
|
|
412
|
+
break;
|
|
413
|
+
}
|
|
414
|
+
case "json": {
|
|
415
|
+
const raw = bytes.toString("utf8");
|
|
416
|
+
try {
|
|
417
|
+
text = JSON.stringify(JSON.parse(raw), null, 2);
|
|
418
|
+
} catch {
|
|
419
|
+
text = raw;
|
|
420
|
+
}
|
|
421
|
+
metadata = { parser: "json" };
|
|
422
|
+
break;
|
|
423
|
+
}
|
|
424
|
+
case "markdown":
|
|
425
|
+
case "text":
|
|
426
|
+
text = bytes.toString("utf8");
|
|
427
|
+
metadata = { parser: "utf8" };
|
|
428
|
+
break;
|
|
429
|
+
case "unknown":
|
|
430
|
+
default:
|
|
431
|
+
// Last-ditch: if the bytes look like text, decode; otherwise reject.
|
|
432
|
+
if (looksLikeText(bytes)) {
|
|
433
|
+
text = bytes.toString("utf8");
|
|
434
|
+
metadata = { parser: "utf8-fallback" };
|
|
435
|
+
} else {
|
|
436
|
+
throw new Error(
|
|
437
|
+
`unsupported document format for ${fileName}` +
|
|
438
|
+
(mimeType ? ` (content-type: ${mimeType})` : "") +
|
|
439
|
+
` — supported: pdf, docx, html, json, txt, md`,
|
|
440
|
+
);
|
|
441
|
+
}
|
|
442
|
+
break;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
if (!text.trim()) {
|
|
446
|
+
throw new Error(
|
|
447
|
+
`no text extracted from ${fileName} (kind=${kind}); document may be image-only or empty`,
|
|
448
|
+
);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
return {
|
|
452
|
+
text,
|
|
453
|
+
fileName,
|
|
454
|
+
kind,
|
|
455
|
+
source,
|
|
456
|
+
sourceType,
|
|
457
|
+
mimeType,
|
|
458
|
+
bytes: bytes.length,
|
|
459
|
+
metadata,
|
|
460
|
+
};
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
/**
|
|
464
|
+
* Heuristic: a binary buffer is "text" if at least 95% of its first 512 bytes
|
|
465
|
+
* are printable ASCII or common UTF-8 continuation bytes.
|
|
466
|
+
*/
|
|
467
|
+
function looksLikeText(bytes: Buffer): boolean {
|
|
468
|
+
const sample = bytes.subarray(0, Math.min(bytes.length, 512));
|
|
469
|
+
if (sample.length === 0) return false;
|
|
470
|
+
let textLike = 0;
|
|
471
|
+
for (const b of sample) {
|
|
472
|
+
if (b === 0x09 || b === 0x0a || b === 0x0d) {
|
|
473
|
+
textLike++;
|
|
474
|
+
continue;
|
|
475
|
+
}
|
|
476
|
+
if (b >= 0x20 && b <= 0x7e) {
|
|
477
|
+
textLike++;
|
|
478
|
+
continue;
|
|
479
|
+
}
|
|
480
|
+
if (b >= 0x80) {
|
|
481
|
+
// Multi-byte UTF-8 continuation; treat as text too.
|
|
482
|
+
textLike++;
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
return textLike / sample.length >= 0.95;
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
/** File extensions eligible for document ingestion (binary/office formats). */
|
|
489
|
+
const DOCUMENT_EXTENSIONS = new Set(["pdf", "docx"]);
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Recursively collect `.docx` and `.pdf` files under `dir`.
|
|
493
|
+
* Returns sorted absolute paths.
|
|
494
|
+
*/
|
|
495
|
+
export async function collectDocumentFiles(dir: string): Promise<string[]> {
|
|
496
|
+
const root = resolve(dir);
|
|
497
|
+
const files: string[] = [];
|
|
498
|
+
async function walk(current: string): Promise<void> {
|
|
499
|
+
const entries = await readdir(current, { withFileTypes: true });
|
|
500
|
+
for (const entry of entries) {
|
|
501
|
+
const next = join(current, entry.name);
|
|
502
|
+
if (entry.isDirectory()) {
|
|
503
|
+
await walk(next);
|
|
504
|
+
} else if (
|
|
505
|
+
entry.isFile() &&
|
|
506
|
+
DOCUMENT_EXTENSIONS.has(extname(entry.name).toLowerCase().replace(/^\./, ""))
|
|
507
|
+
) {
|
|
508
|
+
files.push(next);
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
await walk(root);
|
|
513
|
+
return files.sort();
|
|
514
|
+
}
|
package/src/mcp/server.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { z } from "zod";
|
|
|
4
4
|
import { BrainDb } from "../db/client";
|
|
5
5
|
import { BrainRepository } from "../repositories/brain-repo";
|
|
6
6
|
import { loadSettings } from "../settings";
|
|
7
|
+
import { loadDocument, type DocumentKind } from "../markdown/document-loader";
|
|
7
8
|
|
|
8
9
|
// ============================================================================
|
|
9
10
|
// Error Handling Utilities
|
|
@@ -130,6 +131,7 @@ export const TOOL_MANIFEST = [
|
|
|
130
131
|
"brain_put",
|
|
131
132
|
"brain_delete",
|
|
132
133
|
"brain_ingest",
|
|
134
|
+
"brain_ingest_document",
|
|
133
135
|
"brain_link",
|
|
134
136
|
"brain_backlinks",
|
|
135
137
|
"brain_timeline",
|
|
@@ -289,6 +291,152 @@ export async function startMcpServer(dbPath: string): Promise<void> {
|
|
|
289
291
|
withErrorHandling("brain_ingest", brainIngestHandler),
|
|
290
292
|
);
|
|
291
293
|
|
|
294
|
+
// -- brain_ingest_document: ingest a PDF/Word/HTML/text file or http(s) URL
|
|
295
|
+
const brainIngestDocumentHandler = async ({
|
|
296
|
+
source,
|
|
297
|
+
slug,
|
|
298
|
+
type,
|
|
299
|
+
format,
|
|
300
|
+
max_bytes,
|
|
301
|
+
timeout_ms,
|
|
302
|
+
}: {
|
|
303
|
+
source: string;
|
|
304
|
+
slug?: string;
|
|
305
|
+
type?: string;
|
|
306
|
+
format?: DocumentKind;
|
|
307
|
+
max_bytes?: number;
|
|
308
|
+
timeout_ms?: number;
|
|
309
|
+
}) => {
|
|
310
|
+
const loaded = await loadDocument(source, {
|
|
311
|
+
forceKind: format,
|
|
312
|
+
maxBytes: max_bytes,
|
|
313
|
+
fetchTimeoutMs: timeout_ms,
|
|
314
|
+
});
|
|
315
|
+
const slugBase =
|
|
316
|
+
loaded.fileName
|
|
317
|
+
.replace(/\.[^.]+$/, "")
|
|
318
|
+
.toLowerCase()
|
|
319
|
+
.replace(/[^a-z0-9\u4e00-\u9fff]+/g, "-")
|
|
320
|
+
.replace(/^-+|-+$/g, "")
|
|
321
|
+
.slice(0, 80) || "document";
|
|
322
|
+
const finalSlug = slug ?? `ingest/${slugBase}`;
|
|
323
|
+
const finalType = type ?? loaded.kind;
|
|
324
|
+
const page = await repo.putPage({
|
|
325
|
+
slug: finalSlug,
|
|
326
|
+
type: finalType,
|
|
327
|
+
title: loaded.fileName,
|
|
328
|
+
compiledTruth: loaded.text,
|
|
329
|
+
timeline: "",
|
|
330
|
+
frontmatter: {
|
|
331
|
+
sourceFile: loaded.source,
|
|
332
|
+
sourceType: loaded.sourceType,
|
|
333
|
+
sourceKind: loaded.kind,
|
|
334
|
+
sourceMimeType: loaded.mimeType,
|
|
335
|
+
sourceBytes: loaded.bytes,
|
|
336
|
+
sourceFileName: loaded.fileName,
|
|
337
|
+
...loaded.metadata,
|
|
338
|
+
},
|
|
339
|
+
});
|
|
340
|
+
try {
|
|
341
|
+
await repo.timelineAdd({
|
|
342
|
+
pageSlug: finalSlug,
|
|
343
|
+
date: new Date().toISOString().slice(0, 10),
|
|
344
|
+
source: finalType,
|
|
345
|
+
summary: `Ingested ${loaded.kind} ${loaded.fileName}`,
|
|
346
|
+
detail:
|
|
347
|
+
loaded.sourceType === "url" ? `Source URL: ${loaded.source}` : "",
|
|
348
|
+
});
|
|
349
|
+
} catch {
|
|
350
|
+
/* non-fatal */
|
|
351
|
+
}
|
|
352
|
+
try {
|
|
353
|
+
await repo.writeRaw(finalSlug, loaded.sourceType, {
|
|
354
|
+
fileName: loaded.fileName,
|
|
355
|
+
sourceRef: loaded.source,
|
|
356
|
+
kind: loaded.kind,
|
|
357
|
+
mimeType: loaded.mimeType,
|
|
358
|
+
bytes: loaded.bytes,
|
|
359
|
+
metadata: loaded.metadata,
|
|
360
|
+
ingestedAt: new Date().toISOString(),
|
|
361
|
+
});
|
|
362
|
+
} catch {
|
|
363
|
+
/* non-fatal */
|
|
364
|
+
}
|
|
365
|
+
return {
|
|
366
|
+
content: [
|
|
367
|
+
{
|
|
368
|
+
type: "text",
|
|
369
|
+
text: JSON.stringify(
|
|
370
|
+
{
|
|
371
|
+
ok: true,
|
|
372
|
+
slug: finalSlug,
|
|
373
|
+
kind: loaded.kind,
|
|
374
|
+
sourceType: loaded.sourceType,
|
|
375
|
+
sourceRef: loaded.source,
|
|
376
|
+
fileName: loaded.fileName,
|
|
377
|
+
mimeType: loaded.mimeType,
|
|
378
|
+
bytes: loaded.bytes,
|
|
379
|
+
contentLength: loaded.text.length,
|
|
380
|
+
page: { slug: page.slug, updatedAt: page.updatedAt },
|
|
381
|
+
metadata: loaded.metadata,
|
|
382
|
+
},
|
|
383
|
+
null,
|
|
384
|
+
2,
|
|
385
|
+
),
|
|
386
|
+
},
|
|
387
|
+
],
|
|
388
|
+
};
|
|
389
|
+
};
|
|
390
|
+
|
|
391
|
+
server.registerTool(
|
|
392
|
+
"brain_ingest_document",
|
|
393
|
+
{
|
|
394
|
+
description:
|
|
395
|
+
"Ingest a document (PDF, Word .docx, HTML, JSON, plain text, markdown) from a local file path or http(s) URL. Extracts text content automatically based on file extension or HTTP content-type.",
|
|
396
|
+
inputSchema: z.object({
|
|
397
|
+
source: z
|
|
398
|
+
.string()
|
|
399
|
+
.describe("Local file path or http(s) URL to ingest."),
|
|
400
|
+
slug: z
|
|
401
|
+
.string()
|
|
402
|
+
.optional()
|
|
403
|
+
.describe(
|
|
404
|
+
"Optional explicit page slug. Defaults to 'ingest/<sanitized-filename>'.",
|
|
405
|
+
),
|
|
406
|
+
type: z
|
|
407
|
+
.string()
|
|
408
|
+
.optional()
|
|
409
|
+
.describe("Optional page type override (defaults to detected kind)."),
|
|
410
|
+
format: z
|
|
411
|
+
.enum([
|
|
412
|
+
"text",
|
|
413
|
+
"markdown",
|
|
414
|
+
"pdf",
|
|
415
|
+
"docx",
|
|
416
|
+
"doc",
|
|
417
|
+
"html",
|
|
418
|
+
"json",
|
|
419
|
+
"unknown",
|
|
420
|
+
])
|
|
421
|
+
.optional()
|
|
422
|
+
.describe("Force a specific document kind, bypassing auto-detection."),
|
|
423
|
+
max_bytes: z
|
|
424
|
+
.number()
|
|
425
|
+
.int()
|
|
426
|
+
.positive()
|
|
427
|
+
.optional()
|
|
428
|
+
.describe("Maximum bytes accepted from URL/file. Default 50MB."),
|
|
429
|
+
timeout_ms: z
|
|
430
|
+
.number()
|
|
431
|
+
.int()
|
|
432
|
+
.positive()
|
|
433
|
+
.optional()
|
|
434
|
+
.describe("Network fetch timeout for URLs in ms. Default 30000."),
|
|
435
|
+
}),
|
|
436
|
+
},
|
|
437
|
+
withErrorHandling("brain_ingest_document", brainIngestDocumentHandler),
|
|
438
|
+
);
|
|
439
|
+
|
|
292
440
|
// ---------------------------------------------------------------------------
|
|
293
441
|
// Link Tools
|
|
294
442
|
// ---------------------------------------------------------------------------
|