ex-brain 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/db/client.ts CHANGED
@@ -107,7 +107,11 @@ export class BrainDb {
107
107
  return new Promise((resolve) => setTimeout(resolve, ms));
108
108
  }
109
109
 
110
- static async connect(dbPath: string, settings?: ResolvedSettings): Promise<BrainDb> {
110
+ static async connect(
111
+ dbPath: string,
112
+ settings?: ResolvedSettings,
113
+ options?: { skipCollection?: boolean },
114
+ ): Promise<BrainDb> {
111
115
  try {
112
116
  const client = settings?.remote
113
117
  ? await BrainDb.openRemoteClient(settings.remote)
@@ -122,6 +126,15 @@ export class BrainDb {
122
126
  await client.execute(sql);
123
127
  }
124
128
 
129
+ // Skip collection creation for init (embedding config may not be ready)
130
+ if (options?.skipCollection) {
131
+ const db = new BrainDb(dbPath, client, null as unknown as Collection);
132
+ db._isConnected = true;
133
+ db._lastConnectedAt = new Date();
134
+ console.error("\x1b[32m[DB] Connected successfully\x1b[0m");
135
+ return db;
136
+ }
137
+
125
138
  const pagesCollection = await client.getOrCreateCollection({
126
139
  name: PAGES_COLLECTION,
127
140
  embeddingFunction: createBrainEmbeddingFunction(settings?.embed),
@@ -0,0 +1,486 @@
1
+ import { readFile, stat } from "node:fs/promises";
2
+ import { basename, extname, resolve } from "node:path";
3
+
4
+ /** Supported document kinds for ingestion. */
5
+ export type DocumentKind =
6
+ | "text"
7
+ | "markdown"
8
+ | "pdf"
9
+ | "docx"
10
+ | "doc"
11
+ | "html"
12
+ | "json"
13
+ | "unknown";
14
+
15
+ export interface LoadedDocument {
16
+ /** Extracted plain-text content (utf-8). */
17
+ text: string;
18
+ /** Original file/URL name without parent path. */
19
+ fileName: string;
20
+ /** Detected document kind. */
21
+ kind: DocumentKind;
22
+ /** Source descriptor: an absolute path for files or the original URL. */
23
+ source: string;
24
+ /** Where the source came from. */
25
+ sourceType: "url" | "file";
26
+ /** MIME type detected from response headers or file extension. */
27
+ mimeType?: string;
28
+ /** Byte size of the *raw* source (downloaded bytes or file size). */
29
+ bytes: number;
30
+ /** Extra metadata: page count for PDF, mammoth warnings, etc. */
31
+ metadata: Record<string, unknown>;
32
+ }
33
+
34
+ export interface LoadDocumentOptions {
35
+ /** Override automatic kind detection. */
36
+ forceKind?: DocumentKind;
37
+ /** Network fetch timeout (ms). Default: 30s. */
38
+ fetchTimeoutMs?: number;
39
+ /** Maximum bytes accepted from a remote URL. Default: 50 MB. */
40
+ maxBytes?: number;
41
+ /** Custom user-agent for URL fetches. */
42
+ userAgent?: string;
43
+ }
44
+
45
+ const DEFAULT_TIMEOUT = 30_000;
46
+ const DEFAULT_MAX_BYTES = 50 * 1024 * 1024;
47
+ const DEFAULT_UA = "ebrain-ingest/1 (+https://github.com/ebrain)";
48
+
49
+ /**
50
+ * Detect whether `input` is a remote URL (http/https) we should download.
51
+ * `file://` URLs are treated as local files.
52
+ */
53
+ export function isRemoteUrl(input: string): boolean {
54
+ return /^https?:\/\//i.test(input.trim());
55
+ }
56
+
57
+ /**
58
+ * Map a file extension or content-type to a DocumentKind.
59
+ * Returns `"unknown"` if no clear match is found.
60
+ */
61
+ export function detectKind(opts: {
62
+ fileName?: string;
63
+ contentType?: string;
64
+ }): DocumentKind {
65
+ const ct = (opts.contentType ?? "").toLowerCase().split(";")[0]?.trim();
66
+ if (ct) {
67
+ if (ct === "application/pdf") return "pdf";
68
+ if (
69
+ ct ===
70
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
71
+ ct === "application/vnd.ms-word.document.macroenabled.12"
72
+ )
73
+ return "docx";
74
+ if (ct === "application/msword") return "doc";
75
+ if (ct === "text/markdown" || ct === "text/x-markdown") return "markdown";
76
+ if (ct === "text/html" || ct === "application/xhtml+xml") return "html";
77
+ if (ct === "application/json" || ct.endsWith("+json")) return "json";
78
+ if (ct.startsWith("text/")) return "text";
79
+ }
80
+ const ext = (extname(opts.fileName ?? "").toLowerCase() || "").replace(
81
+ /^\./,
82
+ "",
83
+ );
84
+ switch (ext) {
85
+ case "pdf":
86
+ return "pdf";
87
+ case "docx":
88
+ return "docx";
89
+ case "doc":
90
+ return "doc";
91
+ case "md":
92
+ case "markdown":
93
+ case "mdx":
94
+ return "markdown";
95
+ case "htm":
96
+ case "html":
97
+ case "xhtml":
98
+ return "html";
99
+ case "json":
100
+ return "json";
101
+ case "txt":
102
+ case "text":
103
+ case "log":
104
+ case "csv":
105
+ case "tsv":
106
+ case "yaml":
107
+ case "yml":
108
+ case "ini":
109
+ case "rst":
110
+ case "org":
111
+ return "text";
112
+ default:
113
+ return "unknown";
114
+ }
115
+ }
116
+
117
+ interface FetchedResource {
118
+ bytes: Buffer;
119
+ fileName: string;
120
+ contentType?: string;
121
+ }
122
+
123
+ async function fetchUrl(
124
+ url: string,
125
+ opts: Required<Pick<LoadDocumentOptions, "fetchTimeoutMs" | "maxBytes" | "userAgent">>,
126
+ ): Promise<FetchedResource> {
127
+ const controller = new AbortController();
128
+ const timer = setTimeout(() => controller.abort(), opts.fetchTimeoutMs);
129
+ let resp: Response;
130
+ try {
131
+ resp = await fetch(url, {
132
+ headers: { "User-Agent": opts.userAgent, Accept: "*/*" },
133
+ redirect: "follow",
134
+ signal: controller.signal,
135
+ });
136
+ } catch (err) {
137
+ clearTimeout(timer);
138
+ const reason = err instanceof Error ? err.message : String(err);
139
+ throw new Error(`failed to fetch ${url}: ${reason}`);
140
+ }
141
+ clearTimeout(timer);
142
+ if (!resp.ok) {
143
+ throw new Error(`fetch ${url} returned HTTP ${resp.status} ${resp.statusText}`);
144
+ }
145
+ const contentLength = Number(resp.headers.get("content-length") ?? "0");
146
+ if (contentLength && contentLength > opts.maxBytes) {
147
+ throw new Error(
148
+ `remote document too large: ${contentLength} bytes (limit ${opts.maxBytes})`,
149
+ );
150
+ }
151
+ const ab = await resp.arrayBuffer();
152
+ if (ab.byteLength > opts.maxBytes) {
153
+ throw new Error(
154
+ `remote document too large: ${ab.byteLength} bytes (limit ${opts.maxBytes})`,
155
+ );
156
+ }
157
+ const contentType = resp.headers.get("content-type") ?? undefined;
158
+ const fileName = inferFileNameFromUrl(url, resp, contentType);
159
+ return { bytes: Buffer.from(ab), fileName, contentType };
160
+ }
161
+
162
+ function inferFileNameFromUrl(
163
+ url: string,
164
+ resp: Response,
165
+ contentType?: string,
166
+ ): string {
167
+ // 1. Content-Disposition wins if it carries a filename.
168
+ const dispo = resp.headers.get("content-disposition");
169
+ if (dispo) {
170
+ const m =
171
+ /filename\*=UTF-8''([^;]+)/i.exec(dispo) ??
172
+ /filename="?([^";]+)"?/i.exec(dispo);
173
+ if (m && m[1]) {
174
+ try {
175
+ return decodeURIComponent(m[1]).trim();
176
+ } catch {
177
+ return m[1].trim();
178
+ }
179
+ }
180
+ }
181
+ // 2. Last path segment of the *final* URL (after redirects).
182
+ const finalUrl = resp.url || url;
183
+ let pathname = "";
184
+ try {
185
+ pathname = new URL(finalUrl).pathname;
186
+ } catch {
187
+ pathname = finalUrl;
188
+ }
189
+ const last = pathname.split("/").filter(Boolean).pop() ?? "";
190
+ if (last && /\.[a-z0-9]{1,8}$/i.test(last)) {
191
+ try {
192
+ return decodeURIComponent(last);
193
+ } catch {
194
+ return last;
195
+ }
196
+ }
197
+ // 3. Synthesise from host + content-type extension.
198
+ let host = "remote";
199
+ try {
200
+ host = new URL(finalUrl).hostname.replace(/^www\./, "");
201
+ } catch {
202
+ // ignore
203
+ }
204
+ const ext = mimeToExt(contentType);
205
+ return ext ? `${host}.${ext}` : host;
206
+ }
207
+
208
+ function mimeToExt(contentType?: string): string | undefined {
209
+ const ct = (contentType ?? "").toLowerCase().split(";")[0]?.trim();
210
+ if (!ct) return undefined;
211
+ if (ct === "application/pdf") return "pdf";
212
+ if (
213
+ ct ===
214
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
215
+ )
216
+ return "docx";
217
+ if (ct === "application/msword") return "doc";
218
+ if (ct === "text/markdown" || ct === "text/x-markdown") return "md";
219
+ if (ct === "text/html" || ct === "application/xhtml+xml") return "html";
220
+ if (ct === "application/json" || ct.endsWith("+json")) return "json";
221
+ if (ct.startsWith("text/")) return "txt";
222
+ return undefined;
223
+ }
224
+
225
+ /**
226
+ * Detect kind from raw bytes magic numbers. Used as a tie-breaker when the
227
+ * extension/content-type is missing or wrong.
228
+ */
229
+ function detectKindFromMagic(bytes: Buffer): DocumentKind | undefined {
230
+ if (bytes.length >= 4) {
231
+ // %PDF-
232
+ if (
233
+ bytes[0] === 0x25 &&
234
+ bytes[1] === 0x50 &&
235
+ bytes[2] === 0x44 &&
236
+ bytes[3] === 0x46
237
+ )
238
+ return "pdf";
239
+ // PK\x03\x04 → ZIP container (docx, xlsx, …); we assume docx here, callers can override.
240
+ if (
241
+ bytes[0] === 0x50 &&
242
+ bytes[1] === 0x4b &&
243
+ bytes[2] === 0x03 &&
244
+ bytes[3] === 0x04
245
+ )
246
+ return "docx";
247
+ // \xD0\xCF\x11\xE0 → legacy OLE (.doc, .xls)
248
+ if (
249
+ bytes[0] === 0xd0 &&
250
+ bytes[1] === 0xcf &&
251
+ bytes[2] === 0x11 &&
252
+ bytes[3] === 0xe0
253
+ )
254
+ return "doc";
255
+ }
256
+ return undefined;
257
+ }
258
+
259
+ /** Strip HTML tags + collapse whitespace into plain text. */
260
+ export function htmlToPlainText(html: string): string {
261
+ // Remove <script>/<style> blocks entirely.
262
+ let s = html.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, "");
263
+ s = s.replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, "");
264
+ s = s.replace(/<!--[\s\S]*?-->/g, "");
265
+ // Convert block-level tags to newlines so paragraphs survive.
266
+ s = s.replace(/<\/(p|div|section|article|li|h[1-6]|tr|table|br)\s*>/gi, "\n");
267
+ s = s.replace(/<br\s*\/?>(?=)/gi, "\n");
268
+ // Drop remaining tags.
269
+ s = s.replace(/<[^>]+>/g, "");
270
+ // Decode the most common HTML entities.
271
+ s = s
272
+ .replace(/&nbsp;/g, " ")
273
+ .replace(/&amp;/g, "&")
274
+ .replace(/&lt;/g, "<")
275
+ .replace(/&gt;/g, ">")
276
+ .replace(/&quot;/g, '"')
277
+ .replace(/&#39;/g, "'")
278
+ .replace(/&apos;/g, "'")
279
+ .replace(/&#(\d+);/g, (_, d: string) =>
280
+ String.fromCodePoint(Number(d)),
281
+ )
282
+ .replace(/&#x([0-9a-f]+);/gi, (_, h: string) =>
283
+ String.fromCodePoint(Number.parseInt(h, 16)),
284
+ );
285
+ return s.replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
286
+ }
287
+
288
+ async function extractPdf(
289
+ bytes: Buffer,
290
+ ): Promise<{ text: string; metadata: Record<string, unknown> }> {
291
+ const { extractText, getDocumentProxy } = await import("unpdf");
292
+ const data = new Uint8Array(bytes);
293
+ const pdf = await getDocumentProxy(data);
294
+ const { totalPages, text } = await extractText(pdf, { mergePages: true });
295
+ return {
296
+ text: text.trim(),
297
+ metadata: { pageCount: totalPages, parser: "unpdf" },
298
+ };
299
+ }
300
+
301
+ async function extractDocx(
302
+ bytes: Buffer,
303
+ ): Promise<{ text: string; metadata: Record<string, unknown> }> {
304
+ const mammothMod = await import("mammoth");
305
+ // CJS-from-ESM: mammoth exports the API on default in some toolchains.
306
+ const mammoth: typeof import("mammoth") =
307
+ (mammothMod as unknown as { default?: typeof import("mammoth") }).default ??
308
+ (mammothMod as unknown as typeof import("mammoth"));
309
+ const result = await mammoth.extractRawText({ buffer: bytes });
310
+ const warnings = result.messages
311
+ .filter((m) => m.type === "warning")
312
+ .map((m) => m.message);
313
+ const errors = result.messages
314
+ .filter((m) => m.type === "error")
315
+ .map((m) => m.message);
316
+ return {
317
+ text: result.value.trim(),
318
+ metadata: {
319
+ parser: "mammoth",
320
+ warnings,
321
+ ...(errors.length > 0 ? { errors } : {}),
322
+ },
323
+ };
324
+ }
325
+
326
+ /**
327
+ * Load and extract text content from a local file path or remote URL.
328
+ *
329
+ * Supported kinds:
330
+ * - PDF (`.pdf`, `application/pdf`) → text via `unpdf`
331
+ * - Word `.docx` → text via `mammoth`
332
+ * - HTML / Markdown / JSON / plain text → utf-8 decoded (HTML stripped)
333
+ *
334
+ * Unsupported `.doc` (legacy OLE) raises a clear error.
335
+ */
336
+ export async function loadDocument(
337
+ input: string,
338
+ opts: LoadDocumentOptions = {},
339
+ ): Promise<LoadedDocument> {
340
+ const fetchTimeoutMs = opts.fetchTimeoutMs ?? DEFAULT_TIMEOUT;
341
+ const maxBytes = opts.maxBytes ?? DEFAULT_MAX_BYTES;
342
+ const userAgent = opts.userAgent ?? DEFAULT_UA;
343
+
344
+ let bytes: Buffer;
345
+ let fileName: string;
346
+ let source: string;
347
+ let sourceType: "url" | "file";
348
+ let mimeType: string | undefined;
349
+
350
+ if (isRemoteUrl(input)) {
351
+ const fetched = await fetchUrl(input, {
352
+ fetchTimeoutMs,
353
+ maxBytes,
354
+ userAgent,
355
+ });
356
+ bytes = fetched.bytes;
357
+ fileName = fetched.fileName;
358
+ source = input;
359
+ sourceType = "url";
360
+ mimeType = fetched.contentType;
361
+ } else {
362
+ const fullPath = resolve(input);
363
+ const st = await stat(fullPath).catch(() => null);
364
+ if (!st || !st.isFile()) {
365
+ throw new Error(`file not found: ${input}`);
366
+ }
367
+ if (st.size > maxBytes) {
368
+ throw new Error(
369
+ `local document too large: ${st.size} bytes (limit ${maxBytes})`,
370
+ );
371
+ }
372
+ bytes = await readFile(fullPath);
373
+ fileName = basename(fullPath);
374
+ source = fullPath;
375
+ sourceType = "file";
376
+ }
377
+
378
+ let kind: DocumentKind =
379
+ opts.forceKind ?? detectKind({ fileName, contentType: mimeType });
380
+ // Magic-based fallback / override: covers servers that send
381
+ // `application/octet-stream` for PDFs and ZIP-based docx files.
382
+ if (kind === "unknown" || kind === "text") {
383
+ const magic = detectKindFromMagic(bytes);
384
+ if (magic === "pdf") kind = "pdf";
385
+ else if (magic === "docx") kind = "docx";
386
+ else if (magic === "doc") kind = "doc";
387
+ }
388
+
389
+ let text = "";
390
+ let metadata: Record<string, unknown> = {};
391
+
392
+ switch (kind) {
393
+ case "pdf": {
394
+ const out = await extractPdf(bytes);
395
+ text = out.text;
396
+ metadata = out.metadata;
397
+ break;
398
+ }
399
+ case "docx": {
400
+ const out = await extractDocx(bytes);
401
+ text = out.text;
402
+ metadata = out.metadata;
403
+ break;
404
+ }
405
+ case "doc":
406
+ throw new Error(
407
+ `legacy .doc (OLE) format is not supported — convert to .docx or PDF first (e.g. via libreoffice --convert-to docx ${fileName})`,
408
+ );
409
+ case "html": {
410
+ text = htmlToPlainText(bytes.toString("utf8"));
411
+ metadata = { parser: "html-strip" };
412
+ break;
413
+ }
414
+ case "json": {
415
+ const raw = bytes.toString("utf8");
416
+ try {
417
+ text = JSON.stringify(JSON.parse(raw), null, 2);
418
+ } catch {
419
+ text = raw;
420
+ }
421
+ metadata = { parser: "json" };
422
+ break;
423
+ }
424
+ case "markdown":
425
+ case "text":
426
+ text = bytes.toString("utf8");
427
+ metadata = { parser: "utf8" };
428
+ break;
429
+ case "unknown":
430
+ default:
431
+ // Last-ditch: if the bytes look like text, decode; otherwise reject.
432
+ if (looksLikeText(bytes)) {
433
+ text = bytes.toString("utf8");
434
+ metadata = { parser: "utf8-fallback" };
435
+ } else {
436
+ throw new Error(
437
+ `unsupported document format for ${fileName}` +
438
+ (mimeType ? ` (content-type: ${mimeType})` : "") +
439
+ ` — supported: pdf, docx, html, json, txt, md`,
440
+ );
441
+ }
442
+ break;
443
+ }
444
+
445
+ if (!text.trim()) {
446
+ throw new Error(
447
+ `no text extracted from ${fileName} (kind=${kind}); document may be image-only or empty`,
448
+ );
449
+ }
450
+
451
+ return {
452
+ text,
453
+ fileName,
454
+ kind,
455
+ source,
456
+ sourceType,
457
+ mimeType,
458
+ bytes: bytes.length,
459
+ metadata,
460
+ };
461
+ }
462
+
463
+ /**
464
+ * Heuristic: a binary buffer is "text" if at least 95% of its first 512 bytes
465
+ * are printable ASCII or common UTF-8 continuation bytes.
466
+ */
467
+ function looksLikeText(bytes: Buffer): boolean {
468
+ const sample = bytes.subarray(0, Math.min(bytes.length, 512));
469
+ if (sample.length === 0) return false;
470
+ let textLike = 0;
471
+ for (const b of sample) {
472
+ if (b === 0x09 || b === 0x0a || b === 0x0d) {
473
+ textLike++;
474
+ continue;
475
+ }
476
+ if (b >= 0x20 && b <= 0x7e) {
477
+ textLike++;
478
+ continue;
479
+ }
480
+ if (b >= 0x80) {
481
+ // Multi-byte UTF-8 continuation; treat as text too.
482
+ textLike++;
483
+ }
484
+ }
485
+ return textLike / sample.length >= 0.95;
486
+ }
package/src/mcp/server.ts CHANGED
@@ -4,6 +4,7 @@ import { z } from "zod";
4
4
  import { BrainDb } from "../db/client";
5
5
  import { BrainRepository } from "../repositories/brain-repo";
6
6
  import { loadSettings } from "../settings";
7
+ import { loadDocument, type DocumentKind } from "../markdown/document-loader";
7
8
 
8
9
  // ============================================================================
9
10
  // Error Handling Utilities
@@ -130,6 +131,7 @@ export const TOOL_MANIFEST = [
130
131
  "brain_put",
131
132
  "brain_delete",
132
133
  "brain_ingest",
134
+ "brain_ingest_document",
133
135
  "brain_link",
134
136
  "brain_backlinks",
135
137
  "brain_timeline",
@@ -289,6 +291,152 @@ export async function startMcpServer(dbPath: string): Promise<void> {
289
291
  withErrorHandling("brain_ingest", brainIngestHandler),
290
292
  );
291
293
 
294
+ // -- brain_ingest_document: ingest a PDF/Word/HTML/text file or http(s) URL
295
+ const brainIngestDocumentHandler = async ({
296
+ source,
297
+ slug,
298
+ type,
299
+ format,
300
+ max_bytes,
301
+ timeout_ms,
302
+ }: {
303
+ source: string;
304
+ slug?: string;
305
+ type?: string;
306
+ format?: DocumentKind;
307
+ max_bytes?: number;
308
+ timeout_ms?: number;
309
+ }) => {
310
+ const loaded = await loadDocument(source, {
311
+ forceKind: format,
312
+ maxBytes: max_bytes,
313
+ fetchTimeoutMs: timeout_ms,
314
+ });
315
+ const slugBase =
316
+ loaded.fileName
317
+ .replace(/\.[^.]+$/, "")
318
+ .toLowerCase()
319
+ .replace(/[^a-z0-9\u4e00-\u9fff]+/g, "-")
320
+ .replace(/^-+|-+$/g, "")
321
+ .slice(0, 80) || "document";
322
+ const finalSlug = slug ?? `ingest/${slugBase}`;
323
+ const finalType = type ?? loaded.kind;
324
+ const page = await repo.putPage({
325
+ slug: finalSlug,
326
+ type: finalType,
327
+ title: loaded.fileName,
328
+ compiledTruth: loaded.text,
329
+ timeline: "",
330
+ frontmatter: {
331
+ sourceFile: loaded.source,
332
+ sourceType: loaded.sourceType,
333
+ sourceKind: loaded.kind,
334
+ sourceMimeType: loaded.mimeType,
335
+ sourceBytes: loaded.bytes,
336
+ sourceFileName: loaded.fileName,
337
+ ...loaded.metadata,
338
+ },
339
+ });
340
+ try {
341
+ await repo.timelineAdd({
342
+ pageSlug: finalSlug,
343
+ date: new Date().toISOString().slice(0, 10),
344
+ source: finalType,
345
+ summary: `Ingested ${loaded.kind} ${loaded.fileName}`,
346
+ detail:
347
+ loaded.sourceType === "url" ? `Source URL: ${loaded.source}` : "",
348
+ });
349
+ } catch {
350
+ /* non-fatal */
351
+ }
352
+ try {
353
+ await repo.writeRaw(finalSlug, loaded.sourceType, {
354
+ fileName: loaded.fileName,
355
+ sourceRef: loaded.source,
356
+ kind: loaded.kind,
357
+ mimeType: loaded.mimeType,
358
+ bytes: loaded.bytes,
359
+ metadata: loaded.metadata,
360
+ ingestedAt: new Date().toISOString(),
361
+ });
362
+ } catch {
363
+ /* non-fatal */
364
+ }
365
+ return {
366
+ content: [
367
+ {
368
+ type: "text",
369
+ text: JSON.stringify(
370
+ {
371
+ ok: true,
372
+ slug: finalSlug,
373
+ kind: loaded.kind,
374
+ sourceType: loaded.sourceType,
375
+ sourceRef: loaded.source,
376
+ fileName: loaded.fileName,
377
+ mimeType: loaded.mimeType,
378
+ bytes: loaded.bytes,
379
+ contentLength: loaded.text.length,
380
+ page: { slug: page.slug, updatedAt: page.updatedAt },
381
+ metadata: loaded.metadata,
382
+ },
383
+ null,
384
+ 2,
385
+ ),
386
+ },
387
+ ],
388
+ };
389
+ };
390
+
391
+ server.registerTool(
392
+ "brain_ingest_document",
393
+ {
394
+ description:
395
+ "Ingest a document (PDF, Word .docx, HTML, JSON, plain text, markdown) from a local file path or http(s) URL. Extracts text content automatically based on file extension or HTTP content-type.",
396
+ inputSchema: z.object({
397
+ source: z
398
+ .string()
399
+ .describe("Local file path or http(s) URL to ingest."),
400
+ slug: z
401
+ .string()
402
+ .optional()
403
+ .describe(
404
+ "Optional explicit page slug. Defaults to 'ingest/<sanitized-filename>'.",
405
+ ),
406
+ type: z
407
+ .string()
408
+ .optional()
409
+ .describe("Optional page type override (defaults to detected kind)."),
410
+ format: z
411
+ .enum([
412
+ "text",
413
+ "markdown",
414
+ "pdf",
415
+ "docx",
416
+ "doc",
417
+ "html",
418
+ "json",
419
+ "unknown",
420
+ ])
421
+ .optional()
422
+ .describe("Force a specific document kind, bypassing auto-detection."),
423
+ max_bytes: z
424
+ .number()
425
+ .int()
426
+ .positive()
427
+ .optional()
428
+ .describe("Maximum bytes accepted from URL/file. Default 50MB."),
429
+ timeout_ms: z
430
+ .number()
431
+ .int()
432
+ .positive()
433
+ .optional()
434
+ .describe("Network fetch timeout for URLs in ms. Default 30000."),
435
+ }),
436
+ },
437
+ withErrorHandling("brain_ingest_document", brainIngestDocumentHandler),
438
+ );
439
+
292
440
  // ---------------------------------------------------------------------------
293
441
  // Link Tools
294
442
  // ---------------------------------------------------------------------------
@@ -347,8 +347,13 @@ export class BrainRepository {
347
347
  metadatas: [meta],
348
348
  });
349
349
  } catch (error) {
350
+ const msg = error instanceof Error ? error.message : String(error);
351
+ // Dimension mismatch means the collection was created with a different
352
+ // embedding model. This is non-critical — pages still work, just no search.
353
+ if (msg.includes("Dimension mismatch")) {
354
+ return;
355
+ }
350
356
  const dbError = wrapDbError(error, "syncPageToSearch", { slug });
351
- logDbError(dbError);
352
357
  // Don't throw - sync failure shouldn't break the main flow
353
358
  console.warn(`[BrainRepo] syncPageToSearch failed for ${slug}: ${dbError.message}`);
354
359
  }
@@ -384,8 +389,11 @@ export class BrainRepository {
384
389
  metadatas: metas,
385
390
  });
386
391
  } catch (error) {
392
+ const msg = error instanceof Error ? error.message : String(error);
393
+ if (msg.includes("Dimension mismatch")) {
394
+ return;
395
+ }
387
396
  const dbError = wrapDbError(error, "syncPagesToSearch", { count: slugs.length });
388
- logDbError(dbError);
389
397
  // Don't throw - sync failure shouldn't break the main flow
390
398
  console.warn(`[BrainRepo] syncPagesToSearch failed: ${dbError.message}`);
391
399
  }