ex-brain 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
- import { basename, resolve } from "node:path";
1
+ import { basename, extname, resolve } from "node:path";
2
2
  import { readFileSync } from "node:fs";
3
+ import { createHash } from "node:crypto";
3
4
  import { Command } from "commander";
4
5
  import { DEFAULT_DB_NAME, inferTypeFromSlug, slugToTitle, normalizeLongSlug, slugify } from "../config";
5
6
  import { BrainDb } from "../db/client";
@@ -13,6 +14,7 @@ import {
13
14
  slugToPath,
14
15
  writeTextFile,
15
16
  } from "../markdown/io";
17
+ import { loadDocument, isRemoteUrl, type DocumentKind } from "../markdown/document-loader";
16
18
  import {
17
19
  extractTimelineLines,
18
20
  extractWikiStyleLinks,
@@ -52,6 +54,14 @@ function isDryRun(opts: Record<string, unknown>): boolean {
52
54
  return Boolean(opts.dryRun);
53
55
  }
54
56
 
57
+ /**
58
+ * Compute a short SHA-256 hex hash of a string (first 16 chars).
59
+ * Used for detecting duplicate document ingestion.
60
+ */
61
+ function contentHash(text: string): string {
62
+ return createHash("sha256").update(text, "utf8").digest("hex").slice(0, 16);
63
+ }
64
+
55
65
  // Simple progress output to stderr (won't interfere with --json stdout).
56
66
  // e.g. "[3/42] import docs/api"
57
67
  function progress(label: string, current: number, total: number, json: boolean): void {
@@ -94,12 +104,12 @@ async function applyEntityLinks(
94
104
  }
95
105
  return { created: 0, linked: 0 };
96
106
  }
97
-
107
+
98
108
  // Filter by confidence
99
109
  const confidenceThreshold = settings.extraction.confidenceThreshold;
100
110
  const highConfidence = relations.filter((r) => r.confidence >= confidenceThreshold);
101
111
  const ignoredCount = relations.length - highConfidence.length;
102
-
112
+
103
113
  if (highConfidence.length === 0) {
104
114
  if (!json) {
105
115
  if (relations.length > 0) {
@@ -119,7 +129,7 @@ async function applyEntityLinks(
119
129
  // 1. Resolve entity slugs (disambiguation)
120
130
  const fromCandidate = entityToSlug(r.from.name, r.from.type);
121
131
  const toCandidate = entityToSlug(r.to.name, r.to.type);
122
-
132
+
123
133
  const fromSlug = await repo.findSimilarSlug(fromCandidate, r.from.name);
124
134
  const toSlug = await repo.findSimilarSlug(toCandidate, r.to.name);
125
135
 
@@ -144,7 +154,7 @@ async function applyEntityLinks(
144
154
  const duration = formatDuration(Date.now() - startTime);
145
155
  const entityNames = [...new Set(highConfidence.flatMap((r) => [r.from.name, r.to.name]))];
146
156
  spinner.succeed(`Extracted ${entityNames.length} entities: ${entityNames.join(", ")}`);
147
-
157
+
148
158
  // Print detailed info
149
159
  subItem(`${created} entity pages created`);
150
160
  subItem(`${linked} links added`);
@@ -225,23 +235,46 @@ Examples:
225
235
 
226
236
  // -- page CRUD ------------------------------------------------------------
227
237
 
238
+ // -- put ------------------------------------------------------------------
239
+ // Auto-detects file type: markdown goes through parsePageMarkdown,
240
+ // other formats (pdf, docx, html, txt, json) go through loadDocument.
241
+
242
+ /** Non-markdown extensions that should use the document ingestion path. */
243
+ const DOC_EXTENSIONS = new Set([
244
+ "pdf", "docx", "doc", "html", "htm", "json", "txt", "text",
245
+ ]);
246
+
247
+ /** Whether a file path should be treated as a document (not markdown). */
248
+ function isDocumentFile(filePath: string, forceKind?: string): boolean {
249
+ if (forceKind && forceKind !== "markdown") return true;
250
+ const ext = extname(filePath).toLowerCase().replace(/^\./, "");
251
+ return DOC_EXTENSIONS.has(ext);
252
+ }
253
+
228
254
  addDryRun(
229
255
  program
230
256
  .command("put")
231
257
  .argument("[slug]", "page slug (optional; auto-generated if omitted)")
232
- .option("--file <path>", "read markdown from file")
258
+ .option("--file <path>", "read content from file (markdown, pdf, docx, html, txt, json)")
233
259
  .option("--stdin", "read markdown from stdin", false)
234
- .option("--type <type>", "page type")
235
- .option("--title <title>", "page title")
260
+ .option("--type <type>", "page type override")
261
+ .option("--title <title>", "page title override")
262
+ .option("--format <kind>", "force document kind (pdf|docx|html|json|markdown|text) — only needed for --file with non-md files when auto-detect fails")
263
+ .option("--max-bytes <number>", "max bytes for URL/file ingest", "52428800")
264
+ .option("--timeout <ms>", "fetch timeout for URLs in ms", "30000")
236
265
  .description(
237
- "create or update a page (idempotent; upserts by slug). If slug is omitted, it is auto-generated from file name, title, or timestamp.",
266
+ "create or update a page (idempotent; upserts by slug). Auto-detects file type: markdown is parsed normally, PDF/DOCX/HTML/TXT/JSON are extracted and ingested.",
238
267
  )
239
268
  .addHelpText(
240
269
  "after",
241
270
  `
242
271
  Examples:
243
- ebrain put --file api.md # auto-generate slug from file name
272
+ ebrain put --file api.md # markdown parsePageMarkdown
244
273
  ebrain put docs/api --file api.md # explicit slug
274
+ ebrain put --file report.pdf # pdf → auto-extract text
275
+ ebrain put docs/report --file report.pdf # explicit slug for pdf
276
+ ebrain put --file article.docx # docx → auto-extract text
277
+ ebrain put --file https://example.com/a.pdf # URL → download + extract
245
278
  cat note.md | ebrain put --stdin # auto-generate slug from title/timestamp
246
279
  ebrain put --title "My Note" --stdin # auto-generate slug from title
247
280
  ebrain put people/john --type person --title "John Doe"
@@ -256,9 +289,173 @@ Examples:
256
289
  stdin?: boolean;
257
290
  type?: string;
258
291
  title?: string;
292
+ format?: string;
293
+ maxBytes?: string;
294
+ timeout?: string;
259
295
  dryRun?: boolean;
260
296
  },
261
297
  ) => {
298
+ // ── Branch 1: document file (pdf/docx/html/txt/json or URL) ──
299
+ const forceKind = opts.format as DocumentKind | undefined;
300
+ if (opts.file && isDocumentFile(opts.file, opts.format)) {
301
+ const loaded = await loadDocument(opts.file, {
302
+ forceKind,
303
+ fetchTimeoutMs: opts.timeout ? Number(opts.timeout) : undefined,
304
+ maxBytes: opts.maxBytes ? Number(opts.maxBytes) : undefined,
305
+ });
306
+ const content = loaded.text;
307
+ const fileName = loaded.fileName;
308
+ const kind = loaded.kind;
309
+ const sourceRef = loaded.source;
310
+ const sourceType = loaded.sourceType;
311
+ const mimeType = loaded.mimeType;
312
+ const bytes = loaded.bytes;
313
+ const metadata = loaded.metadata;
314
+
315
+ let finalSlug = slug;
316
+ if (!finalSlug) {
317
+ const nameNoExt = fileName.replace(/\.[^.]+$/, "");
318
+ const slugBase = normalizeLongSlug(slugify(nameNoExt));
319
+ finalSlug = `ingest/${slugBase}`;
320
+ }
321
+
322
+ const type = opts.type ?? kind;
323
+ const title =
324
+ opts.title ??
325
+ String(slugToTitle(finalSlug));
326
+ const hash = contentHash(content);
327
+ const frontmatter: Record<string, unknown> = {
328
+ sourceFile: sourceRef,
329
+ sourceType,
330
+ sourceKind: kind,
331
+ sourceMimeType: mimeType,
332
+ sourceBytes: bytes,
333
+ sourceFileName: fileName,
334
+ _contentHash: hash,
335
+ ...metadata,
336
+ };
337
+
338
+ if (isDryRun(opts)) {
339
+ print(program, {
340
+ dryRun: true,
341
+ action: "put",
342
+ slug: finalSlug,
343
+ type,
344
+ title,
345
+ kind,
346
+ sourceType,
347
+ sourceRef,
348
+ mimeType,
349
+ bytes,
350
+ contentLength: content.length,
351
+ contentHash: hash,
352
+ metadata,
353
+ });
354
+ return;
355
+ }
356
+
357
+ await withRepo(program, async (repo) => {
358
+ const jsonOut = isJson(program);
359
+ const spinner = createSpinner();
360
+ const startTime = Date.now();
361
+
362
+ // Check if content has already been ingested (idempotency)
363
+ const existingPage = await repo.getPage(finalSlug);
364
+ const existingHash = existingPage?.frontmatter._contentHash as string | undefined;
365
+
366
+ if (existingHash === hash) {
367
+ if (!jsonOut) {
368
+ header(`Put: ${fileName}`);
369
+ success(`Content unchanged — skipped (hash: ${hash})`);
370
+ }
371
+ print(program, {
372
+ ok: true,
373
+ action: "put",
374
+ slug: finalSlug,
375
+ unchanged: true,
376
+ contentHash: hash,
377
+ });
378
+ return;
379
+ }
380
+
381
+ if (!jsonOut) {
382
+ header(`Put: ${fileName}`);
383
+ keyValue("Kind", kind);
384
+ keyValue("Source", sourceRef);
385
+ if (mimeType) keyValue("Content-Type", mimeType);
386
+ keyValue("Bytes", String(bytes));
387
+ if (existingPage) {
388
+ keyValue("Previous hash", existingHash ?? "none");
389
+ keyValue("New hash", hash);
390
+ }
391
+ spinner.start(`Creating page from ${kind}...`);
392
+ }
393
+
394
+ await repo.putPage({
395
+ slug: finalSlug,
396
+ type,
397
+ title,
398
+ compiledTruth: content,
399
+ timeline: "",
400
+ frontmatter,
401
+ });
402
+
403
+ if (!jsonOut) {
404
+ spinner.succeed(`Page created: ${finalSlug}`);
405
+ keyValue("Type", type);
406
+ keyValue("Content length", `${content.length} chars`);
407
+ }
408
+
409
+ // ── Side-effect operations (only on new/changed content) ──
410
+ await repo.timelineAdd({
411
+ pageSlug: finalSlug,
412
+ date: new Date().toISOString().slice(0, 10),
413
+ source: type,
414
+ summary: `Ingested ${kind} ${fileName}`,
415
+ detail: sourceType === "url" ? `Source URL: ${sourceRef}` : "",
416
+ });
417
+
418
+ try {
419
+ await repo.writeRaw(finalSlug, sourceType, {
420
+ fileName,
421
+ sourceRef,
422
+ kind,
423
+ mimeType,
424
+ bytes,
425
+ metadata,
426
+ ingestedAt: new Date().toISOString(),
427
+ });
428
+ } catch (err) {
429
+ if (!jsonOut) {
430
+ warning(
431
+ `failed to record raw_data: ${err instanceof Error ? err.message : String(err)}`,
432
+ );
433
+ }
434
+ }
435
+
436
+ await applyEntityLinks(repo, finalSlug, content, jsonOut);
437
+
438
+ if (!jsonOut) {
439
+ const duration = formatDuration(Date.now() - startTime);
440
+ success(`Operation completed in ${duration}`);
441
+ }
442
+
443
+ print(program, {
444
+ ok: true,
445
+ action: "put",
446
+ slug: finalSlug,
447
+ kind,
448
+ sourceType,
449
+ sourceRef,
450
+ bytes,
451
+ contentLength: content.length,
452
+ contentHash: hash,
453
+ });
454
+ });
455
+ return;
456
+ }
457
+
458
+ // ── Branch 2: markdown (stdin or .md file) ──
262
459
  const input = await resolveInput(opts.file, opts.stdin ?? false);
263
460
  if (!input.trim()) {
264
461
  throw new Error(
@@ -266,7 +463,7 @@ Examples:
266
463
  );
267
464
  }
268
465
  const parsed = parsePageMarkdown(input);
269
-
466
+
270
467
  // Auto-generate slug if not provided
271
468
  let finalSlug = slug;
272
469
  if (!finalSlug) {
@@ -284,7 +481,7 @@ Examples:
284
481
  finalSlug = `notes/${timestamp}`;
285
482
  }
286
483
  }
287
-
484
+
288
485
  const type =
289
486
  opts.type ??
290
487
  String(parsed.frontmatter.type ?? inferTypeFromSlug(finalSlug));
@@ -292,6 +489,10 @@ Examples:
292
489
  opts.title ??
293
490
  String(parsed.frontmatter.title ?? slugToTitle(finalSlug));
294
491
 
492
+ // Compute content hash and embed in frontmatter for idempotency
493
+ const hash = contentHash(parsed.compiledTruth);
494
+ parsed.frontmatter._contentHash = hash;
495
+
295
496
  if (isDryRun(opts)) {
296
497
  print(program, {
297
498
  dryRun: true,
@@ -300,6 +501,7 @@ Examples:
300
501
  type,
301
502
  title,
302
503
  contentLength: parsed.compiledTruth.length,
504
+ contentHash: hash,
303
505
  hasTimeline: !!parsed.timeline,
304
506
  frontmatterKeys: Object.keys(parsed.frontmatter),
305
507
  });
@@ -310,12 +512,35 @@ Examples:
310
512
  const jsonOut = isJson(program);
311
513
  const spinner = createSpinner();
312
514
  const startTime = Date.now();
313
-
515
+
516
+ // Check if content is unchanged (idempotency)
517
+ const existingPage = await repo.getPage(finalSlug);
518
+ const existingHash = existingPage?.frontmatter._contentHash as string | undefined;
519
+
520
+ if (existingHash === hash) {
521
+ if (!jsonOut) {
522
+ header(`Put: ${finalSlug}`);
523
+ success(`Content unchanged — skipped (hash: ${hash})`);
524
+ }
525
+ print(program, {
526
+ ok: true,
527
+ action: "put",
528
+ slug: finalSlug,
529
+ unchanged: true,
530
+ contentHash: hash,
531
+ });
532
+ return;
533
+ }
534
+
314
535
  if (!jsonOut) {
315
536
  header(`Put: ${finalSlug}`);
537
+ if (existingPage) {
538
+ keyValue("Previous hash", existingHash ?? "none");
539
+ keyValue("New hash", hash);
540
+ }
316
541
  spinner.start(`Creating/updating page...`);
317
542
  }
318
-
543
+
319
544
  const page = await repo.putPage({
320
545
  slug: finalSlug,
321
546
  type,
@@ -324,27 +549,32 @@ Examples:
324
549
  timeline: parsed.timeline,
325
550
  frontmatter: parsed.frontmatter,
326
551
  });
327
-
552
+
328
553
  if (!jsonOut) {
329
554
  spinner.succeed(`Page saved: ${page.slug}`);
330
555
  keyValue("Title", title);
331
556
  keyValue("Type", type);
332
557
  keyValue("Content length", `${parsed.compiledTruth.length} chars`);
333
558
  }
334
-
559
+
335
560
  await applyEntityLinks(
336
561
  repo,
337
562
  finalSlug,
338
563
  parsed.compiledTruth,
339
564
  jsonOut,
340
565
  );
341
-
566
+
342
567
  if (!jsonOut) {
343
568
  const duration = formatDuration(Date.now() - startTime);
344
569
  success(`Operation completed in ${duration}`);
345
570
  }
346
-
347
- print(program, { ok: true, slug: page.slug, updatedAt: page.updatedAt });
571
+
572
+ print(program, {
573
+ ok: true,
574
+ slug: page.slug,
575
+ updatedAt: page.updatedAt,
576
+ contentHash: hash,
577
+ });
348
578
  });
349
579
  },
350
580
  );
@@ -415,18 +645,18 @@ Examples:
415
645
  await withRepo(program, async (repo) => {
416
646
  const jsonOut = isJson(program);
417
647
  const spinner = createSpinner();
418
-
648
+
419
649
  if (!jsonOut) {
420
650
  header(`Delete: ${slug}`);
421
651
  spinner.start(`Deleting page and related data...`);
422
652
  }
423
-
653
+
424
654
  await repo.deletePage(slug);
425
-
655
+
426
656
  if (!jsonOut) {
427
657
  spinner.succeed(`Page deleted: ${slug}`);
428
658
  }
429
-
659
+
430
660
  print(program, { ok: true, action: "delete", slug });
431
661
  });
432
662
  });
@@ -522,7 +752,7 @@ Examples:
522
752
  await withRepo(program, async (repo) => {
523
753
  const limit = Number(opts.limit ?? 10);
524
754
  const hits = await repo.query(question, limit);
525
-
755
+
526
756
  // If --llm flag, generate answer based on multi-layer context
527
757
  if (opts.llm) {
528
758
  const settings = await loadSettings();
@@ -530,20 +760,20 @@ Examples:
530
760
  print(program, { error: "LLM not configured. Set llm.baseURL in settings." });
531
761
  return;
532
762
  }
533
-
763
+
534
764
  const progress = createProgress();
535
765
  progress.start("Searching knowledge base...");
536
-
766
+
537
767
  const contextLimit = Number(opts.contextLimit ?? 5);
538
768
  const topHits = hits.slice(0, contextLimit);
539
-
769
+
540
770
  if (topHits.length === 0) {
541
771
  progress.stop();
542
772
  process.stderr.write("No relevant pages found.\n");
543
773
  print(program, { answer: "No relevant information found in the knowledge base.", sources: [] });
544
774
  return;
545
775
  }
546
-
776
+
547
777
  // Collect multi-layer context (primary + raw data + linked pages scored by relevance)
548
778
  // ~100KB char budget ≈ 25K tokens, safe for most models
549
779
  const MAX_CONTEXT_CHARS = 100_000;
@@ -553,33 +783,33 @@ Examples:
553
783
  progress.update(`Loading ${stage}...`);
554
784
  });
555
785
  const ctxDuration = formatDuration(Date.now() - ctxStart);
556
-
786
+
557
787
  if (sections.length === 0) {
558
788
  progress.stop();
559
789
  process.stderr.write("No content could be loaded.\n");
560
790
  print(program, { answer: "Failed to load page content.", sources: [] });
561
791
  return;
562
792
  }
563
-
793
+
564
794
  progress.succeed(`Loaded ${stats.primaryPages} page(s), ${stats.rawDocs} raw doc(s), ${stats.linkedPages} linked page(s) (${ctxDuration})`);
565
795
  const startTime = Date.now();
566
-
796
+
567
797
  const { answer, ok } = await generateAnswerWithStream(question, sections, stats, settings.llm);
568
-
798
+
569
799
  if (!ok) {
570
800
  // If streaming failed, answer contains the error message
571
801
  console.log(answer);
572
802
  return;
573
803
  }
574
-
804
+
575
805
  const duration = formatDuration(Date.now() - startTime);
576
-
806
+
577
807
  // Show sources breakdown
578
808
  console.log("\n---\n**Sources:**\n");
579
809
  for (let i = 0; i < sections.length; i++) {
580
810
  const s = sections[i];
581
811
  const icon = s.type === 'primary' ? '📄' : s.type === 'raw_data' ? '📎' : '🔗';
582
- console.log(`${icon} ${i + 1}. [[${s.slug}|${s.title}]] ${s.label} (${(s.content.length / 1024).toFixed(1)}KB)`);
812
+ console.log(`${icon} ${i + 1}. [[${s.slug}|${s.title}]] - ${s.label} (${(s.content.length / 1024).toFixed(1)}KB)`);
583
813
  }
584
814
  console.log(`\n*Context: ${stats.primaryPages} page(s), ${stats.rawDocs} raw doc(s), ${stats.linkedPages} linked page(s)*`);
585
815
  } else {
@@ -763,11 +993,11 @@ Examples:
763
993
  throw new Error(`page not found: ${slug}`);
764
994
  }
765
995
  const settings = await loadSettings();
766
-
996
+
767
997
  const progress = createProgress();
768
998
  progress.start(`Extracting timeline from ${slug}...`);
769
999
  const startTime = Date.now();
770
-
1000
+
771
1001
  const result = await repo.extractAndAddTimeline(
772
1002
  slug,
773
1003
  page.compiledTruth,
@@ -775,16 +1005,16 @@ Examples:
775
1005
  opts.defaultDate ?? new Date().toISOString().slice(0, 10),
776
1006
  settings.llm,
777
1007
  );
778
-
1008
+
779
1009
  const duration = formatDuration(Date.now() - startTime);
780
-
1010
+
781
1011
  if (result.entries.length > 0) {
782
1012
  progress.succeed(`${result.entries.length} events extracted (${duration})`);
783
1013
  } else {
784
1014
  progress.stop();
785
1015
  process.stderr.write(`No events found (${duration})\n`);
786
1016
  }
787
-
1017
+
788
1018
  print(program, {
789
1019
  ok: true,
790
1020
  action: "timeline-extract",
@@ -947,7 +1177,7 @@ Examples:
947
1177
  data = JSON.parse(opts.data);
948
1178
  } else if (opts.stdin) {
949
1179
  const raw = await readMaybeStdin();
950
- if (!raw?.trim()) throw new Error("empty stdin pipe JSON");
1180
+ if (!raw?.trim()) throw new Error("empty stdin - pipe JSON");
951
1181
  data = JSON.parse(raw);
952
1182
  } else {
953
1183
  throw new Error("provide --data <json> or --stdin");
@@ -996,7 +1226,7 @@ Examples:
996
1226
  await withRepo(program, async (repo) => {
997
1227
  const root = resolve(dir);
998
1228
  const files = await collectMarkdownFiles(root);
999
-
1229
+
1000
1230
  if (isDryRun(opts)) {
1001
1231
  print(program, {
1002
1232
  dryRun: true,
@@ -1012,16 +1242,16 @@ Examples:
1012
1242
  const settings = await loadSettings();
1013
1243
  const spinner = createSpinner();
1014
1244
  const startTime = Date.now();
1015
-
1245
+
1016
1246
  if (!jsonOut) {
1017
1247
  header(`Import: ${root}`);
1018
1248
  }
1019
-
1249
+
1020
1250
  // Phase 1: Parse all files and collect data
1021
1251
  if (!jsonOut) {
1022
1252
  spinner.start(`Scanning ${files.length} files...`);
1023
1253
  }
1024
-
1254
+
1025
1255
  const fileData: Array<{
1026
1256
  file: string;
1027
1257
  slug: string;
@@ -1031,7 +1261,7 @@ Examples:
1031
1261
  timelineEntries: ReturnType<typeof extractTimelineLines>;
1032
1262
  tags: string[];
1033
1263
  }> = [];
1034
-
1264
+
1035
1265
  for (const file of files) {
1036
1266
  const rawSlug = pathToSlug(file, root);
1037
1267
  const slug = normalizeLongSlug(rawSlug);
@@ -1044,19 +1274,19 @@ Examples:
1044
1274
  : [];
1045
1275
  fileData.push({ file, slug, parsed, content, wikiLinks, timelineEntries, tags });
1046
1276
  }
1047
-
1277
+
1048
1278
  if (!jsonOut) {
1049
1279
  spinner.succeed(`Found ${files.length} markdown files`);
1050
1280
  }
1051
-
1281
+
1052
1282
  // Phase 2: Write all pages first (skip embed for performance)
1053
1283
  if (!jsonOut) {
1054
1284
  spinner.start(`Writing ${fileData.length} pages to database...`);
1055
1285
  }
1056
-
1286
+
1057
1287
  const allSlugs: string[] = [];
1058
1288
  const writeErrors: string[] = [];
1059
-
1289
+
1060
1290
  for (let i = 0; i < fileData.length; i++) {
1061
1291
  const { slug, parsed } = fileData[i]!;
1062
1292
  if (!jsonOut && i % 20 === 0) {
@@ -1076,7 +1306,7 @@ Examples:
1076
1306
  writeErrors.push(`${slug}: ${err instanceof Error ? err.message : String(err)}`);
1077
1307
  }
1078
1308
  }
1079
-
1309
+
1080
1310
  if (!jsonOut) {
1081
1311
  spinner.succeed(`Wrote ${allSlugs.length} pages to database`);
1082
1312
  if (writeErrors.length > 0) {
@@ -1089,16 +1319,16 @@ Examples:
1089
1319
  }
1090
1320
  }
1091
1321
  }
1092
-
1322
+
1093
1323
  // Phase 3: Parallel entity extraction (main optimization)
1094
1324
  const BATCH_SIZE = 10;
1095
1325
  const entityResults = new Map<string, Awaited<ReturnType<typeof extractRelations>>>();
1096
-
1326
+
1097
1327
  if (settings.llm.baseURL) {
1098
1328
  if (!jsonOut) {
1099
1329
  spinner.start(`Extracting entities with LLM...`);
1100
1330
  }
1101
-
1331
+
1102
1332
  for (let i = 0; i < fileData.length; i += BATCH_SIZE) {
1103
1333
  const batch = fileData.slice(i, i + BATCH_SIZE);
1104
1334
  if (!jsonOut) {
@@ -1113,7 +1343,7 @@ Examples:
1113
1343
  entityResults.set(slug, relations);
1114
1344
  }
1115
1345
  }
1116
-
1346
+
1117
1347
  if (!jsonOut) {
1118
1348
  spinner.succeed(`Entity extraction complete`);
1119
1349
  }
@@ -1122,17 +1352,17 @@ Examples:
1122
1352
  warning(`LLM not configured, skipping entity extraction`);
1123
1353
  }
1124
1354
  }
1125
-
1355
+
1126
1356
  // Phase 4: Write links, tags, timeline, and entity pages
1127
1357
  if (!jsonOut) {
1128
1358
  spinner.start(`Creating links, tags, and timeline entries...`);
1129
1359
  }
1130
-
1360
+
1131
1361
  let linkCount = 0;
1132
1362
  let timelineCount = 0;
1133
1363
  let entityCount = 0;
1134
1364
  let tagCount = 0;
1135
-
1365
+
1136
1366
  // Collect timeline entries for batch insert
1137
1367
  const allTimelineEntries: Array<{
1138
1368
  pageSlug: string;
@@ -1141,14 +1371,14 @@ Examples:
1141
1371
  summary: string;
1142
1372
  detail: string;
1143
1373
  }> = [];
1144
-
1374
+
1145
1375
  for (const { slug, wikiLinks, timelineEntries, tags, content } of fileData) {
1146
1376
  // Wiki links
1147
1377
  for (const link of wikiLinks) {
1148
1378
  await repo.link(slug, link, "import");
1149
1379
  linkCount++;
1150
1380
  }
1151
-
1381
+
1152
1382
  // Collect timeline entries for batch insert
1153
1383
  for (const entry of timelineEntries) {
1154
1384
  allTimelineEntries.push({
@@ -1160,13 +1390,13 @@ Examples:
1160
1390
  });
1161
1391
  timelineCount++;
1162
1392
  }
1163
-
1393
+
1164
1394
  // Tags
1165
1395
  for (const tag of tags) {
1166
1396
  await repo.tag(slug, tag);
1167
1397
  tagCount++;
1168
1398
  }
1169
-
1399
+
1170
1400
  // Entity links from parallel extraction
1171
1401
  const relations = entityResults.get(slug);
1172
1402
  if (relations && relations.length > 0) {
@@ -1176,12 +1406,12 @@ Examples:
1176
1406
  const toCandidate = entityToSlug(r.to.name, r.to.type);
1177
1407
  const fromSlug = await repo.findSimilarSlug(fromCandidate, r.from.name);
1178
1408
  const toSlug = await repo.findSimilarSlug(toCandidate, r.to.name);
1179
-
1409
+
1180
1410
  const c1 = await repo.ensureEntityPage(fromSlug, r.from.type, r.from.name, r.relation, r.context, slug);
1181
1411
  const c2 = await repo.ensureEntityPage(toSlug, r.to.type, r.to.name, r.relation, r.context, slug);
1182
1412
  if (c1) entityCount++;
1183
1413
  if (c2) entityCount++;
1184
-
1414
+
1185
1415
  await repo.link(fromSlug, toSlug, `[${r.relation}] ${r.context}`);
1186
1416
  await repo.link(slug, fromSlug, `Mentions ${r.from.name}`);
1187
1417
  await repo.link(slug, toSlug, `Mentions ${r.to.name}`);
@@ -1189,16 +1419,16 @@ Examples:
1189
1419
  }
1190
1420
  }
1191
1421
  }
1192
-
1422
+
1193
1423
  // Batch insert all timeline entries
1194
1424
  if (allTimelineEntries.length > 0) {
1195
1425
  await repo.timelineAddBatch(allTimelineEntries);
1196
1426
  }
1197
-
1427
+
1198
1428
  if (!jsonOut) {
1199
1429
  spinner.succeed(`Created links, tags, and timeline`);
1200
1430
  }
1201
-
1431
+
1202
1432
  // Phase 5: Batch sync all pages to search index
1203
1433
  if (opts.skipIndex) {
1204
1434
  if (!jsonOut) {
@@ -1209,14 +1439,14 @@ Examples:
1209
1439
  spinner.start(`Indexing ${allSlugs.length} pages for search...`);
1210
1440
  }
1211
1441
  await repo.embedAll();
1212
-
1442
+
1213
1443
  if (!jsonOut) {
1214
1444
  spinner.succeed(`Search indexing complete`);
1215
1445
  }
1216
1446
  }
1217
-
1447
+
1218
1448
  const duration = formatDuration(Date.now() - startTime);
1219
-
1449
+
1220
1450
  if (!jsonOut) {
1221
1451
  // Print summary
1222
1452
  header("Import Summary");
@@ -1227,12 +1457,12 @@ Examples:
1227
1457
  keyValue("Timeline entries", String(timelineCount));
1228
1458
  keyValue("Tags added", String(tagCount));
1229
1459
  keyValue("Duration", duration);
1230
-
1460
+
1231
1461
  if (writeErrors.length > 0) {
1232
1462
  warning(`${writeErrors.length} pages had errors`);
1233
1463
  }
1234
1464
  }
1235
-
1465
+
1236
1466
  print(program, {
1237
1467
  ok: true,
1238
1468
  importedFiles: files.length,
@@ -1280,116 +1510,6 @@ Examples:
1280
1510
  });
1281
1511
  });
1282
1512
 
1283
- // -- ingest ---------------------------------------------------------------
1284
-
1285
- addDryRun(
1286
- program
1287
- .command("ingest")
1288
- .argument("[file]", "file path to ingest (omit for stdin)")
1289
- .option("--type <type>", "source type", "doc")
1290
- .option("--stdin", "read from stdin", false)
1291
- .description("ingest a file as a new page (under ingest/<name>)")
1292
- .addHelpText(
1293
- "after",
1294
- `
1295
- Examples:
1296
- ebrain ingest report.pdf --type pdf
1297
- cat article.md | ebrain ingest --stdin --type article
1298
- ebrain ingest report.pdf --type pdf --dry-run
1299
- `,
1300
- ),
1301
- ).action(
1302
- async (
1303
- file: string | undefined,
1304
- opts: { type?: string; stdin?: boolean; dryRun?: boolean },
1305
- ) => {
1306
- let content: string;
1307
- let fileName: string;
1308
-
1309
- if (file) {
1310
- const fullPath = resolve(file);
1311
- if (!(await fileExists(fullPath))) {
1312
- throw new Error(`file not found: ${file}`);
1313
- }
1314
- content = await readTextFile(fullPath);
1315
- fileName = basename(fullPath);
1316
- } else if (opts.stdin) {
1317
- const raw = await readMaybeStdin();
1318
- if (!raw?.trim()) throw new Error("empty stdin — pipe content");
1319
- content = raw;
1320
- fileName = "stdin";
1321
- } else {
1322
- throw new Error("provide <file> or --stdin");
1323
- }
1324
-
1325
- const slug = `ingest/${fileName.replace(/\.[^.]+$/, "")}`;
1326
- const type = opts.type ?? "doc";
1327
-
1328
- if (isDryRun(opts)) {
1329
- print(program, {
1330
- dryRun: true,
1331
- action: "ingest",
1332
- slug,
1333
- type,
1334
- contentLength: content.length,
1335
- });
1336
- return;
1337
- }
1338
-
1339
- await withRepo(program, async (repo) => {
1340
- const jsonOut = isJson(program);
1341
- const spinner = createSpinner();
1342
- const startTime = Date.now();
1343
-
1344
- if (!jsonOut) {
1345
- header(`Ingest: ${fileName}`);
1346
- spinner.start(`Creating page from file...`);
1347
- }
1348
-
1349
- await repo.putPage({
1350
- slug,
1351
- type,
1352
- title: slugToTitle(slug),
1353
- compiledTruth: content,
1354
- timeline: "",
1355
- frontmatter: {
1356
- sourceFile: resolve(fileName),
1357
- sourceType: type,
1358
- },
1359
- });
1360
-
1361
- if (!jsonOut) {
1362
- spinner.succeed(`Page created: ${slug}`);
1363
- keyValue("Source file", fileName);
1364
- keyValue("Type", type);
1365
- keyValue("Content length", `${content.length} chars`);
1366
- }
1367
-
1368
- await repo.timelineAdd({
1369
- pageSlug: slug,
1370
- date: new Date().toISOString().slice(0, 10),
1371
- source: type,
1372
- summary: `Ingested file ${fileName}`,
1373
- detail: "",
1374
- });
1375
-
1376
- await applyEntityLinks(
1377
- repo,
1378
- slug,
1379
- content,
1380
- jsonOut,
1381
- );
1382
-
1383
- if (!jsonOut) {
1384
- const duration = formatDuration(Date.now() - startTime);
1385
- success(`Ingestion completed in ${duration}`);
1386
- }
1387
-
1388
- print(program, { ok: true, action: "ingest", slug });
1389
- });
1390
- },
1391
- );
1392
-
1393
1513
  // -- embed ----------------------------------------------------------------
1394
1514
 
1395
1515
  addDryRun(
@@ -1429,26 +1549,26 @@ Examples:
1429
1549
  const jsonOut = isJson(program);
1430
1550
  const spinner = createSpinner();
1431
1551
  const startTime = Date.now();
1432
-
1552
+
1433
1553
  if (!jsonOut) {
1434
1554
  header("Embed All Pages");
1435
1555
  spinner.start(`Loading pages...`);
1436
1556
  }
1437
-
1557
+
1438
1558
  const pages = await repo.listPages({ limit: 100000 });
1439
-
1559
+
1440
1560
  if (!jsonOut) {
1441
1561
  spinner.update(`Embedding ${pages.length} pages...`);
1442
1562
  }
1443
-
1563
+
1444
1564
  const count = await repo.embedAll();
1445
-
1565
+
1446
1566
  if (!jsonOut) {
1447
1567
  const duration = formatDuration(Date.now() - startTime);
1448
1568
  spinner.succeed(`Embedded ${count} pages`);
1449
1569
  keyValue("Duration", duration);
1450
1570
  }
1451
-
1571
+
1452
1572
  print(program, { embedded: count, mode: "all" });
1453
1573
  });
1454
1574
  return;
@@ -1463,18 +1583,18 @@ Examples:
1463
1583
  await withRepo(program, async (repo) => {
1464
1584
  const jsonOut = isJson(program);
1465
1585
  const spinner = createSpinner();
1466
-
1586
+
1467
1587
  if (!jsonOut) {
1468
1588
  header(`Embed: ${slug}`);
1469
1589
  spinner.start(`Generating embedding for page...`);
1470
1590
  }
1471
-
1591
+
1472
1592
  await repo.syncPageToSearch(slug);
1473
-
1593
+
1474
1594
  if (!jsonOut) {
1475
1595
  spinner.succeed(`Page embedded: ${slug}`);
1476
1596
  }
1477
-
1597
+
1478
1598
  print(program, { embedded: 1, slug });
1479
1599
  });
1480
1600
  },
@@ -1527,7 +1647,7 @@ Examples:
1527
1647
  }
1528
1648
  dbInitialized = true;
1529
1649
  } else {
1530
- // Try to create it without collection embedding config may not be ready
1650
+ // Try to create it without collection - embedding config may not be ready
1531
1651
  try {
1532
1652
  const db = await BrainDb.connect(dbPath, settings, { skipCollection: true });
1533
1653
  await db.close();
@@ -1601,7 +1721,7 @@ Examples:
1601
1721
  await withRepo(program, async (repo) => {
1602
1722
  const jsonOut = isJson(program);
1603
1723
  const stats = await repo.stats();
1604
-
1724
+
1605
1725
  if (!jsonOut) {
1606
1726
  header("Knowledge Base Statistics");
1607
1727
  keyValue("Pages", String(stats.pages));
@@ -1610,7 +1730,7 @@ Examples:
1610
1730
  keyValue("Timeline entries", String(stats.timelineEntries));
1611
1731
  keyValue("Raw data rows", String(stats.rawRows));
1612
1732
  }
1613
-
1733
+
1614
1734
  print(program, stats);
1615
1735
  });
1616
1736
  });
@@ -1671,7 +1791,7 @@ async function withRepo(
1671
1791
  const db = await BrainDb.connect(dbPath, settings);
1672
1792
  const repo = new BrainRepository(db);
1673
1793
  await callback(repo);
1674
-
1794
+
1675
1795
  // Gracefully close database
1676
1796
  // Note: seekdb SDK's InternalEmbeddedClient.close() is empty in embedded mode
1677
1797
  // Data may not flush properly. Use remote seekdb server for reliability.
@@ -1680,10 +1800,10 @@ async function withRepo(
1680
1800
  } catch (e) {
1681
1801
  // Close may fail due to seekdb native bug
1682
1802
  }
1683
-
1803
+
1684
1804
  // Give seekdb extra time after close
1685
1805
  await new Promise((r) => setTimeout(r, 500));
1686
-
1806
+
1687
1807
  // CLI: force exit to bypass seekdb native cleanup segfault
1688
1808
  process.exit(0);
1689
1809
  }
@@ -1726,7 +1846,7 @@ function normalizeLinkSlug(path: string): string {
1726
1846
  }
1727
1847
 
1728
1848
  // ---------------------------------------------------------------------------
1729
- // LLM Answer Generation Multi-layer Context Collection
1849
+ // LLM Answer Generation - Multi-layer Context Collection
1730
1850
  // ---------------------------------------------------------------------------
1731
1851
 
1732
1852
  /** A single section of context for the LLM prompt. */
@@ -1741,12 +1861,12 @@ interface ContextSection {
1741
1861
 
1742
1862
  /**
1743
1863
  * Collect multi-layer context for LLM answer generation.
1744
- *
1864
+ *
1745
1865
  * Layers (in priority order):
1746
1866
  * 1. Primary: compiledTruth + timeline of each hit page
1747
1867
  * 2. Raw data: original documents stored via raw.set
1748
1868
  * 3. Linked pages: compiledTruth of pages linked to/from hit pages
1749
- *
1869
+ *
1750
1870
  * Budget is enforced via total character limit.
1751
1871
  */
1752
1872
  async function collectContextForLLM(
@@ -1845,8 +1965,8 @@ async function collectContextForLLM(
1845
1965
  }
1846
1966
  }
1847
1967
 
1848
- // Layer 3: Linked pages score using cached data + keyword matching
1849
- // No second repo.query() call needed reuse hits scores + keyword fallback
1968
+ // Layer 3: Linked pages - score using cached data + keyword matching
1969
+ // No second repo.query() call needed - reuse hits scores + keyword fallback
1850
1970
  onProgress?.('linked pages');
1851
1971
  const allLinkedSlugs = new Set<string>();
1852
1972
  for (const hit of hits) {
@@ -1952,7 +2072,7 @@ async function collectContextForLLM(
1952
2072
  function computeKeywordRelevance(text: string, question: string): number {
1953
2073
  const STOP_CHARS = new Set('的是了在和我有你就这不人都说上个大国为到以们年会生地要主中子自实家小对多能好可很所把当');
1954
2074
  const questionChars = [...question]
1955
- .filter(c => !/\s|[,,。!?、;::""''()()【】\[\]{}<>\/\\|~`@#$%^&*+=_-]/.test(c) && !STOP_CHARS.has(c));
2075
+ .filter(c => !/\s|[,,。!?、;::""''()()【】\[\]{}<>\/\\|~`@#$%^&*+=_-]/.test(c) && !STOP_CHARS.has(c));
1956
2076
  if (questionChars.length === 0) return 0;
1957
2077
 
1958
2078
  const uniqueChars = new Set(questionChars);
@@ -2003,7 +2123,7 @@ async function generateAnswerWithStream(
2003
2123
  contextParts.push(`## ${header}\n`);
2004
2124
  for (const s of group) {
2005
2125
  sectionIndex++;
2006
- contextParts.push(`### [${sectionIndex}] ${s.title} ${s.label}\n**Slug:** ${s.slug}\n\n${s.content}\n`);
2126
+ contextParts.push(`### [${sectionIndex}] ${s.title} - ${s.label}\n**Slug:** ${s.slug}\n\n${s.content}\n`);
2007
2127
  }
2008
2128
  contextParts.push('');
2009
2129
  }
@@ -2014,7 +2134,7 @@ async function generateAnswerWithStream(
2014
2134
 
2015
2135
  const context = contextParts.join('\n');
2016
2136
 
2017
- const prompt = `你是一个知识库助手,请根据提供的知识库内容回答问题。
2137
+ const prompt = `你是一个知识库助手,请根据提供的知识库内容回答问题。
2018
2138
 
2019
2139
  ## 问题
2020
2140
  ${question}
@@ -2024,13 +2144,13 @@ ${question}
2024
2144
  ${context}
2025
2145
 
2026
2146
  ## 回答要求
2027
- - 仅基于提供的知识库内容回答,不要编造信息
2028
- - 如果知识库中没有相关信息,请明确说明
2147
+ - 仅基于提供的知识库内容回答,不要编造信息
2148
+ - 如果知识库中没有相关信息,请明确说明
2029
2149
  - 引用来源时使用 [[slug|标题]] 的格式
2030
2150
  - 使用清晰的 markdown 格式
2031
- - 如果涉及时间线信息,请在回答中体现
2151
+ - 如果涉及时间线信息,请在回答中体现
2032
2152
  - 区分哪些信息来自「页面正文」、哪些来自「原始文档」、哪些来自「关联页面」
2033
- - 语言与提问保持一致(中文提问用中文回答,英文提问用英文回答)
2153
+ - 语言与提问保持一致(中文提问用中文回答,英文提问用英文回答)
2034
2154
 
2035
2155
  ## 回答`;
2036
2156
 
@@ -2045,10 +2165,10 @@ ${context}
2045
2165
 
2046
2166
  try {
2047
2167
  const url = llm.baseURL.endsWith("/") ? llm.baseURL + "chat/completions" : llm.baseURL + "/chat/completions";
2048
-
2168
+
2049
2169
  // Show thinking indicator while waiting for first token
2050
2170
  process.stderr.write(`\x1b[35m💭\x1b[0m \x1b[2mConnecting to ${llm.model}...\x1b[0m\n`);
2051
-
2171
+
2052
2172
  const resp = await fetch(
2053
2173
  url,
2054
2174
  {
@@ -2063,7 +2183,7 @@ ${context}
2063
2183
  messages: [
2064
2184
  {
2065
2185
  role: "system",
2066
- content: "你是一个专业的知识库助手,基于提供的知识库内容准确回答问题。引用来源时使用 [[slug|标题]] 格式。回答要条理清晰,区分信息来源。",
2186
+ content: "你是一个专业的知识库助手,基于提供的知识库内容准确回答问题。引用来源时使用 [[slug|标题]] 格式。回答要条理清晰,区分信息来源。",
2067
2187
  },
2068
2188
  { role: "user", content: prompt },
2069
2189
  ],
@@ -2172,7 +2292,7 @@ async function generateAnswerWithContext(
2172
2292
  contextParts.push(`## ${header}\n`);
2173
2293
  for (const s of group) {
2174
2294
  sectionIndex++;
2175
- contextParts.push(`### [${sectionIndex}] ${s.title} ${s.label}\n**Slug:** ${s.slug}\n\n${s.content}\n`);
2295
+ contextParts.push(`### [${sectionIndex}] ${s.title} - ${s.label}\n**Slug:** ${s.slug}\n\n${s.content}\n`);
2176
2296
  }
2177
2297
  contextParts.push('');
2178
2298
  }
@@ -2183,7 +2303,7 @@ async function generateAnswerWithContext(
2183
2303
 
2184
2304
  const context = contextParts.join('\n');
2185
2305
 
2186
- const prompt = `你是一个知识库助手,请根据提供的知识库内容回答问题。
2306
+ const prompt = `你是一个知识库助手,请根据提供的知识库内容回答问题。
2187
2307
 
2188
2308
  ## 问题
2189
2309
  ${question}
@@ -2193,13 +2313,13 @@ ${question}
2193
2313
  ${context}
2194
2314
 
2195
2315
  ## 回答要求
2196
- - 仅基于提供的知识库内容回答,不要编造信息
2197
- - 如果知识库中没有相关信息,请明确说明
2316
+ - 仅基于提供的知识库内容回答,不要编造信息
2317
+ - 如果知识库中没有相关信息,请明确说明
2198
2318
  - 引用来源时使用 [[slug|标题]] 的格式
2199
2319
  - 使用清晰的 markdown 格式
2200
- - 如果涉及时间线信息,请在回答中体现
2320
+ - 如果涉及时间线信息,请在回答中体现
2201
2321
  - 区分哪些信息来自「页面正文」、哪些来自「原始文档」、哪些来自「关联页面」
2202
- - 语言与提问保持一致(中文提问用中文回答,英文提问用英文回答)
2322
+ - 语言与提问保持一致(中文提问用中文回答,英文提问用英文回答)
2203
2323
 
2204
2324
  ## 回答`;
2205
2325
 
@@ -2217,7 +2337,7 @@ ${context}
2217
2337
  messages: [
2218
2338
  {
2219
2339
  role: "system",
2220
- content: "你是一个专业的知识库助手,基于提供的知识库内容准确回答问题。引用来源时使用 [[slug|标题]] 格式。回答要条理清晰,区分信息来源。",
2340
+ content: "你是一个专业的知识库助手,基于提供的知识库内容准确回答问题。引用来源时使用 [[slug|标题]] 格式。回答要条理清晰,区分信息来源。",
2221
2341
  },
2222
2342
  { role: "user", content: prompt },
2223
2343
  ],