ex-brain 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,10 +20,23 @@ import {
20
20
  } from "../markdown/parser";
21
21
  import { BrainRepository } from "../repositories/brain-repo";
22
22
  import { loadSettings, SETTINGS_PATH, DEFAULT_DB_PATH, type ResolvedLLM } from "../settings";
23
- import { extractRelations, entityToSlug, EntityType } from "../ai/entity-link";
23
+ import { extractRelations, entityToSlug, type EntityType } from "../ai/entity-link";
24
24
  import { registerCompileCommands } from "./compile-cmd";
25
25
  import { registerGraphCommand } from "./graph-cmd";
26
26
  import { createProgress, formatDuration } from "../utils/progress";
27
+ import {
28
+ success,
29
+ error as cliError,
30
+ warning,
31
+ info,
32
+ step,
33
+ subItem,
34
+ keyValue,
35
+ header,
36
+ createSpinner,
37
+ formatCount,
38
+ type ProgressSpinner,
39
+ } from "../utils/cli-output";
27
40
 
28
41
  // ---------------------------------------------------------------------------
29
42
  // Helpers
@@ -59,32 +72,46 @@ async function applyEntityLinks(
59
72
  const settings = await loadSettings();
60
73
  if (!settings.llm.baseURL) {
61
74
  if (!json) {
62
- process.stderr.write(`[entity-link] LLM not configured, skipping for ${sourceSlug}\n`);
75
+ warning(`LLM not configured, skipping entity extraction for ${sourceSlug}`);
63
76
  }
64
77
  return { created: 0, linked: 0 };
65
78
  }
66
79
 
67
- const progress = createProgress();
80
+ const spinner = createSpinner();
68
81
  if (!json) {
69
- progress.start(`Extracting entities from ${sourceSlug}`);
82
+ spinner.start(`Extracting entities from ${sourceSlug}...`);
70
83
  }
71
84
 
72
85
  const startTime = Date.now();
73
- const relations = await extractRelations(content, settings.llm);
86
+ let relations;
87
+ try {
88
+ relations = await extractRelations(content, settings.llm);
89
+ } catch (err) {
90
+ if (!json) {
91
+ spinner.fail(`Entity extraction failed: ${err instanceof Error ? err.message : String(err)}`);
92
+ }
93
+ return { created: 0, linked: 0 };
94
+ }
74
95
 
75
96
  // Filter by confidence
76
- const highConfidence = relations.filter((r) => r.confidence >= 0.6);
97
+ const confidenceThreshold = settings.extraction.confidenceThreshold;
98
+ const highConfidence = relations.filter((r) => r.confidence >= confidenceThreshold);
77
99
  const ignoredCount = relations.length - highConfidence.length;
78
100
 
79
101
  if (highConfidence.length === 0) {
80
102
  if (!json) {
81
- progress.fail(`No high-confidence entities found`);
103
+ if (relations.length > 0) {
104
+ spinner.warn(`Found ${relations.length} entities but all below confidence threshold (${confidenceThreshold})`);
105
+ } else {
106
+ spinner.warn(`No entities found in content`);
107
+ }
82
108
  }
83
109
  return { created: 0, linked: 0 };
84
110
  }
85
111
 
86
112
  let created = 0;
87
113
  let linked = 0;
114
+ const details: string[] = [];
88
115
 
89
116
  for (const r of highConfidence) {
90
117
  // 1. Resolve entity slugs (disambiguation)
@@ -97,8 +124,8 @@ async function applyEntityLinks(
97
124
  // 2. Ensure entity pages exist
98
125
  const c1 = await repo.ensureEntityPage(fromSlug, r.from.type, r.from.name, r.relation, r.context, sourceSlug);
99
126
  const c2 = await repo.ensureEntityPage(toSlug, r.to.type, r.to.name, r.relation, r.context, sourceSlug);
100
- if (c1) created += 1;
101
- if (c2) created += 1;
127
+ if (c1) { created += 1; details.push(`Created: ${r.from.name} (${r.from.type})`); }
128
+ if (c2) { created += 1; details.push(`Created: ${r.to.name} (${r.to.type})`); }
102
129
 
103
130
  // 3. Link between entities (context includes relation type)
104
131
  await repo.link(fromSlug, toSlug, `[${r.relation}] ${r.context}`);
@@ -113,8 +140,16 @@ async function applyEntityLinks(
113
140
 
114
141
  if (!json) {
115
142
  const duration = formatDuration(Date.now() - startTime);
116
- const entityNames = highConfidence.flatMap((r) => [r.from.name, r.to.name]);
117
- progress.succeed(`${[...new Set(entityNames)].join(", ")} (${created} created, ${linked} links, ${duration})`);
143
+ const entityNames = [...new Set(highConfidence.flatMap((r) => [r.from.name, r.to.name]))];
144
+ spinner.succeed(`Extracted ${entityNames.length} entities: ${entityNames.join(", ")}`);
145
+
146
+ // Print detailed info
147
+ subItem(`${created} entity pages created`);
148
+ subItem(`${linked} links added`);
149
+ if (ignoredCount > 0) {
150
+ subItem(`${ignoredCount} low-confidence relations ignored`);
151
+ }
152
+ subItem(`Completed in ${duration}`);
118
153
  }
119
154
 
120
155
  return { created, linked };
@@ -267,6 +302,15 @@ Examples:
267
302
  }
268
303
 
269
304
  await withRepo(program, async (repo) => {
305
+ const jsonOut = isJson(program);
306
+ const spinner = createSpinner();
307
+ const startTime = Date.now();
308
+
309
+ if (!jsonOut) {
310
+ header(`Put: ${finalSlug}`);
311
+ spinner.start(`Creating/updating page...`);
312
+ }
313
+
270
314
  const page = await repo.putPage({
271
315
  slug: finalSlug,
272
316
  type,
@@ -275,12 +319,26 @@ Examples:
275
319
  timeline: parsed.timeline,
276
320
  frontmatter: parsed.frontmatter,
277
321
  });
322
+
323
+ if (!jsonOut) {
324
+ spinner.succeed(`Page saved: ${page.slug}`);
325
+ keyValue("Title", title);
326
+ keyValue("Type", type);
327
+ keyValue("Content length", `${parsed.compiledTruth.length} chars`);
328
+ }
329
+
278
330
  await applyEntityLinks(
279
331
  repo,
280
332
  finalSlug,
281
333
  parsed.compiledTruth,
282
- isJson(program),
334
+ jsonOut,
283
335
  );
336
+
337
+ if (!jsonOut) {
338
+ const duration = formatDuration(Date.now() - startTime);
339
+ success(`Operation completed in ${duration}`);
340
+ }
341
+
284
342
  print(program, { ok: true, slug: page.slug, updatedAt: page.updatedAt });
285
343
  });
286
344
  },
@@ -350,7 +408,20 @@ Examples:
350
408
  return;
351
409
  }
352
410
  await withRepo(program, async (repo) => {
411
+ const jsonOut = isJson(program);
412
+ const spinner = createSpinner();
413
+
414
+ if (!jsonOut) {
415
+ header(`Delete: ${slug}`);
416
+ spinner.start(`Deleting page and related data...`);
417
+ }
418
+
353
419
  await repo.deletePage(slug);
420
+
421
+ if (!jsonOut) {
422
+ spinner.succeed(`Page deleted: ${slug}`);
423
+ }
424
+
354
425
  print(program, { ok: true, action: "delete", slug });
355
426
  });
356
427
  });
@@ -447,7 +518,7 @@ Examples:
447
518
  const limit = Number(opts.limit ?? 10);
448
519
  const hits = await repo.query(question, limit);
449
520
 
450
- // If --llm flag, generate answer based on context
521
+ // If --llm flag, generate answer based on multi-layer context
451
522
  if (opts.llm) {
452
523
  const settings = await loadSettings();
453
524
  if (!settings.llm.baseURL) {
@@ -458,35 +529,48 @@ Examples:
458
529
  const progress = createProgress();
459
530
  progress.start("Searching knowledge base...");
460
531
 
461
- // Use excerpts from hits as context (avoids extra DB queries that cause segfault)
462
532
  const contextLimit = Number(opts.contextLimit ?? 5);
463
533
  const topHits = hits.slice(0, contextLimit);
464
534
 
465
- // Build context from search results
466
- const contextPages = topHits.map(hit => ({
467
- slug: hit.slug,
468
- title: hit.title,
469
- excerpt: hit.excerpt || "",
470
- }));
535
+ if (topHits.length === 0) {
536
+ progress.stop();
537
+ process.stderr.write("No relevant pages found.\n");
538
+ print(program, { answer: "No relevant information found in the knowledge base.", sources: [] });
539
+ return;
540
+ }
541
+
542
+ // Collect multi-layer context (primary + raw data + linked pages scored by relevance)
543
+ progress.update(`Loading pages, raw documents, and linked content...`);
544
+ // ~100KB char budget ≈ 25K tokens, safe for most models
545
+ const MAX_CONTEXT_CHARS = 100_000;
546
+ const { sections, totalChars, stats } = await collectContextForLLM(repo, topHits, question, MAX_CONTEXT_CHARS);
547
+
548
+ if (sections.length === 0) {
549
+ progress.stop();
550
+ process.stderr.write("No content could be loaded.\n");
551
+ print(program, { answer: "Failed to load page content.", sources: [] });
552
+ return;
553
+ }
471
554
 
472
- progress.update("Generating answer...");
555
+ progress.update(`Generating answer from ${stats.primaryPages} page(s), ${stats.rawDocs} raw doc(s), ${stats.linkedPages} linked page(s)...`);
473
556
  const startTime = Date.now();
474
557
 
475
- const answer = await generateAnswerFromExcerpts(question, contextPages, settings.llm);
558
+ const answer = await generateAnswerWithContext(question, sections, stats, settings.llm);
476
559
 
477
560
  const duration = formatDuration(Date.now() - startTime);
478
- progress.succeed(`Answer generated (${duration})`);
561
+ progress.succeed(`Answer generated (${duration}, context: ${(totalChars / 1024).toFixed(1)}KB)`);
479
562
 
480
- // Output markdown
563
+ // Output answer as markdown
481
564
  console.log("\n" + answer);
482
565
 
483
- // Show sources
484
- if (contextPages.length > 0) {
485
- console.log("\n---\n**Sources:**\n");
486
- contextPages.forEach((p, i) => {
487
- console.log(`${i + 1}. [[${p.slug}|${p.title}]]`);
488
- });
566
+ // Show sources breakdown
567
+ console.log("\n---\n**Sources:**\n");
568
+ for (let i = 0; i < sections.length; i++) {
569
+ const s = sections[i];
570
+ const icon = s.type === 'primary' ? '📄' : s.type === 'raw_data' ? '📎' : '🔗';
571
+ console.log(`${icon} ${i + 1}. [[${s.slug}|${s.title}]] — ${s.label} (${(s.content.length / 1024).toFixed(1)}KB)`);
489
572
  }
573
+ console.log(`\n*Context: ${stats.primaryPages} page(s), ${stats.rawDocs} raw doc(s), ${stats.linkedPages} linked page(s)*`);
490
574
  } else {
491
575
  print(program, hits);
492
576
  }
@@ -887,18 +971,21 @@ Examples:
887
971
  .command("import")
888
972
  .argument("<dir>", "directory of markdown files")
889
973
  .description("import a directory of markdown files")
974
+ .option("--skip-index", "skip vector indexing (useful if seekdb crashes)")
890
975
  .addHelpText(
891
976
  "after",
892
977
  `
893
978
  Examples:
894
979
  ebrain import ./docs
895
980
  ebrain import ./docs --dry-run
981
+ ebrain import ./docs --skip-index # skip vector indexing
896
982
  `,
897
983
  ),
898
- ).action(async (dir: string, opts: { dryRun?: boolean }) => {
984
+ ).action(async (dir: string, opts: { dryRun?: boolean; skipIndex?: boolean }) => {
899
985
  await withRepo(program, async (repo) => {
900
986
  const root = resolve(dir);
901
987
  const files = await collectMarkdownFiles(root);
988
+
902
989
  if (isDryRun(opts)) {
903
990
  print(program, {
904
991
  dryRun: true,
@@ -912,11 +999,18 @@ Examples:
912
999
 
913
1000
  const jsonOut = isJson(program);
914
1001
  const settings = await loadSettings();
915
- const progress = createProgress();
1002
+ const spinner = createSpinner();
916
1003
  const startTime = Date.now();
917
1004
 
1005
+ if (!jsonOut) {
1006
+ header(`Import: ${root}`);
1007
+ }
1008
+
918
1009
  // Phase 1: Parse all files and collect data
919
- progress.start(`Scanning ${files.length} files...`);
1010
+ if (!jsonOut) {
1011
+ spinner.start(`Scanning ${files.length} files...`);
1012
+ }
1013
+
920
1014
  const fileData: Array<{
921
1015
  file: string;
922
1016
  slug: string;
@@ -940,33 +1034,64 @@ Examples:
940
1034
  fileData.push({ file, slug, parsed, content, wikiLinks, timelineEntries, tags });
941
1035
  }
942
1036
 
943
- // Phase 2: Write all pages first
944
- progress.update(`Writing ${fileData.length} pages...`);
1037
+ if (!jsonOut) {
1038
+ spinner.succeed(`Found ${files.length} markdown files`);
1039
+ }
1040
+
1041
+ // Phase 2: Write all pages first (skip embed for performance)
1042
+ if (!jsonOut) {
1043
+ spinner.start(`Writing ${fileData.length} pages to database...`);
1044
+ }
1045
+
1046
+ const allSlugs: string[] = [];
1047
+ const writeErrors: string[] = [];
1048
+
945
1049
  for (let i = 0; i < fileData.length; i++) {
946
1050
  const { slug, parsed } = fileData[i]!;
947
- if (!jsonOut && i % 10 === 0) {
948
- progress.update(`Writing pages... ${i + 1}/${fileData.length}`);
1051
+ if (!jsonOut && i % 20 === 0) {
1052
+ spinner.update(`Writing pages... ${i + 1}/${fileData.length}`);
1053
+ }
1054
+ try {
1055
+ await repo.putPage({
1056
+ slug,
1057
+ type: String(parsed.frontmatter.type ?? inferTypeFromSlug(slug)),
1058
+ title: String(parsed.frontmatter.title ?? slugToTitle(slug)),
1059
+ compiledTruth: parsed.compiledTruth,
1060
+ timeline: parsed.timeline,
1061
+ frontmatter: parsed.frontmatter,
1062
+ }, true); // skipEmbed: true for performance
1063
+ allSlugs.push(slug);
1064
+ } catch (err) {
1065
+ writeErrors.push(`${slug}: ${err instanceof Error ? err.message : String(err)}`);
1066
+ }
1067
+ }
1068
+
1069
+ if (!jsonOut) {
1070
+ spinner.succeed(`Wrote ${allSlugs.length} pages to database`);
1071
+ if (writeErrors.length > 0) {
1072
+ warning(`${writeErrors.length} pages failed to write`);
1073
+ for (const e of writeErrors.slice(0, 3)) {
1074
+ subItem(e);
1075
+ }
1076
+ if (writeErrors.length > 3) {
1077
+ subItem(`... and ${writeErrors.length - 3} more`);
1078
+ }
949
1079
  }
950
- await repo.putPage({
951
- slug,
952
- type: String(parsed.frontmatter.type ?? inferTypeFromSlug(slug)),
953
- title: String(parsed.frontmatter.title ?? slugToTitle(slug)),
954
- compiledTruth: parsed.compiledTruth,
955
- timeline: parsed.timeline,
956
- frontmatter: parsed.frontmatter,
957
- });
958
1080
  }
959
1081
 
960
1082
  // Phase 3: Parallel entity extraction (main optimization)
961
- progress.update("Extracting entities...");
962
1083
  const BATCH_SIZE = 10;
963
1084
  const entityResults = new Map<string, Awaited<ReturnType<typeof extractRelations>>>();
964
1085
 
965
1086
  if (settings.llm.baseURL) {
1087
+ if (!jsonOut) {
1088
+ spinner.start(`Extracting entities with LLM...`);
1089
+ }
1090
+
966
1091
  for (let i = 0; i < fileData.length; i += BATCH_SIZE) {
967
1092
  const batch = fileData.slice(i, i + BATCH_SIZE).filter(d => d.tags.length === 0);
968
1093
  if (!jsonOut) {
969
- progress.update(`Extracting entities... ${Math.min(i + BATCH_SIZE, fileData.length)}/${fileData.length}`);
1094
+ spinner.update(`Extracting entities... ${Math.min(i + BATCH_SIZE, fileData.length)}/${fileData.length}`);
970
1095
  }
971
1096
  const batchPromises = batch.map(async ({ slug, content }) => {
972
1097
  const relations = await extractRelations(content, settings.llm);
@@ -977,13 +1102,34 @@ Examples:
977
1102
  entityResults.set(slug, relations);
978
1103
  }
979
1104
  }
1105
+
1106
+ if (!jsonOut) {
1107
+ spinner.succeed(`Entity extraction complete`);
1108
+ }
1109
+ } else {
1110
+ if (!jsonOut) {
1111
+ warning(`LLM not configured, skipping entity extraction`);
1112
+ }
980
1113
  }
981
1114
 
982
1115
  // Phase 4: Write links, tags, timeline, and entity pages
983
- progress.update("Creating links and timeline...");
1116
+ if (!jsonOut) {
1117
+ spinner.start(`Creating links, tags, and timeline entries...`);
1118
+ }
1119
+
984
1120
  let linkCount = 0;
985
1121
  let timelineCount = 0;
986
1122
  let entityCount = 0;
1123
+ let tagCount = 0;
1124
+
1125
+ // Collect timeline entries for batch insert
1126
+ const allTimelineEntries: Array<{
1127
+ pageSlug: string;
1128
+ date: string;
1129
+ source: string;
1130
+ summary: string;
1131
+ detail: string;
1132
+ }> = [];
987
1133
 
988
1134
  for (const { slug, wikiLinks, timelineEntries, tags, content } of fileData) {
989
1135
  // Wiki links
@@ -992,9 +1138,9 @@ Examples:
992
1138
  linkCount++;
993
1139
  }
994
1140
 
995
- // Timeline entries
1141
+ // Collect timeline entries for batch insert
996
1142
  for (const entry of timelineEntries) {
997
- await repo.timelineAdd({
1143
+ allTimelineEntries.push({
998
1144
  pageSlug: slug,
999
1145
  date: entry.date,
1000
1146
  source: entry.source,
@@ -1007,6 +1153,7 @@ Examples:
1007
1153
  // Tags
1008
1154
  for (const tag of tags) {
1009
1155
  await repo.tag(slug, tag);
1156
+ tagCount++;
1010
1157
  }
1011
1158
 
1012
1159
  // Entity links from parallel extraction
@@ -1032,12 +1179,53 @@ Examples:
1032
1179
  }
1033
1180
  }
1034
1181
 
1182
+ // Batch insert all timeline entries
1183
+ if (allTimelineEntries.length > 0) {
1184
+ await repo.timelineAddBatch(allTimelineEntries);
1185
+ }
1186
+
1187
+ if (!jsonOut) {
1188
+ spinner.succeed(`Created links, tags, and timeline`);
1189
+ }
1190
+
1191
+ // Phase 5: Batch sync all pages to search index
1192
+ if (opts.skipIndex) {
1193
+ if (!jsonOut) {
1194
+ info(`Skipping vector indexing (--skip-index)`);
1195
+ }
1196
+ } else {
1197
+ if (!jsonOut) {
1198
+ spinner.start(`Indexing ${allSlugs.length} pages for search...`);
1199
+ }
1200
+ await repo.embedAll();
1201
+
1202
+ if (!jsonOut) {
1203
+ spinner.succeed(`Search indexing complete`);
1204
+ }
1205
+ }
1206
+
1035
1207
  const duration = formatDuration(Date.now() - startTime);
1036
- progress.succeed(`${files.length} files imported, ${entityCount} entities, ${linkCount} links (${duration})`);
1208
+
1209
+ if (!jsonOut) {
1210
+ // Print summary
1211
+ header("Import Summary");
1212
+ keyValue("Files imported", String(files.length));
1213
+ keyValue("Pages created", String(allSlugs.length));
1214
+ keyValue("Entities extracted", String(entityCount));
1215
+ keyValue("Links created", String(linkCount));
1216
+ keyValue("Timeline entries", String(timelineCount));
1217
+ keyValue("Tags added", String(tagCount));
1218
+ keyValue("Duration", duration);
1219
+
1220
+ if (writeErrors.length > 0) {
1221
+ warning(`${writeErrors.length} pages had errors`);
1222
+ }
1223
+ }
1037
1224
 
1038
1225
  print(program, {
1226
+ ok: true,
1039
1227
  importedFiles: files.length,
1040
- pages: fileData.length,
1228
+ pages: allSlugs.length,
1041
1229
  links: linkCount,
1042
1230
  timelineEntries: timelineCount,
1043
1231
  entities: entityCount,
@@ -1138,6 +1326,15 @@ Examples:
1138
1326
  }
1139
1327
 
1140
1328
  await withRepo(program, async (repo) => {
1329
+ const jsonOut = isJson(program);
1330
+ const spinner = createSpinner();
1331
+ const startTime = Date.now();
1332
+
1333
+ if (!jsonOut) {
1334
+ header(`Ingest: ${fileName}`);
1335
+ spinner.start(`Creating page from file...`);
1336
+ }
1337
+
1141
1338
  await repo.putPage({
1142
1339
  slug,
1143
1340
  type,
@@ -1149,6 +1346,14 @@ Examples:
1149
1346
  sourceType: type,
1150
1347
  },
1151
1348
  });
1349
+
1350
+ if (!jsonOut) {
1351
+ spinner.succeed(`Page created: ${slug}`);
1352
+ keyValue("Source file", fileName);
1353
+ keyValue("Type", type);
1354
+ keyValue("Content length", `${content.length} chars`);
1355
+ }
1356
+
1152
1357
  await repo.timelineAdd({
1153
1358
  pageSlug: slug,
1154
1359
  date: new Date().toISOString().slice(0, 10),
@@ -1156,12 +1361,19 @@ Examples:
1156
1361
  summary: `Ingested file ${fileName}`,
1157
1362
  detail: "",
1158
1363
  });
1364
+
1159
1365
  await applyEntityLinks(
1160
1366
  repo,
1161
1367
  slug,
1162
1368
  content,
1163
- isJson(program),
1369
+ jsonOut,
1164
1370
  );
1371
+
1372
+ if (!jsonOut) {
1373
+ const duration = formatDuration(Date.now() - startTime);
1374
+ success(`Ingestion completed in ${duration}`);
1375
+ }
1376
+
1165
1377
  print(program, { ok: true, action: "ingest", slug });
1166
1378
  });
1167
1379
  },
@@ -1204,13 +1416,28 @@ Examples:
1204
1416
  }
1205
1417
  await withRepo(program, async (repo) => {
1206
1418
  const jsonOut = isJson(program);
1419
+ const spinner = createSpinner();
1420
+ const startTime = Date.now();
1421
+
1422
+ if (!jsonOut) {
1423
+ header("Embed All Pages");
1424
+ spinner.start(`Loading pages...`);
1425
+ }
1426
+
1207
1427
  const pages = await repo.listPages({ limit: 100000 });
1208
- let count = 0;
1209
- for (const page of pages) {
1210
- count += 1;
1211
- progress("embed " + page.slug, count, pages.length, jsonOut);
1212
- await repo.syncPageToSearch(page.slug);
1428
+
1429
+ if (!jsonOut) {
1430
+ spinner.update(`Embedding ${pages.length} pages...`);
1431
+ }
1432
+
1433
+ const count = await repo.embedAll();
1434
+
1435
+ if (!jsonOut) {
1436
+ const duration = formatDuration(Date.now() - startTime);
1437
+ spinner.succeed(`Embedded ${count} pages`);
1438
+ keyValue("Duration", duration);
1213
1439
  }
1440
+
1214
1441
  print(program, { embedded: count, mode: "all" });
1215
1442
  });
1216
1443
  return;
@@ -1223,7 +1450,20 @@ Examples:
1223
1450
  return;
1224
1451
  }
1225
1452
  await withRepo(program, async (repo) => {
1453
+ const jsonOut = isJson(program);
1454
+ const spinner = createSpinner();
1455
+
1456
+ if (!jsonOut) {
1457
+ header(`Embed: ${slug}`);
1458
+ spinner.start(`Generating embedding for page...`);
1459
+ }
1460
+
1226
1461
  await repo.syncPageToSearch(slug);
1462
+
1463
+ if (!jsonOut) {
1464
+ spinner.succeed(`Page embedded: ${slug}`);
1465
+ }
1466
+
1227
1467
  print(program, { embedded: 1, slug });
1228
1468
  });
1229
1469
  },
@@ -1243,10 +1483,15 @@ Examples:
1243
1483
  )
1244
1484
  .action(async () => {
1245
1485
  await withRepo(program, async () => {
1486
+ const settings = await loadSettings();
1487
+ const dbPath = program.opts().db ?? settings.dbPath;
1488
+
1489
+ success(`Database initialized`);
1490
+ keyValue("Path", dbPath);
1491
+
1246
1492
  print(program, {
1247
1493
  ok: true,
1248
- dbPath:
1249
- program.opts().db ?? (await loadSettings()).dbPath,
1494
+ dbPath,
1250
1495
  });
1251
1496
  });
1252
1497
  });
@@ -1264,7 +1509,19 @@ Examples:
1264
1509
  )
1265
1510
  .action(async () => {
1266
1511
  await withRepo(program, async (repo) => {
1267
- print(program, await repo.stats());
1512
+ const jsonOut = isJson(program);
1513
+ const stats = await repo.stats();
1514
+
1515
+ if (!jsonOut) {
1516
+ header("Knowledge Base Statistics");
1517
+ keyValue("Pages", String(stats.pages));
1518
+ keyValue("Links", String(stats.links));
1519
+ keyValue("Tags", String(stats.tags));
1520
+ keyValue("Timeline entries", String(stats.timelineEntries));
1521
+ keyValue("Raw data rows", String(stats.rawRows));
1522
+ }
1523
+
1524
+ print(program, stats);
1268
1525
  });
1269
1526
  });
1270
1527
 
@@ -1324,7 +1581,20 @@ async function withRepo(
1324
1581
  const db = await BrainDb.connect(dbPath, settings);
1325
1582
  const repo = new BrainRepository(db);
1326
1583
  await callback(repo);
1327
- // CLI 短生命周期应用:强制退出绕过 seekdb native 模块的 cleanup bug
1584
+
1585
+ // Gracefully close database
1586
+ // Note: seekdb SDK's InternalEmbeddedClient.close() is empty in embedded mode
1587
+ // Data may not flush properly. Use remote seekdb server for reliability.
1588
+ try {
1589
+ await db.close();
1590
+ } catch (e) {
1591
+ // Close may fail due to seekdb native bug
1592
+ }
1593
+
1594
+ // Give seekdb extra time after close
1595
+ await new Promise((r) => setTimeout(r, 500));
1596
+
1597
+ // CLI: force exit to bypass seekdb native cleanup segfault
1328
1598
  process.exit(0);
1329
1599
  }
1330
1600
 
@@ -1366,18 +1636,249 @@ function normalizeLinkSlug(path: string): string {
1366
1636
  }
1367
1637
 
1368
1638
  // ---------------------------------------------------------------------------
1369
- // LLM Answer Generation
1639
+ // LLM Answer Generation — Multi-layer Context Collection
1370
1640
  // ---------------------------------------------------------------------------
1371
1641
 
1372
- interface ContextPage {
1642
+ /** A single section of context for the LLM prompt. */
1643
+ interface ContextSection {
1644
+ type: 'primary' | 'raw_data' | 'linked';
1373
1645
  slug: string;
1374
1646
  title: string;
1375
- excerpt: string;
1647
+ content: string;
1648
+ /** Human-readable label like "原始文档 (crm)" or "关联页面: projects/alpha". */
1649
+ label: string;
1650
+ }
1651
+
1652
+ /**
1653
+ * Collect multi-layer context for LLM answer generation.
1654
+ *
1655
+ * Layers (in priority order):
1656
+ * 1. Primary: compiledTruth + timeline of each hit page
1657
+ * 2. Raw data: original documents stored via raw.set
1658
+ * 3. Linked pages: compiledTruth of pages linked to/from hit pages
1659
+ *
1660
+ * Budget is enforced via total character limit.
1661
+ */
1662
+ async function collectContextForLLM(
1663
+ repo: BrainRepository,
1664
+ hits: Array<{ slug: string; title: string; score: number }>,
1665
+ question: string,
1666
+ maxChars: number,
1667
+ ): Promise<{ sections: ContextSection[]; totalChars: number; stats: ContextStats }> {
1668
+ const sections: ContextSection[] = [];
1669
+ let totalChars = 0;
1670
+ const stats: ContextStats = {
1671
+ primaryPages: 0,
1672
+ rawDocs: 0,
1673
+ linkedPages: 0,
1674
+ skippedChars: 0,
1675
+ };
1676
+
1677
+ const seenSlugs = new Set<string>();
1678
+
1679
+ function addSection(section: ContextSection): boolean {
1680
+ if (seenSlugs.has(`${section.type}:${section.slug}:${section.label}`)) {
1681
+ return false;
1682
+ }
1683
+ const budget = maxChars - totalChars;
1684
+ if (section.content.length > budget && sections.length > 0) {
1685
+ // Truncate to fit budget
1686
+ section.content = section.content.slice(0, budget - 20) + '\n...[truncated]';
1687
+ stats.skippedChars += section.content.length - budget;
1688
+ }
1689
+ if (section.content.length > 0) {
1690
+ sections.push(section);
1691
+ totalChars += section.content.length;
1692
+ seenSlugs.add(`${section.type}:${section.slug}:${section.label}`);
1693
+ return true;
1694
+ }
1695
+ return false;
1696
+ }
1697
+
1698
+ // Layer 1: Primary pages (compiledTruth + timeline)
1699
+ for (const hit of hits) {
1700
+ const page = await repo.getPage(hit.slug);
1701
+ if (!page) continue;
1702
+
1703
+ const parts: string[] = [];
1704
+ if (page.compiledTruth?.trim()) {
1705
+ parts.push(page.compiledTruth.trim());
1706
+ }
1707
+ const tl = page.timeline?.trim();
1708
+ if (tl) {
1709
+ parts.push(`## 时间线\n${tl}`);
1710
+ }
1711
+
1712
+ if (parts.length > 0) {
1713
+ addSection({
1714
+ type: 'primary',
1715
+ slug: page.slug,
1716
+ title: page.title,
1717
+ content: parts.join('\n\n'),
1718
+ label: `页面正文`,
1719
+ });
1720
+ stats.primaryPages++;
1721
+ }
1722
+ }
1723
+
1724
+ // Layer 2: Raw data (original documents)
1725
+ for (const hit of hits) {
1726
+ try {
1727
+ const rawRows = await repo.readRaw(hit.slug) as Array<{ source: string; data: unknown; fetchedAt?: string }>;
1728
+ for (const row of rawRows) {
1729
+ let rawContent = '';
1730
+ if (typeof row.data === 'string') {
1731
+ rawContent = row.data;
1732
+ } else if (typeof row.data === 'object' && row.data !== null) {
1733
+ rawContent = JSON.stringify(row.data, null, 2);
1734
+ }
1735
+ if (rawContent.trim()) {
1736
+ addSection({
1737
+ type: 'raw_data',
1738
+ slug: hit.slug,
1739
+ title: hit.title,
1740
+ content: rawContent,
1741
+ label: `原始文档 (${row.source})`,
1742
+ });
1743
+ stats.rawDocs++;
1744
+ }
1745
+ }
1746
+ } catch {
1747
+ // Raw data fetch failure is non-fatal
1748
+ }
1749
+ }
1750
+
1751
+ // Layer 3: Linked pages — SEMANTICALLY SCORED against the question
1752
+ // Only include linked pages that are actually relevant to what the user asked.
1753
+ const allLinkedSlugs = new Set<string>();
1754
+ for (const hit of hits) {
1755
+ try {
1756
+ const outLinks = await repo.outgoingLinks(hit.slug);
1757
+ outLinks.forEach(l => allLinkedSlugs.add(l.slug));
1758
+ } catch { /* ignore */ }
1759
+ try {
1760
+ const backlinkSlugs = await repo.backlinks(hit.slug);
1761
+ backlinkSlugs.forEach(s => allLinkedSlugs.add(s));
1762
+ } catch { /* ignore */ }
1763
+ }
1764
+
1765
+ if (allLinkedSlugs.size > 0) {
1766
+ // Score linked pages using broad semantic search.
1767
+ // Query a wide set of pages, then intersect with linked slugs.
1768
+ const broadLimit = Math.min(200, Math.max(50, allLinkedSlugs.size));
1769
+ const broadResults = await repo.query(question, broadLimit);
1770
+ const semanticScoreMap = new Map(broadResults.map(h => [h.slug, h.score]));
1771
+
1772
+ // Keyword-based fallback scoring for linked pages without embedding scores
1773
+ const keywordScores = new Map<string, number>();
1774
+ for (const linkedSlug of allLinkedSlugs) {
1775
+ if (semanticScoreMap.has(linkedSlug)) continue;
1776
+ try {
1777
+ const page = await repo.getPage(linkedSlug);
1778
+ if (page) {
1779
+ const text = `${page.title} ${page.compiledTruth}`.slice(0, 2000);
1780
+ keywordScores.set(linkedSlug, computeKeywordRelevance(text, question));
1781
+ }
1782
+ } catch { /* ignore */ }
1783
+ }
1784
+
1785
+ // Combine scores: semantic first, then keyword fallback
1786
+ const scoredLinked = [...allLinkedSlugs].map(slug => ({
1787
+ slug,
1788
+ score: semanticScoreMap.get(slug) ?? keywordScores.get(slug) ?? 0,
1789
+ }));
1790
+
1791
+ // Filter: only include linked pages with meaningful relevance
1792
+ const MIN_LINKED_SCORE = 0.02;
1793
+ const relevantLinked = scoredLinked
1794
+ .filter(s => s.score >= MIN_LINKED_SCORE)
1795
+ .sort((a, b) => b.score - a.score);
1796
+
1797
+ // Fetch content for relevant linked pages (respecting budget)
1798
+ for (const linked of relevantLinked) {
1799
+ if (totalChars >= maxChars) break;
1800
+
1801
+ const linkedPage = await repo.getPage(linked.slug);
1802
+ if (!linkedPage || !linkedPage.compiledTruth?.trim()) continue;
1803
+
1804
+ const remaining = maxChars - totalChars;
1805
+ let content = linkedPage.compiledTruth.trim();
1806
+ if (content.length > remaining - 100) {
1807
+ content = content.slice(0, remaining - 100) + '\n...[truncated]';
1808
+ }
1809
+
1810
+ addSection({
1811
+ type: 'linked',
1812
+ slug: linkedPage.slug,
1813
+ title: linkedPage.title,
1814
+ content,
1815
+ label: `关联页面: ${linkedPage.slug} (相关度: ${(linked.score * 100).toFixed(1)}%)`,
1816
+ });
1817
+ stats.linkedPages++;
1818
+
1819
+ // Also fetch raw data for highly relevant linked pages
1820
+ if (linked.score > 0.1) {
1821
+ try {
1822
+ const rawRows = await repo.readRaw(linked.slug) as Array<{ source: string; data: unknown }>;
1823
+ for (const row of rawRows) {
1824
+ let rawContent = typeof row.data === 'string' ? row.data : JSON.stringify(row.data);
1825
+ if (rawContent.trim().length > 100) {
1826
+ const remaining2 = maxChars - totalChars;
1827
+ if (rawContent.length > remaining2 - 100) {
1828
+ rawContent = rawContent.slice(0, remaining2 - 100) + '\n...[truncated]';
1829
+ }
1830
+ addSection({
1831
+ type: 'raw_data',
1832
+ slug: linked.slug,
1833
+ title: linkedPage.title,
1834
+ content: rawContent,
1835
+ label: `原始文档 (关联: ${row.source})`,
1836
+ });
1837
+ stats.rawDocs++;
1838
+ }
1839
+ }
1840
+ } catch { /* ignore */ }
1841
+ }
1842
+ }
1843
+ }
1844
+
1845
+ return { sections, totalChars, stats };
1846
+ }
1847
+
1848
+ /**
1849
+ * Simple keyword-based relevance scoring (fallback for pages without embeddings).
1850
+ * Computes the fraction of unique meaningful characters from the question
1851
+ * that appear in the text.
1852
+ */
1853
+ function computeKeywordRelevance(text: string, question: string): number {
1854
+ const STOP_CHARS = new Set('的是了在和我有你就这不人都说上个大国为到以们年会生地要主中子自实家小对多能好可很所把当');
1855
+ const questionChars = [...question]
1856
+ .filter(c => !/\s|[,,。!?、;::""''()()【】\[\]{}<>\/\\|~`@#$%^&*+=_-]/.test(c) && !STOP_CHARS.has(c));
1857
+ if (questionChars.length === 0) return 0;
1858
+
1859
+ const uniqueChars = new Set(questionChars);
1860
+ const lower = text.toLowerCase();
1861
+ let matched = 0;
1862
+ for (const char of uniqueChars) {
1863
+ if (lower.includes(char.toLowerCase())) matched++;
1864
+ }
1865
+ return matched / uniqueChars.size;
1376
1866
  }
1377
1867
 
1378
- async function generateAnswerFromExcerpts(
1868
+ interface ContextStats {
1869
+ primaryPages: number;
1870
+ rawDocs: number;
1871
+ linkedPages: number;
1872
+ skippedChars: number;
1873
+ }
1874
+
1875
+ /**
1876
+ * Build LLM prompt from collected context sections and generate answer.
1877
+ */
1878
+ async function generateAnswerWithContext(
1379
1879
  question: string,
1380
- pages: ContextPage[],
1880
+ sections: ContextSection[],
1881
+ stats: ContextStats,
1381
1882
  llm: ResolvedLLM,
1382
1883
  ): Promise<string> {
1383
1884
  const apiKey = llm.apiKey || process.env[llm.apiKeyEnv] || "";
@@ -1385,29 +1886,54 @@ async function generateAnswerFromExcerpts(
1385
1886
  return "Error: LLM API key not configured.";
1386
1887
  }
1387
1888
 
1388
- // Build context from page excerpts
1389
- const context = pages
1390
- .map((p, i) => {
1391
- return `## Source ${i + 1}: ${p.title}\n**Slug:** ${p.slug}\n\n${p.excerpt}`;
1392
- })
1393
- .join("\n\n---\n\n");
1889
+ if (sections.length === 0) {
1890
+ return "知识库中没有找到相关内容。";
1891
+ }
1394
1892
 
1395
- const prompt = `You are answering a question based on the provided knowledge base context.
1893
+ // Build context sections with clear labels
1894
+ const contextParts: string[] = [];
1895
+ let sectionIndex = 0;
1896
+
1897
+ // Group by type for cleaner output
1898
+ const primarySections = sections.filter(s => s.type === 'primary');
1899
+ const rawSections = sections.filter(s => s.type === 'raw_data');
1900
+ const linkedSections = sections.filter(s => s.type === 'linked');
1901
+
1902
+ function renderSections(group: ContextSection[], header: string) {
1903
+ if (group.length === 0) return;
1904
+ contextParts.push(`## ${header}\n`);
1905
+ for (const s of group) {
1906
+ sectionIndex++;
1907
+ contextParts.push(`### [${sectionIndex}] ${s.title} — ${s.label}\n**Slug:** ${s.slug}\n\n${s.content}\n`);
1908
+ }
1909
+ contextParts.push('');
1910
+ }
1911
+
1912
+ renderSections(primarySections, '页面正文');
1913
+ renderSections(rawSections, '原始文档');
1914
+ renderSections(linkedSections, '关联页面');
1915
+
1916
+ const context = contextParts.join('\n');
1396
1917
 
1397
- ## Question
1918
+ const prompt = `你是一个知识库助手,请根据提供的知识库内容回答问题。
1919
+
1920
+ ## 问题
1398
1921
  ${question}
1399
1922
 
1400
- ## Context from Knowledge Base
1401
- ${context || "(No relevant pages found)"}
1923
+ ## 知识库内容
1924
+
1925
+ ${context}
1402
1926
 
1403
- ## Instructions
1404
- - Answer the question based ONLY on the provided context
1405
- - If the context doesn't contain enough information, say so
1406
- - Cite sources using markdown links like [Title](slug) when referencing specific information
1407
- - Format your answer in clean markdown
1408
- - Be concise but comprehensive
1927
+ ## 回答要求
1928
+ - 仅基于提供的知识库内容回答,不要编造信息
1929
+ - 如果知识库中没有相关信息,请明确说明
1930
+ - 引用来源时使用 [[slug|标题]] 的格式
1931
+ - 使用清晰的 markdown 格式
1932
+ - 如果涉及时间线信息,请在回答中体现
1933
+ - 区分哪些信息来自「页面正文」、哪些来自「原始文档」、哪些来自「关联页面」
1934
+ - 语言与提问保持一致(中文提问用中文回答,英文提问用英文回答)
1409
1935
 
1410
- ## Answer`;
1936
+ ## 回答`;
1411
1937
 
1412
1938
  try {
1413
1939
  const resp = await fetch(
@@ -1423,12 +1949,12 @@ ${context || "(No relevant pages found)"}
1423
1949
  messages: [
1424
1950
  {
1425
1951
  role: "system",
1426
- content: "You are a helpful assistant that answers questions based on a knowledge base. Always cite your sources.",
1952
+ content: "你是一个专业的知识库助手,基于提供的知识库内容准确回答问题。引用来源时使用 [[slug|标题]] 格式。回答要条理清晰,区分信息来源。",
1427
1953
  },
1428
1954
  { role: "user", content: prompt },
1429
1955
  ],
1430
1956
  temperature: 0.3,
1431
- max_tokens: 2048,
1957
+ max_tokens: 4096,
1432
1958
  }),
1433
1959
  },
1434
1960
  );