ex-brain 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -1
- package/src/commands/index.ts +340 -220
- package/src/markdown/document-loader.ts +486 -0
- package/src/mcp/server.ts +148 -0
package/src/commands/index.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import { basename, resolve } from "node:path";
|
|
1
|
+
import { basename, extname, resolve } from "node:path";
|
|
2
2
|
import { readFileSync } from "node:fs";
|
|
3
|
+
import { createHash } from "node:crypto";
|
|
3
4
|
import { Command } from "commander";
|
|
4
5
|
import { DEFAULT_DB_NAME, inferTypeFromSlug, slugToTitle, normalizeLongSlug, slugify } from "../config";
|
|
5
6
|
import { BrainDb } from "../db/client";
|
|
@@ -13,6 +14,7 @@ import {
|
|
|
13
14
|
slugToPath,
|
|
14
15
|
writeTextFile,
|
|
15
16
|
} from "../markdown/io";
|
|
17
|
+
import { loadDocument, isRemoteUrl, type DocumentKind } from "../markdown/document-loader";
|
|
16
18
|
import {
|
|
17
19
|
extractTimelineLines,
|
|
18
20
|
extractWikiStyleLinks,
|
|
@@ -52,6 +54,14 @@ function isDryRun(opts: Record<string, unknown>): boolean {
|
|
|
52
54
|
return Boolean(opts.dryRun);
|
|
53
55
|
}
|
|
54
56
|
|
|
57
|
+
/**
|
|
58
|
+
* Compute a short SHA-256 hex hash of a string (first 16 chars).
|
|
59
|
+
* Used for detecting duplicate document ingestion.
|
|
60
|
+
*/
|
|
61
|
+
function contentHash(text: string): string {
|
|
62
|
+
return createHash("sha256").update(text, "utf8").digest("hex").slice(0, 16);
|
|
63
|
+
}
|
|
64
|
+
|
|
55
65
|
// Simple progress output to stderr (won't interfere with --json stdout).
|
|
56
66
|
// e.g. "[3/42] import docs/api"
|
|
57
67
|
function progress(label: string, current: number, total: number, json: boolean): void {
|
|
@@ -94,12 +104,12 @@ async function applyEntityLinks(
|
|
|
94
104
|
}
|
|
95
105
|
return { created: 0, linked: 0 };
|
|
96
106
|
}
|
|
97
|
-
|
|
107
|
+
|
|
98
108
|
// Filter by confidence
|
|
99
109
|
const confidenceThreshold = settings.extraction.confidenceThreshold;
|
|
100
110
|
const highConfidence = relations.filter((r) => r.confidence >= confidenceThreshold);
|
|
101
111
|
const ignoredCount = relations.length - highConfidence.length;
|
|
102
|
-
|
|
112
|
+
|
|
103
113
|
if (highConfidence.length === 0) {
|
|
104
114
|
if (!json) {
|
|
105
115
|
if (relations.length > 0) {
|
|
@@ -119,7 +129,7 @@ async function applyEntityLinks(
|
|
|
119
129
|
// 1. Resolve entity slugs (disambiguation)
|
|
120
130
|
const fromCandidate = entityToSlug(r.from.name, r.from.type);
|
|
121
131
|
const toCandidate = entityToSlug(r.to.name, r.to.type);
|
|
122
|
-
|
|
132
|
+
|
|
123
133
|
const fromSlug = await repo.findSimilarSlug(fromCandidate, r.from.name);
|
|
124
134
|
const toSlug = await repo.findSimilarSlug(toCandidate, r.to.name);
|
|
125
135
|
|
|
@@ -144,7 +154,7 @@ async function applyEntityLinks(
|
|
|
144
154
|
const duration = formatDuration(Date.now() - startTime);
|
|
145
155
|
const entityNames = [...new Set(highConfidence.flatMap((r) => [r.from.name, r.to.name]))];
|
|
146
156
|
spinner.succeed(`Extracted ${entityNames.length} entities: ${entityNames.join(", ")}`);
|
|
147
|
-
|
|
157
|
+
|
|
148
158
|
// Print detailed info
|
|
149
159
|
subItem(`${created} entity pages created`);
|
|
150
160
|
subItem(`${linked} links added`);
|
|
@@ -225,23 +235,46 @@ Examples:
|
|
|
225
235
|
|
|
226
236
|
// -- page CRUD ------------------------------------------------------------
|
|
227
237
|
|
|
238
|
+
// -- put ------------------------------------------------------------------
|
|
239
|
+
// Auto-detects file type: markdown goes through parsePageMarkdown,
|
|
240
|
+
// other formats (pdf, docx, html, txt, json) go through loadDocument.
|
|
241
|
+
|
|
242
|
+
/** Non-markdown extensions that should use the document ingestion path. */
|
|
243
|
+
const DOC_EXTENSIONS = new Set([
|
|
244
|
+
"pdf", "docx", "doc", "html", "htm", "json", "txt", "text",
|
|
245
|
+
]);
|
|
246
|
+
|
|
247
|
+
/** Whether a file path should be treated as a document (not markdown). */
|
|
248
|
+
function isDocumentFile(filePath: string, forceKind?: string): boolean {
|
|
249
|
+
if (forceKind && forceKind !== "markdown") return true;
|
|
250
|
+
const ext = extname(filePath).toLowerCase().replace(/^\./, "");
|
|
251
|
+
return DOC_EXTENSIONS.has(ext);
|
|
252
|
+
}
|
|
253
|
+
|
|
228
254
|
addDryRun(
|
|
229
255
|
program
|
|
230
256
|
.command("put")
|
|
231
257
|
.argument("[slug]", "page slug (optional; auto-generated if omitted)")
|
|
232
|
-
.option("--file <path>", "read
|
|
258
|
+
.option("--file <path>", "read content from file (markdown, pdf, docx, html, txt, json)")
|
|
233
259
|
.option("--stdin", "read markdown from stdin", false)
|
|
234
|
-
.option("--type <type>", "page type")
|
|
235
|
-
.option("--title <title>", "page title")
|
|
260
|
+
.option("--type <type>", "page type override")
|
|
261
|
+
.option("--title <title>", "page title override")
|
|
262
|
+
.option("--format <kind>", "force document kind (pdf|docx|html|json|markdown|text) — only needed for --file with non-md files when auto-detect fails")
|
|
263
|
+
.option("--max-bytes <number>", "max bytes for URL/file ingest", "52428800")
|
|
264
|
+
.option("--timeout <ms>", "fetch timeout for URLs in ms", "30000")
|
|
236
265
|
.description(
|
|
237
|
-
"create or update a page (idempotent; upserts by slug).
|
|
266
|
+
"create or update a page (idempotent; upserts by slug). Auto-detects file type: markdown is parsed normally, PDF/DOCX/HTML/TXT/JSON are extracted and ingested.",
|
|
238
267
|
)
|
|
239
268
|
.addHelpText(
|
|
240
269
|
"after",
|
|
241
270
|
`
|
|
242
271
|
Examples:
|
|
243
|
-
ebrain put --file api.md #
|
|
272
|
+
ebrain put --file api.md # markdown → parsePageMarkdown
|
|
244
273
|
ebrain put docs/api --file api.md # explicit slug
|
|
274
|
+
ebrain put --file report.pdf # pdf → auto-extract text
|
|
275
|
+
ebrain put docs/report --file report.pdf # explicit slug for pdf
|
|
276
|
+
ebrain put --file article.docx # docx → auto-extract text
|
|
277
|
+
ebrain put --file https://example.com/a.pdf # URL → download + extract
|
|
245
278
|
cat note.md | ebrain put --stdin # auto-generate slug from title/timestamp
|
|
246
279
|
ebrain put --title "My Note" --stdin # auto-generate slug from title
|
|
247
280
|
ebrain put people/john --type person --title "John Doe"
|
|
@@ -256,9 +289,173 @@ Examples:
|
|
|
256
289
|
stdin?: boolean;
|
|
257
290
|
type?: string;
|
|
258
291
|
title?: string;
|
|
292
|
+
format?: string;
|
|
293
|
+
maxBytes?: string;
|
|
294
|
+
timeout?: string;
|
|
259
295
|
dryRun?: boolean;
|
|
260
296
|
},
|
|
261
297
|
) => {
|
|
298
|
+
// ── Branch 1: document file (pdf/docx/html/txt/json or URL) ──
|
|
299
|
+
const forceKind = opts.format as DocumentKind | undefined;
|
|
300
|
+
if (opts.file && isDocumentFile(opts.file, opts.format)) {
|
|
301
|
+
const loaded = await loadDocument(opts.file, {
|
|
302
|
+
forceKind,
|
|
303
|
+
fetchTimeoutMs: opts.timeout ? Number(opts.timeout) : undefined,
|
|
304
|
+
maxBytes: opts.maxBytes ? Number(opts.maxBytes) : undefined,
|
|
305
|
+
});
|
|
306
|
+
const content = loaded.text;
|
|
307
|
+
const fileName = loaded.fileName;
|
|
308
|
+
const kind = loaded.kind;
|
|
309
|
+
const sourceRef = loaded.source;
|
|
310
|
+
const sourceType = loaded.sourceType;
|
|
311
|
+
const mimeType = loaded.mimeType;
|
|
312
|
+
const bytes = loaded.bytes;
|
|
313
|
+
const metadata = loaded.metadata;
|
|
314
|
+
|
|
315
|
+
let finalSlug = slug;
|
|
316
|
+
if (!finalSlug) {
|
|
317
|
+
const nameNoExt = fileName.replace(/\.[^.]+$/, "");
|
|
318
|
+
const slugBase = normalizeLongSlug(slugify(nameNoExt));
|
|
319
|
+
finalSlug = `ingest/${slugBase}`;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
const type = opts.type ?? kind;
|
|
323
|
+
const title =
|
|
324
|
+
opts.title ??
|
|
325
|
+
String(slugToTitle(finalSlug));
|
|
326
|
+
const hash = contentHash(content);
|
|
327
|
+
const frontmatter: Record<string, unknown> = {
|
|
328
|
+
sourceFile: sourceRef,
|
|
329
|
+
sourceType,
|
|
330
|
+
sourceKind: kind,
|
|
331
|
+
sourceMimeType: mimeType,
|
|
332
|
+
sourceBytes: bytes,
|
|
333
|
+
sourceFileName: fileName,
|
|
334
|
+
_contentHash: hash,
|
|
335
|
+
...metadata,
|
|
336
|
+
};
|
|
337
|
+
|
|
338
|
+
if (isDryRun(opts)) {
|
|
339
|
+
print(program, {
|
|
340
|
+
dryRun: true,
|
|
341
|
+
action: "put",
|
|
342
|
+
slug: finalSlug,
|
|
343
|
+
type,
|
|
344
|
+
title,
|
|
345
|
+
kind,
|
|
346
|
+
sourceType,
|
|
347
|
+
sourceRef,
|
|
348
|
+
mimeType,
|
|
349
|
+
bytes,
|
|
350
|
+
contentLength: content.length,
|
|
351
|
+
contentHash: hash,
|
|
352
|
+
metadata,
|
|
353
|
+
});
|
|
354
|
+
return;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
await withRepo(program, async (repo) => {
|
|
358
|
+
const jsonOut = isJson(program);
|
|
359
|
+
const spinner = createSpinner();
|
|
360
|
+
const startTime = Date.now();
|
|
361
|
+
|
|
362
|
+
// Check if content has already been ingested (idempotency)
|
|
363
|
+
const existingPage = await repo.getPage(finalSlug);
|
|
364
|
+
const existingHash = existingPage?.frontmatter._contentHash as string | undefined;
|
|
365
|
+
|
|
366
|
+
if (existingHash === hash) {
|
|
367
|
+
if (!jsonOut) {
|
|
368
|
+
header(`Put: ${fileName}`);
|
|
369
|
+
success(`Content unchanged — skipped (hash: ${hash})`);
|
|
370
|
+
}
|
|
371
|
+
print(program, {
|
|
372
|
+
ok: true,
|
|
373
|
+
action: "put",
|
|
374
|
+
slug: finalSlug,
|
|
375
|
+
unchanged: true,
|
|
376
|
+
contentHash: hash,
|
|
377
|
+
});
|
|
378
|
+
return;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
if (!jsonOut) {
|
|
382
|
+
header(`Put: ${fileName}`);
|
|
383
|
+
keyValue("Kind", kind);
|
|
384
|
+
keyValue("Source", sourceRef);
|
|
385
|
+
if (mimeType) keyValue("Content-Type", mimeType);
|
|
386
|
+
keyValue("Bytes", String(bytes));
|
|
387
|
+
if (existingPage) {
|
|
388
|
+
keyValue("Previous hash", existingHash ?? "none");
|
|
389
|
+
keyValue("New hash", hash);
|
|
390
|
+
}
|
|
391
|
+
spinner.start(`Creating page from ${kind}...`);
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
await repo.putPage({
|
|
395
|
+
slug: finalSlug,
|
|
396
|
+
type,
|
|
397
|
+
title,
|
|
398
|
+
compiledTruth: content,
|
|
399
|
+
timeline: "",
|
|
400
|
+
frontmatter,
|
|
401
|
+
});
|
|
402
|
+
|
|
403
|
+
if (!jsonOut) {
|
|
404
|
+
spinner.succeed(`Page created: ${finalSlug}`);
|
|
405
|
+
keyValue("Type", type);
|
|
406
|
+
keyValue("Content length", `${content.length} chars`);
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// ── Side-effect operations (only on new/changed content) ──
|
|
410
|
+
await repo.timelineAdd({
|
|
411
|
+
pageSlug: finalSlug,
|
|
412
|
+
date: new Date().toISOString().slice(0, 10),
|
|
413
|
+
source: type,
|
|
414
|
+
summary: `Ingested ${kind} ${fileName}`,
|
|
415
|
+
detail: sourceType === "url" ? `Source URL: ${sourceRef}` : "",
|
|
416
|
+
});
|
|
417
|
+
|
|
418
|
+
try {
|
|
419
|
+
await repo.writeRaw(finalSlug, sourceType, {
|
|
420
|
+
fileName,
|
|
421
|
+
sourceRef,
|
|
422
|
+
kind,
|
|
423
|
+
mimeType,
|
|
424
|
+
bytes,
|
|
425
|
+
metadata,
|
|
426
|
+
ingestedAt: new Date().toISOString(),
|
|
427
|
+
});
|
|
428
|
+
} catch (err) {
|
|
429
|
+
if (!jsonOut) {
|
|
430
|
+
warning(
|
|
431
|
+
`failed to record raw_data: ${err instanceof Error ? err.message : String(err)}`,
|
|
432
|
+
);
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
await applyEntityLinks(repo, finalSlug, content, jsonOut);
|
|
437
|
+
|
|
438
|
+
if (!jsonOut) {
|
|
439
|
+
const duration = formatDuration(Date.now() - startTime);
|
|
440
|
+
success(`Operation completed in ${duration}`);
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
print(program, {
|
|
444
|
+
ok: true,
|
|
445
|
+
action: "put",
|
|
446
|
+
slug: finalSlug,
|
|
447
|
+
kind,
|
|
448
|
+
sourceType,
|
|
449
|
+
sourceRef,
|
|
450
|
+
bytes,
|
|
451
|
+
contentLength: content.length,
|
|
452
|
+
contentHash: hash,
|
|
453
|
+
});
|
|
454
|
+
});
|
|
455
|
+
return;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
// ── Branch 2: markdown (stdin or .md file) ──
|
|
262
459
|
const input = await resolveInput(opts.file, opts.stdin ?? false);
|
|
263
460
|
if (!input.trim()) {
|
|
264
461
|
throw new Error(
|
|
@@ -266,7 +463,7 @@ Examples:
|
|
|
266
463
|
);
|
|
267
464
|
}
|
|
268
465
|
const parsed = parsePageMarkdown(input);
|
|
269
|
-
|
|
466
|
+
|
|
270
467
|
// Auto-generate slug if not provided
|
|
271
468
|
let finalSlug = slug;
|
|
272
469
|
if (!finalSlug) {
|
|
@@ -284,7 +481,7 @@ Examples:
|
|
|
284
481
|
finalSlug = `notes/${timestamp}`;
|
|
285
482
|
}
|
|
286
483
|
}
|
|
287
|
-
|
|
484
|
+
|
|
288
485
|
const type =
|
|
289
486
|
opts.type ??
|
|
290
487
|
String(parsed.frontmatter.type ?? inferTypeFromSlug(finalSlug));
|
|
@@ -292,6 +489,10 @@ Examples:
|
|
|
292
489
|
opts.title ??
|
|
293
490
|
String(parsed.frontmatter.title ?? slugToTitle(finalSlug));
|
|
294
491
|
|
|
492
|
+
// Compute content hash and embed in frontmatter for idempotency
|
|
493
|
+
const hash = contentHash(parsed.compiledTruth);
|
|
494
|
+
parsed.frontmatter._contentHash = hash;
|
|
495
|
+
|
|
295
496
|
if (isDryRun(opts)) {
|
|
296
497
|
print(program, {
|
|
297
498
|
dryRun: true,
|
|
@@ -300,6 +501,7 @@ Examples:
|
|
|
300
501
|
type,
|
|
301
502
|
title,
|
|
302
503
|
contentLength: parsed.compiledTruth.length,
|
|
504
|
+
contentHash: hash,
|
|
303
505
|
hasTimeline: !!parsed.timeline,
|
|
304
506
|
frontmatterKeys: Object.keys(parsed.frontmatter),
|
|
305
507
|
});
|
|
@@ -310,12 +512,35 @@ Examples:
|
|
|
310
512
|
const jsonOut = isJson(program);
|
|
311
513
|
const spinner = createSpinner();
|
|
312
514
|
const startTime = Date.now();
|
|
313
|
-
|
|
515
|
+
|
|
516
|
+
// Check if content is unchanged (idempotency)
|
|
517
|
+
const existingPage = await repo.getPage(finalSlug);
|
|
518
|
+
const existingHash = existingPage?.frontmatter._contentHash as string | undefined;
|
|
519
|
+
|
|
520
|
+
if (existingHash === hash) {
|
|
521
|
+
if (!jsonOut) {
|
|
522
|
+
header(`Put: ${finalSlug}`);
|
|
523
|
+
success(`Content unchanged — skipped (hash: ${hash})`);
|
|
524
|
+
}
|
|
525
|
+
print(program, {
|
|
526
|
+
ok: true,
|
|
527
|
+
action: "put",
|
|
528
|
+
slug: finalSlug,
|
|
529
|
+
unchanged: true,
|
|
530
|
+
contentHash: hash,
|
|
531
|
+
});
|
|
532
|
+
return;
|
|
533
|
+
}
|
|
534
|
+
|
|
314
535
|
if (!jsonOut) {
|
|
315
536
|
header(`Put: ${finalSlug}`);
|
|
537
|
+
if (existingPage) {
|
|
538
|
+
keyValue("Previous hash", existingHash ?? "none");
|
|
539
|
+
keyValue("New hash", hash);
|
|
540
|
+
}
|
|
316
541
|
spinner.start(`Creating/updating page...`);
|
|
317
542
|
}
|
|
318
|
-
|
|
543
|
+
|
|
319
544
|
const page = await repo.putPage({
|
|
320
545
|
slug: finalSlug,
|
|
321
546
|
type,
|
|
@@ -324,27 +549,32 @@ Examples:
|
|
|
324
549
|
timeline: parsed.timeline,
|
|
325
550
|
frontmatter: parsed.frontmatter,
|
|
326
551
|
});
|
|
327
|
-
|
|
552
|
+
|
|
328
553
|
if (!jsonOut) {
|
|
329
554
|
spinner.succeed(`Page saved: ${page.slug}`);
|
|
330
555
|
keyValue("Title", title);
|
|
331
556
|
keyValue("Type", type);
|
|
332
557
|
keyValue("Content length", `${parsed.compiledTruth.length} chars`);
|
|
333
558
|
}
|
|
334
|
-
|
|
559
|
+
|
|
335
560
|
await applyEntityLinks(
|
|
336
561
|
repo,
|
|
337
562
|
finalSlug,
|
|
338
563
|
parsed.compiledTruth,
|
|
339
564
|
jsonOut,
|
|
340
565
|
);
|
|
341
|
-
|
|
566
|
+
|
|
342
567
|
if (!jsonOut) {
|
|
343
568
|
const duration = formatDuration(Date.now() - startTime);
|
|
344
569
|
success(`Operation completed in ${duration}`);
|
|
345
570
|
}
|
|
346
|
-
|
|
347
|
-
print(program, {
|
|
571
|
+
|
|
572
|
+
print(program, {
|
|
573
|
+
ok: true,
|
|
574
|
+
slug: page.slug,
|
|
575
|
+
updatedAt: page.updatedAt,
|
|
576
|
+
contentHash: hash,
|
|
577
|
+
});
|
|
348
578
|
});
|
|
349
579
|
},
|
|
350
580
|
);
|
|
@@ -415,18 +645,18 @@ Examples:
|
|
|
415
645
|
await withRepo(program, async (repo) => {
|
|
416
646
|
const jsonOut = isJson(program);
|
|
417
647
|
const spinner = createSpinner();
|
|
418
|
-
|
|
648
|
+
|
|
419
649
|
if (!jsonOut) {
|
|
420
650
|
header(`Delete: ${slug}`);
|
|
421
651
|
spinner.start(`Deleting page and related data...`);
|
|
422
652
|
}
|
|
423
|
-
|
|
653
|
+
|
|
424
654
|
await repo.deletePage(slug);
|
|
425
|
-
|
|
655
|
+
|
|
426
656
|
if (!jsonOut) {
|
|
427
657
|
spinner.succeed(`Page deleted: ${slug}`);
|
|
428
658
|
}
|
|
429
|
-
|
|
659
|
+
|
|
430
660
|
print(program, { ok: true, action: "delete", slug });
|
|
431
661
|
});
|
|
432
662
|
});
|
|
@@ -522,7 +752,7 @@ Examples:
|
|
|
522
752
|
await withRepo(program, async (repo) => {
|
|
523
753
|
const limit = Number(opts.limit ?? 10);
|
|
524
754
|
const hits = await repo.query(question, limit);
|
|
525
|
-
|
|
755
|
+
|
|
526
756
|
// If --llm flag, generate answer based on multi-layer context
|
|
527
757
|
if (opts.llm) {
|
|
528
758
|
const settings = await loadSettings();
|
|
@@ -530,20 +760,20 @@ Examples:
|
|
|
530
760
|
print(program, { error: "LLM not configured. Set llm.baseURL in settings." });
|
|
531
761
|
return;
|
|
532
762
|
}
|
|
533
|
-
|
|
763
|
+
|
|
534
764
|
const progress = createProgress();
|
|
535
765
|
progress.start("Searching knowledge base...");
|
|
536
|
-
|
|
766
|
+
|
|
537
767
|
const contextLimit = Number(opts.contextLimit ?? 5);
|
|
538
768
|
const topHits = hits.slice(0, contextLimit);
|
|
539
|
-
|
|
769
|
+
|
|
540
770
|
if (topHits.length === 0) {
|
|
541
771
|
progress.stop();
|
|
542
772
|
process.stderr.write("No relevant pages found.\n");
|
|
543
773
|
print(program, { answer: "No relevant information found in the knowledge base.", sources: [] });
|
|
544
774
|
return;
|
|
545
775
|
}
|
|
546
|
-
|
|
776
|
+
|
|
547
777
|
// Collect multi-layer context (primary + raw data + linked pages scored by relevance)
|
|
548
778
|
// ~100KB char budget ≈ 25K tokens, safe for most models
|
|
549
779
|
const MAX_CONTEXT_CHARS = 100_000;
|
|
@@ -553,33 +783,33 @@ Examples:
|
|
|
553
783
|
progress.update(`Loading ${stage}...`);
|
|
554
784
|
});
|
|
555
785
|
const ctxDuration = formatDuration(Date.now() - ctxStart);
|
|
556
|
-
|
|
786
|
+
|
|
557
787
|
if (sections.length === 0) {
|
|
558
788
|
progress.stop();
|
|
559
789
|
process.stderr.write("No content could be loaded.\n");
|
|
560
790
|
print(program, { answer: "Failed to load page content.", sources: [] });
|
|
561
791
|
return;
|
|
562
792
|
}
|
|
563
|
-
|
|
793
|
+
|
|
564
794
|
progress.succeed(`Loaded ${stats.primaryPages} page(s), ${stats.rawDocs} raw doc(s), ${stats.linkedPages} linked page(s) (${ctxDuration})`);
|
|
565
795
|
const startTime = Date.now();
|
|
566
|
-
|
|
796
|
+
|
|
567
797
|
const { answer, ok } = await generateAnswerWithStream(question, sections, stats, settings.llm);
|
|
568
|
-
|
|
798
|
+
|
|
569
799
|
if (!ok) {
|
|
570
800
|
// If streaming failed, answer contains the error message
|
|
571
801
|
console.log(answer);
|
|
572
802
|
return;
|
|
573
803
|
}
|
|
574
|
-
|
|
804
|
+
|
|
575
805
|
const duration = formatDuration(Date.now() - startTime);
|
|
576
|
-
|
|
806
|
+
|
|
577
807
|
// Show sources breakdown
|
|
578
808
|
console.log("\n---\n**Sources:**\n");
|
|
579
809
|
for (let i = 0; i < sections.length; i++) {
|
|
580
810
|
const s = sections[i];
|
|
581
811
|
const icon = s.type === 'primary' ? '📄' : s.type === 'raw_data' ? '📎' : '🔗';
|
|
582
|
-
console.log(`${icon} ${i + 1}. [[${s.slug}|${s.title}]]
|
|
812
|
+
console.log(`${icon} ${i + 1}. [[${s.slug}|${s.title}]] - ${s.label} (${(s.content.length / 1024).toFixed(1)}KB)`);
|
|
583
813
|
}
|
|
584
814
|
console.log(`\n*Context: ${stats.primaryPages} page(s), ${stats.rawDocs} raw doc(s), ${stats.linkedPages} linked page(s)*`);
|
|
585
815
|
} else {
|
|
@@ -763,11 +993,11 @@ Examples:
|
|
|
763
993
|
throw new Error(`page not found: ${slug}`);
|
|
764
994
|
}
|
|
765
995
|
const settings = await loadSettings();
|
|
766
|
-
|
|
996
|
+
|
|
767
997
|
const progress = createProgress();
|
|
768
998
|
progress.start(`Extracting timeline from ${slug}...`);
|
|
769
999
|
const startTime = Date.now();
|
|
770
|
-
|
|
1000
|
+
|
|
771
1001
|
const result = await repo.extractAndAddTimeline(
|
|
772
1002
|
slug,
|
|
773
1003
|
page.compiledTruth,
|
|
@@ -775,16 +1005,16 @@ Examples:
|
|
|
775
1005
|
opts.defaultDate ?? new Date().toISOString().slice(0, 10),
|
|
776
1006
|
settings.llm,
|
|
777
1007
|
);
|
|
778
|
-
|
|
1008
|
+
|
|
779
1009
|
const duration = formatDuration(Date.now() - startTime);
|
|
780
|
-
|
|
1010
|
+
|
|
781
1011
|
if (result.entries.length > 0) {
|
|
782
1012
|
progress.succeed(`${result.entries.length} events extracted (${duration})`);
|
|
783
1013
|
} else {
|
|
784
1014
|
progress.stop();
|
|
785
1015
|
process.stderr.write(`No events found (${duration})\n`);
|
|
786
1016
|
}
|
|
787
|
-
|
|
1017
|
+
|
|
788
1018
|
print(program, {
|
|
789
1019
|
ok: true,
|
|
790
1020
|
action: "timeline-extract",
|
|
@@ -947,7 +1177,7 @@ Examples:
|
|
|
947
1177
|
data = JSON.parse(opts.data);
|
|
948
1178
|
} else if (opts.stdin) {
|
|
949
1179
|
const raw = await readMaybeStdin();
|
|
950
|
-
if (!raw?.trim()) throw new Error("empty stdin
|
|
1180
|
+
if (!raw?.trim()) throw new Error("empty stdin - pipe JSON");
|
|
951
1181
|
data = JSON.parse(raw);
|
|
952
1182
|
} else {
|
|
953
1183
|
throw new Error("provide --data <json> or --stdin");
|
|
@@ -996,7 +1226,7 @@ Examples:
|
|
|
996
1226
|
await withRepo(program, async (repo) => {
|
|
997
1227
|
const root = resolve(dir);
|
|
998
1228
|
const files = await collectMarkdownFiles(root);
|
|
999
|
-
|
|
1229
|
+
|
|
1000
1230
|
if (isDryRun(opts)) {
|
|
1001
1231
|
print(program, {
|
|
1002
1232
|
dryRun: true,
|
|
@@ -1012,16 +1242,16 @@ Examples:
|
|
|
1012
1242
|
const settings = await loadSettings();
|
|
1013
1243
|
const spinner = createSpinner();
|
|
1014
1244
|
const startTime = Date.now();
|
|
1015
|
-
|
|
1245
|
+
|
|
1016
1246
|
if (!jsonOut) {
|
|
1017
1247
|
header(`Import: ${root}`);
|
|
1018
1248
|
}
|
|
1019
|
-
|
|
1249
|
+
|
|
1020
1250
|
// Phase 1: Parse all files and collect data
|
|
1021
1251
|
if (!jsonOut) {
|
|
1022
1252
|
spinner.start(`Scanning ${files.length} files...`);
|
|
1023
1253
|
}
|
|
1024
|
-
|
|
1254
|
+
|
|
1025
1255
|
const fileData: Array<{
|
|
1026
1256
|
file: string;
|
|
1027
1257
|
slug: string;
|
|
@@ -1031,7 +1261,7 @@ Examples:
|
|
|
1031
1261
|
timelineEntries: ReturnType<typeof extractTimelineLines>;
|
|
1032
1262
|
tags: string[];
|
|
1033
1263
|
}> = [];
|
|
1034
|
-
|
|
1264
|
+
|
|
1035
1265
|
for (const file of files) {
|
|
1036
1266
|
const rawSlug = pathToSlug(file, root);
|
|
1037
1267
|
const slug = normalizeLongSlug(rawSlug);
|
|
@@ -1044,19 +1274,19 @@ Examples:
|
|
|
1044
1274
|
: [];
|
|
1045
1275
|
fileData.push({ file, slug, parsed, content, wikiLinks, timelineEntries, tags });
|
|
1046
1276
|
}
|
|
1047
|
-
|
|
1277
|
+
|
|
1048
1278
|
if (!jsonOut) {
|
|
1049
1279
|
spinner.succeed(`Found ${files.length} markdown files`);
|
|
1050
1280
|
}
|
|
1051
|
-
|
|
1281
|
+
|
|
1052
1282
|
// Phase 2: Write all pages first (skip embed for performance)
|
|
1053
1283
|
if (!jsonOut) {
|
|
1054
1284
|
spinner.start(`Writing ${fileData.length} pages to database...`);
|
|
1055
1285
|
}
|
|
1056
|
-
|
|
1286
|
+
|
|
1057
1287
|
const allSlugs: string[] = [];
|
|
1058
1288
|
const writeErrors: string[] = [];
|
|
1059
|
-
|
|
1289
|
+
|
|
1060
1290
|
for (let i = 0; i < fileData.length; i++) {
|
|
1061
1291
|
const { slug, parsed } = fileData[i]!;
|
|
1062
1292
|
if (!jsonOut && i % 20 === 0) {
|
|
@@ -1076,7 +1306,7 @@ Examples:
|
|
|
1076
1306
|
writeErrors.push(`${slug}: ${err instanceof Error ? err.message : String(err)}`);
|
|
1077
1307
|
}
|
|
1078
1308
|
}
|
|
1079
|
-
|
|
1309
|
+
|
|
1080
1310
|
if (!jsonOut) {
|
|
1081
1311
|
spinner.succeed(`Wrote ${allSlugs.length} pages to database`);
|
|
1082
1312
|
if (writeErrors.length > 0) {
|
|
@@ -1089,16 +1319,16 @@ Examples:
|
|
|
1089
1319
|
}
|
|
1090
1320
|
}
|
|
1091
1321
|
}
|
|
1092
|
-
|
|
1322
|
+
|
|
1093
1323
|
// Phase 3: Parallel entity extraction (main optimization)
|
|
1094
1324
|
const BATCH_SIZE = 10;
|
|
1095
1325
|
const entityResults = new Map<string, Awaited<ReturnType<typeof extractRelations>>>();
|
|
1096
|
-
|
|
1326
|
+
|
|
1097
1327
|
if (settings.llm.baseURL) {
|
|
1098
1328
|
if (!jsonOut) {
|
|
1099
1329
|
spinner.start(`Extracting entities with LLM...`);
|
|
1100
1330
|
}
|
|
1101
|
-
|
|
1331
|
+
|
|
1102
1332
|
for (let i = 0; i < fileData.length; i += BATCH_SIZE) {
|
|
1103
1333
|
const batch = fileData.slice(i, i + BATCH_SIZE);
|
|
1104
1334
|
if (!jsonOut) {
|
|
@@ -1113,7 +1343,7 @@ Examples:
|
|
|
1113
1343
|
entityResults.set(slug, relations);
|
|
1114
1344
|
}
|
|
1115
1345
|
}
|
|
1116
|
-
|
|
1346
|
+
|
|
1117
1347
|
if (!jsonOut) {
|
|
1118
1348
|
spinner.succeed(`Entity extraction complete`);
|
|
1119
1349
|
}
|
|
@@ -1122,17 +1352,17 @@ Examples:
|
|
|
1122
1352
|
warning(`LLM not configured, skipping entity extraction`);
|
|
1123
1353
|
}
|
|
1124
1354
|
}
|
|
1125
|
-
|
|
1355
|
+
|
|
1126
1356
|
// Phase 4: Write links, tags, timeline, and entity pages
|
|
1127
1357
|
if (!jsonOut) {
|
|
1128
1358
|
spinner.start(`Creating links, tags, and timeline entries...`);
|
|
1129
1359
|
}
|
|
1130
|
-
|
|
1360
|
+
|
|
1131
1361
|
let linkCount = 0;
|
|
1132
1362
|
let timelineCount = 0;
|
|
1133
1363
|
let entityCount = 0;
|
|
1134
1364
|
let tagCount = 0;
|
|
1135
|
-
|
|
1365
|
+
|
|
1136
1366
|
// Collect timeline entries for batch insert
|
|
1137
1367
|
const allTimelineEntries: Array<{
|
|
1138
1368
|
pageSlug: string;
|
|
@@ -1141,14 +1371,14 @@ Examples:
|
|
|
1141
1371
|
summary: string;
|
|
1142
1372
|
detail: string;
|
|
1143
1373
|
}> = [];
|
|
1144
|
-
|
|
1374
|
+
|
|
1145
1375
|
for (const { slug, wikiLinks, timelineEntries, tags, content } of fileData) {
|
|
1146
1376
|
// Wiki links
|
|
1147
1377
|
for (const link of wikiLinks) {
|
|
1148
1378
|
await repo.link(slug, link, "import");
|
|
1149
1379
|
linkCount++;
|
|
1150
1380
|
}
|
|
1151
|
-
|
|
1381
|
+
|
|
1152
1382
|
// Collect timeline entries for batch insert
|
|
1153
1383
|
for (const entry of timelineEntries) {
|
|
1154
1384
|
allTimelineEntries.push({
|
|
@@ -1160,13 +1390,13 @@ Examples:
|
|
|
1160
1390
|
});
|
|
1161
1391
|
timelineCount++;
|
|
1162
1392
|
}
|
|
1163
|
-
|
|
1393
|
+
|
|
1164
1394
|
// Tags
|
|
1165
1395
|
for (const tag of tags) {
|
|
1166
1396
|
await repo.tag(slug, tag);
|
|
1167
1397
|
tagCount++;
|
|
1168
1398
|
}
|
|
1169
|
-
|
|
1399
|
+
|
|
1170
1400
|
// Entity links from parallel extraction
|
|
1171
1401
|
const relations = entityResults.get(slug);
|
|
1172
1402
|
if (relations && relations.length > 0) {
|
|
@@ -1176,12 +1406,12 @@ Examples:
|
|
|
1176
1406
|
const toCandidate = entityToSlug(r.to.name, r.to.type);
|
|
1177
1407
|
const fromSlug = await repo.findSimilarSlug(fromCandidate, r.from.name);
|
|
1178
1408
|
const toSlug = await repo.findSimilarSlug(toCandidate, r.to.name);
|
|
1179
|
-
|
|
1409
|
+
|
|
1180
1410
|
const c1 = await repo.ensureEntityPage(fromSlug, r.from.type, r.from.name, r.relation, r.context, slug);
|
|
1181
1411
|
const c2 = await repo.ensureEntityPage(toSlug, r.to.type, r.to.name, r.relation, r.context, slug);
|
|
1182
1412
|
if (c1) entityCount++;
|
|
1183
1413
|
if (c2) entityCount++;
|
|
1184
|
-
|
|
1414
|
+
|
|
1185
1415
|
await repo.link(fromSlug, toSlug, `[${r.relation}] ${r.context}`);
|
|
1186
1416
|
await repo.link(slug, fromSlug, `Mentions ${r.from.name}`);
|
|
1187
1417
|
await repo.link(slug, toSlug, `Mentions ${r.to.name}`);
|
|
@@ -1189,16 +1419,16 @@ Examples:
|
|
|
1189
1419
|
}
|
|
1190
1420
|
}
|
|
1191
1421
|
}
|
|
1192
|
-
|
|
1422
|
+
|
|
1193
1423
|
// Batch insert all timeline entries
|
|
1194
1424
|
if (allTimelineEntries.length > 0) {
|
|
1195
1425
|
await repo.timelineAddBatch(allTimelineEntries);
|
|
1196
1426
|
}
|
|
1197
|
-
|
|
1427
|
+
|
|
1198
1428
|
if (!jsonOut) {
|
|
1199
1429
|
spinner.succeed(`Created links, tags, and timeline`);
|
|
1200
1430
|
}
|
|
1201
|
-
|
|
1431
|
+
|
|
1202
1432
|
// Phase 5: Batch sync all pages to search index
|
|
1203
1433
|
if (opts.skipIndex) {
|
|
1204
1434
|
if (!jsonOut) {
|
|
@@ -1209,14 +1439,14 @@ Examples:
|
|
|
1209
1439
|
spinner.start(`Indexing ${allSlugs.length} pages for search...`);
|
|
1210
1440
|
}
|
|
1211
1441
|
await repo.embedAll();
|
|
1212
|
-
|
|
1442
|
+
|
|
1213
1443
|
if (!jsonOut) {
|
|
1214
1444
|
spinner.succeed(`Search indexing complete`);
|
|
1215
1445
|
}
|
|
1216
1446
|
}
|
|
1217
|
-
|
|
1447
|
+
|
|
1218
1448
|
const duration = formatDuration(Date.now() - startTime);
|
|
1219
|
-
|
|
1449
|
+
|
|
1220
1450
|
if (!jsonOut) {
|
|
1221
1451
|
// Print summary
|
|
1222
1452
|
header("Import Summary");
|
|
@@ -1227,12 +1457,12 @@ Examples:
|
|
|
1227
1457
|
keyValue("Timeline entries", String(timelineCount));
|
|
1228
1458
|
keyValue("Tags added", String(tagCount));
|
|
1229
1459
|
keyValue("Duration", duration);
|
|
1230
|
-
|
|
1460
|
+
|
|
1231
1461
|
if (writeErrors.length > 0) {
|
|
1232
1462
|
warning(`${writeErrors.length} pages had errors`);
|
|
1233
1463
|
}
|
|
1234
1464
|
}
|
|
1235
|
-
|
|
1465
|
+
|
|
1236
1466
|
print(program, {
|
|
1237
1467
|
ok: true,
|
|
1238
1468
|
importedFiles: files.length,
|
|
@@ -1280,116 +1510,6 @@ Examples:
|
|
|
1280
1510
|
});
|
|
1281
1511
|
});
|
|
1282
1512
|
|
|
1283
|
-
// -- ingest ---------------------------------------------------------------
|
|
1284
|
-
|
|
1285
|
-
addDryRun(
|
|
1286
|
-
program
|
|
1287
|
-
.command("ingest")
|
|
1288
|
-
.argument("[file]", "file path to ingest (omit for stdin)")
|
|
1289
|
-
.option("--type <type>", "source type", "doc")
|
|
1290
|
-
.option("--stdin", "read from stdin", false)
|
|
1291
|
-
.description("ingest a file as a new page (under ingest/<name>)")
|
|
1292
|
-
.addHelpText(
|
|
1293
|
-
"after",
|
|
1294
|
-
`
|
|
1295
|
-
Examples:
|
|
1296
|
-
ebrain ingest report.pdf --type pdf
|
|
1297
|
-
cat article.md | ebrain ingest --stdin --type article
|
|
1298
|
-
ebrain ingest report.pdf --type pdf --dry-run
|
|
1299
|
-
`,
|
|
1300
|
-
),
|
|
1301
|
-
).action(
|
|
1302
|
-
async (
|
|
1303
|
-
file: string | undefined,
|
|
1304
|
-
opts: { type?: string; stdin?: boolean; dryRun?: boolean },
|
|
1305
|
-
) => {
|
|
1306
|
-
let content: string;
|
|
1307
|
-
let fileName: string;
|
|
1308
|
-
|
|
1309
|
-
if (file) {
|
|
1310
|
-
const fullPath = resolve(file);
|
|
1311
|
-
if (!(await fileExists(fullPath))) {
|
|
1312
|
-
throw new Error(`file not found: ${file}`);
|
|
1313
|
-
}
|
|
1314
|
-
content = await readTextFile(fullPath);
|
|
1315
|
-
fileName = basename(fullPath);
|
|
1316
|
-
} else if (opts.stdin) {
|
|
1317
|
-
const raw = await readMaybeStdin();
|
|
1318
|
-
if (!raw?.trim()) throw new Error("empty stdin — pipe content");
|
|
1319
|
-
content = raw;
|
|
1320
|
-
fileName = "stdin";
|
|
1321
|
-
} else {
|
|
1322
|
-
throw new Error("provide <file> or --stdin");
|
|
1323
|
-
}
|
|
1324
|
-
|
|
1325
|
-
const slug = `ingest/${fileName.replace(/\.[^.]+$/, "")}`;
|
|
1326
|
-
const type = opts.type ?? "doc";
|
|
1327
|
-
|
|
1328
|
-
if (isDryRun(opts)) {
|
|
1329
|
-
print(program, {
|
|
1330
|
-
dryRun: true,
|
|
1331
|
-
action: "ingest",
|
|
1332
|
-
slug,
|
|
1333
|
-
type,
|
|
1334
|
-
contentLength: content.length,
|
|
1335
|
-
});
|
|
1336
|
-
return;
|
|
1337
|
-
}
|
|
1338
|
-
|
|
1339
|
-
await withRepo(program, async (repo) => {
|
|
1340
|
-
const jsonOut = isJson(program);
|
|
1341
|
-
const spinner = createSpinner();
|
|
1342
|
-
const startTime = Date.now();
|
|
1343
|
-
|
|
1344
|
-
if (!jsonOut) {
|
|
1345
|
-
header(`Ingest: ${fileName}`);
|
|
1346
|
-
spinner.start(`Creating page from file...`);
|
|
1347
|
-
}
|
|
1348
|
-
|
|
1349
|
-
await repo.putPage({
|
|
1350
|
-
slug,
|
|
1351
|
-
type,
|
|
1352
|
-
title: slugToTitle(slug),
|
|
1353
|
-
compiledTruth: content,
|
|
1354
|
-
timeline: "",
|
|
1355
|
-
frontmatter: {
|
|
1356
|
-
sourceFile: resolve(fileName),
|
|
1357
|
-
sourceType: type,
|
|
1358
|
-
},
|
|
1359
|
-
});
|
|
1360
|
-
|
|
1361
|
-
if (!jsonOut) {
|
|
1362
|
-
spinner.succeed(`Page created: ${slug}`);
|
|
1363
|
-
keyValue("Source file", fileName);
|
|
1364
|
-
keyValue("Type", type);
|
|
1365
|
-
keyValue("Content length", `${content.length} chars`);
|
|
1366
|
-
}
|
|
1367
|
-
|
|
1368
|
-
await repo.timelineAdd({
|
|
1369
|
-
pageSlug: slug,
|
|
1370
|
-
date: new Date().toISOString().slice(0, 10),
|
|
1371
|
-
source: type,
|
|
1372
|
-
summary: `Ingested file ${fileName}`,
|
|
1373
|
-
detail: "",
|
|
1374
|
-
});
|
|
1375
|
-
|
|
1376
|
-
await applyEntityLinks(
|
|
1377
|
-
repo,
|
|
1378
|
-
slug,
|
|
1379
|
-
content,
|
|
1380
|
-
jsonOut,
|
|
1381
|
-
);
|
|
1382
|
-
|
|
1383
|
-
if (!jsonOut) {
|
|
1384
|
-
const duration = formatDuration(Date.now() - startTime);
|
|
1385
|
-
success(`Ingestion completed in ${duration}`);
|
|
1386
|
-
}
|
|
1387
|
-
|
|
1388
|
-
print(program, { ok: true, action: "ingest", slug });
|
|
1389
|
-
});
|
|
1390
|
-
},
|
|
1391
|
-
);
|
|
1392
|
-
|
|
1393
1513
|
// -- embed ----------------------------------------------------------------
|
|
1394
1514
|
|
|
1395
1515
|
addDryRun(
|
|
@@ -1429,26 +1549,26 @@ Examples:
|
|
|
1429
1549
|
const jsonOut = isJson(program);
|
|
1430
1550
|
const spinner = createSpinner();
|
|
1431
1551
|
const startTime = Date.now();
|
|
1432
|
-
|
|
1552
|
+
|
|
1433
1553
|
if (!jsonOut) {
|
|
1434
1554
|
header("Embed All Pages");
|
|
1435
1555
|
spinner.start(`Loading pages...`);
|
|
1436
1556
|
}
|
|
1437
|
-
|
|
1557
|
+
|
|
1438
1558
|
const pages = await repo.listPages({ limit: 100000 });
|
|
1439
|
-
|
|
1559
|
+
|
|
1440
1560
|
if (!jsonOut) {
|
|
1441
1561
|
spinner.update(`Embedding ${pages.length} pages...`);
|
|
1442
1562
|
}
|
|
1443
|
-
|
|
1563
|
+
|
|
1444
1564
|
const count = await repo.embedAll();
|
|
1445
|
-
|
|
1565
|
+
|
|
1446
1566
|
if (!jsonOut) {
|
|
1447
1567
|
const duration = formatDuration(Date.now() - startTime);
|
|
1448
1568
|
spinner.succeed(`Embedded ${count} pages`);
|
|
1449
1569
|
keyValue("Duration", duration);
|
|
1450
1570
|
}
|
|
1451
|
-
|
|
1571
|
+
|
|
1452
1572
|
print(program, { embedded: count, mode: "all" });
|
|
1453
1573
|
});
|
|
1454
1574
|
return;
|
|
@@ -1463,18 +1583,18 @@ Examples:
|
|
|
1463
1583
|
await withRepo(program, async (repo) => {
|
|
1464
1584
|
const jsonOut = isJson(program);
|
|
1465
1585
|
const spinner = createSpinner();
|
|
1466
|
-
|
|
1586
|
+
|
|
1467
1587
|
if (!jsonOut) {
|
|
1468
1588
|
header(`Embed: ${slug}`);
|
|
1469
1589
|
spinner.start(`Generating embedding for page...`);
|
|
1470
1590
|
}
|
|
1471
|
-
|
|
1591
|
+
|
|
1472
1592
|
await repo.syncPageToSearch(slug);
|
|
1473
|
-
|
|
1593
|
+
|
|
1474
1594
|
if (!jsonOut) {
|
|
1475
1595
|
spinner.succeed(`Page embedded: ${slug}`);
|
|
1476
1596
|
}
|
|
1477
|
-
|
|
1597
|
+
|
|
1478
1598
|
print(program, { embedded: 1, slug });
|
|
1479
1599
|
});
|
|
1480
1600
|
},
|
|
@@ -1527,7 +1647,7 @@ Examples:
|
|
|
1527
1647
|
}
|
|
1528
1648
|
dbInitialized = true;
|
|
1529
1649
|
} else {
|
|
1530
|
-
// Try to create it without collection
|
|
1650
|
+
// Try to create it without collection - embedding config may not be ready
|
|
1531
1651
|
try {
|
|
1532
1652
|
const db = await BrainDb.connect(dbPath, settings, { skipCollection: true });
|
|
1533
1653
|
await db.close();
|
|
@@ -1601,7 +1721,7 @@ Examples:
|
|
|
1601
1721
|
await withRepo(program, async (repo) => {
|
|
1602
1722
|
const jsonOut = isJson(program);
|
|
1603
1723
|
const stats = await repo.stats();
|
|
1604
|
-
|
|
1724
|
+
|
|
1605
1725
|
if (!jsonOut) {
|
|
1606
1726
|
header("Knowledge Base Statistics");
|
|
1607
1727
|
keyValue("Pages", String(stats.pages));
|
|
@@ -1610,7 +1730,7 @@ Examples:
|
|
|
1610
1730
|
keyValue("Timeline entries", String(stats.timelineEntries));
|
|
1611
1731
|
keyValue("Raw data rows", String(stats.rawRows));
|
|
1612
1732
|
}
|
|
1613
|
-
|
|
1733
|
+
|
|
1614
1734
|
print(program, stats);
|
|
1615
1735
|
});
|
|
1616
1736
|
});
|
|
@@ -1671,7 +1791,7 @@ async function withRepo(
|
|
|
1671
1791
|
const db = await BrainDb.connect(dbPath, settings);
|
|
1672
1792
|
const repo = new BrainRepository(db);
|
|
1673
1793
|
await callback(repo);
|
|
1674
|
-
|
|
1794
|
+
|
|
1675
1795
|
// Gracefully close database
|
|
1676
1796
|
// Note: seekdb SDK's InternalEmbeddedClient.close() is empty in embedded mode
|
|
1677
1797
|
// Data may not flush properly. Use remote seekdb server for reliability.
|
|
@@ -1680,10 +1800,10 @@ async function withRepo(
|
|
|
1680
1800
|
} catch (e) {
|
|
1681
1801
|
// Close may fail due to seekdb native bug
|
|
1682
1802
|
}
|
|
1683
|
-
|
|
1803
|
+
|
|
1684
1804
|
// Give seekdb extra time after close
|
|
1685
1805
|
await new Promise((r) => setTimeout(r, 500));
|
|
1686
|
-
|
|
1806
|
+
|
|
1687
1807
|
// CLI: force exit to bypass seekdb native cleanup segfault
|
|
1688
1808
|
process.exit(0);
|
|
1689
1809
|
}
|
|
@@ -1726,7 +1846,7 @@ function normalizeLinkSlug(path: string): string {
|
|
|
1726
1846
|
}
|
|
1727
1847
|
|
|
1728
1848
|
// ---------------------------------------------------------------------------
|
|
1729
|
-
// LLM Answer Generation
|
|
1849
|
+
// LLM Answer Generation - Multi-layer Context Collection
|
|
1730
1850
|
// ---------------------------------------------------------------------------
|
|
1731
1851
|
|
|
1732
1852
|
/** A single section of context for the LLM prompt. */
|
|
@@ -1741,12 +1861,12 @@ interface ContextSection {
|
|
|
1741
1861
|
|
|
1742
1862
|
/**
|
|
1743
1863
|
* Collect multi-layer context for LLM answer generation.
|
|
1744
|
-
*
|
|
1864
|
+
*
|
|
1745
1865
|
* Layers (in priority order):
|
|
1746
1866
|
* 1. Primary: compiledTruth + timeline of each hit page
|
|
1747
1867
|
* 2. Raw data: original documents stored via raw.set
|
|
1748
1868
|
* 3. Linked pages: compiledTruth of pages linked to/from hit pages
|
|
1749
|
-
*
|
|
1869
|
+
*
|
|
1750
1870
|
* Budget is enforced via total character limit.
|
|
1751
1871
|
*/
|
|
1752
1872
|
async function collectContextForLLM(
|
|
@@ -1845,8 +1965,8 @@ async function collectContextForLLM(
|
|
|
1845
1965
|
}
|
|
1846
1966
|
}
|
|
1847
1967
|
|
|
1848
|
-
// Layer 3: Linked pages
|
|
1849
|
-
// No second repo.query() call needed
|
|
1968
|
+
// Layer 3: Linked pages - score using cached data + keyword matching
|
|
1969
|
+
// No second repo.query() call needed - reuse hits scores + keyword fallback
|
|
1850
1970
|
onProgress?.('linked pages');
|
|
1851
1971
|
const allLinkedSlugs = new Set<string>();
|
|
1852
1972
|
for (const hit of hits) {
|
|
@@ -1952,7 +2072,7 @@ async function collectContextForLLM(
|
|
|
1952
2072
|
function computeKeywordRelevance(text: string, question: string): number {
|
|
1953
2073
|
const STOP_CHARS = new Set('的是了在和我有你就这不人都说上个大国为到以们年会生地要主中子自实家小对多能好可很所把当');
|
|
1954
2074
|
const questionChars = [...question]
|
|
1955
|
-
.filter(c => !/\s|[
|
|
2075
|
+
.filter(c => !/\s|[,,。!?、;::""''()()【】\[\]{}<>\/\\|~`@#$%^&*+=_-]/.test(c) && !STOP_CHARS.has(c));
|
|
1956
2076
|
if (questionChars.length === 0) return 0;
|
|
1957
2077
|
|
|
1958
2078
|
const uniqueChars = new Set(questionChars);
|
|
@@ -2003,7 +2123,7 @@ async function generateAnswerWithStream(
|
|
|
2003
2123
|
contextParts.push(`## ${header}\n`);
|
|
2004
2124
|
for (const s of group) {
|
|
2005
2125
|
sectionIndex++;
|
|
2006
|
-
contextParts.push(`### [${sectionIndex}] ${s.title}
|
|
2126
|
+
contextParts.push(`### [${sectionIndex}] ${s.title} - ${s.label}\n**Slug:** ${s.slug}\n\n${s.content}\n`);
|
|
2007
2127
|
}
|
|
2008
2128
|
contextParts.push('');
|
|
2009
2129
|
}
|
|
@@ -2014,7 +2134,7 @@ async function generateAnswerWithStream(
|
|
|
2014
2134
|
|
|
2015
2135
|
const context = contextParts.join('\n');
|
|
2016
2136
|
|
|
2017
|
-
const prompt =
|
|
2137
|
+
const prompt = `你是一个知识库助手,请根据提供的知识库内容回答问题。
|
|
2018
2138
|
|
|
2019
2139
|
## 问题
|
|
2020
2140
|
${question}
|
|
@@ -2024,13 +2144,13 @@ ${question}
|
|
|
2024
2144
|
${context}
|
|
2025
2145
|
|
|
2026
2146
|
## 回答要求
|
|
2027
|
-
-
|
|
2028
|
-
-
|
|
2147
|
+
- 仅基于提供的知识库内容回答,不要编造信息
|
|
2148
|
+
- 如果知识库中没有相关信息,请明确说明
|
|
2029
2149
|
- 引用来源时使用 [[slug|标题]] 的格式
|
|
2030
2150
|
- 使用清晰的 markdown 格式
|
|
2031
|
-
-
|
|
2151
|
+
- 如果涉及时间线信息,请在回答中体现
|
|
2032
2152
|
- 区分哪些信息来自「页面正文」、哪些来自「原始文档」、哪些来自「关联页面」
|
|
2033
|
-
-
|
|
2153
|
+
- 语言与提问保持一致(中文提问用中文回答,英文提问用英文回答)
|
|
2034
2154
|
|
|
2035
2155
|
## 回答`;
|
|
2036
2156
|
|
|
@@ -2045,10 +2165,10 @@ ${context}
|
|
|
2045
2165
|
|
|
2046
2166
|
try {
|
|
2047
2167
|
const url = llm.baseURL.endsWith("/") ? llm.baseURL + "chat/completions" : llm.baseURL + "/chat/completions";
|
|
2048
|
-
|
|
2168
|
+
|
|
2049
2169
|
// Show thinking indicator while waiting for first token
|
|
2050
2170
|
process.stderr.write(`\x1b[35m💭\x1b[0m \x1b[2mConnecting to ${llm.model}...\x1b[0m\n`);
|
|
2051
|
-
|
|
2171
|
+
|
|
2052
2172
|
const resp = await fetch(
|
|
2053
2173
|
url,
|
|
2054
2174
|
{
|
|
@@ -2063,7 +2183,7 @@ ${context}
|
|
|
2063
2183
|
messages: [
|
|
2064
2184
|
{
|
|
2065
2185
|
role: "system",
|
|
2066
|
-
content: "
|
|
2186
|
+
content: "你是一个专业的知识库助手,基于提供的知识库内容准确回答问题。引用来源时使用 [[slug|标题]] 格式。回答要条理清晰,区分信息来源。",
|
|
2067
2187
|
},
|
|
2068
2188
|
{ role: "user", content: prompt },
|
|
2069
2189
|
],
|
|
@@ -2172,7 +2292,7 @@ async function generateAnswerWithContext(
|
|
|
2172
2292
|
contextParts.push(`## ${header}\n`);
|
|
2173
2293
|
for (const s of group) {
|
|
2174
2294
|
sectionIndex++;
|
|
2175
|
-
contextParts.push(`### [${sectionIndex}] ${s.title}
|
|
2295
|
+
contextParts.push(`### [${sectionIndex}] ${s.title} - ${s.label}\n**Slug:** ${s.slug}\n\n${s.content}\n`);
|
|
2176
2296
|
}
|
|
2177
2297
|
contextParts.push('');
|
|
2178
2298
|
}
|
|
@@ -2183,7 +2303,7 @@ async function generateAnswerWithContext(
|
|
|
2183
2303
|
|
|
2184
2304
|
const context = contextParts.join('\n');
|
|
2185
2305
|
|
|
2186
|
-
const prompt =
|
|
2306
|
+
const prompt = `你是一个知识库助手,请根据提供的知识库内容回答问题。
|
|
2187
2307
|
|
|
2188
2308
|
## 问题
|
|
2189
2309
|
${question}
|
|
@@ -2193,13 +2313,13 @@ ${question}
|
|
|
2193
2313
|
${context}
|
|
2194
2314
|
|
|
2195
2315
|
## 回答要求
|
|
2196
|
-
-
|
|
2197
|
-
-
|
|
2316
|
+
- 仅基于提供的知识库内容回答,不要编造信息
|
|
2317
|
+
- 如果知识库中没有相关信息,请明确说明
|
|
2198
2318
|
- 引用来源时使用 [[slug|标题]] 的格式
|
|
2199
2319
|
- 使用清晰的 markdown 格式
|
|
2200
|
-
-
|
|
2320
|
+
- 如果涉及时间线信息,请在回答中体现
|
|
2201
2321
|
- 区分哪些信息来自「页面正文」、哪些来自「原始文档」、哪些来自「关联页面」
|
|
2202
|
-
-
|
|
2322
|
+
- 语言与提问保持一致(中文提问用中文回答,英文提问用英文回答)
|
|
2203
2323
|
|
|
2204
2324
|
## 回答`;
|
|
2205
2325
|
|
|
@@ -2217,7 +2337,7 @@ ${context}
|
|
|
2217
2337
|
messages: [
|
|
2218
2338
|
{
|
|
2219
2339
|
role: "system",
|
|
2220
|
-
content: "
|
|
2340
|
+
content: "你是一个专业的知识库助手,基于提供的知识库内容准确回答问题。引用来源时使用 [[slug|标题]] 格式。回答要条理清晰,区分信息来源。",
|
|
2221
2341
|
},
|
|
2222
2342
|
{ role: "user", content: prompt },
|
|
2223
2343
|
],
|