ex-brain 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,486 @@
1
+ import { Command } from "commander";
2
+ import { BrainRepository } from "../repositories/brain-repo";
3
+ import { loadSettings, type ResolvedLLM } from "../settings";
4
+ import { withRepo, isJson, print } from "./shared";
5
+ import { createProgress, formatDuration } from "../utils/progress";
6
+
7
+ // ---------------------------------------------------------------------------
8
+ // Context collection for LLM answers
9
+ // ---------------------------------------------------------------------------
10
+
11
+ interface ContextSection {
12
+ type: 'primary' | 'raw_data' | 'linked';
13
+ slug: string;
14
+ title: string;
15
+ content: string;
16
+ label: string;
17
+ }
18
+
19
+ interface ContextStats {
20
+ primaryPages: number;
21
+ rawDocs: number;
22
+ linkedPages: number;
23
+ skippedChars: number;
24
+ }
25
+
26
+ async function collectContextForLLM(
27
+ repo: BrainRepository,
28
+ hits: Array<{ slug: string; title: string; score: number }>,
29
+ question: string,
30
+ maxChars: number,
31
+ onProgress?: (stage: string) => void,
32
+ ): Promise<{ sections: ContextSection[]; totalChars: number; stats: ContextStats }> {
33
+ const sections: ContextSection[] = [];
34
+ let totalChars = 0;
35
+ const stats: ContextStats = {
36
+ primaryPages: 0,
37
+ rawDocs: 0,
38
+ linkedPages: 0,
39
+ skippedChars: 0,
40
+ };
41
+
42
+ const seenSlugs = new Set<string>();
43
+
44
+ function addSection(section: ContextSection): boolean {
45
+ if (seenSlugs.has(`${section.type}:${section.slug}:${section.label}`)) {
46
+ return false;
47
+ }
48
+ const budget = maxChars - totalChars;
49
+ if (section.content.length > budget && sections.length > 0) {
50
+ section.content = section.content.slice(0, budget - 20) + '\n...[truncated]';
51
+ stats.skippedChars += section.content.length - budget;
52
+ }
53
+ if (section.content.length > 0) {
54
+ sections.push(section);
55
+ totalChars += section.content.length;
56
+ seenSlugs.add(`${section.type}:${section.slug}:${section.label}`);
57
+ return true;
58
+ }
59
+ return false;
60
+ }
61
+
62
+ const pageCache = new Map<string, NonNullable<Awaited<ReturnType<typeof repo.getPage>>>>();
63
+
64
+ // Layer 1: Primary pages
65
+ onProgress?.('page content');
66
+ for (const hit of hits) {
67
+ const page = await repo.getPage(hit.slug);
68
+ if (!page) continue;
69
+ pageCache.set(hit.slug, page);
70
+
71
+ const parts: string[] = [];
72
+ if (page.compiledTruth?.trim()) {
73
+ parts.push(page.compiledTruth.trim());
74
+ }
75
+ const tl = page.timeline?.trim();
76
+ if (tl) {
77
+ parts.push(`## 时间线\n${tl}`);
78
+ }
79
+
80
+ if (parts.length > 0) {
81
+ addSection({
82
+ type: 'primary',
83
+ slug: page.slug,
84
+ title: page.title,
85
+ content: parts.join('\n\n'),
86
+ label: `页面正文`,
87
+ });
88
+ stats.primaryPages++;
89
+ }
90
+ }
91
+
92
+ // Layer 2: Raw data
93
+ onProgress?.('raw documents');
94
+ for (const hit of hits) {
95
+ try {
96
+ const rawRows = await repo.readRaw(hit.slug) as Array<{ source: string; data: unknown; fetchedAt?: string }>;
97
+ for (const row of rawRows) {
98
+ let rawContent = '';
99
+ if (typeof row.data === 'string') {
100
+ rawContent = row.data;
101
+ } else if (typeof row.data === 'object' && row.data !== null) {
102
+ rawContent = JSON.stringify(row.data, null, 2);
103
+ }
104
+ if (rawContent.trim()) {
105
+ addSection({
106
+ type: 'raw_data',
107
+ slug: hit.slug,
108
+ title: hit.title,
109
+ content: rawContent,
110
+ label: `原始文档 (${row.source})`,
111
+ });
112
+ stats.rawDocs++;
113
+ }
114
+ }
115
+ } catch { /* non-fatal */ }
116
+ }
117
+
118
+ // Layer 3: Linked pages
119
+ onProgress?.('linked pages');
120
+ const allLinkedSlugs = new Set<string>();
121
+ for (const hit of hits) {
122
+ try {
123
+ const outLinks = await repo.outgoingLinks(hit.slug);
124
+ outLinks.forEach(l => allLinkedSlugs.add(l.slug));
125
+ } catch { /* ignore */ }
126
+ try {
127
+ const backlinkSlugs = await repo.backlinks(hit.slug);
128
+ backlinkSlugs.forEach(s => allLinkedSlugs.add(s));
129
+ } catch { /* ignore */ }
130
+ }
131
+
132
+ if (allLinkedSlugs.size > 0) {
133
+ const semanticScoreMap = new Map(hits.map(h => [h.slug, h.score]));
134
+ const keywordScores = new Map<string, number>();
135
+ for (const linkedSlug of allLinkedSlugs) {
136
+ if (semanticScoreMap.has(linkedSlug)) continue;
137
+ const cached = pageCache.get(linkedSlug);
138
+ if (cached) {
139
+ const text = `${cached.title} ${cached.compiledTruth}`.slice(0, 2000);
140
+ keywordScores.set(linkedSlug, computeKeywordRelevance(text, question));
141
+ } else {
142
+ const page = await repo.getPage(linkedSlug);
143
+ if (page) {
144
+ pageCache.set(linkedSlug, page);
145
+ const text = `${page.title} ${page.compiledTruth}`.slice(0, 2000);
146
+ keywordScores.set(linkedSlug, computeKeywordRelevance(text, question));
147
+ }
148
+ }
149
+ }
150
+
151
+ const scoredLinked = [...allLinkedSlugs].map(slug => ({
152
+ slug,
153
+ score: semanticScoreMap.get(slug) ?? keywordScores.get(slug) ?? 0,
154
+ }));
155
+
156
+ const MIN_LINKED_SCORE = 0.02;
157
+ const relevantLinked = scoredLinked
158
+ .filter(s => s.score >= MIN_LINKED_SCORE)
159
+ .sort((a, b) => b.score - a.score);
160
+
161
+ for (const linked of relevantLinked) {
162
+ if (totalChars >= maxChars) break;
163
+
164
+ const linkedPage = pageCache.get(linked.slug);
165
+ if (!linkedPage || !linkedPage.compiledTruth?.trim()) continue;
166
+
167
+ const remaining = maxChars - totalChars;
168
+ let content = linkedPage.compiledTruth.trim();
169
+ if (content.length > remaining - 100) {
170
+ content = content.slice(0, remaining - 100) + '\n...[truncated]';
171
+ }
172
+
173
+ addSection({
174
+ type: 'linked',
175
+ slug: linkedPage.slug,
176
+ title: linkedPage.title,
177
+ content,
178
+ label: `关联页面: ${linkedPage.slug} (相关度: ${(linked.score * 100).toFixed(1)}%)`,
179
+ });
180
+ stats.linkedPages++;
181
+
182
+ if (linked.score > 0.1) {
183
+ try {
184
+ const rawRows = await repo.readRaw(linked.slug) as Array<{ source: string; data: unknown }>;
185
+ for (const row of rawRows) {
186
+ let rawContent = typeof row.data === 'string' ? row.data : JSON.stringify(row.data);
187
+ if (rawContent.trim().length > 100) {
188
+ const remaining2 = maxChars - totalChars;
189
+ if (rawContent.length > remaining2 - 100) {
190
+ rawContent = rawContent.slice(0, remaining2 - 100) + '\n...[truncated]';
191
+ }
192
+ addSection({
193
+ type: 'raw_data',
194
+ slug: linked.slug,
195
+ title: linkedPage.title,
196
+ content: rawContent,
197
+ label: `原始文档 (关联: ${row.source})`,
198
+ });
199
+ stats.rawDocs++;
200
+ }
201
+ }
202
+ } catch { /* ignore */ }
203
+ }
204
+ }
205
+ }
206
+
207
+ return { sections, totalChars, stats };
208
+ }
209
+
210
+ function computeKeywordRelevance(text: string, question: string): number {
211
+ const STOP_CHARS = new Set('的是了在和我有你就这不人都说上个大国为到以们年会生地要主中子自实家小对多能好可很所把当');
212
+ const questionChars = [...question]
213
+ .filter(c => !/\s|[,,。!?、;::""''()()【】\[\]{}<>\/\\|~`@#$%^&*+=_-]/.test(c) && !STOP_CHARS.has(c));
214
+ if (questionChars.length === 0) return 0;
215
+
216
+ const uniqueChars = new Set(questionChars);
217
+ const lower = text.toLowerCase();
218
+ let matched = 0;
219
+ for (const char of uniqueChars) {
220
+ if (lower.includes(char.toLowerCase())) matched++;
221
+ }
222
+ return matched / uniqueChars.size;
223
+ }
224
+
225
+ async function generateAnswerWithStream(
226
+ question: string,
227
+ sections: ContextSection[],
228
+ stats: ContextStats,
229
+ llm: ResolvedLLM,
230
+ ): Promise<{ answer: string; ok: boolean }> {
231
+ const apiKey = llm.apiKey || process.env[llm.apiKeyEnv] || "";
232
+ if (!apiKey) {
233
+ return { answer: "Error: LLM API key not configured.", ok: false };
234
+ }
235
+
236
+ if (sections.length === 0) {
237
+ return { answer: "知识库中没有找到相关内容。", ok: true };
238
+ }
239
+
240
+ const contextParts: string[] = [];
241
+ let sectionIndex = 0;
242
+
243
+ const primarySections = sections.filter(s => s.type === 'primary');
244
+ const rawSections = sections.filter(s => s.type === 'raw_data');
245
+ const linkedSections = sections.filter(s => s.type === 'linked');
246
+
247
+ function renderSections(group: ContextSection[], header: string) {
248
+ if (group.length === 0) return;
249
+ contextParts.push(`## ${header}\n`);
250
+ for (const s of group) {
251
+ sectionIndex++;
252
+ contextParts.push(`### [${sectionIndex}] ${s.title} - ${s.label}\n**Slug:** ${s.slug}\n\n${s.content}\n`);
253
+ }
254
+ contextParts.push('');
255
+ }
256
+
257
+ renderSections(primarySections, '页面正文');
258
+ renderSections(rawSections, '原始文档');
259
+ renderSections(linkedSections, '关联页面');
260
+
261
+ const context = contextParts.join('\n');
262
+
263
+ const prompt = `你是一个知识库助手,请根据提供的知识库内容回答问题。
264
+
265
+ ## 问题
266
+ ${question}
267
+
268
+ ## 知识库内容
269
+
270
+ ${context}
271
+
272
+ ## 回答要求
273
+ - 仅基于提供的知识库内容回答,不要编造信息
274
+ - 如果知识库中没有相关信息,请明确说明
275
+ - 引用来源时使用 [[slug|标题]] 的格式
276
+ - 使用清晰的 markdown 格式
277
+ - 如果涉及时间线信息,请在回答中体现
278
+ - 区分哪些信息来自「页面正文」、哪些来自「原始文档」、哪些来自「关联页面」
279
+ - 语言与提问保持一致(中文提问用中文回答,英文提问用英文回答)
280
+
281
+ ## 回答`;
282
+
283
+ const disableThinking: Record<string, unknown> = {
284
+ thinking: { type: "disabled" },
285
+ };
286
+ const extraBody: Record<string, unknown> = {
287
+ thinking: { type: "disabled" },
288
+ };
289
+
290
+ try {
291
+ const url = llm.baseURL.endsWith("/") ? llm.baseURL + "chat/completions" : llm.baseURL + "/chat/completions";
292
+
293
+ process.stderr.write(`\x1b[35m💭\x1b[0m \x1b[2mConnecting to ${llm.model}...\x1b[0m\n`);
294
+
295
+ const resp = await fetch(
296
+ url,
297
+ {
298
+ method: "POST",
299
+ headers: {
300
+ "Content-Type": "application/json",
301
+ Authorization: `Bearer ${apiKey}`,
302
+ },
303
+ body: JSON.stringify({
304
+ model: llm.model,
305
+ stream: true,
306
+ messages: [
307
+ {
308
+ role: "system",
309
+ content: "你是一个专业的知识库助手,基于提供的知识库内容准确回答问题。引用来源时使用 [[slug|标题]] 格式。回答要条理清晰,区分信息来源。",
310
+ },
311
+ { role: "user", content: prompt },
312
+ ],
313
+ temperature: 0.3,
314
+ max_tokens: 4096,
315
+ ...disableThinking,
316
+ extra_body: extraBody,
317
+ thinking: { type: "disabled" },
318
+ }),
319
+ signal: AbortSignal.timeout(30_000),
320
+ },
321
+ );
322
+
323
+ if (!resp.ok) {
324
+ const text = await resp.text();
325
+ process.stderr.write("\r\x1b[K");
326
+ return { answer: `Error: LLM API failed (${resp.status}): ${text.slice(0, 200)}`, ok: false };
327
+ }
328
+
329
+ if (!resp.body) {
330
+ process.stderr.write("\r\x1b[K");
331
+ return { answer: "Error: No response body from LLM API.", ok: false };
332
+ }
333
+
334
+ process.stderr.write("\r\x1b[K");
335
+ process.stderr.write(`\x1b[32m✦\x1b[0m \x1b[2mStreaming response...\x1b[0m\n`);
336
+
337
+ const reader = resp.body.getReader();
338
+ const decoder = new TextDecoder();
339
+ let fullAnswer = "";
340
+ let buffer = "";
341
+
342
+ while (true) {
343
+ const { done, value } = await reader.read();
344
+ if (done) break;
345
+
346
+ buffer += decoder.decode(value, { stream: true });
347
+ const lines = buffer.split("\n");
348
+ buffer = lines.pop() || "";
349
+
350
+ for (const line of lines) {
351
+ const trimmed = line.trim();
352
+ if (!trimmed || trimmed === "data: [DONE]") continue;
353
+ if (!trimmed.startsWith("data: ")) continue;
354
+
355
+ try {
356
+ const json = JSON.parse(trimmed.slice(6));
357
+ const content = json.choices?.[0]?.delta?.content;
358
+ if (content) {
359
+ process.stdout.write(content);
360
+ fullAnswer += content;
361
+ }
362
+ } catch { /* skip malformed SSE */ }
363
+ }
364
+ }
365
+
366
+ process.stdout.write("\n");
367
+
368
+ return { answer: fullAnswer || "(No answer generated)", ok: true };
369
+ } catch (error) {
370
+ const msg = error instanceof Error ? error.message : String(error);
371
+ return { answer: `Error: ${msg}`, ok: false };
372
+ }
373
+ }
374
+
375
+ // ---------------------------------------------------------------------------
376
+ // Query command
377
+ // ---------------------------------------------------------------------------
378
+
379
+ export function registerQueryCommand(program: Command): void {
380
+ program
381
+ .command("query")
382
+ .argument("<question>", "natural language question")
383
+ .option("--limit <number>", "max results", "10")
384
+ .option("--llm", "use LLM to answer based on retrieved context", false)
385
+ .option("--context-limit <number>", "max pages to use as context", "5")
386
+ .description("semantic / vector search")
387
+ .addHelpText(
388
+ "after",
389
+ `
390
+ Examples:
391
+ ebrain query "What projects did we ship in Q4?"
392
+ ebrain query "Who leads the ML team?" --limit 5
393
+ ebrain query "What are the key findings?" --llm
394
+ `,
395
+ )
396
+ .action(async (question: string, opts: Record<string, string>) => {
397
+ await withRepo(program, async (repo) => {
398
+ const limit = Number(opts.limit ?? 10);
399
+ const hits = await repo.query(question, limit);
400
+
401
+ if (opts.llm) {
402
+ const settings = await loadSettings();
403
+ if (!settings.llm.baseURL) {
404
+ print(program, { error: "LLM not configured. Set llm.baseURL in settings." });
405
+ return;
406
+ }
407
+
408
+ const progress = createProgress();
409
+ progress.start("Searching knowledge base...");
410
+
411
+ const contextLimit = Number(opts.contextLimit ?? 5);
412
+ const topHits = hits.slice(0, contextLimit);
413
+
414
+ if (topHits.length === 0) {
415
+ progress.stop();
416
+ process.stderr.write("No relevant pages found.\n");
417
+ print(program, { answer: "No relevant information found in the knowledge base.", sources: [] });
418
+ return;
419
+ }
420
+
421
+ const MAX_CONTEXT_CHARS = 100_000;
422
+ const ctxStart = Date.now();
423
+ progress.update(`Loading page content...`);
424
+ const { sections, totalChars, stats } = await collectContextForLLM(repo, topHits, question, MAX_CONTEXT_CHARS, (stage) => {
425
+ progress.update(`Loading ${stage}...`);
426
+ });
427
+ const ctxDuration = formatDuration(Date.now() - ctxStart);
428
+
429
+ if (sections.length === 0) {
430
+ progress.stop();
431
+ process.stderr.write("No content could be loaded.\n");
432
+ print(program, { answer: "Failed to load page content.", sources: [] });
433
+ return;
434
+ }
435
+
436
+ progress.succeed(`Loaded ${stats.primaryPages} page(s), ${stats.rawDocs} raw doc(s), ${stats.linkedPages} linked page(s) (${ctxDuration})`);
437
+ const startTime = Date.now();
438
+
439
+ const { answer, ok } = await generateAnswerWithStream(question, sections, stats, settings.llm);
440
+
441
+ if (!ok) {
442
+ console.log(answer);
443
+ return;
444
+ }
445
+
446
+ const duration = formatDuration(Date.now() - startTime);
447
+
448
+ console.log("\n---\n**Sources:**\n");
449
+ for (let i = 0; i < sections.length; i++) {
450
+ const s = sections[i];
451
+ const icon = s.type === 'primary' ? '📄' : s.type === 'raw_data' ? '📎' : '🔗';
452
+ console.log(`${icon} ${i + 1}. [[${s.slug}|${s.title}]] - ${s.label} (${(s.content.length / 1024).toFixed(1)}KB)`);
453
+ }
454
+ console.log(`\n*Context: ${stats.primaryPages} page(s), ${stats.rawDocs} raw doc(s), ${stats.linkedPages} linked page(s)*`);
455
+ } else {
456
+ print(program, hits);
457
+ }
458
+ });
459
+ });
460
+
461
+ // -- search ---------------------------------------------------------------
462
+ program
463
+ .command("search")
464
+ .argument("<query>", "full-text search query")
465
+ .option("--type <type>", "filter by page type")
466
+ .option("--limit <number>", "max results", "10")
467
+ .description("full-text / hybrid search")
468
+ .addHelpText(
469
+ "after",
470
+ `
471
+ Examples:
472
+ ebrain search "machine learning"
473
+ ebrain search "quarterly revenue" --type deal --limit 5
474
+ `,
475
+ )
476
+ .action(async (query: string, opts: Record<string, string>) => {
477
+ await withRepo(program, async (repo) => {
478
+ const hits = await repo.search(
479
+ query,
480
+ Number(opts.limit ?? 10),
481
+ opts.type,
482
+ );
483
+ print(program, hits);
484
+ });
485
+ });
486
+ }
@@ -0,0 +1,109 @@
1
+ import { createHash } from "node:crypto";
2
+ import { Command } from "commander";
3
+ import { BrainDb } from "../db/client";
4
+ import { BrainRepository } from "../repositories/brain-repo";
5
+ import { loadSettings } from "../settings";
6
+
7
+ // ---------------------------------------------------------------------------
8
+ // Dry-run helpers
9
+ // ---------------------------------------------------------------------------
10
+
11
+ export function addDryRun(cmd: Command): Command {
12
+ return cmd.option("--dry-run", "preview changes without executing", false);
13
+ }
14
+
15
+ export function isDryRun(opts: Record<string, unknown>): boolean {
16
+ return Boolean(opts.dryRun);
17
+ }
18
+
19
+ // ---------------------------------------------------------------------------
20
+ // Content hashing
21
+ // ---------------------------------------------------------------------------
22
+
23
+ /**
24
+ * Compute a short SHA-256 hex hash of a string (first 16 chars).
25
+ * Used for detecting duplicate document ingestion.
26
+ */
27
+ export function contentHash(text: string): string {
28
+ return createHash("sha256").update(text, "utf8").digest("hex").slice(0, 16);
29
+ }
30
+
31
+ // ---------------------------------------------------------------------------
32
+ // Output helpers
33
+ // ---------------------------------------------------------------------------
34
+
35
+ export function isJson(program: Command): boolean {
36
+ return Boolean(program.opts().json);
37
+ }
38
+
39
+ function formatHuman(payload: unknown): string {
40
+ if (Array.isArray(payload)) {
41
+ return payload
42
+ .map((item) =>
43
+ typeof item === "string"
44
+ ? `- ${item}`
45
+ : `- ${JSON.stringify(item)}`,
46
+ )
47
+ .join("\n");
48
+ }
49
+ return JSON.stringify(payload, null, 2);
50
+ }
51
+
52
+ export function print(program: Command, payload: unknown): void {
53
+ if (isJson(program)) {
54
+ console.log(JSON.stringify(payload, null, 2));
55
+ return;
56
+ }
57
+ if (typeof payload === "string") {
58
+ console.log(payload);
59
+ return;
60
+ }
61
+ console.log(formatHuman(payload));
62
+ }
63
+
64
+ // ---------------------------------------------------------------------------
65
+ // Database session helper
66
+ // ---------------------------------------------------------------------------
67
+
68
+ /**
69
+ * Open a database connection, run the callback, then exit.
70
+ *
71
+ * DO NOT call db.close() — seekdb's embedded native close() segfaults.
72
+ * seekdb is an embedded database; data is flushed to WAL on each transaction.
73
+ * For remote mode, the server handles cleanup; for embedded, process.exit
74
+ * is safe and avoids the native crash.
75
+ */
76
+ export async function withRepo(
77
+ program: Command,
78
+ callback: (repo: BrainRepository) => Promise<void>,
79
+ ): Promise<void> {
80
+ const settings = await loadSettings();
81
+ const cliDb = program.opts().db;
82
+ const dbPath = cliDb ?? settings.dbPath;
83
+ const db = await BrainDb.connect(dbPath, settings);
84
+ const repo = new BrainRepository(db);
85
+ await callback(repo);
86
+
87
+ // Flush stdout/stderr buffers so results are visible
88
+ await new Promise<void>((r) => {
89
+ process.stdout.write("", () => {
90
+ process.stderr.write("", () => r());
91
+ });
92
+ });
93
+
94
+ // Exit code 139 (segfault) from seekdb's atexit hooks in the native
95
+ // embedded server — unavoidable without upstream fix.
96
+ process.exit(0);
97
+ }
98
+
99
+ // ---------------------------------------------------------------------------
100
+ // Link slug normalization (used by import/wiki-link processing)
101
+ // ---------------------------------------------------------------------------
102
+
103
+ export function normalizeLinkSlug(path: string): string {
104
+ return path
105
+ .replaceAll("\\", "/")
106
+ .replace(/^\.\//, "")
107
+ .replace(/^\.\.\//g, "")
108
+ .replace(/\.md$/, "");
109
+ }