ex-brain 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,464 @@
1
+ import { dirname, extname, resolve } from "node:path";
2
+ import { Command } from "commander";
3
+ import { stat } from "node:fs/promises";
4
+ import { inferTypeFromSlug, slugToTitle, normalizeLongSlug, slugify } from "../slug-utils";
5
+ import { loadDocument, collectDocumentFiles, detectKind, type DocumentKind } from "../markdown/document-loader";
6
+ import { collectMarkdownFiles, pathToSlug, readTextFile } from "../markdown/io";
7
+ import { parsePageMarkdown, extractWikiStyleLinks, extractTimelineLines } from "../markdown/parser";
8
+ import { extractRelations, entityToSlug, type EntityType, type RelationType, type EntityRef } from "../ai/entity-link";
9
+ import { loadSettings } from "../settings";
10
+ import { BrainRepository } from "../repositories/brain-repo";
11
+ import { addDryRun, isDryRun, contentHash, withRepo, isJson, print, normalizeLinkSlug } from "./shared";
12
+ import { success, warning, subItem, keyValue, header, createSpinner } from "../utils/cli-output";
13
+ import { formatDuration } from "../utils/progress";
14
+
15
+ // ---------------------------------------------------------------------------
16
+ // Helpers
17
+ // ---------------------------------------------------------------------------
18
+
19
+ const DOC_EXTENSIONS = new Set([
20
+ "pdf", "docx", "doc", "html", "htm", "json", "txt", "text",
21
+ ]);
22
+
23
+ function isDocumentFile(filePath: string, forceKind?: string): boolean {
24
+ if (forceKind && forceKind !== "markdown") return true;
25
+ const ext = extname(filePath).toLowerCase().replace(/^\./, "");
26
+ return DOC_EXTENSIONS.has(ext);
27
+ }
28
+
29
+ async function collectMarkdownFilesFromPaths(paths: string[]): Promise<Array<{ file: string; root: string }>> {
30
+ const results: Array<{ file: string; root: string }> = [];
31
+ for (const p of paths) {
32
+ const rp = resolve(p);
33
+ const s = await stat(rp);
34
+ if (s.isDirectory()) {
35
+ const mdFiles = await collectMarkdownFiles(rp);
36
+ for (const f of mdFiles) results.push({ file: f, root: rp });
37
+ } else if (s.isFile() && extname(rp).toLowerCase() === ".md") {
38
+ results.push({ file: rp, root: dirname(rp) });
39
+ }
40
+ }
41
+ return results.sort((a, b) => a.file.localeCompare(b.file));
42
+ }
43
+
44
+ async function collectDocumentFilesFromPaths(paths: string[]): Promise<Array<{ file: string; root: string }>> {
45
+ const results: Array<{ file: string; root: string }> = [];
46
+ for (const p of paths) {
47
+ const rp = resolve(p);
48
+ const s = await stat(rp);
49
+ if (s.isDirectory()) {
50
+ const docFiles = await collectDocumentFiles(rp);
51
+ for (const f of docFiles) results.push({ file: f, root: rp });
52
+ } else if (s.isFile() && isDocumentFile(rp)) {
53
+ results.push({ file: rp, root: dirname(rp) });
54
+ }
55
+ }
56
+ return results.sort((a, b) => a.file.localeCompare(b.file));
57
+ }
58
+
59
+ interface EntityRelation {
60
+ type: "relation";
61
+ from: EntityRef;
62
+ to: EntityRef;
63
+ relation: RelationType;
64
+ context: string;
65
+ confidence: number;
66
+ }
67
+
68
+ // ---------------------------------------------------------------------------
69
+ // Import command
70
+ // ---------------------------------------------------------------------------
71
+
72
+ export function registerImportCommand(program: Command): void {
73
+ addDryRun(
74
+ program
75
+ .command("import")
76
+ .argument("<paths...>", "directories or files (markdown, PDF, DOCX) to import")
77
+ .description("import markdown, PDF, and DOCX files — accepts directories (recursive) and/or individual files")
78
+ .option("--skip-index", "skip vector indexing (useful if seekdb crashes)")
79
+ .addHelpText(
80
+ "after",
81
+ `
82
+ Examples:
83
+ ebrain import ./docs # import a directory
84
+ ebrain import *.docx # import matching files (shell glob)
85
+ ebrain import report.pdf notes.md ./docs # mix of files and directories
86
+ ebrain import ./docs --dry-run
87
+ ebrain import ./docs --skip-index # skip vector indexing
88
+ `,
89
+ ),
90
+ ).action(async (paths: string[], opts: { dryRun?: boolean; skipIndex?: boolean }) => {
91
+ await withRepo(program, async (repo) => {
92
+ const mdEntries = await collectMarkdownFilesFromPaths(paths);
93
+ const files = mdEntries.map((e) => e.file);
94
+
95
+ if (isDryRun(opts)) {
96
+ print(program, {
97
+ dryRun: true,
98
+ action: "import",
99
+ paths: paths.map((p) => resolve(p)),
100
+ filesFound: files.length,
101
+ slugs: mdEntries.map((e) => pathToSlug(e.file, e.root)),
102
+ });
103
+ return;
104
+ }
105
+
106
+ const jsonOut = isJson(program);
107
+ const settings = await loadSettings();
108
+ const spinner = createSpinner();
109
+ const startTime = Date.now();
110
+
111
+ if (!jsonOut) {
112
+ header(`Import: ${paths.map((p) => resolve(p)).join(", ")}`);
113
+ }
114
+
115
+ // Phase 1: Parse all files and collect data
116
+ if (!jsonOut) {
117
+ spinner.start(`Scanning ${files.length} files...`);
118
+ }
119
+
120
+ const fileData: Array<{
121
+ file: string;
122
+ slug: string;
123
+ parsed: ReturnType<typeof parsePageMarkdown>;
124
+ content: string;
125
+ wikiLinks: string[];
126
+ timelineEntries: ReturnType<typeof extractTimelineLines>;
127
+ tags: string[];
128
+ }> = [];
129
+
130
+ for (let i = 0; i < mdEntries.length; i++) {
131
+ const { file, root } = mdEntries[i]!;
132
+ const rawSlug = pathToSlug(file, root);
133
+ const slug = normalizeLongSlug(rawSlug);
134
+ const content = await readTextFile(file);
135
+ const parsed = parsePageMarkdown(content);
136
+ const wikiLinks = extractWikiStyleLinks(content).map(normalizeLinkSlug);
137
+ const timelineEntries = extractTimelineLines(parsed.timeline);
138
+ const tags = Array.isArray(parsed.frontmatter.tags)
139
+ ? parsed.frontmatter.tags.filter((t): t is string => typeof t === "string")
140
+ : [];
141
+ fileData.push({ file, slug, parsed, content, wikiLinks, timelineEntries, tags });
142
+ }
143
+
144
+ if (!jsonOut) {
145
+ spinner.succeed(`Found ${files.length} markdown files`);
146
+ }
147
+
148
+ // Phase 1.5: Scan for docx/pdf files
149
+ const writeErrors: string[] = [];
150
+
151
+ if (!jsonOut) {
152
+ spinner.start("Scanning for PDF/DOCX files...");
153
+ }
154
+ const docEntries = await collectDocumentFilesFromPaths(paths);
155
+ const docFilePaths = docEntries.map((e) => e.file);
156
+
157
+ const docFileData: Array<{
158
+ file: string;
159
+ slug: string;
160
+ content: string;
161
+ kind: DocumentKind;
162
+ fileName: string;
163
+ sourceRef: string;
164
+ sourceType: "file" | "url";
165
+ mimeType: string | undefined;
166
+ bytes: number;
167
+ metadata: Record<string, unknown>;
168
+ }> = [];
169
+
170
+ for (let i = 0; i < docFilePaths.length; i++) {
171
+ const file = docFilePaths[i]!;
172
+ const root = docEntries[i]!.root;
173
+ if (!jsonOut) {
174
+ spinner.update(`Extracting documents... ${i + 1}/${docFilePaths.length}`);
175
+ }
176
+ try {
177
+ const loaded = await loadDocument(file, { forceKind: detectKind({ fileName: file }) });
178
+ const rawSlug = pathToSlug(file, root);
179
+ const slug = normalizeLongSlug(rawSlug);
180
+ docFileData.push({
181
+ file,
182
+ slug,
183
+ content: loaded.text,
184
+ kind: loaded.kind,
185
+ fileName: loaded.fileName,
186
+ sourceRef: loaded.source,
187
+ sourceType: loaded.sourceType,
188
+ mimeType: loaded.mimeType,
189
+ bytes: loaded.bytes,
190
+ metadata: loaded.metadata,
191
+ });
192
+ } catch (err) {
193
+ writeErrors.push(`${file}: ${err instanceof Error ? err.message : String(err)}`);
194
+ }
195
+ }
196
+
197
+ if (!jsonOut) {
198
+ spinner.succeed(`Found ${docFilePaths.length} PDF/DOCX files`);
199
+ if (writeErrors.length > 0) {
200
+ warning(`${writeErrors.length} files failed to extract`);
201
+ }
202
+ }
203
+
204
+ // Phase 2: Write all pages first (skip embed for performance)
205
+ if (!jsonOut) {
206
+ spinner.start(`Writing ${fileData.length + docFileData.length} pages to database...`);
207
+ }
208
+
209
+ const allSlugs: string[] = [];
210
+
211
+ for (let i = 0; i < fileData.length; i++) {
212
+ const { slug, parsed } = fileData[i]!;
213
+ if (!jsonOut && i % 20 === 0) {
214
+ spinner.update(`Writing pages... ${i + 1}/${fileData.length + docFileData.length}`);
215
+ }
216
+ try {
217
+ await repo.putPage({
218
+ slug,
219
+ type: String(parsed.frontmatter.type ?? inferTypeFromSlug(slug)),
220
+ title: String(parsed.frontmatter.title ?? slugToTitle(slug)),
221
+ compiledTruth: parsed.compiledTruth,
222
+ timeline: parsed.timeline,
223
+ frontmatter: parsed.frontmatter,
224
+ }, true);
225
+ allSlugs.push(slug);
226
+ } catch (err) {
227
+ writeErrors.push(`${slug}: ${err instanceof Error ? err.message : String(err)}`);
228
+ }
229
+ }
230
+
231
+ for (let i = 0; i < docFileData.length; i++) {
232
+ const { slug, content, kind, sourceRef, sourceType, mimeType, bytes, metadata, fileName } = docFileData[i]!;
233
+ if (!jsonOut) {
234
+ spinner.update(`Writing pages... ${fileData.length + i + 1}/${fileData.length + docFileData.length}`);
235
+ }
236
+ try {
237
+ const hash = contentHash(content);
238
+ const type = kind;
239
+ const title = String(slugToTitle(slug));
240
+ const frontmatter: Record<string, unknown> = {
241
+ sourceFile: sourceRef,
242
+ sourceType,
243
+ sourceKind: kind,
244
+ sourceMimeType: mimeType,
245
+ sourceBytes: bytes,
246
+ sourceFileName: fileName,
247
+ _contentHash: hash,
248
+ ...metadata,
249
+ };
250
+ await repo.putPage({
251
+ slug,
252
+ type,
253
+ title,
254
+ compiledTruth: content,
255
+ timeline: "",
256
+ frontmatter,
257
+ }, true);
258
+ allSlugs.push(slug);
259
+ } catch (err) {
260
+ writeErrors.push(`${slug}: ${err instanceof Error ? err.message : String(err)}`);
261
+ }
262
+ }
263
+
264
+ if (!jsonOut) {
265
+ spinner.succeed(`Wrote ${allSlugs.length} pages to database`);
266
+ if (writeErrors.length > 0) {
267
+ warning(`${writeErrors.length} pages failed to write`);
268
+ for (const e of writeErrors.slice(0, 3)) {
269
+ subItem(e);
270
+ }
271
+ if (writeErrors.length > 3) {
272
+ subItem(`... and ${writeErrors.length - 3} more`);
273
+ }
274
+ }
275
+ }
276
+
277
+ // Phase 3: Parallel entity extraction
278
+ const BATCH_SIZE = 10;
279
+ const entityResults = new Map<string, EntityRelation[]>();
280
+
281
+ if (settings.llm.baseURL) {
282
+ if (!jsonOut) {
283
+ spinner.start(`Extracting entities with LLM...`);
284
+ }
285
+
286
+ const allPages: Array<{ slug: string; content: string }> = [
287
+ ...fileData.map(({ slug, content }) => ({ slug, content })),
288
+ ...docFileData.map(({ slug, content }) => ({ slug, content })),
289
+ ];
290
+
291
+ for (let i = 0; i < allPages.length; i += BATCH_SIZE) {
292
+ const batch = allPages.slice(i, i + BATCH_SIZE);
293
+ if (!jsonOut) {
294
+ spinner.update(`Extracting entities... ${Math.min(i + BATCH_SIZE, allPages.length)}/${allPages.length}`);
295
+ }
296
+ const batchPromises = batch.map(async ({ slug, content }) => {
297
+ const relations = await extractRelations(content, settings.llm);
298
+ return { slug, relations };
299
+ });
300
+ const results = await Promise.all(batchPromises);
301
+ for (const { slug, relations } of results) {
302
+ entityResults.set(slug, relations);
303
+ }
304
+ }
305
+
306
+ if (!jsonOut) {
307
+ spinner.succeed(`Entity extraction complete`);
308
+ }
309
+ } else {
310
+ if (!jsonOut) {
311
+ warning(`LLM not configured, skipping entity extraction`);
312
+ }
313
+ }
314
+
315
+ // Phase 4: Write links, tags, timeline, and entity pages
316
+ if (!jsonOut) {
317
+ spinner.start(`Creating links, tags, and timeline entries...`);
318
+ }
319
+
320
+ let linkCount = 0;
321
+ let timelineCount = 0;
322
+ let entityCount = 0;
323
+ let tagCount = 0;
324
+
325
+ const allTimelineEntries: Array<{
326
+ pageSlug: string;
327
+ date: string;
328
+ source: string;
329
+ summary: string;
330
+ detail: string;
331
+ }> = [];
332
+
333
+ for (const { slug, wikiLinks, timelineEntries, tags } of fileData) {
334
+ for (const link of wikiLinks) {
335
+ await repo.link(slug, link, "import");
336
+ linkCount++;
337
+ }
338
+
339
+ for (const entry of timelineEntries) {
340
+ allTimelineEntries.push({
341
+ pageSlug: slug,
342
+ date: entry.date,
343
+ source: entry.source,
344
+ summary: entry.summary,
345
+ detail: "",
346
+ });
347
+ timelineCount++;
348
+ }
349
+
350
+ for (const tag of tags) {
351
+ await repo.tag(slug, tag);
352
+ tagCount++;
353
+ }
354
+
355
+ const relations = entityResults.get(slug);
356
+ if (relations && relations.length > 0) {
357
+ const highConfidence = relations.filter(r => r.confidence >= 0.6);
358
+ for (const r of highConfidence) {
359
+ const fromCandidate = entityToSlug(r.from.name, r.from.type);
360
+ const toCandidate = entityToSlug(r.to.name, r.to.type);
361
+ const fromSlug = await repo.findSimilarSlug(fromCandidate, r.from.name);
362
+ const toSlug = await repo.findSimilarSlug(toCandidate, r.to.name);
363
+
364
+ const c1 = await repo.ensureEntityPage(fromSlug, r.from.type, r.from.name, r.relation, r.context, slug);
365
+ const c2 = await repo.ensureEntityPage(toSlug, r.to.type, r.to.name, r.relation, r.context, slug);
366
+ if (c1) entityCount++;
367
+ if (c2) entityCount++;
368
+
369
+ await repo.link(fromSlug, toSlug, `[${r.relation}] ${r.context}`);
370
+ await repo.link(slug, fromSlug, `Mentions ${r.from.name}`);
371
+ await repo.link(slug, toSlug, `Mentions ${r.to.name}`);
372
+ linkCount += 3;
373
+ }
374
+ }
375
+ }
376
+
377
+ for (const { slug } of docFileData) {
378
+ const relations = entityResults.get(slug);
379
+ if (relations && relations.length > 0) {
380
+ const highConfidence = relations.filter(r => r.confidence >= 0.6);
381
+ for (const r of highConfidence) {
382
+ const fromCandidate = entityToSlug(r.from.name, r.from.type);
383
+ const toCandidate = entityToSlug(r.to.name, r.to.type);
384
+ const fromSlug = await repo.findSimilarSlug(fromCandidate, r.from.name);
385
+ const toSlug = await repo.findSimilarSlug(toCandidate, r.to.name);
386
+
387
+ const c1 = await repo.ensureEntityPage(fromSlug, r.from.type, r.from.name, r.relation, r.context, slug);
388
+ const c2 = await repo.ensureEntityPage(toSlug, r.to.type, r.to.name, r.relation, r.context, slug);
389
+ if (c1) entityCount++;
390
+ if (c2) entityCount++;
391
+
392
+ await repo.link(fromSlug, toSlug, `[${r.relation}] ${r.context}`);
393
+ await repo.link(slug, fromSlug, `Mentions ${r.from.name}`);
394
+ await repo.link(slug, toSlug, `Mentions ${r.to.name}`);
395
+ linkCount += 3;
396
+ }
397
+ }
398
+ }
399
+
400
+ for (const { slug, kind, fileName } of docFileData) {
401
+ allTimelineEntries.push({
402
+ pageSlug: slug,
403
+ date: new Date().toISOString().slice(0, 10),
404
+ source: "import",
405
+ summary: `Ingested ${kind}: ${fileName}`,
406
+ detail: "",
407
+ });
408
+ timelineCount++;
409
+ }
410
+
411
+ if (allTimelineEntries.length > 0) {
412
+ await repo.timelineAddBatch(allTimelineEntries);
413
+ }
414
+
415
+ if (!jsonOut) {
416
+ spinner.succeed(`Created links, tags, and timeline`);
417
+ }
418
+
419
+ // Phase 5: Batch sync all pages to search index
420
+ if (opts.skipIndex) {
421
+ if (!jsonOut) {
422
+ success(`Skipping vector indexing (--skip-index)`);
423
+ }
424
+ } else {
425
+ if (!jsonOut) {
426
+ spinner.start(`Indexing ${allSlugs.length} pages for search...`);
427
+ }
428
+ await repo.embedAll();
429
+
430
+ if (!jsonOut) {
431
+ spinner.succeed(`Search indexing complete`);
432
+ }
433
+ }
434
+
435
+ const duration = formatDuration(Date.now() - startTime);
436
+
437
+ if (!jsonOut) {
438
+ header("Import Summary");
439
+ keyValue("Markdown files", String(files.length));
440
+ keyValue("PDF/DOCX files", String(docFilePaths.length));
441
+ keyValue("Pages created", String(allSlugs.length));
442
+ keyValue("Entities extracted", String(entityCount));
443
+ keyValue("Links created", String(linkCount));
444
+ keyValue("Timeline entries", String(timelineCount));
445
+ keyValue("Tags added", String(tagCount));
446
+ keyValue("Duration", duration);
447
+
448
+ if (writeErrors.length > 0) {
449
+ warning(`${writeErrors.length} files had errors`);
450
+ }
451
+ }
452
+
453
+ print(program, {
454
+ ok: true,
455
+ markdownFiles: files.length,
456
+ docFiles: docFilePaths.length,
457
+ pages: allSlugs.length,
458
+ links: linkCount,
459
+ timelineEntries: timelineCount,
460
+ entities: entityCount,
461
+ });
462
+ });
463
+ });
464
+ }