ex-brain 0.2.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/ai/ax-pipeline.ts +114 -0
- package/src/ai/compiler.ts +118 -113
- package/src/ai/entity-link.ts +96 -78
- package/src/ai/timeline-extractor.ts +110 -99
- package/src/commands/compile-cmd.ts +1 -1
- package/src/commands/entity-links.ts +105 -0
- package/src/commands/import-cmd.ts +219 -0
- package/src/commands/import-put.ts +180 -0
- package/src/commands/index.ts +30 -2314
- package/src/commands/misc-cmds.ts +190 -0
- package/src/commands/misc-commands.ts +252 -0
- package/src/commands/put-cmd.ts +525 -0
- package/src/commands/query-cmd.ts +486 -0
- package/src/commands/shared.ts +109 -0
- package/src/commands/timeline-cmd.ts +159 -0
- package/src/config/index.ts +53 -0
- package/src/config/init.ts +50 -0
- package/src/config/paths.ts +21 -0
- package/src/config/schema.ts +121 -0
- package/src/config/settings.ts +168 -0
- package/src/db/client.ts +1 -1
- package/src/markdown/document-loader.ts +30 -2
- package/src/repositories/brain-repo.ts +43 -1
- package/src/settings.ts +27 -282
- /package/src/{config.ts → slug-utils.ts} +0 -0
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Timeline Extraction —
|
|
2
|
+
* Timeline Extraction — AIPipeline version.
|
|
3
3
|
*
|
|
4
|
-
* Uses
|
|
5
|
-
*
|
|
6
|
-
*
|
|
4
|
+
* Uses AIPipeline for LLM call lifecycle (createAxAI → forward → parse → transform → fallback).
|
|
5
|
+
*
|
|
6
|
+
* Public API unchanged — drop-in replacement for callers.
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
|
-
import {
|
|
9
|
+
import { f } from "@ax-llm/ax";
|
|
10
10
|
import type { ResolvedLLM } from "../settings";
|
|
11
11
|
import type { TimelineEntry } from "../types";
|
|
12
|
+
import { AIPipeline, parseJsonArray } from "./ax-pipeline";
|
|
12
13
|
import { createAxAI } from "./ax-adapter";
|
|
13
14
|
|
|
14
15
|
// ---------------------------------------------------------------------------
|
|
@@ -29,7 +30,7 @@ export interface TimelineExtractionResult {
|
|
|
29
30
|
}
|
|
30
31
|
|
|
31
32
|
// ---------------------------------------------------------------------------
|
|
32
|
-
//
|
|
33
|
+
// Timeline pipeline configuration
|
|
33
34
|
// ---------------------------------------------------------------------------
|
|
34
35
|
|
|
35
36
|
const timelineSig = f()
|
|
@@ -40,99 +41,6 @@ const timelineSig = f()
|
|
|
40
41
|
))
|
|
41
42
|
.build();
|
|
42
43
|
|
|
43
|
-
const timelineGen = ax(timelineSig);
|
|
44
|
-
|
|
45
|
-
// ---------------------------------------------------------------------------
|
|
46
|
-
// Public API
|
|
47
|
-
// ---------------------------------------------------------------------------
|
|
48
|
-
|
|
49
|
-
export async function extractTimelineEvents(
|
|
50
|
-
input: TimelineExtractionInput,
|
|
51
|
-
llm: ResolvedLLM,
|
|
52
|
-
): Promise<TimelineExtractionResult> {
|
|
53
|
-
if (!input.content.trim()) {
|
|
54
|
-
return { entries: [], success: false, confidence: 0.3 };
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
const aiClient = createAxAI(llm);
|
|
58
|
-
if (!aiClient) {
|
|
59
|
-
return fallbackExtract(input);
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
try {
|
|
63
|
-
const result = await timelineGen.forward(aiClient, {
|
|
64
|
-
textContent: input.content.slice(0, 4000),
|
|
65
|
-
infoDate: input.defaultDate,
|
|
66
|
-
});
|
|
67
|
-
|
|
68
|
-
const rawEvents = parseEvents(result.events);
|
|
69
|
-
const entries: TimelineEntry[] = [];
|
|
70
|
-
for (const e of rawEvents) {
|
|
71
|
-
const date = normalizeDate(String(e.date ?? ""), input.defaultDate);
|
|
72
|
-
if (!date) continue;
|
|
73
|
-
|
|
74
|
-
entries.push({
|
|
75
|
-
pageSlug: input.pageSlug,
|
|
76
|
-
date,
|
|
77
|
-
source: input.source,
|
|
78
|
-
summary: String(e.summary ?? "").slice(0, 120),
|
|
79
|
-
detail: String(e.detail ?? ""),
|
|
80
|
-
importance: Math.max(1, Math.min(5, Math.round(Number(e.importance ?? 3)))),
|
|
81
|
-
});
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
entries.sort((a, b) => b.date.localeCompare(a.date));
|
|
85
|
-
|
|
86
|
-
return {
|
|
87
|
-
entries: entries.slice(0, 5),
|
|
88
|
-
success: entries.length > 0,
|
|
89
|
-
confidence: entries.length > 0 ? 0.85 : 0.3,
|
|
90
|
-
};
|
|
91
|
-
} catch (error) {
|
|
92
|
-
const msg = error instanceof Error ? error.message : String(error);
|
|
93
|
-
console.warn(`[ebrain] Timeline extraction failed: ${msg}`);
|
|
94
|
-
return fallbackExtract(input);
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
export async function extractTimelineFromRelation(
|
|
99
|
-
relation: { from: string; to: string; relationType: string; context: string },
|
|
100
|
-
defaultDate: string,
|
|
101
|
-
pageSlug: string,
|
|
102
|
-
llm: ResolvedLLM,
|
|
103
|
-
): Promise<TimelineEntry | null> {
|
|
104
|
-
const significantTypes = ["invested_in", "acquired", "founder_of", "leader_of", "works_at"];
|
|
105
|
-
if (!significantTypes.includes(relation.relationType)) return null;
|
|
106
|
-
|
|
107
|
-
const aiClient = createAxAI(llm);
|
|
108
|
-
if (!aiClient) return null;
|
|
109
|
-
|
|
110
|
-
try {
|
|
111
|
-
const content = `${relation.from} → ${relation.to} (${relation.relationType}): ${relation.context}`;
|
|
112
|
-
const result = await timelineGen.forward(aiClient, {
|
|
113
|
-
textContent: content,
|
|
114
|
-
infoDate: defaultDate,
|
|
115
|
-
});
|
|
116
|
-
|
|
117
|
-
const rawEvents = parseEvents(result.events);
|
|
118
|
-
for (const e of rawEvents) {
|
|
119
|
-
const date = normalizeDate(String(e.date ?? ""), defaultDate);
|
|
120
|
-
if (!date) continue;
|
|
121
|
-
return {
|
|
122
|
-
pageSlug,
|
|
123
|
-
date,
|
|
124
|
-
source: "extracted",
|
|
125
|
-
summary: String(e.summary ?? "").slice(0, 120),
|
|
126
|
-
detail: String(e.detail ?? ""),
|
|
127
|
-
importance: Math.max(1, Math.min(5, Math.round(Number(e.importance ?? 3)))),
|
|
128
|
-
};
|
|
129
|
-
}
|
|
130
|
-
return null;
|
|
131
|
-
} catch {
|
|
132
|
-
return null;
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
|
|
136
44
|
interface RawEvent {
|
|
137
45
|
date?: string;
|
|
138
46
|
summary?: string;
|
|
@@ -158,6 +66,20 @@ function parseEvents(raw: unknown): RawEvent[] {
|
|
|
158
66
|
return [];
|
|
159
67
|
}
|
|
160
68
|
|
|
69
|
+
const timelinePipeline = new AIPipeline<
|
|
70
|
+
{ textContent: string; infoDate: string },
|
|
71
|
+
RawEvent[],
|
|
72
|
+
RawEvent[]
|
|
73
|
+
>({
|
|
74
|
+
signature: timelineSig,
|
|
75
|
+
mapInput: (input) => input,
|
|
76
|
+
extractOutput: (raw) => raw.events,
|
|
77
|
+
parseRaw: parseEvents,
|
|
78
|
+
transform: (raw) => raw,
|
|
79
|
+
fallback: () => [],
|
|
80
|
+
label: "Timeline extraction",
|
|
81
|
+
});
|
|
82
|
+
|
|
161
83
|
// ---------------------------------------------------------------------------
|
|
162
84
|
// Date Normalization (preserved from original implementation)
|
|
163
85
|
// ---------------------------------------------------------------------------
|
|
@@ -244,3 +166,92 @@ function fallbackExtract(input: TimelineExtractionInput): TimelineExtractionResu
|
|
|
244
166
|
const uniqueEntries = Array.from(seen.values());
|
|
245
167
|
return { entries: uniqueEntries, success: uniqueEntries.length > 0, confidence: 0.4 };
|
|
246
168
|
}
|
|
169
|
+
|
|
170
|
+
// ---------------------------------------------------------------------------
|
|
171
|
+
// Public API (unchanged)
|
|
172
|
+
// ---------------------------------------------------------------------------
|
|
173
|
+
|
|
174
|
+
export async function extractTimelineEvents(
|
|
175
|
+
input: TimelineExtractionInput,
|
|
176
|
+
llm: ResolvedLLM,
|
|
177
|
+
): Promise<TimelineExtractionResult> {
|
|
178
|
+
if (!input.content.trim()) {
|
|
179
|
+
return { entries: [], success: false, confidence: 0.3 };
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const aiClient = createAxAI(llm);
|
|
183
|
+
if (!aiClient) {
|
|
184
|
+
return fallbackExtract(input);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
try {
|
|
188
|
+
const rawEvents = await timelinePipeline.run(
|
|
189
|
+
{ textContent: input.content.slice(0, 4000), infoDate: input.defaultDate },
|
|
190
|
+
llm,
|
|
191
|
+
);
|
|
192
|
+
|
|
193
|
+
const entries: TimelineEntry[] = [];
|
|
194
|
+
for (const e of rawEvents) {
|
|
195
|
+
const date = normalizeDate(String(e.date ?? ""), input.defaultDate);
|
|
196
|
+
if (!date) continue;
|
|
197
|
+
|
|
198
|
+
entries.push({
|
|
199
|
+
pageSlug: input.pageSlug,
|
|
200
|
+
date,
|
|
201
|
+
source: input.source,
|
|
202
|
+
summary: String(e.summary ?? "").slice(0, 120),
|
|
203
|
+
detail: String(e.detail ?? ""),
|
|
204
|
+
importance: Math.max(1, Math.min(5, Math.round(Number(e.importance ?? 3)))),
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
entries.sort((a, b) => b.date.localeCompare(a.date));
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
entries: entries.slice(0, 5),
|
|
212
|
+
success: entries.length > 0,
|
|
213
|
+
confidence: entries.length > 0 ? 0.85 : 0.3,
|
|
214
|
+
};
|
|
215
|
+
} catch (error) {
|
|
216
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
217
|
+
console.warn(`[ebrain] Timeline extraction failed: ${msg}`);
|
|
218
|
+
return fallbackExtract(input);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
export async function extractTimelineFromRelation(
|
|
223
|
+
relation: { from: string; to: string; relationType: string; context: string },
|
|
224
|
+
defaultDate: string,
|
|
225
|
+
pageSlug: string,
|
|
226
|
+
llm: ResolvedLLM,
|
|
227
|
+
): Promise<TimelineEntry | null> {
|
|
228
|
+
const significantTypes = ["invested_in", "acquired", "founder_of", "leader_of", "works_at"];
|
|
229
|
+
if (!significantTypes.includes(relation.relationType)) return null;
|
|
230
|
+
|
|
231
|
+
const aiClient = createAxAI(llm);
|
|
232
|
+
if (!aiClient) return null;
|
|
233
|
+
|
|
234
|
+
try {
|
|
235
|
+
const content = `${relation.from} → ${relation.to} (${relation.relationType}): ${relation.context}`;
|
|
236
|
+
const rawEvents = await timelinePipeline.run(
|
|
237
|
+
{ textContent: content, infoDate: defaultDate },
|
|
238
|
+
llm,
|
|
239
|
+
);
|
|
240
|
+
|
|
241
|
+
for (const e of rawEvents) {
|
|
242
|
+
const date = normalizeDate(String(e.date ?? ""), defaultDate);
|
|
243
|
+
if (!date) continue;
|
|
244
|
+
return {
|
|
245
|
+
pageSlug,
|
|
246
|
+
date,
|
|
247
|
+
source: "extracted",
|
|
248
|
+
summary: String(e.summary ?? "").slice(0, 120),
|
|
249
|
+
detail: String(e.detail ?? ""),
|
|
250
|
+
importance: Math.max(1, Math.min(5, Math.round(Number(e.importance ?? 3)))),
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
return null;
|
|
254
|
+
} catch {
|
|
255
|
+
return null;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { Command } from "commander";
|
|
2
2
|
import { basename } from "node:path";
|
|
3
|
-
import { normalizeLongSlug, slugify } from "../
|
|
3
|
+
import { normalizeLongSlug, slugify } from "../slug-utils";
|
|
4
4
|
import { readMaybeStdin, readTextFile } from "../markdown/io";
|
|
5
5
|
import { loadSettings } from "../settings";
|
|
6
6
|
import { BrainRepository } from "../repositories/brain-repo";
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { BrainRepository } from "../repositories/brain-repo";
|
|
2
|
+
import { loadSettings } from "../settings";
|
|
3
|
+
import { extractRelations, entityToSlug } from "../ai/entity-link";
|
|
4
|
+
import { warning, subItem, createSpinner } from "../utils/cli-output";
|
|
5
|
+
import { formatDuration } from "../utils/progress";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Extract entities and create entity pages + links.
|
|
9
|
+
* Non-blocking: failures produce warnings, not errors.
|
|
10
|
+
*
|
|
11
|
+
* This is a **real seam** — called by both `put` (markdown + document branches)
|
|
12
|
+
* and `import` (markdown + docx branches). Two adapters = real seam.
|
|
13
|
+
*/
|
|
14
|
+
export async function applyEntityLinks(
|
|
15
|
+
repo: BrainRepository,
|
|
16
|
+
sourceSlug: string,
|
|
17
|
+
content: string,
|
|
18
|
+
json: boolean,
|
|
19
|
+
): Promise<{ created: number; linked: number }> {
|
|
20
|
+
if (!content.trim()) return { created: 0, linked: 0 };
|
|
21
|
+
|
|
22
|
+
const settings = await loadSettings();
|
|
23
|
+
if (!settings.llm.baseURL) {
|
|
24
|
+
if (!json) {
|
|
25
|
+
warning(`LLM not configured, skipping entity extraction for ${sourceSlug}`);
|
|
26
|
+
}
|
|
27
|
+
return { created: 0, linked: 0 };
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const spinner = createSpinner();
|
|
31
|
+
if (!json) {
|
|
32
|
+
spinner.start(`Extracting entities from ${sourceSlug}...`);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const startTime = Date.now();
|
|
36
|
+
let relations;
|
|
37
|
+
try {
|
|
38
|
+
relations = await extractRelations(content, settings.llm);
|
|
39
|
+
} catch (err) {
|
|
40
|
+
if (!json) {
|
|
41
|
+
spinner.fail(`Entity extraction failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
42
|
+
}
|
|
43
|
+
return { created: 0, linked: 0 };
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Filter by confidence
|
|
47
|
+
const confidenceThreshold = settings.extraction.confidenceThreshold;
|
|
48
|
+
const highConfidence = relations.filter((r) => r.confidence >= confidenceThreshold);
|
|
49
|
+
const ignoredCount = relations.length - highConfidence.length;
|
|
50
|
+
|
|
51
|
+
if (highConfidence.length === 0) {
|
|
52
|
+
if (!json) {
|
|
53
|
+
if (relations.length > 0) {
|
|
54
|
+
spinner.warn(`Found ${relations.length} entities but all below confidence threshold (${confidenceThreshold})`);
|
|
55
|
+
} else {
|
|
56
|
+
spinner.warn(`No entities found in content`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return { created: 0, linked: 0 };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
let created = 0;
|
|
63
|
+
let linked = 0;
|
|
64
|
+
|
|
65
|
+
for (const r of highConfidence) {
|
|
66
|
+
// 1. Resolve entity slugs (disambiguation)
|
|
67
|
+
const fromCandidate = entityToSlug(r.from.name, r.from.type);
|
|
68
|
+
const toCandidate = entityToSlug(r.to.name, r.to.type);
|
|
69
|
+
|
|
70
|
+
const fromSlug = await repo.findSimilarSlug(fromCandidate, r.from.name);
|
|
71
|
+
const toSlug = await repo.findSimilarSlug(toCandidate, r.to.name);
|
|
72
|
+
|
|
73
|
+
// 2. Ensure entity pages exist
|
|
74
|
+
const c1 = await repo.ensureEntityPage(fromSlug, r.from.type, r.from.name, r.relation, r.context, sourceSlug);
|
|
75
|
+
const c2 = await repo.ensureEntityPage(toSlug, r.to.type, r.to.name, r.relation, r.context, sourceSlug);
|
|
76
|
+
if (c1) created += 1;
|
|
77
|
+
if (c2) created += 1;
|
|
78
|
+
|
|
79
|
+
// 3. Link between entities (context includes relation type)
|
|
80
|
+
await repo.link(fromSlug, toSlug, `[${r.relation}] ${r.context}`);
|
|
81
|
+
linked += 1;
|
|
82
|
+
|
|
83
|
+
// 4. Link from source document to entities (for backlinks tracing)
|
|
84
|
+
await repo.link(sourceSlug, fromSlug, `Mentions ${r.from.name}`);
|
|
85
|
+
linked += 1;
|
|
86
|
+
await repo.link(sourceSlug, toSlug, `Mentions ${r.to.name}`);
|
|
87
|
+
linked += 1;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if (!json) {
|
|
91
|
+
const duration = formatDuration(Date.now() - startTime);
|
|
92
|
+
const entityNames = [...new Set(highConfidence.flatMap((r) => [r.from.name, r.to.name]))];
|
|
93
|
+
spinner.succeed(`Extracted ${entityNames.length} entities: ${entityNames.join(", ")}`);
|
|
94
|
+
|
|
95
|
+
// Print detailed info
|
|
96
|
+
subItem(`${created} entity pages created`);
|
|
97
|
+
subItem(`${linked} links added`);
|
|
98
|
+
if (ignoredCount > 0) {
|
|
99
|
+
subItem(`${ignoredCount} low-confidence relations ignored`);
|
|
100
|
+
}
|
|
101
|
+
subItem(`Completed in ${duration}`);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return { created, linked };
|
|
105
|
+
}
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import { dirname, extname, resolve } from "node:path";
|
|
2
|
+
import { Command } from "commander";
|
|
3
|
+
import { stat } from "node:fs/promises";
|
|
4
|
+
import { collectDocumentFiles, detectKind, type DocumentKind } from "../markdown/document-loader";
|
|
5
|
+
import { collectMarkdownFiles, pathToSlug } from "../markdown/io";
|
|
6
|
+
import { BrainRepository } from "../repositories/brain-repo";
|
|
7
|
+
import { addDryRun, isDryRun, withRepo, isJson, print, normalizeLinkSlug } from "./shared";
|
|
8
|
+
import { putFile } from "./import-put";
|
|
9
|
+
import { success, warning, subItem, header, keyValue, createSpinner } from "../utils/cli-output";
|
|
10
|
+
import { formatDuration } from "../utils/progress";
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Helpers
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
const DELAY_MS = 600;
|
|
17
|
+
|
|
18
|
+
const DOC_EXTENSIONS = new Set([
|
|
19
|
+
"pdf", "docx", "doc", "html", "htm", "json", "txt", "text",
|
|
20
|
+
]);
|
|
21
|
+
|
|
22
|
+
function isDocumentFile(filePath: string, forceKind?: string): boolean {
|
|
23
|
+
if (forceKind && forceKind !== "markdown") return true;
|
|
24
|
+
const ext = extname(filePath).toLowerCase().replace(/^\./, "");
|
|
25
|
+
return DOC_EXTENSIONS.has(ext);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
async function collectMarkdownFilesFromPaths(paths: string[]): Promise<Array<{ file: string; root: string }>> {
|
|
29
|
+
const results: Array<{ file: string; root: string }> = [];
|
|
30
|
+
for (const p of paths) {
|
|
31
|
+
const rp = resolve(p);
|
|
32
|
+
const s = await stat(rp);
|
|
33
|
+
if (s.isDirectory()) {
|
|
34
|
+
const mdFiles = await collectMarkdownFiles(rp);
|
|
35
|
+
for (const f of mdFiles) results.push({ file: f, root: dirname(rp) });
|
|
36
|
+
} else if (s.isFile() && extname(rp).toLowerCase() === ".md") {
|
|
37
|
+
results.push({ file: rp, root: dirname(rp) });
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return results.sort((a, b) => a.file.localeCompare(b.file));
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
async function collectDocumentFilesFromPaths(paths: string[]): Promise<Array<{ file: string; root: string }>> {
|
|
44
|
+
const results: Array<{ file: string; root: string }> = [];
|
|
45
|
+
for (const p of paths) {
|
|
46
|
+
const rp = resolve(p);
|
|
47
|
+
const s = await stat(rp);
|
|
48
|
+
if (s.isDirectory()) {
|
|
49
|
+
const docFiles = await collectDocumentFiles(rp);
|
|
50
|
+
for (const f of docFiles) results.push({ file: f, root: dirname(rp) });
|
|
51
|
+
} else if (s.isFile() && isDocumentFile(rp)) {
|
|
52
|
+
results.push({ file: rp, root: dirname(rp) });
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return results.sort((a, b) => a.file.localeCompare(b.file));
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function sleep(ms: number): Promise<void> {
|
|
59
|
+
return new Promise((r) => setTimeout(r, ms));
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// ---------------------------------------------------------------------------
|
|
63
|
+
// Import command — collect valid files, then serially put each with 600ms gap
|
|
64
|
+
// ---------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
export function registerImportCommand(program: Command): void {
|
|
67
|
+
addDryRun(
|
|
68
|
+
program
|
|
69
|
+
.command("import")
|
|
70
|
+
.argument("<paths...>", "directories or files (markdown, PDF, DOCX) to import")
|
|
71
|
+
.description("import markdown, PDF, and DOCX files — accepts directories (recursive) and/or individual files")
|
|
72
|
+
.option("--skip-index", "skip vector indexing (useful if seekdb crashes)")
|
|
73
|
+
.option("--skip-entity", "skip entity extraction")
|
|
74
|
+
.addHelpText(
|
|
75
|
+
"after",
|
|
76
|
+
`
|
|
77
|
+
Examples:
|
|
78
|
+
ebrain import ./docs # import a directory
|
|
79
|
+
ebrain import *.docx # import matching files (shell glob)
|
|
80
|
+
ebrain import report.pdf notes.md ./docs # mix of files and directories
|
|
81
|
+
ebrain import ./docs --dry-run
|
|
82
|
+
ebrain import ./docs --skip-index # skip vector indexing
|
|
83
|
+
ebrain import ./docs --skip-entity # skip entity extraction
|
|
84
|
+
`,
|
|
85
|
+
),
|
|
86
|
+
).action(async (paths: string[], opts: { dryRun?: boolean; skipIndex?: boolean; skipEntity?: boolean }) => {
|
|
87
|
+
await withRepo(program, async (repo) => {
|
|
88
|
+
const jsonOut = isJson(program);
|
|
89
|
+
const startTime = Date.now();
|
|
90
|
+
const spinner = createSpinner();
|
|
91
|
+
|
|
92
|
+
// Phase 1: Collect all valid files
|
|
93
|
+
const mdEntries = await collectMarkdownFilesFromPaths(paths);
|
|
94
|
+
const docEntries = await collectDocumentFilesFromPaths(paths);
|
|
95
|
+
const totalFiles = mdEntries.length + docEntries.length;
|
|
96
|
+
|
|
97
|
+
if (totalFiles === 0) {
|
|
98
|
+
if (!jsonOut) {
|
|
99
|
+
header("Import");
|
|
100
|
+
warning("No files found");
|
|
101
|
+
}
|
|
102
|
+
print(program, { ok: true, markdownFiles: 0, docFiles: 0, pages: 0, duration: "0ms" });
|
|
103
|
+
return;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if (isDryRun(opts)) {
|
|
107
|
+
print(program, {
|
|
108
|
+
dryRun: true,
|
|
109
|
+
action: "import",
|
|
110
|
+
paths: paths.map((p) => resolve(p)),
|
|
111
|
+
filesFound: totalFiles,
|
|
112
|
+
slugs: [
|
|
113
|
+
...mdEntries.map((e) => pathToSlug(e.file, e.root)),
|
|
114
|
+
...docEntries.map((e) => pathToSlug(e.file, e.root)),
|
|
115
|
+
],
|
|
116
|
+
});
|
|
117
|
+
return;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if (!jsonOut) {
|
|
121
|
+
header(`Import: ${paths.map((p) => resolve(p)).join(", ")}`);
|
|
122
|
+
spinner.start(`Found ${totalFiles} files (${mdEntries.length} markdown, ${docEntries.length} documents)`);
|
|
123
|
+
spinner.succeed(`Found ${totalFiles} files`);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Phase 2: Serially put each file with 600ms delay
|
|
127
|
+
const allSlugs: string[] = [];
|
|
128
|
+
const writeErrors: string[] = [];
|
|
129
|
+
let createdCount = 0;
|
|
130
|
+
let skippedCount = 0;
|
|
131
|
+
|
|
132
|
+
for (let i = 0; i < totalFiles; i++) {
|
|
133
|
+
const isMd = i < mdEntries.length;
|
|
134
|
+
const entry = isMd ? mdEntries[i]! : docEntries[i - mdEntries.length]!;
|
|
135
|
+
const file = entry.file;
|
|
136
|
+
|
|
137
|
+
if (!jsonOut) {
|
|
138
|
+
spinner.start(`[${i + 1}/${totalFiles}] ${file}`);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
try {
|
|
142
|
+
const result = await putFile({
|
|
143
|
+
repo,
|
|
144
|
+
filePath: file,
|
|
145
|
+
embed: false, // defer to embedAll at the end
|
|
146
|
+
entityLinks: !opts.skipEntity,
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
allSlugs.push(result.slug);
|
|
150
|
+
if (result.unchanged) {
|
|
151
|
+
skippedCount++;
|
|
152
|
+
if (!jsonOut) {
|
|
153
|
+
spinner.warn(`[${i + 1}/${totalFiles}] unchanged — skipped: ${result.slug}`);
|
|
154
|
+
}
|
|
155
|
+
} else {
|
|
156
|
+
createdCount++;
|
|
157
|
+
if (!jsonOut) {
|
|
158
|
+
spinner.succeed(`[${i + 1}/${totalFiles}] ${result.slug} (${result.contentLength} chars)`);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
} catch (err) {
|
|
162
|
+
writeErrors.push(`${file}: ${err instanceof Error ? err.message : String(err)}`);
|
|
163
|
+
if (!jsonOut) {
|
|
164
|
+
spinner.fail(`[${i + 1}/${totalFiles}] error: ${err instanceof Error ? err.message : String(err)}`);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// 600ms delay between files
|
|
169
|
+
if (i < totalFiles - 1) {
|
|
170
|
+
await sleep(DELAY_MS);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Phase 3: Search indexing
|
|
175
|
+
if (opts.skipIndex) {
|
|
176
|
+
if (!jsonOut) {
|
|
177
|
+
success(`Skipping vector indexing (--skip-index)`);
|
|
178
|
+
}
|
|
179
|
+
} else if (allSlugs.length > 0) {
|
|
180
|
+
if (!jsonOut) {
|
|
181
|
+
spinner.start(`Indexing ${allSlugs.length} pages for search...`);
|
|
182
|
+
}
|
|
183
|
+
await repo.embedAll();
|
|
184
|
+
if (!jsonOut) {
|
|
185
|
+
spinner.succeed(`Search indexing complete`);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const duration = formatDuration(Date.now() - startTime);
|
|
190
|
+
|
|
191
|
+
if (!jsonOut) {
|
|
192
|
+
header("Import Summary");
|
|
193
|
+
keyValue("Total files", String(totalFiles));
|
|
194
|
+
keyValue("Pages created", String(createdCount));
|
|
195
|
+
keyValue("Pages skipped (unchanged)", String(skippedCount));
|
|
196
|
+
keyValue("Duration", duration);
|
|
197
|
+
if (writeErrors.length > 0) {
|
|
198
|
+
warning(`${writeErrors.length} errors`);
|
|
199
|
+
for (const e of writeErrors.slice(0, 3)) {
|
|
200
|
+
subItem(e);
|
|
201
|
+
}
|
|
202
|
+
if (writeErrors.length > 3) {
|
|
203
|
+
subItem(`... and ${writeErrors.length - 3} more`);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
print(program, {
|
|
209
|
+
ok: true,
|
|
210
|
+
totalFiles,
|
|
211
|
+
created: createdCount,
|
|
212
|
+
skipped: skippedCount,
|
|
213
|
+
errors: writeErrors.length,
|
|
214
|
+
pages: allSlugs.length,
|
|
215
|
+
duration,
|
|
216
|
+
});
|
|
217
|
+
});
|
|
218
|
+
});
|
|
219
|
+
}
|