ex-brain 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/commands/import-cmd.ts +87 -332
- package/src/commands/import-put.ts +180 -0
package/package.json
CHANGED
|
@@ -1,21 +1,20 @@
|
|
|
1
1
|
import { dirname, extname, resolve } from "node:path";
|
|
2
2
|
import { Command } from "commander";
|
|
3
3
|
import { stat } from "node:fs/promises";
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
import { collectMarkdownFiles, pathToSlug, readTextFile } from "../markdown/io";
|
|
7
|
-
import { parsePageMarkdown, extractWikiStyleLinks, extractTimelineLines } from "../markdown/parser";
|
|
8
|
-
import { extractRelations, entityToSlug, type EntityType, type RelationType, type EntityRef } from "../ai/entity-link";
|
|
9
|
-
import { loadSettings } from "../settings";
|
|
4
|
+
import { collectDocumentFiles, detectKind, type DocumentKind } from "../markdown/document-loader";
|
|
5
|
+
import { collectMarkdownFiles, pathToSlug } from "../markdown/io";
|
|
10
6
|
import { BrainRepository } from "../repositories/brain-repo";
|
|
11
|
-
import { addDryRun, isDryRun,
|
|
12
|
-
import {
|
|
7
|
+
import { addDryRun, isDryRun, withRepo, isJson, print, normalizeLinkSlug } from "./shared";
|
|
8
|
+
import { putFile } from "./import-put";
|
|
9
|
+
import { success, warning, subItem, header, keyValue, createSpinner } from "../utils/cli-output";
|
|
13
10
|
import { formatDuration } from "../utils/progress";
|
|
14
11
|
|
|
15
12
|
// ---------------------------------------------------------------------------
|
|
16
13
|
// Helpers
|
|
17
14
|
// ---------------------------------------------------------------------------
|
|
18
15
|
|
|
16
|
+
const DELAY_MS = 600;
|
|
17
|
+
|
|
19
18
|
const DOC_EXTENSIONS = new Set([
|
|
20
19
|
"pdf", "docx", "doc", "html", "htm", "json", "txt", "text",
|
|
21
20
|
]);
|
|
@@ -33,7 +32,7 @@ async function collectMarkdownFilesFromPaths(paths: string[]): Promise<Array<{ f
|
|
|
33
32
|
const s = await stat(rp);
|
|
34
33
|
if (s.isDirectory()) {
|
|
35
34
|
const mdFiles = await collectMarkdownFiles(rp);
|
|
36
|
-
for (const f of mdFiles) results.push({ file: f, root: rp });
|
|
35
|
+
for (const f of mdFiles) results.push({ file: f, root: dirname(rp) });
|
|
37
36
|
} else if (s.isFile() && extname(rp).toLowerCase() === ".md") {
|
|
38
37
|
results.push({ file: rp, root: dirname(rp) });
|
|
39
38
|
}
|
|
@@ -48,7 +47,7 @@ async function collectDocumentFilesFromPaths(paths: string[]): Promise<Array<{ f
|
|
|
48
47
|
const s = await stat(rp);
|
|
49
48
|
if (s.isDirectory()) {
|
|
50
49
|
const docFiles = await collectDocumentFiles(rp);
|
|
51
|
-
for (const f of docFiles) results.push({ file: f, root: rp });
|
|
50
|
+
for (const f of docFiles) results.push({ file: f, root: dirname(rp) });
|
|
52
51
|
} else if (s.isFile() && isDocumentFile(rp)) {
|
|
53
52
|
results.push({ file: rp, root: dirname(rp) });
|
|
54
53
|
}
|
|
@@ -56,17 +55,12 @@ async function collectDocumentFilesFromPaths(paths: string[]): Promise<Array<{ f
|
|
|
56
55
|
return results.sort((a, b) => a.file.localeCompare(b.file));
|
|
57
56
|
}
|
|
58
57
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
from: EntityRef;
|
|
62
|
-
to: EntityRef;
|
|
63
|
-
relation: RelationType;
|
|
64
|
-
context: string;
|
|
65
|
-
confidence: number;
|
|
58
|
+
function sleep(ms: number): Promise<void> {
|
|
59
|
+
return new Promise((r) => setTimeout(r, ms));
|
|
66
60
|
}
|
|
67
61
|
|
|
68
62
|
// ---------------------------------------------------------------------------
|
|
69
|
-
// Import command
|
|
63
|
+
// Import command — collect valid files, then serially put each with 600ms gap
|
|
70
64
|
// ---------------------------------------------------------------------------
|
|
71
65
|
|
|
72
66
|
export function registerImportCommand(program: Command): void {
|
|
@@ -76,6 +70,7 @@ export function registerImportCommand(program: Command): void {
|
|
|
76
70
|
.argument("<paths...>", "directories or files (markdown, PDF, DOCX) to import")
|
|
77
71
|
.description("import markdown, PDF, and DOCX files — accepts directories (recursive) and/or individual files")
|
|
78
72
|
.option("--skip-index", "skip vector indexing (useful if seekdb crashes)")
|
|
73
|
+
.option("--skip-entity", "skip entity extraction")
|
|
79
74
|
.addHelpText(
|
|
80
75
|
"after",
|
|
81
76
|
`
|
|
@@ -85,348 +80,107 @@ Examples:
|
|
|
85
80
|
ebrain import report.pdf notes.md ./docs # mix of files and directories
|
|
86
81
|
ebrain import ./docs --dry-run
|
|
87
82
|
ebrain import ./docs --skip-index # skip vector indexing
|
|
83
|
+
ebrain import ./docs --skip-entity # skip entity extraction
|
|
88
84
|
`,
|
|
89
85
|
),
|
|
90
|
-
).action(async (paths: string[], opts: { dryRun?: boolean; skipIndex?: boolean }) => {
|
|
86
|
+
).action(async (paths: string[], opts: { dryRun?: boolean; skipIndex?: boolean; skipEntity?: boolean }) => {
|
|
91
87
|
await withRepo(program, async (repo) => {
|
|
88
|
+
const jsonOut = isJson(program);
|
|
89
|
+
const startTime = Date.now();
|
|
90
|
+
const spinner = createSpinner();
|
|
91
|
+
|
|
92
|
+
// Phase 1: Collect all valid files
|
|
92
93
|
const mdEntries = await collectMarkdownFilesFromPaths(paths);
|
|
93
|
-
const
|
|
94
|
+
const docEntries = await collectDocumentFilesFromPaths(paths);
|
|
95
|
+
const totalFiles = mdEntries.length + docEntries.length;
|
|
96
|
+
|
|
97
|
+
if (totalFiles === 0) {
|
|
98
|
+
if (!jsonOut) {
|
|
99
|
+
header("Import");
|
|
100
|
+
warning("No files found");
|
|
101
|
+
}
|
|
102
|
+
print(program, { ok: true, markdownFiles: 0, docFiles: 0, pages: 0, duration: "0ms" });
|
|
103
|
+
return;
|
|
104
|
+
}
|
|
94
105
|
|
|
95
106
|
if (isDryRun(opts)) {
|
|
96
107
|
print(program, {
|
|
97
108
|
dryRun: true,
|
|
98
109
|
action: "import",
|
|
99
110
|
paths: paths.map((p) => resolve(p)),
|
|
100
|
-
filesFound:
|
|
101
|
-
slugs:
|
|
111
|
+
filesFound: totalFiles,
|
|
112
|
+
slugs: [
|
|
113
|
+
...mdEntries.map((e) => pathToSlug(e.file, e.root)),
|
|
114
|
+
...docEntries.map((e) => pathToSlug(e.file, e.root)),
|
|
115
|
+
],
|
|
102
116
|
});
|
|
103
117
|
return;
|
|
104
118
|
}
|
|
105
119
|
|
|
106
|
-
const jsonOut = isJson(program);
|
|
107
|
-
const settings = await loadSettings();
|
|
108
|
-
const spinner = createSpinner();
|
|
109
|
-
const startTime = Date.now();
|
|
110
|
-
|
|
111
120
|
if (!jsonOut) {
|
|
112
121
|
header(`Import: ${paths.map((p) => resolve(p)).join(", ")}`);
|
|
122
|
+
spinner.start(`Found ${totalFiles} files (${mdEntries.length} markdown, ${docEntries.length} documents)`);
|
|
123
|
+
spinner.succeed(`Found ${totalFiles} files`);
|
|
113
124
|
}
|
|
114
125
|
|
|
115
|
-
// Phase
|
|
116
|
-
|
|
117
|
-
spinner.start(`Scanning ${files.length} files...`);
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
const fileData: Array<{
|
|
121
|
-
file: string;
|
|
122
|
-
slug: string;
|
|
123
|
-
parsed: ReturnType<typeof parsePageMarkdown>;
|
|
124
|
-
content: string;
|
|
125
|
-
wikiLinks: string[];
|
|
126
|
-
timelineEntries: ReturnType<typeof extractTimelineLines>;
|
|
127
|
-
tags: string[];
|
|
128
|
-
}> = [];
|
|
129
|
-
|
|
130
|
-
for (let i = 0; i < mdEntries.length; i++) {
|
|
131
|
-
const { file, root } = mdEntries[i]!;
|
|
132
|
-
const rawSlug = pathToSlug(file, root);
|
|
133
|
-
const slug = normalizeLongSlug(rawSlug);
|
|
134
|
-
const content = await readTextFile(file);
|
|
135
|
-
const parsed = parsePageMarkdown(content);
|
|
136
|
-
const wikiLinks = extractWikiStyleLinks(content).map(normalizeLinkSlug);
|
|
137
|
-
const timelineEntries = extractTimelineLines(parsed.timeline);
|
|
138
|
-
const tags = Array.isArray(parsed.frontmatter.tags)
|
|
139
|
-
? parsed.frontmatter.tags.filter((t): t is string => typeof t === "string")
|
|
140
|
-
: [];
|
|
141
|
-
fileData.push({ file, slug, parsed, content, wikiLinks, timelineEntries, tags });
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
if (!jsonOut) {
|
|
145
|
-
spinner.succeed(`Found ${files.length} markdown files`);
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
// Phase 1.5: Scan for docx/pdf files
|
|
126
|
+
// Phase 2: Serially put each file with 600ms delay
|
|
127
|
+
const allSlugs: string[] = [];
|
|
149
128
|
const writeErrors: string[] = [];
|
|
129
|
+
let createdCount = 0;
|
|
130
|
+
let skippedCount = 0;
|
|
150
131
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
const docFilePaths = docEntries.map((e) => e.file);
|
|
156
|
-
|
|
157
|
-
const docFileData: Array<{
|
|
158
|
-
file: string;
|
|
159
|
-
slug: string;
|
|
160
|
-
content: string;
|
|
161
|
-
kind: DocumentKind;
|
|
162
|
-
fileName: string;
|
|
163
|
-
sourceRef: string;
|
|
164
|
-
sourceType: "file" | "url";
|
|
165
|
-
mimeType: string | undefined;
|
|
166
|
-
bytes: number;
|
|
167
|
-
metadata: Record<string, unknown>;
|
|
168
|
-
}> = [];
|
|
132
|
+
for (let i = 0; i < totalFiles; i++) {
|
|
133
|
+
const isMd = i < mdEntries.length;
|
|
134
|
+
const entry = isMd ? mdEntries[i]! : docEntries[i - mdEntries.length]!;
|
|
135
|
+
const file = entry.file;
|
|
169
136
|
|
|
170
|
-
for (let i = 0; i < docFilePaths.length; i++) {
|
|
171
|
-
const file = docFilePaths[i]!;
|
|
172
|
-
const root = docEntries[i]!.root;
|
|
173
137
|
if (!jsonOut) {
|
|
174
|
-
spinner.
|
|
175
|
-
}
|
|
176
|
-
try {
|
|
177
|
-
const loaded = await loadDocument(file, { forceKind: detectKind({ fileName: file }) });
|
|
178
|
-
const rawSlug = pathToSlug(file, root);
|
|
179
|
-
const slug = normalizeLongSlug(rawSlug);
|
|
180
|
-
docFileData.push({
|
|
181
|
-
file,
|
|
182
|
-
slug,
|
|
183
|
-
content: loaded.text,
|
|
184
|
-
kind: loaded.kind,
|
|
185
|
-
fileName: loaded.fileName,
|
|
186
|
-
sourceRef: loaded.source,
|
|
187
|
-
sourceType: loaded.sourceType,
|
|
188
|
-
mimeType: loaded.mimeType,
|
|
189
|
-
bytes: loaded.bytes,
|
|
190
|
-
metadata: loaded.metadata,
|
|
191
|
-
});
|
|
192
|
-
} catch (err) {
|
|
193
|
-
writeErrors.push(`${file}: ${err instanceof Error ? err.message : String(err)}`);
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
if (!jsonOut) {
|
|
198
|
-
spinner.succeed(`Found ${docFilePaths.length} PDF/DOCX files`);
|
|
199
|
-
if (writeErrors.length > 0) {
|
|
200
|
-
warning(`${writeErrors.length} files failed to extract`);
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
// Phase 2: Write all pages first (skip embed for performance)
|
|
205
|
-
if (!jsonOut) {
|
|
206
|
-
spinner.start(`Writing ${fileData.length + docFileData.length} pages to database...`);
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
const allSlugs: string[] = [];
|
|
210
|
-
|
|
211
|
-
for (let i = 0; i < fileData.length; i++) {
|
|
212
|
-
const { slug, parsed } = fileData[i]!;
|
|
213
|
-
if (!jsonOut && i % 20 === 0) {
|
|
214
|
-
spinner.update(`Writing pages... ${i + 1}/${fileData.length + docFileData.length}`);
|
|
215
|
-
}
|
|
216
|
-
try {
|
|
217
|
-
await repo.putPage({
|
|
218
|
-
slug,
|
|
219
|
-
type: String(parsed.frontmatter.type ?? inferTypeFromSlug(slug)),
|
|
220
|
-
title: String(parsed.frontmatter.title ?? slugToTitle(slug)),
|
|
221
|
-
compiledTruth: parsed.compiledTruth,
|
|
222
|
-
timeline: parsed.timeline,
|
|
223
|
-
frontmatter: parsed.frontmatter,
|
|
224
|
-
}, true);
|
|
225
|
-
allSlugs.push(slug);
|
|
226
|
-
} catch (err) {
|
|
227
|
-
writeErrors.push(`${slug}: ${err instanceof Error ? err.message : String(err)}`);
|
|
138
|
+
spinner.start(`[${i + 1}/${totalFiles}] ${file}`);
|
|
228
139
|
}
|
|
229
|
-
}
|
|
230
140
|
|
|
231
|
-
for (let i = 0; i < docFileData.length; i++) {
|
|
232
|
-
const { slug, content, kind, sourceRef, sourceType, mimeType, bytes, metadata, fileName } = docFileData[i]!;
|
|
233
|
-
if (!jsonOut) {
|
|
234
|
-
spinner.update(`Writing pages... ${fileData.length + i + 1}/${fileData.length + docFileData.length}`);
|
|
235
|
-
}
|
|
236
141
|
try {
|
|
237
|
-
const
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
sourceKind: kind,
|
|
244
|
-
sourceMimeType: mimeType,
|
|
245
|
-
sourceBytes: bytes,
|
|
246
|
-
sourceFileName: fileName,
|
|
247
|
-
_contentHash: hash,
|
|
248
|
-
...metadata,
|
|
249
|
-
};
|
|
250
|
-
await repo.putPage({
|
|
251
|
-
slug,
|
|
252
|
-
type,
|
|
253
|
-
title,
|
|
254
|
-
compiledTruth: content,
|
|
255
|
-
timeline: "",
|
|
256
|
-
frontmatter,
|
|
257
|
-
}, true);
|
|
258
|
-
allSlugs.push(slug);
|
|
259
|
-
} catch (err) {
|
|
260
|
-
writeErrors.push(`${slug}: ${err instanceof Error ? err.message : String(err)}`);
|
|
261
|
-
}
|
|
262
|
-
}
|
|
142
|
+
const result = await putFile({
|
|
143
|
+
repo,
|
|
144
|
+
filePath: file,
|
|
145
|
+
embed: false, // defer to embedAll at the end
|
|
146
|
+
entityLinks: !opts.skipEntity,
|
|
147
|
+
});
|
|
263
148
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
|
|
149
|
+
allSlugs.push(result.slug);
|
|
150
|
+
if (result.unchanged) {
|
|
151
|
+
skippedCount++;
|
|
152
|
+
if (!jsonOut) {
|
|
153
|
+
spinner.warn(`[${i + 1}/${totalFiles}] unchanged — skipped: ${result.slug}`);
|
|
154
|
+
}
|
|
155
|
+
} else {
|
|
156
|
+
createdCount++;
|
|
157
|
+
if (!jsonOut) {
|
|
158
|
+
spinner.succeed(`[${i + 1}/${totalFiles}] ${result.slug} (${result.contentLength} chars)`);
|
|
159
|
+
}
|
|
273
160
|
}
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
// Phase 3: Parallel entity extraction
|
|
278
|
-
const BATCH_SIZE = 10;
|
|
279
|
-
const entityResults = new Map<string, EntityRelation[]>();
|
|
280
|
-
|
|
281
|
-
if (settings.llm.baseURL) {
|
|
282
|
-
if (!jsonOut) {
|
|
283
|
-
spinner.start(`Extracting entities with LLM...`);
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
const allPages: Array<{ slug: string; content: string }> = [
|
|
287
|
-
...fileData.map(({ slug, content }) => ({ slug, content })),
|
|
288
|
-
...docFileData.map(({ slug, content }) => ({ slug, content })),
|
|
289
|
-
];
|
|
290
|
-
|
|
291
|
-
for (let i = 0; i < allPages.length; i += BATCH_SIZE) {
|
|
292
|
-
const batch = allPages.slice(i, i + BATCH_SIZE);
|
|
161
|
+
} catch (err) {
|
|
162
|
+
writeErrors.push(`${file}: ${err instanceof Error ? err.message : String(err)}`);
|
|
293
163
|
if (!jsonOut) {
|
|
294
|
-
spinner.
|
|
295
|
-
}
|
|
296
|
-
const batchPromises = batch.map(async ({ slug, content }) => {
|
|
297
|
-
const relations = await extractRelations(content, settings.llm);
|
|
298
|
-
return { slug, relations };
|
|
299
|
-
});
|
|
300
|
-
const results = await Promise.all(batchPromises);
|
|
301
|
-
for (const { slug, relations } of results) {
|
|
302
|
-
entityResults.set(slug, relations);
|
|
164
|
+
spinner.fail(`[${i + 1}/${totalFiles}] error: ${err instanceof Error ? err.message : String(err)}`);
|
|
303
165
|
}
|
|
304
166
|
}
|
|
305
167
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
} else {
|
|
310
|
-
if (!jsonOut) {
|
|
311
|
-
warning(`LLM not configured, skipping entity extraction`);
|
|
168
|
+
// 600ms delay between files
|
|
169
|
+
if (i < totalFiles - 1) {
|
|
170
|
+
await sleep(DELAY_MS);
|
|
312
171
|
}
|
|
313
172
|
}
|
|
314
173
|
|
|
315
|
-
// Phase
|
|
316
|
-
if (!jsonOut) {
|
|
317
|
-
spinner.start(`Creating links, tags, and timeline entries...`);
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
let linkCount = 0;
|
|
321
|
-
let timelineCount = 0;
|
|
322
|
-
let entityCount = 0;
|
|
323
|
-
let tagCount = 0;
|
|
324
|
-
|
|
325
|
-
const allTimelineEntries: Array<{
|
|
326
|
-
pageSlug: string;
|
|
327
|
-
date: string;
|
|
328
|
-
source: string;
|
|
329
|
-
summary: string;
|
|
330
|
-
detail: string;
|
|
331
|
-
}> = [];
|
|
332
|
-
|
|
333
|
-
for (const { slug, wikiLinks, timelineEntries, tags } of fileData) {
|
|
334
|
-
for (const link of wikiLinks) {
|
|
335
|
-
await repo.link(slug, link, "import");
|
|
336
|
-
linkCount++;
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
for (const entry of timelineEntries) {
|
|
340
|
-
allTimelineEntries.push({
|
|
341
|
-
pageSlug: slug,
|
|
342
|
-
date: entry.date,
|
|
343
|
-
source: entry.source,
|
|
344
|
-
summary: entry.summary,
|
|
345
|
-
detail: "",
|
|
346
|
-
});
|
|
347
|
-
timelineCount++;
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
for (const tag of tags) {
|
|
351
|
-
await repo.tag(slug, tag);
|
|
352
|
-
tagCount++;
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
const relations = entityResults.get(slug);
|
|
356
|
-
if (relations && relations.length > 0) {
|
|
357
|
-
const highConfidence = relations.filter(r => r.confidence >= 0.6);
|
|
358
|
-
for (const r of highConfidence) {
|
|
359
|
-
const fromCandidate = entityToSlug(r.from.name, r.from.type);
|
|
360
|
-
const toCandidate = entityToSlug(r.to.name, r.to.type);
|
|
361
|
-
const fromSlug = await repo.findSimilarSlug(fromCandidate, r.from.name);
|
|
362
|
-
const toSlug = await repo.findSimilarSlug(toCandidate, r.to.name);
|
|
363
|
-
|
|
364
|
-
const c1 = await repo.ensureEntityPage(fromSlug, r.from.type, r.from.name, r.relation, r.context, slug);
|
|
365
|
-
const c2 = await repo.ensureEntityPage(toSlug, r.to.type, r.to.name, r.relation, r.context, slug);
|
|
366
|
-
if (c1) entityCount++;
|
|
367
|
-
if (c2) entityCount++;
|
|
368
|
-
|
|
369
|
-
await repo.link(fromSlug, toSlug, `[${r.relation}] ${r.context}`);
|
|
370
|
-
await repo.link(slug, fromSlug, `Mentions ${r.from.name}`);
|
|
371
|
-
await repo.link(slug, toSlug, `Mentions ${r.to.name}`);
|
|
372
|
-
linkCount += 3;
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
for (const { slug } of docFileData) {
|
|
378
|
-
const relations = entityResults.get(slug);
|
|
379
|
-
if (relations && relations.length > 0) {
|
|
380
|
-
const highConfidence = relations.filter(r => r.confidence >= 0.6);
|
|
381
|
-
for (const r of highConfidence) {
|
|
382
|
-
const fromCandidate = entityToSlug(r.from.name, r.from.type);
|
|
383
|
-
const toCandidate = entityToSlug(r.to.name, r.to.type);
|
|
384
|
-
const fromSlug = await repo.findSimilarSlug(fromCandidate, r.from.name);
|
|
385
|
-
const toSlug = await repo.findSimilarSlug(toCandidate, r.to.name);
|
|
386
|
-
|
|
387
|
-
const c1 = await repo.ensureEntityPage(fromSlug, r.from.type, r.from.name, r.relation, r.context, slug);
|
|
388
|
-
const c2 = await repo.ensureEntityPage(toSlug, r.to.type, r.to.name, r.relation, r.context, slug);
|
|
389
|
-
if (c1) entityCount++;
|
|
390
|
-
if (c2) entityCount++;
|
|
391
|
-
|
|
392
|
-
await repo.link(fromSlug, toSlug, `[${r.relation}] ${r.context}`);
|
|
393
|
-
await repo.link(slug, fromSlug, `Mentions ${r.from.name}`);
|
|
394
|
-
await repo.link(slug, toSlug, `Mentions ${r.to.name}`);
|
|
395
|
-
linkCount += 3;
|
|
396
|
-
}
|
|
397
|
-
}
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
for (const { slug, kind, fileName } of docFileData) {
|
|
401
|
-
allTimelineEntries.push({
|
|
402
|
-
pageSlug: slug,
|
|
403
|
-
date: new Date().toISOString().slice(0, 10),
|
|
404
|
-
source: "import",
|
|
405
|
-
summary: `Ingested ${kind}: ${fileName}`,
|
|
406
|
-
detail: "",
|
|
407
|
-
});
|
|
408
|
-
timelineCount++;
|
|
409
|
-
}
|
|
410
|
-
|
|
411
|
-
if (allTimelineEntries.length > 0) {
|
|
412
|
-
await repo.timelineAddBatch(allTimelineEntries);
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
if (!jsonOut) {
|
|
416
|
-
spinner.succeed(`Created links, tags, and timeline`);
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
// Phase 5: Batch sync all pages to search index
|
|
174
|
+
// Phase 3: Search indexing
|
|
420
175
|
if (opts.skipIndex) {
|
|
421
176
|
if (!jsonOut) {
|
|
422
177
|
success(`Skipping vector indexing (--skip-index)`);
|
|
423
178
|
}
|
|
424
|
-
} else {
|
|
179
|
+
} else if (allSlugs.length > 0) {
|
|
425
180
|
if (!jsonOut) {
|
|
426
181
|
spinner.start(`Indexing ${allSlugs.length} pages for search...`);
|
|
427
182
|
}
|
|
428
183
|
await repo.embedAll();
|
|
429
|
-
|
|
430
184
|
if (!jsonOut) {
|
|
431
185
|
spinner.succeed(`Search indexing complete`);
|
|
432
186
|
}
|
|
@@ -436,28 +190,29 @@ Examples:
|
|
|
436
190
|
|
|
437
191
|
if (!jsonOut) {
|
|
438
192
|
header("Import Summary");
|
|
439
|
-
keyValue("
|
|
440
|
-
keyValue("
|
|
441
|
-
keyValue("Pages
|
|
442
|
-
keyValue("Entities extracted", String(entityCount));
|
|
443
|
-
keyValue("Links created", String(linkCount));
|
|
444
|
-
keyValue("Timeline entries", String(timelineCount));
|
|
445
|
-
keyValue("Tags added", String(tagCount));
|
|
193
|
+
keyValue("Total files", String(totalFiles));
|
|
194
|
+
keyValue("Pages created", String(createdCount));
|
|
195
|
+
keyValue("Pages skipped (unchanged)", String(skippedCount));
|
|
446
196
|
keyValue("Duration", duration);
|
|
447
|
-
|
|
448
197
|
if (writeErrors.length > 0) {
|
|
449
|
-
warning(`${writeErrors.length}
|
|
198
|
+
warning(`${writeErrors.length} errors`);
|
|
199
|
+
for (const e of writeErrors.slice(0, 3)) {
|
|
200
|
+
subItem(e);
|
|
201
|
+
}
|
|
202
|
+
if (writeErrors.length > 3) {
|
|
203
|
+
subItem(`... and ${writeErrors.length - 3} more`);
|
|
204
|
+
}
|
|
450
205
|
}
|
|
451
206
|
}
|
|
452
207
|
|
|
453
208
|
print(program, {
|
|
454
209
|
ok: true,
|
|
455
|
-
|
|
456
|
-
|
|
210
|
+
totalFiles,
|
|
211
|
+
created: createdCount,
|
|
212
|
+
skipped: skippedCount,
|
|
213
|
+
errors: writeErrors.length,
|
|
457
214
|
pages: allSlugs.length,
|
|
458
|
-
|
|
459
|
-
timelineEntries: timelineCount,
|
|
460
|
-
entities: entityCount,
|
|
215
|
+
duration,
|
|
461
216
|
});
|
|
462
217
|
});
|
|
463
218
|
});
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared single-file put logic used by both `ebrain put --file` and
|
|
3
|
+
* `ebrain import`. Import calls this function serially with a 600 ms
|
|
4
|
+
* delay between files; `put` calls it once per invocation.
|
|
5
|
+
*/
|
|
6
|
+
import { basename, dirname, extname, resolve } from "node:path";
|
|
7
|
+
import { loadDocument, detectKind, type DocumentKind } from "../markdown/document-loader";
|
|
8
|
+
import { pathToSlug, readTextFile } from "../markdown/io";
|
|
9
|
+
import { parsePageMarkdown } from "../markdown/parser";
|
|
10
|
+
import { BrainRepository } from "../repositories/brain-repo";
|
|
11
|
+
import { contentHash } from "./shared";
|
|
12
|
+
import { applyEntityLinks } from "./entity-links";
|
|
13
|
+
import { inferTypeFromSlug, normalizeLongSlug, slugify, slugToTitle } from "../slug-utils";
|
|
14
|
+
|
|
15
|
+
/* ------------------------------------------------------------------ */
|
|
16
|
+
/* Types */
|
|
17
|
+
/* ------------------------------------------------------------------ */
|
|
18
|
+
|
|
19
|
+
export interface PutFileResult {
|
|
20
|
+
/** Final slug of the page */
|
|
21
|
+
slug: string;
|
|
22
|
+
/** Content length in characters */
|
|
23
|
+
contentLength: number;
|
|
24
|
+
/** Content hash (first 16 chars of SHA-256) */
|
|
25
|
+
contentHash: string;
|
|
26
|
+
/** Whether the page was unchanged and skipped */
|
|
27
|
+
unchanged: boolean;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export interface PutFileOptions {
|
|
31
|
+
repo: BrainRepository;
|
|
32
|
+
/** Absolute path to the file */
|
|
33
|
+
filePath: string;
|
|
34
|
+
/** Explicit slug override */
|
|
35
|
+
slug?: string;
|
|
36
|
+
/** Type override (e.g. "person", "note") */
|
|
37
|
+
type?: string;
|
|
38
|
+
/** Title override */
|
|
39
|
+
title?: string;
|
|
40
|
+
/** Force document kind (only for non-md files) */
|
|
41
|
+
format?: DocumentKind;
|
|
42
|
+
/** Maximum bytes for file ingest (default 50 MB) */
|
|
43
|
+
maxBytes?: number;
|
|
44
|
+
/** Fetch timeout for URLs in ms (default 30 000) */
|
|
45
|
+
timeout?: number;
|
|
46
|
+
/** Whether to run entity extraction (default true) */
|
|
47
|
+
entityLinks?: boolean;
|
|
48
|
+
/** Whether to embed in search index (default true) */
|
|
49
|
+
embed?: boolean;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/* ------------------------------------------------------------------ */
|
|
53
|
+
/* Helpers */
|
|
54
|
+
/* ------------------------------------------------------------------ */
|
|
55
|
+
|
|
56
|
+
const DOC_EXTENSIONS = new Set([
|
|
57
|
+
"pdf", "docx", "doc", "html", "htm", "json", "txt", "text",
|
|
58
|
+
]);
|
|
59
|
+
|
|
60
|
+
function isDocumentFile(filePath: string, forceKind?: string): boolean {
|
|
61
|
+
if (forceKind && forceKind !== "markdown") return true;
|
|
62
|
+
const ext = extname(filePath).toLowerCase().replace(/^\./, "");
|
|
63
|
+
return DOC_EXTENSIONS.has(ext);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/* ------------------------------------------------------------------ */
|
|
67
|
+
/* Core: put a single file */
|
|
68
|
+
/* ------------------------------------------------------------------ */
|
|
69
|
+
|
|
70
|
+
export async function putFile(opts: PutFileOptions): Promise<PutFileResult> {
|
|
71
|
+
const {
|
|
72
|
+
repo,
|
|
73
|
+
filePath,
|
|
74
|
+
type: typeOverride,
|
|
75
|
+
title: titleOverride,
|
|
76
|
+
format,
|
|
77
|
+
maxBytes,
|
|
78
|
+
timeout,
|
|
79
|
+
entityLinks = true,
|
|
80
|
+
embed = true,
|
|
81
|
+
} = opts;
|
|
82
|
+
|
|
83
|
+
const isDoc = isDocumentFile(filePath, format);
|
|
84
|
+
|
|
85
|
+
// ── Branch 1: document file (pdf/docx/html/txt/json) ──
|
|
86
|
+
if (isDoc) {
|
|
87
|
+
const loaded = await loadDocument(filePath, {
|
|
88
|
+
forceKind: format,
|
|
89
|
+
fetchTimeoutMs: timeout,
|
|
90
|
+
maxBytes,
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
const { text: content, kind, fileName, source: sourceRef, sourceType, mimeType, bytes, metadata } = loaded;
|
|
94
|
+
let finalSlug = opts.slug;
|
|
95
|
+
if (!finalSlug) {
|
|
96
|
+
const nameNoExt = fileName.replace(/\.[^.]+$/, "");
|
|
97
|
+
finalSlug = `ingest/${normalizeLongSlug(slugify(nameNoExt))}`;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const type = typeOverride ?? kind;
|
|
101
|
+
const title = titleOverride ?? String(slugToTitle(finalSlug));
|
|
102
|
+
const hash = contentHash(content);
|
|
103
|
+
|
|
104
|
+
// Idempotency check
|
|
105
|
+
const existingPage = await repo.getPage(finalSlug);
|
|
106
|
+
const existingHash = (existingPage?.frontmatter?._contentHash) as string | undefined;
|
|
107
|
+
if (existingHash === hash) {
|
|
108
|
+
await repo.syncTagsFromFrontmatter(finalSlug, {
|
|
109
|
+
_contentHash: hash,
|
|
110
|
+
sourceFile: sourceRef,
|
|
111
|
+
sourceType,
|
|
112
|
+
sourceKind: kind,
|
|
113
|
+
sourceMimeType: mimeType,
|
|
114
|
+
sourceBytes: bytes,
|
|
115
|
+
sourceFileName: fileName,
|
|
116
|
+
...metadata,
|
|
117
|
+
});
|
|
118
|
+
return { slug: finalSlug, contentLength: content.length, contentHash: hash, unchanged: true };
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const frontmatter: Record<string, unknown> = {
|
|
122
|
+
sourceFile: sourceRef,
|
|
123
|
+
sourceType,
|
|
124
|
+
sourceKind: kind,
|
|
125
|
+
sourceMimeType: mimeType,
|
|
126
|
+
sourceBytes: bytes,
|
|
127
|
+
sourceFileName: fileName,
|
|
128
|
+
_contentHash: hash,
|
|
129
|
+
...metadata,
|
|
130
|
+
};
|
|
131
|
+
|
|
132
|
+
await repo.putPage({ slug: finalSlug, type, title, compiledTruth: content, timeline: "", frontmatter }, embed);
|
|
133
|
+
|
|
134
|
+
if (entityLinks) {
|
|
135
|
+
await applyEntityLinks(repo, finalSlug, content, true);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return { slug: finalSlug, contentLength: content.length, contentHash: hash, unchanged: false };
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// ── Branch 2: markdown ──
|
|
142
|
+
const content = await readTextFile(filePath);
|
|
143
|
+
const parsed = parsePageMarkdown(content);
|
|
144
|
+
|
|
145
|
+
let finalSlug = opts.slug;
|
|
146
|
+
if (!finalSlug) {
|
|
147
|
+
finalSlug = normalizeLongSlug(slugify(basename(filePath).replace(/\.md$/i, "")));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
const type = typeOverride ?? String(parsed.frontmatter.type ?? inferTypeFromSlug(finalSlug));
|
|
151
|
+
const title = titleOverride ?? String(parsed.frontmatter.title ?? slugToTitle(finalSlug));
|
|
152
|
+
const hash = contentHash(parsed.compiledTruth);
|
|
153
|
+
|
|
154
|
+
// Idempotency check
|
|
155
|
+
const existingPage = await repo.getPage(finalSlug);
|
|
156
|
+
const existingHash = (existingPage?.frontmatter?._contentHash) as string | undefined;
|
|
157
|
+
if (existingHash === hash) {
|
|
158
|
+
await repo.syncTagsFromFrontmatter(finalSlug, parsed.frontmatter);
|
|
159
|
+
return { slug: finalSlug, contentLength: parsed.compiledTruth.length, contentHash: hash, unchanged: true };
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
parsed.frontmatter._contentHash = hash;
|
|
163
|
+
|
|
164
|
+
await repo.putPage({
|
|
165
|
+
slug: finalSlug,
|
|
166
|
+
type,
|
|
167
|
+
title,
|
|
168
|
+
compiledTruth: parsed.compiledTruth,
|
|
169
|
+
timeline: parsed.timeline,
|
|
170
|
+
frontmatter: parsed.frontmatter,
|
|
171
|
+
}, embed);
|
|
172
|
+
|
|
173
|
+
await repo.syncTagsFromFrontmatter(finalSlug, parsed.frontmatter);
|
|
174
|
+
|
|
175
|
+
if (entityLinks) {
|
|
176
|
+
await applyEntityLinks(repo, finalSlug, parsed.compiledTruth, true);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
return { slug: finalSlug, contentLength: parsed.compiledTruth.length, contentHash: hash, unchanged: false };
|
|
180
|
+
}
|