@fs/mycroft 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -0
- package/dist/batch-embedder-6IIWAZPW.js +14 -0
- package/dist/batch-embedder-6IIWAZPW.js.map +1 -0
- package/dist/batch-embedder-7DGZAQKL.js +14 -0
- package/dist/batch-embedder-7DGZAQKL.js.map +1 -0
- package/dist/batch-embedder-IZDBS3IL.js +13 -0
- package/dist/batch-embedder-IZDBS3IL.js.map +1 -0
- package/dist/batch-embedder-LYCZDYI4.js +15 -0
- package/dist/batch-embedder-LYCZDYI4.js.map +1 -0
- package/dist/batch-embedder-RHKD2OJD.js +14 -0
- package/dist/batch-embedder-RHKD2OJD.js.map +1 -0
- package/dist/batch-embedder-VQZUI7R6.js +14 -0
- package/dist/batch-embedder-VQZUI7R6.js.map +1 -0
- package/dist/batch-embedder-ZJZLNLOK.js +14 -0
- package/dist/batch-embedder-ZJZLNLOK.js.map +1 -0
- package/dist/batch-summarizer-7MCT4HJB.js +14 -0
- package/dist/batch-summarizer-7MCT4HJB.js.map +1 -0
- package/dist/batch-summarizer-BMIBVFAE.js +14 -0
- package/dist/batch-summarizer-BMIBVFAE.js.map +1 -0
- package/dist/chunk-35EO53CC.js +8058 -0
- package/dist/chunk-35EO53CC.js.map +1 -0
- package/dist/chunk-57ZGGKEF.js +8060 -0
- package/dist/chunk-57ZGGKEF.js.map +1 -0
- package/dist/chunk-6DLQHHCC.js +249 -0
- package/dist/chunk-6DLQHHCC.js.map +1 -0
- package/dist/chunk-7CO4PMU5.js +92 -0
- package/dist/chunk-7CO4PMU5.js.map +1 -0
- package/dist/chunk-7DUQNGEK.js +253 -0
- package/dist/chunk-7DUQNGEK.js.map +1 -0
- package/dist/chunk-7IPX4MKA.js +4637 -0
- package/dist/chunk-7IPX4MKA.js.map +1 -0
- package/dist/chunk-7NLMBXXY.js +6438 -0
- package/dist/chunk-7NLMBXXY.js.map +1 -0
- package/dist/chunk-BR2PM6D3.js +11047 -0
- package/dist/chunk-BR2PM6D3.js.map +1 -0
- package/dist/chunk-KGG7WEYE.js +162 -0
- package/dist/chunk-KGG7WEYE.js.map +1 -0
- package/dist/chunk-QRDUQX63.js +256 -0
- package/dist/chunk-QRDUQX63.js.map +1 -0
- package/dist/chunk-R3FOJK5A.js +2088 -0
- package/dist/chunk-R3FOJK5A.js.map +1 -0
- package/dist/chunk-XXO66RCF.js +94 -0
- package/dist/chunk-XXO66RCF.js.map +1 -0
- package/dist/cli.js +638 -179
- package/dist/cli.js.map +1 -1
- package/dist/fileFromPath-FLANAQWT.js +128 -0
- package/dist/fileFromPath-FLANAQWT.js.map +1 -0
- package/dist/main-36PRDAPE.js +1857 -0
- package/dist/main-36PRDAPE.js.map +1 -0
- package/dist/main-B3QJZGLU.js +1859 -0
- package/dist/main-B3QJZGLU.js.map +1 -0
- package/package.json +7 -1
package/dist/cli.js
CHANGED
|
@@ -1,154 +1,43 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
submitBatchEmbeddings
|
|
4
|
+
} from "./chunk-XXO66RCF.js";
|
|
5
|
+
import {
|
|
6
|
+
submitBatchSummaries
|
|
7
|
+
} from "./chunk-7DUQNGEK.js";
|
|
8
|
+
import {
|
|
9
|
+
CHUNK_OVERLAP,
|
|
10
|
+
CHUNK_SIZE,
|
|
11
|
+
SEPARATORS,
|
|
12
|
+
SUMMARY_CONCURRENCY,
|
|
13
|
+
SUMMARY_MAX_TOKENS,
|
|
14
|
+
SUMMARY_TARGET_WORDS,
|
|
15
|
+
configPath,
|
|
16
|
+
ensureConfigDirs,
|
|
17
|
+
ensureDataDirs,
|
|
18
|
+
getModels,
|
|
19
|
+
handleSigint,
|
|
20
|
+
isAskEnabled,
|
|
21
|
+
isInteractive,
|
|
22
|
+
loadConfig,
|
|
23
|
+
logInfo,
|
|
24
|
+
logWarn,
|
|
25
|
+
printError,
|
|
26
|
+
requireOpenAIKey,
|
|
27
|
+
resolvePaths,
|
|
28
|
+
setConfigOverrides,
|
|
29
|
+
stdout
|
|
30
|
+
} from "./chunk-KGG7WEYE.js";
|
|
2
31
|
|
|
3
32
|
// src/cli.ts
|
|
4
33
|
import { Command } from "commander";
|
|
5
|
-
|
|
6
|
-
// src/config.ts
|
|
7
|
-
import { mkdir, readFile } from "fs/promises";
|
|
8
|
-
import { homedir } from "os";
|
|
9
|
-
import { dirname, join, resolve } from "path";
|
|
10
|
-
var DEFAULT_CONFIG = {
|
|
11
|
-
dataDir: "~/.local/share/mycroft",
|
|
12
|
-
askEnabled: true,
|
|
13
|
-
models: {
|
|
14
|
-
embedding: "text-embedding-3-small",
|
|
15
|
-
summary: "gpt-5-nano",
|
|
16
|
-
chat: "gpt-5.1"
|
|
17
|
-
}
|
|
18
|
-
};
|
|
19
|
-
var expandHome = (input) => {
|
|
20
|
-
if (!input.startsWith("~")) return input;
|
|
21
|
-
return join(homedir(), input.slice(1));
|
|
22
|
-
};
|
|
23
|
-
var resolvePath = (input) => resolve(expandHome(input));
|
|
24
|
-
var getConfigPath = () => {
|
|
25
|
-
const override = process.env.MYCROFT_CONFIG;
|
|
26
|
-
if (override) return resolvePath(override);
|
|
27
|
-
return resolvePath("~/.config/mycroft/config.json");
|
|
28
|
-
};
|
|
29
|
-
var normalizeModels = (models) => ({
|
|
30
|
-
embedding: models?.embedding || DEFAULT_CONFIG.models.embedding,
|
|
31
|
-
summary: models?.summary || DEFAULT_CONFIG.models.summary,
|
|
32
|
-
chat: models?.chat || DEFAULT_CONFIG.models.chat
|
|
33
|
-
});
|
|
34
|
-
var overrides = {};
|
|
35
|
-
var setConfigOverrides = (next) => {
|
|
36
|
-
overrides = { ...overrides, ...next };
|
|
37
|
-
};
|
|
38
|
-
var normalizeConfig = (input) => {
|
|
39
|
-
const dataDirEnv = process.env.MYCROFT_DATA_DIR;
|
|
40
|
-
const dataDir = overrides.dataDir || dataDirEnv || input?.dataDir || DEFAULT_CONFIG.dataDir;
|
|
41
|
-
return {
|
|
42
|
-
dataDir,
|
|
43
|
-
askEnabled: input?.askEnabled ?? DEFAULT_CONFIG.askEnabled,
|
|
44
|
-
models: normalizeModels(input?.models)
|
|
45
|
-
};
|
|
46
|
-
};
|
|
47
|
-
var readConfigFile = async (path) => {
|
|
48
|
-
try {
|
|
49
|
-
const contents = await readFile(path, "utf-8");
|
|
50
|
-
return JSON.parse(contents);
|
|
51
|
-
} catch {
|
|
52
|
-
return null;
|
|
53
|
-
}
|
|
54
|
-
};
|
|
55
|
-
var loadConfig = async () => {
|
|
56
|
-
const configPath2 = getConfigPath();
|
|
57
|
-
const data = await readConfigFile(configPath2);
|
|
58
|
-
const normalized = normalizeConfig(data);
|
|
59
|
-
return {
|
|
60
|
-
...normalized,
|
|
61
|
-
dataDir: resolvePath(normalized.dataDir)
|
|
62
|
-
};
|
|
63
|
-
};
|
|
64
|
-
var ensureConfigDirs = async (configPath2) => {
|
|
65
|
-
const path = configPath2 || getConfigPath();
|
|
66
|
-
await mkdir(dirname(path), { recursive: true });
|
|
67
|
-
};
|
|
68
|
-
var configPath = () => getConfigPath();
|
|
69
|
-
|
|
70
|
-
// src/commands/io.ts
|
|
71
|
-
import chalk from "chalk";
|
|
72
|
-
var isTTY = () => Boolean(process.stdout.isTTY);
|
|
73
|
-
var isInteractive = () => Boolean(process.stdin.isTTY && process.stdout.isTTY);
|
|
74
|
-
var formatError = (text) => isTTY() ? chalk.red(text) : text;
|
|
75
|
-
var formatWarn = (text) => isTTY() ? chalk.yellow(text) : text;
|
|
76
|
-
var stdout = (message) => {
|
|
77
|
-
process.stdout.write(message.endsWith("\n") ? message : `${message}
|
|
78
|
-
`);
|
|
79
|
-
};
|
|
80
|
-
var stderr = (message) => {
|
|
81
|
-
process.stderr.write(message.endsWith("\n") ? message : `${message}
|
|
82
|
-
`);
|
|
83
|
-
};
|
|
84
|
-
var printError = (message) => {
|
|
85
|
-
stderr(formatError(`Error: ${message}`));
|
|
86
|
-
};
|
|
87
|
-
var logInfo = (message) => {
|
|
88
|
-
stderr(message);
|
|
89
|
-
};
|
|
90
|
-
var logWarn = (message) => {
|
|
91
|
-
stderr(formatWarn(message));
|
|
92
|
-
};
|
|
93
|
-
var handleSigint = (onCancel) => {
|
|
94
|
-
const handler = () => {
|
|
95
|
-
if (onCancel) onCancel();
|
|
96
|
-
stderr("\nCancelled.");
|
|
97
|
-
process.exit(130);
|
|
98
|
-
};
|
|
99
|
-
process.once("SIGINT", handler);
|
|
100
|
-
return () => process.off("SIGINT", handler);
|
|
101
|
-
};
|
|
102
|
-
|
|
103
|
-
// src/cli.ts
|
|
104
34
|
import { readFile as readFile2 } from "fs/promises";
|
|
105
|
-
import { dirname
|
|
35
|
+
import { dirname, resolve } from "path";
|
|
106
36
|
import { fileURLToPath } from "url";
|
|
107
37
|
|
|
108
38
|
// src/services/epub-parser.ts
|
|
109
39
|
import { initEpubFile } from "@lingo-reader/epub-parser";
|
|
110
40
|
import { basename } from "path";
|
|
111
|
-
|
|
112
|
-
// src/services/constants.ts
|
|
113
|
-
import { mkdir as mkdir2 } from "fs/promises";
|
|
114
|
-
var CHUNK_SIZE = 1e3;
|
|
115
|
-
var CHUNK_OVERLAP = 100;
|
|
116
|
-
var SEPARATORS = ["\n\n", "\n", ". ", " ", ""];
|
|
117
|
-
var SUMMARY_MAX_TOKENS = 3e4;
|
|
118
|
-
var SUMMARY_CONCURRENCY = 3;
|
|
119
|
-
var SUMMARY_TARGET_WORDS = 250;
|
|
120
|
-
var resolvePaths = async () => {
|
|
121
|
-
const config = await loadConfig();
|
|
122
|
-
const dataDir = config.dataDir;
|
|
123
|
-
return {
|
|
124
|
-
dataDir,
|
|
125
|
-
booksDir: `${dataDir}/books`,
|
|
126
|
-
vectorsDir: `${dataDir}/vectors`,
|
|
127
|
-
dbPath: `${dataDir}/metadata.db`
|
|
128
|
-
};
|
|
129
|
-
};
|
|
130
|
-
var ensureDataDirs = async () => {
|
|
131
|
-
const paths = await resolvePaths();
|
|
132
|
-
await mkdir2(paths.dataDir, { recursive: true });
|
|
133
|
-
await mkdir2(paths.booksDir, { recursive: true });
|
|
134
|
-
await mkdir2(paths.vectorsDir, { recursive: true });
|
|
135
|
-
return paths;
|
|
136
|
-
};
|
|
137
|
-
var getModels = async () => {
|
|
138
|
-
const config = await loadConfig();
|
|
139
|
-
return config.models;
|
|
140
|
-
};
|
|
141
|
-
var isAskEnabled = async () => {
|
|
142
|
-
const config = await loadConfig();
|
|
143
|
-
return config.askEnabled;
|
|
144
|
-
};
|
|
145
|
-
var requireOpenAIKey = () => {
|
|
146
|
-
if (!process.env.OPENAI_API_KEY) {
|
|
147
|
-
throw new Error("OPENAI_API_KEY is not set. Export it to use embeddings and chat.");
|
|
148
|
-
}
|
|
149
|
-
};
|
|
150
|
-
|
|
151
|
-
// src/services/epub-parser.ts
|
|
152
41
|
var detectNarrativeBoundaries = (chapterTitles) => {
|
|
153
42
|
const frontMatterPattern = /^(about|contents|table of contents|dedication|preface|foreword|title|half.?title|copyright|epigraph|frontispiece|map)/i;
|
|
154
43
|
const backMatterPattern = /^(acknowledgment|afterword|appendix|glossary|index|bibliography|about the author|also by|praise|copyright page|notes|bonus|preview|excerpt|major characters|locations)/i;
|
|
@@ -264,7 +153,7 @@ var parseEpub = async (epubPath, resourceSaveDir) => {
|
|
|
264
153
|
|
|
265
154
|
// src/services/ingest.ts
|
|
266
155
|
import { randomUUID } from "crypto";
|
|
267
|
-
import { mkdir
|
|
156
|
+
import { mkdir, unlink, copyFile, readFile, writeFile } from "fs/promises";
|
|
268
157
|
|
|
269
158
|
// src/services/chunker.ts
|
|
270
159
|
var splitRecursive = (text, separators) => {
|
|
@@ -338,7 +227,7 @@ import { embedMany } from "ai";
|
|
|
338
227
|
import { openai } from "@ai-sdk/openai";
|
|
339
228
|
var MAX_TOKENS_PER_BATCH = 25e4;
|
|
340
229
|
var CHARS_PER_TOKEN = 4;
|
|
341
|
-
var embedChunks = async (chunks) => {
|
|
230
|
+
var embedChunks = async (chunks, options) => {
|
|
342
231
|
if (chunks.length === 0) return [];
|
|
343
232
|
const batches = [];
|
|
344
233
|
let currentBatch = [];
|
|
@@ -367,10 +256,23 @@ var embedChunks = async (chunks) => {
|
|
|
367
256
|
model: openai.embeddingModel(models.embedding),
|
|
368
257
|
values: batch.map((chunk) => chunk.content)
|
|
369
258
|
});
|
|
259
|
+
const embeddedBatch = [];
|
|
370
260
|
for (let j = 0; j < batch.length; j++) {
|
|
371
|
-
|
|
261
|
+
const embeddedChunk = {
|
|
372
262
|
...batch[j],
|
|
373
263
|
vector: embeddings[j] ?? []
|
|
264
|
+
};
|
|
265
|
+
embeddedBatch.push(embeddedChunk);
|
|
266
|
+
allEmbedded.push({
|
|
267
|
+
...embeddedChunk
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
if (options?.onBatch) {
|
|
271
|
+
await options.onBatch(embeddedBatch, {
|
|
272
|
+
batchIndex: i + 1,
|
|
273
|
+
batchCount: batches.length,
|
|
274
|
+
completed: allEmbedded.length,
|
|
275
|
+
total: chunks.length
|
|
374
276
|
});
|
|
375
277
|
}
|
|
376
278
|
}
|
|
@@ -485,8 +387,7 @@ var summarizeSection = async (text, title, sectionNum) => {
|
|
|
485
387
|
model: openai2(models.summary),
|
|
486
388
|
prompt: `Summarize this section from chapter "${title}" (Part ${sectionNum}). Focus on key events, characters, and revelations. Keep it concise (100-150 words):
|
|
487
389
|
|
|
488
|
-
${text}
|
|
489
|
-
temperature: 0.3
|
|
390
|
+
${text}`
|
|
490
391
|
});
|
|
491
392
|
return summary;
|
|
492
393
|
};
|
|
@@ -495,8 +396,7 @@ var generateStructuredSummary = async (content, title, chapterIndex) => {
|
|
|
495
396
|
const models = await getModels();
|
|
496
397
|
const { text } = await generateText({
|
|
497
398
|
model: openai2(models.summary),
|
|
498
|
-
prompt: SUMMARY_PROMPT(title, chapterIndex + 1, content)
|
|
499
|
-
temperature: 0.3
|
|
399
|
+
prompt: SUMMARY_PROMPT(title, chapterIndex + 1, content)
|
|
500
400
|
});
|
|
501
401
|
let jsonText = text.trim();
|
|
502
402
|
if (jsonText.startsWith("```json")) {
|
|
@@ -550,7 +450,9 @@ var summarizeChapter = async (chapter, chapterIndex) => {
|
|
|
550
450
|
};
|
|
551
451
|
var summarizeAllChapters = async (chapters) => {
|
|
552
452
|
const summaries = [];
|
|
553
|
-
logInfo(
|
|
453
|
+
logInfo(
|
|
454
|
+
`[Summarizer] Starting summarization of ${chapters.length} chapters (concurrency: ${SUMMARY_CONCURRENCY})`
|
|
455
|
+
);
|
|
554
456
|
for (let i = 0; i < chapters.length; i += SUMMARY_CONCURRENCY) {
|
|
555
457
|
const batch = chapters.slice(i, i + SUMMARY_CONCURRENCY);
|
|
556
458
|
const batchPromises = batch.map((chapter, batchIndex) => summarizeChapter(chapter, i + batchIndex));
|
|
@@ -623,6 +525,14 @@ var createDb = async () => {
|
|
|
623
525
|
ensureColumn("summaries", "summaries TEXT");
|
|
624
526
|
ensureColumn("narrative_start_index", "narrative_start_index INTEGER DEFAULT 0");
|
|
625
527
|
ensureColumn("narrative_end_index", "narrative_end_index INTEGER");
|
|
528
|
+
ensureColumn("batch_id", "batch_id TEXT");
|
|
529
|
+
ensureColumn("batch_file_id", "batch_file_id TEXT");
|
|
530
|
+
ensureColumn("batch_chunks", "batch_chunks TEXT");
|
|
531
|
+
ensureColumn("ingest_state", "ingest_state TEXT");
|
|
532
|
+
ensureColumn("ingest_resume_path", "ingest_resume_path TEXT");
|
|
533
|
+
ensureColumn("summary_batch_id", "summary_batch_id TEXT");
|
|
534
|
+
ensureColumn("summary_batch_file_id", "summary_batch_file_id TEXT");
|
|
535
|
+
ensureColumn("summary_batch_chapters", "summary_batch_chapters TEXT");
|
|
626
536
|
return db;
|
|
627
537
|
};
|
|
628
538
|
|
|
@@ -639,7 +549,13 @@ var mapRow = (row) => ({
|
|
|
639
549
|
chapters: row.chapters ? JSON.parse(row.chapters) : [],
|
|
640
550
|
progressChapter: row.progress_chapter ?? null,
|
|
641
551
|
narrativeStartIndex: row.narrative_start_index ?? null,
|
|
642
|
-
narrativeEndIndex: row.narrative_end_index ?? null
|
|
552
|
+
narrativeEndIndex: row.narrative_end_index ?? null,
|
|
553
|
+
batchId: row.batch_id ?? null,
|
|
554
|
+
batchFileId: row.batch_file_id ?? null,
|
|
555
|
+
ingestState: row.ingest_state ?? null,
|
|
556
|
+
ingestResumePath: row.ingest_resume_path ?? null,
|
|
557
|
+
summaryBatchId: row.summary_batch_id ?? null,
|
|
558
|
+
summaryBatchFileId: row.summary_batch_file_id ?? null
|
|
643
559
|
});
|
|
644
560
|
var dbPromise = null;
|
|
645
561
|
var getDb = async () => {
|
|
@@ -715,6 +631,38 @@ var updateBook = async (id, updates) => {
|
|
|
715
631
|
fields.push("narrative_end_index = @narrativeEndIndex");
|
|
716
632
|
params.narrativeEndIndex = updates.narrativeEndIndex;
|
|
717
633
|
}
|
|
634
|
+
if (updates.batchId !== void 0) {
|
|
635
|
+
fields.push("batch_id = @batchId");
|
|
636
|
+
params.batchId = updates.batchId;
|
|
637
|
+
}
|
|
638
|
+
if (updates.batchFileId !== void 0) {
|
|
639
|
+
fields.push("batch_file_id = @batchFileId");
|
|
640
|
+
params.batchFileId = updates.batchFileId;
|
|
641
|
+
}
|
|
642
|
+
if (updates.batchChunks !== void 0) {
|
|
643
|
+
fields.push("batch_chunks = @batchChunks");
|
|
644
|
+
params.batchChunks = updates.batchChunks;
|
|
645
|
+
}
|
|
646
|
+
if (updates.ingestState !== void 0) {
|
|
647
|
+
fields.push("ingest_state = @ingestState");
|
|
648
|
+
params.ingestState = updates.ingestState;
|
|
649
|
+
}
|
|
650
|
+
if (updates.ingestResumePath !== void 0) {
|
|
651
|
+
fields.push("ingest_resume_path = @ingestResumePath");
|
|
652
|
+
params.ingestResumePath = updates.ingestResumePath;
|
|
653
|
+
}
|
|
654
|
+
if (updates.summaryBatchId !== void 0) {
|
|
655
|
+
fields.push("summary_batch_id = @summaryBatchId");
|
|
656
|
+
params.summaryBatchId = updates.summaryBatchId;
|
|
657
|
+
}
|
|
658
|
+
if (updates.summaryBatchFileId !== void 0) {
|
|
659
|
+
fields.push("summary_batch_file_id = @summaryBatchFileId");
|
|
660
|
+
params.summaryBatchFileId = updates.summaryBatchFileId;
|
|
661
|
+
}
|
|
662
|
+
if (updates.summaryBatchChapters !== void 0) {
|
|
663
|
+
fields.push("summary_batch_chapters = @summaryBatchChapters");
|
|
664
|
+
params.summaryBatchChapters = updates.summaryBatchChapters;
|
|
665
|
+
}
|
|
718
666
|
if (fields.length === 0) return;
|
|
719
667
|
const db = await getDb();
|
|
720
668
|
db.prepare(`UPDATE books SET ${fields.join(", ")} WHERE id = @id`).run(params);
|
|
@@ -729,6 +677,16 @@ var getBook = async (id) => {
|
|
|
729
677
|
const row = db.prepare("SELECT * FROM books WHERE id = ?").get(id);
|
|
730
678
|
return row ? mapRow(row) : null;
|
|
731
679
|
};
|
|
680
|
+
var getBookBatchChunks = async (id) => {
|
|
681
|
+
const db = await getDb();
|
|
682
|
+
const row = db.prepare("SELECT batch_chunks FROM books WHERE id = ?").get(id);
|
|
683
|
+
return row?.batch_chunks ?? null;
|
|
684
|
+
};
|
|
685
|
+
var getBookSummaryBatchChapters = async (id) => {
|
|
686
|
+
const db = await getDb();
|
|
687
|
+
const row = db.prepare("SELECT summary_batch_chapters FROM books WHERE id = ?").get(id);
|
|
688
|
+
return row?.summary_batch_chapters ?? null;
|
|
689
|
+
};
|
|
732
690
|
var deleteBook = async (id) => {
|
|
733
691
|
const db = await getDb();
|
|
734
692
|
db.prepare("DELETE FROM chat_messages WHERE session_id IN (SELECT id FROM chat_sessions WHERE book_id = ?)").run(id);
|
|
@@ -822,6 +780,32 @@ var getChatMessages = async (sessionId, limit) => {
|
|
|
822
780
|
};
|
|
823
781
|
|
|
824
782
|
// src/services/ingest.ts
|
|
783
|
+
var resumePathForBook = async (bookId) => {
|
|
784
|
+
const paths = await ensureDataDirs();
|
|
785
|
+
return `${paths.ingestDir}/${bookId}.json`;
|
|
786
|
+
};
|
|
787
|
+
var loadResumeState = async (bookId, resumePath) => {
|
|
788
|
+
const raw = await readFile(resumePath, "utf-8");
|
|
789
|
+
const parsed = JSON.parse(raw);
|
|
790
|
+
if (!Array.isArray(parsed.chunks) || typeof parsed.resumeIndex !== "number") {
|
|
791
|
+
throw new Error(`Invalid resume state for book ${bookId}. Re-ingest to start over.`);
|
|
792
|
+
}
|
|
793
|
+
return parsed;
|
|
794
|
+
};
|
|
795
|
+
var persistResumeState = async (bookId, state) => {
|
|
796
|
+
const resumePath = await resumePathForBook(bookId);
|
|
797
|
+
await writeFile(resumePath, JSON.stringify(state));
|
|
798
|
+
await updateBook(bookId, {
|
|
799
|
+
ingestState: "pending",
|
|
800
|
+
ingestResumePath: resumePath
|
|
801
|
+
});
|
|
802
|
+
return resumePath;
|
|
803
|
+
};
|
|
804
|
+
var finalizeResumeState = async (bookId, resumePath) => {
|
|
805
|
+
const path = resumePath || await resumePathForBook(bookId);
|
|
806
|
+
await unlink(path).catch(() => void 0);
|
|
807
|
+
await updateBook(bookId, { ingestState: null, ingestResumePath: null });
|
|
808
|
+
};
|
|
825
809
|
var formatDuration = (ms) => {
|
|
826
810
|
const seconds = Math.round(ms / 100) / 10;
|
|
827
811
|
return `${seconds}s`;
|
|
@@ -831,8 +815,9 @@ var ingestEpub = async (filePath, selectedChapterIndices, options) => {
|
|
|
831
815
|
const paths = await ensureDataDirs();
|
|
832
816
|
const fileName = `${bookId}.epub`;
|
|
833
817
|
const bookPath = `${paths.booksDir}/${fileName}`;
|
|
818
|
+
let resumePath = null;
|
|
834
819
|
logInfo(`[Ingest] Starting ingestion for book ${bookId}`);
|
|
835
|
-
await
|
|
820
|
+
await mkdir(paths.booksDir, { recursive: true });
|
|
836
821
|
await copyFile(filePath, bookPath);
|
|
837
822
|
logInfo(`[Ingest] EPUB file saved to ${bookPath}`);
|
|
838
823
|
const parseStart = Date.now();
|
|
@@ -858,7 +843,7 @@ var ingestEpub = async (filePath, selectedChapterIndices, options) => {
|
|
|
858
843
|
);
|
|
859
844
|
logInfo(`[Ingest] Processing ${chaptersToProcess.length} selected chapters (indices: ${selectedIndices.join(", ")})`);
|
|
860
845
|
let adjustedSummaries = [];
|
|
861
|
-
if (options?.summarize !== false) {
|
|
846
|
+
if (options?.summarize !== false && !options?.batch) {
|
|
862
847
|
logInfo(`[Ingest] Generating summaries for ${chaptersToProcess.length} chapters...`);
|
|
863
848
|
const summarizeStart = Date.now();
|
|
864
849
|
const summaries = await summarizeAllChapters(chaptersToProcess);
|
|
@@ -886,23 +871,251 @@ var ingestEpub = async (filePath, selectedChapterIndices, options) => {
|
|
|
886
871
|
);
|
|
887
872
|
const chunks = chunkChapters(bookId, chunksToProcess).filter((chunk) => chunk.content.length > 0);
|
|
888
873
|
logInfo(`[Ingest] Created ${chunks.length} chunks from selected chapters`);
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
874
|
+
if (options?.batch) {
|
|
875
|
+
if (options?.summarize !== false) {
|
|
876
|
+
logInfo(`[Ingest] Submitting ${chaptersToProcess.length} chapters for batch summarization`);
|
|
877
|
+
const { batchId: summaryBatchId, inputFileId: summaryFileId, metadata } = await submitBatchSummaries(chaptersToProcess);
|
|
878
|
+
await updateBook(bookId, {
|
|
879
|
+
summaryBatchId,
|
|
880
|
+
summaryBatchFileId: summaryFileId,
|
|
881
|
+
summaryBatchChapters: JSON.stringify({ chapters: chaptersToProcess, metadata, selectedIndices, textChunks: chunks })
|
|
882
|
+
});
|
|
883
|
+
logInfo(`[Ingest] Summary batch submitted (${summaryBatchId}). Use "mycroft book ingest status ${bookId.slice(0, 8)}" or "mycroft book ingest resume ${bookId.slice(0, 8)}".`);
|
|
884
|
+
} else {
|
|
885
|
+
logInfo(`[Ingest] Submitting ${chunks.length} chunks to OpenAI Batch API`);
|
|
886
|
+
const { batchId, inputFileId } = await submitBatchEmbeddings(chunks);
|
|
887
|
+
await updateBook(bookId, {
|
|
888
|
+
batchId,
|
|
889
|
+
batchFileId: inputFileId,
|
|
890
|
+
batchChunks: JSON.stringify(chunks)
|
|
891
|
+
});
|
|
892
|
+
logInfo(`[Ingest] Batch submitted (${batchId}). Use "mycroft book ingest status ${bookId.slice(0, 8)}" or "mycroft book ingest resume ${bookId.slice(0, 8)}".`);
|
|
893
|
+
}
|
|
894
|
+
} else {
|
|
895
|
+
const allChunks = [...chunks, ...adjustedSummaries];
|
|
896
|
+
const embedStart = Date.now();
|
|
897
|
+
resumePath = await persistResumeState(bookId, { chunks: allChunks, resumeIndex: 0 });
|
|
898
|
+
const embedded = await embedChunks(allChunks, {
|
|
899
|
+
onBatch: async (embeddedBatch, progress) => {
|
|
900
|
+
await addChunksToIndex(bookId, embeddedBatch);
|
|
901
|
+
await updateBook(bookId, { chunkCount: progress.completed });
|
|
902
|
+
if (!resumePath) return;
|
|
903
|
+
await writeFile(
|
|
904
|
+
resumePath,
|
|
905
|
+
JSON.stringify({ chunks: allChunks, resumeIndex: progress.completed })
|
|
906
|
+
);
|
|
907
|
+
}
|
|
908
|
+
});
|
|
909
|
+
logInfo(`[Ingest] Embedded ${embedded.length} total chunks (${formatDuration(Date.now() - embedStart)})`);
|
|
910
|
+
await updateBook(bookId, { chunkCount: embedded.length, indexedAt: Date.now() });
|
|
911
|
+
logInfo(`[Ingest] Updated book record with chunk count: ${embedded.length}`);
|
|
912
|
+
await finalizeResumeState(bookId, resumePath);
|
|
913
|
+
}
|
|
897
914
|
} catch (error) {
|
|
898
915
|
logWarn(`[Ingest] Error during chunking/embedding: ${error instanceof Error ? error.message : String(error)}`);
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
916
|
+
if (resumePath) {
|
|
917
|
+
logWarn(`[Ingest] Partial progress saved. Use "mycroft book ingest status ${bookId.slice(0, 8)}" or "mycroft book ingest resume ${bookId.slice(0, 8)}".`);
|
|
918
|
+
return { id: bookId, status: "interrupted" };
|
|
919
|
+
} else {
|
|
920
|
+
await deleteBookIndex(bookId);
|
|
921
|
+
await unlink(bookPath).catch(() => void 0);
|
|
922
|
+
await deleteBook(bookId).catch(() => void 0);
|
|
923
|
+
}
|
|
902
924
|
throw error;
|
|
903
925
|
}
|
|
904
926
|
logInfo(`[Ingest] Ingestion complete for ${bookId}`);
|
|
905
|
-
return { id: bookId };
|
|
927
|
+
return { id: bookId, status: "completed" };
|
|
928
|
+
};
|
|
929
|
+
var resumeIngest = async (bookId, storedChunks, batchId, batchFileId) => {
|
|
930
|
+
const { checkBatchStatus, downloadBatchResults, cleanupBatchFiles } = await import("./batch-embedder-ZJZLNLOK.js");
|
|
931
|
+
logInfo(`[Resume] Checking embedding batch ${batchId} for book ${bookId}`);
|
|
932
|
+
const status = await checkBatchStatus(batchId);
|
|
933
|
+
logInfo(`[Resume] Batch status: ${status.status} (completed: ${status.completed}/${status.total})`);
|
|
934
|
+
if (["validating", "in_progress", "finalizing"].includes(status.status)) {
|
|
935
|
+
return { status: status.status, completed: status.completed, total: status.total };
|
|
936
|
+
}
|
|
937
|
+
if (status.status === "failed" || status.status === "expired" || status.status === "cancelled") {
|
|
938
|
+
logWarn(`[Resume] Batch ${batchId} ended with status "${status.status}". Re-submitting...`);
|
|
939
|
+
await cleanupBatchFiles(batchFileId, status.outputFileId);
|
|
940
|
+
const { submitBatchEmbeddings: submitBatchEmbeddings2 } = await import("./batch-embedder-ZJZLNLOK.js");
|
|
941
|
+
const { batchId: newBatchId, inputFileId: newFileId } = await submitBatchEmbeddings2(storedChunks);
|
|
942
|
+
await updateBook(bookId, { batchId: newBatchId, batchFileId: newFileId });
|
|
943
|
+
logInfo(`[Resume] New batch submitted (${newBatchId}). Run resume again later.`);
|
|
944
|
+
return { status: "resubmitted", batchId: newBatchId };
|
|
945
|
+
}
|
|
946
|
+
if (status.status !== "completed") {
|
|
947
|
+
throw new Error(`Unexpected batch status: ${status.status}`);
|
|
948
|
+
}
|
|
949
|
+
if (!status.outputFileId) {
|
|
950
|
+
logWarn(`[Resume] Batch ${batchId} completed but produced no output (${status.failed}/${status.total} failed). Re-submitting...`);
|
|
951
|
+
await cleanupBatchFiles(batchFileId, null);
|
|
952
|
+
const { submitBatchEmbeddings: submitBatchEmbeddings2 } = await import("./batch-embedder-ZJZLNLOK.js");
|
|
953
|
+
const { batchId: newBatchId, inputFileId: newFileId } = await submitBatchEmbeddings2(storedChunks);
|
|
954
|
+
await updateBook(bookId, { batchId: newBatchId, batchFileId: newFileId });
|
|
955
|
+
logInfo(`[Resume] New batch submitted (${newBatchId}). Run resume again later.`);
|
|
956
|
+
return { status: "resubmitted", batchId: newBatchId };
|
|
957
|
+
}
|
|
958
|
+
const embedded = await downloadBatchResults(status.outputFileId, storedChunks);
|
|
959
|
+
await addChunksToIndex(bookId, embedded);
|
|
960
|
+
logInfo(`[Resume] Added ${embedded.length} chunks to vector index`);
|
|
961
|
+
await updateBook(bookId, {
|
|
962
|
+
chunkCount: embedded.length,
|
|
963
|
+
indexedAt: Date.now(),
|
|
964
|
+
batchId: null,
|
|
965
|
+
batchFileId: null,
|
|
966
|
+
batchChunks: null
|
|
967
|
+
});
|
|
968
|
+
logInfo(`[Resume] Book ${bookId} indexing complete`);
|
|
969
|
+
await cleanupBatchFiles(batchFileId, status.outputFileId);
|
|
970
|
+
return { status: "completed" };
|
|
971
|
+
};
|
|
972
|
+
var resumeSummaryBatch = async (bookId, summaryBatchId, summaryBatchFileId, storedData) => {
|
|
973
|
+
const { checkBatchStatus, cleanupBatchFiles } = await import("./batch-embedder-ZJZLNLOK.js");
|
|
974
|
+
const { downloadBatchSummaryResults, submitMergePass, downloadMergeResults } = await import("./batch-summarizer-BMIBVFAE.js");
|
|
975
|
+
logInfo(`[Resume] Checking summary batch ${summaryBatchId} for book ${bookId}`);
|
|
976
|
+
const status = await checkBatchStatus(summaryBatchId);
|
|
977
|
+
logInfo(`[Resume] Summary batch status: ${status.status} (completed: ${status.completed}/${status.total})`);
|
|
978
|
+
if (["validating", "in_progress", "finalizing"].includes(status.status)) {
|
|
979
|
+
return { status: status.status, completed: status.completed, total: status.total, phase: "summary" };
|
|
980
|
+
}
|
|
981
|
+
if (status.status === "failed" || status.status === "expired" || status.status === "cancelled") {
|
|
982
|
+
logWarn(`[Resume] Summary batch ${summaryBatchId} ended with status "${status.status}". Re-submitting...`);
|
|
983
|
+
await cleanupBatchFiles(summaryBatchFileId, status.outputFileId);
|
|
984
|
+
const { submitBatchSummaries: submitBatchSummaries2 } = await import("./batch-summarizer-BMIBVFAE.js");
|
|
985
|
+
const { batchId: newBatchId, inputFileId: newFileId, metadata: newMetadata } = await submitBatchSummaries2(storedData.chapters);
|
|
986
|
+
await updateBook(bookId, {
|
|
987
|
+
summaryBatchId: newBatchId,
|
|
988
|
+
summaryBatchFileId: newFileId,
|
|
989
|
+
summaryBatchChapters: JSON.stringify({ ...storedData, metadata: newMetadata })
|
|
990
|
+
});
|
|
991
|
+
logInfo(`[Resume] New summary batch submitted (${newBatchId}).`);
|
|
992
|
+
return { status: "resubmitted", batchId: newBatchId, phase: "summary" };
|
|
993
|
+
}
|
|
994
|
+
if (status.status !== "completed") {
|
|
995
|
+
throw new Error(`Unexpected summary batch status: ${status.status}`);
|
|
996
|
+
}
|
|
997
|
+
if (!status.outputFileId) {
|
|
998
|
+
logWarn(`[Resume] Summary batch ${summaryBatchId} completed but produced no output (${status.failed}/${status.total} failed). Re-submitting...`);
|
|
999
|
+
await cleanupBatchFiles(summaryBatchFileId, null);
|
|
1000
|
+
const { submitBatchSummaries: submitBatchSummaries2 } = await import("./batch-summarizer-BMIBVFAE.js");
|
|
1001
|
+
const { batchId: newBatchId, inputFileId: newFileId, metadata: newMetadata } = await submitBatchSummaries2(storedData.chapters);
|
|
1002
|
+
await updateBook(bookId, {
|
|
1003
|
+
summaryBatchId: newBatchId,
|
|
1004
|
+
summaryBatchFileId: newFileId,
|
|
1005
|
+
summaryBatchChapters: JSON.stringify({ ...storedData, metadata: newMetadata })
|
|
1006
|
+
});
|
|
1007
|
+
logInfo(`[Resume] New summary batch submitted (${newBatchId}).`);
|
|
1008
|
+
return { status: "resubmitted", batchId: newBatchId, phase: "summary" };
|
|
1009
|
+
}
|
|
1010
|
+
let { summaries, needsMergePass } = await downloadBatchSummaryResults(
|
|
1011
|
+
status.outputFileId,
|
|
1012
|
+
storedData.chapters,
|
|
1013
|
+
storedData.metadata
|
|
1014
|
+
);
|
|
1015
|
+
await cleanupBatchFiles(summaryBatchFileId, status.outputFileId);
|
|
1016
|
+
if (needsMergePass.length > 0) {
|
|
1017
|
+
logInfo(`[Resume] ${needsMergePass.length} chapters need merge pass, submitting merge batch...`);
|
|
1018
|
+
const mergeResult = await submitMergePass(needsMergePass);
|
|
1019
|
+
await updateBook(bookId, {
|
|
1020
|
+
summaryBatchId: mergeResult.batchId,
|
|
1021
|
+
summaryBatchFileId: mergeResult.inputFileId,
|
|
1022
|
+
summaryBatchChapters: JSON.stringify({
|
|
1023
|
+
...storedData,
|
|
1024
|
+
metadata: mergeResult.metadata,
|
|
1025
|
+
completedSummaries: summaries,
|
|
1026
|
+
isMergePass: true
|
|
1027
|
+
})
|
|
1028
|
+
});
|
|
1029
|
+
return { status: "merge_submitted", batchId: mergeResult.batchId, phase: "summary" };
|
|
1030
|
+
}
|
|
1031
|
+
return await finalizeSummariesAndSubmitEmbeddings(bookId, summaries, storedData);
|
|
1032
|
+
};
|
|
1033
|
+
var resumeMergeBatch = async (bookId, summaryBatchId, summaryBatchFileId, storedData) => {
|
|
1034
|
+
const { checkBatchStatus, cleanupBatchFiles } = await import("./batch-embedder-ZJZLNLOK.js");
|
|
1035
|
+
const { downloadMergeResults } = await import("./batch-summarizer-BMIBVFAE.js");
|
|
1036
|
+
logInfo(`[Resume] Checking merge batch ${summaryBatchId} for book ${bookId}`);
|
|
1037
|
+
const status = await checkBatchStatus(summaryBatchId);
|
|
1038
|
+
logInfo(`[Resume] Merge batch status: ${status.status} (completed: ${status.completed}/${status.total})`);
|
|
1039
|
+
if (["validating", "in_progress", "finalizing"].includes(status.status)) {
|
|
1040
|
+
return { status: status.status, completed: status.completed, total: status.total, phase: "summary" };
|
|
1041
|
+
}
|
|
1042
|
+
if (status.status !== "completed") {
|
|
1043
|
+
throw new Error(`Unexpected merge batch status: ${status.status}`);
|
|
1044
|
+
}
|
|
1045
|
+
if (!status.outputFileId) {
|
|
1046
|
+
throw new Error(`Merge batch completed but produced no output (${status.failed}/${status.total} failed). Re-ingest to start over.`);
|
|
1047
|
+
}
|
|
1048
|
+
const mergedSummaries = await downloadMergeResults(
|
|
1049
|
+
status.outputFileId,
|
|
1050
|
+
storedData.metadata.map((m) => ({ chapterIndex: m.chapterIndex, title: m.title }))
|
|
1051
|
+
);
|
|
1052
|
+
await cleanupBatchFiles(summaryBatchFileId, status.outputFileId);
|
|
1053
|
+
const allSummaries = [...storedData.completedSummaries || [], ...mergedSummaries];
|
|
1054
|
+
return await finalizeSummariesAndSubmitEmbeddings(bookId, allSummaries, storedData);
|
|
1055
|
+
};
|
|
1056
|
+
var finalizeSummariesAndSubmitEmbeddings = async (bookId, summaries, storedData) => {
|
|
1057
|
+
const { submitBatchEmbeddings: submitBatchEmbeddings2 } = await import("./batch-embedder-ZJZLNLOK.js");
|
|
1058
|
+
const summaryRecords = summaries.map((s) => ({
|
|
1059
|
+
...s,
|
|
1060
|
+
chapterIndex: storedData.selectedIndices[s.chapterIndex] ?? s.chapterIndex
|
|
1061
|
+
}));
|
|
1062
|
+
await updateBook(bookId, {
|
|
1063
|
+
summaries: JSON.stringify(summaryRecords)
|
|
1064
|
+
});
|
|
1065
|
+
const summaryChunks = summaryRecords.map((s) => ({
|
|
1066
|
+
id: `${bookId}-summary-${s.chapterIndex}`,
|
|
1067
|
+
bookId,
|
|
1068
|
+
chapterIndex: s.chapterIndex,
|
|
1069
|
+
chapterTitle: s.chapterTitle,
|
|
1070
|
+
chunkIndex: -1,
|
|
1071
|
+
content: s.fullSummary,
|
|
1072
|
+
type: "summary"
|
|
1073
|
+
}));
|
|
1074
|
+
logInfo(`[Resume] Created ${summaryChunks.length} summary chunks from ${summaries.length} summaries`);
|
|
1075
|
+
const allChunks = [...storedData.textChunks, ...summaryChunks];
|
|
1076
|
+
logInfo(`[Resume] Submitting ${allChunks.length} chunks for batch embedding`);
|
|
1077
|
+
const { batchId, inputFileId } = await submitBatchEmbeddings2(allChunks);
|
|
1078
|
+
await updateBook(bookId, {
|
|
1079
|
+
summaryBatchId: null,
|
|
1080
|
+
summaryBatchFileId: null,
|
|
1081
|
+
summaryBatchChapters: null,
|
|
1082
|
+
batchId,
|
|
1083
|
+
batchFileId: inputFileId,
|
|
1084
|
+
batchChunks: JSON.stringify(allChunks)
|
|
1085
|
+
});
|
|
1086
|
+
logInfo(`[Resume] Embedding batch submitted (${batchId}). Run resume again when batch completes.`);
|
|
1087
|
+
return { status: "embeddings_submitted", batchId, phase: "embedding" };
|
|
1088
|
+
};
|
|
1089
|
+
var resumeLocalIngest = async (bookId, resumePath, currentChunkCount) => {
|
|
1090
|
+
const state = await loadResumeState(bookId, resumePath);
|
|
1091
|
+
const total = state.chunks.length;
|
|
1092
|
+
const startIndex = Math.max(state.resumeIndex, currentChunkCount);
|
|
1093
|
+
if (startIndex >= total) {
|
|
1094
|
+
await finalizeResumeState(bookId, resumePath);
|
|
1095
|
+
throw new Error(`Resume state already completed for book ${bookId}.`);
|
|
1096
|
+
}
|
|
1097
|
+
logInfo(`[Resume] Resuming local embeddings at chunk ${startIndex + 1}/${total}`);
|
|
1098
|
+
const embedStart = Date.now();
|
|
1099
|
+
const remaining = state.chunks.slice(startIndex);
|
|
1100
|
+
const embeddedRemaining = await embedChunks(remaining, {
|
|
1101
|
+
onBatch: async (embeddedBatch, progress) => {
|
|
1102
|
+
const completed = startIndex + progress.completed;
|
|
1103
|
+
await addChunksToIndex(bookId, embeddedBatch);
|
|
1104
|
+
await updateBook(bookId, { chunkCount: completed });
|
|
1105
|
+
await writeFile(
|
|
1106
|
+
resumePath,
|
|
1107
|
+
JSON.stringify({ chunks: state.chunks, resumeIndex: completed })
|
|
1108
|
+
);
|
|
1109
|
+
}
|
|
1110
|
+
});
|
|
1111
|
+
logInfo(`[Resume] Embedded ${embeddedRemaining.length} remaining chunks (${formatDuration(Date.now() - embedStart)})`);
|
|
1112
|
+
const finalCount = startIndex + embeddedRemaining.length;
|
|
1113
|
+
await updateBook(bookId, {
|
|
1114
|
+
chunkCount: finalCount,
|
|
1115
|
+
indexedAt: Date.now()
|
|
1116
|
+
});
|
|
1117
|
+
await finalizeResumeState(bookId, resumePath);
|
|
1118
|
+
return { status: "completed", chunkCount: finalCount };
|
|
906
1119
|
};
|
|
907
1120
|
|
|
908
1121
|
// src/commands/ingest.ts
|
|
@@ -983,17 +1196,51 @@ var ingestCommand = async (filePath, options) => {
|
|
|
983
1196
|
);
|
|
984
1197
|
}
|
|
985
1198
|
}
|
|
986
|
-
const result = await ingestEpub(filePath, selectedChapterIndices, { summarize: options.summarize ?? false });
|
|
987
|
-
|
|
1199
|
+
const result = await ingestEpub(filePath, selectedChapterIndices, { summarize: options.summarize ?? false, batch: options.batch ?? false });
|
|
1200
|
+
const shortId = result.id.slice(0, 8);
|
|
1201
|
+
if (result.status === "interrupted") {
|
|
1202
|
+
stdout(`
|
|
1203
|
+
Ingest interrupted.`);
|
|
1204
|
+
stdout(` mycroft book ingest status ${shortId} # check progress`);
|
|
1205
|
+
stdout(` mycroft book ingest resume ${shortId} # continue ingestion`);
|
|
1206
|
+
return;
|
|
1207
|
+
}
|
|
1208
|
+
if (options.batch) {
|
|
1209
|
+
const batchType = options.summarize ? "Summary batch" : "Embedding batch";
|
|
1210
|
+
stdout(`
|
|
1211
|
+
${batchType} submitted. Book registered as ${result.id}`);
|
|
1212
|
+
stdout(` mycroft book ingest status ${shortId} # check batch progress`);
|
|
1213
|
+
stdout(` mycroft book ingest resume ${shortId} # continue when batch finishes`);
|
|
1214
|
+
} else {
|
|
1215
|
+
stdout(`
|
|
988
1216
|
Done. Book indexed as ${result.id}`);
|
|
1217
|
+
}
|
|
989
1218
|
};
|
|
990
1219
|
|
|
991
1220
|
// src/commands/book/ingest.ts
|
|
992
1221
|
var registerBookIngest = (program2) => {
|
|
993
|
-
program2.command("ingest").description("Ingest an EPUB file").argument("<path>", "Path to the EPUB file").option("--manual", "Interactive chapter selection").option("--summary", "Enable AI chapter summaries").
|
|
1222
|
+
const ingest = program2.command("ingest").description("Ingest an EPUB file").argument("<path>", "Path to the EPUB file").option("--manual", "Interactive chapter selection").option("--summary", "Enable AI chapter summaries").option("--batch", "Use OpenAI Batch API for embeddings and summaries (50% cost savings, up to 24h)").addHelpText(
|
|
1223
|
+
"after",
|
|
1224
|
+
`
|
|
1225
|
+
EXAMPLES
|
|
1226
|
+
mycroft book ingest ./book.epub
|
|
1227
|
+
mycroft book ingest ./book.epub --summary
|
|
1228
|
+
mycroft book ingest ./book.epub --batch --summary
|
|
1229
|
+
mycroft book ingest status 8f2c1a4b
|
|
1230
|
+
mycroft book ingest resume 8f2c1a4b
|
|
1231
|
+
|
|
1232
|
+
NOTES
|
|
1233
|
+
--batch submits work to the OpenAI Batch API and returns immediately.
|
|
1234
|
+
When combined with --summary, summaries are batched first, then embeddings.
|
|
1235
|
+
Use "mycroft book ingest status <id>" to check progress.
|
|
1236
|
+
Use "mycroft book ingest resume <id>" to continue when a batch completes.
|
|
1237
|
+
Non-batch ingests can also be resumed if interrupted.
|
|
1238
|
+
`
|
|
1239
|
+
).action(async (path, options) => {
|
|
994
1240
|
const summarize = Boolean(options.summary);
|
|
995
|
-
await ingestCommand(path, { manual: options.manual, summarize });
|
|
1241
|
+
await ingestCommand(path, { manual: options.manual, summarize, batch: options.batch });
|
|
996
1242
|
});
|
|
1243
|
+
return ingest;
|
|
997
1244
|
};
|
|
998
1245
|
|
|
999
1246
|
// src/commands/list.ts
|
|
@@ -1016,7 +1263,7 @@ var listCommand = async () => {
|
|
|
1016
1263
|
const author = book.author || "-";
|
|
1017
1264
|
const chunks = String(book.chunkCount ?? 0);
|
|
1018
1265
|
const indexed = formatDate(book.indexedAt);
|
|
1019
|
-
const status = book.indexedAt ? "[indexed]" : "[pending]";
|
|
1266
|
+
const status = book.indexedAt ? "[indexed]" : book.batchId ? "[batch pending]" : book.ingestState === "pending" ? "[resume pending]" : "[pending]";
|
|
1020
1267
|
stdout(`${shortId} | ${title} | ${author} | ${chunks} | ${indexed} | ${status}`);
|
|
1021
1268
|
}
|
|
1022
1269
|
};
|
|
@@ -1059,6 +1306,7 @@ var showCommand = async (id) => {
|
|
|
1059
1306
|
stdout(`Indexed: ${book.indexedAt ? new Date(book.indexedAt).toISOString() : "-"}`);
|
|
1060
1307
|
stdout(`Narrative range: ${book.narrativeStartIndex ?? 0} to ${book.narrativeEndIndex ?? book.chapters.length - 1}`);
|
|
1061
1308
|
stdout(`Progress chapter: ${book.progressChapter ?? "-"}`);
|
|
1309
|
+
stdout(`Ingest status: ${book.ingestState ?? "-"}`);
|
|
1062
1310
|
stdout("\nChapters:");
|
|
1063
1311
|
book.chapters.forEach((title, index) => {
|
|
1064
1312
|
const marker = index === book.narrativeStartIndex ? "[start]" : index === book.narrativeEndIndex ? "[end]" : "";
|
|
@@ -1251,6 +1499,216 @@ var registerBookDelete = (program2) => {
|
|
|
1251
1499
|
});
|
|
1252
1500
|
};
|
|
1253
1501
|
|
|
1502
|
+
// src/commands/resume.ts
|
|
1503
|
+
var resumeCommand = async (id) => {
|
|
1504
|
+
requireOpenAIKey();
|
|
1505
|
+
await ensureDataDirs();
|
|
1506
|
+
const resolvedId = await resolveBookId(id);
|
|
1507
|
+
if (!resolvedId) {
|
|
1508
|
+
throw new Error(`Book not found: ${id}`);
|
|
1509
|
+
}
|
|
1510
|
+
const book = await getBook(resolvedId);
|
|
1511
|
+
if (!book) {
|
|
1512
|
+
throw new Error(`Book not found: ${id}`);
|
|
1513
|
+
}
|
|
1514
|
+
if (book.indexedAt) {
|
|
1515
|
+
stdout(`Book "${book.title}" is already indexed (${book.chunkCount} chunks).`);
|
|
1516
|
+
return;
|
|
1517
|
+
}
|
|
1518
|
+
const shortId = resolvedId.slice(0, 8);
|
|
1519
|
+
if (book.summaryBatchId) {
|
|
1520
|
+
const rawData = await getBookSummaryBatchChapters(resolvedId);
|
|
1521
|
+
if (!rawData) {
|
|
1522
|
+
throw new Error(`No stored summary batch data for book "${book.title}". Re-ingest with "mycroft book ingest --batch --summary".`);
|
|
1523
|
+
}
|
|
1524
|
+
const storedData = JSON.parse(rawData);
|
|
1525
|
+
let result2;
|
|
1526
|
+
if (storedData.isMergePass) {
|
|
1527
|
+
result2 = await resumeMergeBatch(resolvedId, book.summaryBatchId, book.summaryBatchFileId ?? book.summaryBatchId, storedData);
|
|
1528
|
+
} else {
|
|
1529
|
+
result2 = await resumeSummaryBatch(resolvedId, book.summaryBatchId, book.summaryBatchFileId ?? book.summaryBatchId, storedData);
|
|
1530
|
+
}
|
|
1531
|
+
if (result2.status === "embeddings_submitted") {
|
|
1532
|
+
stdout(`
|
|
1533
|
+
Summaries complete. Embedding batch submitted (${result2.batchId}).`);
|
|
1534
|
+
stdout(` mycroft book ingest status ${shortId} # check embedding batch progress`);
|
|
1535
|
+
stdout(` mycroft book ingest resume ${shortId} # complete ingestion once batch finishes`);
|
|
1536
|
+
} else if (result2.status === "merge_submitted") {
|
|
1537
|
+
stdout(`
|
|
1538
|
+
Section summaries complete. Merge batch submitted (${result2.batchId}).`);
|
|
1539
|
+
stdout(` mycroft book ingest status ${shortId} # check merge batch progress`);
|
|
1540
|
+
stdout(` mycroft book ingest resume ${shortId} # continue when batch finishes`);
|
|
1541
|
+
} else if (result2.status === "resubmitted") {
|
|
1542
|
+
stdout(`
|
|
1543
|
+
Summary batch failed and was re-submitted (${result2.batchId}).`);
|
|
1544
|
+
stdout(` mycroft book ingest status ${shortId} # check batch progress`);
|
|
1545
|
+
stdout(` mycroft book ingest resume ${shortId} # continue when batch finishes`);
|
|
1546
|
+
} else {
|
|
1547
|
+
stdout(`
|
|
1548
|
+
Summary batch still in progress (${result2.status}: ${result2.completed}/${result2.total}).`);
|
|
1549
|
+
stdout(` mycroft book ingest status ${shortId} # check batch progress`);
|
|
1550
|
+
stdout(` mycroft book ingest resume ${shortId} # retry when batch finishes`);
|
|
1551
|
+
}
|
|
1552
|
+
return;
|
|
1553
|
+
}
|
|
1554
|
+
if (book.batchId) {
|
|
1555
|
+
const rawChunks = await getBookBatchChunks(resolvedId);
|
|
1556
|
+
if (!rawChunks) {
|
|
1557
|
+
throw new Error(`No stored chunks found for book "${book.title}". Re-ingest with "mycroft book ingest --batch".`);
|
|
1558
|
+
}
|
|
1559
|
+
const chunks = JSON.parse(rawChunks);
|
|
1560
|
+
const result2 = await resumeIngest(resolvedId, chunks, book.batchId, book.batchFileId ?? book.batchId);
|
|
1561
|
+
if (result2.status === "completed") {
|
|
1562
|
+
stdout(`
|
|
1563
|
+
Done. Book "${book.title}" indexed as ${book.id}`);
|
|
1564
|
+
} else if (result2.status === "resubmitted") {
|
|
1565
|
+
stdout(`
|
|
1566
|
+
Batch failed and was re-submitted (${result2.batchId}).`);
|
|
1567
|
+
stdout(` mycroft book ingest status ${shortId} # check batch progress`);
|
|
1568
|
+
stdout(` mycroft book ingest resume ${shortId} # complete ingestion once batch finishes`);
|
|
1569
|
+
} else {
|
|
1570
|
+
stdout(`
|
|
1571
|
+
Batch still in progress (${result2.status}: ${result2.completed}/${result2.total}).`);
|
|
1572
|
+
stdout(` mycroft book ingest status ${shortId} # check batch progress`);
|
|
1573
|
+
stdout(` mycroft book ingest resume ${shortId} # retry when batch finishes`);
|
|
1574
|
+
}
|
|
1575
|
+
return;
|
|
1576
|
+
}
|
|
1577
|
+
if (!book.ingestResumePath || book.ingestState !== "pending") {
|
|
1578
|
+
throw new Error(`Book "${book.title}" has no resumable ingest. Re-ingest to start one.`);
|
|
1579
|
+
}
|
|
1580
|
+
const result = await resumeLocalIngest(resolvedId, book.ingestResumePath, book.chunkCount ?? 0);
|
|
1581
|
+
if (result.status === "completed") {
|
|
1582
|
+
stdout(`
|
|
1583
|
+
Done. Book "${book.title}" indexed as ${book.id}`);
|
|
1584
|
+
}
|
|
1585
|
+
};
|
|
1586
|
+
|
|
1587
|
+
// src/commands/book/resume.ts
|
|
1588
|
+
var registerBookResume = (program2, ingest) => {
|
|
1589
|
+
const target = ingest ?? program2.command("ingest");
|
|
1590
|
+
target.command("resume").description("Resume a pending ingestion").argument("<id>", "Book id or prefix").addHelpText(
|
|
1591
|
+
"after",
|
|
1592
|
+
`
|
|
1593
|
+
EXAMPLES
|
|
1594
|
+
mycroft book ingest resume 8f2c1a4b
|
|
1595
|
+
|
|
1596
|
+
NOTES
|
|
1597
|
+
Resumes either batch or non-batch ingests if interrupted.
|
|
1598
|
+
`
|
|
1599
|
+
).action(async (id) => {
|
|
1600
|
+
await resumeCommand(id);
|
|
1601
|
+
});
|
|
1602
|
+
};
|
|
1603
|
+
|
|
1604
|
+
// src/commands/status.ts
|
|
1605
|
+
var statusCommand = async (id) => {
|
|
1606
|
+
await ensureDataDirs();
|
|
1607
|
+
const resolvedId = await resolveBookId(id);
|
|
1608
|
+
if (!resolvedId) {
|
|
1609
|
+
throw new Error(`Book not found: ${id}`);
|
|
1610
|
+
}
|
|
1611
|
+
const book = await getBook(resolvedId);
|
|
1612
|
+
if (!book) {
|
|
1613
|
+
throw new Error(`Book not found: ${id}`);
|
|
1614
|
+
}
|
|
1615
|
+
const shortId = resolvedId.slice(0, 8);
|
|
1616
|
+
stdout(`Book: ${book.title}`);
|
|
1617
|
+
stdout(`ID: ${book.id}`);
|
|
1618
|
+
if (book.indexedAt) {
|
|
1619
|
+
stdout(`
|
|
1620
|
+
Status: completed`);
|
|
1621
|
+
stdout(`Chunks: ${book.chunkCount}`);
|
|
1622
|
+
stdout(`Indexed: ${new Date(book.indexedAt).toLocaleString()}`);
|
|
1623
|
+
return;
|
|
1624
|
+
}
|
|
1625
|
+
if (book.summaryBatchId) {
|
|
1626
|
+
requireOpenAIKey();
|
|
1627
|
+
const { checkBatchStatus } = await import("./batch-embedder-ZJZLNLOK.js");
|
|
1628
|
+
const status = await checkBatchStatus(book.summaryBatchId);
|
|
1629
|
+
stdout(`
|
|
1630
|
+
Status: summary batch ${status.status}`);
|
|
1631
|
+
stdout(`Batch: ${book.summaryBatchId}`);
|
|
1632
|
+
stdout(`Progress: ${status.completed}/${status.total} requests${status.failed > 0 ? ` (${status.failed} failed)` : ""}`);
|
|
1633
|
+
if (status.status === "completed") {
|
|
1634
|
+
if (status.failed > 0 && status.completed === 0) {
|
|
1635
|
+
stdout(`
|
|
1636
|
+
All requests failed. Run resume to re-submit.`);
|
|
1637
|
+
} else {
|
|
1638
|
+
stdout(`
|
|
1639
|
+
Summary batch is ready.`);
|
|
1640
|
+
}
|
|
1641
|
+
stdout(` mycroft book ingest resume ${shortId} # process summaries and submit embedding batch`);
|
|
1642
|
+
} else if (["failed", "expired", "cancelled"].includes(status.status)) {
|
|
1643
|
+
stdout(`
|
|
1644
|
+
Summary batch ended with "${status.status}".`);
|
|
1645
|
+
stdout(` mycroft book ingest resume ${shortId} # re-submit summary batch`);
|
|
1646
|
+
} else {
|
|
1647
|
+
stdout(`
|
|
1648
|
+
Summary batch still processing.`);
|
|
1649
|
+
stdout(` mycroft book ingest status ${shortId} # check again later`);
|
|
1650
|
+
stdout(` mycroft book ingest resume ${shortId} # resume when ready`);
|
|
1651
|
+
}
|
|
1652
|
+
return;
|
|
1653
|
+
}
|
|
1654
|
+
if (book.batchId) {
|
|
1655
|
+
requireOpenAIKey();
|
|
1656
|
+
const { checkBatchStatus } = await import("./batch-embedder-ZJZLNLOK.js");
|
|
1657
|
+
const status = await checkBatchStatus(book.batchId);
|
|
1658
|
+
stdout(`
|
|
1659
|
+
Status: embedding batch ${status.status}`);
|
|
1660
|
+
stdout(`Batch: ${book.batchId}`);
|
|
1661
|
+
stdout(`Progress: ${status.completed}/${status.total} requests${status.failed > 0 ? ` (${status.failed} failed)` : ""}`);
|
|
1662
|
+
if (status.status === "completed") {
|
|
1663
|
+
if (status.failed > 0 && status.completed === 0) {
|
|
1664
|
+
stdout(`
|
|
1665
|
+
All requests failed. Run resume to re-submit.`);
|
|
1666
|
+
} else {
|
|
1667
|
+
stdout(`
|
|
1668
|
+
Embedding batch is ready.`);
|
|
1669
|
+
}
|
|
1670
|
+
stdout(` mycroft book ingest resume ${shortId} # complete indexing`);
|
|
1671
|
+
} else if (["failed", "expired", "cancelled"].includes(status.status)) {
|
|
1672
|
+
stdout(`
|
|
1673
|
+
Embedding batch ended with "${status.status}".`);
|
|
1674
|
+
stdout(` mycroft book ingest resume ${shortId} # re-submit embedding batch`);
|
|
1675
|
+
} else {
|
|
1676
|
+
stdout(`
|
|
1677
|
+
Embedding batch still processing.`);
|
|
1678
|
+
stdout(` mycroft book ingest status ${shortId} # check again later`);
|
|
1679
|
+
stdout(` mycroft book ingest resume ${shortId} # resume when ready`);
|
|
1680
|
+
}
|
|
1681
|
+
return;
|
|
1682
|
+
}
|
|
1683
|
+
if (book.ingestResumePath && book.ingestState === "pending") {
|
|
1684
|
+
stdout(`
|
|
1685
|
+
Status: interrupted`);
|
|
1686
|
+
stdout(`Chunks completed: ${book.chunkCount}`);
|
|
1687
|
+
stdout(` mycroft book ingest resume ${shortId} # continue ingestion`);
|
|
1688
|
+
return;
|
|
1689
|
+
}
|
|
1690
|
+
stdout(`
|
|
1691
|
+
Status: no active ingestion`);
|
|
1692
|
+
};
|
|
1693
|
+
|
|
1694
|
+
// src/commands/book/status.ts
|
|
1695
|
+
var registerBookStatus = (program2, ingest) => {
|
|
1696
|
+
const target = ingest ?? program2.command("ingest");
|
|
1697
|
+
target.command("status").description("Check ingestion status for a book").argument("<id>", "Book id or prefix").addHelpText(
|
|
1698
|
+
"after",
|
|
1699
|
+
`
|
|
1700
|
+
EXAMPLES
|
|
1701
|
+
mycroft book ingest status 8f2c1a4b
|
|
1702
|
+
|
|
1703
|
+
NOTES
|
|
1704
|
+
For batch ingests, queries the OpenAI API for live progress.
|
|
1705
|
+
For local ingests, shows how many chunks have been completed.
|
|
1706
|
+
`
|
|
1707
|
+
).action(async (id) => {
|
|
1708
|
+
await statusCommand(id);
|
|
1709
|
+
});
|
|
1710
|
+
};
|
|
1711
|
+
|
|
1254
1712
|
// src/commands/config.ts
|
|
1255
1713
|
var configCommand = async () => {
|
|
1256
1714
|
const path = configPath();
|
|
@@ -1265,7 +1723,7 @@ var registerConfigPath = (program2) => {
|
|
|
1265
1723
|
};
|
|
1266
1724
|
|
|
1267
1725
|
// src/commands/init-config.ts
|
|
1268
|
-
import { mkdir as
|
|
1726
|
+
import { mkdir as mkdir2, writeFile as writeFile2, access as access2 } from "fs/promises";
|
|
1269
1727
|
var initConfigCommand = async () => {
|
|
1270
1728
|
const path = configPath();
|
|
1271
1729
|
await ensureConfigDirs(path);
|
|
@@ -1281,8 +1739,8 @@ var initConfigCommand = async () => {
|
|
|
1281
1739
|
askEnabled: resolved.askEnabled,
|
|
1282
1740
|
models: resolved.models
|
|
1283
1741
|
};
|
|
1284
|
-
await
|
|
1285
|
-
await
|
|
1742
|
+
await writeFile2(path, JSON.stringify(template, null, 2), "utf-8");
|
|
1743
|
+
await mkdir2(resolved.dataDir, { recursive: true });
|
|
1286
1744
|
stdout(`Created config at ${path}`);
|
|
1287
1745
|
};
|
|
1288
1746
|
|
|
@@ -1311,7 +1769,7 @@ var registerConfigResolve = (program2) => {
|
|
|
1311
1769
|
};
|
|
1312
1770
|
|
|
1313
1771
|
// src/commands/onboard.ts
|
|
1314
|
-
import { writeFile as
|
|
1772
|
+
import { writeFile as writeFile3 } from "fs/promises";
|
|
1315
1773
|
var isDefault = (input) => input === "" || input.toLowerCase() === "-y";
|
|
1316
1774
|
var parseBoolean = (input, fallback) => {
|
|
1317
1775
|
if (isDefault(input)) return fallback;
|
|
@@ -1339,7 +1797,7 @@ var onboardCommand = async () => {
|
|
|
1339
1797
|
const chatInput = await prompt(`Chat model [${defaults.models.chat}]: `);
|
|
1340
1798
|
const chat = isDefault(chatInput) ? defaults.models.chat : chatInput;
|
|
1341
1799
|
await ensureConfigDirs(path);
|
|
1342
|
-
await
|
|
1800
|
+
await writeFile3(
|
|
1343
1801
|
path,
|
|
1344
1802
|
JSON.stringify(
|
|
1345
1803
|
{
|
|
@@ -1394,8 +1852,7 @@ var summarizeMessages = async (messages) => {
|
|
|
1394
1852
|
model: openai5(models.summary),
|
|
1395
1853
|
prompt: `Summarize this conversation so far in ~${SUMMARY_TARGET_WORDS2} words. Focus on facts, decisions, and unresolved questions.
|
|
1396
1854
|
|
|
1397
|
-
${transcript}
|
|
1398
|
-
temperature: 0.3
|
|
1855
|
+
${transcript}`
|
|
1399
1856
|
});
|
|
1400
1857
|
return text.trim();
|
|
1401
1858
|
};
|
|
@@ -1661,8 +2118,8 @@ var registerChatCommands = (program2) => {
|
|
|
1661
2118
|
// src/cli.ts
|
|
1662
2119
|
var resolveVersion = async () => {
|
|
1663
2120
|
try {
|
|
1664
|
-
const currentDir =
|
|
1665
|
-
const pkgPath =
|
|
2121
|
+
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
2122
|
+
const pkgPath = resolve(currentDir, "../package.json");
|
|
1666
2123
|
const raw = await readFile2(pkgPath, "utf-8");
|
|
1667
2124
|
return JSON.parse(raw).version || "0.1.0";
|
|
1668
2125
|
} catch {
|
|
@@ -1680,12 +2137,14 @@ var configureProgram = async () => {
|
|
|
1680
2137
|
};
|
|
1681
2138
|
var registerCommands = () => {
|
|
1682
2139
|
const book = program.command("book").description("Manage books and queries");
|
|
1683
|
-
registerBookIngest(book);
|
|
2140
|
+
const ingest = registerBookIngest(book);
|
|
1684
2141
|
registerBookList(book);
|
|
1685
2142
|
registerBookShow(book);
|
|
1686
2143
|
registerBookAsk(book);
|
|
1687
2144
|
registerBookSearch(book);
|
|
1688
2145
|
registerBookDelete(book);
|
|
2146
|
+
registerBookResume(book, ingest);
|
|
2147
|
+
registerBookStatus(book, ingest);
|
|
1689
2148
|
const config = program.command("config").description("Manage configuration");
|
|
1690
2149
|
registerConfigPath(config);
|
|
1691
2150
|
registerConfigInit(config);
|