@fs/mycroft 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -0
- package/dist/batch-embedder-6IIWAZPW.js +14 -0
- package/dist/batch-embedder-6IIWAZPW.js.map +1 -0
- package/dist/batch-embedder-7DGZAQKL.js +14 -0
- package/dist/batch-embedder-7DGZAQKL.js.map +1 -0
- package/dist/batch-embedder-IZDBS3IL.js +13 -0
- package/dist/batch-embedder-IZDBS3IL.js.map +1 -0
- package/dist/batch-embedder-LYCZDYI4.js +15 -0
- package/dist/batch-embedder-LYCZDYI4.js.map +1 -0
- package/dist/batch-embedder-RHKD2OJD.js +14 -0
- package/dist/batch-embedder-RHKD2OJD.js.map +1 -0
- package/dist/batch-embedder-VQZUI7R6.js +14 -0
- package/dist/batch-embedder-VQZUI7R6.js.map +1 -0
- package/dist/batch-embedder-ZJZLNLOK.js +14 -0
- package/dist/batch-embedder-ZJZLNLOK.js.map +1 -0
- package/dist/batch-summarizer-7MCT4HJB.js +14 -0
- package/dist/batch-summarizer-7MCT4HJB.js.map +1 -0
- package/dist/batch-summarizer-BMIBVFAE.js +14 -0
- package/dist/batch-summarizer-BMIBVFAE.js.map +1 -0
- package/dist/chunk-35EO53CC.js +8058 -0
- package/dist/chunk-35EO53CC.js.map +1 -0
- package/dist/chunk-57ZGGKEF.js +8060 -0
- package/dist/chunk-57ZGGKEF.js.map +1 -0
- package/dist/chunk-6DLQHHCC.js +249 -0
- package/dist/chunk-6DLQHHCC.js.map +1 -0
- package/dist/chunk-7CO4PMU5.js +92 -0
- package/dist/chunk-7CO4PMU5.js.map +1 -0
- package/dist/chunk-7DUQNGEK.js +253 -0
- package/dist/chunk-7DUQNGEK.js.map +1 -0
- package/dist/chunk-7IPX4MKA.js +4637 -0
- package/dist/chunk-7IPX4MKA.js.map +1 -0
- package/dist/chunk-7NLMBXXY.js +6438 -0
- package/dist/chunk-7NLMBXXY.js.map +1 -0
- package/dist/chunk-BR2PM6D3.js +11047 -0
- package/dist/chunk-BR2PM6D3.js.map +1 -0
- package/dist/chunk-KGG7WEYE.js +162 -0
- package/dist/chunk-KGG7WEYE.js.map +1 -0
- package/dist/chunk-QRDUQX63.js +256 -0
- package/dist/chunk-QRDUQX63.js.map +1 -0
- package/dist/chunk-R3FOJK5A.js +2088 -0
- package/dist/chunk-R3FOJK5A.js.map +1 -0
- package/dist/chunk-XXO66RCF.js +94 -0
- package/dist/chunk-XXO66RCF.js.map +1 -0
- package/dist/cli.js +638 -179
- package/dist/cli.js.map +1 -1
- package/dist/fileFromPath-FLANAQWT.js +128 -0
- package/dist/fileFromPath-FLANAQWT.js.map +1 -0
- package/dist/main-36PRDAPE.js +1857 -0
- package/dist/main-36PRDAPE.js.map +1 -0
- package/dist/main-B3QJZGLU.js +1859 -0
- package/dist/main-B3QJZGLU.js.map +1 -0
- package/package.json +7 -1
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
// src/services/batch-embedder.ts
|
|
2
|
+
import OpenAI from "openai";
|
|
3
|
+
|
|
4
|
+
// src/services/constants.ts
|
|
5
|
+
import { mkdir as mkdir2 } from "fs/promises";
|
|
6
|
+
|
|
7
|
+
// src/config.ts
|
|
8
|
+
import { mkdir, readFile } from "fs/promises";
|
|
9
|
+
import { homedir } from "os";
|
|
10
|
+
import { dirname, join, resolve } from "path";
|
|
11
|
+
var DEFAULT_CONFIG = {
|
|
12
|
+
dataDir: "~/.local/share/mycroft",
|
|
13
|
+
askEnabled: true,
|
|
14
|
+
models: {
|
|
15
|
+
embedding: "text-embedding-3-small",
|
|
16
|
+
summary: "gpt-5-nano",
|
|
17
|
+
chat: "gpt-5.1"
|
|
18
|
+
}
|
|
19
|
+
};
|
|
20
|
+
var expandHome = (input) => {
|
|
21
|
+
if (!input.startsWith("~")) return input;
|
|
22
|
+
return join(homedir(), input.slice(1));
|
|
23
|
+
};
|
|
24
|
+
var resolvePath = (input) => resolve(expandHome(input));
|
|
25
|
+
var getConfigPath = () => {
|
|
26
|
+
const override = process.env.MYCROFT_CONFIG;
|
|
27
|
+
if (override) return resolvePath(override);
|
|
28
|
+
return resolvePath("~/.config/mycroft/config.json");
|
|
29
|
+
};
|
|
30
|
+
var normalizeModels = (models) => ({
|
|
31
|
+
embedding: models?.embedding || DEFAULT_CONFIG.models.embedding,
|
|
32
|
+
summary: models?.summary || DEFAULT_CONFIG.models.summary,
|
|
33
|
+
chat: models?.chat || DEFAULT_CONFIG.models.chat
|
|
34
|
+
});
|
|
35
|
+
var overrides = {};
|
|
36
|
+
var setConfigOverrides = (next) => {
|
|
37
|
+
overrides = { ...overrides, ...next };
|
|
38
|
+
};
|
|
39
|
+
var normalizeConfig = (input) => {
|
|
40
|
+
const dataDirEnv = process.env.MYCROFT_DATA_DIR;
|
|
41
|
+
const dataDir = overrides.dataDir || dataDirEnv || input?.dataDir || DEFAULT_CONFIG.dataDir;
|
|
42
|
+
return {
|
|
43
|
+
dataDir,
|
|
44
|
+
askEnabled: input?.askEnabled ?? DEFAULT_CONFIG.askEnabled,
|
|
45
|
+
models: normalizeModels(input?.models)
|
|
46
|
+
};
|
|
47
|
+
};
|
|
48
|
+
var readConfigFile = async (path) => {
|
|
49
|
+
try {
|
|
50
|
+
const contents = await readFile(path, "utf-8");
|
|
51
|
+
return JSON.parse(contents);
|
|
52
|
+
} catch {
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
};
|
|
56
|
+
var loadConfig = async () => {
|
|
57
|
+
const configPath2 = getConfigPath();
|
|
58
|
+
const data = await readConfigFile(configPath2);
|
|
59
|
+
const normalized = normalizeConfig(data);
|
|
60
|
+
return {
|
|
61
|
+
...normalized,
|
|
62
|
+
dataDir: resolvePath(normalized.dataDir)
|
|
63
|
+
};
|
|
64
|
+
};
|
|
65
|
+
var ensureConfigDirs = async (configPath2) => {
|
|
66
|
+
const path = configPath2 || getConfigPath();
|
|
67
|
+
await mkdir(dirname(path), { recursive: true });
|
|
68
|
+
};
|
|
69
|
+
var configPath = () => getConfigPath();
|
|
70
|
+
|
|
71
|
+
// src/commands/io.ts
|
|
72
|
+
import chalk from "chalk";
|
|
73
|
+
var isTTY = () => Boolean(process.stdout.isTTY);
|
|
74
|
+
var isInteractive = () => Boolean(process.stdin.isTTY && process.stdout.isTTY);
|
|
75
|
+
var formatError = (text) => isTTY() ? chalk.red(text) : text;
|
|
76
|
+
var formatWarn = (text) => isTTY() ? chalk.yellow(text) : text;
|
|
77
|
+
var stdout = (message) => {
|
|
78
|
+
process.stdout.write(message.endsWith("\n") ? message : `${message}
|
|
79
|
+
`);
|
|
80
|
+
};
|
|
81
|
+
var stderr = (message) => {
|
|
82
|
+
process.stderr.write(message.endsWith("\n") ? message : `${message}
|
|
83
|
+
`);
|
|
84
|
+
};
|
|
85
|
+
var printError = (message) => {
|
|
86
|
+
stderr(formatError(`Error: ${message}`));
|
|
87
|
+
};
|
|
88
|
+
var logInfo = (message) => {
|
|
89
|
+
stderr(message);
|
|
90
|
+
};
|
|
91
|
+
var logWarn = (message) => {
|
|
92
|
+
stderr(formatWarn(message));
|
|
93
|
+
};
|
|
94
|
+
var handleSigint = (onCancel) => {
|
|
95
|
+
const handler = () => {
|
|
96
|
+
if (onCancel) onCancel();
|
|
97
|
+
stderr("\nCancelled.");
|
|
98
|
+
process.exit(130);
|
|
99
|
+
};
|
|
100
|
+
process.once("SIGINT", handler);
|
|
101
|
+
return () => process.off("SIGINT", handler);
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
// src/services/constants.ts
|
|
105
|
+
var CHUNK_SIZE = 1e3;
|
|
106
|
+
var CHUNK_OVERLAP = 100;
|
|
107
|
+
var SEPARATORS = ["\n\n", "\n", ". ", " ", ""];
|
|
108
|
+
var SUMMARY_MAX_TOKENS = 3e4;
|
|
109
|
+
var SUMMARY_CONCURRENCY = 3;
|
|
110
|
+
var SUMMARY_TARGET_WORDS = 250;
|
|
111
|
+
var resolvePaths = async () => {
|
|
112
|
+
const config = await loadConfig();
|
|
113
|
+
const dataDir = config.dataDir;
|
|
114
|
+
return {
|
|
115
|
+
dataDir,
|
|
116
|
+
booksDir: `${dataDir}/books`,
|
|
117
|
+
vectorsDir: `${dataDir}/vectors`,
|
|
118
|
+
ingestDir: `${dataDir}/ingest`,
|
|
119
|
+
dbPath: `${dataDir}/metadata.db`
|
|
120
|
+
};
|
|
121
|
+
};
|
|
122
|
+
var ensureDataDirs = async () => {
|
|
123
|
+
const paths = await resolvePaths();
|
|
124
|
+
await mkdir2(paths.dataDir, { recursive: true });
|
|
125
|
+
await mkdir2(paths.booksDir, { recursive: true });
|
|
126
|
+
await mkdir2(paths.vectorsDir, { recursive: true });
|
|
127
|
+
await mkdir2(paths.ingestDir, { recursive: true });
|
|
128
|
+
return paths;
|
|
129
|
+
};
|
|
130
|
+
var getModels = async () => {
|
|
131
|
+
const config = await loadConfig();
|
|
132
|
+
return config.models;
|
|
133
|
+
};
|
|
134
|
+
var isAskEnabled = async () => {
|
|
135
|
+
const config = await loadConfig();
|
|
136
|
+
return config.askEnabled;
|
|
137
|
+
};
|
|
138
|
+
var requireOpenAIKey = () => {
|
|
139
|
+
if (!process.env.OPENAI_API_KEY) {
|
|
140
|
+
throw new Error("OPENAI_API_KEY is not set. Export it to use embeddings and chat.");
|
|
141
|
+
}
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
// src/services/batch-embedder.ts
|
|
145
|
+
var buildJsonl = (chunks, model) => chunks.map(
|
|
146
|
+
(chunk, i) => ({
|
|
147
|
+
custom_id: String(i),
|
|
148
|
+
method: "POST",
|
|
149
|
+
url: "/v1/embeddings",
|
|
150
|
+
body: { model, input: chunk.content }
|
|
151
|
+
})
|
|
152
|
+
).map((line) => JSON.stringify(line)).join("\n");
|
|
153
|
+
var submitBatchEmbeddings = async (chunks) => {
|
|
154
|
+
const models = await getModels();
|
|
155
|
+
const client = new OpenAI();
|
|
156
|
+
logInfo(`[BatchEmbedder] Preparing batch request for ${chunks.length} chunks`);
|
|
157
|
+
const jsonl = buildJsonl(chunks, models.embedding);
|
|
158
|
+
const blob = new Blob([jsonl], { type: "application/jsonl" });
|
|
159
|
+
const file = await client.files.create({
|
|
160
|
+
file: new File([blob], "embeddings.jsonl", { type: "application/jsonl" }),
|
|
161
|
+
purpose: "batch"
|
|
162
|
+
});
|
|
163
|
+
logInfo(`[BatchEmbedder] Uploaded input file ${file.id}`);
|
|
164
|
+
const batch = await client.batches.create({
|
|
165
|
+
input_file_id: file.id,
|
|
166
|
+
endpoint: "/v1/embeddings",
|
|
167
|
+
completion_window: "24h"
|
|
168
|
+
});
|
|
169
|
+
logInfo(`[BatchEmbedder] Created batch ${batch.id} \u2014 status: ${batch.status}`);
|
|
170
|
+
return { batchId: batch.id, inputFileId: file.id };
|
|
171
|
+
};
|
|
172
|
+
var checkBatchStatus = async (batchId) => {
|
|
173
|
+
const client = new OpenAI();
|
|
174
|
+
const batch = await client.batches.retrieve(batchId);
|
|
175
|
+
return {
|
|
176
|
+
status: batch.status,
|
|
177
|
+
completed: batch.request_counts?.completed ?? 0,
|
|
178
|
+
total: batch.request_counts?.total ?? 0,
|
|
179
|
+
outputFileId: batch.output_file_id ?? null
|
|
180
|
+
};
|
|
181
|
+
};
|
|
182
|
+
var downloadBatchResults = async (outputFileId, chunks) => {
|
|
183
|
+
const client = new OpenAI();
|
|
184
|
+
logInfo(`[BatchEmbedder] Downloading results from ${outputFileId}`);
|
|
185
|
+
const response = await client.files.content(outputFileId);
|
|
186
|
+
const text = await response.text();
|
|
187
|
+
const lines = text.trim().split("\n");
|
|
188
|
+
const vectors = /* @__PURE__ */ new Map();
|
|
189
|
+
for (const line of lines) {
|
|
190
|
+
const result = JSON.parse(line);
|
|
191
|
+
const idx = Number(result.custom_id);
|
|
192
|
+
if (result.response?.status_code === 200) {
|
|
193
|
+
const embedding = result.response.body?.data?.[0]?.embedding;
|
|
194
|
+
if (embedding) {
|
|
195
|
+
vectors.set(idx, embedding);
|
|
196
|
+
}
|
|
197
|
+
} else {
|
|
198
|
+
logWarn(
|
|
199
|
+
`[BatchEmbedder] Request ${idx} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`
|
|
200
|
+
);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
const embedded = chunks.map((chunk, i) => ({
|
|
204
|
+
...chunk,
|
|
205
|
+
vector: vectors.get(i) ?? []
|
|
206
|
+
}));
|
|
207
|
+
const missing = embedded.filter((e) => e.vector.length === 0).length;
|
|
208
|
+
if (missing > 0) {
|
|
209
|
+
logWarn(`[BatchEmbedder] ${missing} chunk(s) have empty embeddings due to batch errors`);
|
|
210
|
+
}
|
|
211
|
+
logInfo(`[BatchEmbedder] Successfully processed ${embedded.length} chunks via batch API`);
|
|
212
|
+
return embedded;
|
|
213
|
+
};
|
|
214
|
+
var cleanupBatchFiles = async (inputFileId, outputFileId) => {
|
|
215
|
+
const client = new OpenAI();
|
|
216
|
+
await client.files.del(inputFileId).catch(() => void 0);
|
|
217
|
+
if (outputFileId) {
|
|
218
|
+
await client.files.del(outputFileId).catch(() => void 0);
|
|
219
|
+
}
|
|
220
|
+
};
|
|
221
|
+
|
|
222
|
+
export {
|
|
223
|
+
setConfigOverrides,
|
|
224
|
+
loadConfig,
|
|
225
|
+
ensureConfigDirs,
|
|
226
|
+
configPath,
|
|
227
|
+
isInteractive,
|
|
228
|
+
stdout,
|
|
229
|
+
printError,
|
|
230
|
+
logInfo,
|
|
231
|
+
logWarn,
|
|
232
|
+
handleSigint,
|
|
233
|
+
CHUNK_SIZE,
|
|
234
|
+
CHUNK_OVERLAP,
|
|
235
|
+
SEPARATORS,
|
|
236
|
+
SUMMARY_MAX_TOKENS,
|
|
237
|
+
SUMMARY_CONCURRENCY,
|
|
238
|
+
SUMMARY_TARGET_WORDS,
|
|
239
|
+
resolvePaths,
|
|
240
|
+
ensureDataDirs,
|
|
241
|
+
getModels,
|
|
242
|
+
isAskEnabled,
|
|
243
|
+
requireOpenAIKey,
|
|
244
|
+
submitBatchEmbeddings,
|
|
245
|
+
checkBatchStatus,
|
|
246
|
+
downloadBatchResults,
|
|
247
|
+
cleanupBatchFiles
|
|
248
|
+
};
|
|
249
|
+
//# sourceMappingURL=chunk-6DLQHHCC.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/services/batch-embedder.ts","../src/services/constants.ts","../src/config.ts","../src/commands/io.ts"],"sourcesContent":["import OpenAI from \"openai\";\nimport type { BookChunk } from \"../shared/types.js\";\nimport type { EmbeddedChunk } from \"./embedder.js\";\nimport { getModels, logInfo, logWarn } from \"./constants.js\";\n\ntype BatchRequestLine = {\n custom_id: string;\n method: \"POST\";\n url: \"/v1/embeddings\";\n body: { model: string; input: string };\n};\n\nconst buildJsonl = (chunks: BookChunk[], model: string): string =>\n chunks\n .map(\n (chunk, i): BatchRequestLine => ({\n custom_id: String(i),\n method: \"POST\",\n url: \"/v1/embeddings\",\n body: { model, input: chunk.content },\n })\n )\n .map((line) => JSON.stringify(line))\n .join(\"\\n\");\n\nexport type BatchSubmitResult = {\n batchId: string;\n inputFileId: string;\n};\n\nexport const submitBatchEmbeddings = async (chunks: BookChunk[]): Promise<BatchSubmitResult> => {\n const models = await getModels();\n const client = new OpenAI();\n\n logInfo(`[BatchEmbedder] Preparing batch request for ${chunks.length} chunks`);\n\n const jsonl = buildJsonl(chunks, models.embedding);\n const blob = new Blob([jsonl], { type: \"application/jsonl\" });\n const file = await client.files.create({\n file: new File([blob], \"embeddings.jsonl\", { type: \"application/jsonl\" }),\n purpose: \"batch\",\n });\n logInfo(`[BatchEmbedder] Uploaded input file ${file.id}`);\n\n const batch = await client.batches.create({\n input_file_id: file.id,\n endpoint: \"/v1/embeddings\",\n completion_window: \"24h\",\n });\n logInfo(`[BatchEmbedder] Created batch ${batch.id} — status: ${batch.status}`);\n\n return { batchId: batch.id, inputFileId: file.id };\n};\n\nexport type BatchStatus = {\n status: string;\n completed: number;\n total: number;\n outputFileId: string | null;\n};\n\nexport const checkBatchStatus = async (batchId: string): Promise<BatchStatus> => {\n const client = new OpenAI();\n const batch = await client.batches.retrieve(batchId);\n return {\n status: batch.status,\n completed: batch.request_counts?.completed ?? 0,\n total: batch.request_counts?.total ?? 0,\n outputFileId: batch.output_file_id ?? null,\n };\n};\n\nexport const downloadBatchResults = async (\n outputFileId: string,\n chunks: BookChunk[],\n): Promise<EmbeddedChunk[]> => {\n const client = new OpenAI();\n\n logInfo(`[BatchEmbedder] Downloading results from ${outputFileId}`);\n const response = await client.files.content(outputFileId);\n const text = await response.text();\n const lines = text.trim().split(\"\\n\");\n\n const vectors = new Map<number, number[]>();\n for (const line of lines) {\n const result = JSON.parse(line);\n const idx = Number(result.custom_id);\n if (result.response?.status_code === 200) {\n const embedding = result.response.body?.data?.[0]?.embedding;\n if (embedding) {\n vectors.set(idx, embedding);\n }\n } else {\n logWarn(\n `[BatchEmbedder] Request ${idx} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`\n );\n }\n }\n\n const embedded: EmbeddedChunk[] = chunks.map((chunk, i) => ({\n ...chunk,\n vector: vectors.get(i) ?? [],\n }));\n\n const missing = embedded.filter((e) => e.vector.length === 0).length;\n if (missing > 0) {\n logWarn(`[BatchEmbedder] ${missing} chunk(s) have empty embeddings due to batch errors`);\n }\n\n logInfo(`[BatchEmbedder] Successfully processed ${embedded.length} chunks via batch API`);\n return embedded;\n};\n\nexport const cleanupBatchFiles = async (inputFileId: string, outputFileId?: string | null) => {\n const client = new OpenAI();\n await client.files.del(inputFileId).catch(() => undefined);\n if (outputFileId) {\n await client.files.del(outputFileId).catch(() => undefined);\n }\n};\n","import { mkdir } from \"node:fs/promises\";\nimport { loadConfig } from \"../config.js\";\nimport { logInfo, logWarn } from \"../commands/io.js\";\n\nexport const CHUNK_SIZE: number = 1000;\nexport const CHUNK_OVERLAP: number = 100;\nexport const SEPARATORS = [\"\\n\\n\", \"\\n\", \". \", \" \", \"\"] as const;\n\nexport const SUMMARY_MAX_TOKENS = 30000;\nexport const SUMMARY_CONCURRENCY = 3;\nexport const SUMMARY_TARGET_WORDS = 250;\n\nexport type ResolvedPaths = {\n dataDir: string;\n booksDir: string;\n vectorsDir: string;\n ingestDir: string;\n dbPath: string;\n};\n\nexport const resolvePaths = async (): Promise<ResolvedPaths> => {\n const config = await loadConfig();\n const dataDir = config.dataDir;\n return {\n dataDir,\n booksDir: `${dataDir}/books`,\n vectorsDir: `${dataDir}/vectors`,\n ingestDir: `${dataDir}/ingest`,\n dbPath: `${dataDir}/metadata.db`,\n };\n};\n\nexport const ensureDataDirs = async () => {\n const paths = await resolvePaths();\n await mkdir(paths.dataDir, { recursive: true });\n await mkdir(paths.booksDir, { recursive: true });\n await mkdir(paths.vectorsDir, { recursive: true });\n await mkdir(paths.ingestDir, { recursive: true });\n return paths;\n};\n\nexport const getModels = async () => {\n const config = await loadConfig();\n return config.models;\n};\n\nexport const isAskEnabled = async () => {\n const config = await loadConfig();\n return config.askEnabled;\n};\n\nexport const requireOpenAIKey = () => {\n if (!process.env.OPENAI_API_KEY) {\n throw new Error(\"OPENAI_API_KEY is not set. Export it to use embeddings and chat.\");\n }\n};\n\nexport { logInfo, logWarn };\n","import { mkdir, readFile } from \"node:fs/promises\";\nimport { homedir } from \"node:os\";\nimport { dirname, join, resolve } from \"node:path\";\n\nexport type ConfigModels = {\n embedding: string;\n summary: string;\n chat: string;\n};\n\nexport type AppConfig = {\n dataDir: string;\n askEnabled: boolean;\n models: ConfigModels;\n};\n\nconst DEFAULT_CONFIG: AppConfig = {\n dataDir: \"~/.local/share/mycroft\",\n askEnabled: true,\n models: {\n embedding: \"text-embedding-3-small\",\n summary: \"gpt-5-nano\",\n chat: \"gpt-5.1\",\n },\n};\n\nconst expandHome = (input: string): string => {\n if (!input.startsWith(\"~\")) return input;\n return join(homedir(), input.slice(1));\n};\n\nconst resolvePath = (input: string): string => resolve(expandHome(input));\n\nconst getConfigPath = (): string => {\n const override = process.env.MYCROFT_CONFIG;\n if (override) return resolvePath(override);\n return resolvePath(\"~/.config/mycroft/config.json\");\n};\n\nconst normalizeModels = (models?: Partial<ConfigModels>): ConfigModels => ({\n embedding: models?.embedding || DEFAULT_CONFIG.models.embedding,\n summary: models?.summary || DEFAULT_CONFIG.models.summary,\n chat: models?.chat || DEFAULT_CONFIG.models.chat,\n});\n\ntype ConfigOverrides = {\n dataDir?: string;\n};\n\nlet overrides: ConfigOverrides = {};\n\nexport const setConfigOverrides = (next: ConfigOverrides) => {\n overrides = { ...overrides, ...next };\n};\n\nconst normalizeConfig = (input: Partial<AppConfig> | null): AppConfig => {\n const dataDirEnv = process.env.MYCROFT_DATA_DIR;\n const dataDir = overrides.dataDir || dataDirEnv || input?.dataDir || DEFAULT_CONFIG.dataDir;\n return {\n dataDir,\n askEnabled: input?.askEnabled ?? DEFAULT_CONFIG.askEnabled,\n models: normalizeModels(input?.models),\n };\n};\n\nconst readConfigFile = async (path: string): Promise<Partial<AppConfig> | null> => {\n try {\n const contents = await readFile(path, \"utf-8\");\n return JSON.parse(contents) as Partial<AppConfig>;\n } catch {\n return null;\n }\n};\n\nexport const loadConfig = async (): Promise<AppConfig> => {\n const configPath = getConfigPath();\n const data = await readConfigFile(configPath);\n const normalized = normalizeConfig(data);\n return {\n ...normalized,\n dataDir: resolvePath(normalized.dataDir),\n };\n};\n\nexport const ensureConfigDirs = async (configPath?: string) => {\n const path = configPath || getConfigPath();\n await mkdir(dirname(path), { recursive: true });\n};\n\nexport const configPath = () => getConfigPath();\n","import chalk from \"chalk\";\n\nconst isTTY = () => Boolean(process.stdout.isTTY);\nexport const isInteractive = () => Boolean(process.stdin.isTTY && process.stdout.isTTY);\n\nexport const formatDim = (text: string) => (isTTY() ? chalk.dim(text) : text);\nexport const formatError = (text: string) => (isTTY() ? chalk.red(text) : text);\nexport const formatBold = (text: string) => (isTTY() ? chalk.bold(text) : text);\nexport const formatWarn = (text: string) => (isTTY() ? chalk.yellow(text) : text);\n\nexport const stdout = (message: string) => {\n process.stdout.write(message.endsWith(\"\\n\") ? message : `${message}\\n`);\n};\n\nexport const stderr = (message: string) => {\n process.stderr.write(message.endsWith(\"\\n\") ? message : `${message}\\n`);\n};\n\nexport const printError = (message: string) => {\n stderr(formatError(`Error: ${message}`));\n};\n\nexport const logInfo = (message: string) => {\n stderr(message);\n};\n\nexport const logWarn = (message: string) => {\n stderr(formatWarn(message));\n};\n\nexport const handleSigint = (onCancel?: () => void) => {\n const handler = () => {\n if (onCancel) onCancel();\n stderr(\"\\nCancelled.\");\n process.exit(130);\n };\n process.once(\"SIGINT\", handler);\n return () => process.off(\"SIGINT\", handler);\n};\n"],"mappings":";AAAA,OAAO,YAAY;;;ACAnB,SAAS,SAAAA,cAAa;;;ACAtB,SAAS,OAAO,gBAAgB;AAChC,SAAS,eAAe;AACxB,SAAS,SAAS,MAAM,eAAe;AAcvC,IAAM,iBAA4B;AAAA,EAChC,SAAS;AAAA,EACT,YAAY;AAAA,EACZ,QAAQ;AAAA,IACN,WAAW;AAAA,IACX,SAAS;AAAA,IACT,MAAM;AAAA,EACR;AACF;AAEA,IAAM,aAAa,CAAC,UAA0B;AAC5C,MAAI,CAAC,MAAM,WAAW,GAAG,EAAG,QAAO;AACnC,SAAO,KAAK,QAAQ,GAAG,MAAM,MAAM,CAAC,CAAC;AACvC;AAEA,IAAM,cAAc,CAAC,UAA0B,QAAQ,WAAW,KAAK,CAAC;AAExE,IAAM,gBAAgB,MAAc;AAClC,QAAM,WAAW,QAAQ,IAAI;AAC7B,MAAI,SAAU,QAAO,YAAY,QAAQ;AACzC,SAAO,YAAY,+BAA+B;AACpD;AAEA,IAAM,kBAAkB,CAAC,YAAkD;AAAA,EACzE,WAAW,QAAQ,aAAa,eAAe,OAAO;AAAA,EACtD,SAAS,QAAQ,WAAW,eAAe,OAAO;AAAA,EAClD,MAAM,QAAQ,QAAQ,eAAe,OAAO;AAC9C;AAMA,IAAI,YAA6B,CAAC;AAE3B,IAAM,qBAAqB,CAAC,SAA0B;AAC3D,cAAY,EAAE,GAAG,WAAW,GAAG,KAAK;AACtC;AAEA,IAAM,kBAAkB,CAAC,UAAgD;AACvE,QAAM,aAAa,QAAQ,IAAI;AAC/B,QAAM,UAAU,UAAU,WAAW,cAAc,OAAO,WAAW,eAAe;AACpF,SAAO;AAAA,IACL;AAAA,IACA,YAAY,OAAO,cAAc,eAAe;AAAA,IAChD,QAAQ,gBAAgB,OAAO,MAAM;AAAA,EACvC;AACF;AAEA,IAAM,iBAAiB,OAAO,SAAqD;AACjF,MAAI;AACF,UAAM,WAAW,MAAM,SAAS,MAAM,OAAO;AAC7C,WAAO,KAAK,MAAM,QAAQ;AAAA,EAC5B,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAEO,IAAM,aAAa,YAAgC;AACxD,QAAMC,cAAa,cAAc;AACjC,QAAM,OAAO,MAAM,eAAeA,WAAU;AAC5C,QAAM,aAAa,gBAAgB,IAAI;AACvC,SAAO;AAAA,IACL,GAAG;AAAA,IACH,SAAS,YAAY,WAAW,OAAO;AAAA,EACzC;AACF;AAEO,IAAM,mBAAmB,OAAOA,gBAAwB;AAC7D,QAAM,OAAOA,eAAc,cAAc;AACzC,QAAM,MAAM,QAAQ,IAAI,GAAG,EAAE,WAAW,KAAK,CAAC;AAChD;AAEO,IAAM,aAAa,MAAM,cAAc;;;ACzF9C,OAAO,WAAW;AAElB,IAAM,QAAQ,MAAM,QAAQ,QAAQ,OAAO,KAAK;AACzC,IAAM,gBAAgB,MAAM,QAAQ,QAAQ,MAAM,SAAS,QAAQ,OAAO,KAAK;AAG/E,IAAM,cAAc,CAAC,SAAkB,MAAM,IAAI,MAAM,IAAI,IAAI,IAAI;AAEnE,IAAM,aAAa,CAAC,SAAkB,MAAM,IAAI,MAAM,OAAO,IAAI,IAAI;AAErE,IAAM,SAAS,CAAC,YAAoB;AACzC,UAAQ,OAAO,MAAM,QAAQ,SAAS,IAAI,IAAI,UAAU,GAAG,OAAO;AAAA,CAAI;AACxE;AAEO,IAAM,SAAS,CAAC,YAAoB;AACzC,UAAQ,OAAO,MAAM,QAAQ,SAAS,IAAI,IAAI,UAAU,GAAG,OAAO;AAAA,CAAI;AACxE;AAEO,IAAM,aAAa,CAAC,YAAoB;AAC7C,SAAO,YAAY,UAAU,OAAO,EAAE,CAAC;AACzC;AAEO,IAAM,UAAU,CAAC,YAAoB;AAC1C,SAAO,OAAO;AAChB;AAEO,IAAM,UAAU,CAAC,YAAoB;AAC1C,SAAO,WAAW,OAAO,CAAC;AAC5B;AAEO,IAAM,eAAe,CAAC,aAA0B;AACrD,QAAM,UAAU,MAAM;AACpB,QAAI,SAAU,UAAS;AACvB,WAAO,cAAc;AACrB,YAAQ,KAAK,GAAG;AAAA,EAClB;AACA,UAAQ,KAAK,UAAU,OAAO;AAC9B,SAAO,MAAM,QAAQ,IAAI,UAAU,OAAO;AAC5C;;;AFlCO,IAAM,aAAqB;AAC3B,IAAM,gBAAwB;AAC9B,IAAM,aAAa,CAAC,QAAQ,MAAM,MAAM,KAAK,EAAE;AAE/C,IAAM,qBAAqB;AAC3B,IAAM,sBAAsB;AAC5B,IAAM,uBAAuB;AAU7B,IAAM,eAAe,YAAoC;AAC9D,QAAM,SAAS,MAAM,WAAW;AAChC,QAAM,UAAU,OAAO;AACvB,SAAO;AAAA,IACL;AAAA,IACA,UAAU,GAAG,OAAO;AAAA,IACpB,YAAY,GAAG,OAAO;AAAA,IACtB,WAAW,GAAG,OAAO;AAAA,IACrB,QAAQ,GAAG,OAAO;AAAA,EACpB;AACF;AAEO,IAAM,iBAAiB,YAAY;AACxC,QAAM,QAAQ,MAAM,aAAa;AACjC,QAAMC,OAAM,MAAM,SAAS,EAAE,WAAW,KAAK,CAAC;AAC9C,QAAMA,OAAM,MAAM,UAAU,EAAE,WAAW,KAAK,CAAC;AAC/C,QAAMA,OAAM,MAAM,YAAY,EAAE,WAAW,KAAK,CAAC;AACjD,QAAMA,OAAM,MAAM,WAAW,EAAE,WAAW,KAAK,CAAC;AAChD,SAAO;AACT;AAEO,IAAM,YAAY,YAAY;AACnC,QAAM,SAAS,MAAM,WAAW;AAChC,SAAO,OAAO;AAChB;AAEO,IAAM,eAAe,YAAY;AACtC,QAAM,SAAS,MAAM,WAAW;AAChC,SAAO,OAAO;AAChB;AAEO,IAAM,mBAAmB,MAAM;AACpC,MAAI,CAAC,QAAQ,IAAI,gBAAgB;AAC/B,UAAM,IAAI,MAAM,kEAAkE;AAAA,EACpF;AACF;;;AD3CA,IAAM,aAAa,CAAC,QAAqB,UACvC,OACG;AAAA,EACC,CAAC,OAAO,OAAyB;AAAA,IAC/B,WAAW,OAAO,CAAC;AAAA,IACnB,QAAQ;AAAA,IACR,KAAK;AAAA,IACL,MAAM,EAAE,OAAO,OAAO,MAAM,QAAQ;AAAA,EACtC;AACF,EACC,IAAI,CAAC,SAAS,KAAK,UAAU,IAAI,CAAC,EAClC,KAAK,IAAI;AAOP,IAAM,wBAAwB,OAAO,WAAoD;AAC9F,QAAM,SAAS,MAAM,UAAU;AAC/B,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,+CAA+C,OAAO,MAAM,SAAS;AAE7E,QAAM,QAAQ,WAAW,QAAQ,OAAO,SAAS;AACjD,QAAM,OAAO,IAAI,KAAK,CAAC,KAAK,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAC5D,QAAM,OAAO,MAAM,OAAO,MAAM,OAAO;AAAA,IACrC,MAAM,IAAI,KAAK,CAAC,IAAI,GAAG,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAAA,IACxE,SAAS;AAAA,EACX,CAAC;AACD,UAAQ,uCAAuC,KAAK,EAAE,EAAE;AAExD,QAAM,QAAQ,MAAM,OAAO,QAAQ,OAAO;AAAA,IACxC,eAAe,KAAK;AAAA,IACpB,UAAU;AAAA,IACV,mBAAmB;AAAA,EACrB,CAAC;AACD,UAAQ,iCAAiC,MAAM,EAAE,mBAAc,MAAM,MAAM,EAAE;AAE7E,SAAO,EAAE,SAAS,MAAM,IAAI,aAAa,KAAK,GAAG;AACnD;AASO,IAAM,mBAAmB,OAAO,YAA0C;AAC/E,QAAM,SAAS,IAAI,OAAO;AAC1B,QAAM,QAAQ,MAAM,OAAO,QAAQ,SAAS,OAAO;AACnD,SAAO;AAAA,IACL,QAAQ,MAAM;AAAA,IACd,WAAW,MAAM,gBAAgB,aAAa;AAAA,IAC9C,OAAO,MAAM,gBAAgB,SAAS;AAAA,IACtC,cAAc,MAAM,kBAAkB;AAAA,EACxC;AACF;AAEO,IAAM,uBAAuB,OAClC,cACA,WAC6B;AAC7B,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,4CAA4C,YAAY,EAAE;AAClE,QAAM,WAAW,MAAM,OAAO,MAAM,QAAQ,YAAY;AACxD,QAAM,OAAO,MAAM,SAAS,KAAK;AACjC,QAAM,QAAQ,KAAK,KAAK,EAAE,MAAM,IAAI;AAEpC,QAAM,UAAU,oBAAI,IAAsB;AAC1C,aAAW,QAAQ,OAAO;AACxB,UAAM,SAAS,KAAK,MAAM,IAAI;AAC9B,UAAM,MAAM,OAAO,OAAO,SAAS;AACnC,QAAI,OAAO,UAAU,gBAAgB,KAAK;AACxC,YAAM,YAAY,OAAO,SAAS,MAAM,OAAO,CAAC,GAAG;AACnD,UAAI,WAAW;AACb,gBAAQ,IAAI,KAAK,SAAS;AAAA,MAC5B;AAAA,IACF,OAAO;AACL;AAAA,QACE,2BAA2B,GAAG,YAAY,KAAK,UAAU,OAAO,UAAU,MAAM,SAAS,OAAO,KAAK,CAAC;AAAA,MACxG;AAAA,IACF;AAAA,EACF;AAEA,QAAM,WAA4B,OAAO,IAAI,CAAC,OAAO,OAAO;AAAA,IAC1D,GAAG;AAAA,IACH,QAAQ,QAAQ,IAAI,CAAC,KAAK,CAAC;AAAA,EAC7B,EAAE;AAEF,QAAM,UAAU,SAAS,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,CAAC,EAAE;AAC9D,MAAI,UAAU,GAAG;AACf,YAAQ,mBAAmB,OAAO,qDAAqD;AAAA,EACzF;AAEA,UAAQ,0CAA0C,SAAS,MAAM,uBAAuB;AACxF,SAAO;AACT;AAEO,IAAM,oBAAoB,OAAO,aAAqB,iBAAiC;AAC5F,QAAM,SAAS,IAAI,OAAO;AAC1B,QAAM,OAAO,MAAM,IAAI,WAAW,EAAE,MAAM,MAAM,MAAS;AACzD,MAAI,cAAc;AAChB,UAAM,OAAO,MAAM,IAAI,YAAY,EAAE,MAAM,MAAM,MAAS;AAAA,EAC5D;AACF;","names":["mkdir","configPath","mkdir"]}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getModels,
|
|
3
|
+
logInfo,
|
|
4
|
+
logWarn
|
|
5
|
+
} from "./chunk-KGG7WEYE.js";
|
|
6
|
+
|
|
7
|
+
// src/services/batch-embedder.ts
|
|
8
|
+
import OpenAI from "openai";
|
|
9
|
+
var buildJsonl = (chunks, model) => chunks.map(
|
|
10
|
+
(chunk, i) => ({
|
|
11
|
+
custom_id: String(i),
|
|
12
|
+
method: "POST",
|
|
13
|
+
url: "/v1/embeddings",
|
|
14
|
+
body: { model, input: chunk.content }
|
|
15
|
+
})
|
|
16
|
+
).map((line) => JSON.stringify(line)).join("\n");
|
|
17
|
+
var submitBatchEmbeddings = async (chunks) => {
|
|
18
|
+
const models = await getModels();
|
|
19
|
+
const client = new OpenAI();
|
|
20
|
+
logInfo(`[BatchEmbedder] Preparing batch request for ${chunks.length} chunks`);
|
|
21
|
+
const jsonl = buildJsonl(chunks, models.embedding);
|
|
22
|
+
const blob = new Blob([jsonl], { type: "application/jsonl" });
|
|
23
|
+
const file = await client.files.create({
|
|
24
|
+
file: new File([blob], "embeddings.jsonl", { type: "application/jsonl" }),
|
|
25
|
+
purpose: "batch"
|
|
26
|
+
});
|
|
27
|
+
logInfo(`[BatchEmbedder] Uploaded input file ${file.id}`);
|
|
28
|
+
const batch = await client.batches.create({
|
|
29
|
+
input_file_id: file.id,
|
|
30
|
+
endpoint: "/v1/embeddings",
|
|
31
|
+
completion_window: "24h"
|
|
32
|
+
});
|
|
33
|
+
logInfo(`[BatchEmbedder] Created batch ${batch.id} \u2014 status: ${batch.status}`);
|
|
34
|
+
return { batchId: batch.id, inputFileId: file.id };
|
|
35
|
+
};
|
|
36
|
+
var checkBatchStatus = async (batchId) => {
|
|
37
|
+
const client = new OpenAI();
|
|
38
|
+
const batch = await client.batches.retrieve(batchId);
|
|
39
|
+
return {
|
|
40
|
+
status: batch.status,
|
|
41
|
+
completed: batch.request_counts?.completed ?? 0,
|
|
42
|
+
total: batch.request_counts?.total ?? 0,
|
|
43
|
+
outputFileId: batch.output_file_id ?? null
|
|
44
|
+
};
|
|
45
|
+
};
|
|
46
|
+
var downloadBatchResults = async (outputFileId, chunks) => {
|
|
47
|
+
const client = new OpenAI();
|
|
48
|
+
logInfo(`[BatchEmbedder] Downloading results from ${outputFileId}`);
|
|
49
|
+
const response = await client.files.content(outputFileId);
|
|
50
|
+
const text = await response.text();
|
|
51
|
+
const lines = text.trim().split("\n");
|
|
52
|
+
const vectors = /* @__PURE__ */ new Map();
|
|
53
|
+
for (const line of lines) {
|
|
54
|
+
const result = JSON.parse(line);
|
|
55
|
+
const idx = Number(result.custom_id);
|
|
56
|
+
if (result.response?.status_code === 200) {
|
|
57
|
+
const embedding = result.response.body?.data?.[0]?.embedding;
|
|
58
|
+
if (embedding) {
|
|
59
|
+
vectors.set(idx, embedding);
|
|
60
|
+
}
|
|
61
|
+
} else {
|
|
62
|
+
logWarn(
|
|
63
|
+
`[BatchEmbedder] Request ${idx} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`
|
|
64
|
+
);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
const embedded = chunks.map((chunk, i) => ({
|
|
68
|
+
...chunk,
|
|
69
|
+
vector: vectors.get(i) ?? []
|
|
70
|
+
}));
|
|
71
|
+
const missing = embedded.filter((e) => e.vector.length === 0).length;
|
|
72
|
+
if (missing > 0) {
|
|
73
|
+
logWarn(`[BatchEmbedder] ${missing} chunk(s) have empty embeddings due to batch errors`);
|
|
74
|
+
}
|
|
75
|
+
logInfo(`[BatchEmbedder] Successfully processed ${embedded.length} chunks via batch API`);
|
|
76
|
+
return embedded;
|
|
77
|
+
};
|
|
78
|
+
var cleanupBatchFiles = async (inputFileId, outputFileId) => {
|
|
79
|
+
const client = new OpenAI();
|
|
80
|
+
await client.files.del(inputFileId).catch(() => void 0);
|
|
81
|
+
if (outputFileId) {
|
|
82
|
+
await client.files.del(outputFileId).catch(() => void 0);
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
export {
|
|
87
|
+
submitBatchEmbeddings,
|
|
88
|
+
checkBatchStatus,
|
|
89
|
+
downloadBatchResults,
|
|
90
|
+
cleanupBatchFiles
|
|
91
|
+
};
|
|
92
|
+
//# sourceMappingURL=chunk-7CO4PMU5.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/services/batch-embedder.ts"],"sourcesContent":["import OpenAI from \"openai\";\nimport type { BookChunk } from \"../shared/types.js\";\nimport type { EmbeddedChunk } from \"./embedder.js\";\nimport { getModels, logInfo, logWarn } from \"./constants.js\";\n\ntype BatchRequestLine = {\n custom_id: string;\n method: \"POST\";\n url: \"/v1/embeddings\";\n body: { model: string; input: string };\n};\n\nconst buildJsonl = (chunks: BookChunk[], model: string): string =>\n chunks\n .map(\n (chunk, i): BatchRequestLine => ({\n custom_id: String(i),\n method: \"POST\",\n url: \"/v1/embeddings\",\n body: { model, input: chunk.content },\n })\n )\n .map((line) => JSON.stringify(line))\n .join(\"\\n\");\n\nexport type BatchSubmitResult = {\n batchId: string;\n inputFileId: string;\n};\n\nexport const submitBatchEmbeddings = async (chunks: BookChunk[]): Promise<BatchSubmitResult> => {\n const models = await getModels();\n const client = new OpenAI();\n\n logInfo(`[BatchEmbedder] Preparing batch request for ${chunks.length} chunks`);\n\n const jsonl = buildJsonl(chunks, models.embedding);\n const blob = new Blob([jsonl], { type: \"application/jsonl\" });\n const file = await client.files.create({\n file: new File([blob], \"embeddings.jsonl\", { type: \"application/jsonl\" }),\n purpose: \"batch\",\n });\n logInfo(`[BatchEmbedder] Uploaded input file ${file.id}`);\n\n const batch = await client.batches.create({\n input_file_id: file.id,\n endpoint: \"/v1/embeddings\",\n completion_window: \"24h\",\n });\n logInfo(`[BatchEmbedder] Created batch ${batch.id} — status: ${batch.status}`);\n\n return { batchId: batch.id, inputFileId: file.id };\n};\n\nexport type BatchStatus = {\n status: string;\n completed: number;\n total: number;\n outputFileId: string | null;\n};\n\nexport const checkBatchStatus = async (batchId: string): Promise<BatchStatus> => {\n const client = new OpenAI();\n const batch = await client.batches.retrieve(batchId);\n return {\n status: batch.status,\n completed: batch.request_counts?.completed ?? 0,\n total: batch.request_counts?.total ?? 0,\n outputFileId: batch.output_file_id ?? null,\n };\n};\n\nexport const downloadBatchResults = async (\n outputFileId: string,\n chunks: BookChunk[],\n): Promise<EmbeddedChunk[]> => {\n const client = new OpenAI();\n\n logInfo(`[BatchEmbedder] Downloading results from ${outputFileId}`);\n const response = await client.files.content(outputFileId);\n const text = await response.text();\n const lines = text.trim().split(\"\\n\");\n\n const vectors = new Map<number, number[]>();\n for (const line of lines) {\n const result = JSON.parse(line);\n const idx = Number(result.custom_id);\n if (result.response?.status_code === 200) {\n const embedding = result.response.body?.data?.[0]?.embedding;\n if (embedding) {\n vectors.set(idx, embedding);\n }\n } else {\n logWarn(\n `[BatchEmbedder] Request ${idx} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`\n );\n }\n }\n\n const embedded: EmbeddedChunk[] = chunks.map((chunk, i) => ({\n ...chunk,\n vector: vectors.get(i) ?? [],\n }));\n\n const missing = embedded.filter((e) => e.vector.length === 0).length;\n if (missing > 0) {\n logWarn(`[BatchEmbedder] ${missing} chunk(s) have empty embeddings due to batch errors`);\n }\n\n logInfo(`[BatchEmbedder] Successfully processed ${embedded.length} chunks via batch API`);\n return embedded;\n};\n\nexport const cleanupBatchFiles = async (inputFileId: string, outputFileId?: string | null) => {\n const client = new OpenAI();\n await client.files.del(inputFileId).catch(() => undefined);\n if (outputFileId) {\n await client.files.del(outputFileId).catch(() => undefined);\n }\n};\n"],"mappings":";;;;;;;AAAA,OAAO,YAAY;AAYnB,IAAM,aAAa,CAAC,QAAqB,UACvC,OACG;AAAA,EACC,CAAC,OAAO,OAAyB;AAAA,IAC/B,WAAW,OAAO,CAAC;AAAA,IACnB,QAAQ;AAAA,IACR,KAAK;AAAA,IACL,MAAM,EAAE,OAAO,OAAO,MAAM,QAAQ;AAAA,EACtC;AACF,EACC,IAAI,CAAC,SAAS,KAAK,UAAU,IAAI,CAAC,EAClC,KAAK,IAAI;AAOP,IAAM,wBAAwB,OAAO,WAAoD;AAC9F,QAAM,SAAS,MAAM,UAAU;AAC/B,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,+CAA+C,OAAO,MAAM,SAAS;AAE7E,QAAM,QAAQ,WAAW,QAAQ,OAAO,SAAS;AACjD,QAAM,OAAO,IAAI,KAAK,CAAC,KAAK,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAC5D,QAAM,OAAO,MAAM,OAAO,MAAM,OAAO;AAAA,IACrC,MAAM,IAAI,KAAK,CAAC,IAAI,GAAG,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAAA,IACxE,SAAS;AAAA,EACX,CAAC;AACD,UAAQ,uCAAuC,KAAK,EAAE,EAAE;AAExD,QAAM,QAAQ,MAAM,OAAO,QAAQ,OAAO;AAAA,IACxC,eAAe,KAAK;AAAA,IACpB,UAAU;AAAA,IACV,mBAAmB;AAAA,EACrB,CAAC;AACD,UAAQ,iCAAiC,MAAM,EAAE,mBAAc,MAAM,MAAM,EAAE;AAE7E,SAAO,EAAE,SAAS,MAAM,IAAI,aAAa,KAAK,GAAG;AACnD;AASO,IAAM,mBAAmB,OAAO,YAA0C;AAC/E,QAAM,SAAS,IAAI,OAAO;AAC1B,QAAM,QAAQ,MAAM,OAAO,QAAQ,SAAS,OAAO;AACnD,SAAO;AAAA,IACL,QAAQ,MAAM;AAAA,IACd,WAAW,MAAM,gBAAgB,aAAa;AAAA,IAC9C,OAAO,MAAM,gBAAgB,SAAS;AAAA,IACtC,cAAc,MAAM,kBAAkB;AAAA,EACxC;AACF;AAEO,IAAM,uBAAuB,OAClC,cACA,WAC6B;AAC7B,QAAM,SAAS,IAAI,OAAO;AAE1B,UAAQ,4CAA4C,YAAY,EAAE;AAClE,QAAM,WAAW,MAAM,OAAO,MAAM,QAAQ,YAAY;AACxD,QAAM,OAAO,MAAM,SAAS,KAAK;AACjC,QAAM,QAAQ,KAAK,KAAK,EAAE,MAAM,IAAI;AAEpC,QAAM,UAAU,oBAAI,IAAsB;AAC1C,aAAW,QAAQ,OAAO;AACxB,UAAM,SAAS,KAAK,MAAM,IAAI;AAC9B,UAAM,MAAM,OAAO,OAAO,SAAS;AACnC,QAAI,OAAO,UAAU,gBAAgB,KAAK;AACxC,YAAM,YAAY,OAAO,SAAS,MAAM,OAAO,CAAC,GAAG;AACnD,UAAI,WAAW;AACb,gBAAQ,IAAI,KAAK,SAAS;AAAA,MAC5B;AAAA,IACF,OAAO;AACL;AAAA,QACE,2BAA2B,GAAG,YAAY,KAAK,UAAU,OAAO,UAAU,MAAM,SAAS,OAAO,KAAK,CAAC;AAAA,MACxG;AAAA,IACF;AAAA,EACF;AAEA,QAAM,WAA4B,OAAO,IAAI,CAAC,OAAO,OAAO;AAAA,IAC1D,GAAG;AAAA,IACH,QAAQ,QAAQ,IAAI,CAAC,KAAK,CAAC;AAAA,EAC7B,EAAE;AAEF,QAAM,UAAU,SAAS,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,CAAC,EAAE;AAC9D,MAAI,UAAU,GAAG;AACf,YAAQ,mBAAmB,OAAO,qDAAqD;AAAA,EACzF;AAEA,UAAQ,0CAA0C,SAAS,MAAM,uBAAuB;AACxF,SAAO;AACT;AAEO,IAAM,oBAAoB,OAAO,aAAqB,iBAAiC;AAC5F,QAAM,SAAS,IAAI,OAAO;AAC1B,QAAM,OAAO,MAAM,IAAI,WAAW,EAAE,MAAM,MAAM,MAAS;AACzD,MAAI,cAAc;AAChB,UAAM,OAAO,MAAM,IAAI,YAAY,EAAE,MAAM,MAAM,MAAS;AAAA,EAC5D;AACF;","names":[]}
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import {
|
|
2
|
+
SUMMARY_MAX_TOKENS,
|
|
3
|
+
SUMMARY_TARGET_WORDS,
|
|
4
|
+
getModels,
|
|
5
|
+
logInfo,
|
|
6
|
+
logWarn
|
|
7
|
+
} from "./chunk-KGG7WEYE.js";
|
|
8
|
+
|
|
9
|
+
// src/services/batch-summarizer.ts
|
|
10
|
+
import OpenAI from "openai";
|
|
11
|
+
var CHARS_PER_TOKEN = 4;
|
|
12
|
+
var estimateTokens = (text) => Math.ceil(text.length / CHARS_PER_TOKEN);
|
|
13
|
+
var SUMMARY_PROMPT = (title, chapterNum, content) => `You are analyzing a chapter from a book (fiction or nonfiction). Extract key information to help readers understand the chapter's content.
|
|
14
|
+
|
|
15
|
+
Chapter Title: ${title}
|
|
16
|
+
Chapter Number: ${chapterNum}
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
${content}
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
Extract the following information and respond ONLY with valid JSON (no markdown, no code blocks):
|
|
23
|
+
|
|
24
|
+
{
|
|
25
|
+
"characters": ["Name - brief description (role, traits, first appearance)", ...],
|
|
26
|
+
"events": "What happens in this chapter? (2-3 sentences)",
|
|
27
|
+
"setting": "Where does this chapter take place?",
|
|
28
|
+
"revelations": "Any important information revealed? (secrets, backstory, foreshadowing)"
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
Keep the total response around ${SUMMARY_TARGET_WORDS} words.`;
|
|
32
|
+
var splitIntoSections = (text, maxTokens) => {
|
|
33
|
+
const estimatedTokens = estimateTokens(text);
|
|
34
|
+
if (estimatedTokens <= maxTokens) return [text];
|
|
35
|
+
const numSections = Math.ceil(estimatedTokens / maxTokens);
|
|
36
|
+
const charsPerSection = Math.floor(text.length / numSections);
|
|
37
|
+
const sections = [];
|
|
38
|
+
for (let i = 0; i < numSections; i++) {
|
|
39
|
+
const start = i * charsPerSection;
|
|
40
|
+
const end = i === numSections - 1 ? text.length : (i + 1) * charsPerSection;
|
|
41
|
+
sections.push(text.slice(start, end));
|
|
42
|
+
}
|
|
43
|
+
return sections;
|
|
44
|
+
};
|
|
45
|
+
var buildJsonl = (chapters, model) => {
|
|
46
|
+
const lines = [];
|
|
47
|
+
const metadata = [];
|
|
48
|
+
for (let i = 0; i < chapters.length; i++) {
|
|
49
|
+
const chapter = chapters[i];
|
|
50
|
+
const tokens = estimateTokens(chapter.content);
|
|
51
|
+
if (tokens <= SUMMARY_MAX_TOKENS) {
|
|
52
|
+
const line = {
|
|
53
|
+
custom_id: `summary-${i}`,
|
|
54
|
+
method: "POST",
|
|
55
|
+
url: "/v1/chat/completions",
|
|
56
|
+
body: {
|
|
57
|
+
model,
|
|
58
|
+
messages: [{ role: "user", content: SUMMARY_PROMPT(chapter.title, i + 1, chapter.content) }]
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
lines.push(JSON.stringify(line));
|
|
62
|
+
metadata.push({ chapterIndex: i, title: chapter.title, needsTwoPass: false, sectionCount: 1 });
|
|
63
|
+
} else {
|
|
64
|
+
const sections = splitIntoSections(chapter.content, SUMMARY_MAX_TOKENS);
|
|
65
|
+
for (let s = 0; s < sections.length; s++) {
|
|
66
|
+
const line = {
|
|
67
|
+
custom_id: `section-${i}-${s}`,
|
|
68
|
+
method: "POST",
|
|
69
|
+
url: "/v1/chat/completions",
|
|
70
|
+
body: {
|
|
71
|
+
model,
|
|
72
|
+
messages: [{
|
|
73
|
+
role: "user",
|
|
74
|
+
content: `Summarize this section from chapter "${chapter.title}" (Part ${s + 1}). Focus on key events, characters, and revelations. Keep it concise (100-150 words):
|
|
75
|
+
|
|
76
|
+
${sections[s]}`
|
|
77
|
+
}]
|
|
78
|
+
}
|
|
79
|
+
};
|
|
80
|
+
lines.push(JSON.stringify(line));
|
|
81
|
+
}
|
|
82
|
+
metadata.push({ chapterIndex: i, title: chapter.title, needsTwoPass: true, sectionCount: sections.length });
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return { jsonl: lines.join("\n"), metadata };
|
|
86
|
+
};
|
|
87
|
+
var submitBatchSummaries = async (chapters) => {
|
|
88
|
+
const models = await getModels();
|
|
89
|
+
const client = new OpenAI();
|
|
90
|
+
logInfo(`[BatchSummarizer] Preparing batch request for ${chapters.length} chapters`);
|
|
91
|
+
const { jsonl, metadata } = buildJsonl(chapters, models.summary);
|
|
92
|
+
const blob = new Blob([jsonl], { type: "application/jsonl" });
|
|
93
|
+
const file = await client.files.create({
|
|
94
|
+
file: new File([blob], "summaries.jsonl", { type: "application/jsonl" }),
|
|
95
|
+
purpose: "batch"
|
|
96
|
+
});
|
|
97
|
+
logInfo(`[BatchSummarizer] Uploaded input file ${file.id}`);
|
|
98
|
+
const batch = await client.batches.create({
|
|
99
|
+
input_file_id: file.id,
|
|
100
|
+
endpoint: "/v1/chat/completions",
|
|
101
|
+
completion_window: "24h"
|
|
102
|
+
});
|
|
103
|
+
logInfo(`[BatchSummarizer] Created batch ${batch.id} \u2014 status: ${batch.status}`);
|
|
104
|
+
return { batchId: batch.id, inputFileId: file.id, metadata };
|
|
105
|
+
};
|
|
106
|
+
var parseStructuredSummary = (text, chapterIndex, title) => {
|
|
107
|
+
try {
|
|
108
|
+
let jsonText = text.trim();
|
|
109
|
+
if (jsonText.startsWith("```json")) {
|
|
110
|
+
jsonText = jsonText.slice(7, -3).trim();
|
|
111
|
+
} else if (jsonText.startsWith("```")) {
|
|
112
|
+
jsonText = jsonText.slice(3, -3).trim();
|
|
113
|
+
}
|
|
114
|
+
const parsed = JSON.parse(jsonText);
|
|
115
|
+
const fullSummary = `Chapter ${chapterIndex + 1}: ${title}
|
|
116
|
+
|
|
117
|
+
Characters: ${parsed.characters.join(", ")}
|
|
118
|
+
|
|
119
|
+
Events: ${parsed.events}
|
|
120
|
+
|
|
121
|
+
Setting: ${parsed.setting}
|
|
122
|
+
|
|
123
|
+
Revelations: ${parsed.revelations}`;
|
|
124
|
+
return {
|
|
125
|
+
chapterIndex,
|
|
126
|
+
chapterTitle: title,
|
|
127
|
+
characters: parsed.characters,
|
|
128
|
+
events: parsed.events,
|
|
129
|
+
setting: parsed.setting,
|
|
130
|
+
revelations: parsed.revelations,
|
|
131
|
+
fullSummary
|
|
132
|
+
};
|
|
133
|
+
} catch (error) {
|
|
134
|
+
logWarn(`[BatchSummarizer] Failed to parse summary JSON for "${title}": ${error instanceof Error ? error.message : String(error)}`);
|
|
135
|
+
return null;
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
var downloadBatchSummaryResults = async (outputFileId, chapters, metadata) => {
|
|
139
|
+
const client = new OpenAI();
|
|
140
|
+
logInfo(`[BatchSummarizer] Downloading results from ${outputFileId}`);
|
|
141
|
+
const response = await client.files.content(outputFileId);
|
|
142
|
+
const text = await response.text();
|
|
143
|
+
const lines = text.trim().split("\n");
|
|
144
|
+
const results = /* @__PURE__ */ new Map();
|
|
145
|
+
for (const line of lines) {
|
|
146
|
+
const result = JSON.parse(line);
|
|
147
|
+
if (result.response?.status_code === 200) {
|
|
148
|
+
const content = result.response.body?.choices?.[0]?.message?.content;
|
|
149
|
+
if (content) {
|
|
150
|
+
results.set(result.custom_id, content);
|
|
151
|
+
}
|
|
152
|
+
} else {
|
|
153
|
+
logWarn(`[BatchSummarizer] Request ${result.custom_id} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
const summaries = [];
|
|
157
|
+
const needsMergePass = [];
|
|
158
|
+
for (const meta of metadata) {
|
|
159
|
+
if (!meta.needsTwoPass) {
|
|
160
|
+
const content = results.get(`summary-${meta.chapterIndex}`);
|
|
161
|
+
if (content) {
|
|
162
|
+
const summary = parseStructuredSummary(content, meta.chapterIndex, meta.title);
|
|
163
|
+
if (summary) summaries.push(summary);
|
|
164
|
+
}
|
|
165
|
+
} else {
|
|
166
|
+
const sectionSummaries = [];
|
|
167
|
+
let allPresent = true;
|
|
168
|
+
for (let s = 0; s < meta.sectionCount; s++) {
|
|
169
|
+
const content = results.get(`section-${meta.chapterIndex}-${s}`);
|
|
170
|
+
if (content) {
|
|
171
|
+
sectionSummaries.push(content);
|
|
172
|
+
} else {
|
|
173
|
+
allPresent = false;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
if (allPresent && sectionSummaries.length > 0) {
|
|
177
|
+
needsMergePass.push({ chapterIndex: meta.chapterIndex, title: meta.title, sectionSummaries });
|
|
178
|
+
} else {
|
|
179
|
+
logWarn(`[BatchSummarizer] Missing section results for chapter ${meta.chapterIndex + 1} "${meta.title}"`);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
logInfo(`[BatchSummarizer] Parsed ${summaries.length} direct summaries, ${needsMergePass.length} chapters need merge pass`);
|
|
184
|
+
return { summaries, needsMergePass };
|
|
185
|
+
};
|
|
186
|
+
var submitMergePass = async (mergeChapters) => {
|
|
187
|
+
const models = await getModels();
|
|
188
|
+
const client = new OpenAI();
|
|
189
|
+
const lines = [];
|
|
190
|
+
const metadata = [];
|
|
191
|
+
for (const ch of mergeChapters) {
|
|
192
|
+
const combined = ch.sectionSummaries.join("\n\n");
|
|
193
|
+
const line = {
|
|
194
|
+
custom_id: `summary-${ch.chapterIndex}`,
|
|
195
|
+
method: "POST",
|
|
196
|
+
url: "/v1/chat/completions",
|
|
197
|
+
body: {
|
|
198
|
+
model: models.summary,
|
|
199
|
+
messages: [{ role: "user", content: SUMMARY_PROMPT(ch.title, ch.chapterIndex + 1, combined) }]
|
|
200
|
+
}
|
|
201
|
+
};
|
|
202
|
+
lines.push(JSON.stringify(line));
|
|
203
|
+
metadata.push({ chapterIndex: ch.chapterIndex, title: ch.title, needsTwoPass: false, sectionCount: 1 });
|
|
204
|
+
}
|
|
205
|
+
const jsonl = lines.join("\n");
|
|
206
|
+
const blob = new Blob([jsonl], { type: "application/jsonl" });
|
|
207
|
+
const file = await client.files.create({
|
|
208
|
+
file: new File([blob], "summaries-merge.jsonl", { type: "application/jsonl" }),
|
|
209
|
+
purpose: "batch"
|
|
210
|
+
});
|
|
211
|
+
logInfo(`[BatchSummarizer] Uploaded merge input file ${file.id} (${mergeChapters.length} chapters)`);
|
|
212
|
+
const batch = await client.batches.create({
|
|
213
|
+
input_file_id: file.id,
|
|
214
|
+
endpoint: "/v1/chat/completions",
|
|
215
|
+
completion_window: "24h"
|
|
216
|
+
});
|
|
217
|
+
logInfo(`[BatchSummarizer] Created merge batch ${batch.id} \u2014 status: ${batch.status}`);
|
|
218
|
+
return { batchId: batch.id, inputFileId: file.id, metadata };
|
|
219
|
+
};
|
|
220
|
+
var downloadMergeResults = async (outputFileId, mergeChapters) => {
|
|
221
|
+
const client = new OpenAI();
|
|
222
|
+
logInfo(`[BatchSummarizer] Downloading merge results from ${outputFileId}`);
|
|
223
|
+
const response = await client.files.content(outputFileId);
|
|
224
|
+
const text = await response.text();
|
|
225
|
+
const lines = text.trim().split("\n");
|
|
226
|
+
const summaries = [];
|
|
227
|
+
for (const line of lines) {
|
|
228
|
+
const result = JSON.parse(line);
|
|
229
|
+
if (result.response?.status_code === 200) {
|
|
230
|
+
const content = result.response.body?.choices?.[0]?.message?.content;
|
|
231
|
+
if (content) {
|
|
232
|
+
const idx = Number(result.custom_id.replace("summary-", ""));
|
|
233
|
+
const meta = mergeChapters.find((ch) => ch.chapterIndex === idx);
|
|
234
|
+
if (meta) {
|
|
235
|
+
const summary = parseStructuredSummary(content, idx, meta.title);
|
|
236
|
+
if (summary) summaries.push(summary);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
} else {
|
|
240
|
+
logWarn(`[BatchSummarizer] Merge request ${result.custom_id} failed: ${JSON.stringify(result.response?.body?.error ?? result.error)}`);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
logInfo(`[BatchSummarizer] Parsed ${summaries.length} merged summaries`);
|
|
244
|
+
return summaries;
|
|
245
|
+
};
|
|
246
|
+
|
|
247
|
+
export {
|
|
248
|
+
submitBatchSummaries,
|
|
249
|
+
downloadBatchSummaryResults,
|
|
250
|
+
submitMergePass,
|
|
251
|
+
downloadMergeResults
|
|
252
|
+
};
|
|
253
|
+
//# sourceMappingURL=chunk-7DUQNGEK.js.map
|