@robin7331/papyrus-cli 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +1 -0
- package/README.md +234 -0
- package/assets/.gitkeep +0 -0
- package/assets/header.jpeg +0 -0
- package/assets/header.png +0 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +366 -0
- package/dist/cliHelpers.d.ts +22 -0
- package/dist/cliHelpers.js +75 -0
- package/dist/openaiPdfToMarkdown.d.ts +22 -0
- package/dist/openaiPdfToMarkdown.js +144 -0
- package/package.json +33 -0
- package/src/cli.ts +507 -0
- package/src/cliHelpers.ts +116 -0
- package/src/openaiPdfToMarkdown.ts +203 -0
- package/test/cliHelpers.test.ts +136 -0
- package/tsconfig.json +17 -0
package/src/cli.ts
ADDED
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import "dotenv/config";
|
|
4
|
+
import { mkdir, readFile, readdir, stat, writeFile } from "node:fs/promises";
|
|
5
|
+
import { dirname, join, relative, resolve } from "node:path";
|
|
6
|
+
import { Command } from "commander";
|
|
7
|
+
import {
|
|
8
|
+
convertPdf,
|
|
9
|
+
type ConvertUsage
|
|
10
|
+
} from "./openaiPdfToMarkdown.js";
|
|
11
|
+
import {
|
|
12
|
+
defaultOutputPath,
|
|
13
|
+
formatDurationMs,
|
|
14
|
+
isPdfPath,
|
|
15
|
+
looksLikeFileOutput,
|
|
16
|
+
parseConcurrency,
|
|
17
|
+
parseFormat,
|
|
18
|
+
parseMode,
|
|
19
|
+
resolveFolderOutputPath,
|
|
20
|
+
truncate,
|
|
21
|
+
type CliOptions,
|
|
22
|
+
validateOptionCombination
|
|
23
|
+
} from "./cliHelpers.js";
|
|
24
|
+
|
|
25
|
+
const program = new Command();
|
|
26
|
+
|
|
27
|
+
program
|
|
28
|
+
.name("papyrus")
|
|
29
|
+
.description("Convert PDF files to Markdown or text using the OpenAI Agents SDK")
|
|
30
|
+
.argument("<input>", "Path to input PDF file or folder")
|
|
31
|
+
.option("-o, --output <path>", "Path to output file (single input) or output directory (folder input)")
|
|
32
|
+
.option("-m, --model <model>", "OpenAI model to use", "gpt-4o-mini")
|
|
33
|
+
.option(
|
|
34
|
+
"--concurrency <n>",
|
|
35
|
+
"Max parallel workers for folder input (default: 10)",
|
|
36
|
+
parseConcurrency
|
|
37
|
+
)
|
|
38
|
+
.option("-y, --yes", "Skip confirmation prompt in folder mode")
|
|
39
|
+
.option("--mode <mode>", "Conversion mode: auto or prompt", parseMode, "auto")
|
|
40
|
+
.option("--format <format>", "Output format override: md or txt", parseFormat)
|
|
41
|
+
.option(
|
|
42
|
+
"--instructions <text>",
|
|
43
|
+
"Additional conversion instructions for auto mode"
|
|
44
|
+
)
|
|
45
|
+
.option("--prompt <text>", "Custom prompt text for prompt mode")
|
|
46
|
+
.option("--prompt-file <path>", "Path to file containing prompt text for prompt mode")
|
|
47
|
+
.action(async (input: string, options: CliOptions) => {
|
|
48
|
+
const inputPath = resolve(input);
|
|
49
|
+
const startedAt = Date.now();
|
|
50
|
+
|
|
51
|
+
try {
|
|
52
|
+
validateOptionCombination(options);
|
|
53
|
+
|
|
54
|
+
const promptText = await resolvePromptText(options);
|
|
55
|
+
const inputKind = await detectInputKind(inputPath);
|
|
56
|
+
let usageTotals: ConvertUsage = emptyUsage();
|
|
57
|
+
|
|
58
|
+
if (inputKind === "file") {
|
|
59
|
+
usageTotals = await processSingleFile(inputPath, options, promptText);
|
|
60
|
+
} else {
|
|
61
|
+
const summary = await processFolder(inputPath, options, promptText);
|
|
62
|
+
usageTotals = summary.usage;
|
|
63
|
+
if (!summary.cancelled && summary.failed > 0) {
|
|
64
|
+
process.exitCode = 1;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
printUsageTotals(usageTotals);
|
|
69
|
+
console.log(`Duration: ${((Date.now() - startedAt) / 1000).toFixed(2)}s`);
|
|
70
|
+
} catch (error) {
|
|
71
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
72
|
+
console.error(`Conversion failed: ${message}`);
|
|
73
|
+
console.error(`Duration: ${((Date.now() - startedAt) / 1000).toFixed(2)}s`);
|
|
74
|
+
process.exitCode = 1;
|
|
75
|
+
}
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
program.parseAsync(process.argv);
|
|
79
|
+
|
|
80
|
+
async function processSingleFile(
|
|
81
|
+
inputPath: string,
|
|
82
|
+
options: CliOptions,
|
|
83
|
+
promptText?: string
|
|
84
|
+
): Promise<ConvertUsage> {
|
|
85
|
+
if (!isPdfPath(inputPath)) {
|
|
86
|
+
throw new Error("Input file must have a .pdf extension.");
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const result = await convertPdf({
|
|
90
|
+
inputPath,
|
|
91
|
+
model: options.model,
|
|
92
|
+
mode: options.mode,
|
|
93
|
+
format: options.format,
|
|
94
|
+
instructions: options.instructions,
|
|
95
|
+
promptText
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
const outputPath = resolve(options.output ?? defaultOutputPath(inputPath, result.format));
|
|
99
|
+
await mkdir(dirname(outputPath), { recursive: true });
|
|
100
|
+
await writeFile(outputPath, result.content, "utf8");
|
|
101
|
+
console.log(`Output (${result.format}) written to: ${outputPath}`);
|
|
102
|
+
return result.usage;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
type FolderSummary = {
|
|
106
|
+
total: number;
|
|
107
|
+
succeeded: number;
|
|
108
|
+
failed: number;
|
|
109
|
+
cancelled: boolean;
|
|
110
|
+
usage: ConvertUsage;
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
async function processFolder(
|
|
114
|
+
inputDir: string,
|
|
115
|
+
options: CliOptions,
|
|
116
|
+
promptText?: string
|
|
117
|
+
): Promise<FolderSummary> {
|
|
118
|
+
if (options.output && looksLikeFileOutput(options.output)) {
|
|
119
|
+
throw new Error(
|
|
120
|
+
"In folder mode, --output must be a directory path (not a .md/.txt file path)."
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const files = await collectPdfFiles(inputDir);
|
|
125
|
+
if (files.length === 0) {
|
|
126
|
+
throw new Error(`No PDF files found in directory: ${inputDir}`);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const concurrency = options.concurrency ?? 10;
|
|
130
|
+
const shouldProceed = await confirmFolderProcessing(files.length, concurrency, Boolean(options.yes));
|
|
131
|
+
if (!shouldProceed) {
|
|
132
|
+
console.log("Cancelled. No files were processed.");
|
|
133
|
+
return { total: files.length, succeeded: 0, failed: 0, cancelled: true, usage: emptyUsage() };
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
const outputRoot = options.output ? resolve(options.output) : undefined;
|
|
137
|
+
let succeeded = 0;
|
|
138
|
+
let failed = 0;
|
|
139
|
+
let completed = 0;
|
|
140
|
+
const usage = emptyUsage();
|
|
141
|
+
const failures: Array<{ file: string; message: string }> = [];
|
|
142
|
+
const workerCount = Math.min(concurrency, files.length);
|
|
143
|
+
|
|
144
|
+
console.log(`Found ${files.length} PDF file(s). Using concurrency: ${concurrency}`);
|
|
145
|
+
const workerDashboard = process.stdout.isTTY
|
|
146
|
+
? new AsciiWorkerDashboard(files.length, workerCount)
|
|
147
|
+
: null;
|
|
148
|
+
workerDashboard?.setSummary(completed, failed);
|
|
149
|
+
|
|
150
|
+
try {
|
|
151
|
+
await runWithConcurrency(files, concurrency, async (filePath, _index, workerId) => {
|
|
152
|
+
const relativeInput = relative(inputDir, filePath);
|
|
153
|
+
const startedAt = Date.now();
|
|
154
|
+
workerDashboard?.setWorkerRunning(workerId, relativeInput);
|
|
155
|
+
|
|
156
|
+
try {
|
|
157
|
+
const result = await convertPdf({
|
|
158
|
+
inputPath: filePath,
|
|
159
|
+
model: options.model,
|
|
160
|
+
mode: options.mode,
|
|
161
|
+
format: options.format,
|
|
162
|
+
instructions: options.instructions,
|
|
163
|
+
promptText
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
const outputPath = resolveFolderOutputPath(filePath, inputDir, outputRoot, result.format);
|
|
167
|
+
await mkdir(dirname(outputPath), { recursive: true });
|
|
168
|
+
await writeFile(outputPath, result.content, "utf8");
|
|
169
|
+
succeeded += 1;
|
|
170
|
+
mergeUsage(usage, result.usage);
|
|
171
|
+
|
|
172
|
+
if (workerDashboard) {
|
|
173
|
+
workerDashboard.setWorkerDone(
|
|
174
|
+
workerId,
|
|
175
|
+
relativeInput,
|
|
176
|
+
`${result.format} in ${formatDurationMs(Date.now() - startedAt)}`
|
|
177
|
+
);
|
|
178
|
+
} else {
|
|
179
|
+
console.log(
|
|
180
|
+
`[worker-${workerId + 1}] Done ${relativeInput} -> ${outputPath} (${result.format}, ${formatDurationMs(Date.now() - startedAt)})`
|
|
181
|
+
);
|
|
182
|
+
}
|
|
183
|
+
} catch (error) {
|
|
184
|
+
failed += 1;
|
|
185
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
186
|
+
failures.push({
|
|
187
|
+
file: relativeInput,
|
|
188
|
+
message
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
if (workerDashboard) {
|
|
192
|
+
workerDashboard.setWorkerFailed(
|
|
193
|
+
workerId,
|
|
194
|
+
relativeInput,
|
|
195
|
+
`${truncate(message, 42)} (${formatDurationMs(Date.now() - startedAt)})`
|
|
196
|
+
);
|
|
197
|
+
} else {
|
|
198
|
+
console.error(
|
|
199
|
+
`[worker-${workerId + 1}] Failed ${relativeInput}: ${message} (${formatDurationMs(Date.now() - startedAt)})`
|
|
200
|
+
);
|
|
201
|
+
}
|
|
202
|
+
} finally {
|
|
203
|
+
completed += 1;
|
|
204
|
+
workerDashboard?.setSummary(completed, failed);
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
} finally {
|
|
208
|
+
workerDashboard?.stop();
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
console.log(
|
|
212
|
+
`Summary: total=${files.length}, succeeded=${succeeded}, failed=${failed}`
|
|
213
|
+
);
|
|
214
|
+
if (failures.length > 0) {
|
|
215
|
+
console.error("Failures:");
|
|
216
|
+
for (const failure of failures) {
|
|
217
|
+
console.error(`- ${failure.file}: ${failure.message}`);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
return { total: files.length, succeeded, failed, cancelled: false, usage };
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
async function resolvePromptText(options: CliOptions): Promise<string | undefined> {
|
|
225
|
+
if (options.mode !== "prompt") {
|
|
226
|
+
return undefined;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if (options.prompt) {
|
|
230
|
+
const prompt = options.prompt.trim();
|
|
231
|
+
if (!prompt) {
|
|
232
|
+
throw new Error("--prompt cannot be empty.");
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return prompt;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if (!options.promptFile) {
|
|
239
|
+
return undefined;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const promptPath = resolve(options.promptFile);
|
|
243
|
+
const promptFromFile = (await readFile(promptPath, "utf8")).trim();
|
|
244
|
+
if (!promptFromFile) {
|
|
245
|
+
throw new Error("--prompt-file must contain non-empty text.");
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
return promptFromFile;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
async function detectInputKind(inputPath: string): Promise<"file" | "directory"> {
|
|
252
|
+
const metadata = await stat(inputPath);
|
|
253
|
+
if (metadata.isFile()) {
|
|
254
|
+
return "file";
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
if (metadata.isDirectory()) {
|
|
258
|
+
return "directory";
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
throw new Error("Input path must be a PDF file or directory.");
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
async function collectPdfFiles(rootDir: string): Promise<string[]> {
|
|
265
|
+
const collected: string[] = [];
|
|
266
|
+
await walkDirectory(rootDir, collected);
|
|
267
|
+
return collected;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
async function walkDirectory(currentDir: string, collected: string[]): Promise<void> {
|
|
271
|
+
const entries = await readdir(currentDir, { withFileTypes: true });
|
|
272
|
+
entries.sort((a, b) => a.name.localeCompare(b.name, "en", { sensitivity: "base" }));
|
|
273
|
+
|
|
274
|
+
for (const entry of entries) {
|
|
275
|
+
if (entry.isSymbolicLink()) {
|
|
276
|
+
continue;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
const fullPath = join(currentDir, entry.name);
|
|
280
|
+
if (entry.isDirectory()) {
|
|
281
|
+
await walkDirectory(fullPath, collected);
|
|
282
|
+
continue;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
if (entry.isFile() && isPdfPath(entry.name)) {
|
|
286
|
+
collected.push(fullPath);
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
async function runWithConcurrency<T>(
|
|
292
|
+
items: T[],
|
|
293
|
+
concurrency: number,
|
|
294
|
+
worker: (item: T, index: number, workerId: number) => Promise<void>
|
|
295
|
+
): Promise<void> {
|
|
296
|
+
const maxWorkers = Math.min(concurrency, items.length);
|
|
297
|
+
let nextIndex = 0;
|
|
298
|
+
|
|
299
|
+
const workers = Array.from({ length: maxWorkers }, async (_, workerId) => {
|
|
300
|
+
while (true) {
|
|
301
|
+
const currentIndex = nextIndex;
|
|
302
|
+
nextIndex += 1;
|
|
303
|
+
|
|
304
|
+
if (currentIndex >= items.length) {
|
|
305
|
+
return;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
await worker(items[currentIndex], currentIndex, workerId);
|
|
309
|
+
}
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
await Promise.all(workers);
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
const SPINNER_FRAMES = ["-", "\\", "|", "/"];
|
|
316
|
+
|
|
317
|
+
type WorkerLane = {
|
|
318
|
+
state: "idle" | "running" | "done" | "failed";
|
|
319
|
+
file?: string;
|
|
320
|
+
message?: string;
|
|
321
|
+
spinnerFrame: number;
|
|
322
|
+
};
|
|
323
|
+
|
|
324
|
+
class AsciiWorkerDashboard {
|
|
325
|
+
private readonly lanes: WorkerLane[];
|
|
326
|
+
private readonly total: number;
|
|
327
|
+
private readonly workerCount: number;
|
|
328
|
+
private readonly spinnerTimer: NodeJS.Timeout;
|
|
329
|
+
private completed = 0;
|
|
330
|
+
private failed = 0;
|
|
331
|
+
private renderedLineCount = 0;
|
|
332
|
+
|
|
333
|
+
constructor(total: number, workerCount: number) {
|
|
334
|
+
this.total = total;
|
|
335
|
+
this.workerCount = workerCount;
|
|
336
|
+
this.lanes = Array.from({ length: workerCount }, () => ({
|
|
337
|
+
state: "idle",
|
|
338
|
+
spinnerFrame: 0
|
|
339
|
+
}));
|
|
340
|
+
|
|
341
|
+
process.stdout.write("\x1b[?25l");
|
|
342
|
+
this.render();
|
|
343
|
+
this.spinnerTimer = setInterval(() => {
|
|
344
|
+
this.tickSpinners();
|
|
345
|
+
this.render();
|
|
346
|
+
}, 100);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
setSummary(completed: number, failed: number): void {
|
|
350
|
+
this.completed = completed;
|
|
351
|
+
this.failed = failed;
|
|
352
|
+
this.render();
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
setWorkerRunning(workerId: number, file: string): void {
|
|
356
|
+
const lane = this.lanes[workerId];
|
|
357
|
+
if (!lane) {
|
|
358
|
+
return;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
lane.state = "running";
|
|
362
|
+
lane.file = file;
|
|
363
|
+
lane.message = "processing";
|
|
364
|
+
this.render();
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
setWorkerDone(workerId: number, file: string, message: string): void {
|
|
368
|
+
const lane = this.lanes[workerId];
|
|
369
|
+
if (!lane) {
|
|
370
|
+
return;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
lane.state = "done";
|
|
374
|
+
lane.file = file;
|
|
375
|
+
lane.message = message;
|
|
376
|
+
this.render();
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
setWorkerFailed(workerId: number, file: string, message: string): void {
|
|
380
|
+
const lane = this.lanes[workerId];
|
|
381
|
+
if (!lane) {
|
|
382
|
+
return;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
lane.state = "failed";
|
|
386
|
+
lane.file = file;
|
|
387
|
+
lane.message = message;
|
|
388
|
+
this.render();
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
stop(): void {
|
|
392
|
+
clearInterval(this.spinnerTimer);
|
|
393
|
+
this.render();
|
|
394
|
+
process.stdout.write("\x1b[?25h");
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
private render(): void {
|
|
398
|
+
const lines = this.composeLines();
|
|
399
|
+
if (this.renderedLineCount > 0) {
|
|
400
|
+
process.stdout.write(`\x1b[${this.renderedLineCount}F`);
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
for (const line of lines) {
|
|
404
|
+
process.stdout.write(`\x1b[2K${line}\n`);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
this.renderedLineCount = lines.length;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
private composeLines(): string[] {
|
|
411
|
+
const active = this.lanes.filter((lane) => lane.state === "running").length;
|
|
412
|
+
const lines = [
|
|
413
|
+
`Progress: ${this.completed}/${this.total} complete | active ${active}/${this.workerCount} | failed ${this.failed}`
|
|
414
|
+
];
|
|
415
|
+
|
|
416
|
+
for (let index = 0; index < this.lanes.length; index += 1) {
|
|
417
|
+
const lane = this.lanes[index];
|
|
418
|
+
const label = `worker-${String(index + 1).padStart(2, "0")}`;
|
|
419
|
+
const icon = this.renderIcon(lane);
|
|
420
|
+
const file = truncate(lane.file ?? "idle", 64);
|
|
421
|
+
const message = lane.message ? ` | ${lane.message}` : "";
|
|
422
|
+
lines.push(`${icon} ${label} | ${file}${message}`);
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
return lines;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
private tickSpinners(): void {
|
|
429
|
+
for (const lane of this.lanes) {
|
|
430
|
+
if (lane.state !== "running") {
|
|
431
|
+
continue;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
lane.spinnerFrame = (lane.spinnerFrame + 1) % SPINNER_FRAMES.length;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
private renderIcon(lane: WorkerLane): string {
|
|
439
|
+
if (lane.state === "running") {
|
|
440
|
+
return SPINNER_FRAMES[lane.spinnerFrame];
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
if (lane.state === "done") {
|
|
444
|
+
return "OK";
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
if (lane.state === "failed") {
|
|
448
|
+
return "!!";
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
return "..";
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
async function confirmFolderProcessing(
|
|
456
|
+
totalFiles: number,
|
|
457
|
+
concurrency: number,
|
|
458
|
+
skipPrompt: boolean
|
|
459
|
+
): Promise<boolean> {
|
|
460
|
+
if (skipPrompt) {
|
|
461
|
+
return true;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
if (!process.stdin.isTTY || !process.stdout.isTTY) {
|
|
465
|
+
throw new Error(
|
|
466
|
+
"Folder mode requires an interactive terminal confirmation. Use --yes to skip the prompt."
|
|
467
|
+
);
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
const { createInterface } = await import("node:readline/promises");
|
|
471
|
+
const rl = createInterface({
|
|
472
|
+
input: process.stdin,
|
|
473
|
+
output: process.stdout
|
|
474
|
+
});
|
|
475
|
+
|
|
476
|
+
try {
|
|
477
|
+
const answer = (await rl.question(
|
|
478
|
+
`Process ${totalFiles} PDF file(s) with concurrency ${concurrency}? [Y/n] `
|
|
479
|
+
)).trim().toLowerCase();
|
|
480
|
+
|
|
481
|
+
return answer === "" || answer === "y" || answer === "yes";
|
|
482
|
+
} finally {
|
|
483
|
+
rl.close();
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
function emptyUsage(): ConvertUsage {
|
|
488
|
+
return {
|
|
489
|
+
requests: 0,
|
|
490
|
+
inputTokens: 0,
|
|
491
|
+
outputTokens: 0,
|
|
492
|
+
totalTokens: 0
|
|
493
|
+
};
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
function mergeUsage(target: ConvertUsage, delta: ConvertUsage): void {
|
|
497
|
+
target.requests += delta.requests;
|
|
498
|
+
target.inputTokens += delta.inputTokens;
|
|
499
|
+
target.outputTokens += delta.outputTokens;
|
|
500
|
+
target.totalTokens += delta.totalTokens;
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
function printUsageTotals(usage: ConvertUsage): void {
|
|
504
|
+
console.log(
|
|
505
|
+
`Token usage: input=${usage.inputTokens}, output=${usage.outputTokens}, total=${usage.totalTokens}, requests=${usage.requests}`
|
|
506
|
+
);
|
|
507
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import { InvalidArgumentError } from "commander";
|
|
2
|
+
import { basename, dirname, extname, join, relative } from "node:path";
|
|
3
|
+
import { type ConversionMode, type OutputFormat } from "./openaiPdfToMarkdown.js";
|
|
4
|
+
|
|
5
|
+
export type CliOptions = {
|
|
6
|
+
output?: string;
|
|
7
|
+
model: string;
|
|
8
|
+
concurrency?: number;
|
|
9
|
+
yes?: boolean;
|
|
10
|
+
mode: ConversionMode;
|
|
11
|
+
format?: OutputFormat;
|
|
12
|
+
instructions?: string;
|
|
13
|
+
prompt?: string;
|
|
14
|
+
promptFile?: string;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
export function parseMode(value: string): ConversionMode {
|
|
18
|
+
if (value === "auto" || value === "prompt") {
|
|
19
|
+
return value;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
throw new InvalidArgumentError("Mode must be either 'auto' or 'prompt'.");
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function parseFormat(value: string): OutputFormat {
|
|
26
|
+
if (value === "md" || value === "txt") {
|
|
27
|
+
return value;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
throw new InvalidArgumentError("Format must be either 'md' or 'txt'.");
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function parseConcurrency(value: string): number {
|
|
34
|
+
const parsed = Number(value);
|
|
35
|
+
if (!Number.isInteger(parsed) || parsed < 1 || parsed > 100) {
|
|
36
|
+
throw new InvalidArgumentError("Concurrency must be an integer between 1 and 100.");
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return parsed;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export function validateOptionCombination(options: CliOptions): void {
|
|
43
|
+
if (options.mode === "prompt") {
|
|
44
|
+
const promptSourceCount = Number(Boolean(options.prompt)) + Number(Boolean(options.promptFile));
|
|
45
|
+
if (promptSourceCount !== 1) {
|
|
46
|
+
throw new Error("Prompt mode requires exactly one of --prompt or --prompt-file.");
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if (options.instructions) {
|
|
50
|
+
throw new Error("--instructions is only supported in auto mode.");
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (options.prompt || options.promptFile) {
|
|
57
|
+
throw new Error("--prompt and --prompt-file are only supported in prompt mode.");
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function defaultOutputPath(inputPath: string, format: OutputFormat): string {
|
|
62
|
+
const extension = format === "md" ? ".md" : ".txt";
|
|
63
|
+
|
|
64
|
+
if (extname(inputPath).toLowerCase() === ".pdf") {
|
|
65
|
+
return inputPath.slice(0, -4) + extension;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return inputPath + extension;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export function resolveFolderOutputPath(
|
|
72
|
+
inputPath: string,
|
|
73
|
+
inputRoot: string,
|
|
74
|
+
outputRoot: string | undefined,
|
|
75
|
+
format: OutputFormat
|
|
76
|
+
): string {
|
|
77
|
+
if (!outputRoot) {
|
|
78
|
+
return defaultOutputPath(inputPath, format);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const relativePath = relative(inputRoot, inputPath);
|
|
82
|
+
const relativeDir = dirname(relativePath);
|
|
83
|
+
const base = basename(relativePath, extname(relativePath));
|
|
84
|
+
const filename = `${base}.${format}`;
|
|
85
|
+
|
|
86
|
+
if (relativeDir === ".") {
|
|
87
|
+
return join(outputRoot, filename);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return join(outputRoot, relativeDir, filename);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export function isPdfPath(inputPath: string): boolean {
|
|
94
|
+
return extname(inputPath).toLowerCase() === ".pdf";
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
export function looksLikeFileOutput(outputPath: string): boolean {
|
|
98
|
+
const outputExt = extname(outputPath).toLowerCase();
|
|
99
|
+
return outputExt === ".md" || outputExt === ".txt";
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export function truncate(value: string, maxLength: number): string {
|
|
103
|
+
if (value.length <= maxLength) {
|
|
104
|
+
return value;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
if (maxLength <= 3) {
|
|
108
|
+
return value.slice(0, maxLength);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return `${value.slice(0, maxLength - 3)}...`;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
export function formatDurationMs(durationMs: number): string {
|
|
115
|
+
return `${(durationMs / 1000).toFixed(2)}s`;
|
|
116
|
+
}
|