@robin7331/papyrus-cli 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -1
- package/dist/cli.js +74 -11
- package/dist/cliHelpers.d.ts +3 -0
- package/dist/cliHelpers.js +6 -0
- package/dist/openaiPdfToMarkdown.d.ts +8 -0
- package/dist/openaiPdfToMarkdown.js +74 -19
- package/package.json +1 -2
- package/src/cli.ts +93 -12
- package/src/cliHelpers.ts +9 -0
- package/src/openaiPdfToMarkdown.ts +93 -23
- package/test/cliHelpers.test.ts +11 -0
package/README.md
CHANGED
|
@@ -27,6 +27,9 @@ papyrus --help
|
|
|
27
27
|
# Show installed CLI version
|
|
28
28
|
papyrus --version
|
|
29
29
|
|
|
30
|
+
# List available models for the current API key
|
|
31
|
+
papyrus --models
|
|
32
|
+
|
|
30
33
|
# Single file (default behavior; if no API key is found, Papyrus prompts you to paste one)
|
|
31
34
|
papyrus ./path/to/input.pdf
|
|
32
35
|
|
|
@@ -88,9 +91,10 @@ papyrus config clear
|
|
|
88
91
|
|
|
89
92
|
## Arguments Reference
|
|
90
93
|
|
|
91
|
-
###
|
|
94
|
+
### `[input]`
|
|
92
95
|
|
|
93
96
|
Path to a single PDF file or a folder containing PDFs (processed recursively).
|
|
97
|
+
Required unless you use `--models`.
|
|
94
98
|
|
|
95
99
|
Example:
|
|
96
100
|
|
|
@@ -165,6 +169,7 @@ papyrus ./docs/invoice.pdf --prompt-file ./my-prompt.txt
|
|
|
165
169
|
### `-m, --model <model>`
|
|
166
170
|
|
|
167
171
|
OpenAI model name used for conversion. Default is `gpt-4o-mini`.
|
|
172
|
+
If the selected model is not available, Papyrus prints the available model IDs before exiting.
|
|
168
173
|
|
|
169
174
|
Example:
|
|
170
175
|
|
|
@@ -172,6 +177,16 @@ Example:
|
|
|
172
177
|
papyrus ./docs/invoice.pdf --model gpt-4.1-mini
|
|
173
178
|
```
|
|
174
179
|
|
|
180
|
+
### `--models`
|
|
181
|
+
|
|
182
|
+
Lists the available OpenAI model IDs for the current API key and exits.
|
|
183
|
+
|
|
184
|
+
Example:
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
papyrus --models
|
|
188
|
+
```
|
|
189
|
+
|
|
175
190
|
### `--concurrency <n>`
|
|
176
191
|
|
|
177
192
|
Maximum parallel workers for folder input. Must be an integer between `1` and `100`. Default is `10`.
|
package/dist/cli.js
CHANGED
|
@@ -5,8 +5,8 @@ import { mkdir, readFile, readdir, stat, writeFile } from "node:fs/promises";
|
|
|
5
5
|
import { dirname, join, relative, resolve } from "node:path";
|
|
6
6
|
import { Command } from "commander";
|
|
7
7
|
import { clearStoredApiKey, getConfigFilePath, getStoredApiKey, maskApiKey, setStoredApiKey } from "./config.js";
|
|
8
|
-
import { convertPdf } from "./openaiPdfToMarkdown.js";
|
|
9
|
-
import { defaultOutputPath, formatDurationMs, isPdfPath, looksLikeFileOutput, parseConcurrency, parseFormat, resolveFolderOutputPath, truncate, validateOptionCombination } from "./cliHelpers.js";
|
|
8
|
+
import { assertModelAvailable, convertPdf, listAvailableModels, UnknownModelError } from "./openaiPdfToMarkdown.js";
|
|
9
|
+
import { defaultOutputPath, formatDurationMs, getSpinnerFrame, isPdfPath, looksLikeFileOutput, parseConcurrency, parseFormat, resolveFolderOutputPath, truncate, validateOptionCombination } from "./cliHelpers.js";
|
|
10
10
|
const program = new Command();
|
|
11
11
|
const configFilePath = getConfigFilePath();
|
|
12
12
|
const OPENAI_API_KEYS_URL = "https://platform.openai.com/settings/organization/api-keys";
|
|
@@ -15,9 +15,10 @@ program
|
|
|
15
15
|
.name("papyrus")
|
|
16
16
|
.version(cliVersion, "-v, --version", "display version number")
|
|
17
17
|
.description("Convert PDF files to Markdown or text using the OpenAI Agents SDK")
|
|
18
|
-
.argument("
|
|
18
|
+
.argument("[input]", "Path to input PDF file or folder")
|
|
19
19
|
.option("-o, --output <path>", "Path to output file (single input) or output directory (folder input)")
|
|
20
20
|
.option("-m, --model <model>", "OpenAI model to use", "gpt-4o-mini")
|
|
21
|
+
.option("--models", "List available OpenAI models for the current API key and exit")
|
|
21
22
|
.option("--concurrency <n>", "Max parallel workers for folder input (default: 10)", parseConcurrency)
|
|
22
23
|
.option("-y, --yes", "Skip confirmation prompt in folder mode")
|
|
23
24
|
.option("--format <format>", "Output file extension override (for example: md, txt, csv, json)", parseFormat)
|
|
@@ -25,13 +26,26 @@ program
|
|
|
25
26
|
.option("--prompt <text>", "Custom prompt text (enables prompt mode)")
|
|
26
27
|
.option("--prompt-file <path>", "Path to file containing prompt text (enables prompt mode)")
|
|
27
28
|
.action(async (input, options) => {
|
|
28
|
-
const inputPath = resolve(input);
|
|
29
29
|
const startedAt = Date.now();
|
|
30
30
|
try {
|
|
31
|
+
if (options.models) {
|
|
32
|
+
await ensureApiKey();
|
|
33
|
+
printAvailableModels(await listAvailableModels());
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
if (!input) {
|
|
37
|
+
throw new Error('Input path is required unless "--models" is used.');
|
|
38
|
+
}
|
|
39
|
+
const inputPath = resolve(input);
|
|
31
40
|
validateOptionCombination(options);
|
|
32
41
|
const promptText = await resolvePromptText(options);
|
|
33
42
|
const conversionMode = resolveConversionMode(promptText);
|
|
34
43
|
const inputKind = await detectInputKind(inputPath);
|
|
44
|
+
if (inputKind === "file" && !isPdfPath(inputPath)) {
|
|
45
|
+
throw new Error("Input file must have a .pdf extension.");
|
|
46
|
+
}
|
|
47
|
+
await ensureApiKey();
|
|
48
|
+
await assertModelAvailable(options.model);
|
|
35
49
|
let usageTotals = emptyUsage();
|
|
36
50
|
if (inputKind === "file") {
|
|
37
51
|
usageTotals = await processSingleFile(inputPath, options, conversionMode, promptText);
|
|
@@ -47,6 +61,9 @@ program
|
|
|
47
61
|
console.log(`Duration: ${((Date.now() - startedAt) / 1000).toFixed(2)}s`);
|
|
48
62
|
}
|
|
49
63
|
catch (error) {
|
|
64
|
+
if (error instanceof UnknownModelError) {
|
|
65
|
+
printAvailableModels(error.availableModels);
|
|
66
|
+
}
|
|
50
67
|
const message = error instanceof Error ? error.message : String(error);
|
|
51
68
|
console.error(`Conversion failed: ${message}`);
|
|
52
69
|
console.error(`Duration: ${((Date.now() - startedAt) / 1000).toFixed(2)}s`);
|
|
@@ -113,10 +130,6 @@ program.parseAsync(process.argv).catch((error) => {
|
|
|
113
130
|
process.exitCode = 1;
|
|
114
131
|
});
|
|
115
132
|
async function processSingleFile(inputPath, options, mode, promptText) {
|
|
116
|
-
if (!isPdfPath(inputPath)) {
|
|
117
|
-
throw new Error("Input file must have a .pdf extension.");
|
|
118
|
-
}
|
|
119
|
-
await ensureApiKey();
|
|
120
133
|
const startedAt = Date.now();
|
|
121
134
|
const displayInput = relative(process.cwd(), inputPath) || inputPath;
|
|
122
135
|
const workerDashboard = process.stdout.isTTY
|
|
@@ -179,7 +192,6 @@ async function processFolder(inputDir, options, mode, promptText) {
|
|
|
179
192
|
console.log("Cancelled. No files were processed.");
|
|
180
193
|
return { total: files.length, succeeded: 0, failed: 0, cancelled: true, usage: emptyUsage() };
|
|
181
194
|
}
|
|
182
|
-
await ensureApiKey();
|
|
183
195
|
const outputRoot = options.output ? resolve(options.output) : undefined;
|
|
184
196
|
let succeeded = 0;
|
|
185
197
|
let failed = 0;
|
|
@@ -434,17 +446,20 @@ async function runWithConcurrency(items, concurrency, worker) {
|
|
|
434
446
|
await Promise.all(workers);
|
|
435
447
|
}
|
|
436
448
|
class AsciiWorkerDashboard {
|
|
449
|
+
static spinnerIntervalMs = 80;
|
|
437
450
|
lanes;
|
|
438
451
|
total;
|
|
439
452
|
workerCount;
|
|
440
453
|
completed = 0;
|
|
441
454
|
failed = 0;
|
|
442
455
|
renderedLineCount = 0;
|
|
456
|
+
spinnerTimer;
|
|
443
457
|
constructor(total, workerCount) {
|
|
444
458
|
this.total = total;
|
|
445
459
|
this.workerCount = workerCount;
|
|
446
460
|
this.lanes = Array.from({ length: workerCount }, () => ({
|
|
447
|
-
state: "idle"
|
|
461
|
+
state: "idle",
|
|
462
|
+
spinnerFrame: 0
|
|
448
463
|
}));
|
|
449
464
|
process.stdout.write("\x1b[?25l");
|
|
450
465
|
this.render();
|
|
@@ -462,6 +477,8 @@ class AsciiWorkerDashboard {
|
|
|
462
477
|
lane.state = "running";
|
|
463
478
|
lane.file = file;
|
|
464
479
|
lane.message = "processing...";
|
|
480
|
+
lane.spinnerFrame = 0;
|
|
481
|
+
this.syncSpinnerTimer();
|
|
465
482
|
this.render();
|
|
466
483
|
}
|
|
467
484
|
setWorkerDone(workerId, file, message) {
|
|
@@ -472,6 +489,8 @@ class AsciiWorkerDashboard {
|
|
|
472
489
|
lane.state = "done";
|
|
473
490
|
lane.file = file;
|
|
474
491
|
lane.message = message;
|
|
492
|
+
lane.spinnerFrame = 0;
|
|
493
|
+
this.syncSpinnerTimer();
|
|
475
494
|
this.render();
|
|
476
495
|
}
|
|
477
496
|
setWorkerFailed(workerId, file, message) {
|
|
@@ -482,9 +501,12 @@ class AsciiWorkerDashboard {
|
|
|
482
501
|
lane.state = "failed";
|
|
483
502
|
lane.file = file;
|
|
484
503
|
lane.message = message;
|
|
504
|
+
lane.spinnerFrame = 0;
|
|
505
|
+
this.syncSpinnerTimer();
|
|
485
506
|
this.render();
|
|
486
507
|
}
|
|
487
508
|
stop() {
|
|
509
|
+
this.clearSpinnerTimer();
|
|
488
510
|
this.render();
|
|
489
511
|
process.stdout.write("\x1b[?25h");
|
|
490
512
|
}
|
|
@@ -515,7 +537,7 @@ class AsciiWorkerDashboard {
|
|
|
515
537
|
}
|
|
516
538
|
renderIcon(lane) {
|
|
517
539
|
if (lane.state === "running") {
|
|
518
|
-
return
|
|
540
|
+
return `${getSpinnerFrame(lane.spinnerFrame)} `;
|
|
519
541
|
}
|
|
520
542
|
if (lane.state === "done") {
|
|
521
543
|
return "OK";
|
|
@@ -525,6 +547,41 @@ class AsciiWorkerDashboard {
|
|
|
525
547
|
}
|
|
526
548
|
return "..";
|
|
527
549
|
}
|
|
550
|
+
syncSpinnerTimer() {
|
|
551
|
+
if (this.lanes.some((lane) => lane.state === "running")) {
|
|
552
|
+
this.ensureSpinnerTimer();
|
|
553
|
+
return;
|
|
554
|
+
}
|
|
555
|
+
this.clearSpinnerTimer();
|
|
556
|
+
}
|
|
557
|
+
ensureSpinnerTimer() {
|
|
558
|
+
if (this.spinnerTimer) {
|
|
559
|
+
return;
|
|
560
|
+
}
|
|
561
|
+
this.spinnerTimer = setInterval(() => {
|
|
562
|
+
let hasRunningLane = false;
|
|
563
|
+
for (const lane of this.lanes) {
|
|
564
|
+
if (lane.state !== "running") {
|
|
565
|
+
continue;
|
|
566
|
+
}
|
|
567
|
+
lane.spinnerFrame += 1;
|
|
568
|
+
hasRunningLane = true;
|
|
569
|
+
}
|
|
570
|
+
if (!hasRunningLane) {
|
|
571
|
+
this.clearSpinnerTimer();
|
|
572
|
+
return;
|
|
573
|
+
}
|
|
574
|
+
this.render();
|
|
575
|
+
}, AsciiWorkerDashboard.spinnerIntervalMs);
|
|
576
|
+
this.spinnerTimer.unref?.();
|
|
577
|
+
}
|
|
578
|
+
clearSpinnerTimer() {
|
|
579
|
+
if (!this.spinnerTimer) {
|
|
580
|
+
return;
|
|
581
|
+
}
|
|
582
|
+
clearInterval(this.spinnerTimer);
|
|
583
|
+
this.spinnerTimer = undefined;
|
|
584
|
+
}
|
|
528
585
|
}
|
|
529
586
|
async function confirmFolderProcessing(totalFiles, concurrency, skipPrompt) {
|
|
530
587
|
if (skipPrompt) {
|
|
@@ -569,6 +626,12 @@ function mergeUsage(target, delta) {
|
|
|
569
626
|
function printUsageTotals(usage) {
|
|
570
627
|
console.log(`Token usage: input=${usage.inputTokens}, output=${usage.outputTokens}, total=${usage.totalTokens}, requests=${usage.requests}`);
|
|
571
628
|
}
|
|
629
|
+
function printAvailableModels(models) {
|
|
630
|
+
console.log(`Available models (${models.length}):`);
|
|
631
|
+
for (const model of models) {
|
|
632
|
+
console.log(model);
|
|
633
|
+
}
|
|
634
|
+
}
|
|
572
635
|
function getCliVersion() {
|
|
573
636
|
try {
|
|
574
637
|
const packageJsonPath = new URL("../package.json", import.meta.url);
|
package/dist/cliHelpers.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
export type CliOptions = {
|
|
2
2
|
output?: string;
|
|
3
3
|
model: string;
|
|
4
|
+
models?: boolean;
|
|
4
5
|
concurrency?: number;
|
|
5
6
|
yes?: boolean;
|
|
6
7
|
format?: string;
|
|
@@ -16,4 +17,6 @@ export declare function resolveFolderOutputPath(inputPath: string, inputRoot: st
|
|
|
16
17
|
export declare function isPdfPath(inputPath: string): boolean;
|
|
17
18
|
export declare function looksLikeFileOutput(outputPath: string): boolean;
|
|
18
19
|
export declare function truncate(value: string, maxLength: number): string;
|
|
20
|
+
export declare const ASCII_SPINNER_FRAMES: string[];
|
|
21
|
+
export declare function getSpinnerFrame(frameIndex: number): string;
|
|
19
22
|
export declare function formatDurationMs(durationMs: number): string;
|
package/dist/cliHelpers.js
CHANGED
|
@@ -63,6 +63,12 @@ export function truncate(value, maxLength) {
|
|
|
63
63
|
}
|
|
64
64
|
return `${value.slice(0, maxLength - 3)}...`;
|
|
65
65
|
}
|
|
66
|
+
export const ASCII_SPINNER_FRAMES = ["|", "/", "-", "\\"];
|
|
67
|
+
export function getSpinnerFrame(frameIndex) {
|
|
68
|
+
const normalizedIndex = ((Math.trunc(frameIndex) % ASCII_SPINNER_FRAMES.length) + ASCII_SPINNER_FRAMES.length)
|
|
69
|
+
% ASCII_SPINNER_FRAMES.length;
|
|
70
|
+
return ASCII_SPINNER_FRAMES[normalizedIndex] ?? ASCII_SPINNER_FRAMES[0];
|
|
71
|
+
}
|
|
66
72
|
export function formatDurationMs(durationMs) {
|
|
67
73
|
return `${(durationMs / 1000).toFixed(2)}s`;
|
|
68
74
|
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import OpenAI from "openai";
|
|
1
2
|
export type ConvertOptions = {
|
|
2
3
|
inputPath: string;
|
|
3
4
|
model: string;
|
|
@@ -19,4 +20,11 @@ export type ConvertUsage = {
|
|
|
19
20
|
outputTokens: number;
|
|
20
21
|
totalTokens: number;
|
|
21
22
|
};
|
|
23
|
+
export declare class UnknownModelError extends Error {
|
|
24
|
+
readonly model: string;
|
|
25
|
+
readonly availableModels: string[];
|
|
26
|
+
constructor(model: string, availableModels: string[]);
|
|
27
|
+
}
|
|
22
28
|
export declare function convertPdf(options: ConvertOptions): Promise<ConvertResult>;
|
|
29
|
+
export declare function assertModelAvailable(model: string): Promise<void>;
|
|
30
|
+
export declare function listAvailableModels(client?: OpenAI): Promise<string[]>;
|
|
@@ -4,6 +4,16 @@ import { resolve } from "node:path";
|
|
|
4
4
|
import { Agent, run } from "@openai/agents";
|
|
5
5
|
import OpenAI from "openai";
|
|
6
6
|
import { z } from "zod";
|
|
7
|
+
export class UnknownModelError extends Error {
|
|
8
|
+
model;
|
|
9
|
+
availableModels;
|
|
10
|
+
constructor(model, availableModels) {
|
|
11
|
+
super(`Model "${model}" is not available for this API key.`);
|
|
12
|
+
this.name = "UnknownModelError";
|
|
13
|
+
this.model = model;
|
|
14
|
+
this.availableModels = availableModels;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
7
17
|
const AUTO_RESPONSE_SCHEMA = z.object({
|
|
8
18
|
format: z.enum(["md", "txt"]),
|
|
9
19
|
content: z.string().min(1)
|
|
@@ -14,11 +24,7 @@ const RATE_LIMIT_MAX_DELAY_MS = parsePositiveIntEnv("PAPYRUS_RATE_LIMIT_MAX_DELA
|
|
|
14
24
|
export async function convertPdf(options) {
|
|
15
25
|
const inputPath = resolve(options.inputPath);
|
|
16
26
|
await access(inputPath);
|
|
17
|
-
const
|
|
18
|
-
if (!apiKey) {
|
|
19
|
-
throw new Error("OPENAI_API_KEY is not set.");
|
|
20
|
-
}
|
|
21
|
-
const client = new OpenAI({ apiKey });
|
|
27
|
+
const client = createOpenAiClient();
|
|
22
28
|
const uploaded = await withRateLimitRetry("file upload", () => client.files.create({
|
|
23
29
|
file: createReadStream(inputPath),
|
|
24
30
|
purpose: "user_data"
|
|
@@ -29,21 +35,30 @@ export async function convertPdf(options) {
|
|
|
29
35
|
model: options.model
|
|
30
36
|
});
|
|
31
37
|
const promptText = buildPromptText(options);
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
38
|
+
let result;
|
|
39
|
+
try {
|
|
40
|
+
result = await withRateLimitRetry("model run", () => run(agent, [
|
|
41
|
+
{
|
|
42
|
+
role: "user",
|
|
43
|
+
content: [
|
|
44
|
+
{
|
|
45
|
+
type: "input_text",
|
|
46
|
+
text: promptText
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
type: "input_file",
|
|
50
|
+
file: { id: uploaded.id }
|
|
51
|
+
}
|
|
52
|
+
]
|
|
53
|
+
}
|
|
54
|
+
]));
|
|
55
|
+
}
|
|
56
|
+
catch (error) {
|
|
57
|
+
if (isUnknownModelError(error, options.model)) {
|
|
58
|
+
throw new UnknownModelError(options.model, await listAvailableModels(client));
|
|
45
59
|
}
|
|
46
|
-
|
|
60
|
+
throw error;
|
|
61
|
+
}
|
|
47
62
|
const rawOutput = (result.finalOutput ?? "").trim();
|
|
48
63
|
if (!rawOutput) {
|
|
49
64
|
throw new Error("No content returned by the API.");
|
|
@@ -59,6 +74,32 @@ export async function convertPdf(options) {
|
|
|
59
74
|
}
|
|
60
75
|
return { format: "txt", content: rawOutput, usage };
|
|
61
76
|
}
|
|
77
|
+
export async function assertModelAvailable(model) {
|
|
78
|
+
const client = createOpenAiClient();
|
|
79
|
+
try {
|
|
80
|
+
await client.models.retrieve(model);
|
|
81
|
+
}
|
|
82
|
+
catch (error) {
|
|
83
|
+
if (!isUnknownModelError(error, model)) {
|
|
84
|
+
throw error;
|
|
85
|
+
}
|
|
86
|
+
throw new UnknownModelError(model, await listAvailableModels(client));
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
export async function listAvailableModels(client = createOpenAiClient()) {
|
|
90
|
+
const modelIds = [];
|
|
91
|
+
for await (const model of client.models.list()) {
|
|
92
|
+
modelIds.push(model.id);
|
|
93
|
+
}
|
|
94
|
+
return modelIds.sort((left, right) => left.localeCompare(right, "en"));
|
|
95
|
+
}
|
|
96
|
+
function createOpenAiClient() {
|
|
97
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
98
|
+
if (!apiKey) {
|
|
99
|
+
throw new Error("OPENAI_API_KEY is not set.");
|
|
100
|
+
}
|
|
101
|
+
return new OpenAI({ apiKey });
|
|
102
|
+
}
|
|
62
103
|
function buildPromptText(options) {
|
|
63
104
|
const outputExtensionHint = normalizeExtensionHint(options.outputExtensionHint);
|
|
64
105
|
if (options.mode === "prompt") {
|
|
@@ -111,6 +152,20 @@ function normalizeExtensionHint(extension) {
|
|
|
111
152
|
const normalized = extension.trim().replace(/^\.+/, "");
|
|
112
153
|
return normalized || undefined;
|
|
113
154
|
}
|
|
155
|
+
function isUnknownModelError(error, model) {
|
|
156
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
157
|
+
const normalizedMessage = message.toLowerCase();
|
|
158
|
+
const errorStatus = typeof error === "object" && error !== null && "status" in error ? error.status : undefined;
|
|
159
|
+
const errorCode = typeof error === "object" && error !== null && "code" in error ? error.code : undefined;
|
|
160
|
+
const quotedModel = model.toLowerCase();
|
|
161
|
+
if (errorStatus === 404 || errorCode === "model_not_found") {
|
|
162
|
+
return true;
|
|
163
|
+
}
|
|
164
|
+
return (normalizedMessage.includes(quotedModel) &&
|
|
165
|
+
(normalizedMessage.includes("does not exist") ||
|
|
166
|
+
normalizedMessage.includes("not found") ||
|
|
167
|
+
normalizedMessage.includes("unknown model")));
|
|
168
|
+
}
|
|
114
169
|
function parseAutoResponse(rawOutput) {
|
|
115
170
|
let candidate = rawOutput.trim();
|
|
116
171
|
const fencedMatch = candidate.match(/```(?:json)?\s*([\s\S]*?)```/i);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@robin7331/papyrus-cli",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.11",
|
|
4
4
|
"private": false,
|
|
5
5
|
"description": "Convert PDF to markdown or text with the OpenAI Agents SDK",
|
|
6
6
|
"repository": {
|
|
@@ -37,7 +37,6 @@
|
|
|
37
37
|
},
|
|
38
38
|
"dependencies": {
|
|
39
39
|
"@openai/agents": "^0.5.3",
|
|
40
|
-
"@robin7331/papyrus-cli": "^0.1.4",
|
|
41
40
|
"commander": "^14.0.0",
|
|
42
41
|
"dotenv": "^17.3.1",
|
|
43
42
|
"openai": "^6.7.0",
|
package/src/cli.ts
CHANGED
|
@@ -13,13 +13,17 @@ import {
|
|
|
13
13
|
setStoredApiKey
|
|
14
14
|
} from "./config.js";
|
|
15
15
|
import {
|
|
16
|
+
assertModelAvailable,
|
|
16
17
|
convertPdf,
|
|
17
18
|
type ConversionMode,
|
|
18
|
-
type ConvertUsage
|
|
19
|
+
type ConvertUsage,
|
|
20
|
+
listAvailableModels,
|
|
21
|
+
UnknownModelError
|
|
19
22
|
} from "./openaiPdfToMarkdown.js";
|
|
20
23
|
import {
|
|
21
24
|
defaultOutputPath,
|
|
22
25
|
formatDurationMs,
|
|
26
|
+
getSpinnerFrame,
|
|
23
27
|
isPdfPath,
|
|
24
28
|
looksLikeFileOutput,
|
|
25
29
|
parseConcurrency,
|
|
@@ -43,9 +47,10 @@ program
|
|
|
43
47
|
.name("papyrus")
|
|
44
48
|
.version(cliVersion, "-v, --version", "display version number")
|
|
45
49
|
.description("Convert PDF files to Markdown or text using the OpenAI Agents SDK")
|
|
46
|
-
.argument("
|
|
50
|
+
.argument("[input]", "Path to input PDF file or folder")
|
|
47
51
|
.option("-o, --output <path>", "Path to output file (single input) or output directory (folder input)")
|
|
48
52
|
.option("-m, --model <model>", "OpenAI model to use", "gpt-4o-mini")
|
|
53
|
+
.option("--models", "List available OpenAI models for the current API key and exit")
|
|
49
54
|
.option(
|
|
50
55
|
"--concurrency <n>",
|
|
51
56
|
"Max parallel workers for folder input (default: 10)",
|
|
@@ -59,16 +64,32 @@ program
|
|
|
59
64
|
)
|
|
60
65
|
.option("--prompt <text>", "Custom prompt text (enables prompt mode)")
|
|
61
66
|
.option("--prompt-file <path>", "Path to file containing prompt text (enables prompt mode)")
|
|
62
|
-
.action(async (input: string, options: CliOptions) => {
|
|
63
|
-
const inputPath = resolve(input);
|
|
67
|
+
.action(async (input: string | undefined, options: CliOptions) => {
|
|
64
68
|
const startedAt = Date.now();
|
|
65
69
|
|
|
66
70
|
try {
|
|
71
|
+
if (options.models) {
|
|
72
|
+
await ensureApiKey();
|
|
73
|
+
printAvailableModels(await listAvailableModels());
|
|
74
|
+
return;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (!input) {
|
|
78
|
+
throw new Error('Input path is required unless "--models" is used.');
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const inputPath = resolve(input);
|
|
67
82
|
validateOptionCombination(options);
|
|
68
83
|
|
|
69
84
|
const promptText = await resolvePromptText(options);
|
|
70
85
|
const conversionMode = resolveConversionMode(promptText);
|
|
71
86
|
const inputKind = await detectInputKind(inputPath);
|
|
87
|
+
if (inputKind === "file" && !isPdfPath(inputPath)) {
|
|
88
|
+
throw new Error("Input file must have a .pdf extension.");
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
await ensureApiKey();
|
|
92
|
+
await assertModelAvailable(options.model);
|
|
72
93
|
let usageTotals: ConvertUsage = emptyUsage();
|
|
73
94
|
|
|
74
95
|
if (inputKind === "file") {
|
|
@@ -84,6 +105,10 @@ program
|
|
|
84
105
|
printUsageTotals(usageTotals);
|
|
85
106
|
console.log(`Duration: ${((Date.now() - startedAt) / 1000).toFixed(2)}s`);
|
|
86
107
|
} catch (error) {
|
|
108
|
+
if (error instanceof UnknownModelError) {
|
|
109
|
+
printAvailableModels(error.availableModels);
|
|
110
|
+
}
|
|
111
|
+
|
|
87
112
|
const message = error instanceof Error ? error.message : String(error);
|
|
88
113
|
console.error(`Conversion failed: ${message}`);
|
|
89
114
|
console.error(`Duration: ${((Date.now() - startedAt) / 1000).toFixed(2)}s`);
|
|
@@ -160,11 +185,6 @@ async function processSingleFile(
|
|
|
160
185
|
mode: ConversionMode,
|
|
161
186
|
promptText?: string
|
|
162
187
|
): Promise<ConvertUsage> {
|
|
163
|
-
if (!isPdfPath(inputPath)) {
|
|
164
|
-
throw new Error("Input file must have a .pdf extension.");
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
await ensureApiKey();
|
|
168
188
|
const startedAt = Date.now();
|
|
169
189
|
const displayInput = relative(process.cwd(), inputPath) || inputPath;
|
|
170
190
|
const workerDashboard = process.stdout.isTTY
|
|
@@ -258,7 +278,6 @@ async function processFolder(
|
|
|
258
278
|
return { total: files.length, succeeded: 0, failed: 0, cancelled: true, usage: emptyUsage() };
|
|
259
279
|
}
|
|
260
280
|
|
|
261
|
-
await ensureApiKey();
|
|
262
281
|
const outputRoot = options.output ? resolve(options.output) : undefined;
|
|
263
282
|
let succeeded = 0;
|
|
264
283
|
let failed = 0;
|
|
@@ -587,21 +606,25 @@ type WorkerLane = {
|
|
|
587
606
|
state: "idle" | "running" | "done" | "failed";
|
|
588
607
|
file?: string;
|
|
589
608
|
message?: string;
|
|
609
|
+
spinnerFrame: number;
|
|
590
610
|
};
|
|
591
611
|
|
|
592
612
|
class AsciiWorkerDashboard {
|
|
613
|
+
private static readonly spinnerIntervalMs = 80;
|
|
593
614
|
private readonly lanes: WorkerLane[];
|
|
594
615
|
private readonly total: number;
|
|
595
616
|
private readonly workerCount: number;
|
|
596
617
|
private completed = 0;
|
|
597
618
|
private failed = 0;
|
|
598
619
|
private renderedLineCount = 0;
|
|
620
|
+
private spinnerTimer?: NodeJS.Timeout;
|
|
599
621
|
|
|
600
622
|
constructor(total: number, workerCount: number) {
|
|
601
623
|
this.total = total;
|
|
602
624
|
this.workerCount = workerCount;
|
|
603
625
|
this.lanes = Array.from({ length: workerCount }, () => ({
|
|
604
|
-
state: "idle"
|
|
626
|
+
state: "idle",
|
|
627
|
+
spinnerFrame: 0
|
|
605
628
|
}));
|
|
606
629
|
|
|
607
630
|
process.stdout.write("\x1b[?25l");
|
|
@@ -623,6 +646,8 @@ class AsciiWorkerDashboard {
|
|
|
623
646
|
lane.state = "running";
|
|
624
647
|
lane.file = file;
|
|
625
648
|
lane.message = "processing...";
|
|
649
|
+
lane.spinnerFrame = 0;
|
|
650
|
+
this.syncSpinnerTimer();
|
|
626
651
|
this.render();
|
|
627
652
|
}
|
|
628
653
|
|
|
@@ -635,6 +660,8 @@ class AsciiWorkerDashboard {
|
|
|
635
660
|
lane.state = "done";
|
|
636
661
|
lane.file = file;
|
|
637
662
|
lane.message = message;
|
|
663
|
+
lane.spinnerFrame = 0;
|
|
664
|
+
this.syncSpinnerTimer();
|
|
638
665
|
this.render();
|
|
639
666
|
}
|
|
640
667
|
|
|
@@ -647,10 +674,13 @@ class AsciiWorkerDashboard {
|
|
|
647
674
|
lane.state = "failed";
|
|
648
675
|
lane.file = file;
|
|
649
676
|
lane.message = message;
|
|
677
|
+
lane.spinnerFrame = 0;
|
|
678
|
+
this.syncSpinnerTimer();
|
|
650
679
|
this.render();
|
|
651
680
|
}
|
|
652
681
|
|
|
653
682
|
stop(): void {
|
|
683
|
+
this.clearSpinnerTimer();
|
|
654
684
|
this.render();
|
|
655
685
|
process.stdout.write("\x1b[?25h");
|
|
656
686
|
}
|
|
@@ -688,7 +718,7 @@ class AsciiWorkerDashboard {
|
|
|
688
718
|
|
|
689
719
|
private renderIcon(lane: WorkerLane): string {
|
|
690
720
|
if (lane.state === "running") {
|
|
691
|
-
return
|
|
721
|
+
return `${getSpinnerFrame(lane.spinnerFrame)} `;
|
|
692
722
|
}
|
|
693
723
|
|
|
694
724
|
if (lane.state === "done") {
|
|
@@ -701,6 +731,50 @@ class AsciiWorkerDashboard {
|
|
|
701
731
|
|
|
702
732
|
return "..";
|
|
703
733
|
}
|
|
734
|
+
|
|
735
|
+
private syncSpinnerTimer(): void {
|
|
736
|
+
if (this.lanes.some((lane) => lane.state === "running")) {
|
|
737
|
+
this.ensureSpinnerTimer();
|
|
738
|
+
return;
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
this.clearSpinnerTimer();
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
private ensureSpinnerTimer(): void {
|
|
745
|
+
if (this.spinnerTimer) {
|
|
746
|
+
return;
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
this.spinnerTimer = setInterval(() => {
|
|
750
|
+
let hasRunningLane = false;
|
|
751
|
+
for (const lane of this.lanes) {
|
|
752
|
+
if (lane.state !== "running") {
|
|
753
|
+
continue;
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
lane.spinnerFrame += 1;
|
|
757
|
+
hasRunningLane = true;
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
if (!hasRunningLane) {
|
|
761
|
+
this.clearSpinnerTimer();
|
|
762
|
+
return;
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
this.render();
|
|
766
|
+
}, AsciiWorkerDashboard.spinnerIntervalMs);
|
|
767
|
+
this.spinnerTimer.unref?.();
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
private clearSpinnerTimer(): void {
|
|
771
|
+
if (!this.spinnerTimer) {
|
|
772
|
+
return;
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
clearInterval(this.spinnerTimer);
|
|
776
|
+
this.spinnerTimer = undefined;
|
|
777
|
+
}
|
|
704
778
|
}
|
|
705
779
|
|
|
706
780
|
async function confirmFolderProcessing(
|
|
@@ -765,6 +839,13 @@ function printUsageTotals(usage: ConvertUsage): void {
|
|
|
765
839
|
);
|
|
766
840
|
}
|
|
767
841
|
|
|
842
|
+
function printAvailableModels(models: string[]): void {
|
|
843
|
+
console.log(`Available models (${models.length}):`);
|
|
844
|
+
for (const model of models) {
|
|
845
|
+
console.log(model);
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
|
|
768
849
|
function getCliVersion(): string {
|
|
769
850
|
try {
|
|
770
851
|
const packageJsonPath = new URL("../package.json", import.meta.url);
|
package/src/cliHelpers.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { basename, dirname, extname, join, relative } from "node:path";
|
|
|
4
4
|
export type CliOptions = {
|
|
5
5
|
output?: string;
|
|
6
6
|
model: string;
|
|
7
|
+
models?: boolean;
|
|
7
8
|
concurrency?: number;
|
|
8
9
|
yes?: boolean;
|
|
9
10
|
format?: string;
|
|
@@ -99,6 +100,14 @@ export function truncate(value: string, maxLength: number): string {
|
|
|
99
100
|
return `${value.slice(0, maxLength - 3)}...`;
|
|
100
101
|
}
|
|
101
102
|
|
|
103
|
+
export const ASCII_SPINNER_FRAMES = ["|", "/", "-", "\\"];
|
|
104
|
+
|
|
105
|
+
export function getSpinnerFrame(frameIndex: number): string {
|
|
106
|
+
const normalizedIndex = ((Math.trunc(frameIndex) % ASCII_SPINNER_FRAMES.length) + ASCII_SPINNER_FRAMES.length)
|
|
107
|
+
% ASCII_SPINNER_FRAMES.length;
|
|
108
|
+
return ASCII_SPINNER_FRAMES[normalizedIndex] ?? ASCII_SPINNER_FRAMES[0];
|
|
109
|
+
}
|
|
110
|
+
|
|
102
111
|
export function formatDurationMs(durationMs: number): string {
|
|
103
112
|
return `${(durationMs / 1000).toFixed(2)}s`;
|
|
104
113
|
}
|
|
@@ -30,6 +30,18 @@ export type ConvertUsage = {
|
|
|
30
30
|
totalTokens: number;
|
|
31
31
|
};
|
|
32
32
|
|
|
33
|
+
export class UnknownModelError extends Error {
|
|
34
|
+
readonly model: string;
|
|
35
|
+
readonly availableModels: string[];
|
|
36
|
+
|
|
37
|
+
constructor(model: string, availableModels: string[]) {
|
|
38
|
+
super(`Model "${model}" is not available for this API key.`);
|
|
39
|
+
this.name = "UnknownModelError";
|
|
40
|
+
this.model = model;
|
|
41
|
+
this.availableModels = availableModels;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
33
45
|
const AUTO_RESPONSE_SCHEMA = z.object({
|
|
34
46
|
format: z.enum(["md", "txt"]),
|
|
35
47
|
content: z.string().min(1)
|
|
@@ -43,12 +55,7 @@ export async function convertPdf(options: ConvertOptions): Promise<ConvertResult
|
|
|
43
55
|
const inputPath = resolve(options.inputPath);
|
|
44
56
|
await access(inputPath);
|
|
45
57
|
|
|
46
|
-
const
|
|
47
|
-
if (!apiKey) {
|
|
48
|
-
throw new Error("OPENAI_API_KEY is not set.");
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
const client = new OpenAI({ apiKey });
|
|
58
|
+
const client = createOpenAiClient();
|
|
52
59
|
|
|
53
60
|
const uploaded = await withRateLimitRetry("file upload", () =>
|
|
54
61
|
client.files.create({
|
|
@@ -64,23 +71,32 @@ export async function convertPdf(options: ConvertOptions): Promise<ConvertResult
|
|
|
64
71
|
});
|
|
65
72
|
|
|
66
73
|
const promptText = buildPromptText(options);
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
74
|
+
let result;
|
|
75
|
+
try {
|
|
76
|
+
result = await withRateLimitRetry("model run", () =>
|
|
77
|
+
run(agent, [
|
|
78
|
+
{
|
|
79
|
+
role: "user",
|
|
80
|
+
content: [
|
|
81
|
+
{
|
|
82
|
+
type: "input_text",
|
|
83
|
+
text: promptText
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
type: "input_file",
|
|
87
|
+
file: { id: uploaded.id }
|
|
88
|
+
}
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
])
|
|
92
|
+
);
|
|
93
|
+
} catch (error) {
|
|
94
|
+
if (isUnknownModelError(error, options.model)) {
|
|
95
|
+
throw new UnknownModelError(options.model, await listAvailableModels(client));
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
throw error;
|
|
99
|
+
}
|
|
84
100
|
|
|
85
101
|
const rawOutput = (result.finalOutput ?? "").trim();
|
|
86
102
|
if (!rawOutput) {
|
|
@@ -101,6 +117,39 @@ export async function convertPdf(options: ConvertOptions): Promise<ConvertResult
|
|
|
101
117
|
return { format: "txt", content: rawOutput, usage };
|
|
102
118
|
}
|
|
103
119
|
|
|
120
|
+
export async function assertModelAvailable(model: string): Promise<void> {
|
|
121
|
+
const client = createOpenAiClient();
|
|
122
|
+
|
|
123
|
+
try {
|
|
124
|
+
await client.models.retrieve(model);
|
|
125
|
+
} catch (error) {
|
|
126
|
+
if (!isUnknownModelError(error, model)) {
|
|
127
|
+
throw error;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
throw new UnknownModelError(model, await listAvailableModels(client));
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
export async function listAvailableModels(client = createOpenAiClient()): Promise<string[]> {
|
|
135
|
+
const modelIds: string[] = [];
|
|
136
|
+
|
|
137
|
+
for await (const model of client.models.list()) {
|
|
138
|
+
modelIds.push(model.id);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
return modelIds.sort((left, right) => left.localeCompare(right, "en"));
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function createOpenAiClient(): OpenAI {
|
|
145
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
146
|
+
if (!apiKey) {
|
|
147
|
+
throw new Error("OPENAI_API_KEY is not set.");
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
return new OpenAI({ apiKey });
|
|
151
|
+
}
|
|
152
|
+
|
|
104
153
|
function buildPromptText(options: ConvertOptions): string {
|
|
105
154
|
const outputExtensionHint = normalizeExtensionHint(options.outputExtensionHint);
|
|
106
155
|
if (options.mode === "prompt") {
|
|
@@ -171,6 +220,27 @@ function normalizeExtensionHint(extension: string | undefined): string | undefin
|
|
|
171
220
|
return normalized || undefined;
|
|
172
221
|
}
|
|
173
222
|
|
|
223
|
+
function isUnknownModelError(error: unknown, model: string): boolean {
|
|
224
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
225
|
+
const normalizedMessage = message.toLowerCase();
|
|
226
|
+
const errorStatus =
|
|
227
|
+
typeof error === "object" && error !== null && "status" in error ? error.status : undefined;
|
|
228
|
+
const errorCode =
|
|
229
|
+
typeof error === "object" && error !== null && "code" in error ? error.code : undefined;
|
|
230
|
+
const quotedModel = model.toLowerCase();
|
|
231
|
+
|
|
232
|
+
if (errorStatus === 404 || errorCode === "model_not_found") {
|
|
233
|
+
return true;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
return (
|
|
237
|
+
normalizedMessage.includes(quotedModel) &&
|
|
238
|
+
(normalizedMessage.includes("does not exist") ||
|
|
239
|
+
normalizedMessage.includes("not found") ||
|
|
240
|
+
normalizedMessage.includes("unknown model"))
|
|
241
|
+
);
|
|
242
|
+
}
|
|
243
|
+
|
|
174
244
|
function parseAutoResponse(rawOutput: string): Omit<ConvertResult, "usage"> {
|
|
175
245
|
let candidate = rawOutput.trim();
|
|
176
246
|
|
package/test/cliHelpers.test.ts
CHANGED
|
@@ -2,8 +2,10 @@ import assert from "node:assert/strict";
|
|
|
2
2
|
import test from "node:test";
|
|
3
3
|
import { InvalidArgumentError } from "commander";
|
|
4
4
|
import {
|
|
5
|
+
ASCII_SPINNER_FRAMES,
|
|
5
6
|
defaultOutputPath,
|
|
6
7
|
formatDurationMs,
|
|
8
|
+
getSpinnerFrame,
|
|
7
9
|
isPdfPath,
|
|
8
10
|
looksLikeFileOutput,
|
|
9
11
|
parseConcurrency,
|
|
@@ -130,6 +132,15 @@ test("truncate shortens long values and preserves short ones", () => {
|
|
|
130
132
|
assert.equal(truncate("abcdefghij", 8), "abcde...");
|
|
131
133
|
});
|
|
132
134
|
|
|
135
|
+
test("getSpinnerFrame cycles through the configured ASCII frames", () => {
|
|
136
|
+
assert.deepEqual(
|
|
137
|
+
ASCII_SPINNER_FRAMES.map((_, index) => getSpinnerFrame(index)),
|
|
138
|
+
ASCII_SPINNER_FRAMES
|
|
139
|
+
);
|
|
140
|
+
assert.equal(getSpinnerFrame(ASCII_SPINNER_FRAMES.length), ASCII_SPINNER_FRAMES[0]);
|
|
141
|
+
assert.equal(getSpinnerFrame(-1), ASCII_SPINNER_FRAMES.at(-1));
|
|
142
|
+
});
|
|
143
|
+
|
|
133
144
|
test("formatDurationMs formats to seconds with two decimals", () => {
|
|
134
145
|
assert.equal(formatDurationMs(0), "0.00s");
|
|
135
146
|
assert.equal(formatDurationMs(1543), "1.54s");
|