@robin7331/papyrus-cli 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -0
- package/dist/cli.js +63 -13
- package/dist/openaiPdfToMarkdown.js +192 -4
- package/package.json +2 -1
- package/src/cli.ts +77 -13
- package/src/openaiPdfToMarkdown.ts +273 -19
package/README.md
CHANGED
|
@@ -24,6 +24,9 @@ papyrus --help
|
|
|
24
24
|
## Usage
|
|
25
25
|
|
|
26
26
|
```bash
|
|
27
|
+
# Show installed CLI version
|
|
28
|
+
papyrus --version
|
|
29
|
+
|
|
27
30
|
# Single file (auto mode; if no API key is found, Papyrus prompts you to paste one)
|
|
28
31
|
papyrus ./path/to/input.pdf
|
|
29
32
|
|
|
@@ -95,6 +98,16 @@ Example:
|
|
|
95
98
|
papyrus ./docs/invoice.pdf
|
|
96
99
|
```
|
|
97
100
|
|
|
101
|
+
### `-v, --version`
|
|
102
|
+
|
|
103
|
+
Print the installed Papyrus CLI version.
|
|
104
|
+
|
|
105
|
+
Example:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
papyrus --version
|
|
109
|
+
```
|
|
110
|
+
|
|
98
111
|
### `--format <format>`
|
|
99
112
|
|
|
100
113
|
Output format override:
|
|
@@ -194,8 +207,14 @@ papyrus ./docs --yes
|
|
|
194
207
|
## Notes
|
|
195
208
|
|
|
196
209
|
- In `auto` mode without `--format`, the model returns structured JSON with `format` + `content`.
|
|
210
|
+
- Single-file input now also shows a live worker lane (spinner in TTY) while conversion is running.
|
|
197
211
|
- Folder input is scanned recursively for `.pdf` files and processed in parallel.
|
|
198
212
|
- In folder mode, `--output` must be a directory path and mirrored subfolders are preserved.
|
|
213
|
+
- OpenAI rate-limit (`429`) responses are retried automatically using `Retry-After` (when present) plus exponential backoff.
|
|
214
|
+
- Rate-limit retry tuning is available via environment variables:
|
|
215
|
+
- `PAPYRUS_RATE_LIMIT_MAX_RETRIES` (default `8`)
|
|
216
|
+
- `PAPYRUS_RATE_LIMIT_BASE_DELAY_MS` (default `2000`)
|
|
217
|
+
- `PAPYRUS_RATE_LIMIT_MAX_DELAY_MS` (default `120000`)
|
|
199
218
|
- For scanned PDFs, output quality depends on OCR quality from the model.
|
|
200
219
|
|
|
201
220
|
## Development
|
package/dist/cli.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import "dotenv/config";
|
|
3
|
+
import { readFileSync } from "node:fs";
|
|
3
4
|
import { mkdir, readFile, readdir, stat, writeFile } from "node:fs/promises";
|
|
4
5
|
import { dirname, join, relative, resolve } from "node:path";
|
|
5
6
|
import { Command } from "commander";
|
|
@@ -9,8 +10,10 @@ import { defaultOutputPath, formatDurationMs, isPdfPath, looksLikeFileOutput, pa
|
|
|
9
10
|
const program = new Command();
|
|
10
11
|
const configFilePath = getConfigFilePath();
|
|
11
12
|
const OPENAI_API_KEYS_URL = "https://platform.openai.com/settings/organization/api-keys";
|
|
13
|
+
const cliVersion = getCliVersion();
|
|
12
14
|
program
|
|
13
15
|
.name("papyrus")
|
|
16
|
+
.version(cliVersion, "-v, --version", "display version number")
|
|
14
17
|
.description("Convert PDF files to Markdown or text using the OpenAI Agents SDK")
|
|
15
18
|
.argument("<input>", "Path to input PDF file or folder")
|
|
16
19
|
.option("-o, --output <path>", "Path to output file (single input) or output directory (folder input)")
|
|
@@ -114,19 +117,52 @@ async function processSingleFile(inputPath, options, promptText) {
|
|
|
114
117
|
throw new Error("Input file must have a .pdf extension.");
|
|
115
118
|
}
|
|
116
119
|
await ensureApiKey();
|
|
117
|
-
const
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
120
|
+
const startedAt = Date.now();
|
|
121
|
+
const displayInput = relative(process.cwd(), inputPath) || inputPath;
|
|
122
|
+
const workerDashboard = process.stdout.isTTY
|
|
123
|
+
? new AsciiWorkerDashboard(1, 1)
|
|
124
|
+
: null;
|
|
125
|
+
workerDashboard?.setSummary(0, 0);
|
|
126
|
+
workerDashboard?.setWorkerRunning(0, displayInput);
|
|
127
|
+
if (!workerDashboard) {
|
|
128
|
+
console.log(`[worker-1] Running ${displayInput}`);
|
|
129
|
+
}
|
|
130
|
+
try {
|
|
131
|
+
const result = await convertPdf({
|
|
132
|
+
inputPath,
|
|
133
|
+
model: options.model,
|
|
134
|
+
mode: options.mode,
|
|
135
|
+
format: options.format,
|
|
136
|
+
instructions: options.instructions,
|
|
137
|
+
promptText
|
|
138
|
+
});
|
|
139
|
+
const outputPath = resolve(options.output ?? defaultOutputPath(inputPath, result.format));
|
|
140
|
+
await mkdir(dirname(outputPath), { recursive: true });
|
|
141
|
+
await writeFile(outputPath, result.content, "utf8");
|
|
142
|
+
if (workerDashboard) {
|
|
143
|
+
workerDashboard.setWorkerDone(0, displayInput, `${result.format} in ${formatDurationMs(Date.now() - startedAt)}`);
|
|
144
|
+
workerDashboard.setSummary(1, 0);
|
|
145
|
+
}
|
|
146
|
+
else {
|
|
147
|
+
console.log(`[worker-1] Done ${displayInput} -> ${outputPath} (${result.format}, ${formatDurationMs(Date.now() - startedAt)})`);
|
|
148
|
+
}
|
|
149
|
+
console.log(`Output (${result.format}) written to: ${outputPath}`);
|
|
150
|
+
return result.usage;
|
|
151
|
+
}
|
|
152
|
+
catch (error) {
|
|
153
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
154
|
+
if (workerDashboard) {
|
|
155
|
+
workerDashboard.setWorkerFailed(0, displayInput, `${truncate(message, 42)} (${formatDurationMs(Date.now() - startedAt)})`);
|
|
156
|
+
workerDashboard.setSummary(1, 1);
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
console.error(`[worker-1] Failed ${displayInput}: ${message} (${formatDurationMs(Date.now() - startedAt)})`);
|
|
160
|
+
}
|
|
161
|
+
throw error;
|
|
162
|
+
}
|
|
163
|
+
finally {
|
|
164
|
+
workerDashboard?.stop();
|
|
165
|
+
}
|
|
130
166
|
}
|
|
131
167
|
async function processFolder(inputDir, options, promptText) {
|
|
132
168
|
if (options.output && looksLikeFileOutput(options.output)) {
|
|
@@ -547,3 +583,17 @@ function mergeUsage(target, delta) {
|
|
|
547
583
|
function printUsageTotals(usage) {
|
|
548
584
|
console.log(`Token usage: input=${usage.inputTokens}, output=${usage.outputTokens}, total=${usage.totalTokens}, requests=${usage.requests}`);
|
|
549
585
|
}
|
|
586
|
+
function getCliVersion() {
|
|
587
|
+
try {
|
|
588
|
+
const packageJsonPath = new URL("../package.json", import.meta.url);
|
|
589
|
+
const raw = readFileSync(packageJsonPath, "utf8");
|
|
590
|
+
const parsed = JSON.parse(raw);
|
|
591
|
+
if (typeof parsed.version === "string" && parsed.version.trim().length > 0) {
|
|
592
|
+
return parsed.version;
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
catch {
|
|
596
|
+
// ignore and use fallback
|
|
597
|
+
}
|
|
598
|
+
return "0.0.0";
|
|
599
|
+
}
|
|
@@ -8,6 +8,9 @@ const AUTO_RESPONSE_SCHEMA = z.object({
|
|
|
8
8
|
format: z.enum(["md", "txt"]),
|
|
9
9
|
content: z.string().min(1)
|
|
10
10
|
});
|
|
11
|
+
const RATE_LIMIT_MAX_RETRIES = parsePositiveIntEnv("PAPYRUS_RATE_LIMIT_MAX_RETRIES", 8);
|
|
12
|
+
const RATE_LIMIT_BASE_DELAY_MS = parsePositiveIntEnv("PAPYRUS_RATE_LIMIT_BASE_DELAY_MS", 2_000);
|
|
13
|
+
const RATE_LIMIT_MAX_DELAY_MS = parsePositiveIntEnv("PAPYRUS_RATE_LIMIT_MAX_DELAY_MS", 120_000);
|
|
11
14
|
export async function convertPdf(options) {
|
|
12
15
|
const inputPath = resolve(options.inputPath);
|
|
13
16
|
await access(inputPath);
|
|
@@ -16,17 +19,17 @@ export async function convertPdf(options) {
|
|
|
16
19
|
throw new Error("OPENAI_API_KEY is not set.");
|
|
17
20
|
}
|
|
18
21
|
const client = new OpenAI({ apiKey });
|
|
19
|
-
const uploaded = await client.files.create({
|
|
22
|
+
const uploaded = await withRateLimitRetry("file upload", () => client.files.create({
|
|
20
23
|
file: createReadStream(inputPath),
|
|
21
24
|
purpose: "user_data"
|
|
22
|
-
});
|
|
25
|
+
}));
|
|
23
26
|
const agent = new Agent({
|
|
24
27
|
name: "PDF Converter",
|
|
25
28
|
instructions: "You convert PDF files precisely according to the requested output format.",
|
|
26
29
|
model: options.model
|
|
27
30
|
});
|
|
28
31
|
const promptText = buildPromptText(options);
|
|
29
|
-
const result = await run(agent, [
|
|
32
|
+
const result = await withRateLimitRetry("model run", () => run(agent, [
|
|
30
33
|
{
|
|
31
34
|
role: "user",
|
|
32
35
|
content: [
|
|
@@ -40,7 +43,7 @@ export async function convertPdf(options) {
|
|
|
40
43
|
}
|
|
41
44
|
]
|
|
42
45
|
}
|
|
43
|
-
]);
|
|
46
|
+
]));
|
|
44
47
|
const rawOutput = (result.finalOutput ?? "").trim();
|
|
45
48
|
if (!rawOutput) {
|
|
46
49
|
throw new Error("No content returned by the API.");
|
|
@@ -142,3 +145,188 @@ function parseAutoResponse(rawOutput) {
|
|
|
142
145
|
}
|
|
143
146
|
return { format: validated.data.format, content };
|
|
144
147
|
}
|
|
148
|
+
async function withRateLimitRetry(operationName, operation) {
|
|
149
|
+
let attempt = 0;
|
|
150
|
+
while (true) {
|
|
151
|
+
try {
|
|
152
|
+
return await operation();
|
|
153
|
+
}
|
|
154
|
+
catch (error) {
|
|
155
|
+
if (!isRetriableRateLimitError(error) || attempt >= RATE_LIMIT_MAX_RETRIES) {
|
|
156
|
+
throw error;
|
|
157
|
+
}
|
|
158
|
+
const retryAfterMs = getRetryAfterMs(error);
|
|
159
|
+
const exponentialBackoffMs = RATE_LIMIT_BASE_DELAY_MS * (2 ** attempt);
|
|
160
|
+
const jitterMs = Math.floor(Math.random() * 750);
|
|
161
|
+
const computedDelayMs = retryAfterMs ?? (exponentialBackoffMs + jitterMs);
|
|
162
|
+
const waitMs = clampDelayMs(computedDelayMs, RATE_LIMIT_MAX_DELAY_MS);
|
|
163
|
+
const nextAttempt = attempt + 2;
|
|
164
|
+
const totalAttempts = RATE_LIMIT_MAX_RETRIES + 1;
|
|
165
|
+
const reason = extractErrorMessage(error);
|
|
166
|
+
console.warn(`[retry] ${operationName} hit OpenAI rate limits. Waiting ${formatDelay(waitMs)} before retry ${nextAttempt}/${totalAttempts}. ${reason}`);
|
|
167
|
+
await sleep(waitMs);
|
|
168
|
+
attempt += 1;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
function isRetriableRateLimitError(error) {
|
|
173
|
+
if (typeof error !== "object" || error === null) {
|
|
174
|
+
return false;
|
|
175
|
+
}
|
|
176
|
+
const candidate = error;
|
|
177
|
+
if (candidate.status === 429) {
|
|
178
|
+
const code = typeof candidate.code === "string" ? candidate.code : undefined;
|
|
179
|
+
const nestedCode = typeof candidate.error?.code === "string" ? candidate.error.code : undefined;
|
|
180
|
+
if (code === "insufficient_quota" || nestedCode === "insufficient_quota") {
|
|
181
|
+
return false;
|
|
182
|
+
}
|
|
183
|
+
return true;
|
|
184
|
+
}
|
|
185
|
+
const searchableText = [
|
|
186
|
+
toLowerCaseIfString(candidate.code),
|
|
187
|
+
toLowerCaseIfString(candidate.type),
|
|
188
|
+
toLowerCaseIfString(candidate.error?.code),
|
|
189
|
+
toLowerCaseIfString(candidate.error?.type),
|
|
190
|
+
toLowerCaseIfString(candidate.message),
|
|
191
|
+
toLowerCaseIfString(candidate.error?.message)
|
|
192
|
+
]
|
|
193
|
+
.filter(Boolean)
|
|
194
|
+
.join(" ");
|
|
195
|
+
if (searchableText.includes("insufficient_quota")) {
|
|
196
|
+
return false;
|
|
197
|
+
}
|
|
198
|
+
return (searchableText.includes("rate_limit") ||
|
|
199
|
+
searchableText.includes("rate limit") ||
|
|
200
|
+
searchableText.includes("too many requests"));
|
|
201
|
+
}
|
|
202
|
+
function getRetryAfterMs(error) {
|
|
203
|
+
const headerDelay = getRetryAfterMsFromHeaders(error);
|
|
204
|
+
if (typeof headerDelay === "number" && Number.isFinite(headerDelay) && headerDelay >= 0) {
|
|
205
|
+
return headerDelay;
|
|
206
|
+
}
|
|
207
|
+
const textDelay = getRetryAfterMsFromText(extractErrorMessage(error));
|
|
208
|
+
if (typeof textDelay === "number" && Number.isFinite(textDelay) && textDelay >= 0) {
|
|
209
|
+
return textDelay;
|
|
210
|
+
}
|
|
211
|
+
return undefined;
|
|
212
|
+
}
|
|
213
|
+
function getRetryAfterMsFromHeaders(error) {
|
|
214
|
+
if (typeof error !== "object" || error === null) {
|
|
215
|
+
return undefined;
|
|
216
|
+
}
|
|
217
|
+
const candidate = error;
|
|
218
|
+
const retryAfterMsHeader = readHeader(candidate.headers, "retry-after-ms")
|
|
219
|
+
?? readHeader(candidate.response?.headers, "retry-after-ms");
|
|
220
|
+
if (retryAfterMsHeader) {
|
|
221
|
+
const milliseconds = Number.parseInt(retryAfterMsHeader, 10);
|
|
222
|
+
if (Number.isFinite(milliseconds) && milliseconds >= 0) {
|
|
223
|
+
return milliseconds;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
const retryAfterHeader = readHeader(candidate.headers, "retry-after")
|
|
227
|
+
?? readHeader(candidate.response?.headers, "retry-after");
|
|
228
|
+
if (!retryAfterHeader) {
|
|
229
|
+
return undefined;
|
|
230
|
+
}
|
|
231
|
+
const seconds = Number.parseFloat(retryAfterHeader);
|
|
232
|
+
if (Number.isFinite(seconds)) {
|
|
233
|
+
return Math.max(0, Math.round(seconds * 1_000));
|
|
234
|
+
}
|
|
235
|
+
const parsedDate = Date.parse(retryAfterHeader);
|
|
236
|
+
if (Number.isFinite(parsedDate)) {
|
|
237
|
+
return Math.max(0, parsedDate - Date.now());
|
|
238
|
+
}
|
|
239
|
+
return undefined;
|
|
240
|
+
}
|
|
241
|
+
function getRetryAfterMsFromText(message) {
|
|
242
|
+
const match = message.match(/(?:try again in|retry after)\s*([0-9]+(?:\.[0-9]+)?)\s*(ms|msec|millisecond|milliseconds|s|sec|second|seconds|m|min|minute|minutes)?/i);
|
|
243
|
+
if (!match) {
|
|
244
|
+
return undefined;
|
|
245
|
+
}
|
|
246
|
+
const rawValue = Number.parseFloat(match[1] ?? "");
|
|
247
|
+
if (!Number.isFinite(rawValue) || rawValue < 0) {
|
|
248
|
+
return undefined;
|
|
249
|
+
}
|
|
250
|
+
const unit = (match[2] ?? "s").toLowerCase();
|
|
251
|
+
if (unit === "ms" || unit === "msec" || unit === "millisecond" || unit === "milliseconds") {
|
|
252
|
+
return Math.round(rawValue);
|
|
253
|
+
}
|
|
254
|
+
if (unit === "m" || unit === "min" || unit === "minute" || unit === "minutes") {
|
|
255
|
+
return Math.round(rawValue * 60_000);
|
|
256
|
+
}
|
|
257
|
+
return Math.round(rawValue * 1_000);
|
|
258
|
+
}
|
|
259
|
+
function readHeader(headersLike, headerName) {
|
|
260
|
+
if (!headersLike) {
|
|
261
|
+
return undefined;
|
|
262
|
+
}
|
|
263
|
+
if (typeof headersLike === "object"
|
|
264
|
+
&& "get" in headersLike
|
|
265
|
+
&& typeof headersLike.get === "function") {
|
|
266
|
+
const value = headersLike.get(headerName);
|
|
267
|
+
return value ?? undefined;
|
|
268
|
+
}
|
|
269
|
+
if (typeof headersLike !== "object") {
|
|
270
|
+
return undefined;
|
|
271
|
+
}
|
|
272
|
+
const headersRecord = headersLike;
|
|
273
|
+
const lowerTarget = headerName.toLowerCase();
|
|
274
|
+
for (const [key, value] of Object.entries(headersRecord)) {
|
|
275
|
+
if (key.toLowerCase() !== lowerTarget) {
|
|
276
|
+
continue;
|
|
277
|
+
}
|
|
278
|
+
if (typeof value === "string") {
|
|
279
|
+
return value;
|
|
280
|
+
}
|
|
281
|
+
if (Array.isArray(value)) {
|
|
282
|
+
const first = value.find((entry) => typeof entry === "string");
|
|
283
|
+
return typeof first === "string" ? first : undefined;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
return undefined;
|
|
287
|
+
}
|
|
288
|
+
function parsePositiveIntEnv(name, fallback) {
|
|
289
|
+
const raw = process.env[name];
|
|
290
|
+
if (!raw) {
|
|
291
|
+
return fallback;
|
|
292
|
+
}
|
|
293
|
+
const parsed = Number.parseInt(raw, 10);
|
|
294
|
+
if (!Number.isFinite(parsed) || parsed < 0) {
|
|
295
|
+
return fallback;
|
|
296
|
+
}
|
|
297
|
+
return parsed;
|
|
298
|
+
}
|
|
299
|
+
function clampDelayMs(value, max) {
|
|
300
|
+
return Math.max(250, Math.min(Math.round(value), max));
|
|
301
|
+
}
|
|
302
|
+
function formatDelay(milliseconds) {
|
|
303
|
+
if (milliseconds < 1_000) {
|
|
304
|
+
return `${milliseconds}ms`;
|
|
305
|
+
}
|
|
306
|
+
const seconds = milliseconds / 1_000;
|
|
307
|
+
return `${seconds.toFixed(seconds >= 10 ? 0 : 1)}s`;
|
|
308
|
+
}
|
|
309
|
+
function extractErrorMessage(error) {
|
|
310
|
+
if (error instanceof Error && error.message.trim().length > 0) {
|
|
311
|
+
return error.message;
|
|
312
|
+
}
|
|
313
|
+
if (typeof error === "object" && error !== null) {
|
|
314
|
+
const message = error.message;
|
|
315
|
+
if (typeof message === "string" && message.trim().length > 0) {
|
|
316
|
+
return message;
|
|
317
|
+
}
|
|
318
|
+
const nestedMessage = error.error?.message;
|
|
319
|
+
if (typeof nestedMessage === "string" && nestedMessage.trim().length > 0) {
|
|
320
|
+
return nestedMessage;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
return String(error);
|
|
324
|
+
}
|
|
325
|
+
function toLowerCaseIfString(value) {
|
|
326
|
+
return typeof value === "string" ? value.toLowerCase() : "";
|
|
327
|
+
}
|
|
328
|
+
function sleep(milliseconds) {
|
|
329
|
+
return new Promise((resolveSleep) => {
|
|
330
|
+
setTimeout(resolveSleep, milliseconds);
|
|
331
|
+
});
|
|
332
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@robin7331/papyrus-cli",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.6",
|
|
4
4
|
"private": false,
|
|
5
5
|
"description": "Convert PDF to markdown or text with the OpenAI Agents SDK",
|
|
6
6
|
"repository": {
|
|
@@ -37,6 +37,7 @@
|
|
|
37
37
|
},
|
|
38
38
|
"dependencies": {
|
|
39
39
|
"@openai/agents": "^0.5.3",
|
|
40
|
+
"@robin7331/papyrus-cli": "^0.1.4",
|
|
40
41
|
"commander": "^14.0.0",
|
|
41
42
|
"dotenv": "^17.3.1",
|
|
42
43
|
"openai": "^6.7.0",
|
package/src/cli.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
import "dotenv/config";
|
|
4
|
+
import { readFileSync } from "node:fs";
|
|
4
5
|
import { mkdir, readFile, readdir, stat, writeFile } from "node:fs/promises";
|
|
5
6
|
import { dirname, join, relative, resolve } from "node:path";
|
|
6
7
|
import { Command } from "commander";
|
|
@@ -32,6 +33,7 @@ import {
|
|
|
32
33
|
const program = new Command();
|
|
33
34
|
const configFilePath = getConfigFilePath();
|
|
34
35
|
const OPENAI_API_KEYS_URL = "https://platform.openai.com/settings/organization/api-keys";
|
|
36
|
+
const cliVersion = getCliVersion();
|
|
35
37
|
|
|
36
38
|
type ConfigInitOptions = {
|
|
37
39
|
force?: boolean;
|
|
@@ -39,6 +41,7 @@ type ConfigInitOptions = {
|
|
|
39
41
|
|
|
40
42
|
program
|
|
41
43
|
.name("papyrus")
|
|
44
|
+
.version(cliVersion, "-v, --version", "display version number")
|
|
42
45
|
.description("Convert PDF files to Markdown or text using the OpenAI Agents SDK")
|
|
43
46
|
.argument("<input>", "Path to input PDF file or folder")
|
|
44
47
|
.option("-o, --output <path>", "Path to output file (single input) or output directory (folder input)")
|
|
@@ -161,20 +164,66 @@ async function processSingleFile(
|
|
|
161
164
|
}
|
|
162
165
|
|
|
163
166
|
await ensureApiKey();
|
|
164
|
-
const
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
});
|
|
167
|
+
const startedAt = Date.now();
|
|
168
|
+
const displayInput = relative(process.cwd(), inputPath) || inputPath;
|
|
169
|
+
const workerDashboard = process.stdout.isTTY
|
|
170
|
+
? new AsciiWorkerDashboard(1, 1)
|
|
171
|
+
: null;
|
|
172
|
+
workerDashboard?.setSummary(0, 0);
|
|
173
|
+
workerDashboard?.setWorkerRunning(0, displayInput);
|
|
172
174
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
175
|
+
if (!workerDashboard) {
|
|
176
|
+
console.log(`[worker-1] Running ${displayInput}`);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
try {
|
|
180
|
+
const result = await convertPdf({
|
|
181
|
+
inputPath,
|
|
182
|
+
model: options.model,
|
|
183
|
+
mode: options.mode,
|
|
184
|
+
format: options.format,
|
|
185
|
+
instructions: options.instructions,
|
|
186
|
+
promptText
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
const outputPath = resolve(options.output ?? defaultOutputPath(inputPath, result.format));
|
|
190
|
+
await mkdir(dirname(outputPath), { recursive: true });
|
|
191
|
+
await writeFile(outputPath, result.content, "utf8");
|
|
192
|
+
|
|
193
|
+
if (workerDashboard) {
|
|
194
|
+
workerDashboard.setWorkerDone(
|
|
195
|
+
0,
|
|
196
|
+
displayInput,
|
|
197
|
+
`${result.format} in ${formatDurationMs(Date.now() - startedAt)}`
|
|
198
|
+
);
|
|
199
|
+
workerDashboard.setSummary(1, 0);
|
|
200
|
+
} else {
|
|
201
|
+
console.log(
|
|
202
|
+
`[worker-1] Done ${displayInput} -> ${outputPath} (${result.format}, ${formatDurationMs(Date.now() - startedAt)})`
|
|
203
|
+
);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
console.log(`Output (${result.format}) written to: ${outputPath}`);
|
|
207
|
+
return result.usage;
|
|
208
|
+
} catch (error) {
|
|
209
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
210
|
+
if (workerDashboard) {
|
|
211
|
+
workerDashboard.setWorkerFailed(
|
|
212
|
+
0,
|
|
213
|
+
displayInput,
|
|
214
|
+
`${truncate(message, 42)} (${formatDurationMs(Date.now() - startedAt)})`
|
|
215
|
+
);
|
|
216
|
+
workerDashboard.setSummary(1, 1);
|
|
217
|
+
} else {
|
|
218
|
+
console.error(
|
|
219
|
+
`[worker-1] Failed ${displayInput}: ${message} (${formatDurationMs(Date.now() - startedAt)})`
|
|
220
|
+
);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
throw error;
|
|
224
|
+
} finally {
|
|
225
|
+
workerDashboard?.stop();
|
|
226
|
+
}
|
|
178
227
|
}
|
|
179
228
|
|
|
180
229
|
type FolderSummary = {
|
|
@@ -733,3 +782,18 @@ function printUsageTotals(usage: ConvertUsage): void {
|
|
|
733
782
|
`Token usage: input=${usage.inputTokens}, output=${usage.outputTokens}, total=${usage.totalTokens}, requests=${usage.requests}`
|
|
734
783
|
);
|
|
735
784
|
}
|
|
785
|
+
|
|
786
|
+
function getCliVersion(): string {
|
|
787
|
+
try {
|
|
788
|
+
const packageJsonPath = new URL("../package.json", import.meta.url);
|
|
789
|
+
const raw = readFileSync(packageJsonPath, "utf8");
|
|
790
|
+
const parsed = JSON.parse(raw) as { version?: unknown };
|
|
791
|
+
if (typeof parsed.version === "string" && parsed.version.trim().length > 0) {
|
|
792
|
+
return parsed.version;
|
|
793
|
+
}
|
|
794
|
+
} catch {
|
|
795
|
+
// ignore and use fallback
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
return "0.0.0";
|
|
799
|
+
}
|
|
@@ -35,6 +35,10 @@ const AUTO_RESPONSE_SCHEMA = z.object({
|
|
|
35
35
|
content: z.string().min(1)
|
|
36
36
|
});
|
|
37
37
|
|
|
38
|
+
const RATE_LIMIT_MAX_RETRIES = parsePositiveIntEnv("PAPYRUS_RATE_LIMIT_MAX_RETRIES", 8);
|
|
39
|
+
const RATE_LIMIT_BASE_DELAY_MS = parsePositiveIntEnv("PAPYRUS_RATE_LIMIT_BASE_DELAY_MS", 2_000);
|
|
40
|
+
const RATE_LIMIT_MAX_DELAY_MS = parsePositiveIntEnv("PAPYRUS_RATE_LIMIT_MAX_DELAY_MS", 120_000);
|
|
41
|
+
|
|
38
42
|
export async function convertPdf(options: ConvertOptions): Promise<ConvertResult> {
|
|
39
43
|
const inputPath = resolve(options.inputPath);
|
|
40
44
|
await access(inputPath);
|
|
@@ -46,10 +50,12 @@ export async function convertPdf(options: ConvertOptions): Promise<ConvertResult
|
|
|
46
50
|
|
|
47
51
|
const client = new OpenAI({ apiKey });
|
|
48
52
|
|
|
49
|
-
const uploaded = await
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
+
const uploaded = await withRateLimitRetry("file upload", () =>
|
|
54
|
+
client.files.create({
|
|
55
|
+
file: createReadStream(inputPath),
|
|
56
|
+
purpose: "user_data"
|
|
57
|
+
})
|
|
58
|
+
);
|
|
53
59
|
|
|
54
60
|
const agent = new Agent({
|
|
55
61
|
name: "PDF Converter",
|
|
@@ -58,21 +64,23 @@ export async function convertPdf(options: ConvertOptions): Promise<ConvertResult
|
|
|
58
64
|
});
|
|
59
65
|
|
|
60
66
|
const promptText = buildPromptText(options);
|
|
61
|
-
const result = await run
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
67
|
+
const result = await withRateLimitRetry("model run", () =>
|
|
68
|
+
run(agent, [
|
|
69
|
+
{
|
|
70
|
+
role: "user",
|
|
71
|
+
content: [
|
|
72
|
+
{
|
|
73
|
+
type: "input_text",
|
|
74
|
+
text: promptText
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
type: "input_file",
|
|
78
|
+
file: { id: uploaded.id }
|
|
79
|
+
}
|
|
80
|
+
]
|
|
81
|
+
}
|
|
82
|
+
])
|
|
83
|
+
);
|
|
76
84
|
|
|
77
85
|
const rawOutput = (result.finalOutput ?? "").trim();
|
|
78
86
|
if (!rawOutput) {
|
|
@@ -201,3 +209,249 @@ function parseAutoResponse(rawOutput: string): Omit<ConvertResult, "usage"> {
|
|
|
201
209
|
|
|
202
210
|
return { format: validated.data.format, content };
|
|
203
211
|
}
|
|
212
|
+
|
|
213
|
+
async function withRateLimitRetry<T>(operationName: string, operation: () => Promise<T>): Promise<T> {
|
|
214
|
+
let attempt = 0;
|
|
215
|
+
while (true) {
|
|
216
|
+
try {
|
|
217
|
+
return await operation();
|
|
218
|
+
} catch (error) {
|
|
219
|
+
if (!isRetriableRateLimitError(error) || attempt >= RATE_LIMIT_MAX_RETRIES) {
|
|
220
|
+
throw error;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
const retryAfterMs = getRetryAfterMs(error);
|
|
224
|
+
const exponentialBackoffMs = RATE_LIMIT_BASE_DELAY_MS * (2 ** attempt);
|
|
225
|
+
const jitterMs = Math.floor(Math.random() * 750);
|
|
226
|
+
const computedDelayMs = retryAfterMs ?? (exponentialBackoffMs + jitterMs);
|
|
227
|
+
const waitMs = clampDelayMs(computedDelayMs, RATE_LIMIT_MAX_DELAY_MS);
|
|
228
|
+
const nextAttempt = attempt + 2;
|
|
229
|
+
const totalAttempts = RATE_LIMIT_MAX_RETRIES + 1;
|
|
230
|
+
const reason = extractErrorMessage(error);
|
|
231
|
+
|
|
232
|
+
console.warn(
|
|
233
|
+
`[retry] ${operationName} hit OpenAI rate limits. Waiting ${formatDelay(waitMs)} before retry ${nextAttempt}/${totalAttempts}. ${reason}`
|
|
234
|
+
);
|
|
235
|
+
|
|
236
|
+
await sleep(waitMs);
|
|
237
|
+
attempt += 1;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
function isRetriableRateLimitError(error: unknown): boolean {
|
|
243
|
+
if (typeof error !== "object" || error === null) {
|
|
244
|
+
return false;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
const candidate = error as {
|
|
248
|
+
status?: unknown;
|
|
249
|
+
code?: unknown;
|
|
250
|
+
type?: unknown;
|
|
251
|
+
error?: { code?: unknown; type?: unknown; message?: unknown };
|
|
252
|
+
message?: unknown;
|
|
253
|
+
};
|
|
254
|
+
|
|
255
|
+
if (candidate.status === 429) {
|
|
256
|
+
const code = typeof candidate.code === "string" ? candidate.code : undefined;
|
|
257
|
+
const nestedCode = typeof candidate.error?.code === "string" ? candidate.error.code : undefined;
|
|
258
|
+
if (code === "insufficient_quota" || nestedCode === "insufficient_quota") {
|
|
259
|
+
return false;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
return true;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
const searchableText = [
|
|
266
|
+
toLowerCaseIfString(candidate.code),
|
|
267
|
+
toLowerCaseIfString(candidate.type),
|
|
268
|
+
toLowerCaseIfString(candidate.error?.code),
|
|
269
|
+
toLowerCaseIfString(candidate.error?.type),
|
|
270
|
+
toLowerCaseIfString(candidate.message),
|
|
271
|
+
toLowerCaseIfString(candidate.error?.message)
|
|
272
|
+
]
|
|
273
|
+
.filter(Boolean)
|
|
274
|
+
.join(" ");
|
|
275
|
+
|
|
276
|
+
if (searchableText.includes("insufficient_quota")) {
|
|
277
|
+
return false;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
return (
|
|
281
|
+
searchableText.includes("rate_limit") ||
|
|
282
|
+
searchableText.includes("rate limit") ||
|
|
283
|
+
searchableText.includes("too many requests")
|
|
284
|
+
);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
function getRetryAfterMs(error: unknown): number | undefined {
|
|
288
|
+
const headerDelay = getRetryAfterMsFromHeaders(error);
|
|
289
|
+
if (typeof headerDelay === "number" && Number.isFinite(headerDelay) && headerDelay >= 0) {
|
|
290
|
+
return headerDelay;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
const textDelay = getRetryAfterMsFromText(extractErrorMessage(error));
|
|
294
|
+
if (typeof textDelay === "number" && Number.isFinite(textDelay) && textDelay >= 0) {
|
|
295
|
+
return textDelay;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
return undefined;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
function getRetryAfterMsFromHeaders(error: unknown): number | undefined {
|
|
302
|
+
if (typeof error !== "object" || error === null) {
|
|
303
|
+
return undefined;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
const candidate = error as {
|
|
307
|
+
headers?: unknown;
|
|
308
|
+
response?: { headers?: unknown };
|
|
309
|
+
};
|
|
310
|
+
|
|
311
|
+
const retryAfterMsHeader = readHeader(candidate.headers, "retry-after-ms")
|
|
312
|
+
?? readHeader(candidate.response?.headers, "retry-after-ms");
|
|
313
|
+
if (retryAfterMsHeader) {
|
|
314
|
+
const milliseconds = Number.parseInt(retryAfterMsHeader, 10);
|
|
315
|
+
if (Number.isFinite(milliseconds) && milliseconds >= 0) {
|
|
316
|
+
return milliseconds;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
const retryAfterHeader = readHeader(candidate.headers, "retry-after")
|
|
321
|
+
?? readHeader(candidate.response?.headers, "retry-after");
|
|
322
|
+
if (!retryAfterHeader) {
|
|
323
|
+
return undefined;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
const seconds = Number.parseFloat(retryAfterHeader);
|
|
327
|
+
if (Number.isFinite(seconds)) {
|
|
328
|
+
return Math.max(0, Math.round(seconds * 1_000));
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
const parsedDate = Date.parse(retryAfterHeader);
|
|
332
|
+
if (Number.isFinite(parsedDate)) {
|
|
333
|
+
return Math.max(0, parsedDate - Date.now());
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
return undefined;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
function getRetryAfterMsFromText(message: string): number | undefined {
|
|
340
|
+
const match = message.match(
|
|
341
|
+
/(?:try again in|retry after)\s*([0-9]+(?:\.[0-9]+)?)\s*(ms|msec|millisecond|milliseconds|s|sec|second|seconds|m|min|minute|minutes)?/i
|
|
342
|
+
);
|
|
343
|
+
if (!match) {
|
|
344
|
+
return undefined;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
const rawValue = Number.parseFloat(match[1] ?? "");
|
|
348
|
+
if (!Number.isFinite(rawValue) || rawValue < 0) {
|
|
349
|
+
return undefined;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
const unit = (match[2] ?? "s").toLowerCase();
|
|
353
|
+
if (unit === "ms" || unit === "msec" || unit === "millisecond" || unit === "milliseconds") {
|
|
354
|
+
return Math.round(rawValue);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
if (unit === "m" || unit === "min" || unit === "minute" || unit === "minutes") {
|
|
358
|
+
return Math.round(rawValue * 60_000);
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
return Math.round(rawValue * 1_000);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
function readHeader(headersLike: unknown, headerName: string): string | undefined {
|
|
365
|
+
if (!headersLike) {
|
|
366
|
+
return undefined;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
if (
|
|
370
|
+
typeof headersLike === "object"
|
|
371
|
+
&& "get" in headersLike
|
|
372
|
+
&& typeof (headersLike as { get?: unknown }).get === "function"
|
|
373
|
+
) {
|
|
374
|
+
const value = (headersLike as { get: (name: string) => string | null }).get(headerName);
|
|
375
|
+
return value ?? undefined;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
if (typeof headersLike !== "object") {
|
|
379
|
+
return undefined;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
const headersRecord = headersLike as Record<string, unknown>;
|
|
383
|
+
const lowerTarget = headerName.toLowerCase();
|
|
384
|
+
for (const [key, value] of Object.entries(headersRecord)) {
|
|
385
|
+
if (key.toLowerCase() !== lowerTarget) {
|
|
386
|
+
continue;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
if (typeof value === "string") {
|
|
390
|
+
return value;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
if (Array.isArray(value)) {
|
|
394
|
+
const first = value.find((entry) => typeof entry === "string");
|
|
395
|
+
return typeof first === "string" ? first : undefined;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
return undefined;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
function parsePositiveIntEnv(name: string, fallback: number): number {
|
|
403
|
+
const raw = process.env[name];
|
|
404
|
+
if (!raw) {
|
|
405
|
+
return fallback;
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
const parsed = Number.parseInt(raw, 10);
|
|
409
|
+
if (!Number.isFinite(parsed) || parsed < 0) {
|
|
410
|
+
return fallback;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
return parsed;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
function clampDelayMs(value: number, max: number): number {
|
|
417
|
+
return Math.max(250, Math.min(Math.round(value), max));
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
function formatDelay(milliseconds: number): string {
|
|
421
|
+
if (milliseconds < 1_000) {
|
|
422
|
+
return `${milliseconds}ms`;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
const seconds = milliseconds / 1_000;
|
|
426
|
+
return `${seconds.toFixed(seconds >= 10 ? 0 : 1)}s`;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
function extractErrorMessage(error: unknown): string {
|
|
430
|
+
if (error instanceof Error && error.message.trim().length > 0) {
|
|
431
|
+
return error.message;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
if (typeof error === "object" && error !== null) {
|
|
435
|
+
const message = (error as { message?: unknown; error?: { message?: unknown } }).message;
|
|
436
|
+
if (typeof message === "string" && message.trim().length > 0) {
|
|
437
|
+
return message;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
const nestedMessage = (error as { error?: { message?: unknown } }).error?.message;
|
|
441
|
+
if (typeof nestedMessage === "string" && nestedMessage.trim().length > 0) {
|
|
442
|
+
return nestedMessage;
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
return String(error);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
function toLowerCaseIfString(value: unknown): string {
|
|
450
|
+
return typeof value === "string" ? value.toLowerCase() : "";
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
function sleep(milliseconds: number): Promise<void> {
|
|
454
|
+
return new Promise((resolveSleep) => {
|
|
455
|
+
setTimeout(resolveSleep, milliseconds);
|
|
456
|
+
});
|
|
457
|
+
}
|