@robin7331/papyrus-cli 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +1 -0
- package/README.md +234 -0
- package/assets/.gitkeep +0 -0
- package/assets/header.jpeg +0 -0
- package/assets/header.png +0 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +366 -0
- package/dist/cliHelpers.d.ts +22 -0
- package/dist/cliHelpers.js +75 -0
- package/dist/openaiPdfToMarkdown.d.ts +22 -0
- package/dist/openaiPdfToMarkdown.js +144 -0
- package/package.json +33 -0
- package/src/cli.ts +507 -0
- package/src/cliHelpers.ts +116 -0
- package/src/openaiPdfToMarkdown.ts +203 -0
- package/test/cliHelpers.test.ts +136 -0
- package/tsconfig.json +17 -0
package/.env.example
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
OPENAI_API_KEY=your_api_key_here
|
package/README.md
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="./assets/header.png" alt="Papyrus CLI logo" width="180" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">Papyrus CLI</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">Convert PDFs into Markdown or plain text with the OpenAI Agents SDK.</p>
|
|
8
|
+
|
|
9
|
+
<p align="center">
|
|
10
|
+
<a href="https://www.npmjs.com/package/@robin7331/papyrus-cli"><img src="https://img.shields.io/npm/v/%40robin7331%2Fpapyrus-cli?logo=npm&label=npm" alt="npm version"></a>
|
|
11
|
+
<a href="https://www.npmjs.com/package/@robin7331/papyrus-cli"><img src="https://img.shields.io/npm/dm/%40robin7331%2Fpapyrus-cli?logo=npm&label=downloads" alt="npm downloads"></a>
|
|
12
|
+
<img src="https://img.shields.io/badge/node-%3E%3D22-339933?logo=node.js&logoColor=white" alt="node >= 22">
|
|
13
|
+
</p>
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
Run directly with `npx`:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
npx @robin7331/papyrus-cli --help
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Or install globally:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
npm i -g @robin7331/papyrus-cli
|
|
27
|
+
papyrus --help
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## API Key Setup
|
|
31
|
+
|
|
32
|
+
Papyrus requires `OPENAI_API_KEY`.
|
|
33
|
+
|
|
34
|
+
macOS/Linux (persistent):
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
echo 'export OPENAI_API_KEY="your_api_key_here"' >> ~/.zshrc
|
|
38
|
+
source ~/.zshrc
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
PowerShell (persistent):
|
|
42
|
+
|
|
43
|
+
```powershell
|
|
44
|
+
setx OPENAI_API_KEY "your_api_key_here"
|
|
45
|
+
# restart PowerShell after running setx
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
One-off execution:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
OPENAI_API_KEY="your_api_key_here" npx @robin7331/papyrus-cli ./path/to/input.pdf
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Security note: Papyrus intentionally does not provide an `--api-key` flag to avoid leaking keys via shell history or process lists.
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
Single file (auto mode):
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
papyrus ./path/to/input.pdf
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Single file with explicit format/output/model:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
papyrus ./path/to/input.pdf --format md --output ./out/result.md --model gpt-4o-mini
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Auto mode with extra instructions:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
papyrus ./path/to/input.pdf --instructions "Prioritize table accuracy." --format txt
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Prompt mode (inline prompt):
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
papyrus ./path/to/input.pdf --mode prompt --prompt "Extract all invoice line items as bullet points." --format md
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Prompt mode (prompt file):
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
papyrus ./path/to/input.pdf --mode prompt --prompt-file ./my-prompt.txt --format txt
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Folder mode (recursive scan, asks for confirmation):
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
papyrus ./path/to/folder
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Folder mode with explicit concurrency and output directory:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
papyrus ./path/to/folder --concurrency 4 --output ./out
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Folder mode without confirmation prompt:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
papyrus ./path/to/folder --yes
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
`npx` alternative for any command:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
npx @robin7331/papyrus-cli ./path/to/input.pdf --mode auto
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Arguments Reference
|
|
113
|
+
|
|
114
|
+
### `<input>`
|
|
115
|
+
|
|
116
|
+
Path to a single PDF file or a folder containing PDFs (processed recursively).
|
|
117
|
+
|
|
118
|
+
Example:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
papyrus ./docs/invoice.pdf
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### `--format <format>`
|
|
125
|
+
|
|
126
|
+
Output format override:
|
|
127
|
+
- `md` for GitHub-flavored Markdown
|
|
128
|
+
- `txt` for plain text
|
|
129
|
+
|
|
130
|
+
Example:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
papyrus ./docs/invoice.pdf --format md
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### `-o, --output <path>`
|
|
137
|
+
|
|
138
|
+
Output destination.
|
|
139
|
+
- Single file input: output file path.
|
|
140
|
+
- Folder input: output directory path (folder structure is mirrored).
|
|
141
|
+
|
|
142
|
+
Example:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
papyrus ./docs --output ./converted
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### `--mode <mode>`
|
|
149
|
+
|
|
150
|
+
Conversion mode:
|
|
151
|
+
- `auto` (default): built-in conversion behavior.
|
|
152
|
+
- `prompt`: use your own prompt via `--prompt` or `--prompt-file`.
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
papyrus ./docs/invoice.pdf --mode prompt --prompt "Extract all line items."
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### `--instructions <text>`
|
|
161
|
+
|
|
162
|
+
Additional conversion instructions in `auto` mode only.
|
|
163
|
+
|
|
164
|
+
Example:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
papyrus ./docs/invoice.pdf --mode auto --instructions "Keep table columns aligned."
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### `--prompt <text>`
|
|
171
|
+
|
|
172
|
+
Inline prompt text for `prompt` mode. Must be non-empty. In `prompt` mode, use exactly one of `--prompt` or `--prompt-file`.
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
papyrus ./docs/invoice.pdf --mode prompt --prompt "Summarize payment terms."
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### `--prompt-file <path>`
|
|
181
|
+
|
|
182
|
+
Path to a text file containing the prompt for `prompt` mode. File must contain non-empty text. In `prompt` mode, use exactly one of `--prompt` or `--prompt-file`.
|
|
183
|
+
|
|
184
|
+
Example:
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
papyrus ./docs/invoice.pdf --mode prompt --prompt-file ./my-prompt.txt
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### `-m, --model <model>`
|
|
191
|
+
|
|
192
|
+
OpenAI model name used for conversion. Default is `gpt-4o-mini`.
|
|
193
|
+
|
|
194
|
+
Example:
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
papyrus ./docs/invoice.pdf --model gpt-4.1-mini
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### `--concurrency <n>`
|
|
201
|
+
|
|
202
|
+
Maximum parallel workers for folder input. Must be an integer between `1` and `100`. Default is `10`.
|
|
203
|
+
|
|
204
|
+
Example:
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
papyrus ./docs --concurrency 4
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### `-y, --yes`
|
|
211
|
+
|
|
212
|
+
Skips the interactive folder confirmation prompt.
|
|
213
|
+
|
|
214
|
+
Example:
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
papyrus ./docs --yes
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## Notes
|
|
221
|
+
|
|
222
|
+
- In `auto` mode without `--format`, the model returns structured JSON with `format` + `content`.
|
|
223
|
+
- Folder input is scanned recursively for `.pdf` files and processed in parallel.
|
|
224
|
+
- In folder mode, `--output` must be a directory path and mirrored subfolders are preserved.
|
|
225
|
+
- For scanned PDFs, output quality depends on OCR quality from the model.
|
|
226
|
+
|
|
227
|
+
## Development
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
npm install
|
|
231
|
+
npm run build
|
|
232
|
+
npm run dev -- ./path/to/input.pdf
|
|
233
|
+
npm test
|
|
234
|
+
```
|
package/assets/.gitkeep
ADDED
|
File without changes
|
|
Binary file
|
|
Binary file
|
package/dist/cli.d.ts
ADDED
package/dist/cli.js
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import "dotenv/config";
|
|
3
|
+
import { mkdir, readFile, readdir, stat, writeFile } from "node:fs/promises";
|
|
4
|
+
import { dirname, join, relative, resolve } from "node:path";
|
|
5
|
+
import { Command } from "commander";
|
|
6
|
+
import { convertPdf } from "./openaiPdfToMarkdown.js";
|
|
7
|
+
import { defaultOutputPath, formatDurationMs, isPdfPath, looksLikeFileOutput, parseConcurrency, parseFormat, parseMode, resolveFolderOutputPath, truncate, validateOptionCombination } from "./cliHelpers.js";
|
|
8
|
+
const program = new Command();
|
|
9
|
+
program
|
|
10
|
+
.name("papyrus")
|
|
11
|
+
.description("Convert PDF files to Markdown or text using the OpenAI Agents SDK")
|
|
12
|
+
.argument("<input>", "Path to input PDF file or folder")
|
|
13
|
+
.option("-o, --output <path>", "Path to output file (single input) or output directory (folder input)")
|
|
14
|
+
.option("-m, --model <model>", "OpenAI model to use", "gpt-4o-mini")
|
|
15
|
+
.option("--concurrency <n>", "Max parallel workers for folder input (default: 10)", parseConcurrency)
|
|
16
|
+
.option("-y, --yes", "Skip confirmation prompt in folder mode")
|
|
17
|
+
.option("--mode <mode>", "Conversion mode: auto or prompt", parseMode, "auto")
|
|
18
|
+
.option("--format <format>", "Output format override: md or txt", parseFormat)
|
|
19
|
+
.option("--instructions <text>", "Additional conversion instructions for auto mode")
|
|
20
|
+
.option("--prompt <text>", "Custom prompt text for prompt mode")
|
|
21
|
+
.option("--prompt-file <path>", "Path to file containing prompt text for prompt mode")
|
|
22
|
+
.action(async (input, options) => {
|
|
23
|
+
const inputPath = resolve(input);
|
|
24
|
+
const startedAt = Date.now();
|
|
25
|
+
try {
|
|
26
|
+
validateOptionCombination(options);
|
|
27
|
+
const promptText = await resolvePromptText(options);
|
|
28
|
+
const inputKind = await detectInputKind(inputPath);
|
|
29
|
+
let usageTotals = emptyUsage();
|
|
30
|
+
if (inputKind === "file") {
|
|
31
|
+
usageTotals = await processSingleFile(inputPath, options, promptText);
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
const summary = await processFolder(inputPath, options, promptText);
|
|
35
|
+
usageTotals = summary.usage;
|
|
36
|
+
if (!summary.cancelled && summary.failed > 0) {
|
|
37
|
+
process.exitCode = 1;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
printUsageTotals(usageTotals);
|
|
41
|
+
console.log(`Duration: ${((Date.now() - startedAt) / 1000).toFixed(2)}s`);
|
|
42
|
+
}
|
|
43
|
+
catch (error) {
|
|
44
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
45
|
+
console.error(`Conversion failed: ${message}`);
|
|
46
|
+
console.error(`Duration: ${((Date.now() - startedAt) / 1000).toFixed(2)}s`);
|
|
47
|
+
process.exitCode = 1;
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
program.parseAsync(process.argv);
|
|
51
|
+
async function processSingleFile(inputPath, options, promptText) {
|
|
52
|
+
if (!isPdfPath(inputPath)) {
|
|
53
|
+
throw new Error("Input file must have a .pdf extension.");
|
|
54
|
+
}
|
|
55
|
+
const result = await convertPdf({
|
|
56
|
+
inputPath,
|
|
57
|
+
model: options.model,
|
|
58
|
+
mode: options.mode,
|
|
59
|
+
format: options.format,
|
|
60
|
+
instructions: options.instructions,
|
|
61
|
+
promptText
|
|
62
|
+
});
|
|
63
|
+
const outputPath = resolve(options.output ?? defaultOutputPath(inputPath, result.format));
|
|
64
|
+
await mkdir(dirname(outputPath), { recursive: true });
|
|
65
|
+
await writeFile(outputPath, result.content, "utf8");
|
|
66
|
+
console.log(`Output (${result.format}) written to: ${outputPath}`);
|
|
67
|
+
return result.usage;
|
|
68
|
+
}
|
|
69
|
+
async function processFolder(inputDir, options, promptText) {
|
|
70
|
+
if (options.output && looksLikeFileOutput(options.output)) {
|
|
71
|
+
throw new Error("In folder mode, --output must be a directory path (not a .md/.txt file path).");
|
|
72
|
+
}
|
|
73
|
+
const files = await collectPdfFiles(inputDir);
|
|
74
|
+
if (files.length === 0) {
|
|
75
|
+
throw new Error(`No PDF files found in directory: ${inputDir}`);
|
|
76
|
+
}
|
|
77
|
+
const concurrency = options.concurrency ?? 10;
|
|
78
|
+
const shouldProceed = await confirmFolderProcessing(files.length, concurrency, Boolean(options.yes));
|
|
79
|
+
if (!shouldProceed) {
|
|
80
|
+
console.log("Cancelled. No files were processed.");
|
|
81
|
+
return { total: files.length, succeeded: 0, failed: 0, cancelled: true, usage: emptyUsage() };
|
|
82
|
+
}
|
|
83
|
+
const outputRoot = options.output ? resolve(options.output) : undefined;
|
|
84
|
+
let succeeded = 0;
|
|
85
|
+
let failed = 0;
|
|
86
|
+
let completed = 0;
|
|
87
|
+
const usage = emptyUsage();
|
|
88
|
+
const failures = [];
|
|
89
|
+
const workerCount = Math.min(concurrency, files.length);
|
|
90
|
+
console.log(`Found ${files.length} PDF file(s). Using concurrency: ${concurrency}`);
|
|
91
|
+
const workerDashboard = process.stdout.isTTY
|
|
92
|
+
? new AsciiWorkerDashboard(files.length, workerCount)
|
|
93
|
+
: null;
|
|
94
|
+
workerDashboard?.setSummary(completed, failed);
|
|
95
|
+
try {
|
|
96
|
+
await runWithConcurrency(files, concurrency, async (filePath, _index, workerId) => {
|
|
97
|
+
const relativeInput = relative(inputDir, filePath);
|
|
98
|
+
const startedAt = Date.now();
|
|
99
|
+
workerDashboard?.setWorkerRunning(workerId, relativeInput);
|
|
100
|
+
try {
|
|
101
|
+
const result = await convertPdf({
|
|
102
|
+
inputPath: filePath,
|
|
103
|
+
model: options.model,
|
|
104
|
+
mode: options.mode,
|
|
105
|
+
format: options.format,
|
|
106
|
+
instructions: options.instructions,
|
|
107
|
+
promptText
|
|
108
|
+
});
|
|
109
|
+
const outputPath = resolveFolderOutputPath(filePath, inputDir, outputRoot, result.format);
|
|
110
|
+
await mkdir(dirname(outputPath), { recursive: true });
|
|
111
|
+
await writeFile(outputPath, result.content, "utf8");
|
|
112
|
+
succeeded += 1;
|
|
113
|
+
mergeUsage(usage, result.usage);
|
|
114
|
+
if (workerDashboard) {
|
|
115
|
+
workerDashboard.setWorkerDone(workerId, relativeInput, `${result.format} in ${formatDurationMs(Date.now() - startedAt)}`);
|
|
116
|
+
}
|
|
117
|
+
else {
|
|
118
|
+
console.log(`[worker-${workerId + 1}] Done ${relativeInput} -> ${outputPath} (${result.format}, ${formatDurationMs(Date.now() - startedAt)})`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
catch (error) {
|
|
122
|
+
failed += 1;
|
|
123
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
124
|
+
failures.push({
|
|
125
|
+
file: relativeInput,
|
|
126
|
+
message
|
|
127
|
+
});
|
|
128
|
+
if (workerDashboard) {
|
|
129
|
+
workerDashboard.setWorkerFailed(workerId, relativeInput, `${truncate(message, 42)} (${formatDurationMs(Date.now() - startedAt)})`);
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
console.error(`[worker-${workerId + 1}] Failed ${relativeInput}: ${message} (${formatDurationMs(Date.now() - startedAt)})`);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
finally {
|
|
136
|
+
completed += 1;
|
|
137
|
+
workerDashboard?.setSummary(completed, failed);
|
|
138
|
+
}
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
finally {
|
|
142
|
+
workerDashboard?.stop();
|
|
143
|
+
}
|
|
144
|
+
console.log(`Summary: total=${files.length}, succeeded=${succeeded}, failed=${failed}`);
|
|
145
|
+
if (failures.length > 0) {
|
|
146
|
+
console.error("Failures:");
|
|
147
|
+
for (const failure of failures) {
|
|
148
|
+
console.error(`- ${failure.file}: ${failure.message}`);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
return { total: files.length, succeeded, failed, cancelled: false, usage };
|
|
152
|
+
}
|
|
153
|
+
async function resolvePromptText(options) {
|
|
154
|
+
if (options.mode !== "prompt") {
|
|
155
|
+
return undefined;
|
|
156
|
+
}
|
|
157
|
+
if (options.prompt) {
|
|
158
|
+
const prompt = options.prompt.trim();
|
|
159
|
+
if (!prompt) {
|
|
160
|
+
throw new Error("--prompt cannot be empty.");
|
|
161
|
+
}
|
|
162
|
+
return prompt;
|
|
163
|
+
}
|
|
164
|
+
if (!options.promptFile) {
|
|
165
|
+
return undefined;
|
|
166
|
+
}
|
|
167
|
+
const promptPath = resolve(options.promptFile);
|
|
168
|
+
const promptFromFile = (await readFile(promptPath, "utf8")).trim();
|
|
169
|
+
if (!promptFromFile) {
|
|
170
|
+
throw new Error("--prompt-file must contain non-empty text.");
|
|
171
|
+
}
|
|
172
|
+
return promptFromFile;
|
|
173
|
+
}
|
|
174
|
+
async function detectInputKind(inputPath) {
|
|
175
|
+
const metadata = await stat(inputPath);
|
|
176
|
+
if (metadata.isFile()) {
|
|
177
|
+
return "file";
|
|
178
|
+
}
|
|
179
|
+
if (metadata.isDirectory()) {
|
|
180
|
+
return "directory";
|
|
181
|
+
}
|
|
182
|
+
throw new Error("Input path must be a PDF file or directory.");
|
|
183
|
+
}
|
|
184
|
+
async function collectPdfFiles(rootDir) {
|
|
185
|
+
const collected = [];
|
|
186
|
+
await walkDirectory(rootDir, collected);
|
|
187
|
+
return collected;
|
|
188
|
+
}
|
|
189
|
+
async function walkDirectory(currentDir, collected) {
|
|
190
|
+
const entries = await readdir(currentDir, { withFileTypes: true });
|
|
191
|
+
entries.sort((a, b) => a.name.localeCompare(b.name, "en", { sensitivity: "base" }));
|
|
192
|
+
for (const entry of entries) {
|
|
193
|
+
if (entry.isSymbolicLink()) {
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
const fullPath = join(currentDir, entry.name);
|
|
197
|
+
if (entry.isDirectory()) {
|
|
198
|
+
await walkDirectory(fullPath, collected);
|
|
199
|
+
continue;
|
|
200
|
+
}
|
|
201
|
+
if (entry.isFile() && isPdfPath(entry.name)) {
|
|
202
|
+
collected.push(fullPath);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
async function runWithConcurrency(items, concurrency, worker) {
|
|
207
|
+
const maxWorkers = Math.min(concurrency, items.length);
|
|
208
|
+
let nextIndex = 0;
|
|
209
|
+
const workers = Array.from({ length: maxWorkers }, async (_, workerId) => {
|
|
210
|
+
while (true) {
|
|
211
|
+
const currentIndex = nextIndex;
|
|
212
|
+
nextIndex += 1;
|
|
213
|
+
if (currentIndex >= items.length) {
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
216
|
+
await worker(items[currentIndex], currentIndex, workerId);
|
|
217
|
+
}
|
|
218
|
+
});
|
|
219
|
+
await Promise.all(workers);
|
|
220
|
+
}
|
|
221
|
+
const SPINNER_FRAMES = ["-", "\\", "|", "/"];
|
|
222
|
+
class AsciiWorkerDashboard {
|
|
223
|
+
lanes;
|
|
224
|
+
total;
|
|
225
|
+
workerCount;
|
|
226
|
+
spinnerTimer;
|
|
227
|
+
completed = 0;
|
|
228
|
+
failed = 0;
|
|
229
|
+
renderedLineCount = 0;
|
|
230
|
+
constructor(total, workerCount) {
|
|
231
|
+
this.total = total;
|
|
232
|
+
this.workerCount = workerCount;
|
|
233
|
+
this.lanes = Array.from({ length: workerCount }, () => ({
|
|
234
|
+
state: "idle",
|
|
235
|
+
spinnerFrame: 0
|
|
236
|
+
}));
|
|
237
|
+
process.stdout.write("\x1b[?25l");
|
|
238
|
+
this.render();
|
|
239
|
+
this.spinnerTimer = setInterval(() => {
|
|
240
|
+
this.tickSpinners();
|
|
241
|
+
this.render();
|
|
242
|
+
}, 100);
|
|
243
|
+
}
|
|
244
|
+
setSummary(completed, failed) {
|
|
245
|
+
this.completed = completed;
|
|
246
|
+
this.failed = failed;
|
|
247
|
+
this.render();
|
|
248
|
+
}
|
|
249
|
+
setWorkerRunning(workerId, file) {
|
|
250
|
+
const lane = this.lanes[workerId];
|
|
251
|
+
if (!lane) {
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
lane.state = "running";
|
|
255
|
+
lane.file = file;
|
|
256
|
+
lane.message = "processing";
|
|
257
|
+
this.render();
|
|
258
|
+
}
|
|
259
|
+
setWorkerDone(workerId, file, message) {
|
|
260
|
+
const lane = this.lanes[workerId];
|
|
261
|
+
if (!lane) {
|
|
262
|
+
return;
|
|
263
|
+
}
|
|
264
|
+
lane.state = "done";
|
|
265
|
+
lane.file = file;
|
|
266
|
+
lane.message = message;
|
|
267
|
+
this.render();
|
|
268
|
+
}
|
|
269
|
+
setWorkerFailed(workerId, file, message) {
|
|
270
|
+
const lane = this.lanes[workerId];
|
|
271
|
+
if (!lane) {
|
|
272
|
+
return;
|
|
273
|
+
}
|
|
274
|
+
lane.state = "failed";
|
|
275
|
+
lane.file = file;
|
|
276
|
+
lane.message = message;
|
|
277
|
+
this.render();
|
|
278
|
+
}
|
|
279
|
+
stop() {
|
|
280
|
+
clearInterval(this.spinnerTimer);
|
|
281
|
+
this.render();
|
|
282
|
+
process.stdout.write("\x1b[?25h");
|
|
283
|
+
}
|
|
284
|
+
render() {
|
|
285
|
+
const lines = this.composeLines();
|
|
286
|
+
if (this.renderedLineCount > 0) {
|
|
287
|
+
process.stdout.write(`\x1b[${this.renderedLineCount}F`);
|
|
288
|
+
}
|
|
289
|
+
for (const line of lines) {
|
|
290
|
+
process.stdout.write(`\x1b[2K${line}\n`);
|
|
291
|
+
}
|
|
292
|
+
this.renderedLineCount = lines.length;
|
|
293
|
+
}
|
|
294
|
+
composeLines() {
|
|
295
|
+
const active = this.lanes.filter((lane) => lane.state === "running").length;
|
|
296
|
+
const lines = [
|
|
297
|
+
`Progress: ${this.completed}/${this.total} complete | active ${active}/${this.workerCount} | failed ${this.failed}`
|
|
298
|
+
];
|
|
299
|
+
for (let index = 0; index < this.lanes.length; index += 1) {
|
|
300
|
+
const lane = this.lanes[index];
|
|
301
|
+
const label = `worker-${String(index + 1).padStart(2, "0")}`;
|
|
302
|
+
const icon = this.renderIcon(lane);
|
|
303
|
+
const file = truncate(lane.file ?? "idle", 64);
|
|
304
|
+
const message = lane.message ? ` | ${lane.message}` : "";
|
|
305
|
+
lines.push(`${icon} ${label} | ${file}${message}`);
|
|
306
|
+
}
|
|
307
|
+
return lines;
|
|
308
|
+
}
|
|
309
|
+
tickSpinners() {
|
|
310
|
+
for (const lane of this.lanes) {
|
|
311
|
+
if (lane.state !== "running") {
|
|
312
|
+
continue;
|
|
313
|
+
}
|
|
314
|
+
lane.spinnerFrame = (lane.spinnerFrame + 1) % SPINNER_FRAMES.length;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
renderIcon(lane) {
|
|
318
|
+
if (lane.state === "running") {
|
|
319
|
+
return SPINNER_FRAMES[lane.spinnerFrame];
|
|
320
|
+
}
|
|
321
|
+
if (lane.state === "done") {
|
|
322
|
+
return "OK";
|
|
323
|
+
}
|
|
324
|
+
if (lane.state === "failed") {
|
|
325
|
+
return "!!";
|
|
326
|
+
}
|
|
327
|
+
return "..";
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
async function confirmFolderProcessing(totalFiles, concurrency, skipPrompt) {
|
|
331
|
+
if (skipPrompt) {
|
|
332
|
+
return true;
|
|
333
|
+
}
|
|
334
|
+
if (!process.stdin.isTTY || !process.stdout.isTTY) {
|
|
335
|
+
throw new Error("Folder mode requires an interactive terminal confirmation. Use --yes to skip the prompt.");
|
|
336
|
+
}
|
|
337
|
+
const { createInterface } = await import("node:readline/promises");
|
|
338
|
+
const rl = createInterface({
|
|
339
|
+
input: process.stdin,
|
|
340
|
+
output: process.stdout
|
|
341
|
+
});
|
|
342
|
+
try {
|
|
343
|
+
const answer = (await rl.question(`Process ${totalFiles} PDF file(s) with concurrency ${concurrency}? [Y/n] `)).trim().toLowerCase();
|
|
344
|
+
return answer === "" || answer === "y" || answer === "yes";
|
|
345
|
+
}
|
|
346
|
+
finally {
|
|
347
|
+
rl.close();
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
function emptyUsage() {
|
|
351
|
+
return {
|
|
352
|
+
requests: 0,
|
|
353
|
+
inputTokens: 0,
|
|
354
|
+
outputTokens: 0,
|
|
355
|
+
totalTokens: 0
|
|
356
|
+
};
|
|
357
|
+
}
|
|
358
|
+
function mergeUsage(target, delta) {
|
|
359
|
+
target.requests += delta.requests;
|
|
360
|
+
target.inputTokens += delta.inputTokens;
|
|
361
|
+
target.outputTokens += delta.outputTokens;
|
|
362
|
+
target.totalTokens += delta.totalTokens;
|
|
363
|
+
}
|
|
364
|
+
function printUsageTotals(usage) {
|
|
365
|
+
console.log(`Token usage: input=${usage.inputTokens}, output=${usage.outputTokens}, total=${usage.totalTokens}, requests=${usage.requests}`);
|
|
366
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { type ConversionMode, type OutputFormat } from "./openaiPdfToMarkdown.js";
|
|
2
|
+
export type CliOptions = {
|
|
3
|
+
output?: string;
|
|
4
|
+
model: string;
|
|
5
|
+
concurrency?: number;
|
|
6
|
+
yes?: boolean;
|
|
7
|
+
mode: ConversionMode;
|
|
8
|
+
format?: OutputFormat;
|
|
9
|
+
instructions?: string;
|
|
10
|
+
prompt?: string;
|
|
11
|
+
promptFile?: string;
|
|
12
|
+
};
|
|
13
|
+
export declare function parseMode(value: string): ConversionMode;
|
|
14
|
+
export declare function parseFormat(value: string): OutputFormat;
|
|
15
|
+
export declare function parseConcurrency(value: string): number;
|
|
16
|
+
export declare function validateOptionCombination(options: CliOptions): void;
|
|
17
|
+
export declare function defaultOutputPath(inputPath: string, format: OutputFormat): string;
|
|
18
|
+
export declare function resolveFolderOutputPath(inputPath: string, inputRoot: string, outputRoot: string | undefined, format: OutputFormat): string;
|
|
19
|
+
export declare function isPdfPath(inputPath: string): boolean;
|
|
20
|
+
export declare function looksLikeFileOutput(outputPath: string): boolean;
|
|
21
|
+
export declare function truncate(value: string, maxLength: number): string;
|
|
22
|
+
export declare function formatDurationMs(durationMs: number): string;
|