@robin7331/papyrus-cli 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +1 -0
- package/README.md +234 -0
- package/assets/.gitkeep +0 -0
- package/assets/header.jpeg +0 -0
- package/assets/header.png +0 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +366 -0
- package/dist/cliHelpers.d.ts +22 -0
- package/dist/cliHelpers.js +75 -0
- package/dist/openaiPdfToMarkdown.d.ts +22 -0
- package/dist/openaiPdfToMarkdown.js +144 -0
- package/package.json +33 -0
- package/src/cli.ts +507 -0
- package/src/cliHelpers.ts +116 -0
- package/src/openaiPdfToMarkdown.ts +203 -0
- package/test/cliHelpers.test.ts +136 -0
- package/tsconfig.json +17 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import { createReadStream } from "node:fs";
|
|
2
|
+
import { access } from "node:fs/promises";
|
|
3
|
+
import { resolve } from "node:path";
|
|
4
|
+
import { Agent, run } from "@openai/agents";
|
|
5
|
+
import OpenAI from "openai";
|
|
6
|
+
import { z } from "zod";
|
|
7
|
+
|
|
8
|
+
export type ConvertOptions = {
|
|
9
|
+
inputPath: string;
|
|
10
|
+
model: string;
|
|
11
|
+
mode: ConversionMode;
|
|
12
|
+
format?: OutputFormat;
|
|
13
|
+
instructions?: string;
|
|
14
|
+
promptText?: string;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
export type ConversionMode = "auto" | "prompt";
|
|
18
|
+
export type OutputFormat = "md" | "txt";
|
|
19
|
+
|
|
20
|
+
export type ConvertResult = {
|
|
21
|
+
format: OutputFormat;
|
|
22
|
+
content: string;
|
|
23
|
+
usage: ConvertUsage;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
export type ConvertUsage = {
|
|
27
|
+
requests: number;
|
|
28
|
+
inputTokens: number;
|
|
29
|
+
outputTokens: number;
|
|
30
|
+
totalTokens: number;
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
const AUTO_RESPONSE_SCHEMA = z.object({
|
|
34
|
+
format: z.enum(["md", "txt"]),
|
|
35
|
+
content: z.string().min(1)
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
export async function convertPdf(options: ConvertOptions): Promise<ConvertResult> {
|
|
39
|
+
const inputPath = resolve(options.inputPath);
|
|
40
|
+
await access(inputPath);
|
|
41
|
+
|
|
42
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
43
|
+
if (!apiKey) {
|
|
44
|
+
throw new Error("OPENAI_API_KEY is not set.");
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const client = new OpenAI({ apiKey });
|
|
48
|
+
|
|
49
|
+
const uploaded = await client.files.create({
|
|
50
|
+
file: createReadStream(inputPath),
|
|
51
|
+
purpose: "user_data"
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
const agent = new Agent({
|
|
55
|
+
name: "PDF Converter",
|
|
56
|
+
instructions: "You convert PDF files precisely according to the requested output format.",
|
|
57
|
+
model: options.model
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
const promptText = buildPromptText(options);
|
|
61
|
+
const result = await run(agent, [
|
|
62
|
+
{
|
|
63
|
+
role: "user",
|
|
64
|
+
content: [
|
|
65
|
+
{
|
|
66
|
+
type: "input_text",
|
|
67
|
+
text: promptText
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
type: "input_file",
|
|
71
|
+
file: { id: uploaded.id }
|
|
72
|
+
}
|
|
73
|
+
]
|
|
74
|
+
}
|
|
75
|
+
]);
|
|
76
|
+
|
|
77
|
+
const rawOutput = (result.finalOutput ?? "").trim();
|
|
78
|
+
if (!rawOutput) {
|
|
79
|
+
throw new Error("No content returned by the API.");
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const usage: ConvertUsage = {
|
|
83
|
+
requests: result.state.usage.requests,
|
|
84
|
+
inputTokens: result.state.usage.inputTokens,
|
|
85
|
+
outputTokens: result.state.usage.outputTokens,
|
|
86
|
+
totalTokens: result.state.usage.totalTokens
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
if (options.mode === "auto" && !options.format) {
|
|
90
|
+
return { ...parseAutoResponse(rawOutput), usage };
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const format = options.format ?? "txt";
|
|
94
|
+
return { format, content: rawOutput, usage };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function buildPromptText(options: ConvertOptions): string {
|
|
98
|
+
if (options.mode === "prompt") {
|
|
99
|
+
if (!options.promptText) {
|
|
100
|
+
throw new Error("promptText is required when mode is 'prompt'.");
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const promptModeParts = [
|
|
104
|
+
"Apply the following user prompt to the PDF.",
|
|
105
|
+
"Return only the final converted content.",
|
|
106
|
+
`User prompt:\n${options.promptText}`
|
|
107
|
+
];
|
|
108
|
+
|
|
109
|
+
if (options.format === "md") {
|
|
110
|
+
promptModeParts.push("Output format requirement: Return only GitHub-flavored Markdown.");
|
|
111
|
+
} else if (options.format === "txt") {
|
|
112
|
+
promptModeParts.push("Output format requirement: Return plain text only and do not use Markdown syntax.");
|
|
113
|
+
} else {
|
|
114
|
+
promptModeParts.push("If the prompt does not enforce a format, prefer plain text without Markdown syntax.");
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return promptModeParts.join("\n\n");
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if (options.format === "md") {
|
|
121
|
+
return withAdditionalInstructions(
|
|
122
|
+
[
|
|
123
|
+
"Convert this PDF into clean GitHub-flavored Markdown.",
|
|
124
|
+
"Preserve headings, paragraphs, lists, and tables.",
|
|
125
|
+
"Render tables as Markdown pipe tables with header separators.",
|
|
126
|
+
"If cells are empty due to merged cells, keep the table readable and consistent.",
|
|
127
|
+
"Return only Markdown without code fences."
|
|
128
|
+
].join(" "),
|
|
129
|
+
options.instructions
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if (options.format === "txt") {
|
|
134
|
+
return withAdditionalInstructions(
|
|
135
|
+
[
|
|
136
|
+
"Convert this PDF into clean plain text.",
|
|
137
|
+
"Preserve reading order and paragraph boundaries.",
|
|
138
|
+
"Represent tables in readable plain text (no Markdown syntax).",
|
|
139
|
+
"Return plain text only and do not use Markdown syntax or code fences."
|
|
140
|
+
].join(" "),
|
|
141
|
+
options.instructions
|
|
142
|
+
);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return withAdditionalInstructions(
|
|
146
|
+
[
|
|
147
|
+
"Decide the best output format for this PDF: Markdown ('md') or plain text ('txt').",
|
|
148
|
+
"Choose 'md' for documents with meaningful headings, lists, and tables that benefit from Markdown.",
|
|
149
|
+
"Choose 'txt' for mostly linear text where Markdown adds little value.",
|
|
150
|
+
"Respond with JSON only, using this exact schema:",
|
|
151
|
+
"{\"format\":\"md|txt\",\"content\":\"<converted content>\"}",
|
|
152
|
+
"If format is 'md', use clean GitHub-flavored Markdown and pipe tables where appropriate.",
|
|
153
|
+
"If format is 'txt', output plain text only and do not use Markdown syntax.",
|
|
154
|
+
"Do not wrap the JSON in code fences."
|
|
155
|
+
].join("\n"),
|
|
156
|
+
options.instructions
|
|
157
|
+
);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function withAdditionalInstructions(base: string, additional?: string): string {
|
|
161
|
+
if (!additional) {
|
|
162
|
+
return base;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return `${base}\n\nAdditional user instructions:\n${additional}`;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
function parseAutoResponse(rawOutput: string): Omit<ConvertResult, "usage"> {
|
|
169
|
+
let candidate = rawOutput.trim();
|
|
170
|
+
|
|
171
|
+
const fencedMatch = candidate.match(/```(?:json)?\s*([\s\S]*?)```/i);
|
|
172
|
+
if (fencedMatch?.[1]) {
|
|
173
|
+
candidate = fencedMatch[1].trim();
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const firstBrace = candidate.indexOf("{");
|
|
177
|
+
const lastBrace = candidate.lastIndexOf("}");
|
|
178
|
+
if (firstBrace === -1 || lastBrace === -1 || lastBrace < firstBrace) {
|
|
179
|
+
throw new Error("Auto mode response is not valid JSON.");
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const jsonPayload = candidate.slice(firstBrace, lastBrace + 1);
|
|
183
|
+
|
|
184
|
+
let parsed: unknown;
|
|
185
|
+
try {
|
|
186
|
+
parsed = JSON.parse(jsonPayload);
|
|
187
|
+
} catch (error) {
|
|
188
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
189
|
+
throw new Error(`Failed to parse auto mode JSON response: ${message}`);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
const validated = AUTO_RESPONSE_SCHEMA.safeParse(parsed);
|
|
193
|
+
if (!validated.success) {
|
|
194
|
+
throw new Error("Auto mode JSON must match { format: 'md' | 'txt', content: string }.");
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
const content = validated.data.content.trim();
|
|
198
|
+
if (!content) {
|
|
199
|
+
throw new Error("Auto mode returned empty content.");
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return { format: validated.data.format, content };
|
|
203
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import assert from "node:assert/strict";
|
|
2
|
+
import test from "node:test";
|
|
3
|
+
import { InvalidArgumentError } from "commander";
|
|
4
|
+
import {
|
|
5
|
+
defaultOutputPath,
|
|
6
|
+
formatDurationMs,
|
|
7
|
+
isPdfPath,
|
|
8
|
+
looksLikeFileOutput,
|
|
9
|
+
parseConcurrency,
|
|
10
|
+
parseFormat,
|
|
11
|
+
parseMode,
|
|
12
|
+
resolveFolderOutputPath,
|
|
13
|
+
truncate,
|
|
14
|
+
validateOptionCombination,
|
|
15
|
+
type CliOptions
|
|
16
|
+
} from "../src/cliHelpers.js";
|
|
17
|
+
|
|
18
|
+
test("parseMode accepts valid values", () => {
|
|
19
|
+
assert.equal(parseMode("auto"), "auto");
|
|
20
|
+
assert.equal(parseMode("prompt"), "prompt");
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
test("parseMode rejects invalid values", () => {
|
|
24
|
+
assert.throws(() => parseMode("invalid"), InvalidArgumentError);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
test("parseFormat accepts valid values", () => {
|
|
28
|
+
assert.equal(parseFormat("md"), "md");
|
|
29
|
+
assert.equal(parseFormat("txt"), "txt");
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
test("parseFormat rejects invalid values", () => {
|
|
33
|
+
assert.throws(() => parseFormat("json"), InvalidArgumentError);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
test("parseConcurrency accepts in-range integers", () => {
|
|
37
|
+
assert.equal(parseConcurrency("1"), 1);
|
|
38
|
+
assert.equal(parseConcurrency("100"), 100);
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
test("parseConcurrency rejects invalid values", () => {
|
|
42
|
+
assert.throws(() => parseConcurrency("0"), InvalidArgumentError);
|
|
43
|
+
assert.throws(() => parseConcurrency("101"), InvalidArgumentError);
|
|
44
|
+
assert.throws(() => parseConcurrency("1.5"), InvalidArgumentError);
|
|
45
|
+
assert.throws(() => parseConcurrency("abc"), InvalidArgumentError);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
test("validateOptionCombination enforces prompt mode requirements", () => {
|
|
49
|
+
const base: CliOptions = {
|
|
50
|
+
model: "gpt-4o-mini",
|
|
51
|
+
mode: "prompt"
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
assert.throws(
|
|
55
|
+
() => validateOptionCombination(base),
|
|
56
|
+
/Prompt mode requires exactly one of --prompt or --prompt-file\./
|
|
57
|
+
);
|
|
58
|
+
assert.doesNotThrow(() => validateOptionCombination({ ...base, prompt: "Convert this" }));
|
|
59
|
+
assert.doesNotThrow(() => validateOptionCombination({ ...base, promptFile: "./prompt.txt" }));
|
|
60
|
+
assert.throws(
|
|
61
|
+
() => validateOptionCombination({ ...base, prompt: "x", promptFile: "./prompt.txt" }),
|
|
62
|
+
/Prompt mode requires exactly one of --prompt or --prompt-file\./
|
|
63
|
+
);
|
|
64
|
+
assert.throws(
|
|
65
|
+
() => validateOptionCombination({ ...base, prompt: "x", instructions: "Extra" }),
|
|
66
|
+
/--instructions is only supported in auto mode\./
|
|
67
|
+
);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test("validateOptionCombination rejects prompt flags in auto mode", () => {
|
|
71
|
+
const base: CliOptions = {
|
|
72
|
+
model: "gpt-4o-mini",
|
|
73
|
+
mode: "auto"
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
assert.doesNotThrow(() => validateOptionCombination(base));
|
|
77
|
+
assert.throws(
|
|
78
|
+
() => validateOptionCombination({ ...base, prompt: "Convert" }),
|
|
79
|
+
/--prompt and --prompt-file are only supported in prompt mode\./
|
|
80
|
+
);
|
|
81
|
+
assert.throws(
|
|
82
|
+
() => validateOptionCombination({ ...base, promptFile: "./prompt.txt" }),
|
|
83
|
+
/--prompt and --prompt-file are only supported in prompt mode\./
|
|
84
|
+
);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
test("defaultOutputPath replaces .pdf extension and appends for other files", () => {
|
|
88
|
+
assert.equal(defaultOutputPath("/tmp/input.pdf", "md"), "/tmp/input.md");
|
|
89
|
+
assert.equal(defaultOutputPath("/tmp/input.PDF", "txt"), "/tmp/input.txt");
|
|
90
|
+
assert.equal(defaultOutputPath("/tmp/input", "md"), "/tmp/input.md");
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
test("resolveFolderOutputPath preserves nested structure when output root is set", () => {
|
|
94
|
+
assert.equal(
|
|
95
|
+
resolveFolderOutputPath(
|
|
96
|
+
"/data/invoices/2025/jan/file.pdf",
|
|
97
|
+
"/data/invoices",
|
|
98
|
+
"/exports",
|
|
99
|
+
"md"
|
|
100
|
+
),
|
|
101
|
+
"/exports/2025/jan/file.md"
|
|
102
|
+
);
|
|
103
|
+
|
|
104
|
+
assert.equal(
|
|
105
|
+
resolveFolderOutputPath("/data/invoices/file.pdf", "/data/invoices", "/exports", "txt"),
|
|
106
|
+
"/exports/file.txt"
|
|
107
|
+
);
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
test("resolveFolderOutputPath falls back to default path when no output root", () => {
|
|
111
|
+
assert.equal(
|
|
112
|
+
resolveFolderOutputPath("/data/invoices/file.pdf", "/data/invoices", undefined, "md"),
|
|
113
|
+
"/data/invoices/file.md"
|
|
114
|
+
);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
test("isPdfPath and looksLikeFileOutput detect supported extensions case-insensitively", () => {
|
|
118
|
+
assert.equal(isPdfPath("report.pdf"), true);
|
|
119
|
+
assert.equal(isPdfPath("report.PDF"), true);
|
|
120
|
+
assert.equal(isPdfPath("report.txt"), false);
|
|
121
|
+
|
|
122
|
+
assert.equal(looksLikeFileOutput("out.md"), true);
|
|
123
|
+
assert.equal(looksLikeFileOutput("out.TXT"), true);
|
|
124
|
+
assert.equal(looksLikeFileOutput("out.json"), false);
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
test("truncate shortens long values and preserves short ones", () => {
|
|
128
|
+
assert.equal(truncate("abcdef", 10), "abcdef");
|
|
129
|
+
assert.equal(truncate("abcdef", 3), "abc");
|
|
130
|
+
assert.equal(truncate("abcdefghij", 8), "abcde...");
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
test("formatDurationMs formats to seconds with two decimals", () => {
|
|
134
|
+
assert.equal(formatDurationMs(0), "0.00s");
|
|
135
|
+
assert.equal(formatDurationMs(1543), "1.54s");
|
|
136
|
+
});
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"target": "ES2022",
|
|
4
|
+
"module": "NodeNext",
|
|
5
|
+
"moduleResolution": "NodeNext",
|
|
6
|
+
"strict": true,
|
|
7
|
+
"skipLibCheck": true,
|
|
8
|
+
"forceConsistentCasingInFileNames": true,
|
|
9
|
+
"esModuleInterop": true,
|
|
10
|
+
"declaration": true,
|
|
11
|
+
"outDir": "dist",
|
|
12
|
+
"rootDir": "src"
|
|
13
|
+
},
|
|
14
|
+
"include": [
|
|
15
|
+
"src/**/*.ts"
|
|
16
|
+
]
|
|
17
|
+
}
|