@opendataloader/pdf 1.2.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NOTICE.md +1 -1
- package/README.md +52 -371
- package/dist/cli.cjs +140 -94
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +140 -94
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +104 -75
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +48 -12
- package/dist/index.d.ts +48 -12
- package/dist/index.js +103 -75
- package/dist/index.js.map +1 -1
- package/lib/opendataloader-pdf-cli.jar +0 -0
- package/package.json +9 -10
package/dist/cli.cjs
CHANGED
|
@@ -31,12 +31,126 @@ var import_child_process = require("child_process");
|
|
|
31
31
|
var path = __toESM(require("path"), 1);
|
|
32
32
|
var fs = __toESM(require("fs"), 1);
|
|
33
33
|
var import_url = require("url");
|
|
34
|
+
|
|
35
|
+
// src/convert-options.generated.ts
|
|
36
|
+
function buildConvertOptions(cliOptions) {
|
|
37
|
+
const convertOptions = {};
|
|
38
|
+
if (cliOptions.outputDir) {
|
|
39
|
+
convertOptions.outputDir = cliOptions.outputDir;
|
|
40
|
+
}
|
|
41
|
+
if (cliOptions.password) {
|
|
42
|
+
convertOptions.password = cliOptions.password;
|
|
43
|
+
}
|
|
44
|
+
if (cliOptions.format) {
|
|
45
|
+
convertOptions.format = cliOptions.format;
|
|
46
|
+
}
|
|
47
|
+
if (cliOptions.quiet) {
|
|
48
|
+
convertOptions.quiet = true;
|
|
49
|
+
}
|
|
50
|
+
if (cliOptions.contentSafetyOff) {
|
|
51
|
+
convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;
|
|
52
|
+
}
|
|
53
|
+
if (cliOptions.keepLineBreaks) {
|
|
54
|
+
convertOptions.keepLineBreaks = true;
|
|
55
|
+
}
|
|
56
|
+
if (cliOptions.replaceInvalidChars) {
|
|
57
|
+
convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;
|
|
58
|
+
}
|
|
59
|
+
if (cliOptions.useStructTree) {
|
|
60
|
+
convertOptions.useStructTree = true;
|
|
61
|
+
}
|
|
62
|
+
if (cliOptions.tableMethod) {
|
|
63
|
+
convertOptions.tableMethod = cliOptions.tableMethod;
|
|
64
|
+
}
|
|
65
|
+
if (cliOptions.readingOrder) {
|
|
66
|
+
convertOptions.readingOrder = cliOptions.readingOrder;
|
|
67
|
+
}
|
|
68
|
+
if (cliOptions.markdownPageSeparator) {
|
|
69
|
+
convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;
|
|
70
|
+
}
|
|
71
|
+
if (cliOptions.textPageSeparator) {
|
|
72
|
+
convertOptions.textPageSeparator = cliOptions.textPageSeparator;
|
|
73
|
+
}
|
|
74
|
+
if (cliOptions.htmlPageSeparator) {
|
|
75
|
+
convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;
|
|
76
|
+
}
|
|
77
|
+
if (cliOptions.embedImages) {
|
|
78
|
+
convertOptions.embedImages = true;
|
|
79
|
+
}
|
|
80
|
+
if (cliOptions.imageFormat) {
|
|
81
|
+
convertOptions.imageFormat = cliOptions.imageFormat;
|
|
82
|
+
}
|
|
83
|
+
return convertOptions;
|
|
84
|
+
}
|
|
85
|
+
function buildArgs(options) {
|
|
86
|
+
const args = [];
|
|
87
|
+
if (options.outputDir) {
|
|
88
|
+
args.push("--output-dir", options.outputDir);
|
|
89
|
+
}
|
|
90
|
+
if (options.password) {
|
|
91
|
+
args.push("--password", options.password);
|
|
92
|
+
}
|
|
93
|
+
if (options.format) {
|
|
94
|
+
if (Array.isArray(options.format)) {
|
|
95
|
+
if (options.format.length > 0) {
|
|
96
|
+
args.push("--format", options.format.join(","));
|
|
97
|
+
}
|
|
98
|
+
} else {
|
|
99
|
+
args.push("--format", options.format);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
if (options.quiet) {
|
|
103
|
+
args.push("--quiet");
|
|
104
|
+
}
|
|
105
|
+
if (options.contentSafetyOff) {
|
|
106
|
+
if (Array.isArray(options.contentSafetyOff)) {
|
|
107
|
+
if (options.contentSafetyOff.length > 0) {
|
|
108
|
+
args.push("--content-safety-off", options.contentSafetyOff.join(","));
|
|
109
|
+
}
|
|
110
|
+
} else {
|
|
111
|
+
args.push("--content-safety-off", options.contentSafetyOff);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
if (options.keepLineBreaks) {
|
|
115
|
+
args.push("--keep-line-breaks");
|
|
116
|
+
}
|
|
117
|
+
if (options.replaceInvalidChars) {
|
|
118
|
+
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
119
|
+
}
|
|
120
|
+
if (options.useStructTree) {
|
|
121
|
+
args.push("--use-struct-tree");
|
|
122
|
+
}
|
|
123
|
+
if (options.tableMethod) {
|
|
124
|
+
args.push("--table-method", options.tableMethod);
|
|
125
|
+
}
|
|
126
|
+
if (options.readingOrder) {
|
|
127
|
+
args.push("--reading-order", options.readingOrder);
|
|
128
|
+
}
|
|
129
|
+
if (options.markdownPageSeparator) {
|
|
130
|
+
args.push("--markdown-page-separator", options.markdownPageSeparator);
|
|
131
|
+
}
|
|
132
|
+
if (options.textPageSeparator) {
|
|
133
|
+
args.push("--text-page-separator", options.textPageSeparator);
|
|
134
|
+
}
|
|
135
|
+
if (options.htmlPageSeparator) {
|
|
136
|
+
args.push("--html-page-separator", options.htmlPageSeparator);
|
|
137
|
+
}
|
|
138
|
+
if (options.embedImages) {
|
|
139
|
+
args.push("--embed-images");
|
|
140
|
+
}
|
|
141
|
+
if (options.imageFormat) {
|
|
142
|
+
args.push("--image-format", options.imageFormat);
|
|
143
|
+
}
|
|
144
|
+
return args;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// src/index.ts
|
|
34
148
|
var import_meta = {};
|
|
35
149
|
var __filename = (0, import_url.fileURLToPath)(import_meta.url);
|
|
36
150
|
var __dirname = path.dirname(__filename);
|
|
37
151
|
var JAR_NAME = "opendataloader-pdf-cli.jar";
|
|
38
152
|
function executeJar(args, executionOptions = {}) {
|
|
39
|
-
const {
|
|
153
|
+
const { streamOutput = false } = executionOptions;
|
|
40
154
|
return new Promise((resolve, reject) => {
|
|
41
155
|
const jarPath = path.join(__dirname, "..", "lib", JAR_NAME);
|
|
42
156
|
if (!fs.existsSync(jarPath)) {
|
|
@@ -90,67 +204,45 @@ ${errorOutput}`
|
|
|
90
204
|
});
|
|
91
205
|
}
|
|
92
206
|
function convert(inputPaths, options = {}) {
|
|
93
|
-
|
|
207
|
+
const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
|
|
208
|
+
if (inputList.length === 0) {
|
|
94
209
|
return Promise.reject(new Error("At least one input path must be provided."));
|
|
95
210
|
}
|
|
96
|
-
for (const input of
|
|
211
|
+
for (const input of inputList) {
|
|
97
212
|
if (!fs.existsSync(input)) {
|
|
98
213
|
return Promise.reject(new Error(`Input file or folder not found: ${input}`));
|
|
99
214
|
}
|
|
100
215
|
}
|
|
101
|
-
const args = [...
|
|
102
|
-
if (options.outputDir) {
|
|
103
|
-
args.push("--output-dir", options.outputDir);
|
|
104
|
-
}
|
|
105
|
-
if (options.password) {
|
|
106
|
-
args.push("--password", options.password);
|
|
107
|
-
}
|
|
108
|
-
if (options.format && options.format.length > 0) {
|
|
109
|
-
args.push("--format", ...options.format);
|
|
110
|
-
}
|
|
111
|
-
if (options.quiet) {
|
|
112
|
-
args.push("--quiet");
|
|
113
|
-
}
|
|
114
|
-
if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {
|
|
115
|
-
args.push("--content-safety-off", ...options.contentSafetyOff);
|
|
116
|
-
}
|
|
117
|
-
if (options.keepLineBreaks) {
|
|
118
|
-
args.push("--keep-line-breaks");
|
|
119
|
-
}
|
|
120
|
-
if (options.replaceInvalidChars) {
|
|
121
|
-
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
122
|
-
}
|
|
123
|
-
if (options.useStructTree) {
|
|
124
|
-
args.push("--use-struct-tree");
|
|
125
|
-
}
|
|
216
|
+
const args = [...inputList, ...buildArgs(options)];
|
|
126
217
|
return executeJar(args, {
|
|
127
218
|
streamOutput: !options.quiet
|
|
128
219
|
});
|
|
129
220
|
}
|
|
130
221
|
|
|
222
|
+
// src/cli-options.generated.ts
|
|
223
|
+
function registerCliOptions(program) {
|
|
224
|
+
program.option("-o, --output-dir <value>", "Directory where output files are written. Default: input file directory");
|
|
225
|
+
program.option("-p, --password <value>", "Password for encrypted PDF files");
|
|
226
|
+
program.option("-f, --format <value>", "Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json");
|
|
227
|
+
program.option("-q, --quiet", "Suppress console logging output");
|
|
228
|
+
program.option("--content-safety-off <value>", "Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg");
|
|
229
|
+
program.option("--keep-line-breaks", "Preserve original line breaks in extracted text");
|
|
230
|
+
program.option("--replace-invalid-chars <value>", "Replacement character for invalid/unrecognized characters. Default: space");
|
|
231
|
+
program.option("--use-struct-tree", "Use PDF structure tree (tagged PDF) for reading order and semantic structure");
|
|
232
|
+
program.option("--table-method <value>", "Table detection method. Values: cluster");
|
|
233
|
+
program.option("--reading-order <value>", "Reading order algorithm. Values: none, xycut. Default: none");
|
|
234
|
+
program.option("--markdown-page-separator <value>", "Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none");
|
|
235
|
+
program.option("--text-page-separator <value>", "Separator between pages in text output. Use %page-number% for page numbers. Default: none");
|
|
236
|
+
program.option("--html-page-separator <value>", "Separator between pages in HTML output. Use %page-number% for page numbers. Default: none");
|
|
237
|
+
program.option("--embed-images", "Embed images as Base64 data URIs instead of file path references");
|
|
238
|
+
program.option("--image-format <value>", "Output format for extracted images. Values: png, jpeg. Default: png");
|
|
239
|
+
}
|
|
240
|
+
|
|
131
241
|
// src/cli.ts
|
|
132
|
-
var VALID_FORMATS = /* @__PURE__ */ new Set([
|
|
133
|
-
"json",
|
|
134
|
-
"text",
|
|
135
|
-
"html",
|
|
136
|
-
"pdf",
|
|
137
|
-
"markdown",
|
|
138
|
-
"markdown-with-html",
|
|
139
|
-
"markdown-with-images"
|
|
140
|
-
]);
|
|
141
|
-
var VALID_CONTENT_SAFETY_MODES = /* @__PURE__ */ new Set([
|
|
142
|
-
"all",
|
|
143
|
-
"hidden-text",
|
|
144
|
-
"off-page",
|
|
145
|
-
"tiny",
|
|
146
|
-
"hidden-ocg"
|
|
147
|
-
]);
|
|
148
242
|
function createProgram() {
|
|
149
243
|
const program = new import_commander.Command();
|
|
150
|
-
program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert")
|
|
151
|
-
|
|
152
|
-
"Output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)"
|
|
153
|
-
).option("-q, --quiet", "Suppress CLI logging output").option("--content-safety-off <mode...>", "Disable one or more content safety filters").option("--keep-line-breaks", "Preserve line breaks in text output").option("--replace-invalid-chars <c>", "Replacement character for invalid characters").option("--use-struct-tree", "Enable processing structure tree (disabled by default)");
|
|
244
|
+
program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert");
|
|
245
|
+
registerCliOptions(program);
|
|
154
246
|
program.configureOutput({
|
|
155
247
|
writeErr: (str) => {
|
|
156
248
|
console.error(str.trimEnd());
|
|
@@ -161,34 +253,6 @@ function createProgram() {
|
|
|
161
253
|
});
|
|
162
254
|
return program;
|
|
163
255
|
}
|
|
164
|
-
function buildConvertOptions(options) {
|
|
165
|
-
const convertOptions = {};
|
|
166
|
-
if (options.outputDir) {
|
|
167
|
-
convertOptions.outputDir = options.outputDir;
|
|
168
|
-
}
|
|
169
|
-
if (options.password) {
|
|
170
|
-
convertOptions.password = options.password;
|
|
171
|
-
}
|
|
172
|
-
if (options.format && options.format.length > 0) {
|
|
173
|
-
convertOptions.format = options.format;
|
|
174
|
-
}
|
|
175
|
-
if (options.quiet) {
|
|
176
|
-
convertOptions.quiet = true;
|
|
177
|
-
}
|
|
178
|
-
if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {
|
|
179
|
-
convertOptions.contentSafetyOff = options.contentSafetyOff;
|
|
180
|
-
}
|
|
181
|
-
if (options.keepLineBreaks) {
|
|
182
|
-
convertOptions.keepLineBreaks = true;
|
|
183
|
-
}
|
|
184
|
-
if (options.replaceInvalidChars) {
|
|
185
|
-
convertOptions.replaceInvalidChars = options.replaceInvalidChars;
|
|
186
|
-
}
|
|
187
|
-
if (options.useStructTree) {
|
|
188
|
-
convertOptions.useStructTree = true;
|
|
189
|
-
}
|
|
190
|
-
return convertOptions;
|
|
191
|
-
}
|
|
192
256
|
async function main() {
|
|
193
257
|
const program = createProgram();
|
|
194
258
|
program.exitOverride();
|
|
@@ -208,24 +272,6 @@ async function main() {
|
|
|
208
272
|
}
|
|
209
273
|
const cliOptions = program.opts();
|
|
210
274
|
const inputPaths = program.args;
|
|
211
|
-
if (cliOptions.format) {
|
|
212
|
-
for (const value of cliOptions.format) {
|
|
213
|
-
if (!VALID_FORMATS.has(value)) {
|
|
214
|
-
console.error(`Invalid format '${value}'. See '--help' for allowed values.`);
|
|
215
|
-
console.error("Use '--help' to see available options.");
|
|
216
|
-
return 1;
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
if (cliOptions.contentSafetyOff) {
|
|
221
|
-
for (const value of cliOptions.contentSafetyOff) {
|
|
222
|
-
if (!VALID_CONTENT_SAFETY_MODES.has(value)) {
|
|
223
|
-
console.error(`Invalid content safety mode '${value}'. See '--help' for allowed values.`);
|
|
224
|
-
console.error("Use '--help' to see available options.");
|
|
225
|
-
return 1;
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
275
|
const convertOptions = buildConvertOptions(cliOptions);
|
|
230
276
|
try {
|
|
231
277
|
const output = await convert(inputPaths, convertOptions);
|
package/dist/cli.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts","../src/index.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert, ConvertOptions } from './index.js';\n\ninterface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string[];\n quiet?: boolean;\n contentSafetyOff?: string[];\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n}\n\nconst VALID_FORMATS = new Set([\n 'json',\n 'text',\n 'html',\n 'pdf',\n 'markdown',\n 'markdown-with-html',\n 'markdown-with-images',\n]);\n\nconst VALID_CONTENT_SAFETY_MODES = new Set([\n 'all',\n 'hidden-text',\n 'off-page',\n 'tiny',\n 'hidden-ocg',\n]);\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert')\n .option('-o, --output-dir <path>', 'Directory where outputs are written')\n .option('-p, --password <password>', 'Password for encrypted PDFs')\n .option(\n '-f, --format <value...>',\n 'Output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)',\n )\n .option('-q, --quiet', 'Suppress CLI logging output')\n .option('--content-safety-off <mode...>', 'Disable one or more content safety filters')\n .option('--keep-line-breaks', 'Preserve line breaks in text output')\n .option('--replace-invalid-chars <c>', 'Replacement character for invalid characters')\n .option('--use-struct-tree', 'Enable processing structure tree (disabled by default)');\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nfunction buildConvertOptions(options: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (options.outputDir) {\n convertOptions.outputDir = options.outputDir;\n }\n if (options.password) {\n convertOptions.password = options.password;\n }\n if (options.format && options.format.length > 0) {\n convertOptions.format = options.format;\n }\n if (options.quiet) {\n convertOptions.quiet = true;\n }\n if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {\n convertOptions.contentSafetyOff = options.contentSafetyOff;\n }\n if (options.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (options.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = options.replaceInvalidChars;\n }\n if (options.useStructTree) {\n convertOptions.useStructTree = true;\n }\n\n return convertOptions;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n\n if (cliOptions.format) {\n for (const value of cliOptions.format) {\n if (!VALID_FORMATS.has(value)) {\n console.error(`Invalid format '${value}'. See '--help' for allowed values.`);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n }\n }\n\n if (cliOptions.contentSafetyOff) {\n for (const value of cliOptions.contentSafetyOff) {\n if (!VALID_CONTENT_SAFETY_MODES.has(value)) {\n console.error(`Invalid content safety mode '${value}'. See '--help' for allowed values.`);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n }\n }\n\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n debug?: boolean;\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { debug = false, streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n return new Promise((resolve, reject) => {\n if (!fs.existsSync(inputPath)) {\n return reject(new Error(`Input file or folder not found: ${inputPath}`));\n }\n\n const args: string[] = [];\n if (options.outputFolder) {\n args.push('--output-dir', options.outputFolder);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.generateMarkdown) {\n args.push('--markdown');\n }\n if (options.generateHtml) {\n args.push('--html');\n }\n if (options.generateAnnotatedPdf) {\n args.push('--pdf');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.contentSafetyOff) {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n if (options.htmlInMarkdown) {\n args.push('--markdown-with-html');\n }\n if (options.addImageToMarkdown) {\n args.push('--markdown-with-images');\n }\n if (options.noJson) {\n args.push('--no-json');\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree')\n }\n\n args.push(inputPath);\n executeJar(args, {\n debug: options.debug,\n streamOutput: Boolean(options.debug),\n })\n .then(resolve)\n .catch(reject);\n });\n}\n\nexport interface ConvertOptions {\n outputDir?: string;\n password?: string;\n format?: string[];\n quiet?: boolean;\n contentSafetyOff?: string[];\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n}\n\nexport function convert(inputPaths: string[], options: ConvertOptions = {}): Promise<string> {\n if (inputPaths.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputPaths) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputPaths];\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format && options.format.length > 0) {\n args.push('--format', ...options.format);\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', ...options.contentSafetyOff);\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree')\n }\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AACA,uBAAwC;;;ACDxC,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;AAH9B;AAKA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAOjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,QAAQ,OAAO,eAAe,MAAM,IAAI;AAEhD,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAmFO,SAAS,QAAQ,YAAsB,UAA0B,CAAC,GAAoB;AAC3F,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,YAAY;AAC9B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,UAAU;AACrC,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,UAAU,QAAQ,OAAO,SAAS,GAAG;AAC/C,SAAK,KAAK,YAAY,GAAG,QAAQ,MAAM;AAAA,EACzC;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,oBAAoB,QAAQ,iBAAiB,SAAS,GAAG;AACnE,SAAK,KAAK,wBAAwB,GAAG,QAAQ,gBAAgB;AAAA,EAC/D;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;ADvLA,IAAM,gBAAgB,oBAAI,IAAI;AAAA,EAC5B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAED,IAAM,6BAA6B,oBAAI,IAAI;AAAA,EACzC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAED,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,yBAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC,EAC9D,OAAO,2BAA2B,qCAAqC,EACvE,OAAO,6BAA6B,6BAA6B,EACjE;AAAA,IACC;AAAA,IACA;AAAA,EACF,EACC,OAAO,eAAe,6BAA6B,EACnD,OAAO,kCAAkC,4CAA4C,EACrF,OAAO,sBAAsB,qCAAqC,EAClE,OAAO,+BAA+B,8CAA8C,EACpF,OAAO,qBAAqB,wDAAwD;AAEvF,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,SAAS,oBAAoB,SAAqC;AAChE,QAAM,iBAAiC,CAAC;AAExC,MAAI,QAAQ,WAAW;AACrB,mBAAe,YAAY,QAAQ;AAAA,EACrC;AACA,MAAI,QAAQ,UAAU;AACpB,mBAAe,WAAW,QAAQ;AAAA,EACpC;AACA,MAAI,QAAQ,UAAU,QAAQ,OAAO,SAAS,GAAG;AAC/C,mBAAe,SAAS,QAAQ;AAAA,EAClC;AACA,MAAI,QAAQ,OAAO;AACjB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,QAAQ,oBAAoB,QAAQ,iBAAiB,SAAS,GAAG;AACnE,mBAAe,mBAAmB,QAAQ;AAAA,EAC5C;AACA,MAAI,QAAQ,gBAAgB;AAC1B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,mBAAe,sBAAsB,QAAQ;AAAA,EAC/C;AACA,MAAI,QAAQ,eAAe;AACzB,mBAAe,gBAAgB;AAAA,EACjC;AAEA,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,iCAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAE3B,MAAI,WAAW,QAAQ;AACrB,eAAW,SAAS,WAAW,QAAQ;AACrC,UAAI,CAAC,cAAc,IAAI,KAAK,GAAG;AAC7B,gBAAQ,MAAM,mBAAmB,KAAK,qCAAqC;AAC3E,gBAAQ,MAAM,wCAAwC;AACtD,eAAO;AAAA,MACT;AAAA,IACF;AAAA,EACF;AAEA,MAAI,WAAW,kBAAkB;AAC/B,eAAW,SAAS,WAAW,kBAAkB;AAC/C,UAAI,CAAC,2BAA2B,IAAI,KAAK,GAAG;AAC1C,gBAAQ,MAAM,gCAAgC,KAAK,qCAAqC;AACxF,gBAAQ,MAAM,wCAAwC;AACtD,eAAO;AAAA,MACT;AAAA,IACF;AAAA,EACF;AAEA,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: cluster */\n tableMethod?: string;\n /** Reading order algorithm. Values: none, xycut. Default: none */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Embed images as Base64 data URIs instead of file path references */\n embedImages?: boolean;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n embedImages?: boolean;\n imageFormat?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.embedImages) {\n convertOptions.embedImages = true;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.embedImages) {\n args.push('--embed-images');\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: cluster');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: none, xycut. Default: none');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--embed-images', 'Embed images as Base64 data URIs instead of file path references');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AACA,uBAAwC;;;ACDxC,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;;;AC4DvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc;AAAA,EAC/B;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,gBAAgB;AAAA,EAC5B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AAEA,SAAO;AACT;;;ADpLA;AAWA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AE9FO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,oIAAoI;AAC3K,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,yCAAyC;AAClF,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,kBAAkB,kEAAkE;AACnG,UAAQ,OAAO,0BAA0B,qEAAqE;AAChH;;;AHlBA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,yBAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,iCAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
|
package/dist/cli.js
CHANGED
|
@@ -8,11 +8,125 @@ import { spawn } from "child_process";
|
|
|
8
8
|
import * as path from "path";
|
|
9
9
|
import * as fs from "fs";
|
|
10
10
|
import { fileURLToPath } from "url";
|
|
11
|
+
|
|
12
|
+
// src/convert-options.generated.ts
|
|
13
|
+
function buildConvertOptions(cliOptions) {
|
|
14
|
+
const convertOptions = {};
|
|
15
|
+
if (cliOptions.outputDir) {
|
|
16
|
+
convertOptions.outputDir = cliOptions.outputDir;
|
|
17
|
+
}
|
|
18
|
+
if (cliOptions.password) {
|
|
19
|
+
convertOptions.password = cliOptions.password;
|
|
20
|
+
}
|
|
21
|
+
if (cliOptions.format) {
|
|
22
|
+
convertOptions.format = cliOptions.format;
|
|
23
|
+
}
|
|
24
|
+
if (cliOptions.quiet) {
|
|
25
|
+
convertOptions.quiet = true;
|
|
26
|
+
}
|
|
27
|
+
if (cliOptions.contentSafetyOff) {
|
|
28
|
+
convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;
|
|
29
|
+
}
|
|
30
|
+
if (cliOptions.keepLineBreaks) {
|
|
31
|
+
convertOptions.keepLineBreaks = true;
|
|
32
|
+
}
|
|
33
|
+
if (cliOptions.replaceInvalidChars) {
|
|
34
|
+
convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;
|
|
35
|
+
}
|
|
36
|
+
if (cliOptions.useStructTree) {
|
|
37
|
+
convertOptions.useStructTree = true;
|
|
38
|
+
}
|
|
39
|
+
if (cliOptions.tableMethod) {
|
|
40
|
+
convertOptions.tableMethod = cliOptions.tableMethod;
|
|
41
|
+
}
|
|
42
|
+
if (cliOptions.readingOrder) {
|
|
43
|
+
convertOptions.readingOrder = cliOptions.readingOrder;
|
|
44
|
+
}
|
|
45
|
+
if (cliOptions.markdownPageSeparator) {
|
|
46
|
+
convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;
|
|
47
|
+
}
|
|
48
|
+
if (cliOptions.textPageSeparator) {
|
|
49
|
+
convertOptions.textPageSeparator = cliOptions.textPageSeparator;
|
|
50
|
+
}
|
|
51
|
+
if (cliOptions.htmlPageSeparator) {
|
|
52
|
+
convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;
|
|
53
|
+
}
|
|
54
|
+
if (cliOptions.embedImages) {
|
|
55
|
+
convertOptions.embedImages = true;
|
|
56
|
+
}
|
|
57
|
+
if (cliOptions.imageFormat) {
|
|
58
|
+
convertOptions.imageFormat = cliOptions.imageFormat;
|
|
59
|
+
}
|
|
60
|
+
return convertOptions;
|
|
61
|
+
}
|
|
62
|
+
function buildArgs(options) {
|
|
63
|
+
const args = [];
|
|
64
|
+
if (options.outputDir) {
|
|
65
|
+
args.push("--output-dir", options.outputDir);
|
|
66
|
+
}
|
|
67
|
+
if (options.password) {
|
|
68
|
+
args.push("--password", options.password);
|
|
69
|
+
}
|
|
70
|
+
if (options.format) {
|
|
71
|
+
if (Array.isArray(options.format)) {
|
|
72
|
+
if (options.format.length > 0) {
|
|
73
|
+
args.push("--format", options.format.join(","));
|
|
74
|
+
}
|
|
75
|
+
} else {
|
|
76
|
+
args.push("--format", options.format);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
if (options.quiet) {
|
|
80
|
+
args.push("--quiet");
|
|
81
|
+
}
|
|
82
|
+
if (options.contentSafetyOff) {
|
|
83
|
+
if (Array.isArray(options.contentSafetyOff)) {
|
|
84
|
+
if (options.contentSafetyOff.length > 0) {
|
|
85
|
+
args.push("--content-safety-off", options.contentSafetyOff.join(","));
|
|
86
|
+
}
|
|
87
|
+
} else {
|
|
88
|
+
args.push("--content-safety-off", options.contentSafetyOff);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
if (options.keepLineBreaks) {
|
|
92
|
+
args.push("--keep-line-breaks");
|
|
93
|
+
}
|
|
94
|
+
if (options.replaceInvalidChars) {
|
|
95
|
+
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
96
|
+
}
|
|
97
|
+
if (options.useStructTree) {
|
|
98
|
+
args.push("--use-struct-tree");
|
|
99
|
+
}
|
|
100
|
+
if (options.tableMethod) {
|
|
101
|
+
args.push("--table-method", options.tableMethod);
|
|
102
|
+
}
|
|
103
|
+
if (options.readingOrder) {
|
|
104
|
+
args.push("--reading-order", options.readingOrder);
|
|
105
|
+
}
|
|
106
|
+
if (options.markdownPageSeparator) {
|
|
107
|
+
args.push("--markdown-page-separator", options.markdownPageSeparator);
|
|
108
|
+
}
|
|
109
|
+
if (options.textPageSeparator) {
|
|
110
|
+
args.push("--text-page-separator", options.textPageSeparator);
|
|
111
|
+
}
|
|
112
|
+
if (options.htmlPageSeparator) {
|
|
113
|
+
args.push("--html-page-separator", options.htmlPageSeparator);
|
|
114
|
+
}
|
|
115
|
+
if (options.embedImages) {
|
|
116
|
+
args.push("--embed-images");
|
|
117
|
+
}
|
|
118
|
+
if (options.imageFormat) {
|
|
119
|
+
args.push("--image-format", options.imageFormat);
|
|
120
|
+
}
|
|
121
|
+
return args;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// src/index.ts
|
|
11
125
|
var __filename = fileURLToPath(import.meta.url);
|
|
12
126
|
var __dirname = path.dirname(__filename);
|
|
13
127
|
var JAR_NAME = "opendataloader-pdf-cli.jar";
|
|
14
128
|
function executeJar(args, executionOptions = {}) {
|
|
15
|
-
const {
|
|
129
|
+
const { streamOutput = false } = executionOptions;
|
|
16
130
|
return new Promise((resolve, reject) => {
|
|
17
131
|
const jarPath = path.join(__dirname, "..", "lib", JAR_NAME);
|
|
18
132
|
if (!fs.existsSync(jarPath)) {
|
|
@@ -66,67 +180,45 @@ ${errorOutput}`
|
|
|
66
180
|
});
|
|
67
181
|
}
|
|
68
182
|
function convert(inputPaths, options = {}) {
|
|
69
|
-
|
|
183
|
+
const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
|
|
184
|
+
if (inputList.length === 0) {
|
|
70
185
|
return Promise.reject(new Error("At least one input path must be provided."));
|
|
71
186
|
}
|
|
72
|
-
for (const input of
|
|
187
|
+
for (const input of inputList) {
|
|
73
188
|
if (!fs.existsSync(input)) {
|
|
74
189
|
return Promise.reject(new Error(`Input file or folder not found: ${input}`));
|
|
75
190
|
}
|
|
76
191
|
}
|
|
77
|
-
const args = [...
|
|
78
|
-
if (options.outputDir) {
|
|
79
|
-
args.push("--output-dir", options.outputDir);
|
|
80
|
-
}
|
|
81
|
-
if (options.password) {
|
|
82
|
-
args.push("--password", options.password);
|
|
83
|
-
}
|
|
84
|
-
if (options.format && options.format.length > 0) {
|
|
85
|
-
args.push("--format", ...options.format);
|
|
86
|
-
}
|
|
87
|
-
if (options.quiet) {
|
|
88
|
-
args.push("--quiet");
|
|
89
|
-
}
|
|
90
|
-
if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {
|
|
91
|
-
args.push("--content-safety-off", ...options.contentSafetyOff);
|
|
92
|
-
}
|
|
93
|
-
if (options.keepLineBreaks) {
|
|
94
|
-
args.push("--keep-line-breaks");
|
|
95
|
-
}
|
|
96
|
-
if (options.replaceInvalidChars) {
|
|
97
|
-
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
98
|
-
}
|
|
99
|
-
if (options.useStructTree) {
|
|
100
|
-
args.push("--use-struct-tree");
|
|
101
|
-
}
|
|
192
|
+
const args = [...inputList, ...buildArgs(options)];
|
|
102
193
|
return executeJar(args, {
|
|
103
194
|
streamOutput: !options.quiet
|
|
104
195
|
});
|
|
105
196
|
}
|
|
106
197
|
|
|
198
|
+
// src/cli-options.generated.ts
|
|
199
|
+
function registerCliOptions(program) {
|
|
200
|
+
program.option("-o, --output-dir <value>", "Directory where output files are written. Default: input file directory");
|
|
201
|
+
program.option("-p, --password <value>", "Password for encrypted PDF files");
|
|
202
|
+
program.option("-f, --format <value>", "Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json");
|
|
203
|
+
program.option("-q, --quiet", "Suppress console logging output");
|
|
204
|
+
program.option("--content-safety-off <value>", "Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg");
|
|
205
|
+
program.option("--keep-line-breaks", "Preserve original line breaks in extracted text");
|
|
206
|
+
program.option("--replace-invalid-chars <value>", "Replacement character for invalid/unrecognized characters. Default: space");
|
|
207
|
+
program.option("--use-struct-tree", "Use PDF structure tree (tagged PDF) for reading order and semantic structure");
|
|
208
|
+
program.option("--table-method <value>", "Table detection method. Values: cluster");
|
|
209
|
+
program.option("--reading-order <value>", "Reading order algorithm. Values: none, xycut. Default: none");
|
|
210
|
+
program.option("--markdown-page-separator <value>", "Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none");
|
|
211
|
+
program.option("--text-page-separator <value>", "Separator between pages in text output. Use %page-number% for page numbers. Default: none");
|
|
212
|
+
program.option("--html-page-separator <value>", "Separator between pages in HTML output. Use %page-number% for page numbers. Default: none");
|
|
213
|
+
program.option("--embed-images", "Embed images as Base64 data URIs instead of file path references");
|
|
214
|
+
program.option("--image-format <value>", "Output format for extracted images. Values: png, jpeg. Default: png");
|
|
215
|
+
}
|
|
216
|
+
|
|
107
217
|
// src/cli.ts
|
|
108
|
-
var VALID_FORMATS = /* @__PURE__ */ new Set([
|
|
109
|
-
"json",
|
|
110
|
-
"text",
|
|
111
|
-
"html",
|
|
112
|
-
"pdf",
|
|
113
|
-
"markdown",
|
|
114
|
-
"markdown-with-html",
|
|
115
|
-
"markdown-with-images"
|
|
116
|
-
]);
|
|
117
|
-
var VALID_CONTENT_SAFETY_MODES = /* @__PURE__ */ new Set([
|
|
118
|
-
"all",
|
|
119
|
-
"hidden-text",
|
|
120
|
-
"off-page",
|
|
121
|
-
"tiny",
|
|
122
|
-
"hidden-ocg"
|
|
123
|
-
]);
|
|
124
218
|
function createProgram() {
|
|
125
219
|
const program = new Command();
|
|
126
|
-
program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert")
|
|
127
|
-
|
|
128
|
-
"Output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)"
|
|
129
|
-
).option("-q, --quiet", "Suppress CLI logging output").option("--content-safety-off <mode...>", "Disable one or more content safety filters").option("--keep-line-breaks", "Preserve line breaks in text output").option("--replace-invalid-chars <c>", "Replacement character for invalid characters").option("--use-struct-tree", "Enable processing structure tree (disabled by default)");
|
|
220
|
+
program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert");
|
|
221
|
+
registerCliOptions(program);
|
|
130
222
|
program.configureOutput({
|
|
131
223
|
writeErr: (str) => {
|
|
132
224
|
console.error(str.trimEnd());
|
|
@@ -137,34 +229,6 @@ function createProgram() {
|
|
|
137
229
|
});
|
|
138
230
|
return program;
|
|
139
231
|
}
|
|
140
|
-
function buildConvertOptions(options) {
|
|
141
|
-
const convertOptions = {};
|
|
142
|
-
if (options.outputDir) {
|
|
143
|
-
convertOptions.outputDir = options.outputDir;
|
|
144
|
-
}
|
|
145
|
-
if (options.password) {
|
|
146
|
-
convertOptions.password = options.password;
|
|
147
|
-
}
|
|
148
|
-
if (options.format && options.format.length > 0) {
|
|
149
|
-
convertOptions.format = options.format;
|
|
150
|
-
}
|
|
151
|
-
if (options.quiet) {
|
|
152
|
-
convertOptions.quiet = true;
|
|
153
|
-
}
|
|
154
|
-
if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {
|
|
155
|
-
convertOptions.contentSafetyOff = options.contentSafetyOff;
|
|
156
|
-
}
|
|
157
|
-
if (options.keepLineBreaks) {
|
|
158
|
-
convertOptions.keepLineBreaks = true;
|
|
159
|
-
}
|
|
160
|
-
if (options.replaceInvalidChars) {
|
|
161
|
-
convertOptions.replaceInvalidChars = options.replaceInvalidChars;
|
|
162
|
-
}
|
|
163
|
-
if (options.useStructTree) {
|
|
164
|
-
convertOptions.useStructTree = true;
|
|
165
|
-
}
|
|
166
|
-
return convertOptions;
|
|
167
|
-
}
|
|
168
232
|
async function main() {
|
|
169
233
|
const program = createProgram();
|
|
170
234
|
program.exitOverride();
|
|
@@ -184,24 +248,6 @@ async function main() {
|
|
|
184
248
|
}
|
|
185
249
|
const cliOptions = program.opts();
|
|
186
250
|
const inputPaths = program.args;
|
|
187
|
-
if (cliOptions.format) {
|
|
188
|
-
for (const value of cliOptions.format) {
|
|
189
|
-
if (!VALID_FORMATS.has(value)) {
|
|
190
|
-
console.error(`Invalid format '${value}'. See '--help' for allowed values.`);
|
|
191
|
-
console.error("Use '--help' to see available options.");
|
|
192
|
-
return 1;
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
if (cliOptions.contentSafetyOff) {
|
|
197
|
-
for (const value of cliOptions.contentSafetyOff) {
|
|
198
|
-
if (!VALID_CONTENT_SAFETY_MODES.has(value)) {
|
|
199
|
-
console.error(`Invalid content safety mode '${value}'. See '--help' for allowed values.`);
|
|
200
|
-
console.error("Use '--help' to see available options.");
|
|
201
|
-
return 1;
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
251
|
const convertOptions = buildConvertOptions(cliOptions);
|
|
206
252
|
try {
|
|
207
253
|
const output = await convert(inputPaths, convertOptions);
|