@opendataloader/pdf 1.3.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NOTICE.md +1 -1
- package/README.md +193 -369
- package/dist/cli.cjs +140 -65
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +140 -65
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +102 -81
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +48 -12
- package/dist/index.d.ts +48 -12
- package/dist/index.js +101 -81
- package/dist/index.js.map +1 -1
- package/lib/opendataloader-pdf-cli.jar +0 -0
- package/package.json +2 -2
package/dist/cli.cjs
CHANGED
|
@@ -31,12 +31,126 @@ var import_child_process = require("child_process");
|
|
|
31
31
|
var path = __toESM(require("path"), 1);
|
|
32
32
|
var fs = __toESM(require("fs"), 1);
|
|
33
33
|
var import_url = require("url");
|
|
34
|
+
|
|
35
|
+
// src/convert-options.generated.ts
|
|
36
|
+
function buildConvertOptions(cliOptions) {
|
|
37
|
+
const convertOptions = {};
|
|
38
|
+
if (cliOptions.outputDir) {
|
|
39
|
+
convertOptions.outputDir = cliOptions.outputDir;
|
|
40
|
+
}
|
|
41
|
+
if (cliOptions.password) {
|
|
42
|
+
convertOptions.password = cliOptions.password;
|
|
43
|
+
}
|
|
44
|
+
if (cliOptions.format) {
|
|
45
|
+
convertOptions.format = cliOptions.format;
|
|
46
|
+
}
|
|
47
|
+
if (cliOptions.quiet) {
|
|
48
|
+
convertOptions.quiet = true;
|
|
49
|
+
}
|
|
50
|
+
if (cliOptions.contentSafetyOff) {
|
|
51
|
+
convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;
|
|
52
|
+
}
|
|
53
|
+
if (cliOptions.keepLineBreaks) {
|
|
54
|
+
convertOptions.keepLineBreaks = true;
|
|
55
|
+
}
|
|
56
|
+
if (cliOptions.replaceInvalidChars) {
|
|
57
|
+
convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;
|
|
58
|
+
}
|
|
59
|
+
if (cliOptions.useStructTree) {
|
|
60
|
+
convertOptions.useStructTree = true;
|
|
61
|
+
}
|
|
62
|
+
if (cliOptions.tableMethod) {
|
|
63
|
+
convertOptions.tableMethod = cliOptions.tableMethod;
|
|
64
|
+
}
|
|
65
|
+
if (cliOptions.readingOrder) {
|
|
66
|
+
convertOptions.readingOrder = cliOptions.readingOrder;
|
|
67
|
+
}
|
|
68
|
+
if (cliOptions.markdownPageSeparator) {
|
|
69
|
+
convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;
|
|
70
|
+
}
|
|
71
|
+
if (cliOptions.textPageSeparator) {
|
|
72
|
+
convertOptions.textPageSeparator = cliOptions.textPageSeparator;
|
|
73
|
+
}
|
|
74
|
+
if (cliOptions.htmlPageSeparator) {
|
|
75
|
+
convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;
|
|
76
|
+
}
|
|
77
|
+
if (cliOptions.embedImages) {
|
|
78
|
+
convertOptions.embedImages = true;
|
|
79
|
+
}
|
|
80
|
+
if (cliOptions.imageFormat) {
|
|
81
|
+
convertOptions.imageFormat = cliOptions.imageFormat;
|
|
82
|
+
}
|
|
83
|
+
return convertOptions;
|
|
84
|
+
}
|
|
85
|
+
function buildArgs(options) {
|
|
86
|
+
const args = [];
|
|
87
|
+
if (options.outputDir) {
|
|
88
|
+
args.push("--output-dir", options.outputDir);
|
|
89
|
+
}
|
|
90
|
+
if (options.password) {
|
|
91
|
+
args.push("--password", options.password);
|
|
92
|
+
}
|
|
93
|
+
if (options.format) {
|
|
94
|
+
if (Array.isArray(options.format)) {
|
|
95
|
+
if (options.format.length > 0) {
|
|
96
|
+
args.push("--format", options.format.join(","));
|
|
97
|
+
}
|
|
98
|
+
} else {
|
|
99
|
+
args.push("--format", options.format);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
if (options.quiet) {
|
|
103
|
+
args.push("--quiet");
|
|
104
|
+
}
|
|
105
|
+
if (options.contentSafetyOff) {
|
|
106
|
+
if (Array.isArray(options.contentSafetyOff)) {
|
|
107
|
+
if (options.contentSafetyOff.length > 0) {
|
|
108
|
+
args.push("--content-safety-off", options.contentSafetyOff.join(","));
|
|
109
|
+
}
|
|
110
|
+
} else {
|
|
111
|
+
args.push("--content-safety-off", options.contentSafetyOff);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
if (options.keepLineBreaks) {
|
|
115
|
+
args.push("--keep-line-breaks");
|
|
116
|
+
}
|
|
117
|
+
if (options.replaceInvalidChars) {
|
|
118
|
+
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
119
|
+
}
|
|
120
|
+
if (options.useStructTree) {
|
|
121
|
+
args.push("--use-struct-tree");
|
|
122
|
+
}
|
|
123
|
+
if (options.tableMethod) {
|
|
124
|
+
args.push("--table-method", options.tableMethod);
|
|
125
|
+
}
|
|
126
|
+
if (options.readingOrder) {
|
|
127
|
+
args.push("--reading-order", options.readingOrder);
|
|
128
|
+
}
|
|
129
|
+
if (options.markdownPageSeparator) {
|
|
130
|
+
args.push("--markdown-page-separator", options.markdownPageSeparator);
|
|
131
|
+
}
|
|
132
|
+
if (options.textPageSeparator) {
|
|
133
|
+
args.push("--text-page-separator", options.textPageSeparator);
|
|
134
|
+
}
|
|
135
|
+
if (options.htmlPageSeparator) {
|
|
136
|
+
args.push("--html-page-separator", options.htmlPageSeparator);
|
|
137
|
+
}
|
|
138
|
+
if (options.embedImages) {
|
|
139
|
+
args.push("--embed-images");
|
|
140
|
+
}
|
|
141
|
+
if (options.imageFormat) {
|
|
142
|
+
args.push("--image-format", options.imageFormat);
|
|
143
|
+
}
|
|
144
|
+
return args;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// src/index.ts
|
|
34
148
|
var import_meta = {};
|
|
35
149
|
var __filename = (0, import_url.fileURLToPath)(import_meta.url);
|
|
36
150
|
var __dirname = path.dirname(__filename);
|
|
37
151
|
var JAR_NAME = "opendataloader-pdf-cli.jar";
|
|
38
152
|
function executeJar(args, executionOptions = {}) {
|
|
39
|
-
const {
|
|
153
|
+
const { streamOutput = false } = executionOptions;
|
|
40
154
|
return new Promise((resolve, reject) => {
|
|
41
155
|
const jarPath = path.join(__dirname, "..", "lib", JAR_NAME);
|
|
42
156
|
if (!fs.existsSync(jarPath)) {
|
|
@@ -90,56 +204,45 @@ ${errorOutput}`
|
|
|
90
204
|
});
|
|
91
205
|
}
|
|
92
206
|
function convert(inputPaths, options = {}) {
|
|
93
|
-
|
|
207
|
+
const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
|
|
208
|
+
if (inputList.length === 0) {
|
|
94
209
|
return Promise.reject(new Error("At least one input path must be provided."));
|
|
95
210
|
}
|
|
96
|
-
for (const input of
|
|
211
|
+
for (const input of inputList) {
|
|
97
212
|
if (!fs.existsSync(input)) {
|
|
98
213
|
return Promise.reject(new Error(`Input file or folder not found: ${input}`));
|
|
99
214
|
}
|
|
100
215
|
}
|
|
101
|
-
const args = [...
|
|
102
|
-
if (options.outputDir) {
|
|
103
|
-
args.push("--output-dir", options.outputDir);
|
|
104
|
-
}
|
|
105
|
-
if (options.password) {
|
|
106
|
-
args.push("--password", options.password);
|
|
107
|
-
}
|
|
108
|
-
if (options.format) {
|
|
109
|
-
if (Array.isArray(options.format)) {
|
|
110
|
-
args.push("--format", options.format.join(","));
|
|
111
|
-
} else {
|
|
112
|
-
args.push("--format", options.format);
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
if (options.quiet) {
|
|
116
|
-
args.push("--quiet");
|
|
117
|
-
}
|
|
118
|
-
if (options.contentSafetyOff) {
|
|
119
|
-
if (Array.isArray(options.contentSafetyOff)) {
|
|
120
|
-
args.push("--content-safety-off", options.contentSafetyOff.join(","));
|
|
121
|
-
} else {
|
|
122
|
-
args.push("--content-safety-off", options.contentSafetyOff);
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
if (options.keepLineBreaks) {
|
|
126
|
-
args.push("--keep-line-breaks");
|
|
127
|
-
}
|
|
128
|
-
if (options.replaceInvalidChars) {
|
|
129
|
-
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
130
|
-
}
|
|
131
|
-
if (options.useStructTree) {
|
|
132
|
-
args.push("--use-struct-tree");
|
|
133
|
-
}
|
|
216
|
+
const args = [...inputList, ...buildArgs(options)];
|
|
134
217
|
return executeJar(args, {
|
|
135
218
|
streamOutput: !options.quiet
|
|
136
219
|
});
|
|
137
220
|
}
|
|
138
221
|
|
|
222
|
+
// src/cli-options.generated.ts
|
|
223
|
+
function registerCliOptions(program) {
|
|
224
|
+
program.option("-o, --output-dir <value>", "Directory where output files are written. Default: input file directory");
|
|
225
|
+
program.option("-p, --password <value>", "Password for encrypted PDF files");
|
|
226
|
+
program.option("-f, --format <value>", "Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json");
|
|
227
|
+
program.option("-q, --quiet", "Suppress console logging output");
|
|
228
|
+
program.option("--content-safety-off <value>", "Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg");
|
|
229
|
+
program.option("--keep-line-breaks", "Preserve original line breaks in extracted text");
|
|
230
|
+
program.option("--replace-invalid-chars <value>", "Replacement character for invalid/unrecognized characters. Default: space");
|
|
231
|
+
program.option("--use-struct-tree", "Use PDF structure tree (tagged PDF) for reading order and semantic structure");
|
|
232
|
+
program.option("--table-method <value>", "Table detection method. Values: cluster");
|
|
233
|
+
program.option("--reading-order <value>", "Reading order algorithm. Values: none, xycut. Default: none");
|
|
234
|
+
program.option("--markdown-page-separator <value>", "Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none");
|
|
235
|
+
program.option("--text-page-separator <value>", "Separator between pages in text output. Use %page-number% for page numbers. Default: none");
|
|
236
|
+
program.option("--html-page-separator <value>", "Separator between pages in HTML output. Use %page-number% for page numbers. Default: none");
|
|
237
|
+
program.option("--embed-images", "Embed images as Base64 data URIs instead of file path references");
|
|
238
|
+
program.option("--image-format <value>", "Output format for extracted images. Values: png, jpeg. Default: png");
|
|
239
|
+
}
|
|
240
|
+
|
|
139
241
|
// src/cli.ts
|
|
140
242
|
function createProgram() {
|
|
141
243
|
const program = new import_commander.Command();
|
|
142
|
-
program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert")
|
|
244
|
+
program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert");
|
|
245
|
+
registerCliOptions(program);
|
|
143
246
|
program.configureOutput({
|
|
144
247
|
writeErr: (str) => {
|
|
145
248
|
console.error(str.trimEnd());
|
|
@@ -150,34 +253,6 @@ function createProgram() {
|
|
|
150
253
|
});
|
|
151
254
|
return program;
|
|
152
255
|
}
|
|
153
|
-
function buildConvertOptions(options) {
|
|
154
|
-
const convertOptions = {};
|
|
155
|
-
if (options.outputDir) {
|
|
156
|
-
convertOptions.outputDir = options.outputDir;
|
|
157
|
-
}
|
|
158
|
-
if (options.password) {
|
|
159
|
-
convertOptions.password = options.password;
|
|
160
|
-
}
|
|
161
|
-
if (options.format) {
|
|
162
|
-
convertOptions.format = options.format;
|
|
163
|
-
}
|
|
164
|
-
if (options.quiet) {
|
|
165
|
-
convertOptions.quiet = true;
|
|
166
|
-
}
|
|
167
|
-
if (options.contentSafetyOff) {
|
|
168
|
-
convertOptions.contentSafetyOff = options.contentSafetyOff;
|
|
169
|
-
}
|
|
170
|
-
if (options.keepLineBreaks) {
|
|
171
|
-
convertOptions.keepLineBreaks = true;
|
|
172
|
-
}
|
|
173
|
-
if (options.replaceInvalidChars) {
|
|
174
|
-
convertOptions.replaceInvalidChars = options.replaceInvalidChars;
|
|
175
|
-
}
|
|
176
|
-
if (options.useStructTree) {
|
|
177
|
-
convertOptions.useStructTree = true;
|
|
178
|
-
}
|
|
179
|
-
return convertOptions;
|
|
180
|
-
}
|
|
181
256
|
async function main() {
|
|
182
257
|
const program = createProgram();
|
|
183
258
|
program.exitOverride();
|
package/dist/cli.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts","../src/index.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert, ConvertOptions } from './index.js';\n\ninterface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n}\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert')\n .option('-o, --output-dir <path>', 'Directory where outputs are written')\n .option('-p, --password <password>', 'Password for encrypted PDFs')\n .option('-f, --format <format>', 'Comma-separated output format(s) to generate.')\n .option('-q, --quiet', 'Suppress CLI logging output')\n .option('--content-safety-off <modes>', 'Comma-separated content safety filters to disable.')\n .option('--keep-line-breaks', 'Preserve line breaks in text output')\n .option('--replace-invalid-chars <c>', 'Replacement character for invalid characters')\n .option('--use-struct-tree', 'Enable processing structure tree (disabled by default)');\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nfunction buildConvertOptions(options: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (options.outputDir) {\n convertOptions.outputDir = options.outputDir;\n }\n if (options.password) {\n convertOptions.password = options.password;\n }\n if (options.format) {\n convertOptions.format = options.format;\n }\n if (options.quiet) {\n convertOptions.quiet = true;\n }\n if (options.contentSafetyOff) {\n convertOptions.contentSafetyOff = options.contentSafetyOff;\n }\n if (options.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (options.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = options.replaceInvalidChars;\n }\n if (options.useStructTree) {\n convertOptions.useStructTree = true;\n }\n\n return convertOptions;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n debug?: boolean;\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { debug = false, streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n return new Promise((resolve, reject) => {\n if (!fs.existsSync(inputPath)) {\n return reject(new Error(`Input file or folder not found: ${inputPath}`));\n }\n\n const args: string[] = [];\n if (options.outputFolder) {\n args.push('--output-dir', options.outputFolder);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.generateMarkdown) {\n args.push('--markdown');\n }\n if (options.generateHtml) {\n args.push('--html');\n }\n if (options.generateAnnotatedPdf) {\n args.push('--pdf');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.contentSafetyOff) {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n if (options.htmlInMarkdown) {\n args.push('--markdown-with-html');\n }\n if (options.addImageToMarkdown) {\n args.push('--markdown-with-images');\n }\n if (options.noJson) {\n args.push('--no-json');\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree')\n }\n\n args.push(inputPath);\n executeJar(args, {\n debug: options.debug,\n streamOutput: Boolean(options.debug),\n })\n .then(resolve)\n .catch(reject);\n });\n}\n\nexport interface ConvertOptions {\n outputDir?: string;\n password?: string;\n format?: string | string[];\n quiet?: boolean;\n contentSafetyOff?: string | string[];\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n}\n\nexport function convert(inputPaths: string[], options: ConvertOptions = {}): Promise<string> {\n if (inputPaths.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputPaths) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputPaths];\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n args.push('--format', options.format.join(','));\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree')\n }\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AACA,uBAAwC;;;ACDxC,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;AAH9B;AAKA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAOjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,QAAQ,OAAO,eAAe,MAAM,IAAI;AAEhD,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAmFO,SAAS,QAAQ,YAAsB,UAA0B,CAAC,GAAoB;AAC3F,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,YAAY;AAC9B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,UAAU;AACrC,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,WAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,IAChD,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,WAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,IACtE,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AD/LA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,yBAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC,EAC9D,OAAO,2BAA2B,qCAAqC,EACvE,OAAO,6BAA6B,6BAA6B,EACjE,OAAO,yBAAyB,+CAA+C,EAC/E,OAAO,eAAe,6BAA6B,EACnD,OAAO,gCAAgC,oDAAoD,EAC3F,OAAO,sBAAsB,qCAAqC,EAClE,OAAO,+BAA+B,8CAA8C,EACpF,OAAO,qBAAqB,wDAAwD;AAEvF,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,SAAS,oBAAoB,SAAqC;AAChE,QAAM,iBAAiC,CAAC;AAExC,MAAI,QAAQ,WAAW;AACrB,mBAAe,YAAY,QAAQ;AAAA,EACrC;AACA,MAAI,QAAQ,UAAU;AACpB,mBAAe,WAAW,QAAQ;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,mBAAe,SAAS,QAAQ;AAAA,EAClC;AACA,MAAI,QAAQ,OAAO;AACjB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,mBAAe,mBAAmB,QAAQ;AAAA,EAC5C;AACA,MAAI,QAAQ,gBAAgB;AAC1B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,mBAAe,sBAAsB,QAAQ;AAAA,EAC/C;AACA,MAAI,QAAQ,eAAe;AACzB,mBAAe,gBAAgB;AAAA,EACjC;AAEA,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,iCAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: cluster */\n tableMethod?: string;\n /** Reading order algorithm. Values: none, xycut. Default: none */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Embed images as Base64 data URIs instead of file path references */\n embedImages?: boolean;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n embedImages?: boolean;\n imageFormat?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.embedImages) {\n convertOptions.embedImages = true;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.embedImages) {\n args.push('--embed-images');\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: cluster');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: none, xycut. Default: none');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--embed-images', 'Embed images as Base64 data URIs instead of file path references');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AACA,uBAAwC;;;ACDxC,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;;;AC4DvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc;AAAA,EAC/B;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,gBAAgB;AAAA,EAC5B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AAEA,SAAO;AACT;;;ADpLA;AAWA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AE9FO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,oIAAoI;AAC3K,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,yCAAyC;AAClF,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,kBAAkB,kEAAkE;AACnG,UAAQ,OAAO,0BAA0B,qEAAqE;AAChH;;;AHlBA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,yBAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,iCAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
|
package/dist/cli.js
CHANGED
|
@@ -8,11 +8,125 @@ import { spawn } from "child_process";
|
|
|
8
8
|
import * as path from "path";
|
|
9
9
|
import * as fs from "fs";
|
|
10
10
|
import { fileURLToPath } from "url";
|
|
11
|
+
|
|
12
|
+
// src/convert-options.generated.ts
|
|
13
|
+
function buildConvertOptions(cliOptions) {
|
|
14
|
+
const convertOptions = {};
|
|
15
|
+
if (cliOptions.outputDir) {
|
|
16
|
+
convertOptions.outputDir = cliOptions.outputDir;
|
|
17
|
+
}
|
|
18
|
+
if (cliOptions.password) {
|
|
19
|
+
convertOptions.password = cliOptions.password;
|
|
20
|
+
}
|
|
21
|
+
if (cliOptions.format) {
|
|
22
|
+
convertOptions.format = cliOptions.format;
|
|
23
|
+
}
|
|
24
|
+
if (cliOptions.quiet) {
|
|
25
|
+
convertOptions.quiet = true;
|
|
26
|
+
}
|
|
27
|
+
if (cliOptions.contentSafetyOff) {
|
|
28
|
+
convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;
|
|
29
|
+
}
|
|
30
|
+
if (cliOptions.keepLineBreaks) {
|
|
31
|
+
convertOptions.keepLineBreaks = true;
|
|
32
|
+
}
|
|
33
|
+
if (cliOptions.replaceInvalidChars) {
|
|
34
|
+
convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;
|
|
35
|
+
}
|
|
36
|
+
if (cliOptions.useStructTree) {
|
|
37
|
+
convertOptions.useStructTree = true;
|
|
38
|
+
}
|
|
39
|
+
if (cliOptions.tableMethod) {
|
|
40
|
+
convertOptions.tableMethod = cliOptions.tableMethod;
|
|
41
|
+
}
|
|
42
|
+
if (cliOptions.readingOrder) {
|
|
43
|
+
convertOptions.readingOrder = cliOptions.readingOrder;
|
|
44
|
+
}
|
|
45
|
+
if (cliOptions.markdownPageSeparator) {
|
|
46
|
+
convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;
|
|
47
|
+
}
|
|
48
|
+
if (cliOptions.textPageSeparator) {
|
|
49
|
+
convertOptions.textPageSeparator = cliOptions.textPageSeparator;
|
|
50
|
+
}
|
|
51
|
+
if (cliOptions.htmlPageSeparator) {
|
|
52
|
+
convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;
|
|
53
|
+
}
|
|
54
|
+
if (cliOptions.embedImages) {
|
|
55
|
+
convertOptions.embedImages = true;
|
|
56
|
+
}
|
|
57
|
+
if (cliOptions.imageFormat) {
|
|
58
|
+
convertOptions.imageFormat = cliOptions.imageFormat;
|
|
59
|
+
}
|
|
60
|
+
return convertOptions;
|
|
61
|
+
}
|
|
62
|
+
function buildArgs(options) {
|
|
63
|
+
const args = [];
|
|
64
|
+
if (options.outputDir) {
|
|
65
|
+
args.push("--output-dir", options.outputDir);
|
|
66
|
+
}
|
|
67
|
+
if (options.password) {
|
|
68
|
+
args.push("--password", options.password);
|
|
69
|
+
}
|
|
70
|
+
if (options.format) {
|
|
71
|
+
if (Array.isArray(options.format)) {
|
|
72
|
+
if (options.format.length > 0) {
|
|
73
|
+
args.push("--format", options.format.join(","));
|
|
74
|
+
}
|
|
75
|
+
} else {
|
|
76
|
+
args.push("--format", options.format);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
if (options.quiet) {
|
|
80
|
+
args.push("--quiet");
|
|
81
|
+
}
|
|
82
|
+
if (options.contentSafetyOff) {
|
|
83
|
+
if (Array.isArray(options.contentSafetyOff)) {
|
|
84
|
+
if (options.contentSafetyOff.length > 0) {
|
|
85
|
+
args.push("--content-safety-off", options.contentSafetyOff.join(","));
|
|
86
|
+
}
|
|
87
|
+
} else {
|
|
88
|
+
args.push("--content-safety-off", options.contentSafetyOff);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
if (options.keepLineBreaks) {
|
|
92
|
+
args.push("--keep-line-breaks");
|
|
93
|
+
}
|
|
94
|
+
if (options.replaceInvalidChars) {
|
|
95
|
+
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
96
|
+
}
|
|
97
|
+
if (options.useStructTree) {
|
|
98
|
+
args.push("--use-struct-tree");
|
|
99
|
+
}
|
|
100
|
+
if (options.tableMethod) {
|
|
101
|
+
args.push("--table-method", options.tableMethod);
|
|
102
|
+
}
|
|
103
|
+
if (options.readingOrder) {
|
|
104
|
+
args.push("--reading-order", options.readingOrder);
|
|
105
|
+
}
|
|
106
|
+
if (options.markdownPageSeparator) {
|
|
107
|
+
args.push("--markdown-page-separator", options.markdownPageSeparator);
|
|
108
|
+
}
|
|
109
|
+
if (options.textPageSeparator) {
|
|
110
|
+
args.push("--text-page-separator", options.textPageSeparator);
|
|
111
|
+
}
|
|
112
|
+
if (options.htmlPageSeparator) {
|
|
113
|
+
args.push("--html-page-separator", options.htmlPageSeparator);
|
|
114
|
+
}
|
|
115
|
+
if (options.embedImages) {
|
|
116
|
+
args.push("--embed-images");
|
|
117
|
+
}
|
|
118
|
+
if (options.imageFormat) {
|
|
119
|
+
args.push("--image-format", options.imageFormat);
|
|
120
|
+
}
|
|
121
|
+
return args;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// src/index.ts
|
|
11
125
|
var __filename = fileURLToPath(import.meta.url);
|
|
12
126
|
var __dirname = path.dirname(__filename);
|
|
13
127
|
var JAR_NAME = "opendataloader-pdf-cli.jar";
|
|
14
128
|
function executeJar(args, executionOptions = {}) {
|
|
15
|
-
const {
|
|
129
|
+
const { streamOutput = false } = executionOptions;
|
|
16
130
|
return new Promise((resolve, reject) => {
|
|
17
131
|
const jarPath = path.join(__dirname, "..", "lib", JAR_NAME);
|
|
18
132
|
if (!fs.existsSync(jarPath)) {
|
|
@@ -66,56 +180,45 @@ ${errorOutput}`
|
|
|
66
180
|
});
|
|
67
181
|
}
|
|
68
182
|
function convert(inputPaths, options = {}) {
|
|
69
|
-
|
|
183
|
+
const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
|
|
184
|
+
if (inputList.length === 0) {
|
|
70
185
|
return Promise.reject(new Error("At least one input path must be provided."));
|
|
71
186
|
}
|
|
72
|
-
for (const input of
|
|
187
|
+
for (const input of inputList) {
|
|
73
188
|
if (!fs.existsSync(input)) {
|
|
74
189
|
return Promise.reject(new Error(`Input file or folder not found: ${input}`));
|
|
75
190
|
}
|
|
76
191
|
}
|
|
77
|
-
const args = [...
|
|
78
|
-
if (options.outputDir) {
|
|
79
|
-
args.push("--output-dir", options.outputDir);
|
|
80
|
-
}
|
|
81
|
-
if (options.password) {
|
|
82
|
-
args.push("--password", options.password);
|
|
83
|
-
}
|
|
84
|
-
if (options.format) {
|
|
85
|
-
if (Array.isArray(options.format)) {
|
|
86
|
-
args.push("--format", options.format.join(","));
|
|
87
|
-
} else {
|
|
88
|
-
args.push("--format", options.format);
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
if (options.quiet) {
|
|
92
|
-
args.push("--quiet");
|
|
93
|
-
}
|
|
94
|
-
if (options.contentSafetyOff) {
|
|
95
|
-
if (Array.isArray(options.contentSafetyOff)) {
|
|
96
|
-
args.push("--content-safety-off", options.contentSafetyOff.join(","));
|
|
97
|
-
} else {
|
|
98
|
-
args.push("--content-safety-off", options.contentSafetyOff);
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
if (options.keepLineBreaks) {
|
|
102
|
-
args.push("--keep-line-breaks");
|
|
103
|
-
}
|
|
104
|
-
if (options.replaceInvalidChars) {
|
|
105
|
-
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
106
|
-
}
|
|
107
|
-
if (options.useStructTree) {
|
|
108
|
-
args.push("--use-struct-tree");
|
|
109
|
-
}
|
|
192
|
+
const args = [...inputList, ...buildArgs(options)];
|
|
110
193
|
return executeJar(args, {
|
|
111
194
|
streamOutput: !options.quiet
|
|
112
195
|
});
|
|
113
196
|
}
|
|
114
197
|
|
|
198
|
+
// src/cli-options.generated.ts
|
|
199
|
+
function registerCliOptions(program) {
|
|
200
|
+
program.option("-o, --output-dir <value>", "Directory where output files are written. Default: input file directory");
|
|
201
|
+
program.option("-p, --password <value>", "Password for encrypted PDF files");
|
|
202
|
+
program.option("-f, --format <value>", "Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json");
|
|
203
|
+
program.option("-q, --quiet", "Suppress console logging output");
|
|
204
|
+
program.option("--content-safety-off <value>", "Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg");
|
|
205
|
+
program.option("--keep-line-breaks", "Preserve original line breaks in extracted text");
|
|
206
|
+
program.option("--replace-invalid-chars <value>", "Replacement character for invalid/unrecognized characters. Default: space");
|
|
207
|
+
program.option("--use-struct-tree", "Use PDF structure tree (tagged PDF) for reading order and semantic structure");
|
|
208
|
+
program.option("--table-method <value>", "Table detection method. Values: cluster");
|
|
209
|
+
program.option("--reading-order <value>", "Reading order algorithm. Values: none, xycut. Default: none");
|
|
210
|
+
program.option("--markdown-page-separator <value>", "Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none");
|
|
211
|
+
program.option("--text-page-separator <value>", "Separator between pages in text output. Use %page-number% for page numbers. Default: none");
|
|
212
|
+
program.option("--html-page-separator <value>", "Separator between pages in HTML output. Use %page-number% for page numbers. Default: none");
|
|
213
|
+
program.option("--embed-images", "Embed images as Base64 data URIs instead of file path references");
|
|
214
|
+
program.option("--image-format <value>", "Output format for extracted images. Values: png, jpeg. Default: png");
|
|
215
|
+
}
|
|
216
|
+
|
|
115
217
|
// src/cli.ts
|
|
116
218
|
function createProgram() {
|
|
117
219
|
const program = new Command();
|
|
118
|
-
program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert")
|
|
220
|
+
program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert");
|
|
221
|
+
registerCliOptions(program);
|
|
119
222
|
program.configureOutput({
|
|
120
223
|
writeErr: (str) => {
|
|
121
224
|
console.error(str.trimEnd());
|
|
@@ -126,34 +229,6 @@ function createProgram() {
|
|
|
126
229
|
});
|
|
127
230
|
return program;
|
|
128
231
|
}
|
|
129
|
-
function buildConvertOptions(options) {
|
|
130
|
-
const convertOptions = {};
|
|
131
|
-
if (options.outputDir) {
|
|
132
|
-
convertOptions.outputDir = options.outputDir;
|
|
133
|
-
}
|
|
134
|
-
if (options.password) {
|
|
135
|
-
convertOptions.password = options.password;
|
|
136
|
-
}
|
|
137
|
-
if (options.format) {
|
|
138
|
-
convertOptions.format = options.format;
|
|
139
|
-
}
|
|
140
|
-
if (options.quiet) {
|
|
141
|
-
convertOptions.quiet = true;
|
|
142
|
-
}
|
|
143
|
-
if (options.contentSafetyOff) {
|
|
144
|
-
convertOptions.contentSafetyOff = options.contentSafetyOff;
|
|
145
|
-
}
|
|
146
|
-
if (options.keepLineBreaks) {
|
|
147
|
-
convertOptions.keepLineBreaks = true;
|
|
148
|
-
}
|
|
149
|
-
if (options.replaceInvalidChars) {
|
|
150
|
-
convertOptions.replaceInvalidChars = options.replaceInvalidChars;
|
|
151
|
-
}
|
|
152
|
-
if (options.useStructTree) {
|
|
153
|
-
convertOptions.useStructTree = true;
|
|
154
|
-
}
|
|
155
|
-
return convertOptions;
|
|
156
|
-
}
|
|
157
232
|
async function main() {
|
|
158
233
|
const program = createProgram();
|
|
159
234
|
program.exitOverride();
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts","../src/index.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert, ConvertOptions } from './index.js';\n\ninterface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n}\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert')\n .option('-o, --output-dir <path>', 'Directory where outputs are written')\n .option('-p, --password <password>', 'Password for encrypted PDFs')\n .option('-f, --format <format>', 'Comma-separated output format(s) to generate.')\n .option('-q, --quiet', 'Suppress CLI logging output')\n .option('--content-safety-off <modes>', 'Comma-separated content safety filters to disable.')\n .option('--keep-line-breaks', 'Preserve line breaks in text output')\n .option('--replace-invalid-chars <c>', 'Replacement character for invalid characters')\n .option('--use-struct-tree', 'Enable processing structure tree (disabled by default)');\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nfunction buildConvertOptions(options: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (options.outputDir) {\n convertOptions.outputDir = options.outputDir;\n }\n if (options.password) {\n convertOptions.password = options.password;\n }\n if (options.format) {\n convertOptions.format = options.format;\n }\n if (options.quiet) {\n convertOptions.quiet = true;\n }\n if (options.contentSafetyOff) {\n convertOptions.contentSafetyOff = options.contentSafetyOff;\n }\n if (options.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (options.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = options.replaceInvalidChars;\n }\n if (options.useStructTree) {\n convertOptions.useStructTree = true;\n }\n\n return convertOptions;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n debug?: boolean;\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { debug = false, streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n return new Promise((resolve, reject) => {\n if (!fs.existsSync(inputPath)) {\n return reject(new Error(`Input file or folder not found: ${inputPath}`));\n }\n\n const args: string[] = [];\n if (options.outputFolder) {\n args.push('--output-dir', options.outputFolder);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.generateMarkdown) {\n args.push('--markdown');\n }\n if (options.generateHtml) {\n args.push('--html');\n }\n if (options.generateAnnotatedPdf) {\n args.push('--pdf');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.contentSafetyOff) {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n if (options.htmlInMarkdown) {\n args.push('--markdown-with-html');\n }\n if (options.addImageToMarkdown) {\n args.push('--markdown-with-images');\n }\n if (options.noJson) {\n args.push('--no-json');\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree')\n }\n\n args.push(inputPath);\n executeJar(args, {\n debug: options.debug,\n streamOutput: Boolean(options.debug),\n })\n .then(resolve)\n .catch(reject);\n });\n}\n\nexport interface ConvertOptions {\n outputDir?: string;\n password?: string;\n format?: string | string[];\n quiet?: boolean;\n contentSafetyOff?: string | string[];\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n}\n\nexport function convert(inputPaths: string[], options: ConvertOptions = {}): Promise<string> {\n if (inputPaths.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputPaths) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputPaths];\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n args.push('--format', options.format.join(','));\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree')\n }\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n"],"mappings":";;;AACA,SAAS,SAAS,sBAAsB;;;ACDxC,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;AAE9B,IAAM,aAAa,cAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAOjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,QAAQ,OAAO,eAAe,MAAM,IAAI;AAEhD,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAmFO,SAAS,QAAQ,YAAsB,UAA0B,CAAC,GAAoB;AAC3F,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,YAAY;AAC9B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,UAAU;AACrC,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,WAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,IAChD,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,WAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,IACtE,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AD/LA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,QAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC,EAC9D,OAAO,2BAA2B,qCAAqC,EACvE,OAAO,6BAA6B,6BAA6B,EACjE,OAAO,yBAAyB,+CAA+C,EAC/E,OAAO,eAAe,6BAA6B,EACnD,OAAO,gCAAgC,oDAAoD,EAC3F,OAAO,sBAAsB,qCAAqC,EAClE,OAAO,+BAA+B,8CAA8C,EACpF,OAAO,qBAAqB,wDAAwD;AAEvF,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,SAAS,oBAAoB,SAAqC;AAChE,QAAM,iBAAiC,CAAC;AAExC,MAAI,QAAQ,WAAW;AACrB,mBAAe,YAAY,QAAQ;AAAA,EACrC;AACA,MAAI,QAAQ,UAAU;AACpB,mBAAe,WAAW,QAAQ;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,mBAAe,SAAS,QAAQ;AAAA,EAClC;AACA,MAAI,QAAQ,OAAO;AACjB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,mBAAe,mBAAmB,QAAQ;AAAA,EAC5C;AACA,MAAI,QAAQ,gBAAgB;AAC1B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,mBAAe,sBAAsB,QAAQ;AAAA,EAC/C;AACA,MAAI,QAAQ,eAAe;AACzB,mBAAe,gBAAgB;AAAA,EACjC;AAEA,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,gBAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: cluster */\n tableMethod?: string;\n /** Reading order algorithm. Values: none, xycut. Default: none */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Embed images as Base64 data URIs instead of file path references */\n embedImages?: boolean;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n embedImages?: boolean;\n imageFormat?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.embedImages) {\n convertOptions.embedImages = true;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.embedImages) {\n args.push('--embed-images');\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: cluster');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: none, xycut. Default: none');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--embed-images', 'Embed images as Base64 data URIs instead of file path references');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n}\n"],"mappings":";;;AACA,SAAS,SAAS,sBAAsB;;;ACDxC,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;;;AC4DvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc;AAAA,EAC/B;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,gBAAgB;AAAA,EAC5B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AAEA,SAAO;AACT;;;ADzKA,IAAM,aAAa,cAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AE9FO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,oIAAoI;AAC3K,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,yCAAyC;AAClF,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,kBAAkB,kEAAkE;AACnG,UAAQ,OAAO,0BAA0B,qEAAqE;AAChH;;;AHlBA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,QAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,gBAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
|