@opendataloader/pdf 1.9.1 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +86 -4
- package/dist/cli.cjs +7 -0
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +7 -0
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/lib/opendataloader-pdf-cli.jar +0 -0
- package/package.json +9 -9
package/README.md
CHANGED
|
@@ -222,6 +222,88 @@ opendataloader_pdf.convert(
|
|
|
222
222
|
- **Fallback**: If backend unavailable, gracefully falls back to local processing
|
|
223
223
|
- **Privacy**: Run the backend locally in Docker for 100% on-premise
|
|
224
224
|
|
|
225
|
+
### Formula Extraction (LaTeX)
|
|
226
|
+
|
|
227
|
+
For PDFs containing mathematical formulas, enable formula enrichment to extract LaTeX representations:
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
# Start backend with formula enrichment
|
|
231
|
+
opendataloader-pdf-hybrid --enrich-formula
|
|
232
|
+
|
|
233
|
+
# Process with full backend mode (required for formula extraction)
|
|
234
|
+
opendataloader-pdf --hybrid docling-fast --hybrid-mode full input.pdf
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
Output in JSON:
|
|
238
|
+
```json
|
|
239
|
+
{
|
|
240
|
+
"type": "formula",
|
|
241
|
+
"page number": 1,
|
|
242
|
+
"bounding box": [226.2, 144.7, 377.1, 168.7],
|
|
243
|
+
"content": "\\frac{f(x+h) - f(x)}{h}"
|
|
244
|
+
}
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
Output in Markdown:
|
|
248
|
+
```markdown
|
|
249
|
+
$$
|
|
250
|
+
\frac{f(x+h) - f(x)}{h}
|
|
251
|
+
$$
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
Output in HTML (MathJax/KaTeX compatible):
|
|
255
|
+
```html
|
|
256
|
+
<div class="math-display">\[\frac{f(x+h) - f(x)}{h}\]</div>
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
> **Note**: Formula extraction requires `--hybrid-mode full` to route all pages to the backend where the formula enrichment model runs.
|
|
260
|
+
|
|
261
|
+
### Picture / Chart Description (Alt Text)
|
|
262
|
+
|
|
263
|
+
Generate AI-powered descriptions for images and charts in your PDFs. Useful for accessibility (alt text) and making visual content searchable in RAG pipelines.
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
# Start backend with picture description
|
|
267
|
+
opendataloader-pdf-hybrid --enrich-picture-description
|
|
268
|
+
|
|
269
|
+
# Process with full backend mode (required for picture description)
|
|
270
|
+
opendataloader-pdf --hybrid docling-fast --hybrid-mode full input.pdf
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
Output in JSON:
|
|
274
|
+
```json
|
|
275
|
+
{
|
|
276
|
+
"type": "picture",
|
|
277
|
+
"page number": 1,
|
|
278
|
+
"bounding box": [72.0, 400.0, 540.0, 650.0],
|
|
279
|
+
"description": "A bar chart showing waste generation by region from 2016 to 2030..."
|
|
280
|
+
}
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
Output in Markdown:
|
|
284
|
+
```markdown
|
|
285
|
+

|
|
286
|
+
|
|
287
|
+
*A bar chart showing waste generation by region from 2016 to 2030...*
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
Output in HTML:
|
|
291
|
+
```html
|
|
292
|
+
<figure>
|
|
293
|
+
<img src="document_images/imageFile1.png" alt="figure1">
|
|
294
|
+
<figcaption>A bar chart showing waste generation by region from 2016 to 2030...</figcaption>
|
|
295
|
+
</figure>
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
You can also customize the prompt for better results with specific document types:
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
opendataloader-pdf-hybrid --enrich-picture-description \
|
|
302
|
+
--picture-description-prompt "Describe this scientific figure in detail."
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
> **Note**: Picture description uses SmolVLM (256M), a lightweight vision model. Results are suitable for general context but may not capture precise data values from complex charts.
|
|
306
|
+
|
|
225
307
|
[Hybrid Mode Guide →](https://opendataloader.org/docs/hybrid-mode)
|
|
226
308
|
|
|
227
309
|
<br/>
|
|
@@ -264,10 +346,10 @@ We continuously benchmark against real-world documents.
|
|
|
264
346
|
|
|
265
347
|
| Engine | Overall | Reading Order | Table | Heading | Speed (s/page) |
|
|
266
348
|
|-----------------------------|----------|---------------|----------|----------|----------------|
|
|
267
|
-
| **opendataloader** | 0.
|
|
268
|
-
| **opendataloader [hybrid]** | **0.
|
|
269
|
-
| docling | 0.86 | 0.90 | 0.89 |
|
|
270
|
-
| marker | 0.83 | 0.89 | 0.81 |
|
|
349
|
+
| **opendataloader** | 0.72 | 0.91 | 0.49 | 0.76 | **0.05** |
|
|
350
|
+
| **opendataloader [hybrid]** | **0.90** | **0.94** | **0.93** | **0.83** | 0.43 |
|
|
351
|
+
| docling | 0.86 | 0.90 | 0.89 | 0.80 | 0.73 |
|
|
352
|
+
| marker | 0.83 | 0.89 | 0.81 | 0.80 | 53.93 |
|
|
271
353
|
| mineru | 0.82 | 0.86 | 0.87 | 0.74 | 5.96 |
|
|
272
354
|
| pymupdf4llm | 0.57 | 0.89 | 0.40 | 0.41 | 0.09 |
|
|
273
355
|
| markitdown | 0.29 | 0.88 | 0.00 | 0.00 | **0.04** |
|
package/dist/cli.cjs
CHANGED
|
@@ -86,6 +86,9 @@ function buildConvertOptions(cliOptions) {
|
|
|
86
86
|
if (cliOptions.pages) {
|
|
87
87
|
convertOptions.pages = cliOptions.pages;
|
|
88
88
|
}
|
|
89
|
+
if (cliOptions.includeHeaderFooter) {
|
|
90
|
+
convertOptions.includeHeaderFooter = true;
|
|
91
|
+
}
|
|
89
92
|
if (cliOptions.hybrid) {
|
|
90
93
|
convertOptions.hybrid = cliOptions.hybrid;
|
|
91
94
|
}
|
|
@@ -168,6 +171,9 @@ function buildArgs(options) {
|
|
|
168
171
|
if (options.pages) {
|
|
169
172
|
args.push("--pages", options.pages);
|
|
170
173
|
}
|
|
174
|
+
if (options.includeHeaderFooter) {
|
|
175
|
+
args.push("--include-header-footer");
|
|
176
|
+
}
|
|
171
177
|
if (options.hybrid) {
|
|
172
178
|
args.push("--hybrid", options.hybrid);
|
|
173
179
|
}
|
|
@@ -280,6 +286,7 @@ function registerCliOptions(program) {
|
|
|
280
286
|
program.option("--image-format <value>", "Output format for extracted images. Values: png, jpeg. Default: png");
|
|
281
287
|
program.option("--image-dir <value>", "Directory for extracted images");
|
|
282
288
|
program.option("--pages <value>", 'Pages to extract (e.g., "1,3,5-7"). Default: all pages');
|
|
289
|
+
program.option("--include-header-footer", "Include page headers and footers in output");
|
|
283
290
|
program.option("--hybrid <value>", "Hybrid backend for AI processing. Values: off (default), docling-fast");
|
|
284
291
|
program.option("--hybrid-mode <value>", "Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)");
|
|
285
292
|
program.option("--hybrid-url <value>", "Hybrid backend server URL (overrides default)");
|
package/dist/cli.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Hybrid backend for AI processing. Values: off (default), docling-fast */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds. Default: 30000 */\n hybridTimeout?: string;\n /** Fallback to Java processing on hybrid backend error. Default: true */\n hybridFallback?: boolean;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: default (border-based), cluster (border + cluster). Default: default');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: off, xycut. Default: xycut');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--image-output <value>', 'Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n program.option('--image-dir <value>', 'Directory for extracted images');\n program.option('--pages <value>', 'Pages to extract (e.g., \"1,3,5-7\"). Default: all pages');\n program.option('--hybrid <value>', 'Hybrid backend for AI processing. Values: off (default), docling-fast');\n program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');\n program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');\n program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds. Default: 30000');\n program.option('--hybrid-fallback', 'Fallback to Java processing on hybrid backend error. Default: true');\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AACA,uBAAwC;;;ACDxC,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;;;ACiFvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ,WAAW;AAAA,EACpC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,YAAY;AACzB,mBAAe,aAAa,WAAW;AAAA,EACzC;AACA,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB,WAAW;AAAA,EAC5C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO;AACT;;;ADnPA;AAWA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AE9FO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,oIAAoI;AAC3K,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,sGAAsG;AAC/I,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,0BAA0B,wHAAwH;AACjK,UAAQ,OAAO,0BAA0B,qEAAqE;AAC9G,UAAQ,OAAO,uBAAuB,gCAAgC;AACtE,UAAQ,OAAO,mBAAmB,wDAAwD;AAC1F,UAAQ,OAAO,oBAAoB,uEAAuE;AAC1G,UAAQ,OAAO,yBAAyB,sGAAsG;AAC9I,UAAQ,OAAO,wBAAwB,+CAA+C;AACtF,UAAQ,OAAO,4BAA4B,gEAAgE;AAC3G,UAAQ,OAAO,qBAAqB,oEAAoE;AAC1G;;;AHzBA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,yBAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,iCAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Hybrid backend for AI processing. Values: off (default), docling-fast */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds. Default: 30000 */\n hybridTimeout?: string;\n /** Fallback to Java processing on hybrid backend error. Default: true */\n hybridFallback?: boolean;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: default (border-based), cluster (border + cluster). Default: default');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: off, xycut. Default: xycut');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--image-output <value>', 'Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n program.option('--image-dir <value>', 'Directory for extracted images');\n program.option('--pages <value>', 'Pages to extract (e.g., \"1,3,5-7\"). Default: all pages');\n program.option('--include-header-footer', 'Include page headers and footers in output');\n program.option('--hybrid <value>', 'Hybrid backend for AI processing. Values: off (default), docling-fast');\n program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');\n program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');\n program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds. Default: 30000');\n program.option('--hybrid-fallback', 'Fallback to Java processing on hybrid backend error. Default: true');\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AACA,uBAAwC;;;ACDxC,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;;;ACoFvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ,WAAW;AAAA,EACpC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,YAAY;AACzB,mBAAe,aAAa,WAAW;AAAA,EACzC;AACA,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB,WAAW;AAAA,EAC5C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO;AACT;;;AD5PA;AAWA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AE9FO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,oIAAoI;AAC3K,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,sGAAsG;AAC/I,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,0BAA0B,wHAAwH;AACjK,UAAQ,OAAO,0BAA0B,qEAAqE;AAC9G,UAAQ,OAAO,uBAAuB,gCAAgC;AACtE,UAAQ,OAAO,mBAAmB,wDAAwD;AAC1F,UAAQ,OAAO,2BAA2B,4CAA4C;AACtF,UAAQ,OAAO,oBAAoB,uEAAuE;AAC1G,UAAQ,OAAO,yBAAyB,sGAAsG;AAC9I,UAAQ,OAAO,wBAAwB,+CAA+C;AACtF,UAAQ,OAAO,4BAA4B,gEAAgE;AAC3G,UAAQ,OAAO,qBAAqB,oEAAoE;AAC1G;;;AH1BA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,yBAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,iCAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
|
package/dist/cli.js
CHANGED
|
@@ -63,6 +63,9 @@ function buildConvertOptions(cliOptions) {
|
|
|
63
63
|
if (cliOptions.pages) {
|
|
64
64
|
convertOptions.pages = cliOptions.pages;
|
|
65
65
|
}
|
|
66
|
+
if (cliOptions.includeHeaderFooter) {
|
|
67
|
+
convertOptions.includeHeaderFooter = true;
|
|
68
|
+
}
|
|
66
69
|
if (cliOptions.hybrid) {
|
|
67
70
|
convertOptions.hybrid = cliOptions.hybrid;
|
|
68
71
|
}
|
|
@@ -145,6 +148,9 @@ function buildArgs(options) {
|
|
|
145
148
|
if (options.pages) {
|
|
146
149
|
args.push("--pages", options.pages);
|
|
147
150
|
}
|
|
151
|
+
if (options.includeHeaderFooter) {
|
|
152
|
+
args.push("--include-header-footer");
|
|
153
|
+
}
|
|
148
154
|
if (options.hybrid) {
|
|
149
155
|
args.push("--hybrid", options.hybrid);
|
|
150
156
|
}
|
|
@@ -256,6 +262,7 @@ function registerCliOptions(program) {
|
|
|
256
262
|
program.option("--image-format <value>", "Output format for extracted images. Values: png, jpeg. Default: png");
|
|
257
263
|
program.option("--image-dir <value>", "Directory for extracted images");
|
|
258
264
|
program.option("--pages <value>", 'Pages to extract (e.g., "1,3,5-7"). Default: all pages');
|
|
265
|
+
program.option("--include-header-footer", "Include page headers and footers in output");
|
|
259
266
|
program.option("--hybrid <value>", "Hybrid backend for AI processing. Values: off (default), docling-fast");
|
|
260
267
|
program.option("--hybrid-mode <value>", "Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)");
|
|
261
268
|
program.option("--hybrid-url <value>", "Hybrid backend server URL (overrides default)");
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Hybrid backend for AI processing. Values: off (default), docling-fast */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds. Default: 30000 */\n hybridTimeout?: string;\n /** Fallback to Java processing on hybrid backend error. Default: true */\n hybridFallback?: boolean;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: default (border-based), cluster (border + cluster). Default: default');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: off, xycut. Default: xycut');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--image-output <value>', 'Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n program.option('--image-dir <value>', 'Directory for extracted images');\n program.option('--pages <value>', 'Pages to extract (e.g., \"1,3,5-7\"). Default: all pages');\n program.option('--hybrid <value>', 'Hybrid backend for AI processing. Values: off (default), docling-fast');\n program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');\n program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');\n program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds. Default: 30000');\n program.option('--hybrid-fallback', 'Fallback to Java processing on hybrid backend error. Default: true');\n}\n"],"mappings":";;;AACA,SAAS,SAAS,sBAAsB;;;ACDxC,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;;;ACiFvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ,WAAW;AAAA,EACpC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,YAAY;AACzB,mBAAe,aAAa,WAAW;AAAA,EACzC;AACA,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB,WAAW;AAAA,EAC5C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO;AACT;;;ADxOA,IAAM,aAAa,cAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AE9FO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,oIAAoI;AAC3K,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,sGAAsG;AAC/I,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,0BAA0B,wHAAwH;AACjK,UAAQ,OAAO,0BAA0B,qEAAqE;AAC9G,UAAQ,OAAO,uBAAuB,gCAAgC;AACtE,UAAQ,OAAO,mBAAmB,wDAAwD;AAC1F,UAAQ,OAAO,oBAAoB,uEAAuE;AAC1G,UAAQ,OAAO,yBAAyB,sGAAsG;AAC9I,UAAQ,OAAO,wBAAwB,+CAA+C;AACtF,UAAQ,OAAO,4BAA4B,gEAAgE;AAC3G,UAAQ,OAAO,qBAAqB,oEAAoE;AAC1G;;;AHzBA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,QAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,gBAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Hybrid backend for AI processing. Values: off (default), docling-fast */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds. Default: 30000 */\n hybridTimeout?: string;\n /** Fallback to Java processing on hybrid backend error. Default: true */\n hybridFallback?: boolean;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: default (border-based), cluster (border + cluster). Default: default');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: off, xycut. Default: xycut');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--image-output <value>', 'Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n program.option('--image-dir <value>', 'Directory for extracted images');\n program.option('--pages <value>', 'Pages to extract (e.g., \"1,3,5-7\"). Default: all pages');\n program.option('--include-header-footer', 'Include page headers and footers in output');\n program.option('--hybrid <value>', 'Hybrid backend for AI processing. Values: off (default), docling-fast');\n program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');\n program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');\n program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds. Default: 30000');\n program.option('--hybrid-fallback', 'Fallback to Java processing on hybrid backend error. Default: true');\n}\n"],"mappings":";;;AACA,SAAS,SAAS,sBAAsB;;;ACDxC,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;;;ACoFvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ,WAAW;AAAA,EACpC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,YAAY;AACzB,mBAAe,aAAa,WAAW;AAAA,EACzC;AACA,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB,WAAW;AAAA,EAC5C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO;AACT;;;ADjPA,IAAM,aAAa,cAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AE9FO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,oIAAoI;AAC3K,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,sGAAsG;AAC/I,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,0BAA0B,wHAAwH;AACjK,UAAQ,OAAO,0BAA0B,qEAAqE;AAC9G,UAAQ,OAAO,uBAAuB,gCAAgC;AACtE,UAAQ,OAAO,mBAAmB,wDAAwD;AAC1F,UAAQ,OAAO,2BAA2B,4CAA4C;AACtF,UAAQ,OAAO,oBAAoB,uEAAuE;AAC1G,UAAQ,OAAO,yBAAyB,sGAAsG;AAC9I,UAAQ,OAAO,wBAAwB,+CAA+C;AACtF,UAAQ,OAAO,4BAA4B,gEAAgE;AAC3G,UAAQ,OAAO,qBAAqB,oEAAoE;AAC1G;;;AH1BA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,QAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,gBAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
|
package/dist/index.cjs
CHANGED
|
@@ -106,6 +106,9 @@ function buildArgs(options) {
|
|
|
106
106
|
if (options.pages) {
|
|
107
107
|
args.push("--pages", options.pages);
|
|
108
108
|
}
|
|
109
|
+
if (options.includeHeaderFooter) {
|
|
110
|
+
args.push("--include-header-footer");
|
|
111
|
+
}
|
|
109
112
|
if (options.hybrid) {
|
|
110
113
|
args.push("--hybrid", options.hybrid);
|
|
111
114
|
}
|
package/dist/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Hybrid backend for AI processing. Values: off (default), docling-fast */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds. Default: 30000 */\n hybridTimeout?: string;\n /** Fallback to Java processing on hybrid backend error. Default: true */\n hybridFallback?: boolean;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n\n return args;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;;;AC6JvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO;AACT;;;ADnPA;AAWA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Hybrid backend for AI processing. Values: off (default), docling-fast */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds. Default: 30000 */\n hybridTimeout?: string;\n /** Fallback to Java processing on hybrid backend error. Default: true */\n hybridFallback?: boolean;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n\n return args;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;;;ACmKvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO;AACT;;;AD5PA;AAWA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":[]}
|
package/dist/index.d.cts
CHANGED
|
@@ -36,6 +36,8 @@ interface ConvertOptions {
|
|
|
36
36
|
imageDir?: string;
|
|
37
37
|
/** Pages to extract (e.g., "1,3,5-7"). Default: all pages */
|
|
38
38
|
pages?: string;
|
|
39
|
+
/** Include page headers and footers in output */
|
|
40
|
+
includeHeaderFooter?: boolean;
|
|
39
41
|
/** Hybrid backend for AI processing. Values: off (default), docling-fast */
|
|
40
42
|
hybrid?: string;
|
|
41
43
|
/** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */
|
package/dist/index.d.ts
CHANGED
|
@@ -36,6 +36,8 @@ interface ConvertOptions {
|
|
|
36
36
|
imageDir?: string;
|
|
37
37
|
/** Pages to extract (e.g., "1,3,5-7"). Default: all pages */
|
|
38
38
|
pages?: string;
|
|
39
|
+
/** Include page headers and footers in output */
|
|
40
|
+
includeHeaderFooter?: boolean;
|
|
39
41
|
/** Hybrid backend for AI processing. Values: off (default), docling-fast */
|
|
40
42
|
hybrid?: string;
|
|
41
43
|
/** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */
|
package/dist/index.js
CHANGED
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Hybrid backend for AI processing. Values: off (default), docling-fast */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds. Default: 30000 */\n hybridTimeout?: string;\n /** Fallback to Java processing on hybrid backend error. Default: true */\n hybridFallback?: boolean;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n\n return args;\n}\n"],"mappings":";AAAA,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;;;AC6JvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO;AACT;;;ADxOA,IAAM,aAAa,cAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Hybrid backend for AI processing. Values: off (default), docling-fast */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds. Default: 30000 */\n hybridTimeout?: string;\n /** Fallback to Java processing on hybrid backend error. Default: true */\n hybridFallback?: boolean;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n\n return args;\n}\n"],"mappings":";AAAA,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;;;ACmKvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO;AACT;;;ADjPA,IAAM,aAAa,cAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":[]}
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@opendataloader/pdf",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.10.0",
|
|
4
4
|
"description": "A Node.js wrapper for the opendataloader-pdf Java CLI.",
|
|
5
5
|
"main": "./dist/index.cjs",
|
|
6
6
|
"module": "./dist/index.js",
|
|
@@ -41,19 +41,19 @@
|
|
|
41
41
|
"access": "public"
|
|
42
42
|
},
|
|
43
43
|
"dependencies": {
|
|
44
|
-
"commander": "^14.0.
|
|
44
|
+
"commander": "^14.0.3"
|
|
45
45
|
},
|
|
46
46
|
"devDependencies": {
|
|
47
|
-
"@types/node": "^
|
|
48
|
-
"@typescript-eslint/eslint-plugin": "^8.
|
|
49
|
-
"@typescript-eslint/parser": "^8.
|
|
47
|
+
"@types/node": "^25.2.0",
|
|
48
|
+
"@typescript-eslint/eslint-plugin": "^8.54.0",
|
|
49
|
+
"@typescript-eslint/parser": "^8.54.0",
|
|
50
50
|
"eslint": "^9.28.0",
|
|
51
|
-
"glob": "^
|
|
52
|
-
"prettier": "^3.
|
|
51
|
+
"glob": "^13.0.1",
|
|
52
|
+
"prettier": "^3.8.1",
|
|
53
53
|
"tsup": "^8.5.1",
|
|
54
54
|
"typescript": "^5.9.3",
|
|
55
|
-
"vite": "^7.
|
|
56
|
-
"vitest": "^4.0.
|
|
55
|
+
"vite": "^7.3.1",
|
|
56
|
+
"vitest": "^4.0.18"
|
|
57
57
|
},
|
|
58
58
|
"files": [
|
|
59
59
|
"dist",
|