@opendataloader/pdf 2.4.3 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -328,6 +328,8 @@ Combine formats: `format="json,markdown"`
328
328
 
329
329
  When a PDF has structure tags, OpenDataLoader extracts the **exact layout** the author intended — no guessing, no heuristics. Headings, lists, tables, and reading order are preserved from the source.
330
330
 
331
+ > **Output quality depends on tag quality.** Not all tagged PDFs are well-tagged. For PDFs with sparse or incorrect tags, the default heuristic mode or `--hybrid docling-fast` often produces better results.
332
+
331
333
  ```python
332
334
  # Batch all files in one call — each convert() spawns a JVM process, so repeated calls are slow
333
335
  opendataloader_pdf.convert(
package/dist/cli.cjs CHANGED
@@ -76,6 +76,9 @@ function buildConvertOptions(cliOptions) {
76
76
  if (cliOptions.markdownPageSeparator) {
77
77
  convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;
78
78
  }
79
+ if (cliOptions.markdownWithHtml) {
80
+ convertOptions.markdownWithHtml = true;
81
+ }
79
82
  if (cliOptions.textPageSeparator) {
80
83
  convertOptions.textPageSeparator = cliOptions.textPageSeparator;
81
84
  }
@@ -182,6 +185,9 @@ function buildArgs(options) {
182
185
  if (options.markdownPageSeparator) {
183
186
  args.push("--markdown-page-separator", options.markdownPageSeparator);
184
187
  }
188
+ if (options.markdownWithHtml) {
189
+ args.push("--markdown-with-html");
190
+ }
185
191
  if (options.textPageSeparator) {
186
192
  args.push("--text-page-separator", options.textPageSeparator);
187
193
  }
@@ -346,16 +352,17 @@ async function _runForCli(inputPaths, options = {}) {
346
352
  function registerCliOptions(program) {
347
353
  program.option("-o, --output-dir <value>", "Directory where output files are written. Default: input file directory");
348
354
  program.option("-p, --password <value>", "Password for encrypted PDF files");
349
- program.option("-f, --format <value>", "Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json");
355
+ program.option("-f, --format <value>", "Output formats (comma-separated). Values: json, text, html, pdf, markdown, tagged-pdf. Default: json. For HTML inside Markdown use --markdown-with-html. For image extraction control use --image-output.");
350
356
  program.option("-q, --quiet", "Suppress console logging output");
351
357
  program.option("--content-safety-off <value>", "Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg");
352
358
  program.option("--sanitize", "Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders");
353
359
  program.option("--keep-line-breaks", "Preserve original line breaks in extracted text");
354
360
  program.option("--replace-invalid-chars <value>", "Replacement character for invalid/unrecognized characters. Default: space");
355
- program.option("--use-struct-tree", "Use PDF structure tree (tagged PDF) for reading order and semantic structure");
361
+ program.option("--use-struct-tree", "Use PDF structure tree (tagged PDF) for reading order and semantic structure. Output quality depends on tag quality");
356
362
  program.option("--table-method <value>", "Table detection method. Values: default (border-based), cluster (border + cluster). Default: default");
357
363
  program.option("--reading-order <value>", "Reading order algorithm. Values: off, xycut. Default: xycut");
358
364
  program.option("--markdown-page-separator <value>", "Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none");
365
+ program.option("--markdown-with-html", "Allow HTML tags inside Markdown output for complex structures such as multi-row-span tables. Implies --format markdown.");
359
366
  program.option("--text-page-separator <value>", "Separator between pages in text output. Use %page-number% for page numbers. Default: none");
360
367
  program.option("--html-page-separator <value>", "Separator between pages in HTML output. Use %page-number% for page numbers. Default: none");
361
368
  program.option("--image-output <value>", "Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external");
package/dist/cli.cjs.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../node_modules/.pnpm/tsup@8.5.1_postcss@8.5.12_tsx@4.20.5_typescript@6.0.2/node_modules/tsup/assets/cjs_shims.js","../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["// Shim globals in cjs bundle\n// There's a weird bug that esbuild will always inject importMetaUrl\n// if we export it as `const importMetaUrl = ... __filename ...`\n// But using a function will not cause this issue\n\nconst getImportMetaUrl = () => \n typeof document === \"undefined\" \n ? new URL(`file:${__filename}`).href \n : (document.currentScript && document.currentScript.tagName.toUpperCase() === 'SCRIPT') \n ? document.currentScript.src \n : new URL(\"main.js\", document.baseURI).href;\n\nexport const importMetaUrl = /* @__PURE__ */ getImportMetaUrl()\n","#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { _runForCli } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n // _runForCli streams stdout/stderr to the parent process as they arrive;\n // we deliberately do not re-print anything here. (Issue #398.)\n await _runForCli(inputPaths, convertOptions);\n return 0;\n } catch (err) {\n // Subprocess-exit errors are already on the user's terminal via the live\n // stderr stream — re-printing would duplicate output and risk leaking\n // anything sensitive Java logged (e.g. a --password value echoed by an\n // underlying library). Wrapper-side failures (JAR not found, java not in\n // PATH, bad input path) still need to be surfaced.\n const isJavaExit =\n err instanceof Error && (err as Error & { isJavaExit?: boolean }).isJavaExit === true;\n if (!isJavaExit) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n }\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { StringDecoder } from 'string_decoder';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n /**\n * When true, forwards Java's stdout and stderr chunks to the parent\n * process in real time as well as accumulating them. Used by the bundled\n * CLI so long-running conversions show progress as it happens.\n */\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n // Force headless AWT so macOS doesn't surface a Dock icon (and steal focus)\n // every time the JVM touches ImageIO/PDFBox rendering. Safe on all OSes —\n // the CLI never opens a UI window, only manipulates BufferedImages.\n const commandArgs = [\n '-Djava.awt.headless=true',\n '-Dapple.awt.UIElement=true',\n '-jar',\n jarPath,\n ...args,\n ];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n // StringDecoder buffers incomplete multi-byte UTF-8 sequences across\n // chunk boundaries — Buffer.toString() alone would emit replacement\n // characters when, e.g., a 3-byte Korean codepoint splits across two\n // 'data' events. One decoder per stream so they don't share state.\n const stdoutDecoder = new StringDecoder('utf8');\n const stderrDecoder = new StringDecoder('utf8');\n\n javaProcess.stdout.on('data', (data: Buffer) => {\n const chunk = stdoutDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n // Stream-only: don't also accumulate, or a multi-hour conversion\n // would buffer its entire (potentially gigabyte) stdout in memory\n // for no consumer.\n process.stdout.write(chunk);\n } else {\n stdout += chunk;\n }\n });\n\n javaProcess.stderr.on('data', (data: Buffer) => {\n const chunk = stderrDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n // stderr is always accumulated (progress logs are small and we need\n // them for error messages on non-zero exit).\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n // Flush any trailing bytes the decoder is still holding (always emit\n // them — if we drop them on error paths, error messages with non-ASCII\n // characters lose their tail).\n const stdoutTail = stdoutDecoder.end();\n const stderrTail = stderrDecoder.end();\n if (stdoutTail.length > 0) {\n if (streamOutput) {\n process.stdout.write(stdoutTail);\n } else {\n stdout += stdoutTail;\n }\n }\n if (stderrTail.length > 0) {\n if (streamOutput) process.stderr.write(stderrTail);\n stderr += stderrTail;\n }\n\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n // Tag so the CLI can suppress re-printing this message — Java's\n // stderr was already streamed live to the parent in CLI mode, and\n // re-printing risks leaking anything sensitive Java logged\n // (e.g. a --password value echoed by an underlying library).\n (error as Error & { isJavaExit?: boolean }).isJavaExit = true;\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nfunction buildJarArgs(\n inputPaths: string | string[],\n options: ConvertOptions,\n): string[] | Error {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return new Error('At least one input path must be provided.');\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return new Error(`Input file or folder not found: ${input}`);\n }\n }\n\n return [...inputList, ...buildArgs(options)];\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n return Promise.reject(argsOrError);\n }\n // Library API: never streams to the parent process. Returns the full stdout\n // string so callers can do `const out = await convert(...)` without surprise\n // side-effects on process.stdout / process.stderr.\n return executeJar(argsOrError, { streamOutput: false });\n}\n\n/**\n * Internal entry point used by the bundled CLI. Streams Java's stdout and\n * stderr to the parent process in real time (so long-running conversions like\n * hybrid mode show progress as it happens) and resolves without a stdout\n * payload — preventing the caller from re-printing what was already streamed.\n *\n * Not part of the public API: do not import this from application code. Use\n * {@link convert} instead.\n *\n * @internal\n */\nexport async function _runForCli(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<void> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n throw argsOrError;\n }\n await executeJar(argsOrError, { streamOutput: true });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */\n hybridHancomAiRegionlistStrategy?: string;\n /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */\n hybridHancomAiOcrStrategy?: string;\n /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */\n hybridHancomAiImageCache?: string;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */\n threads?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n hybridHancomAiRegionlistStrategy?: string;\n hybridHancomAiOcrStrategy?: string;\n hybridHancomAiImageCache?: string;\n toStdout?: boolean;\n threads?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.hybridHancomAiRegionlistStrategy) {\n convertOptions.hybridHancomAiRegionlistStrategy = cliOptions.hybridHancomAiRegionlistStrategy;\n }\n if (cliOptions.hybridHancomAiOcrStrategy) {\n convertOptions.hybridHancomAiOcrStrategy = cliOptions.hybridHancomAiOcrStrategy;\n }\n if (cliOptions.hybridHancomAiImageCache) {\n convertOptions.hybridHancomAiImageCache = cliOptions.hybridHancomAiImageCache;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n if (cliOptions.threads) {\n convertOptions.threads = cliOptions.threads;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.hybridHancomAiRegionlistStrategy) {\n args.push('--hybrid-hancom-ai-regionlist-strategy', options.hybridHancomAiRegionlistStrategy);\n }\n if (options.hybridHancomAiOcrStrategy) {\n args.push('--hybrid-hancom-ai-ocr-strategy', options.hybridHancomAiOcrStrategy);\n }\n if (options.hybridHancomAiImageCache) {\n args.push('--hybrid-hancom-ai-image-cache', options.hybridHancomAiImageCache);\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n if (options.threads) {\n args.push('--threads', options.threads);\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--sanitize', 'Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: default (border-based), cluster (border + cluster). Default: default');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: off, xycut. Default: xycut');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--image-output <value>', 'Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n program.option('--image-dir <value>', 'Directory for extracted images');\n program.option('--pages <value>', 'Pages to extract (e.g., \"1,3,5-7\"). Default: all pages');\n program.option('--include-header-footer', 'Include page headers and footers in output');\n program.option('--detect-strikethrough', 'Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental)');\n program.option('--hybrid <value>', 'Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai');\n program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');\n program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');\n program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0');\n program.option('--hybrid-fallback', 'Opt in to Java fallback on hybrid backend error (default: disabled)');\n program.option('--hybrid-hancom-ai-regionlist-strategy <value>', 'DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list)');\n program.option('--hybrid-hancom-ai-ocr-strategy <value>', 'OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only)');\n program.option('--hybrid-hancom-ai-image-cache <value>', 'Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk');\n program.option('--to-stdout', 'Write output to stdout instead of file (single format only)');\n program.option('--threads <value>', 'Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode');\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAKA,IAAM,mBAAmB,MACvB,OAAO,aAAa,cAChB,IAAI,IAAI,QAAQ,UAAU,EAAE,EAAE,OAC7B,SAAS,iBAAiB,SAAS,cAAc,QAAQ,YAAY,MAAM,WAC1E,SAAS,cAAc,MACvB,IAAI,IAAI,WAAW,SAAS,OAAO,EAAE;AAEtC,IAAM,gBAAgC,iCAAiB;;;ACX9D,uBAAwC;;;ACDxC,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,4BAA8B;AAC9B,iBAA8B;;;ACwGvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ,WAAW;AAAA,EACpC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,YAAY;AACzB,mBAAe,aAAa,WAAW;AAAA,EACzC;AACA,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB,WAAW;AAAA,EAC5C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,kCAAkC;AAC/C,mBAAe,mCAAmC,WAAW;AAAA,EAC/D;AACA,MAAI,WAAW,2BAA2B;AACxC,mBAAe,4BAA4B,WAAW;AAAA,EACxD;AACA,MAAI,WAAW,0BAA0B;AACvC,mBAAe,2BAA2B,WAAW;AAAA,EACvD;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,SAAS;AACtB,mBAAe,UAAU,WAAW;AAAA,EACtC;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,kCAAkC;AAC5C,SAAK,KAAK,0CAA0C,QAAQ,gCAAgC;AAAA,EAC9F;AACA,MAAI,QAAQ,2BAA2B;AACrC,SAAK,KAAK,mCAAmC,QAAQ,yBAAyB;AAAA,EAChF;AACA,MAAI,QAAQ,0BAA0B;AACpC,SAAK,KAAK,kCAAkC,QAAQ,wBAAwB;AAAA,EAC9E;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AACA,MAAI,QAAQ,SAAS;AACnB,SAAK,KAAK,aAAa,QAAQ,OAAO;AAAA,EACxC;AAEA,SAAO;AACT;;;AD/SA,IAAMA,kBAAa,0BAAc,aAAe;AAChD,IAAM,YAAiB,aAAQA,WAAU;AAEzC,IAAM,WAAW;AAWjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAIhB,UAAM,cAAc;AAAA,MAClB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAG;AAAA,IACL;AAEA,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAKb,UAAM,gBAAgB,IAAI,oCAAc,MAAM;AAC9C,UAAM,gBAAgB,IAAI,oCAAc,MAAM;AAE9C,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAIhB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B,OAAO;AACL,kBAAU;AAAA,MACZ;AAAA,IACF,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AAGA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAIhC,YAAM,aAAa,cAAc,IAAI;AACrC,YAAM,aAAa,cAAc,IAAI;AACrC,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,cAAc;AAChB,kBAAQ,OAAO,MAAM,UAAU;AAAA,QACjC,OAAO;AACL,oBAAU;AAAA,QACZ;AAAA,MACF;AACA,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,aAAc,SAAQ,OAAO,MAAM,UAAU;AACjD,kBAAU;AAAA,MACZ;AAEA,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AAKA,QAAC,MAA2C,aAAa;AACzD,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEA,SAAS,aACP,YACA,SACkB;AAClB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,IAAI,MAAM,2CAA2C;AAAA,EAC9D;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE;AAAA,IAC7D;AAAA,EACF;AAEA,SAAO,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAC7C;AA2BA,eAAsB,WACpB,YACA,UAA0B,CAAC,GACZ;AACf,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,UAAM;AAAA,EACR;AACA,QAAM,WAAW,aAAa,EAAE,cAAc,KAAK,CAAC;AACtD;;;AEjLO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,gJAAgJ;AACvL,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,cAAc,mHAAmH;AAChJ,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,sGAAsG;AAC/I,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,0BAA0B,wHAAwH;AACjK,UAAQ,OAAO,0BAA0B,qEAAqE;AAC9G,UAAQ,OAAO,uBAAuB,gCAAgC;AACtE,UAAQ,OAAO,mBAAmB,wDAAwD;AAC1F,UAAQ,OAAO,2BAA2B,4CAA4C;AACtF,UAAQ,OAAO,0BAA0B,gHAAgH;AACzJ,UAAQ,OAAO,oBAAoB,iOAAiO;AACpQ,UAAQ,OAAO,yBAAyB,sGAAsG;AAC9I,UAAQ,OAAO,wBAAwB,+CAA+C;AACtF,UAAQ,OAAO,4BAA4B,6EAA6E;AACxH,UAAQ,OAAO,qBAAqB,qEAAqE;AACzG,UAAQ,OAAO,kDAAkD,8JAA8J;AAC/N,UAAQ,OAAO,2CAA2C,oIAAoI;AAC9L,UAAQ,OAAO,0CAA0C,uFAAuF;AAChJ,UAAQ,OAAO,eAAe,6DAA6D;AAC3F,UAAQ,OAAO,qBAAqB,iTAAiT;AACvV;;;AHjCA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,yBAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,iCAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AAGF,UAAM,WAAW,YAAY,cAAc;AAC3C,WAAO;AAAA,EACT,SAAS,KAAK;AAMZ,UAAM,aACJ,eAAe,SAAU,IAAyC,eAAe;AACnF,QAAI,CAAC,YAAY;AACf,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,cAAQ,MAAM,OAAO;AAAA,IACvB;AACA,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":["__filename"]}
1
+ {"version":3,"sources":["../node_modules/.pnpm/tsup@8.5.1_postcss@8.5.12_tsx@4.20.5_typescript@6.0.2/node_modules/tsup/assets/cjs_shims.js","../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["// Shim globals in cjs bundle\n// There's a weird bug that esbuild will always inject importMetaUrl\n// if we export it as `const importMetaUrl = ... __filename ...`\n// But using a function will not cause this issue\n\nconst getImportMetaUrl = () => \n typeof document === \"undefined\" \n ? new URL(`file:${__filename}`).href \n : (document.currentScript && document.currentScript.tagName.toUpperCase() === 'SCRIPT') \n ? document.currentScript.src \n : new URL(\"main.js\", document.baseURI).href;\n\nexport const importMetaUrl = /* @__PURE__ */ getImportMetaUrl()\n","#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { _runForCli } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n // _runForCli streams stdout/stderr to the parent process as they arrive;\n // we deliberately do not re-print anything here. (Issue #398.)\n await _runForCli(inputPaths, convertOptions);\n return 0;\n } catch (err) {\n // Subprocess-exit errors are already on the user's terminal via the live\n // stderr stream — re-printing would duplicate output and risk leaking\n // anything sensitive Java logged (e.g. a --password value echoed by an\n // underlying library). Wrapper-side failures (JAR not found, java not in\n // PATH, bad input path) still need to be surfaced.\n const isJavaExit =\n err instanceof Error && (err as Error & { isJavaExit?: boolean }).isJavaExit === true;\n if (!isJavaExit) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n }\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { StringDecoder } from 'string_decoder';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n /**\n * When true, forwards Java's stdout and stderr chunks to the parent\n * process in real time as well as accumulating them. Used by the bundled\n * CLI so long-running conversions show progress as it happens.\n */\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n // Force headless AWT so macOS doesn't surface a Dock icon (and steal focus)\n // every time the JVM touches ImageIO/PDFBox rendering. Safe on all OSes —\n // the CLI never opens a UI window, only manipulates BufferedImages.\n const commandArgs = [\n '-Djava.awt.headless=true',\n '-Dapple.awt.UIElement=true',\n '-jar',\n jarPath,\n ...args,\n ];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n // StringDecoder buffers incomplete multi-byte UTF-8 sequences across\n // chunk boundaries — Buffer.toString() alone would emit replacement\n // characters when, e.g., a 3-byte Korean codepoint splits across two\n // 'data' events. One decoder per stream so they don't share state.\n const stdoutDecoder = new StringDecoder('utf8');\n const stderrDecoder = new StringDecoder('utf8');\n\n javaProcess.stdout.on('data', (data: Buffer) => {\n const chunk = stdoutDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n // Stream-only: don't also accumulate, or a multi-hour conversion\n // would buffer its entire (potentially gigabyte) stdout in memory\n // for no consumer.\n process.stdout.write(chunk);\n } else {\n stdout += chunk;\n }\n });\n\n javaProcess.stderr.on('data', (data: Buffer) => {\n const chunk = stderrDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n // stderr is always accumulated (progress logs are small and we need\n // them for error messages on non-zero exit).\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n // Flush any trailing bytes the decoder is still holding (always emit\n // them — if we drop them on error paths, error messages with non-ASCII\n // characters lose their tail).\n const stdoutTail = stdoutDecoder.end();\n const stderrTail = stderrDecoder.end();\n if (stdoutTail.length > 0) {\n if (streamOutput) {\n process.stdout.write(stdoutTail);\n } else {\n stdout += stdoutTail;\n }\n }\n if (stderrTail.length > 0) {\n if (streamOutput) process.stderr.write(stderrTail);\n stderr += stderrTail;\n }\n\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n // Tag so the CLI can suppress re-printing this message — Java's\n // stderr was already streamed live to the parent in CLI mode, and\n // re-printing risks leaking anything sensitive Java logged\n // (e.g. a --password value echoed by an underlying library).\n (error as Error & { isJavaExit?: boolean }).isJavaExit = true;\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nfunction buildJarArgs(\n inputPaths: string | string[],\n options: ConvertOptions,\n): string[] | Error {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return new Error('At least one input path must be provided.');\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return new Error(`Input file or folder not found: ${input}`);\n }\n }\n\n return [...inputList, ...buildArgs(options)];\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n return Promise.reject(argsOrError);\n }\n // Library API: never streams to the parent process. Returns the full stdout\n // string so callers can do `const out = await convert(...)` without surprise\n // side-effects on process.stdout / process.stderr.\n return executeJar(argsOrError, { streamOutput: false });\n}\n\n/**\n * Internal entry point used by the bundled CLI. Streams Java's stdout and\n * stderr to the parent process in real time (so long-running conversions like\n * hybrid mode show progress as it happens) and resolves without a stdout\n * payload — preventing the caller from re-printing what was already streamed.\n *\n * Not part of the public API: do not import this from application code. Use\n * {@link convert} instead.\n *\n * @internal\n */\nexport async function _runForCli(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<void> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n throw argsOrError;\n }\n await executeJar(argsOrError, { streamOutput: true });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, tagged-pdf. Default: json. For HTML inside Markdown use --markdown-with-html. For image extraction control use --image-output. */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure. Output quality depends on tag quality */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Allow HTML tags inside Markdown output for complex structures such as multi-row-span tables. Implies --format markdown. */\n markdownWithHtml?: boolean;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */\n hybridHancomAiRegionlistStrategy?: string;\n /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */\n hybridHancomAiOcrStrategy?: string;\n /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */\n hybridHancomAiImageCache?: string;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */\n threads?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n markdownWithHtml?: boolean;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n hybridHancomAiRegionlistStrategy?: string;\n hybridHancomAiOcrStrategy?: string;\n hybridHancomAiImageCache?: string;\n toStdout?: boolean;\n threads?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.markdownWithHtml) {\n convertOptions.markdownWithHtml = true;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.hybridHancomAiRegionlistStrategy) {\n convertOptions.hybridHancomAiRegionlistStrategy = cliOptions.hybridHancomAiRegionlistStrategy;\n }\n if (cliOptions.hybridHancomAiOcrStrategy) {\n convertOptions.hybridHancomAiOcrStrategy = cliOptions.hybridHancomAiOcrStrategy;\n }\n if (cliOptions.hybridHancomAiImageCache) {\n convertOptions.hybridHancomAiImageCache = cliOptions.hybridHancomAiImageCache;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n if (cliOptions.threads) {\n convertOptions.threads = cliOptions.threads;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.markdownWithHtml) {\n args.push('--markdown-with-html');\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.hybridHancomAiRegionlistStrategy) {\n args.push('--hybrid-hancom-ai-regionlist-strategy', options.hybridHancomAiRegionlistStrategy);\n }\n if (options.hybridHancomAiOcrStrategy) {\n args.push('--hybrid-hancom-ai-ocr-strategy', options.hybridHancomAiOcrStrategy);\n }\n if (options.hybridHancomAiImageCache) {\n args.push('--hybrid-hancom-ai-image-cache', options.hybridHancomAiImageCache);\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n if (options.threads) {\n args.push('--threads', options.threads);\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, tagged-pdf. Default: json. For HTML inside Markdown use --markdown-with-html. For image extraction control use --image-output.');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--sanitize', 'Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure. Output quality depends on tag quality');\n program.option('--table-method <value>', 'Table detection method. Values: default (border-based), cluster (border + cluster). Default: default');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: off, xycut. Default: xycut');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--markdown-with-html', 'Allow HTML tags inside Markdown output for complex structures such as multi-row-span tables. Implies --format markdown.');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--image-output <value>', 'Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n program.option('--image-dir <value>', 'Directory for extracted images');\n program.option('--pages <value>', 'Pages to extract (e.g., \"1,3,5-7\"). Default: all pages');\n program.option('--include-header-footer', 'Include page headers and footers in output');\n program.option('--detect-strikethrough', 'Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental)');\n program.option('--hybrid <value>', 'Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai');\n program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');\n program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');\n program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0');\n program.option('--hybrid-fallback', 'Opt in to Java fallback on hybrid backend error (default: disabled)');\n program.option('--hybrid-hancom-ai-regionlist-strategy <value>', 'DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list)');\n program.option('--hybrid-hancom-ai-ocr-strategy <value>', 'OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only)');\n program.option('--hybrid-hancom-ai-image-cache <value>', 'Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk');\n program.option('--to-stdout', 'Write output to stdout instead of file (single format only)');\n program.option('--threads <value>', 'Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode');\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAKA,IAAM,mBAAmB,MACvB,OAAO,aAAa,cAChB,IAAI,IAAI,QAAQ,UAAU,EAAE,EAAE,OAC7B,SAAS,iBAAiB,SAAS,cAAc,QAAQ,YAAY,MAAM,WAC1E,SAAS,cAAc,MACvB,IAAI,IAAI,WAAW,SAAS,OAAO,EAAE;AAEtC,IAAM,gBAAgC,iCAAiB;;;ACX9D,uBAAwC;;;ACDxC,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,4BAA8B;AAC9B,iBAA8B;;;AC2GvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB;AAAA,EACpC;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ,WAAW;AAAA,EACpC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,YAAY;AACzB,mBAAe,aAAa,WAAW;AAAA,EACzC;AACA,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB,WAAW;AAAA,EAC5C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,kCAAkC;AAC/C,mBAAe,mCAAmC,WAAW;AAAA,EAC/D;AACA,MAAI,WAAW,2BAA2B;AACxC,mBAAe,4BAA4B,WAAW;AAAA,EACxD;AACA,MAAI,WAAW,0BAA0B;AACvC,mBAAe,2BAA2B,WAAW;AAAA,EACvD;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,SAAS;AACtB,mBAAe,UAAU,WAAW;AAAA,EACtC;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,kBAAkB;AAC5B,SAAK,KAAK,sBAAsB;AAAA,EAClC;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,kCAAkC;AAC5C,SAAK,KAAK,0CAA0C,QAAQ,gCAAgC;AAAA,EAC9F;AACA,MAAI,QAAQ,2BAA2B;AACrC,SAAK,KAAK,mCAAmC,QAAQ,yBAAyB;AAAA,EAChF;AACA,MAAI,QAAQ,0BAA0B;AACpC,SAAK,KAAK,kCAAkC,QAAQ,wBAAwB;AAAA,EAC9E;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AACA,MAAI,QAAQ,SAAS;AACnB,SAAK,KAAK,aAAa,QAAQ,OAAO;AAAA,EACxC;AAEA,SAAO;AACT;;;ADxTA,IAAMA,kBAAa,0BAAc,aAAe;AAChD,IAAM,YAAiB,aAAQA,WAAU;AAEzC,IAAM,WAAW;AAWjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAIhB,UAAM,cAAc;AAAA,MAClB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAG;AAAA,IACL;AAEA,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAKb,UAAM,gBAAgB,IAAI,oCAAc,MAAM;AAC9C,UAAM,gBAAgB,IAAI,oCAAc,MAAM;AAE9C,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAIhB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B,OAAO;AACL,kBAAU;AAAA,MACZ;AAAA,IACF,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AAGA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAIhC,YAAM,aAAa,cAAc,IAAI;AACrC,YAAM,aAAa,cAAc,IAAI;AACrC,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,cAAc;AAChB,kBAAQ,OAAO,MAAM,UAAU;AAAA,QACjC,OAAO;AACL,oBAAU;AAAA,QACZ;AAAA,MACF;AACA,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,aAAc,SAAQ,OAAO,MAAM,UAAU;AACjD,kBAAU;AAAA,MACZ;AAEA,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AAKA,QAAC,MAA2C,aAAa;AACzD,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEA,SAAS,aACP,YACA,SACkB;AAClB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,IAAI,MAAM,2CAA2C;AAAA,EAC9D;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE;AAAA,IAC7D;AAAA,EACF;AAEA,SAAO,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAC7C;AA2BA,eAAsB,WACpB,YACA,UAA0B,CAAC,GACZ;AACf,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,UAAM;AAAA,EACR;AACA,QAAM,WAAW,aAAa,EAAE,cAAc,KAAK,CAAC;AACtD;;;AEjLO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,2MAA2M;AAClP,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,cAAc,mHAAmH;AAChJ,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,qHAAqH;AACzJ,UAAQ,OAAO,0BAA0B,sGAAsG;AAC/I,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,wBAAwB,yHAAyH;AAChK,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,0BAA0B,wHAAwH;AACjK,UAAQ,OAAO,0BAA0B,qEAAqE;AAC9G,UAAQ,OAAO,uBAAuB,gCAAgC;AACtE,UAAQ,OAAO,mBAAmB,wDAAwD;AAC1F,UAAQ,OAAO,2BAA2B,4CAA4C;AACtF,UAAQ,OAAO,0BAA0B,gHAAgH;AACzJ,UAAQ,OAAO,oBAAoB,iOAAiO;AACpQ,UAAQ,OAAO,yBAAyB,sGAAsG;AAC9I,UAAQ,OAAO,wBAAwB,+CAA+C;AACtF,UAAQ,OAAO,4BAA4B,6EAA6E;AACxH,UAAQ,OAAO,qBAAqB,qEAAqE;AACzG,UAAQ,OAAO,kDAAkD,8JAA8J;AAC/N,UAAQ,OAAO,2CAA2C,oIAAoI;AAC9L,UAAQ,OAAO,0CAA0C,uFAAuF;AAChJ,UAAQ,OAAO,eAAe,6DAA6D;AAC3F,UAAQ,OAAO,qBAAqB,iTAAiT;AACvV;;;AHlCA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,yBAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,iCAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AAGF,UAAM,WAAW,YAAY,cAAc;AAC3C,WAAO;AAAA,EACT,SAAS,KAAK;AAMZ,UAAM,aACJ,eAAe,SAAU,IAAyC,eAAe;AACnF,QAAI,CAAC,YAAY;AACf,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,cAAQ,MAAM,OAAO;AAAA,IACvB;AACA,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":["__filename"]}
package/dist/cli.js CHANGED
@@ -49,6 +49,9 @@ function buildConvertOptions(cliOptions) {
49
49
  if (cliOptions.markdownPageSeparator) {
50
50
  convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;
51
51
  }
52
+ if (cliOptions.markdownWithHtml) {
53
+ convertOptions.markdownWithHtml = true;
54
+ }
52
55
  if (cliOptions.textPageSeparator) {
53
56
  convertOptions.textPageSeparator = cliOptions.textPageSeparator;
54
57
  }
@@ -155,6 +158,9 @@ function buildArgs(options) {
155
158
  if (options.markdownPageSeparator) {
156
159
  args.push("--markdown-page-separator", options.markdownPageSeparator);
157
160
  }
161
+ if (options.markdownWithHtml) {
162
+ args.push("--markdown-with-html");
163
+ }
158
164
  if (options.textPageSeparator) {
159
165
  args.push("--text-page-separator", options.textPageSeparator);
160
166
  }
@@ -319,16 +325,17 @@ async function _runForCli(inputPaths, options = {}) {
319
325
  function registerCliOptions(program) {
320
326
  program.option("-o, --output-dir <value>", "Directory where output files are written. Default: input file directory");
321
327
  program.option("-p, --password <value>", "Password for encrypted PDF files");
322
- program.option("-f, --format <value>", "Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json");
328
+ program.option("-f, --format <value>", "Output formats (comma-separated). Values: json, text, html, pdf, markdown, tagged-pdf. Default: json. For HTML inside Markdown use --markdown-with-html. For image extraction control use --image-output.");
323
329
  program.option("-q, --quiet", "Suppress console logging output");
324
330
  program.option("--content-safety-off <value>", "Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg");
325
331
  program.option("--sanitize", "Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders");
326
332
  program.option("--keep-line-breaks", "Preserve original line breaks in extracted text");
327
333
  program.option("--replace-invalid-chars <value>", "Replacement character for invalid/unrecognized characters. Default: space");
328
- program.option("--use-struct-tree", "Use PDF structure tree (tagged PDF) for reading order and semantic structure");
334
+ program.option("--use-struct-tree", "Use PDF structure tree (tagged PDF) for reading order and semantic structure. Output quality depends on tag quality");
329
335
  program.option("--table-method <value>", "Table detection method. Values: default (border-based), cluster (border + cluster). Default: default");
330
336
  program.option("--reading-order <value>", "Reading order algorithm. Values: off, xycut. Default: xycut");
331
337
  program.option("--markdown-page-separator <value>", "Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none");
338
+ program.option("--markdown-with-html", "Allow HTML tags inside Markdown output for complex structures such as multi-row-span tables. Implies --format markdown.");
332
339
  program.option("--text-page-separator <value>", "Separator between pages in text output. Use %page-number% for page numbers. Default: none");
333
340
  program.option("--html-page-separator <value>", "Separator between pages in HTML output. Use %page-number% for page numbers. Default: none");
334
341
  program.option("--image-output <value>", "Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external");
package/dist/cli.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { _runForCli } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n // _runForCli streams stdout/stderr to the parent process as they arrive;\n // we deliberately do not re-print anything here. (Issue #398.)\n await _runForCli(inputPaths, convertOptions);\n return 0;\n } catch (err) {\n // Subprocess-exit errors are already on the user's terminal via the live\n // stderr stream — re-printing would duplicate output and risk leaking\n // anything sensitive Java logged (e.g. a --password value echoed by an\n // underlying library). Wrapper-side failures (JAR not found, java not in\n // PATH, bad input path) still need to be surfaced.\n const isJavaExit =\n err instanceof Error && (err as Error & { isJavaExit?: boolean }).isJavaExit === true;\n if (!isJavaExit) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n }\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { StringDecoder } from 'string_decoder';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n /**\n * When true, forwards Java's stdout and stderr chunks to the parent\n * process in real time as well as accumulating them. Used by the bundled\n * CLI so long-running conversions show progress as it happens.\n */\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n // Force headless AWT so macOS doesn't surface a Dock icon (and steal focus)\n // every time the JVM touches ImageIO/PDFBox rendering. Safe on all OSes —\n // the CLI never opens a UI window, only manipulates BufferedImages.\n const commandArgs = [\n '-Djava.awt.headless=true',\n '-Dapple.awt.UIElement=true',\n '-jar',\n jarPath,\n ...args,\n ];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n // StringDecoder buffers incomplete multi-byte UTF-8 sequences across\n // chunk boundaries — Buffer.toString() alone would emit replacement\n // characters when, e.g., a 3-byte Korean codepoint splits across two\n // 'data' events. One decoder per stream so they don't share state.\n const stdoutDecoder = new StringDecoder('utf8');\n const stderrDecoder = new StringDecoder('utf8');\n\n javaProcess.stdout.on('data', (data: Buffer) => {\n const chunk = stdoutDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n // Stream-only: don't also accumulate, or a multi-hour conversion\n // would buffer its entire (potentially gigabyte) stdout in memory\n // for no consumer.\n process.stdout.write(chunk);\n } else {\n stdout += chunk;\n }\n });\n\n javaProcess.stderr.on('data', (data: Buffer) => {\n const chunk = stderrDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n // stderr is always accumulated (progress logs are small and we need\n // them for error messages on non-zero exit).\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n // Flush any trailing bytes the decoder is still holding (always emit\n // them — if we drop them on error paths, error messages with non-ASCII\n // characters lose their tail).\n const stdoutTail = stdoutDecoder.end();\n const stderrTail = stderrDecoder.end();\n if (stdoutTail.length > 0) {\n if (streamOutput) {\n process.stdout.write(stdoutTail);\n } else {\n stdout += stdoutTail;\n }\n }\n if (stderrTail.length > 0) {\n if (streamOutput) process.stderr.write(stderrTail);\n stderr += stderrTail;\n }\n\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n // Tag so the CLI can suppress re-printing this message — Java's\n // stderr was already streamed live to the parent in CLI mode, and\n // re-printing risks leaking anything sensitive Java logged\n // (e.g. a --password value echoed by an underlying library).\n (error as Error & { isJavaExit?: boolean }).isJavaExit = true;\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nfunction buildJarArgs(\n inputPaths: string | string[],\n options: ConvertOptions,\n): string[] | Error {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return new Error('At least one input path must be provided.');\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return new Error(`Input file or folder not found: ${input}`);\n }\n }\n\n return [...inputList, ...buildArgs(options)];\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n return Promise.reject(argsOrError);\n }\n // Library API: never streams to the parent process. Returns the full stdout\n // string so callers can do `const out = await convert(...)` without surprise\n // side-effects on process.stdout / process.stderr.\n return executeJar(argsOrError, { streamOutput: false });\n}\n\n/**\n * Internal entry point used by the bundled CLI. Streams Java's stdout and\n * stderr to the parent process in real time (so long-running conversions like\n * hybrid mode show progress as it happens) and resolves without a stdout\n * payload — preventing the caller from re-printing what was already streamed.\n *\n * Not part of the public API: do not import this from application code. Use\n * {@link convert} instead.\n *\n * @internal\n */\nexport async function _runForCli(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<void> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n throw argsOrError;\n }\n await executeJar(argsOrError, { streamOutput: true });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */\n hybridHancomAiRegionlistStrategy?: string;\n /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */\n hybridHancomAiOcrStrategy?: string;\n /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */\n hybridHancomAiImageCache?: string;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */\n threads?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n hybridHancomAiRegionlistStrategy?: string;\n hybridHancomAiOcrStrategy?: string;\n hybridHancomAiImageCache?: string;\n toStdout?: boolean;\n threads?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.hybridHancomAiRegionlistStrategy) {\n convertOptions.hybridHancomAiRegionlistStrategy = cliOptions.hybridHancomAiRegionlistStrategy;\n }\n if (cliOptions.hybridHancomAiOcrStrategy) {\n convertOptions.hybridHancomAiOcrStrategy = cliOptions.hybridHancomAiOcrStrategy;\n }\n if (cliOptions.hybridHancomAiImageCache) {\n convertOptions.hybridHancomAiImageCache = cliOptions.hybridHancomAiImageCache;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n if (cliOptions.threads) {\n convertOptions.threads = cliOptions.threads;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.hybridHancomAiRegionlistStrategy) {\n args.push('--hybrid-hancom-ai-regionlist-strategy', options.hybridHancomAiRegionlistStrategy);\n }\n if (options.hybridHancomAiOcrStrategy) {\n args.push('--hybrid-hancom-ai-ocr-strategy', options.hybridHancomAiOcrStrategy);\n }\n if (options.hybridHancomAiImageCache) {\n args.push('--hybrid-hancom-ai-image-cache', options.hybridHancomAiImageCache);\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n if (options.threads) {\n args.push('--threads', options.threads);\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--sanitize', 'Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: default (border-based), cluster (border + cluster). Default: default');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: off, xycut. Default: xycut');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--image-output <value>', 'Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n program.option('--image-dir <value>', 'Directory for extracted images');\n program.option('--pages <value>', 'Pages to extract (e.g., \"1,3,5-7\"). Default: all pages');\n program.option('--include-header-footer', 'Include page headers and footers in output');\n program.option('--detect-strikethrough', 'Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental)');\n program.option('--hybrid <value>', 'Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai');\n program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');\n program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');\n program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0');\n program.option('--hybrid-fallback', 'Opt in to Java fallback on hybrid backend error (default: disabled)');\n program.option('--hybrid-hancom-ai-regionlist-strategy <value>', 'DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list)');\n program.option('--hybrid-hancom-ai-ocr-strategy <value>', 'OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only)');\n program.option('--hybrid-hancom-ai-image-cache <value>', 'Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk');\n program.option('--to-stdout', 'Write output to stdout instead of file (single format only)');\n program.option('--threads <value>', 'Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode');\n}\n"],"mappings":";;;AACA,SAAS,SAAS,sBAAsB;;;ACDxC,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;AAC9B,SAAS,qBAAqB;;;ACwGvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ,WAAW;AAAA,EACpC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,YAAY;AACzB,mBAAe,aAAa,WAAW;AAAA,EACzC;AACA,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB,WAAW;AAAA,EAC5C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,kCAAkC;AAC/C,mBAAe,mCAAmC,WAAW;AAAA,EAC/D;AACA,MAAI,WAAW,2BAA2B;AACxC,mBAAe,4BAA4B,WAAW;AAAA,EACxD;AACA,MAAI,WAAW,0BAA0B;AACvC,mBAAe,2BAA2B,WAAW;AAAA,EACvD;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,SAAS;AACtB,mBAAe,UAAU,WAAW;AAAA,EACtC;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,kCAAkC;AAC5C,SAAK,KAAK,0CAA0C,QAAQ,gCAAgC;AAAA,EAC9F;AACA,MAAI,QAAQ,2BAA2B;AACrC,SAAK,KAAK,mCAAmC,QAAQ,yBAAyB;AAAA,EAChF;AACA,MAAI,QAAQ,0BAA0B;AACpC,SAAK,KAAK,kCAAkC,QAAQ,wBAAwB;AAAA,EAC9E;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AACA,MAAI,QAAQ,SAAS;AACnB,SAAK,KAAK,aAAa,QAAQ,OAAO;AAAA,EACxC;AAEA,SAAO;AACT;;;AD/SA,IAAMA,cAAa,cAAc,YAAY,GAAG;AAChD,IAAMC,aAAiB,aAAQD,WAAU;AAEzC,IAAM,WAAW;AAWjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAKC,YAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAIhB,UAAM,cAAc;AAAA,MAClB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAG;AAAA,IACL;AAEA,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAKb,UAAM,gBAAgB,IAAI,cAAc,MAAM;AAC9C,UAAM,gBAAgB,IAAI,cAAc,MAAM;AAE9C,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAIhB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B,OAAO;AACL,kBAAU;AAAA,MACZ;AAAA,IACF,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AAGA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAIhC,YAAM,aAAa,cAAc,IAAI;AACrC,YAAM,aAAa,cAAc,IAAI;AACrC,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,cAAc;AAChB,kBAAQ,OAAO,MAAM,UAAU;AAAA,QACjC,OAAO;AACL,oBAAU;AAAA,QACZ;AAAA,MACF;AACA,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,aAAc,SAAQ,OAAO,MAAM,UAAU;AACjD,kBAAU;AAAA,MACZ;AAEA,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AAKA,QAAC,MAA2C,aAAa;AACzD,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEA,SAAS,aACP,YACA,SACkB;AAClB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,IAAI,MAAM,2CAA2C;AAAA,EAC9D;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE;AAAA,IAC7D;AAAA,EACF;AAEA,SAAO,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAC7C;AA2BA,eAAsB,WACpB,YACA,UAA0B,CAAC,GACZ;AACf,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,UAAM;AAAA,EACR;AACA,QAAM,WAAW,aAAa,EAAE,cAAc,KAAK,CAAC;AACtD;;;AEjLO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,gJAAgJ;AACvL,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,cAAc,mHAAmH;AAChJ,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,sGAAsG;AAC/I,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,0BAA0B,wHAAwH;AACjK,UAAQ,OAAO,0BAA0B,qEAAqE;AAC9G,UAAQ,OAAO,uBAAuB,gCAAgC;AACtE,UAAQ,OAAO,mBAAmB,wDAAwD;AAC1F,UAAQ,OAAO,2BAA2B,4CAA4C;AACtF,UAAQ,OAAO,0BAA0B,gHAAgH;AACzJ,UAAQ,OAAO,oBAAoB,iOAAiO;AACpQ,UAAQ,OAAO,yBAAyB,sGAAsG;AAC9I,UAAQ,OAAO,wBAAwB,+CAA+C;AACtF,UAAQ,OAAO,4BAA4B,6EAA6E;AACxH,UAAQ,OAAO,qBAAqB,qEAAqE;AACzG,UAAQ,OAAO,kDAAkD,8JAA8J;AAC/N,UAAQ,OAAO,2CAA2C,oIAAoI;AAC9L,UAAQ,OAAO,0CAA0C,uFAAuF;AAChJ,UAAQ,OAAO,eAAe,6DAA6D;AAC3F,UAAQ,OAAO,qBAAqB,iTAAiT;AACvV;;;AHjCA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,QAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,gBAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AAGF,UAAM,WAAW,YAAY,cAAc;AAC3C,WAAO;AAAA,EACT,SAAS,KAAK;AAMZ,UAAM,aACJ,eAAe,SAAU,IAAyC,eAAe;AACnF,QAAI,CAAC,YAAY;AACf,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,cAAQ,MAAM,OAAO;AAAA,IACvB;AACA,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":["__filename","__dirname"]}
1
+ {"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { _runForCli } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n // _runForCli streams stdout/stderr to the parent process as they arrive;\n // we deliberately do not re-print anything here. (Issue #398.)\n await _runForCli(inputPaths, convertOptions);\n return 0;\n } catch (err) {\n // Subprocess-exit errors are already on the user's terminal via the live\n // stderr stream — re-printing would duplicate output and risk leaking\n // anything sensitive Java logged (e.g. a --password value echoed by an\n // underlying library). Wrapper-side failures (JAR not found, java not in\n // PATH, bad input path) still need to be surfaced.\n const isJavaExit =\n err instanceof Error && (err as Error & { isJavaExit?: boolean }).isJavaExit === true;\n if (!isJavaExit) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n }\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { StringDecoder } from 'string_decoder';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n /**\n * When true, forwards Java's stdout and stderr chunks to the parent\n * process in real time as well as accumulating them. Used by the bundled\n * CLI so long-running conversions show progress as it happens.\n */\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n // Force headless AWT so macOS doesn't surface a Dock icon (and steal focus)\n // every time the JVM touches ImageIO/PDFBox rendering. Safe on all OSes —\n // the CLI never opens a UI window, only manipulates BufferedImages.\n const commandArgs = [\n '-Djava.awt.headless=true',\n '-Dapple.awt.UIElement=true',\n '-jar',\n jarPath,\n ...args,\n ];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n // StringDecoder buffers incomplete multi-byte UTF-8 sequences across\n // chunk boundaries — Buffer.toString() alone would emit replacement\n // characters when, e.g., a 3-byte Korean codepoint splits across two\n // 'data' events. One decoder per stream so they don't share state.\n const stdoutDecoder = new StringDecoder('utf8');\n const stderrDecoder = new StringDecoder('utf8');\n\n javaProcess.stdout.on('data', (data: Buffer) => {\n const chunk = stdoutDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n // Stream-only: don't also accumulate, or a multi-hour conversion\n // would buffer its entire (potentially gigabyte) stdout in memory\n // for no consumer.\n process.stdout.write(chunk);\n } else {\n stdout += chunk;\n }\n });\n\n javaProcess.stderr.on('data', (data: Buffer) => {\n const chunk = stderrDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n // stderr is always accumulated (progress logs are small and we need\n // them for error messages on non-zero exit).\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n // Flush any trailing bytes the decoder is still holding (always emit\n // them — if we drop them on error paths, error messages with non-ASCII\n // characters lose their tail).\n const stdoutTail = stdoutDecoder.end();\n const stderrTail = stderrDecoder.end();\n if (stdoutTail.length > 0) {\n if (streamOutput) {\n process.stdout.write(stdoutTail);\n } else {\n stdout += stdoutTail;\n }\n }\n if (stderrTail.length > 0) {\n if (streamOutput) process.stderr.write(stderrTail);\n stderr += stderrTail;\n }\n\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n // Tag so the CLI can suppress re-printing this message — Java's\n // stderr was already streamed live to the parent in CLI mode, and\n // re-printing risks leaking anything sensitive Java logged\n // (e.g. a --password value echoed by an underlying library).\n (error as Error & { isJavaExit?: boolean }).isJavaExit = true;\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nfunction buildJarArgs(\n inputPaths: string | string[],\n options: ConvertOptions,\n): string[] | Error {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return new Error('At least one input path must be provided.');\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return new Error(`Input file or folder not found: ${input}`);\n }\n }\n\n return [...inputList, ...buildArgs(options)];\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n return Promise.reject(argsOrError);\n }\n // Library API: never streams to the parent process. Returns the full stdout\n // string so callers can do `const out = await convert(...)` without surprise\n // side-effects on process.stdout / process.stderr.\n return executeJar(argsOrError, { streamOutput: false });\n}\n\n/**\n * Internal entry point used by the bundled CLI. Streams Java's stdout and\n * stderr to the parent process in real time (so long-running conversions like\n * hybrid mode show progress as it happens) and resolves without a stdout\n * payload — preventing the caller from re-printing what was already streamed.\n *\n * Not part of the public API: do not import this from application code. Use\n * {@link convert} instead.\n *\n * @internal\n */\nexport async function _runForCli(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<void> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n throw argsOrError;\n }\n await executeJar(argsOrError, { streamOutput: true });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, tagged-pdf. Default: json. For HTML inside Markdown use --markdown-with-html. For image extraction control use --image-output. */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure. Output quality depends on tag quality */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Allow HTML tags inside Markdown output for complex structures such as multi-row-span tables. Implies --format markdown. */\n markdownWithHtml?: boolean;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */\n hybridHancomAiRegionlistStrategy?: string;\n /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */\n hybridHancomAiOcrStrategy?: string;\n /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */\n hybridHancomAiImageCache?: string;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */\n threads?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n markdownWithHtml?: boolean;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n hybridHancomAiRegionlistStrategy?: string;\n hybridHancomAiOcrStrategy?: string;\n hybridHancomAiImageCache?: string;\n toStdout?: boolean;\n threads?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.markdownWithHtml) {\n convertOptions.markdownWithHtml = true;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.hybridHancomAiRegionlistStrategy) {\n convertOptions.hybridHancomAiRegionlistStrategy = cliOptions.hybridHancomAiRegionlistStrategy;\n }\n if (cliOptions.hybridHancomAiOcrStrategy) {\n convertOptions.hybridHancomAiOcrStrategy = cliOptions.hybridHancomAiOcrStrategy;\n }\n if (cliOptions.hybridHancomAiImageCache) {\n convertOptions.hybridHancomAiImageCache = cliOptions.hybridHancomAiImageCache;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n if (cliOptions.threads) {\n convertOptions.threads = cliOptions.threads;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.markdownWithHtml) {\n args.push('--markdown-with-html');\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.hybridHancomAiRegionlistStrategy) {\n args.push('--hybrid-hancom-ai-regionlist-strategy', options.hybridHancomAiRegionlistStrategy);\n }\n if (options.hybridHancomAiOcrStrategy) {\n args.push('--hybrid-hancom-ai-ocr-strategy', options.hybridHancomAiOcrStrategy);\n }\n if (options.hybridHancomAiImageCache) {\n args.push('--hybrid-hancom-ai-image-cache', options.hybridHancomAiImageCache);\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n if (options.threads) {\n args.push('--threads', options.threads);\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, tagged-pdf. Default: json. For HTML inside Markdown use --markdown-with-html. For image extraction control use --image-output.');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--sanitize', 'Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure. Output quality depends on tag quality');\n program.option('--table-method <value>', 'Table detection method. Values: default (border-based), cluster (border + cluster). Default: default');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: off, xycut. Default: xycut');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--markdown-with-html', 'Allow HTML tags inside Markdown output for complex structures such as multi-row-span tables. Implies --format markdown.');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--image-output <value>', 'Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n program.option('--image-dir <value>', 'Directory for extracted images');\n program.option('--pages <value>', 'Pages to extract (e.g., \"1,3,5-7\"). Default: all pages');\n program.option('--include-header-footer', 'Include page headers and footers in output');\n program.option('--detect-strikethrough', 'Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental)');\n program.option('--hybrid <value>', 'Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai');\n program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');\n program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');\n program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0');\n program.option('--hybrid-fallback', 'Opt in to Java fallback on hybrid backend error (default: disabled)');\n program.option('--hybrid-hancom-ai-regionlist-strategy <value>', 'DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list)');\n program.option('--hybrid-hancom-ai-ocr-strategy <value>', 'OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only)');\n program.option('--hybrid-hancom-ai-image-cache <value>', 'Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk');\n program.option('--to-stdout', 'Write output to stdout instead of file (single format only)');\n program.option('--threads <value>', 'Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode');\n}\n"],"mappings":";;;AACA,SAAS,SAAS,sBAAsB;;;ACDxC,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;AAC9B,SAAS,qBAAqB;;;AC2GvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB;AAAA,EACpC;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ,WAAW;AAAA,EACpC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,YAAY;AACzB,mBAAe,aAAa,WAAW;AAAA,EACzC;AACA,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB,WAAW;AAAA,EAC5C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,kCAAkC;AAC/C,mBAAe,mCAAmC,WAAW;AAAA,EAC/D;AACA,MAAI,WAAW,2BAA2B;AACxC,mBAAe,4BAA4B,WAAW;AAAA,EACxD;AACA,MAAI,WAAW,0BAA0B;AACvC,mBAAe,2BAA2B,WAAW;AAAA,EACvD;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,SAAS;AACtB,mBAAe,UAAU,WAAW;AAAA,EACtC;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,kBAAkB;AAC5B,SAAK,KAAK,sBAAsB;AAAA,EAClC;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,kCAAkC;AAC5C,SAAK,KAAK,0CAA0C,QAAQ,gCAAgC;AAAA,EAC9F;AACA,MAAI,QAAQ,2BAA2B;AACrC,SAAK,KAAK,mCAAmC,QAAQ,yBAAyB;AAAA,EAChF;AACA,MAAI,QAAQ,0BAA0B;AACpC,SAAK,KAAK,kCAAkC,QAAQ,wBAAwB;AAAA,EAC9E;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AACA,MAAI,QAAQ,SAAS;AACnB,SAAK,KAAK,aAAa,QAAQ,OAAO;AAAA,EACxC;AAEA,SAAO;AACT;;;ADxTA,IAAMA,cAAa,cAAc,YAAY,GAAG;AAChD,IAAMC,aAAiB,aAAQD,WAAU;AAEzC,IAAM,WAAW;AAWjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAKC,YAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAIhB,UAAM,cAAc;AAAA,MAClB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAG;AAAA,IACL;AAEA,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAKb,UAAM,gBAAgB,IAAI,cAAc,MAAM;AAC9C,UAAM,gBAAgB,IAAI,cAAc,MAAM;AAE9C,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAIhB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B,OAAO;AACL,kBAAU;AAAA,MACZ;AAAA,IACF,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AAGA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAIhC,YAAM,aAAa,cAAc,IAAI;AACrC,YAAM,aAAa,cAAc,IAAI;AACrC,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,cAAc;AAChB,kBAAQ,OAAO,MAAM,UAAU;AAAA,QACjC,OAAO;AACL,oBAAU;AAAA,QACZ;AAAA,MACF;AACA,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,aAAc,SAAQ,OAAO,MAAM,UAAU;AACjD,kBAAU;AAAA,MACZ;AAEA,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AAKA,QAAC,MAA2C,aAAa;AACzD,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEA,SAAS,aACP,YACA,SACkB;AAClB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,IAAI,MAAM,2CAA2C;AAAA,EAC9D;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE;AAAA,IAC7D;AAAA,EACF;AAEA,SAAO,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAC7C;AA2BA,eAAsB,WACpB,YACA,UAA0B,CAAC,GACZ;AACf,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,UAAM;AAAA,EACR;AACA,QAAM,WAAW,aAAa,EAAE,cAAc,KAAK,CAAC;AACtD;;;AEjLO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,2MAA2M;AAClP,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,cAAc,mHAAmH;AAChJ,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,qHAAqH;AACzJ,UAAQ,OAAO,0BAA0B,sGAAsG;AAC/I,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,wBAAwB,yHAAyH;AAChK,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,0BAA0B,wHAAwH;AACjK,UAAQ,OAAO,0BAA0B,qEAAqE;AAC9G,UAAQ,OAAO,uBAAuB,gCAAgC;AACtE,UAAQ,OAAO,mBAAmB,wDAAwD;AAC1F,UAAQ,OAAO,2BAA2B,4CAA4C;AACtF,UAAQ,OAAO,0BAA0B,gHAAgH;AACzJ,UAAQ,OAAO,oBAAoB,iOAAiO;AACpQ,UAAQ,OAAO,yBAAyB,sGAAsG;AAC9I,UAAQ,OAAO,wBAAwB,+CAA+C;AACtF,UAAQ,OAAO,4BAA4B,6EAA6E;AACxH,UAAQ,OAAO,qBAAqB,qEAAqE;AACzG,UAAQ,OAAO,kDAAkD,8JAA8J;AAC/N,UAAQ,OAAO,2CAA2C,oIAAoI;AAC9L,UAAQ,OAAO,0CAA0C,uFAAuF;AAChJ,UAAQ,OAAO,eAAe,6DAA6D;AAC3F,UAAQ,OAAO,qBAAqB,iTAAiT;AACvV;;;AHlCA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,QAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,gBAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AAGF,UAAM,WAAW,YAAY,cAAc;AAC3C,WAAO;AAAA,EACT,SAAS,KAAK;AAMZ,UAAM,aACJ,eAAe,SAAU,IAAyC,eAAe;AACnF,QAAI,CAAC,YAAY;AACf,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,cAAQ,MAAM,OAAO;AAAA,IACvB;AACA,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":["__filename","__dirname"]}
package/dist/index.cjs CHANGED
@@ -99,6 +99,9 @@ function buildArgs(options) {
99
99
  if (options.markdownPageSeparator) {
100
100
  args.push("--markdown-page-separator", options.markdownPageSeparator);
101
101
  }
102
+ if (options.markdownWithHtml) {
103
+ args.push("--markdown-with-html");
104
+ }
102
105
  if (options.textPageSeparator) {
103
106
  args.push("--text-page-separator", options.textPageSeparator);
104
107
  }
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/index.ts","../node_modules/.pnpm/tsup@8.5.1_postcss@8.5.12_tsx@4.20.5_typescript@6.0.2/node_modules/tsup/assets/cjs_shims.js","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { StringDecoder } from 'string_decoder';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n /**\n * When true, forwards Java's stdout and stderr chunks to the parent\n * process in real time as well as accumulating them. Used by the bundled\n * CLI so long-running conversions show progress as it happens.\n */\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n // Force headless AWT so macOS doesn't surface a Dock icon (and steal focus)\n // every time the JVM touches ImageIO/PDFBox rendering. Safe on all OSes —\n // the CLI never opens a UI window, only manipulates BufferedImages.\n const commandArgs = [\n '-Djava.awt.headless=true',\n '-Dapple.awt.UIElement=true',\n '-jar',\n jarPath,\n ...args,\n ];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n // StringDecoder buffers incomplete multi-byte UTF-8 sequences across\n // chunk boundaries — Buffer.toString() alone would emit replacement\n // characters when, e.g., a 3-byte Korean codepoint splits across two\n // 'data' events. One decoder per stream so they don't share state.\n const stdoutDecoder = new StringDecoder('utf8');\n const stderrDecoder = new StringDecoder('utf8');\n\n javaProcess.stdout.on('data', (data: Buffer) => {\n const chunk = stdoutDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n // Stream-only: don't also accumulate, or a multi-hour conversion\n // would buffer its entire (potentially gigabyte) stdout in memory\n // for no consumer.\n process.stdout.write(chunk);\n } else {\n stdout += chunk;\n }\n });\n\n javaProcess.stderr.on('data', (data: Buffer) => {\n const chunk = stderrDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n // stderr is always accumulated (progress logs are small and we need\n // them for error messages on non-zero exit).\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n // Flush any trailing bytes the decoder is still holding (always emit\n // them — if we drop them on error paths, error messages with non-ASCII\n // characters lose their tail).\n const stdoutTail = stdoutDecoder.end();\n const stderrTail = stderrDecoder.end();\n if (stdoutTail.length > 0) {\n if (streamOutput) {\n process.stdout.write(stdoutTail);\n } else {\n stdout += stdoutTail;\n }\n }\n if (stderrTail.length > 0) {\n if (streamOutput) process.stderr.write(stderrTail);\n stderr += stderrTail;\n }\n\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n // Tag so the CLI can suppress re-printing this message — Java's\n // stderr was already streamed live to the parent in CLI mode, and\n // re-printing risks leaking anything sensitive Java logged\n // (e.g. a --password value echoed by an underlying library).\n (error as Error & { isJavaExit?: boolean }).isJavaExit = true;\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nfunction buildJarArgs(\n inputPaths: string | string[],\n options: ConvertOptions,\n): string[] | Error {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return new Error('At least one input path must be provided.');\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return new Error(`Input file or folder not found: ${input}`);\n }\n }\n\n return [...inputList, ...buildArgs(options)];\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n return Promise.reject(argsOrError);\n }\n // Library API: never streams to the parent process. Returns the full stdout\n // string so callers can do `const out = await convert(...)` without surprise\n // side-effects on process.stdout / process.stderr.\n return executeJar(argsOrError, { streamOutput: false });\n}\n\n/**\n * Internal entry point used by the bundled CLI. Streams Java's stdout and\n * stderr to the parent process in real time (so long-running conversions like\n * hybrid mode show progress as it happens) and resolves without a stdout\n * payload — preventing the caller from re-printing what was already streamed.\n *\n * Not part of the public API: do not import this from application code. Use\n * {@link convert} instead.\n *\n * @internal\n */\nexport async function _runForCli(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<void> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n throw argsOrError;\n }\n await executeJar(argsOrError, { streamOutput: true });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// Shim globals in cjs bundle\n// There's a weird bug that esbuild will always inject importMetaUrl\n// if we export it as `const importMetaUrl = ... __filename ...`\n// But using a function will not cause this issue\n\nconst getImportMetaUrl = () => \n typeof document === \"undefined\" \n ? new URL(`file:${__filename}`).href \n : (document.currentScript && document.currentScript.tagName.toUpperCase() === 'SCRIPT') \n ? document.currentScript.src \n : new URL(\"main.js\", document.baseURI).href;\n\nexport const importMetaUrl = /* @__PURE__ */ getImportMetaUrl()\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */\n hybridHancomAiRegionlistStrategy?: string;\n /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */\n hybridHancomAiOcrStrategy?: string;\n /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */\n hybridHancomAiImageCache?: string;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */\n threads?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n hybridHancomAiRegionlistStrategy?: string;\n hybridHancomAiOcrStrategy?: string;\n hybridHancomAiImageCache?: string;\n toStdout?: boolean;\n threads?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.hybridHancomAiRegionlistStrategy) {\n convertOptions.hybridHancomAiRegionlistStrategy = cliOptions.hybridHancomAiRegionlistStrategy;\n }\n if (cliOptions.hybridHancomAiOcrStrategy) {\n convertOptions.hybridHancomAiOcrStrategy = cliOptions.hybridHancomAiOcrStrategy;\n }\n if (cliOptions.hybridHancomAiImageCache) {\n convertOptions.hybridHancomAiImageCache = cliOptions.hybridHancomAiImageCache;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n if (cliOptions.threads) {\n convertOptions.threads = cliOptions.threads;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.hybridHancomAiRegionlistStrategy) {\n args.push('--hybrid-hancom-ai-regionlist-strategy', options.hybridHancomAiRegionlistStrategy);\n }\n if (options.hybridHancomAiOcrStrategy) {\n args.push('--hybrid-hancom-ai-ocr-strategy', options.hybridHancomAiOcrStrategy);\n }\n if (options.hybridHancomAiImageCache) {\n args.push('--hybrid-hancom-ai-image-cache', options.hybridHancomAiImageCache);\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n if (options.threads) {\n args.push('--threads', options.threads);\n }\n\n return args;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKA,IAAM,mBAAmB,MACvB,OAAO,aAAa,cAChB,IAAI,IAAI,QAAQ,UAAU,EAAE,EAAE,OAC7B,SAAS,iBAAiB,SAAS,cAAc,QAAQ,YAAY,MAAM,WAC1E,SAAS,cAAc,MACvB,IAAI,IAAI,WAAW,SAAS,OAAO,EAAE;AAEtC,IAAM,gBAAgC,iCAAiB;;;ADZ9D,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,4BAA8B;AAC9B,iBAA8B;;;AE4MvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,kCAAkC;AAC5C,SAAK,KAAK,0CAA0C,QAAQ,gCAAgC;AAAA,EAC9F;AACA,MAAI,QAAQ,2BAA2B;AACrC,SAAK,KAAK,mCAAmC,QAAQ,yBAAyB;AAAA,EAChF;AACA,MAAI,QAAQ,0BAA0B;AACpC,SAAK,KAAK,kCAAkC,QAAQ,wBAAwB;AAAA,EAC9E;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AACA,MAAI,QAAQ,SAAS;AACnB,SAAK,KAAK,aAAa,QAAQ,OAAO;AAAA,EACxC;AAEA,SAAO;AACT;;;AF/SA,IAAMA,kBAAa,0BAAc,aAAe;AAChD,IAAM,YAAiB,aAAQA,WAAU;AAEzC,IAAM,WAAW;AAWjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAIhB,UAAM,cAAc;AAAA,MAClB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAG;AAAA,IACL;AAEA,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAKb,UAAM,gBAAgB,IAAI,oCAAc,MAAM;AAC9C,UAAM,gBAAgB,IAAI,oCAAc,MAAM;AAE9C,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAIhB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B,OAAO;AACL,kBAAU;AAAA,MACZ;AAAA,IACF,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AAGA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAIhC,YAAM,aAAa,cAAc,IAAI;AACrC,YAAM,aAAa,cAAc,IAAI;AACrC,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,cAAc;AAChB,kBAAQ,OAAO,MAAM,UAAU;AAAA,QACjC,OAAO;AACL,oBAAU;AAAA,QACZ;AAAA,MACF;AACA,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,aAAc,SAAQ,OAAO,MAAM,UAAU;AACjD,kBAAU;AAAA,MACZ;AAEA,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AAKA,QAAC,MAA2C,aAAa;AACzD,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEA,SAAS,aACP,YACA,SACkB;AAClB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,IAAI,MAAM,2CAA2C;AAAA,EAC9D;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE;AAAA,IAC7D;AAAA,EACF;AAEA,SAAO,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAC7C;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,WAAO,QAAQ,OAAO,WAAW;AAAA,EACnC;AAIA,SAAO,WAAW,aAAa,EAAE,cAAc,MAAM,CAAC;AACxD;AAaA,eAAsB,WACpB,YACA,UAA0B,CAAC,GACZ;AACf,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,UAAM;AAAA,EACR;AACA,QAAM,WAAW,aAAa,EAAE,cAAc,KAAK,CAAC;AACtD;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":["__filename"]}
1
+ {"version":3,"sources":["../src/index.ts","../node_modules/.pnpm/tsup@8.5.1_postcss@8.5.12_tsx@4.20.5_typescript@6.0.2/node_modules/tsup/assets/cjs_shims.js","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { StringDecoder } from 'string_decoder';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n /**\n * When true, forwards Java's stdout and stderr chunks to the parent\n * process in real time as well as accumulating them. Used by the bundled\n * CLI so long-running conversions show progress as it happens.\n */\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n // Force headless AWT so macOS doesn't surface a Dock icon (and steal focus)\n // every time the JVM touches ImageIO/PDFBox rendering. Safe on all OSes —\n // the CLI never opens a UI window, only manipulates BufferedImages.\n const commandArgs = [\n '-Djava.awt.headless=true',\n '-Dapple.awt.UIElement=true',\n '-jar',\n jarPath,\n ...args,\n ];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n // StringDecoder buffers incomplete multi-byte UTF-8 sequences across\n // chunk boundaries — Buffer.toString() alone would emit replacement\n // characters when, e.g., a 3-byte Korean codepoint splits across two\n // 'data' events. One decoder per stream so they don't share state.\n const stdoutDecoder = new StringDecoder('utf8');\n const stderrDecoder = new StringDecoder('utf8');\n\n javaProcess.stdout.on('data', (data: Buffer) => {\n const chunk = stdoutDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n // Stream-only: don't also accumulate, or a multi-hour conversion\n // would buffer its entire (potentially gigabyte) stdout in memory\n // for no consumer.\n process.stdout.write(chunk);\n } else {\n stdout += chunk;\n }\n });\n\n javaProcess.stderr.on('data', (data: Buffer) => {\n const chunk = stderrDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n // stderr is always accumulated (progress logs are small and we need\n // them for error messages on non-zero exit).\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n // Flush any trailing bytes the decoder is still holding (always emit\n // them — if we drop them on error paths, error messages with non-ASCII\n // characters lose their tail).\n const stdoutTail = stdoutDecoder.end();\n const stderrTail = stderrDecoder.end();\n if (stdoutTail.length > 0) {\n if (streamOutput) {\n process.stdout.write(stdoutTail);\n } else {\n stdout += stdoutTail;\n }\n }\n if (stderrTail.length > 0) {\n if (streamOutput) process.stderr.write(stderrTail);\n stderr += stderrTail;\n }\n\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n // Tag so the CLI can suppress re-printing this message — Java's\n // stderr was already streamed live to the parent in CLI mode, and\n // re-printing risks leaking anything sensitive Java logged\n // (e.g. a --password value echoed by an underlying library).\n (error as Error & { isJavaExit?: boolean }).isJavaExit = true;\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nfunction buildJarArgs(\n inputPaths: string | string[],\n options: ConvertOptions,\n): string[] | Error {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return new Error('At least one input path must be provided.');\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return new Error(`Input file or folder not found: ${input}`);\n }\n }\n\n return [...inputList, ...buildArgs(options)];\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n return Promise.reject(argsOrError);\n }\n // Library API: never streams to the parent process. Returns the full stdout\n // string so callers can do `const out = await convert(...)` without surprise\n // side-effects on process.stdout / process.stderr.\n return executeJar(argsOrError, { streamOutput: false });\n}\n\n/**\n * Internal entry point used by the bundled CLI. Streams Java's stdout and\n * stderr to the parent process in real time (so long-running conversions like\n * hybrid mode show progress as it happens) and resolves without a stdout\n * payload — preventing the caller from re-printing what was already streamed.\n *\n * Not part of the public API: do not import this from application code. Use\n * {@link convert} instead.\n *\n * @internal\n */\nexport async function _runForCli(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<void> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n throw argsOrError;\n }\n await executeJar(argsOrError, { streamOutput: true });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// Shim globals in cjs bundle\n// There's a weird bug that esbuild will always inject importMetaUrl\n// if we export it as `const importMetaUrl = ... __filename ...`\n// But using a function will not cause this issue\n\nconst getImportMetaUrl = () => \n typeof document === \"undefined\" \n ? new URL(`file:${__filename}`).href \n : (document.currentScript && document.currentScript.tagName.toUpperCase() === 'SCRIPT') \n ? document.currentScript.src \n : new URL(\"main.js\", document.baseURI).href;\n\nexport const importMetaUrl = /* @__PURE__ */ getImportMetaUrl()\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, tagged-pdf. Default: json. For HTML inside Markdown use --markdown-with-html. For image extraction control use --image-output. */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure. Output quality depends on tag quality */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Allow HTML tags inside Markdown output for complex structures such as multi-row-span tables. Implies --format markdown. */\n markdownWithHtml?: boolean;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */\n hybridHancomAiRegionlistStrategy?: string;\n /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */\n hybridHancomAiOcrStrategy?: string;\n /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */\n hybridHancomAiImageCache?: string;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */\n threads?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n markdownWithHtml?: boolean;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n hybridHancomAiRegionlistStrategy?: string;\n hybridHancomAiOcrStrategy?: string;\n hybridHancomAiImageCache?: string;\n toStdout?: boolean;\n threads?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.markdownWithHtml) {\n convertOptions.markdownWithHtml = true;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.hybridHancomAiRegionlistStrategy) {\n convertOptions.hybridHancomAiRegionlistStrategy = cliOptions.hybridHancomAiRegionlistStrategy;\n }\n if (cliOptions.hybridHancomAiOcrStrategy) {\n convertOptions.hybridHancomAiOcrStrategy = cliOptions.hybridHancomAiOcrStrategy;\n }\n if (cliOptions.hybridHancomAiImageCache) {\n convertOptions.hybridHancomAiImageCache = cliOptions.hybridHancomAiImageCache;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n if (cliOptions.threads) {\n convertOptions.threads = cliOptions.threads;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.markdownWithHtml) {\n args.push('--markdown-with-html');\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.hybridHancomAiRegionlistStrategy) {\n args.push('--hybrid-hancom-ai-regionlist-strategy', options.hybridHancomAiRegionlistStrategy);\n }\n if (options.hybridHancomAiOcrStrategy) {\n args.push('--hybrid-hancom-ai-ocr-strategy', options.hybridHancomAiOcrStrategy);\n }\n if (options.hybridHancomAiImageCache) {\n args.push('--hybrid-hancom-ai-image-cache', options.hybridHancomAiImageCache);\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n if (options.threads) {\n args.push('--threads', options.threads);\n }\n\n return args;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKA,IAAM,mBAAmB,MACvB,OAAO,aAAa,cAChB,IAAI,IAAI,QAAQ,UAAU,EAAE,EAAE,OAC7B,SAAS,iBAAiB,SAAS,cAAc,QAAQ,YAAY,MAAM,WAC1E,SAAS,cAAc,MACvB,IAAI,IAAI,WAAW,SAAS,OAAO,EAAE;AAEtC,IAAM,gBAAgC,iCAAiB;;;ADZ9D,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,4BAA8B;AAC9B,iBAA8B;;;AEkNvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,kBAAkB;AAC5B,SAAK,KAAK,sBAAsB;AAAA,EAClC;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,kCAAkC;AAC5C,SAAK,KAAK,0CAA0C,QAAQ,gCAAgC;AAAA,EAC9F;AACA,MAAI,QAAQ,2BAA2B;AACrC,SAAK,KAAK,mCAAmC,QAAQ,yBAAyB;AAAA,EAChF;AACA,MAAI,QAAQ,0BAA0B;AACpC,SAAK,KAAK,kCAAkC,QAAQ,wBAAwB;AAAA,EAC9E;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AACA,MAAI,QAAQ,SAAS;AACnB,SAAK,KAAK,aAAa,QAAQ,OAAO;AAAA,EACxC;AAEA,SAAO;AACT;;;AFxTA,IAAMA,kBAAa,0BAAc,aAAe;AAChD,IAAM,YAAiB,aAAQA,WAAU;AAEzC,IAAM,WAAW;AAWjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAIhB,UAAM,cAAc;AAAA,MAClB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAG;AAAA,IACL;AAEA,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAKb,UAAM,gBAAgB,IAAI,oCAAc,MAAM;AAC9C,UAAM,gBAAgB,IAAI,oCAAc,MAAM;AAE9C,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAIhB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B,OAAO;AACL,kBAAU;AAAA,MACZ;AAAA,IACF,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AAGA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAIhC,YAAM,aAAa,cAAc,IAAI;AACrC,YAAM,aAAa,cAAc,IAAI;AACrC,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,cAAc;AAChB,kBAAQ,OAAO,MAAM,UAAU;AAAA,QACjC,OAAO;AACL,oBAAU;AAAA,QACZ;AAAA,MACF;AACA,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,aAAc,SAAQ,OAAO,MAAM,UAAU;AACjD,kBAAU;AAAA,MACZ;AAEA,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AAKA,QAAC,MAA2C,aAAa;AACzD,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEA,SAAS,aACP,YACA,SACkB;AAClB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,IAAI,MAAM,2CAA2C;AAAA,EAC9D;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE;AAAA,IAC7D;AAAA,EACF;AAEA,SAAO,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAC7C;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,WAAO,QAAQ,OAAO,WAAW;AAAA,EACnC;AAIA,SAAO,WAAW,aAAa,EAAE,cAAc,MAAM,CAAC;AACxD;AAaA,eAAsB,WACpB,YACA,UAA0B,CAAC,GACZ;AACf,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,UAAM;AAAA,EACR;AACA,QAAM,WAAW,aAAa,EAAE,cAAc,KAAK,CAAC;AACtD;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":["__filename"]}
package/dist/index.d.cts CHANGED
@@ -6,7 +6,7 @@ interface ConvertOptions {
6
6
  outputDir?: string;
7
7
  /** Password for encrypted PDF files */
8
8
  password?: string;
9
- /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json */
9
+ /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, tagged-pdf. Default: json. For HTML inside Markdown use --markdown-with-html. For image extraction control use --image-output. */
10
10
  format?: string | string[];
11
11
  /** Suppress console logging output */
12
12
  quiet?: boolean;
@@ -18,7 +18,7 @@ interface ConvertOptions {
18
18
  keepLineBreaks?: boolean;
19
19
  /** Replacement character for invalid/unrecognized characters. Default: space */
20
20
  replaceInvalidChars?: string;
21
- /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */
21
+ /** Use PDF structure tree (tagged PDF) for reading order and semantic structure. Output quality depends on tag quality */
22
22
  useStructTree?: boolean;
23
23
  /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */
24
24
  tableMethod?: string;
@@ -26,6 +26,8 @@ interface ConvertOptions {
26
26
  readingOrder?: string;
27
27
  /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */
28
28
  markdownPageSeparator?: string;
29
+ /** Allow HTML tags inside Markdown output for complex structures such as multi-row-span tables. Implies --format markdown. */
30
+ markdownWithHtml?: boolean;
29
31
  /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */
30
32
  textPageSeparator?: string;
31
33
  /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */
package/dist/index.d.ts CHANGED
@@ -6,7 +6,7 @@ interface ConvertOptions {
6
6
  outputDir?: string;
7
7
  /** Password for encrypted PDF files */
8
8
  password?: string;
9
- /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json */
9
+ /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, tagged-pdf. Default: json. For HTML inside Markdown use --markdown-with-html. For image extraction control use --image-output. */
10
10
  format?: string | string[];
11
11
  /** Suppress console logging output */
12
12
  quiet?: boolean;
@@ -18,7 +18,7 @@ interface ConvertOptions {
18
18
  keepLineBreaks?: boolean;
19
19
  /** Replacement character for invalid/unrecognized characters. Default: space */
20
20
  replaceInvalidChars?: string;
21
- /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */
21
+ /** Use PDF structure tree (tagged PDF) for reading order and semantic structure. Output quality depends on tag quality */
22
22
  useStructTree?: boolean;
23
23
  /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */
24
24
  tableMethod?: string;
@@ -26,6 +26,8 @@ interface ConvertOptions {
26
26
  readingOrder?: string;
27
27
  /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */
28
28
  markdownPageSeparator?: string;
29
+ /** Allow HTML tags inside Markdown output for complex structures such as multi-row-span tables. Implies --format markdown. */
30
+ markdownWithHtml?: boolean;
29
31
  /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */
30
32
  textPageSeparator?: string;
31
33
  /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */
package/dist/index.js CHANGED
@@ -56,6 +56,9 @@ function buildArgs(options) {
56
56
  if (options.markdownPageSeparator) {
57
57
  args.push("--markdown-page-separator", options.markdownPageSeparator);
58
58
  }
59
+ if (options.markdownWithHtml) {
60
+ args.push("--markdown-with-html");
61
+ }
59
62
  if (options.textPageSeparator) {
60
63
  args.push("--text-page-separator", options.textPageSeparator);
61
64
  }
package/dist/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/index.ts","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { StringDecoder } from 'string_decoder';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n /**\n * When true, forwards Java's stdout and stderr chunks to the parent\n * process in real time as well as accumulating them. Used by the bundled\n * CLI so long-running conversions show progress as it happens.\n */\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n // Force headless AWT so macOS doesn't surface a Dock icon (and steal focus)\n // every time the JVM touches ImageIO/PDFBox rendering. Safe on all OSes —\n // the CLI never opens a UI window, only manipulates BufferedImages.\n const commandArgs = [\n '-Djava.awt.headless=true',\n '-Dapple.awt.UIElement=true',\n '-jar',\n jarPath,\n ...args,\n ];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n // StringDecoder buffers incomplete multi-byte UTF-8 sequences across\n // chunk boundaries — Buffer.toString() alone would emit replacement\n // characters when, e.g., a 3-byte Korean codepoint splits across two\n // 'data' events. One decoder per stream so they don't share state.\n const stdoutDecoder = new StringDecoder('utf8');\n const stderrDecoder = new StringDecoder('utf8');\n\n javaProcess.stdout.on('data', (data: Buffer) => {\n const chunk = stdoutDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n // Stream-only: don't also accumulate, or a multi-hour conversion\n // would buffer its entire (potentially gigabyte) stdout in memory\n // for no consumer.\n process.stdout.write(chunk);\n } else {\n stdout += chunk;\n }\n });\n\n javaProcess.stderr.on('data', (data: Buffer) => {\n const chunk = stderrDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n // stderr is always accumulated (progress logs are small and we need\n // them for error messages on non-zero exit).\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n // Flush any trailing bytes the decoder is still holding (always emit\n // them — if we drop them on error paths, error messages with non-ASCII\n // characters lose their tail).\n const stdoutTail = stdoutDecoder.end();\n const stderrTail = stderrDecoder.end();\n if (stdoutTail.length > 0) {\n if (streamOutput) {\n process.stdout.write(stdoutTail);\n } else {\n stdout += stdoutTail;\n }\n }\n if (stderrTail.length > 0) {\n if (streamOutput) process.stderr.write(stderrTail);\n stderr += stderrTail;\n }\n\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n // Tag so the CLI can suppress re-printing this message — Java's\n // stderr was already streamed live to the parent in CLI mode, and\n // re-printing risks leaking anything sensitive Java logged\n // (e.g. a --password value echoed by an underlying library).\n (error as Error & { isJavaExit?: boolean }).isJavaExit = true;\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nfunction buildJarArgs(\n inputPaths: string | string[],\n options: ConvertOptions,\n): string[] | Error {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return new Error('At least one input path must be provided.');\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return new Error(`Input file or folder not found: ${input}`);\n }\n }\n\n return [...inputList, ...buildArgs(options)];\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n return Promise.reject(argsOrError);\n }\n // Library API: never streams to the parent process. Returns the full stdout\n // string so callers can do `const out = await convert(...)` without surprise\n // side-effects on process.stdout / process.stderr.\n return executeJar(argsOrError, { streamOutput: false });\n}\n\n/**\n * Internal entry point used by the bundled CLI. Streams Java's stdout and\n * stderr to the parent process in real time (so long-running conversions like\n * hybrid mode show progress as it happens) and resolves without a stdout\n * payload — preventing the caller from re-printing what was already streamed.\n *\n * Not part of the public API: do not import this from application code. Use\n * {@link convert} instead.\n *\n * @internal\n */\nexport async function _runForCli(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<void> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n throw argsOrError;\n }\n await executeJar(argsOrError, { streamOutput: true });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */\n hybridHancomAiRegionlistStrategy?: string;\n /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */\n hybridHancomAiOcrStrategy?: string;\n /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */\n hybridHancomAiImageCache?: string;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */\n threads?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n hybridHancomAiRegionlistStrategy?: string;\n hybridHancomAiOcrStrategy?: string;\n hybridHancomAiImageCache?: string;\n toStdout?: boolean;\n threads?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.hybridHancomAiRegionlistStrategy) {\n convertOptions.hybridHancomAiRegionlistStrategy = cliOptions.hybridHancomAiRegionlistStrategy;\n }\n if (cliOptions.hybridHancomAiOcrStrategy) {\n convertOptions.hybridHancomAiOcrStrategy = cliOptions.hybridHancomAiOcrStrategy;\n }\n if (cliOptions.hybridHancomAiImageCache) {\n convertOptions.hybridHancomAiImageCache = cliOptions.hybridHancomAiImageCache;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n if (cliOptions.threads) {\n convertOptions.threads = cliOptions.threads;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.hybridHancomAiRegionlistStrategy) {\n args.push('--hybrid-hancom-ai-regionlist-strategy', options.hybridHancomAiRegionlistStrategy);\n }\n if (options.hybridHancomAiOcrStrategy) {\n args.push('--hybrid-hancom-ai-ocr-strategy', options.hybridHancomAiOcrStrategy);\n }\n if (options.hybridHancomAiImageCache) {\n args.push('--hybrid-hancom-ai-image-cache', options.hybridHancomAiImageCache);\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n if (options.threads) {\n args.push('--threads', options.threads);\n }\n\n return args;\n}\n"],"mappings":";AAAA,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;AAC9B,SAAS,qBAAqB;;;AC4MvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,kCAAkC;AAC5C,SAAK,KAAK,0CAA0C,QAAQ,gCAAgC;AAAA,EAC9F;AACA,MAAI,QAAQ,2BAA2B;AACrC,SAAK,KAAK,mCAAmC,QAAQ,yBAAyB;AAAA,EAChF;AACA,MAAI,QAAQ,0BAA0B;AACpC,SAAK,KAAK,kCAAkC,QAAQ,wBAAwB;AAAA,EAC9E;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AACA,MAAI,QAAQ,SAAS;AACnB,SAAK,KAAK,aAAa,QAAQ,OAAO;AAAA,EACxC;AAEA,SAAO;AACT;;;AD/SA,IAAMA,cAAa,cAAc,YAAY,GAAG;AAChD,IAAMC,aAAiB,aAAQD,WAAU;AAEzC,IAAM,WAAW;AAWjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAKC,YAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAIhB,UAAM,cAAc;AAAA,MAClB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAG;AAAA,IACL;AAEA,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAKb,UAAM,gBAAgB,IAAI,cAAc,MAAM;AAC9C,UAAM,gBAAgB,IAAI,cAAc,MAAM;AAE9C,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAIhB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B,OAAO;AACL,kBAAU;AAAA,MACZ;AAAA,IACF,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AAGA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAIhC,YAAM,aAAa,cAAc,IAAI;AACrC,YAAM,aAAa,cAAc,IAAI;AACrC,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,cAAc;AAChB,kBAAQ,OAAO,MAAM,UAAU;AAAA,QACjC,OAAO;AACL,oBAAU;AAAA,QACZ;AAAA,MACF;AACA,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,aAAc,SAAQ,OAAO,MAAM,UAAU;AACjD,kBAAU;AAAA,MACZ;AAEA,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AAKA,QAAC,MAA2C,aAAa;AACzD,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEA,SAAS,aACP,YACA,SACkB;AAClB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,IAAI,MAAM,2CAA2C;AAAA,EAC9D;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE;AAAA,IAC7D;AAAA,EACF;AAEA,SAAO,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAC7C;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,WAAO,QAAQ,OAAO,WAAW;AAAA,EACnC;AAIA,SAAO,WAAW,aAAa,EAAE,cAAc,MAAM,CAAC;AACxD;AAaA,eAAsB,WACpB,YACA,UAA0B,CAAC,GACZ;AACf,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,UAAM;AAAA,EACR;AACA,QAAM,WAAW,aAAa,EAAE,cAAc,KAAK,CAAC;AACtD;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":["__filename","__dirname"]}
1
+ {"version":3,"sources":["../src/index.ts","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { StringDecoder } from 'string_decoder';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n /**\n * When true, forwards Java's stdout and stderr chunks to the parent\n * process in real time as well as accumulating them. Used by the bundled\n * CLI so long-running conversions show progress as it happens.\n */\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n // Force headless AWT so macOS doesn't surface a Dock icon (and steal focus)\n // every time the JVM touches ImageIO/PDFBox rendering. Safe on all OSes —\n // the CLI never opens a UI window, only manipulates BufferedImages.\n const commandArgs = [\n '-Djava.awt.headless=true',\n '-Dapple.awt.UIElement=true',\n '-jar',\n jarPath,\n ...args,\n ];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n // StringDecoder buffers incomplete multi-byte UTF-8 sequences across\n // chunk boundaries — Buffer.toString() alone would emit replacement\n // characters when, e.g., a 3-byte Korean codepoint splits across two\n // 'data' events. One decoder per stream so they don't share state.\n const stdoutDecoder = new StringDecoder('utf8');\n const stderrDecoder = new StringDecoder('utf8');\n\n javaProcess.stdout.on('data', (data: Buffer) => {\n const chunk = stdoutDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n // Stream-only: don't also accumulate, or a multi-hour conversion\n // would buffer its entire (potentially gigabyte) stdout in memory\n // for no consumer.\n process.stdout.write(chunk);\n } else {\n stdout += chunk;\n }\n });\n\n javaProcess.stderr.on('data', (data: Buffer) => {\n const chunk = stderrDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n // stderr is always accumulated (progress logs are small and we need\n // them for error messages on non-zero exit).\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n // Flush any trailing bytes the decoder is still holding (always emit\n // them — if we drop them on error paths, error messages with non-ASCII\n // characters lose their tail).\n const stdoutTail = stdoutDecoder.end();\n const stderrTail = stderrDecoder.end();\n if (stdoutTail.length > 0) {\n if (streamOutput) {\n process.stdout.write(stdoutTail);\n } else {\n stdout += stdoutTail;\n }\n }\n if (stderrTail.length > 0) {\n if (streamOutput) process.stderr.write(stderrTail);\n stderr += stderrTail;\n }\n\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n // Tag so the CLI can suppress re-printing this message — Java's\n // stderr was already streamed live to the parent in CLI mode, and\n // re-printing risks leaking anything sensitive Java logged\n // (e.g. a --password value echoed by an underlying library).\n (error as Error & { isJavaExit?: boolean }).isJavaExit = true;\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nfunction buildJarArgs(\n inputPaths: string | string[],\n options: ConvertOptions,\n): string[] | Error {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return new Error('At least one input path must be provided.');\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return new Error(`Input file or folder not found: ${input}`);\n }\n }\n\n return [...inputList, ...buildArgs(options)];\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n return Promise.reject(argsOrError);\n }\n // Library API: never streams to the parent process. Returns the full stdout\n // string so callers can do `const out = await convert(...)` without surprise\n // side-effects on process.stdout / process.stderr.\n return executeJar(argsOrError, { streamOutput: false });\n}\n\n/**\n * Internal entry point used by the bundled CLI. Streams Java's stdout and\n * stderr to the parent process in real time (so long-running conversions like\n * hybrid mode show progress as it happens) and resolves without a stdout\n * payload — preventing the caller from re-printing what was already streamed.\n *\n * Not part of the public API: do not import this from application code. Use\n * {@link convert} instead.\n *\n * @internal\n */\nexport async function _runForCli(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<void> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n throw argsOrError;\n }\n await executeJar(argsOrError, { streamOutput: true });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, tagged-pdf. Default: json. For HTML inside Markdown use --markdown-with-html. For image extraction control use --image-output. */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure. Output quality depends on tag quality */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Allow HTML tags inside Markdown output for complex structures such as multi-row-span tables. Implies --format markdown. */\n markdownWithHtml?: boolean;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */\n hybridHancomAiRegionlistStrategy?: string;\n /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */\n hybridHancomAiOcrStrategy?: string;\n /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */\n hybridHancomAiImageCache?: string;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */\n threads?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n markdownWithHtml?: boolean;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n hybridHancomAiRegionlistStrategy?: string;\n hybridHancomAiOcrStrategy?: string;\n hybridHancomAiImageCache?: string;\n toStdout?: boolean;\n threads?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.markdownWithHtml) {\n convertOptions.markdownWithHtml = true;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.hybridHancomAiRegionlistStrategy) {\n convertOptions.hybridHancomAiRegionlistStrategy = cliOptions.hybridHancomAiRegionlistStrategy;\n }\n if (cliOptions.hybridHancomAiOcrStrategy) {\n convertOptions.hybridHancomAiOcrStrategy = cliOptions.hybridHancomAiOcrStrategy;\n }\n if (cliOptions.hybridHancomAiImageCache) {\n convertOptions.hybridHancomAiImageCache = cliOptions.hybridHancomAiImageCache;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n if (cliOptions.threads) {\n convertOptions.threads = cliOptions.threads;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.markdownWithHtml) {\n args.push('--markdown-with-html');\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.hybridHancomAiRegionlistStrategy) {\n args.push('--hybrid-hancom-ai-regionlist-strategy', options.hybridHancomAiRegionlistStrategy);\n }\n if (options.hybridHancomAiOcrStrategy) {\n args.push('--hybrid-hancom-ai-ocr-strategy', options.hybridHancomAiOcrStrategy);\n }\n if (options.hybridHancomAiImageCache) {\n args.push('--hybrid-hancom-ai-image-cache', options.hybridHancomAiImageCache);\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n if (options.threads) {\n args.push('--threads', options.threads);\n }\n\n return args;\n}\n"],"mappings":";AAAA,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;AAC9B,SAAS,qBAAqB;;;ACkNvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,kBAAkB;AAC5B,SAAK,KAAK,sBAAsB;AAAA,EAClC;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,kCAAkC;AAC5C,SAAK,KAAK,0CAA0C,QAAQ,gCAAgC;AAAA,EAC9F;AACA,MAAI,QAAQ,2BAA2B;AACrC,SAAK,KAAK,mCAAmC,QAAQ,yBAAyB;AAAA,EAChF;AACA,MAAI,QAAQ,0BAA0B;AACpC,SAAK,KAAK,kCAAkC,QAAQ,wBAAwB;AAAA,EAC9E;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AACA,MAAI,QAAQ,SAAS;AACnB,SAAK,KAAK,aAAa,QAAQ,OAAO;AAAA,EACxC;AAEA,SAAO;AACT;;;ADxTA,IAAMA,cAAa,cAAc,YAAY,GAAG;AAChD,IAAMC,aAAiB,aAAQD,WAAU;AAEzC,IAAM,WAAW;AAWjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAKC,YAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAIhB,UAAM,cAAc;AAAA,MAClB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAG;AAAA,IACL;AAEA,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAKb,UAAM,gBAAgB,IAAI,cAAc,MAAM;AAC9C,UAAM,gBAAgB,IAAI,cAAc,MAAM;AAE9C,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAIhB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B,OAAO;AACL,kBAAU;AAAA,MACZ;AAAA,IACF,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AAGA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAIhC,YAAM,aAAa,cAAc,IAAI;AACrC,YAAM,aAAa,cAAc,IAAI;AACrC,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,cAAc;AAChB,kBAAQ,OAAO,MAAM,UAAU;AAAA,QACjC,OAAO;AACL,oBAAU;AAAA,QACZ;AAAA,MACF;AACA,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,aAAc,SAAQ,OAAO,MAAM,UAAU;AACjD,kBAAU;AAAA,MACZ;AAEA,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AAKA,QAAC,MAA2C,aAAa;AACzD,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEA,SAAS,aACP,YACA,SACkB;AAClB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,IAAI,MAAM,2CAA2C;AAAA,EAC9D;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE;AAAA,IAC7D;AAAA,EACF;AAEA,SAAO,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAC7C;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,WAAO,QAAQ,OAAO,WAAW;AAAA,EACnC;AAIA,SAAO,WAAW,aAAa,EAAE,cAAc,MAAM,CAAC;AACxD;AAaA,eAAsB,WACpB,YACA,UAA0B,CAAC,GACZ;AACf,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,UAAM;AAAA,EACR;AACA,QAAM,WAAW,aAAa,EAAE,cAAc,KAAK,CAAC;AACtD;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":["__filename","__dirname"]}
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opendataloader/pdf",
3
- "version": "2.4.3",
3
+ "version": "2.4.4",
4
4
  "description": "A Node.js wrapper for the opendataloader-pdf Java CLI.",
5
5
  "main": "./dist/index.cjs",
6
6
  "module": "./dist/index.js",