@opendataloader/pdf 2.3.0 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n toStdout?: boolean;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--sanitize', 'Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: default (border-based), cluster (border + cluster). Default: default');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: off, xycut. Default: xycut');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--image-output <value>', 'Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n program.option('--image-dir <value>', 'Directory for extracted images');\n program.option('--pages <value>', 'Pages to extract (e.g., \"1,3,5-7\"). Default: all pages');\n program.option('--include-header-footer', 'Include page headers and footers in output');\n program.option('--detect-strikethrough', 'Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental)');\n program.option('--hybrid <value>', 'Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast');\n program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');\n program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');\n program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0');\n program.option('--hybrid-fallback', 'Opt in to Java fallback on hybrid backend error (default: disabled)');\n program.option('--to-stdout', 'Write output to stdout instead of file (single format only)');\n}\n"],"mappings":";;;AACA,SAAS,SAAS,sBAAsB;;;ACDxC,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;;;AC6FvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ,WAAW;AAAA,EACpC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,YAAY;AACzB,mBAAe,aAAa,WAAW;AAAA,EACzC;AACA,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB,WAAW;AAAA,EAC5C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AAEA,SAAO;AACT;;;AD5QA,IAAMA,cAAa,cAAc,YAAY,GAAG;AAChD,IAAMC,aAAiB,aAAQD,WAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAKC,YAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AE9FO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,gJAAgJ;AACvL,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,cAAc,mHAAmH;AAChJ,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,sGAAsG;AAC/I,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,0BAA0B,wHAAwH;AACjK,UAAQ,OAAO,0BAA0B,qEAAqE;AAC9G,UAAQ,OAAO,uBAAuB,gCAAgC;AACtE,UAAQ,OAAO,mBAAmB,wDAAwD;AAC1F,UAAQ,OAAO,2BAA2B,4CAA4C;AACtF,UAAQ,OAAO,0BAA0B,gHAAgH;AACzJ,UAAQ,OAAO,oBAAoB,sNAAsN;AACzP,UAAQ,OAAO,yBAAyB,sGAAsG;AAC9I,UAAQ,OAAO,wBAAwB,+CAA+C;AACtF,UAAQ,OAAO,4BAA4B,6EAA6E;AACxH,UAAQ,OAAO,qBAAqB,qEAAqE;AACzG,UAAQ,OAAO,eAAe,6DAA6D;AAC7F;;;AH7BA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,QAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,gBAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":["__filename","__dirname"]}
1
+ {"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { _runForCli } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n // _runForCli streams stdout/stderr to the parent process as they arrive;\n // we deliberately do not re-print anything here. (Issue #398.)\n await _runForCli(inputPaths, convertOptions);\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { StringDecoder } from 'string_decoder';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n /**\n * When true, forwards Java's stdout and stderr chunks to the parent\n * process in real time as well as accumulating them. Used by the bundled\n * CLI so long-running conversions show progress as it happens.\n */\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n // StringDecoder buffers incomplete multi-byte UTF-8 sequences across\n // chunk boundaries — Buffer.toString() alone would emit replacement\n // characters when, e.g., a 3-byte Korean codepoint splits across two\n // 'data' events. One decoder per stream so they don't share state.\n const stdoutDecoder = new StringDecoder('utf8');\n const stderrDecoder = new StringDecoder('utf8');\n\n javaProcess.stdout.on('data', (data: Buffer) => {\n const chunk = stdoutDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n // Stream-only: don't also accumulate, or a multi-hour conversion\n // would buffer its entire (potentially gigabyte) stdout in memory\n // for no consumer.\n process.stdout.write(chunk);\n } else {\n stdout += chunk;\n }\n });\n\n javaProcess.stderr.on('data', (data: Buffer) => {\n const chunk = stderrDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n // stderr is always accumulated (progress logs are small and we need\n // them for error messages on non-zero exit).\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n // Flush any trailing bytes the decoder is still holding (always emit\n // them — if we drop them on error paths, error messages with non-ASCII\n // characters lose their tail).\n const stdoutTail = stdoutDecoder.end();\n const stderrTail = stderrDecoder.end();\n if (stdoutTail.length > 0) {\n if (streamOutput) {\n process.stdout.write(stdoutTail);\n } else {\n stdout += stdoutTail;\n }\n }\n if (stderrTail.length > 0) {\n if (streamOutput) process.stderr.write(stderrTail);\n stderr += stderrTail;\n }\n\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nfunction buildJarArgs(\n inputPaths: string | string[],\n options: ConvertOptions,\n): string[] | Error {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return new Error('At least one input path must be provided.');\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return new Error(`Input file or folder not found: ${input}`);\n }\n }\n\n return [...inputList, ...buildArgs(options)];\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n return Promise.reject(argsOrError);\n }\n // Library API: never streams to the parent process. Returns the full stdout\n // string so callers can do `const out = await convert(...)` without surprise\n // side-effects on process.stdout / process.stderr.\n return executeJar(argsOrError, { streamOutput: false });\n}\n\n/**\n * Internal entry point used by the bundled CLI. Streams Java's stdout and\n * stderr to the parent process in real time (so long-running conversions like\n * hybrid mode show progress as it happens) and resolves without a stdout\n * payload — preventing the caller from re-printing what was already streamed.\n *\n * Not part of the public API: do not import this from application code. Use\n * {@link convert} instead.\n *\n * @internal\n */\nexport async function _runForCli(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<void> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n throw argsOrError;\n }\n await executeJar(argsOrError, { streamOutput: true });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */\n hybridHancomAiRegionlistStrategy?: string;\n /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */\n hybridHancomAiOcrStrategy?: string;\n /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */\n hybridHancomAiImageCache?: string;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */\n threads?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n hybridHancomAiRegionlistStrategy?: string;\n hybridHancomAiOcrStrategy?: string;\n hybridHancomAiImageCache?: string;\n toStdout?: boolean;\n threads?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.hybridHancomAiRegionlistStrategy) {\n convertOptions.hybridHancomAiRegionlistStrategy = cliOptions.hybridHancomAiRegionlistStrategy;\n }\n if (cliOptions.hybridHancomAiOcrStrategy) {\n convertOptions.hybridHancomAiOcrStrategy = cliOptions.hybridHancomAiOcrStrategy;\n }\n if (cliOptions.hybridHancomAiImageCache) {\n convertOptions.hybridHancomAiImageCache = cliOptions.hybridHancomAiImageCache;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n if (cliOptions.threads) {\n convertOptions.threads = cliOptions.threads;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.hybridHancomAiRegionlistStrategy) {\n args.push('--hybrid-hancom-ai-regionlist-strategy', options.hybridHancomAiRegionlistStrategy);\n }\n if (options.hybridHancomAiOcrStrategy) {\n args.push('--hybrid-hancom-ai-ocr-strategy', options.hybridHancomAiOcrStrategy);\n }\n if (options.hybridHancomAiImageCache) {\n args.push('--hybrid-hancom-ai-image-cache', options.hybridHancomAiImageCache);\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n if (options.threads) {\n args.push('--threads', options.threads);\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--sanitize', 'Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: default (border-based), cluster (border + cluster). Default: default');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: off, xycut. Default: xycut');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--image-output <value>', 'Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n program.option('--image-dir <value>', 'Directory for extracted images');\n program.option('--pages <value>', 'Pages to extract (e.g., \"1,3,5-7\"). Default: all pages');\n program.option('--include-header-footer', 'Include page headers and footers in output');\n program.option('--detect-strikethrough', 'Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental)');\n program.option('--hybrid <value>', 'Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai');\n program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');\n program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');\n program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0');\n program.option('--hybrid-fallback', 'Opt in to Java fallback on hybrid backend error (default: disabled)');\n program.option('--hybrid-hancom-ai-regionlist-strategy <value>', 'DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list)');\n program.option('--hybrid-hancom-ai-ocr-strategy <value>', 'OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only)');\n program.option('--hybrid-hancom-ai-image-cache <value>', 'Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk');\n program.option('--to-stdout', 'Write output to stdout instead of file (single format only)');\n program.option('--threads <value>', 'Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode');\n}\n"],"mappings":";;;AACA,SAAS,SAAS,sBAAsB;;;ACDxC,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;AAC9B,SAAS,qBAAqB;;;ACwGvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ,WAAW;AAAA,EACpC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,YAAY;AACzB,mBAAe,aAAa,WAAW;AAAA,EACzC;AACA,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB,WAAW;AAAA,EAC5C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,kCAAkC;AAC/C,mBAAe,mCAAmC,WAAW;AAAA,EAC/D;AACA,MAAI,WAAW,2BAA2B;AACxC,mBAAe,4BAA4B,WAAW;AAAA,EACxD;AACA,MAAI,WAAW,0BAA0B;AACvC,mBAAe,2BAA2B,WAAW;AAAA,EACvD;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW;AAAA,EAC5B;AACA,MAAI,WAAW,SAAS;AACtB,mBAAe,UAAU,WAAW;AAAA,EACtC;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,kCAAkC;AAC5C,SAAK,KAAK,0CAA0C,QAAQ,gCAAgC;AAAA,EAC9F;AACA,MAAI,QAAQ,2BAA2B;AACrC,SAAK,KAAK,mCAAmC,QAAQ,yBAAyB;AAAA,EAChF;AACA,MAAI,QAAQ,0BAA0B;AACpC,SAAK,KAAK,kCAAkC,QAAQ,wBAAwB;AAAA,EAC9E;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AACA,MAAI,QAAQ,SAAS;AACnB,SAAK,KAAK,aAAa,QAAQ,OAAO;AAAA,EACxC;AAEA,SAAO;AACT;;;AD/SA,IAAMA,cAAa,cAAc,YAAY,GAAG;AAChD,IAAMC,aAAiB,aAAQD,WAAU;AAEzC,IAAM,WAAW;AAWjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAKC,YAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAKb,UAAM,gBAAgB,IAAI,cAAc,MAAM;AAC9C,UAAM,gBAAgB,IAAI,cAAc,MAAM;AAE9C,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAIhB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B,OAAO;AACL,kBAAU;AAAA,MACZ;AAAA,IACF,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AAGA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAIhC,YAAM,aAAa,cAAc,IAAI;AACrC,YAAM,aAAa,cAAc,IAAI;AACrC,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,cAAc;AAChB,kBAAQ,OAAO,MAAM,UAAU;AAAA,QACjC,OAAO;AACL,oBAAU;AAAA,QACZ;AAAA,MACF;AACA,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,aAAc,SAAQ,OAAO,MAAM,UAAU;AACjD,kBAAU;AAAA,MACZ;AAEA,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEA,SAAS,aACP,YACA,SACkB;AAClB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,IAAI,MAAM,2CAA2C;AAAA,EAC9D;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE;AAAA,IAC7D;AAAA,EACF;AAEA,SAAO,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAC7C;AA2BA,eAAsB,WACpB,YACA,UAA0B,CAAC,GACZ;AACf,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,UAAM;AAAA,EACR;AACA,QAAM,WAAW,aAAa,EAAE,cAAc,KAAK,CAAC;AACtD;;;AEnKO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,gJAAgJ;AACvL,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,cAAc,mHAAmH;AAChJ,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,sGAAsG;AAC/I,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,0BAA0B,wHAAwH;AACjK,UAAQ,OAAO,0BAA0B,qEAAqE;AAC9G,UAAQ,OAAO,uBAAuB,gCAAgC;AACtE,UAAQ,OAAO,mBAAmB,wDAAwD;AAC1F,UAAQ,OAAO,2BAA2B,4CAA4C;AACtF,UAAQ,OAAO,0BAA0B,gHAAgH;AACzJ,UAAQ,OAAO,oBAAoB,iOAAiO;AACpQ,UAAQ,OAAO,yBAAyB,sGAAsG;AAC9I,UAAQ,OAAO,wBAAwB,+CAA+C;AACtF,UAAQ,OAAO,4BAA4B,6EAA6E;AACxH,UAAQ,OAAO,qBAAqB,qEAAqE;AACzG,UAAQ,OAAO,kDAAkD,8JAA8J;AAC/N,UAAQ,OAAO,2CAA2C,oIAAoI;AAC9L,UAAQ,OAAO,0CAA0C,uFAAuF;AAChJ,UAAQ,OAAO,eAAe,6DAA6D;AAC3F,UAAQ,OAAO,qBAAqB,iTAAiT;AACvV;;;AHjCA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,QAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,gBAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AAGF,UAAM,WAAW,YAAY,cAAc;AAC3C,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":["__filename","__dirname"]}
package/dist/index.cjs CHANGED
@@ -30,6 +30,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
+ _runForCli: () => _runForCli,
33
34
  buildArgs: () => buildArgs,
34
35
  convert: () => convert,
35
36
  run: () => run
@@ -44,6 +45,7 @@ var importMetaUrl = /* @__PURE__ */ getImportMetaUrl();
44
45
  var import_child_process = require("child_process");
45
46
  var path = __toESM(require("path"), 1);
46
47
  var fs = __toESM(require("fs"), 1);
48
+ var import_string_decoder = require("string_decoder");
47
49
  var import_url = require("url");
48
50
 
49
51
  // src/convert-options.generated.ts
@@ -136,9 +138,21 @@ function buildArgs(options) {
136
138
  if (options.hybridFallback) {
137
139
  args.push("--hybrid-fallback");
138
140
  }
141
+ if (options.hybridHancomAiRegionlistStrategy) {
142
+ args.push("--hybrid-hancom-ai-regionlist-strategy", options.hybridHancomAiRegionlistStrategy);
143
+ }
144
+ if (options.hybridHancomAiOcrStrategy) {
145
+ args.push("--hybrid-hancom-ai-ocr-strategy", options.hybridHancomAiOcrStrategy);
146
+ }
147
+ if (options.hybridHancomAiImageCache) {
148
+ args.push("--hybrid-hancom-ai-image-cache", options.hybridHancomAiImageCache);
149
+ }
139
150
  if (options.toStdout) {
140
151
  args.push("--to-stdout");
141
152
  }
153
+ if (options.threads) {
154
+ args.push("--threads", options.threads);
155
+ }
142
156
  return args;
143
157
  }
144
158
 
@@ -160,21 +174,39 @@ function executeJar(args, executionOptions = {}) {
160
174
  const javaProcess = (0, import_child_process.spawn)(command, commandArgs);
161
175
  let stdout = "";
162
176
  let stderr = "";
177
+ const stdoutDecoder = new import_string_decoder.StringDecoder("utf8");
178
+ const stderrDecoder = new import_string_decoder.StringDecoder("utf8");
163
179
  javaProcess.stdout.on("data", (data) => {
164
- const chunk = data.toString();
180
+ const chunk = stdoutDecoder.write(data);
181
+ if (chunk.length === 0) return;
165
182
  if (streamOutput) {
166
183
  process.stdout.write(chunk);
184
+ } else {
185
+ stdout += chunk;
167
186
  }
168
- stdout += chunk;
169
187
  });
170
188
  javaProcess.stderr.on("data", (data) => {
171
- const chunk = data.toString();
189
+ const chunk = stderrDecoder.write(data);
190
+ if (chunk.length === 0) return;
172
191
  if (streamOutput) {
173
192
  process.stderr.write(chunk);
174
193
  }
175
194
  stderr += chunk;
176
195
  });
177
196
  javaProcess.on("close", (code) => {
197
+ const stdoutTail = stdoutDecoder.end();
198
+ const stderrTail = stderrDecoder.end();
199
+ if (stdoutTail.length > 0) {
200
+ if (streamOutput) {
201
+ process.stdout.write(stdoutTail);
202
+ } else {
203
+ stdout += stdoutTail;
204
+ }
205
+ }
206
+ if (stderrTail.length > 0) {
207
+ if (streamOutput) process.stderr.write(stderrTail);
208
+ stderr += stderrTail;
209
+ }
178
210
  if (code === 0) {
179
211
  resolve(stdout);
180
212
  } else {
@@ -200,20 +232,31 @@ ${errorOutput}`
200
232
  });
201
233
  });
202
234
  }
203
- function convert(inputPaths, options = {}) {
235
+ function buildJarArgs(inputPaths, options) {
204
236
  const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
205
237
  if (inputList.length === 0) {
206
- return Promise.reject(new Error("At least one input path must be provided."));
238
+ return new Error("At least one input path must be provided.");
207
239
  }
208
240
  for (const input of inputList) {
209
241
  if (!fs.existsSync(input)) {
210
- return Promise.reject(new Error(`Input file or folder not found: ${input}`));
242
+ return new Error(`Input file or folder not found: ${input}`);
211
243
  }
212
244
  }
213
- const args = [...inputList, ...buildArgs(options)];
214
- return executeJar(args, {
215
- streamOutput: !options.quiet
216
- });
245
+ return [...inputList, ...buildArgs(options)];
246
+ }
247
+ function convert(inputPaths, options = {}) {
248
+ const argsOrError = buildJarArgs(inputPaths, options);
249
+ if (argsOrError instanceof Error) {
250
+ return Promise.reject(argsOrError);
251
+ }
252
+ return executeJar(argsOrError, { streamOutput: false });
253
+ }
254
+ async function _runForCli(inputPaths, options = {}) {
255
+ const argsOrError = buildJarArgs(inputPaths, options);
256
+ if (argsOrError instanceof Error) {
257
+ throw argsOrError;
258
+ }
259
+ await executeJar(argsOrError, { streamOutput: true });
217
260
  }
218
261
  function run(inputPath, options = {}) {
219
262
  console.warn(
@@ -251,6 +294,7 @@ function run(inputPath, options = {}) {
251
294
  }
252
295
  // Annotate the CommonJS export names for ESM import in node:
253
296
  0 && (module.exports = {
297
+ _runForCli,
254
298
  buildArgs,
255
299
  convert,
256
300
  run
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/index.ts","../node_modules/.pnpm/tsup@8.5.1_postcss@8.5.9_tsx@4.20.5_typescript@6.0.2/node_modules/tsup/assets/cjs_shims.js","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// Shim globals in cjs bundle\n// There's a weird bug that esbuild will always inject importMetaUrl\n// if we export it as `const importMetaUrl = ... __filename ...`\n// But using a function will not cause this issue\n\nconst getImportMetaUrl = () => \n typeof document === \"undefined\" \n ? new URL(`file:${__filename}`).href \n : (document.currentScript && document.currentScript.tagName.toUpperCase() === 'SCRIPT') \n ? document.currentScript.src \n : new URL(\"main.js\", document.baseURI).href;\n\nexport const importMetaUrl = /* @__PURE__ */ getImportMetaUrl()\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n toStdout?: boolean;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n\n return args;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKA,IAAM,mBAAmB,MACvB,OAAO,aAAa,cAChB,IAAI,IAAI,QAAQ,UAAU,EAAE,EAAE,OAC7B,SAAS,iBAAiB,SAAS,cAAc,QAAQ,YAAY,MAAM,WAC1E,SAAS,cAAc,MACvB,IAAI,IAAI,WAAW,SAAS,OAAO,EAAE;AAEtC,IAAM,gBAAgC,iCAAiB;;;ADZ9D,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;;;AEqLvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AAEA,SAAO;AACT;;;AF5QA,IAAMA,kBAAa,0BAAc,aAAe;AAChD,IAAM,YAAiB,aAAQA,WAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":["__filename"]}
1
+ {"version":3,"sources":["../src/index.ts","../node_modules/.pnpm/tsup@8.5.1_postcss@8.5.9_tsx@4.20.5_typescript@6.0.2/node_modules/tsup/assets/cjs_shims.js","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { StringDecoder } from 'string_decoder';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n /**\n * When true, forwards Java's stdout and stderr chunks to the parent\n * process in real time as well as accumulating them. Used by the bundled\n * CLI so long-running conversions show progress as it happens.\n */\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n // StringDecoder buffers incomplete multi-byte UTF-8 sequences across\n // chunk boundaries — Buffer.toString() alone would emit replacement\n // characters when, e.g., a 3-byte Korean codepoint splits across two\n // 'data' events. One decoder per stream so they don't share state.\n const stdoutDecoder = new StringDecoder('utf8');\n const stderrDecoder = new StringDecoder('utf8');\n\n javaProcess.stdout.on('data', (data: Buffer) => {\n const chunk = stdoutDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n // Stream-only: don't also accumulate, or a multi-hour conversion\n // would buffer its entire (potentially gigabyte) stdout in memory\n // for no consumer.\n process.stdout.write(chunk);\n } else {\n stdout += chunk;\n }\n });\n\n javaProcess.stderr.on('data', (data: Buffer) => {\n const chunk = stderrDecoder.write(data);\n if (chunk.length === 0) return;\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n // stderr is always accumulated (progress logs are small and we need\n // them for error messages on non-zero exit).\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n // Flush any trailing bytes the decoder is still holding (always emit\n // them — if we drop them on error paths, error messages with non-ASCII\n // characters lose their tail).\n const stdoutTail = stdoutDecoder.end();\n const stderrTail = stderrDecoder.end();\n if (stdoutTail.length > 0) {\n if (streamOutput) {\n process.stdout.write(stdoutTail);\n } else {\n stdout += stdoutTail;\n }\n }\n if (stderrTail.length > 0) {\n if (streamOutput) process.stderr.write(stderrTail);\n stderr += stderrTail;\n }\n\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nfunction buildJarArgs(\n inputPaths: string | string[],\n options: ConvertOptions,\n): string[] | Error {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return new Error('At least one input path must be provided.');\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return new Error(`Input file or folder not found: ${input}`);\n }\n }\n\n return [...inputList, ...buildArgs(options)];\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n return Promise.reject(argsOrError);\n }\n // Library API: never streams to the parent process. Returns the full stdout\n // string so callers can do `const out = await convert(...)` without surprise\n // side-effects on process.stdout / process.stderr.\n return executeJar(argsOrError, { streamOutput: false });\n}\n\n/**\n * Internal entry point used by the bundled CLI. Streams Java's stdout and\n * stderr to the parent process in real time (so long-running conversions like\n * hybrid mode show progress as it happens) and resolves without a stdout\n * payload — preventing the caller from re-printing what was already streamed.\n *\n * Not part of the public API: do not import this from application code. Use\n * {@link convert} instead.\n *\n * @internal\n */\nexport async function _runForCli(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<void> {\n const argsOrError = buildJarArgs(inputPaths, options);\n if (argsOrError instanceof Error) {\n throw argsOrError;\n }\n await executeJar(argsOrError, { streamOutput: true });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// Shim globals in cjs bundle\n// There's a weird bug that esbuild will always inject importMetaUrl\n// if we export it as `const importMetaUrl = ... __filename ...`\n// But using a function will not cause this issue\n\nconst getImportMetaUrl = () => \n typeof document === \"undefined\" \n ? new URL(`file:${__filename}`).href \n : (document.currentScript && document.currentScript.tagName.toUpperCase() === 'SCRIPT') \n ? document.currentScript.src \n : new URL(\"main.js\", document.baseURI).href;\n\nexport const importMetaUrl = /* @__PURE__ */ getImportMetaUrl()\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images, tagged-pdf. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Enable sensitive data sanitization. Replaces emails, phone numbers, IPs, credit cards, and URLs with placeholders */\n sanitize?: boolean;\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: default (border-based), cluster (border + cluster). Default: default */\n tableMethod?: string;\n /** Reading order algorithm. Values: off, xycut. Default: xycut */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Image output mode. Values: off (no images), embedded (Base64 data URIs), external (file references). Default: external */\n imageOutput?: string;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n /** Directory for extracted images */\n imageDir?: string;\n /** Pages to extract (e.g., \"1,3,5-7\"). Default: all pages */\n pages?: string;\n /** Include page headers and footers in output */\n includeHeaderFooter?: boolean;\n /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */\n detectStrikethrough?: boolean;\n /** Hybrid backend (requires a running server). Quick start: pip install \"opendataloader-pdf[hybrid]\" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */\n hybrid?: string;\n /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */\n hybridMode?: string;\n /** Hybrid backend server URL (overrides default) */\n hybridUrl?: string;\n /** Hybrid backend request timeout in milliseconds (0 = no timeout). Default: 0 */\n hybridTimeout?: string;\n /** Opt in to Java fallback on hybrid backend error (default: disabled) */\n hybridFallback?: boolean;\n /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */\n hybridHancomAiRegionlistStrategy?: string;\n /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */\n hybridHancomAiOcrStrategy?: string;\n /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */\n hybridHancomAiImageCache?: string;\n /** Write output to stdout instead of file (single format only) */\n toStdout?: boolean;\n /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */\n threads?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n sanitize?: boolean;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n imageOutput?: string;\n imageFormat?: string;\n imageDir?: string;\n pages?: string;\n includeHeaderFooter?: boolean;\n detectStrikethrough?: boolean;\n hybrid?: string;\n hybridMode?: string;\n hybridUrl?: string;\n hybridTimeout?: string;\n hybridFallback?: boolean;\n hybridHancomAiRegionlistStrategy?: string;\n hybridHancomAiOcrStrategy?: string;\n hybridHancomAiImageCache?: string;\n toStdout?: boolean;\n threads?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.sanitize) {\n convertOptions.sanitize = true;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.imageOutput) {\n convertOptions.imageOutput = cliOptions.imageOutput;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n if (cliOptions.imageDir) {\n convertOptions.imageDir = cliOptions.imageDir;\n }\n if (cliOptions.pages) {\n convertOptions.pages = cliOptions.pages;\n }\n if (cliOptions.includeHeaderFooter) {\n convertOptions.includeHeaderFooter = true;\n }\n if (cliOptions.detectStrikethrough) {\n convertOptions.detectStrikethrough = true;\n }\n if (cliOptions.hybrid) {\n convertOptions.hybrid = cliOptions.hybrid;\n }\n if (cliOptions.hybridMode) {\n convertOptions.hybridMode = cliOptions.hybridMode;\n }\n if (cliOptions.hybridUrl) {\n convertOptions.hybridUrl = cliOptions.hybridUrl;\n }\n if (cliOptions.hybridTimeout) {\n convertOptions.hybridTimeout = cliOptions.hybridTimeout;\n }\n if (cliOptions.hybridFallback) {\n convertOptions.hybridFallback = true;\n }\n if (cliOptions.hybridHancomAiRegionlistStrategy) {\n convertOptions.hybridHancomAiRegionlistStrategy = cliOptions.hybridHancomAiRegionlistStrategy;\n }\n if (cliOptions.hybridHancomAiOcrStrategy) {\n convertOptions.hybridHancomAiOcrStrategy = cliOptions.hybridHancomAiOcrStrategy;\n }\n if (cliOptions.hybridHancomAiImageCache) {\n convertOptions.hybridHancomAiImageCache = cliOptions.hybridHancomAiImageCache;\n }\n if (cliOptions.toStdout) {\n convertOptions.toStdout = true;\n }\n if (cliOptions.threads) {\n convertOptions.threads = cliOptions.threads;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.sanitize) {\n args.push('--sanitize');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.imageOutput) {\n args.push('--image-output', options.imageOutput);\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n if (options.imageDir) {\n args.push('--image-dir', options.imageDir);\n }\n if (options.pages) {\n args.push('--pages', options.pages);\n }\n if (options.includeHeaderFooter) {\n args.push('--include-header-footer');\n }\n if (options.detectStrikethrough) {\n args.push('--detect-strikethrough');\n }\n if (options.hybrid) {\n args.push('--hybrid', options.hybrid);\n }\n if (options.hybridMode) {\n args.push('--hybrid-mode', options.hybridMode);\n }\n if (options.hybridUrl) {\n args.push('--hybrid-url', options.hybridUrl);\n }\n if (options.hybridTimeout) {\n args.push('--hybrid-timeout', options.hybridTimeout);\n }\n if (options.hybridFallback) {\n args.push('--hybrid-fallback');\n }\n if (options.hybridHancomAiRegionlistStrategy) {\n args.push('--hybrid-hancom-ai-regionlist-strategy', options.hybridHancomAiRegionlistStrategy);\n }\n if (options.hybridHancomAiOcrStrategy) {\n args.push('--hybrid-hancom-ai-ocr-strategy', options.hybridHancomAiOcrStrategy);\n }\n if (options.hybridHancomAiImageCache) {\n args.push('--hybrid-hancom-ai-image-cache', options.hybridHancomAiImageCache);\n }\n if (options.toStdout) {\n args.push('--to-stdout');\n }\n if (options.threads) {\n args.push('--threads', options.threads);\n }\n\n return args;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKA,IAAM,mBAAmB,MACvB,OAAO,aAAa,cAChB,IAAI,IAAI,QAAQ,UAAU,EAAE,EAAE,OAC7B,SAAS,iBAAiB,SAAS,cAAc,QAAQ,YAAY,MAAM,WAC1E,SAAS,cAAc,MACvB,IAAI,IAAI,WAAW,SAAS,OAAO,EAAE;AAEtC,IAAM,gBAAgC,iCAAiB;;;ADZ9D,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,4BAA8B;AAC9B,iBAA8B;;;AE4MvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,YAAY;AAAA,EACxB;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,eAAe,QAAQ,QAAQ;AAAA,EAC3C;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,WAAW,QAAQ,KAAK;AAAA,EACpC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,yBAAyB;AAAA,EACrC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,wBAAwB;AAAA,EACpC;AACA,MAAI,QAAQ,QAAQ;AAClB,SAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,EACtC;AACA,MAAI,QAAQ,YAAY;AACtB,SAAK,KAAK,iBAAiB,QAAQ,UAAU;AAAA,EAC/C;AACA,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,oBAAoB,QAAQ,aAAa;AAAA,EACrD;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,kCAAkC;AAC5C,SAAK,KAAK,0CAA0C,QAAQ,gCAAgC;AAAA,EAC9F;AACA,MAAI,QAAQ,2BAA2B;AACrC,SAAK,KAAK,mCAAmC,QAAQ,yBAAyB;AAAA,EAChF;AACA,MAAI,QAAQ,0BAA0B;AACpC,SAAK,KAAK,kCAAkC,QAAQ,wBAAwB;AAAA,EAC9E;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,aAAa;AAAA,EACzB;AACA,MAAI,QAAQ,SAAS;AACnB,SAAK,KAAK,aAAa,QAAQ,OAAO;AAAA,EACxC;AAEA,SAAO;AACT;;;AF/SA,IAAMA,kBAAa,0BAAc,aAAe;AAChD,IAAM,YAAiB,aAAQA,WAAU;AAEzC,IAAM,WAAW;AAWjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAKb,UAAM,gBAAgB,IAAI,oCAAc,MAAM;AAC9C,UAAM,gBAAgB,IAAI,oCAAc,MAAM;AAE9C,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAIhB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B,OAAO;AACL,kBAAU;AAAA,MACZ;AAAA,IACF,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAiB;AAC9C,YAAM,QAAQ,cAAc,MAAM,IAAI;AACtC,UAAI,MAAM,WAAW,EAAG;AACxB,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AAGA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAIhC,YAAM,aAAa,cAAc,IAAI;AACrC,YAAM,aAAa,cAAc,IAAI;AACrC,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,cAAc;AAChB,kBAAQ,OAAO,MAAM,UAAU;AAAA,QACjC,OAAO;AACL,oBAAU;AAAA,QACZ;AAAA,MACF;AACA,UAAI,WAAW,SAAS,GAAG;AACzB,YAAI,aAAc,SAAQ,OAAO,MAAM,UAAU;AACjD,kBAAU;AAAA,MACZ;AAEA,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEA,SAAS,aACP,YACA,SACkB;AAClB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,IAAI,MAAM,2CAA2C;AAAA,EAC9D;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE;AAAA,IAC7D;AAAA,EACF;AAEA,SAAO,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAC7C;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,WAAO,QAAQ,OAAO,WAAW;AAAA,EACnC;AAIA,SAAO,WAAW,aAAa,EAAE,cAAc,MAAM,CAAC;AACxD;AAaA,eAAsB,WACpB,YACA,UAA0B,CAAC,GACZ;AACf,QAAM,cAAc,aAAa,YAAY,OAAO;AACpD,MAAI,uBAAuB,OAAO;AAChC,UAAM;AAAA,EACR;AACA,QAAM,WAAW,aAAa,EAAE,cAAc,KAAK,CAAC;AACtD;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":["__filename"]}
package/dist/index.d.cts CHANGED
@@ -42,7 +42,7 @@ interface ConvertOptions {
42
42
  includeHeaderFooter?: boolean;
43
43
  /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */
44
44
  detectStrikethrough?: boolean;
45
- /** Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast */
45
+ /** Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */
46
46
  hybrid?: string;
47
47
  /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */
48
48
  hybridMode?: string;
@@ -52,8 +52,16 @@ interface ConvertOptions {
52
52
  hybridTimeout?: string;
53
53
  /** Opt in to Java fallback on hybrid backend error (default: disabled) */
54
54
  hybridFallback?: boolean;
55
+ /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */
56
+ hybridHancomAiRegionlistStrategy?: string;
57
+ /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */
58
+ hybridHancomAiOcrStrategy?: string;
59
+ /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */
60
+ hybridHancomAiImageCache?: string;
55
61
  /** Write output to stdout instead of file (single format only) */
56
62
  toStdout?: boolean;
63
+ /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */
64
+ threads?: string;
57
65
  }
58
66
  /**
59
67
  * Build CLI arguments array from ConvertOptions.
package/dist/index.d.ts CHANGED
@@ -42,7 +42,7 @@ interface ConvertOptions {
42
42
  includeHeaderFooter?: boolean;
43
43
  /** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */
44
44
  detectStrikethrough?: boolean;
45
- /** Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast */
45
+ /** Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast, hancom-ai */
46
46
  hybrid?: string;
47
47
  /** Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) */
48
48
  hybridMode?: string;
@@ -52,8 +52,16 @@ interface ConvertOptions {
52
52
  hybridTimeout?: string;
53
53
  /** Opt in to Java fallback on hybrid backend error (default: disabled) */
54
54
  hybridFallback?: boolean;
55
+ /** DLA label 7 (regionlist) handling. Requires --hybrid=hancom-ai. Values: table-first (default; check TSR overlap), list-only (skip TSR, always treat as list) */
56
+ hybridHancomAiRegionlistStrategy?: string;
57
+ /** OCR strategy. Requires --hybrid=hancom-ai. Values: off (stream-only), auto (default; stream first, OCR fallback), force (OCR-only) */
58
+ hybridHancomAiOcrStrategy?: string;
59
+ /** Page image cache backing. Requires --hybrid=hancom-ai. Values: memory (default), disk */
60
+ hybridHancomAiImageCache?: string;
55
61
  /** Write output to stdout instead of file (single format only) */
56
62
  toStdout?: boolean;
63
+ /** Number of worker threads for per-page processing. Default: 1 (sequential, stable). Values >1 (experimental) run pages in parallel for faster throughput; output may vary slightly on some PDFs. Capped at the number of available CPU cores. Applies to the native Java pipeline only; ignored in --hybrid mode */
64
+ threads?: string;
57
65
  }
58
66
  /**
59
67
  * Build CLI arguments array from ConvertOptions.
package/dist/index.js CHANGED
@@ -2,6 +2,7 @@
2
2
  import { spawn } from "child_process";
3
3
  import * as path from "path";
4
4
  import * as fs from "fs";
5
+ import { StringDecoder } from "string_decoder";
5
6
  import { fileURLToPath } from "url";
6
7
 
7
8
  // src/convert-options.generated.ts
@@ -94,9 +95,21 @@ function buildArgs(options) {
94
95
  if (options.hybridFallback) {
95
96
  args.push("--hybrid-fallback");
96
97
  }
98
+ if (options.hybridHancomAiRegionlistStrategy) {
99
+ args.push("--hybrid-hancom-ai-regionlist-strategy", options.hybridHancomAiRegionlistStrategy);
100
+ }
101
+ if (options.hybridHancomAiOcrStrategy) {
102
+ args.push("--hybrid-hancom-ai-ocr-strategy", options.hybridHancomAiOcrStrategy);
103
+ }
104
+ if (options.hybridHancomAiImageCache) {
105
+ args.push("--hybrid-hancom-ai-image-cache", options.hybridHancomAiImageCache);
106
+ }
97
107
  if (options.toStdout) {
98
108
  args.push("--to-stdout");
99
109
  }
110
+ if (options.threads) {
111
+ args.push("--threads", options.threads);
112
+ }
100
113
  return args;
101
114
  }
102
115
 
@@ -118,21 +131,39 @@ function executeJar(args, executionOptions = {}) {
118
131
  const javaProcess = spawn(command, commandArgs);
119
132
  let stdout = "";
120
133
  let stderr = "";
134
+ const stdoutDecoder = new StringDecoder("utf8");
135
+ const stderrDecoder = new StringDecoder("utf8");
121
136
  javaProcess.stdout.on("data", (data) => {
122
- const chunk = data.toString();
137
+ const chunk = stdoutDecoder.write(data);
138
+ if (chunk.length === 0) return;
123
139
  if (streamOutput) {
124
140
  process.stdout.write(chunk);
141
+ } else {
142
+ stdout += chunk;
125
143
  }
126
- stdout += chunk;
127
144
  });
128
145
  javaProcess.stderr.on("data", (data) => {
129
- const chunk = data.toString();
146
+ const chunk = stderrDecoder.write(data);
147
+ if (chunk.length === 0) return;
130
148
  if (streamOutput) {
131
149
  process.stderr.write(chunk);
132
150
  }
133
151
  stderr += chunk;
134
152
  });
135
153
  javaProcess.on("close", (code) => {
154
+ const stdoutTail = stdoutDecoder.end();
155
+ const stderrTail = stderrDecoder.end();
156
+ if (stdoutTail.length > 0) {
157
+ if (streamOutput) {
158
+ process.stdout.write(stdoutTail);
159
+ } else {
160
+ stdout += stdoutTail;
161
+ }
162
+ }
163
+ if (stderrTail.length > 0) {
164
+ if (streamOutput) process.stderr.write(stderrTail);
165
+ stderr += stderrTail;
166
+ }
136
167
  if (code === 0) {
137
168
  resolve(stdout);
138
169
  } else {
@@ -158,20 +189,31 @@ ${errorOutput}`
158
189
  });
159
190
  });
160
191
  }
161
- function convert(inputPaths, options = {}) {
192
+ function buildJarArgs(inputPaths, options) {
162
193
  const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
163
194
  if (inputList.length === 0) {
164
- return Promise.reject(new Error("At least one input path must be provided."));
195
+ return new Error("At least one input path must be provided.");
165
196
  }
166
197
  for (const input of inputList) {
167
198
  if (!fs.existsSync(input)) {
168
- return Promise.reject(new Error(`Input file or folder not found: ${input}`));
199
+ return new Error(`Input file or folder not found: ${input}`);
169
200
  }
170
201
  }
171
- const args = [...inputList, ...buildArgs(options)];
172
- return executeJar(args, {
173
- streamOutput: !options.quiet
174
- });
202
+ return [...inputList, ...buildArgs(options)];
203
+ }
204
+ function convert(inputPaths, options = {}) {
205
+ const argsOrError = buildJarArgs(inputPaths, options);
206
+ if (argsOrError instanceof Error) {
207
+ return Promise.reject(argsOrError);
208
+ }
209
+ return executeJar(argsOrError, { streamOutput: false });
210
+ }
211
+ async function _runForCli(inputPaths, options = {}) {
212
+ const argsOrError = buildJarArgs(inputPaths, options);
213
+ if (argsOrError instanceof Error) {
214
+ throw argsOrError;
215
+ }
216
+ await executeJar(argsOrError, { streamOutput: true });
175
217
  }
176
218
  function run(inputPath, options = {}) {
177
219
  console.warn(
@@ -208,6 +250,7 @@ function run(inputPath, options = {}) {
208
250
  });
209
251
  }
210
252
  export {
253
+ _runForCli,
211
254
  buildArgs,
212
255
  convert,
213
256
  run