@opendataloader/pdf 1.2.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -31,12 +31,126 @@ var import_child_process = require("child_process");
31
31
  var path = __toESM(require("path"), 1);
32
32
  var fs = __toESM(require("fs"), 1);
33
33
  var import_url = require("url");
34
+
35
+ // src/convert-options.generated.ts
36
+ function buildConvertOptions(cliOptions) {
37
+ const convertOptions = {};
38
+ if (cliOptions.outputDir) {
39
+ convertOptions.outputDir = cliOptions.outputDir;
40
+ }
41
+ if (cliOptions.password) {
42
+ convertOptions.password = cliOptions.password;
43
+ }
44
+ if (cliOptions.format) {
45
+ convertOptions.format = cliOptions.format;
46
+ }
47
+ if (cliOptions.quiet) {
48
+ convertOptions.quiet = true;
49
+ }
50
+ if (cliOptions.contentSafetyOff) {
51
+ convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;
52
+ }
53
+ if (cliOptions.keepLineBreaks) {
54
+ convertOptions.keepLineBreaks = true;
55
+ }
56
+ if (cliOptions.replaceInvalidChars) {
57
+ convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;
58
+ }
59
+ if (cliOptions.useStructTree) {
60
+ convertOptions.useStructTree = true;
61
+ }
62
+ if (cliOptions.tableMethod) {
63
+ convertOptions.tableMethod = cliOptions.tableMethod;
64
+ }
65
+ if (cliOptions.readingOrder) {
66
+ convertOptions.readingOrder = cliOptions.readingOrder;
67
+ }
68
+ if (cliOptions.markdownPageSeparator) {
69
+ convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;
70
+ }
71
+ if (cliOptions.textPageSeparator) {
72
+ convertOptions.textPageSeparator = cliOptions.textPageSeparator;
73
+ }
74
+ if (cliOptions.htmlPageSeparator) {
75
+ convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;
76
+ }
77
+ if (cliOptions.embedImages) {
78
+ convertOptions.embedImages = true;
79
+ }
80
+ if (cliOptions.imageFormat) {
81
+ convertOptions.imageFormat = cliOptions.imageFormat;
82
+ }
83
+ return convertOptions;
84
+ }
85
+ function buildArgs(options) {
86
+ const args = [];
87
+ if (options.outputDir) {
88
+ args.push("--output-dir", options.outputDir);
89
+ }
90
+ if (options.password) {
91
+ args.push("--password", options.password);
92
+ }
93
+ if (options.format) {
94
+ if (Array.isArray(options.format)) {
95
+ if (options.format.length > 0) {
96
+ args.push("--format", options.format.join(","));
97
+ }
98
+ } else {
99
+ args.push("--format", options.format);
100
+ }
101
+ }
102
+ if (options.quiet) {
103
+ args.push("--quiet");
104
+ }
105
+ if (options.contentSafetyOff) {
106
+ if (Array.isArray(options.contentSafetyOff)) {
107
+ if (options.contentSafetyOff.length > 0) {
108
+ args.push("--content-safety-off", options.contentSafetyOff.join(","));
109
+ }
110
+ } else {
111
+ args.push("--content-safety-off", options.contentSafetyOff);
112
+ }
113
+ }
114
+ if (options.keepLineBreaks) {
115
+ args.push("--keep-line-breaks");
116
+ }
117
+ if (options.replaceInvalidChars) {
118
+ args.push("--replace-invalid-chars", options.replaceInvalidChars);
119
+ }
120
+ if (options.useStructTree) {
121
+ args.push("--use-struct-tree");
122
+ }
123
+ if (options.tableMethod) {
124
+ args.push("--table-method", options.tableMethod);
125
+ }
126
+ if (options.readingOrder) {
127
+ args.push("--reading-order", options.readingOrder);
128
+ }
129
+ if (options.markdownPageSeparator) {
130
+ args.push("--markdown-page-separator", options.markdownPageSeparator);
131
+ }
132
+ if (options.textPageSeparator) {
133
+ args.push("--text-page-separator", options.textPageSeparator);
134
+ }
135
+ if (options.htmlPageSeparator) {
136
+ args.push("--html-page-separator", options.htmlPageSeparator);
137
+ }
138
+ if (options.embedImages) {
139
+ args.push("--embed-images");
140
+ }
141
+ if (options.imageFormat) {
142
+ args.push("--image-format", options.imageFormat);
143
+ }
144
+ return args;
145
+ }
146
+
147
+ // src/index.ts
34
148
  var import_meta = {};
35
149
  var __filename = (0, import_url.fileURLToPath)(import_meta.url);
36
150
  var __dirname = path.dirname(__filename);
37
151
  var JAR_NAME = "opendataloader-pdf-cli.jar";
38
152
  function executeJar(args, executionOptions = {}) {
39
- const { debug = false, streamOutput = false } = executionOptions;
153
+ const { streamOutput = false } = executionOptions;
40
154
  return new Promise((resolve, reject) => {
41
155
  const jarPath = path.join(__dirname, "..", "lib", JAR_NAME);
42
156
  if (!fs.existsSync(jarPath)) {
@@ -90,67 +204,45 @@ ${errorOutput}`
90
204
  });
91
205
  }
92
206
  function convert(inputPaths, options = {}) {
93
- if (inputPaths.length === 0) {
207
+ const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
208
+ if (inputList.length === 0) {
94
209
  return Promise.reject(new Error("At least one input path must be provided."));
95
210
  }
96
- for (const input of inputPaths) {
211
+ for (const input of inputList) {
97
212
  if (!fs.existsSync(input)) {
98
213
  return Promise.reject(new Error(`Input file or folder not found: ${input}`));
99
214
  }
100
215
  }
101
- const args = [...inputPaths];
102
- if (options.outputDir) {
103
- args.push("--output-dir", options.outputDir);
104
- }
105
- if (options.password) {
106
- args.push("--password", options.password);
107
- }
108
- if (options.format && options.format.length > 0) {
109
- args.push("--format", ...options.format);
110
- }
111
- if (options.quiet) {
112
- args.push("--quiet");
113
- }
114
- if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {
115
- args.push("--content-safety-off", ...options.contentSafetyOff);
116
- }
117
- if (options.keepLineBreaks) {
118
- args.push("--keep-line-breaks");
119
- }
120
- if (options.replaceInvalidChars) {
121
- args.push("--replace-invalid-chars", options.replaceInvalidChars);
122
- }
123
- if (options.useStructTree) {
124
- args.push("--use-struct-tree");
125
- }
216
+ const args = [...inputList, ...buildArgs(options)];
126
217
  return executeJar(args, {
127
218
  streamOutput: !options.quiet
128
219
  });
129
220
  }
130
221
 
222
+ // src/cli-options.generated.ts
223
+ function registerCliOptions(program) {
224
+ program.option("-o, --output-dir <value>", "Directory where output files are written. Default: input file directory");
225
+ program.option("-p, --password <value>", "Password for encrypted PDF files");
226
+ program.option("-f, --format <value>", "Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json");
227
+ program.option("-q, --quiet", "Suppress console logging output");
228
+ program.option("--content-safety-off <value>", "Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg");
229
+ program.option("--keep-line-breaks", "Preserve original line breaks in extracted text");
230
+ program.option("--replace-invalid-chars <value>", "Replacement character for invalid/unrecognized characters. Default: space");
231
+ program.option("--use-struct-tree", "Use PDF structure tree (tagged PDF) for reading order and semantic structure");
232
+ program.option("--table-method <value>", "Table detection method. Values: cluster");
233
+ program.option("--reading-order <value>", "Reading order algorithm. Values: none, xycut. Default: none");
234
+ program.option("--markdown-page-separator <value>", "Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none");
235
+ program.option("--text-page-separator <value>", "Separator between pages in text output. Use %page-number% for page numbers. Default: none");
236
+ program.option("--html-page-separator <value>", "Separator between pages in HTML output. Use %page-number% for page numbers. Default: none");
237
+ program.option("--embed-images", "Embed images as Base64 data URIs instead of file path references");
238
+ program.option("--image-format <value>", "Output format for extracted images. Values: png, jpeg. Default: png");
239
+ }
240
+
131
241
  // src/cli.ts
132
- var VALID_FORMATS = /* @__PURE__ */ new Set([
133
- "json",
134
- "text",
135
- "html",
136
- "pdf",
137
- "markdown",
138
- "markdown-with-html",
139
- "markdown-with-images"
140
- ]);
141
- var VALID_CONTENT_SAFETY_MODES = /* @__PURE__ */ new Set([
142
- "all",
143
- "hidden-text",
144
- "off-page",
145
- "tiny",
146
- "hidden-ocg"
147
- ]);
148
242
  function createProgram() {
149
243
  const program = new import_commander.Command();
150
- program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert").option("-o, --output-dir <path>", "Directory where outputs are written").option("-p, --password <password>", "Password for encrypted PDFs").option(
151
- "-f, --format <value...>",
152
- "Output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)"
153
- ).option("-q, --quiet", "Suppress CLI logging output").option("--content-safety-off <mode...>", "Disable one or more content safety filters").option("--keep-line-breaks", "Preserve line breaks in text output").option("--replace-invalid-chars <c>", "Replacement character for invalid characters").option("--use-struct-tree", "Enable processing structure tree (disabled by default)");
244
+ program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert");
245
+ registerCliOptions(program);
154
246
  program.configureOutput({
155
247
  writeErr: (str) => {
156
248
  console.error(str.trimEnd());
@@ -161,34 +253,6 @@ function createProgram() {
161
253
  });
162
254
  return program;
163
255
  }
164
- function buildConvertOptions(options) {
165
- const convertOptions = {};
166
- if (options.outputDir) {
167
- convertOptions.outputDir = options.outputDir;
168
- }
169
- if (options.password) {
170
- convertOptions.password = options.password;
171
- }
172
- if (options.format && options.format.length > 0) {
173
- convertOptions.format = options.format;
174
- }
175
- if (options.quiet) {
176
- convertOptions.quiet = true;
177
- }
178
- if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {
179
- convertOptions.contentSafetyOff = options.contentSafetyOff;
180
- }
181
- if (options.keepLineBreaks) {
182
- convertOptions.keepLineBreaks = true;
183
- }
184
- if (options.replaceInvalidChars) {
185
- convertOptions.replaceInvalidChars = options.replaceInvalidChars;
186
- }
187
- if (options.useStructTree) {
188
- convertOptions.useStructTree = true;
189
- }
190
- return convertOptions;
191
- }
192
256
  async function main() {
193
257
  const program = createProgram();
194
258
  program.exitOverride();
@@ -208,24 +272,6 @@ async function main() {
208
272
  }
209
273
  const cliOptions = program.opts();
210
274
  const inputPaths = program.args;
211
- if (cliOptions.format) {
212
- for (const value of cliOptions.format) {
213
- if (!VALID_FORMATS.has(value)) {
214
- console.error(`Invalid format '${value}'. See '--help' for allowed values.`);
215
- console.error("Use '--help' to see available options.");
216
- return 1;
217
- }
218
- }
219
- }
220
- if (cliOptions.contentSafetyOff) {
221
- for (const value of cliOptions.contentSafetyOff) {
222
- if (!VALID_CONTENT_SAFETY_MODES.has(value)) {
223
- console.error(`Invalid content safety mode '${value}'. See '--help' for allowed values.`);
224
- console.error("Use '--help' to see available options.");
225
- return 1;
226
- }
227
- }
228
- }
229
275
  const convertOptions = buildConvertOptions(cliOptions);
230
276
  try {
231
277
  const output = await convert(inputPaths, convertOptions);
package/dist/cli.cjs.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/cli.ts","../src/index.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert, ConvertOptions } from './index.js';\n\ninterface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string[];\n quiet?: boolean;\n contentSafetyOff?: string[];\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n}\n\nconst VALID_FORMATS = new Set([\n 'json',\n 'text',\n 'html',\n 'pdf',\n 'markdown',\n 'markdown-with-html',\n 'markdown-with-images',\n]);\n\nconst VALID_CONTENT_SAFETY_MODES = new Set([\n 'all',\n 'hidden-text',\n 'off-page',\n 'tiny',\n 'hidden-ocg',\n]);\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert')\n .option('-o, --output-dir <path>', 'Directory where outputs are written')\n .option('-p, --password <password>', 'Password for encrypted PDFs')\n .option(\n '-f, --format <value...>',\n 'Output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)',\n )\n .option('-q, --quiet', 'Suppress CLI logging output')\n .option('--content-safety-off <mode...>', 'Disable one or more content safety filters')\n .option('--keep-line-breaks', 'Preserve line breaks in text output')\n .option('--replace-invalid-chars <c>', 'Replacement character for invalid characters')\n .option('--use-struct-tree', 'Enable processing structure tree (disabled by default)');\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nfunction buildConvertOptions(options: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (options.outputDir) {\n convertOptions.outputDir = options.outputDir;\n }\n if (options.password) {\n convertOptions.password = options.password;\n }\n if (options.format && options.format.length > 0) {\n convertOptions.format = options.format;\n }\n if (options.quiet) {\n convertOptions.quiet = true;\n }\n if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {\n convertOptions.contentSafetyOff = options.contentSafetyOff;\n }\n if (options.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (options.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = options.replaceInvalidChars;\n }\n if (options.useStructTree) {\n convertOptions.useStructTree = true;\n }\n\n return convertOptions;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n\n if (cliOptions.format) {\n for (const value of cliOptions.format) {\n if (!VALID_FORMATS.has(value)) {\n console.error(`Invalid format '${value}'. See '--help' for allowed values.`);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n }\n }\n\n if (cliOptions.contentSafetyOff) {\n for (const value of cliOptions.contentSafetyOff) {\n if (!VALID_CONTENT_SAFETY_MODES.has(value)) {\n console.error(`Invalid content safety mode '${value}'. See '--help' for allowed values.`);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n }\n }\n\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n debug?: boolean;\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { debug = false, streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n return new Promise((resolve, reject) => {\n if (!fs.existsSync(inputPath)) {\n return reject(new Error(`Input file or folder not found: ${inputPath}`));\n }\n\n const args: string[] = [];\n if (options.outputFolder) {\n args.push('--output-dir', options.outputFolder);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.generateMarkdown) {\n args.push('--markdown');\n }\n if (options.generateHtml) {\n args.push('--html');\n }\n if (options.generateAnnotatedPdf) {\n args.push('--pdf');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.contentSafetyOff) {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n if (options.htmlInMarkdown) {\n args.push('--markdown-with-html');\n }\n if (options.addImageToMarkdown) {\n args.push('--markdown-with-images');\n }\n if (options.noJson) {\n args.push('--no-json');\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree')\n }\n\n args.push(inputPath);\n executeJar(args, {\n debug: options.debug,\n streamOutput: Boolean(options.debug),\n })\n .then(resolve)\n .catch(reject);\n });\n}\n\nexport interface ConvertOptions {\n outputDir?: string;\n password?: string;\n format?: string[];\n quiet?: boolean;\n contentSafetyOff?: string[];\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n}\n\nexport function convert(inputPaths: string[], options: ConvertOptions = {}): Promise<string> {\n if (inputPaths.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputPaths) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputPaths];\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format && options.format.length > 0) {\n args.push('--format', ...options.format);\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', ...options.contentSafetyOff);\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree')\n }\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AACA,uBAAwC;;;ACDxC,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;AAH9B;AAKA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAOjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,QAAQ,OAAO,eAAe,MAAM,IAAI;AAEhD,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAmFO,SAAS,QAAQ,YAAsB,UAA0B,CAAC,GAAoB;AAC3F,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,YAAY;AAC9B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,UAAU;AACrC,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,UAAU,QAAQ,OAAO,SAAS,GAAG;AAC/C,SAAK,KAAK,YAAY,GAAG,QAAQ,MAAM;AAAA,EACzC;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,oBAAoB,QAAQ,iBAAiB,SAAS,GAAG;AACnE,SAAK,KAAK,wBAAwB,GAAG,QAAQ,gBAAgB;AAAA,EAC/D;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;ADvLA,IAAM,gBAAgB,oBAAI,IAAI;AAAA,EAC5B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAED,IAAM,6BAA6B,oBAAI,IAAI;AAAA,EACzC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAED,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,yBAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC,EAC9D,OAAO,2BAA2B,qCAAqC,EACvE,OAAO,6BAA6B,6BAA6B,EACjE;AAAA,IACC;AAAA,IACA;AAAA,EACF,EACC,OAAO,eAAe,6BAA6B,EACnD,OAAO,kCAAkC,4CAA4C,EACrF,OAAO,sBAAsB,qCAAqC,EAClE,OAAO,+BAA+B,8CAA8C,EACpF,OAAO,qBAAqB,wDAAwD;AAEvF,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,SAAS,oBAAoB,SAAqC;AAChE,QAAM,iBAAiC,CAAC;AAExC,MAAI,QAAQ,WAAW;AACrB,mBAAe,YAAY,QAAQ;AAAA,EACrC;AACA,MAAI,QAAQ,UAAU;AACpB,mBAAe,WAAW,QAAQ;AAAA,EACpC;AACA,MAAI,QAAQ,UAAU,QAAQ,OAAO,SAAS,GAAG;AAC/C,mBAAe,SAAS,QAAQ;AAAA,EAClC;AACA,MAAI,QAAQ,OAAO;AACjB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,QAAQ,oBAAoB,QAAQ,iBAAiB,SAAS,GAAG;AACnE,mBAAe,mBAAmB,QAAQ;AAAA,EAC5C;AACA,MAAI,QAAQ,gBAAgB;AAC1B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,mBAAe,sBAAsB,QAAQ;AAAA,EAC/C;AACA,MAAI,QAAQ,eAAe;AACzB,mBAAe,gBAAgB;AAAA,EACjC;AAEA,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,iCAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAE3B,MAAI,WAAW,QAAQ;AACrB,eAAW,SAAS,WAAW,QAAQ;AACrC,UAAI,CAAC,cAAc,IAAI,KAAK,GAAG;AAC7B,gBAAQ,MAAM,mBAAmB,KAAK,qCAAqC;AAC3E,gBAAQ,MAAM,wCAAwC;AACtD,eAAO;AAAA,MACT;AAAA,IACF;AAAA,EACF;AAEA,MAAI,WAAW,kBAAkB;AAC/B,eAAW,SAAS,WAAW,kBAAkB;AAC/C,UAAI,CAAC,2BAA2B,IAAI,KAAK,GAAG;AAC1C,gBAAQ,MAAM,gCAAgC,KAAK,qCAAqC;AACxF,gBAAQ,MAAM,wCAAwC;AACtD,eAAO;AAAA,MACT;AAAA,IACF;AAAA,EACF;AAEA,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
1
+ {"version":3,"sources":["../src/cli.ts","../src/index.ts","../src/convert-options.generated.ts","../src/cli-options.generated.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { Command, CommanderError } from 'commander';\nimport { convert } from './index.js';\nimport { CliOptions, buildConvertOptions } from './convert-options.generated.js';\nimport { registerCliOptions } from './cli-options.generated.js';\n\nfunction createProgram(): Command {\n const program = new Command();\n\n program\n .name('opendataloader-pdf')\n .usage('[options] <input...>')\n .description('Convert PDFs using the OpenDataLoader CLI.')\n .showHelpAfterError(\"Use '--help' to see available options.\")\n .showSuggestionAfterError(false)\n .argument('<input...>', 'Input files or directories to convert');\n\n // Register CLI options from auto-generated file\n registerCliOptions(program);\n\n program.configureOutput({\n writeErr: (str) => {\n console.error(str.trimEnd());\n },\n outputError: (str, write) => {\n write(str);\n },\n });\n\n return program;\n}\n\nasync function main(): Promise<number> {\n const program = createProgram();\n\n program.exitOverride();\n\n try {\n program.parse(process.argv);\n } catch (err) {\n if (err instanceof CommanderError) {\n if (err.code === 'commander.helpDisplayed') {\n return 0;\n }\n return err.exitCode ?? 1;\n }\n\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n console.error(\"Use '--help' to see available options.\");\n return 1;\n }\n\n const cliOptions = program.opts<CliOptions>();\n const inputPaths = program.args;\n const convertOptions = buildConvertOptions(cliOptions);\n\n try {\n const output = await convert(inputPaths, convertOptions);\n if (output && !convertOptions.quiet) {\n process.stdout.write(output);\n if (!output.endsWith('\\n')) {\n process.stdout.write('\\n');\n }\n }\n return 0;\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n console.error(message);\n return 1;\n }\n}\n\nmain().then((code) => {\n if (code !== 0) {\n process.exit(code);\n }\n});\n","import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: cluster */\n tableMethod?: string;\n /** Reading order algorithm. Values: none, xycut. Default: none */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Embed images as Base64 data URIs instead of file path references */\n embedImages?: boolean;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n embedImages?: boolean;\n imageFormat?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.embedImages) {\n convertOptions.embedImages = true;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.embedImages) {\n args.push('--embed-images');\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n\n return args;\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\nimport { Command } from 'commander';\n\n/**\n * Register all CLI options on the given Commander program.\n */\nexport function registerCliOptions(program: Command): void {\n program.option('-o, --output-dir <value>', 'Directory where output files are written. Default: input file directory');\n program.option('-p, --password <value>', 'Password for encrypted PDF files');\n program.option('-f, --format <value>', 'Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json');\n program.option('-q, --quiet', 'Suppress console logging output');\n program.option('--content-safety-off <value>', 'Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg');\n program.option('--keep-line-breaks', 'Preserve original line breaks in extracted text');\n program.option('--replace-invalid-chars <value>', 'Replacement character for invalid/unrecognized characters. Default: space');\n program.option('--use-struct-tree', 'Use PDF structure tree (tagged PDF) for reading order and semantic structure');\n program.option('--table-method <value>', 'Table detection method. Values: cluster');\n program.option('--reading-order <value>', 'Reading order algorithm. Values: none, xycut. Default: none');\n program.option('--markdown-page-separator <value>', 'Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none');\n program.option('--text-page-separator <value>', 'Separator between pages in text output. Use %page-number% for page numbers. Default: none');\n program.option('--html-page-separator <value>', 'Separator between pages in HTML output. Use %page-number% for page numbers. Default: none');\n program.option('--embed-images', 'Embed images as Base64 data URIs instead of file path references');\n program.option('--image-format <value>', 'Output format for extracted images. Values: png, jpeg. Default: png');\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AACA,uBAAwC;;;ACDxC,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;;;AC4DvB,SAAS,oBAAoB,YAAwC;AAC1E,QAAM,iBAAiC,CAAC;AAExC,MAAI,WAAW,WAAW;AACxB,mBAAe,YAAY,WAAW;AAAA,EACxC;AACA,MAAI,WAAW,UAAU;AACvB,mBAAe,WAAW,WAAW;AAAA,EACvC;AACA,MAAI,WAAW,QAAQ;AACrB,mBAAe,SAAS,WAAW;AAAA,EACrC;AACA,MAAI,WAAW,OAAO;AACpB,mBAAe,QAAQ;AAAA,EACzB;AACA,MAAI,WAAW,kBAAkB;AAC/B,mBAAe,mBAAmB,WAAW;AAAA,EAC/C;AACA,MAAI,WAAW,gBAAgB;AAC7B,mBAAe,iBAAiB;AAAA,EAClC;AACA,MAAI,WAAW,qBAAqB;AAClC,mBAAe,sBAAsB,WAAW;AAAA,EAClD;AACA,MAAI,WAAW,eAAe;AAC5B,mBAAe,gBAAgB;AAAA,EACjC;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AACA,MAAI,WAAW,cAAc;AAC3B,mBAAe,eAAe,WAAW;AAAA,EAC3C;AACA,MAAI,WAAW,uBAAuB;AACpC,mBAAe,wBAAwB,WAAW;AAAA,EACpD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,mBAAmB;AAChC,mBAAe,oBAAoB,WAAW;AAAA,EAChD;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc;AAAA,EAC/B;AACA,MAAI,WAAW,aAAa;AAC1B,mBAAe,cAAc,WAAW;AAAA,EAC1C;AAEA,SAAO;AACT;AAKO,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,gBAAgB;AAAA,EAC5B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AAEA,SAAO;AACT;;;ADpLA;AAWA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;;;AE9FO,SAAS,mBAAmB,SAAwB;AACzD,UAAQ,OAAO,4BAA4B,yEAAyE;AACpH,UAAQ,OAAO,0BAA0B,kCAAkC;AAC3E,UAAQ,OAAO,wBAAwB,oIAAoI;AAC3K,UAAQ,OAAO,eAAe,iCAAiC;AAC/D,UAAQ,OAAO,gCAAgC,sFAAsF;AACrI,UAAQ,OAAO,sBAAsB,iDAAiD;AACtF,UAAQ,OAAO,mCAAmC,2EAA2E;AAC7H,UAAQ,OAAO,qBAAqB,8EAA8E;AAClH,UAAQ,OAAO,0BAA0B,yCAAyC;AAClF,UAAQ,OAAO,2BAA2B,6DAA6D;AACvG,UAAQ,OAAO,qCAAqC,+FAA+F;AACnJ,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,iCAAiC,2FAA2F;AAC3I,UAAQ,OAAO,kBAAkB,kEAAkE;AACnG,UAAQ,OAAO,0BAA0B,qEAAqE;AAChH;;;AHlBA,SAAS,gBAAyB;AAChC,QAAM,UAAU,IAAI,yBAAQ;AAE5B,UACG,KAAK,oBAAoB,EACzB,MAAM,sBAAsB,EAC5B,YAAY,4CAA4C,EACxD,mBAAmB,wCAAwC,EAC3D,yBAAyB,KAAK,EAC9B,SAAS,cAAc,uCAAuC;AAGjE,qBAAmB,OAAO;AAE1B,UAAQ,gBAAgB;AAAA,IACtB,UAAU,CAAC,QAAQ;AACjB,cAAQ,MAAM,IAAI,QAAQ,CAAC;AAAA,IAC7B;AAAA,IACA,aAAa,CAAC,KAAK,UAAU;AAC3B,YAAM,GAAG;AAAA,IACX;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,eAAe,OAAwB;AACrC,QAAM,UAAU,cAAc;AAE9B,UAAQ,aAAa;AAErB,MAAI;AACF,YAAQ,MAAM,QAAQ,IAAI;AAAA,EAC5B,SAAS,KAAK;AACZ,QAAI,eAAe,iCAAgB;AACjC,UAAI,IAAI,SAAS,2BAA2B;AAC1C,eAAO;AAAA,MACT;AACA,aAAO,IAAI,YAAY;AAAA,IACzB;AAEA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,YAAQ,MAAM,wCAAwC;AACtD,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,QAAQ,KAAiB;AAC5C,QAAM,aAAa,QAAQ;AAC3B,QAAM,iBAAiB,oBAAoB,UAAU;AAErD,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,YAAY,cAAc;AACvD,QAAI,UAAU,CAAC,eAAe,OAAO;AACnC,cAAQ,OAAO,MAAM,MAAM;AAC3B,UAAI,CAAC,OAAO,SAAS,IAAI,GAAG;AAC1B,gBAAQ,OAAO,MAAM,IAAI;AAAA,MAC3B;AAAA,IACF;AACA,WAAO;AAAA,EACT,SAAS,KAAK;AACZ,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAQ,MAAM,OAAO;AACrB,WAAO;AAAA,EACT;AACF;AAEA,KAAK,EAAE,KAAK,CAAC,SAAS;AACpB,MAAI,SAAS,GAAG;AACd,YAAQ,KAAK,IAAI;AAAA,EACnB;AACF,CAAC;","names":[]}
package/dist/cli.js CHANGED
@@ -8,11 +8,125 @@ import { spawn } from "child_process";
8
8
  import * as path from "path";
9
9
  import * as fs from "fs";
10
10
  import { fileURLToPath } from "url";
11
+
12
+ // src/convert-options.generated.ts
13
+ function buildConvertOptions(cliOptions) {
14
+ const convertOptions = {};
15
+ if (cliOptions.outputDir) {
16
+ convertOptions.outputDir = cliOptions.outputDir;
17
+ }
18
+ if (cliOptions.password) {
19
+ convertOptions.password = cliOptions.password;
20
+ }
21
+ if (cliOptions.format) {
22
+ convertOptions.format = cliOptions.format;
23
+ }
24
+ if (cliOptions.quiet) {
25
+ convertOptions.quiet = true;
26
+ }
27
+ if (cliOptions.contentSafetyOff) {
28
+ convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;
29
+ }
30
+ if (cliOptions.keepLineBreaks) {
31
+ convertOptions.keepLineBreaks = true;
32
+ }
33
+ if (cliOptions.replaceInvalidChars) {
34
+ convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;
35
+ }
36
+ if (cliOptions.useStructTree) {
37
+ convertOptions.useStructTree = true;
38
+ }
39
+ if (cliOptions.tableMethod) {
40
+ convertOptions.tableMethod = cliOptions.tableMethod;
41
+ }
42
+ if (cliOptions.readingOrder) {
43
+ convertOptions.readingOrder = cliOptions.readingOrder;
44
+ }
45
+ if (cliOptions.markdownPageSeparator) {
46
+ convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;
47
+ }
48
+ if (cliOptions.textPageSeparator) {
49
+ convertOptions.textPageSeparator = cliOptions.textPageSeparator;
50
+ }
51
+ if (cliOptions.htmlPageSeparator) {
52
+ convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;
53
+ }
54
+ if (cliOptions.embedImages) {
55
+ convertOptions.embedImages = true;
56
+ }
57
+ if (cliOptions.imageFormat) {
58
+ convertOptions.imageFormat = cliOptions.imageFormat;
59
+ }
60
+ return convertOptions;
61
+ }
62
+ function buildArgs(options) {
63
+ const args = [];
64
+ if (options.outputDir) {
65
+ args.push("--output-dir", options.outputDir);
66
+ }
67
+ if (options.password) {
68
+ args.push("--password", options.password);
69
+ }
70
+ if (options.format) {
71
+ if (Array.isArray(options.format)) {
72
+ if (options.format.length > 0) {
73
+ args.push("--format", options.format.join(","));
74
+ }
75
+ } else {
76
+ args.push("--format", options.format);
77
+ }
78
+ }
79
+ if (options.quiet) {
80
+ args.push("--quiet");
81
+ }
82
+ if (options.contentSafetyOff) {
83
+ if (Array.isArray(options.contentSafetyOff)) {
84
+ if (options.contentSafetyOff.length > 0) {
85
+ args.push("--content-safety-off", options.contentSafetyOff.join(","));
86
+ }
87
+ } else {
88
+ args.push("--content-safety-off", options.contentSafetyOff);
89
+ }
90
+ }
91
+ if (options.keepLineBreaks) {
92
+ args.push("--keep-line-breaks");
93
+ }
94
+ if (options.replaceInvalidChars) {
95
+ args.push("--replace-invalid-chars", options.replaceInvalidChars);
96
+ }
97
+ if (options.useStructTree) {
98
+ args.push("--use-struct-tree");
99
+ }
100
+ if (options.tableMethod) {
101
+ args.push("--table-method", options.tableMethod);
102
+ }
103
+ if (options.readingOrder) {
104
+ args.push("--reading-order", options.readingOrder);
105
+ }
106
+ if (options.markdownPageSeparator) {
107
+ args.push("--markdown-page-separator", options.markdownPageSeparator);
108
+ }
109
+ if (options.textPageSeparator) {
110
+ args.push("--text-page-separator", options.textPageSeparator);
111
+ }
112
+ if (options.htmlPageSeparator) {
113
+ args.push("--html-page-separator", options.htmlPageSeparator);
114
+ }
115
+ if (options.embedImages) {
116
+ args.push("--embed-images");
117
+ }
118
+ if (options.imageFormat) {
119
+ args.push("--image-format", options.imageFormat);
120
+ }
121
+ return args;
122
+ }
123
+
124
+ // src/index.ts
11
125
  var __filename = fileURLToPath(import.meta.url);
12
126
  var __dirname = path.dirname(__filename);
13
127
  var JAR_NAME = "opendataloader-pdf-cli.jar";
14
128
  function executeJar(args, executionOptions = {}) {
15
- const { debug = false, streamOutput = false } = executionOptions;
129
+ const { streamOutput = false } = executionOptions;
16
130
  return new Promise((resolve, reject) => {
17
131
  const jarPath = path.join(__dirname, "..", "lib", JAR_NAME);
18
132
  if (!fs.existsSync(jarPath)) {
@@ -66,67 +180,45 @@ ${errorOutput}`
66
180
  });
67
181
  }
68
182
  function convert(inputPaths, options = {}) {
69
- if (inputPaths.length === 0) {
183
+ const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
184
+ if (inputList.length === 0) {
70
185
  return Promise.reject(new Error("At least one input path must be provided."));
71
186
  }
72
- for (const input of inputPaths) {
187
+ for (const input of inputList) {
73
188
  if (!fs.existsSync(input)) {
74
189
  return Promise.reject(new Error(`Input file or folder not found: ${input}`));
75
190
  }
76
191
  }
77
- const args = [...inputPaths];
78
- if (options.outputDir) {
79
- args.push("--output-dir", options.outputDir);
80
- }
81
- if (options.password) {
82
- args.push("--password", options.password);
83
- }
84
- if (options.format && options.format.length > 0) {
85
- args.push("--format", ...options.format);
86
- }
87
- if (options.quiet) {
88
- args.push("--quiet");
89
- }
90
- if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {
91
- args.push("--content-safety-off", ...options.contentSafetyOff);
92
- }
93
- if (options.keepLineBreaks) {
94
- args.push("--keep-line-breaks");
95
- }
96
- if (options.replaceInvalidChars) {
97
- args.push("--replace-invalid-chars", options.replaceInvalidChars);
98
- }
99
- if (options.useStructTree) {
100
- args.push("--use-struct-tree");
101
- }
192
+ const args = [...inputList, ...buildArgs(options)];
102
193
  return executeJar(args, {
103
194
  streamOutput: !options.quiet
104
195
  });
105
196
  }
106
197
 
198
+ // src/cli-options.generated.ts
199
+ function registerCliOptions(program) {
200
+ program.option("-o, --output-dir <value>", "Directory where output files are written. Default: input file directory");
201
+ program.option("-p, --password <value>", "Password for encrypted PDF files");
202
+ program.option("-f, --format <value>", "Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json");
203
+ program.option("-q, --quiet", "Suppress console logging output");
204
+ program.option("--content-safety-off <value>", "Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg");
205
+ program.option("--keep-line-breaks", "Preserve original line breaks in extracted text");
206
+ program.option("--replace-invalid-chars <value>", "Replacement character for invalid/unrecognized characters. Default: space");
207
+ program.option("--use-struct-tree", "Use PDF structure tree (tagged PDF) for reading order and semantic structure");
208
+ program.option("--table-method <value>", "Table detection method. Values: cluster");
209
+ program.option("--reading-order <value>", "Reading order algorithm. Values: none, xycut. Default: none");
210
+ program.option("--markdown-page-separator <value>", "Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none");
211
+ program.option("--text-page-separator <value>", "Separator between pages in text output. Use %page-number% for page numbers. Default: none");
212
+ program.option("--html-page-separator <value>", "Separator between pages in HTML output. Use %page-number% for page numbers. Default: none");
213
+ program.option("--embed-images", "Embed images as Base64 data URIs instead of file path references");
214
+ program.option("--image-format <value>", "Output format for extracted images. Values: png, jpeg. Default: png");
215
+ }
216
+
107
217
  // src/cli.ts
108
- var VALID_FORMATS = /* @__PURE__ */ new Set([
109
- "json",
110
- "text",
111
- "html",
112
- "pdf",
113
- "markdown",
114
- "markdown-with-html",
115
- "markdown-with-images"
116
- ]);
117
- var VALID_CONTENT_SAFETY_MODES = /* @__PURE__ */ new Set([
118
- "all",
119
- "hidden-text",
120
- "off-page",
121
- "tiny",
122
- "hidden-ocg"
123
- ]);
124
218
  function createProgram() {
125
219
  const program = new Command();
126
- program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert").option("-o, --output-dir <path>", "Directory where outputs are written").option("-p, --password <password>", "Password for encrypted PDFs").option(
127
- "-f, --format <value...>",
128
- "Output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)"
129
- ).option("-q, --quiet", "Suppress CLI logging output").option("--content-safety-off <mode...>", "Disable one or more content safety filters").option("--keep-line-breaks", "Preserve line breaks in text output").option("--replace-invalid-chars <c>", "Replacement character for invalid characters").option("--use-struct-tree", "Enable processing structure tree (disabled by default)");
220
+ program.name("opendataloader-pdf").usage("[options] <input...>").description("Convert PDFs using the OpenDataLoader CLI.").showHelpAfterError("Use '--help' to see available options.").showSuggestionAfterError(false).argument("<input...>", "Input files or directories to convert");
221
+ registerCliOptions(program);
130
222
  program.configureOutput({
131
223
  writeErr: (str) => {
132
224
  console.error(str.trimEnd());
@@ -137,34 +229,6 @@ function createProgram() {
137
229
  });
138
230
  return program;
139
231
  }
140
- function buildConvertOptions(options) {
141
- const convertOptions = {};
142
- if (options.outputDir) {
143
- convertOptions.outputDir = options.outputDir;
144
- }
145
- if (options.password) {
146
- convertOptions.password = options.password;
147
- }
148
- if (options.format && options.format.length > 0) {
149
- convertOptions.format = options.format;
150
- }
151
- if (options.quiet) {
152
- convertOptions.quiet = true;
153
- }
154
- if (options.contentSafetyOff && options.contentSafetyOff.length > 0) {
155
- convertOptions.contentSafetyOff = options.contentSafetyOff;
156
- }
157
- if (options.keepLineBreaks) {
158
- convertOptions.keepLineBreaks = true;
159
- }
160
- if (options.replaceInvalidChars) {
161
- convertOptions.replaceInvalidChars = options.replaceInvalidChars;
162
- }
163
- if (options.useStructTree) {
164
- convertOptions.useStructTree = true;
165
- }
166
- return convertOptions;
167
- }
168
232
  async function main() {
169
233
  const program = createProgram();
170
234
  program.exitOverride();
@@ -184,24 +248,6 @@ async function main() {
184
248
  }
185
249
  const cliOptions = program.opts();
186
250
  const inputPaths = program.args;
187
- if (cliOptions.format) {
188
- for (const value of cliOptions.format) {
189
- if (!VALID_FORMATS.has(value)) {
190
- console.error(`Invalid format '${value}'. See '--help' for allowed values.`);
191
- console.error("Use '--help' to see available options.");
192
- return 1;
193
- }
194
- }
195
- }
196
- if (cliOptions.contentSafetyOff) {
197
- for (const value of cliOptions.contentSafetyOff) {
198
- if (!VALID_CONTENT_SAFETY_MODES.has(value)) {
199
- console.error(`Invalid content safety mode '${value}'. See '--help' for allowed values.`);
200
- console.error("Use '--help' to see available options.");
201
- return 1;
202
- }
203
- }
204
- }
205
251
  const convertOptions = buildConvertOptions(cliOptions);
206
252
  try {
207
253
  const output = await convert(inputPaths, convertOptions);