@opendataloader/pdf 1.3.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NOTICE.md +1 -1
- package/README.md +193 -369
- package/dist/cli.cjs +140 -65
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +140 -65
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +102 -81
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +48 -12
- package/dist/index.d.ts +48 -12
- package/dist/index.js +101 -81
- package/dist/index.js.map +1 -1
- package/lib/opendataloader-pdf-cli.jar +0 -0
- package/package.json +2 -2
package/dist/index.cjs
CHANGED
|
@@ -30,6 +30,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/index.ts
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
|
+
buildArgs: () => buildArgs,
|
|
33
34
|
convert: () => convert,
|
|
34
35
|
run: () => run
|
|
35
36
|
});
|
|
@@ -38,12 +39,77 @@ var import_child_process = require("child_process");
|
|
|
38
39
|
var path = __toESM(require("path"), 1);
|
|
39
40
|
var fs = __toESM(require("fs"), 1);
|
|
40
41
|
var import_url = require("url");
|
|
42
|
+
|
|
43
|
+
// src/convert-options.generated.ts
|
|
44
|
+
function buildArgs(options) {
|
|
45
|
+
const args = [];
|
|
46
|
+
if (options.outputDir) {
|
|
47
|
+
args.push("--output-dir", options.outputDir);
|
|
48
|
+
}
|
|
49
|
+
if (options.password) {
|
|
50
|
+
args.push("--password", options.password);
|
|
51
|
+
}
|
|
52
|
+
if (options.format) {
|
|
53
|
+
if (Array.isArray(options.format)) {
|
|
54
|
+
if (options.format.length > 0) {
|
|
55
|
+
args.push("--format", options.format.join(","));
|
|
56
|
+
}
|
|
57
|
+
} else {
|
|
58
|
+
args.push("--format", options.format);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
if (options.quiet) {
|
|
62
|
+
args.push("--quiet");
|
|
63
|
+
}
|
|
64
|
+
if (options.contentSafetyOff) {
|
|
65
|
+
if (Array.isArray(options.contentSafetyOff)) {
|
|
66
|
+
if (options.contentSafetyOff.length > 0) {
|
|
67
|
+
args.push("--content-safety-off", options.contentSafetyOff.join(","));
|
|
68
|
+
}
|
|
69
|
+
} else {
|
|
70
|
+
args.push("--content-safety-off", options.contentSafetyOff);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
if (options.keepLineBreaks) {
|
|
74
|
+
args.push("--keep-line-breaks");
|
|
75
|
+
}
|
|
76
|
+
if (options.replaceInvalidChars) {
|
|
77
|
+
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
78
|
+
}
|
|
79
|
+
if (options.useStructTree) {
|
|
80
|
+
args.push("--use-struct-tree");
|
|
81
|
+
}
|
|
82
|
+
if (options.tableMethod) {
|
|
83
|
+
args.push("--table-method", options.tableMethod);
|
|
84
|
+
}
|
|
85
|
+
if (options.readingOrder) {
|
|
86
|
+
args.push("--reading-order", options.readingOrder);
|
|
87
|
+
}
|
|
88
|
+
if (options.markdownPageSeparator) {
|
|
89
|
+
args.push("--markdown-page-separator", options.markdownPageSeparator);
|
|
90
|
+
}
|
|
91
|
+
if (options.textPageSeparator) {
|
|
92
|
+
args.push("--text-page-separator", options.textPageSeparator);
|
|
93
|
+
}
|
|
94
|
+
if (options.htmlPageSeparator) {
|
|
95
|
+
args.push("--html-page-separator", options.htmlPageSeparator);
|
|
96
|
+
}
|
|
97
|
+
if (options.embedImages) {
|
|
98
|
+
args.push("--embed-images");
|
|
99
|
+
}
|
|
100
|
+
if (options.imageFormat) {
|
|
101
|
+
args.push("--image-format", options.imageFormat);
|
|
102
|
+
}
|
|
103
|
+
return args;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// src/index.ts
|
|
41
107
|
var import_meta = {};
|
|
42
108
|
var __filename = (0, import_url.fileURLToPath)(import_meta.url);
|
|
43
109
|
var __dirname = path.dirname(__filename);
|
|
44
110
|
var JAR_NAME = "opendataloader-pdf-cli.jar";
|
|
45
111
|
function executeJar(args, executionOptions = {}) {
|
|
46
|
-
const {
|
|
112
|
+
const { streamOutput = false } = executionOptions;
|
|
47
113
|
return new Promise((resolve, reject) => {
|
|
48
114
|
const jarPath = path.join(__dirname, "..", "lib", JAR_NAME);
|
|
49
115
|
if (!fs.existsSync(jarPath)) {
|
|
@@ -96,103 +162,58 @@ ${errorOutput}`
|
|
|
96
162
|
});
|
|
97
163
|
});
|
|
98
164
|
}
|
|
99
|
-
function run(inputPath, options = {}) {
|
|
100
|
-
return new Promise((resolve, reject) => {
|
|
101
|
-
if (!fs.existsSync(inputPath)) {
|
|
102
|
-
return reject(new Error(`Input file or folder not found: ${inputPath}`));
|
|
103
|
-
}
|
|
104
|
-
const args = [];
|
|
105
|
-
if (options.outputFolder) {
|
|
106
|
-
args.push("--output-dir", options.outputFolder);
|
|
107
|
-
}
|
|
108
|
-
if (options.password) {
|
|
109
|
-
args.push("--password", options.password);
|
|
110
|
-
}
|
|
111
|
-
if (options.replaceInvalidChars) {
|
|
112
|
-
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
113
|
-
}
|
|
114
|
-
if (options.generateMarkdown) {
|
|
115
|
-
args.push("--markdown");
|
|
116
|
-
}
|
|
117
|
-
if (options.generateHtml) {
|
|
118
|
-
args.push("--html");
|
|
119
|
-
}
|
|
120
|
-
if (options.generateAnnotatedPdf) {
|
|
121
|
-
args.push("--pdf");
|
|
122
|
-
}
|
|
123
|
-
if (options.keepLineBreaks) {
|
|
124
|
-
args.push("--keep-line-breaks");
|
|
125
|
-
}
|
|
126
|
-
if (options.contentSafetyOff) {
|
|
127
|
-
args.push("--content-safety-off", options.contentSafetyOff);
|
|
128
|
-
}
|
|
129
|
-
if (options.htmlInMarkdown) {
|
|
130
|
-
args.push("--markdown-with-html");
|
|
131
|
-
}
|
|
132
|
-
if (options.addImageToMarkdown) {
|
|
133
|
-
args.push("--markdown-with-images");
|
|
134
|
-
}
|
|
135
|
-
if (options.noJson) {
|
|
136
|
-
args.push("--no-json");
|
|
137
|
-
}
|
|
138
|
-
if (options.useStructTree) {
|
|
139
|
-
args.push("--use-struct-tree");
|
|
140
|
-
}
|
|
141
|
-
args.push(inputPath);
|
|
142
|
-
executeJar(args, {
|
|
143
|
-
debug: options.debug,
|
|
144
|
-
streamOutput: Boolean(options.debug)
|
|
145
|
-
}).then(resolve).catch(reject);
|
|
146
|
-
});
|
|
147
|
-
}
|
|
148
165
|
function convert(inputPaths, options = {}) {
|
|
149
|
-
|
|
166
|
+
const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
|
|
167
|
+
if (inputList.length === 0) {
|
|
150
168
|
return Promise.reject(new Error("At least one input path must be provided."));
|
|
151
169
|
}
|
|
152
|
-
for (const input of
|
|
170
|
+
for (const input of inputList) {
|
|
153
171
|
if (!fs.existsSync(input)) {
|
|
154
172
|
return Promise.reject(new Error(`Input file or folder not found: ${input}`));
|
|
155
173
|
}
|
|
156
174
|
}
|
|
157
|
-
const args = [...
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
if (options.quiet) {
|
|
172
|
-
args.push("--quiet");
|
|
175
|
+
const args = [...inputList, ...buildArgs(options)];
|
|
176
|
+
return executeJar(args, {
|
|
177
|
+
streamOutput: !options.quiet
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
function run(inputPath, options = {}) {
|
|
181
|
+
console.warn(
|
|
182
|
+
"Warning: run() is deprecated and will be removed in a future version. Use convert() instead."
|
|
183
|
+
);
|
|
184
|
+
const formats = [];
|
|
185
|
+
if (!options.noJson) {
|
|
186
|
+
formats.push("json");
|
|
173
187
|
}
|
|
174
|
-
if (options.
|
|
175
|
-
if (
|
|
176
|
-
|
|
188
|
+
if (options.generateMarkdown) {
|
|
189
|
+
if (options.addImageToMarkdown) {
|
|
190
|
+
formats.push("markdown-with-images");
|
|
191
|
+
} else if (options.htmlInMarkdown) {
|
|
192
|
+
formats.push("markdown-with-html");
|
|
177
193
|
} else {
|
|
178
|
-
|
|
194
|
+
formats.push("markdown");
|
|
179
195
|
}
|
|
180
196
|
}
|
|
181
|
-
if (options.
|
|
182
|
-
|
|
197
|
+
if (options.generateHtml) {
|
|
198
|
+
formats.push("html");
|
|
183
199
|
}
|
|
184
|
-
if (options.
|
|
185
|
-
|
|
200
|
+
if (options.generateAnnotatedPdf) {
|
|
201
|
+
formats.push("pdf");
|
|
186
202
|
}
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
203
|
+
return convert(inputPath, {
|
|
204
|
+
outputDir: options.outputFolder,
|
|
205
|
+
password: options.password,
|
|
206
|
+
replaceInvalidChars: options.replaceInvalidChars,
|
|
207
|
+
keepLineBreaks: options.keepLineBreaks,
|
|
208
|
+
contentSafetyOff: options.contentSafetyOff,
|
|
209
|
+
useStructTree: options.useStructTree,
|
|
210
|
+
format: formats.length > 0 ? formats : void 0,
|
|
211
|
+
quiet: !options.debug
|
|
192
212
|
});
|
|
193
213
|
}
|
|
194
214
|
// Annotate the CommonJS export names for ESM import in node:
|
|
195
215
|
0 && (module.exports = {
|
|
216
|
+
buildArgs,
|
|
196
217
|
convert,
|
|
197
218
|
run
|
|
198
219
|
});
|
package/dist/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n debug?: boolean;\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { debug = false, streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n return new Promise((resolve, reject) => {\n if (!fs.existsSync(inputPath)) {\n return reject(new Error(`Input file or folder not found: ${inputPath}`));\n }\n\n const args: string[] = [];\n if (options.outputFolder) {\n args.push('--output-dir', options.outputFolder);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.generateMarkdown) {\n args.push('--markdown');\n }\n if (options.generateHtml) {\n args.push('--html');\n }\n if (options.generateAnnotatedPdf) {\n args.push('--pdf');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.contentSafetyOff) {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n if (options.htmlInMarkdown) {\n args.push('--markdown-with-html');\n }\n if (options.addImageToMarkdown) {\n args.push('--markdown-with-images');\n }\n if (options.noJson) {\n args.push('--no-json');\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree')\n }\n\n args.push(inputPath);\n executeJar(args, {\n debug: options.debug,\n streamOutput: Boolean(options.debug),\n })\n .then(resolve)\n .catch(reject);\n });\n}\n\nexport interface ConvertOptions {\n outputDir?: string;\n password?: string;\n format?: string | string[];\n quiet?: boolean;\n contentSafetyOff?: string | string[];\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n}\n\nexport function convert(inputPaths: string[], options: ConvertOptions = {}): Promise<string> {\n if (inputPaths.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputPaths) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputPaths];\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n args.push('--format', options.format.join(','));\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree')\n }\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;AAH9B;AAKA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAOjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,QAAQ,OAAO,eAAe,MAAM,IAAI;AAEhD,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAkBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,QAAI,CAAI,cAAW,SAAS,GAAG;AAC7B,aAAO,OAAO,IAAI,MAAM,mCAAmC,SAAS,EAAE,CAAC;AAAA,IACzE;AAEA,UAAM,OAAiB,CAAC;AACxB,QAAI,QAAQ,cAAc;AACxB,WAAK,KAAK,gBAAgB,QAAQ,YAAY;AAAA,IAChD;AACA,QAAI,QAAQ,UAAU;AACpB,WAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,IAC1C;AACA,QAAI,QAAQ,qBAAqB;AAC/B,WAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,IAClE;AACA,QAAI,QAAQ,kBAAkB;AAC5B,WAAK,KAAK,YAAY;AAAA,IACxB;AACA,QAAI,QAAQ,cAAc;AACxB,WAAK,KAAK,QAAQ;AAAA,IACpB;AACA,QAAI,QAAQ,sBAAsB;AAChC,WAAK,KAAK,OAAO;AAAA,IACnB;AACA,QAAI,QAAQ,gBAAgB;AAC1B,WAAK,KAAK,oBAAoB;AAAA,IAChC;AACA,QAAI,QAAQ,kBAAkB;AAC5B,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AACA,QAAI,QAAQ,gBAAgB;AAC1B,WAAK,KAAK,sBAAsB;AAAA,IAClC;AACA,QAAI,QAAQ,oBAAoB;AAC9B,WAAK,KAAK,wBAAwB;AAAA,IACpC;AACA,QAAI,QAAQ,QAAQ;AAClB,WAAK,KAAK,WAAW;AAAA,IACvB;AACA,QAAI,QAAQ,eAAe;AACzB,WAAK,KAAK,mBAAmB;AAAA,IAC/B;AAEA,SAAK,KAAK,SAAS;AACnB,eAAW,MAAM;AAAA,MACf,OAAO,QAAQ;AAAA,MACf,cAAc,QAAQ,QAAQ,KAAK;AAAA,IACrC,CAAC,EACE,KAAK,OAAO,EACZ,MAAM,MAAM;AAAA,EACjB,CAAC;AACH;AAaO,SAAS,QAAQ,YAAsB,UAA0B,CAAC,GAAoB;AAC3F,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,YAAY;AAC9B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,UAAU;AACrC,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,WAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,IAChD,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,WAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,IACtE,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AAEA,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/convert-options.generated.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\n// Re-export types and utilities from auto-generated file\nexport type { ConvertOptions } from './convert-options.generated.js';\nexport { buildArgs } from './convert-options.generated.js';\nimport type { ConvertOptions } from './convert-options.generated.js';\nimport { buildArgs } from './convert-options.generated.js';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\ninterface JarExecutionOptions {\n streamOutput?: boolean;\n}\n\nfunction executeJar(args: string[], executionOptions: JarExecutionOptions = {}): Promise<string> {\n const { streamOutput = false } = executionOptions;\n\n return new Promise((resolve, reject) => {\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (streamOutput) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const errorOutput = stderr || stdout;\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${errorOutput}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err: Error) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}\n\nexport function convert(\n inputPaths: string | string[],\n options: ConvertOptions = {},\n): Promise<string> {\n const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];\n if (inputList.length === 0) {\n return Promise.reject(new Error('At least one input path must be provided.'));\n }\n\n for (const input of inputList) {\n if (!fs.existsSync(input)) {\n return Promise.reject(new Error(`Input file or folder not found: ${input}`));\n }\n }\n\n const args: string[] = [...inputList, ...buildArgs(options)];\n\n return executeJar(args, {\n streamOutput: !options.quiet,\n });\n}\n\n/**\n * @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.\n */\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n noJson?: boolean;\n debug?: boolean;\n useStructTree?: boolean;\n}\n\n/**\n * @deprecated Use `convert()` instead. This function will be removed in a future version.\n */\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n console.warn(\n 'Warning: run() is deprecated and will be removed in a future version. Use convert() instead.',\n );\n\n // Build format array based on legacy boolean options\n const formats: string[] = [];\n if (!options.noJson) {\n formats.push('json');\n }\n if (options.generateMarkdown) {\n if (options.addImageToMarkdown) {\n formats.push('markdown-with-images');\n } else if (options.htmlInMarkdown) {\n formats.push('markdown-with-html');\n } else {\n formats.push('markdown');\n }\n }\n if (options.generateHtml) {\n formats.push('html');\n }\n if (options.generateAnnotatedPdf) {\n formats.push('pdf');\n }\n\n return convert(inputPath, {\n outputDir: options.outputFolder,\n password: options.password,\n replaceInvalidChars: options.replaceInvalidChars,\n keepLineBreaks: options.keepLineBreaks,\n contentSafetyOff: options.contentSafetyOff,\n useStructTree: options.useStructTree,\n format: formats.length > 0 ? formats : undefined,\n quiet: !options.debug,\n });\n}\n","// AUTO-GENERATED FROM options.json - DO NOT EDIT DIRECTLY\n// Run `npm run generate-options` to regenerate\n\n/**\n * Options for the convert function.\n */\nexport interface ConvertOptions {\n /** Directory where output files are written. Default: input file directory */\n outputDir?: string;\n /** Password for encrypted PDF files */\n password?: string;\n /** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */\n format?: string | string[];\n /** Suppress console logging output */\n quiet?: boolean;\n /** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */\n contentSafetyOff?: string | string[];\n /** Preserve original line breaks in extracted text */\n keepLineBreaks?: boolean;\n /** Replacement character for invalid/unrecognized characters. Default: space */\n replaceInvalidChars?: string;\n /** Use PDF structure tree (tagged PDF) for reading order and semantic structure */\n useStructTree?: boolean;\n /** Table detection method. Values: cluster */\n tableMethod?: string;\n /** Reading order algorithm. Values: none, xycut. Default: none */\n readingOrder?: string;\n /** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */\n markdownPageSeparator?: string;\n /** Separator between pages in text output. Use %page-number% for page numbers. Default: none */\n textPageSeparator?: string;\n /** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */\n htmlPageSeparator?: string;\n /** Embed images as Base64 data URIs instead of file path references */\n embedImages?: boolean;\n /** Output format for extracted images. Values: png, jpeg. Default: png */\n imageFormat?: string;\n}\n\n/**\n * Options as parsed from CLI (all values are strings from commander).\n */\nexport interface CliOptions {\n outputDir?: string;\n password?: string;\n format?: string;\n quiet?: boolean;\n contentSafetyOff?: string;\n keepLineBreaks?: boolean;\n replaceInvalidChars?: string;\n useStructTree?: boolean;\n tableMethod?: string;\n readingOrder?: string;\n markdownPageSeparator?: string;\n textPageSeparator?: string;\n htmlPageSeparator?: string;\n embedImages?: boolean;\n imageFormat?: string;\n}\n\n/**\n * Convert CLI options to ConvertOptions.\n */\nexport function buildConvertOptions(cliOptions: CliOptions): ConvertOptions {\n const convertOptions: ConvertOptions = {};\n\n if (cliOptions.outputDir) {\n convertOptions.outputDir = cliOptions.outputDir;\n }\n if (cliOptions.password) {\n convertOptions.password = cliOptions.password;\n }\n if (cliOptions.format) {\n convertOptions.format = cliOptions.format;\n }\n if (cliOptions.quiet) {\n convertOptions.quiet = true;\n }\n if (cliOptions.contentSafetyOff) {\n convertOptions.contentSafetyOff = cliOptions.contentSafetyOff;\n }\n if (cliOptions.keepLineBreaks) {\n convertOptions.keepLineBreaks = true;\n }\n if (cliOptions.replaceInvalidChars) {\n convertOptions.replaceInvalidChars = cliOptions.replaceInvalidChars;\n }\n if (cliOptions.useStructTree) {\n convertOptions.useStructTree = true;\n }\n if (cliOptions.tableMethod) {\n convertOptions.tableMethod = cliOptions.tableMethod;\n }\n if (cliOptions.readingOrder) {\n convertOptions.readingOrder = cliOptions.readingOrder;\n }\n if (cliOptions.markdownPageSeparator) {\n convertOptions.markdownPageSeparator = cliOptions.markdownPageSeparator;\n }\n if (cliOptions.textPageSeparator) {\n convertOptions.textPageSeparator = cliOptions.textPageSeparator;\n }\n if (cliOptions.htmlPageSeparator) {\n convertOptions.htmlPageSeparator = cliOptions.htmlPageSeparator;\n }\n if (cliOptions.embedImages) {\n convertOptions.embedImages = true;\n }\n if (cliOptions.imageFormat) {\n convertOptions.imageFormat = cliOptions.imageFormat;\n }\n\n return convertOptions;\n}\n\n/**\n * Build CLI arguments array from ConvertOptions.\n */\nexport function buildArgs(options: ConvertOptions): string[] {\n const args: string[] = [];\n\n if (options.outputDir) {\n args.push('--output-dir', options.outputDir);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.format) {\n if (Array.isArray(options.format)) {\n if (options.format.length > 0) {\n args.push('--format', options.format.join(','));\n }\n } else {\n args.push('--format', options.format);\n }\n }\n if (options.quiet) {\n args.push('--quiet');\n }\n if (options.contentSafetyOff) {\n if (Array.isArray(options.contentSafetyOff)) {\n if (options.contentSafetyOff.length > 0) {\n args.push('--content-safety-off', options.contentSafetyOff.join(','));\n }\n } else {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.useStructTree) {\n args.push('--use-struct-tree');\n }\n if (options.tableMethod) {\n args.push('--table-method', options.tableMethod);\n }\n if (options.readingOrder) {\n args.push('--reading-order', options.readingOrder);\n }\n if (options.markdownPageSeparator) {\n args.push('--markdown-page-separator', options.markdownPageSeparator);\n }\n if (options.textPageSeparator) {\n args.push('--text-page-separator', options.textPageSeparator);\n }\n if (options.htmlPageSeparator) {\n args.push('--html-page-separator', options.htmlPageSeparator);\n }\n if (options.embedImages) {\n args.push('--embed-images');\n }\n if (options.imageFormat) {\n args.push('--image-format', options.imageFormat);\n }\n\n return args;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;;;ACmHvB,SAAS,UAAU,SAAmC;AAC3D,QAAM,OAAiB,CAAC;AAExB,MAAI,QAAQ,WAAW;AACrB,SAAK,KAAK,gBAAgB,QAAQ,SAAS;AAAA,EAC7C;AACA,MAAI,QAAQ,UAAU;AACpB,SAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,EAC1C;AACA,MAAI,QAAQ,QAAQ;AAClB,QAAI,MAAM,QAAQ,QAAQ,MAAM,GAAG;AACjC,UAAI,QAAQ,OAAO,SAAS,GAAG;AAC7B,aAAK,KAAK,YAAY,QAAQ,OAAO,KAAK,GAAG,CAAC;AAAA,MAChD;AAAA,IACF,OAAO;AACL,WAAK,KAAK,YAAY,QAAQ,MAAM;AAAA,IACtC;AAAA,EACF;AACA,MAAI,QAAQ,OAAO;AACjB,SAAK,KAAK,SAAS;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,MAAM,QAAQ,QAAQ,gBAAgB,GAAG;AAC3C,UAAI,QAAQ,iBAAiB,SAAS,GAAG;AACvC,aAAK,KAAK,wBAAwB,QAAQ,iBAAiB,KAAK,GAAG,CAAC;AAAA,MACtE;AAAA,IACF,OAAO;AACL,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AAAA,EACF;AACA,MAAI,QAAQ,gBAAgB;AAC1B,SAAK,KAAK,oBAAoB;AAAA,EAChC;AACA,MAAI,QAAQ,qBAAqB;AAC/B,SAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,EAClE;AACA,MAAI,QAAQ,eAAe;AACzB,SAAK,KAAK,mBAAmB;AAAA,EAC/B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AACA,MAAI,QAAQ,cAAc;AACxB,SAAK,KAAK,mBAAmB,QAAQ,YAAY;AAAA,EACnD;AACA,MAAI,QAAQ,uBAAuB;AACjC,SAAK,KAAK,6BAA6B,QAAQ,qBAAqB;AAAA,EACtE;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,mBAAmB;AAC7B,SAAK,KAAK,yBAAyB,QAAQ,iBAAiB;AAAA,EAC9D;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,gBAAgB;AAAA,EAC5B;AACA,MAAI,QAAQ,aAAa;AACvB,SAAK,KAAK,kBAAkB,QAAQ,WAAW;AAAA,EACjD;AAEA,SAAO;AACT;;;ADpLA;AAWA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAMjB,SAAS,WAAW,MAAgB,mBAAwC,CAAC,GAAoB;AAC/F,QAAM,EAAE,eAAe,MAAM,IAAI;AAEjC,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,cAAc;AAChB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,cAAc,UAAU;AAC9B,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,WAAW;AAAA,QACxE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAe;AACtC,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,QACd,YACA,UAA0B,CAAC,GACV;AACjB,QAAM,YAAY,MAAM,QAAQ,UAAU,IAAI,aAAa,CAAC,UAAU;AACtE,MAAI,UAAU,WAAW,GAAG;AAC1B,WAAO,QAAQ,OAAO,IAAI,MAAM,2CAA2C,CAAC;AAAA,EAC9E;AAEA,aAAW,SAAS,WAAW;AAC7B,QAAI,CAAI,cAAW,KAAK,GAAG;AACzB,aAAO,QAAQ,OAAO,IAAI,MAAM,mCAAmC,KAAK,EAAE,CAAC;AAAA,IAC7E;AAAA,EACF;AAEA,QAAM,OAAiB,CAAC,GAAG,WAAW,GAAG,UAAU,OAAO,CAAC;AAE3D,SAAO,WAAW,MAAM;AAAA,IACtB,cAAc,CAAC,QAAQ;AAAA,EACzB,CAAC;AACH;AAwBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,UAAQ;AAAA,IACN;AAAA,EACF;AAGA,QAAM,UAAoB,CAAC;AAC3B,MAAI,CAAC,QAAQ,QAAQ;AACnB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,kBAAkB;AAC5B,QAAI,QAAQ,oBAAoB;AAC9B,cAAQ,KAAK,sBAAsB;AAAA,IACrC,WAAW,QAAQ,gBAAgB;AACjC,cAAQ,KAAK,oBAAoB;AAAA,IACnC,OAAO;AACL,cAAQ,KAAK,UAAU;AAAA,IACzB;AAAA,EACF;AACA,MAAI,QAAQ,cAAc;AACxB,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,MAAI,QAAQ,sBAAsB;AAChC,YAAQ,KAAK,KAAK;AAAA,EACpB;AAEA,SAAO,QAAQ,WAAW;AAAA,IACxB,WAAW,QAAQ;AAAA,IACnB,UAAU,QAAQ;AAAA,IAClB,qBAAqB,QAAQ;AAAA,IAC7B,gBAAgB,QAAQ;AAAA,IACxB,kBAAkB,QAAQ;AAAA,IAC1B,eAAe,QAAQ;AAAA,IACvB,QAAQ,QAAQ,SAAS,IAAI,UAAU;AAAA,IACvC,OAAO,CAAC,QAAQ;AAAA,EAClB,CAAC;AACH;","names":[]}
|
package/dist/index.d.cts
CHANGED
|
@@ -1,3 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Options for the convert function.
|
|
3
|
+
*/
|
|
4
|
+
interface ConvertOptions {
|
|
5
|
+
/** Directory where output files are written. Default: input file directory */
|
|
6
|
+
outputDir?: string;
|
|
7
|
+
/** Password for encrypted PDF files */
|
|
8
|
+
password?: string;
|
|
9
|
+
/** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */
|
|
10
|
+
format?: string | string[];
|
|
11
|
+
/** Suppress console logging output */
|
|
12
|
+
quiet?: boolean;
|
|
13
|
+
/** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */
|
|
14
|
+
contentSafetyOff?: string | string[];
|
|
15
|
+
/** Preserve original line breaks in extracted text */
|
|
16
|
+
keepLineBreaks?: boolean;
|
|
17
|
+
/** Replacement character for invalid/unrecognized characters. Default: space */
|
|
18
|
+
replaceInvalidChars?: string;
|
|
19
|
+
/** Use PDF structure tree (tagged PDF) for reading order and semantic structure */
|
|
20
|
+
useStructTree?: boolean;
|
|
21
|
+
/** Table detection method. Values: cluster */
|
|
22
|
+
tableMethod?: string;
|
|
23
|
+
/** Reading order algorithm. Values: none, xycut. Default: none */
|
|
24
|
+
readingOrder?: string;
|
|
25
|
+
/** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */
|
|
26
|
+
markdownPageSeparator?: string;
|
|
27
|
+
/** Separator between pages in text output. Use %page-number% for page numbers. Default: none */
|
|
28
|
+
textPageSeparator?: string;
|
|
29
|
+
/** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */
|
|
30
|
+
htmlPageSeparator?: string;
|
|
31
|
+
/** Embed images as Base64 data URIs instead of file path references */
|
|
32
|
+
embedImages?: boolean;
|
|
33
|
+
/** Output format for extracted images. Values: png, jpeg. Default: png */
|
|
34
|
+
imageFormat?: string;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Build CLI arguments array from ConvertOptions.
|
|
38
|
+
*/
|
|
39
|
+
declare function buildArgs(options: ConvertOptions): string[];
|
|
40
|
+
|
|
41
|
+
declare function convert(inputPaths: string | string[], options?: ConvertOptions): Promise<string>;
|
|
42
|
+
/**
|
|
43
|
+
* @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.
|
|
44
|
+
*/
|
|
1
45
|
interface RunOptions {
|
|
2
46
|
outputFolder?: string;
|
|
3
47
|
password?: string;
|
|
@@ -13,17 +57,9 @@ interface RunOptions {
|
|
|
13
57
|
debug?: boolean;
|
|
14
58
|
useStructTree?: boolean;
|
|
15
59
|
}
|
|
60
|
+
/**
|
|
61
|
+
* @deprecated Use `convert()` instead. This function will be removed in a future version.
|
|
62
|
+
*/
|
|
16
63
|
declare function run(inputPath: string, options?: RunOptions): Promise<string>;
|
|
17
|
-
interface ConvertOptions {
|
|
18
|
-
outputDir?: string;
|
|
19
|
-
password?: string;
|
|
20
|
-
format?: string | string[];
|
|
21
|
-
quiet?: boolean;
|
|
22
|
-
contentSafetyOff?: string | string[];
|
|
23
|
-
keepLineBreaks?: boolean;
|
|
24
|
-
replaceInvalidChars?: string;
|
|
25
|
-
useStructTree?: boolean;
|
|
26
|
-
}
|
|
27
|
-
declare function convert(inputPaths: string[], options?: ConvertOptions): Promise<string>;
|
|
28
64
|
|
|
29
|
-
export { type ConvertOptions, type RunOptions, convert, run };
|
|
65
|
+
export { type ConvertOptions, type RunOptions, buildArgs, convert, run };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Options for the convert function.
|
|
3
|
+
*/
|
|
4
|
+
interface ConvertOptions {
|
|
5
|
+
/** Directory where output files are written. Default: input file directory */
|
|
6
|
+
outputDir?: string;
|
|
7
|
+
/** Password for encrypted PDF files */
|
|
8
|
+
password?: string;
|
|
9
|
+
/** Output formats (comma-separated). Values: json, text, html, pdf, markdown, markdown-with-html, markdown-with-images. Default: json */
|
|
10
|
+
format?: string | string[];
|
|
11
|
+
/** Suppress console logging output */
|
|
12
|
+
quiet?: boolean;
|
|
13
|
+
/** Disable content safety filters. Values: all, hidden-text, off-page, tiny, hidden-ocg */
|
|
14
|
+
contentSafetyOff?: string | string[];
|
|
15
|
+
/** Preserve original line breaks in extracted text */
|
|
16
|
+
keepLineBreaks?: boolean;
|
|
17
|
+
/** Replacement character for invalid/unrecognized characters. Default: space */
|
|
18
|
+
replaceInvalidChars?: string;
|
|
19
|
+
/** Use PDF structure tree (tagged PDF) for reading order and semantic structure */
|
|
20
|
+
useStructTree?: boolean;
|
|
21
|
+
/** Table detection method. Values: cluster */
|
|
22
|
+
tableMethod?: string;
|
|
23
|
+
/** Reading order algorithm. Values: none, xycut. Default: none */
|
|
24
|
+
readingOrder?: string;
|
|
25
|
+
/** Separator between pages in Markdown output. Use %page-number% for page numbers. Default: none */
|
|
26
|
+
markdownPageSeparator?: string;
|
|
27
|
+
/** Separator between pages in text output. Use %page-number% for page numbers. Default: none */
|
|
28
|
+
textPageSeparator?: string;
|
|
29
|
+
/** Separator between pages in HTML output. Use %page-number% for page numbers. Default: none */
|
|
30
|
+
htmlPageSeparator?: string;
|
|
31
|
+
/** Embed images as Base64 data URIs instead of file path references */
|
|
32
|
+
embedImages?: boolean;
|
|
33
|
+
/** Output format for extracted images. Values: png, jpeg. Default: png */
|
|
34
|
+
imageFormat?: string;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Build CLI arguments array from ConvertOptions.
|
|
38
|
+
*/
|
|
39
|
+
declare function buildArgs(options: ConvertOptions): string[];
|
|
40
|
+
|
|
41
|
+
declare function convert(inputPaths: string | string[], options?: ConvertOptions): Promise<string>;
|
|
42
|
+
/**
|
|
43
|
+
* @deprecated Use `convert()` and `ConvertOptions` instead. This function will be removed in a future version.
|
|
44
|
+
*/
|
|
1
45
|
interface RunOptions {
|
|
2
46
|
outputFolder?: string;
|
|
3
47
|
password?: string;
|
|
@@ -13,17 +57,9 @@ interface RunOptions {
|
|
|
13
57
|
debug?: boolean;
|
|
14
58
|
useStructTree?: boolean;
|
|
15
59
|
}
|
|
60
|
+
/**
|
|
61
|
+
* @deprecated Use `convert()` instead. This function will be removed in a future version.
|
|
62
|
+
*/
|
|
16
63
|
declare function run(inputPath: string, options?: RunOptions): Promise<string>;
|
|
17
|
-
interface ConvertOptions {
|
|
18
|
-
outputDir?: string;
|
|
19
|
-
password?: string;
|
|
20
|
-
format?: string | string[];
|
|
21
|
-
quiet?: boolean;
|
|
22
|
-
contentSafetyOff?: string | string[];
|
|
23
|
-
keepLineBreaks?: boolean;
|
|
24
|
-
replaceInvalidChars?: string;
|
|
25
|
-
useStructTree?: boolean;
|
|
26
|
-
}
|
|
27
|
-
declare function convert(inputPaths: string[], options?: ConvertOptions): Promise<string>;
|
|
28
64
|
|
|
29
|
-
export { type ConvertOptions, type RunOptions, convert, run };
|
|
65
|
+
export { type ConvertOptions, type RunOptions, buildArgs, convert, run };
|
package/dist/index.js
CHANGED
|
@@ -3,11 +3,76 @@ import { spawn } from "child_process";
|
|
|
3
3
|
import * as path from "path";
|
|
4
4
|
import * as fs from "fs";
|
|
5
5
|
import { fileURLToPath } from "url";
|
|
6
|
+
|
|
7
|
+
// src/convert-options.generated.ts
|
|
8
|
+
function buildArgs(options) {
|
|
9
|
+
const args = [];
|
|
10
|
+
if (options.outputDir) {
|
|
11
|
+
args.push("--output-dir", options.outputDir);
|
|
12
|
+
}
|
|
13
|
+
if (options.password) {
|
|
14
|
+
args.push("--password", options.password);
|
|
15
|
+
}
|
|
16
|
+
if (options.format) {
|
|
17
|
+
if (Array.isArray(options.format)) {
|
|
18
|
+
if (options.format.length > 0) {
|
|
19
|
+
args.push("--format", options.format.join(","));
|
|
20
|
+
}
|
|
21
|
+
} else {
|
|
22
|
+
args.push("--format", options.format);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
if (options.quiet) {
|
|
26
|
+
args.push("--quiet");
|
|
27
|
+
}
|
|
28
|
+
if (options.contentSafetyOff) {
|
|
29
|
+
if (Array.isArray(options.contentSafetyOff)) {
|
|
30
|
+
if (options.contentSafetyOff.length > 0) {
|
|
31
|
+
args.push("--content-safety-off", options.contentSafetyOff.join(","));
|
|
32
|
+
}
|
|
33
|
+
} else {
|
|
34
|
+
args.push("--content-safety-off", options.contentSafetyOff);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
if (options.keepLineBreaks) {
|
|
38
|
+
args.push("--keep-line-breaks");
|
|
39
|
+
}
|
|
40
|
+
if (options.replaceInvalidChars) {
|
|
41
|
+
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
42
|
+
}
|
|
43
|
+
if (options.useStructTree) {
|
|
44
|
+
args.push("--use-struct-tree");
|
|
45
|
+
}
|
|
46
|
+
if (options.tableMethod) {
|
|
47
|
+
args.push("--table-method", options.tableMethod);
|
|
48
|
+
}
|
|
49
|
+
if (options.readingOrder) {
|
|
50
|
+
args.push("--reading-order", options.readingOrder);
|
|
51
|
+
}
|
|
52
|
+
if (options.markdownPageSeparator) {
|
|
53
|
+
args.push("--markdown-page-separator", options.markdownPageSeparator);
|
|
54
|
+
}
|
|
55
|
+
if (options.textPageSeparator) {
|
|
56
|
+
args.push("--text-page-separator", options.textPageSeparator);
|
|
57
|
+
}
|
|
58
|
+
if (options.htmlPageSeparator) {
|
|
59
|
+
args.push("--html-page-separator", options.htmlPageSeparator);
|
|
60
|
+
}
|
|
61
|
+
if (options.embedImages) {
|
|
62
|
+
args.push("--embed-images");
|
|
63
|
+
}
|
|
64
|
+
if (options.imageFormat) {
|
|
65
|
+
args.push("--image-format", options.imageFormat);
|
|
66
|
+
}
|
|
67
|
+
return args;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// src/index.ts
|
|
6
71
|
var __filename = fileURLToPath(import.meta.url);
|
|
7
72
|
var __dirname = path.dirname(__filename);
|
|
8
73
|
var JAR_NAME = "opendataloader-pdf-cli.jar";
|
|
9
74
|
function executeJar(args, executionOptions = {}) {
|
|
10
|
-
const {
|
|
75
|
+
const { streamOutput = false } = executionOptions;
|
|
11
76
|
return new Promise((resolve, reject) => {
|
|
12
77
|
const jarPath = path.join(__dirname, "..", "lib", JAR_NAME);
|
|
13
78
|
if (!fs.existsSync(jarPath)) {
|
|
@@ -60,102 +125,57 @@ ${errorOutput}`
|
|
|
60
125
|
});
|
|
61
126
|
});
|
|
62
127
|
}
|
|
63
|
-
function run(inputPath, options = {}) {
|
|
64
|
-
return new Promise((resolve, reject) => {
|
|
65
|
-
if (!fs.existsSync(inputPath)) {
|
|
66
|
-
return reject(new Error(`Input file or folder not found: ${inputPath}`));
|
|
67
|
-
}
|
|
68
|
-
const args = [];
|
|
69
|
-
if (options.outputFolder) {
|
|
70
|
-
args.push("--output-dir", options.outputFolder);
|
|
71
|
-
}
|
|
72
|
-
if (options.password) {
|
|
73
|
-
args.push("--password", options.password);
|
|
74
|
-
}
|
|
75
|
-
if (options.replaceInvalidChars) {
|
|
76
|
-
args.push("--replace-invalid-chars", options.replaceInvalidChars);
|
|
77
|
-
}
|
|
78
|
-
if (options.generateMarkdown) {
|
|
79
|
-
args.push("--markdown");
|
|
80
|
-
}
|
|
81
|
-
if (options.generateHtml) {
|
|
82
|
-
args.push("--html");
|
|
83
|
-
}
|
|
84
|
-
if (options.generateAnnotatedPdf) {
|
|
85
|
-
args.push("--pdf");
|
|
86
|
-
}
|
|
87
|
-
if (options.keepLineBreaks) {
|
|
88
|
-
args.push("--keep-line-breaks");
|
|
89
|
-
}
|
|
90
|
-
if (options.contentSafetyOff) {
|
|
91
|
-
args.push("--content-safety-off", options.contentSafetyOff);
|
|
92
|
-
}
|
|
93
|
-
if (options.htmlInMarkdown) {
|
|
94
|
-
args.push("--markdown-with-html");
|
|
95
|
-
}
|
|
96
|
-
if (options.addImageToMarkdown) {
|
|
97
|
-
args.push("--markdown-with-images");
|
|
98
|
-
}
|
|
99
|
-
if (options.noJson) {
|
|
100
|
-
args.push("--no-json");
|
|
101
|
-
}
|
|
102
|
-
if (options.useStructTree) {
|
|
103
|
-
args.push("--use-struct-tree");
|
|
104
|
-
}
|
|
105
|
-
args.push(inputPath);
|
|
106
|
-
executeJar(args, {
|
|
107
|
-
debug: options.debug,
|
|
108
|
-
streamOutput: Boolean(options.debug)
|
|
109
|
-
}).then(resolve).catch(reject);
|
|
110
|
-
});
|
|
111
|
-
}
|
|
112
128
|
function convert(inputPaths, options = {}) {
|
|
113
|
-
|
|
129
|
+
const inputList = Array.isArray(inputPaths) ? inputPaths : [inputPaths];
|
|
130
|
+
if (inputList.length === 0) {
|
|
114
131
|
return Promise.reject(new Error("At least one input path must be provided."));
|
|
115
132
|
}
|
|
116
|
-
for (const input of
|
|
133
|
+
for (const input of inputList) {
|
|
117
134
|
if (!fs.existsSync(input)) {
|
|
118
135
|
return Promise.reject(new Error(`Input file or folder not found: ${input}`));
|
|
119
136
|
}
|
|
120
137
|
}
|
|
121
|
-
const args = [...
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
if (options.quiet) {
|
|
136
|
-
args.push("--quiet");
|
|
138
|
+
const args = [...inputList, ...buildArgs(options)];
|
|
139
|
+
return executeJar(args, {
|
|
140
|
+
streamOutput: !options.quiet
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
function run(inputPath, options = {}) {
|
|
144
|
+
console.warn(
|
|
145
|
+
"Warning: run() is deprecated and will be removed in a future version. Use convert() instead."
|
|
146
|
+
);
|
|
147
|
+
const formats = [];
|
|
148
|
+
if (!options.noJson) {
|
|
149
|
+
formats.push("json");
|
|
137
150
|
}
|
|
138
|
-
if (options.
|
|
139
|
-
if (
|
|
140
|
-
|
|
151
|
+
if (options.generateMarkdown) {
|
|
152
|
+
if (options.addImageToMarkdown) {
|
|
153
|
+
formats.push("markdown-with-images");
|
|
154
|
+
} else if (options.htmlInMarkdown) {
|
|
155
|
+
formats.push("markdown-with-html");
|
|
141
156
|
} else {
|
|
142
|
-
|
|
157
|
+
formats.push("markdown");
|
|
143
158
|
}
|
|
144
159
|
}
|
|
145
|
-
if (options.
|
|
146
|
-
|
|
160
|
+
if (options.generateHtml) {
|
|
161
|
+
formats.push("html");
|
|
147
162
|
}
|
|
148
|
-
if (options.
|
|
149
|
-
|
|
163
|
+
if (options.generateAnnotatedPdf) {
|
|
164
|
+
formats.push("pdf");
|
|
150
165
|
}
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
166
|
+
return convert(inputPath, {
|
|
167
|
+
outputDir: options.outputFolder,
|
|
168
|
+
password: options.password,
|
|
169
|
+
replaceInvalidChars: options.replaceInvalidChars,
|
|
170
|
+
keepLineBreaks: options.keepLineBreaks,
|
|
171
|
+
contentSafetyOff: options.contentSafetyOff,
|
|
172
|
+
useStructTree: options.useStructTree,
|
|
173
|
+
format: formats.length > 0 ? formats : void 0,
|
|
174
|
+
quiet: !options.debug
|
|
156
175
|
});
|
|
157
176
|
}
|
|
158
177
|
export {
|
|
178
|
+
buildArgs,
|
|
159
179
|
convert,
|
|
160
180
|
run
|
|
161
181
|
};
|