@opendataloader/pdf 0.0.16 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -38
- package/dist/index.cjs +11 -3
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +11 -3
- package/dist/index.js.map +1 -1
- package/lib/opendataloader-pdf-cli.jar +0 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# OpenDataLoader PDF
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
|
|
4
4
|
[](https://github.com/opendataloader-project/opendataloader-pdf/blob/main/LICENSE)
|
|
5
5
|

|
|
6
|
-

|
|
7
7
|
[](https://search.maven.org/artifact/org.opendataloader/opendataloader-pdf-core)
|
|
8
8
|
[](https://pypi.org/project/opendataloader-pdf/)
|
|
9
9
|
[](https://www.npmjs.com/package/@opendataloader/pdf)
|
|
@@ -50,7 +50,7 @@ AI-safety is enabled by default and automatically filters likely prompt-injectio
|
|
|
50
50
|
## Prerequisites
|
|
51
51
|
|
|
52
52
|
- Java 11 or higher must be installed and available in your system's PATH.
|
|
53
|
-
- Python 3.
|
|
53
|
+
- Python 3.9+
|
|
54
54
|
|
|
55
55
|
<br/>
|
|
56
56
|
|
|
@@ -71,8 +71,8 @@ pip install -U opendataloader-pdf
|
|
|
71
71
|
import opendataloader_pdf
|
|
72
72
|
|
|
73
73
|
opendataloader_pdf.run(
|
|
74
|
-
input_path="path
|
|
75
|
-
output_folder="path
|
|
74
|
+
input_path="path/to/document.pdf",
|
|
75
|
+
output_folder="path/to/output",
|
|
76
76
|
generate_markdown=True,
|
|
77
77
|
generate_html=True,
|
|
78
78
|
generate_annotated_pdf=True,
|
|
@@ -83,20 +83,20 @@ opendataloader_pdf.run(
|
|
|
83
83
|
|
|
84
84
|
The main function to process PDFs.
|
|
85
85
|
|
|
86
|
-
| Parameter | Type | Required | Default | Description
|
|
87
|
-
|--------------------------| ------ | --------
|
|
88
|
-
| `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder.
|
|
89
|
-
| `output_folder` | `str` | No | input folder | Path to the output folder.
|
|
90
|
-
| `password` | `str` | No | `None` | Password for the PDF file.
|
|
91
|
-
| `replace_invalid_chars` | `str` | No | `" "` | Character to replace invalid or unrecognized characters (e.g., �, \u0000)
|
|
92
|
-
| `content_safety_off` | `str` | No | `None` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page. |
|
|
93
|
-
| `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file.
|
|
94
|
-
| `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file.
|
|
95
|
-
| `generate_annotated_pdf` | `bool` | No | `False` | If `True`, generates an annotated PDF output file.
|
|
96
|
-
| `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output.
|
|
97
|
-
| `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output.
|
|
98
|
-
| `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output.
|
|
99
|
-
| `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution.
|
|
86
|
+
| Parameter | Type | Required | Default | Description |
|
|
87
|
+
|--------------------------| ------ | -------- |--------------|-------------------------------------------------------------------------------------------------------------------------------------------|
|
|
88
|
+
| `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
|
|
89
|
+
| `output_folder` | `str` | No | input folder | Path to the output folder. |
|
|
90
|
+
| `password` | `str` | No | `None` | Password for the PDF file. |
|
|
91
|
+
| `replace_invalid_chars` | `str` | No | `" "` | Character to replace invalid or unrecognized characters (e.g., �, \u0000) |
|
|
92
|
+
| `content_safety_off` | `str` | No | `None` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny. |
|
|
93
|
+
| `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
|
|
94
|
+
| `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
|
|
95
|
+
| `generate_annotated_pdf` | `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
|
|
96
|
+
| `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
|
|
97
|
+
| `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
|
|
98
|
+
| `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
|
|
99
|
+
| `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
|
|
100
100
|
|
|
101
101
|
<br/>
|
|
102
102
|
|
|
@@ -124,8 +124,8 @@ import { run } from '@opendataloader/pdf';
|
|
|
124
124
|
|
|
125
125
|
async function main() {
|
|
126
126
|
try {
|
|
127
|
-
const output = await run('path
|
|
128
|
-
outputFolder: 'path
|
|
127
|
+
const output = await run('path/to/document.pdf', {
|
|
128
|
+
outputFolder: 'path/to/output',
|
|
129
129
|
generateMarkdown: true,
|
|
130
130
|
generateHtml: true,
|
|
131
131
|
generateAnnotatedPdf: true,
|
|
@@ -155,19 +155,19 @@ The main function to process PDFs.
|
|
|
155
155
|
|
|
156
156
|
**RunOptions**
|
|
157
157
|
|
|
158
|
-
| Property | Type | Default | Description
|
|
159
|
-
| ----------------------- | --------- | -------------
|
|
160
|
-
| `outputFolder` | `string` | `undefined` | Path to the output folder. If not set, output is saved next to the input.
|
|
161
|
-
| `password` | `string` | `undefined` | Password for the PDF file.
|
|
162
|
-
| `replaceInvalidChars` | `string` | `" "` | Character to replace invalid or unrecognized characters (e.g., , \u0000).
|
|
163
|
-
| `
|
|
164
|
-
| `generateMarkdown` | `boolean` | `false` | If `true`, generates a Markdown output file.
|
|
165
|
-
| `generateHtml` | `boolean` | `false` | If `true`, generates an HTML output file.
|
|
166
|
-
| `generateAnnotatedPdf` | `boolean` | `false` | If `true`, generates an annotated PDF output file.
|
|
167
|
-
| `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output.
|
|
168
|
-
| `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output.
|
|
169
|
-
| `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output.
|
|
170
|
-
| `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution.
|
|
158
|
+
| Property | Type | Default | Description |
|
|
159
|
+
| ----------------------- | --------- | ------------- |-------------------------------------------------------------------------------------------------------------------------------------------|
|
|
160
|
+
| `outputFolder` | `string` | `undefined` | Path to the output folder. If not set, output is saved next to the input. |
|
|
161
|
+
| `password` | `string` | `undefined` | Password for the PDF file. |
|
|
162
|
+
| `replaceInvalidChars` | `string` | `" "` | Character to replace invalid or unrecognized characters (e.g., , \u0000). |
|
|
163
|
+
| `contentSafetyOff` | `string` | `undefined` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny. |
|
|
164
|
+
| `generateMarkdown` | `boolean` | `false` | If `true`, generates a Markdown output file. |
|
|
165
|
+
| `generateHtml` | `boolean` | `false` | If `true`, generates an HTML output file. |
|
|
166
|
+
| `generateAnnotatedPdf` | `boolean` | `false` | If `true`, generates an annotated PDF output file. |
|
|
167
|
+
| `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output. |
|
|
168
|
+
| `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output. |
|
|
169
|
+
| `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output. |
|
|
170
|
+
| `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution. |
|
|
171
171
|
|
|
172
172
|
<br/>
|
|
173
173
|
|
|
@@ -189,7 +189,7 @@ Check for the latest version on [Maven Central](https://search.maven.org/artifac
|
|
|
189
189
|
<dependency>
|
|
190
190
|
<groupId>org.opendataloader</groupId>
|
|
191
191
|
<artifactId>opendataloader-pdf-core</artifactId>
|
|
192
|
-
<version>0.0
|
|
192
|
+
<version>1.0.0</version>
|
|
193
193
|
</dependency>
|
|
194
194
|
</dependencies>
|
|
195
195
|
|
|
@@ -273,10 +273,10 @@ docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pd
|
|
|
273
273
|
|
|
274
274
|
### Build
|
|
275
275
|
|
|
276
|
-
Build and
|
|
276
|
+
Build and install using Maven command:
|
|
277
277
|
|
|
278
278
|
```sh
|
|
279
|
-
mvn clean
|
|
279
|
+
mvn clean install -f java/pom.xml
|
|
280
280
|
```
|
|
281
281
|
|
|
282
282
|
If the build is successful, the resulting `jar` file will be created in the path below.
|
|
@@ -309,7 +309,7 @@ The images are extracted from PDF as individual files and stored in a subfolder
|
|
|
309
309
|
Options:
|
|
310
310
|
-o,--output-dir <arg> Specifies the output directory for generated files
|
|
311
311
|
--keep-line-breaks Preserves original line breaks in the extracted text
|
|
312
|
-
--content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page
|
|
312
|
+
--content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny
|
|
313
313
|
--markdown-with-html Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
|
|
314
314
|
--markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
|
|
315
315
|
--markdown Sets the data extraction output format to Markdown
|
package/dist/index.cjs
CHANGED
|
@@ -41,6 +41,14 @@ var import_meta = {};
|
|
|
41
41
|
var __filename = (0, import_url.fileURLToPath)(import_meta.url);
|
|
42
42
|
var __dirname = path.dirname(__filename);
|
|
43
43
|
var JAR_NAME = "opendataloader-pdf-cli.jar";
|
|
44
|
+
function getRedactedCommandString(command, commandArgs) {
|
|
45
|
+
const commandArgsForLogging = [...commandArgs];
|
|
46
|
+
const passwordIndex = commandArgsForLogging.indexOf("--password");
|
|
47
|
+
if (passwordIndex > -1 && passwordIndex + 1 < commandArgsForLogging.length) {
|
|
48
|
+
commandArgsForLogging[passwordIndex + 1] = "[REDACTED]";
|
|
49
|
+
}
|
|
50
|
+
return `${command} ${commandArgsForLogging.join(" ")}`;
|
|
51
|
+
}
|
|
44
52
|
function run(inputPath, options = {}) {
|
|
45
53
|
return new Promise((resolve, reject) => {
|
|
46
54
|
if (!fs.existsSync(inputPath)) {
|
|
@@ -68,8 +76,8 @@ function run(inputPath, options = {}) {
|
|
|
68
76
|
if (options.keepLineBreaks) {
|
|
69
77
|
args.push("--keep-line-breaks");
|
|
70
78
|
}
|
|
71
|
-
if (options.
|
|
72
|
-
args.push("--
|
|
79
|
+
if (options.contentSafetyOff) {
|
|
80
|
+
args.push("--content-safety-off", options.contentSafetyOff);
|
|
73
81
|
}
|
|
74
82
|
if (options.htmlInMarkdown) {
|
|
75
83
|
args.push("--markdown-with-html");
|
|
@@ -87,7 +95,7 @@ function run(inputPath, options = {}) {
|
|
|
87
95
|
const command = "java";
|
|
88
96
|
const commandArgs = ["-jar", jarPath, ...args];
|
|
89
97
|
if (options.debug) {
|
|
90
|
-
console.error(`Running command: ${command
|
|
98
|
+
console.error(`Running command: ${getRedactedCommandString(command, commandArgs)}`);
|
|
91
99
|
}
|
|
92
100
|
const javaProcess = (0, import_child_process.spawn)(command, commandArgs);
|
|
93
101
|
let stdout = "";
|
package/dist/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n
|
|
1
|
+
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\nfunction getRedactedCommandString(command: string, commandArgs: string[]): string {\n const commandArgsForLogging = [...commandArgs];\n const passwordIndex = commandArgsForLogging.indexOf('--password');\n if (passwordIndex > -1 && passwordIndex + 1 < commandArgsForLogging.length) {\n commandArgsForLogging[passwordIndex + 1] = '[REDACTED]';\n }\n return `${command} ${commandArgsForLogging.join(' ')}`;\n}\n\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n debug?: boolean;\n}\n\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n return new Promise((resolve, reject) => {\n if (!fs.existsSync(inputPath)) {\n return reject(new Error(`Input file or folder not found: ${inputPath}`));\n }\n\n const args: string[] = [];\n if (options.outputFolder) {\n args.push('--output-dir', options.outputFolder);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.generateMarkdown) {\n args.push('--markdown');\n }\n if (options.generateHtml) {\n args.push('--html');\n }\n if (options.generateAnnotatedPdf) {\n args.push('--pdf');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.contentSafetyOff) {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n if (options.htmlInMarkdown) {\n args.push('--markdown-with-html');\n }\n if (options.addImageToMarkdown) {\n args.push('--markdown-with-images');\n }\n\n args.push(inputPath);\n\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n if (options.debug) {\n console.error(`Running command: ${getRedactedCommandString(command, commandArgs)}`);\n }\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (options.debug) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (options.debug) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${stderr}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,2BAAsB;AACtB,WAAsB;AACtB,SAAoB;AACpB,iBAA8B;AAH9B;AAKA,IAAM,iBAAa,0BAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAEjB,SAAS,yBAAyB,SAAiB,aAA+B;AAChF,QAAM,wBAAwB,CAAC,GAAG,WAAW;AAC7C,QAAM,gBAAgB,sBAAsB,QAAQ,YAAY;AAChE,MAAI,gBAAgB,MAAM,gBAAgB,IAAI,sBAAsB,QAAQ;AAC1E,0BAAsB,gBAAgB,CAAC,IAAI;AAAA,EAC7C;AACA,SAAO,GAAG,OAAO,IAAI,sBAAsB,KAAK,GAAG,CAAC;AACtD;AAgBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,QAAI,CAAI,cAAW,SAAS,GAAG;AAC7B,aAAO,OAAO,IAAI,MAAM,mCAAmC,SAAS,EAAE,CAAC;AAAA,IACzE;AAEA,UAAM,OAAiB,CAAC;AACxB,QAAI,QAAQ,cAAc;AACxB,WAAK,KAAK,gBAAgB,QAAQ,YAAY;AAAA,IAChD;AACA,QAAI,QAAQ,UAAU;AACpB,WAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,IAC1C;AACA,QAAI,QAAQ,qBAAqB;AAC/B,WAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,IAClE;AACA,QAAI,QAAQ,kBAAkB;AAC5B,WAAK,KAAK,YAAY;AAAA,IACxB;AACA,QAAI,QAAQ,cAAc;AACxB,WAAK,KAAK,QAAQ;AAAA,IACpB;AACA,QAAI,QAAQ,sBAAsB;AAChC,WAAK,KAAK,OAAO;AAAA,IACnB;AACA,QAAI,QAAQ,gBAAgB;AAC1B,WAAK,KAAK,oBAAoB;AAAA,IAChC;AACA,QAAI,QAAQ,kBAAkB;AAC5B,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AACA,QAAI,QAAQ,gBAAgB;AAC1B,WAAK,KAAK,sBAAsB;AAAA,IAClC;AACA,QAAI,QAAQ,oBAAoB;AAC9B,WAAK,KAAK,wBAAwB;AAAA,IACpC;AAEA,SAAK,KAAK,SAAS;AAEnB,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,QAAI,QAAQ,OAAO;AACjB,cAAQ,MAAM,oBAAoB,yBAAyB,SAAS,WAAW,CAAC,EAAE;AAAA,IACpF;AAEA,UAAM,kBAAc,4BAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,QAAQ,OAAO;AACjB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,QAAQ,OAAO;AACjB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,MAAM;AAAA,QACnE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAQ;AAC/B,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;","names":[]}
|
package/dist/index.d.cts
CHANGED
package/dist/index.d.ts
CHANGED
package/dist/index.js
CHANGED
|
@@ -6,6 +6,14 @@ import { fileURLToPath } from "url";
|
|
|
6
6
|
var __filename = fileURLToPath(import.meta.url);
|
|
7
7
|
var __dirname = path.dirname(__filename);
|
|
8
8
|
var JAR_NAME = "opendataloader-pdf-cli.jar";
|
|
9
|
+
function getRedactedCommandString(command, commandArgs) {
|
|
10
|
+
const commandArgsForLogging = [...commandArgs];
|
|
11
|
+
const passwordIndex = commandArgsForLogging.indexOf("--password");
|
|
12
|
+
if (passwordIndex > -1 && passwordIndex + 1 < commandArgsForLogging.length) {
|
|
13
|
+
commandArgsForLogging[passwordIndex + 1] = "[REDACTED]";
|
|
14
|
+
}
|
|
15
|
+
return `${command} ${commandArgsForLogging.join(" ")}`;
|
|
16
|
+
}
|
|
9
17
|
function run(inputPath, options = {}) {
|
|
10
18
|
return new Promise((resolve, reject) => {
|
|
11
19
|
if (!fs.existsSync(inputPath)) {
|
|
@@ -33,8 +41,8 @@ function run(inputPath, options = {}) {
|
|
|
33
41
|
if (options.keepLineBreaks) {
|
|
34
42
|
args.push("--keep-line-breaks");
|
|
35
43
|
}
|
|
36
|
-
if (options.
|
|
37
|
-
args.push("--
|
|
44
|
+
if (options.contentSafetyOff) {
|
|
45
|
+
args.push("--content-safety-off", options.contentSafetyOff);
|
|
38
46
|
}
|
|
39
47
|
if (options.htmlInMarkdown) {
|
|
40
48
|
args.push("--markdown-with-html");
|
|
@@ -52,7 +60,7 @@ function run(inputPath, options = {}) {
|
|
|
52
60
|
const command = "java";
|
|
53
61
|
const commandArgs = ["-jar", jarPath, ...args];
|
|
54
62
|
if (options.debug) {
|
|
55
|
-
console.error(`Running command: ${command
|
|
63
|
+
console.error(`Running command: ${getRedactedCommandString(command, commandArgs)}`);
|
|
56
64
|
}
|
|
57
65
|
const javaProcess = spawn(command, commandArgs);
|
|
58
66
|
let stdout = "";
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n
|
|
1
|
+
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { spawn } from 'child_process';\nimport * as path from 'path';\nimport * as fs from 'fs';\nimport { fileURLToPath } from 'url';\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = path.dirname(__filename);\n\nconst JAR_NAME = 'opendataloader-pdf-cli.jar';\n\nfunction getRedactedCommandString(command: string, commandArgs: string[]): string {\n const commandArgsForLogging = [...commandArgs];\n const passwordIndex = commandArgsForLogging.indexOf('--password');\n if (passwordIndex > -1 && passwordIndex + 1 < commandArgsForLogging.length) {\n commandArgsForLogging[passwordIndex + 1] = '[REDACTED]';\n }\n return `${command} ${commandArgsForLogging.join(' ')}`;\n}\n\nexport interface RunOptions {\n outputFolder?: string;\n password?: string;\n replaceInvalidChars?: string;\n generateMarkdown?: boolean;\n generateHtml?: boolean;\n generateAnnotatedPdf?: boolean;\n keepLineBreaks?: boolean;\n contentSafetyOff?: string;\n htmlInMarkdown?: boolean;\n addImageToMarkdown?: boolean;\n debug?: boolean;\n}\n\nexport function run(inputPath: string, options: RunOptions = {}): Promise<string> {\n return new Promise((resolve, reject) => {\n if (!fs.existsSync(inputPath)) {\n return reject(new Error(`Input file or folder not found: ${inputPath}`));\n }\n\n const args: string[] = [];\n if (options.outputFolder) {\n args.push('--output-dir', options.outputFolder);\n }\n if (options.password) {\n args.push('--password', options.password);\n }\n if (options.replaceInvalidChars) {\n args.push('--replace-invalid-chars', options.replaceInvalidChars);\n }\n if (options.generateMarkdown) {\n args.push('--markdown');\n }\n if (options.generateHtml) {\n args.push('--html');\n }\n if (options.generateAnnotatedPdf) {\n args.push('--pdf');\n }\n if (options.keepLineBreaks) {\n args.push('--keep-line-breaks');\n }\n if (options.contentSafetyOff) {\n args.push('--content-safety-off', options.contentSafetyOff);\n }\n if (options.htmlInMarkdown) {\n args.push('--markdown-with-html');\n }\n if (options.addImageToMarkdown) {\n args.push('--markdown-with-images');\n }\n\n args.push(inputPath);\n\n const jarPath = path.join(__dirname, '..', 'lib', JAR_NAME);\n\n if (!fs.existsSync(jarPath)) {\n return reject(\n new Error(`JAR file not found at ${jarPath}. Please run the build script first.`),\n );\n }\n\n const command = 'java';\n const commandArgs = ['-jar', jarPath, ...args];\n\n if (options.debug) {\n console.error(`Running command: ${getRedactedCommandString(command, commandArgs)}`);\n }\n\n const javaProcess = spawn(command, commandArgs);\n\n let stdout = '';\n let stderr = '';\n\n javaProcess.stdout.on('data', (data) => {\n const chunk = data.toString();\n if (options.debug) {\n process.stdout.write(chunk);\n }\n stdout += chunk;\n });\n\n javaProcess.stderr.on('data', (data) => {\n const chunk = data.toString();\n if (options.debug) {\n process.stderr.write(chunk);\n }\n stderr += chunk;\n });\n\n javaProcess.on('close', (code) => {\n if (code === 0) {\n resolve(stdout);\n } else {\n const error = new Error(\n `The opendataloader-pdf CLI exited with code ${code}.\\n\\n${stderr}`,\n );\n reject(error);\n }\n });\n\n javaProcess.on('error', (err) => {\n if (err.message.includes('ENOENT')) {\n reject(\n new Error(\n \"'java' command not found. Please ensure Java is installed and in your system's PATH.\",\n ),\n );\n } else {\n reject(err);\n }\n });\n });\n}"],"mappings":";AAAA,SAAS,aAAa;AACtB,YAAY,UAAU;AACtB,YAAY,QAAQ;AACpB,SAAS,qBAAqB;AAE9B,IAAM,aAAa,cAAc,YAAY,GAAG;AAChD,IAAM,YAAiB,aAAQ,UAAU;AAEzC,IAAM,WAAW;AAEjB,SAAS,yBAAyB,SAAiB,aAA+B;AAChF,QAAM,wBAAwB,CAAC,GAAG,WAAW;AAC7C,QAAM,gBAAgB,sBAAsB,QAAQ,YAAY;AAChE,MAAI,gBAAgB,MAAM,gBAAgB,IAAI,sBAAsB,QAAQ;AAC1E,0BAAsB,gBAAgB,CAAC,IAAI;AAAA,EAC7C;AACA,SAAO,GAAG,OAAO,IAAI,sBAAsB,KAAK,GAAG,CAAC;AACtD;AAgBO,SAAS,IAAI,WAAmB,UAAsB,CAAC,GAAoB;AAChF,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,QAAI,CAAI,cAAW,SAAS,GAAG;AAC7B,aAAO,OAAO,IAAI,MAAM,mCAAmC,SAAS,EAAE,CAAC;AAAA,IACzE;AAEA,UAAM,OAAiB,CAAC;AACxB,QAAI,QAAQ,cAAc;AACxB,WAAK,KAAK,gBAAgB,QAAQ,YAAY;AAAA,IAChD;AACA,QAAI,QAAQ,UAAU;AACpB,WAAK,KAAK,cAAc,QAAQ,QAAQ;AAAA,IAC1C;AACA,QAAI,QAAQ,qBAAqB;AAC/B,WAAK,KAAK,2BAA2B,QAAQ,mBAAmB;AAAA,IAClE;AACA,QAAI,QAAQ,kBAAkB;AAC5B,WAAK,KAAK,YAAY;AAAA,IACxB;AACA,QAAI,QAAQ,cAAc;AACxB,WAAK,KAAK,QAAQ;AAAA,IACpB;AACA,QAAI,QAAQ,sBAAsB;AAChC,WAAK,KAAK,OAAO;AAAA,IACnB;AACA,QAAI,QAAQ,gBAAgB;AAC1B,WAAK,KAAK,oBAAoB;AAAA,IAChC;AACA,QAAI,QAAQ,kBAAkB;AAC5B,WAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAAA,IAC5D;AACA,QAAI,QAAQ,gBAAgB;AAC1B,WAAK,KAAK,sBAAsB;AAAA,IAClC;AACA,QAAI,QAAQ,oBAAoB;AAC9B,WAAK,KAAK,wBAAwB;AAAA,IACpC;AAEA,SAAK,KAAK,SAAS;AAEnB,UAAM,UAAe,UAAK,WAAW,MAAM,OAAO,QAAQ;AAE1D,QAAI,CAAI,cAAW,OAAO,GAAG;AAC3B,aAAO;AAAA,QACL,IAAI,MAAM,yBAAyB,OAAO,sCAAsC;AAAA,MAClF;AAAA,IACF;AAEA,UAAM,UAAU;AAChB,UAAM,cAAc,CAAC,QAAQ,SAAS,GAAG,IAAI;AAE7C,QAAI,QAAQ,OAAO;AACjB,cAAQ,MAAM,oBAAoB,yBAAyB,SAAS,WAAW,CAAC,EAAE;AAAA,IACpF;AAEA,UAAM,cAAc,MAAM,SAAS,WAAW;AAE9C,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,QAAQ,OAAO;AACjB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,OAAO,GAAG,QAAQ,CAAC,SAAS;AACtC,YAAM,QAAQ,KAAK,SAAS;AAC5B,UAAI,QAAQ,OAAO;AACjB,gBAAQ,OAAO,MAAM,KAAK;AAAA,MAC5B;AACA,gBAAU;AAAA,IACZ,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,SAAS;AAChC,UAAI,SAAS,GAAG;AACd,gBAAQ,MAAM;AAAA,MAChB,OAAO;AACL,cAAM,QAAQ,IAAI;AAAA,UAChB,+CAA+C,IAAI;AAAA;AAAA,EAAQ,MAAM;AAAA,QACnE;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,gBAAY,GAAG,SAAS,CAAC,QAAQ;AAC/B,UAAI,IAAI,QAAQ,SAAS,QAAQ,GAAG;AAClC;AAAA,UACE,IAAI;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,MACF,OAAO;AACL,eAAO,GAAG;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACH;","names":[]}
|
|
Binary file
|