@opendataloader/pdf 0.0.0 → 0.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +112 -62
- package/lib/opendataloader-pdf-cli.jar +0 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -4,8 +4,9 @@
|
|
|
4
4
|
[](https://github.com/opendataloader-project/opendataloader-pdf/blob/main/LICENSE)
|
|
5
5
|

|
|
6
6
|

|
|
7
|
-
[](https://search.maven.org/artifact/org.opendataloader/opendataloader-pdf-core)
|
|
8
8
|
[](https://pypi.org/project/opendataloader-pdf/)
|
|
9
|
+
[](https://www.npmjs.com/package/@opendataloader/pdf)
|
|
9
10
|
[](https://github.com/opendataloader-project/opendataloader-pdf/pkgs/container/opendataloader-pdf-cli)
|
|
10
11
|
[](https://app.codecov.io/gh/opendataloader-project/opendataloader-pdf)
|
|
11
12
|
[](https://cla-assistant.io/opendataloader-project/opendataloader-pdf)
|
|
@@ -70,8 +71,8 @@ pip install -U opendataloader-pdf
|
|
|
70
71
|
import opendataloader_pdf
|
|
71
72
|
|
|
72
73
|
opendataloader_pdf.run(
|
|
73
|
-
input_path="path
|
|
74
|
-
output_folder="path
|
|
74
|
+
input_path="path-to-document.pdf",
|
|
75
|
+
output_folder="path-to-output",
|
|
75
76
|
generate_markdown=True,
|
|
76
77
|
generate_html=True,
|
|
77
78
|
generate_annotated_pdf=True,
|
|
@@ -82,37 +83,115 @@ opendataloader_pdf.run(
|
|
|
82
83
|
|
|
83
84
|
The main function to process PDFs.
|
|
84
85
|
|
|
85
|
-
| Parameter
|
|
86
|
-
|
|
87
|
-
| `input_path`
|
|
88
|
-
| `output_folder`
|
|
89
|
-
| `password`
|
|
90
|
-
| `replace_invalid_chars`
|
|
91
|
-
| `
|
|
92
|
-
| `
|
|
93
|
-
| `
|
|
94
|
-
| `
|
|
95
|
-
| `
|
|
96
|
-
| `html_in_markdown`
|
|
97
|
-
| `add_image_to_markdown`
|
|
98
|
-
| `debug`
|
|
86
|
+
| Parameter | Type | Required | Default | Description |
|
|
87
|
+
|--------------------------| ------ | -------- |--------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
|
88
|
+
| `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
|
|
89
|
+
| `output_folder` | `str` | No | input folder | Path to the output folder. |
|
|
90
|
+
| `password` | `str` | No | `None` | Password for the PDF file. |
|
|
91
|
+
| `replace_invalid_chars` | `str` | No | `" "` | Character to replace invalid or unrecognized characters (e.g., �, \u0000) |
|
|
92
|
+
| `content_safety_off` | `str` | No | `None` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page. |
|
|
93
|
+
| `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
|
|
94
|
+
| `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
|
|
95
|
+
| `generate_annotated_pdf` | `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
|
|
96
|
+
| `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
|
|
97
|
+
| `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
|
|
98
|
+
| `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
|
|
99
|
+
| `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
|
|
100
|
+
|
|
101
|
+
<br/>
|
|
102
|
+
|
|
103
|
+
## Node.js / NPM
|
|
104
|
+
|
|
105
|
+
**Note:** This package is a wrapper around a Java CLI and is intended for use in a Node.js backend environment. It cannot be used in a browser-based frontend.
|
|
106
|
+
|
|
107
|
+
### Prerequisites
|
|
108
|
+
|
|
109
|
+
- Java 11 or higher must be installed and available in your system's PATH.
|
|
110
|
+
|
|
111
|
+
### Installation
|
|
112
|
+
|
|
113
|
+
```sh
|
|
114
|
+
npm install @opendataloader/pdf
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Usage
|
|
118
|
+
|
|
119
|
+
- `inputPath` can be either the path to a single document or the path to a folder.
|
|
120
|
+
- If you don’t specify an `outputFolder`, the output data will be saved in the same directory as the input document.
|
|
121
|
+
|
|
122
|
+
```typescript
|
|
123
|
+
import { run } from '@opendataloader/pdf';
|
|
124
|
+
|
|
125
|
+
async function main() {
|
|
126
|
+
try {
|
|
127
|
+
const output = await run('path-to-document.pdf', {
|
|
128
|
+
outputFolder: 'path-to-output',
|
|
129
|
+
generateMarkdown: true,
|
|
130
|
+
generateHtml: true,
|
|
131
|
+
generateAnnotatedPdf: true,
|
|
132
|
+
debug: true,
|
|
133
|
+
});
|
|
134
|
+
console.log('PDF processing complete.', output);
|
|
135
|
+
} catch (error) {
|
|
136
|
+
console.error('Error processing PDF:', error);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
main();
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Function: run()
|
|
144
|
+
|
|
145
|
+
`run(inputPath: string, options?: RunOptions): Promise<string>`
|
|
146
|
+
|
|
147
|
+
The main function to process PDFs.
|
|
148
|
+
|
|
149
|
+
**Parameters**
|
|
150
|
+
|
|
151
|
+
| Parameter | Type | Required | Description |
|
|
152
|
+
| ----------- | -------- | -------- | ------------------------------------- |
|
|
153
|
+
| `inputPath` | `string` | ✅ Yes | Path to the input PDF file or folder. |
|
|
154
|
+
| `options` | `RunOptions` | No | Configuration options for the run. |
|
|
155
|
+
|
|
156
|
+
**RunOptions**
|
|
157
|
+
|
|
158
|
+
| Property | Type | Default | Description |
|
|
159
|
+
| ----------------------- | --------- | ------------- | --------------------------------------------------------------------------- |
|
|
160
|
+
| `outputFolder` | `string` | `undefined` | Path to the output folder. If not set, output is saved next to the input. |
|
|
161
|
+
| `password` | `string` | `undefined` | Password for the PDF file. |
|
|
162
|
+
| `replaceInvalidChars` | `string` | `" "` | Character to replace invalid or unrecognized characters (e.g., , \u0000). |
|
|
163
|
+
| `content_safety_off` | `string` | `undefined` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page. |
|
|
164
|
+
| `generateMarkdown` | `boolean` | `false` | If `true`, generates a Markdown output file. |
|
|
165
|
+
| `generateHtml` | `boolean` | `false` | If `true`, generates an HTML output file. |
|
|
166
|
+
| `generateAnnotatedPdf` | `boolean` | `false` | If `true`, generates an annotated PDF output file. |
|
|
167
|
+
| `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output. |
|
|
168
|
+
| `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output. |
|
|
169
|
+
| `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output. |
|
|
170
|
+
| `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution. |
|
|
99
171
|
|
|
100
172
|
<br/>
|
|
101
173
|
|
|
102
174
|
## Java
|
|
103
175
|
|
|
176
|
+
For various example templates, including Gradle and Maven, please refer to https://github.com/opendataloader-project/opendataloader-pdf/tree/main/examples/java.
|
|
177
|
+
|
|
104
178
|
### Dependency
|
|
105
179
|
|
|
106
180
|
To include OpenDataLoader PDF in your Maven project, add the dependency below to your `pom.xml` file.
|
|
107
181
|
|
|
108
|
-
Check for the latest version on [Maven Central](https://search.maven.org/artifact/
|
|
182
|
+
Check for the latest version on [Maven Central](https://search.maven.org/artifact/org.opendataloader/opendataloader-pdf-core).
|
|
109
183
|
|
|
110
184
|
```xml
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
185
|
+
<project>
|
|
186
|
+
<!-- other configurations... -->
|
|
187
|
+
|
|
188
|
+
<dependencies>
|
|
189
|
+
<dependency>
|
|
190
|
+
<groupId>org.opendataloader</groupId>
|
|
191
|
+
<artifactId>opendataloader-pdf-core</artifactId>
|
|
192
|
+
<version>0.0.15</version>
|
|
193
|
+
</dependency>
|
|
194
|
+
</dependencies>
|
|
116
195
|
|
|
117
196
|
<repositories>
|
|
118
197
|
<repository>
|
|
@@ -134,6 +213,9 @@ Check for the latest version on [Maven Central](https://search.maven.org/artifac
|
|
|
134
213
|
<url>https://artifactory.openpreservation.org/artifactory/vera-dev</url>
|
|
135
214
|
</pluginRepository>
|
|
136
215
|
</pluginRepositories>
|
|
216
|
+
|
|
217
|
+
<!-- other configurations... -->
|
|
218
|
+
</project>
|
|
137
219
|
```
|
|
138
220
|
|
|
139
221
|
|
|
@@ -142,54 +224,22 @@ Check for the latest version on [Maven Central](https://search.maven.org/artifac
|
|
|
142
224
|
To integrate Layout recognition API into Java code, one can follow the sample code below.
|
|
143
225
|
|
|
144
226
|
```java
|
|
145
|
-
import
|
|
146
|
-
import
|
|
227
|
+
import org.opendataloader.pdf.api.Config;
|
|
228
|
+
import org.opendataloader.pdf.api.OpenDataLoaderPDF;
|
|
147
229
|
|
|
148
230
|
import java.io.IOException;
|
|
149
231
|
|
|
150
232
|
public class Sample {
|
|
151
233
|
|
|
152
234
|
public static void main(String[] args) {
|
|
153
|
-
//create default config
|
|
154
235
|
Config config = new Config();
|
|
155
|
-
|
|
156
|
-
//set output folder relative to the input PDF
|
|
157
|
-
//if the output folder is not set, the current folder of the input PDF is used
|
|
158
|
-
config.setOutputFolder("output");
|
|
159
|
-
|
|
160
|
-
//generating pdf output file
|
|
236
|
+
config.setOutputFolder("path/to/output");
|
|
161
237
|
config.setGeneratePDF(true);
|
|
162
|
-
|
|
163
|
-
//set password of input pdf file
|
|
164
|
-
config.setPassword("password");
|
|
165
|
-
|
|
166
|
-
//generate markdown output file
|
|
167
238
|
config.setGenerateMarkdown(true);
|
|
168
|
-
|
|
169
|
-
//generate html output file
|
|
170
239
|
config.setGenerateHtml(true);
|
|
171
240
|
|
|
172
|
-
//enable html in markdown output file
|
|
173
|
-
config.setUseHTMLInMarkdown(true);
|
|
174
|
-
|
|
175
|
-
//add images to markdown output file
|
|
176
|
-
config.setAddImageToMarkdown(true);
|
|
177
|
-
|
|
178
|
-
//disable json output file
|
|
179
|
-
config.setGenerateJSON(false);
|
|
180
|
-
|
|
181
|
-
//keep line breaks
|
|
182
|
-
config.setKeepLineBreaks(true);
|
|
183
|
-
|
|
184
|
-
//find hidden text
|
|
185
|
-
config.setFindHiddenText(true);
|
|
186
|
-
|
|
187
|
-
//replace invalid chars with specified character
|
|
188
|
-
config.setReplaceInvalidChars("character");
|
|
189
|
-
|
|
190
241
|
try {
|
|
191
|
-
|
|
192
|
-
OpenDataLoaderPDF.processFile("input.pdf", config);
|
|
242
|
+
OpenDataLoaderPDF.processFile("path/to/document.pdf", config);
|
|
193
243
|
} catch (Exception exception) {
|
|
194
244
|
//exception during processing
|
|
195
245
|
}
|
|
@@ -199,7 +249,7 @@ public class Sample {
|
|
|
199
249
|
|
|
200
250
|
### API Documentation
|
|
201
251
|
|
|
202
|
-
The full API documentation is available at [javadoc](https://javadoc.io/doc/
|
|
252
|
+
The full API documentation is available at [javadoc](https://javadoc.io/doc/org.opendataloader/opendataloader-pdf-core/latest/)
|
|
203
253
|
|
|
204
254
|
<br/>
|
|
205
255
|
|
|
@@ -247,7 +297,7 @@ Additionally, annotated PDF with recognized structures, Markdown and Html are ge
|
|
|
247
297
|
By default all line breaks and hyphenation characters are removed, the Markdown does not include any images and does not use any HTML.
|
|
248
298
|
|
|
249
299
|
The option `--keep-line-breaks` to preserve the original line breaks text content in JSON and Markdown output.
|
|
250
|
-
|
|
300
|
+
The option `--content-safety-off` disables one or more content safety filters. Accepts a comma-separated list of filter names.
|
|
251
301
|
The option `--markdown-with-html` enables use of HTML in Markdown, which may improve Markdown preview in processors that support HTML tags.
|
|
252
302
|
The option `--markdown-with-images` enables inclusion of image references into the output Markdown.
|
|
253
303
|
The option `--replace-invalid-chars` replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character.
|
|
@@ -259,7 +309,7 @@ The images are extracted from PDF as individual files and stored in a subfolder
|
|
|
259
309
|
Options:
|
|
260
310
|
-o,--output-dir <arg> Specifies the output directory for generated files
|
|
261
311
|
--keep-line-breaks Preserves original line breaks in the extracted text
|
|
262
|
-
-
|
|
312
|
+
--content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page
|
|
263
313
|
--markdown-with-html Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
|
|
264
314
|
--markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
|
|
265
315
|
--markdown Sets the data extraction output format to Markdown
|
|
Binary file
|