opendataloader-pdf 0.0.16__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opendataloader-pdf might be problematic. Click here for more details.
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar +0 -0
- {opendataloader_pdf-0.0.16.dist-info → opendataloader_pdf-1.0.0.dist-info}/METADATA +36 -36
- {opendataloader_pdf-0.0.16.dist-info → opendataloader_pdf-1.0.0.dist-info}/RECORD +5 -5
- {opendataloader_pdf-0.0.16.dist-info → opendataloader_pdf-1.0.0.dist-info}/WHEEL +0 -0
- {opendataloader_pdf-0.0.16.dist-info → opendataloader_pdf-1.0.0.dist-info}/top_level.txt +0 -0
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opendataloader-pdf
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: A Python wrapper for the opendataloader-pdf Java CLI.
|
|
5
5
|
Home-page: https://github.com/opendataloader-project/opendataloader-pdf
|
|
6
6
|
Author: opendataloader-project
|
|
@@ -95,8 +95,8 @@ pip install -U opendataloader-pdf
|
|
|
95
95
|
import opendataloader_pdf
|
|
96
96
|
|
|
97
97
|
opendataloader_pdf.run(
|
|
98
|
-
input_path="path
|
|
99
|
-
output_folder="path
|
|
98
|
+
input_path="path/to/document.pdf",
|
|
99
|
+
output_folder="path/to/output",
|
|
100
100
|
generate_markdown=True,
|
|
101
101
|
generate_html=True,
|
|
102
102
|
generate_annotated_pdf=True,
|
|
@@ -107,20 +107,20 @@ opendataloader_pdf.run(
|
|
|
107
107
|
|
|
108
108
|
The main function to process PDFs.
|
|
109
109
|
|
|
110
|
-
| Parameter | Type | Required | Default | Description
|
|
111
|
-
|--------------------------| ------ | --------
|
|
112
|
-
| `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder.
|
|
113
|
-
| `output_folder` | `str` | No | input folder | Path to the output folder.
|
|
114
|
-
| `password` | `str` | No | `None` | Password for the PDF file.
|
|
115
|
-
| `replace_invalid_chars` | `str` | No | `" "` | Character to replace invalid or unrecognized characters (e.g., �, \u0000)
|
|
116
|
-
| `content_safety_off` | `str` | No | `None` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page. |
|
|
117
|
-
| `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file.
|
|
118
|
-
| `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file.
|
|
119
|
-
| `generate_annotated_pdf` | `bool` | No | `False` | If `True`, generates an annotated PDF output file.
|
|
120
|
-
| `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output.
|
|
121
|
-
| `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output.
|
|
122
|
-
| `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output.
|
|
123
|
-
| `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution.
|
|
110
|
+
| Parameter | Type | Required | Default | Description |
|
|
111
|
+
|--------------------------| ------ | -------- |--------------|-------------------------------------------------------------------------------------------------------------------------------------------|
|
|
112
|
+
| `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
|
|
113
|
+
| `output_folder` | `str` | No | input folder | Path to the output folder. |
|
|
114
|
+
| `password` | `str` | No | `None` | Password for the PDF file. |
|
|
115
|
+
| `replace_invalid_chars` | `str` | No | `" "` | Character to replace invalid or unrecognized characters (e.g., �, \u0000) |
|
|
116
|
+
| `content_safety_off` | `str` | No | `None` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny. |
|
|
117
|
+
| `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
|
|
118
|
+
| `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
|
|
119
|
+
| `generate_annotated_pdf` | `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
|
|
120
|
+
| `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
|
|
121
|
+
| `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
|
|
122
|
+
| `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
|
|
123
|
+
| `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
|
|
124
124
|
|
|
125
125
|
<br/>
|
|
126
126
|
|
|
@@ -148,8 +148,8 @@ import { run } from '@opendataloader/pdf';
|
|
|
148
148
|
|
|
149
149
|
async function main() {
|
|
150
150
|
try {
|
|
151
|
-
const output = await run('path
|
|
152
|
-
outputFolder: 'path
|
|
151
|
+
const output = await run('path/to/document.pdf', {
|
|
152
|
+
outputFolder: 'path/to/output',
|
|
153
153
|
generateMarkdown: true,
|
|
154
154
|
generateHtml: true,
|
|
155
155
|
generateAnnotatedPdf: true,
|
|
@@ -179,19 +179,19 @@ The main function to process PDFs.
|
|
|
179
179
|
|
|
180
180
|
**RunOptions**
|
|
181
181
|
|
|
182
|
-
| Property | Type | Default | Description
|
|
183
|
-
| ----------------------- | --------- | -------------
|
|
184
|
-
| `outputFolder` | `string` | `undefined` | Path to the output folder. If not set, output is saved next to the input.
|
|
185
|
-
| `password` | `string` | `undefined` | Password for the PDF file.
|
|
186
|
-
| `replaceInvalidChars` | `string` | `" "` | Character to replace invalid or unrecognized characters (e.g., , \u0000).
|
|
187
|
-
| `
|
|
188
|
-
| `generateMarkdown` | `boolean` | `false` | If `true`, generates a Markdown output file.
|
|
189
|
-
| `generateHtml` | `boolean` | `false` | If `true`, generates an HTML output file.
|
|
190
|
-
| `generateAnnotatedPdf` | `boolean` | `false` | If `true`, generates an annotated PDF output file.
|
|
191
|
-
| `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output.
|
|
192
|
-
| `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output.
|
|
193
|
-
| `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output.
|
|
194
|
-
| `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution.
|
|
182
|
+
| Property | Type | Default | Description |
|
|
183
|
+
| ----------------------- | --------- | ------------- |-------------------------------------------------------------------------------------------------------------------------------------------|
|
|
184
|
+
| `outputFolder` | `string` | `undefined` | Path to the output folder. If not set, output is saved next to the input. |
|
|
185
|
+
| `password` | `string` | `undefined` | Password for the PDF file. |
|
|
186
|
+
| `replaceInvalidChars` | `string` | `" "` | Character to replace invalid or unrecognized characters (e.g., , \u0000). |
|
|
187
|
+
| `contentSafetyOff` | `string` | `undefined` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny. |
|
|
188
|
+
| `generateMarkdown` | `boolean` | `false` | If `true`, generates a Markdown output file. |
|
|
189
|
+
| `generateHtml` | `boolean` | `false` | If `true`, generates an HTML output file. |
|
|
190
|
+
| `generateAnnotatedPdf` | `boolean` | `false` | If `true`, generates an annotated PDF output file. |
|
|
191
|
+
| `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output. |
|
|
192
|
+
| `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output. |
|
|
193
|
+
| `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output. |
|
|
194
|
+
| `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution. |
|
|
195
195
|
|
|
196
196
|
<br/>
|
|
197
197
|
|
|
@@ -213,7 +213,7 @@ Check for the latest version on [Maven Central](https://search.maven.org/artifac
|
|
|
213
213
|
<dependency>
|
|
214
214
|
<groupId>org.opendataloader</groupId>
|
|
215
215
|
<artifactId>opendataloader-pdf-core</artifactId>
|
|
216
|
-
<version>0.0.
|
|
216
|
+
<version>0.0.16</version>
|
|
217
217
|
</dependency>
|
|
218
218
|
</dependencies>
|
|
219
219
|
|
|
@@ -297,10 +297,10 @@ docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pd
|
|
|
297
297
|
|
|
298
298
|
### Build
|
|
299
299
|
|
|
300
|
-
Build and
|
|
300
|
+
Build and install using Maven command:
|
|
301
301
|
|
|
302
302
|
```sh
|
|
303
|
-
mvn clean
|
|
303
|
+
mvn clean install -f java/pom.xml
|
|
304
304
|
```
|
|
305
305
|
|
|
306
306
|
If the build is successful, the resulting `jar` file will be created in the path below.
|
|
@@ -333,7 +333,7 @@ The images are extracted from PDF as individual files and stored in a subfolder
|
|
|
333
333
|
Options:
|
|
334
334
|
-o,--output-dir <arg> Specifies the output directory for generated files
|
|
335
335
|
--keep-line-breaks Preserves original line breaks in the extracted text
|
|
336
|
-
--content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page
|
|
336
|
+
--content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny
|
|
337
337
|
--markdown-with-html Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
|
|
338
338
|
--markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
|
|
339
339
|
--markdown Sets the data extraction output format to Markdown
|
|
@@ -13,8 +13,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
|
|
|
13
13
|
opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
|
|
14
14
|
opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
|
|
15
15
|
opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
|
|
16
|
-
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=
|
|
17
|
-
opendataloader_pdf-0.0.
|
|
18
|
-
opendataloader_pdf-0.0.
|
|
19
|
-
opendataloader_pdf-0.0.
|
|
20
|
-
opendataloader_pdf-0.0.
|
|
16
|
+
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=YpRYRxfbUO2W_oPYigKziGqX1awk9xNWsGHfN_XS_tw,20470354
|
|
17
|
+
opendataloader_pdf-1.0.0.dist-info/METADATA,sha256=IfUtznw8ufhoJXi1j38sbtYHZhIJebF_940cOhbsjqo,24527
|
|
18
|
+
opendataloader_pdf-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
opendataloader_pdf-1.0.0.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
|
|
20
|
+
opendataloader_pdf-1.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|