opendataloader-pdf 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opendataloader-pdf might be problematic. Click here for more details.
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar +0 -0
- opendataloader_pdf/wrapper.py +9 -2
- {opendataloader_pdf-1.0.3.dist-info → opendataloader_pdf-1.0.4.dist-info}/METADATA +19 -5
- {opendataloader_pdf-1.0.3.dist-info → opendataloader_pdf-1.0.4.dist-info}/RECORD +6 -6
- {opendataloader_pdf-1.0.3.dist-info → opendataloader_pdf-1.0.4.dist-info}/WHEEL +0 -0
- {opendataloader_pdf-1.0.3.dist-info → opendataloader_pdf-1.0.4.dist-info}/top_level.txt +0 -0
|
Binary file
|
opendataloader_pdf/wrapper.py
CHANGED
|
@@ -33,6 +33,7 @@ def run(
|
|
|
33
33
|
content_safety_off: str = None,
|
|
34
34
|
html_in_markdown: bool = False,
|
|
35
35
|
add_image_to_markdown: bool = False,
|
|
36
|
+
no_json: bool = False,
|
|
36
37
|
debug: bool = False,
|
|
37
38
|
):
|
|
38
39
|
"""
|
|
@@ -49,6 +50,7 @@ def run(
|
|
|
49
50
|
keep_line_breaks: If True, keeps line breaks in the output.
|
|
50
51
|
html_in_markdown: If True, uses HTML in the Markdown output.
|
|
51
52
|
add_image_to_markdown: If True, adds images to the Markdown output.
|
|
53
|
+
no_json: If True, disable the JSON output.
|
|
52
54
|
debug: If True, prints all messages from the CLI to the console during execution.
|
|
53
55
|
|
|
54
56
|
Returns:
|
|
@@ -82,6 +84,8 @@ def run(
|
|
|
82
84
|
args.append("--markdown-with-html")
|
|
83
85
|
if add_image_to_markdown:
|
|
84
86
|
args.append("--markdown-with-images")
|
|
87
|
+
if no_json:
|
|
88
|
+
args.append("--no-json")
|
|
85
89
|
|
|
86
90
|
args.append(input_path)
|
|
87
91
|
|
|
@@ -94,7 +98,10 @@ def run(
|
|
|
94
98
|
command = ["java", "-jar", str(jar_path)] + args
|
|
95
99
|
|
|
96
100
|
if debug:
|
|
97
|
-
print(
|
|
101
|
+
print(
|
|
102
|
+
f"Running command: {_get_redacted_command_string(command)}",
|
|
103
|
+
file=sys.stderr,
|
|
104
|
+
)
|
|
98
105
|
process = subprocess.Popen(
|
|
99
106
|
command,
|
|
100
107
|
stdout=subprocess.PIPE,
|
|
@@ -144,4 +151,4 @@ def run(
|
|
|
144
151
|
print(f"Stderr: {e.stderr}", file=sys.stderr)
|
|
145
152
|
if e.stdout:
|
|
146
153
|
print(f"Stdout: {e.stdout}", file=sys.stderr)
|
|
147
|
-
raise e
|
|
154
|
+
raise e
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opendataloader-pdf
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.4
|
|
4
4
|
Summary: A Python wrapper for the opendataloader-pdf Java CLI.
|
|
5
5
|
Home-page: https://github.com/opendataloader-project/opendataloader-pdf
|
|
6
6
|
Author: opendataloader-project
|
|
@@ -62,13 +62,24 @@ AI-safety is enabled by default and automatically filters likely prompt-injectio
|
|
|
62
62
|
|
|
63
63
|
## 🚀 Upcoming Features
|
|
64
64
|
|
|
65
|
-
|
|
66
|
-
-
|
|
67
|
-
- ⚡ **Performance
|
|
68
|
-
-
|
|
65
|
+
**Scheduled for November**
|
|
66
|
+
- 📄 **Tagged PDF** — Develop advanced data extraction technology based on Tagged PDF
|
|
67
|
+
- ⚡ **Performance Improvement** — Enhance the inference skill for greater accuracy and speed.
|
|
68
|
+
- 📊 **Benchmarks & Datasets** — Publish transparent evaluations using open datasets and standardized metrics.
|
|
69
|
+
- 🎯 **Metrics** — Publish the calculation methods to transparently share benchmark results.
|
|
69
70
|
|
|
70
71
|
<br/>
|
|
71
72
|
|
|
73
|
+
**Scheduled for December**
|
|
74
|
+
- 🖨️ **OCR for scanned PDFs** — Extract data from image-only pages.
|
|
75
|
+
- 🧠 **Table AI option** — Higher accuracy for tables with borderless or merged cells.
|
|
76
|
+
<br/>
|
|
77
|
+
|
|
78
|
+
**Scheduled for 2026**
|
|
79
|
+
- 🛡️ **AI Red Teaming** — Transparent adversarial benchmarks with datasets and metrics, then reported regularly.
|
|
80
|
+
|
|
81
|
+
<br/>
|
|
82
|
+
|
|
72
83
|
## Prerequisites
|
|
73
84
|
|
|
74
85
|
- Java 11 or higher must be installed and available in your system's PATH.
|
|
@@ -118,6 +129,7 @@ The main function to process PDFs.
|
|
|
118
129
|
| `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
|
|
119
130
|
| `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
|
|
120
131
|
| `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
|
|
132
|
+
| `no_json` | `bool` | No | `False` | If `True`, disables the JSON output. |
|
|
121
133
|
| `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
|
|
122
134
|
|
|
123
135
|
<br/>
|
|
@@ -189,6 +201,7 @@ The main function to process PDFs.
|
|
|
189
201
|
| `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output. |
|
|
190
202
|
| `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output. |
|
|
191
203
|
| `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output. |
|
|
204
|
+
| `noJson` | `boolean` | `false` | If `true`, disables the JSON output. |
|
|
192
205
|
| `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution. |
|
|
193
206
|
|
|
194
207
|
<br/>
|
|
@@ -336,6 +349,7 @@ Options:
|
|
|
336
349
|
--markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
|
|
337
350
|
--markdown Sets the data extraction output format to Markdown
|
|
338
351
|
--html Sets the data extraction output format to HTML
|
|
352
|
+
--no-json Disables the JSON output format
|
|
339
353
|
-p,--password <arg> Specifies the password for an encrypted PDF
|
|
340
354
|
--pdf Generates a new PDF file where the extracted layout data is visualized as annotations
|
|
341
355
|
--replace-invalid-chars <arg> Replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
|
|
2
2
|
opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
|
|
3
3
|
opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
|
|
4
|
-
opendataloader_pdf/wrapper.py,sha256=
|
|
4
|
+
opendataloader_pdf/wrapper.py,sha256=57qjWXTtLzkqg9S8Pg-9vx_8CNQ6_GUABqrQnX6CHGk,5624
|
|
5
5
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
|
|
6
6
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
|
|
7
7
|
opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
@@ -13,8 +13,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
|
|
|
13
13
|
opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
|
|
14
14
|
opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
|
|
15
15
|
opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
|
|
16
|
-
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=
|
|
17
|
-
opendataloader_pdf-1.0.
|
|
18
|
-
opendataloader_pdf-1.0.
|
|
19
|
-
opendataloader_pdf-1.0.
|
|
20
|
-
opendataloader_pdf-1.0.
|
|
16
|
+
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=JUJ5g4-Y9aYW0PayVf9M9-2LBl1PjuUl3zGmV6Yykmo,20477542
|
|
17
|
+
opendataloader_pdf-1.0.4.dist-info/METADATA,sha256=TrFzqF0E1VoXuxfBxf8f42SLToB8EmbAkK2jNfAdjGM,25452
|
|
18
|
+
opendataloader_pdf-1.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
opendataloader_pdf-1.0.4.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
|
|
20
|
+
opendataloader_pdf-1.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|