opendataloader-pdf 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opendataloader-pdf might be problematic. Click here for more details.
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar +0 -0
- opendataloader_pdf/wrapper.py +6 -6
- {opendataloader_pdf-0.0.9.dist-info → opendataloader_pdf-0.0.10.dist-info}/METADATA +22 -14
- {opendataloader_pdf-0.0.9.dist-info → opendataloader_pdf-0.0.10.dist-info}/RECORD +6 -6
- {opendataloader_pdf-0.0.9.dist-info → opendataloader_pdf-0.0.10.dist-info}/WHEEL +0 -0
- {opendataloader_pdf-0.0.9.dist-info → opendataloader_pdf-0.0.10.dist-info}/top_level.txt +0 -0
|
Binary file
|
opendataloader_pdf/wrapper.py
CHANGED
|
@@ -12,8 +12,8 @@ def run(
|
|
|
12
12
|
input_path: str,
|
|
13
13
|
output_folder: str = None,
|
|
14
14
|
password: str = None,
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
generate_markdown: bool = False,
|
|
16
|
+
generate_annotated_pdf: bool = False,
|
|
17
17
|
keep_line_breaks: bool = False,
|
|
18
18
|
find_hidden_text: bool = False,
|
|
19
19
|
html_in_markdown: bool = False,
|
|
@@ -27,8 +27,8 @@ def run(
|
|
|
27
27
|
input_path: Path to the input PDF file or folder.
|
|
28
28
|
output_folder: Path to the output folder. Defaults to the input folder.
|
|
29
29
|
password: Password for the PDF file.
|
|
30
|
-
|
|
31
|
-
|
|
30
|
+
generate_markdown: If True, generates a Markdown output file.
|
|
31
|
+
generate_annotated_pdf: If True, generates an annotated PDF output file.
|
|
32
32
|
keep_line_breaks: If True, keeps line breaks in the output.
|
|
33
33
|
find_hidden_text: If True, finds hidden text in the PDF.
|
|
34
34
|
html_in_markdown: If True, uses HTML in the Markdown output.
|
|
@@ -50,9 +50,9 @@ def run(
|
|
|
50
50
|
args.extend(["--folder", output_folder])
|
|
51
51
|
if password:
|
|
52
52
|
args.extend(["--password", password])
|
|
53
|
-
if
|
|
53
|
+
if generate_markdown:
|
|
54
54
|
args.append("--markdown")
|
|
55
|
-
if
|
|
55
|
+
if generate_annotated_pdf:
|
|
56
56
|
args.append("--pdf")
|
|
57
57
|
if keep_line_breaks:
|
|
58
58
|
args.append("--keeplinebreaks")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opendataloader-pdf
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.10
|
|
4
4
|
Summary: A Python wrapper for the opendataloader-pdf Java CLI.
|
|
5
5
|
Home-page: https://github.com/opendataloader-project/opendataloader-pdf
|
|
6
6
|
Author: opendataloader-project
|
|
@@ -32,7 +32,10 @@ Dynamic: summary
|
|
|
32
32
|
|
|
33
33
|
<br/>
|
|
34
34
|
|
|
35
|
-
|
|
35
|
+
**Safe, Open, High-Performance — OpenDataLoader PDF for AI**
|
|
36
|
+
|
|
37
|
+
OpenDataLoader-PDF converts PDFs into JSON, Markdown or Html — ready to feed into modern AI stacks (LLMs, vector search, and RAG).
|
|
38
|
+
|
|
36
39
|
It reconstructs document layout (headings, lists, tables, and reading order) so the content is easier to chunk, index, and query.
|
|
37
40
|
Powered by fast, heuristic, rule-based inference, it runs entirely on your local machine and delivers high-throughput processing for large document sets.
|
|
38
41
|
AI-safety is enabled by default and automatically filters likely prompt-injection content embedded in PDFs to reduce downstream risk.
|
|
@@ -41,13 +44,13 @@ AI-safety is enabled by default and automatically filters likely prompt-injectio
|
|
|
41
44
|
|
|
42
45
|
## 🌟 Key Features
|
|
43
46
|
|
|
44
|
-
- 🧾 **Rich,
|
|
45
|
-
- 🧩 **Layout
|
|
46
|
-
- 🔒 **Local-
|
|
47
|
-
- ⚡ **Fast &
|
|
48
|
-
- 🛡️ **AI-
|
|
49
|
-
- 🆓 **Open
|
|
50
|
-
- 🖍️ **Annotated PDF
|
|
47
|
+
- 🧾 **Rich, Structured Output** — JSON, Markdown or Html
|
|
48
|
+
- 🧩 **Layout Reconstruction** — Headings, Lists, Tables, Images, Reading Order
|
|
49
|
+
- 🔒 **Local-First Privacy** — Runs fully on your machine
|
|
50
|
+
- ⚡ **Fast & Lightweight** — Rule-Based Heuristic, High-Throughput, No GPU
|
|
51
|
+
- 🛡️ **AI-Safety** — Auto-Filters likely prompt-injection content
|
|
52
|
+
- 🆓 **Open-Source** — Free for commercial use
|
|
53
|
+
- 🖍️ **Annotated PDF Visualization** — See detected structures overlaid on the original
|
|
51
54
|
|
|
52
55
|

|
|
53
56
|
|
|
@@ -88,7 +91,8 @@ import opendataloader_pdf
|
|
|
88
91
|
opendataloader_pdf.run(
|
|
89
92
|
input_path="path/to/document.pdf",
|
|
90
93
|
output_folder="path/to/output",
|
|
91
|
-
|
|
94
|
+
generate_markdown=True,
|
|
95
|
+
generate_annotated_pdf=True
|
|
92
96
|
)
|
|
93
97
|
```
|
|
94
98
|
|
|
@@ -101,8 +105,8 @@ The main function to process PDFs.
|
|
|
101
105
|
| `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
|
|
102
106
|
| `output_folder` | `str` | No | input folder | Path to the output folder. |
|
|
103
107
|
| `password` | `str` | No | `None` | Password for the PDF file. |
|
|
104
|
-
| `
|
|
105
|
-
| `
|
|
108
|
+
| `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
|
|
109
|
+
| `generate_annotated_pdf`| `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
|
|
106
110
|
| `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
|
|
107
111
|
| `find_hidden_text` | `bool` | No | `False` | If `True`, finds hidden text in the PDF. |
|
|
108
112
|
| `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
|
|
@@ -205,11 +209,15 @@ public class Sample {
|
|
|
205
209
|
|
|
206
210
|
## Docker
|
|
207
211
|
|
|
212
|
+
Download sample PDF
|
|
213
|
+
|
|
208
214
|
```sh
|
|
209
|
-
# Download sample PDF
|
|
210
215
|
curl -L -o 1901.03003.pdf https://arxiv.org/pdf/1901.03003
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
Run opendataloader-pdf in Docker container
|
|
211
219
|
|
|
212
|
-
|
|
220
|
+
```
|
|
213
221
|
docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pdf-cli:latest /work/1901.03003.pdf --markdown --pdf
|
|
214
222
|
```
|
|
215
223
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
|
|
2
2
|
opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
|
|
3
3
|
opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
|
|
4
|
-
opendataloader_pdf/wrapper.py,sha256=
|
|
4
|
+
opendataloader_pdf/wrapper.py,sha256=DGwzBVy1DyNxUFPLxi8Mzwb68u3fo0k0B5YEBufy0vI,4518
|
|
5
5
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
|
|
6
6
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
|
|
7
7
|
opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
@@ -13,8 +13,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
|
|
|
13
13
|
opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
|
|
14
14
|
opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
|
|
15
15
|
opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
|
|
16
|
-
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=
|
|
17
|
-
opendataloader_pdf-0.0.
|
|
18
|
-
opendataloader_pdf-0.0.
|
|
19
|
-
opendataloader_pdf-0.0.
|
|
20
|
-
opendataloader_pdf-0.0.
|
|
16
|
+
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=Qp9qnNbptrsdrL2UJn8bw-WStRqfI9EGYd883EtDZfE,22114700
|
|
17
|
+
opendataloader_pdf-0.0.10.dist-info/METADATA,sha256=ESHbbQmEr8L5VqKwpll0u7h4EiiG7YTnYiD10p7Z7h0,17626
|
|
18
|
+
opendataloader_pdf-0.0.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
opendataloader_pdf-0.0.10.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
|
|
20
|
+
opendataloader_pdf-0.0.10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|