opendataloader-pdf 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opendataloader-pdf might be problematic. Click here for more details.
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar +0 -0
- opendataloader_pdf/wrapper.py +10 -6
- {opendataloader_pdf-0.0.9.dist-info → opendataloader_pdf-0.0.11.dist-info}/METADATA +36 -20
- {opendataloader_pdf-0.0.9.dist-info → opendataloader_pdf-0.0.11.dist-info}/RECORD +6 -6
- {opendataloader_pdf-0.0.9.dist-info → opendataloader_pdf-0.0.11.dist-info}/WHEEL +0 -0
- {opendataloader_pdf-0.0.9.dist-info → opendataloader_pdf-0.0.11.dist-info}/top_level.txt +0 -0
|
Binary file
|
opendataloader_pdf/wrapper.py
CHANGED
|
@@ -12,8 +12,9 @@ def run(
|
|
|
12
12
|
input_path: str,
|
|
13
13
|
output_folder: str = None,
|
|
14
14
|
password: str = None,
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
generate_markdown: bool = False,
|
|
16
|
+
generate_html: bool = False,
|
|
17
|
+
generate_annotated_pdf: bool = False,
|
|
17
18
|
keep_line_breaks: bool = False,
|
|
18
19
|
find_hidden_text: bool = False,
|
|
19
20
|
html_in_markdown: bool = False,
|
|
@@ -27,8 +28,9 @@ def run(
|
|
|
27
28
|
input_path: Path to the input PDF file or folder.
|
|
28
29
|
output_folder: Path to the output folder. Defaults to the input folder.
|
|
29
30
|
password: Password for the PDF file.
|
|
30
|
-
|
|
31
|
-
|
|
31
|
+
generate_markdown: If True, generates a Markdown output file.
|
|
32
|
+
generate_html: If True, generates an HTML output file.
|
|
33
|
+
generate_annotated_pdf: If True, generates an annotated PDF output file.
|
|
32
34
|
keep_line_breaks: If True, keeps line breaks in the output.
|
|
33
35
|
find_hidden_text: If True, finds hidden text in the PDF.
|
|
34
36
|
html_in_markdown: If True, uses HTML in the Markdown output.
|
|
@@ -50,9 +52,11 @@ def run(
|
|
|
50
52
|
args.extend(["--folder", output_folder])
|
|
51
53
|
if password:
|
|
52
54
|
args.extend(["--password", password])
|
|
53
|
-
if
|
|
55
|
+
if generate_markdown:
|
|
54
56
|
args.append("--markdown")
|
|
55
|
-
if
|
|
57
|
+
if generate_html:
|
|
58
|
+
args.append("--html")
|
|
59
|
+
if generate_annotated_pdf:
|
|
56
60
|
args.append("--pdf")
|
|
57
61
|
if keep_line_breaks:
|
|
58
62
|
args.append("--keeplinebreaks")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opendataloader-pdf
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.11
|
|
4
4
|
Summary: A Python wrapper for the opendataloader-pdf Java CLI.
|
|
5
5
|
Home-page: https://github.com/opendataloader-project/opendataloader-pdf
|
|
6
6
|
Author: opendataloader-project
|
|
@@ -24,15 +24,20 @@ Dynamic: summary
|
|
|
24
24
|
|
|
25
25
|

|
|
26
26
|
[](https://pypi.org/project/opendataloader-pdf/)
|
|
27
|
+

|
|
28
|
+

|
|
27
29
|
[](https://search.maven.org/artifact/io.github.opendataloader-project/opendataloader-pdf-core)
|
|
28
30
|
[](https://pypi.org/project/opendataloader-pdf/)
|
|
29
|
-
[](https://pypi.org/project/opendataloader-pdf/)
|
|
30
31
|
[](https://github.com/opendataloader-project/opendataloader-pdf/pkgs/container/opendataloader-pdf-cli)
|
|
32
|
+
[](https://app.codecov.io/gh/opendataloader-project/opendataloader-pdf)
|
|
31
33
|
[](https://cla-assistant.io/opendataloader-project/opendataloader-pdf)
|
|
32
34
|
|
|
33
35
|
<br/>
|
|
34
36
|
|
|
35
|
-
|
|
37
|
+
**Safe, Open, High-Performance — OpenDataLoader PDF for AI**
|
|
38
|
+
|
|
39
|
+
OpenDataLoader-PDF converts PDFs into JSON, Markdown or Html — ready to feed into modern AI stacks (LLMs, vector search, and RAG).
|
|
40
|
+
|
|
36
41
|
It reconstructs document layout (headings, lists, tables, and reading order) so the content is easier to chunk, index, and query.
|
|
37
42
|
Powered by fast, heuristic, rule-based inference, it runs entirely on your local machine and delivers high-throughput processing for large document sets.
|
|
38
43
|
AI-safety is enabled by default and automatically filters likely prompt-injection content embedded in PDFs to reduce downstream risk.
|
|
@@ -41,13 +46,13 @@ AI-safety is enabled by default and automatically filters likely prompt-injectio
|
|
|
41
46
|
|
|
42
47
|
## 🌟 Key Features
|
|
43
48
|
|
|
44
|
-
- 🧾 **Rich,
|
|
45
|
-
- 🧩 **Layout
|
|
46
|
-
- 🔒 **Local-
|
|
47
|
-
- ⚡ **Fast &
|
|
48
|
-
- 🛡️ **AI-
|
|
49
|
-
- 🆓 **Open
|
|
50
|
-
- 🖍️ **Annotated PDF
|
|
49
|
+
- 🧾 **Rich, Structured Output** — JSON, Markdown or Html
|
|
50
|
+
- 🧩 **Layout Reconstruction** — Headings, Lists, Tables, Images, Reading Order
|
|
51
|
+
- 🔒 **Local-First Privacy** — Runs fully on your machine
|
|
52
|
+
- ⚡ **Fast & Lightweight** — Rule-Based Heuristic, High-Throughput, No GPU
|
|
53
|
+
- 🛡️ **AI-Safety** — Auto-Filters likely prompt-injection content
|
|
54
|
+
- 🆓 **Open-Source** — Free for commercial use
|
|
55
|
+
- 🖍️ **Annotated PDF Visualization** — See detected structures overlaid on the original
|
|
51
56
|
|
|
52
57
|

|
|
53
58
|
|
|
@@ -88,7 +93,9 @@ import opendataloader_pdf
|
|
|
88
93
|
opendataloader_pdf.run(
|
|
89
94
|
input_path="path/to/document.pdf",
|
|
90
95
|
output_folder="path/to/output",
|
|
91
|
-
|
|
96
|
+
generate_markdown=True,
|
|
97
|
+
generate_html=True,
|
|
98
|
+
generate_annotated_pdf=True,
|
|
92
99
|
)
|
|
93
100
|
```
|
|
94
101
|
|
|
@@ -101,8 +108,9 @@ The main function to process PDFs.
|
|
|
101
108
|
| `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
|
|
102
109
|
| `output_folder` | `str` | No | input folder | Path to the output folder. |
|
|
103
110
|
| `password` | `str` | No | `None` | Password for the PDF file. |
|
|
104
|
-
| `
|
|
105
|
-
| `
|
|
111
|
+
| `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
|
|
112
|
+
| `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
|
|
113
|
+
| `generate_annotated_pdf`| `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
|
|
106
114
|
| `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
|
|
107
115
|
| `find_hidden_text` | `bool` | No | `False` | If `True`, finds hidden text in the PDF. |
|
|
108
116
|
| `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
|
|
@@ -176,6 +184,9 @@ public class Sample {
|
|
|
176
184
|
//generate markdown output file
|
|
177
185
|
config.setGenerateMarkdown(true);
|
|
178
186
|
|
|
187
|
+
//generate html output file
|
|
188
|
+
config.setGenerateHtml(true);
|
|
189
|
+
|
|
179
190
|
//enable html in markdown output file
|
|
180
191
|
config.setUseHTMLInMarkdown(true);
|
|
181
192
|
|
|
@@ -205,12 +216,16 @@ public class Sample {
|
|
|
205
216
|
|
|
206
217
|
## Docker
|
|
207
218
|
|
|
219
|
+
Download sample PDF
|
|
220
|
+
|
|
208
221
|
```sh
|
|
209
|
-
# Download sample PDF
|
|
210
222
|
curl -L -o 1901.03003.pdf https://arxiv.org/pdf/1901.03003
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
Run opendataloader-pdf in Docker container
|
|
211
226
|
|
|
212
|
-
|
|
213
|
-
docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pdf-cli:latest /work/1901.03003.pdf --markdown --pdf
|
|
227
|
+
```
|
|
228
|
+
docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pdf-cli:latest /work/1901.03003.pdf --markdown --html --pdf
|
|
214
229
|
```
|
|
215
230
|
|
|
216
231
|
<br/>
|
|
@@ -234,17 +249,17 @@ java/opendataloader-pdf-cli/target
|
|
|
234
249
|
### CLI usage
|
|
235
250
|
|
|
236
251
|
```sh
|
|
237
|
-
java -jar
|
|
252
|
+
java -jar opendataloader-pdf-cli-<VERSION>.jar [options] <INPUT FILE OR FOLDER>
|
|
238
253
|
```
|
|
239
254
|
|
|
240
255
|
This generates a JSON file with layout recognition results in the specified output folder.
|
|
241
|
-
Additionally, annotated PDF with recognized structures and
|
|
256
|
+
Additionally, annotated PDF with recognized structures, Markdown and Html are generated if options `--pdf`, `--markdown` and `--html` are specified.
|
|
242
257
|
|
|
243
258
|
By default all line breaks and hyphenation characters are removed, the Markdown does not include any images and does not use any HTML.
|
|
244
259
|
|
|
245
260
|
The option `--keeplinebreaks` to preserve the original line breaks text content in JSON and Markdown output.
|
|
246
261
|
|
|
247
|
-
The option `--
|
|
262
|
+
The option `--htmlinmarkdown` enables use of HTML in Markdown, which may improve Markdown preview in processors that support HTML tags.
|
|
248
263
|
The option `--addimagetomarkdown` enables inclusion of image references into the output Markdown.
|
|
249
264
|
The images are extracted from PDF as individual files and stored in a subfolder next to the Markdown output.
|
|
250
265
|
|
|
@@ -255,9 +270,10 @@ Options:
|
|
|
255
270
|
-f,--folder <arg> Specify output folder (default the folder of the input PDF)
|
|
256
271
|
-klb,--keeplinebreaks Keep line breaks
|
|
257
272
|
-ht,--findhiddentext Find hidden text
|
|
258
|
-
-
|
|
273
|
+
-htmlmd,--htmlinmarkdown Use html in markdown
|
|
259
274
|
-im,--addimagetomarkdown Add images to markdown
|
|
260
275
|
-markdown,--markdown Generates markdown output
|
|
276
|
+
-html,--html Generates html output
|
|
261
277
|
-p,--password <arg> Specifies password
|
|
262
278
|
-pdf,--pdf Generates pdf output
|
|
263
279
|
```
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
|
|
2
2
|
opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
|
|
3
3
|
opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
|
|
4
|
-
opendataloader_pdf/wrapper.py,sha256=
|
|
4
|
+
opendataloader_pdf/wrapper.py,sha256=YuCPVrqZdoA6kg-_MiXYo9KvIkmRIY_QxDqem8Sd8V0,4666
|
|
5
5
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
|
|
6
6
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
|
|
7
7
|
opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
@@ -13,8 +13,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
|
|
|
13
13
|
opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
|
|
14
14
|
opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
|
|
15
15
|
opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
|
|
16
|
-
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=
|
|
17
|
-
opendataloader_pdf-0.0.
|
|
18
|
-
opendataloader_pdf-0.0.
|
|
19
|
-
opendataloader_pdf-0.0.
|
|
20
|
-
opendataloader_pdf-0.0.
|
|
16
|
+
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=sM8mynVYq8p4gqQwXxI3sBf-jQukTtFp_0ts1VAMSpM,22122871
|
|
17
|
+
opendataloader_pdf-0.0.11.dist-info/METADATA,sha256=fKVr3VPqq2vmV6RKE2WvztIbAFxs-g6fAvHMO15QQ1k,18128
|
|
18
|
+
opendataloader_pdf-0.0.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
opendataloader_pdf-0.0.11.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
|
|
20
|
+
opendataloader_pdf-0.0.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|