opendataloader-pdf 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opendataloader_pdf/LICENSE +362 -0
- opendataloader_pdf/NOTICE.md +15 -0
- opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md +57 -0
- opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md +258 -0
- opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt +202 -0
- opendataloader_pdf/THIRD_PARTY/licenses/BSD-3-Clause.txt +30 -0
- opendataloader_pdf/THIRD_PARTY/licenses/CDDL-1.1.txt +352 -0
- opendataloader_pdf/THIRD_PARTY/licenses/EDL-1.0.txt +31 -0
- opendataloader_pdf/THIRD_PARTY/licenses/EPL-2.0.txt +267 -0
- opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt +28 -0
- opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt +21 -0
- opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt +408 -0
- opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt +37 -0
- opendataloader_pdf/__init__.py +3 -0
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar +0 -0
- opendataloader_pdf/wrapper.py +126 -0
- opendataloader_pdf-0.0.0.dist-info/METADATA +91 -0
- opendataloader_pdf-0.0.0.dist-info/RECORD +20 -0
- opendataloader_pdf-0.0.0.dist-info/WHEEL +5 -0
- opendataloader_pdf-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: opendataloader-pdf
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: A Python wrapper for the opendataloader-pdf Java CLI.
|
|
5
|
+
Home-page: https://github.com/opendataloader-project/opendataloader-pdf
|
|
6
|
+
Author: opendataloader-project
|
|
7
|
+
Author-email: open.dataloader@hancom.com
|
|
8
|
+
License: MPL-2.0
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.7
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Dynamic: author
|
|
14
|
+
Dynamic: author-email
|
|
15
|
+
Dynamic: classifier
|
|
16
|
+
Dynamic: description
|
|
17
|
+
Dynamic: description-content-type
|
|
18
|
+
Dynamic: home-page
|
|
19
|
+
Dynamic: license
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
Dynamic: summary
|
|
22
|
+
|
|
23
|
+
# Opendataloader PDF Python Wrapper
|
|
24
|
+
|
|
25
|
+
This package is a Python wrapper for the `opendataloader-pdf` Java command-line tool.
|
|
26
|
+
|
|
27
|
+
It allows you to process PDF files and convert them to JSON or Markdown format directly from Python.
|
|
28
|
+
|
|
29
|
+
## Prerequisites
|
|
30
|
+
|
|
31
|
+
- Java 11 or higher must be installed and available in your system's PATH.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install opendataloader-pdf
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
Here is a basic example of how to use the wrapper:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import opendataloader_pdf
|
|
45
|
+
import json
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
# Process a PDF and get the structured JSON output from stdout
|
|
49
|
+
json_output_str = opendataloader_pdf.run(input_path="path/to/your/document.pdf")
|
|
50
|
+
|
|
51
|
+
# The output is a JSON string, so you can parse it
|
|
52
|
+
if json_output_str:
|
|
53
|
+
data = json.loads(json_output_str)
|
|
54
|
+
print(f"Successfully processed {data['file name']}.")
|
|
55
|
+
|
|
56
|
+
# Example: Generate Markdown and annotated PDF in a specific folder
|
|
57
|
+
opendataloader_pdf.run(
|
|
58
|
+
input_path="path/to/your/document.pdf",
|
|
59
|
+
output_folder="path/to/output",
|
|
60
|
+
generate_markdown=True,
|
|
61
|
+
generate_pdf=True,
|
|
62
|
+
keep_line_breaks=True
|
|
63
|
+
)
|
|
64
|
+
print("Successfully generated Markdown and annotated PDF.")
|
|
65
|
+
|
|
66
|
+
except FileNotFoundError:
|
|
67
|
+
print("Error: Input file not found or Java is not installed correctly.")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
print(f"An error occurred: {e}")
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Function: `run()`
|
|
73
|
+
|
|
74
|
+
The main function to process PDFs.
|
|
75
|
+
|
|
76
|
+
### Parameters
|
|
77
|
+
|
|
78
|
+
- `input_path` (str): Path to the input PDF file or folder. **(Required)**
|
|
79
|
+
- `output_folder` (str, optional): Path to the output folder. Defaults to the input folder.
|
|
80
|
+
- `password` (str, optional): Password for the PDF file.
|
|
81
|
+
- `generate_markdown` (bool, optional): If `True`, generates a Markdown output file. Defaults to `False`.
|
|
82
|
+
- `generate_pdf` (bool, optional): If `True`, generates an annotated PDF output file. Defaults to `False`.
|
|
83
|
+
- `keep_line_breaks` (bool, optional): If `True`, keeps line breaks in the output. Defaults to `False`.
|
|
84
|
+
- `find_hidden_text` (bool, optional): If `True`, finds hidden text in the PDF. Defaults to `False`.
|
|
85
|
+
- `html_in_markdown` (bool, optional): If `True`, uses HTML in the Markdown output. Defaults to `False`.
|
|
86
|
+
- `add_image_to_markdown` (bool, optional): If `True`, adds images to the Markdown output. Defaults to `False`.
|
|
87
|
+
- `debug` (bool, optional): If `True`, prints all messages from the CLI to the console during execution. Defaults to `False`.
|
|
88
|
+
|
|
89
|
+
### Returns
|
|
90
|
+
|
|
91
|
+
- `str`: The standard output from the CLI tool. When processing a single file without other output formats specified, this will be the JSON content. When `debug=True`, this will be the combined stdout and stderr from the CLI tool.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
|
|
2
|
+
opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
|
|
3
|
+
opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
|
|
4
|
+
opendataloader_pdf/wrapper.py,sha256=ahbxo7YYunsNt3w66eXn_oa0XKy1LLfoA5GQcSuMvvM,4407
|
|
5
|
+
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
|
|
6
|
+
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
|
|
7
|
+
opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
8
|
+
opendataloader_pdf/THIRD_PARTY/licenses/BSD-3-Clause.txt,sha256=rlv4jgrZMCB6mUuhQk3Osruj4Ug7N5jLP3iPEPine68,1542
|
|
9
|
+
opendataloader_pdf/THIRD_PARTY/licenses/CDDL-1.1.txt,sha256=O_1Dq0kHvqPR75IYLVl4YcLEFrKk7w6Np9sVhGTdAng,18340
|
|
10
|
+
opendataloader_pdf/THIRD_PARTY/licenses/EDL-1.0.txt,sha256=4_5e3V-pvs4tqytV1G9ayEIQkmSRz3XhUJ_DOUKv_-o,1620
|
|
11
|
+
opendataloader_pdf/THIRD_PARTY/licenses/EPL-2.0.txt,sha256=MIGRPWmScojG3ufiUpg810F5dq8tAVI7uymuRMlDto4,14437
|
|
12
|
+
opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJGibiZbR_QhJi2vURV-QfUCvNrI-5s,1785
|
|
13
|
+
opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
|
|
14
|
+
opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
|
|
15
|
+
opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
|
|
16
|
+
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=jUVcuDXxNBFweOQbusLebCX1AZ3l3ssns4nJtSWsRTI,22116373
|
|
17
|
+
opendataloader_pdf-0.0.0.dist-info/METADATA,sha256=y-T7UA9NodfcETSd2tX5FJdXMNArmpgXZG22wINIHRo,3382
|
|
18
|
+
opendataloader_pdf-0.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
opendataloader_pdf-0.0.0.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
|
|
20
|
+
opendataloader_pdf-0.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
opendataloader_pdf
|