opendataloader-pdf 1.0.5__py3-none-any.whl → 1.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opendataloader-pdf might be problematic. Click here for more details.
- opendataloader_pdf/__main__.py +5 -0
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar +0 -0
- opendataloader_pdf/wrapper.py +82 -0
- {opendataloader_pdf-1.0.5.dist-info → opendataloader_pdf-1.0.6.dist-info}/METADATA +26 -1
- {opendataloader_pdf-1.0.5.dist-info → opendataloader_pdf-1.0.6.dist-info}/RECORD +8 -6
- opendataloader_pdf-1.0.6.dist-info/entry_points.txt +2 -0
- {opendataloader_pdf-1.0.5.dist-info → opendataloader_pdf-1.0.6.dist-info}/WHEEL +0 -0
- {opendataloader_pdf-1.0.5.dist-info → opendataloader_pdf-1.0.6.dist-info}/top_level.txt +0 -0
|
Binary file
|
opendataloader_pdf/wrapper.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import argparse
|
|
1
2
|
import subprocess
|
|
2
3
|
import sys
|
|
3
4
|
import importlib.resources as importlib_resources
|
|
@@ -135,3 +136,84 @@ def run(
|
|
|
135
136
|
if e.stdout:
|
|
136
137
|
print(f"Stdout: {e.stdout}", file=sys.stderr)
|
|
137
138
|
raise e
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def main(argv=None) -> int:
|
|
142
|
+
"""CLI entry point for running the wrapper from the command line."""
|
|
143
|
+
parser = argparse.ArgumentParser(
|
|
144
|
+
description="Run the opendataloader-pdf CLI using the bundled JAR."
|
|
145
|
+
)
|
|
146
|
+
parser.add_argument("input_path", help="Path to the input PDF file or directory.")
|
|
147
|
+
parser.add_argument(
|
|
148
|
+
"-o",
|
|
149
|
+
"--output-dir",
|
|
150
|
+
dest="output_folder",
|
|
151
|
+
help="Directory where outputs are written.",
|
|
152
|
+
)
|
|
153
|
+
parser.add_argument("-p", "--password", help="Password for encrypted PDFs.")
|
|
154
|
+
parser.add_argument(
|
|
155
|
+
"--replace-invalid-chars",
|
|
156
|
+
help="Replacement character for invalid or unrecognized characters.",
|
|
157
|
+
)
|
|
158
|
+
parser.add_argument(
|
|
159
|
+
"--content-safety-off",
|
|
160
|
+
help="Disable content safety filtering (expects the desired mode).",
|
|
161
|
+
)
|
|
162
|
+
parser.add_argument(
|
|
163
|
+
"--markdown",
|
|
164
|
+
dest="generate_markdown",
|
|
165
|
+
action="store_true",
|
|
166
|
+
help="Generate Markdown output.",
|
|
167
|
+
)
|
|
168
|
+
parser.add_argument(
|
|
169
|
+
"--html",
|
|
170
|
+
dest="generate_html",
|
|
171
|
+
action="store_true",
|
|
172
|
+
help="Generate HTML output.",
|
|
173
|
+
)
|
|
174
|
+
parser.add_argument(
|
|
175
|
+
"--pdf",
|
|
176
|
+
dest="generate_annotated_pdf",
|
|
177
|
+
action="store_true",
|
|
178
|
+
help="Generate annotated PDF output.",
|
|
179
|
+
)
|
|
180
|
+
parser.add_argument(
|
|
181
|
+
"--keep-line-breaks",
|
|
182
|
+
action="store_true",
|
|
183
|
+
help="Preserve line breaks in text output.",
|
|
184
|
+
)
|
|
185
|
+
parser.add_argument(
|
|
186
|
+
"--markdown-with-html",
|
|
187
|
+
dest="html_in_markdown",
|
|
188
|
+
action="store_true",
|
|
189
|
+
help="Allow raw HTML within Markdown output.",
|
|
190
|
+
)
|
|
191
|
+
parser.add_argument(
|
|
192
|
+
"--markdown-with-images",
|
|
193
|
+
dest="add_image_to_markdown",
|
|
194
|
+
action="store_true",
|
|
195
|
+
help="Embed images in Markdown output.",
|
|
196
|
+
)
|
|
197
|
+
parser.add_argument(
|
|
198
|
+
"--no-json",
|
|
199
|
+
action="store_true",
|
|
200
|
+
help="Disable JSON output generation.",
|
|
201
|
+
)
|
|
202
|
+
parser.add_argument(
|
|
203
|
+
"--debug",
|
|
204
|
+
action="store_true",
|
|
205
|
+
help="Stream CLI logs directly to stdout.",
|
|
206
|
+
)
|
|
207
|
+
args = parser.parse_args(argv)
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
run(**vars(args))
|
|
211
|
+
except FileNotFoundError as err:
|
|
212
|
+
print(err, file=sys.stderr)
|
|
213
|
+
return 1
|
|
214
|
+
except subprocess.CalledProcessError as err:
|
|
215
|
+
return err.returncode or 1
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
if __name__ == "__main__":
|
|
219
|
+
sys.exit(main())
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opendataloader-pdf
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.6
|
|
4
4
|
Summary: A Python wrapper for the opendataloader-pdf Java CLI.
|
|
5
5
|
Home-page: https://github.com/opendataloader-project/opendataloader-pdf
|
|
6
6
|
Author: opendataloader-project
|
|
@@ -109,9 +109,16 @@ opendataloader_pdf.run(
|
|
|
109
109
|
generate_markdown=True,
|
|
110
110
|
generate_html=True,
|
|
111
111
|
generate_annotated_pdf=True,
|
|
112
|
+
debug=True,
|
|
112
113
|
)
|
|
113
114
|
```
|
|
114
115
|
|
|
116
|
+
- If you want to run it via CLI, you can use the following command:
|
|
117
|
+
|
|
118
|
+
```sh
|
|
119
|
+
opendataloader-pdf path/to/document.pdf --markdown --html --pdf
|
|
120
|
+
```
|
|
121
|
+
|
|
115
122
|
### Function: run()
|
|
116
123
|
|
|
117
124
|
The main function to process PDFs.
|
|
@@ -174,6 +181,24 @@ async function main() {
|
|
|
174
181
|
main();
|
|
175
182
|
```
|
|
176
183
|
|
|
184
|
+
If you want to run it via CLI, you can use the following command:
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
npx @opendataloader/pdf path/to/document.pdf -o path/to/output --markdown --html --pdf
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
or you can install it globally:
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
npm install -g @opendataloader/pdf
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
then run:
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
opendataloader-pdf path/to/document.pdf -o path/to/output --markdown --html --pdf
|
|
200
|
+
```
|
|
201
|
+
|
|
177
202
|
### Function: run()
|
|
178
203
|
|
|
179
204
|
`run(inputPath: string, options?: RunOptions): Promise<string>`
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
|
|
2
2
|
opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
|
|
3
3
|
opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
|
|
4
|
-
opendataloader_pdf/
|
|
4
|
+
opendataloader_pdf/__main__.py,sha256=lmla4yz3SaYBfRJXOXnwO_8ID31-Ja20aQmomiz1eEc,84
|
|
5
|
+
opendataloader_pdf/wrapper.py,sha256=Dsvw5un_HROLcy2xX0WqoKKRnOjL081LEYC6YfpViLE,7331
|
|
5
6
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
|
|
6
7
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
|
|
7
8
|
opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
@@ -13,8 +14,9 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
|
|
|
13
14
|
opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
|
|
14
15
|
opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
|
|
15
16
|
opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
|
|
16
|
-
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=
|
|
17
|
-
opendataloader_pdf-1.0.
|
|
18
|
-
opendataloader_pdf-1.0.
|
|
19
|
-
opendataloader_pdf-1.0.
|
|
20
|
-
opendataloader_pdf-1.0.
|
|
17
|
+
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=HmcxP25ZCOJNRV9U1IXy-beAN243_iCWTBIV6JB-6S8,20477911
|
|
18
|
+
opendataloader_pdf-1.0.6.dist-info/METADATA,sha256=2BWSSScAW3mmpWum3N7g-01fMZITHsmQcDBqSmGSkU0,25966
|
|
19
|
+
opendataloader_pdf-1.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
+
opendataloader_pdf-1.0.6.dist-info/entry_points.txt,sha256=Tupa9pVNF6nXD9sqzCLI8PCHbSu0jKkL3SYyTkQy0dc,71
|
|
21
|
+
opendataloader_pdf-1.0.6.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
|
|
22
|
+
opendataloader_pdf-1.0.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|