opendataloader-pdf 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opendataloader-pdf might be problematic. Click here for more details.
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar +0 -0
- opendataloader_pdf/wrapper.py +17 -5
- {opendataloader_pdf-1.0.0.dist-info → opendataloader_pdf-1.0.1.dist-info}/METADATA +6 -8
- {opendataloader_pdf-1.0.0.dist-info → opendataloader_pdf-1.0.1.dist-info}/RECORD +6 -6
- {opendataloader_pdf-1.0.0.dist-info → opendataloader_pdf-1.0.1.dist-info}/WHEEL +0 -0
- {opendataloader_pdf-1.0.0.dist-info → opendataloader_pdf-1.0.1.dist-info}/top_level.txt +0 -0
|
Binary file
|
opendataloader_pdf/wrapper.py
CHANGED
|
@@ -1,13 +1,26 @@
|
|
|
1
1
|
import subprocess
|
|
2
2
|
import sys
|
|
3
|
-
import importlib_resources
|
|
3
|
+
import importlib.resources as importlib_resources
|
|
4
4
|
import locale
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import List
|
|
6
7
|
|
|
7
8
|
# The consistent name of the JAR file bundled with the package
|
|
8
9
|
_JAR_NAME = "opendataloader-pdf-cli.jar"
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
def _get_redacted_command_string(command: List[str]) -> str:
|
|
13
|
+
"""Redacts the password from a command list for safe logging."""
|
|
14
|
+
command_for_logging = list(command)
|
|
15
|
+
try:
|
|
16
|
+
password_index = command_for_logging.index("--password")
|
|
17
|
+
if password_index + 1 < len(command_for_logging):
|
|
18
|
+
command_for_logging[password_index + 1] = "[REDACTED]"
|
|
19
|
+
except ValueError:
|
|
20
|
+
pass # '--password' not in command
|
|
21
|
+
return " ".join(command_for_logging)
|
|
22
|
+
|
|
23
|
+
|
|
11
24
|
def run(
|
|
12
25
|
input_path: str,
|
|
13
26
|
output_folder: str = None,
|
|
@@ -29,12 +42,11 @@ def run(
|
|
|
29
42
|
input_path: Path to the input PDF file or folder.
|
|
30
43
|
output_folder: Path to the output folder. Defaults to the input folder.
|
|
31
44
|
password: Password for the PDF file.
|
|
32
|
-
replace_invalid_chars: Character to replace invalid or unrecognized characters (e.g.,
|
|
45
|
+
replace_invalid_chars: Character to replace invalid or unrecognized characters (e.g., , \u0000) with.
|
|
33
46
|
generate_markdown: If True, generates a Markdown output file.
|
|
34
47
|
generate_html: If True, generates an HTML output file.
|
|
35
48
|
generate_annotated_pdf: If True, generates an annotated PDF output file.
|
|
36
49
|
keep_line_breaks: If True, keeps line breaks in the output.
|
|
37
|
-
find_hidden_text: If True, finds hidden text in the PDF.
|
|
38
50
|
html_in_markdown: If True, uses HTML in the Markdown output.
|
|
39
51
|
add_image_to_markdown: If True, adds images to the Markdown output.
|
|
40
52
|
debug: If True, prints all messages from the CLI to the console during execution.
|
|
@@ -82,7 +94,7 @@ def run(
|
|
|
82
94
|
command = ["java", "-jar", str(jar_path)] + args
|
|
83
95
|
|
|
84
96
|
if debug:
|
|
85
|
-
print(f"Running command: {
|
|
97
|
+
print(f"Running command: {_get_redacted_command_string(command)}", file=sys.stderr)
|
|
86
98
|
process = subprocess.Popen(
|
|
87
99
|
command,
|
|
88
100
|
stdout=subprocess.PIPE,
|
|
@@ -132,4 +144,4 @@ def run(
|
|
|
132
144
|
print(f"Stderr: {e.stderr}", file=sys.stderr)
|
|
133
145
|
if e.stdout:
|
|
134
146
|
print(f"Stdout: {e.stdout}", file=sys.stderr)
|
|
135
|
-
raise e
|
|
147
|
+
raise e
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opendataloader-pdf
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: A Python wrapper for the opendataloader-pdf Java CLI.
|
|
5
5
|
Home-page: https://github.com/opendataloader-project/opendataloader-pdf
|
|
6
6
|
Author: opendataloader-project
|
|
@@ -8,9 +8,8 @@ Author-email: open.dataloader@hancom.com
|
|
|
8
8
|
License: MPL-2.0
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
11
|
+
Requires-Python: >=3.9, <4.0
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
|
-
Requires-Dist: importlib_resources; python_version < "3.9"
|
|
14
13
|
Dynamic: author
|
|
15
14
|
Dynamic: author-email
|
|
16
15
|
Dynamic: classifier
|
|
@@ -18,16 +17,15 @@ Dynamic: description
|
|
|
18
17
|
Dynamic: description-content-type
|
|
19
18
|
Dynamic: home-page
|
|
20
19
|
Dynamic: license
|
|
21
|
-
Dynamic: requires-dist
|
|
22
20
|
Dynamic: requires-python
|
|
23
21
|
Dynamic: summary
|
|
24
22
|
|
|
25
23
|
# OpenDataLoader PDF
|
|
26
24
|
|
|
27
|
-
|
|
25
|
+
|
|
28
26
|
[](https://github.com/opendataloader-project/opendataloader-pdf/blob/main/LICENSE)
|
|
29
27
|

|
|
30
|
-

|
|
31
29
|
[](https://search.maven.org/artifact/org.opendataloader/opendataloader-pdf-core)
|
|
32
30
|
[](https://pypi.org/project/opendataloader-pdf/)
|
|
33
31
|
[](https://www.npmjs.com/package/@opendataloader/pdf)
|
|
@@ -74,7 +72,7 @@ AI-safety is enabled by default and automatically filters likely prompt-injectio
|
|
|
74
72
|
## Prerequisites
|
|
75
73
|
|
|
76
74
|
- Java 11 or higher must be installed and available in your system's PATH.
|
|
77
|
-
- Python 3.
|
|
75
|
+
- Python 3.9+
|
|
78
76
|
|
|
79
77
|
<br/>
|
|
80
78
|
|
|
@@ -213,7 +211,7 @@ Check for the latest version on [Maven Central](https://search.maven.org/artifac
|
|
|
213
211
|
<dependency>
|
|
214
212
|
<groupId>org.opendataloader</groupId>
|
|
215
213
|
<artifactId>opendataloader-pdf-core</artifactId>
|
|
216
|
-
<version>0.0
|
|
214
|
+
<version>1.0.0</version>
|
|
217
215
|
</dependency>
|
|
218
216
|
</dependencies>
|
|
219
217
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
|
|
2
2
|
opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
|
|
3
3
|
opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
|
|
4
|
-
opendataloader_pdf/wrapper.py,sha256=
|
|
4
|
+
opendataloader_pdf/wrapper.py,sha256=723K0YL0P9JSD_2pQ0w8je3dINy5s8rvYoQhyi6Z8PY,5437
|
|
5
5
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
|
|
6
6
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
|
|
7
7
|
opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
@@ -13,8 +13,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
|
|
|
13
13
|
opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
|
|
14
14
|
opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
|
|
15
15
|
opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
|
|
16
|
-
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=
|
|
17
|
-
opendataloader_pdf-1.0.
|
|
18
|
-
opendataloader_pdf-1.0.
|
|
19
|
-
opendataloader_pdf-1.0.
|
|
20
|
-
opendataloader_pdf-1.0.
|
|
16
|
+
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=pW9pLp40AhKPBn6UalczNCpzN2zesq0q7hhdl-hOSTw,20470360
|
|
17
|
+
opendataloader_pdf-1.0.1.dist-info/METADATA,sha256=43ddIC8BVAzij8L3akH3MUq5Yd-C_80HVsVGb5cODz8,24374
|
|
18
|
+
opendataloader_pdf-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
opendataloader_pdf-1.0.1.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
|
|
20
|
+
opendataloader_pdf-1.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|