opendataloader-pdf 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opendataloader-pdf might be problematic. Click here for more details.

@@ -1,13 +1,26 @@
1
1
  import subprocess
2
2
  import sys
3
- import importlib_resources
3
+ import importlib.resources as importlib_resources
4
4
  import locale
5
5
  from pathlib import Path
6
+ from typing import List
6
7
 
7
8
  # The consistent name of the JAR file bundled with the package
8
9
  _JAR_NAME = "opendataloader-pdf-cli.jar"
9
10
 
10
11
 
12
+ def _get_redacted_command_string(command: List[str]) -> str:
13
+ """Redacts the password from a command list for safe logging."""
14
+ command_for_logging = list(command)
15
+ try:
16
+ password_index = command_for_logging.index("--password")
17
+ if password_index + 1 < len(command_for_logging):
18
+ command_for_logging[password_index + 1] = "[REDACTED]"
19
+ except ValueError:
20
+ pass # '--password' not in command
21
+ return " ".join(command_for_logging)
22
+
23
+
11
24
  def run(
12
25
  input_path: str,
13
26
  output_folder: str = None,
@@ -29,12 +42,11 @@ def run(
29
42
  input_path: Path to the input PDF file or folder.
30
43
  output_folder: Path to the output folder. Defaults to the input folder.
31
44
  password: Password for the PDF file.
32
- replace_invalid_chars: Character to replace invalid or unrecognized characters (e.g., �, \u0000) with.
45
+ replace_invalid_chars: Character to replace invalid or unrecognized characters (e.g., , \u0000) with.
33
46
  generate_markdown: If True, generates a Markdown output file.
34
47
  generate_html: If True, generates an HTML output file.
35
48
  generate_annotated_pdf: If True, generates an annotated PDF output file.
36
49
  keep_line_breaks: If True, keeps line breaks in the output.
37
- find_hidden_text: If True, finds hidden text in the PDF.
38
50
  html_in_markdown: If True, uses HTML in the Markdown output.
39
51
  add_image_to_markdown: If True, adds images to the Markdown output.
40
52
  debug: If True, prints all messages from the CLI to the console during execution.
@@ -82,7 +94,7 @@ def run(
82
94
  command = ["java", "-jar", str(jar_path)] + args
83
95
 
84
96
  if debug:
85
- print(f"Running command: {' '.join(command)}", file=sys.stderr)
97
+ print(f"Running command: {_get_redacted_command_string(command)}", file=sys.stderr)
86
98
  process = subprocess.Popen(
87
99
  command,
88
100
  stdout=subprocess.PIPE,
@@ -132,4 +144,4 @@ def run(
132
144
  print(f"Stderr: {e.stderr}", file=sys.stderr)
133
145
  if e.stdout:
134
146
  print(f"Stdout: {e.stdout}", file=sys.stderr)
135
- raise e
147
+ raise e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opendataloader-pdf
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: A Python wrapper for the opendataloader-pdf Java CLI.
5
5
  Home-page: https://github.com/opendataloader-project/opendataloader-pdf
6
6
  Author: opendataloader-project
@@ -8,9 +8,8 @@ Author-email: open.dataloader@hancom.com
8
8
  License: MPL-2.0
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.7
11
+ Requires-Python: >=3.9, <4.0
12
12
  Description-Content-Type: text/markdown
13
- Requires-Dist: importlib_resources; python_version < "3.9"
14
13
  Dynamic: author
15
14
  Dynamic: author-email
16
15
  Dynamic: classifier
@@ -18,16 +17,15 @@ Dynamic: description
18
17
  Dynamic: description-content-type
19
18
  Dynamic: home-page
20
19
  Dynamic: license
21
- Dynamic: requires-dist
22
20
  Dynamic: requires-python
23
21
  Dynamic: summary
24
22
 
25
23
  # OpenDataLoader PDF
26
24
 
27
- ![Pre-release](https://img.shields.io/badge/Pre--release-FFA500&logo=github)
25
+
28
26
  [![License](https://img.shields.io/pypi/l/opendataloader-pdf.svg)](https://github.com/opendataloader-project/opendataloader-pdf/blob/main/LICENSE)
29
27
  ![Java](https://img.shields.io/badge/Java-11+-blue.svg)
30
- ![Python](https://img.shields.io/badge/Python-3.8+-blue.svg)
28
+ ![Python](https://img.shields.io/badge/Python-3.9+-blue.svg)
31
29
  [![Maven Central](https://img.shields.io/maven-central/v/org.opendataloader/opendataloader-pdf-core.svg)](https://search.maven.org/artifact/org.opendataloader/opendataloader-pdf-core)
32
30
  [![PyPI version](https://img.shields.io/pypi/v/opendataloader-pdf.svg)](https://pypi.org/project/opendataloader-pdf/)
33
31
  [![npm version](https://img.shields.io/npm/v/@opendataloader/pdf.svg)](https://www.npmjs.com/package/@opendataloader/pdf)
@@ -74,7 +72,7 @@ AI-safety is enabled by default and automatically filters likely prompt-injectio
74
72
  ## Prerequisites
75
73
 
76
74
  - Java 11 or higher must be installed and available in your system's PATH.
77
- - Python 3.8+
75
+ - Python 3.9+
78
76
 
79
77
  <br/>
80
78
 
@@ -107,20 +105,20 @@ opendataloader_pdf.run(
107
105
 
108
106
  The main function to process PDFs.
109
107
 
110
- | Parameter | Type | Required | Default | Description |
111
- |--------------------------| ------ | -------- |--------------|-------------------------------------------------------------------------------------------------------------------------------------------|
112
- | `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
113
- | `output_folder` | `str` | No | input folder | Path to the output folder. |
114
- | `password` | `str` | No | `None` | Password for the PDF file. |
115
- | `replace_invalid_chars` | `str` | No | `" "` | Character to replace invalid or unrecognized characters (e.g., �, \u0000) |
116
- | `content_safety_off` | `str` | No | `None` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny. |
117
- | `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
118
- | `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
119
- | `generate_annotated_pdf` | `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
120
- | `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
121
- | `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
122
- | `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
123
- | `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
108
+ | Parameter | Type | Required | Default | Description |
109
+ |--------------------------| ------ | -------- |--------------|---------------------------------------------------------------------------------------------------------------------------------------------|
110
+ | `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
111
+ | `output_folder` | `str` | No | input folder | Path to the output folder. |
112
+ | `password` | `str` | No | `None` | Password for the PDF file. |
113
+ | `replace_invalid_chars` | `str` | No | `" "` | Character to replace invalid or unrecognized characters (e.g., �, \u0000) |
114
+ | `content_safety_off` | `str` | No | `None` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg. |
115
+ | `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
116
+ | `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
117
+ | `generate_annotated_pdf` | `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
118
+ | `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
119
+ | `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
120
+ | `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
121
+ | `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
124
122
 
125
123
  <br/>
126
124
 
@@ -179,19 +177,19 @@ The main function to process PDFs.
179
177
 
180
178
  **RunOptions**
181
179
 
182
- | Property | Type | Default | Description |
183
- | ----------------------- | --------- | ------------- |-------------------------------------------------------------------------------------------------------------------------------------------|
184
- | `outputFolder` | `string` | `undefined` | Path to the output folder. If not set, output is saved next to the input. |
185
- | `password` | `string` | `undefined` | Password for the PDF file. |
186
- | `replaceInvalidChars` | `string` | `" "` | Character to replace invalid or unrecognized characters (e.g., , \u0000). |
187
- | `contentSafetyOff` | `string` | `undefined` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny. |
188
- | `generateMarkdown` | `boolean` | `false` | If `true`, generates a Markdown output file. |
189
- | `generateHtml` | `boolean` | `false` | If `true`, generates an HTML output file. |
190
- | `generateAnnotatedPdf` | `boolean` | `false` | If `true`, generates an annotated PDF output file. |
191
- | `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output. |
192
- | `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output. |
193
- | `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output. |
194
- | `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution. |
180
+ | Property | Type | Default | Description |
181
+ | ----------------------- | --------- | ------------- |-------------------------------------------------------------------------------------------------------------------------------------------------------|
182
+ | `outputFolder` | `string` | `undefined` | Path to the output folder. If not set, output is saved next to the input. |
183
+ | `password` | `string` | `undefined` | Password for the PDF file. |
184
+ | `replaceInvalidChars` | `string` | `" "` | Character to replace invalid or unrecognized characters (e.g., , \u0000). |
185
+ | `contentSafetyOff` | `string` | `undefined` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg. |
186
+ | `generateMarkdown` | `boolean` | `false` | If `true`, generates a Markdown output file. |
187
+ | `generateHtml` | `boolean` | `false` | If `true`, generates an HTML output file. |
188
+ | `generateAnnotatedPdf` | `boolean` | `false` | If `true`, generates an annotated PDF output file. |
189
+ | `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output. |
190
+ | `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output. |
191
+ | `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output. |
192
+ | `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution. |
195
193
 
196
194
  <br/>
197
195
 
@@ -213,7 +211,7 @@ Check for the latest version on [Maven Central](https://search.maven.org/artifac
213
211
  <dependency>
214
212
  <groupId>org.opendataloader</groupId>
215
213
  <artifactId>opendataloader-pdf-core</artifactId>
216
- <version>0.0.16</version>
214
+ <version>1.0.0</version>
217
215
  </dependency>
218
216
  </dependencies>
219
217
 
@@ -333,7 +331,7 @@ The images are extracted from PDF as individual files and stored in a subfolder
333
331
  Options:
334
332
  -o,--output-dir <arg> Specifies the output directory for generated files
335
333
  --keep-line-breaks Preserves original line breaks in the extracted text
336
- --content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny
334
+ --content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg
337
335
  --markdown-with-html Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
338
336
  --markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
339
337
  --markdown Sets the data extraction output format to Markdown
@@ -1,7 +1,7 @@
1
1
  opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
2
2
  opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
3
3
  opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
4
- opendataloader_pdf/wrapper.py,sha256=bPy-wNmQfJpmCg9dVx9uNTrGfW446GdGNrlJnt0cosA,4960
4
+ opendataloader_pdf/wrapper.py,sha256=723K0YL0P9JSD_2pQ0w8je3dINy5s8rvYoQhyi6Z8PY,5437
5
5
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
6
6
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
7
7
  opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
@@ -13,8 +13,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
13
13
  opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
14
14
  opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
15
15
  opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
16
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=YpRYRxfbUO2W_oPYigKziGqX1awk9xNWsGHfN_XS_tw,20470354
17
- opendataloader_pdf-1.0.0.dist-info/METADATA,sha256=IfUtznw8ufhoJXi1j38sbtYHZhIJebF_940cOhbsjqo,24527
18
- opendataloader_pdf-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- opendataloader_pdf-1.0.0.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
20
- opendataloader_pdf-1.0.0.dist-info/RECORD,,
16
+ opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=9ysDEMbBVBEJ9UdPkrLvSSNK7OK3f9ufxdusL_GdlpA,20471019
17
+ opendataloader_pdf-1.0.2.dist-info/METADATA,sha256=Bt6BaWFzPVd99gjPjgupZjt1a1VJFNkrDmH5MdlVqVk,24580
18
+ opendataloader_pdf-1.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
+ opendataloader_pdf-1.0.2.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
20
+ opendataloader_pdf-1.0.2.dist-info/RECORD,,