opendataloader-pdf 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opendataloader-pdf might be problematic. Click here for more details.

@@ -3,24 +3,11 @@ import sys
3
3
  import importlib.resources as importlib_resources
4
4
  import locale
5
5
  from pathlib import Path
6
- from typing import List
7
6
 
8
7
  # The consistent name of the JAR file bundled with the package
9
8
  _JAR_NAME = "opendataloader-pdf-cli.jar"
10
9
 
11
10
 
12
- def _get_redacted_command_string(command: List[str]) -> str:
13
- """Redacts the password from a command list for safe logging."""
14
- command_for_logging = list(command)
15
- try:
16
- password_index = command_for_logging.index("--password")
17
- if password_index + 1 < len(command_for_logging):
18
- command_for_logging[password_index + 1] = "[REDACTED]"
19
- except ValueError:
20
- pass # '--password' not in command
21
- return " ".join(command_for_logging)
22
-
23
-
24
11
  def run(
25
12
  input_path: str,
26
13
  output_folder: str = None,
@@ -33,6 +20,7 @@ def run(
33
20
  content_safety_off: str = None,
34
21
  html_in_markdown: bool = False,
35
22
  add_image_to_markdown: bool = False,
23
+ no_json: bool = False,
36
24
  debug: bool = False,
37
25
  ):
38
26
  """
@@ -49,6 +37,7 @@ def run(
49
37
  keep_line_breaks: If True, keeps line breaks in the output.
50
38
  html_in_markdown: If True, uses HTML in the Markdown output.
51
39
  add_image_to_markdown: If True, adds images to the Markdown output.
40
+ no_json: If True, disable the JSON output.
52
41
  debug: If True, prints all messages from the CLI to the console during execution.
53
42
 
54
43
  Returns:
@@ -68,6 +57,8 @@ def run(
68
57
  args.extend(["--password", password])
69
58
  if replace_invalid_chars:
70
59
  args.extend(["--replace-invalid-chars", replace_invalid_chars])
60
+ if content_safety_off:
61
+ args.extend(["--content-safety-off", content_safety_off])
71
62
  if generate_markdown:
72
63
  args.append("--markdown")
73
64
  if generate_html:
@@ -76,12 +67,12 @@ def run(
76
67
  args.append("--pdf")
77
68
  if keep_line_breaks:
78
69
  args.append("--keep-line-breaks")
79
- if content_safety_off:
80
- args.append(["--content-safety-off", content_safety_off])
81
70
  if html_in_markdown:
82
71
  args.append("--markdown-with-html")
83
72
  if add_image_to_markdown:
84
73
  args.append("--markdown-with-images")
74
+ if no_json:
75
+ args.append("--no-json")
85
76
 
86
77
  args.append(input_path)
87
78
 
@@ -94,7 +85,6 @@ def run(
94
85
  command = ["java", "-jar", str(jar_path)] + args
95
86
 
96
87
  if debug:
97
- print(f"Running command: {_get_redacted_command_string(command)}", file=sys.stderr)
98
88
  process = subprocess.Popen(
99
89
  command,
100
90
  stdout=subprocess.PIPE,
@@ -144,4 +134,4 @@ def run(
144
134
  print(f"Stderr: {e.stderr}", file=sys.stderr)
145
135
  if e.stdout:
146
136
  print(f"Stdout: {e.stdout}", file=sys.stderr)
147
- raise e
137
+ raise e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opendataloader-pdf
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: A Python wrapper for the opendataloader-pdf Java CLI.
5
5
  Home-page: https://github.com/opendataloader-project/opendataloader-pdf
6
6
  Author: opendataloader-project
@@ -62,13 +62,24 @@ AI-safety is enabled by default and automatically filters likely prompt-injectio
62
62
 
63
63
  ## 🚀 Upcoming Features
64
64
 
65
- - 🖨️ **OCR for scanned PDFs** — Extract data from image-only pages
66
- - 🧠 **Table AI option** — Higher accuracy for tables with borderless or merged cells
67
- - ⚡ **Performance Benchmarks** — Transparent evaluations with open datasets and metrics, reported regularly
68
- - 🛡️ **AI Red Teaming** — Transparent adversarial benchmarks with datasets and metrics, reported regularly
65
+ **Scheduled for November**
66
+ - 📄 **Tagged PDF** — Develop advanced data extraction technology based on Tagged PDF
67
+ - ⚡ **Performance Improvement** — Enhance the inference skill for greater accuracy and speed.
68
+ - 📊 **Benchmarks & Datasets** — Publish transparent evaluations using open datasets and standardized metrics.
69
+ - 🎯 **Metrics** — Publish the calculation methods to transparently share benchmark results.
69
70
 
70
71
  <br/>
71
72
 
73
+ **Scheduled for December**
74
+ - 🖨️ **OCR for scanned PDFs** — Extract data from image-only pages.
75
+ - 🧠 **Table AI option** — Higher accuracy for tables with borderless or merged cells.
76
+ <br/>
77
+
78
+ **Scheduled for 2026**
79
+ - 🛡️ **AI Red Teaming** — Transparent adversarial benchmarks with datasets and metrics, then reported regularly.
80
+
81
+ <br/>
82
+
72
83
  ## Prerequisites
73
84
 
74
85
  - Java 11 or higher must be installed and available in your system's PATH.
@@ -118,6 +129,7 @@ The main function to process PDFs.
118
129
  | `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
119
130
  | `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
120
131
  | `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
132
+ | `no_json` | `bool` | No | `False` | If `True`, disables the JSON output. |
121
133
  | `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
122
134
 
123
135
  <br/>
@@ -189,6 +201,7 @@ The main function to process PDFs.
189
201
  | `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output. |
190
202
  | `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output. |
191
203
  | `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output. |
204
+ | `noJson` | `boolean` | `false` | If `true`, disables the JSON output. |
192
205
  | `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution. |
193
206
 
194
207
  <br/>
@@ -336,6 +349,7 @@ Options:
336
349
  --markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
337
350
  --markdown Sets the data extraction output format to Markdown
338
351
  --html Sets the data extraction output format to HTML
352
+ --no-json Disables the JSON output format
339
353
  -p,--password <arg> Specifies the password for an encrypted PDF
340
354
  --pdf Generates a new PDF file where the extracted layout data is visualized as annotations
341
355
  --replace-invalid-chars <arg> Replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character
@@ -1,7 +1,7 @@
1
1
  opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
2
2
  opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
3
3
  opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
4
- opendataloader_pdf/wrapper.py,sha256=723K0YL0P9JSD_2pQ0w8je3dINy5s8rvYoQhyi6Z8PY,5437
4
+ opendataloader_pdf/wrapper.py,sha256=WL7qTsX214L0jXxlSDesYadRVpdrsLQd2Hgum5BdD1s,4962
5
5
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
6
6
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
7
7
  opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
@@ -13,8 +13,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
13
13
  opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
14
14
  opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
15
15
  opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
16
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=Xj3vHN5EyNydUtNheaZT81gvATHhX_q1tYt9yRCt8EA,20472831
17
- opendataloader_pdf-1.0.3.dist-info/METADATA,sha256=1nB-I81XSIeqVrFdiODmdJyKRx0yfbyNLPO_jhqRz5Q,24580
18
- opendataloader_pdf-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- opendataloader_pdf-1.0.3.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
20
- opendataloader_pdf-1.0.3.dist-info/RECORD,,
16
+ opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=Z9WU68Tw5ckOTgnlUPJs_Jub_C6ZGyQ-0sqjjSNMYYk,20477542
17
+ opendataloader_pdf-1.0.5.dist-info/METADATA,sha256=RNIDw03Rwl4wGRSPIhbHR6VyTzhc7cnlYHEEIajZBTk,25452
18
+ opendataloader_pdf-1.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
+ opendataloader_pdf-1.0.5.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
20
+ opendataloader_pdf-1.0.5.dist-info/RECORD,,