opendataloader-pdf 1.0.6__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opendataloader-pdf might be problematic. Click here for more details.

@@ -1,3 +1,3 @@
1
- from .wrapper import run
1
+ from .wrapper import run, convert
2
2
 
3
- __all__ = ["run"]
3
+ __all__ = ["run", "convert"]
@@ -1,9 +1,9 @@
1
1
  import argparse
2
2
  import subprocess
3
3
  import sys
4
- import importlib.resources as importlib_resources
5
- import locale
4
+ import importlib.resources as resources
6
5
  from pathlib import Path
6
+ from typing import List, Optional
7
7
 
8
8
  # The consistent name of the JAR file bundled with the package
9
9
  _JAR_NAME = "opendataloader-pdf-cli.jar"
@@ -41,9 +41,6 @@ def run(
41
41
  no_json: If True, disable the JSON output.
42
42
  debug: If True, prints all messages from the CLI to the console during execution.
43
43
 
44
- Returns:
45
- The stdout from the CLI tool if successful.
46
-
47
44
  Raises:
48
45
  FileNotFoundError: If the 'java' command is not found or input_path is invalid.
49
46
  subprocess.CalledProcessError: If the CLI tool returns a non-zero exit code.
@@ -52,6 +49,7 @@ def run(
52
49
  raise FileNotFoundError(f"Input file or folder not found: {input_path}")
53
50
 
54
51
  args = []
52
+ args.append(input_path)
55
53
  if output_folder:
56
54
  args.extend(["--output-dir", output_folder])
57
55
  if password:
@@ -75,49 +73,94 @@ def run(
75
73
  if no_json:
76
74
  args.append("--no-json")
77
75
 
78
- args.append(input_path)
76
+ # Run the command
77
+ run_jar(args, quiet=not debug)
78
+
79
+
80
+ def convert(
81
+ input_path: List[str],
82
+ output_dir: Optional[str] = None,
83
+ password: Optional[str] = None,
84
+ format: Optional[List[str]] = None,
85
+ quiet: bool = False,
86
+ content_safety_off: Optional[List[str]] = None,
87
+ keep_line_breaks: bool = False,
88
+ replace_invalid_chars: Optional[str] = None,
89
+ ) -> None:
90
+ """
91
+ Convert PDF(s) into the requested output format(s).
92
+
93
+ Args:
94
+ input_path: One or more input PDF file paths or directories
95
+ output_dir: Directory where outputs are written
96
+ password: Password for encrypted PDFs
97
+ format: List of output formats (e.g., ["json", "html"])
98
+ quiet: Suppress CLI logging output
99
+ content_safety_off: List of content safety filters to disable
100
+ keep_line_breaks: Preserve line breaks in text output
101
+ replace_invalid_chars: Replacement character for invalid/unrecognized characters
102
+ """
103
+ args: List[str] = []
104
+ args.extend(input_path)
105
+ if output_dir:
106
+ args.extend(["--output-dir", output_dir])
107
+ if password:
108
+ args.extend(["--password", password])
109
+ if format:
110
+ args.extend(["--format", *format])
111
+ if quiet:
112
+ args.append("--quiet")
113
+ if content_safety_off:
114
+ args.extend(["--content-safety-off", *content_safety_off])
115
+ if keep_line_breaks:
116
+ args.append("--keep-line-breaks")
117
+ if replace_invalid_chars:
118
+ args.extend(["--replace-invalid-chars", replace_invalid_chars])
119
+
120
+ # Run the command
121
+ run_jar(args, quiet)
79
122
 
123
+
124
+ def run_jar(args: List[str], quiet: bool = False) -> str:
125
+ """Run the opendataloader-pdf JAR with the given arguments."""
80
126
  try:
81
- # Find the JAR file within the package
82
- jar_ref = importlib_resources.files("opendataloader_pdf").joinpath(
83
- "jar", _JAR_NAME
84
- )
85
- with importlib_resources.as_file(jar_ref) as jar_path:
86
- command = ["java", "-jar", str(jar_path)] + args
127
+ # Access the embedded JAR inside the package
128
+ jar_ref = resources.files("opendataloader_pdf").joinpath("jar", _JAR_NAME)
129
+ with resources.as_file(jar_ref) as jar_path:
130
+ command = ["java", "-jar", str(jar_path), *args]
87
131
 
88
- if debug:
89
- process = subprocess.Popen(
132
+ if quiet:
133
+ # Quiet mode → capture all output
134
+ result = subprocess.run(
90
135
  command,
91
- stdout=subprocess.PIPE,
92
- stderr=subprocess.STDOUT,
136
+ capture_output=True,
93
137
  text=True,
94
- encoding=locale.getpreferredencoding(False),
138
+ check=True,
139
+ encoding="utf-8",
95
140
  )
141
+ return result.stdout
96
142
 
97
- output_lines = []
98
- for line in iter(process.stdout.readline, ""):
143
+ # Streaming mode → live output
144
+ with subprocess.Popen(
145
+ command,
146
+ stdout=subprocess.PIPE,
147
+ stderr=subprocess.STDOUT,
148
+ text=True,
149
+ encoding="utf-8",
150
+ ) as process:
151
+ output_lines: List[str] = []
152
+ for line in process.stdout:
99
153
  sys.stdout.write(line)
100
154
  output_lines.append(line)
101
155
 
102
- process.stdout.close()
103
156
  return_code = process.wait()
104
157
  captured_output = "".join(output_lines)
105
158
 
106
159
  if return_code:
107
- # Manually raise error with the combined output
108
160
  raise subprocess.CalledProcessError(
109
161
  return_code, command, output=captured_output
110
162
  )
111
163
  return captured_output
112
- else:
113
- result = subprocess.run(
114
- command,
115
- capture_output=True,
116
- text=True,
117
- check=True,
118
- encoding=locale.getpreferredencoding(False),
119
- )
120
- return result.stdout
121
164
 
122
165
  except FileNotFoundError:
123
166
  print(
@@ -126,16 +169,16 @@ def run(
126
169
  )
127
170
  raise
128
171
 
129
- except subprocess.CalledProcessError as e:
172
+ except subprocess.CalledProcessError as error:
130
173
  print("Error running opendataloader-pdf CLI.", file=sys.stderr)
131
- print(f"Return code: {e.returncode}", file=sys.stderr)
132
- if e.output:
133
- print(f"Output: {e.output}", file=sys.stderr)
134
- elif e.stderr:
135
- print(f"Stderr: {e.stderr}", file=sys.stderr)
136
- if e.stdout:
137
- print(f"Stdout: {e.stdout}", file=sys.stderr)
138
- raise e
174
+ print(f"Return code: {error.returncode}", file=sys.stderr)
175
+ if error.output:
176
+ print(f"Output: {error.output}", file=sys.stderr)
177
+ if error.stderr:
178
+ print(f"Stderr: {error.stderr}", file=sys.stderr)
179
+ if error.stdout:
180
+ print(f"Stdout: {error.stdout}", file=sys.stderr)
181
+ raise
139
182
 
140
183
 
141
184
  def main(argv=None) -> int:
@@ -143,39 +186,47 @@ def main(argv=None) -> int:
143
186
  parser = argparse.ArgumentParser(
144
187
  description="Run the opendataloader-pdf CLI using the bundled JAR."
145
188
  )
146
- parser.add_argument("input_path", help="Path to the input PDF file or directory.")
189
+ parser.add_argument(
190
+ "input_path", nargs="+", help="Path to the input PDF file or directory."
191
+ )
147
192
  parser.add_argument(
148
193
  "-o",
149
194
  "--output-dir",
150
- dest="output_folder",
151
195
  help="Directory where outputs are written.",
152
196
  )
153
197
  parser.add_argument("-p", "--password", help="Password for encrypted PDFs.")
154
198
  parser.add_argument(
155
- "--replace-invalid-chars",
156
- help="Replacement character for invalid or unrecognized characters.",
157
- )
158
- parser.add_argument(
159
- "--content-safety-off",
160
- help="Disable content safety filtering (expects the desired mode).",
161
- )
162
- parser.add_argument(
163
- "--markdown",
164
- dest="generate_markdown",
165
- action="store_true",
166
- help="Generate Markdown output.",
199
+ "-f",
200
+ "--format",
201
+ nargs="+",
202
+ choices=[
203
+ "json",
204
+ "text",
205
+ "html",
206
+ "pdf",
207
+ "markdown",
208
+ "markdown-with-html",
209
+ "markdown-with-images",
210
+ ],
211
+ help="Output format(s) to generate.",
167
212
  )
168
213
  parser.add_argument(
169
- "--html",
170
- dest="generate_html",
214
+ "-q",
215
+ "--quiet",
171
216
  action="store_true",
172
- help="Generate HTML output.",
217
+ help="Suppress CLI logging output.",
173
218
  )
174
219
  parser.add_argument(
175
- "--pdf",
176
- dest="generate_annotated_pdf",
177
- action="store_true",
178
- help="Generate annotated PDF output.",
220
+ "--content-safety-off",
221
+ nargs="+",
222
+ choices=[
223
+ "all",
224
+ "hidden-text",
225
+ "off-page",
226
+ "tiny",
227
+ "hidden-ocg",
228
+ ],
229
+ help="Disables one or more content safety filters. Accepts a list of filter names.",
179
230
  )
180
231
  parser.add_argument(
181
232
  "--keep-line-breaks",
@@ -183,31 +234,13 @@ def main(argv=None) -> int:
183
234
  help="Preserve line breaks in text output.",
184
235
  )
185
236
  parser.add_argument(
186
- "--markdown-with-html",
187
- dest="html_in_markdown",
188
- action="store_true",
189
- help="Allow raw HTML within Markdown output.",
190
- )
191
- parser.add_argument(
192
- "--markdown-with-images",
193
- dest="add_image_to_markdown",
194
- action="store_true",
195
- help="Embed images in Markdown output.",
196
- )
197
- parser.add_argument(
198
- "--no-json",
199
- action="store_true",
200
- help="Disable JSON output generation.",
201
- )
202
- parser.add_argument(
203
- "--debug",
204
- action="store_true",
205
- help="Stream CLI logs directly to stdout.",
237
+ "--replace-invalid-chars",
238
+ help="Replacement character for invalid or unrecognized characters.",
206
239
  )
207
240
  args = parser.parse_args(argv)
208
241
 
209
242
  try:
210
- run(**vars(args))
243
+ convert(**vars(args))
211
244
  except FileNotFoundError as err:
212
245
  print(err, file=sys.stderr)
213
246
  return 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opendataloader-pdf
3
- Version: 1.0.6
3
+ Version: 1.1.0
4
4
  Summary: A Python wrapper for the opendataloader-pdf Java CLI.
5
5
  Home-page: https://github.com/opendataloader-project/opendataloader-pdf
6
6
  Author: opendataloader-project
@@ -97,47 +97,42 @@ pip install -U opendataloader-pdf
97
97
 
98
98
  ### Usage
99
99
 
100
- - input_path can be either the path to a single document or the path to a folder.
101
- - If you don’t specify an output_folder, the output data will be saved in the same directory as the input document.
100
+ input_path can be either the path to a single document or the path to a folder.
102
101
 
103
102
  ```python
104
103
  import opendataloader_pdf
105
104
 
106
- opendataloader_pdf.run(
107
- input_path="path/to/document.pdf",
108
- output_folder="path/to/output",
109
- generate_markdown=True,
110
- generate_html=True,
111
- generate_annotated_pdf=True,
112
- debug=True,
105
+ opendataloader_pdf.convert(
106
+ input_path=["path/to/document.pdf", "path/to/folder"],
107
+ output_dir="path/to/output",
108
+ format=["json", "html", "pdf", "markdown"]
113
109
  )
114
110
  ```
115
111
 
116
- - If you want to run it via CLI, you can use the following command:
112
+ If you want to run it via CLI, you can use the following command on the terminal:
117
113
 
118
- ```sh
119
- opendataloader-pdf path/to/document.pdf --markdown --html --pdf
114
+ ```bash
115
+ opendataloader-pdf path/to/document.pdf path/to/folder -o path/to/output -f json html pdf markdown
120
116
  ```
121
117
 
122
- ### Function: run()
118
+ ### Function: convert()
123
119
 
124
120
  The main function to process PDFs.
125
121
 
126
- | Parameter | Type | Required | Default | Description |
127
- |--------------------------| ------ | -------- |--------------|---------------------------------------------------------------------------------------------------------------------------------------------|
128
- | `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
129
- | `output_folder` | `str` | No | input folder | Path to the output folder. |
130
- | `password` | `str` | No | `None` | Password for the PDF file. |
131
- | `replace_invalid_chars` | `str` | No | `" "` | Character to replace invalid or unrecognized characters (e.g., �, \u0000) |
132
- | `content_safety_off` | `str` | No | `None` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg. |
133
- | `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
134
- | `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
135
- | `generate_annotated_pdf` | `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
136
- | `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
137
- | `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
138
- | `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
139
- | `no_json` | `bool` | No | `False` | If `True`, disables the JSON output. |
140
- | `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
122
+ | Parameter | Type | Required | Default | Description |
123
+ |--------------------------|----------------| -------- |--------------|---------------------------------------------------------------------------------------------------------------------------------------------|
124
+ | `input_path` | `List[str]` | ✅ Yes | — | One or more PDF file paths or directories to process. |
125
+ | `output_dir` | `Optional[str]` | No | input folder | Directory where outputs are written. |
126
+ | `password` | `Optional[str]` | No | `None` | Password used for encrypted PDFs. |
127
+ | `format` | `Optional[List[str]]` | No | `None` | Output formats to generate (e.g. `"json"`, `"html"`, `"pdf"`, `"text"`, `"markdown"`, `"markdown-with-html"`, `"markdown-with-images"`). |
128
+ | `quiet` | `bool` | No | `False` | Suppresses CLI logging output when `True`. |
129
+ | `content_safety_off` | `Optional[List[str]]` | No | `None` | List of content safety filters to disable (e.g. `"all"`, `"hidden-text"`, `"off-page"`, `"tiny"`, `"hidden-ocg"`). |
130
+ | `keep_line_breaks` | `bool` | No | `False` | Preserves line breaks in text output when `True`. |
131
+ | `replace_invalid_chars` | `Optional[str]` | No | `None` | Replacement character for invalid or unrecognized characters (e.g., �, `\u0000`). |
132
+
133
+ ### Function: run()
134
+
135
+ Deprecated.
141
136
 
142
137
  <br/>
143
138
 
@@ -368,16 +363,23 @@ The images are extracted from PDF as individual files and stored in a subfolder
368
363
  ```
369
364
  Options:
370
365
  -o,--output-dir <arg> Specifies the output directory for generated files
366
+ -p,--password <arg> Specifies the password for an encrypted PDF
367
+ -f,--format <arg> List of output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images). Default: json
368
+ -q,--quiet Suppresses console logging output
369
+ --content-safety-off <arg> Disables one or more content safety filters. Accepts a list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg
371
370
  --keep-line-breaks Preserves original line breaks in the extracted text
372
- --content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg
373
- --markdown-with-html Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
374
- --markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
375
- --markdown Sets the data extraction output format to Markdown
376
- --html Sets the data extraction output format to HTML
371
+ --replace-invalid-chars <arg> Replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character
372
+ ```
373
+
374
+ The legacy options (for backward compatibility):
375
+
376
+ ```
377
377
  --no-json Disables the JSON output format
378
- -p,--password <arg> Specifies the password for an encrypted PDF
378
+ --html Sets the data extraction output format to HTML
379
379
  --pdf Generates a new PDF file where the extracted layout data is visualized as annotations
380
- --replace-invalid-chars <arg> Replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character
380
+ --markdown Sets the data extraction output format to Markdown
381
+ --markdown-with-html Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
382
+ --markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
381
383
  ```
382
384
 
383
385
  ### Schema of the JSON output
@@ -1,8 +1,8 @@
1
1
  opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
2
2
  opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
3
- opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
3
+ opendataloader_pdf/__init__.py,sha256=xkTyVWNu1W2YrI1tPpGnd11DwwcwFDyBp8b4agLdd7A,64
4
4
  opendataloader_pdf/__main__.py,sha256=lmla4yz3SaYBfRJXOXnwO_8ID31-Ja20aQmomiz1eEc,84
5
- opendataloader_pdf/wrapper.py,sha256=Dsvw5un_HROLcy2xX0WqoKKRnOjL081LEYC6YfpViLE,7331
5
+ opendataloader_pdf/wrapper.py,sha256=0Erld0Cpu5rXhT46WpqNkrg0bBg15iBmaH-AU9CfdNU,8409
6
6
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
7
7
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
8
8
  opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
@@ -14,9 +14,9 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
14
14
  opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
15
15
  opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
16
16
  opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
17
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=HmcxP25ZCOJNRV9U1IXy-beAN243_iCWTBIV6JB-6S8,20477911
18
- opendataloader_pdf-1.0.6.dist-info/METADATA,sha256=2BWSSScAW3mmpWum3N7g-01fMZITHsmQcDBqSmGSkU0,25966
19
- opendataloader_pdf-1.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- opendataloader_pdf-1.0.6.dist-info/entry_points.txt,sha256=Tupa9pVNF6nXD9sqzCLI8PCHbSu0jKkL3SYyTkQy0dc,71
21
- opendataloader_pdf-1.0.6.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
22
- opendataloader_pdf-1.0.6.dist-info/RECORD,,
17
+ opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=0mrZFl8pVTrUIf8Nk09wN9KJPWW1U8ZMvdyB9sEPt-c,20485612
18
+ opendataloader_pdf-1.1.0.dist-info/METADATA,sha256=jk-YV7lX5a9wqL57cLhAhJr_bgXEOBSY2RGL67lDN34,25270
19
+ opendataloader_pdf-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
+ opendataloader_pdf-1.1.0.dist-info/entry_points.txt,sha256=Tupa9pVNF6nXD9sqzCLI8PCHbSu0jKkL3SYyTkQy0dc,71
21
+ opendataloader_pdf-1.1.0.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
22
+ opendataloader_pdf-1.1.0.dist-info/RECORD,,