opendataloader-pdf 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opendataloader-pdf might be problematic. Click here for more details.

@@ -1,3 +1,3 @@
1
- from .wrapper import run
1
+ from .wrapper import run, convert
2
2
 
3
- __all__ = ["run"]
3
+ __all__ = ["run", "convert"]
@@ -0,0 +1,5 @@
1
+ from .wrapper import main
2
+
3
+
4
+ if __name__ == "__main__":
5
+ raise SystemExit(main())
@@ -1,8 +1,9 @@
1
+ import argparse
1
2
  import subprocess
2
3
  import sys
3
- import importlib.resources as importlib_resources
4
- import locale
4
+ import importlib.resources as resources
5
5
  from pathlib import Path
6
+ from typing import List, Optional
6
7
 
7
8
  # The consistent name of the JAR file bundled with the package
8
9
  _JAR_NAME = "opendataloader-pdf-cli.jar"
@@ -40,9 +41,6 @@ def run(
40
41
  no_json: If True, disable the JSON output.
41
42
  debug: If True, prints all messages from the CLI to the console during execution.
42
43
 
43
- Returns:
44
- The stdout from the CLI tool if successful.
45
-
46
44
  Raises:
47
45
  FileNotFoundError: If the 'java' command is not found or input_path is invalid.
48
46
  subprocess.CalledProcessError: If the CLI tool returns a non-zero exit code.
@@ -51,6 +49,7 @@ def run(
51
49
  raise FileNotFoundError(f"Input file or folder not found: {input_path}")
52
50
 
53
51
  args = []
52
+ args.append(input_path)
54
53
  if output_folder:
55
54
  args.extend(["--output-dir", output_folder])
56
55
  if password:
@@ -74,49 +73,94 @@ def run(
74
73
  if no_json:
75
74
  args.append("--no-json")
76
75
 
77
- args.append(input_path)
76
+ # Run the command
77
+ run_jar(args, quiet=not debug)
78
+
79
+
80
+ def convert(
81
+ input_path: List[str],
82
+ output_dir: Optional[str] = None,
83
+ password: Optional[str] = None,
84
+ format: Optional[List[str]] = None,
85
+ quiet: bool = False,
86
+ content_safety_off: Optional[List[str]] = None,
87
+ keep_line_breaks: bool = False,
88
+ replace_invalid_chars: Optional[str] = None,
89
+ ) -> None:
90
+ """
91
+ Convert PDF(s) into the requested output format(s).
92
+
93
+ Args:
94
+ input_path: One or more input PDF file paths or directories
95
+ output_dir: Directory where outputs are written
96
+ password: Password for encrypted PDFs
97
+ format: List of output formats (e.g., ["json", "html"])
98
+ quiet: Suppress CLI logging output
99
+ content_safety_off: List of content safety filters to disable
100
+ keep_line_breaks: Preserve line breaks in text output
101
+ replace_invalid_chars: Replacement character for invalid/unrecognized characters
102
+ """
103
+ args: List[str] = []
104
+ args.extend(input_path)
105
+ if output_dir:
106
+ args.extend(["--output-dir", output_dir])
107
+ if password:
108
+ args.extend(["--password", password])
109
+ if format:
110
+ args.extend(["--format", *format])
111
+ if quiet:
112
+ args.append("--quiet")
113
+ if content_safety_off:
114
+ args.extend(["--content-safety-off", *content_safety_off])
115
+ if keep_line_breaks:
116
+ args.append("--keep-line-breaks")
117
+ if replace_invalid_chars:
118
+ args.extend(["--replace-invalid-chars", replace_invalid_chars])
119
+
120
+ # Run the command
121
+ run_jar(args, quiet)
122
+
78
123
 
124
+ def run_jar(args: List[str], quiet: bool = False) -> str:
125
+ """Run the opendataloader-pdf JAR with the given arguments."""
79
126
  try:
80
- # Find the JAR file within the package
81
- jar_ref = importlib_resources.files("opendataloader_pdf").joinpath(
82
- "jar", _JAR_NAME
83
- )
84
- with importlib_resources.as_file(jar_ref) as jar_path:
85
- command = ["java", "-jar", str(jar_path)] + args
127
+ # Access the embedded JAR inside the package
128
+ jar_ref = resources.files("opendataloader_pdf").joinpath("jar", _JAR_NAME)
129
+ with resources.as_file(jar_ref) as jar_path:
130
+ command = ["java", "-jar", str(jar_path), *args]
86
131
 
87
- if debug:
88
- process = subprocess.Popen(
132
+ if quiet:
133
+ # Quiet mode → capture all output
134
+ result = subprocess.run(
89
135
  command,
90
- stdout=subprocess.PIPE,
91
- stderr=subprocess.STDOUT,
136
+ capture_output=True,
92
137
  text=True,
93
- encoding=locale.getpreferredencoding(False),
138
+ check=True,
139
+ encoding="utf-8",
94
140
  )
141
+ return result.stdout
95
142
 
96
- output_lines = []
97
- for line in iter(process.stdout.readline, ""):
143
+ # Streaming mode → live output
144
+ with subprocess.Popen(
145
+ command,
146
+ stdout=subprocess.PIPE,
147
+ stderr=subprocess.STDOUT,
148
+ text=True,
149
+ encoding="utf-8",
150
+ ) as process:
151
+ output_lines: List[str] = []
152
+ for line in process.stdout:
98
153
  sys.stdout.write(line)
99
154
  output_lines.append(line)
100
155
 
101
- process.stdout.close()
102
156
  return_code = process.wait()
103
157
  captured_output = "".join(output_lines)
104
158
 
105
159
  if return_code:
106
- # Manually raise error with the combined output
107
160
  raise subprocess.CalledProcessError(
108
161
  return_code, command, output=captured_output
109
162
  )
110
163
  return captured_output
111
- else:
112
- result = subprocess.run(
113
- command,
114
- capture_output=True,
115
- text=True,
116
- check=True,
117
- encoding=locale.getpreferredencoding(False),
118
- )
119
- return result.stdout
120
164
 
121
165
  except FileNotFoundError:
122
166
  print(
@@ -125,13 +169,84 @@ def run(
125
169
  )
126
170
  raise
127
171
 
128
- except subprocess.CalledProcessError as e:
172
+ except subprocess.CalledProcessError as error:
129
173
  print("Error running opendataloader-pdf CLI.", file=sys.stderr)
130
- print(f"Return code: {e.returncode}", file=sys.stderr)
131
- if e.output:
132
- print(f"Output: {e.output}", file=sys.stderr)
133
- elif e.stderr:
134
- print(f"Stderr: {e.stderr}", file=sys.stderr)
135
- if e.stdout:
136
- print(f"Stdout: {e.stdout}", file=sys.stderr)
137
- raise e
174
+ print(f"Return code: {error.returncode}", file=sys.stderr)
175
+ if error.output:
176
+ print(f"Output: {error.output}", file=sys.stderr)
177
+ if error.stderr:
178
+ print(f"Stderr: {error.stderr}", file=sys.stderr)
179
+ if error.stdout:
180
+ print(f"Stdout: {error.stdout}", file=sys.stderr)
181
+ raise
182
+
183
+
184
+ def main(argv=None) -> int:
185
+ """CLI entry point for running the wrapper from the command line."""
186
+ parser = argparse.ArgumentParser(
187
+ description="Run the opendataloader-pdf CLI using the bundled JAR."
188
+ )
189
+ parser.add_argument(
190
+ "input_path", nargs="+", help="Path to the input PDF file or directory."
191
+ )
192
+ parser.add_argument(
193
+ "-o",
194
+ "--output-dir",
195
+ help="Directory where outputs are written.",
196
+ )
197
+ parser.add_argument("-p", "--password", help="Password for encrypted PDFs.")
198
+ parser.add_argument(
199
+ "-f",
200
+ "--format",
201
+ nargs="+",
202
+ choices=[
203
+ "json",
204
+ "text",
205
+ "html",
206
+ "pdf",
207
+ "markdown",
208
+ "markdown-with-html",
209
+ "markdown-with-images",
210
+ ],
211
+ help="Output format(s) to generate.",
212
+ )
213
+ parser.add_argument(
214
+ "-q",
215
+ "--quiet",
216
+ action="store_true",
217
+ help="Suppress CLI logging output.",
218
+ )
219
+ parser.add_argument(
220
+ "--content-safety-off",
221
+ nargs="+",
222
+ choices=[
223
+ "all",
224
+ "hidden-text",
225
+ "off-page",
226
+ "tiny",
227
+ "hidden-ocg",
228
+ ],
229
+ help="Disables one or more content safety filters. Accepts a list of filter names.",
230
+ )
231
+ parser.add_argument(
232
+ "--keep-line-breaks",
233
+ action="store_true",
234
+ help="Preserve line breaks in text output.",
235
+ )
236
+ parser.add_argument(
237
+ "--replace-invalid-chars",
238
+ help="Replacement character for invalid or unrecognized characters.",
239
+ )
240
+ args = parser.parse_args(argv)
241
+
242
+ try:
243
+ convert(**vars(args))
244
+ except FileNotFoundError as err:
245
+ print(err, file=sys.stderr)
246
+ return 1
247
+ except subprocess.CalledProcessError as err:
248
+ return err.returncode or 1
249
+
250
+
251
+ if __name__ == "__main__":
252
+ sys.exit(main())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opendataloader-pdf
3
- Version: 1.0.5
3
+ Version: 1.1.0
4
4
  Summary: A Python wrapper for the opendataloader-pdf Java CLI.
5
5
  Home-page: https://github.com/opendataloader-project/opendataloader-pdf
6
6
  Author: opendataloader-project
@@ -97,40 +97,42 @@ pip install -U opendataloader-pdf
97
97
 
98
98
  ### Usage
99
99
 
100
- - input_path can be either the path to a single document or the path to a folder.
101
- - If you don’t specify an output_folder, the output data will be saved in the same directory as the input document.
100
+ input_path can be either the path to a single document or the path to a folder.
102
101
 
103
102
  ```python
104
103
  import opendataloader_pdf
105
104
 
106
- opendataloader_pdf.run(
107
- input_path="path/to/document.pdf",
108
- output_folder="path/to/output",
109
- generate_markdown=True,
110
- generate_html=True,
111
- generate_annotated_pdf=True,
105
+ opendataloader_pdf.convert(
106
+ input_path=["path/to/document.pdf", "path/to/folder"],
107
+ output_dir="path/to/output",
108
+ format=["json", "html", "pdf", "markdown"]
112
109
  )
113
110
  ```
114
111
 
115
- ### Function: run()
112
+ If you want to run it via CLI, you can use the following command on the terminal:
113
+
114
+ ```bash
115
+ opendataloader-pdf path/to/document.pdf path/to/folder -o path/to/output -f json html pdf markdown
116
+ ```
117
+
118
+ ### Function: convert()
116
119
 
117
120
  The main function to process PDFs.
118
121
 
119
- | Parameter | Type | Required | Default | Description |
120
- |--------------------------| ------ | -------- |--------------|---------------------------------------------------------------------------------------------------------------------------------------------|
121
- | `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
122
- | `output_folder` | `str` | No | input folder | Path to the output folder. |
123
- | `password` | `str` | No | `None` | Password for the PDF file. |
124
- | `replace_invalid_chars` | `str` | No | `" "` | Character to replace invalid or unrecognized characters (e.g., �, \u0000) |
125
- | `content_safety_off` | `str` | No | `None` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg. |
126
- | `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
127
- | `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
128
- | `generate_annotated_pdf` | `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
129
- | `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
130
- | `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
131
- | `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
132
- | `no_json` | `bool` | No | `False` | If `True`, disables the JSON output. |
133
- | `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
122
+ | Parameter | Type | Required | Default | Description |
123
+ |--------------------------|----------------| -------- |--------------|---------------------------------------------------------------------------------------------------------------------------------------------|
124
+ | `input_path` | `List[str]` | ✅ Yes | — | One or more PDF file paths or directories to process. |
125
+ | `output_dir` | `Optional[str]` | No | input folder | Directory where outputs are written. |
126
+ | `password` | `Optional[str]` | No | `None` | Password used for encrypted PDFs. |
127
+ | `format` | `Optional[List[str]]` | No | `None` | Output formats to generate (e.g. `"json"`, `"html"`, `"pdf"`, `"text"`, `"markdown"`, `"markdown-with-html"`, `"markdown-with-images"`). |
128
+ | `quiet` | `bool` | No | `False` | Suppresses CLI logging output when `True`. |
129
+ | `content_safety_off` | `Optional[List[str]]` | No | `None` | List of content safety filters to disable (e.g. `"all"`, `"hidden-text"`, `"off-page"`, `"tiny"`, `"hidden-ocg"`). |
130
+ | `keep_line_breaks` | `bool` | No | `False` | Preserves line breaks in text output when `True`. |
131
+ | `replace_invalid_chars` | `Optional[str]` | No | `None` | Replacement character for invalid or unrecognized characters (e.g., �, `\u0000`). |
132
+
133
+ ### Function: run()
134
+
135
+ Deprecated.
134
136
 
135
137
  <br/>
136
138
 
@@ -174,6 +176,24 @@ async function main() {
174
176
  main();
175
177
  ```
176
178
 
179
+ If you want to run it via CLI, you can use the following command:
180
+
181
+ ```bash
182
+ npx @opendataloader/pdf path/to/document.pdf -o path/to/output --markdown --html --pdf
183
+ ```
184
+
185
+ or you can install it globally:
186
+
187
+ ```bash
188
+ npm install -g @opendataloader/pdf
189
+ ```
190
+
191
+ then run:
192
+
193
+ ```bash
194
+ opendataloader-pdf path/to/document.pdf -o path/to/output --markdown --html --pdf
195
+ ```
196
+
177
197
  ### Function: run()
178
198
 
179
199
  `run(inputPath: string, options?: RunOptions): Promise<string>`
@@ -343,16 +363,23 @@ The images are extracted from PDF as individual files and stored in a subfolder
343
363
  ```
344
364
  Options:
345
365
  -o,--output-dir <arg> Specifies the output directory for generated files
366
+ -p,--password <arg> Specifies the password for an encrypted PDF
367
+ -f,--format <arg> List of output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images). Default: json
368
+ -q,--quiet Suppresses console logging output
369
+ --content-safety-off <arg> Disables one or more content safety filters. Accepts a list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg
346
370
  --keep-line-breaks Preserves original line breaks in the extracted text
347
- --content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg
348
- --markdown-with-html Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
349
- --markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
350
- --markdown Sets the data extraction output format to Markdown
351
- --html Sets the data extraction output format to HTML
371
+ --replace-invalid-chars <arg> Replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character
372
+ ```
373
+
374
+ The legacy options (for backward compatibility):
375
+
376
+ ```
352
377
  --no-json Disables the JSON output format
353
- -p,--password <arg> Specifies the password for an encrypted PDF
378
+ --html Sets the data extraction output format to HTML
354
379
  --pdf Generates a new PDF file where the extracted layout data is visualized as annotations
355
- --replace-invalid-chars <arg> Replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character
380
+ --markdown Sets the data extraction output format to Markdown
381
+ --markdown-with-html Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
382
+ --markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
356
383
  ```
357
384
 
358
385
  ### Schema of the JSON output
@@ -1,7 +1,8 @@
1
1
  opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
2
2
  opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
3
- opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
4
- opendataloader_pdf/wrapper.py,sha256=WL7qTsX214L0jXxlSDesYadRVpdrsLQd2Hgum5BdD1s,4962
3
+ opendataloader_pdf/__init__.py,sha256=xkTyVWNu1W2YrI1tPpGnd11DwwcwFDyBp8b4agLdd7A,64
4
+ opendataloader_pdf/__main__.py,sha256=lmla4yz3SaYBfRJXOXnwO_8ID31-Ja20aQmomiz1eEc,84
5
+ opendataloader_pdf/wrapper.py,sha256=0Erld0Cpu5rXhT46WpqNkrg0bBg15iBmaH-AU9CfdNU,8409
5
6
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
6
7
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
7
8
  opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
@@ -13,8 +14,9 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
13
14
  opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
14
15
  opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
15
16
  opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
16
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=Z9WU68Tw5ckOTgnlUPJs_Jub_C6ZGyQ-0sqjjSNMYYk,20477542
17
- opendataloader_pdf-1.0.5.dist-info/METADATA,sha256=RNIDw03Rwl4wGRSPIhbHR6VyTzhc7cnlYHEEIajZBTk,25452
18
- opendataloader_pdf-1.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- opendataloader_pdf-1.0.5.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
20
- opendataloader_pdf-1.0.5.dist-info/RECORD,,
17
+ opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=0mrZFl8pVTrUIf8Nk09wN9KJPWW1U8ZMvdyB9sEPt-c,20485612
18
+ opendataloader_pdf-1.1.0.dist-info/METADATA,sha256=jk-YV7lX5a9wqL57cLhAhJr_bgXEOBSY2RGL67lDN34,25270
19
+ opendataloader_pdf-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
+ opendataloader_pdf-1.1.0.dist-info/entry_points.txt,sha256=Tupa9pVNF6nXD9sqzCLI8PCHbSu0jKkL3SYyTkQy0dc,71
21
+ opendataloader_pdf-1.1.0.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
22
+ opendataloader_pdf-1.1.0.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ opendataloader-pdf = opendataloader_pdf.wrapper:main