PyPI - opendataloader-pdf - Versions diffs - 1.0.6__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

opendataloader-pdf 1.0.6py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of opendataloader-pdf might be problematic. Click here for more details.

Files changed (8) hide show

opendataloader_pdf/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .wrapper import run
+from .wrapper import run, convert
-__all__ = ["run"]
+__all__ = ["run", "convert"]

opendataloader_pdf/jar/opendataloader-pdf-cli.jar CHANGED Viewed

Binary file

opendataloader_pdf/wrapper.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import argparse
 import subprocess
 import sys
-import importlib.resources as importlib_resources
-import locale
+import importlib.resources as resources
 from pathlib import Path
+from typing import List, Optional
 # The consistent name of the JAR file bundled with the package
 _JAR_NAME = "opendataloader-pdf-cli.jar"
@@ -41,9 +41,6 @@ def run(
         no_json: If True, disable the JSON output.
         debug: If True, prints all messages from the CLI to the console during execution.
-    Returns:
-        The stdout from the CLI tool if successful.
     Raises:
         FileNotFoundError: If the 'java' command is not found or input_path is invalid.
         subprocess.CalledProcessError: If the CLI tool returns a non-zero exit code.
@@ -52,6 +49,7 @@ def run(
         raise FileNotFoundError(f"Input file or folder not found: {input_path}")
     args = []
+    args.append(input_path)
     if output_folder:
         args.extend(["--output-dir", output_folder])
     if password:
@@ -75,49 +73,94 @@ def run(
     if no_json:
         args.append("--no-json")
-    args.append(input_path)
+    # Run the command
+    run_jar(args, quiet=not debug)
+def convert(
+    input_path: List[str],
+    output_dir: Optional[str] = None,
+    password: Optional[str] = None,
+    format: Optional[List[str]] = None,
+    quiet: bool = False,
+    content_safety_off: Optional[List[str]] = None,
+    keep_line_breaks: bool = False,
+    replace_invalid_chars: Optional[str] = None,
+) -> None:
+    """
+    Convert PDF(s) into the requested output format(s).
+    Args:
+        input_path: One or more input PDF file paths or directories
+        output_dir: Directory where outputs are written
+        password: Password for encrypted PDFs
+        format: List of output formats (e.g., ["json", "html"])
+        quiet: Suppress CLI logging output
+        content_safety_off: List of content safety filters to disable
+        keep_line_breaks: Preserve line breaks in text output
+        replace_invalid_chars: Replacement character for invalid/unrecognized characters
+    """
+    args: List[str] = []
+    args.extend(input_path)
+    if output_dir:
+        args.extend(["--output-dir", output_dir])
+    if password:
+        args.extend(["--password", password])
+    if format:
+        args.extend(["--format", *format])
+    if quiet:
+        args.append("--quiet")
+    if content_safety_off:
+        args.extend(["--content-safety-off", *content_safety_off])
+    if keep_line_breaks:
+        args.append("--keep-line-breaks")
+    if replace_invalid_chars:
+        args.extend(["--replace-invalid-chars", replace_invalid_chars])
+    # Run the command
+    run_jar(args, quiet)
+def run_jar(args: List[str], quiet: bool = False) -> str:
+    """Run the opendataloader-pdf JAR with the given arguments."""
     try:
-        # Find the JAR file within the package
-        jar_ref = importlib_resources.files("opendataloader_pdf").joinpath(
-            "jar", _JAR_NAME
-        )
-        with importlib_resources.as_file(jar_ref) as jar_path:
-            command = ["java", "-jar", str(jar_path)] + args
+        # Access the embedded JAR inside the package
+        jar_ref = resources.files("opendataloader_pdf").joinpath("jar", _JAR_NAME)
+        with resources.as_file(jar_ref) as jar_path:
+            command = ["java", "-jar", str(jar_path), *args]
-            if debug:
-                process = subprocess.Popen(
+            if quiet:
+                # Quiet mode → capture all output
+                result = subprocess.run(
                     command,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.STDOUT,
+                    capture_output=True,
                     text=True,
-                    encoding=locale.getpreferredencoding(False),
+                    check=True,
+                    encoding="utf-8",
                 )
+                return result.stdout
-                output_lines = []
-                for line in iter(process.stdout.readline, ""):
+            # Streaming mode → live output
+            with subprocess.Popen(
+                command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                encoding="utf-8",
+            ) as process:
+                output_lines: List[str] = []
+                for line in process.stdout:
                     sys.stdout.write(line)
                     output_lines.append(line)
-                process.stdout.close()
                 return_code = process.wait()
                 captured_output = "".join(output_lines)
                 if return_code:
-                    # Manually raise error with the combined output
                     raise subprocess.CalledProcessError(
                         return_code, command, output=captured_output
                     )
                 return captured_output
-            else:
-                result = subprocess.run(
-                    command,
-                    capture_output=True,
-                    text=True,
-                    check=True,
-                    encoding=locale.getpreferredencoding(False),
-                )
-                return result.stdout
     except FileNotFoundError:
         print(
@@ -126,16 +169,16 @@ def run(
         )
         raise
-    except subprocess.CalledProcessError as e:
+    except subprocess.CalledProcessError as error:
         print("Error running opendataloader-pdf CLI.", file=sys.stderr)
-        print(f"Return code: {e.returncode}", file=sys.stderr)
-        if e.output:
-            print(f"Output: {e.output}", file=sys.stderr)
-        elif e.stderr:
-            print(f"Stderr: {e.stderr}", file=sys.stderr)
-        if e.stdout:
-            print(f"Stdout: {e.stdout}", file=sys.stderr)
-        raise e
+        print(f"Return code: {error.returncode}", file=sys.stderr)
+        if error.output:
+            print(f"Output: {error.output}", file=sys.stderr)
+        if error.stderr:
+            print(f"Stderr: {error.stderr}", file=sys.stderr)
+        if error.stdout:
+            print(f"Stdout: {error.stdout}", file=sys.stderr)
+        raise
 def main(argv=None) -> int:
@@ -143,39 +186,47 @@ def main(argv=None) -> int:
     parser = argparse.ArgumentParser(
         description="Run the opendataloader-pdf CLI using the bundled JAR."
     )
-    parser.add_argument("input_path", help="Path to the input PDF file or directory.")
+    parser.add_argument(
+        "input_path", nargs="+", help="Path to the input PDF file or directory."
+    )
     parser.add_argument(
         "-o",
         "--output-dir",
-        dest="output_folder",
         help="Directory where outputs are written.",
     )
     parser.add_argument("-p", "--password", help="Password for encrypted PDFs.")
     parser.add_argument(
-        "--replace-invalid-chars",
-        help="Replacement character for invalid or unrecognized characters.",
-    )
-    parser.add_argument(
-        "--content-safety-off",
-        help="Disable content safety filtering (expects the desired mode).",
-    )
-    parser.add_argument(
-        "--markdown",
-        dest="generate_markdown",
-        action="store_true",
-        help="Generate Markdown output.",
+        "-f",
+        "--format",
+        nargs="+",
+        choices=[
+            "json",
+            "text",
+            "html",
+            "pdf",
+            "markdown",
+            "markdown-with-html",
+            "markdown-with-images",
+        ],
+        help="Output format(s) to generate.",
     )
     parser.add_argument(
-        "--html",
-        dest="generate_html",
+        "-q",
+        "--quiet",
         action="store_true",
-        help="Generate HTML output.",
+        help="Suppress CLI logging output.",
     )
     parser.add_argument(
-        "--pdf",
-        dest="generate_annotated_pdf",
-        action="store_true",
-        help="Generate annotated PDF output.",
+        "--content-safety-off",
+        nargs="+",
+        choices=[
+            "all",
+            "hidden-text",
+            "off-page",
+            "tiny",
+            "hidden-ocg",
+        ],
+        help="Disables one or more content safety filters. Accepts a list of filter names.",
     )
     parser.add_argument(
         "--keep-line-breaks",
@@ -183,31 +234,13 @@ def main(argv=None) -> int:
         help="Preserve line breaks in text output.",
     )
     parser.add_argument(
-        "--markdown-with-html",
-        dest="html_in_markdown",
-        action="store_true",
-        help="Allow raw HTML within Markdown output.",
-    )
-    parser.add_argument(
-        "--markdown-with-images",
-        dest="add_image_to_markdown",
-        action="store_true",
-        help="Embed images in Markdown output.",
-    )
-    parser.add_argument(
-        "--no-json",
-        action="store_true",
-        help="Disable JSON output generation.",
-    )
-    parser.add_argument(
-        "--debug",
-        action="store_true",
-        help="Stream CLI logs directly to stdout.",
+        "--replace-invalid-chars",
+        help="Replacement character for invalid or unrecognized characters.",
     )
     args = parser.parse_args(argv)
     try:
-        run(**vars(args))
+        convert(**vars(args))
     except FileNotFoundError as err:
         print(err, file=sys.stderr)
         return 1

{opendataloader_pdf-1.0.6.dist-info → opendataloader_pdf-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: opendataloader-pdf
-Version: 1.0.6
+Version: 1.1.0
 Summary: A Python wrapper for the opendataloader-pdf Java CLI.
 Home-page: https://github.com/opendataloader-project/opendataloader-pdf
 Author: opendataloader-project
@@ -97,47 +97,42 @@ pip install -U opendataloader-pdf
 ### Usage
-- input_path can be either the path to a single document or the path to a folder.
-- If you don’t specify an output_folder, the output data will be saved in the same directory as the input document.
+input_path can be either the path to a single document or the path to a folder.
 ```python
 import opendataloader_pdf
-opendataloader_pdf.run(
-    input_path="path/to/document.pdf",
-    output_folder="path/to/output",
-    generate_markdown=True,
-    generate_html=True,
-    generate_annotated_pdf=True,
-    debug=True,
+opendataloader_pdf.convert(
+    input_path=["path/to/document.pdf", "path/to/folder"],
+    output_dir="path/to/output",
+    format=["json", "html", "pdf", "markdown"]
 )
 ```
-- If you want to run it via CLI, you can use the following command:
+If you want to run it via CLI, you can use the following command on the terminal:
-```sh
-opendataloader-pdf path/to/document.pdf --markdown --html --pdf
+```bash
+opendataloader-pdf path/to/document.pdf path/to/folder -o path/to/output -f json html pdf markdown
 ```
-### Function: run()
+### Function: convert()
 The main function to process PDFs.
-| Parameter                | Type   | Required | Default      | Description                                                                                                                                 |
-|--------------------------| ------ | -------- |--------------|---------------------------------------------------------------------------------------------------------------------------------------------|
-| `input_path`             | `str`  | ✅ Yes    | —            | Path to the input PDF file or folder.                                                                                                       |
-| `output_folder`          | `str`  | No       | input folder | Path to the output folder.                                                                                                                  |
-| `password`               | `str`  | No       | `None`       | Password for the PDF file.                                                                                                                  |
-| `replace_invalid_chars`  | `str`  | No       | `" "`       | Character to replace invalid or unrecognized characters (e.g., �, \u0000)                                                                   |
-| `content_safety_off`     | `str`  | No       | `None`       | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg. |
-| `generate_markdown`      | `bool` | No       | `False`      | If `True`, generates a Markdown output file.                                                                                                |
-| `generate_html`          | `bool` | No       | `False`      | If `True`, generates an HTML output file.                                                                                                   |
-| `generate_annotated_pdf` | `bool` | No       | `False`      | If `True`, generates an annotated PDF output file.                                                                                          |
-| `keep_line_breaks`       | `bool` | No       | `False`      | If `True`, keeps line breaks in the output.                                                                                                 |
-| `html_in_markdown`       | `bool` | No       | `False`      | If `True`, uses HTML in the Markdown output.                                                                                                |
-| `add_image_to_markdown`  | `bool` | No       | `False`      | If `True`, adds images to the Markdown output.                                                                                              |
-| `no_json`                | `bool` | No       | `False`      | If `True`, disables the JSON output.                                                                                                        |
-| `debug`                  | `bool` | No       | `False`      | If `True`, prints CLI messages to the console during execution.                                                                             |
+| Parameter                | Type           | Required | Default      | Description                                                                                                                                 |
+|--------------------------|----------------| -------- |--------------|---------------------------------------------------------------------------------------------------------------------------------------------|
+| `input_path`             | `List[str]`     | ✅ Yes    | —            | One or more PDF file paths or directories to process.                                                                                       |
+| `output_dir`             | `Optional[str]` | No       | input folder | Directory where outputs are written.                                                                                                       |
+| `password`               | `Optional[str]` | No       | `None`       | Password used for encrypted PDFs.                                                                                                           |
+| `format`                 | `Optional[List[str]]` | No | `None`       | Output formats to generate (e.g. `"json"`, `"html"`, `"pdf"`, `"text"`, `"markdown"`, `"markdown-with-html"`, `"markdown-with-images"`).                                                             |
+| `quiet`                  | `bool`          | No       | `False`      | Suppresses CLI logging output when `True`.                                                                                                  |
+| `content_safety_off`     | `Optional[List[str]]` | No | `None`       | List of content safety filters to disable (e.g. `"all"`, `"hidden-text"`, `"off-page"`, `"tiny"`, `"hidden-ocg"`).                      |
+| `keep_line_breaks`       | `bool`          | No       | `False`      | Preserves line breaks in text output when `True`.                                                                                           |
+| `replace_invalid_chars`  | `Optional[str]` | No       | `None`       | Replacement character for invalid or unrecognized characters (e.g., �, `\u0000`).                                                           |
+### Function: run()
+Deprecated.
 <br/>
@@ -368,16 +363,23 @@ The images are extracted from PDF as individual files and stored in a subfolder
 ```
 Options:
 -o,--output-dir <arg>           Specifies the output directory for generated files
+-p,--password <arg>             Specifies the password for an encrypted PDF
+-f,--format <arg>               List of output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images). Default: json
+-q,--quiet                      Suppresses console logging output
+--content-safety-off <arg>      Disables one or more content safety filters. Accepts a list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg
 --keep-line-breaks              Preserves original line breaks in the extracted text
---content-safety-off <arg>      Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg
---markdown-with-html            Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
---markdown-with-images          Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
---markdown                      Sets the data extraction output format to Markdown
---html                          Sets the data extraction output format to HTML
+--replace-invalid-chars <arg>   Replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character
+```
+The legacy options (for backward compatibility):
+```
 --no-json                       Disables the JSON output format
--p,--password <arg>             Specifies the password for an encrypted PDF
+--html                          Sets the data extraction output format to HTML
 --pdf                           Generates a new PDF file where the extracted layout data is visualized as annotations
---replace-invalid-chars <arg>   Replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character
+--markdown                      Sets the data extraction output format to Markdown
+--markdown-with-html            Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
+--markdown-with-images          Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
 ```
 ### Schema of the JSON output

{opendataloader_pdf-1.0.6.dist-info → opendataloader_pdf-1.1.0.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
 opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
-opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
+opendataloader_pdf/__init__.py,sha256=xkTyVWNu1W2YrI1tPpGnd11DwwcwFDyBp8b4agLdd7A,64
 opendataloader_pdf/__main__.py,sha256=lmla4yz3SaYBfRJXOXnwO_8ID31-Ja20aQmomiz1eEc,84
-opendataloader_pdf/wrapper.py,sha256=Dsvw5un_HROLcy2xX0WqoKKRnOjL081LEYC6YfpViLE,7331
+opendataloader_pdf/wrapper.py,sha256=0Erld0Cpu5rXhT46WpqNkrg0bBg15iBmaH-AU9CfdNU,8409
 opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
 opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
 opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
@@ -14,9 +14,9 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
 opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
 opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
 opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
-opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=HmcxP25ZCOJNRV9U1IXy-beAN243_iCWTBIV6JB-6S8,20477911
-opendataloader_pdf-1.0.6.dist-info/METADATA,sha256=2BWSSScAW3mmpWum3N7g-01fMZITHsmQcDBqSmGSkU0,25966
-opendataloader_pdf-1.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-opendataloader_pdf-1.0.6.dist-info/entry_points.txt,sha256=Tupa9pVNF6nXD9sqzCLI8PCHbSu0jKkL3SYyTkQy0dc,71
-opendataloader_pdf-1.0.6.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
-opendataloader_pdf-1.0.6.dist-info/RECORD,,
+opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=0mrZFl8pVTrUIf8Nk09wN9KJPWW1U8ZMvdyB9sEPt-c,20485612
+opendataloader_pdf-1.1.0.dist-info/METADATA,sha256=jk-YV7lX5a9wqL57cLhAhJr_bgXEOBSY2RGL67lDN34,25270
+opendataloader_pdf-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+opendataloader_pdf-1.1.0.dist-info/entry_points.txt,sha256=Tupa9pVNF6nXD9sqzCLI8PCHbSu0jKkL3SYyTkQy0dc,71
+opendataloader_pdf-1.1.0.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
+opendataloader_pdf-1.1.0.dist-info/RECORD,,

{opendataloader_pdf-1.0.6.dist-info → opendataloader_pdf-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{opendataloader_pdf-1.0.6.dist-info → opendataloader_pdf-1.1.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{opendataloader_pdf-1.0.6.dist-info → opendataloader_pdf-1.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

opendataloader-pdf 1.0.6__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

opendataloader-pdf 1.0.6py3-none-any.whl → 1.1.0py3-none-any.whl