PyPI - paddleocr-haystack - Versions diffs - 0.1.0__tar.gz → 1.0.0__tar.gz - Mend

paddleocr-haystack 0.1.0tar.gz → 1.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

paddleocr_haystack-1.0.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,9 @@
+# Changelog
+## [integrations/paddleocr-v0.1.0] - 2025-12-10
+### 🚀 Features
+- Add PaddleOCR-VL document converter (#2567)
+<!-- generated by git-cliff -->

{paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: paddleocr-haystack
-Version: 0.1.0
+Version: 1.0.0
 Summary: An integration of PaddleOCR with Haystack
 Project-URL: Documentation, https://github.com/haystack-core-integrations/tree/main/integrations/paddleocr#readme
 Project-URL: Issues, https://github.com/haystack-core-integrations/issues
@@ -10,14 +10,13 @@ License-Expression: Apache-2.0
 License-File: LICENSE.txt
 Classifier: Development Status :: 4 - Beta
 Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
-Requires-Python: >=3.9
-Requires-Dist: haystack-ai>=2.19.0
+Requires-Python: >=3.10
+Requires-Dist: haystack-ai>=2.22.0
 Requires-Dist: paddleocr>=3.3.2
 Requires-Dist: paddlex[serving]>=3.3.10
 Requires-Dist: requests>=2.25.0

{paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ name = "paddleocr-haystack"
 dynamic = ["version"]
 description = 'An integration of PaddleOCR with Haystack'
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = "Apache-2.0"
 keywords = []
 authors = [
@@ -16,7 +16,6 @@ authors = [
 classifiers = [
   "Development Status :: 4 - Beta",
   "Programming Language :: Python",
-  "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
@@ -24,7 +23,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-  "haystack-ai>=2.19.0",
+  "haystack-ai>=2.22.0",
   "paddleocr>=3.3.2",
   "paddlex[serving]>=3.3.10",
   "requests>=2.25.0",
@@ -52,7 +51,7 @@ dependencies = ["haystack-pydoc-tools", "ruff"]
 [tool.hatch.envs.default.scripts]
 docs = ["pydoc-markdown pydoc/config_docusaurus.yml"]
-fmt = "ruff check --fix {args} && ruff format {args}"
+fmt = "ruff check --fix {args}; ruff format {args}"
 fmt-check = "ruff check {args} && ruff format --check {args}"
 [tool.hatch.envs.test]
@@ -80,7 +79,6 @@ check_untyped_defs = true
 disallow_incomplete_defs = true
 [tool.ruff]
-target-version = "py39"
 line-length = 120
 [tool.ruff.lint]
@@ -127,10 +125,6 @@ ignore = [
   "B008",
   "S101",
 ]
-unfixable = [
-  # Don't touch unused imports
-  "F401",
-]
 [tool.ruff.lint.isort]
 known-first-party = ["haystack_integrations"]

{paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py RENAMED Viewed

@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import base64
 from pathlib import Path
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal
 import requests
 from haystack import Document, component, default_from_dict, default_to_dict, logging
@@ -24,7 +24,7 @@ from paddlex.inference.serving.schemas.shared.ocr import FileType  # type: ignor
 logger = logging.getLogger(__name__)
-FileTypeInput = Union[Literal["pdf", "image"], None]
+FileTypeInput = Literal["pdf", "image"] | None
 # Supported image file extensions
 _IMAGE_EXTENSIONS = {
@@ -41,9 +41,9 @@ _PDF_EXTENSIONS = {".pdf"}
 def _infer_file_type_from_source(
-    source: Union[str, Path, ByteStream],
-    mime_type: Optional[str] = None,
-) -> Optional[FileType]:
+    source: str | Path | ByteStream,
+    mime_type: str | None = None,
+) -> FileType | None:
     """
     Infer file type from file extension or MIME type.
@@ -56,7 +56,7 @@ def _infer_file_type_from_source(
         determined.
     """
     # Try to get extension from file path
-    file_path: Optional[str] = None
+    file_path: str | None = None
     # Check if source is a file path
     if isinstance(source, (str, Path)):
@@ -86,7 +86,7 @@ def _infer_file_type_from_source(
     return None
-def _normalize_file_type(file_type: Optional[FileTypeInput]) -> Optional[FileType]:
+def _normalize_file_type(file_type: FileTypeInput) -> FileType | None:
     """
     Normalize file type input to the numeric format expected by the API.
@@ -145,26 +145,26 @@ class PaddleOCRVLDocumentConverter:
         *,
         api_url: str,
         access_token: Secret = Secret.from_env_var("AISTUDIO_ACCESS_TOKEN"),
-        file_type: Optional[FileTypeInput] = None,
-        use_doc_orientation_classify: Optional[bool] = None,
-        use_doc_unwarping: Optional[bool] = None,
-        use_layout_detection: Optional[bool] = None,
-        use_chart_recognition: Optional[bool] = None,
-        layout_threshold: Optional[Union[float, dict]] = None,
-        layout_nms: Optional[bool] = None,
-        layout_unclip_ratio: Optional[Union[float, tuple[float, float], dict]] = None,
-        layout_merge_bboxes_mode: Optional[Union[str, dict]] = None,
-        prompt_label: Optional[str] = None,
-        format_block_content: Optional[bool] = None,
-        repetition_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        prettify_markdown: Optional[bool] = None,
-        show_formula_number: Optional[bool] = None,
-        visualize: Optional[bool] = None,
-        additional_params: Optional[dict[str, Any]] = None,
+        file_type: FileTypeInput = None,
+        use_doc_orientation_classify: bool | None = None,
+        use_doc_unwarping: bool | None = None,
+        use_layout_detection: bool | None = None,
+        use_chart_recognition: bool | None = None,
+        layout_threshold: float | dict | None = None,
+        layout_nms: bool | None = None,
+        layout_unclip_ratio: float | tuple[float, float] | dict | None = None,
+        layout_merge_bboxes_mode: str | dict | None = None,
+        prompt_label: str | None = None,
+        format_block_content: bool | None = None,
+        repetition_penalty: float | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        prettify_markdown: bool | None = None,
+        show_formula_number: bool | None = None,
+        visualize: bool | None = None,
+        additional_params: dict[str, Any] | None = None,
     ):
         """
         Create a `PaddleOCRVLDocumentConverter` component.
@@ -372,7 +372,7 @@ class PaddleOCRVLDocumentConverter:
         # Prepare headers with authentication
         access_token_value = self.access_token.resolve_value() if self.access_token else None
-        headers = {"Content-Type": "application/json"}
+        headers = {"Content-Type": "application/json", "Client-Platform": "haystack"}
         if access_token_value:
             headers["Authorization"] = f"token {access_token_value}"
@@ -421,8 +421,8 @@ class PaddleOCRVLDocumentConverter:
     @component.output_types(documents=list[Document], raw_paddleocr_responses=list[dict[str, Any]])
     def run(
         self,
-        sources: list[Union[str, Path, ByteStream]],
-        meta: Optional[Union[dict[str, Any], list[dict[str, Any]]]] = None,
+        sources: list[str | Path | ByteStream],
+        meta: dict[str, Any] | list[dict[str, Any]] | None = None,
     ) -> dict[str, Any]:
         """
         Convert image or PDF files to Documents.
@@ -448,7 +448,7 @@ class PaddleOCRVLDocumentConverter:
         meta_list = normalize_metadata(meta, sources_count=len(sources))
-        for source, metadata in zip(sources, meta_list):
+        for source, metadata in zip(sources, meta_list, strict=True):
             try:
                 bytestream = get_bytestream_from_source(source)
             except Exception as e: