paddleocr-haystack 0.1.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ # Changelog
2
+
3
+ ## [integrations/paddleocr-v0.1.0] - 2025-12-10
4
+
5
+ ### 🚀 Features
6
+
7
+ - Add PaddleOCR-VL document converter (#2567)
8
+
9
+ <!-- generated by git-cliff -->
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: paddleocr-haystack
3
- Version: 0.1.0
3
+ Version: 1.0.0
4
4
  Summary: An integration of PaddleOCR with Haystack
5
5
  Project-URL: Documentation, https://github.com/haystack-core-integrations/tree/main/integrations/paddleocr#readme
6
6
  Project-URL: Issues, https://github.com/haystack-core-integrations/issues
@@ -10,14 +10,13 @@ License-Expression: Apache-2.0
10
10
  License-File: LICENSE.txt
11
11
  Classifier: Development Status :: 4 - Beta
12
12
  Classifier: Programming Language :: Python
13
- Classifier: Programming Language :: Python :: 3.9
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Programming Language :: Python :: 3.12
17
16
  Classifier: Programming Language :: Python :: Implementation :: CPython
18
17
  Classifier: Programming Language :: Python :: Implementation :: PyPy
19
- Requires-Python: >=3.9
20
- Requires-Dist: haystack-ai>=2.19.0
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: haystack-ai>=2.22.0
21
20
  Requires-Dist: paddleocr>=3.3.2
22
21
  Requires-Dist: paddlex[serving]>=3.3.10
23
22
  Requires-Dist: requests>=2.25.0
@@ -7,7 +7,7 @@ name = "paddleocr-haystack"
7
7
  dynamic = ["version"]
8
8
  description = 'An integration of PaddleOCR with Haystack'
9
9
  readme = "README.md"
10
- requires-python = ">=3.9"
10
+ requires-python = ">=3.10"
11
11
  license = "Apache-2.0"
12
12
  keywords = []
13
13
  authors = [
@@ -16,7 +16,6 @@ authors = [
16
16
  classifiers = [
17
17
  "Development Status :: 4 - Beta",
18
18
  "Programming Language :: Python",
19
- "Programming Language :: Python :: 3.9",
20
19
  "Programming Language :: Python :: 3.10",
21
20
  "Programming Language :: Python :: 3.11",
22
21
  "Programming Language :: Python :: 3.12",
@@ -24,7 +23,7 @@ classifiers = [
24
23
  "Programming Language :: Python :: Implementation :: PyPy",
25
24
  ]
26
25
  dependencies = [
27
- "haystack-ai>=2.19.0",
26
+ "haystack-ai>=2.22.0",
28
27
  "paddleocr>=3.3.2",
29
28
  "paddlex[serving]>=3.3.10",
30
29
  "requests>=2.25.0",
@@ -52,7 +51,7 @@ dependencies = ["haystack-pydoc-tools", "ruff"]
52
51
 
53
52
  [tool.hatch.envs.default.scripts]
54
53
  docs = ["pydoc-markdown pydoc/config_docusaurus.yml"]
55
- fmt = "ruff check --fix {args} && ruff format {args}"
54
+ fmt = "ruff check --fix {args}; ruff format {args}"
56
55
  fmt-check = "ruff check {args} && ruff format --check {args}"
57
56
 
58
57
  [tool.hatch.envs.test]
@@ -80,7 +79,6 @@ check_untyped_defs = true
80
79
  disallow_incomplete_defs = true
81
80
 
82
81
  [tool.ruff]
83
- target-version = "py39"
84
82
  line-length = 120
85
83
 
86
84
  [tool.ruff.lint]
@@ -127,10 +125,6 @@ ignore = [
127
125
  "B008",
128
126
  "S101",
129
127
  ]
130
- unfixable = [
131
- # Don't touch unused imports
132
- "F401",
133
- ]
134
128
 
135
129
  [tool.ruff.lint.isort]
136
130
  known-first-party = ["haystack_integrations"]
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
  import base64
5
5
  from pathlib import Path
6
- from typing import Any, Literal, Optional, Union
6
+ from typing import Any, Literal
7
7
 
8
8
  import requests
9
9
  from haystack import Document, component, default_from_dict, default_to_dict, logging
@@ -24,7 +24,7 @@ from paddlex.inference.serving.schemas.shared.ocr import FileType # type: ignor
24
24
  logger = logging.getLogger(__name__)
25
25
 
26
26
 
27
- FileTypeInput = Union[Literal["pdf", "image"], None]
27
+ FileTypeInput = Literal["pdf", "image"] | None
28
28
 
29
29
  # Supported image file extensions
30
30
  _IMAGE_EXTENSIONS = {
@@ -41,9 +41,9 @@ _PDF_EXTENSIONS = {".pdf"}
41
41
 
42
42
 
43
43
  def _infer_file_type_from_source(
44
- source: Union[str, Path, ByteStream],
45
- mime_type: Optional[str] = None,
46
- ) -> Optional[FileType]:
44
+ source: str | Path | ByteStream,
45
+ mime_type: str | None = None,
46
+ ) -> FileType | None:
47
47
  """
48
48
  Infer file type from file extension or MIME type.
49
49
 
@@ -56,7 +56,7 @@ def _infer_file_type_from_source(
56
56
  determined.
57
57
  """
58
58
  # Try to get extension from file path
59
- file_path: Optional[str] = None
59
+ file_path: str | None = None
60
60
 
61
61
  # Check if source is a file path
62
62
  if isinstance(source, (str, Path)):
@@ -86,7 +86,7 @@ def _infer_file_type_from_source(
86
86
  return None
87
87
 
88
88
 
89
- def _normalize_file_type(file_type: Optional[FileTypeInput]) -> Optional[FileType]:
89
+ def _normalize_file_type(file_type: FileTypeInput) -> FileType | None:
90
90
  """
91
91
  Normalize file type input to the numeric format expected by the API.
92
92
 
@@ -145,26 +145,26 @@ class PaddleOCRVLDocumentConverter:
145
145
  *,
146
146
  api_url: str,
147
147
  access_token: Secret = Secret.from_env_var("AISTUDIO_ACCESS_TOKEN"),
148
- file_type: Optional[FileTypeInput] = None,
149
- use_doc_orientation_classify: Optional[bool] = None,
150
- use_doc_unwarping: Optional[bool] = None,
151
- use_layout_detection: Optional[bool] = None,
152
- use_chart_recognition: Optional[bool] = None,
153
- layout_threshold: Optional[Union[float, dict]] = None,
154
- layout_nms: Optional[bool] = None,
155
- layout_unclip_ratio: Optional[Union[float, tuple[float, float], dict]] = None,
156
- layout_merge_bboxes_mode: Optional[Union[str, dict]] = None,
157
- prompt_label: Optional[str] = None,
158
- format_block_content: Optional[bool] = None,
159
- repetition_penalty: Optional[float] = None,
160
- temperature: Optional[float] = None,
161
- top_p: Optional[float] = None,
162
- min_pixels: Optional[int] = None,
163
- max_pixels: Optional[int] = None,
164
- prettify_markdown: Optional[bool] = None,
165
- show_formula_number: Optional[bool] = None,
166
- visualize: Optional[bool] = None,
167
- additional_params: Optional[dict[str, Any]] = None,
148
+ file_type: FileTypeInput = None,
149
+ use_doc_orientation_classify: bool | None = None,
150
+ use_doc_unwarping: bool | None = None,
151
+ use_layout_detection: bool | None = None,
152
+ use_chart_recognition: bool | None = None,
153
+ layout_threshold: float | dict | None = None,
154
+ layout_nms: bool | None = None,
155
+ layout_unclip_ratio: float | tuple[float, float] | dict | None = None,
156
+ layout_merge_bboxes_mode: str | dict | None = None,
157
+ prompt_label: str | None = None,
158
+ format_block_content: bool | None = None,
159
+ repetition_penalty: float | None = None,
160
+ temperature: float | None = None,
161
+ top_p: float | None = None,
162
+ min_pixels: int | None = None,
163
+ max_pixels: int | None = None,
164
+ prettify_markdown: bool | None = None,
165
+ show_formula_number: bool | None = None,
166
+ visualize: bool | None = None,
167
+ additional_params: dict[str, Any] | None = None,
168
168
  ):
169
169
  """
170
170
  Create a `PaddleOCRVLDocumentConverter` component.
@@ -372,7 +372,7 @@ class PaddleOCRVLDocumentConverter:
372
372
 
373
373
  # Prepare headers with authentication
374
374
  access_token_value = self.access_token.resolve_value() if self.access_token else None
375
- headers = {"Content-Type": "application/json"}
375
+ headers = {"Content-Type": "application/json", "Client-Platform": "haystack"}
376
376
  if access_token_value:
377
377
  headers["Authorization"] = f"token {access_token_value}"
378
378
 
@@ -421,8 +421,8 @@ class PaddleOCRVLDocumentConverter:
421
421
  @component.output_types(documents=list[Document], raw_paddleocr_responses=list[dict[str, Any]])
422
422
  def run(
423
423
  self,
424
- sources: list[Union[str, Path, ByteStream]],
425
- meta: Optional[Union[dict[str, Any], list[dict[str, Any]]]] = None,
424
+ sources: list[str | Path | ByteStream],
425
+ meta: dict[str, Any] | list[dict[str, Any]] | None = None,
426
426
  ) -> dict[str, Any]:
427
427
  """
428
428
  Convert image or PDF files to Documents.
@@ -448,7 +448,7 @@ class PaddleOCRVLDocumentConverter:
448
448
 
449
449
  meta_list = normalize_metadata(meta, sources_count=len(sources))
450
450
 
451
- for source, metadata in zip(sources, meta_list):
451
+ for source, metadata in zip(sources, meta_list, strict=True):
452
452
  try:
453
453
  bytestream = get_bytestream_from_source(source)
454
454
  except Exception as e: