paddleocr-haystack 1.0.0__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. paddleocr_haystack-2.0.0/CHANGELOG.md +27 -0
  2. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/PKG-INFO +3 -5
  3. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/README.md +1 -1
  4. paddleocr_haystack-2.0.0/pydoc/config_docusaurus.yml +13 -0
  5. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/pyproject.toml +16 -6
  6. paddleocr_haystack-2.0.0/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py +448 -0
  7. paddleocr_haystack-2.0.0/tests/test_paddleocr_vl_document_converter.py +599 -0
  8. paddleocr_haystack-1.0.0/CHANGELOG.md +0 -9
  9. paddleocr_haystack-1.0.0/pydoc/config_docusaurus.yml +0 -28
  10. paddleocr_haystack-1.0.0/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py +0 -494
  11. paddleocr_haystack-1.0.0/tests/test_paddleocr_vl_document_converter.py +0 -748
  12. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/.gitignore +0 -0
  13. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/LICENSE.txt +0 -0
  14. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/src/haystack_integrations/components/converters/paddleocr/__init__.py +0 -0
  15. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/src/haystack_integrations/components/converters/py.typed +0 -0
  16. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/tests/__init__.py +0 -0
  17. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/tests/conftest.py +0 -0
  18. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/tests/test_files/sample_img.jpg +0 -0
  19. {paddleocr_haystack-1.0.0 → paddleocr_haystack-2.0.0}/tests/test_files/sample_pdf.pdf +0 -0
@@ -0,0 +1,27 @@
1
+ # Changelog
2
+
3
+ ## [integrations/paddleocr-v1.1.0] - 2026-01-29
4
+
5
+ ### 🚀 Features
6
+
7
+ - Update for PaddleOCR-VL-1.5 interface (#2782)
8
+
9
+
10
+ ## [integrations/paddleocr-v1.0.0] - 2026-01-12
11
+
12
+ ### 🧹 Chores
13
+
14
+ - Make fmt command more forgiving (#2671)
15
+ - [**breaking**] Paddleocr - drop Python 3.9 and use X|Y typing (#2714)
16
+
17
+ ### 🌀 Miscellaneous
18
+
19
+ - Feat: Add 'Client-Platform' header for server processing for PaddleOCR (#2657)
20
+
21
+ ## [integrations/paddleocr-v0.1.0] - 2025-12-10
22
+
23
+ ### 🚀 Features
24
+
25
+ - Add PaddleOCR-VL document converter (#2567)
26
+
27
+ <!-- generated by git-cliff -->
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: paddleocr-haystack
3
- Version: 1.0.0
3
+ Version: 2.0.0
4
4
  Summary: An integration of PaddleOCR with Haystack
5
5
  Project-URL: Documentation, https://github.com/haystack-core-integrations/tree/main/integrations/paddleocr#readme
6
6
  Project-URL: Issues, https://github.com/haystack-core-integrations/issues
@@ -17,9 +17,7 @@ Classifier: Programming Language :: Python :: Implementation :: CPython
17
17
  Classifier: Programming Language :: Python :: Implementation :: PyPy
18
18
  Requires-Python: >=3.10
19
19
  Requires-Dist: haystack-ai>=2.22.0
20
- Requires-Dist: paddleocr>=3.3.2
21
- Requires-Dist: paddlex[serving]>=3.3.10
22
- Requires-Dist: requests>=2.25.0
20
+ Requires-Dist: paddleocr>=3.7.0
23
21
  Description-Content-Type: text/markdown
24
22
 
25
23
  # paddleocr-haystack
@@ -36,4 +34,4 @@ Description-Content-Type: text/markdown
36
34
 
37
35
  Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
38
36
 
39
- To run integration tests locally, you need to export the `PADDLEOCR_VL_API_URL` and `AISTUDIO_ACCESS_TOKEN` environment variables.
37
+ To run integration tests locally, you need to export the `PADDLEOCR_BASE_URL` and `PADDLEOCR_ACCESS_TOKEN` environment variables.
@@ -12,4 +12,4 @@
12
12
 
13
13
  Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
14
14
 
15
- To run integration tests locally, you need to export the `PADDLEOCR_VL_API_URL` and `AISTUDIO_ACCESS_TOKEN` environment variables.
15
+ To run integration tests locally, you need to export the `PADDLEOCR_BASE_URL` and `PADDLEOCR_ACCESS_TOKEN` environment variables.
@@ -0,0 +1,13 @@
1
+ loaders:
2
+ - modules:
3
+ - haystack_integrations.components.converters.paddleocr.paddleocr_vl_document_converter
4
+ search_path: [../src]
5
+ processors:
6
+ - type: filter
7
+ documented_only: true
8
+ skip_empty_modules: true
9
+ renderer:
10
+ description: PaddleOCR integration for Haystack
11
+ id: integrations-paddleocr
12
+ filename: paddleocr.md
13
+ title: PaddleOCR
@@ -24,9 +24,7 @@ classifiers = [
24
24
  ]
25
25
  dependencies = [
26
26
  "haystack-ai>=2.22.0",
27
- "paddleocr>=3.3.2",
28
- "paddlex[serving]>=3.3.10",
29
- "requests>=2.25.0",
27
+ "paddleocr>=3.7.0",
30
28
  ]
31
29
 
32
30
  [project.urls]
@@ -50,7 +48,7 @@ installer = "uv"
50
48
  dependencies = ["haystack-pydoc-tools", "ruff"]
51
49
 
52
50
  [tool.hatch.envs.default.scripts]
53
- docs = ["pydoc-markdown pydoc/config_docusaurus.yml"]
51
+ docs = ["haystack-pydoc pydoc/config_docusaurus.yml"]
54
52
  fmt = "ruff check --fix {args}; ruff format {args}"
55
53
  fmt-check = "ruff check {args} && ruff format --check {args}"
56
54
 
@@ -69,7 +67,8 @@ dependencies = [
69
67
  unit = 'pytest -m "not integration" {args:tests}'
70
68
  integration = 'pytest -m "integration" {args:tests}'
71
69
  all = 'pytest {args:tests}'
72
- cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}'
70
+ unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}'
71
+ integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}'
73
72
  types = "mypy -p haystack_integrations.components.converters.paddleocr {args}"
74
73
 
75
74
  [tool.mypy]
@@ -84,9 +83,17 @@ line-length = 120
84
83
  [tool.ruff.lint]
85
84
  select = [
86
85
  "A",
86
+ "ANN",
87
87
  "ARG",
88
88
  "B",
89
89
  "C",
90
+ "D102", # Missing docstring in public method
91
+ "D103", # Missing docstring in public function
92
+ "D205", # 1 blank line required between summary line and description
93
+ "D209", # Closing triple quotes go to new line
94
+ "D213", # summary lines must be positioned on the second physical line of the docstring
95
+ "D417", # Missing argument descriptions in the docstring
96
+ "D419", # Docstring is empty
90
97
  "DTZ",
91
98
  "E",
92
99
  "EM",
@@ -111,6 +118,8 @@ select = [
111
118
  ignore = [
112
119
  # Allow non-abstract empty methods in abstract base classes
113
120
  "B027",
121
+ # Allow Any in type annotations at dynamic boundaries
122
+ "ANN401",
114
123
  # Ignore checks for possible passwords
115
124
  "S105",
116
125
  "S106",
@@ -134,11 +143,12 @@ ban-relative-imports = "parents"
134
143
 
135
144
  [tool.ruff.lint.per-file-ignores]
136
145
  # Tests can use magic values, assertions, and relative imports
137
- "tests/**/*" = ["PLR2004", "S101", "TID252"]
146
+ "tests/**/*" = ["D", "PLR2004", "S101", "TID252", "ANN"]
138
147
 
139
148
  [tool.coverage.run]
140
149
  source = ["haystack_integrations"]
141
150
  branch = true
151
+ relative_files = true
142
152
  parallel = false
143
153
 
144
154
  [tool.coverage.report]
@@ -0,0 +1,448 @@
1
+ # SPDX-FileCopyrightText: 2025-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ import tempfile
5
+ from pathlib import Path
6
+ from typing import Any, Literal
7
+
8
+ from haystack import Document, component, default_from_dict, default_to_dict, logging
9
+ from haystack.components.converters.utils import (
10
+ get_bytestream_from_source,
11
+ normalize_metadata,
12
+ )
13
+ from haystack.dataclasses import ByteStream
14
+ from haystack.utils import Secret, deserialize_secrets_inplace
15
+
16
+ from paddleocr import Model, PaddleOCRClient, PaddleOCRVLOptions # type: ignore[import-untyped]
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ FileTypeInput = Literal["pdf", "image"] | None
21
+
22
+ _IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}
23
+ _PDF_EXTENSIONS = {".pdf"}
24
+ _EXTENSION_FOR_FILE_TYPE = {0: ".pdf", 1: ".jpg"}
25
+
26
+
27
+ def _infer_file_type_from_source(
28
+ source: str | Path | ByteStream,
29
+ mime_type: str | None = None,
30
+ ) -> int | None:
31
+ """
32
+ Infer file type from file extension or MIME type.
33
+
34
+ :param source:
35
+ Original source (file path, Path object, or ByteStream).
36
+ :param mime_type:
37
+ MIME type of the source.
38
+ :returns:
39
+ Inferred file type: 0 for PDF, 1 for image, or None if cannot be determined.
40
+ """
41
+ file_path: str | None = None
42
+
43
+ if isinstance(source, (str, Path)):
44
+ file_path = str(source)
45
+ elif isinstance(source, ByteStream) and source.meta:
46
+ file_path = source.meta.get("file_path")
47
+
48
+ if file_path:
49
+ extension = Path(file_path).suffix.lower()
50
+ if extension in _PDF_EXTENSIONS:
51
+ return 0
52
+ if extension in _IMAGE_EXTENSIONS:
53
+ return 1
54
+
55
+ if mime_type:
56
+ mime_lower = mime_type.lower()
57
+ if mime_lower == "application/pdf":
58
+ return 0
59
+ if mime_lower.startswith("image/"):
60
+ return 1
61
+
62
+ return None
63
+
64
+
65
+ def _normalize_file_type(file_type: FileTypeInput) -> int | None:
66
+ """
67
+ Normalize file type input to the numeric format used internally.
68
+
69
+ :param file_type:
70
+ "pdf", "image", or None for auto-detection.
71
+ Integers 0 and 1 are also accepted for deserialization round-trips.
72
+ :returns:
73
+ 0 for PDF, 1 for image, or None for auto-detection.
74
+ """
75
+ if file_type is None:
76
+ return None
77
+ if file_type in ("pdf", 0):
78
+ return 0
79
+ if file_type in ("image", 1):
80
+ return 1
81
+ if isinstance(file_type, str):
82
+ msg = f"Invalid `file_type` string: {file_type}. Must be 'pdf' or 'image'."
83
+ raise ValueError(msg)
84
+ msg = f"Invalid `file_type` value: {file_type}. Must be 'pdf', 'image', or `None`."
85
+ raise ValueError(msg)
86
+
87
+
88
+ @component
89
+ class PaddleOCRVLDocumentConverter:
90
+ """
91
+ Extracts text from documents using PaddleOCR's official document parsing API.
92
+
93
+ Uses `PaddleOCRClient` to parse documents via the PaddleOCR serving API.
94
+ For more information, please refer to:
95
+ https://www.paddleocr.ai/latest/en/version3.x/algorithm/PaddleOCR-VL/PaddleOCR-VL.html
96
+
97
+ **Usage Example:**
98
+
99
+ ```python
100
+ from haystack_integrations.components.converters.paddleocr import PaddleOCRVLDocumentConverter
101
+
102
+ converter = PaddleOCRVLDocumentConverter(
103
+ base_url="http://xxxxx.aistudio-app.com",
104
+ )
105
+ result = converter.run(sources=["sample.pdf"])
106
+ documents = result["documents"]
107
+ raw_responses = result["raw_paddleocr_responses"]
108
+ ```
109
+ """
110
+
111
+ def __init__(
112
+ self,
113
+ *,
114
+ base_url: str | None = None,
115
+ access_token: Secret = Secret.from_env_var(["PADDLEOCR_ACCESS_TOKEN", "AISTUDIO_ACCESS_TOKEN"]),
116
+ model: Model | str = Model.PADDLE_OCR_VL_16,
117
+ file_type: FileTypeInput = None,
118
+ use_doc_orientation_classify: bool | None = False,
119
+ use_doc_unwarping: bool | None = False,
120
+ use_layout_detection: bool | None = None,
121
+ use_chart_recognition: bool | None = None,
122
+ use_seal_recognition: bool | None = None,
123
+ use_ocr_for_image_block: bool | None = None,
124
+ layout_threshold: float | dict | None = None,
125
+ layout_nms: bool | None = None,
126
+ layout_unclip_ratio: float | list | dict | None = None,
127
+ layout_merge_bboxes_mode: str | dict | None = None,
128
+ layout_shape_mode: str | None = None,
129
+ prompt_label: str | None = None,
130
+ format_block_content: bool | None = None,
131
+ repetition_penalty: float | None = None,
132
+ temperature: float | None = None,
133
+ top_p: float | None = None,
134
+ min_pixels: int | None = None,
135
+ max_pixels: int | None = None,
136
+ max_new_tokens: int | None = None,
137
+ merge_layout_blocks: bool | None = None,
138
+ markdown_ignore_labels: list[str] | None = None,
139
+ vlm_extra_args: dict | None = None,
140
+ prettify_markdown: bool | None = None,
141
+ show_formula_number: bool | None = None,
142
+ restructure_pages: bool | None = None,
143
+ merge_tables: bool | None = None,
144
+ relevel_titles: bool | None = None,
145
+ visualize: bool | None = None,
146
+ additional_params: dict[str, Any] | None = None,
147
+ ) -> None:
148
+ """
149
+ Create a `PaddleOCRVLDocumentConverter` component.
150
+
151
+ :param base_url:
152
+ Base URL for the PaddleOCR API. Falls back to `PADDLEOCR_BASE_URL`
153
+ env var, then the SDK default.
154
+ :param access_token:
155
+ PaddleOCR access token. Falls back to `PADDLEOCR_ACCESS_TOKEN` env var.
156
+ :param model:
157
+ Document parsing model. Defaults to `Model.PADDLE_OCR_VL_16`.
158
+ :param file_type:
159
+ "pdf", "image", or None for auto-detection.
160
+ :param use_doc_orientation_classify:
161
+ Enable document orientation classification.
162
+ :param use_doc_unwarping:
163
+ Enable text image unwarping.
164
+ :param use_layout_detection:
165
+ Enable layout detection.
166
+ :param use_chart_recognition:
167
+ Enable chart recognition.
168
+ :param use_seal_recognition:
169
+ Enable seal recognition.
170
+ :param use_ocr_for_image_block:
171
+ Recognize text in image blocks.
172
+ :param layout_threshold:
173
+ Layout detection threshold.
174
+ :param layout_nms:
175
+ Perform NMS on layout detection results.
176
+ :param layout_unclip_ratio:
177
+ Layout unclip ratio.
178
+ :param layout_merge_bboxes_mode:
179
+ Layout merge bounding boxes mode.
180
+ :param layout_shape_mode:
181
+ Layout shape mode.
182
+ :param prompt_label:
183
+ Prompt type for the VLM ("ocr", "formula", "table", "chart", "seal", "spotting").
184
+ :param format_block_content:
185
+ Format block content.
186
+ :param repetition_penalty:
187
+ Repetition penalty for VLM sampling.
188
+ :param temperature:
189
+ Temperature for VLM sampling.
190
+ :param top_p:
191
+ Top-p for VLM sampling.
192
+ :param min_pixels:
193
+ Minimum pixels for VLM preprocessing.
194
+ :param max_pixels:
195
+ Maximum pixels for VLM preprocessing.
196
+ :param max_new_tokens:
197
+ Maximum tokens generated by the VLM.
198
+ :param merge_layout_blocks:
199
+ Merge layout detection boxes for cross-column content.
200
+ :param markdown_ignore_labels:
201
+ Layout labels to ignore in Markdown output.
202
+ :param vlm_extra_args:
203
+ Extra configuration for the VLM.
204
+ :param prettify_markdown:
205
+ Prettify output Markdown.
206
+ :param show_formula_number:
207
+ Include formula numbers in Markdown output.
208
+ :param restructure_pages:
209
+ Restructure results across multiple pages.
210
+ :param merge_tables:
211
+ Merge tables across pages.
212
+ :param relevel_titles:
213
+ Relevel titles.
214
+ :param visualize:
215
+ Return visualization results.
216
+ :param additional_params:
217
+ Extra options passed to `PaddleOCRVLOptions.extra_options`.
218
+ """
219
+ self.base_url = base_url
220
+ self.access_token = access_token
221
+ self.model = model
222
+ self.file_type = _normalize_file_type(file_type)
223
+ self.use_doc_orientation_classify = use_doc_orientation_classify
224
+ self.use_doc_unwarping = use_doc_unwarping
225
+ self.use_layout_detection = use_layout_detection
226
+ self.use_chart_recognition = use_chart_recognition
227
+ self.use_seal_recognition = use_seal_recognition
228
+ self.use_ocr_for_image_block = use_ocr_for_image_block
229
+ self.layout_threshold = layout_threshold
230
+ self.layout_nms = layout_nms
231
+ self.layout_unclip_ratio = layout_unclip_ratio
232
+ self.layout_merge_bboxes_mode = layout_merge_bboxes_mode
233
+ self.layout_shape_mode = layout_shape_mode
234
+ self.prompt_label = prompt_label
235
+ self.format_block_content = format_block_content
236
+ self.repetition_penalty = repetition_penalty
237
+ self.temperature = temperature
238
+ self.top_p = top_p
239
+ self.min_pixels = min_pixels
240
+ self.max_pixels = max_pixels
241
+ self.max_new_tokens = max_new_tokens
242
+ self.merge_layout_blocks = merge_layout_blocks
243
+ self.markdown_ignore_labels = markdown_ignore_labels
244
+ self.vlm_extra_args = vlm_extra_args
245
+ self.prettify_markdown = prettify_markdown
246
+ self.show_formula_number = show_formula_number
247
+ self.restructure_pages = restructure_pages
248
+ self.merge_tables = merge_tables
249
+ self.relevel_titles = relevel_titles
250
+ self.visualize = visualize
251
+ self.additional_params = additional_params
252
+
253
+ def to_dict(self) -> dict[str, Any]:
254
+ """
255
+ Serialize the component to a dictionary.
256
+
257
+ :returns:
258
+ Dictionary with serialized data.
259
+ """
260
+ return default_to_dict(
261
+ self,
262
+ base_url=self.base_url,
263
+ access_token=self.access_token.to_dict(),
264
+ model=self.model if isinstance(self.model, str) else self.model.value,
265
+ file_type=self.file_type,
266
+ use_doc_orientation_classify=self.use_doc_orientation_classify,
267
+ use_doc_unwarping=self.use_doc_unwarping,
268
+ use_layout_detection=self.use_layout_detection,
269
+ use_chart_recognition=self.use_chart_recognition,
270
+ use_seal_recognition=self.use_seal_recognition,
271
+ use_ocr_for_image_block=self.use_ocr_for_image_block,
272
+ layout_threshold=self.layout_threshold,
273
+ layout_nms=self.layout_nms,
274
+ layout_unclip_ratio=self.layout_unclip_ratio,
275
+ layout_merge_bboxes_mode=self.layout_merge_bboxes_mode,
276
+ layout_shape_mode=self.layout_shape_mode,
277
+ prompt_label=self.prompt_label,
278
+ format_block_content=self.format_block_content,
279
+ repetition_penalty=self.repetition_penalty,
280
+ temperature=self.temperature,
281
+ top_p=self.top_p,
282
+ min_pixels=self.min_pixels,
283
+ max_pixels=self.max_pixels,
284
+ max_new_tokens=self.max_new_tokens,
285
+ merge_layout_blocks=self.merge_layout_blocks,
286
+ markdown_ignore_labels=self.markdown_ignore_labels,
287
+ vlm_extra_args=self.vlm_extra_args,
288
+ prettify_markdown=self.prettify_markdown,
289
+ show_formula_number=self.show_formula_number,
290
+ restructure_pages=self.restructure_pages,
291
+ merge_tables=self.merge_tables,
292
+ relevel_titles=self.relevel_titles,
293
+ visualize=self.visualize,
294
+ additional_params=self.additional_params,
295
+ )
296
+
297
+ @classmethod
298
+ def from_dict(cls, data: dict[str, Any]) -> "PaddleOCRVLDocumentConverter":
299
+ """
300
+ Deserialize the component from a dictionary.
301
+
302
+ :param data:
303
+ Dictionary to deserialize from.
304
+ :returns:
305
+ Deserialized component.
306
+ """
307
+ deserialize_secrets_inplace(data["init_parameters"], keys=["access_token"])
308
+ init_params = data["init_parameters"]
309
+ if "model" in init_params and isinstance(init_params["model"], str):
310
+ try:
311
+ init_params["model"] = Model(init_params["model"])
312
+ except ValueError:
313
+ pass
314
+ return default_from_dict(cls, data)
315
+
316
+ def _build_options(self) -> PaddleOCRVLOptions:
317
+ return PaddleOCRVLOptions(
318
+ use_doc_orientation_classify=self.use_doc_orientation_classify,
319
+ use_doc_unwarping=self.use_doc_unwarping,
320
+ use_layout_detection=self.use_layout_detection,
321
+ use_chart_recognition=self.use_chart_recognition,
322
+ use_seal_recognition=self.use_seal_recognition,
323
+ use_ocr_for_image_block=self.use_ocr_for_image_block,
324
+ layout_threshold=self.layout_threshold,
325
+ layout_nms=self.layout_nms,
326
+ layout_unclip_ratio=self.layout_unclip_ratio,
327
+ layout_merge_bboxes_mode=self.layout_merge_bboxes_mode,
328
+ layout_shape_mode=self.layout_shape_mode,
329
+ prompt_label=self.prompt_label,
330
+ format_block_content=self.format_block_content,
331
+ repetition_penalty=self.repetition_penalty,
332
+ temperature=self.temperature,
333
+ top_p=self.top_p,
334
+ min_pixels=self.min_pixels,
335
+ max_pixels=self.max_pixels,
336
+ max_new_tokens=self.max_new_tokens,
337
+ merge_layout_blocks=self.merge_layout_blocks,
338
+ markdown_ignore_labels=self.markdown_ignore_labels,
339
+ vlm_extra_args=self.vlm_extra_args,
340
+ prettify_markdown=self.prettify_markdown,
341
+ show_formula_number=self.show_formula_number,
342
+ restructure_pages=self.restructure_pages,
343
+ merge_tables=self.merge_tables,
344
+ relevel_titles=self.relevel_titles,
345
+ visualize=self.visualize,
346
+ extra_options=self.additional_params,
347
+ )
348
+
349
+ def _parse(
350
+ self, data: bytes, file_type: int, client: PaddleOCRClient, source_extension: str | None = None
351
+ ) -> tuple[str, dict[str, Any]]:
352
+ extension = source_extension if source_extension else _EXTENSION_FOR_FILE_TYPE[file_type]
353
+ with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
354
+ tmp_path = tmp.name
355
+ tmp.write(data)
356
+ try:
357
+ result = client.parse_document(
358
+ model=self.model,
359
+ file_path=tmp_path,
360
+ options=self._build_options(),
361
+ )
362
+ finally:
363
+ Path(tmp_path).unlink(missing_ok=True)
364
+
365
+ text = "\f".join(page.markdown_text for page in result.pages)
366
+ raw: dict[str, Any] = {
367
+ "job_id": result.job_id,
368
+ "pages": [page.raw for page in result.pages],
369
+ "data_info": result.data_info,
370
+ }
371
+ return text, raw
372
+
373
+ @component.output_types(documents=list[Document], raw_paddleocr_responses=list[dict[str, Any]])
374
+ def run(
375
+ self,
376
+ sources: list[str | Path | ByteStream],
377
+ meta: dict[str, Any] | list[dict[str, Any]] | None = None,
378
+ ) -> dict[str, Any]:
379
+ """
380
+ Convert image or PDF files to Documents.
381
+
382
+ :param sources:
383
+ List of image or PDF file paths or ByteStream objects.
384
+ :param meta:
385
+ Optional metadata to attach to the Documents. A single dict is applied
386
+ to all documents; a list must match the number of sources.
387
+ :returns:
388
+ A dictionary with:
389
+ - `documents`: List of created Documents.
390
+ - `raw_paddleocr_responses`: List of raw PaddleOCR API responses.
391
+ """
392
+ documents: list[Document] = []
393
+ raw_responses: list[dict[str, Any]] = []
394
+
395
+ meta_list = normalize_metadata(meta, sources_count=len(sources))
396
+ token = self.access_token.resolve_value() if self.access_token else None
397
+
398
+ kwargs: dict[str, Any] = {"client_platform": "haystack", "token": token}
399
+ if self.base_url is not None:
400
+ kwargs["base_url"] = self.base_url
401
+
402
+ with PaddleOCRClient(**kwargs) as client:
403
+ for source, metadata in zip(sources, meta_list, strict=True):
404
+ try:
405
+ bytestream = get_bytestream_from_source(source)
406
+ except Exception as e:
407
+ logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
408
+ continue
409
+
410
+ if self.file_type is not None:
411
+ file_type: int | None = self.file_type
412
+ else:
413
+ mime_type = bytestream.mime_type if bytestream.mime_type else None
414
+ file_type = _infer_file_type_from_source(source, mime_type)
415
+
416
+ if file_type is None:
417
+ logger.warning("Could not determine file type for {source}. Skipping it.", source=source)
418
+ continue
419
+
420
+ source_ext: str | None = None
421
+ if self.file_type is None:
422
+ if isinstance(source, (str, Path)):
423
+ source_ext = Path(source).suffix.lower() or None
424
+ elif isinstance(source, ByteStream) and source.meta.get("file_path"):
425
+ source_ext = Path(str(source.meta["file_path"])).suffix.lower() or None
426
+
427
+ try:
428
+ text, raw_resp = self._parse(bytestream.data, file_type, client, source_extension=source_ext)
429
+ except Exception as e:
430
+ logger.warning(
431
+ "Could not convert {source} to Document, skipping. Error: {error}",
432
+ source=source,
433
+ error=e,
434
+ )
435
+ continue
436
+
437
+ if not text:
438
+ logger.warning(
439
+ "{cls} could not extract text from {source}. Returning an empty document.",
440
+ cls=self.__class__.__name__,
441
+ source=source,
442
+ )
443
+
444
+ merged_metadata = {**bytestream.meta, **metadata}
445
+ documents.append(Document(content=text, meta=merged_metadata))
446
+ raw_responses.append(raw_resp)
447
+
448
+ return {"documents": documents, "raw_paddleocr_responses": raw_responses}