paddleocr-haystack 1.1.0__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/CHANGELOG.md +7 -0
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/PKG-INFO +3 -5
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/README.md +1 -1
- paddleocr_haystack-2.0.0/pydoc/config_docusaurus.yml +13 -0
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/pyproject.toml +16 -6
- paddleocr_haystack-2.0.0/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py +448 -0
- paddleocr_haystack-2.0.0/tests/test_paddleocr_vl_document_converter.py +599 -0
- paddleocr_haystack-1.1.0/pydoc/config_docusaurus.yml +0 -28
- paddleocr_haystack-1.1.0/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py +0 -564
- paddleocr_haystack-1.1.0/tests/test_paddleocr_vl_document_converter.py +0 -778
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/.gitignore +0 -0
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/LICENSE.txt +0 -0
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/src/haystack_integrations/components/converters/paddleocr/__init__.py +0 -0
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/src/haystack_integrations/components/converters/py.typed +0 -0
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/tests/__init__.py +0 -0
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/tests/conftest.py +0 -0
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/tests/test_files/sample_img.jpg +0 -0
- {paddleocr_haystack-1.1.0 → paddleocr_haystack-2.0.0}/tests/test_files/sample_pdf.pdf +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: paddleocr-haystack
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: An integration of PaddleOCR with Haystack
|
|
5
5
|
Project-URL: Documentation, https://github.com/haystack-core-integrations/tree/main/integrations/paddleocr#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/haystack-core-integrations/issues
|
|
@@ -17,9 +17,7 @@ Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
|
17
17
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
18
18
|
Requires-Python: >=3.10
|
|
19
19
|
Requires-Dist: haystack-ai>=2.22.0
|
|
20
|
-
Requires-Dist: paddleocr>=3.
|
|
21
|
-
Requires-Dist: paddlex[serving]>=3.4.0
|
|
22
|
-
Requires-Dist: requests>=2.25.0
|
|
20
|
+
Requires-Dist: paddleocr>=3.7.0
|
|
23
21
|
Description-Content-Type: text/markdown
|
|
24
22
|
|
|
25
23
|
# paddleocr-haystack
|
|
@@ -36,4 +34,4 @@ Description-Content-Type: text/markdown
|
|
|
36
34
|
|
|
37
35
|
Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
|
|
38
36
|
|
|
39
|
-
To run integration tests locally, you need to export the `
|
|
37
|
+
To run integration tests locally, you need to export the `PADDLEOCR_BASE_URL` and `PADDLEOCR_ACCESS_TOKEN` environment variables.
|
|
@@ -12,4 +12,4 @@
|
|
|
12
12
|
|
|
13
13
|
Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
|
|
14
14
|
|
|
15
|
-
To run integration tests locally, you need to export the `
|
|
15
|
+
To run integration tests locally, you need to export the `PADDLEOCR_BASE_URL` and `PADDLEOCR_ACCESS_TOKEN` environment variables.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
loaders:
|
|
2
|
+
- modules:
|
|
3
|
+
- haystack_integrations.components.converters.paddleocr.paddleocr_vl_document_converter
|
|
4
|
+
search_path: [../src]
|
|
5
|
+
processors:
|
|
6
|
+
- type: filter
|
|
7
|
+
documented_only: true
|
|
8
|
+
skip_empty_modules: true
|
|
9
|
+
renderer:
|
|
10
|
+
description: PaddleOCR integration for Haystack
|
|
11
|
+
id: integrations-paddleocr
|
|
12
|
+
filename: paddleocr.md
|
|
13
|
+
title: PaddleOCR
|
|
@@ -24,9 +24,7 @@ classifiers = [
|
|
|
24
24
|
]
|
|
25
25
|
dependencies = [
|
|
26
26
|
"haystack-ai>=2.22.0",
|
|
27
|
-
"paddleocr>=3.
|
|
28
|
-
"paddlex[serving]>=3.4.0",
|
|
29
|
-
"requests>=2.25.0",
|
|
27
|
+
"paddleocr>=3.7.0",
|
|
30
28
|
]
|
|
31
29
|
|
|
32
30
|
[project.urls]
|
|
@@ -50,7 +48,7 @@ installer = "uv"
|
|
|
50
48
|
dependencies = ["haystack-pydoc-tools", "ruff"]
|
|
51
49
|
|
|
52
50
|
[tool.hatch.envs.default.scripts]
|
|
53
|
-
docs = ["pydoc
|
|
51
|
+
docs = ["haystack-pydoc pydoc/config_docusaurus.yml"]
|
|
54
52
|
fmt = "ruff check --fix {args}; ruff format {args}"
|
|
55
53
|
fmt-check = "ruff check {args} && ruff format --check {args}"
|
|
56
54
|
|
|
@@ -69,7 +67,8 @@ dependencies = [
|
|
|
69
67
|
unit = 'pytest -m "not integration" {args:tests}'
|
|
70
68
|
integration = 'pytest -m "integration" {args:tests}'
|
|
71
69
|
all = 'pytest {args:tests}'
|
|
72
|
-
cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}'
|
|
70
|
+
unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}'
|
|
71
|
+
integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}'
|
|
73
72
|
types = "mypy -p haystack_integrations.components.converters.paddleocr {args}"
|
|
74
73
|
|
|
75
74
|
[tool.mypy]
|
|
@@ -84,9 +83,17 @@ line-length = 120
|
|
|
84
83
|
[tool.ruff.lint]
|
|
85
84
|
select = [
|
|
86
85
|
"A",
|
|
86
|
+
"ANN",
|
|
87
87
|
"ARG",
|
|
88
88
|
"B",
|
|
89
89
|
"C",
|
|
90
|
+
"D102", # Missing docstring in public method
|
|
91
|
+
"D103", # Missing docstring in public function
|
|
92
|
+
"D205", # 1 blank line required between summary line and description
|
|
93
|
+
"D209", # Closing triple quotes go to new line
|
|
94
|
+
"D213", # summary lines must be positioned on the second physical line of the docstring
|
|
95
|
+
"D417", # Missing argument descriptions in the docstring
|
|
96
|
+
"D419", # Docstring is empty
|
|
90
97
|
"DTZ",
|
|
91
98
|
"E",
|
|
92
99
|
"EM",
|
|
@@ -111,6 +118,8 @@ select = [
|
|
|
111
118
|
ignore = [
|
|
112
119
|
# Allow non-abstract empty methods in abstract base classes
|
|
113
120
|
"B027",
|
|
121
|
+
# Allow Any in type annotations at dynamic boundaries
|
|
122
|
+
"ANN401",
|
|
114
123
|
# Ignore checks for possible passwords
|
|
115
124
|
"S105",
|
|
116
125
|
"S106",
|
|
@@ -134,11 +143,12 @@ ban-relative-imports = "parents"
|
|
|
134
143
|
|
|
135
144
|
[tool.ruff.lint.per-file-ignores]
|
|
136
145
|
# Tests can use magic values, assertions, and relative imports
|
|
137
|
-
"tests/**/*" = ["PLR2004", "S101", "TID252"]
|
|
146
|
+
"tests/**/*" = ["D", "PLR2004", "S101", "TID252", "ANN"]
|
|
138
147
|
|
|
139
148
|
[tool.coverage.run]
|
|
140
149
|
source = ["haystack_integrations"]
|
|
141
150
|
branch = true
|
|
151
|
+
relative_files = true
|
|
142
152
|
parallel = false
|
|
143
153
|
|
|
144
154
|
[tool.coverage.report]
|
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Literal
|
|
7
|
+
|
|
8
|
+
from haystack import Document, component, default_from_dict, default_to_dict, logging
|
|
9
|
+
from haystack.components.converters.utils import (
|
|
10
|
+
get_bytestream_from_source,
|
|
11
|
+
normalize_metadata,
|
|
12
|
+
)
|
|
13
|
+
from haystack.dataclasses import ByteStream
|
|
14
|
+
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
15
|
+
|
|
16
|
+
from paddleocr import Model, PaddleOCRClient, PaddleOCRVLOptions # type: ignore[import-untyped]
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
FileTypeInput = Literal["pdf", "image"] | None
|
|
21
|
+
|
|
22
|
+
_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}
|
|
23
|
+
_PDF_EXTENSIONS = {".pdf"}
|
|
24
|
+
_EXTENSION_FOR_FILE_TYPE = {0: ".pdf", 1: ".jpg"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _infer_file_type_from_source(
|
|
28
|
+
source: str | Path | ByteStream,
|
|
29
|
+
mime_type: str | None = None,
|
|
30
|
+
) -> int | None:
|
|
31
|
+
"""
|
|
32
|
+
Infer file type from file extension or MIME type.
|
|
33
|
+
|
|
34
|
+
:param source:
|
|
35
|
+
Original source (file path, Path object, or ByteStream).
|
|
36
|
+
:param mime_type:
|
|
37
|
+
MIME type of the source.
|
|
38
|
+
:returns:
|
|
39
|
+
Inferred file type: 0 for PDF, 1 for image, or None if cannot be determined.
|
|
40
|
+
"""
|
|
41
|
+
file_path: str | None = None
|
|
42
|
+
|
|
43
|
+
if isinstance(source, (str, Path)):
|
|
44
|
+
file_path = str(source)
|
|
45
|
+
elif isinstance(source, ByteStream) and source.meta:
|
|
46
|
+
file_path = source.meta.get("file_path")
|
|
47
|
+
|
|
48
|
+
if file_path:
|
|
49
|
+
extension = Path(file_path).suffix.lower()
|
|
50
|
+
if extension in _PDF_EXTENSIONS:
|
|
51
|
+
return 0
|
|
52
|
+
if extension in _IMAGE_EXTENSIONS:
|
|
53
|
+
return 1
|
|
54
|
+
|
|
55
|
+
if mime_type:
|
|
56
|
+
mime_lower = mime_type.lower()
|
|
57
|
+
if mime_lower == "application/pdf":
|
|
58
|
+
return 0
|
|
59
|
+
if mime_lower.startswith("image/"):
|
|
60
|
+
return 1
|
|
61
|
+
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _normalize_file_type(file_type: FileTypeInput) -> int | None:
|
|
66
|
+
"""
|
|
67
|
+
Normalize file type input to the numeric format used internally.
|
|
68
|
+
|
|
69
|
+
:param file_type:
|
|
70
|
+
"pdf", "image", or None for auto-detection.
|
|
71
|
+
Integers 0 and 1 are also accepted for deserialization round-trips.
|
|
72
|
+
:returns:
|
|
73
|
+
0 for PDF, 1 for image, or None for auto-detection.
|
|
74
|
+
"""
|
|
75
|
+
if file_type is None:
|
|
76
|
+
return None
|
|
77
|
+
if file_type in ("pdf", 0):
|
|
78
|
+
return 0
|
|
79
|
+
if file_type in ("image", 1):
|
|
80
|
+
return 1
|
|
81
|
+
if isinstance(file_type, str):
|
|
82
|
+
msg = f"Invalid `file_type` string: {file_type}. Must be 'pdf' or 'image'."
|
|
83
|
+
raise ValueError(msg)
|
|
84
|
+
msg = f"Invalid `file_type` value: {file_type}. Must be 'pdf', 'image', or `None`."
|
|
85
|
+
raise ValueError(msg)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@component
|
|
89
|
+
class PaddleOCRVLDocumentConverter:
|
|
90
|
+
"""
|
|
91
|
+
Extracts text from documents using PaddleOCR's official document parsing API.
|
|
92
|
+
|
|
93
|
+
Uses `PaddleOCRClient` to parse documents via the PaddleOCR serving API.
|
|
94
|
+
For more information, please refer to:
|
|
95
|
+
https://www.paddleocr.ai/latest/en/version3.x/algorithm/PaddleOCR-VL/PaddleOCR-VL.html
|
|
96
|
+
|
|
97
|
+
**Usage Example:**
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from haystack_integrations.components.converters.paddleocr import PaddleOCRVLDocumentConverter
|
|
101
|
+
|
|
102
|
+
converter = PaddleOCRVLDocumentConverter(
|
|
103
|
+
base_url="http://xxxxx.aistudio-app.com",
|
|
104
|
+
)
|
|
105
|
+
result = converter.run(sources=["sample.pdf"])
|
|
106
|
+
documents = result["documents"]
|
|
107
|
+
raw_responses = result["raw_paddleocr_responses"]
|
|
108
|
+
```
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
def __init__(
|
|
112
|
+
self,
|
|
113
|
+
*,
|
|
114
|
+
base_url: str | None = None,
|
|
115
|
+
access_token: Secret = Secret.from_env_var(["PADDLEOCR_ACCESS_TOKEN", "AISTUDIO_ACCESS_TOKEN"]),
|
|
116
|
+
model: Model | str = Model.PADDLE_OCR_VL_16,
|
|
117
|
+
file_type: FileTypeInput = None,
|
|
118
|
+
use_doc_orientation_classify: bool | None = False,
|
|
119
|
+
use_doc_unwarping: bool | None = False,
|
|
120
|
+
use_layout_detection: bool | None = None,
|
|
121
|
+
use_chart_recognition: bool | None = None,
|
|
122
|
+
use_seal_recognition: bool | None = None,
|
|
123
|
+
use_ocr_for_image_block: bool | None = None,
|
|
124
|
+
layout_threshold: float | dict | None = None,
|
|
125
|
+
layout_nms: bool | None = None,
|
|
126
|
+
layout_unclip_ratio: float | list | dict | None = None,
|
|
127
|
+
layout_merge_bboxes_mode: str | dict | None = None,
|
|
128
|
+
layout_shape_mode: str | None = None,
|
|
129
|
+
prompt_label: str | None = None,
|
|
130
|
+
format_block_content: bool | None = None,
|
|
131
|
+
repetition_penalty: float | None = None,
|
|
132
|
+
temperature: float | None = None,
|
|
133
|
+
top_p: float | None = None,
|
|
134
|
+
min_pixels: int | None = None,
|
|
135
|
+
max_pixels: int | None = None,
|
|
136
|
+
max_new_tokens: int | None = None,
|
|
137
|
+
merge_layout_blocks: bool | None = None,
|
|
138
|
+
markdown_ignore_labels: list[str] | None = None,
|
|
139
|
+
vlm_extra_args: dict | None = None,
|
|
140
|
+
prettify_markdown: bool | None = None,
|
|
141
|
+
show_formula_number: bool | None = None,
|
|
142
|
+
restructure_pages: bool | None = None,
|
|
143
|
+
merge_tables: bool | None = None,
|
|
144
|
+
relevel_titles: bool | None = None,
|
|
145
|
+
visualize: bool | None = None,
|
|
146
|
+
additional_params: dict[str, Any] | None = None,
|
|
147
|
+
) -> None:
|
|
148
|
+
"""
|
|
149
|
+
Create a `PaddleOCRVLDocumentConverter` component.
|
|
150
|
+
|
|
151
|
+
:param base_url:
|
|
152
|
+
Base URL for the PaddleOCR API. Falls back to `PADDLEOCR_BASE_URL`
|
|
153
|
+
env var, then the SDK default.
|
|
154
|
+
:param access_token:
|
|
155
|
+
PaddleOCR access token. Falls back to `PADDLEOCR_ACCESS_TOKEN` env var.
|
|
156
|
+
:param model:
|
|
157
|
+
Document parsing model. Defaults to `Model.PADDLE_OCR_VL_16`.
|
|
158
|
+
:param file_type:
|
|
159
|
+
"pdf", "image", or None for auto-detection.
|
|
160
|
+
:param use_doc_orientation_classify:
|
|
161
|
+
Enable document orientation classification.
|
|
162
|
+
:param use_doc_unwarping:
|
|
163
|
+
Enable text image unwarping.
|
|
164
|
+
:param use_layout_detection:
|
|
165
|
+
Enable layout detection.
|
|
166
|
+
:param use_chart_recognition:
|
|
167
|
+
Enable chart recognition.
|
|
168
|
+
:param use_seal_recognition:
|
|
169
|
+
Enable seal recognition.
|
|
170
|
+
:param use_ocr_for_image_block:
|
|
171
|
+
Recognize text in image blocks.
|
|
172
|
+
:param layout_threshold:
|
|
173
|
+
Layout detection threshold.
|
|
174
|
+
:param layout_nms:
|
|
175
|
+
Perform NMS on layout detection results.
|
|
176
|
+
:param layout_unclip_ratio:
|
|
177
|
+
Layout unclip ratio.
|
|
178
|
+
:param layout_merge_bboxes_mode:
|
|
179
|
+
Layout merge bounding boxes mode.
|
|
180
|
+
:param layout_shape_mode:
|
|
181
|
+
Layout shape mode.
|
|
182
|
+
:param prompt_label:
|
|
183
|
+
Prompt type for the VLM ("ocr", "formula", "table", "chart", "seal", "spotting").
|
|
184
|
+
:param format_block_content:
|
|
185
|
+
Format block content.
|
|
186
|
+
:param repetition_penalty:
|
|
187
|
+
Repetition penalty for VLM sampling.
|
|
188
|
+
:param temperature:
|
|
189
|
+
Temperature for VLM sampling.
|
|
190
|
+
:param top_p:
|
|
191
|
+
Top-p for VLM sampling.
|
|
192
|
+
:param min_pixels:
|
|
193
|
+
Minimum pixels for VLM preprocessing.
|
|
194
|
+
:param max_pixels:
|
|
195
|
+
Maximum pixels for VLM preprocessing.
|
|
196
|
+
:param max_new_tokens:
|
|
197
|
+
Maximum tokens generated by the VLM.
|
|
198
|
+
:param merge_layout_blocks:
|
|
199
|
+
Merge layout detection boxes for cross-column content.
|
|
200
|
+
:param markdown_ignore_labels:
|
|
201
|
+
Layout labels to ignore in Markdown output.
|
|
202
|
+
:param vlm_extra_args:
|
|
203
|
+
Extra configuration for the VLM.
|
|
204
|
+
:param prettify_markdown:
|
|
205
|
+
Prettify output Markdown.
|
|
206
|
+
:param show_formula_number:
|
|
207
|
+
Include formula numbers in Markdown output.
|
|
208
|
+
:param restructure_pages:
|
|
209
|
+
Restructure results across multiple pages.
|
|
210
|
+
:param merge_tables:
|
|
211
|
+
Merge tables across pages.
|
|
212
|
+
:param relevel_titles:
|
|
213
|
+
Relevel titles.
|
|
214
|
+
:param visualize:
|
|
215
|
+
Return visualization results.
|
|
216
|
+
:param additional_params:
|
|
217
|
+
Extra options passed to `PaddleOCRVLOptions.extra_options`.
|
|
218
|
+
"""
|
|
219
|
+
self.base_url = base_url
|
|
220
|
+
self.access_token = access_token
|
|
221
|
+
self.model = model
|
|
222
|
+
self.file_type = _normalize_file_type(file_type)
|
|
223
|
+
self.use_doc_orientation_classify = use_doc_orientation_classify
|
|
224
|
+
self.use_doc_unwarping = use_doc_unwarping
|
|
225
|
+
self.use_layout_detection = use_layout_detection
|
|
226
|
+
self.use_chart_recognition = use_chart_recognition
|
|
227
|
+
self.use_seal_recognition = use_seal_recognition
|
|
228
|
+
self.use_ocr_for_image_block = use_ocr_for_image_block
|
|
229
|
+
self.layout_threshold = layout_threshold
|
|
230
|
+
self.layout_nms = layout_nms
|
|
231
|
+
self.layout_unclip_ratio = layout_unclip_ratio
|
|
232
|
+
self.layout_merge_bboxes_mode = layout_merge_bboxes_mode
|
|
233
|
+
self.layout_shape_mode = layout_shape_mode
|
|
234
|
+
self.prompt_label = prompt_label
|
|
235
|
+
self.format_block_content = format_block_content
|
|
236
|
+
self.repetition_penalty = repetition_penalty
|
|
237
|
+
self.temperature = temperature
|
|
238
|
+
self.top_p = top_p
|
|
239
|
+
self.min_pixels = min_pixels
|
|
240
|
+
self.max_pixels = max_pixels
|
|
241
|
+
self.max_new_tokens = max_new_tokens
|
|
242
|
+
self.merge_layout_blocks = merge_layout_blocks
|
|
243
|
+
self.markdown_ignore_labels = markdown_ignore_labels
|
|
244
|
+
self.vlm_extra_args = vlm_extra_args
|
|
245
|
+
self.prettify_markdown = prettify_markdown
|
|
246
|
+
self.show_formula_number = show_formula_number
|
|
247
|
+
self.restructure_pages = restructure_pages
|
|
248
|
+
self.merge_tables = merge_tables
|
|
249
|
+
self.relevel_titles = relevel_titles
|
|
250
|
+
self.visualize = visualize
|
|
251
|
+
self.additional_params = additional_params
|
|
252
|
+
|
|
253
|
+
def to_dict(self) -> dict[str, Any]:
|
|
254
|
+
"""
|
|
255
|
+
Serialize the component to a dictionary.
|
|
256
|
+
|
|
257
|
+
:returns:
|
|
258
|
+
Dictionary with serialized data.
|
|
259
|
+
"""
|
|
260
|
+
return default_to_dict(
|
|
261
|
+
self,
|
|
262
|
+
base_url=self.base_url,
|
|
263
|
+
access_token=self.access_token.to_dict(),
|
|
264
|
+
model=self.model if isinstance(self.model, str) else self.model.value,
|
|
265
|
+
file_type=self.file_type,
|
|
266
|
+
use_doc_orientation_classify=self.use_doc_orientation_classify,
|
|
267
|
+
use_doc_unwarping=self.use_doc_unwarping,
|
|
268
|
+
use_layout_detection=self.use_layout_detection,
|
|
269
|
+
use_chart_recognition=self.use_chart_recognition,
|
|
270
|
+
use_seal_recognition=self.use_seal_recognition,
|
|
271
|
+
use_ocr_for_image_block=self.use_ocr_for_image_block,
|
|
272
|
+
layout_threshold=self.layout_threshold,
|
|
273
|
+
layout_nms=self.layout_nms,
|
|
274
|
+
layout_unclip_ratio=self.layout_unclip_ratio,
|
|
275
|
+
layout_merge_bboxes_mode=self.layout_merge_bboxes_mode,
|
|
276
|
+
layout_shape_mode=self.layout_shape_mode,
|
|
277
|
+
prompt_label=self.prompt_label,
|
|
278
|
+
format_block_content=self.format_block_content,
|
|
279
|
+
repetition_penalty=self.repetition_penalty,
|
|
280
|
+
temperature=self.temperature,
|
|
281
|
+
top_p=self.top_p,
|
|
282
|
+
min_pixels=self.min_pixels,
|
|
283
|
+
max_pixels=self.max_pixels,
|
|
284
|
+
max_new_tokens=self.max_new_tokens,
|
|
285
|
+
merge_layout_blocks=self.merge_layout_blocks,
|
|
286
|
+
markdown_ignore_labels=self.markdown_ignore_labels,
|
|
287
|
+
vlm_extra_args=self.vlm_extra_args,
|
|
288
|
+
prettify_markdown=self.prettify_markdown,
|
|
289
|
+
show_formula_number=self.show_formula_number,
|
|
290
|
+
restructure_pages=self.restructure_pages,
|
|
291
|
+
merge_tables=self.merge_tables,
|
|
292
|
+
relevel_titles=self.relevel_titles,
|
|
293
|
+
visualize=self.visualize,
|
|
294
|
+
additional_params=self.additional_params,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
@classmethod
|
|
298
|
+
def from_dict(cls, data: dict[str, Any]) -> "PaddleOCRVLDocumentConverter":
|
|
299
|
+
"""
|
|
300
|
+
Deserialize the component from a dictionary.
|
|
301
|
+
|
|
302
|
+
:param data:
|
|
303
|
+
Dictionary to deserialize from.
|
|
304
|
+
:returns:
|
|
305
|
+
Deserialized component.
|
|
306
|
+
"""
|
|
307
|
+
deserialize_secrets_inplace(data["init_parameters"], keys=["access_token"])
|
|
308
|
+
init_params = data["init_parameters"]
|
|
309
|
+
if "model" in init_params and isinstance(init_params["model"], str):
|
|
310
|
+
try:
|
|
311
|
+
init_params["model"] = Model(init_params["model"])
|
|
312
|
+
except ValueError:
|
|
313
|
+
pass
|
|
314
|
+
return default_from_dict(cls, data)
|
|
315
|
+
|
|
316
|
+
def _build_options(self) -> PaddleOCRVLOptions:
|
|
317
|
+
return PaddleOCRVLOptions(
|
|
318
|
+
use_doc_orientation_classify=self.use_doc_orientation_classify,
|
|
319
|
+
use_doc_unwarping=self.use_doc_unwarping,
|
|
320
|
+
use_layout_detection=self.use_layout_detection,
|
|
321
|
+
use_chart_recognition=self.use_chart_recognition,
|
|
322
|
+
use_seal_recognition=self.use_seal_recognition,
|
|
323
|
+
use_ocr_for_image_block=self.use_ocr_for_image_block,
|
|
324
|
+
layout_threshold=self.layout_threshold,
|
|
325
|
+
layout_nms=self.layout_nms,
|
|
326
|
+
layout_unclip_ratio=self.layout_unclip_ratio,
|
|
327
|
+
layout_merge_bboxes_mode=self.layout_merge_bboxes_mode,
|
|
328
|
+
layout_shape_mode=self.layout_shape_mode,
|
|
329
|
+
prompt_label=self.prompt_label,
|
|
330
|
+
format_block_content=self.format_block_content,
|
|
331
|
+
repetition_penalty=self.repetition_penalty,
|
|
332
|
+
temperature=self.temperature,
|
|
333
|
+
top_p=self.top_p,
|
|
334
|
+
min_pixels=self.min_pixels,
|
|
335
|
+
max_pixels=self.max_pixels,
|
|
336
|
+
max_new_tokens=self.max_new_tokens,
|
|
337
|
+
merge_layout_blocks=self.merge_layout_blocks,
|
|
338
|
+
markdown_ignore_labels=self.markdown_ignore_labels,
|
|
339
|
+
vlm_extra_args=self.vlm_extra_args,
|
|
340
|
+
prettify_markdown=self.prettify_markdown,
|
|
341
|
+
show_formula_number=self.show_formula_number,
|
|
342
|
+
restructure_pages=self.restructure_pages,
|
|
343
|
+
merge_tables=self.merge_tables,
|
|
344
|
+
relevel_titles=self.relevel_titles,
|
|
345
|
+
visualize=self.visualize,
|
|
346
|
+
extra_options=self.additional_params,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def _parse(
|
|
350
|
+
self, data: bytes, file_type: int, client: PaddleOCRClient, source_extension: str | None = None
|
|
351
|
+
) -> tuple[str, dict[str, Any]]:
|
|
352
|
+
extension = source_extension if source_extension else _EXTENSION_FOR_FILE_TYPE[file_type]
|
|
353
|
+
with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
|
|
354
|
+
tmp_path = tmp.name
|
|
355
|
+
tmp.write(data)
|
|
356
|
+
try:
|
|
357
|
+
result = client.parse_document(
|
|
358
|
+
model=self.model,
|
|
359
|
+
file_path=tmp_path,
|
|
360
|
+
options=self._build_options(),
|
|
361
|
+
)
|
|
362
|
+
finally:
|
|
363
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
364
|
+
|
|
365
|
+
text = "\f".join(page.markdown_text for page in result.pages)
|
|
366
|
+
raw: dict[str, Any] = {
|
|
367
|
+
"job_id": result.job_id,
|
|
368
|
+
"pages": [page.raw for page in result.pages],
|
|
369
|
+
"data_info": result.data_info,
|
|
370
|
+
}
|
|
371
|
+
return text, raw
|
|
372
|
+
|
|
373
|
+
@component.output_types(documents=list[Document], raw_paddleocr_responses=list[dict[str, Any]])
|
|
374
|
+
def run(
|
|
375
|
+
self,
|
|
376
|
+
sources: list[str | Path | ByteStream],
|
|
377
|
+
meta: dict[str, Any] | list[dict[str, Any]] | None = None,
|
|
378
|
+
) -> dict[str, Any]:
|
|
379
|
+
"""
|
|
380
|
+
Convert image or PDF files to Documents.
|
|
381
|
+
|
|
382
|
+
:param sources:
|
|
383
|
+
List of image or PDF file paths or ByteStream objects.
|
|
384
|
+
:param meta:
|
|
385
|
+
Optional metadata to attach to the Documents. A single dict is applied
|
|
386
|
+
to all documents; a list must match the number of sources.
|
|
387
|
+
:returns:
|
|
388
|
+
A dictionary with:
|
|
389
|
+
- `documents`: List of created Documents.
|
|
390
|
+
- `raw_paddleocr_responses`: List of raw PaddleOCR API responses.
|
|
391
|
+
"""
|
|
392
|
+
documents: list[Document] = []
|
|
393
|
+
raw_responses: list[dict[str, Any]] = []
|
|
394
|
+
|
|
395
|
+
meta_list = normalize_metadata(meta, sources_count=len(sources))
|
|
396
|
+
token = self.access_token.resolve_value() if self.access_token else None
|
|
397
|
+
|
|
398
|
+
kwargs: dict[str, Any] = {"client_platform": "haystack", "token": token}
|
|
399
|
+
if self.base_url is not None:
|
|
400
|
+
kwargs["base_url"] = self.base_url
|
|
401
|
+
|
|
402
|
+
with PaddleOCRClient(**kwargs) as client:
|
|
403
|
+
for source, metadata in zip(sources, meta_list, strict=True):
|
|
404
|
+
try:
|
|
405
|
+
bytestream = get_bytestream_from_source(source)
|
|
406
|
+
except Exception as e:
|
|
407
|
+
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
|
|
408
|
+
continue
|
|
409
|
+
|
|
410
|
+
if self.file_type is not None:
|
|
411
|
+
file_type: int | None = self.file_type
|
|
412
|
+
else:
|
|
413
|
+
mime_type = bytestream.mime_type if bytestream.mime_type else None
|
|
414
|
+
file_type = _infer_file_type_from_source(source, mime_type)
|
|
415
|
+
|
|
416
|
+
if file_type is None:
|
|
417
|
+
logger.warning("Could not determine file type for {source}. Skipping it.", source=source)
|
|
418
|
+
continue
|
|
419
|
+
|
|
420
|
+
source_ext: str | None = None
|
|
421
|
+
if self.file_type is None:
|
|
422
|
+
if isinstance(source, (str, Path)):
|
|
423
|
+
source_ext = Path(source).suffix.lower() or None
|
|
424
|
+
elif isinstance(source, ByteStream) and source.meta.get("file_path"):
|
|
425
|
+
source_ext = Path(str(source.meta["file_path"])).suffix.lower() or None
|
|
426
|
+
|
|
427
|
+
try:
|
|
428
|
+
text, raw_resp = self._parse(bytestream.data, file_type, client, source_extension=source_ext)
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.warning(
|
|
431
|
+
"Could not convert {source} to Document, skipping. Error: {error}",
|
|
432
|
+
source=source,
|
|
433
|
+
error=e,
|
|
434
|
+
)
|
|
435
|
+
continue
|
|
436
|
+
|
|
437
|
+
if not text:
|
|
438
|
+
logger.warning(
|
|
439
|
+
"{cls} could not extract text from {source}. Returning an empty document.",
|
|
440
|
+
cls=self.__class__.__name__,
|
|
441
|
+
source=source,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
merged_metadata = {**bytestream.meta, **metadata}
|
|
445
|
+
documents.append(Document(content=text, meta=merged_metadata))
|
|
446
|
+
raw_responses.append(raw_resp)
|
|
447
|
+
|
|
448
|
+
return {"documents": documents, "raw_paddleocr_responses": raw_responses}
|