paddleocr-haystack 0.1.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddleocr_haystack-1.1.0/CHANGELOG.md +20 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/PKG-INFO +5 -6
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/pyproject.toml +5 -11
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py +106 -36
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/tests/test_paddleocr_vl_document_converter.py +34 -4
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/.gitignore +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/LICENSE.txt +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/README.md +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/pydoc/config_docusaurus.yml +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/src/haystack_integrations/components/converters/paddleocr/__init__.py +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/src/haystack_integrations/components/converters/py.typed +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/tests/__init__.py +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/tests/conftest.py +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/tests/test_files/sample_img.jpg +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/tests/test_files/sample_pdf.pdf +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [integrations/paddleocr-v1.0.0] - 2026-01-12
|
|
4
|
+
|
|
5
|
+
### 🧹 Chores
|
|
6
|
+
|
|
7
|
+
- Make fmt command more forgiving (#2671)
|
|
8
|
+
- [**breaking**] Paddleocr - drop Python 3.9 and use X|Y typing (#2714)
|
|
9
|
+
|
|
10
|
+
### 🌀 Miscellaneous
|
|
11
|
+
|
|
12
|
+
- Feat: Add 'Client-Platform' header for server processing for PaddleOCR (#2657)
|
|
13
|
+
|
|
14
|
+
## [integrations/paddleocr-v0.1.0] - 2025-12-10
|
|
15
|
+
|
|
16
|
+
### 🚀 Features
|
|
17
|
+
|
|
18
|
+
- Add PaddleOCR-VL document converter (#2567)
|
|
19
|
+
|
|
20
|
+
<!-- generated by git-cliff -->
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: paddleocr-haystack
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: An integration of PaddleOCR with Haystack
|
|
5
5
|
Project-URL: Documentation, https://github.com/haystack-core-integrations/tree/main/integrations/paddleocr#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/haystack-core-integrations/issues
|
|
@@ -10,16 +10,15 @@ License-Expression: Apache-2.0
|
|
|
10
10
|
License-File: LICENSE.txt
|
|
11
11
|
Classifier: Development Status :: 4 - Beta
|
|
12
12
|
Classifier: Programming Language :: Python
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
16
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
18
17
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
19
|
-
Requires-Python: >=3.
|
|
20
|
-
Requires-Dist: haystack-ai>=2.
|
|
21
|
-
Requires-Dist: paddleocr>=3.
|
|
22
|
-
Requires-Dist: paddlex[serving]>=3.
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: haystack-ai>=2.22.0
|
|
20
|
+
Requires-Dist: paddleocr>=3.4.0
|
|
21
|
+
Requires-Dist: paddlex[serving]>=3.4.0
|
|
23
22
|
Requires-Dist: requests>=2.25.0
|
|
24
23
|
Description-Content-Type: text/markdown
|
|
25
24
|
|
|
@@ -7,7 +7,7 @@ name = "paddleocr-haystack"
|
|
|
7
7
|
dynamic = ["version"]
|
|
8
8
|
description = 'An integration of PaddleOCR with Haystack'
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
requires-python = ">=3.
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
11
|
license = "Apache-2.0"
|
|
12
12
|
keywords = []
|
|
13
13
|
authors = [
|
|
@@ -16,7 +16,6 @@ authors = [
|
|
|
16
16
|
classifiers = [
|
|
17
17
|
"Development Status :: 4 - Beta",
|
|
18
18
|
"Programming Language :: Python",
|
|
19
|
-
"Programming Language :: Python :: 3.9",
|
|
20
19
|
"Programming Language :: Python :: 3.10",
|
|
21
20
|
"Programming Language :: Python :: 3.11",
|
|
22
21
|
"Programming Language :: Python :: 3.12",
|
|
@@ -24,9 +23,9 @@ classifiers = [
|
|
|
24
23
|
"Programming Language :: Python :: Implementation :: PyPy",
|
|
25
24
|
]
|
|
26
25
|
dependencies = [
|
|
27
|
-
"haystack-ai>=2.
|
|
28
|
-
"paddleocr>=3.
|
|
29
|
-
"paddlex[serving]>=3.
|
|
26
|
+
"haystack-ai>=2.22.0",
|
|
27
|
+
"paddleocr>=3.4.0",
|
|
28
|
+
"paddlex[serving]>=3.4.0",
|
|
30
29
|
"requests>=2.25.0",
|
|
31
30
|
]
|
|
32
31
|
|
|
@@ -52,7 +51,7 @@ dependencies = ["haystack-pydoc-tools", "ruff"]
|
|
|
52
51
|
|
|
53
52
|
[tool.hatch.envs.default.scripts]
|
|
54
53
|
docs = ["pydoc-markdown pydoc/config_docusaurus.yml"]
|
|
55
|
-
fmt = "ruff check --fix {args}
|
|
54
|
+
fmt = "ruff check --fix {args}; ruff format {args}"
|
|
56
55
|
fmt-check = "ruff check {args} && ruff format --check {args}"
|
|
57
56
|
|
|
58
57
|
[tool.hatch.envs.test]
|
|
@@ -80,7 +79,6 @@ check_untyped_defs = true
|
|
|
80
79
|
disallow_incomplete_defs = true
|
|
81
80
|
|
|
82
81
|
[tool.ruff]
|
|
83
|
-
target-version = "py39"
|
|
84
82
|
line-length = 120
|
|
85
83
|
|
|
86
84
|
[tool.ruff.lint]
|
|
@@ -127,10 +125,6 @@ ignore = [
|
|
|
127
125
|
"B008",
|
|
128
126
|
"S101",
|
|
129
127
|
]
|
|
130
|
-
unfixable = [
|
|
131
|
-
# Don't touch unused imports
|
|
132
|
-
"F401",
|
|
133
|
-
]
|
|
134
128
|
|
|
135
129
|
[tool.ruff.lint.isort]
|
|
136
130
|
known-first-party = ["haystack_integrations"]
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import base64
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Literal
|
|
6
|
+
from typing import Any, Literal
|
|
7
7
|
|
|
8
8
|
import requests
|
|
9
9
|
from haystack import Document, component, default_from_dict, default_to_dict, logging
|
|
@@ -24,7 +24,7 @@ from paddlex.inference.serving.schemas.shared.ocr import FileType # type: ignor
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
FileTypeInput =
|
|
27
|
+
FileTypeInput = Literal["pdf", "image"] | None
|
|
28
28
|
|
|
29
29
|
# Supported image file extensions
|
|
30
30
|
_IMAGE_EXTENSIONS = {
|
|
@@ -41,9 +41,9 @@ _PDF_EXTENSIONS = {".pdf"}
|
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
def _infer_file_type_from_source(
|
|
44
|
-
source:
|
|
45
|
-
mime_type:
|
|
46
|
-
) ->
|
|
44
|
+
source: str | Path | ByteStream,
|
|
45
|
+
mime_type: str | None = None,
|
|
46
|
+
) -> FileType | None:
|
|
47
47
|
"""
|
|
48
48
|
Infer file type from file extension or MIME type.
|
|
49
49
|
|
|
@@ -56,7 +56,7 @@ def _infer_file_type_from_source(
|
|
|
56
56
|
determined.
|
|
57
57
|
"""
|
|
58
58
|
# Try to get extension from file path
|
|
59
|
-
file_path:
|
|
59
|
+
file_path: str | None = None
|
|
60
60
|
|
|
61
61
|
# Check if source is a file path
|
|
62
62
|
if isinstance(source, (str, Path)):
|
|
@@ -86,7 +86,7 @@ def _infer_file_type_from_source(
|
|
|
86
86
|
return None
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
def _normalize_file_type(file_type:
|
|
89
|
+
def _normalize_file_type(file_type: FileTypeInput) -> FileType | None:
|
|
90
90
|
"""
|
|
91
91
|
Normalize file type input to the numeric format expected by the API.
|
|
92
92
|
|
|
@@ -145,36 +145,45 @@ class PaddleOCRVLDocumentConverter:
|
|
|
145
145
|
*,
|
|
146
146
|
api_url: str,
|
|
147
147
|
access_token: Secret = Secret.from_env_var("AISTUDIO_ACCESS_TOKEN"),
|
|
148
|
-
file_type:
|
|
149
|
-
use_doc_orientation_classify:
|
|
150
|
-
use_doc_unwarping:
|
|
151
|
-
use_layout_detection:
|
|
152
|
-
use_chart_recognition:
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
148
|
+
file_type: FileTypeInput = None,
|
|
149
|
+
use_doc_orientation_classify: bool | None = False,
|
|
150
|
+
use_doc_unwarping: bool | None = False,
|
|
151
|
+
use_layout_detection: bool | None = None,
|
|
152
|
+
use_chart_recognition: bool | None = None,
|
|
153
|
+
use_seal_recognition: bool | None = None,
|
|
154
|
+
use_ocr_for_image_block: bool | None = None,
|
|
155
|
+
layout_threshold: float | dict | None = None,
|
|
156
|
+
layout_nms: bool | None = None,
|
|
157
|
+
layout_unclip_ratio: float | tuple[float, float] | dict | None = None,
|
|
158
|
+
layout_merge_bboxes_mode: str | dict | None = None,
|
|
159
|
+
layout_shape_mode: str | None = None,
|
|
160
|
+
prompt_label: str | None = None,
|
|
161
|
+
format_block_content: bool | None = None,
|
|
162
|
+
repetition_penalty: float | None = None,
|
|
163
|
+
temperature: float | None = None,
|
|
164
|
+
top_p: float | None = None,
|
|
165
|
+
min_pixels: int | None = None,
|
|
166
|
+
max_pixels: int | None = None,
|
|
167
|
+
max_new_tokens: int | None = None,
|
|
168
|
+
merge_layout_blocks: bool | None = None,
|
|
169
|
+
markdown_ignore_labels: list[str] | None = None,
|
|
170
|
+
vlm_extra_args: dict | None = None,
|
|
171
|
+
prettify_markdown: bool | None = None,
|
|
172
|
+
show_formula_number: bool | None = None,
|
|
173
|
+
restructure_pages: bool | None = None,
|
|
174
|
+
merge_tables: bool | None = None,
|
|
175
|
+
relevel_titles: bool | None = None,
|
|
176
|
+
visualize: bool | None = None,
|
|
177
|
+
additional_params: dict[str, Any] | None = None,
|
|
168
178
|
):
|
|
169
179
|
"""
|
|
170
180
|
Create a `PaddleOCRVLDocumentConverter` component.
|
|
171
181
|
|
|
172
182
|
:param api_url:
|
|
173
183
|
API URL. To obtain the API URL, visit the [PaddleOCR official
|
|
174
|
-
website](https://aistudio.baidu.com/paddleocr
|
|
175
|
-
**API** button
|
|
176
|
-
|
|
177
|
-
`API_URL`.
|
|
184
|
+
website](https://aistudio.baidu.com/paddleocr), click the
|
|
185
|
+
**API** button, choose the example code for PaddleOCR-VL, and copy
|
|
186
|
+
the `API_URL`.
|
|
178
187
|
:param access_token:
|
|
179
188
|
AI Studio access token. You can obtain it from [this
|
|
180
189
|
page](https://aistudio.baidu.com/account/accessToken).
|
|
@@ -193,6 +202,10 @@ class PaddleOCRVLDocumentConverter:
|
|
|
193
202
|
Whether to enable the layout detection function.
|
|
194
203
|
:param use_chart_recognition:
|
|
195
204
|
Whether to enable the chart recognition function.
|
|
205
|
+
:param use_seal_recognition:
|
|
206
|
+
Whether to enable the seal recognition function.
|
|
207
|
+
:param use_ocr_for_image_block:
|
|
208
|
+
Whether to recognize text in image blocks.
|
|
196
209
|
:param layout_threshold:
|
|
197
210
|
Layout detection threshold. Can be a float or a dict with
|
|
198
211
|
page-specific thresholds.
|
|
@@ -204,9 +217,11 @@ class PaddleOCRVLDocumentConverter:
|
|
|
204
217
|
dict with page-specific values.
|
|
205
218
|
:param layout_merge_bboxes_mode:
|
|
206
219
|
Layout merge bounding boxes mode. Can be a string or a dict.
|
|
220
|
+
:param layout_shape_mode:
|
|
221
|
+
Layout shape mode.
|
|
207
222
|
:param prompt_label:
|
|
208
223
|
Prompt type for the VLM. Possible values are "ocr", "formula",
|
|
209
|
-
"table", and "
|
|
224
|
+
"table", "chart", "seal", and "spotting".
|
|
210
225
|
:param format_block_content:
|
|
211
226
|
Whether to format block content.
|
|
212
227
|
:param repetition_penalty:
|
|
@@ -219,10 +234,25 @@ class PaddleOCRVLDocumentConverter:
|
|
|
219
234
|
Minimum number of pixels allowed during VLM preprocessing.
|
|
220
235
|
:param max_pixels:
|
|
221
236
|
Maximum number of pixels allowed during VLM preprocessing.
|
|
237
|
+
:param max_new_tokens:
|
|
238
|
+
Maximum number of tokens generated by the VLM.
|
|
239
|
+
:param merge_layout_blocks:
|
|
240
|
+
Whether to merge the layout detection boxes for cross-column or
|
|
241
|
+
staggered top and bottom columns.
|
|
242
|
+
:param markdown_ignore_labels:
|
|
243
|
+
Layout labels that need to be ignored in Markdown.
|
|
244
|
+
:param vlm_extra_args:
|
|
245
|
+
Additional configuration parameters for the VLM.
|
|
222
246
|
:param prettify_markdown:
|
|
223
247
|
Whether to prettify the output Markdown text.
|
|
224
248
|
:param show_formula_number:
|
|
225
249
|
Whether to include formula numbers in the output markdown text.
|
|
250
|
+
:param restructure_pages:
|
|
251
|
+
Whether to restructure results across multiple pages.
|
|
252
|
+
:param merge_tables:
|
|
253
|
+
Whether to merge tables across pages.
|
|
254
|
+
:param relevel_titles:
|
|
255
|
+
Whether to relevel titles.
|
|
226
256
|
:param visualize:
|
|
227
257
|
Whether to return visualization results.
|
|
228
258
|
:param additional_params:
|
|
@@ -235,10 +265,13 @@ class PaddleOCRVLDocumentConverter:
|
|
|
235
265
|
self.use_doc_unwarping = use_doc_unwarping
|
|
236
266
|
self.use_layout_detection = use_layout_detection
|
|
237
267
|
self.use_chart_recognition = use_chart_recognition
|
|
268
|
+
self.use_seal_recognition = use_seal_recognition
|
|
269
|
+
self.use_ocr_for_image_block = use_ocr_for_image_block
|
|
238
270
|
self.layout_threshold = layout_threshold
|
|
239
271
|
self.layout_nms = layout_nms
|
|
240
272
|
self.layout_unclip_ratio = layout_unclip_ratio
|
|
241
273
|
self.layout_merge_bboxes_mode = layout_merge_bboxes_mode
|
|
274
|
+
self.layout_shape_mode = layout_shape_mode
|
|
242
275
|
self.prompt_label = prompt_label
|
|
243
276
|
self.format_block_content = format_block_content
|
|
244
277
|
self.repetition_penalty = repetition_penalty
|
|
@@ -246,8 +279,15 @@ class PaddleOCRVLDocumentConverter:
|
|
|
246
279
|
self.top_p = top_p
|
|
247
280
|
self.min_pixels = min_pixels
|
|
248
281
|
self.max_pixels = max_pixels
|
|
282
|
+
self.max_new_tokens = max_new_tokens
|
|
283
|
+
self.merge_layout_blocks = merge_layout_blocks
|
|
284
|
+
self.markdown_ignore_labels = markdown_ignore_labels
|
|
285
|
+
self.vlm_extra_args = vlm_extra_args
|
|
249
286
|
self.prettify_markdown = prettify_markdown
|
|
250
287
|
self.show_formula_number = show_formula_number
|
|
288
|
+
self.restructure_pages = restructure_pages
|
|
289
|
+
self.merge_tables = merge_tables
|
|
290
|
+
self.relevel_titles = relevel_titles
|
|
251
291
|
self.visualize = visualize
|
|
252
292
|
self.additional_params = additional_params
|
|
253
293
|
|
|
@@ -267,10 +307,13 @@ class PaddleOCRVLDocumentConverter:
|
|
|
267
307
|
use_doc_unwarping=self.use_doc_unwarping,
|
|
268
308
|
use_layout_detection=self.use_layout_detection,
|
|
269
309
|
use_chart_recognition=self.use_chart_recognition,
|
|
310
|
+
use_seal_recognition=self.use_seal_recognition,
|
|
311
|
+
use_ocr_for_image_block=self.use_ocr_for_image_block,
|
|
270
312
|
layout_threshold=self.layout_threshold,
|
|
271
313
|
layout_nms=self.layout_nms,
|
|
272
314
|
layout_unclip_ratio=self.layout_unclip_ratio,
|
|
273
315
|
layout_merge_bboxes_mode=self.layout_merge_bboxes_mode,
|
|
316
|
+
layout_shape_mode=self.layout_shape_mode,
|
|
274
317
|
prompt_label=self.prompt_label,
|
|
275
318
|
format_block_content=self.format_block_content,
|
|
276
319
|
repetition_penalty=self.repetition_penalty,
|
|
@@ -278,8 +321,15 @@ class PaddleOCRVLDocumentConverter:
|
|
|
278
321
|
top_p=self.top_p,
|
|
279
322
|
min_pixels=self.min_pixels,
|
|
280
323
|
max_pixels=self.max_pixels,
|
|
324
|
+
max_new_tokens=self.max_new_tokens,
|
|
325
|
+
merge_layout_blocks=self.merge_layout_blocks,
|
|
326
|
+
markdown_ignore_labels=self.markdown_ignore_labels,
|
|
327
|
+
vlm_extra_args=self.vlm_extra_args,
|
|
281
328
|
prettify_markdown=self.prettify_markdown,
|
|
282
329
|
show_formula_number=self.show_formula_number,
|
|
330
|
+
restructure_pages=self.restructure_pages,
|
|
331
|
+
merge_tables=self.merge_tables,
|
|
332
|
+
relevel_titles=self.relevel_titles,
|
|
283
333
|
visualize=self.visualize,
|
|
284
334
|
additional_params=self.additional_params,
|
|
285
335
|
)
|
|
@@ -331,6 +381,10 @@ class PaddleOCRVLDocumentConverter:
|
|
|
331
381
|
request_data["useLayoutDetection"] = self.use_layout_detection
|
|
332
382
|
if self.use_chart_recognition is not None:
|
|
333
383
|
request_data["useChartRecognition"] = self.use_chart_recognition
|
|
384
|
+
if self.use_seal_recognition is not None:
|
|
385
|
+
request_data["useSealRecognition"] = self.use_seal_recognition
|
|
386
|
+
if self.use_ocr_for_image_block is not None:
|
|
387
|
+
request_data["useOcrForImageBlock"] = self.use_ocr_for_image_block
|
|
334
388
|
if self.layout_threshold is not None:
|
|
335
389
|
request_data["layoutThreshold"] = self.layout_threshold
|
|
336
390
|
if self.layout_nms is not None:
|
|
@@ -339,6 +393,8 @@ class PaddleOCRVLDocumentConverter:
|
|
|
339
393
|
request_data["layoutUnclipRatio"] = self.layout_unclip_ratio
|
|
340
394
|
if self.layout_merge_bboxes_mode is not None:
|
|
341
395
|
request_data["layoutMergeBboxesMode"] = self.layout_merge_bboxes_mode
|
|
396
|
+
if self.layout_shape_mode is not None:
|
|
397
|
+
request_data["layoutShapeMode"] = self.layout_shape_mode
|
|
342
398
|
if self.prompt_label is not None:
|
|
343
399
|
request_data["promptLabel"] = self.prompt_label
|
|
344
400
|
if self.format_block_content is not None:
|
|
@@ -353,10 +409,24 @@ class PaddleOCRVLDocumentConverter:
|
|
|
353
409
|
request_data["minPixels"] = self.min_pixels
|
|
354
410
|
if self.max_pixels is not None:
|
|
355
411
|
request_data["maxPixels"] = self.max_pixels
|
|
412
|
+
if self.max_new_tokens is not None:
|
|
413
|
+
request_data["maxNewTokens"] = self.max_new_tokens
|
|
414
|
+
if self.merge_layout_blocks is not None:
|
|
415
|
+
request_data["mergeLayoutBlocks"] = self.merge_layout_blocks
|
|
416
|
+
if self.markdown_ignore_labels is not None:
|
|
417
|
+
request_data["markdownIgnoreLabels"] = self.markdown_ignore_labels
|
|
418
|
+
if self.vlm_extra_args is not None:
|
|
419
|
+
request_data["vlmExtraArgs"] = self.vlm_extra_args
|
|
356
420
|
if self.prettify_markdown is not None:
|
|
357
421
|
request_data["prettifyMarkdown"] = self.prettify_markdown
|
|
358
422
|
if self.show_formula_number is not None:
|
|
359
423
|
request_data["showFormulaNumber"] = self.show_formula_number
|
|
424
|
+
if self.restructure_pages is not None:
|
|
425
|
+
request_data["restructurePages"] = self.restructure_pages
|
|
426
|
+
if self.merge_tables is not None:
|
|
427
|
+
request_data["mergeTables"] = self.merge_tables
|
|
428
|
+
if self.relevel_titles is not None:
|
|
429
|
+
request_data["relevelTitles"] = self.relevel_titles
|
|
360
430
|
if self.visualize is not None:
|
|
361
431
|
request_data["visualize"] = self.visualize
|
|
362
432
|
if self.additional_params is not None:
|
|
@@ -372,7 +442,7 @@ class PaddleOCRVLDocumentConverter:
|
|
|
372
442
|
|
|
373
443
|
# Prepare headers with authentication
|
|
374
444
|
access_token_value = self.access_token.resolve_value() if self.access_token else None
|
|
375
|
-
headers = {"Content-Type": "application/json"}
|
|
445
|
+
headers = {"Content-Type": "application/json", "Client-Platform": "haystack"}
|
|
376
446
|
if access_token_value:
|
|
377
447
|
headers["Authorization"] = f"token {access_token_value}"
|
|
378
448
|
|
|
@@ -421,8 +491,8 @@ class PaddleOCRVLDocumentConverter:
|
|
|
421
491
|
@component.output_types(documents=list[Document], raw_paddleocr_responses=list[dict[str, Any]])
|
|
422
492
|
def run(
|
|
423
493
|
self,
|
|
424
|
-
sources: list[
|
|
425
|
-
meta:
|
|
494
|
+
sources: list[str | Path | ByteStream],
|
|
495
|
+
meta: dict[str, Any] | list[dict[str, Any]] | None = None,
|
|
426
496
|
) -> dict[str, Any]:
|
|
427
497
|
"""
|
|
428
498
|
Convert image or PDF files to Documents.
|
|
@@ -448,7 +518,7 @@ class PaddleOCRVLDocumentConverter:
|
|
|
448
518
|
|
|
449
519
|
meta_list = normalize_metadata(meta, sources_count=len(sources))
|
|
450
520
|
|
|
451
|
-
for source, metadata in zip(sources, meta_list):
|
|
521
|
+
for source, metadata in zip(sources, meta_list, strict=True):
|
|
452
522
|
try:
|
|
453
523
|
bytestream = get_bytestream_from_source(source)
|
|
454
524
|
except Exception as e:
|
{paddleocr_haystack-0.1.0 → paddleocr_haystack-1.1.0}/tests/test_paddleocr_vl_document_converter.py
RENAMED
|
@@ -42,8 +42,8 @@ class TestPaddleOCRVLDocumentConverter:
|
|
|
42
42
|
assert converter.access_token == Secret.from_env_var("AISTUDIO_ACCESS_TOKEN")
|
|
43
43
|
assert converter.api_url == "http://test-api-url.com"
|
|
44
44
|
assert converter.file_type is None
|
|
45
|
-
assert converter.use_doc_orientation_classify is
|
|
46
|
-
assert converter.use_doc_unwarping is
|
|
45
|
+
assert converter.use_doc_orientation_classify is False
|
|
46
|
+
assert converter.use_doc_unwarping is False
|
|
47
47
|
assert converter.use_layout_detection is None
|
|
48
48
|
assert converter.use_chart_recognition is None
|
|
49
49
|
assert converter.layout_threshold is None
|
|
@@ -126,14 +126,17 @@ class TestPaddleOCRVLDocumentConverter:
|
|
|
126
126
|
"type": "env_var",
|
|
127
127
|
},
|
|
128
128
|
"file_type": None,
|
|
129
|
-
"use_doc_orientation_classify":
|
|
130
|
-
"use_doc_unwarping":
|
|
129
|
+
"use_doc_orientation_classify": False,
|
|
130
|
+
"use_doc_unwarping": False,
|
|
131
131
|
"use_layout_detection": None,
|
|
132
132
|
"use_chart_recognition": None,
|
|
133
|
+
"use_seal_recognition": None,
|
|
134
|
+
"use_ocr_for_image_block": None,
|
|
133
135
|
"layout_threshold": None,
|
|
134
136
|
"layout_nms": None,
|
|
135
137
|
"layout_unclip_ratio": None,
|
|
136
138
|
"layout_merge_bboxes_mode": None,
|
|
139
|
+
"layout_shape_mode": None,
|
|
137
140
|
"prompt_label": None,
|
|
138
141
|
"format_block_content": None,
|
|
139
142
|
"repetition_penalty": None,
|
|
@@ -141,8 +144,15 @@ class TestPaddleOCRVLDocumentConverter:
|
|
|
141
144
|
"top_p": None,
|
|
142
145
|
"min_pixels": None,
|
|
143
146
|
"max_pixels": None,
|
|
147
|
+
"max_new_tokens": None,
|
|
148
|
+
"merge_layout_blocks": None,
|
|
149
|
+
"markdown_ignore_labels": None,
|
|
150
|
+
"vlm_extra_args": None,
|
|
144
151
|
"prettify_markdown": None,
|
|
145
152
|
"show_formula_number": None,
|
|
153
|
+
"restructure_pages": None,
|
|
154
|
+
"merge_tables": None,
|
|
155
|
+
"relevel_titles": None,
|
|
146
156
|
"visualize": None,
|
|
147
157
|
"additional_params": None,
|
|
148
158
|
},
|
|
@@ -158,10 +168,13 @@ class TestPaddleOCRVLDocumentConverter:
|
|
|
158
168
|
use_doc_unwarping=False,
|
|
159
169
|
use_layout_detection=True,
|
|
160
170
|
use_chart_recognition=False,
|
|
171
|
+
use_seal_recognition=None,
|
|
172
|
+
use_ocr_for_image_block=None,
|
|
161
173
|
layout_threshold=0.7,
|
|
162
174
|
layout_nms=False,
|
|
163
175
|
layout_unclip_ratio=2.0,
|
|
164
176
|
layout_merge_bboxes_mode="separate",
|
|
177
|
+
layout_shape_mode=None,
|
|
165
178
|
prompt_label="formula",
|
|
166
179
|
format_block_content=False,
|
|
167
180
|
repetition_penalty=1.2,
|
|
@@ -169,8 +182,15 @@ class TestPaddleOCRVLDocumentConverter:
|
|
|
169
182
|
top_p=0.95,
|
|
170
183
|
min_pixels=200,
|
|
171
184
|
max_pixels=2000,
|
|
185
|
+
max_new_tokens=None,
|
|
186
|
+
merge_layout_blocks=None,
|
|
187
|
+
markdown_ignore_labels=None,
|
|
188
|
+
vlm_extra_args=None,
|
|
172
189
|
prettify_markdown=True,
|
|
173
190
|
show_formula_number=True,
|
|
191
|
+
restructure_pages=None,
|
|
192
|
+
merge_tables=None,
|
|
193
|
+
relevel_titles=None,
|
|
174
194
|
visualize=False,
|
|
175
195
|
additional_params={},
|
|
176
196
|
)
|
|
@@ -190,10 +210,13 @@ class TestPaddleOCRVLDocumentConverter:
|
|
|
190
210
|
"use_doc_unwarping": False,
|
|
191
211
|
"use_layout_detection": True,
|
|
192
212
|
"use_chart_recognition": False,
|
|
213
|
+
"use_seal_recognition": None,
|
|
214
|
+
"use_ocr_for_image_block": None,
|
|
193
215
|
"layout_threshold": 0.7,
|
|
194
216
|
"layout_nms": False,
|
|
195
217
|
"layout_unclip_ratio": 2.0,
|
|
196
218
|
"layout_merge_bboxes_mode": "separate",
|
|
219
|
+
"layout_shape_mode": None,
|
|
197
220
|
"prompt_label": "formula",
|
|
198
221
|
"format_block_content": False,
|
|
199
222
|
"repetition_penalty": 1.2,
|
|
@@ -201,8 +224,15 @@ class TestPaddleOCRVLDocumentConverter:
|
|
|
201
224
|
"top_p": 0.95,
|
|
202
225
|
"min_pixels": 200,
|
|
203
226
|
"max_pixels": 2000,
|
|
227
|
+
"max_new_tokens": None,
|
|
228
|
+
"merge_layout_blocks": None,
|
|
229
|
+
"markdown_ignore_labels": None,
|
|
230
|
+
"vlm_extra_args": None,
|
|
204
231
|
"prettify_markdown": True,
|
|
205
232
|
"show_formula_number": True,
|
|
233
|
+
"restructure_pages": None,
|
|
234
|
+
"merge_tables": None,
|
|
235
|
+
"relevel_titles": None,
|
|
206
236
|
"visualize": False,
|
|
207
237
|
"additional_params": {},
|
|
208
238
|
},
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|