paddleocr-haystack 0.1.0__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddleocr_haystack-1.0.0/CHANGELOG.md +9 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/PKG-INFO +3 -4
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/pyproject.toml +3 -9
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py +31 -31
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/.gitignore +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/LICENSE.txt +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/README.md +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/pydoc/config_docusaurus.yml +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/src/haystack_integrations/components/converters/paddleocr/__init__.py +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/src/haystack_integrations/components/converters/py.typed +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/tests/__init__.py +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/tests/conftest.py +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/tests/test_files/sample_img.jpg +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/tests/test_files/sample_pdf.pdf +0 -0
- {paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/tests/test_paddleocr_vl_document_converter.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: paddleocr-haystack
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: An integration of PaddleOCR with Haystack
|
|
5
5
|
Project-URL: Documentation, https://github.com/haystack-core-integrations/tree/main/integrations/paddleocr#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/haystack-core-integrations/issues
|
|
@@ -10,14 +10,13 @@ License-Expression: Apache-2.0
|
|
|
10
10
|
License-File: LICENSE.txt
|
|
11
11
|
Classifier: Development Status :: 4 - Beta
|
|
12
12
|
Classifier: Programming Language :: Python
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
16
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
18
17
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
19
|
-
Requires-Python: >=3.
|
|
20
|
-
Requires-Dist: haystack-ai>=2.
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: haystack-ai>=2.22.0
|
|
21
20
|
Requires-Dist: paddleocr>=3.3.2
|
|
22
21
|
Requires-Dist: paddlex[serving]>=3.3.10
|
|
23
22
|
Requires-Dist: requests>=2.25.0
|
|
@@ -7,7 +7,7 @@ name = "paddleocr-haystack"
|
|
|
7
7
|
dynamic = ["version"]
|
|
8
8
|
description = 'An integration of PaddleOCR with Haystack'
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
requires-python = ">=3.
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
11
|
license = "Apache-2.0"
|
|
12
12
|
keywords = []
|
|
13
13
|
authors = [
|
|
@@ -16,7 +16,6 @@ authors = [
|
|
|
16
16
|
classifiers = [
|
|
17
17
|
"Development Status :: 4 - Beta",
|
|
18
18
|
"Programming Language :: Python",
|
|
19
|
-
"Programming Language :: Python :: 3.9",
|
|
20
19
|
"Programming Language :: Python :: 3.10",
|
|
21
20
|
"Programming Language :: Python :: 3.11",
|
|
22
21
|
"Programming Language :: Python :: 3.12",
|
|
@@ -24,7 +23,7 @@ classifiers = [
|
|
|
24
23
|
"Programming Language :: Python :: Implementation :: PyPy",
|
|
25
24
|
]
|
|
26
25
|
dependencies = [
|
|
27
|
-
"haystack-ai>=2.
|
|
26
|
+
"haystack-ai>=2.22.0",
|
|
28
27
|
"paddleocr>=3.3.2",
|
|
29
28
|
"paddlex[serving]>=3.3.10",
|
|
30
29
|
"requests>=2.25.0",
|
|
@@ -52,7 +51,7 @@ dependencies = ["haystack-pydoc-tools", "ruff"]
|
|
|
52
51
|
|
|
53
52
|
[tool.hatch.envs.default.scripts]
|
|
54
53
|
docs = ["pydoc-markdown pydoc/config_docusaurus.yml"]
|
|
55
|
-
fmt = "ruff check --fix {args}
|
|
54
|
+
fmt = "ruff check --fix {args}; ruff format {args}"
|
|
56
55
|
fmt-check = "ruff check {args} && ruff format --check {args}"
|
|
57
56
|
|
|
58
57
|
[tool.hatch.envs.test]
|
|
@@ -80,7 +79,6 @@ check_untyped_defs = true
|
|
|
80
79
|
disallow_incomplete_defs = true
|
|
81
80
|
|
|
82
81
|
[tool.ruff]
|
|
83
|
-
target-version = "py39"
|
|
84
82
|
line-length = 120
|
|
85
83
|
|
|
86
84
|
[tool.ruff.lint]
|
|
@@ -127,10 +125,6 @@ ignore = [
|
|
|
127
125
|
"B008",
|
|
128
126
|
"S101",
|
|
129
127
|
]
|
|
130
|
-
unfixable = [
|
|
131
|
-
# Don't touch unused imports
|
|
132
|
-
"F401",
|
|
133
|
-
]
|
|
134
128
|
|
|
135
129
|
[tool.ruff.lint.isort]
|
|
136
130
|
known-first-party = ["haystack_integrations"]
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import base64
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Literal
|
|
6
|
+
from typing import Any, Literal
|
|
7
7
|
|
|
8
8
|
import requests
|
|
9
9
|
from haystack import Document, component, default_from_dict, default_to_dict, logging
|
|
@@ -24,7 +24,7 @@ from paddlex.inference.serving.schemas.shared.ocr import FileType # type: ignor
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
FileTypeInput =
|
|
27
|
+
FileTypeInput = Literal["pdf", "image"] | None
|
|
28
28
|
|
|
29
29
|
# Supported image file extensions
|
|
30
30
|
_IMAGE_EXTENSIONS = {
|
|
@@ -41,9 +41,9 @@ _PDF_EXTENSIONS = {".pdf"}
|
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
def _infer_file_type_from_source(
|
|
44
|
-
source:
|
|
45
|
-
mime_type:
|
|
46
|
-
) ->
|
|
44
|
+
source: str | Path | ByteStream,
|
|
45
|
+
mime_type: str | None = None,
|
|
46
|
+
) -> FileType | None:
|
|
47
47
|
"""
|
|
48
48
|
Infer file type from file extension or MIME type.
|
|
49
49
|
|
|
@@ -56,7 +56,7 @@ def _infer_file_type_from_source(
|
|
|
56
56
|
determined.
|
|
57
57
|
"""
|
|
58
58
|
# Try to get extension from file path
|
|
59
|
-
file_path:
|
|
59
|
+
file_path: str | None = None
|
|
60
60
|
|
|
61
61
|
# Check if source is a file path
|
|
62
62
|
if isinstance(source, (str, Path)):
|
|
@@ -86,7 +86,7 @@ def _infer_file_type_from_source(
|
|
|
86
86
|
return None
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
def _normalize_file_type(file_type:
|
|
89
|
+
def _normalize_file_type(file_type: FileTypeInput) -> FileType | None:
|
|
90
90
|
"""
|
|
91
91
|
Normalize file type input to the numeric format expected by the API.
|
|
92
92
|
|
|
@@ -145,26 +145,26 @@ class PaddleOCRVLDocumentConverter:
|
|
|
145
145
|
*,
|
|
146
146
|
api_url: str,
|
|
147
147
|
access_token: Secret = Secret.from_env_var("AISTUDIO_ACCESS_TOKEN"),
|
|
148
|
-
file_type:
|
|
149
|
-
use_doc_orientation_classify:
|
|
150
|
-
use_doc_unwarping:
|
|
151
|
-
use_layout_detection:
|
|
152
|
-
use_chart_recognition:
|
|
153
|
-
layout_threshold:
|
|
154
|
-
layout_nms:
|
|
155
|
-
layout_unclip_ratio:
|
|
156
|
-
layout_merge_bboxes_mode:
|
|
157
|
-
prompt_label:
|
|
158
|
-
format_block_content:
|
|
159
|
-
repetition_penalty:
|
|
160
|
-
temperature:
|
|
161
|
-
top_p:
|
|
162
|
-
min_pixels:
|
|
163
|
-
max_pixels:
|
|
164
|
-
prettify_markdown:
|
|
165
|
-
show_formula_number:
|
|
166
|
-
visualize:
|
|
167
|
-
additional_params:
|
|
148
|
+
file_type: FileTypeInput = None,
|
|
149
|
+
use_doc_orientation_classify: bool | None = None,
|
|
150
|
+
use_doc_unwarping: bool | None = None,
|
|
151
|
+
use_layout_detection: bool | None = None,
|
|
152
|
+
use_chart_recognition: bool | None = None,
|
|
153
|
+
layout_threshold: float | dict | None = None,
|
|
154
|
+
layout_nms: bool | None = None,
|
|
155
|
+
layout_unclip_ratio: float | tuple[float, float] | dict | None = None,
|
|
156
|
+
layout_merge_bboxes_mode: str | dict | None = None,
|
|
157
|
+
prompt_label: str | None = None,
|
|
158
|
+
format_block_content: bool | None = None,
|
|
159
|
+
repetition_penalty: float | None = None,
|
|
160
|
+
temperature: float | None = None,
|
|
161
|
+
top_p: float | None = None,
|
|
162
|
+
min_pixels: int | None = None,
|
|
163
|
+
max_pixels: int | None = None,
|
|
164
|
+
prettify_markdown: bool | None = None,
|
|
165
|
+
show_formula_number: bool | None = None,
|
|
166
|
+
visualize: bool | None = None,
|
|
167
|
+
additional_params: dict[str, Any] | None = None,
|
|
168
168
|
):
|
|
169
169
|
"""
|
|
170
170
|
Create a `PaddleOCRVLDocumentConverter` component.
|
|
@@ -372,7 +372,7 @@ class PaddleOCRVLDocumentConverter:
|
|
|
372
372
|
|
|
373
373
|
# Prepare headers with authentication
|
|
374
374
|
access_token_value = self.access_token.resolve_value() if self.access_token else None
|
|
375
|
-
headers = {"Content-Type": "application/json"}
|
|
375
|
+
headers = {"Content-Type": "application/json", "Client-Platform": "haystack"}
|
|
376
376
|
if access_token_value:
|
|
377
377
|
headers["Authorization"] = f"token {access_token_value}"
|
|
378
378
|
|
|
@@ -421,8 +421,8 @@ class PaddleOCRVLDocumentConverter:
|
|
|
421
421
|
@component.output_types(documents=list[Document], raw_paddleocr_responses=list[dict[str, Any]])
|
|
422
422
|
def run(
|
|
423
423
|
self,
|
|
424
|
-
sources: list[
|
|
425
|
-
meta:
|
|
424
|
+
sources: list[str | Path | ByteStream],
|
|
425
|
+
meta: dict[str, Any] | list[dict[str, Any]] | None = None,
|
|
426
426
|
) -> dict[str, Any]:
|
|
427
427
|
"""
|
|
428
428
|
Convert image or PDF files to Documents.
|
|
@@ -448,7 +448,7 @@ class PaddleOCRVLDocumentConverter:
|
|
|
448
448
|
|
|
449
449
|
meta_list = normalize_metadata(meta, sources_count=len(sources))
|
|
450
450
|
|
|
451
|
-
for source, metadata in zip(sources, meta_list):
|
|
451
|
+
for source, metadata in zip(sources, meta_list, strict=True):
|
|
452
452
|
try:
|
|
453
453
|
bytestream = get_bytestream_from_source(source)
|
|
454
454
|
except Exception as e:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{paddleocr_haystack-0.1.0 → paddleocr_haystack-1.0.0}/tests/test_paddleocr_vl_document_converter.py
RENAMED
|
File without changes
|