kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -2,17 +2,25 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import platform
|
4
4
|
import warnings
|
5
|
-
from dataclasses import dataclass
|
6
5
|
from importlib.util import find_spec
|
7
|
-
from
|
8
|
-
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
6
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
9
7
|
|
10
8
|
from PIL import Image
|
11
9
|
|
12
10
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
13
11
|
from kreuzberg._ocr._base import OCRBackend
|
14
|
-
from kreuzberg._types import ExtractionResult, Metadata
|
15
|
-
from kreuzberg._utils._device import DeviceInfo,
|
12
|
+
from kreuzberg._types import ExtractionResult, Metadata, PaddleOCRConfig
|
13
|
+
from kreuzberg._utils._device import DeviceInfo, validate_device_request
|
14
|
+
from kreuzberg._utils._ocr_cache import (
|
15
|
+
build_cache_kwargs,
|
16
|
+
cache_and_complete_async,
|
17
|
+
cache_and_complete_sync,
|
18
|
+
generate_image_hash,
|
19
|
+
get_file_info,
|
20
|
+
handle_cache_lookup_async,
|
21
|
+
handle_cache_lookup_sync,
|
22
|
+
mark_processing_complete,
|
23
|
+
)
|
16
24
|
from kreuzberg._utils._string import normalize_spaces
|
17
25
|
from kreuzberg._utils._sync import run_sync
|
18
26
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
@@ -20,154 +28,97 @@ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationErr
|
|
20
28
|
if TYPE_CHECKING:
|
21
29
|
from pathlib import Path
|
22
30
|
|
23
|
-
|
24
31
|
try: # pragma: no cover
|
25
32
|
from typing import Unpack # type: ignore[attr-defined]
|
26
33
|
except ImportError: # pragma: no cover
|
27
34
|
from typing_extensions import Unpack
|
28
35
|
|
36
|
+
if TYPE_CHECKING:
|
37
|
+
import numpy as np
|
38
|
+
from paddleocr import PaddleOCR
|
29
39
|
|
30
|
-
|
40
|
+
HAS_PADDLEOCR: bool
|
41
|
+
if not TYPE_CHECKING:
|
42
|
+
try:
|
43
|
+
import numpy as np
|
44
|
+
from paddleocr import PaddleOCR
|
31
45
|
|
46
|
+
HAS_PADDLEOCR = True
|
47
|
+
except ImportError:
|
48
|
+
HAS_PADDLEOCR = False
|
49
|
+
np: Any = None
|
50
|
+
PaddleOCR: Any = None
|
32
51
|
|
33
|
-
|
34
|
-
|
35
|
-
"""Configuration options for PaddleOCR.
|
36
|
-
|
37
|
-
This TypedDict provides type hints and documentation for all PaddleOCR parameters.
|
38
|
-
"""
|
39
|
-
|
40
|
-
cls_image_shape: str = "3,48,192"
|
41
|
-
"""Image shape for classification algorithm in format 'channels,height,width'."""
|
42
|
-
det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
|
43
|
-
"""Detection algorithm."""
|
44
|
-
det_db_box_thresh: float = 0.5
|
45
|
-
"""Score threshold for detected boxes. Boxes below this value are discarded."""
|
46
|
-
det_db_thresh: float = 0.3
|
47
|
-
"""Binarization threshold for DB output map."""
|
48
|
-
det_db_unclip_ratio: float = 2.0
|
49
|
-
"""Expansion ratio for detected text boxes."""
|
50
|
-
det_east_cover_thresh: float = 0.1
|
51
|
-
"""Score threshold for EAST output boxes."""
|
52
|
-
det_east_nms_thresh: float = 0.2
|
53
|
-
"""NMS threshold for EAST model output boxes."""
|
54
|
-
det_east_score_thresh: float = 0.8
|
55
|
-
"""Binarization threshold for EAST output map."""
|
56
|
-
det_max_side_len: int = 960
|
57
|
-
"""Maximum size of image long side. Images exceeding this will be proportionally resized."""
|
58
|
-
det_model_dir: str | None = None
|
59
|
-
"""Directory for detection model. If None, uses default model location."""
|
60
|
-
drop_score: float = 0.5
|
61
|
-
"""Filter recognition results by confidence score. Results below this are discarded."""
|
62
|
-
enable_mkldnn: bool = False
|
63
|
-
"""Whether to enable MKL-DNN acceleration (Intel CPU only)."""
|
64
|
-
gpu_mem: int = 8000
|
65
|
-
"""GPU memory size (in MB) to use for initialization."""
|
66
|
-
language: str = "en"
|
67
|
-
"""Language to use for OCR."""
|
68
|
-
max_text_length: int = 25
|
69
|
-
"""Maximum text length that the recognition algorithm can recognize."""
|
70
|
-
rec: bool = True
|
71
|
-
"""Enable text recognition when using the ocr() function."""
|
72
|
-
rec_algorithm: Literal[
|
73
|
-
"CRNN",
|
74
|
-
"SRN",
|
75
|
-
"NRTR",
|
76
|
-
"SAR",
|
77
|
-
"SEED",
|
78
|
-
"SVTR",
|
79
|
-
"SVTR_LCNet",
|
80
|
-
"ViTSTR",
|
81
|
-
"ABINet",
|
82
|
-
"VisionLAN",
|
83
|
-
"SPIN",
|
84
|
-
"RobustScanner",
|
85
|
-
"RFL",
|
86
|
-
] = "CRNN"
|
87
|
-
"""Recognition algorithm."""
|
88
|
-
rec_image_shape: str = "3,32,320"
|
89
|
-
"""Image shape for recognition algorithm in format 'channels,height,width'."""
|
90
|
-
rec_model_dir: str | None = None
|
91
|
-
"""Directory for recognition model. If None, uses default model location."""
|
92
|
-
table: bool = True
|
93
|
-
"""Whether to enable table recognition."""
|
94
|
-
use_angle_cls: bool = True
|
95
|
-
"""Whether to use text orientation classification model."""
|
96
|
-
use_gpu: bool = False
|
97
|
-
"""Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
|
98
|
-
device: DeviceType = "auto"
|
99
|
-
"""Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
|
100
|
-
gpu_memory_limit: float | None = None
|
101
|
-
"""Maximum GPU memory to use in GB. None for no limit."""
|
102
|
-
fallback_to_cpu: bool = True
|
103
|
-
"""Whether to fallback to CPU if requested device is unavailable."""
|
104
|
-
use_space_char: bool = True
|
105
|
-
"""Whether to recognize spaces."""
|
106
|
-
use_zero_copy_run: bool = False
|
107
|
-
"""Whether to enable zero_copy_run for inference optimization."""
|
52
|
+
|
53
|
+
PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
|
108
54
|
|
109
55
|
|
110
56
|
class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
111
57
|
_paddle_ocr: ClassVar[Any] = None
|
112
58
|
|
113
59
|
async def process_image(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
|
114
|
-
""
|
60
|
+
use_cache = kwargs.pop("use_cache", True)
|
115
61
|
|
116
|
-
|
117
|
-
|
118
|
-
|
62
|
+
cache_kwargs = None
|
63
|
+
if use_cache:
|
64
|
+
image_hash = generate_image_hash(image)
|
65
|
+
cache_kwargs = build_cache_kwargs("paddleocr", kwargs, image_hash=image_hash)
|
119
66
|
|
120
|
-
|
121
|
-
|
67
|
+
cached_result = await handle_cache_lookup_async(cache_kwargs)
|
68
|
+
if cached_result:
|
69
|
+
return cached_result
|
122
70
|
|
123
|
-
|
124
|
-
|
125
|
-
"""
|
126
|
-
import numpy as np # noqa: PLC0415
|
71
|
+
try:
|
72
|
+
await self._init_paddle_ocr(**kwargs)
|
127
73
|
|
128
|
-
|
74
|
+
if image.mode != "RGB":
|
75
|
+
image = image.convert("RGB")
|
129
76
|
|
130
|
-
|
131
|
-
|
77
|
+
image_np = np.array(image)
|
78
|
+
use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
|
79
|
+
result = await run_sync(self._paddle_ocr.ocr, image_np, cls=use_textline_orientation)
|
132
80
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
81
|
+
extraction_result = self._process_paddle_result(result, image)
|
82
|
+
|
83
|
+
if use_cache and cache_kwargs:
|
84
|
+
await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
|
85
|
+
|
86
|
+
return extraction_result
|
137
87
|
except Exception as e:
|
88
|
+
if use_cache and cache_kwargs:
|
89
|
+
mark_processing_complete(cache_kwargs)
|
138
90
|
raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
|
139
91
|
|
140
92
|
async def process_file(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
|
141
|
-
""
|
93
|
+
use_cache = kwargs.pop("use_cache", True)
|
142
94
|
|
143
|
-
|
144
|
-
|
145
|
-
|
95
|
+
cache_kwargs = None
|
96
|
+
if use_cache:
|
97
|
+
file_info = get_file_info(path)
|
98
|
+
cache_kwargs = build_cache_kwargs("paddleocr", kwargs, file_info=file_info)
|
146
99
|
|
147
|
-
|
148
|
-
|
100
|
+
cached_result = await handle_cache_lookup_async(cache_kwargs)
|
101
|
+
if cached_result:
|
102
|
+
return cached_result
|
149
103
|
|
150
|
-
Raises:
|
151
|
-
OCRError: If file loading or OCR processing fails.
|
152
|
-
"""
|
153
|
-
await self._init_paddle_ocr(**kwargs)
|
154
104
|
try:
|
105
|
+
await self._init_paddle_ocr(**kwargs)
|
155
106
|
image = await run_sync(Image.open, path)
|
156
|
-
|
107
|
+
|
108
|
+
kwargs["use_cache"] = False
|
109
|
+
extraction_result = await self.process_image(image, **kwargs)
|
110
|
+
|
111
|
+
if use_cache and cache_kwargs:
|
112
|
+
await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
|
113
|
+
|
114
|
+
return extraction_result
|
157
115
|
except Exception as e:
|
116
|
+
if use_cache and cache_kwargs:
|
117
|
+
mark_processing_complete(cache_kwargs)
|
158
118
|
raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
|
159
119
|
|
160
120
|
@staticmethod
|
161
121
|
def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
|
162
|
-
"""Process PaddleOCR result into an ExtractionResult with metadata.
|
163
|
-
|
164
|
-
Args:
|
165
|
-
result: The raw result from PaddleOCR.
|
166
|
-
image: The original PIL image.
|
167
|
-
|
168
|
-
Returns:
|
169
|
-
ExtractionResult: The extraction result containing text content, mime type, and metadata.
|
170
|
-
"""
|
171
122
|
text_content = ""
|
172
123
|
confidence_sum = 0
|
173
124
|
confidence_count = 0
|
@@ -227,11 +178,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
227
178
|
|
228
179
|
@classmethod
|
229
180
|
def _is_mkldnn_supported(cls) -> bool:
|
230
|
-
"""Check if the current architecture supports MKL-DNN optimization.
|
231
|
-
|
232
|
-
Returns:
|
233
|
-
True if MKL-DNN is supported on this architecture.
|
234
|
-
"""
|
235
181
|
system = platform.system().lower()
|
236
182
|
processor = platform.processor().lower()
|
237
183
|
machine = platform.machine().lower()
|
@@ -246,59 +192,44 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
246
192
|
|
247
193
|
@classmethod
|
248
194
|
async def _init_paddle_ocr(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
|
249
|
-
"""Initialize PaddleOCR with the provided configuration.
|
250
|
-
|
251
|
-
Args:
|
252
|
-
**kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
|
253
|
-
|
254
|
-
Raises:
|
255
|
-
MissingDependencyError: If PaddleOCR is not installed.
|
256
|
-
OCRError: If initialization fails.
|
257
|
-
"""
|
258
195
|
if cls._paddle_ocr is not None:
|
259
196
|
return
|
260
197
|
|
261
|
-
|
262
|
-
from paddleocr import PaddleOCR # noqa: PLC0415
|
263
|
-
except ImportError as e: # pragma: no cover
|
198
|
+
if not HAS_PADDLEOCR or PaddleOCR is None:
|
264
199
|
raise MissingDependencyError.create_for_package(
|
265
200
|
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
266
|
-
)
|
201
|
+
)
|
267
202
|
|
268
203
|
language = cls._validate_language_code(kwargs.pop("language", "en"))
|
269
204
|
|
270
|
-
|
271
|
-
|
205
|
+
cls._resolve_device_config(**kwargs)
|
206
|
+
|
207
|
+
bool(find_spec("paddlepaddle_gpu"))
|
208
|
+
|
209
|
+
use_angle_cls = kwargs.pop("use_angle_cls", True)
|
210
|
+
kwargs.setdefault("use_textline_orientation", use_angle_cls)
|
211
|
+
|
212
|
+
det_db_thresh = kwargs.pop("det_db_thresh", 0.3)
|
213
|
+
det_db_box_thresh = kwargs.pop("det_db_box_thresh", 0.5)
|
214
|
+
det_db_unclip_ratio = kwargs.pop("det_db_unclip_ratio", 1.6)
|
272
215
|
|
273
|
-
|
274
|
-
kwargs.setdefault("
|
275
|
-
kwargs
|
276
|
-
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
|
277
|
-
kwargs.setdefault("det_db_thresh", 0.3)
|
278
|
-
kwargs.setdefault("det_db_box_thresh", 0.5)
|
279
|
-
kwargs.setdefault("det_db_unclip_ratio", 1.6)
|
216
|
+
kwargs.setdefault("text_det_thresh", det_db_thresh)
|
217
|
+
kwargs.setdefault("text_det_box_thresh", det_db_box_thresh)
|
218
|
+
kwargs.setdefault("text_det_unclip_ratio", det_db_unclip_ratio)
|
280
219
|
|
281
|
-
|
282
|
-
|
220
|
+
kwargs.pop("use_gpu", None)
|
221
|
+
kwargs.pop("gpu_mem", None)
|
222
|
+
kwargs.pop("gpu_memory_limit", None)
|
223
|
+
|
224
|
+
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
|
283
225
|
|
284
226
|
try:
|
285
|
-
cls._paddle_ocr = await run_sync(PaddleOCR, lang=language,
|
227
|
+
cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, **kwargs)
|
286
228
|
except Exception as e:
|
287
229
|
raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
|
288
230
|
|
289
231
|
@classmethod
|
290
232
|
def _resolve_device_config(cls, **kwargs: Unpack[PaddleOCRConfig]) -> DeviceInfo:
|
291
|
-
"""Resolve device configuration with backward compatibility.
|
292
|
-
|
293
|
-
Args:
|
294
|
-
**kwargs: Configuration parameters including device settings.
|
295
|
-
|
296
|
-
Returns:
|
297
|
-
DeviceInfo object for the selected device.
|
298
|
-
|
299
|
-
Raises:
|
300
|
-
ValidationError: If requested device is not available and fallback is disabled.
|
301
|
-
"""
|
302
233
|
use_gpu = kwargs.get("use_gpu", False)
|
303
234
|
device = kwargs.get("device", "auto")
|
304
235
|
memory_limit = kwargs.get("gpu_memory_limit")
|
@@ -343,17 +274,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
343
274
|
|
344
275
|
@staticmethod
|
345
276
|
def _validate_language_code(lang_code: str) -> str:
|
346
|
-
"""Convert a language code to PaddleOCR format.
|
347
|
-
|
348
|
-
Args:
|
349
|
-
lang_code: ISO language code or language name
|
350
|
-
|
351
|
-
Raises:
|
352
|
-
ValidationError: If the language is not supported by PaddleOCR
|
353
|
-
|
354
|
-
Returns:
|
355
|
-
Language code compatible with PaddleOCR
|
356
|
-
"""
|
357
277
|
normalized = lang_code.lower()
|
358
278
|
if normalized in PADDLEOCR_SUPPORTED_LANGUAGE_CODES:
|
359
279
|
return normalized
|
@@ -367,90 +287,100 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
367
287
|
)
|
368
288
|
|
369
289
|
def process_image_sync(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
|
370
|
-
""
|
290
|
+
use_cache = kwargs.pop("use_cache", True)
|
371
291
|
|
372
|
-
|
373
|
-
|
374
|
-
|
292
|
+
cache_kwargs = None
|
293
|
+
if use_cache:
|
294
|
+
image_hash = generate_image_hash(image)
|
295
|
+
cache_kwargs = build_cache_kwargs("paddleocr", kwargs, image_hash=image_hash)
|
375
296
|
|
376
|
-
|
377
|
-
|
297
|
+
cached_result = handle_cache_lookup_sync(cache_kwargs)
|
298
|
+
if cached_result:
|
299
|
+
return cached_result
|
378
300
|
|
379
|
-
|
380
|
-
|
381
|
-
"""
|
382
|
-
import numpy as np # noqa: PLC0415
|
301
|
+
try:
|
302
|
+
self._init_paddle_ocr_sync(**kwargs)
|
383
303
|
|
384
|
-
|
304
|
+
if image.mode != "RGB":
|
305
|
+
image = image.convert("RGB")
|
385
306
|
|
386
|
-
|
387
|
-
|
307
|
+
image_np = np.array(image)
|
308
|
+
use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
|
309
|
+
result = self._paddle_ocr.ocr(image_np, cls=use_textline_orientation)
|
388
310
|
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
311
|
+
extraction_result = self._process_paddle_result(result, image)
|
312
|
+
|
313
|
+
if use_cache and cache_kwargs:
|
314
|
+
cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
|
315
|
+
|
316
|
+
return extraction_result
|
393
317
|
except Exception as e:
|
318
|
+
if use_cache and cache_kwargs:
|
319
|
+
mark_processing_complete(cache_kwargs)
|
394
320
|
raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
|
395
321
|
|
396
322
|
def process_file_sync(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
|
397
|
-
""
|
323
|
+
use_cache = kwargs.pop("use_cache", True)
|
398
324
|
|
399
|
-
|
400
|
-
|
401
|
-
|
325
|
+
cache_kwargs = None
|
326
|
+
if use_cache:
|
327
|
+
file_info = get_file_info(path)
|
328
|
+
cache_kwargs = build_cache_kwargs("paddleocr", kwargs, file_info=file_info)
|
402
329
|
|
403
|
-
|
404
|
-
|
330
|
+
cached_result = handle_cache_lookup_sync(cache_kwargs)
|
331
|
+
if cached_result:
|
332
|
+
return cached_result
|
405
333
|
|
406
|
-
Raises:
|
407
|
-
OCRError: If file loading or OCR processing fails.
|
408
|
-
"""
|
409
|
-
self._init_paddle_ocr_sync(**kwargs)
|
410
334
|
try:
|
335
|
+
self._init_paddle_ocr_sync(**kwargs)
|
411
336
|
image = Image.open(path)
|
412
|
-
|
337
|
+
|
338
|
+
kwargs["use_cache"] = False
|
339
|
+
extraction_result = self.process_image_sync(image, **kwargs)
|
340
|
+
|
341
|
+
if use_cache and cache_kwargs:
|
342
|
+
cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
|
343
|
+
|
344
|
+
return extraction_result
|
413
345
|
except Exception as e:
|
346
|
+
if use_cache and cache_kwargs:
|
347
|
+
mark_processing_complete(cache_kwargs)
|
414
348
|
raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
|
415
349
|
|
416
350
|
@classmethod
|
417
351
|
def _init_paddle_ocr_sync(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
|
418
|
-
"""Synchronously initialize PaddleOCR with the provided configuration.
|
419
|
-
|
420
|
-
Args:
|
421
|
-
**kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
|
422
|
-
|
423
|
-
Raises:
|
424
|
-
MissingDependencyError: If PaddleOCR is not installed.
|
425
|
-
OCRError: If initialization fails.
|
426
|
-
"""
|
427
352
|
if cls._paddle_ocr is not None:
|
428
353
|
return
|
429
354
|
|
430
|
-
|
431
|
-
from paddleocr import PaddleOCR # noqa: PLC0415
|
432
|
-
except ImportError as e: # pragma: no cover
|
355
|
+
if not HAS_PADDLEOCR or PaddleOCR is None:
|
433
356
|
raise MissingDependencyError.create_for_package(
|
434
357
|
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
435
|
-
)
|
358
|
+
)
|
436
359
|
|
437
360
|
language = cls._validate_language_code(kwargs.pop("language", "en"))
|
438
361
|
|
439
|
-
|
440
|
-
|
362
|
+
cls._resolve_device_config(**kwargs)
|
363
|
+
|
364
|
+
bool(find_spec("paddlepaddle_gpu"))
|
365
|
+
|
366
|
+
use_angle_cls = kwargs.pop("use_angle_cls", True)
|
367
|
+
kwargs.setdefault("use_textline_orientation", use_angle_cls)
|
368
|
+
|
369
|
+
det_db_thresh = kwargs.pop("det_db_thresh", 0.3)
|
370
|
+
det_db_box_thresh = kwargs.pop("det_db_box_thresh", 0.5)
|
371
|
+
det_db_unclip_ratio = kwargs.pop("det_db_unclip_ratio", 1.6)
|
372
|
+
|
373
|
+
kwargs.setdefault("text_det_thresh", det_db_thresh)
|
374
|
+
kwargs.setdefault("text_det_box_thresh", det_db_box_thresh)
|
375
|
+
kwargs.setdefault("text_det_unclip_ratio", det_db_unclip_ratio)
|
441
376
|
|
442
|
-
|
443
|
-
kwargs.
|
444
|
-
kwargs
|
445
|
-
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
|
446
|
-
kwargs.setdefault("det_db_thresh", 0.3)
|
447
|
-
kwargs.setdefault("det_db_box_thresh", 0.5)
|
448
|
-
kwargs.setdefault("det_db_unclip_ratio", 1.6)
|
377
|
+
kwargs.pop("use_gpu", None)
|
378
|
+
kwargs.pop("gpu_mem", None)
|
379
|
+
kwargs.pop("gpu_memory_limit", None)
|
449
380
|
|
450
|
-
|
451
|
-
kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
|
381
|
+
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
|
452
382
|
|
453
383
|
try:
|
454
|
-
cls._paddle_ocr = PaddleOCR(lang=language,
|
384
|
+
cls._paddle_ocr = PaddleOCR(lang=language, **kwargs)
|
455
385
|
except Exception as e:
|
456
386
|
raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
|