kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. kreuzberg/__init__.py +16 -2
  2. kreuzberg/_chunker.py +51 -0
  3. kreuzberg/_constants.py +2 -3
  4. kreuzberg/_extractors/__init__.py +0 -0
  5. kreuzberg/_extractors/_base.py +92 -0
  6. kreuzberg/_extractors/_html.py +34 -0
  7. kreuzberg/_extractors/_image.py +74 -0
  8. kreuzberg/_extractors/_pandoc.py +613 -0
  9. kreuzberg/_extractors/_pdf.py +163 -0
  10. kreuzberg/_extractors/_presentation.py +233 -0
  11. kreuzberg/_extractors/_spread_sheet.py +125 -0
  12. kreuzberg/_mime_types.py +19 -26
  13. kreuzberg/_ocr/__init__.py +17 -0
  14. kreuzberg/_ocr/_base.py +54 -0
  15. kreuzberg/_ocr/_easyocr.py +376 -0
  16. kreuzberg/_ocr/_paddleocr.py +291 -0
  17. kreuzberg/_ocr/_tesseract.py +342 -0
  18. kreuzberg/_playa.py +276 -0
  19. kreuzberg/_registry.py +108 -0
  20. kreuzberg/_types.py +133 -36
  21. kreuzberg/_utils/__init__.py +0 -0
  22. kreuzberg/{_string.py → _utils/_string.py} +0 -2
  23. kreuzberg/_utils/_sync.py +121 -0
  24. kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
  25. kreuzberg/exceptions.py +25 -0
  26. kreuzberg/extraction.py +114 -227
  27. kreuzberg-3.0.1.dist-info/METADATA +178 -0
  28. kreuzberg-3.0.1.dist-info/RECORD +32 -0
  29. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
  30. kreuzberg/_html.py +0 -31
  31. kreuzberg/_pandoc.py +0 -366
  32. kreuzberg/_pdf.py +0 -190
  33. kreuzberg/_pptx.py +0 -88
  34. kreuzberg/_sync.py +0 -74
  35. kreuzberg/_tesseract.py +0 -231
  36. kreuzberg/_xlsx.py +0 -88
  37. kreuzberg-2.1.2.dist-info/METADATA +0 -446
  38. kreuzberg-2.1.2.dist-info/RECORD +0 -21
  39. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
  40. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,376 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
5
+
6
+ from PIL import Image
7
+
8
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
9
+ from kreuzberg._ocr._base import OCRBackend
10
+ from kreuzberg._types import ExtractionResult, Metadata
11
+ from kreuzberg._utils._string import normalize_spaces
12
+ from kreuzberg._utils._sync import run_sync
13
+ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
14
+
15
+ if TYPE_CHECKING:
16
+ from pathlib import Path
17
+
18
+ try: # pragma: no cover
19
+ from typing import Unpack # type: ignore[attr-defined]
20
+ except ImportError: # pragma: no cover
21
+ from typing_extensions import Unpack
22
+
23
+
24
+ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
25
+ "abq",
26
+ "ady",
27
+ "af",
28
+ "ang",
29
+ "ar",
30
+ "as",
31
+ "ava",
32
+ "az",
33
+ "be",
34
+ "bg",
35
+ "bh",
36
+ "bho",
37
+ "bn",
38
+ "bs",
39
+ "ch_sim",
40
+ "ch_tra",
41
+ "che",
42
+ "cs",
43
+ "cy",
44
+ "da",
45
+ "dar",
46
+ "de",
47
+ "en",
48
+ "es",
49
+ "et",
50
+ "fa",
51
+ "fr",
52
+ "ga",
53
+ "gom",
54
+ "hi",
55
+ "hr",
56
+ "hu",
57
+ "id",
58
+ "inh", # codespell:ignore
59
+ "is",
60
+ "it",
61
+ "ja",
62
+ "kbd",
63
+ "kn",
64
+ "ko",
65
+ "ku",
66
+ "la",
67
+ "lbe",
68
+ "lez",
69
+ "lt",
70
+ "lv",
71
+ "mah",
72
+ "mai",
73
+ "mi",
74
+ "mn",
75
+ "mr",
76
+ "ms",
77
+ "mt",
78
+ "ne",
79
+ "new",
80
+ "nl",
81
+ "no",
82
+ "oc",
83
+ "pi",
84
+ "pl",
85
+ "pt",
86
+ "ro",
87
+ "ru",
88
+ "rs_cyrillic",
89
+ "rs_latin",
90
+ "sck",
91
+ "sk",
92
+ "sl",
93
+ "sq",
94
+ "sv",
95
+ "sw",
96
+ "ta",
97
+ "tab",
98
+ "te", # codespell:ignore
99
+ "th",
100
+ "tjk",
101
+ "tl",
102
+ "tr",
103
+ "ug",
104
+ "uk",
105
+ "ur",
106
+ "uz",
107
+ "vi",
108
+ }
109
+
110
+
111
+ @dataclass(unsafe_hash=True, frozen=True)
112
+ class EasyOCRConfig:
113
+ """Configuration options for EasyOCR."""
114
+
115
+ add_margin: float = 0.1
116
+ """Extend bounding boxes in all directions."""
117
+ adjust_contrast: float = 0.5
118
+ """Target contrast level for low contrast text."""
119
+ beam_width: int = 5
120
+ """Beam width for beam search in recognition."""
121
+ canvas_size: int = 2560
122
+ """Maximum image dimension for detection."""
123
+ contrast_ths: float = 0.1
124
+ """Contrast threshold for preprocessing."""
125
+ decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
126
+ """Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
127
+ height_ths: float = 0.5
128
+ """Maximum difference in box height for merging."""
129
+ language: str | list[str] = "en"
130
+ """Language or languages to use for OCR."""
131
+ link_threshold: float = 0.4
132
+ """Link confidence threshold."""
133
+ low_text: float = 0.4
134
+ """Text low-bound score."""
135
+ mag_ratio: float = 1.0
136
+ """Image magnification ratio."""
137
+ min_size: int = 10
138
+ """Minimum text box size in pixels."""
139
+ rotation_info: list[int] | None = None
140
+ """List of angles to try for detection."""
141
+ slope_ths: float = 0.1
142
+ """Maximum slope for merging text boxes."""
143
+ text_threshold: float = 0.7
144
+ """Text confidence threshold."""
145
+ use_gpu: bool = False
146
+ """Whether to use GPU for inference."""
147
+ width_ths: float = 0.5
148
+ """Maximum horizontal distance for merging boxes."""
149
+ x_ths: float = 1.0
150
+ """Maximum horizontal distance for paragraph merging."""
151
+ y_ths: float = 0.5
152
+ """Maximum vertical distance for paragraph merging."""
153
+ ycenter_ths: float = 0.5
154
+ """Maximum shift in y direction for merging."""
155
+
156
+
157
+ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
158
+ _reader: ClassVar[Any] = None
159
+
160
+ async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
161
+ """Asynchronously process an image and extract its text and metadata using EasyOCR.
162
+
163
+ Args:
164
+ image: An instance of PIL.Image representing the input image.
165
+ **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
166
+
167
+ Returns:
168
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
169
+
170
+ Raises:
171
+ OCRError: If OCR processing fails.
172
+ """
173
+ await self._init_easyocr(**kwargs)
174
+
175
+ beam_width = kwargs.pop("beam_width")
176
+ try:
177
+ result = await run_sync(
178
+ self._reader.readtext,
179
+ image.tobytes(),
180
+ beamWidth=beam_width,
181
+ **kwargs,
182
+ )
183
+
184
+ return self._process_easyocr_result(result, image)
185
+ except Exception as e:
186
+ raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
187
+
188
+ async def process_file(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
189
+ """Asynchronously process a file and extract its text and metadata using EasyOCR.
190
+
191
+ Args:
192
+ path: A Path object representing the file to be processed.
193
+ **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
194
+
195
+ Returns:
196
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
197
+
198
+ Raises:
199
+ OCRError: If file loading or OCR processing fails.
200
+ """
201
+ await self._init_easyocr(**kwargs)
202
+ try:
203
+ image = await run_sync(Image.open, path)
204
+ return await self.process_image(image, **kwargs)
205
+ except Exception as e:
206
+ raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
207
+
208
+ @staticmethod
209
+ def _process_easyocr_result(result: list[Any], image: Image.Image) -> ExtractionResult:
210
+ """Process EasyOCR result into an ExtractionResult with metadata.
211
+
212
+ Args:
213
+ result: The raw result from EasyOCR.
214
+ image: The original PIL image.
215
+
216
+ Returns:
217
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
218
+ """
219
+ if not result:
220
+ return ExtractionResult(
221
+ content="",
222
+ mime_type=PLAIN_TEXT_MIME_TYPE,
223
+ metadata=Metadata(width=image.width, height=image.height),
224
+ chunks=[],
225
+ )
226
+
227
+ expected_tuple_length = 2
228
+
229
+ if all(len(item) == expected_tuple_length for item in result):
230
+ text_content = ""
231
+ confidence_sum = 0
232
+ confidence_count = 0
233
+
234
+ for text, confidence in result:
235
+ if text:
236
+ text_content += text + "\n"
237
+ confidence_sum += confidence
238
+ confidence_count += 1
239
+
240
+ metadata = Metadata(
241
+ width=image.width,
242
+ height=image.height,
243
+ )
244
+
245
+ return ExtractionResult(
246
+ content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
247
+ )
248
+
249
+ sorted_results = sorted(result, key=lambda x: x[0][0][1] + x[0][2][1])
250
+ line_groups: list[list[Any]] = []
251
+ current_line: list[Any] = []
252
+ prev_y_center: float | None = None
253
+ line_height_threshold = 20
254
+
255
+ for item in sorted_results:
256
+ box, text, confidence = item
257
+ y_center = sum(point[1] for point in box) / 4
258
+
259
+ if prev_y_center is None or abs(y_center - prev_y_center) > line_height_threshold:
260
+ if current_line:
261
+ line_groups.append(current_line)
262
+ current_line = [item]
263
+ else:
264
+ current_line.append(item)
265
+
266
+ prev_y_center = y_center
267
+
268
+ if current_line:
269
+ line_groups.append(current_line)
270
+
271
+ text_content = ""
272
+ confidence_sum = 0
273
+ confidence_count = 0
274
+
275
+ for line in line_groups:
276
+ line_sorted = sorted(line, key=lambda x: x[0][0][0])
277
+
278
+ for item in line_sorted:
279
+ _, text, confidence = item
280
+ if text:
281
+ text_content += text + " "
282
+ confidence_sum += confidence
283
+ confidence_count += 1
284
+
285
+ text_content += "\n"
286
+
287
+ metadata = Metadata(
288
+ width=image.width,
289
+ height=image.height,
290
+ )
291
+
292
+ return ExtractionResult(
293
+ content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
294
+ )
295
+
296
+ @classmethod
297
+ def _is_gpu_available(cls) -> bool:
298
+ """Check if GPU is available for EasyOCR.
299
+
300
+ Returns:
301
+ bool: True if GPU support is available.
302
+ """
303
+ try:
304
+ import torch
305
+
306
+ return torch.cuda.is_available()
307
+ except ImportError:
308
+ return False
309
+
310
+ @classmethod
311
+ async def _init_easyocr(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
312
+ """Initialize EasyOCR with the provided configuration.
313
+
314
+ Args:
315
+ **kwargs: Configuration parameters for EasyOCR including language, etc.
316
+
317
+ Raises:
318
+ MissingDependencyError: If EasyOCR is not installed.
319
+ OCRError: If initialization fails.
320
+ """
321
+ if cls._reader is not None:
322
+ return
323
+
324
+ try:
325
+ import easyocr
326
+ except ImportError as e:
327
+ raise MissingDependencyError.create_for_package(
328
+ dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
329
+ ) from e
330
+
331
+ languages = cls._validate_language_code(kwargs.pop("language", "en"))
332
+ has_gpu = cls._is_gpu_available()
333
+ kwargs.setdefault("gpu", has_gpu)
334
+ kwargs.setdefault("detector", True)
335
+ kwargs.setdefault("recognizer", True)
336
+ kwargs.setdefault("download_enabled", True)
337
+ kwargs.setdefault("recog_network", "standard")
338
+
339
+ try:
340
+ cls._reader = await run_sync(
341
+ easyocr.Reader,
342
+ languages,
343
+ gpu=kwargs.get("use_gpu"),
344
+ verbose=False,
345
+ )
346
+ except Exception as e:
347
+ raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
348
+
349
+ @staticmethod
350
+ def _validate_language_code(language_codes: str | list[str]) -> list[str]:
351
+ """Validate and normalize a provided language code.
352
+
353
+ Args:
354
+ language_codes: The language code string.
355
+
356
+ Raises:
357
+ ValidationError: If the language is not supported by EasyOCR
358
+
359
+ Returns:
360
+ A list with the normalized language code.
361
+ """
362
+ if not isinstance(language_codes, list):
363
+ languages = [language_codes.lower()]
364
+ else:
365
+ languages = [lang.lower() for lang in language_codes]
366
+
367
+ if all(lang in EASYOCR_SUPPORTED_LANGUAGE_CODES for lang in languages):
368
+ return languages
369
+
370
+ raise ValidationError(
371
+ "The provided language codes are not supported by EasyOCR",
372
+ context={
373
+ "language_code": ",".join([lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]),
374
+ "supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
375
+ },
376
+ )
@@ -0,0 +1,291 @@
1
+ from __future__ import annotations
2
+
3
+ import platform
4
+ import sys
5
+ from dataclasses import dataclass
6
+ from importlib.util import find_spec
7
+ from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
8
+
9
+ from PIL import Image
10
+
11
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
+ from kreuzberg._ocr._base import OCRBackend
13
+ from kreuzberg._types import ExtractionResult, Metadata
14
+ from kreuzberg._utils._string import normalize_spaces
15
+ from kreuzberg._utils._sync import run_sync
16
+ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
17
+
18
+ if TYPE_CHECKING:
19
+ from pathlib import Path
20
+
21
+
22
+ try: # pragma: no cover
23
+ from typing import Unpack # type: ignore[attr-defined]
24
+ except ImportError: # pragma: no cover
25
+ from typing_extensions import Unpack
26
+
27
+
28
+ PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
29
+
30
+
31
+ @dataclass(unsafe_hash=True, frozen=True)
32
+ class PaddleOCRConfig:
33
+ """Configuration options for PaddleOCR.
34
+
35
+ This TypedDict provides type hints and documentation for all PaddleOCR parameters.
36
+ """
37
+
38
+ cls_image_shape: str = "3,48,192"
39
+ """Image shape for classification algorithm in format 'channels,height,width'."""
40
+ det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
41
+ """Detection algorithm."""
42
+ det_db_box_thresh: float = 0.5
43
+ """Score threshold for detected boxes. Boxes below this value are discarded."""
44
+ det_db_thresh: float = 0.3
45
+ """Binarization threshold for DB output map."""
46
+ det_db_unclip_ratio: float = 2.0
47
+ """Expansion ratio for detected text boxes."""
48
+ det_east_cover_thresh: float = 0.1
49
+ """Score threshold for EAST output boxes."""
50
+ det_east_nms_thresh: float = 0.2
51
+ """NMS threshold for EAST model output boxes."""
52
+ det_east_score_thresh: float = 0.8
53
+ """Binarization threshold for EAST output map."""
54
+ det_max_side_len: int = 960
55
+ """Maximum size of image long side. Images exceeding this will be proportionally resized."""
56
+ drop_score: float = 0.5
57
+ """Filter recognition results by confidence score. Results below this are discarded."""
58
+ enable_mkldnn: bool = False
59
+ """Whether to enable MKL-DNN acceleration (Intel CPU only)."""
60
+ gpu_mem: int = 8000
61
+ """GPU memory size (in MB) to use for initialization."""
62
+ language: str = "en"
63
+ """Language to use for OCR."""
64
+ max_text_length: int = 25
65
+ """Maximum text length that the recognition algorithm can recognize."""
66
+ rec: bool = True
67
+ """Enable text recognition when using the ocr() function."""
68
+ rec_algorithm: Literal[
69
+ "CRNN",
70
+ "SRN",
71
+ "NRTR",
72
+ "SAR",
73
+ "SEED",
74
+ "SVTR",
75
+ "SVTR_LCNet",
76
+ "ViTSTR",
77
+ "ABINet",
78
+ "VisionLAN",
79
+ "SPIN",
80
+ "RobustScanner",
81
+ "RFL",
82
+ ] = "CRNN"
83
+ """Recognition algorithm."""
84
+ rec_image_shape: str = "3,32,320"
85
+ """Image shape for recognition algorithm in format 'channels,height,width'."""
86
+ table: bool = True
87
+ """Whether to enable table recognition."""
88
+ use_angle_cls: bool = True
89
+ """Whether to use text orientation classification model."""
90
+ use_gpu: bool = False
91
+ """Whether to use GPU for inference. Requires installing the paddlepaddle-gpu package"""
92
+ use_space_char: bool = True
93
+ """Whether to recognize spaces."""
94
+ use_zero_copy_run: bool = False
95
+ """Whether to enable zero_copy_run for inference optimization."""
96
+
97
+
98
+ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
99
+ _paddle_ocr: ClassVar[Any] = None
100
+
101
+ async def process_image(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
102
+ """Asynchronously process an image and extract its text and metadata using PaddleOCR.
103
+
104
+ Args:
105
+ image: An instance of PIL.Image representing the input image.
106
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
107
+
108
+ Returns:
109
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
110
+
111
+ Raises:
112
+ OCRError: If OCR processing fails.
113
+ """
114
+ import numpy as np
115
+
116
+ await self._init_paddle_ocr(**kwargs)
117
+ image_np = np.array(image)
118
+ try:
119
+ result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
120
+ return self._process_paddle_result(result, image)
121
+ except Exception as e:
122
+ raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
123
+
124
+ async def process_file(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
125
+ """Asynchronously process a file and extract its text and metadata using PaddleOCR.
126
+
127
+ Args:
128
+ path: A Path object representing the file to be processed.
129
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
130
+
131
+ Returns:
132
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
133
+
134
+ Raises:
135
+ OCRError: If file loading or OCR processing fails.
136
+ """
137
+ await self._init_paddle_ocr(**kwargs)
138
+ try:
139
+ image = await run_sync(Image.open, path)
140
+ return await self.process_image(image, **kwargs)
141
+ except Exception as e:
142
+ raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
143
+
144
+ @staticmethod
145
+ def _process_paddle_result(result: list[Any], image: Image.Image) -> ExtractionResult:
146
+ """Process PaddleOCR result into an ExtractionResult with metadata.
147
+
148
+ Args:
149
+ result: The raw result from PaddleOCR.
150
+ image: The original PIL image.
151
+
152
+ Returns:
153
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
154
+ """
155
+ text_content = ""
156
+ confidence_sum = 0
157
+ confidence_count = 0
158
+
159
+ for page_result in result:
160
+ if not page_result:
161
+ continue
162
+
163
+ sorted_boxes = sorted(page_result, key=lambda x: x[0][0][1])
164
+ line_groups: list[list[Any]] = []
165
+ current_line: list[Any] = []
166
+ prev_y: float | None = None
167
+
168
+ for box in sorted_boxes:
169
+ box_points, (_, _) = box
170
+ current_y = sum(point[1] for point in box_points) / 4
171
+ min_box_distance = 20
172
+
173
+ if prev_y is None or abs(current_y - prev_y) > min_box_distance:
174
+ if current_line:
175
+ line_groups.append(current_line)
176
+ current_line = [box]
177
+ else:
178
+ current_line.append(box)
179
+
180
+ prev_y = current_y
181
+
182
+ if current_line:
183
+ line_groups.append(current_line)
184
+
185
+ for line in line_groups:
186
+ line_sorted = sorted(line, key=lambda x: x[0][0][0])
187
+
188
+ for box in line_sorted:
189
+ _, (text, confidence) = box
190
+ if text:
191
+ text_content += text + " "
192
+ confidence_sum += confidence
193
+ confidence_count += 1
194
+
195
+ text_content += "\n"
196
+
197
+ width, height = image.size
198
+ metadata = Metadata(
199
+ width=width,
200
+ height=height,
201
+ )
202
+
203
+ return ExtractionResult(
204
+ content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
205
+ )
206
+
207
+ @classmethod
208
+ def _is_mkldnn_supported(cls) -> bool:
209
+ """Check if the current architecture supports MKL-DNN optimization.
210
+
211
+ Returns:
212
+ True if MKL-DNN is supported on this architecture.
213
+ """
214
+ system = platform.system().lower()
215
+ processor = platform.processor().lower()
216
+ machine = platform.machine().lower()
217
+
218
+ if system in ("linux", "windows"):
219
+ return "intel" in processor or "x86" in machine or "amd64" in machine or "x86_64" in machine
220
+
221
+ if system == "darwin":
222
+ return machine == "x86_64"
223
+
224
+ return False
225
+
226
+ @classmethod
227
+ async def _init_paddle_ocr(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
228
+ """Initialize PaddleOCR with the provided configuration.
229
+
230
+ Args:
231
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
232
+
233
+ Raises:
234
+ MissingDependencyError: If PaddleOCR is not installed.
235
+ OCRError: If initialization fails.
236
+ ValidationError: If the python version is too high.
237
+ """
238
+ if cls._paddle_ocr is not None:
239
+ return
240
+
241
+ if sys.version_info >= (3, 13): # pragma: no cover
242
+ raise ValidationError(
243
+ "PaddleOCR is only available in python 3.12 and below. Please downgrade your Python or switch to a different OCR backend.",
244
+ context={"issue": "https://github.com/PaddlePaddle/Paddle/issues/71616"},
245
+ )
246
+
247
+ try:
248
+ from paddleocr import PaddleOCR
249
+ except ImportError as e:
250
+ raise MissingDependencyError.create_for_package(
251
+ dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
252
+ ) from e
253
+
254
+ language = cls._validate_language_code(kwargs.pop("language", "en"))
255
+ has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
256
+ kwargs.setdefault("use_angle_cls", True)
257
+ kwargs.setdefault("use_gpu", has_gpu_package)
258
+ kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not has_gpu_package)
259
+ kwargs.setdefault("det_db_thresh", 0.3)
260
+ kwargs.setdefault("det_db_box_thresh", 0.5)
261
+ kwargs.setdefault("det_db_unclip_ratio", 1.6)
262
+
263
+ try:
264
+ cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
265
+ except Exception as e:
266
+ raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
267
+
268
+ @staticmethod
269
+ def _validate_language_code(lang_code: str) -> str:
270
+ """Convert a language code to PaddleOCR format.
271
+
272
+ Args:
273
+ lang_code: ISO language code or language name
274
+
275
+ Raises:
276
+ ValidationError: If the language is not supported by PaddleOCR
277
+
278
+ Returns:
279
+ Language code compatible with PaddleOCR
280
+ """
281
+ normalized = lang_code.lower()
282
+ if normalized in PADDLEOCR_SUPPORTED_LANGUAGE_CODES:
283
+ return normalized
284
+
285
+ raise ValidationError(
286
+ "The provided language code is not supported by PaddleOCR",
287
+ context={
288
+ "language_code": lang_code,
289
+ "supported_languages": ",".join(sorted(PADDLEOCR_SUPPORTED_LANGUAGE_CODES)),
290
+ },
291
+ )