kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kreuzberg/__init__.py +9 -2
  2. kreuzberg/_api/__init__.py +0 -0
  3. kreuzberg/_api/main.py +87 -0
  4. kreuzberg/_entity_extraction.py +238 -0
  5. kreuzberg/_extractors/_base.py +39 -1
  6. kreuzberg/_extractors/_email.py +149 -0
  7. kreuzberg/_extractors/_html.py +15 -3
  8. kreuzberg/_extractors/_image.py +27 -22
  9. kreuzberg/_extractors/_pandoc.py +3 -14
  10. kreuzberg/_extractors/_pdf.py +97 -34
  11. kreuzberg/_extractors/_presentation.py +62 -10
  12. kreuzberg/_extractors/_spread_sheet.py +181 -6
  13. kreuzberg/_extractors/_structured.py +148 -0
  14. kreuzberg/_gmft.py +318 -11
  15. kreuzberg/_language_detection.py +95 -0
  16. kreuzberg/_mcp/__init__.py +5 -0
  17. kreuzberg/_mcp/server.py +227 -0
  18. kreuzberg/_mime_types.py +27 -1
  19. kreuzberg/_ocr/__init__.py +10 -1
  20. kreuzberg/_ocr/_base.py +59 -0
  21. kreuzberg/_ocr/_easyocr.py +92 -1
  22. kreuzberg/_ocr/_paddleocr.py +89 -0
  23. kreuzberg/_ocr/_tesseract.py +569 -5
  24. kreuzberg/_registry.py +4 -0
  25. kreuzberg/_types.py +181 -4
  26. kreuzberg/_utils/_cache.py +52 -4
  27. kreuzberg/_utils/_device.py +2 -2
  28. kreuzberg/_utils/_errors.py +3 -7
  29. kreuzberg/_utils/_process_pool.py +182 -9
  30. kreuzberg/_utils/_quality.py +237 -0
  31. kreuzberg/_utils/_serialization.py +4 -2
  32. kreuzberg/_utils/_string.py +153 -10
  33. kreuzberg/_utils/_sync.py +6 -7
  34. kreuzberg/_utils/_table.py +261 -0
  35. kreuzberg/_utils/_tmp.py +2 -2
  36. kreuzberg/cli.py +1 -2
  37. kreuzberg/extraction.py +43 -34
  38. kreuzberg-3.8.1.dist-info/METADATA +301 -0
  39. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  40. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
  41. kreuzberg/_multiprocessing/__init__.py +0 -6
  42. kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
  43. kreuzberg/_multiprocessing/process_manager.py +0 -188
  44. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  45. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  46. kreuzberg-3.3.0.dist-info/METADATA +0 -235
  47. kreuzberg-3.3.0.dist-info/RECORD +0 -48
  48. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  49. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,227 @@
1
+ """Kreuzberg MCP server implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ from typing import Any
7
+
8
+ from mcp.server import FastMCP
9
+ from mcp.types import TextContent
10
+
11
+ from kreuzberg._types import ExtractionConfig, OcrBackendType
12
+ from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
13
+
14
+ # Create the MCP server
15
+ mcp = FastMCP("Kreuzberg Text Extraction")
16
+
17
+
18
+ @mcp.tool()
19
+ def extract_document( # noqa: PLR0913
20
+ file_path: str,
21
+ mime_type: str | None = None,
22
+ force_ocr: bool = False,
23
+ chunk_content: bool = False,
24
+ extract_tables: bool = False,
25
+ extract_entities: bool = False,
26
+ extract_keywords: bool = False,
27
+ ocr_backend: OcrBackendType = "tesseract",
28
+ max_chars: int = 1000,
29
+ max_overlap: int = 200,
30
+ keyword_count: int = 10,
31
+ auto_detect_language: bool = False,
32
+ ) -> dict[str, Any]:
33
+ """Extract text content from a document file.
34
+
35
+ Args:
36
+ file_path: Path to the document file
37
+ mime_type: MIME type of the document (auto-detected if not provided)
38
+ force_ocr: Force OCR even for text-based documents
39
+ chunk_content: Split content into chunks
40
+ extract_tables: Extract tables from the document
41
+ extract_entities: Extract named entities
42
+ extract_keywords: Extract keywords
43
+ ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
44
+ max_chars: Maximum characters per chunk
45
+ max_overlap: Character overlap between chunks
46
+ keyword_count: Number of keywords to extract
47
+ auto_detect_language: Auto-detect document language
48
+
49
+ Returns:
50
+ Extracted content with metadata, tables, chunks, entities, and keywords
51
+ """
52
+ config = ExtractionConfig(
53
+ force_ocr=force_ocr,
54
+ chunk_content=chunk_content,
55
+ extract_tables=extract_tables,
56
+ extract_entities=extract_entities,
57
+ extract_keywords=extract_keywords,
58
+ ocr_backend=ocr_backend,
59
+ max_chars=max_chars,
60
+ max_overlap=max_overlap,
61
+ keyword_count=keyword_count,
62
+ auto_detect_language=auto_detect_language,
63
+ )
64
+
65
+ result = extract_file_sync(file_path, mime_type, config)
66
+ return result.to_dict()
67
+
68
+
69
+ @mcp.tool()
70
+ def extract_bytes( # noqa: PLR0913
71
+ content_base64: str,
72
+ mime_type: str,
73
+ force_ocr: bool = False,
74
+ chunk_content: bool = False,
75
+ extract_tables: bool = False,
76
+ extract_entities: bool = False,
77
+ extract_keywords: bool = False,
78
+ ocr_backend: OcrBackendType = "tesseract",
79
+ max_chars: int = 1000,
80
+ max_overlap: int = 200,
81
+ keyword_count: int = 10,
82
+ auto_detect_language: bool = False,
83
+ ) -> dict[str, Any]:
84
+ """Extract text content from document bytes.
85
+
86
+ Args:
87
+ content_base64: Base64-encoded document content
88
+ mime_type: MIME type of the document
89
+ force_ocr: Force OCR even for text-based documents
90
+ chunk_content: Split content into chunks
91
+ extract_tables: Extract tables from the document
92
+ extract_entities: Extract named entities
93
+ extract_keywords: Extract keywords
94
+ ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
95
+ max_chars: Maximum characters per chunk
96
+ max_overlap: Character overlap between chunks
97
+ keyword_count: Number of keywords to extract
98
+ auto_detect_language: Auto-detect document language
99
+
100
+ Returns:
101
+ Extracted content with metadata, tables, chunks, entities, and keywords
102
+ """
103
+ content_bytes = base64.b64decode(content_base64)
104
+
105
+ config = ExtractionConfig(
106
+ force_ocr=force_ocr,
107
+ chunk_content=chunk_content,
108
+ extract_tables=extract_tables,
109
+ extract_entities=extract_entities,
110
+ extract_keywords=extract_keywords,
111
+ ocr_backend=ocr_backend,
112
+ max_chars=max_chars,
113
+ max_overlap=max_overlap,
114
+ keyword_count=keyword_count,
115
+ auto_detect_language=auto_detect_language,
116
+ )
117
+
118
+ result = extract_bytes_sync(content_bytes, mime_type, config)
119
+ return result.to_dict()
120
+
121
+
122
+ @mcp.tool()
123
+ def extract_simple(
124
+ file_path: str,
125
+ mime_type: str | None = None,
126
+ ) -> str:
127
+ """Simple text extraction from a document file.
128
+
129
+ Args:
130
+ file_path: Path to the document file
131
+ mime_type: MIME type of the document (auto-detected if not provided)
132
+
133
+ Returns:
134
+ Extracted text content as a string
135
+ """
136
+ config = ExtractionConfig()
137
+ result = extract_file_sync(file_path, mime_type, config)
138
+ return result.content
139
+
140
+
141
+ @mcp.resource("config://default")
142
+ def get_default_config() -> str:
143
+ """Get the default extraction configuration."""
144
+ config = ExtractionConfig()
145
+ return str(config.__dict__)
146
+
147
+
148
+ @mcp.resource("config://available-backends")
149
+ def get_available_backends() -> str:
150
+ """Get available OCR backends."""
151
+ return "tesseract, easyocr, paddleocr"
152
+
153
+
154
+ @mcp.resource("extractors://supported-formats")
155
+ def get_supported_formats() -> str:
156
+ """Get supported document formats."""
157
+ return """
158
+ Supported formats:
159
+ - PDF documents
160
+ - Images (PNG, JPG, JPEG, TIFF, BMP, WEBP)
161
+ - Office documents (DOCX, PPTX, XLSX)
162
+ - HTML files
163
+ - Text files (TXT, CSV, TSV)
164
+ - And more...
165
+ """
166
+
167
+
168
+ @mcp.prompt()
169
+ def extract_and_summarize(file_path: str) -> list[TextContent]:
170
+ """Extract text from a document and provide a summary prompt.
171
+
172
+ Args:
173
+ file_path: Path to the document file
174
+
175
+ Returns:
176
+ Extracted content with summarization prompt
177
+ """
178
+ result = extract_file_sync(file_path, None, ExtractionConfig())
179
+
180
+ return [
181
+ TextContent(
182
+ type="text",
183
+ text=f"Document Content:\n{result.content}\n\nPlease provide a concise summary of this document.",
184
+ )
185
+ ]
186
+
187
+
188
+ @mcp.prompt()
189
+ def extract_structured(file_path: str) -> list[TextContent]:
190
+ """Extract text with structured analysis prompt.
191
+
192
+ Args:
193
+ file_path: Path to the document file
194
+
195
+ Returns:
196
+ Extracted content with structured analysis prompt
197
+ """
198
+ config = ExtractionConfig(
199
+ extract_entities=True,
200
+ extract_keywords=True,
201
+ extract_tables=True,
202
+ )
203
+ result = extract_file_sync(file_path, None, config)
204
+
205
+ content = f"Document Content:\n{result.content}\n\n"
206
+
207
+ if result.entities:
208
+ content += f"Entities: {[f'{e.text} ({e.type})' for e in result.entities]}\n\n"
209
+
210
+ if result.keywords:
211
+ content += f"Keywords: {[f'{kw[0]} ({kw[1]:.2f})' for kw in result.keywords]}\n\n"
212
+
213
+ if result.tables:
214
+ content += f"Tables found: {len(result.tables)}\n\n"
215
+
216
+ content += "Please analyze this document and provide structured insights."
217
+
218
+ return [TextContent(type="text", text=content)]
219
+
220
+
221
+ def main() -> None:
222
+ """Main entry point for the MCP server."""
223
+ mcp.run()
224
+
225
+
226
+ if __name__ == "__main__":
227
+ main()
kreuzberg/_mime_types.py CHANGED
@@ -17,6 +17,12 @@ PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
17
17
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
18
18
  DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
19
19
 
20
+ EML_MIME_TYPE: Final = "message/rfc822"
21
+ MSG_MIME_TYPE: Final = "application/vnd.ms-outlook"
22
+ JSON_MIME_TYPE: Final = "application/json"
23
+ YAML_MIME_TYPE: Final = "application/x-yaml"
24
+ TOML_MIME_TYPE: Final = "application/toml"
25
+
20
26
  EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
21
27
  EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
22
28
  EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
@@ -127,6 +133,12 @@ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
127
133
  ".org": "text/x-org",
128
134
  ".epub": "application/epub+zip",
129
135
  ".rtf": "application/rtf",
136
+ ".eml": EML_MIME_TYPE,
137
+ ".msg": MSG_MIME_TYPE,
138
+ ".json": JSON_MIME_TYPE,
139
+ ".yaml": YAML_MIME_TYPE,
140
+ ".yml": YAML_MIME_TYPE,
141
+ ".toml": TOML_MIME_TYPE,
130
142
  ".odt": "application/vnd.oasis.opendocument.text",
131
143
  ".docx": DOCX_MIME_TYPE,
132
144
  ".bib": "application/x-bibtex",
@@ -139,7 +151,21 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
139
151
  | IMAGE_MIME_TYPES
140
152
  | PANDOC_SUPPORTED_MIME_TYPES
141
153
  | SPREADSHEET_MIME_TYPES
142
- | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
154
+ | {
155
+ PDF_MIME_TYPE,
156
+ POWER_POINT_MIME_TYPE,
157
+ HTML_MIME_TYPE,
158
+ EML_MIME_TYPE,
159
+ MSG_MIME_TYPE,
160
+ JSON_MIME_TYPE,
161
+ YAML_MIME_TYPE,
162
+ TOML_MIME_TYPE,
163
+ "text/json",
164
+ "text/yaml",
165
+ "text/x-yaml",
166
+ "application/yaml",
167
+ "text/toml",
168
+ }
143
169
  )
144
170
 
145
171
 
@@ -4,9 +4,18 @@ from typing import Any
4
4
  from kreuzberg._ocr._base import OCRBackend
5
5
  from kreuzberg._ocr._easyocr import EasyOCRBackend
6
6
  from kreuzberg._ocr._paddleocr import PaddleBackend
7
- from kreuzberg._ocr._tesseract import TesseractBackend
7
+ from kreuzberg._ocr._tesseract import TesseractBackend, TesseractProcessPool
8
8
  from kreuzberg._types import OcrBackendType
9
9
 
10
+ __all__ = [
11
+ "EasyOCRBackend",
12
+ "OCRBackend",
13
+ "PaddleBackend",
14
+ "TesseractBackend",
15
+ "TesseractProcessPool",
16
+ "get_ocr_backend",
17
+ ]
18
+
10
19
 
11
20
  @lru_cache
12
21
  def get_ocr_backend(backend: OcrBackendType) -> OCRBackend[Any]:
kreuzberg/_ocr/_base.py CHANGED
@@ -49,6 +49,65 @@ class OCRBackend(ABC, Generic[T]):
49
49
  """
50
50
  ...
51
51
 
52
+ @abstractmethod
53
+ def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
54
+ """Synchronously process an image and extract its text and metadata.
55
+
56
+ Args:
57
+ image: An instance of PIL.Image representing the input image.
58
+ **kwargs: Any kwargs related to the given backend
59
+
60
+ Returns:
61
+ The extraction result object
62
+ """
63
+ ...
64
+
65
+ @abstractmethod
66
+ def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
67
+ """Synchronously process a file and extract its text and metadata.
68
+
69
+ Args:
70
+ path: A Path object representing the file to be processed.
71
+ **kwargs: Any kwargs related to the given backend
72
+
73
+ Returns:
74
+ The extraction result object
75
+ """
76
+ ...
77
+
78
+ def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
79
+ """Synchronously process a batch of files and extract their text and metadata.
80
+
81
+ Default implementation processes files sequentially. Backends can override
82
+ for more efficient batch processing.
83
+
84
+ Args:
85
+ paths: List of Path objects representing files to be processed.
86
+ **kwargs: Any kwargs related to the given backend
87
+
88
+ Returns:
89
+ List of extraction result objects in the same order as input paths
90
+ """
91
+ return [self.process_file_sync(path, **kwargs) for path in paths]
92
+
93
+ async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
94
+ """Asynchronously process a batch of files and extract their text and metadata.
95
+
96
+ Default implementation processes files concurrently. Backends can override
97
+ for more efficient batch processing.
98
+
99
+ Args:
100
+ paths: List of Path objects representing files to be processed.
101
+ **kwargs: Any kwargs related to the given backend
102
+
103
+ Returns:
104
+ List of extraction result objects in the same order as input paths
105
+ """
106
+ from kreuzberg._utils._sync import run_taskgroup
107
+
108
+ tasks = [self.process_file(path, **kwargs) for path in paths]
109
+ return await run_taskgroup(*tasks)
110
+
52
111
  def __hash__(self) -> int:
53
112
  """Hash function for allowing caching."""
54
113
  return hash(type(self).__name__)
@@ -4,6 +4,7 @@ import warnings
4
4
  from dataclasses import dataclass
5
5
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
6
6
 
7
+ import numpy as np
7
8
  from PIL import Image
8
9
 
9
10
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -319,7 +320,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
319
320
  try:
320
321
  import torch
321
322
 
322
- return torch.cuda.is_available()
323
+ return bool(torch.cuda.is_available())
323
324
  except ImportError:
324
325
  return False
325
326
 
@@ -440,3 +441,93 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
440
441
  )
441
442
 
442
443
  return languages
444
+
445
+ def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
446
+ """Synchronously process an image and extract its text and metadata using EasyOCR.
447
+
448
+ Args:
449
+ image: An instance of PIL.Image representing the input image.
450
+ **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
451
+
452
+ Returns:
453
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
454
+
455
+ Raises:
456
+ OCRError: If OCR processing fails.
457
+ """
458
+ self._init_easyocr_sync(**kwargs)
459
+
460
+ beam_width = kwargs.pop("beam_width")
461
+ kwargs.pop("language", None)
462
+ kwargs.pop("use_gpu", None)
463
+
464
+ try:
465
+ result = self._reader.readtext(
466
+ np.array(image),
467
+ beamWidth=beam_width,
468
+ **kwargs,
469
+ )
470
+
471
+ return self._process_easyocr_result(result, image)
472
+ except Exception as e:
473
+ raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
474
+
475
+ def process_file_sync(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
476
+ """Synchronously process a file and extract its text and metadata using EasyOCR.
477
+
478
+ Args:
479
+ path: A Path object representing the file to be processed.
480
+ **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
481
+
482
+ Returns:
483
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
484
+
485
+ Raises:
486
+ OCRError: If file loading or OCR processing fails.
487
+ """
488
+ self._init_easyocr_sync(**kwargs)
489
+ try:
490
+ image = Image.open(path)
491
+ return self.process_image_sync(image, **kwargs)
492
+ except Exception as e:
493
+ raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
494
+
495
+ @classmethod
496
+ def _init_easyocr_sync(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
497
+ """Synchronously initialize EasyOCR with the provided configuration.
498
+
499
+ Args:
500
+ **kwargs: Configuration parameters for EasyOCR including language, etc.
501
+
502
+ Raises:
503
+ MissingDependencyError: If EasyOCR is not installed.
504
+ OCRError: If initialization fails.
505
+ """
506
+ if cls._reader is not None:
507
+ return
508
+
509
+ try:
510
+ import easyocr
511
+ except ImportError as e:
512
+ raise MissingDependencyError.create_for_package(
513
+ dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
514
+ ) from e
515
+
516
+ languages = cls._validate_language_code(kwargs.pop("language", "en"))
517
+
518
+ device_info = cls._resolve_device_config(**kwargs)
519
+ use_gpu = device_info.device_type in ("cuda", "mps")
520
+
521
+ kwargs.setdefault("detector", True)
522
+ kwargs.setdefault("recognizer", True)
523
+ kwargs.setdefault("download_enabled", True)
524
+ kwargs.setdefault("recog_network", "standard")
525
+
526
+ try:
527
+ cls._reader = easyocr.Reader(
528
+ languages,
529
+ gpu=use_gpu,
530
+ verbose=False,
531
+ )
532
+ except Exception as e:
533
+ raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
@@ -4,8 +4,10 @@ import platform
4
4
  import warnings
5
5
  from dataclasses import dataclass
6
6
  from importlib.util import find_spec
7
+ from pathlib import Path
7
8
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
8
9
 
10
+ import numpy as np
9
11
  from PIL import Image
10
12
 
11
13
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -364,3 +366,90 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
364
366
  "supported_languages": ",".join(sorted(PADDLEOCR_SUPPORTED_LANGUAGE_CODES)),
365
367
  },
366
368
  )
369
+
370
+ def process_image_sync(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
371
+ """Synchronously process an image and extract its text and metadata using PaddleOCR.
372
+
373
+ Args:
374
+ image: An instance of PIL.Image representing the input image.
375
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
376
+
377
+ Returns:
378
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
379
+
380
+ Raises:
381
+ OCRError: If OCR processing fails.
382
+ """
383
+ self._init_paddle_ocr_sync(**kwargs)
384
+
385
+ if image.mode != "RGB":
386
+ image = image.convert("RGB")
387
+
388
+ image_np = np.array(image)
389
+ try:
390
+ result = self._paddle_ocr.ocr(image_np, cls=kwargs.get("use_angle_cls", True))
391
+ return self._process_paddle_result(result, image)
392
+ except Exception as e:
393
+ raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
394
+
395
+ def process_file_sync(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
396
+ """Synchronously process a file and extract its text and metadata using PaddleOCR.
397
+
398
+ Args:
399
+ path: A Path object representing the file to be processed.
400
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
401
+
402
+ Returns:
403
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
404
+
405
+ Raises:
406
+ OCRError: If file loading or OCR processing fails.
407
+ """
408
+ self._init_paddle_ocr_sync(**kwargs)
409
+ try:
410
+ image = Image.open(path)
411
+ return self.process_image_sync(image, **kwargs)
412
+ except Exception as e:
413
+ raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
414
+
415
+ @classmethod
416
+ def _init_paddle_ocr_sync(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
417
+ """Synchronously initialize PaddleOCR with the provided configuration.
418
+
419
+ Args:
420
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
421
+
422
+ Raises:
423
+ MissingDependencyError: If PaddleOCR is not installed.
424
+ OCRError: If initialization fails.
425
+ """
426
+ if cls._paddle_ocr is not None:
427
+ return
428
+
429
+ try:
430
+ from paddleocr import PaddleOCR
431
+ except ImportError as e:
432
+ raise MissingDependencyError.create_for_package(
433
+ dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
434
+ ) from e
435
+
436
+ language = cls._validate_language_code(kwargs.pop("language", "en"))
437
+
438
+ device_info = cls._resolve_device_config(**kwargs)
439
+ use_gpu = device_info.device_type == "cuda"
440
+
441
+ has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
442
+ kwargs.setdefault("use_angle_cls", True)
443
+ kwargs["use_gpu"] = use_gpu and has_gpu_package
444
+ kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
445
+ kwargs.setdefault("det_db_thresh", 0.3)
446
+ kwargs.setdefault("det_db_box_thresh", 0.5)
447
+ kwargs.setdefault("det_db_unclip_ratio", 1.6)
448
+
449
+ if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
450
+ kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
451
+
452
+ try:
453
+ cls._paddle_ocr = PaddleOCR(lang=language, show_log=False, **kwargs)
454
+ except Exception as e:
455
+ raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e