kreuzberg 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/_entity_extraction.py +1 -2
  2. kreuzberg/_extractors/_base.py +39 -1
  3. kreuzberg/_extractors/_email.py +149 -0
  4. kreuzberg/_extractors/_html.py +15 -3
  5. kreuzberg/_extractors/_image.py +21 -36
  6. kreuzberg/_extractors/_pandoc.py +3 -14
  7. kreuzberg/_extractors/_pdf.py +81 -48
  8. kreuzberg/_extractors/_presentation.py +62 -10
  9. kreuzberg/_extractors/_spread_sheet.py +179 -4
  10. kreuzberg/_extractors/_structured.py +148 -0
  11. kreuzberg/_gmft.py +314 -7
  12. kreuzberg/_mime_types.py +27 -1
  13. kreuzberg/_ocr/__init__.py +10 -1
  14. kreuzberg/_ocr/_base.py +59 -0
  15. kreuzberg/_ocr/_easyocr.py +91 -0
  16. kreuzberg/_ocr/_paddleocr.py +89 -0
  17. kreuzberg/_ocr/_tesseract.py +564 -4
  18. kreuzberg/_registry.py +4 -0
  19. kreuzberg/_types.py +131 -0
  20. kreuzberg/_utils/_cache.py +52 -4
  21. kreuzberg/_utils/_errors.py +3 -7
  22. kreuzberg/_utils/_process_pool.py +180 -7
  23. kreuzberg/_utils/_quality.py +237 -0
  24. kreuzberg/_utils/_serialization.py +4 -2
  25. kreuzberg/_utils/_string.py +153 -10
  26. kreuzberg/_utils/_sync.py +5 -2
  27. kreuzberg/_utils/_table.py +261 -0
  28. kreuzberg/cli.py +1 -2
  29. kreuzberg/extraction.py +4 -22
  30. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/METADATA +58 -54
  31. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  32. kreuzberg/_multiprocessing/__init__.py +0 -6
  33. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  34. kreuzberg/_multiprocessing/process_manager.py +0 -189
  35. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  36. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  37. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  38. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  39. kreuzberg-3.7.0.dist-info/RECORD +0 -56
  40. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_gmft.py CHANGED
@@ -1,12 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import multiprocessing as mp
3
4
  import os
5
+ import queue
6
+ import signal
7
+ import traceback
4
8
  from dataclasses import dataclass, field
9
+ from io import StringIO
5
10
  from typing import TYPE_CHECKING, Any, Literal
6
11
 
7
12
  from kreuzberg._types import TableData
8
13
  from kreuzberg._utils._sync import run_sync
9
- from kreuzberg.exceptions import MissingDependencyError
14
+ from kreuzberg.exceptions import MissingDependencyError, ParsingError
10
15
 
11
16
  if TYPE_CHECKING:
12
17
  from os import PathLike
@@ -196,9 +201,7 @@ async def extract_tables( # noqa: PLR0915
196
201
 
197
202
  try:
198
203
  if use_isolated_process:
199
- from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated_async
200
-
201
- result = await extract_tables_isolated_async(file_path, config)
204
+ result = await _extract_tables_isolated_async(file_path, config)
202
205
 
203
206
  await table_cache.aset(result, **cache_kwargs)
204
207
 
@@ -314,9 +317,7 @@ def extract_tables_sync(
314
317
  return cached_result # type: ignore[no-any-return]
315
318
 
316
319
  if use_isolated_process:
317
- from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated
318
-
319
- result = extract_tables_isolated(file_path, config)
320
+ result = _extract_tables_isolated(file_path, config)
320
321
 
321
322
  table_cache.set(result, **cache_kwargs)
322
323
 
@@ -378,3 +379,309 @@ def extract_tables_sync(
378
379
  raise MissingDependencyError.create_for_package(
379
380
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
380
381
  ) from e
382
+
383
+
384
+ def _extract_tables_in_process(
385
+ file_path: str | PathLike[str],
386
+ config_dict: dict[str, Any],
387
+ result_queue: queue.Queue[tuple[bool, Any]],
388
+ ) -> None:
389
+ """Extract tables in an isolated process to handle potential segfaults.
390
+
391
+ Args:
392
+ file_path: Path to the PDF file
393
+ config_dict: Serialized GMFTConfig as a dict
394
+ result_queue: Queue to put results or errors
395
+ """
396
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
397
+
398
+ try:
399
+ from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
400
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
401
+ from gmft.formatters.tatr import TATRFormatConfig
402
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document
403
+
404
+ config = GMFTConfig(**config_dict)
405
+
406
+ formatter = AutoTableFormatter( # type: ignore[no-untyped-call]
407
+ config=TATRFormatConfig(
408
+ verbosity=config.verbosity,
409
+ formatter_base_threshold=config.formatter_base_threshold,
410
+ cell_required_confidence=config.cell_required_confidence,
411
+ remove_null_rows=config.remove_null_rows,
412
+ enable_multi_header=config.enable_multi_header,
413
+ semantic_spanning_cells=config.semantic_spanning_cells,
414
+ semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
415
+ large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
416
+ large_table_threshold=config.large_table_threshold,
417
+ large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
418
+ large_table_maximum_rows=config.large_table_maximum_rows,
419
+ force_large_table_assumption=config.force_large_table_assumption,
420
+ )
421
+ )
422
+ detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)) # type: ignore[no-untyped-call]
423
+
424
+ doc = PyPDFium2Document(str(file_path))
425
+ cropped_tables = []
426
+ dataframes = []
427
+
428
+ try:
429
+ for page in doc:
430
+ cropped_tables.extend(detector.extract(page)) # type: ignore[attr-defined]
431
+
432
+ for cropped_table in cropped_tables:
433
+ formatted_table = formatter.extract(cropped_table) # type: ignore[attr-defined]
434
+ dataframes.append(formatted_table.df())
435
+
436
+ results = []
437
+ for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
438
+ import io
439
+
440
+ img_bytes = io.BytesIO()
441
+ cropped_image = cropped_table.image()
442
+ cropped_image.save(img_bytes, format="PNG")
443
+ img_bytes.seek(0)
444
+
445
+ results.append(
446
+ {
447
+ "cropped_image_bytes": img_bytes.getvalue(),
448
+ "page_number": cropped_table.page.page_number,
449
+ "text": data_frame.to_markdown(),
450
+ "df_csv": data_frame.to_csv(index=False),
451
+ }
452
+ )
453
+
454
+ result_queue.put((True, results))
455
+
456
+ finally:
457
+ doc.close() # type: ignore[no-untyped-call]
458
+
459
+ except Exception as e: # noqa: BLE001
460
+ error_info = {"error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
461
+ result_queue.put((False, error_info))
462
+
463
+
464
+ def _extract_tables_isolated(
465
+ file_path: str | PathLike[str],
466
+ config: GMFTConfig | None = None,
467
+ timeout: float = 300.0,
468
+ ) -> list[TableData]:
469
+ """Extract tables using an isolated process to handle segfaults.
470
+
471
+ Args:
472
+ file_path: Path to the PDF file
473
+ config: GMFT configuration
474
+ timeout: Maximum time to wait for extraction
475
+
476
+ Returns:
477
+ List of extracted tables
478
+
479
+ Raises:
480
+ RuntimeError: If extraction fails or times out
481
+ """
482
+ config = config or GMFTConfig()
483
+ config_dict = config.__dict__.copy()
484
+
485
+ ctx = mp.get_context("spawn")
486
+ result_queue = ctx.Queue()
487
+
488
+ process = ctx.Process(
489
+ target=_extract_tables_in_process,
490
+ args=(str(file_path), config_dict, result_queue),
491
+ )
492
+
493
+ process.start()
494
+
495
+ try:
496
+ # Wait for result with timeout, checking for process death # ~keep
497
+ import time
498
+
499
+ start_time = time.time()
500
+ while True:
501
+ try:
502
+ success, result = result_queue.get_nowait()
503
+ break
504
+ except queue.Empty:
505
+ if time.time() - start_time > timeout:
506
+ raise
507
+
508
+ if not process.is_alive():
509
+ # Process died without putting result # ~keep
510
+ if process.exitcode == -signal.SIGSEGV:
511
+ raise ParsingError(
512
+ "GMFT process crashed with segmentation fault",
513
+ context={
514
+ "file_path": str(file_path),
515
+ "exit_code": process.exitcode,
516
+ },
517
+ ) from None
518
+ raise ParsingError(
519
+ f"GMFT process died unexpectedly with exit code {process.exitcode}",
520
+ context={
521
+ "file_path": str(file_path),
522
+ "exit_code": process.exitcode,
523
+ },
524
+ ) from None
525
+
526
+ time.sleep(0.1)
527
+
528
+ if success:
529
+ tables = []
530
+ for table_dict in result:
531
+ import io
532
+
533
+ from PIL import Image
534
+
535
+ img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
536
+ import pandas as pd
537
+
538
+ df = pd.read_csv(StringIO(table_dict["df_csv"]))
539
+
540
+ tables.append(
541
+ TableData(
542
+ cropped_image=img,
543
+ page_number=table_dict["page_number"],
544
+ text=table_dict["text"],
545
+ df=df,
546
+ )
547
+ )
548
+
549
+ return tables
550
+
551
+ error_info = result
552
+ raise ParsingError(
553
+ f"GMFT table extraction failed: {error_info['error']}",
554
+ context={
555
+ "file_path": str(file_path),
556
+ "error_type": error_info["type"],
557
+ "traceback": error_info["traceback"],
558
+ },
559
+ )
560
+
561
+ except queue.Empty as e:
562
+ raise ParsingError(
563
+ "GMFT table extraction timed out",
564
+ context={
565
+ "file_path": str(file_path),
566
+ "timeout": timeout,
567
+ },
568
+ ) from e
569
+ finally:
570
+ if process.is_alive():
571
+ process.terminate()
572
+ process.join(timeout=5)
573
+ if process.is_alive():
574
+ process.kill()
575
+ process.join()
576
+
577
+
578
+ async def _extract_tables_isolated_async(
579
+ file_path: str | PathLike[str],
580
+ config: GMFTConfig | None = None,
581
+ timeout: float = 300.0,
582
+ ) -> list[TableData]:
583
+ """Async version of extract_tables_isolated using asyncio.
584
+
585
+ Args:
586
+ file_path: Path to the PDF file
587
+ config: GMFT configuration
588
+ timeout: Maximum time to wait for extraction
589
+
590
+ Returns:
591
+ List of extracted tables
592
+
593
+ Raises:
594
+ RuntimeError: If extraction fails or times out
595
+ """
596
+ import anyio
597
+
598
+ config = config or GMFTConfig()
599
+ config_dict = config.__dict__.copy()
600
+
601
+ ctx = mp.get_context("spawn")
602
+ result_queue = ctx.Queue()
603
+
604
+ process = ctx.Process(
605
+ target=_extract_tables_in_process,
606
+ args=(str(file_path), config_dict, result_queue),
607
+ )
608
+
609
+ process.start()
610
+
611
+ try:
612
+
613
+ async def wait_for_result() -> tuple[bool, Any]:
614
+ while True:
615
+ try:
616
+ return result_queue.get_nowait() # type: ignore[no-any-return]
617
+ except queue.Empty: # noqa: PERF203
618
+ await anyio.sleep(0.1)
619
+ if not process.is_alive():
620
+ # Process died without putting result # ~keep
621
+ if process.exitcode == -signal.SIGSEGV:
622
+ raise ParsingError(
623
+ "GMFT process crashed with segmentation fault",
624
+ context={
625
+ "file_path": str(file_path),
626
+ "exit_code": process.exitcode,
627
+ },
628
+ ) from None
629
+ raise ParsingError(
630
+ f"GMFT process died unexpectedly with exit code {process.exitcode}",
631
+ context={
632
+ "file_path": str(file_path),
633
+ "exit_code": process.exitcode,
634
+ },
635
+ ) from None
636
+
637
+ with anyio.fail_after(timeout):
638
+ success, result = await wait_for_result()
639
+
640
+ if success:
641
+ tables = []
642
+ for table_dict in result:
643
+ import io
644
+
645
+ from PIL import Image
646
+
647
+ img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
648
+ import pandas as pd
649
+
650
+ df = pd.read_csv(StringIO(table_dict["df_csv"]))
651
+
652
+ tables.append(
653
+ TableData(
654
+ cropped_image=img,
655
+ page_number=table_dict["page_number"],
656
+ text=table_dict["text"],
657
+ df=df,
658
+ )
659
+ )
660
+
661
+ return tables
662
+
663
+ error_info = result
664
+ raise ParsingError(
665
+ f"GMFT table extraction failed: {error_info['error']}",
666
+ context={
667
+ "file_path": str(file_path),
668
+ "error_type": error_info["type"],
669
+ "traceback": error_info["traceback"],
670
+ },
671
+ )
672
+
673
+ except TimeoutError as e:
674
+ raise ParsingError(
675
+ "GMFT table extraction timed out",
676
+ context={
677
+ "file_path": str(file_path),
678
+ "timeout": timeout,
679
+ },
680
+ ) from e
681
+ finally:
682
+ if process.is_alive():
683
+ process.terminate()
684
+ await anyio.to_thread.run_sync(lambda: process.join(timeout=5))
685
+ if process.is_alive():
686
+ process.kill()
687
+ await anyio.to_thread.run_sync(process.join)
kreuzberg/_mime_types.py CHANGED
@@ -17,6 +17,12 @@ PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
17
17
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
18
18
  DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
19
19
 
20
+ EML_MIME_TYPE: Final = "message/rfc822"
21
+ MSG_MIME_TYPE: Final = "application/vnd.ms-outlook"
22
+ JSON_MIME_TYPE: Final = "application/json"
23
+ YAML_MIME_TYPE: Final = "application/x-yaml"
24
+ TOML_MIME_TYPE: Final = "application/toml"
25
+
20
26
  EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
21
27
  EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
22
28
  EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
@@ -127,6 +133,12 @@ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
127
133
  ".org": "text/x-org",
128
134
  ".epub": "application/epub+zip",
129
135
  ".rtf": "application/rtf",
136
+ ".eml": EML_MIME_TYPE,
137
+ ".msg": MSG_MIME_TYPE,
138
+ ".json": JSON_MIME_TYPE,
139
+ ".yaml": YAML_MIME_TYPE,
140
+ ".yml": YAML_MIME_TYPE,
141
+ ".toml": TOML_MIME_TYPE,
130
142
  ".odt": "application/vnd.oasis.opendocument.text",
131
143
  ".docx": DOCX_MIME_TYPE,
132
144
  ".bib": "application/x-bibtex",
@@ -139,7 +151,21 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
139
151
  | IMAGE_MIME_TYPES
140
152
  | PANDOC_SUPPORTED_MIME_TYPES
141
153
  | SPREADSHEET_MIME_TYPES
142
- | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
154
+ | {
155
+ PDF_MIME_TYPE,
156
+ POWER_POINT_MIME_TYPE,
157
+ HTML_MIME_TYPE,
158
+ EML_MIME_TYPE,
159
+ MSG_MIME_TYPE,
160
+ JSON_MIME_TYPE,
161
+ YAML_MIME_TYPE,
162
+ TOML_MIME_TYPE,
163
+ "text/json",
164
+ "text/yaml",
165
+ "text/x-yaml",
166
+ "application/yaml",
167
+ "text/toml",
168
+ }
143
169
  )
144
170
 
145
171
 
@@ -4,9 +4,18 @@ from typing import Any
4
4
  from kreuzberg._ocr._base import OCRBackend
5
5
  from kreuzberg._ocr._easyocr import EasyOCRBackend
6
6
  from kreuzberg._ocr._paddleocr import PaddleBackend
7
- from kreuzberg._ocr._tesseract import TesseractBackend
7
+ from kreuzberg._ocr._tesseract import TesseractBackend, TesseractProcessPool
8
8
  from kreuzberg._types import OcrBackendType
9
9
 
10
+ __all__ = [
11
+ "EasyOCRBackend",
12
+ "OCRBackend",
13
+ "PaddleBackend",
14
+ "TesseractBackend",
15
+ "TesseractProcessPool",
16
+ "get_ocr_backend",
17
+ ]
18
+
10
19
 
11
20
  @lru_cache
12
21
  def get_ocr_backend(backend: OcrBackendType) -> OCRBackend[Any]:
kreuzberg/_ocr/_base.py CHANGED
@@ -49,6 +49,65 @@ class OCRBackend(ABC, Generic[T]):
49
49
  """
50
50
  ...
51
51
 
52
+ @abstractmethod
53
+ def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
54
+ """Synchronously process an image and extract its text and metadata.
55
+
56
+ Args:
57
+ image: An instance of PIL.Image representing the input image.
58
+ **kwargs: Any kwargs related to the given backend
59
+
60
+ Returns:
61
+ The extraction result object
62
+ """
63
+ ...
64
+
65
+ @abstractmethod
66
+ def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
67
+ """Synchronously process a file and extract its text and metadata.
68
+
69
+ Args:
70
+ path: A Path object representing the file to be processed.
71
+ **kwargs: Any kwargs related to the given backend
72
+
73
+ Returns:
74
+ The extraction result object
75
+ """
76
+ ...
77
+
78
+ def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
79
+ """Synchronously process a batch of files and extract their text and metadata.
80
+
81
+ Default implementation processes files sequentially. Backends can override
82
+ for more efficient batch processing.
83
+
84
+ Args:
85
+ paths: List of Path objects representing files to be processed.
86
+ **kwargs: Any kwargs related to the given backend
87
+
88
+ Returns:
89
+ List of extraction result objects in the same order as input paths
90
+ """
91
+ return [self.process_file_sync(path, **kwargs) for path in paths]
92
+
93
+ async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
94
+ """Asynchronously process a batch of files and extract their text and metadata.
95
+
96
+ Default implementation processes files concurrently. Backends can override
97
+ for more efficient batch processing.
98
+
99
+ Args:
100
+ paths: List of Path objects representing files to be processed.
101
+ **kwargs: Any kwargs related to the given backend
102
+
103
+ Returns:
104
+ List of extraction result objects in the same order as input paths
105
+ """
106
+ from kreuzberg._utils._sync import run_taskgroup
107
+
108
+ tasks = [self.process_file(path, **kwargs) for path in paths]
109
+ return await run_taskgroup(*tasks)
110
+
52
111
  def __hash__(self) -> int:
53
112
  """Hash function for allowing caching."""
54
113
  return hash(type(self).__name__)
@@ -4,6 +4,7 @@ import warnings
4
4
  from dataclasses import dataclass
5
5
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
6
6
 
7
+ import numpy as np
7
8
  from PIL import Image
8
9
 
9
10
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -440,3 +441,93 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
440
441
  )
441
442
 
442
443
  return languages
444
+
445
+ def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
446
+ """Synchronously process an image and extract its text and metadata using EasyOCR.
447
+
448
+ Args:
449
+ image: An instance of PIL.Image representing the input image.
450
+ **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
451
+
452
+ Returns:
453
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
454
+
455
+ Raises:
456
+ OCRError: If OCR processing fails.
457
+ """
458
+ self._init_easyocr_sync(**kwargs)
459
+
460
+ beam_width = kwargs.pop("beam_width")
461
+ kwargs.pop("language", None)
462
+ kwargs.pop("use_gpu", None)
463
+
464
+ try:
465
+ result = self._reader.readtext(
466
+ np.array(image),
467
+ beamWidth=beam_width,
468
+ **kwargs,
469
+ )
470
+
471
+ return self._process_easyocr_result(result, image)
472
+ except Exception as e:
473
+ raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
474
+
475
+ def process_file_sync(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
476
+ """Synchronously process a file and extract its text and metadata using EasyOCR.
477
+
478
+ Args:
479
+ path: A Path object representing the file to be processed.
480
+ **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
481
+
482
+ Returns:
483
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
484
+
485
+ Raises:
486
+ OCRError: If file loading or OCR processing fails.
487
+ """
488
+ self._init_easyocr_sync(**kwargs)
489
+ try:
490
+ image = Image.open(path)
491
+ return self.process_image_sync(image, **kwargs)
492
+ except Exception as e:
493
+ raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
494
+
495
+ @classmethod
496
+ def _init_easyocr_sync(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
497
+ """Synchronously initialize EasyOCR with the provided configuration.
498
+
499
+ Args:
500
+ **kwargs: Configuration parameters for EasyOCR including language, etc.
501
+
502
+ Raises:
503
+ MissingDependencyError: If EasyOCR is not installed.
504
+ OCRError: If initialization fails.
505
+ """
506
+ if cls._reader is not None:
507
+ return
508
+
509
+ try:
510
+ import easyocr
511
+ except ImportError as e:
512
+ raise MissingDependencyError.create_for_package(
513
+ dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
514
+ ) from e
515
+
516
+ languages = cls._validate_language_code(kwargs.pop("language", "en"))
517
+
518
+ device_info = cls._resolve_device_config(**kwargs)
519
+ use_gpu = device_info.device_type in ("cuda", "mps")
520
+
521
+ kwargs.setdefault("detector", True)
522
+ kwargs.setdefault("recognizer", True)
523
+ kwargs.setdefault("download_enabled", True)
524
+ kwargs.setdefault("recog_network", "standard")
525
+
526
+ try:
527
+ cls._reader = easyocr.Reader(
528
+ languages,
529
+ gpu=use_gpu,
530
+ verbose=False,
531
+ )
532
+ except Exception as e:
533
+ raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
@@ -4,8 +4,10 @@ import platform
4
4
  import warnings
5
5
  from dataclasses import dataclass
6
6
  from importlib.util import find_spec
7
+ from pathlib import Path
7
8
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
8
9
 
10
+ import numpy as np
9
11
  from PIL import Image
10
12
 
11
13
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -364,3 +366,90 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
364
366
  "supported_languages": ",".join(sorted(PADDLEOCR_SUPPORTED_LANGUAGE_CODES)),
365
367
  },
366
368
  )
369
+
370
+ def process_image_sync(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
371
+ """Synchronously process an image and extract its text and metadata using PaddleOCR.
372
+
373
+ Args:
374
+ image: An instance of PIL.Image representing the input image.
375
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
376
+
377
+ Returns:
378
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
379
+
380
+ Raises:
381
+ OCRError: If OCR processing fails.
382
+ """
383
+ self._init_paddle_ocr_sync(**kwargs)
384
+
385
+ if image.mode != "RGB":
386
+ image = image.convert("RGB")
387
+
388
+ image_np = np.array(image)
389
+ try:
390
+ result = self._paddle_ocr.ocr(image_np, cls=kwargs.get("use_angle_cls", True))
391
+ return self._process_paddle_result(result, image)
392
+ except Exception as e:
393
+ raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
394
+
395
+ def process_file_sync(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
396
+ """Synchronously process a file and extract its text and metadata using PaddleOCR.
397
+
398
+ Args:
399
+ path: A Path object representing the file to be processed.
400
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
401
+
402
+ Returns:
403
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
404
+
405
+ Raises:
406
+ OCRError: If file loading or OCR processing fails.
407
+ """
408
+ self._init_paddle_ocr_sync(**kwargs)
409
+ try:
410
+ image = Image.open(path)
411
+ return self.process_image_sync(image, **kwargs)
412
+ except Exception as e:
413
+ raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
414
+
415
+ @classmethod
416
+ def _init_paddle_ocr_sync(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
417
+ """Synchronously initialize PaddleOCR with the provided configuration.
418
+
419
+ Args:
420
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
421
+
422
+ Raises:
423
+ MissingDependencyError: If PaddleOCR is not installed.
424
+ OCRError: If initialization fails.
425
+ """
426
+ if cls._paddle_ocr is not None:
427
+ return
428
+
429
+ try:
430
+ from paddleocr import PaddleOCR
431
+ except ImportError as e:
432
+ raise MissingDependencyError.create_for_package(
433
+ dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
434
+ ) from e
435
+
436
+ language = cls._validate_language_code(kwargs.pop("language", "en"))
437
+
438
+ device_info = cls._resolve_device_config(**kwargs)
439
+ use_gpu = device_info.device_type == "cuda"
440
+
441
+ has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
442
+ kwargs.setdefault("use_angle_cls", True)
443
+ kwargs["use_gpu"] = use_gpu and has_gpu_package
444
+ kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
445
+ kwargs.setdefault("det_db_thresh", 0.3)
446
+ kwargs.setdefault("det_db_box_thresh", 0.5)
447
+ kwargs.setdefault("det_db_unclip_ratio", 1.6)
448
+
449
+ if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
450
+ kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
451
+
452
+ try:
453
+ cls._paddle_ocr = PaddleOCR(lang=language, show_log=False, **kwargs)
454
+ except Exception as e:
455
+ raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e