kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kreuzberg/__init__.py +9 -2
  2. kreuzberg/_api/__init__.py +0 -0
  3. kreuzberg/_api/main.py +87 -0
  4. kreuzberg/_entity_extraction.py +238 -0
  5. kreuzberg/_extractors/_base.py +39 -1
  6. kreuzberg/_extractors/_email.py +149 -0
  7. kreuzberg/_extractors/_html.py +15 -3
  8. kreuzberg/_extractors/_image.py +27 -22
  9. kreuzberg/_extractors/_pandoc.py +3 -14
  10. kreuzberg/_extractors/_pdf.py +97 -34
  11. kreuzberg/_extractors/_presentation.py +62 -10
  12. kreuzberg/_extractors/_spread_sheet.py +181 -6
  13. kreuzberg/_extractors/_structured.py +148 -0
  14. kreuzberg/_gmft.py +318 -11
  15. kreuzberg/_language_detection.py +95 -0
  16. kreuzberg/_mcp/__init__.py +5 -0
  17. kreuzberg/_mcp/server.py +227 -0
  18. kreuzberg/_mime_types.py +27 -1
  19. kreuzberg/_ocr/__init__.py +10 -1
  20. kreuzberg/_ocr/_base.py +59 -0
  21. kreuzberg/_ocr/_easyocr.py +92 -1
  22. kreuzberg/_ocr/_paddleocr.py +89 -0
  23. kreuzberg/_ocr/_tesseract.py +569 -5
  24. kreuzberg/_registry.py +4 -0
  25. kreuzberg/_types.py +181 -4
  26. kreuzberg/_utils/_cache.py +52 -4
  27. kreuzberg/_utils/_device.py +2 -2
  28. kreuzberg/_utils/_errors.py +3 -7
  29. kreuzberg/_utils/_process_pool.py +182 -9
  30. kreuzberg/_utils/_quality.py +237 -0
  31. kreuzberg/_utils/_serialization.py +4 -2
  32. kreuzberg/_utils/_string.py +153 -10
  33. kreuzberg/_utils/_sync.py +6 -7
  34. kreuzberg/_utils/_table.py +261 -0
  35. kreuzberg/_utils/_tmp.py +2 -2
  36. kreuzberg/cli.py +1 -2
  37. kreuzberg/extraction.py +43 -34
  38. kreuzberg-3.8.1.dist-info/METADATA +301 -0
  39. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  40. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
  41. kreuzberg/_multiprocessing/__init__.py +0 -6
  42. kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
  43. kreuzberg/_multiprocessing/process_manager.py +0 -188
  44. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  45. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  46. kreuzberg-3.3.0.dist-info/METADATA +0 -235
  47. kreuzberg-3.3.0.dist-info/RECORD +0 -48
  48. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  49. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,14 +1,19 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import hashlib
4
+ import os
4
5
  import re
6
+ import subprocess
5
7
  import sys
8
+ import tempfile
6
9
  from dataclasses import dataclass
7
10
  from enum import Enum
11
+ from pathlib import Path
8
12
  from typing import TYPE_CHECKING, Any, ClassVar, Final
9
13
 
10
14
  from anyio import Path as AsyncPath
11
15
  from anyio import run_process
16
+ from typing_extensions import Self
12
17
 
13
18
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
14
19
  from kreuzberg._ocr._base import OCRBackend
@@ -19,8 +24,6 @@ from kreuzberg._utils._tmp import create_temp_file
19
24
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
20
25
 
21
26
  if TYPE_CHECKING:
22
- from pathlib import Path
23
-
24
27
  from PIL.Image import Image
25
28
 
26
29
  try: # pragma: no cover
@@ -202,8 +205,10 @@ class TesseractConfig:
202
205
  - 'deu' for German
203
206
  - multiple languages combined with '+', e.g. 'eng+deu')
204
207
  """
205
- language_model_ngram_on: bool = True
206
- """Enable or disable the use of n-gram-based language models for improved text recognition."""
208
+ language_model_ngram_on: bool = False
209
+ """Enable or disable the use of n-gram-based language models for improved text recognition.
210
+
211
+ Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
207
212
  psm: PSMMode = PSMMode.AUTO
208
213
  """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
209
214
  tessedit_dont_blkrej_good_wds: bool = True
@@ -212,6 +217,8 @@ class TesseractConfig:
212
217
  """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
213
218
  tessedit_enable_dict_correction: bool = True
214
219
  """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
220
+ tessedit_char_whitelist: str = ""
221
+ """Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
215
222
  tessedit_use_primary_params_model: bool = True
216
223
  """If True, forces the use of the primary parameters model for text recognition."""
217
224
  textord_space_size_is_variable: bool = True
@@ -341,7 +348,11 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
341
348
  "OFF",
342
349
  ]
343
350
  for kwarg, value in kwargs.items():
344
- command.extend(["-c", f"{kwarg}={1 if value else 0}"])
351
+ if isinstance(value, bool):
352
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
353
+ else:
354
+ # Handle string parameters (like tessedit_char_whitelist)
355
+ command.extend(["-c", f"{kwarg}={value}"])
345
356
 
346
357
  env: dict[str, Any] | None = None
347
358
  if sys.platform.startswith("linux"):
@@ -399,6 +410,225 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
399
410
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
400
411
  ) from e
401
412
 
413
+ def process_image_sync(
414
+ self,
415
+ image: Image,
416
+ **kwargs: Unpack[TesseractConfig],
417
+ ) -> ExtractionResult:
418
+ """Synchronously process an image and extract its text and metadata.
419
+
420
+ Args:
421
+ image: An instance of PIL.Image representing the input image.
422
+ **kwargs: Any kwargs related to the given backend
423
+
424
+ Returns:
425
+ The extraction result object
426
+ """
427
+ import io
428
+
429
+ from kreuzberg._utils._cache import get_ocr_cache
430
+
431
+ image_buffer = io.BytesIO()
432
+ image.save(image_buffer, format="PNG")
433
+ image_content = image_buffer.getvalue()
434
+
435
+ cache_kwargs = {
436
+ "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
437
+ "ocr_backend": "tesseract",
438
+ "ocr_config": str(sorted(kwargs.items())),
439
+ }
440
+
441
+ ocr_cache = get_ocr_cache()
442
+ cached_result = ocr_cache.get(**cache_kwargs)
443
+ if cached_result is not None:
444
+ return cached_result
445
+
446
+ if ocr_cache.is_processing(**cache_kwargs):
447
+ event = ocr_cache.mark_processing(**cache_kwargs)
448
+ event.wait()
449
+
450
+ # Try cache again after waiting for other process to complete
451
+ cached_result = ocr_cache.get(**cache_kwargs)
452
+ if cached_result is not None:
453
+ return cached_result
454
+
455
+ ocr_cache.mark_processing(**cache_kwargs)
456
+
457
+ try:
458
+ self._validate_tesseract_version_sync()
459
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
460
+ image_path = Path(tmp_file.name)
461
+ image.save(str(image_path), format="PNG")
462
+ try:
463
+ result = self.process_file_sync(image_path, **kwargs)
464
+
465
+ ocr_cache.set(result, **cache_kwargs)
466
+
467
+ return result
468
+ finally:
469
+ if image_path.exists():
470
+ image_path.unlink()
471
+ finally:
472
+ ocr_cache.mark_complete(**cache_kwargs)
473
+
474
+ def process_file_sync(
475
+ self,
476
+ path: Path,
477
+ **kwargs: Unpack[TesseractConfig],
478
+ ) -> ExtractionResult:
479
+ """Synchronously process a file and extract its text and metadata.
480
+
481
+ Args:
482
+ path: A Path object representing the file to be processed.
483
+ **kwargs: Any kwargs related to the given backend
484
+
485
+ Returns:
486
+ The extraction result object
487
+ """
488
+ from kreuzberg._utils._cache import get_ocr_cache
489
+
490
+ file_info = self._get_file_info(path)
491
+
492
+ cache_kwargs = {
493
+ "file_info": str(sorted(file_info.items())),
494
+ "ocr_backend": "tesseract",
495
+ "ocr_config": str(sorted(kwargs.items())),
496
+ }
497
+
498
+ ocr_cache = get_ocr_cache()
499
+ cached_result = ocr_cache.get(**cache_kwargs)
500
+ if cached_result is not None:
501
+ return cached_result
502
+
503
+ if ocr_cache.is_processing(**cache_kwargs):
504
+ event = ocr_cache.mark_processing(**cache_kwargs)
505
+ event.wait()
506
+
507
+ # Try cache again after waiting for other process to complete
508
+ cached_result = ocr_cache.get(**cache_kwargs)
509
+ if cached_result is not None:
510
+ return cached_result
511
+
512
+ ocr_cache.mark_processing(**cache_kwargs)
513
+
514
+ try:
515
+ self._validate_tesseract_version_sync()
516
+ with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
517
+ output_base = tmp_file.name.replace(".txt", "")
518
+ language = self._validate_language_code(kwargs.pop("language", "eng"))
519
+ psm = kwargs.pop("psm", PSMMode.AUTO)
520
+ try:
521
+ command = self._build_tesseract_command(path, output_base, language, psm, **kwargs)
522
+ self._run_tesseract_sync(command)
523
+
524
+ output_path = Path(output_base + ".txt")
525
+ with output_path.open(encoding="utf-8") as f:
526
+ output = f.read()
527
+ extraction_result = ExtractionResult(
528
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
529
+ )
530
+
531
+ final_cache_kwargs = cache_kwargs.copy()
532
+ final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
533
+ ocr_cache.set(extraction_result, **final_cache_kwargs)
534
+
535
+ return extraction_result
536
+ except (RuntimeError, OSError) as e:
537
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
538
+ finally:
539
+ for ext in [".txt"]:
540
+ temp_file = Path(output_base + ext)
541
+ if temp_file.exists():
542
+ temp_file.unlink()
543
+ finally:
544
+ ocr_cache.mark_complete(**cache_kwargs)
545
+
546
+ def _get_file_info(self, path: Path) -> dict[str, Any]:
547
+ """Get file information for caching."""
548
+ try:
549
+ stat = path.stat()
550
+ return {
551
+ "path": str(path.resolve()),
552
+ "size": stat.st_size,
553
+ "mtime": stat.st_mtime,
554
+ }
555
+ except OSError:
556
+ return {
557
+ "path": str(path),
558
+ "size": 0,
559
+ "mtime": 0,
560
+ }
561
+
562
+ def _build_tesseract_command(
563
+ self, path: Path, output_base: str, language: str, psm: PSMMode, **kwargs: Any
564
+ ) -> list[str]:
565
+ """Build tesseract command with all parameters."""
566
+ command = [
567
+ "tesseract",
568
+ str(path),
569
+ output_base,
570
+ "-l",
571
+ language,
572
+ "--psm",
573
+ str(psm.value),
574
+ "--oem",
575
+ "1",
576
+ "--loglevel",
577
+ "OFF",
578
+ ]
579
+ for kwarg, value in kwargs.items():
580
+ if isinstance(value, bool):
581
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
582
+ else:
583
+ command.extend(["-c", f"{kwarg}={value}"])
584
+ return command
585
+
586
+ def _run_tesseract_sync(self, command: list[str]) -> None:
587
+ """Run tesseract command synchronously."""
588
+ env = os.environ.copy()
589
+ if sys.platform.startswith("linux"):
590
+ env["OMP_THREAD_LIMIT"] = "1"
591
+
592
+ result = subprocess.run(
593
+ command,
594
+ check=False,
595
+ env=env,
596
+ capture_output=True,
597
+ text=True,
598
+ timeout=30,
599
+ )
600
+
601
+ if result.returncode != 0:
602
+ raise OCRError(
603
+ "OCR failed with a non-0 return code.",
604
+ context={"error": result.stderr},
605
+ )
606
+
607
+ @classmethod
608
+ def _validate_tesseract_version_sync(cls) -> None:
609
+ """Synchronously validate that Tesseract is installed and is version 5 or above.
610
+
611
+ Raises:
612
+ MissingDependencyError: If Tesseract is not installed or is below version 5.
613
+ """
614
+ try:
615
+ if cls._version_checked:
616
+ return
617
+
618
+ command = ["tesseract", "--version"]
619
+ result = subprocess.run(command, capture_output=True, text=True, check=False)
620
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout)
621
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
622
+ raise MissingDependencyError(
623
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
624
+ )
625
+
626
+ cls._version_checked = True
627
+ except FileNotFoundError as e:
628
+ raise MissingDependencyError(
629
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
630
+ ) from e
631
+
402
632
  @staticmethod
403
633
  def _validate_language_code(language_code: str) -> str:
404
634
  """Convert a language code to Tesseract format.
@@ -430,3 +660,337 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
430
660
  "supported_languages": ",".join(sorted(TESSERACT_SUPPORTED_LANGUAGE_CODES)),
431
661
  },
432
662
  )
663
+
664
+
665
+ def _process_image_with_tesseract(
666
+ image_path: str,
667
+ config_dict: dict[str, Any],
668
+ ) -> dict[str, Any]:
669
+ """Process a single image with Tesseract in a separate process.
670
+
671
+ This function is designed to be executed in a subprocess.
672
+ It uses direct tesseract command execution to avoid async complications.
673
+
674
+ Args:
675
+ image_path: Path to the image file.
676
+ config_dict: Tesseract configuration as dictionary.
677
+
678
+ Returns:
679
+ OCR result as dictionary.
680
+ """
681
+ try:
682
+ with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
683
+ output_base = tmp_file.name.replace(".txt", "")
684
+
685
+ try:
686
+ language = config_dict.get("language", "eng")
687
+ psm = config_dict.get("psm", 3)
688
+
689
+ command = [
690
+ "tesseract",
691
+ image_path,
692
+ output_base,
693
+ "-l",
694
+ language,
695
+ "--psm",
696
+ str(psm),
697
+ "--oem",
698
+ "1",
699
+ "--loglevel",
700
+ "OFF",
701
+ ]
702
+
703
+ boolean_options = [
704
+ "classify_use_pre_adapted_templates",
705
+ "language_model_ngram_on",
706
+ "tessedit_dont_blkrej_good_wds",
707
+ "tessedit_dont_rowrej_good_wds",
708
+ "tessedit_enable_dict_correction",
709
+ "tessedit_use_primary_params_model",
710
+ "textord_space_size_is_variable",
711
+ "thresholding_method",
712
+ ]
713
+
714
+ for option in boolean_options:
715
+ if option in config_dict:
716
+ value = 1 if config_dict[option] else 0
717
+ command.extend(["-c", f"{option}={value}"])
718
+
719
+ env = os.environ.copy()
720
+ env["OMP_THREAD_LIMIT"] = "1"
721
+
722
+ result = subprocess.run(
723
+ command,
724
+ check=False,
725
+ env=env,
726
+ capture_output=True,
727
+ text=True,
728
+ timeout=30,
729
+ )
730
+
731
+ if result.returncode != 0:
732
+ raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
733
+
734
+ output_file = output_base + ".txt"
735
+ with Path(output_file).open(encoding="utf-8") as f:
736
+ text = f.read()
737
+
738
+ text = normalize_spaces(text)
739
+
740
+ return {
741
+ "success": True,
742
+ "text": text,
743
+ "confidence": None,
744
+ "error": None,
745
+ }
746
+
747
+ finally:
748
+ for ext in [".txt"]:
749
+ temp_file = output_base + ext
750
+ temp_path = Path(temp_file)
751
+ if temp_path.exists():
752
+ temp_path.unlink()
753
+
754
+ except Exception as e: # noqa: BLE001
755
+ return {
756
+ "success": False,
757
+ "text": "",
758
+ "confidence": None,
759
+ "error": str(e),
760
+ }
761
+
762
+
763
+ def _process_image_bytes_with_tesseract(
764
+ image_bytes: bytes,
765
+ config_dict: dict[str, Any],
766
+ ) -> dict[str, Any]:
767
+ """Process image bytes with Tesseract in a separate process.
768
+
769
+ Args:
770
+ image_bytes: Image data as bytes.
771
+ config_dict: Tesseract configuration as dictionary.
772
+
773
+ Returns:
774
+ OCR result as dictionary.
775
+ """
776
+ try:
777
+ import io
778
+
779
+ from PIL import Image
780
+
781
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
782
+ with Image.open(io.BytesIO(image_bytes)) as image:
783
+ image.save(tmp_image.name, format="PNG")
784
+ image_path = tmp_image.name
785
+
786
+ try:
787
+ return _process_image_with_tesseract(image_path, config_dict)
788
+ finally:
789
+ image_file = Path(image_path)
790
+ if image_file.exists():
791
+ image_file.unlink()
792
+
793
+ except Exception as e: # noqa: BLE001
794
+ return {
795
+ "success": False,
796
+ "text": "",
797
+ "confidence": None,
798
+ "error": str(e),
799
+ }
800
+
801
+
802
+ class TesseractProcessPool:
803
+ """Process pool for parallel Tesseract OCR processing."""
804
+
805
+ def __init__(
806
+ self,
807
+ config: TesseractConfig | None = None,
808
+ max_processes: int | None = None,
809
+ memory_limit_gb: float | None = None,
810
+ ) -> None:
811
+ """Initialize the Tesseract process pool.
812
+
813
+ Args:
814
+ config: Default Tesseract configuration.
815
+ max_processes: Maximum number of processes.
816
+ memory_limit_gb: Memory limit in GB.
817
+ """
818
+ from kreuzberg._utils._process_pool import ProcessPoolManager
819
+
820
+ self.config = config or TesseractConfig()
821
+ self.process_manager = ProcessPoolManager(
822
+ max_processes=max_processes,
823
+ memory_limit_gb=memory_limit_gb,
824
+ )
825
+
826
+ def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
827
+ """Convert TesseractConfig to dictionary for pickling."""
828
+ cfg = config or self.config
829
+
830
+ config_dict = {}
831
+ for field_name in cfg.__dataclass_fields__:
832
+ value = getattr(cfg, field_name)
833
+
834
+ if hasattr(value, "value"):
835
+ config_dict[field_name] = value.value
836
+ else:
837
+ config_dict[field_name] = value
838
+
839
+ return config_dict
840
+
841
+ def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
842
+ """Convert result dictionary back to OCRResult."""
843
+ if not result_dict["success"]:
844
+ raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
845
+
846
+ return ExtractionResult(
847
+ content=result_dict["text"],
848
+ mime_type=PLAIN_TEXT_MIME_TYPE,
849
+ metadata={"confidence": result_dict["confidence"]} if result_dict["confidence"] else {}, # type: ignore[typeddict-unknown-key]
850
+ chunks=[],
851
+ )
852
+
853
+ async def process_image(
854
+ self,
855
+ image_path: str | Path,
856
+ config: TesseractConfig | None = None,
857
+ ) -> ExtractionResult:
858
+ """Process a single image file with Tesseract.
859
+
860
+ Args:
861
+ image_path: Path to the image file.
862
+ config: Tesseract configuration (uses default if None).
863
+
864
+ Returns:
865
+ OCR result.
866
+ """
867
+ config_dict = self._config_to_dict(config)
868
+
869
+ task_memory_mb = 80
870
+
871
+ result_dict = await self.process_manager.submit_task(
872
+ _process_image_with_tesseract,
873
+ str(image_path),
874
+ config_dict,
875
+ task_memory_mb=task_memory_mb,
876
+ )
877
+
878
+ return self._result_from_dict(result_dict)
879
+
880
+ async def process_image_bytes(
881
+ self,
882
+ image_bytes: bytes,
883
+ config: TesseractConfig | None = None,
884
+ ) -> ExtractionResult:
885
+ """Process image bytes with Tesseract.
886
+
887
+ Args:
888
+ image_bytes: Image data as bytes.
889
+ config: Tesseract configuration (uses default if None).
890
+
891
+ Returns:
892
+ OCR result.
893
+ """
894
+ config_dict = self._config_to_dict(config)
895
+
896
+ image_size_mb = len(image_bytes) / 1024 / 1024
897
+ task_memory_mb = max(80, image_size_mb * 2 + 50)
898
+
899
+ result_dict = await self.process_manager.submit_task(
900
+ _process_image_bytes_with_tesseract,
901
+ image_bytes,
902
+ config_dict,
903
+ task_memory_mb=task_memory_mb,
904
+ )
905
+
906
+ return self._result_from_dict(result_dict)
907
+
908
+ async def process_batch_images(
909
+ self,
910
+ image_paths: list[str | Path],
911
+ config: TesseractConfig | None = None,
912
+ max_concurrent: int | None = None,
913
+ ) -> list[ExtractionResult]:
914
+ """Process a batch of images in parallel.
915
+
916
+ Args:
917
+ image_paths: List of image file paths.
918
+ config: Tesseract configuration (uses default if None).
919
+ max_concurrent: Maximum concurrent processes.
920
+
921
+ Returns:
922
+ List of OCR results in the same order as input.
923
+ """
924
+ if not image_paths:
925
+ return []
926
+
927
+ config_dict = self._config_to_dict(config)
928
+
929
+ arg_batches = [(str(path), config_dict) for path in image_paths]
930
+
931
+ task_memory_mb = 80
932
+
933
+ result_dicts = await self.process_manager.submit_batch(
934
+ _process_image_with_tesseract,
935
+ arg_batches,
936
+ task_memory_mb=task_memory_mb,
937
+ max_concurrent=max_concurrent,
938
+ )
939
+
940
+ return [self._result_from_dict(result_dict) for result_dict in result_dicts]
941
+
942
+ async def process_batch_bytes(
943
+ self,
944
+ image_bytes_list: list[bytes],
945
+ config: TesseractConfig | None = None,
946
+ max_concurrent: int | None = None,
947
+ ) -> list[ExtractionResult]:
948
+ """Process a batch of image bytes in parallel.
949
+
950
+ Args:
951
+ image_bytes_list: List of image data as bytes.
952
+ config: Tesseract configuration (uses default if None).
953
+ max_concurrent: Maximum concurrent processes.
954
+
955
+ Returns:
956
+ List of OCR results in the same order as input.
957
+ """
958
+ if not image_bytes_list:
959
+ return []
960
+
961
+ config_dict = self._config_to_dict(config)
962
+
963
+ arg_batches = [(image_bytes, config_dict) for image_bytes in image_bytes_list]
964
+
965
+ avg_image_size_mb = sum(len(img) for img in image_bytes_list) / len(image_bytes_list) / 1024 / 1024
966
+ task_memory_mb = max(80, avg_image_size_mb * 2 + 50)
967
+
968
+ result_dicts = await self.process_manager.submit_batch(
969
+ _process_image_bytes_with_tesseract,
970
+ arg_batches,
971
+ task_memory_mb=task_memory_mb,
972
+ max_concurrent=max_concurrent,
973
+ )
974
+
975
+ return [self._result_from_dict(result_dict) for result_dict in result_dicts]
976
+
977
+ def get_system_info(self) -> dict[str, Any]:
978
+ """Get system information from the process manager."""
979
+ return self.process_manager.get_system_info()
980
+
981
+ def shutdown(self, wait: bool = True) -> None:
982
+ """Shutdown the process pool."""
983
+ self.process_manager.shutdown(wait=wait)
984
+
985
+ async def __aenter__(self) -> Self:
986
+ """Async context manager entry."""
987
+ return self
988
+
989
+ async def __aexit__(
990
+ self,
991
+ exc_type: type[BaseException] | None,
992
+ exc_val: BaseException | None,
993
+ exc_tb: object,
994
+ ) -> None:
995
+ """Async context manager exit."""
996
+ self.shutdown()
kreuzberg/_registry.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from functools import lru_cache
4
4
  from typing import TYPE_CHECKING, ClassVar
5
5
 
6
+ from kreuzberg._extractors._email import EmailExtractor
6
7
  from kreuzberg._extractors._html import HTMLExtractor
7
8
  from kreuzberg._extractors._image import ImageExtractor
8
9
  from kreuzberg._extractors._pandoc import (
@@ -19,6 +20,7 @@ from kreuzberg._extractors._pandoc import (
19
20
  from kreuzberg._extractors._pdf import PDFExtractor
20
21
  from kreuzberg._extractors._presentation import PresentationExtractor
21
22
  from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
23
+ from kreuzberg._extractors._structured import StructuredDataExtractor
22
24
 
23
25
  if TYPE_CHECKING:
24
26
  from kreuzberg._extractors._base import Extractor
@@ -40,6 +42,8 @@ class ExtractorRegistry:
40
42
  PresentationExtractor,
41
43
  SpreadSheetExtractor,
42
44
  HTMLExtractor,
45
+ EmailExtractor,
46
+ StructuredDataExtractor,
43
47
  MarkdownExtractor,
44
48
  ImageExtractor,
45
49
  BibliographyExtractor,