kreuzberg 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/_entity_extraction.py +1 -2
  2. kreuzberg/_extractors/_base.py +39 -1
  3. kreuzberg/_extractors/_email.py +149 -0
  4. kreuzberg/_extractors/_html.py +15 -3
  5. kreuzberg/_extractors/_image.py +21 -36
  6. kreuzberg/_extractors/_pandoc.py +3 -14
  7. kreuzberg/_extractors/_pdf.py +81 -48
  8. kreuzberg/_extractors/_presentation.py +62 -10
  9. kreuzberg/_extractors/_spread_sheet.py +179 -4
  10. kreuzberg/_extractors/_structured.py +148 -0
  11. kreuzberg/_gmft.py +314 -7
  12. kreuzberg/_mime_types.py +27 -1
  13. kreuzberg/_ocr/__init__.py +10 -1
  14. kreuzberg/_ocr/_base.py +59 -0
  15. kreuzberg/_ocr/_easyocr.py +91 -0
  16. kreuzberg/_ocr/_paddleocr.py +89 -0
  17. kreuzberg/_ocr/_tesseract.py +564 -4
  18. kreuzberg/_registry.py +4 -0
  19. kreuzberg/_types.py +131 -0
  20. kreuzberg/_utils/_cache.py +52 -4
  21. kreuzberg/_utils/_errors.py +3 -7
  22. kreuzberg/_utils/_process_pool.py +180 -7
  23. kreuzberg/_utils/_quality.py +237 -0
  24. kreuzberg/_utils/_serialization.py +4 -2
  25. kreuzberg/_utils/_string.py +153 -10
  26. kreuzberg/_utils/_sync.py +5 -2
  27. kreuzberg/_utils/_table.py +261 -0
  28. kreuzberg/cli.py +1 -2
  29. kreuzberg/extraction.py +4 -22
  30. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/METADATA +58 -54
  31. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  32. kreuzberg/_multiprocessing/__init__.py +0 -6
  33. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  34. kreuzberg/_multiprocessing/process_manager.py +0 -189
  35. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  36. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  37. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  38. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  39. kreuzberg-3.7.0.dist-info/RECORD +0 -56
  40. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,14 +1,19 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import hashlib
4
+ import os
4
5
  import re
6
+ import subprocess
5
7
  import sys
8
+ import tempfile
6
9
  from dataclasses import dataclass
7
10
  from enum import Enum
11
+ from pathlib import Path
8
12
  from typing import TYPE_CHECKING, Any, ClassVar, Final
9
13
 
10
14
  from anyio import Path as AsyncPath
11
15
  from anyio import run_process
16
+ from typing_extensions import Self
12
17
 
13
18
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
14
19
  from kreuzberg._ocr._base import OCRBackend
@@ -19,8 +24,6 @@ from kreuzberg._utils._tmp import create_temp_file
19
24
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
20
25
 
21
26
  if TYPE_CHECKING:
22
- from pathlib import Path
23
-
24
27
  from PIL.Image import Image
25
28
 
26
29
  try: # pragma: no cover
@@ -206,7 +209,7 @@ class TesseractConfig:
206
209
  """Enable or disable the use of n-gram-based language models for improved text recognition.
207
210
 
208
211
  Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
209
- psm: PSMMode = PSMMode.AUTO_ONLY
212
+ psm: PSMMode = PSMMode.AUTO
210
213
  """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
211
214
  tessedit_dont_blkrej_good_wds: bool = True
212
215
  """If True, prevents block rejection of words identified as good, improving text output quality."""
@@ -345,7 +348,11 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
345
348
  "OFF",
346
349
  ]
347
350
  for kwarg, value in kwargs.items():
348
- command.extend(["-c", f"{kwarg}={1 if value else 0}"])
351
+ if isinstance(value, bool):
352
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
353
+ else:
354
+ # Handle string parameters (like tessedit_char_whitelist)
355
+ command.extend(["-c", f"{kwarg}={value}"])
349
356
 
350
357
  env: dict[str, Any] | None = None
351
358
  if sys.platform.startswith("linux"):
@@ -403,6 +410,225 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
403
410
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
404
411
  ) from e
405
412
 
413
+ def process_image_sync(
414
+ self,
415
+ image: Image,
416
+ **kwargs: Unpack[TesseractConfig],
417
+ ) -> ExtractionResult:
418
+ """Synchronously process an image and extract its text and metadata.
419
+
420
+ Args:
421
+ image: An instance of PIL.Image representing the input image.
422
+ **kwargs: Any kwargs related to the given backend
423
+
424
+ Returns:
425
+ The extraction result object
426
+ """
427
+ import io
428
+
429
+ from kreuzberg._utils._cache import get_ocr_cache
430
+
431
+ image_buffer = io.BytesIO()
432
+ image.save(image_buffer, format="PNG")
433
+ image_content = image_buffer.getvalue()
434
+
435
+ cache_kwargs = {
436
+ "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
437
+ "ocr_backend": "tesseract",
438
+ "ocr_config": str(sorted(kwargs.items())),
439
+ }
440
+
441
+ ocr_cache = get_ocr_cache()
442
+ cached_result = ocr_cache.get(**cache_kwargs)
443
+ if cached_result is not None:
444
+ return cached_result
445
+
446
+ if ocr_cache.is_processing(**cache_kwargs):
447
+ event = ocr_cache.mark_processing(**cache_kwargs)
448
+ event.wait()
449
+
450
+ # Try cache again after waiting for other process to complete
451
+ cached_result = ocr_cache.get(**cache_kwargs)
452
+ if cached_result is not None:
453
+ return cached_result
454
+
455
+ ocr_cache.mark_processing(**cache_kwargs)
456
+
457
+ try:
458
+ self._validate_tesseract_version_sync()
459
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
460
+ image_path = Path(tmp_file.name)
461
+ image.save(str(image_path), format="PNG")
462
+ try:
463
+ result = self.process_file_sync(image_path, **kwargs)
464
+
465
+ ocr_cache.set(result, **cache_kwargs)
466
+
467
+ return result
468
+ finally:
469
+ if image_path.exists():
470
+ image_path.unlink()
471
+ finally:
472
+ ocr_cache.mark_complete(**cache_kwargs)
473
+
474
+ def process_file_sync(
475
+ self,
476
+ path: Path,
477
+ **kwargs: Unpack[TesseractConfig],
478
+ ) -> ExtractionResult:
479
+ """Synchronously process a file and extract its text and metadata.
480
+
481
+ Args:
482
+ path: A Path object representing the file to be processed.
483
+ **kwargs: Any kwargs related to the given backend
484
+
485
+ Returns:
486
+ The extraction result object
487
+ """
488
+ from kreuzberg._utils._cache import get_ocr_cache
489
+
490
+ file_info = self._get_file_info(path)
491
+
492
+ cache_kwargs = {
493
+ "file_info": str(sorted(file_info.items())),
494
+ "ocr_backend": "tesseract",
495
+ "ocr_config": str(sorted(kwargs.items())),
496
+ }
497
+
498
+ ocr_cache = get_ocr_cache()
499
+ cached_result = ocr_cache.get(**cache_kwargs)
500
+ if cached_result is not None:
501
+ return cached_result
502
+
503
+ if ocr_cache.is_processing(**cache_kwargs):
504
+ event = ocr_cache.mark_processing(**cache_kwargs)
505
+ event.wait()
506
+
507
+ # Try cache again after waiting for other process to complete
508
+ cached_result = ocr_cache.get(**cache_kwargs)
509
+ if cached_result is not None:
510
+ return cached_result
511
+
512
+ ocr_cache.mark_processing(**cache_kwargs)
513
+
514
+ try:
515
+ self._validate_tesseract_version_sync()
516
+ with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
517
+ output_base = tmp_file.name.replace(".txt", "")
518
+ language = self._validate_language_code(kwargs.pop("language", "eng"))
519
+ psm = kwargs.pop("psm", PSMMode.AUTO)
520
+ try:
521
+ command = self._build_tesseract_command(path, output_base, language, psm, **kwargs)
522
+ self._run_tesseract_sync(command)
523
+
524
+ output_path = Path(output_base + ".txt")
525
+ with output_path.open(encoding="utf-8") as f:
526
+ output = f.read()
527
+ extraction_result = ExtractionResult(
528
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
529
+ )
530
+
531
+ final_cache_kwargs = cache_kwargs.copy()
532
+ final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
533
+ ocr_cache.set(extraction_result, **final_cache_kwargs)
534
+
535
+ return extraction_result
536
+ except (RuntimeError, OSError) as e:
537
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
538
+ finally:
539
+ for ext in [".txt"]:
540
+ temp_file = Path(output_base + ext)
541
+ if temp_file.exists():
542
+ temp_file.unlink()
543
+ finally:
544
+ ocr_cache.mark_complete(**cache_kwargs)
545
+
546
+ def _get_file_info(self, path: Path) -> dict[str, Any]:
547
+ """Get file information for caching."""
548
+ try:
549
+ stat = path.stat()
550
+ return {
551
+ "path": str(path.resolve()),
552
+ "size": stat.st_size,
553
+ "mtime": stat.st_mtime,
554
+ }
555
+ except OSError:
556
+ return {
557
+ "path": str(path),
558
+ "size": 0,
559
+ "mtime": 0,
560
+ }
561
+
562
+ def _build_tesseract_command(
563
+ self, path: Path, output_base: str, language: str, psm: PSMMode, **kwargs: Any
564
+ ) -> list[str]:
565
+ """Build tesseract command with all parameters."""
566
+ command = [
567
+ "tesseract",
568
+ str(path),
569
+ output_base,
570
+ "-l",
571
+ language,
572
+ "--psm",
573
+ str(psm.value),
574
+ "--oem",
575
+ "1",
576
+ "--loglevel",
577
+ "OFF",
578
+ ]
579
+ for kwarg, value in kwargs.items():
580
+ if isinstance(value, bool):
581
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
582
+ else:
583
+ command.extend(["-c", f"{kwarg}={value}"])
584
+ return command
585
+
586
+ def _run_tesseract_sync(self, command: list[str]) -> None:
587
+ """Run tesseract command synchronously."""
588
+ env = os.environ.copy()
589
+ if sys.platform.startswith("linux"):
590
+ env["OMP_THREAD_LIMIT"] = "1"
591
+
592
+ result = subprocess.run(
593
+ command,
594
+ check=False,
595
+ env=env,
596
+ capture_output=True,
597
+ text=True,
598
+ timeout=30,
599
+ )
600
+
601
+ if result.returncode != 0:
602
+ raise OCRError(
603
+ "OCR failed with a non-0 return code.",
604
+ context={"error": result.stderr},
605
+ )
606
+
607
+ @classmethod
608
+ def _validate_tesseract_version_sync(cls) -> None:
609
+ """Synchronously validate that Tesseract is installed and is version 5 or above.
610
+
611
+ Raises:
612
+ MissingDependencyError: If Tesseract is not installed or is below version 5.
613
+ """
614
+ try:
615
+ if cls._version_checked:
616
+ return
617
+
618
+ command = ["tesseract", "--version"]
619
+ result = subprocess.run(command, capture_output=True, text=True, check=False)
620
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout)
621
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
622
+ raise MissingDependencyError(
623
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
624
+ )
625
+
626
+ cls._version_checked = True
627
+ except FileNotFoundError as e:
628
+ raise MissingDependencyError(
629
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
630
+ ) from e
631
+
406
632
  @staticmethod
407
633
  def _validate_language_code(language_code: str) -> str:
408
634
  """Convert a language code to Tesseract format.
@@ -434,3 +660,337 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
434
660
  "supported_languages": ",".join(sorted(TESSERACT_SUPPORTED_LANGUAGE_CODES)),
435
661
  },
436
662
  )
663
+
664
+
665
+ def _process_image_with_tesseract(
666
+ image_path: str,
667
+ config_dict: dict[str, Any],
668
+ ) -> dict[str, Any]:
669
+ """Process a single image with Tesseract in a separate process.
670
+
671
+ This function is designed to be executed in a subprocess.
672
+ It uses direct tesseract command execution to avoid async complications.
673
+
674
+ Args:
675
+ image_path: Path to the image file.
676
+ config_dict: Tesseract configuration as dictionary.
677
+
678
+ Returns:
679
+ OCR result as dictionary.
680
+ """
681
+ try:
682
+ with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
683
+ output_base = tmp_file.name.replace(".txt", "")
684
+
685
+ try:
686
+ language = config_dict.get("language", "eng")
687
+ psm = config_dict.get("psm", 3)
688
+
689
+ command = [
690
+ "tesseract",
691
+ image_path,
692
+ output_base,
693
+ "-l",
694
+ language,
695
+ "--psm",
696
+ str(psm),
697
+ "--oem",
698
+ "1",
699
+ "--loglevel",
700
+ "OFF",
701
+ ]
702
+
703
+ boolean_options = [
704
+ "classify_use_pre_adapted_templates",
705
+ "language_model_ngram_on",
706
+ "tessedit_dont_blkrej_good_wds",
707
+ "tessedit_dont_rowrej_good_wds",
708
+ "tessedit_enable_dict_correction",
709
+ "tessedit_use_primary_params_model",
710
+ "textord_space_size_is_variable",
711
+ "thresholding_method",
712
+ ]
713
+
714
+ for option in boolean_options:
715
+ if option in config_dict:
716
+ value = 1 if config_dict[option] else 0
717
+ command.extend(["-c", f"{option}={value}"])
718
+
719
+ env = os.environ.copy()
720
+ env["OMP_THREAD_LIMIT"] = "1"
721
+
722
+ result = subprocess.run(
723
+ command,
724
+ check=False,
725
+ env=env,
726
+ capture_output=True,
727
+ text=True,
728
+ timeout=30,
729
+ )
730
+
731
+ if result.returncode != 0:
732
+ raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
733
+
734
+ output_file = output_base + ".txt"
735
+ with Path(output_file).open(encoding="utf-8") as f:
736
+ text = f.read()
737
+
738
+ text = normalize_spaces(text)
739
+
740
+ return {
741
+ "success": True,
742
+ "text": text,
743
+ "confidence": None,
744
+ "error": None,
745
+ }
746
+
747
+ finally:
748
+ for ext in [".txt"]:
749
+ temp_file = output_base + ext
750
+ temp_path = Path(temp_file)
751
+ if temp_path.exists():
752
+ temp_path.unlink()
753
+
754
+ except Exception as e: # noqa: BLE001
755
+ return {
756
+ "success": False,
757
+ "text": "",
758
+ "confidence": None,
759
+ "error": str(e),
760
+ }
761
+
762
+
763
+ def _process_image_bytes_with_tesseract(
764
+ image_bytes: bytes,
765
+ config_dict: dict[str, Any],
766
+ ) -> dict[str, Any]:
767
+ """Process image bytes with Tesseract in a separate process.
768
+
769
+ Args:
770
+ image_bytes: Image data as bytes.
771
+ config_dict: Tesseract configuration as dictionary.
772
+
773
+ Returns:
774
+ OCR result as dictionary.
775
+ """
776
+ try:
777
+ import io
778
+
779
+ from PIL import Image
780
+
781
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
782
+ with Image.open(io.BytesIO(image_bytes)) as image:
783
+ image.save(tmp_image.name, format="PNG")
784
+ image_path = tmp_image.name
785
+
786
+ try:
787
+ return _process_image_with_tesseract(image_path, config_dict)
788
+ finally:
789
+ image_file = Path(image_path)
790
+ if image_file.exists():
791
+ image_file.unlink()
792
+
793
+ except Exception as e: # noqa: BLE001
794
+ return {
795
+ "success": False,
796
+ "text": "",
797
+ "confidence": None,
798
+ "error": str(e),
799
+ }
800
+
801
+
802
+ class TesseractProcessPool:
803
+ """Process pool for parallel Tesseract OCR processing."""
804
+
805
+ def __init__(
806
+ self,
807
+ config: TesseractConfig | None = None,
808
+ max_processes: int | None = None,
809
+ memory_limit_gb: float | None = None,
810
+ ) -> None:
811
+ """Initialize the Tesseract process pool.
812
+
813
+ Args:
814
+ config: Default Tesseract configuration.
815
+ max_processes: Maximum number of processes.
816
+ memory_limit_gb: Memory limit in GB.
817
+ """
818
+ from kreuzberg._utils._process_pool import ProcessPoolManager
819
+
820
+ self.config = config or TesseractConfig()
821
+ self.process_manager = ProcessPoolManager(
822
+ max_processes=max_processes,
823
+ memory_limit_gb=memory_limit_gb,
824
+ )
825
+
826
+ def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
827
+ """Convert TesseractConfig to dictionary for pickling."""
828
+ cfg = config or self.config
829
+
830
+ config_dict = {}
831
+ for field_name in cfg.__dataclass_fields__:
832
+ value = getattr(cfg, field_name)
833
+
834
+ if hasattr(value, "value"):
835
+ config_dict[field_name] = value.value
836
+ else:
837
+ config_dict[field_name] = value
838
+
839
+ return config_dict
840
+
841
+ def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
842
+ """Convert result dictionary back to OCRResult."""
843
+ if not result_dict["success"]:
844
+ raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
845
+
846
+ return ExtractionResult(
847
+ content=result_dict["text"],
848
+ mime_type=PLAIN_TEXT_MIME_TYPE,
849
+ metadata={"confidence": result_dict["confidence"]} if result_dict["confidence"] else {}, # type: ignore[typeddict-unknown-key]
850
+ chunks=[],
851
+ )
852
+
853
+ async def process_image(
854
+ self,
855
+ image_path: str | Path,
856
+ config: TesseractConfig | None = None,
857
+ ) -> ExtractionResult:
858
+ """Process a single image file with Tesseract.
859
+
860
+ Args:
861
+ image_path: Path to the image file.
862
+ config: Tesseract configuration (uses default if None).
863
+
864
+ Returns:
865
+ OCR result.
866
+ """
867
+ config_dict = self._config_to_dict(config)
868
+
869
+ task_memory_mb = 80
870
+
871
+ result_dict = await self.process_manager.submit_task(
872
+ _process_image_with_tesseract,
873
+ str(image_path),
874
+ config_dict,
875
+ task_memory_mb=task_memory_mb,
876
+ )
877
+
878
+ return self._result_from_dict(result_dict)
879
+
880
+ async def process_image_bytes(
881
+ self,
882
+ image_bytes: bytes,
883
+ config: TesseractConfig | None = None,
884
+ ) -> ExtractionResult:
885
+ """Process image bytes with Tesseract.
886
+
887
+ Args:
888
+ image_bytes: Image data as bytes.
889
+ config: Tesseract configuration (uses default if None).
890
+
891
+ Returns:
892
+ OCR result.
893
+ """
894
+ config_dict = self._config_to_dict(config)
895
+
896
+ image_size_mb = len(image_bytes) / 1024 / 1024
897
+ task_memory_mb = max(80, image_size_mb * 2 + 50)
898
+
899
+ result_dict = await self.process_manager.submit_task(
900
+ _process_image_bytes_with_tesseract,
901
+ image_bytes,
902
+ config_dict,
903
+ task_memory_mb=task_memory_mb,
904
+ )
905
+
906
+ return self._result_from_dict(result_dict)
907
+
908
+ async def process_batch_images(
909
+ self,
910
+ image_paths: list[str | Path],
911
+ config: TesseractConfig | None = None,
912
+ max_concurrent: int | None = None,
913
+ ) -> list[ExtractionResult]:
914
+ """Process a batch of images in parallel.
915
+
916
+ Args:
917
+ image_paths: List of image file paths.
918
+ config: Tesseract configuration (uses default if None).
919
+ max_concurrent: Maximum concurrent processes.
920
+
921
+ Returns:
922
+ List of OCR results in the same order as input.
923
+ """
924
+ if not image_paths:
925
+ return []
926
+
927
+ config_dict = self._config_to_dict(config)
928
+
929
+ arg_batches = [(str(path), config_dict) for path in image_paths]
930
+
931
+ task_memory_mb = 80
932
+
933
+ result_dicts = await self.process_manager.submit_batch(
934
+ _process_image_with_tesseract,
935
+ arg_batches,
936
+ task_memory_mb=task_memory_mb,
937
+ max_concurrent=max_concurrent,
938
+ )
939
+
940
+ return [self._result_from_dict(result_dict) for result_dict in result_dicts]
941
+
942
+ async def process_batch_bytes(
943
+ self,
944
+ image_bytes_list: list[bytes],
945
+ config: TesseractConfig | None = None,
946
+ max_concurrent: int | None = None,
947
+ ) -> list[ExtractionResult]:
948
+ """Process a batch of image bytes in parallel.
949
+
950
+ Args:
951
+ image_bytes_list: List of image data as bytes.
952
+ config: Tesseract configuration (uses default if None).
953
+ max_concurrent: Maximum concurrent processes.
954
+
955
+ Returns:
956
+ List of OCR results in the same order as input.
957
+ """
958
+ if not image_bytes_list:
959
+ return []
960
+
961
+ config_dict = self._config_to_dict(config)
962
+
963
+ arg_batches = [(image_bytes, config_dict) for image_bytes in image_bytes_list]
964
+
965
+ avg_image_size_mb = sum(len(img) for img in image_bytes_list) / len(image_bytes_list) / 1024 / 1024
966
+ task_memory_mb = max(80, avg_image_size_mb * 2 + 50)
967
+
968
+ result_dicts = await self.process_manager.submit_batch(
969
+ _process_image_bytes_with_tesseract,
970
+ arg_batches,
971
+ task_memory_mb=task_memory_mb,
972
+ max_concurrent=max_concurrent,
973
+ )
974
+
975
+ return [self._result_from_dict(result_dict) for result_dict in result_dicts]
976
+
977
+ def get_system_info(self) -> dict[str, Any]:
978
+ """Get system information from the process manager."""
979
+ return self.process_manager.get_system_info()
980
+
981
+ def shutdown(self, wait: bool = True) -> None:
982
+ """Shutdown the process pool."""
983
+ self.process_manager.shutdown(wait=wait)
984
+
985
+ async def __aenter__(self) -> Self:
986
+ """Async context manager entry."""
987
+ return self
988
+
989
+ async def __aexit__(
990
+ self,
991
+ exc_type: type[BaseException] | None,
992
+ exc_val: BaseException | None,
993
+ exc_tb: object,
994
+ ) -> None:
995
+ """Async context manager exit."""
996
+ self.shutdown()
kreuzberg/_registry.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from functools import lru_cache
4
4
  from typing import TYPE_CHECKING, ClassVar
5
5
 
6
+ from kreuzberg._extractors._email import EmailExtractor
6
7
  from kreuzberg._extractors._html import HTMLExtractor
7
8
  from kreuzberg._extractors._image import ImageExtractor
8
9
  from kreuzberg._extractors._pandoc import (
@@ -19,6 +20,7 @@ from kreuzberg._extractors._pandoc import (
19
20
  from kreuzberg._extractors._pdf import PDFExtractor
20
21
  from kreuzberg._extractors._presentation import PresentationExtractor
21
22
  from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
23
+ from kreuzberg._extractors._structured import StructuredDataExtractor
22
24
 
23
25
  if TYPE_CHECKING:
24
26
  from kreuzberg._extractors._base import Extractor
@@ -40,6 +42,8 @@ class ExtractorRegistry:
40
42
  PresentationExtractor,
41
43
  SpreadSheetExtractor,
42
44
  HTMLExtractor,
45
+ EmailExtractor,
46
+ StructuredDataExtractor,
43
47
  MarkdownExtractor,
44
48
  ImageExtractor,
45
49
  BibliographyExtractor,