kreuzberg 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_entity_extraction.py +1 -2
- kreuzberg/_extractors/_base.py +39 -1
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +21 -36
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +81 -48
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +179 -4
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +314 -7
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_ocr/__init__.py +10 -1
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +91 -0
- kreuzberg/_ocr/_paddleocr.py +89 -0
- kreuzberg/_ocr/_tesseract.py +564 -4
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +131 -0
- kreuzberg/_utils/_cache.py +52 -4
- kreuzberg/_utils/_errors.py +3 -7
- kreuzberg/_utils/_process_pool.py +180 -7
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +5 -2
- kreuzberg/_utils/_table.py +261 -0
- kreuzberg/cli.py +1 -2
- kreuzberg/extraction.py +4 -22
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/METADATA +58 -54
- kreuzberg-3.8.1.dist-info/RECORD +53 -0
- kreuzberg/_multiprocessing/__init__.py +0 -6
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
- kreuzberg/_multiprocessing/process_manager.py +0 -189
- kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
- kreuzberg-3.7.0.dist-info/RECORD +0 -56
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import hashlib
|
4
|
+
import os
|
4
5
|
import re
|
6
|
+
import subprocess
|
5
7
|
import sys
|
8
|
+
import tempfile
|
6
9
|
from dataclasses import dataclass
|
7
10
|
from enum import Enum
|
11
|
+
from pathlib import Path
|
8
12
|
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
9
13
|
|
10
14
|
from anyio import Path as AsyncPath
|
11
15
|
from anyio import run_process
|
16
|
+
from typing_extensions import Self
|
12
17
|
|
13
18
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
14
19
|
from kreuzberg._ocr._base import OCRBackend
|
@@ -19,8 +24,6 @@ from kreuzberg._utils._tmp import create_temp_file
|
|
19
24
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
20
25
|
|
21
26
|
if TYPE_CHECKING:
|
22
|
-
from pathlib import Path
|
23
|
-
|
24
27
|
from PIL.Image import Image
|
25
28
|
|
26
29
|
try: # pragma: no cover
|
@@ -206,7 +209,7 @@ class TesseractConfig:
|
|
206
209
|
"""Enable or disable the use of n-gram-based language models for improved text recognition.
|
207
210
|
|
208
211
|
Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
|
209
|
-
psm: PSMMode = PSMMode.
|
212
|
+
psm: PSMMode = PSMMode.AUTO
|
210
213
|
"""Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
|
211
214
|
tessedit_dont_blkrej_good_wds: bool = True
|
212
215
|
"""If True, prevents block rejection of words identified as good, improving text output quality."""
|
@@ -345,7 +348,11 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
345
348
|
"OFF",
|
346
349
|
]
|
347
350
|
for kwarg, value in kwargs.items():
|
348
|
-
|
351
|
+
if isinstance(value, bool):
|
352
|
+
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
353
|
+
else:
|
354
|
+
# Handle string parameters (like tessedit_char_whitelist)
|
355
|
+
command.extend(["-c", f"{kwarg}={value}"])
|
349
356
|
|
350
357
|
env: dict[str, Any] | None = None
|
351
358
|
if sys.platform.startswith("linux"):
|
@@ -403,6 +410,225 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
403
410
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
404
411
|
) from e
|
405
412
|
|
413
|
+
def process_image_sync(
|
414
|
+
self,
|
415
|
+
image: Image,
|
416
|
+
**kwargs: Unpack[TesseractConfig],
|
417
|
+
) -> ExtractionResult:
|
418
|
+
"""Synchronously process an image and extract its text and metadata.
|
419
|
+
|
420
|
+
Args:
|
421
|
+
image: An instance of PIL.Image representing the input image.
|
422
|
+
**kwargs: Any kwargs related to the given backend
|
423
|
+
|
424
|
+
Returns:
|
425
|
+
The extraction result object
|
426
|
+
"""
|
427
|
+
import io
|
428
|
+
|
429
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
430
|
+
|
431
|
+
image_buffer = io.BytesIO()
|
432
|
+
image.save(image_buffer, format="PNG")
|
433
|
+
image_content = image_buffer.getvalue()
|
434
|
+
|
435
|
+
cache_kwargs = {
|
436
|
+
"image_hash": hashlib.sha256(image_content).hexdigest()[:16],
|
437
|
+
"ocr_backend": "tesseract",
|
438
|
+
"ocr_config": str(sorted(kwargs.items())),
|
439
|
+
}
|
440
|
+
|
441
|
+
ocr_cache = get_ocr_cache()
|
442
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
443
|
+
if cached_result is not None:
|
444
|
+
return cached_result
|
445
|
+
|
446
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
447
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
448
|
+
event.wait()
|
449
|
+
|
450
|
+
# Try cache again after waiting for other process to complete
|
451
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
452
|
+
if cached_result is not None:
|
453
|
+
return cached_result
|
454
|
+
|
455
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
456
|
+
|
457
|
+
try:
|
458
|
+
self._validate_tesseract_version_sync()
|
459
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
|
460
|
+
image_path = Path(tmp_file.name)
|
461
|
+
image.save(str(image_path), format="PNG")
|
462
|
+
try:
|
463
|
+
result = self.process_file_sync(image_path, **kwargs)
|
464
|
+
|
465
|
+
ocr_cache.set(result, **cache_kwargs)
|
466
|
+
|
467
|
+
return result
|
468
|
+
finally:
|
469
|
+
if image_path.exists():
|
470
|
+
image_path.unlink()
|
471
|
+
finally:
|
472
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
473
|
+
|
474
|
+
def process_file_sync(
|
475
|
+
self,
|
476
|
+
path: Path,
|
477
|
+
**kwargs: Unpack[TesseractConfig],
|
478
|
+
) -> ExtractionResult:
|
479
|
+
"""Synchronously process a file and extract its text and metadata.
|
480
|
+
|
481
|
+
Args:
|
482
|
+
path: A Path object representing the file to be processed.
|
483
|
+
**kwargs: Any kwargs related to the given backend
|
484
|
+
|
485
|
+
Returns:
|
486
|
+
The extraction result object
|
487
|
+
"""
|
488
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
489
|
+
|
490
|
+
file_info = self._get_file_info(path)
|
491
|
+
|
492
|
+
cache_kwargs = {
|
493
|
+
"file_info": str(sorted(file_info.items())),
|
494
|
+
"ocr_backend": "tesseract",
|
495
|
+
"ocr_config": str(sorted(kwargs.items())),
|
496
|
+
}
|
497
|
+
|
498
|
+
ocr_cache = get_ocr_cache()
|
499
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
500
|
+
if cached_result is not None:
|
501
|
+
return cached_result
|
502
|
+
|
503
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
504
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
505
|
+
event.wait()
|
506
|
+
|
507
|
+
# Try cache again after waiting for other process to complete
|
508
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
509
|
+
if cached_result is not None:
|
510
|
+
return cached_result
|
511
|
+
|
512
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
513
|
+
|
514
|
+
try:
|
515
|
+
self._validate_tesseract_version_sync()
|
516
|
+
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
|
517
|
+
output_base = tmp_file.name.replace(".txt", "")
|
518
|
+
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
519
|
+
psm = kwargs.pop("psm", PSMMode.AUTO)
|
520
|
+
try:
|
521
|
+
command = self._build_tesseract_command(path, output_base, language, psm, **kwargs)
|
522
|
+
self._run_tesseract_sync(command)
|
523
|
+
|
524
|
+
output_path = Path(output_base + ".txt")
|
525
|
+
with output_path.open(encoding="utf-8") as f:
|
526
|
+
output = f.read()
|
527
|
+
extraction_result = ExtractionResult(
|
528
|
+
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
529
|
+
)
|
530
|
+
|
531
|
+
final_cache_kwargs = cache_kwargs.copy()
|
532
|
+
final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
|
533
|
+
ocr_cache.set(extraction_result, **final_cache_kwargs)
|
534
|
+
|
535
|
+
return extraction_result
|
536
|
+
except (RuntimeError, OSError) as e:
|
537
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
538
|
+
finally:
|
539
|
+
for ext in [".txt"]:
|
540
|
+
temp_file = Path(output_base + ext)
|
541
|
+
if temp_file.exists():
|
542
|
+
temp_file.unlink()
|
543
|
+
finally:
|
544
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
545
|
+
|
546
|
+
def _get_file_info(self, path: Path) -> dict[str, Any]:
|
547
|
+
"""Get file information for caching."""
|
548
|
+
try:
|
549
|
+
stat = path.stat()
|
550
|
+
return {
|
551
|
+
"path": str(path.resolve()),
|
552
|
+
"size": stat.st_size,
|
553
|
+
"mtime": stat.st_mtime,
|
554
|
+
}
|
555
|
+
except OSError:
|
556
|
+
return {
|
557
|
+
"path": str(path),
|
558
|
+
"size": 0,
|
559
|
+
"mtime": 0,
|
560
|
+
}
|
561
|
+
|
562
|
+
def _build_tesseract_command(
|
563
|
+
self, path: Path, output_base: str, language: str, psm: PSMMode, **kwargs: Any
|
564
|
+
) -> list[str]:
|
565
|
+
"""Build tesseract command with all parameters."""
|
566
|
+
command = [
|
567
|
+
"tesseract",
|
568
|
+
str(path),
|
569
|
+
output_base,
|
570
|
+
"-l",
|
571
|
+
language,
|
572
|
+
"--psm",
|
573
|
+
str(psm.value),
|
574
|
+
"--oem",
|
575
|
+
"1",
|
576
|
+
"--loglevel",
|
577
|
+
"OFF",
|
578
|
+
]
|
579
|
+
for kwarg, value in kwargs.items():
|
580
|
+
if isinstance(value, bool):
|
581
|
+
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
582
|
+
else:
|
583
|
+
command.extend(["-c", f"{kwarg}={value}"])
|
584
|
+
return command
|
585
|
+
|
586
|
+
def _run_tesseract_sync(self, command: list[str]) -> None:
|
587
|
+
"""Run tesseract command synchronously."""
|
588
|
+
env = os.environ.copy()
|
589
|
+
if sys.platform.startswith("linux"):
|
590
|
+
env["OMP_THREAD_LIMIT"] = "1"
|
591
|
+
|
592
|
+
result = subprocess.run(
|
593
|
+
command,
|
594
|
+
check=False,
|
595
|
+
env=env,
|
596
|
+
capture_output=True,
|
597
|
+
text=True,
|
598
|
+
timeout=30,
|
599
|
+
)
|
600
|
+
|
601
|
+
if result.returncode != 0:
|
602
|
+
raise OCRError(
|
603
|
+
"OCR failed with a non-0 return code.",
|
604
|
+
context={"error": result.stderr},
|
605
|
+
)
|
606
|
+
|
607
|
+
@classmethod
|
608
|
+
def _validate_tesseract_version_sync(cls) -> None:
|
609
|
+
"""Synchronously validate that Tesseract is installed and is version 5 or above.
|
610
|
+
|
611
|
+
Raises:
|
612
|
+
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
613
|
+
"""
|
614
|
+
try:
|
615
|
+
if cls._version_checked:
|
616
|
+
return
|
617
|
+
|
618
|
+
command = ["tesseract", "--version"]
|
619
|
+
result = subprocess.run(command, capture_output=True, text=True, check=False)
|
620
|
+
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout)
|
621
|
+
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
622
|
+
raise MissingDependencyError(
|
623
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
624
|
+
)
|
625
|
+
|
626
|
+
cls._version_checked = True
|
627
|
+
except FileNotFoundError as e:
|
628
|
+
raise MissingDependencyError(
|
629
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
630
|
+
) from e
|
631
|
+
|
406
632
|
@staticmethod
|
407
633
|
def _validate_language_code(language_code: str) -> str:
|
408
634
|
"""Convert a language code to Tesseract format.
|
@@ -434,3 +660,337 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
434
660
|
"supported_languages": ",".join(sorted(TESSERACT_SUPPORTED_LANGUAGE_CODES)),
|
435
661
|
},
|
436
662
|
)
|
663
|
+
|
664
|
+
|
665
|
+
def _process_image_with_tesseract(
|
666
|
+
image_path: str,
|
667
|
+
config_dict: dict[str, Any],
|
668
|
+
) -> dict[str, Any]:
|
669
|
+
"""Process a single image with Tesseract in a separate process.
|
670
|
+
|
671
|
+
This function is designed to be executed in a subprocess.
|
672
|
+
It uses direct tesseract command execution to avoid async complications.
|
673
|
+
|
674
|
+
Args:
|
675
|
+
image_path: Path to the image file.
|
676
|
+
config_dict: Tesseract configuration as dictionary.
|
677
|
+
|
678
|
+
Returns:
|
679
|
+
OCR result as dictionary.
|
680
|
+
"""
|
681
|
+
try:
|
682
|
+
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
|
683
|
+
output_base = tmp_file.name.replace(".txt", "")
|
684
|
+
|
685
|
+
try:
|
686
|
+
language = config_dict.get("language", "eng")
|
687
|
+
psm = config_dict.get("psm", 3)
|
688
|
+
|
689
|
+
command = [
|
690
|
+
"tesseract",
|
691
|
+
image_path,
|
692
|
+
output_base,
|
693
|
+
"-l",
|
694
|
+
language,
|
695
|
+
"--psm",
|
696
|
+
str(psm),
|
697
|
+
"--oem",
|
698
|
+
"1",
|
699
|
+
"--loglevel",
|
700
|
+
"OFF",
|
701
|
+
]
|
702
|
+
|
703
|
+
boolean_options = [
|
704
|
+
"classify_use_pre_adapted_templates",
|
705
|
+
"language_model_ngram_on",
|
706
|
+
"tessedit_dont_blkrej_good_wds",
|
707
|
+
"tessedit_dont_rowrej_good_wds",
|
708
|
+
"tessedit_enable_dict_correction",
|
709
|
+
"tessedit_use_primary_params_model",
|
710
|
+
"textord_space_size_is_variable",
|
711
|
+
"thresholding_method",
|
712
|
+
]
|
713
|
+
|
714
|
+
for option in boolean_options:
|
715
|
+
if option in config_dict:
|
716
|
+
value = 1 if config_dict[option] else 0
|
717
|
+
command.extend(["-c", f"{option}={value}"])
|
718
|
+
|
719
|
+
env = os.environ.copy()
|
720
|
+
env["OMP_THREAD_LIMIT"] = "1"
|
721
|
+
|
722
|
+
result = subprocess.run(
|
723
|
+
command,
|
724
|
+
check=False,
|
725
|
+
env=env,
|
726
|
+
capture_output=True,
|
727
|
+
text=True,
|
728
|
+
timeout=30,
|
729
|
+
)
|
730
|
+
|
731
|
+
if result.returncode != 0:
|
732
|
+
raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
|
733
|
+
|
734
|
+
output_file = output_base + ".txt"
|
735
|
+
with Path(output_file).open(encoding="utf-8") as f:
|
736
|
+
text = f.read()
|
737
|
+
|
738
|
+
text = normalize_spaces(text)
|
739
|
+
|
740
|
+
return {
|
741
|
+
"success": True,
|
742
|
+
"text": text,
|
743
|
+
"confidence": None,
|
744
|
+
"error": None,
|
745
|
+
}
|
746
|
+
|
747
|
+
finally:
|
748
|
+
for ext in [".txt"]:
|
749
|
+
temp_file = output_base + ext
|
750
|
+
temp_path = Path(temp_file)
|
751
|
+
if temp_path.exists():
|
752
|
+
temp_path.unlink()
|
753
|
+
|
754
|
+
except Exception as e: # noqa: BLE001
|
755
|
+
return {
|
756
|
+
"success": False,
|
757
|
+
"text": "",
|
758
|
+
"confidence": None,
|
759
|
+
"error": str(e),
|
760
|
+
}
|
761
|
+
|
762
|
+
|
763
|
+
def _process_image_bytes_with_tesseract(
|
764
|
+
image_bytes: bytes,
|
765
|
+
config_dict: dict[str, Any],
|
766
|
+
) -> dict[str, Any]:
|
767
|
+
"""Process image bytes with Tesseract in a separate process.
|
768
|
+
|
769
|
+
Args:
|
770
|
+
image_bytes: Image data as bytes.
|
771
|
+
config_dict: Tesseract configuration as dictionary.
|
772
|
+
|
773
|
+
Returns:
|
774
|
+
OCR result as dictionary.
|
775
|
+
"""
|
776
|
+
try:
|
777
|
+
import io
|
778
|
+
|
779
|
+
from PIL import Image
|
780
|
+
|
781
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
|
782
|
+
with Image.open(io.BytesIO(image_bytes)) as image:
|
783
|
+
image.save(tmp_image.name, format="PNG")
|
784
|
+
image_path = tmp_image.name
|
785
|
+
|
786
|
+
try:
|
787
|
+
return _process_image_with_tesseract(image_path, config_dict)
|
788
|
+
finally:
|
789
|
+
image_file = Path(image_path)
|
790
|
+
if image_file.exists():
|
791
|
+
image_file.unlink()
|
792
|
+
|
793
|
+
except Exception as e: # noqa: BLE001
|
794
|
+
return {
|
795
|
+
"success": False,
|
796
|
+
"text": "",
|
797
|
+
"confidence": None,
|
798
|
+
"error": str(e),
|
799
|
+
}
|
800
|
+
|
801
|
+
|
802
|
+
class TesseractProcessPool:
|
803
|
+
"""Process pool for parallel Tesseract OCR processing."""
|
804
|
+
|
805
|
+
def __init__(
|
806
|
+
self,
|
807
|
+
config: TesseractConfig | None = None,
|
808
|
+
max_processes: int | None = None,
|
809
|
+
memory_limit_gb: float | None = None,
|
810
|
+
) -> None:
|
811
|
+
"""Initialize the Tesseract process pool.
|
812
|
+
|
813
|
+
Args:
|
814
|
+
config: Default Tesseract configuration.
|
815
|
+
max_processes: Maximum number of processes.
|
816
|
+
memory_limit_gb: Memory limit in GB.
|
817
|
+
"""
|
818
|
+
from kreuzberg._utils._process_pool import ProcessPoolManager
|
819
|
+
|
820
|
+
self.config = config or TesseractConfig()
|
821
|
+
self.process_manager = ProcessPoolManager(
|
822
|
+
max_processes=max_processes,
|
823
|
+
memory_limit_gb=memory_limit_gb,
|
824
|
+
)
|
825
|
+
|
826
|
+
def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
|
827
|
+
"""Convert TesseractConfig to dictionary for pickling."""
|
828
|
+
cfg = config or self.config
|
829
|
+
|
830
|
+
config_dict = {}
|
831
|
+
for field_name in cfg.__dataclass_fields__:
|
832
|
+
value = getattr(cfg, field_name)
|
833
|
+
|
834
|
+
if hasattr(value, "value"):
|
835
|
+
config_dict[field_name] = value.value
|
836
|
+
else:
|
837
|
+
config_dict[field_name] = value
|
838
|
+
|
839
|
+
return config_dict
|
840
|
+
|
841
|
+
def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
|
842
|
+
"""Convert result dictionary back to OCRResult."""
|
843
|
+
if not result_dict["success"]:
|
844
|
+
raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
|
845
|
+
|
846
|
+
return ExtractionResult(
|
847
|
+
content=result_dict["text"],
|
848
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
849
|
+
metadata={"confidence": result_dict["confidence"]} if result_dict["confidence"] else {}, # type: ignore[typeddict-unknown-key]
|
850
|
+
chunks=[],
|
851
|
+
)
|
852
|
+
|
853
|
+
async def process_image(
|
854
|
+
self,
|
855
|
+
image_path: str | Path,
|
856
|
+
config: TesseractConfig | None = None,
|
857
|
+
) -> ExtractionResult:
|
858
|
+
"""Process a single image file with Tesseract.
|
859
|
+
|
860
|
+
Args:
|
861
|
+
image_path: Path to the image file.
|
862
|
+
config: Tesseract configuration (uses default if None).
|
863
|
+
|
864
|
+
Returns:
|
865
|
+
OCR result.
|
866
|
+
"""
|
867
|
+
config_dict = self._config_to_dict(config)
|
868
|
+
|
869
|
+
task_memory_mb = 80
|
870
|
+
|
871
|
+
result_dict = await self.process_manager.submit_task(
|
872
|
+
_process_image_with_tesseract,
|
873
|
+
str(image_path),
|
874
|
+
config_dict,
|
875
|
+
task_memory_mb=task_memory_mb,
|
876
|
+
)
|
877
|
+
|
878
|
+
return self._result_from_dict(result_dict)
|
879
|
+
|
880
|
+
async def process_image_bytes(
|
881
|
+
self,
|
882
|
+
image_bytes: bytes,
|
883
|
+
config: TesseractConfig | None = None,
|
884
|
+
) -> ExtractionResult:
|
885
|
+
"""Process image bytes with Tesseract.
|
886
|
+
|
887
|
+
Args:
|
888
|
+
image_bytes: Image data as bytes.
|
889
|
+
config: Tesseract configuration (uses default if None).
|
890
|
+
|
891
|
+
Returns:
|
892
|
+
OCR result.
|
893
|
+
"""
|
894
|
+
config_dict = self._config_to_dict(config)
|
895
|
+
|
896
|
+
image_size_mb = len(image_bytes) / 1024 / 1024
|
897
|
+
task_memory_mb = max(80, image_size_mb * 2 + 50)
|
898
|
+
|
899
|
+
result_dict = await self.process_manager.submit_task(
|
900
|
+
_process_image_bytes_with_tesseract,
|
901
|
+
image_bytes,
|
902
|
+
config_dict,
|
903
|
+
task_memory_mb=task_memory_mb,
|
904
|
+
)
|
905
|
+
|
906
|
+
return self._result_from_dict(result_dict)
|
907
|
+
|
908
|
+
async def process_batch_images(
|
909
|
+
self,
|
910
|
+
image_paths: list[str | Path],
|
911
|
+
config: TesseractConfig | None = None,
|
912
|
+
max_concurrent: int | None = None,
|
913
|
+
) -> list[ExtractionResult]:
|
914
|
+
"""Process a batch of images in parallel.
|
915
|
+
|
916
|
+
Args:
|
917
|
+
image_paths: List of image file paths.
|
918
|
+
config: Tesseract configuration (uses default if None).
|
919
|
+
max_concurrent: Maximum concurrent processes.
|
920
|
+
|
921
|
+
Returns:
|
922
|
+
List of OCR results in the same order as input.
|
923
|
+
"""
|
924
|
+
if not image_paths:
|
925
|
+
return []
|
926
|
+
|
927
|
+
config_dict = self._config_to_dict(config)
|
928
|
+
|
929
|
+
arg_batches = [(str(path), config_dict) for path in image_paths]
|
930
|
+
|
931
|
+
task_memory_mb = 80
|
932
|
+
|
933
|
+
result_dicts = await self.process_manager.submit_batch(
|
934
|
+
_process_image_with_tesseract,
|
935
|
+
arg_batches,
|
936
|
+
task_memory_mb=task_memory_mb,
|
937
|
+
max_concurrent=max_concurrent,
|
938
|
+
)
|
939
|
+
|
940
|
+
return [self._result_from_dict(result_dict) for result_dict in result_dicts]
|
941
|
+
|
942
|
+
async def process_batch_bytes(
|
943
|
+
self,
|
944
|
+
image_bytes_list: list[bytes],
|
945
|
+
config: TesseractConfig | None = None,
|
946
|
+
max_concurrent: int | None = None,
|
947
|
+
) -> list[ExtractionResult]:
|
948
|
+
"""Process a batch of image bytes in parallel.
|
949
|
+
|
950
|
+
Args:
|
951
|
+
image_bytes_list: List of image data as bytes.
|
952
|
+
config: Tesseract configuration (uses default if None).
|
953
|
+
max_concurrent: Maximum concurrent processes.
|
954
|
+
|
955
|
+
Returns:
|
956
|
+
List of OCR results in the same order as input.
|
957
|
+
"""
|
958
|
+
if not image_bytes_list:
|
959
|
+
return []
|
960
|
+
|
961
|
+
config_dict = self._config_to_dict(config)
|
962
|
+
|
963
|
+
arg_batches = [(image_bytes, config_dict) for image_bytes in image_bytes_list]
|
964
|
+
|
965
|
+
avg_image_size_mb = sum(len(img) for img in image_bytes_list) / len(image_bytes_list) / 1024 / 1024
|
966
|
+
task_memory_mb = max(80, avg_image_size_mb * 2 + 50)
|
967
|
+
|
968
|
+
result_dicts = await self.process_manager.submit_batch(
|
969
|
+
_process_image_bytes_with_tesseract,
|
970
|
+
arg_batches,
|
971
|
+
task_memory_mb=task_memory_mb,
|
972
|
+
max_concurrent=max_concurrent,
|
973
|
+
)
|
974
|
+
|
975
|
+
return [self._result_from_dict(result_dict) for result_dict in result_dicts]
|
976
|
+
|
977
|
+
def get_system_info(self) -> dict[str, Any]:
|
978
|
+
"""Get system information from the process manager."""
|
979
|
+
return self.process_manager.get_system_info()
|
980
|
+
|
981
|
+
def shutdown(self, wait: bool = True) -> None:
|
982
|
+
"""Shutdown the process pool."""
|
983
|
+
self.process_manager.shutdown(wait=wait)
|
984
|
+
|
985
|
+
async def __aenter__(self) -> Self:
|
986
|
+
"""Async context manager entry."""
|
987
|
+
return self
|
988
|
+
|
989
|
+
async def __aexit__(
|
990
|
+
self,
|
991
|
+
exc_type: type[BaseException] | None,
|
992
|
+
exc_val: BaseException | None,
|
993
|
+
exc_tb: object,
|
994
|
+
) -> None:
|
995
|
+
"""Async context manager exit."""
|
996
|
+
self.shutdown()
|
kreuzberg/_registry.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
from functools import lru_cache
|
4
4
|
from typing import TYPE_CHECKING, ClassVar
|
5
5
|
|
6
|
+
from kreuzberg._extractors._email import EmailExtractor
|
6
7
|
from kreuzberg._extractors._html import HTMLExtractor
|
7
8
|
from kreuzberg._extractors._image import ImageExtractor
|
8
9
|
from kreuzberg._extractors._pandoc import (
|
@@ -19,6 +20,7 @@ from kreuzberg._extractors._pandoc import (
|
|
19
20
|
from kreuzberg._extractors._pdf import PDFExtractor
|
20
21
|
from kreuzberg._extractors._presentation import PresentationExtractor
|
21
22
|
from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
|
23
|
+
from kreuzberg._extractors._structured import StructuredDataExtractor
|
22
24
|
|
23
25
|
if TYPE_CHECKING:
|
24
26
|
from kreuzberg._extractors._base import Extractor
|
@@ -40,6 +42,8 @@ class ExtractorRegistry:
|
|
40
42
|
PresentationExtractor,
|
41
43
|
SpreadSheetExtractor,
|
42
44
|
HTMLExtractor,
|
45
|
+
EmailExtractor,
|
46
|
+
StructuredDataExtractor,
|
43
47
|
MarkdownExtractor,
|
44
48
|
ImageExtractor,
|
45
49
|
BibliographyExtractor,
|