kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +9 -2
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_entity_extraction.py +238 -0
- kreuzberg/_extractors/_base.py +39 -1
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +27 -22
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +97 -34
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +181 -6
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +318 -11
- kreuzberg/_language_detection.py +95 -0
- kreuzberg/_mcp/__init__.py +5 -0
- kreuzberg/_mcp/server.py +227 -0
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_ocr/__init__.py +10 -1
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +92 -1
- kreuzberg/_ocr/_paddleocr.py +89 -0
- kreuzberg/_ocr/_tesseract.py +569 -5
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +181 -4
- kreuzberg/_utils/_cache.py +52 -4
- kreuzberg/_utils/_device.py +2 -2
- kreuzberg/_utils/_errors.py +3 -7
- kreuzberg/_utils/_process_pool.py +182 -9
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +6 -7
- kreuzberg/_utils/_table.py +261 -0
- kreuzberg/_utils/_tmp.py +2 -2
- kreuzberg/cli.py +1 -2
- kreuzberg/extraction.py +43 -34
- kreuzberg-3.8.1.dist-info/METADATA +301 -0
- kreuzberg-3.8.1.dist-info/RECORD +53 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
- kreuzberg/_multiprocessing/__init__.py +0 -6
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
- kreuzberg/_multiprocessing/process_manager.py +0 -188
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
- kreuzberg-3.3.0.dist-info/METADATA +0 -235
- kreuzberg-3.3.0.dist-info/RECORD +0 -48
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import hashlib
|
4
|
+
import os
|
4
5
|
import re
|
6
|
+
import subprocess
|
5
7
|
import sys
|
8
|
+
import tempfile
|
6
9
|
from dataclasses import dataclass
|
7
10
|
from enum import Enum
|
11
|
+
from pathlib import Path
|
8
12
|
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
9
13
|
|
10
14
|
from anyio import Path as AsyncPath
|
11
15
|
from anyio import run_process
|
16
|
+
from typing_extensions import Self
|
12
17
|
|
13
18
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
14
19
|
from kreuzberg._ocr._base import OCRBackend
|
@@ -19,8 +24,6 @@ from kreuzberg._utils._tmp import create_temp_file
|
|
19
24
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
20
25
|
|
21
26
|
if TYPE_CHECKING:
|
22
|
-
from pathlib import Path
|
23
|
-
|
24
27
|
from PIL.Image import Image
|
25
28
|
|
26
29
|
try: # pragma: no cover
|
@@ -202,8 +205,10 @@ class TesseractConfig:
|
|
202
205
|
- 'deu' for German
|
203
206
|
- multiple languages combined with '+', e.g. 'eng+deu')
|
204
207
|
"""
|
205
|
-
language_model_ngram_on: bool =
|
206
|
-
"""Enable or disable the use of n-gram-based language models for improved text recognition.
|
208
|
+
language_model_ngram_on: bool = False
|
209
|
+
"""Enable or disable the use of n-gram-based language models for improved text recognition.
|
210
|
+
|
211
|
+
Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
|
207
212
|
psm: PSMMode = PSMMode.AUTO
|
208
213
|
"""Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
|
209
214
|
tessedit_dont_blkrej_good_wds: bool = True
|
@@ -212,6 +217,8 @@ class TesseractConfig:
|
|
212
217
|
"""If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
|
213
218
|
tessedit_enable_dict_correction: bool = True
|
214
219
|
"""Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
|
220
|
+
tessedit_char_whitelist: str = ""
|
221
|
+
"""Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
|
215
222
|
tessedit_use_primary_params_model: bool = True
|
216
223
|
"""If True, forces the use of the primary parameters model for text recognition."""
|
217
224
|
textord_space_size_is_variable: bool = True
|
@@ -341,7 +348,11 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
341
348
|
"OFF",
|
342
349
|
]
|
343
350
|
for kwarg, value in kwargs.items():
|
344
|
-
|
351
|
+
if isinstance(value, bool):
|
352
|
+
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
353
|
+
else:
|
354
|
+
# Handle string parameters (like tessedit_char_whitelist)
|
355
|
+
command.extend(["-c", f"{kwarg}={value}"])
|
345
356
|
|
346
357
|
env: dict[str, Any] | None = None
|
347
358
|
if sys.platform.startswith("linux"):
|
@@ -399,6 +410,225 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
399
410
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
400
411
|
) from e
|
401
412
|
|
413
|
+
def process_image_sync(
|
414
|
+
self,
|
415
|
+
image: Image,
|
416
|
+
**kwargs: Unpack[TesseractConfig],
|
417
|
+
) -> ExtractionResult:
|
418
|
+
"""Synchronously process an image and extract its text and metadata.
|
419
|
+
|
420
|
+
Args:
|
421
|
+
image: An instance of PIL.Image representing the input image.
|
422
|
+
**kwargs: Any kwargs related to the given backend
|
423
|
+
|
424
|
+
Returns:
|
425
|
+
The extraction result object
|
426
|
+
"""
|
427
|
+
import io
|
428
|
+
|
429
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
430
|
+
|
431
|
+
image_buffer = io.BytesIO()
|
432
|
+
image.save(image_buffer, format="PNG")
|
433
|
+
image_content = image_buffer.getvalue()
|
434
|
+
|
435
|
+
cache_kwargs = {
|
436
|
+
"image_hash": hashlib.sha256(image_content).hexdigest()[:16],
|
437
|
+
"ocr_backend": "tesseract",
|
438
|
+
"ocr_config": str(sorted(kwargs.items())),
|
439
|
+
}
|
440
|
+
|
441
|
+
ocr_cache = get_ocr_cache()
|
442
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
443
|
+
if cached_result is not None:
|
444
|
+
return cached_result
|
445
|
+
|
446
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
447
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
448
|
+
event.wait()
|
449
|
+
|
450
|
+
# Try cache again after waiting for other process to complete
|
451
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
452
|
+
if cached_result is not None:
|
453
|
+
return cached_result
|
454
|
+
|
455
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
456
|
+
|
457
|
+
try:
|
458
|
+
self._validate_tesseract_version_sync()
|
459
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
|
460
|
+
image_path = Path(tmp_file.name)
|
461
|
+
image.save(str(image_path), format="PNG")
|
462
|
+
try:
|
463
|
+
result = self.process_file_sync(image_path, **kwargs)
|
464
|
+
|
465
|
+
ocr_cache.set(result, **cache_kwargs)
|
466
|
+
|
467
|
+
return result
|
468
|
+
finally:
|
469
|
+
if image_path.exists():
|
470
|
+
image_path.unlink()
|
471
|
+
finally:
|
472
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
473
|
+
|
474
|
+
def process_file_sync(
|
475
|
+
self,
|
476
|
+
path: Path,
|
477
|
+
**kwargs: Unpack[TesseractConfig],
|
478
|
+
) -> ExtractionResult:
|
479
|
+
"""Synchronously process a file and extract its text and metadata.
|
480
|
+
|
481
|
+
Args:
|
482
|
+
path: A Path object representing the file to be processed.
|
483
|
+
**kwargs: Any kwargs related to the given backend
|
484
|
+
|
485
|
+
Returns:
|
486
|
+
The extraction result object
|
487
|
+
"""
|
488
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
489
|
+
|
490
|
+
file_info = self._get_file_info(path)
|
491
|
+
|
492
|
+
cache_kwargs = {
|
493
|
+
"file_info": str(sorted(file_info.items())),
|
494
|
+
"ocr_backend": "tesseract",
|
495
|
+
"ocr_config": str(sorted(kwargs.items())),
|
496
|
+
}
|
497
|
+
|
498
|
+
ocr_cache = get_ocr_cache()
|
499
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
500
|
+
if cached_result is not None:
|
501
|
+
return cached_result
|
502
|
+
|
503
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
504
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
505
|
+
event.wait()
|
506
|
+
|
507
|
+
# Try cache again after waiting for other process to complete
|
508
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
509
|
+
if cached_result is not None:
|
510
|
+
return cached_result
|
511
|
+
|
512
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
513
|
+
|
514
|
+
try:
|
515
|
+
self._validate_tesseract_version_sync()
|
516
|
+
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
|
517
|
+
output_base = tmp_file.name.replace(".txt", "")
|
518
|
+
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
519
|
+
psm = kwargs.pop("psm", PSMMode.AUTO)
|
520
|
+
try:
|
521
|
+
command = self._build_tesseract_command(path, output_base, language, psm, **kwargs)
|
522
|
+
self._run_tesseract_sync(command)
|
523
|
+
|
524
|
+
output_path = Path(output_base + ".txt")
|
525
|
+
with output_path.open(encoding="utf-8") as f:
|
526
|
+
output = f.read()
|
527
|
+
extraction_result = ExtractionResult(
|
528
|
+
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
529
|
+
)
|
530
|
+
|
531
|
+
final_cache_kwargs = cache_kwargs.copy()
|
532
|
+
final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
|
533
|
+
ocr_cache.set(extraction_result, **final_cache_kwargs)
|
534
|
+
|
535
|
+
return extraction_result
|
536
|
+
except (RuntimeError, OSError) as e:
|
537
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
538
|
+
finally:
|
539
|
+
for ext in [".txt"]:
|
540
|
+
temp_file = Path(output_base + ext)
|
541
|
+
if temp_file.exists():
|
542
|
+
temp_file.unlink()
|
543
|
+
finally:
|
544
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
545
|
+
|
546
|
+
def _get_file_info(self, path: Path) -> dict[str, Any]:
|
547
|
+
"""Get file information for caching."""
|
548
|
+
try:
|
549
|
+
stat = path.stat()
|
550
|
+
return {
|
551
|
+
"path": str(path.resolve()),
|
552
|
+
"size": stat.st_size,
|
553
|
+
"mtime": stat.st_mtime,
|
554
|
+
}
|
555
|
+
except OSError:
|
556
|
+
return {
|
557
|
+
"path": str(path),
|
558
|
+
"size": 0,
|
559
|
+
"mtime": 0,
|
560
|
+
}
|
561
|
+
|
562
|
+
def _build_tesseract_command(
|
563
|
+
self, path: Path, output_base: str, language: str, psm: PSMMode, **kwargs: Any
|
564
|
+
) -> list[str]:
|
565
|
+
"""Build tesseract command with all parameters."""
|
566
|
+
command = [
|
567
|
+
"tesseract",
|
568
|
+
str(path),
|
569
|
+
output_base,
|
570
|
+
"-l",
|
571
|
+
language,
|
572
|
+
"--psm",
|
573
|
+
str(psm.value),
|
574
|
+
"--oem",
|
575
|
+
"1",
|
576
|
+
"--loglevel",
|
577
|
+
"OFF",
|
578
|
+
]
|
579
|
+
for kwarg, value in kwargs.items():
|
580
|
+
if isinstance(value, bool):
|
581
|
+
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
582
|
+
else:
|
583
|
+
command.extend(["-c", f"{kwarg}={value}"])
|
584
|
+
return command
|
585
|
+
|
586
|
+
def _run_tesseract_sync(self, command: list[str]) -> None:
|
587
|
+
"""Run tesseract command synchronously."""
|
588
|
+
env = os.environ.copy()
|
589
|
+
if sys.platform.startswith("linux"):
|
590
|
+
env["OMP_THREAD_LIMIT"] = "1"
|
591
|
+
|
592
|
+
result = subprocess.run(
|
593
|
+
command,
|
594
|
+
check=False,
|
595
|
+
env=env,
|
596
|
+
capture_output=True,
|
597
|
+
text=True,
|
598
|
+
timeout=30,
|
599
|
+
)
|
600
|
+
|
601
|
+
if result.returncode != 0:
|
602
|
+
raise OCRError(
|
603
|
+
"OCR failed with a non-0 return code.",
|
604
|
+
context={"error": result.stderr},
|
605
|
+
)
|
606
|
+
|
607
|
+
@classmethod
|
608
|
+
def _validate_tesseract_version_sync(cls) -> None:
|
609
|
+
"""Synchronously validate that Tesseract is installed and is version 5 or above.
|
610
|
+
|
611
|
+
Raises:
|
612
|
+
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
613
|
+
"""
|
614
|
+
try:
|
615
|
+
if cls._version_checked:
|
616
|
+
return
|
617
|
+
|
618
|
+
command = ["tesseract", "--version"]
|
619
|
+
result = subprocess.run(command, capture_output=True, text=True, check=False)
|
620
|
+
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout)
|
621
|
+
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
622
|
+
raise MissingDependencyError(
|
623
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
624
|
+
)
|
625
|
+
|
626
|
+
cls._version_checked = True
|
627
|
+
except FileNotFoundError as e:
|
628
|
+
raise MissingDependencyError(
|
629
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
630
|
+
) from e
|
631
|
+
|
402
632
|
@staticmethod
|
403
633
|
def _validate_language_code(language_code: str) -> str:
|
404
634
|
"""Convert a language code to Tesseract format.
|
@@ -430,3 +660,337 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
430
660
|
"supported_languages": ",".join(sorted(TESSERACT_SUPPORTED_LANGUAGE_CODES)),
|
431
661
|
},
|
432
662
|
)
|
663
|
+
|
664
|
+
|
665
|
+
def _process_image_with_tesseract(
|
666
|
+
image_path: str,
|
667
|
+
config_dict: dict[str, Any],
|
668
|
+
) -> dict[str, Any]:
|
669
|
+
"""Process a single image with Tesseract in a separate process.
|
670
|
+
|
671
|
+
This function is designed to be executed in a subprocess.
|
672
|
+
It uses direct tesseract command execution to avoid async complications.
|
673
|
+
|
674
|
+
Args:
|
675
|
+
image_path: Path to the image file.
|
676
|
+
config_dict: Tesseract configuration as dictionary.
|
677
|
+
|
678
|
+
Returns:
|
679
|
+
OCR result as dictionary.
|
680
|
+
"""
|
681
|
+
try:
|
682
|
+
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
|
683
|
+
output_base = tmp_file.name.replace(".txt", "")
|
684
|
+
|
685
|
+
try:
|
686
|
+
language = config_dict.get("language", "eng")
|
687
|
+
psm = config_dict.get("psm", 3)
|
688
|
+
|
689
|
+
command = [
|
690
|
+
"tesseract",
|
691
|
+
image_path,
|
692
|
+
output_base,
|
693
|
+
"-l",
|
694
|
+
language,
|
695
|
+
"--psm",
|
696
|
+
str(psm),
|
697
|
+
"--oem",
|
698
|
+
"1",
|
699
|
+
"--loglevel",
|
700
|
+
"OFF",
|
701
|
+
]
|
702
|
+
|
703
|
+
boolean_options = [
|
704
|
+
"classify_use_pre_adapted_templates",
|
705
|
+
"language_model_ngram_on",
|
706
|
+
"tessedit_dont_blkrej_good_wds",
|
707
|
+
"tessedit_dont_rowrej_good_wds",
|
708
|
+
"tessedit_enable_dict_correction",
|
709
|
+
"tessedit_use_primary_params_model",
|
710
|
+
"textord_space_size_is_variable",
|
711
|
+
"thresholding_method",
|
712
|
+
]
|
713
|
+
|
714
|
+
for option in boolean_options:
|
715
|
+
if option in config_dict:
|
716
|
+
value = 1 if config_dict[option] else 0
|
717
|
+
command.extend(["-c", f"{option}={value}"])
|
718
|
+
|
719
|
+
env = os.environ.copy()
|
720
|
+
env["OMP_THREAD_LIMIT"] = "1"
|
721
|
+
|
722
|
+
result = subprocess.run(
|
723
|
+
command,
|
724
|
+
check=False,
|
725
|
+
env=env,
|
726
|
+
capture_output=True,
|
727
|
+
text=True,
|
728
|
+
timeout=30,
|
729
|
+
)
|
730
|
+
|
731
|
+
if result.returncode != 0:
|
732
|
+
raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
|
733
|
+
|
734
|
+
output_file = output_base + ".txt"
|
735
|
+
with Path(output_file).open(encoding="utf-8") as f:
|
736
|
+
text = f.read()
|
737
|
+
|
738
|
+
text = normalize_spaces(text)
|
739
|
+
|
740
|
+
return {
|
741
|
+
"success": True,
|
742
|
+
"text": text,
|
743
|
+
"confidence": None,
|
744
|
+
"error": None,
|
745
|
+
}
|
746
|
+
|
747
|
+
finally:
|
748
|
+
for ext in [".txt"]:
|
749
|
+
temp_file = output_base + ext
|
750
|
+
temp_path = Path(temp_file)
|
751
|
+
if temp_path.exists():
|
752
|
+
temp_path.unlink()
|
753
|
+
|
754
|
+
except Exception as e: # noqa: BLE001
|
755
|
+
return {
|
756
|
+
"success": False,
|
757
|
+
"text": "",
|
758
|
+
"confidence": None,
|
759
|
+
"error": str(e),
|
760
|
+
}
|
761
|
+
|
762
|
+
|
763
|
+
def _process_image_bytes_with_tesseract(
|
764
|
+
image_bytes: bytes,
|
765
|
+
config_dict: dict[str, Any],
|
766
|
+
) -> dict[str, Any]:
|
767
|
+
"""Process image bytes with Tesseract in a separate process.
|
768
|
+
|
769
|
+
Args:
|
770
|
+
image_bytes: Image data as bytes.
|
771
|
+
config_dict: Tesseract configuration as dictionary.
|
772
|
+
|
773
|
+
Returns:
|
774
|
+
OCR result as dictionary.
|
775
|
+
"""
|
776
|
+
try:
|
777
|
+
import io
|
778
|
+
|
779
|
+
from PIL import Image
|
780
|
+
|
781
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
|
782
|
+
with Image.open(io.BytesIO(image_bytes)) as image:
|
783
|
+
image.save(tmp_image.name, format="PNG")
|
784
|
+
image_path = tmp_image.name
|
785
|
+
|
786
|
+
try:
|
787
|
+
return _process_image_with_tesseract(image_path, config_dict)
|
788
|
+
finally:
|
789
|
+
image_file = Path(image_path)
|
790
|
+
if image_file.exists():
|
791
|
+
image_file.unlink()
|
792
|
+
|
793
|
+
except Exception as e: # noqa: BLE001
|
794
|
+
return {
|
795
|
+
"success": False,
|
796
|
+
"text": "",
|
797
|
+
"confidence": None,
|
798
|
+
"error": str(e),
|
799
|
+
}
|
800
|
+
|
801
|
+
|
802
|
+
class TesseractProcessPool:
|
803
|
+
"""Process pool for parallel Tesseract OCR processing."""
|
804
|
+
|
805
|
+
def __init__(
|
806
|
+
self,
|
807
|
+
config: TesseractConfig | None = None,
|
808
|
+
max_processes: int | None = None,
|
809
|
+
memory_limit_gb: float | None = None,
|
810
|
+
) -> None:
|
811
|
+
"""Initialize the Tesseract process pool.
|
812
|
+
|
813
|
+
Args:
|
814
|
+
config: Default Tesseract configuration.
|
815
|
+
max_processes: Maximum number of processes.
|
816
|
+
memory_limit_gb: Memory limit in GB.
|
817
|
+
"""
|
818
|
+
from kreuzberg._utils._process_pool import ProcessPoolManager
|
819
|
+
|
820
|
+
self.config = config or TesseractConfig()
|
821
|
+
self.process_manager = ProcessPoolManager(
|
822
|
+
max_processes=max_processes,
|
823
|
+
memory_limit_gb=memory_limit_gb,
|
824
|
+
)
|
825
|
+
|
826
|
+
def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
|
827
|
+
"""Convert TesseractConfig to dictionary for pickling."""
|
828
|
+
cfg = config or self.config
|
829
|
+
|
830
|
+
config_dict = {}
|
831
|
+
for field_name in cfg.__dataclass_fields__:
|
832
|
+
value = getattr(cfg, field_name)
|
833
|
+
|
834
|
+
if hasattr(value, "value"):
|
835
|
+
config_dict[field_name] = value.value
|
836
|
+
else:
|
837
|
+
config_dict[field_name] = value
|
838
|
+
|
839
|
+
return config_dict
|
840
|
+
|
841
|
+
def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
|
842
|
+
"""Convert result dictionary back to OCRResult."""
|
843
|
+
if not result_dict["success"]:
|
844
|
+
raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
|
845
|
+
|
846
|
+
return ExtractionResult(
|
847
|
+
content=result_dict["text"],
|
848
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
849
|
+
metadata={"confidence": result_dict["confidence"]} if result_dict["confidence"] else {}, # type: ignore[typeddict-unknown-key]
|
850
|
+
chunks=[],
|
851
|
+
)
|
852
|
+
|
853
|
+
async def process_image(
|
854
|
+
self,
|
855
|
+
image_path: str | Path,
|
856
|
+
config: TesseractConfig | None = None,
|
857
|
+
) -> ExtractionResult:
|
858
|
+
"""Process a single image file with Tesseract.
|
859
|
+
|
860
|
+
Args:
|
861
|
+
image_path: Path to the image file.
|
862
|
+
config: Tesseract configuration (uses default if None).
|
863
|
+
|
864
|
+
Returns:
|
865
|
+
OCR result.
|
866
|
+
"""
|
867
|
+
config_dict = self._config_to_dict(config)
|
868
|
+
|
869
|
+
task_memory_mb = 80
|
870
|
+
|
871
|
+
result_dict = await self.process_manager.submit_task(
|
872
|
+
_process_image_with_tesseract,
|
873
|
+
str(image_path),
|
874
|
+
config_dict,
|
875
|
+
task_memory_mb=task_memory_mb,
|
876
|
+
)
|
877
|
+
|
878
|
+
return self._result_from_dict(result_dict)
|
879
|
+
|
880
|
+
async def process_image_bytes(
|
881
|
+
self,
|
882
|
+
image_bytes: bytes,
|
883
|
+
config: TesseractConfig | None = None,
|
884
|
+
) -> ExtractionResult:
|
885
|
+
"""Process image bytes with Tesseract.
|
886
|
+
|
887
|
+
Args:
|
888
|
+
image_bytes: Image data as bytes.
|
889
|
+
config: Tesseract configuration (uses default if None).
|
890
|
+
|
891
|
+
Returns:
|
892
|
+
OCR result.
|
893
|
+
"""
|
894
|
+
config_dict = self._config_to_dict(config)
|
895
|
+
|
896
|
+
image_size_mb = len(image_bytes) / 1024 / 1024
|
897
|
+
task_memory_mb = max(80, image_size_mb * 2 + 50)
|
898
|
+
|
899
|
+
result_dict = await self.process_manager.submit_task(
|
900
|
+
_process_image_bytes_with_tesseract,
|
901
|
+
image_bytes,
|
902
|
+
config_dict,
|
903
|
+
task_memory_mb=task_memory_mb,
|
904
|
+
)
|
905
|
+
|
906
|
+
return self._result_from_dict(result_dict)
|
907
|
+
|
908
|
+
async def process_batch_images(
|
909
|
+
self,
|
910
|
+
image_paths: list[str | Path],
|
911
|
+
config: TesseractConfig | None = None,
|
912
|
+
max_concurrent: int | None = None,
|
913
|
+
) -> list[ExtractionResult]:
|
914
|
+
"""Process a batch of images in parallel.
|
915
|
+
|
916
|
+
Args:
|
917
|
+
image_paths: List of image file paths.
|
918
|
+
config: Tesseract configuration (uses default if None).
|
919
|
+
max_concurrent: Maximum concurrent processes.
|
920
|
+
|
921
|
+
Returns:
|
922
|
+
List of OCR results in the same order as input.
|
923
|
+
"""
|
924
|
+
if not image_paths:
|
925
|
+
return []
|
926
|
+
|
927
|
+
config_dict = self._config_to_dict(config)
|
928
|
+
|
929
|
+
arg_batches = [(str(path), config_dict) for path in image_paths]
|
930
|
+
|
931
|
+
task_memory_mb = 80
|
932
|
+
|
933
|
+
result_dicts = await self.process_manager.submit_batch(
|
934
|
+
_process_image_with_tesseract,
|
935
|
+
arg_batches,
|
936
|
+
task_memory_mb=task_memory_mb,
|
937
|
+
max_concurrent=max_concurrent,
|
938
|
+
)
|
939
|
+
|
940
|
+
return [self._result_from_dict(result_dict) for result_dict in result_dicts]
|
941
|
+
|
942
|
+
async def process_batch_bytes(
|
943
|
+
self,
|
944
|
+
image_bytes_list: list[bytes],
|
945
|
+
config: TesseractConfig | None = None,
|
946
|
+
max_concurrent: int | None = None,
|
947
|
+
) -> list[ExtractionResult]:
|
948
|
+
"""Process a batch of image bytes in parallel.
|
949
|
+
|
950
|
+
Args:
|
951
|
+
image_bytes_list: List of image data as bytes.
|
952
|
+
config: Tesseract configuration (uses default if None).
|
953
|
+
max_concurrent: Maximum concurrent processes.
|
954
|
+
|
955
|
+
Returns:
|
956
|
+
List of OCR results in the same order as input.
|
957
|
+
"""
|
958
|
+
if not image_bytes_list:
|
959
|
+
return []
|
960
|
+
|
961
|
+
config_dict = self._config_to_dict(config)
|
962
|
+
|
963
|
+
arg_batches = [(image_bytes, config_dict) for image_bytes in image_bytes_list]
|
964
|
+
|
965
|
+
avg_image_size_mb = sum(len(img) for img in image_bytes_list) / len(image_bytes_list) / 1024 / 1024
|
966
|
+
task_memory_mb = max(80, avg_image_size_mb * 2 + 50)
|
967
|
+
|
968
|
+
result_dicts = await self.process_manager.submit_batch(
|
969
|
+
_process_image_bytes_with_tesseract,
|
970
|
+
arg_batches,
|
971
|
+
task_memory_mb=task_memory_mb,
|
972
|
+
max_concurrent=max_concurrent,
|
973
|
+
)
|
974
|
+
|
975
|
+
return [self._result_from_dict(result_dict) for result_dict in result_dicts]
|
976
|
+
|
977
|
+
def get_system_info(self) -> dict[str, Any]:
|
978
|
+
"""Get system information from the process manager."""
|
979
|
+
return self.process_manager.get_system_info()
|
980
|
+
|
981
|
+
def shutdown(self, wait: bool = True) -> None:
|
982
|
+
"""Shutdown the process pool."""
|
983
|
+
self.process_manager.shutdown(wait=wait)
|
984
|
+
|
985
|
+
async def __aenter__(self) -> Self:
|
986
|
+
"""Async context manager entry."""
|
987
|
+
return self
|
988
|
+
|
989
|
+
async def __aexit__(
|
990
|
+
self,
|
991
|
+
exc_type: type[BaseException] | None,
|
992
|
+
exc_val: BaseException | None,
|
993
|
+
exc_tb: object,
|
994
|
+
) -> None:
|
995
|
+
"""Async context manager exit."""
|
996
|
+
self.shutdown()
|
kreuzberg/_registry.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
from functools import lru_cache
|
4
4
|
from typing import TYPE_CHECKING, ClassVar
|
5
5
|
|
6
|
+
from kreuzberg._extractors._email import EmailExtractor
|
6
7
|
from kreuzberg._extractors._html import HTMLExtractor
|
7
8
|
from kreuzberg._extractors._image import ImageExtractor
|
8
9
|
from kreuzberg._extractors._pandoc import (
|
@@ -19,6 +20,7 @@ from kreuzberg._extractors._pandoc import (
|
|
19
20
|
from kreuzberg._extractors._pdf import PDFExtractor
|
20
21
|
from kreuzberg._extractors._presentation import PresentationExtractor
|
21
22
|
from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
|
23
|
+
from kreuzberg._extractors._structured import StructuredDataExtractor
|
22
24
|
|
23
25
|
if TYPE_CHECKING:
|
24
26
|
from kreuzberg._extractors._base import Extractor
|
@@ -40,6 +42,8 @@ class ExtractorRegistry:
|
|
40
42
|
PresentationExtractor,
|
41
43
|
SpreadSheetExtractor,
|
42
44
|
HTMLExtractor,
|
45
|
+
EmailExtractor,
|
46
|
+
StructuredDataExtractor,
|
43
47
|
MarkdownExtractor,
|
44
48
|
ImageExtractor,
|
45
49
|
BibliographyExtractor,
|