kreuzberg 3.13.0__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +0 -124
- kreuzberg/_document_classification.py +20 -39
- kreuzberg/_entity_extraction.py +0 -29
- kreuzberg/_extractors/_base.py +4 -66
- kreuzberg/_extractors/_email.py +0 -4
- kreuzberg/_extractors/_image.py +0 -2
- kreuzberg/_extractors/_pandoc.py +0 -58
- kreuzberg/_extractors/_pdf.py +0 -3
- kreuzberg/_extractors/_presentation.py +0 -82
- kreuzberg/_extractors/_spread_sheet.py +0 -2
- kreuzberg/_gmft.py +0 -61
- kreuzberg/_language_detection.py +0 -14
- kreuzberg/_mime_types.py +0 -17
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +110 -85
- kreuzberg/_ocr/_paddleocr.py +146 -138
- kreuzberg/_ocr/_table_extractor.py +0 -76
- kreuzberg/_ocr/_tesseract.py +0 -206
- kreuzberg/_playa.py +0 -27
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +16 -119
- kreuzberg/_utils/_cache.py +0 -52
- kreuzberg/_utils/_device.py +0 -56
- kreuzberg/_utils/_document_cache.py +0 -73
- kreuzberg/_utils/_errors.py +0 -47
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -14
- kreuzberg/_utils/_process_pool.py +0 -47
- kreuzberg/_utils/_quality.py +0 -17
- kreuzberg/_utils/_ref.py +0 -16
- kreuzberg/_utils/_serialization.py +0 -25
- kreuzberg/_utils/_string.py +0 -20
- kreuzberg/_utils/_sync.py +0 -76
- kreuzberg/_utils/_table.py +0 -45
- kreuzberg/_utils/_tmp.py +0 -9
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +3 -2
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.13.0.dist-info/RECORD +0 -56
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -231,7 +231,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
231
231
|
ocr_cache.mark_complete(**cache_kwargs)
|
232
232
|
|
233
233
|
async def _handle_cache_lookup(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
|
234
|
-
"""Handle cache lookup before processing."""
|
235
234
|
ocr_cache = get_ocr_cache()
|
236
235
|
|
237
236
|
cached_result = await ocr_cache.aget(**cache_kwargs)
|
@@ -249,7 +248,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
249
248
|
return None
|
250
249
|
|
251
250
|
def _prepare_tesseract_run_config(self, **kwargs: Any) -> dict[str, Any]:
|
252
|
-
"""Prepare configuration for a Tesseract run."""
|
253
251
|
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
254
252
|
psm = kwargs.pop("psm", PSMMode.AUTO)
|
255
253
|
output_format = kwargs.pop("output_format", "markdown")
|
@@ -282,7 +280,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
282
280
|
}
|
283
281
|
|
284
282
|
async def _execute_tesseract(self, path: Path, output_base: str, run_config: dict[str, Any]) -> None:
|
285
|
-
"""Build and execute the Tesseract command."""
|
286
283
|
command = [
|
287
284
|
"tesseract",
|
288
285
|
str(path),
|
@@ -327,7 +324,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
327
324
|
) from e
|
328
325
|
|
329
326
|
async def _process_tesseract_output(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
|
330
|
-
"""Process the raw output from Tesseract based on the requested format."""
|
331
327
|
output_format = run_config["output_format"]
|
332
328
|
enable_table_detection = run_config["enable_table_detection"]
|
333
329
|
kwargs = run_config["remaining_kwargs"]
|
@@ -413,17 +409,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
413
409
|
table_row_threshold_ratio: float = 0.5,
|
414
410
|
table_min_confidence: float = 30.0,
|
415
411
|
) -> ExtractionResult:
|
416
|
-
"""Process TSV output and extract tables if detected.
|
417
|
-
|
418
|
-
Args:
|
419
|
-
tsv_content: Raw TSV output from Tesseract.
|
420
|
-
table_column_threshold: Pixel threshold for column clustering.
|
421
|
-
table_row_threshold_ratio: Row threshold as ratio of mean text height.
|
422
|
-
table_min_confidence: Minimum confidence score to include a word.
|
423
|
-
|
424
|
-
Returns:
|
425
|
-
ExtractionResult with extracted content and tables.
|
426
|
-
"""
|
427
412
|
text_result = self._extract_text_from_tsv(tsv_content)
|
428
413
|
|
429
414
|
try:
|
@@ -460,14 +445,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
460
445
|
return text_result
|
461
446
|
|
462
447
|
def _extract_text_from_tsv(self, tsv_content: str) -> ExtractionResult:
|
463
|
-
"""Extract plain text from TSV output.
|
464
|
-
|
465
|
-
Args:
|
466
|
-
tsv_content: Raw TSV output from Tesseract.
|
467
|
-
|
468
|
-
Returns:
|
469
|
-
ExtractionResult with extracted text.
|
470
|
-
"""
|
471
448
|
try:
|
472
449
|
reader = csv.DictReader(StringIO(tsv_content), delimiter="\t")
|
473
450
|
|
@@ -527,20 +504,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
527
504
|
table_min_confidence: float = 30.0,
|
528
505
|
**_kwargs: Any,
|
529
506
|
) -> ExtractionResult:
|
530
|
-
"""Convert hOCR content to Markdown with table detection.
|
531
|
-
|
532
|
-
Args:
|
533
|
-
hocr_content: Raw hOCR HTML/XML content from Tesseract.
|
534
|
-
enable_table_detection: Whether to detect and format tables.
|
535
|
-
html_to_markdown_config: Configuration for HTML to Markdown conversion.
|
536
|
-
table_column_threshold: Pixel threshold for column clustering.
|
537
|
-
table_row_threshold_ratio: Row threshold as ratio of mean text height.
|
538
|
-
table_min_confidence: Minimum confidence score to include a word.
|
539
|
-
**kwargs: Additional configuration options.
|
540
|
-
|
541
|
-
Returns:
|
542
|
-
ExtractionResult with Markdown content and detected tables.
|
543
|
-
"""
|
544
507
|
config = html_to_markdown_config or HTMLToMarkdownConfig(
|
545
508
|
escape_asterisks=False,
|
546
509
|
escape_underscores=False,
|
@@ -610,20 +573,15 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
610
573
|
)
|
611
574
|
|
612
575
|
def _create_basic_converters(self) -> dict[str, Any]:
|
613
|
-
"""Create basic converters for individual hOCR elements."""
|
614
|
-
|
615
576
|
def ocrx_word_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
616
|
-
"""Custom converter for hOCR word elements - adds spaces between words."""
|
617
577
|
del tag
|
618
578
|
return f"{text.strip()} "
|
619
579
|
|
620
580
|
def ocr_line_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
621
|
-
"""Custom converter for hOCR line elements - handles line breaks."""
|
622
581
|
del tag
|
623
582
|
return f"{text.strip()}\n"
|
624
583
|
|
625
584
|
def ocr_par_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
626
|
-
"""Custom converter for hOCR paragraph elements - handles paragraph breaks."""
|
627
585
|
del tag
|
628
586
|
content = text.strip()
|
629
587
|
if not content:
|
@@ -631,7 +589,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
631
589
|
return f"{content}\n\n"
|
632
590
|
|
633
591
|
def ocr_carea_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
634
|
-
"""Custom converter for hOCR content area elements."""
|
635
592
|
del tag
|
636
593
|
content = text.strip()
|
637
594
|
if not content:
|
@@ -639,17 +596,14 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
639
596
|
return f"{content}\n\n"
|
640
597
|
|
641
598
|
def ocr_page_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
642
|
-
"""Custom converter for hOCR page elements."""
|
643
599
|
del tag
|
644
600
|
return text.strip()
|
645
601
|
|
646
602
|
def ocr_separator_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
647
|
-
"""Custom converter for hOCR separator elements - convert to horizontal rules."""
|
648
603
|
del tag, text
|
649
604
|
return "---\n"
|
650
605
|
|
651
606
|
def ocr_photo_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
652
|
-
"""Custom converter for hOCR photo/image elements - indicate image presence."""
|
653
607
|
del text
|
654
608
|
title = tag.get("title", "")
|
655
609
|
if isinstance(title, str):
|
@@ -672,18 +626,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
672
626
|
}
|
673
627
|
|
674
628
|
def _create_hocr_converters(self, _tables: list[TableData]) -> dict[str, Any]:
|
675
|
-
"""Create custom converters for hOCR elements that preserve spacing.
|
676
|
-
|
677
|
-
Args:
|
678
|
-
tables: List of detected tables (not used for filtering, tables added separately).
|
679
|
-
|
680
|
-
Returns:
|
681
|
-
Dictionary mapping HTML tags to converter functions.
|
682
|
-
"""
|
683
629
|
basic_converters = self._create_basic_converters()
|
684
630
|
|
685
631
|
def generic_div_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
686
|
-
"""Generic converter for div elements based on class."""
|
687
632
|
class_attr = tag.get("class", "")
|
688
633
|
if isinstance(class_attr, list):
|
689
634
|
class_attr = " ".join(class_attr)
|
@@ -697,7 +642,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
697
642
|
return text
|
698
643
|
|
699
644
|
def generic_span_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
700
|
-
"""Generic converter for span elements based on class."""
|
701
645
|
class_attr = tag.get("class", "")
|
702
646
|
if isinstance(class_attr, list):
|
703
647
|
class_attr = " ".join(class_attr)
|
@@ -717,15 +661,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
717
661
|
}
|
718
662
|
|
719
663
|
def _process_hocr_to_markdown_sync(self, hocr_content: str, config: TesseractConfig) -> ExtractionResult:
|
720
|
-
"""Synchronously process hOCR content to markdown format.
|
721
|
-
|
722
|
-
Args:
|
723
|
-
hocr_content: Raw hOCR content as string
|
724
|
-
config: Tesseract configuration object
|
725
|
-
|
726
|
-
Returns:
|
727
|
-
ExtractionResult with markdown content
|
728
|
-
"""
|
729
664
|
tables: list[TableData] = []
|
730
665
|
|
731
666
|
if config.enable_table_detection:
|
@@ -795,17 +730,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
795
730
|
table_row_threshold_ratio: float = 0.5,
|
796
731
|
table_min_confidence: float = 30.0,
|
797
732
|
) -> ExtractionResult:
|
798
|
-
"""Synchronously process TSV output and extract tables if detected.
|
799
|
-
|
800
|
-
Args:
|
801
|
-
tsv_content: Raw TSV output from Tesseract.
|
802
|
-
table_column_threshold: Pixel threshold for column clustering.
|
803
|
-
table_row_threshold_ratio: Row threshold as ratio of mean text height.
|
804
|
-
table_min_confidence: Minimum confidence score to include a word.
|
805
|
-
|
806
|
-
Returns:
|
807
|
-
ExtractionResult with extracted content and tables.
|
808
|
-
"""
|
809
733
|
text_result = self._extract_text_from_tsv(tsv_content)
|
810
734
|
|
811
735
|
try:
|
@@ -848,17 +772,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
848
772
|
row_threshold_ratio: float = 0.5,
|
849
773
|
min_confidence: float = 30.0,
|
850
774
|
) -> list[TableData]:
|
851
|
-
"""Extract tables from hOCR structure using coordinate analysis.
|
852
|
-
|
853
|
-
Args:
|
854
|
-
soup: Parsed hOCR BeautifulSoup object.
|
855
|
-
column_threshold: Pixel threshold for column clustering.
|
856
|
-
row_threshold_ratio: Row threshold as ratio of mean text height.
|
857
|
-
min_confidence: Minimum confidence score to include a word.
|
858
|
-
|
859
|
-
Returns:
|
860
|
-
List of detected tables as TableData objects.
|
861
|
-
"""
|
862
775
|
tsv_data = await self._hocr_to_tsv_data(soup, min_confidence)
|
863
776
|
|
864
777
|
if not tsv_data:
|
@@ -903,15 +816,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
903
816
|
return tables
|
904
817
|
|
905
818
|
async def _hocr_to_tsv_data(self, soup: Any, min_confidence: float) -> str:
|
906
|
-
"""Convert hOCR structure to TSV format for table extraction.
|
907
|
-
|
908
|
-
Args:
|
909
|
-
soup: Parsed hOCR BeautifulSoup object.
|
910
|
-
min_confidence: Minimum confidence score to include.
|
911
|
-
|
912
|
-
Returns:
|
913
|
-
TSV formatted string compatible with table extractor.
|
914
|
-
"""
|
915
819
|
tsv_lines = ["level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext"]
|
916
820
|
|
917
821
|
words = soup.find_all("span", class_="ocrx_word")
|
@@ -947,14 +851,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
947
851
|
return "\n".join(tsv_lines)
|
948
852
|
|
949
853
|
def _identify_table_regions(self, words: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
|
950
|
-
"""Identify potential table regions from word coordinates.
|
951
|
-
|
952
|
-
Args:
|
953
|
-
words: List of word dictionaries with coordinates.
|
954
|
-
|
955
|
-
Returns:
|
956
|
-
List of word groups representing potential tables.
|
957
|
-
"""
|
958
854
|
if not words:
|
959
855
|
return []
|
960
856
|
|
@@ -962,11 +858,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
962
858
|
|
963
859
|
@classmethod
|
964
860
|
async def _validate_tesseract_version(cls) -> None:
|
965
|
-
"""Validate that Tesseract is installed and is version 5 or above.
|
966
|
-
|
967
|
-
Raises:
|
968
|
-
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
969
|
-
"""
|
970
861
|
try:
|
971
862
|
if cls._version_checked:
|
972
863
|
return
|
@@ -992,7 +883,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
992
883
|
) from e
|
993
884
|
|
994
885
|
def _handle_cache_lookup_sync(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
|
995
|
-
"""Handle cache lookup before processing (sync)."""
|
996
886
|
ocr_cache = get_ocr_cache()
|
997
887
|
|
998
888
|
cached_result = ocr_cache.get(**cache_kwargs)
|
@@ -1010,7 +900,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1010
900
|
return None
|
1011
901
|
|
1012
902
|
def _execute_tesseract_sync(self, command: list[str]) -> None:
|
1013
|
-
"""Run tesseract command synchronously."""
|
1014
903
|
env = os.environ.copy()
|
1015
904
|
if sys.platform.startswith("linux"):
|
1016
905
|
env["OMP_THREAD_LIMIT"] = "1"
|
@@ -1038,7 +927,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1038
927
|
) from e
|
1039
928
|
|
1040
929
|
def _process_tesseract_output_sync(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
|
1041
|
-
"""Process the raw output from Tesseract based on the requested format (sync)."""
|
1042
930
|
output_format = run_config["output_format"]
|
1043
931
|
enable_table_detection = run_config["enable_table_detection"]
|
1044
932
|
kwargs = run_config["remaining_kwargs"]
|
@@ -1063,7 +951,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1063
951
|
)
|
1064
952
|
|
1065
953
|
def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
|
1066
|
-
"""Synchronously process an image and extract its text and metadata."""
|
1067
954
|
use_cache = kwargs.pop("use_cache", True)
|
1068
955
|
|
1069
956
|
save_image = image
|
@@ -1107,7 +994,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1107
994
|
ocr_cache.mark_complete(**cache_kwargs)
|
1108
995
|
|
1109
996
|
def process_file_sync(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
|
1110
|
-
"""Synchronously process a file and extract its text and metadata."""
|
1111
997
|
use_cache = kwargs.pop("use_cache", True)
|
1112
998
|
|
1113
999
|
file_info = self._get_file_info(path)
|
@@ -1188,7 +1074,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1188
1074
|
ocr_cache.mark_complete(**cache_kwargs)
|
1189
1075
|
|
1190
1076
|
def _get_file_info(self, path: Path) -> dict[str, Any]:
|
1191
|
-
"""Get file information for caching."""
|
1192
1077
|
try:
|
1193
1078
|
stat = path.stat()
|
1194
1079
|
return {
|
@@ -1206,7 +1091,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1206
1091
|
def _build_tesseract_command(
|
1207
1092
|
self, path: Path, output_base: str, language: str, psm: PSMMode, output_format: str = "text", **kwargs: Any
|
1208
1093
|
) -> list[str]:
|
1209
|
-
"""Build tesseract command with all parameters."""
|
1210
1094
|
command = [
|
1211
1095
|
"tesseract",
|
1212
1096
|
str(path),
|
@@ -1235,11 +1119,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1235
1119
|
|
1236
1120
|
@classmethod
|
1237
1121
|
def _validate_tesseract_version_sync(cls) -> None:
|
1238
|
-
"""Synchronously validate that Tesseract is installed and is version 5 or above.
|
1239
|
-
|
1240
|
-
Raises:
|
1241
|
-
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
1242
|
-
"""
|
1243
1122
|
try:
|
1244
1123
|
if cls._version_checked:
|
1245
1124
|
return
|
@@ -1265,17 +1144,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1265
1144
|
|
1266
1145
|
@staticmethod
|
1267
1146
|
def _validate_language_code(language_code: str) -> str:
|
1268
|
-
"""Convert a language code to Tesseract format.
|
1269
|
-
|
1270
|
-
Args:
|
1271
|
-
language_code: Tesseract supported language code or multiple language codes connected with '+'
|
1272
|
-
|
1273
|
-
Raises:
|
1274
|
-
ValidationError: If the language is not supported by Tesseract
|
1275
|
-
|
1276
|
-
Returns:
|
1277
|
-
Language code compatible with Tesseract
|
1278
|
-
"""
|
1279
1147
|
normalized = language_code.lower()
|
1280
1148
|
if normalized in TESSERACT_SUPPORTED_LANGUAGE_CODES:
|
1281
1149
|
return normalized
|
@@ -1300,18 +1168,6 @@ def _process_image_with_tesseract(
|
|
1300
1168
|
image_path: str,
|
1301
1169
|
config_dict: dict[str, Any],
|
1302
1170
|
) -> dict[str, Any]:
|
1303
|
-
"""Process a single image with Tesseract in a separate process.
|
1304
|
-
|
1305
|
-
This function is designed to be executed in a subprocess.
|
1306
|
-
It uses direct tesseract command execution to avoid async complications.
|
1307
|
-
|
1308
|
-
Args:
|
1309
|
-
image_path: Path to the image file.
|
1310
|
-
config_dict: Tesseract configuration as dictionary.
|
1311
|
-
|
1312
|
-
Returns:
|
1313
|
-
OCR result as dictionary.
|
1314
|
-
"""
|
1315
1171
|
try:
|
1316
1172
|
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
|
1317
1173
|
output_base = tmp_file.name.replace(".txt", "")
|
@@ -1399,15 +1255,6 @@ def _process_image_bytes_with_tesseract(
|
|
1399
1255
|
image_bytes: bytes,
|
1400
1256
|
config_dict: dict[str, Any],
|
1401
1257
|
) -> dict[str, Any]:
|
1402
|
-
"""Process image bytes with Tesseract in a separate process.
|
1403
|
-
|
1404
|
-
Args:
|
1405
|
-
image_bytes: Image data as bytes.
|
1406
|
-
config_dict: Tesseract configuration as dictionary.
|
1407
|
-
|
1408
|
-
Returns:
|
1409
|
-
OCR result as dictionary.
|
1410
|
-
"""
|
1411
1258
|
try:
|
1412
1259
|
with (
|
1413
1260
|
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image,
|
@@ -1433,21 +1280,12 @@ def _process_image_bytes_with_tesseract(
|
|
1433
1280
|
|
1434
1281
|
|
1435
1282
|
class TesseractProcessPool:
|
1436
|
-
"""Process pool for parallel Tesseract OCR processing."""
|
1437
|
-
|
1438
1283
|
def __init__(
|
1439
1284
|
self,
|
1440
1285
|
config: TesseractConfig | None = None,
|
1441
1286
|
max_processes: int | None = None,
|
1442
1287
|
memory_limit_gb: float | None = None,
|
1443
1288
|
) -> None:
|
1444
|
-
"""Initialize the Tesseract process pool.
|
1445
|
-
|
1446
|
-
Args:
|
1447
|
-
config: Default Tesseract configuration.
|
1448
|
-
max_processes: Maximum number of processes.
|
1449
|
-
memory_limit_gb: Memory limit in GB.
|
1450
|
-
"""
|
1451
1289
|
from kreuzberg._utils._process_pool import ProcessPoolManager # noqa: PLC0415
|
1452
1290
|
|
1453
1291
|
self.config = config or TesseractConfig()
|
@@ -1457,7 +1295,6 @@ class TesseractProcessPool:
|
|
1457
1295
|
)
|
1458
1296
|
|
1459
1297
|
def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
|
1460
|
-
"""Convert TesseractConfig to dictionary for pickling."""
|
1461
1298
|
cfg = config or self.config
|
1462
1299
|
|
1463
1300
|
config_dict = {}
|
@@ -1472,7 +1309,6 @@ class TesseractProcessPool:
|
|
1472
1309
|
return config_dict
|
1473
1310
|
|
1474
1311
|
def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
|
1475
|
-
"""Convert result dictionary back to OCRResult."""
|
1476
1312
|
if not result_dict["success"]:
|
1477
1313
|
raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
|
1478
1314
|
|
@@ -1488,15 +1324,6 @@ class TesseractProcessPool:
|
|
1488
1324
|
image_path: str | Path,
|
1489
1325
|
config: TesseractConfig | None = None,
|
1490
1326
|
) -> ExtractionResult:
|
1491
|
-
"""Process a single image file with Tesseract.
|
1492
|
-
|
1493
|
-
Args:
|
1494
|
-
image_path: Path to the image file.
|
1495
|
-
config: Tesseract configuration (uses default if None).
|
1496
|
-
|
1497
|
-
Returns:
|
1498
|
-
OCR result.
|
1499
|
-
"""
|
1500
1327
|
config_dict = self._config_to_dict(config)
|
1501
1328
|
|
1502
1329
|
task_memory_mb = 80
|
@@ -1515,15 +1342,6 @@ class TesseractProcessPool:
|
|
1515
1342
|
image_bytes: bytes,
|
1516
1343
|
config: TesseractConfig | None = None,
|
1517
1344
|
) -> ExtractionResult:
|
1518
|
-
"""Process image bytes with Tesseract.
|
1519
|
-
|
1520
|
-
Args:
|
1521
|
-
image_bytes: Image data as bytes.
|
1522
|
-
config: Tesseract configuration (uses default if None).
|
1523
|
-
|
1524
|
-
Returns:
|
1525
|
-
OCR result.
|
1526
|
-
"""
|
1527
1345
|
config_dict = self._config_to_dict(config)
|
1528
1346
|
|
1529
1347
|
image_size_mb = len(image_bytes) / 1024 / 1024
|
@@ -1544,16 +1362,6 @@ class TesseractProcessPool:
|
|
1544
1362
|
config: TesseractConfig | None = None,
|
1545
1363
|
max_concurrent: int | None = None,
|
1546
1364
|
) -> list[ExtractionResult]:
|
1547
|
-
"""Process a batch of images in parallel.
|
1548
|
-
|
1549
|
-
Args:
|
1550
|
-
image_paths: List of image file paths.
|
1551
|
-
config: Tesseract configuration (uses default if None).
|
1552
|
-
max_concurrent: Maximum concurrent processes.
|
1553
|
-
|
1554
|
-
Returns:
|
1555
|
-
List of OCR results in the same order as input.
|
1556
|
-
"""
|
1557
1365
|
if not image_paths:
|
1558
1366
|
return []
|
1559
1367
|
|
@@ -1578,16 +1386,6 @@ class TesseractProcessPool:
|
|
1578
1386
|
config: TesseractConfig | None = None,
|
1579
1387
|
max_concurrent: int | None = None,
|
1580
1388
|
) -> list[ExtractionResult]:
|
1581
|
-
"""Process a batch of image bytes in parallel.
|
1582
|
-
|
1583
|
-
Args:
|
1584
|
-
image_bytes_list: List of image data as bytes.
|
1585
|
-
config: Tesseract configuration (uses default if None).
|
1586
|
-
max_concurrent: Maximum concurrent processes.
|
1587
|
-
|
1588
|
-
Returns:
|
1589
|
-
List of OCR results in the same order as input.
|
1590
|
-
"""
|
1591
1389
|
if not image_bytes_list:
|
1592
1390
|
return []
|
1593
1391
|
|
@@ -1608,15 +1406,12 @@ class TesseractProcessPool:
|
|
1608
1406
|
return [self._result_from_dict(result_dict) for result_dict in result_dicts]
|
1609
1407
|
|
1610
1408
|
def get_system_info(self) -> dict[str, Any]:
|
1611
|
-
"""Get system information from the process manager."""
|
1612
1409
|
return self.process_manager.get_system_info()
|
1613
1410
|
|
1614
1411
|
def shutdown(self, wait: bool = True) -> None:
|
1615
|
-
"""Shutdown the process pool."""
|
1616
1412
|
self.process_manager.shutdown(wait=wait)
|
1617
1413
|
|
1618
1414
|
async def __aenter__(self) -> Self:
|
1619
|
-
"""Async context manager entry."""
|
1620
1415
|
return self
|
1621
1416
|
|
1622
1417
|
async def __aexit__(
|
@@ -1625,5 +1420,4 @@ class TesseractProcessPool:
|
|
1625
1420
|
exc_val: BaseException | None,
|
1626
1421
|
exc_tb: object,
|
1627
1422
|
) -> None:
|
1628
|
-
"""Async context manager exit."""
|
1629
1423
|
self.shutdown()
|
kreuzberg/_playa.py
CHANGED
@@ -25,18 +25,6 @@ BOM_CHAR = "\ufeff"
|
|
25
25
|
|
26
26
|
|
27
27
|
async def extract_pdf_metadata(pdf_content: bytes, password: str = "") -> Metadata:
|
28
|
-
"""Extract metadata from a PDF document.
|
29
|
-
|
30
|
-
Args:
|
31
|
-
pdf_content: The bytes of the PDF document.
|
32
|
-
password: Password for encrypted PDF files.
|
33
|
-
|
34
|
-
Raises:
|
35
|
-
ParsingError: If the PDF metadata could not be extracted.
|
36
|
-
|
37
|
-
Returns:
|
38
|
-
A dictionary of metadata extracted from the PDF.
|
39
|
-
"""
|
40
28
|
try:
|
41
29
|
document = parse(pdf_content, max_workers=1, password=password)
|
42
30
|
metadata: Metadata = {}
|
@@ -247,7 +235,6 @@ def _collect_document_permissions(document: Document) -> list[str]:
|
|
247
235
|
|
248
236
|
|
249
237
|
def _extract_structure_information(document: Document, result: Metadata) -> None:
|
250
|
-
"""Extract language and subtitle from document structure."""
|
251
238
|
if document.structure:
|
252
239
|
languages = set()
|
253
240
|
subtitle = None
|
@@ -280,20 +267,6 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
|
|
280
267
|
|
281
268
|
|
282
269
|
def extract_pdf_metadata_sync(pdf_content: bytes, password: str = "") -> Metadata:
|
283
|
-
"""Synchronous version of extract_pdf_metadata.
|
284
|
-
|
285
|
-
Extract metadata from a PDF document without using async/await.
|
286
|
-
|
287
|
-
Args:
|
288
|
-
pdf_content: The bytes of the PDF document.
|
289
|
-
password: Password for encrypted PDF files.
|
290
|
-
|
291
|
-
Raises:
|
292
|
-
ParsingError: If the PDF metadata could not be extracted.
|
293
|
-
|
294
|
-
Returns:
|
295
|
-
A dictionary of metadata extracted from the PDF.
|
296
|
-
"""
|
297
270
|
try:
|
298
271
|
document = parse(pdf_content, max_workers=1, password=password)
|
299
272
|
metadata: Metadata = {}
|
kreuzberg/_registry.py
CHANGED
@@ -28,14 +28,6 @@ if TYPE_CHECKING:
|
|
28
28
|
|
29
29
|
|
30
30
|
class ExtractorRegistry:
|
31
|
-
"""Manages extractors for different MIME types and their configurations.
|
32
|
-
|
33
|
-
This class provides functionality to register, unregister, and retrieve
|
34
|
-
extractors based on MIME types. It supports both synchronous and asynchronous
|
35
|
-
operations for managing extractors. A default set of extractors is also
|
36
|
-
maintained alongside user-registered extractors.
|
37
|
-
"""
|
38
|
-
|
39
31
|
_default_extractors: ClassVar[list[type[Extractor]]] = [
|
40
32
|
PDFExtractor,
|
41
33
|
OfficeDocumentExtractor,
|
@@ -59,15 +51,6 @@ class ExtractorRegistry:
|
|
59
51
|
@classmethod
|
60
52
|
@lru_cache
|
61
53
|
def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
|
62
|
-
"""Gets the extractor for the mimetype.
|
63
|
-
|
64
|
-
Args:
|
65
|
-
mime_type: The mime type of the content.
|
66
|
-
config: Extraction options object, defaults to the default object.
|
67
|
-
|
68
|
-
Returns:
|
69
|
-
The extractor
|
70
|
-
"""
|
71
54
|
extractors: list[type[Extractor]] = [
|
72
55
|
*cls._registered_extractors,
|
73
56
|
*cls._default_extractors,
|
@@ -81,30 +64,11 @@ class ExtractorRegistry:
|
|
81
64
|
|
82
65
|
@classmethod
|
83
66
|
def add_extractor(cls, extractor: type[Extractor]) -> None:
|
84
|
-
"""Add an extractor to the registry.
|
85
|
-
|
86
|
-
Note:
|
87
|
-
Extractors are tried in the order they are added: first added, first tried.
|
88
|
-
|
89
|
-
Args:
|
90
|
-
extractor: The extractor to add.
|
91
|
-
|
92
|
-
Returns:
|
93
|
-
None
|
94
|
-
"""
|
95
67
|
cls._registered_extractors.append(extractor)
|
96
68
|
cls.get_extractor.cache_clear()
|
97
69
|
|
98
70
|
@classmethod
|
99
71
|
def remove_extractor(cls, extractor: type[Extractor]) -> None:
|
100
|
-
"""Remove an extractor from the registry.
|
101
|
-
|
102
|
-
Args:
|
103
|
-
extractor: The extractor to remove.
|
104
|
-
|
105
|
-
Returns:
|
106
|
-
None
|
107
|
-
"""
|
108
72
|
try:
|
109
73
|
cls._registered_extractors.remove(extractor)
|
110
74
|
cls.get_extractor.cache_clear()
|