kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +10 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +74 -45
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_config.py +11 -1
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +5 -7
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +101 -27
- kreuzberg/_extractors/_html.py +112 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +208 -99
- kreuzberg/_extractors/_presentation.py +76 -8
- kreuzberg/_extractors/_spread_sheet.py +24 -30
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +324 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +53 -21
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +88 -37
- kreuzberg/_types.py +291 -61
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +39 -10
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +44 -28
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
- kreuzberg-3.16.0.dist-info/RECORD +61 -0
- kreuzberg-3.14.1.dist-info/RECORD +0 -58
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -8,6 +8,7 @@ import re
|
|
8
8
|
import subprocess
|
9
9
|
import sys
|
10
10
|
import tempfile
|
11
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
11
12
|
from io import StringIO
|
12
13
|
from pathlib import Path
|
13
14
|
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
@@ -28,10 +29,11 @@ from kreuzberg._ocr._base import OCRBackend
|
|
28
29
|
from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
|
29
30
|
from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
|
30
31
|
from kreuzberg._utils._cache import get_ocr_cache
|
31
|
-
from kreuzberg._utils.
|
32
|
+
from kreuzberg._utils._html_streaming import should_use_streaming
|
33
|
+
from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
|
32
34
|
from kreuzberg._utils._string import normalize_spaces
|
33
35
|
from kreuzberg._utils._sync import run_sync
|
34
|
-
from kreuzberg._utils._tmp import create_temp_file
|
36
|
+
from kreuzberg._utils._tmp import create_temp_file, temporary_file_sync
|
35
37
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
36
38
|
|
37
39
|
if TYPE_CHECKING:
|
@@ -257,18 +259,19 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
257
259
|
if enable_table_detection and output_format == "text":
|
258
260
|
output_format = "tsv"
|
259
261
|
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
262
|
+
match output_format:
|
263
|
+
case "markdown":
|
264
|
+
tesseract_format = "hocr"
|
265
|
+
ext = ".hocr"
|
266
|
+
case "tsv":
|
267
|
+
tesseract_format = "tsv"
|
268
|
+
ext = ".tsv"
|
269
|
+
case "hocr":
|
270
|
+
tesseract_format = "hocr"
|
271
|
+
ext = ".hocr"
|
272
|
+
case _:
|
273
|
+
tesseract_format = "text"
|
274
|
+
ext = ".txt"
|
272
275
|
|
273
276
|
return {
|
274
277
|
"language": language,
|
@@ -344,11 +347,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
344
347
|
if output_format == "tsv":
|
345
348
|
return self._extract_text_from_tsv(output)
|
346
349
|
if output_format == "hocr":
|
347
|
-
return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}
|
350
|
+
return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={})
|
348
351
|
|
349
|
-
return ExtractionResult(
|
350
|
-
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
351
|
-
)
|
352
|
+
return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
352
353
|
|
353
354
|
async def process_file(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
|
354
355
|
use_cache = kwargs.pop("use_cache", True)
|
@@ -494,9 +495,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
494
495
|
content += parts[11] + " "
|
495
496
|
content = content.strip()
|
496
497
|
|
497
|
-
return ExtractionResult(
|
498
|
-
content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
499
|
-
)
|
498
|
+
return ExtractionResult(content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
500
499
|
|
501
500
|
async def _process_hocr_to_markdown(
|
502
501
|
self,
|
@@ -512,12 +511,12 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
512
511
|
escape_asterisks=False,
|
513
512
|
escape_underscores=False,
|
514
513
|
extract_metadata=False,
|
515
|
-
strip="meta title",
|
514
|
+
strip=["meta", "title"],
|
516
515
|
)
|
517
516
|
|
518
517
|
tables: list[TableData] = []
|
519
518
|
if enable_table_detection:
|
520
|
-
soup = BeautifulSoup(hocr_content, "
|
519
|
+
soup = BeautifulSoup(hocr_content, "xml")
|
521
520
|
tables = await self._extract_tables_from_hocr(
|
522
521
|
soup,
|
523
522
|
table_column_threshold,
|
@@ -534,12 +533,16 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
534
533
|
config_dict = config.to_dict()
|
535
534
|
config_dict["custom_converters"] = all_converters
|
536
535
|
|
536
|
+
use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
|
537
|
+
config_dict["stream_processing"] = use_streaming
|
538
|
+
config_dict["chunk_size"] = chunk_size
|
539
|
+
|
537
540
|
try:
|
538
541
|
markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
|
539
542
|
markdown_content = normalize_spaces(markdown_content)
|
540
543
|
except (ValueError, TypeError, AttributeError):
|
541
544
|
try:
|
542
|
-
soup = BeautifulSoup(hocr_content, "
|
545
|
+
soup = BeautifulSoup(hocr_content, "xml")
|
543
546
|
words = soup.find_all("span", class_="ocrx_word")
|
544
547
|
text_parts = []
|
545
548
|
for word in words:
|
@@ -678,19 +681,25 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
678
681
|
escape_asterisks=False,
|
679
682
|
escape_underscores=False,
|
680
683
|
extract_metadata=False,
|
681
|
-
strip="meta title",
|
684
|
+
strip=["meta", "title"],
|
682
685
|
)
|
683
686
|
|
687
|
+
config_dict = html_config.to_dict()
|
688
|
+
|
689
|
+
use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
|
690
|
+
config_dict["stream_processing"] = use_streaming
|
691
|
+
config_dict["chunk_size"] = chunk_size
|
692
|
+
|
684
693
|
markdown_content = html_to_markdown.convert_to_markdown(
|
685
694
|
hocr_content,
|
686
|
-
**
|
695
|
+
**config_dict,
|
687
696
|
)
|
688
697
|
|
689
698
|
markdown_content = normalize_spaces(markdown_content)
|
690
699
|
|
691
700
|
except (ValueError, TypeError, AttributeError):
|
692
701
|
try:
|
693
|
-
soup = BeautifulSoup(hocr_content, "
|
702
|
+
soup = BeautifulSoup(hocr_content, "xml")
|
694
703
|
words = soup.find_all("span", class_="ocrx_word")
|
695
704
|
text_parts = []
|
696
705
|
for word in words:
|
@@ -948,11 +957,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
948
957
|
if output_format == "tsv":
|
949
958
|
return self._extract_text_from_tsv(output)
|
950
959
|
if output_format == "hocr":
|
951
|
-
return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}
|
960
|
+
return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={})
|
952
961
|
|
953
|
-
return ExtractionResult(
|
954
|
-
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
955
|
-
)
|
962
|
+
return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
956
963
|
|
957
964
|
def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
|
958
965
|
use_cache = kwargs.pop("use_cache", True)
|
@@ -979,10 +986,8 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
979
986
|
ocr_cache = get_ocr_cache()
|
980
987
|
try:
|
981
988
|
self._validate_tesseract_version_sync()
|
982
|
-
with
|
983
|
-
image_path = Path(tmp_file.name)
|
989
|
+
with temporary_file_sync(".png") as image_path:
|
984
990
|
save_image.save(str(image_path), format="PNG")
|
985
|
-
try:
|
986
991
|
kwargs_with_cache = {**kwargs, "use_cache": use_cache}
|
987
992
|
result = self.process_file_sync(image_path, **kwargs_with_cache)
|
988
993
|
|
@@ -990,9 +995,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
990
995
|
ocr_cache.set(result, **cache_kwargs)
|
991
996
|
|
992
997
|
return result
|
993
|
-
finally:
|
994
|
-
if image_path.exists():
|
995
|
-
image_path.unlink()
|
996
998
|
finally:
|
997
999
|
if use_cache:
|
998
1000
|
ocr_cache.mark_complete(**cache_kwargs)
|
@@ -1092,6 +1094,55 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1092
1094
|
"mtime": 0,
|
1093
1095
|
}
|
1094
1096
|
|
1097
|
+
def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
|
1098
|
+
"""Convert a worker result dict to ExtractionResult."""
|
1099
|
+
if result_dict.get("success"):
|
1100
|
+
return ExtractionResult(
|
1101
|
+
content=str(result_dict.get("text", "")),
|
1102
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
1103
|
+
metadata={},
|
1104
|
+
chunks=[],
|
1105
|
+
)
|
1106
|
+
return ExtractionResult(
|
1107
|
+
content=f"[OCR error: {result_dict.get('error', 'Unknown error')}]",
|
1108
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
1109
|
+
metadata={},
|
1110
|
+
chunks=[],
|
1111
|
+
)
|
1112
|
+
|
1113
|
+
def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[TesseractConfig]) -> list[ExtractionResult]:
|
1114
|
+
if not paths:
|
1115
|
+
return []
|
1116
|
+
|
1117
|
+
results: list[ExtractionResult] = [
|
1118
|
+
ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
1119
|
+
] * len(paths)
|
1120
|
+
|
1121
|
+
run_config = self._prepare_tesseract_run_config(**kwargs)
|
1122
|
+
config_dict: dict[str, Any] = {
|
1123
|
+
**run_config["remaining_kwargs"],
|
1124
|
+
"language": run_config["language"],
|
1125
|
+
"psm": run_config["psm"],
|
1126
|
+
}
|
1127
|
+
|
1128
|
+
optimal_workers = get_optimal_worker_count(len(paths), cpu_intensive=True)
|
1129
|
+
|
1130
|
+
with ProcessPoolExecutor(max_workers=optimal_workers) as pool:
|
1131
|
+
future_to_idx = {
|
1132
|
+
pool.submit(_process_image_with_tesseract, str(p), config_dict): idx for idx, p in enumerate(paths)
|
1133
|
+
}
|
1134
|
+
for future in as_completed(future_to_idx):
|
1135
|
+
idx = future_to_idx[future]
|
1136
|
+
try:
|
1137
|
+
result_dict = future.result()
|
1138
|
+
results[idx] = self._result_from_dict(result_dict)
|
1139
|
+
except Exception as e: # noqa: BLE001
|
1140
|
+
results[idx] = ExtractionResult(
|
1141
|
+
content=f"[OCR error: {e}]", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
|
1142
|
+
)
|
1143
|
+
|
1144
|
+
return results
|
1145
|
+
|
1095
1146
|
def _build_tesseract_command(
|
1096
1147
|
self,
|
1097
1148
|
path: Path,
|