kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/__init__.py +10 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +74 -45
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_config.py +11 -1
  6. kreuzberg/_constants.py +2 -0
  7. kreuzberg/_document_classification.py +5 -7
  8. kreuzberg/_entity_extraction.py +9 -4
  9. kreuzberg/_extractors/_base.py +269 -3
  10. kreuzberg/_extractors/_email.py +101 -27
  11. kreuzberg/_extractors/_html.py +112 -7
  12. kreuzberg/_extractors/_image.py +23 -22
  13. kreuzberg/_extractors/_pandoc.py +106 -75
  14. kreuzberg/_extractors/_pdf.py +208 -99
  15. kreuzberg/_extractors/_presentation.py +76 -8
  16. kreuzberg/_extractors/_spread_sheet.py +24 -30
  17. kreuzberg/_extractors/_structured.py +83 -15
  18. kreuzberg/_gmft.py +5 -0
  19. kreuzberg/_mcp/server.py +324 -25
  20. kreuzberg/_mime_types.py +42 -0
  21. kreuzberg/_ocr/_easyocr.py +53 -21
  22. kreuzberg/_ocr/_paddleocr.py +1 -1
  23. kreuzberg/_ocr/_tesseract.py +88 -37
  24. kreuzberg/_types.py +291 -61
  25. kreuzberg/_utils/_cache.py +10 -4
  26. kreuzberg/_utils/_device.py +2 -4
  27. kreuzberg/_utils/_html_streaming.py +20 -0
  28. kreuzberg/_utils/_image_preprocessing.py +12 -39
  29. kreuzberg/_utils/_process_pool.py +29 -8
  30. kreuzberg/_utils/_quality.py +7 -2
  31. kreuzberg/_utils/_resource_managers.py +65 -0
  32. kreuzberg/_utils/_serialization.py +13 -6
  33. kreuzberg/_utils/_sync.py +39 -10
  34. kreuzberg/_utils/_tmp.py +37 -1
  35. kreuzberg/cli.py +34 -20
  36. kreuzberg/extraction.py +44 -28
  37. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
  38. kreuzberg-3.16.0.dist-info/RECORD +61 -0
  39. kreuzberg-3.14.1.dist-info/RECORD +0 -58
  40. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
@@ -8,6 +8,7 @@ import re
8
8
  import subprocess
9
9
  import sys
10
10
  import tempfile
11
+ from concurrent.futures import ProcessPoolExecutor, as_completed
11
12
  from io import StringIO
12
13
  from pathlib import Path
13
14
  from typing import TYPE_CHECKING, Any, ClassVar, Final
@@ -28,10 +29,11 @@ from kreuzberg._ocr._base import OCRBackend
28
29
  from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
29
30
  from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
30
31
  from kreuzberg._utils._cache import get_ocr_cache
31
- from kreuzberg._utils._process_pool import ProcessPoolManager
32
+ from kreuzberg._utils._html_streaming import should_use_streaming
33
+ from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
32
34
  from kreuzberg._utils._string import normalize_spaces
33
35
  from kreuzberg._utils._sync import run_sync
34
- from kreuzberg._utils._tmp import create_temp_file
36
+ from kreuzberg._utils._tmp import create_temp_file, temporary_file_sync
35
37
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
36
38
 
37
39
  if TYPE_CHECKING:
@@ -257,18 +259,19 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
257
259
  if enable_table_detection and output_format == "text":
258
260
  output_format = "tsv"
259
261
 
260
- if output_format == "markdown":
261
- tesseract_format = "hocr"
262
- ext = ".hocr"
263
- elif output_format == "tsv":
264
- tesseract_format = "tsv"
265
- ext = ".tsv"
266
- elif output_format == "hocr":
267
- tesseract_format = "hocr"
268
- ext = ".hocr"
269
- else:
270
- tesseract_format = "text"
271
- ext = ".txt"
262
+ match output_format:
263
+ case "markdown":
264
+ tesseract_format = "hocr"
265
+ ext = ".hocr"
266
+ case "tsv":
267
+ tesseract_format = "tsv"
268
+ ext = ".tsv"
269
+ case "hocr":
270
+ tesseract_format = "hocr"
271
+ ext = ".hocr"
272
+ case _:
273
+ tesseract_format = "text"
274
+ ext = ".txt"
272
275
 
273
276
  return {
274
277
  "language": language,
@@ -344,11 +347,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
344
347
  if output_format == "tsv":
345
348
  return self._extract_text_from_tsv(output)
346
349
  if output_format == "hocr":
347
- return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
350
+ return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={})
348
351
 
349
- return ExtractionResult(
350
- content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
351
- )
352
+ return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
352
353
 
353
354
  async def process_file(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
354
355
  use_cache = kwargs.pop("use_cache", True)
@@ -494,9 +495,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
494
495
  content += parts[11] + " "
495
496
  content = content.strip()
496
497
 
497
- return ExtractionResult(
498
- content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
499
- )
498
+ return ExtractionResult(content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
500
499
 
501
500
  async def _process_hocr_to_markdown(
502
501
  self,
@@ -512,12 +511,12 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
512
511
  escape_asterisks=False,
513
512
  escape_underscores=False,
514
513
  extract_metadata=False,
515
- strip="meta title",
514
+ strip=["meta", "title"],
516
515
  )
517
516
 
518
517
  tables: list[TableData] = []
519
518
  if enable_table_detection:
520
- soup = BeautifulSoup(hocr_content, "lxml")
519
+ soup = BeautifulSoup(hocr_content, "xml")
521
520
  tables = await self._extract_tables_from_hocr(
522
521
  soup,
523
522
  table_column_threshold,
@@ -534,12 +533,16 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
534
533
  config_dict = config.to_dict()
535
534
  config_dict["custom_converters"] = all_converters
536
535
 
536
+ use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
537
+ config_dict["stream_processing"] = use_streaming
538
+ config_dict["chunk_size"] = chunk_size
539
+
537
540
  try:
538
541
  markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
539
542
  markdown_content = normalize_spaces(markdown_content)
540
543
  except (ValueError, TypeError, AttributeError):
541
544
  try:
542
- soup = BeautifulSoup(hocr_content, "lxml")
545
+ soup = BeautifulSoup(hocr_content, "xml")
543
546
  words = soup.find_all("span", class_="ocrx_word")
544
547
  text_parts = []
545
548
  for word in words:
@@ -678,19 +681,25 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
678
681
  escape_asterisks=False,
679
682
  escape_underscores=False,
680
683
  extract_metadata=False,
681
- strip="meta title",
684
+ strip=["meta", "title"],
682
685
  )
683
686
 
687
+ config_dict = html_config.to_dict()
688
+
689
+ use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
690
+ config_dict["stream_processing"] = use_streaming
691
+ config_dict["chunk_size"] = chunk_size
692
+
684
693
  markdown_content = html_to_markdown.convert_to_markdown(
685
694
  hocr_content,
686
- **html_config.to_dict(),
695
+ **config_dict,
687
696
  )
688
697
 
689
698
  markdown_content = normalize_spaces(markdown_content)
690
699
 
691
700
  except (ValueError, TypeError, AttributeError):
692
701
  try:
693
- soup = BeautifulSoup(hocr_content, "lxml")
702
+ soup = BeautifulSoup(hocr_content, "xml")
694
703
  words = soup.find_all("span", class_="ocrx_word")
695
704
  text_parts = []
696
705
  for word in words:
@@ -948,11 +957,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
948
957
  if output_format == "tsv":
949
958
  return self._extract_text_from_tsv(output)
950
959
  if output_format == "hocr":
951
- return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
960
+ return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={})
952
961
 
953
- return ExtractionResult(
954
- content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
955
- )
962
+ return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
956
963
 
957
964
  def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
958
965
  use_cache = kwargs.pop("use_cache", True)
@@ -979,10 +986,8 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
979
986
  ocr_cache = get_ocr_cache()
980
987
  try:
981
988
  self._validate_tesseract_version_sync()
982
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
983
- image_path = Path(tmp_file.name)
989
+ with temporary_file_sync(".png") as image_path:
984
990
  save_image.save(str(image_path), format="PNG")
985
- try:
986
991
  kwargs_with_cache = {**kwargs, "use_cache": use_cache}
987
992
  result = self.process_file_sync(image_path, **kwargs_with_cache)
988
993
 
@@ -990,9 +995,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
990
995
  ocr_cache.set(result, **cache_kwargs)
991
996
 
992
997
  return result
993
- finally:
994
- if image_path.exists():
995
- image_path.unlink()
996
998
  finally:
997
999
  if use_cache:
998
1000
  ocr_cache.mark_complete(**cache_kwargs)
@@ -1092,6 +1094,55 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1092
1094
  "mtime": 0,
1093
1095
  }
1094
1096
 
1097
+ def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
1098
+ """Convert a worker result dict to ExtractionResult."""
1099
+ if result_dict.get("success"):
1100
+ return ExtractionResult(
1101
+ content=str(result_dict.get("text", "")),
1102
+ mime_type=PLAIN_TEXT_MIME_TYPE,
1103
+ metadata={},
1104
+ chunks=[],
1105
+ )
1106
+ return ExtractionResult(
1107
+ content=f"[OCR error: {result_dict.get('error', 'Unknown error')}]",
1108
+ mime_type=PLAIN_TEXT_MIME_TYPE,
1109
+ metadata={},
1110
+ chunks=[],
1111
+ )
1112
+
1113
+ def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[TesseractConfig]) -> list[ExtractionResult]:
1114
+ if not paths:
1115
+ return []
1116
+
1117
+ results: list[ExtractionResult] = [
1118
+ ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
1119
+ ] * len(paths)
1120
+
1121
+ run_config = self._prepare_tesseract_run_config(**kwargs)
1122
+ config_dict: dict[str, Any] = {
1123
+ **run_config["remaining_kwargs"],
1124
+ "language": run_config["language"],
1125
+ "psm": run_config["psm"],
1126
+ }
1127
+
1128
+ optimal_workers = get_optimal_worker_count(len(paths), cpu_intensive=True)
1129
+
1130
+ with ProcessPoolExecutor(max_workers=optimal_workers) as pool:
1131
+ future_to_idx = {
1132
+ pool.submit(_process_image_with_tesseract, str(p), config_dict): idx for idx, p in enumerate(paths)
1133
+ }
1134
+ for future in as_completed(future_to_idx):
1135
+ idx = future_to_idx[future]
1136
+ try:
1137
+ result_dict = future.result()
1138
+ results[idx] = self._result_from_dict(result_dict)
1139
+ except Exception as e: # noqa: BLE001
1140
+ results[idx] = ExtractionResult(
1141
+ content=f"[OCR error: {e}]", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
1142
+ )
1143
+
1144
+ return results
1145
+
1095
1146
  def _build_tesseract_command(
1096
1147
  self,
1097
1148
  path: Path,