PyPI - kreuzberg - Versions diffs - 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl - Mend

kreuzberg 3.14.1py3-none-any.whl → 3.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

kreuzberg/__init__.py +10 -0
kreuzberg/_api/_config_cache.py +247 -0
kreuzberg/_api/main.py +74 -45
kreuzberg/_chunker.py +7 -6
kreuzberg/_config.py +11 -1
kreuzberg/_constants.py +2 -0
kreuzberg/_document_classification.py +5 -7
kreuzberg/_entity_extraction.py +9 -4
kreuzberg/_extractors/_base.py +269 -3
kreuzberg/_extractors/_email.py +101 -27
kreuzberg/_extractors/_html.py +112 -7
kreuzberg/_extractors/_image.py +23 -22
kreuzberg/_extractors/_pandoc.py +106 -75
kreuzberg/_extractors/_pdf.py +208 -99
kreuzberg/_extractors/_presentation.py +76 -8
kreuzberg/_extractors/_spread_sheet.py +24 -30
kreuzberg/_extractors/_structured.py +83 -15
kreuzberg/_gmft.py +5 -0
kreuzberg/_mcp/server.py +324 -25
kreuzberg/_mime_types.py +42 -0
kreuzberg/_ocr/_easyocr.py +53 -21
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +88 -37
kreuzberg/_types.py +291 -61
kreuzberg/_utils/_cache.py +10 -4
kreuzberg/_utils/_device.py +2 -4
kreuzberg/_utils/_html_streaming.py +20 -0
kreuzberg/_utils/_image_preprocessing.py +12 -39
kreuzberg/_utils/_process_pool.py +29 -8
kreuzberg/_utils/_quality.py +7 -2
kreuzberg/_utils/_resource_managers.py +65 -0
kreuzberg/_utils/_serialization.py +13 -6
kreuzberg/_utils/_sync.py +39 -10
kreuzberg/_utils/_tmp.py +37 -1
kreuzberg/cli.py +34 -20
kreuzberg/extraction.py +44 -28
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
kreuzberg-3.16.0.dist-info/RECORD +61 -0
kreuzberg-3.14.1.dist-info/RECORD +0 -58
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -8,6 +8,7 @@ import re
 import subprocess
 import sys
 import tempfile
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from io import StringIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Final
@@ -28,10 +29,11 @@ from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
 from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
 from kreuzberg._utils._cache import get_ocr_cache
-from kreuzberg._utils._process_pool import ProcessPoolManager
+from kreuzberg._utils._html_streaming import should_use_streaming
+from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
-from kreuzberg._utils._tmp import create_temp_file
+from kreuzberg._utils._tmp import create_temp_file, temporary_file_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
 if TYPE_CHECKING:
@@ -257,18 +259,19 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         if enable_table_detection and output_format == "text":
             output_format = "tsv"
-        if output_format == "markdown":
-            tesseract_format = "hocr"
-            ext = ".hocr"
-        elif output_format == "tsv":
-            tesseract_format = "tsv"
-            ext = ".tsv"
-        elif output_format == "hocr":
-            tesseract_format = "hocr"
-            ext = ".hocr"
-        else:
-            tesseract_format = "text"
-            ext = ".txt"
+        match output_format:
+            case "markdown":
+                tesseract_format = "hocr"
+                ext = ".hocr"
+            case "tsv":
+                tesseract_format = "tsv"
+                ext = ".tsv"
+            case "hocr":
+                tesseract_format = "hocr"
+                ext = ".hocr"
+            case _:
+                tesseract_format = "text"
+                ext = ".txt"
         return {
             "language": language,
@@ -344,11 +347,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         if output_format == "tsv":
             return self._extract_text_from_tsv(output)
         if output_format == "hocr":
-            return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
+            return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={})
-        return ExtractionResult(
-            content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
-        )
+        return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
     async def process_file(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
         use_cache = kwargs.pop("use_cache", True)
@@ -494,9 +495,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     content += parts[11] + " "
             content = content.strip()
-        return ExtractionResult(
-            content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
-        )
+        return ExtractionResult(content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
     async def _process_hocr_to_markdown(
         self,
@@ -512,12 +511,12 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             escape_asterisks=False,
             escape_underscores=False,
             extract_metadata=False,
-            strip="meta title",
+            strip=["meta", "title"],
         )
         tables: list[TableData] = []
         if enable_table_detection:
-            soup = BeautifulSoup(hocr_content, "lxml")
+            soup = BeautifulSoup(hocr_content, "xml")
             tables = await self._extract_tables_from_hocr(
                 soup,
                 table_column_threshold,
@@ -534,12 +533,16 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         config_dict = config.to_dict()
         config_dict["custom_converters"] = all_converters
+        use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
+        config_dict["stream_processing"] = use_streaming
+        config_dict["chunk_size"] = chunk_size
         try:
             markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
             markdown_content = normalize_spaces(markdown_content)
         except (ValueError, TypeError, AttributeError):
             try:
-                soup = BeautifulSoup(hocr_content, "lxml")
+                soup = BeautifulSoup(hocr_content, "xml")
                 words = soup.find_all("span", class_="ocrx_word")
                 text_parts = []
                 for word in words:
@@ -678,19 +681,25 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 escape_asterisks=False,
                 escape_underscores=False,
                 extract_metadata=False,
-                strip="meta title",
+                strip=["meta", "title"],
             )
+            config_dict = html_config.to_dict()
+            use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
+            config_dict["stream_processing"] = use_streaming
+            config_dict["chunk_size"] = chunk_size
             markdown_content = html_to_markdown.convert_to_markdown(
                 hocr_content,
-                **html_config.to_dict(),
+                **config_dict,
             )
             markdown_content = normalize_spaces(markdown_content)
         except (ValueError, TypeError, AttributeError):
             try:
-                soup = BeautifulSoup(hocr_content, "lxml")
+                soup = BeautifulSoup(hocr_content, "xml")
                 words = soup.find_all("span", class_="ocrx_word")
                 text_parts = []
                 for word in words:
@@ -948,11 +957,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         if output_format == "tsv":
             return self._extract_text_from_tsv(output)
         if output_format == "hocr":
-            return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
+            return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={})
-        return ExtractionResult(
-            content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
-        )
+        return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
     def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
         use_cache = kwargs.pop("use_cache", True)
@@ -979,10 +986,8 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         ocr_cache = get_ocr_cache()
         try:
             self._validate_tesseract_version_sync()
-            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
-                image_path = Path(tmp_file.name)
+            with temporary_file_sync(".png") as image_path:
                 save_image.save(str(image_path), format="PNG")
-            try:
                 kwargs_with_cache = {**kwargs, "use_cache": use_cache}
                 result = self.process_file_sync(image_path, **kwargs_with_cache)
@@ -990,9 +995,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     ocr_cache.set(result, **cache_kwargs)
                 return result
-            finally:
-                if image_path.exists():
-                    image_path.unlink()
         finally:
             if use_cache:
                 ocr_cache.mark_complete(**cache_kwargs)
@@ -1092,6 +1094,55 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 "mtime": 0,
             }
+    def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
+        """Convert a worker result dict to ExtractionResult."""
+        if result_dict.get("success"):
+            return ExtractionResult(
+                content=str(result_dict.get("text", "")),
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata={},
+                chunks=[],
+            )
+        return ExtractionResult(
+            content=f"[OCR error: {result_dict.get('error', 'Unknown error')}]",
+            mime_type=PLAIN_TEXT_MIME_TYPE,
+            metadata={},
+            chunks=[],
+        )
+    def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[TesseractConfig]) -> list[ExtractionResult]:
+        if not paths:
+            return []
+        results: list[ExtractionResult] = [
+            ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
+        ] * len(paths)
+        run_config = self._prepare_tesseract_run_config(**kwargs)
+        config_dict: dict[str, Any] = {
+            **run_config["remaining_kwargs"],
+            "language": run_config["language"],
+            "psm": run_config["psm"],
+        }
+        optimal_workers = get_optimal_worker_count(len(paths), cpu_intensive=True)
+        with ProcessPoolExecutor(max_workers=optimal_workers) as pool:
+            future_to_idx = {
+                pool.submit(_process_image_with_tesseract, str(p), config_dict): idx for idx, p in enumerate(paths)
+            }
+            for future in as_completed(future_to_idx):
+                idx = future_to_idx[future]
+                try:
+                    result_dict = future.result()
+                    results[idx] = self._result_from_dict(result_dict)
+                except Exception as e:  # noqa: BLE001
+                    results[idx] = ExtractionResult(
+                        content=f"[OCR error: {e}]", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
+                    )
+        return results
     def _build_tesseract_command(
         self,
         path: Path,

kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

kreuzberg 3.14.1py3-none-any.whl → 3.16.0py3-none-any.whl