PyPI - kreuzberg - Versions diffs - 3.13.0__py3-none-any.whl → 3.13.1__py3-none-any.whl - Mend

kreuzberg 3.13.0py3-none-any.whl → 3.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +0 -124
kreuzberg/_document_classification.py +20 -39
kreuzberg/_entity_extraction.py +0 -29
kreuzberg/_extractors/_base.py +4 -66
kreuzberg/_extractors/_email.py +0 -4
kreuzberg/_extractors/_image.py +0 -2
kreuzberg/_extractors/_pandoc.py +0 -58
kreuzberg/_extractors/_pdf.py +0 -3
kreuzberg/_extractors/_presentation.py +0 -82
kreuzberg/_extractors/_spread_sheet.py +0 -2
kreuzberg/_gmft.py +0 -61
kreuzberg/_language_detection.py +0 -14
kreuzberg/_mime_types.py +0 -17
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +110 -85
kreuzberg/_ocr/_paddleocr.py +146 -138
kreuzberg/_ocr/_table_extractor.py +0 -76
kreuzberg/_ocr/_tesseract.py +0 -206
kreuzberg/_playa.py +0 -27
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +16 -119
kreuzberg/_utils/_cache.py +0 -52
kreuzberg/_utils/_device.py +0 -56
kreuzberg/_utils/_document_cache.py +0 -73
kreuzberg/_utils/_errors.py +0 -47
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -14
kreuzberg/_utils/_process_pool.py +0 -47
kreuzberg/_utils/_quality.py +0 -17
kreuzberg/_utils/_ref.py +0 -16
kreuzberg/_utils/_serialization.py +0 -25
kreuzberg/_utils/_string.py +0 -20
kreuzberg/_utils/_sync.py +0 -76
kreuzberg/_utils/_table.py +0 -45
kreuzberg/_utils/_tmp.py +0 -9
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +3 -2
kreuzberg-3.13.1.dist-info/RECORD +57 -0
kreuzberg-3.13.0.dist-info/RECORD +0 -56
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -11,6 +11,16 @@ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._types import ExtractionResult, Metadata, PaddleOCRConfig
 from kreuzberg._utils._device import DeviceInfo, validate_device_request
+from kreuzberg._utils._ocr_cache import (
+    build_cache_kwargs,
+    cache_and_complete_async,
+    cache_and_complete_sync,
+    generate_image_hash,
+    get_file_info,
+    handle_cache_lookup_async,
+    handle_cache_lookup_sync,
+    mark_processing_complete,
+)
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -23,15 +33,21 @@ try:  # pragma: no cover
 except ImportError:  # pragma: no cover
     from typing_extensions import Unpack
-try:
+if TYPE_CHECKING:
     import numpy as np
     from paddleocr import PaddleOCR
-    HAS_PADDLEOCR = True
-except ImportError:
-    HAS_PADDLEOCR = False
-    np = None  # type: ignore[assignment]
-    PaddleOCR = None
+HAS_PADDLEOCR: bool
+if not TYPE_CHECKING:
+    try:
+        import numpy as np
+        from paddleocr import PaddleOCR
+        HAS_PADDLEOCR = True
+    except ImportError:
+        HAS_PADDLEOCR = False
+        np: Any = None
+        PaddleOCR: Any = None
 PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
@@ -41,61 +57,68 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
     _paddle_ocr: ClassVar[Any] = None
     async def process_image(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
-        """Asynchronously process an image and extract its text and metadata using PaddleOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            image: An instance of PIL.Image representing the input image.
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            image_hash = generate_image_hash(image)
+            cache_kwargs = build_cache_kwargs("paddleocr", kwargs, image_hash=image_hash)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = await handle_cache_lookup_async(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If OCR processing fails.
-        """
-        await self._init_paddle_ocr(**kwargs)
+        try:
+            await self._init_paddle_ocr(**kwargs)
-        if image.mode != "RGB":
-            image = image.convert("RGB")
+            if image.mode != "RGB":
+                image = image.convert("RGB")
-        image_np = np.array(image)
-        try:
-            result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
-            return self._process_paddle_result(result, image)
+            image_np = np.array(image)
+            use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
+            result = await run_sync(self._paddle_ocr.ocr, image_np, cls=use_textline_orientation)
+            extraction_result = self._process_paddle_result(result, image)
+            if use_cache and cache_kwargs:
+                await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
     async def process_file(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
-        """Asynchronously process a file and extract its text and metadata using PaddleOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            path: A Path object representing the file to be processed.
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            file_info = get_file_info(path)
+            cache_kwargs = build_cache_kwargs("paddleocr", kwargs, file_info=file_info)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = await handle_cache_lookup_async(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If file loading or OCR processing fails.
-        """
-        await self._init_paddle_ocr(**kwargs)
         try:
+            await self._init_paddle_ocr(**kwargs)
             image = await run_sync(Image.open, path)
-            return await self.process_image(image, **kwargs)
+            kwargs["use_cache"] = False
+            extraction_result = await self.process_image(image, **kwargs)
+            if use_cache and cache_kwargs:
+                await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
     @staticmethod
     def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
-        """Process PaddleOCR result into an ExtractionResult with metadata.
-        Args:
-            result: The raw result from PaddleOCR.
-            image: The original PIL image.
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
-        """
         text_content = ""
         confidence_sum = 0
         confidence_count = 0
@@ -155,11 +178,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
     @classmethod
     def _is_mkldnn_supported(cls) -> bool:
-        """Check if the current architecture supports MKL-DNN optimization.
-        Returns:
-            True if MKL-DNN is supported on this architecture.
-        """
         system = platform.system().lower()
         processor = platform.processor().lower()
         machine = platform.machine().lower()
@@ -174,15 +192,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
     @classmethod
     async def _init_paddle_ocr(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
-        """Initialize PaddleOCR with the provided configuration.
-        Args:
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
-        Raises:
-            MissingDependencyError: If PaddleOCR is not installed.
-            OCRError: If initialization fails.
-        """
         if cls._paddle_ocr is not None:
             return
@@ -193,38 +202,34 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         language = cls._validate_language_code(kwargs.pop("language", "en"))
-        device_info = cls._resolve_device_config(**kwargs)
-        use_gpu = device_info.device_type == "cuda"
+        cls._resolve_device_config(**kwargs)
-        has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
-        kwargs.setdefault("use_angle_cls", True)
-        kwargs["use_gpu"] = use_gpu and has_gpu_package
-        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
-        kwargs.setdefault("det_db_thresh", 0.3)
-        kwargs.setdefault("det_db_box_thresh", 0.5)
-        kwargs.setdefault("det_db_unclip_ratio", 1.6)
+        bool(find_spec("paddlepaddle_gpu"))
-        if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
-            kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
+        use_angle_cls = kwargs.pop("use_angle_cls", True)
+        kwargs.setdefault("use_textline_orientation", use_angle_cls)
+        det_db_thresh = kwargs.pop("det_db_thresh", 0.3)
+        det_db_box_thresh = kwargs.pop("det_db_box_thresh", 0.5)
+        det_db_unclip_ratio = kwargs.pop("det_db_unclip_ratio", 1.6)
+        kwargs.setdefault("text_det_thresh", det_db_thresh)
+        kwargs.setdefault("text_det_box_thresh", det_db_box_thresh)
+        kwargs.setdefault("text_det_unclip_ratio", det_db_unclip_ratio)
+        kwargs.pop("use_gpu", None)
+        kwargs.pop("gpu_mem", None)
+        kwargs.pop("gpu_memory_limit", None)
+        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
         try:
-            cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
+            cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, **kwargs)
         except Exception as e:
             raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
     @classmethod
     def _resolve_device_config(cls, **kwargs: Unpack[PaddleOCRConfig]) -> DeviceInfo:
-        """Resolve device configuration with backward compatibility.
-        Args:
-            **kwargs: Configuration parameters including device settings.
-        Returns:
-            DeviceInfo object for the selected device.
-        Raises:
-            ValidationError: If requested device is not available and fallback is disabled.
-        """
         use_gpu = kwargs.get("use_gpu", False)
         device = kwargs.get("device", "auto")
         memory_limit = kwargs.get("gpu_memory_limit")
@@ -269,17 +274,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
     @staticmethod
     def _validate_language_code(lang_code: str) -> str:
-        """Convert a language code to PaddleOCR format.
-        Args:
-            lang_code: ISO language code or language name
-        Raises:
-            ValidationError: If the language is not supported by PaddleOCR
-        Returns:
-            Language code compatible with PaddleOCR
-        """
         normalized = lang_code.lower()
         if normalized in PADDLEOCR_SUPPORTED_LANGUAGE_CODES:
             return normalized
@@ -293,61 +287,68 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         )
     def process_image_sync(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
-        """Synchronously process an image and extract its text and metadata using PaddleOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            image: An instance of PIL.Image representing the input image.
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            image_hash = generate_image_hash(image)
+            cache_kwargs = build_cache_kwargs("paddleocr", kwargs, image_hash=image_hash)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = handle_cache_lookup_sync(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If OCR processing fails.
-        """
-        self._init_paddle_ocr_sync(**kwargs)
+        try:
+            self._init_paddle_ocr_sync(**kwargs)
-        if image.mode != "RGB":
-            image = image.convert("RGB")
+            if image.mode != "RGB":
+                image = image.convert("RGB")
-        image_np = np.array(image)
-        try:
-            result = self._paddle_ocr.ocr(image_np, cls=kwargs.get("use_angle_cls", True))
-            return self._process_paddle_result(result, image)
+            image_np = np.array(image)
+            use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
+            result = self._paddle_ocr.ocr(image_np, cls=use_textline_orientation)
+            extraction_result = self._process_paddle_result(result, image)
+            if use_cache and cache_kwargs:
+                cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
     def process_file_sync(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
-        """Synchronously process a file and extract its text and metadata using PaddleOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            path: A Path object representing the file to be processed.
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            file_info = get_file_info(path)
+            cache_kwargs = build_cache_kwargs("paddleocr", kwargs, file_info=file_info)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = handle_cache_lookup_sync(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If file loading or OCR processing fails.
-        """
-        self._init_paddle_ocr_sync(**kwargs)
         try:
+            self._init_paddle_ocr_sync(**kwargs)
             image = Image.open(path)
-            return self.process_image_sync(image, **kwargs)
+            kwargs["use_cache"] = False
+            extraction_result = self.process_image_sync(image, **kwargs)
+            if use_cache and cache_kwargs:
+                cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
     @classmethod
     def _init_paddle_ocr_sync(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
-        """Synchronously initialize PaddleOCR with the provided configuration.
-        Args:
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
-        Raises:
-            MissingDependencyError: If PaddleOCR is not installed.
-            OCRError: If initialization fails.
-        """
         if cls._paddle_ocr is not None:
             return
@@ -358,21 +359,28 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         language = cls._validate_language_code(kwargs.pop("language", "en"))
-        device_info = cls._resolve_device_config(**kwargs)
-        use_gpu = device_info.device_type == "cuda"
+        cls._resolve_device_config(**kwargs)
+        bool(find_spec("paddlepaddle_gpu"))
+        use_angle_cls = kwargs.pop("use_angle_cls", True)
+        kwargs.setdefault("use_textline_orientation", use_angle_cls)
+        det_db_thresh = kwargs.pop("det_db_thresh", 0.3)
+        det_db_box_thresh = kwargs.pop("det_db_box_thresh", 0.5)
+        det_db_unclip_ratio = kwargs.pop("det_db_unclip_ratio", 1.6)
+        kwargs.setdefault("text_det_thresh", det_db_thresh)
+        kwargs.setdefault("text_det_box_thresh", det_db_box_thresh)
+        kwargs.setdefault("text_det_unclip_ratio", det_db_unclip_ratio)
-        has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
-        kwargs.setdefault("use_angle_cls", True)
-        kwargs["use_gpu"] = use_gpu and has_gpu_package
-        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
-        kwargs.setdefault("det_db_thresh", 0.3)
-        kwargs.setdefault("det_db_box_thresh", 0.5)
-        kwargs.setdefault("det_db_unclip_ratio", 1.6)
+        kwargs.pop("use_gpu", None)
+        kwargs.pop("gpu_mem", None)
+        kwargs.pop("gpu_memory_limit", None)
-        if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
-            kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
+        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
         try:
-            cls._paddle_ocr = PaddleOCR(lang=language, show_log=False, **kwargs)
+            cls._paddle_ocr = PaddleOCR(lang=language, **kwargs)
         except Exception as e:
             raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e

kreuzberg/_ocr/_table_extractor.py CHANGED Viewed

@@ -13,18 +13,6 @@ if TYPE_CHECKING:
 def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWord]:
-    """Parse TSV output into structured word data.
-    Args:
-        tsv_data: Raw TSV output from Tesseract.
-        min_confidence: Minimum confidence score to include a word.
-    Returns:
-        List of word dictionaries with position and text data.
-    Raises:
-        ParsingError: If TSV data cannot be parsed.
-    """
     try:
         reader = csv.DictReader(StringIO(tsv_data), delimiter="\t")
         words: list[TSVWord] = []
@@ -62,15 +50,6 @@ def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWor
 def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[int]:
-    """Detect columns using X position clustering.
-    Args:
-        words: List of word dictionaries from TSV.
-        column_threshold: Pixel threshold for column clustering.
-    Returns:
-        Sorted list of column X positions.
-    """
     if not words:
         return []
@@ -94,15 +73,6 @@ def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[
 def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> list[int]:
-    """Detect rows using Y position clustering.
-    Args:
-        words: List of word dictionaries from TSV.
-        row_threshold_ratio: Row threshold as ratio of mean text height.
-    Returns:
-        Sorted list of row Y positions.
-    """
     if not words:
         return []
@@ -129,15 +99,6 @@ def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> li
 def _find_closest_index(value: float, positions: list[int]) -> int:
-    """Find index of closest position.
-    Args:
-        value: The value to match.
-        positions: List of positions to search.
-    Returns:
-        Index of the closest position.
-    """
     if not positions:
         return 0
@@ -146,14 +107,6 @@ def _find_closest_index(value: float, positions: list[int]) -> int:
 def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
-    """Remove completely empty rows and columns.
-    Args:
-        table: 2D table array.
-    Returns:
-        Cleaned table with empty rows/columns removed.
-    """
     if not table:
         return table
@@ -175,16 +128,6 @@ def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
 def reconstruct_table(
     words: list[TSVWord], *, column_threshold: int = 20, row_threshold_ratio: float = 0.5
 ) -> list[list[str]]:
-    """Reconstruct table from words and detected structure.
-    Args:
-        words: List of word dictionaries from TSV.
-        column_threshold: Pixel threshold for column clustering.
-        row_threshold_ratio: Row threshold as ratio of mean text height.
-    Returns:
-        2D list representing the table structure.
-    """
     if not words:
         return []
@@ -211,14 +154,6 @@ def reconstruct_table(
 def to_markdown(table: list[list[str]]) -> str:
-    """Convert table to markdown format.
-    Args:
-        table: 2D list representing the table.
-    Returns:
-        Markdown-formatted table string.
-    """
     if not table or not table[0]:
         return ""
@@ -238,17 +173,6 @@ def to_markdown(table: list[list[str]]) -> str:
 def extract_table_from_tsv(
     tsv_data: str, *, column_threshold: int = 20, row_threshold_ratio: float = 0.5, min_confidence: float = 30.0
 ) -> str:
-    """Extract table from TSV data and convert to markdown.
-    Args:
-        tsv_data: Raw TSV output from Tesseract.
-        column_threshold: Pixel threshold for column clustering.
-        row_threshold_ratio: Row threshold as ratio of mean text height.
-        min_confidence: Minimum confidence score to include a word.
-    Returns:
-        Markdown-formatted table string, or empty string if no table detected.
-    """
     words = extract_words(tsv_data, min_confidence=min_confidence)
     if not words:
         return ""

kreuzberg 3.13.0__py3-none-any.whl → 3.13.1__py3-none-any.whl

kreuzberg 3.13.0py3-none-any.whl → 3.13.1py3-none-any.whl