PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +212 -292
kreuzberg/_document_classification.py +20 -47
kreuzberg/_entity_extraction.py +1 -122
kreuzberg/_extractors/_base.py +4 -71
kreuzberg/_extractors/_email.py +1 -15
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -25
kreuzberg/_extractors/_pandoc.py +10 -147
kreuzberg/_extractors/_pdf.py +38 -94
kreuzberg/_extractors/_presentation.py +0 -99
kreuzberg/_extractors/_spread_sheet.py +13 -55
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -199
kreuzberg/_language_detection.py +1 -36
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -19
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +124 -186
kreuzberg/_ocr/_paddleocr.py +154 -224
kreuzberg/_ocr/_table_extractor.py +184 -0
kreuzberg/_ocr/_tesseract.py +797 -361
kreuzberg/_playa.py +5 -31
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +588 -93
kreuzberg/_utils/_cache.py +84 -138
kreuzberg/_utils/_device.py +0 -74
kreuzberg/_utils/_document_cache.py +0 -75
kreuzberg/_utils/_errors.py +0 -50
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -16
kreuzberg/_utils/_process_pool.py +17 -64
kreuzberg/_utils/_quality.py +0 -60
kreuzberg/_utils/_ref.py +32 -0
kreuzberg/_utils/_serialization.py +0 -30
kreuzberg/_utils/_string.py +9 -59
kreuzberg/_utils/_sync.py +0 -77
kreuzberg/_utils/_table.py +49 -101
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
kreuzberg-3.13.1.dist-info/RECORD +57 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -2,17 +2,25 @@ from __future__ import annotations
 import platform
 import warnings
-from dataclasses import dataclass
 from importlib.util import find_spec
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
+from typing import TYPE_CHECKING, Any, ClassVar, Final
 from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr._base import OCRBackend
-from kreuzberg._types import ExtractionResult, Metadata
-from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
+from kreuzberg._types import ExtractionResult, Metadata, PaddleOCRConfig
+from kreuzberg._utils._device import DeviceInfo, validate_device_request
+from kreuzberg._utils._ocr_cache import (
+    build_cache_kwargs,
+    cache_and_complete_async,
+    cache_and_complete_sync,
+    generate_image_hash,
+    get_file_info,
+    handle_cache_lookup_async,
+    handle_cache_lookup_sync,
+    mark_processing_complete,
+)
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -20,154 +28,97 @@ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationErr
 if TYPE_CHECKING:
     from pathlib import Path
 try:  # pragma: no cover
     from typing import Unpack  # type: ignore[attr-defined]
 except ImportError:  # pragma: no cover
     from typing_extensions import Unpack
+if TYPE_CHECKING:
+    import numpy as np
+    from paddleocr import PaddleOCR
-PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
+HAS_PADDLEOCR: bool
+if not TYPE_CHECKING:
+    try:
+        import numpy as np
+        from paddleocr import PaddleOCR
+        HAS_PADDLEOCR = True
+    except ImportError:
+        HAS_PADDLEOCR = False
+        np: Any = None
+        PaddleOCR: Any = None
-@dataclass(unsafe_hash=True, frozen=True, slots=True)
-class PaddleOCRConfig:
-    """Configuration options for PaddleOCR.
-    This TypedDict provides type hints and documentation for all PaddleOCR parameters.
-    """
-    cls_image_shape: str = "3,48,192"
-    """Image shape for classification algorithm in format 'channels,height,width'."""
-    det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
-    """Detection algorithm."""
-    det_db_box_thresh: float = 0.5
-    """Score threshold for detected boxes. Boxes below this value are discarded."""
-    det_db_thresh: float = 0.3
-    """Binarization threshold for DB output map."""
-    det_db_unclip_ratio: float = 2.0
-    """Expansion ratio for detected text boxes."""
-    det_east_cover_thresh: float = 0.1
-    """Score threshold for EAST output boxes."""
-    det_east_nms_thresh: float = 0.2
-    """NMS threshold for EAST model output boxes."""
-    det_east_score_thresh: float = 0.8
-    """Binarization threshold for EAST output map."""
-    det_max_side_len: int = 960
-    """Maximum size of image long side. Images exceeding this will be proportionally resized."""
-    det_model_dir: str | None = None
-    """Directory for detection model. If None, uses default model location."""
-    drop_score: float = 0.5
-    """Filter recognition results by confidence score. Results below this are discarded."""
-    enable_mkldnn: bool = False
-    """Whether to enable MKL-DNN acceleration (Intel CPU only)."""
-    gpu_mem: int = 8000
-    """GPU memory size (in MB) to use for initialization."""
-    language: str = "en"
-    """Language to use for OCR."""
-    max_text_length: int = 25
-    """Maximum text length that the recognition algorithm can recognize."""
-    rec: bool = True
-    """Enable text recognition when using the ocr() function."""
-    rec_algorithm: Literal[
-        "CRNN",
-        "SRN",
-        "NRTR",
-        "SAR",
-        "SEED",
-        "SVTR",
-        "SVTR_LCNet",
-        "ViTSTR",
-        "ABINet",
-        "VisionLAN",
-        "SPIN",
-        "RobustScanner",
-        "RFL",
-    ] = "CRNN"
-    """Recognition algorithm."""
-    rec_image_shape: str = "3,32,320"
-    """Image shape for recognition algorithm in format 'channels,height,width'."""
-    rec_model_dir: str | None = None
-    """Directory for recognition model. If None, uses default model location."""
-    table: bool = True
-    """Whether to enable table recognition."""
-    use_angle_cls: bool = True
-    """Whether to use text orientation classification model."""
-    use_gpu: bool = False
-    """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
-    device: DeviceType = "auto"
-    """Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
-    gpu_memory_limit: float | None = None
-    """Maximum GPU memory to use in GB. None for no limit."""
-    fallback_to_cpu: bool = True
-    """Whether to fallback to CPU if requested device is unavailable."""
-    use_space_char: bool = True
-    """Whether to recognize spaces."""
-    use_zero_copy_run: bool = False
-    """Whether to enable zero_copy_run for inference optimization."""
+PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
 class PaddleBackend(OCRBackend[PaddleOCRConfig]):
     _paddle_ocr: ClassVar[Any] = None
     async def process_image(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
-        """Asynchronously process an image and extract its text and metadata using PaddleOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            image: An instance of PIL.Image representing the input image.
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            image_hash = generate_image_hash(image)
+            cache_kwargs = build_cache_kwargs("paddleocr", kwargs, image_hash=image_hash)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = await handle_cache_lookup_async(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If OCR processing fails.
-        """
-        import numpy as np  # noqa: PLC0415
+        try:
+            await self._init_paddle_ocr(**kwargs)
-        await self._init_paddle_ocr(**kwargs)
+            if image.mode != "RGB":
+                image = image.convert("RGB")
-        if image.mode != "RGB":
-            image = image.convert("RGB")
+            image_np = np.array(image)
+            use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
+            result = await run_sync(self._paddle_ocr.ocr, image_np, cls=use_textline_orientation)
-        image_np = np.array(image)
-        try:
-            result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
-            return self._process_paddle_result(result, image)
+            extraction_result = self._process_paddle_result(result, image)
+            if use_cache and cache_kwargs:
+                await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
     async def process_file(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
-        """Asynchronously process a file and extract its text and metadata using PaddleOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            path: A Path object representing the file to be processed.
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            file_info = get_file_info(path)
+            cache_kwargs = build_cache_kwargs("paddleocr", kwargs, file_info=file_info)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = await handle_cache_lookup_async(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If file loading or OCR processing fails.
-        """
-        await self._init_paddle_ocr(**kwargs)
         try:
+            await self._init_paddle_ocr(**kwargs)
             image = await run_sync(Image.open, path)
-            return await self.process_image(image, **kwargs)
+            kwargs["use_cache"] = False
+            extraction_result = await self.process_image(image, **kwargs)
+            if use_cache and cache_kwargs:
+                await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
     @staticmethod
     def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
-        """Process PaddleOCR result into an ExtractionResult with metadata.
-        Args:
-            result: The raw result from PaddleOCR.
-            image: The original PIL image.
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
-        """
         text_content = ""
         confidence_sum = 0
         confidence_count = 0
@@ -227,11 +178,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
     @classmethod
     def _is_mkldnn_supported(cls) -> bool:
-        """Check if the current architecture supports MKL-DNN optimization.
-        Returns:
-            True if MKL-DNN is supported on this architecture.
-        """
         system = platform.system().lower()
         processor = platform.processor().lower()
         machine = platform.machine().lower()
@@ -246,59 +192,44 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
     @classmethod
     async def _init_paddle_ocr(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
-        """Initialize PaddleOCR with the provided configuration.
-        Args:
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
-        Raises:
-            MissingDependencyError: If PaddleOCR is not installed.
-            OCRError: If initialization fails.
-        """
         if cls._paddle_ocr is not None:
             return
-        try:
-            from paddleocr import PaddleOCR  # noqa: PLC0415
-        except ImportError as e:  # pragma: no cover
+        if not HAS_PADDLEOCR or PaddleOCR is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
-            ) from e
+            )
         language = cls._validate_language_code(kwargs.pop("language", "en"))
-        device_info = cls._resolve_device_config(**kwargs)
-        use_gpu = device_info.device_type == "cuda"
+        cls._resolve_device_config(**kwargs)
+        bool(find_spec("paddlepaddle_gpu"))
+        use_angle_cls = kwargs.pop("use_angle_cls", True)
+        kwargs.setdefault("use_textline_orientation", use_angle_cls)
+        det_db_thresh = kwargs.pop("det_db_thresh", 0.3)
+        det_db_box_thresh = kwargs.pop("det_db_box_thresh", 0.5)
+        det_db_unclip_ratio = kwargs.pop("det_db_unclip_ratio", 1.6)
-        has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
-        kwargs.setdefault("use_angle_cls", True)
-        kwargs["use_gpu"] = use_gpu and has_gpu_package
-        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
-        kwargs.setdefault("det_db_thresh", 0.3)
-        kwargs.setdefault("det_db_box_thresh", 0.5)
-        kwargs.setdefault("det_db_unclip_ratio", 1.6)
+        kwargs.setdefault("text_det_thresh", det_db_thresh)
+        kwargs.setdefault("text_det_box_thresh", det_db_box_thresh)
+        kwargs.setdefault("text_det_unclip_ratio", det_db_unclip_ratio)
-        if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
-            kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
+        kwargs.pop("use_gpu", None)
+        kwargs.pop("gpu_mem", None)
+        kwargs.pop("gpu_memory_limit", None)
+        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
         try:
-            cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
+            cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, **kwargs)
         except Exception as e:
             raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
     @classmethod
     def _resolve_device_config(cls, **kwargs: Unpack[PaddleOCRConfig]) -> DeviceInfo:
-        """Resolve device configuration with backward compatibility.
-        Args:
-            **kwargs: Configuration parameters including device settings.
-        Returns:
-            DeviceInfo object for the selected device.
-        Raises:
-            ValidationError: If requested device is not available and fallback is disabled.
-        """
         use_gpu = kwargs.get("use_gpu", False)
         device = kwargs.get("device", "auto")
         memory_limit = kwargs.get("gpu_memory_limit")
@@ -343,17 +274,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
     @staticmethod
     def _validate_language_code(lang_code: str) -> str:
-        """Convert a language code to PaddleOCR format.
-        Args:
-            lang_code: ISO language code or language name
-        Raises:
-            ValidationError: If the language is not supported by PaddleOCR
-        Returns:
-            Language code compatible with PaddleOCR
-        """
         normalized = lang_code.lower()
         if normalized in PADDLEOCR_SUPPORTED_LANGUAGE_CODES:
             return normalized
@@ -367,90 +287,100 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         )
     def process_image_sync(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
-        """Synchronously process an image and extract its text and metadata using PaddleOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            image: An instance of PIL.Image representing the input image.
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            image_hash = generate_image_hash(image)
+            cache_kwargs = build_cache_kwargs("paddleocr", kwargs, image_hash=image_hash)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = handle_cache_lookup_sync(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If OCR processing fails.
-        """
-        import numpy as np  # noqa: PLC0415
+        try:
+            self._init_paddle_ocr_sync(**kwargs)
-        self._init_paddle_ocr_sync(**kwargs)
+            if image.mode != "RGB":
+                image = image.convert("RGB")
-        if image.mode != "RGB":
-            image = image.convert("RGB")
+            image_np = np.array(image)
+            use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
+            result = self._paddle_ocr.ocr(image_np, cls=use_textline_orientation)
-        image_np = np.array(image)
-        try:
-            result = self._paddle_ocr.ocr(image_np, cls=kwargs.get("use_angle_cls", True))
-            return self._process_paddle_result(result, image)
+            extraction_result = self._process_paddle_result(result, image)
+            if use_cache and cache_kwargs:
+                cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
     def process_file_sync(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
-        """Synchronously process a file and extract its text and metadata using PaddleOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            path: A Path object representing the file to be processed.
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            file_info = get_file_info(path)
+            cache_kwargs = build_cache_kwargs("paddleocr", kwargs, file_info=file_info)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = handle_cache_lookup_sync(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If file loading or OCR processing fails.
-        """
-        self._init_paddle_ocr_sync(**kwargs)
         try:
+            self._init_paddle_ocr_sync(**kwargs)
             image = Image.open(path)
-            return self.process_image_sync(image, **kwargs)
+            kwargs["use_cache"] = False
+            extraction_result = self.process_image_sync(image, **kwargs)
+            if use_cache and cache_kwargs:
+                cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
     @classmethod
     def _init_paddle_ocr_sync(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
-        """Synchronously initialize PaddleOCR with the provided configuration.
-        Args:
-            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
-        Raises:
-            MissingDependencyError: If PaddleOCR is not installed.
-            OCRError: If initialization fails.
-        """
         if cls._paddle_ocr is not None:
             return
-        try:
-            from paddleocr import PaddleOCR  # noqa: PLC0415
-        except ImportError as e:  # pragma: no cover
+        if not HAS_PADDLEOCR or PaddleOCR is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
-            ) from e
+            )
         language = cls._validate_language_code(kwargs.pop("language", "en"))
-        device_info = cls._resolve_device_config(**kwargs)
-        use_gpu = device_info.device_type == "cuda"
+        cls._resolve_device_config(**kwargs)
+        bool(find_spec("paddlepaddle_gpu"))
+        use_angle_cls = kwargs.pop("use_angle_cls", True)
+        kwargs.setdefault("use_textline_orientation", use_angle_cls)
+        det_db_thresh = kwargs.pop("det_db_thresh", 0.3)
+        det_db_box_thresh = kwargs.pop("det_db_box_thresh", 0.5)
+        det_db_unclip_ratio = kwargs.pop("det_db_unclip_ratio", 1.6)
+        kwargs.setdefault("text_det_thresh", det_db_thresh)
+        kwargs.setdefault("text_det_box_thresh", det_db_box_thresh)
+        kwargs.setdefault("text_det_unclip_ratio", det_db_unclip_ratio)
-        has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
-        kwargs.setdefault("use_angle_cls", True)
-        kwargs["use_gpu"] = use_gpu and has_gpu_package
-        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
-        kwargs.setdefault("det_db_thresh", 0.3)
-        kwargs.setdefault("det_db_box_thresh", 0.5)
-        kwargs.setdefault("det_db_unclip_ratio", 1.6)
+        kwargs.pop("use_gpu", None)
+        kwargs.pop("gpu_mem", None)
+        kwargs.pop("gpu_memory_limit", None)
-        if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
-            kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
+        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
         try:
-            cls._paddle_ocr = PaddleOCR(lang=language, show_log=False, **kwargs)
+            cls._paddle_ocr = PaddleOCR(lang=language, **kwargs)
         except Exception as e:
             raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e

kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl