PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +212 -292
kreuzberg/_document_classification.py +20 -47
kreuzberg/_entity_extraction.py +1 -122
kreuzberg/_extractors/_base.py +4 -71
kreuzberg/_extractors/_email.py +1 -15
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -25
kreuzberg/_extractors/_pandoc.py +10 -147
kreuzberg/_extractors/_pdf.py +38 -94
kreuzberg/_extractors/_presentation.py +0 -99
kreuzberg/_extractors/_spread_sheet.py +13 -55
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -199
kreuzberg/_language_detection.py +1 -36
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -19
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +124 -186
kreuzberg/_ocr/_paddleocr.py +154 -224
kreuzberg/_ocr/_table_extractor.py +184 -0
kreuzberg/_ocr/_tesseract.py +797 -361
kreuzberg/_playa.py +5 -31
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +588 -93
kreuzberg/_utils/_cache.py +84 -138
kreuzberg/_utils/_device.py +0 -74
kreuzberg/_utils/_document_cache.py +0 -75
kreuzberg/_utils/_errors.py +0 -50
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -16
kreuzberg/_utils/_process_pool.py +17 -64
kreuzberg/_utils/_quality.py +0 -60
kreuzberg/_utils/_ref.py +32 -0
kreuzberg/_utils/_serialization.py +0 -30
kreuzberg/_utils/_string.py +9 -59
kreuzberg/_utils/_sync.py +0 -77
kreuzberg/_utils/_table.py +49 -101
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
kreuzberg-3.13.1.dist-info/RECORD +57 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -1,15 +1,24 @@
 from __future__ import annotations
 import warnings
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
+from typing import TYPE_CHECKING, Any, ClassVar, Final
 from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr._base import OCRBackend
-from kreuzberg._types import ExtractionResult, Metadata
-from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
+from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata
+from kreuzberg._utils._device import DeviceInfo, validate_device_request
+from kreuzberg._utils._ocr_cache import (
+    build_cache_kwargs,
+    cache_and_complete_async,
+    cache_and_complete_sync,
+    generate_image_hash,
+    get_file_info,
+    handle_cache_lookup_async,
+    handle_cache_lookup_sync,
+    mark_processing_complete,
+)
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -22,6 +31,25 @@ try:  # pragma: no cover
 except ImportError:  # pragma: no cover
     from typing_extensions import Unpack
+if TYPE_CHECKING:
+    import easyocr
+    import numpy as np
+    import torch
+HAS_EASYOCR: bool
+if not TYPE_CHECKING:
+    try:
+        import easyocr
+        import numpy as np
+        import torch
+        HAS_EASYOCR = True
+    except ImportError:
+        HAS_EASYOCR = False
+        easyocr: Any = None
+        np: Any = None
+        torch: Any = None
 EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
     "abq",
@@ -110,88 +138,32 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
 }
-@dataclass(unsafe_hash=True, frozen=True, slots=True)
-class EasyOCRConfig:
-    """Configuration options for EasyOCR."""
-    add_margin: float = 0.1
-    """Extend bounding boxes in all directions."""
-    adjust_contrast: float = 0.5
-    """Target contrast level for low contrast text."""
-    beam_width: int = 5
-    """Beam width for beam search in recognition."""
-    canvas_size: int = 2560
-    """Maximum image dimension for detection."""
-    contrast_ths: float = 0.1
-    """Contrast threshold for preprocessing."""
-    decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
-    """Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
-    height_ths: float = 0.5
-    """Maximum difference in box height for merging."""
-    language: str | list[str] = "en"
-    """Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
-    a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
-    link_threshold: float = 0.4
-    """Link confidence threshold."""
-    low_text: float = 0.4
-    """Text low-bound score."""
-    mag_ratio: float = 1.0
-    """Image magnification ratio."""
-    min_size: int = 10
-    """Minimum text box size in pixels."""
-    rotation_info: list[int] | None = None
-    """List of angles to try for detection."""
-    slope_ths: float = 0.1
-    """Maximum slope for merging text boxes."""
-    text_threshold: float = 0.7
-    """Text confidence threshold."""
-    use_gpu: bool = False
-    """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
-    device: DeviceType = "auto"
-    """Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
-    gpu_memory_limit: float | None = None
-    """Maximum GPU memory to use in GB. None for no limit."""
-    fallback_to_cpu: bool = True
-    """Whether to fallback to CPU if requested device is unavailable."""
-    width_ths: float = 0.5
-    """Maximum horizontal distance for merging boxes."""
-    x_ths: float = 1.0
-    """Maximum horizontal distance for paragraph merging."""
-    y_ths: float = 0.5
-    """Maximum vertical distance for paragraph merging."""
-    ycenter_ths: float = 0.5
-    """Maximum shift in y direction for merging."""
 class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     _reader: ClassVar[Any] = None
     async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
-        """Asynchronously process an image and extract its text and metadata using EasyOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            image: An instance of PIL.Image representing the input image.
-            **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            image_hash = generate_image_hash(image)
+            cache_kwargs = build_cache_kwargs("easyocr", kwargs, image_hash=image_hash)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = await handle_cache_lookup_async(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If OCR processing fails.
-        """
-        import numpy as np  # noqa: PLC0415
-        await self._init_easyocr(**kwargs)
+        try:
+            await self._init_easyocr(**kwargs)
-        beam_width = kwargs.pop("beam_width")
+            beam_width = kwargs.pop("beam_width", 5)
-        kwargs.pop("language", None)
-        kwargs.pop("use_gpu", None)
-        kwargs.pop("device", None)
-        kwargs.pop("gpu_memory_limit", None)
-        kwargs.pop("fallback_to_cpu", None)
+            kwargs.pop("language", None)
+            kwargs.pop("use_gpu", None)
+            kwargs.pop("device", None)
+            kwargs.pop("gpu_memory_limit", None)
+            kwargs.pop("fallback_to_cpu", None)
-        try:
             result = await run_sync(
                 self._reader.readtext,
                 np.array(image),
@@ -199,41 +171,47 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
                 **kwargs,
             )
-            return self._process_easyocr_result(result, image)
+            extraction_result = self._process_easyocr_result(result, image)
+            if use_cache and cache_kwargs:
+                await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
     async def process_file(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
-        """Asynchronously process a file and extract its text and metadata using EasyOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            path: A Path object representing the file to be processed.
-            **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            file_info = get_file_info(path)
+            cache_kwargs = build_cache_kwargs("easyocr", kwargs, file_info=file_info)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = await handle_cache_lookup_async(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If file loading or OCR processing fails.
-        """
-        await self._init_easyocr(**kwargs)
         try:
+            await self._init_easyocr(**kwargs)
             image = await run_sync(Image.open, path)
-            return await self.process_image(image, **kwargs)
+            kwargs["use_cache"] = False
+            extraction_result = await self.process_image(image, **kwargs)
+            if use_cache and cache_kwargs:
+                await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
     @staticmethod
     def _process_easyocr_result(result: list[Any], image: Image.Image) -> ExtractionResult:
-        """Process EasyOCR result into an ExtractionResult with metadata.
-        Args:
-            result: The raw result from EasyOCR.
-            image: The original PIL image.
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
-        """
         if not result:
             return ExtractionResult(
                 content="",
@@ -314,38 +292,19 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     @classmethod
     def _is_gpu_available(cls) -> bool:
-        """Check if GPU is available for EasyOCR.
-        Returns:
-            bool: True if GPU support is available.
-        """
-        try:
-            import torch  # noqa: PLC0415
-            return bool(torch.cuda.is_available())
-        except ImportError:  # pragma: no cover
+        if not HAS_EASYOCR or torch is None:
             return False
+        return bool(torch.cuda.is_available())
     @classmethod
     async def _init_easyocr(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
-        """Initialize EasyOCR with the provided configuration.
-        Args:
-            **kwargs: Configuration parameters for EasyOCR including language, etc.
-        Raises:
-            MissingDependencyError: If EasyOCR is not installed.
-            OCRError: If initialization fails.
-        """
         if cls._reader is not None:
             return
-        try:
-            import easyocr  # noqa: PLC0415
-        except ImportError as e:  # pragma: no cover
+        if not HAS_EASYOCR or easyocr is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
-            ) from e
+            )
         languages = cls._validate_language_code(kwargs.pop("language", "en"))
@@ -369,17 +328,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     @classmethod
     def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
-        """Resolve device configuration with backward compatibility.
-        Args:
-            **kwargs: Configuration parameters including device settings.
-        Returns:
-            DeviceInfo object for the selected device.
-        Raises:
-            ValidationError: If requested device is not available and fallback is disabled.
-        """
         use_gpu = kwargs.get("use_gpu", False)
         device = kwargs.get("device", "auto")
         memory_limit = kwargs.get("gpu_memory_limit")
@@ -416,17 +364,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     @staticmethod
     def _validate_language_code(language_codes: str | list[str]) -> list[str]:
-        """Validate and normalize provided language codes.
-        Args:
-            language_codes: The language code(s), either as a string (single or comma-separated) or a list.
-        Raises:
-            ValidationError: If any of the languages are not supported by EasyOCR
-        Returns:
-            A list with the normalized language codes.
-        """
         if isinstance(language_codes, str):
             languages = [lang.strip().lower() for lang in language_codes.split(",")]
         else:
@@ -445,80 +382,81 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         return languages
     def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
-        """Synchronously process an image and extract its text and metadata using EasyOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            image: An instance of PIL.Image representing the input image.
-            **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            image_hash = generate_image_hash(image)
+            cache_kwargs = build_cache_kwargs("easyocr", kwargs, image_hash=image_hash)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = handle_cache_lookup_sync(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If OCR processing fails.
-        """
-        import numpy as np  # noqa: PLC0415
-        self._init_easyocr_sync(**kwargs)
+        try:
+            self._init_easyocr_sync(**kwargs)
-        beam_width = kwargs.pop("beam_width")
-        kwargs.pop("language", None)
-        kwargs.pop("use_gpu", None)
-        kwargs.pop("device", None)
-        kwargs.pop("gpu_memory_limit", None)
-        kwargs.pop("fallback_to_cpu", None)
+            beam_width = kwargs.pop("beam_width", 5)
+            kwargs.pop("language", None)
+            kwargs.pop("use_gpu", None)
+            kwargs.pop("device", None)
+            kwargs.pop("gpu_memory_limit", None)
+            kwargs.pop("fallback_to_cpu", None)
-        try:
             result = self._reader.readtext(
                 np.array(image),
                 beamWidth=beam_width,
                 **kwargs,
             )
-            return self._process_easyocr_result(result, image)
+            extraction_result = self._process_easyocr_result(result, image)
+            if use_cache and cache_kwargs:
+                cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
     def process_file_sync(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
-        """Synchronously process a file and extract its text and metadata using EasyOCR.
+        use_cache = kwargs.pop("use_cache", True)
-        Args:
-            path: A Path object representing the file to be processed.
-            **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
+        cache_kwargs = None
+        if use_cache:
+            file_info = get_file_info(path)
+            cache_kwargs = build_cache_kwargs("easyocr", kwargs, file_info=file_info)
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+            cached_result = handle_cache_lookup_sync(cache_kwargs)
+            if cached_result:
+                return cached_result
-        Raises:
-            OCRError: If file loading or OCR processing fails.
-        """
-        self._init_easyocr_sync(**kwargs)
         try:
+            self._init_easyocr_sync(**kwargs)
             image = Image.open(path)
-            return self.process_image_sync(image, **kwargs)
+            kwargs["use_cache"] = False
+            extraction_result = self.process_image_sync(image, **kwargs)
+            if use_cache and cache_kwargs:
+                cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
+            return extraction_result
         except Exception as e:
+            if use_cache and cache_kwargs:
+                mark_processing_complete(cache_kwargs)
             raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
     @classmethod
     def _init_easyocr_sync(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
-        """Synchronously initialize EasyOCR with the provided configuration.
-        Args:
-            **kwargs: Configuration parameters for EasyOCR including language, etc.
-        Raises:
-            MissingDependencyError: If EasyOCR is not installed.
-            OCRError: If initialization fails.
-        """
         if cls._reader is not None:
             return
-        try:
-            import easyocr  # noqa: PLC0415
-        except ImportError as e:  # pragma: no cover
+        if not HAS_EASYOCR or easyocr is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
-            ) from e
+            )
         languages = cls._validate_language_code(kwargs.pop("language", "en"))

kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl