PyPI - kreuzberg - Versions diffs - 3.1.7__tar.gz → 3.2.0__tar.gz - Mend

kreuzberg 3.1.7tar.gz → 3.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{kreuzberg-3.1.7 → kreuzberg-3.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.1.7
+Version: 3.2.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
@@ -27,8 +27,8 @@ License-File: LICENSE
 Requires-Dist: anyio>=4.9.0
 Requires-Dist: charset-normalizer>=3.4.2
 Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
-Requires-Dist: html-to-markdown>=1.3.3
-Requires-Dist: playa-pdf>=0.5.1
+Requires-Dist: html-to-markdown>=1.4.0
+Requires-Dist: playa-pdf>=0.6.1
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.3.2
 Requires-Dist: python-pptx>=1.0.2
@@ -36,7 +36,7 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < "3.12"
 Provides-Extra: all
 Requires-Dist: easyocr>=1.7.2; extra == "all"
 Requires-Dist: gmft>=0.4.1; extra == "all"
-Requires-Dist: paddleocr>=3.0.1; extra == "all"
+Requires-Dist: paddleocr>=3.0.2; extra == "all"
 Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == "all"
 Requires-Dist: setuptools>=80.9.0; extra == "all"
@@ -47,7 +47,7 @@ Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
 Provides-Extra: gmft
 Requires-Dist: gmft>=0.4.1; extra == "gmft"
 Provides-Extra: paddleocr
-Requires-Dist: paddleocr>=3.0.1; extra == "paddleocr"
+Requires-Dist: paddleocr>=3.0.2; extra == "paddleocr"
 Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
 Requires-Dist: setuptools>=80.9.0; extra == "paddleocr"
 Dynamic: license-file
@@ -157,17 +157,9 @@ Kreuzberg supports multiple OCR engines:
 For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
-## Contribution
+## Contributing
-This library is open to contribution. Feel free to open issues or submit PRs. It's better to discuss issues before submitting PRs to avoid disappointment.
-### Local Development
-- Clone the repo
-- Install the system dependencies
-- Install the full dependencies with `uv sync`
-- Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
-- Make your changes and submit a PR
+We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.
 ## License

{kreuzberg-3.1.7 → kreuzberg-3.2.0}/README.md RENAMED Viewed

@@ -103,17 +103,9 @@ Kreuzberg supports multiple OCR engines:
 For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
-## Contribution
+## Contributing
-This library is open to contribution. Feel free to open issues or submit PRs. It's better to discuss issues before submitting PRs to avoid disappointment.
-### Local Development
-- Clone the repo
-- Install the system dependencies
-- Install the full dependencies with `uv sync`
-- Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
-- Make your changes and submit a PR
+We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.
 ## License

{kreuzberg-3.1.7 → kreuzberg-3.2.0}/kreuzberg/_extractors/_presentation.py RENAMED Viewed

@@ -202,7 +202,7 @@ class PresentationExtractor(Extractor):
             ("keywords", "keywords"),
             ("modified_by", "last_modified_by"),
             ("modified_at", "modified"),
-            ("version", "revision"),  # if version and revision are given, version overwrites ~keep
+            ("version", "revision"),  # if version and revision are given, version overwrites
             ("subject", "subject"),
             ("title", "title"),
             ("version", "version"),

{kreuzberg-3.1.7 → kreuzberg-3.2.0}/kreuzberg/_ocr/_easyocr.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import warnings
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
@@ -8,6 +9,7 @@ from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._types import ExtractionResult, Metadata
+from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -144,7 +146,13 @@ class EasyOCRConfig:
     text_threshold: float = 0.7
     """Text confidence threshold."""
     use_gpu: bool = False
-    """Whether to use GPU for inference."""
+    """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
+    device: DeviceType = "auto"
+    """Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
+    gpu_memory_limit: float | None = None
+    """Maximum GPU memory to use in GB. None for no limit."""
+    fallback_to_cpu: bool = True
+    """Whether to fallback to CPU if requested device is unavailable."""
     width_ths: float = 0.5
     """Maximum horizontal distance for merging boxes."""
     x_ths: float = 1.0
@@ -336,8 +344,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
             ) from e
         languages = cls._validate_language_code(kwargs.pop("language", "en"))
-        has_gpu = cls._is_gpu_available()
-        kwargs.setdefault("gpu", has_gpu)
+        # Handle device selection with backward compatibility
+        device_info = cls._resolve_device_config(**kwargs)
+        use_gpu = device_info.device_type in ("cuda", "mps")
         kwargs.setdefault("detector", True)
         kwargs.setdefault("recognizer", True)
         kwargs.setdefault("download_enabled", True)
@@ -347,12 +358,63 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
             cls._reader = await run_sync(
                 easyocr.Reader,
                 languages,
-                gpu=kwargs.get("use_gpu"),
+                gpu=use_gpu,
                 verbose=False,
             )
         except Exception as e:
             raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
+    @classmethod
+    def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
+        """Resolve device configuration with backward compatibility.
+        Args:
+            **kwargs: Configuration parameters including device settings.
+        Returns:
+            DeviceInfo object for the selected device.
+        Raises:
+            ValidationError: If requested device is not available and fallback is disabled.
+        """
+        # Handle deprecated use_gpu parameter
+        use_gpu = kwargs.get("use_gpu", False)
+        device = kwargs.get("device", "auto")
+        memory_limit = kwargs.get("gpu_memory_limit")
+        fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
+        # Check for deprecated parameter usage
+        if use_gpu and device == "auto":
+            warnings.warn(
+                "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
+                "Use 'device=\"cuda\"' or 'device=\"auto\"' instead.",
+                DeprecationWarning,
+                stacklevel=4,
+            )
+            # Convert deprecated use_gpu=True to device="auto"
+            device = "auto" if use_gpu else "cpu"
+        elif use_gpu and device != "auto":
+            warnings.warn(
+                "Both 'use_gpu' and 'device' parameters specified. The 'use_gpu' parameter is deprecated. "
+                "Using 'device' parameter value.",
+                DeprecationWarning,
+                stacklevel=4,
+            )
+        # Validate and get device info
+        try:
+            return validate_device_request(
+                device,
+                "EasyOCR",
+                memory_limit=memory_limit,
+                fallback_to_cpu=fallback_to_cpu,
+            )
+        except ValidationError:
+            # If device validation fails and we're using deprecated use_gpu=False, fallback to CPU
+            if not use_gpu and device == "cpu":
+                return DeviceInfo(device_type="cpu", name="CPU")
+            raise
     @staticmethod
     def _validate_language_code(language_codes: str | list[str]) -> list[str]:
         """Validate and normalize provided language codes.

{kreuzberg-3.1.7 → kreuzberg-3.2.0}/kreuzberg/_ocr/_paddleocr.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import platform
+import warnings
 from dataclasses import dataclass
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
@@ -10,6 +11,7 @@ from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._types import ExtractionResult, Metadata
+from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -91,7 +93,13 @@ class PaddleOCRConfig:
     use_angle_cls: bool = True
     """Whether to use text orientation classification model."""
     use_gpu: bool = False
-    """Whether to use GPU for inference. Requires installing the paddlepaddle-gpu package"""
+    """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
+    device: DeviceType = "auto"
+    """Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
+    gpu_memory_limit: float | None = None
+    """Maximum GPU memory to use in GB. None for no limit."""
+    fallback_to_cpu: bool = True
+    """Whether to fallback to CPU if requested device is unavailable."""
     use_space_char: bool = True
     """Whether to recognize spaces."""
     use_zero_copy_run: bool = False
@@ -248,19 +256,88 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             ) from e
         language = cls._validate_language_code(kwargs.pop("language", "en"))
+        # Handle device selection with backward compatibility
+        device_info = cls._resolve_device_config(**kwargs)
+        use_gpu = device_info.device_type == "cuda"
         has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
         kwargs.setdefault("use_angle_cls", True)
-        kwargs.setdefault("use_gpu", has_gpu_package)
-        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not has_gpu_package)
+        kwargs["use_gpu"] = use_gpu and has_gpu_package
+        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
         kwargs.setdefault("det_db_thresh", 0.3)
         kwargs.setdefault("det_db_box_thresh", 0.5)
         kwargs.setdefault("det_db_unclip_ratio", 1.6)
+        # Set GPU memory limit if specified
+        if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
+            kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)  # Convert GB to MB
         try:
             cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
         except Exception as e:
             raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
+    @classmethod
+    def _resolve_device_config(cls, **kwargs: Unpack[PaddleOCRConfig]) -> DeviceInfo:
+        """Resolve device configuration with backward compatibility.
+        Args:
+            **kwargs: Configuration parameters including device settings.
+        Returns:
+            DeviceInfo object for the selected device.
+        Raises:
+            ValidationError: If requested device is not available and fallback is disabled.
+        """
+        # Handle deprecated use_gpu parameter
+        use_gpu = kwargs.get("use_gpu", False)
+        device = kwargs.get("device", "auto")
+        memory_limit = kwargs.get("gpu_memory_limit")
+        fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
+        # Check for deprecated parameter usage
+        if use_gpu and device == "auto":
+            warnings.warn(
+                "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
+                "Use 'device=\"cuda\"' or 'device=\"auto\"' instead.",
+                DeprecationWarning,
+                stacklevel=4,
+            )
+            # Convert deprecated use_gpu=True to device="auto"
+            device = "auto" if use_gpu else "cpu"
+        elif use_gpu and device != "auto":
+            warnings.warn(
+                "Both 'use_gpu' and 'device' parameters specified. The 'use_gpu' parameter is deprecated. "
+                "Using 'device' parameter value.",
+                DeprecationWarning,
+                stacklevel=4,
+            )
+        # PaddlePaddle doesn't support MPS, so warn if requested
+        if device == "mps":
+            warnings.warn(
+                "PaddlePaddle does not support MPS (Apple Silicon) acceleration. Falling back to CPU.",
+                UserWarning,
+                stacklevel=4,
+            )
+            device = "cpu"
+        # Validate and get device info
+        try:
+            return validate_device_request(
+                device,
+                "PaddleOCR",
+                memory_limit=memory_limit,
+                fallback_to_cpu=fallback_to_cpu,
+            )
+        except ValidationError:
+            # If device validation fails and we're using deprecated use_gpu=False, fallback to CPU
+            if not use_gpu and device == "cpu":
+                return DeviceInfo(device_type="cpu", name="CPU")
+            raise
     @staticmethod
     def _validate_language_code(lang_code: str) -> str:
         """Convert a language code to PaddleOCR format.

{kreuzberg-3.1.7 → kreuzberg-3.2.0}/kreuzberg/_ocr/_tesseract.py RENAMED Viewed

@@ -264,7 +264,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             env: dict[str, Any] | None = None
             if sys.platform.startswith("linux"):
-                # we have to prevent multithreading this way otherwise we will get deadlocks ~keep
+                # we have to prevent multithreading this way otherwise we will get deadlocks
                 env = {"OMP_THREAD_LIMIT": "1"}
             result = await run_process(command, env=env)

kreuzberg-3.2.0/kreuzberg/_utils/_device.py ADDED Viewed

@@ -0,0 +1,373 @@
+"""Device detection and management utilities for GPU acceleration."""
+# ruff: noqa: BLE001
+from __future__ import annotations
+import warnings
+from dataclasses import dataclass
+from typing import Literal
+from kreuzberg.exceptions import ValidationError
+DeviceType = Literal["cpu", "cuda", "mps", "auto"]
+@dataclass(frozen=True)
+class DeviceInfo:
+    """Information about a compute device."""
+    device_type: Literal["cpu", "cuda", "mps"]
+    """The type of device."""
+    device_id: int | None = None
+    """Device ID for multi-GPU systems. None for CPU or single GPU."""
+    memory_total: float | None = None
+    """Total memory in GB. None if unknown."""
+    memory_available: float | None = None
+    """Available memory in GB. None if unknown."""
+    name: str | None = None
+    """Human-readable device name."""
+def detect_available_devices() -> list[DeviceInfo]:
+    """Detect all available compute devices.
+    Returns:
+        List of available devices, with the most preferred device first.
+    """
+    devices: list[DeviceInfo] = []
+    # Always include CPU as fallback
+    devices.append(
+        DeviceInfo(
+            device_type="cpu",
+            name="CPU",
+        )
+    )
+    # Check for CUDA (NVIDIA GPUs)
+    if _is_cuda_available():
+        cuda_devices = _get_cuda_devices()
+        devices.extend(cuda_devices)
+    # Check for MPS (Apple Silicon)
+    if _is_mps_available():
+        mps_device = _get_mps_device()
+        if mps_device:
+            devices.append(mps_device)
+    # Reorder to put GPU devices first
+    gpu_devices = [d for d in devices if d.device_type != "cpu"]
+    cpu_devices = [d for d in devices if d.device_type == "cpu"]
+    return gpu_devices + cpu_devices
+def get_optimal_device() -> DeviceInfo:
+    """Get the optimal device for OCR processing.
+    Returns:
+        The best available device, preferring GPU over CPU.
+    """
+    devices = detect_available_devices()
+    return devices[0] if devices else DeviceInfo(device_type="cpu", name="CPU")
+def validate_device_request(
+    requested: DeviceType,
+    backend: str,
+    *,
+    memory_limit: float | None = None,
+    fallback_to_cpu: bool = True,
+) -> DeviceInfo:
+    """Validate and resolve a device request.
+    Args:
+        requested: The requested device type.
+        backend: Name of the OCR backend requesting the device.
+        memory_limit: Optional memory limit in GB.
+        fallback_to_cpu: Whether to fallback to CPU if requested device unavailable.
+    Returns:
+        A validated DeviceInfo object.
+    Raises:
+        ValidationError: If the requested device is not available and fallback is disabled.
+    """
+    available_devices = detect_available_devices()
+    # Handle auto device selection
+    if requested == "auto":
+        device = get_optimal_device()
+        if memory_limit is not None:
+            _validate_memory_limit(device, memory_limit)
+        return device
+    # Find requested device
+    matching_devices = [d for d in available_devices if d.device_type == requested]
+    if not matching_devices:
+        if fallback_to_cpu and requested != "cpu":
+            warnings.warn(
+                f"Requested device '{requested}' not available for {backend}. Falling back to CPU.",
+                UserWarning,
+                stacklevel=2,
+            )
+            cpu_device = next((d for d in available_devices if d.device_type == "cpu"), None)
+            if cpu_device:
+                return cpu_device
+        raise ValidationError(
+            f"Requested device '{requested}' is not available for {backend}",
+            context={
+                "requested_device": requested,
+                "backend": backend,
+                "available_devices": [d.device_type for d in available_devices],
+            },
+        )
+    # Use the first matching device (typically the best one)
+    device = matching_devices[0]
+    # Validate memory limit if specified
+    if memory_limit is not None:
+        _validate_memory_limit(device, memory_limit)
+    return device
+def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | None]:
+    """Get memory information for a device.
+    Args:
+        device: The device to query.
+    Returns:
+        Tuple of (total_memory_gb, available_memory_gb). None values if unknown.
+    """
+    if device.device_type == "cpu":
+        return None, None
+    if device.device_type == "cuda":
+        return _get_cuda_memory_info(device.device_id or 0)
+    if device.device_type == "mps":
+        return _get_mps_memory_info()
+    return None, None
+def _is_cuda_available() -> bool:
+    """Check if CUDA is available."""
+    try:
+        import torch
+        return torch.cuda.is_available()
+    except ImportError:
+        return False
+def _is_mps_available() -> bool:
+    """Check if MPS (Apple Silicon) is available."""
+    try:
+        import torch
+        return torch.backends.mps.is_available()
+    except ImportError:
+        return False
+def _get_cuda_devices() -> list[DeviceInfo]:
+    """Get information about available CUDA devices."""
+    devices: list[DeviceInfo] = []
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            return devices
+        for i in range(torch.cuda.device_count()):
+            props = torch.cuda.get_device_properties(i)
+            total_memory = props.total_memory / (1024**3)  # Convert to GB
+            # Get available memory
+            torch.cuda.set_device(i)
+            available_memory = torch.cuda.get_device_properties(i).total_memory / (1024**3)
+            try:
+                # Try to get current memory usage
+                allocated = torch.cuda.memory_allocated(i) / (1024**3)
+                available_memory = total_memory - allocated
+            except Exception:
+                # Fallback to total memory if we can't get allocation info
+                available_memory = total_memory
+            devices.append(
+                DeviceInfo(
+                    device_type="cuda",
+                    device_id=i,
+                    memory_total=total_memory,
+                    memory_available=available_memory,
+                    name=props.name,
+                )
+            )
+    except ImportError:
+        pass
+    return devices
+def _get_mps_device() -> DeviceInfo | None:
+    """Get information about the MPS device."""
+    try:
+        import torch
+        if not torch.backends.mps.is_available():
+            return None
+        # MPS doesn't provide detailed memory info
+        return DeviceInfo(
+            device_type="mps",
+            name="Apple Silicon GPU (MPS)",
+        )
+    except ImportError:
+        return None
+def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
+    """Get CUDA memory information for a specific device."""
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            return None, None
+        props = torch.cuda.get_device_properties(device_id)
+        total_memory = props.total_memory / (1024**3)
+        try:
+            allocated = torch.cuda.memory_allocated(device_id) / (1024**3)
+            available_memory = total_memory - allocated
+        except Exception:
+            available_memory = total_memory
+        return total_memory, available_memory
+    except ImportError:
+        return None, None
+def _get_mps_memory_info() -> tuple[float | None, float | None]:
+    """Get MPS memory information."""
+    # MPS doesn't provide detailed memory info through PyTorch
+    # We could potentially use system calls but that's platform-specific
+    return None, None
+def _validate_memory_limit(device: DeviceInfo, memory_limit: float) -> None:
+    """Validate that a device has enough memory for the requested limit.
+    Args:
+        device: The device to validate.
+        memory_limit: Required memory in GB.
+    Raises:
+        ValidationError: If the device doesn't have enough memory.
+    """
+    if device.device_type == "cpu":
+        # CPU memory validation is complex and OS-dependent, skip for now
+        return
+    total_memory, available_memory = get_device_memory_info(device)
+    if total_memory is not None and memory_limit > total_memory:
+        raise ValidationError(
+            f"Requested memory limit ({memory_limit:.1f}GB) exceeds device capacity ({total_memory:.1f}GB)",
+            context={
+                "device": device.device_type,
+                "device_name": device.name,
+                "requested_memory": memory_limit,
+                "total_memory": total_memory,
+                "available_memory": available_memory,
+            },
+        )
+    if available_memory is not None and memory_limit > available_memory:
+        warnings.warn(
+            f"Requested memory limit ({memory_limit:.1f}GB) exceeds available memory "
+            f"({available_memory:.1f}GB) on {device.name or device.device_type}",
+            UserWarning,
+            stacklevel=3,
+        )
+def is_backend_gpu_compatible(backend: str) -> bool:
+    """Check if an OCR backend supports GPU acceleration.
+    Args:
+        backend: Name of the OCR backend.
+    Returns:
+        True if the backend supports GPU acceleration.
+    """
+    # EasyOCR and PaddleOCR support GPU, Tesseract does not
+    return backend.lower() in ("easyocr", "paddleocr")
+def get_recommended_batch_size(device: DeviceInfo, input_size_mb: float = 10.0) -> int:
+    """Get recommended batch size for OCR processing.
+    Args:
+        device: The device to optimize for.
+        input_size_mb: Estimated input size per item in MB.
+    Returns:
+        Recommended batch size.
+    """
+    if device.device_type == "cpu":
+        # Conservative batch size for CPU
+        return 1
+    # For GPU devices, estimate based on available memory
+    _, available_memory = get_device_memory_info(device)
+    if available_memory is None:
+        # Conservative default for unknown memory
+        return 4
+    # Reserve some memory for model and intermediate calculations
+    # Use approximately 50% of available memory for batching
+    usable_memory_gb = available_memory * 0.5
+    usable_memory_mb = usable_memory_gb * 1024
+    # Estimate batch size (conservative)
+    estimated_batch_size = max(1, int(usable_memory_mb / (input_size_mb * 4)))
+    # Cap at reasonable limits
+    return min(estimated_batch_size, 32)
+def cleanup_device_memory(device: DeviceInfo) -> None:
+    """Clean up device memory.
+    Args:
+        device: The device to clean up.
+    """
+    if device.device_type == "cuda":
+        try:
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except ImportError:
+            pass
+    elif device.device_type == "mps":
+        try:
+            import torch
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
+        except (ImportError, AttributeError):
+            pass

{kreuzberg-3.1.7 → kreuzberg-3.2.0}/kreuzberg.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.1.7
+Version: 3.2.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
@@ -27,8 +27,8 @@ License-File: LICENSE
 Requires-Dist: anyio>=4.9.0
 Requires-Dist: charset-normalizer>=3.4.2
 Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
-Requires-Dist: html-to-markdown>=1.3.3
-Requires-Dist: playa-pdf>=0.5.1
+Requires-Dist: html-to-markdown>=1.4.0
+Requires-Dist: playa-pdf>=0.6.1
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.3.2
 Requires-Dist: python-pptx>=1.0.2
@@ -36,7 +36,7 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < "3.12"
 Provides-Extra: all
 Requires-Dist: easyocr>=1.7.2; extra == "all"
 Requires-Dist: gmft>=0.4.1; extra == "all"
-Requires-Dist: paddleocr>=3.0.1; extra == "all"
+Requires-Dist: paddleocr>=3.0.2; extra == "all"
 Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == "all"
 Requires-Dist: setuptools>=80.9.0; extra == "all"
@@ -47,7 +47,7 @@ Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
 Provides-Extra: gmft
 Requires-Dist: gmft>=0.4.1; extra == "gmft"
 Provides-Extra: paddleocr
-Requires-Dist: paddleocr>=3.0.1; extra == "paddleocr"
+Requires-Dist: paddleocr>=3.0.2; extra == "paddleocr"
 Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
 Requires-Dist: setuptools>=80.9.0; extra == "paddleocr"
 Dynamic: license-file
@@ -157,17 +157,9 @@ Kreuzberg supports multiple OCR engines:
 For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
-## Contribution
+## Contributing
-This library is open to contribution. Feel free to open issues or submit PRs. It's better to discuss issues before submitting PRs to avoid disappointment.
-### Local Development
-- Clone the repo
-- Install the system dependencies
-- Install the full dependencies with `uv sync`
-- Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
-- Make your changes and submit a PR
+We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.
 ## License

{kreuzberg-3.1.7 → kreuzberg-3.2.0}/kreuzberg.egg-info/SOURCES.txt RENAMED Viewed

@@ -31,6 +31,7 @@ kreuzberg/_ocr/_easyocr.py
 kreuzberg/_ocr/_paddleocr.py
 kreuzberg/_ocr/_tesseract.py
 kreuzberg/_utils/__init__.py
+kreuzberg/_utils/_device.py
 kreuzberg/_utils/_string.py
 kreuzberg/_utils/_sync.py
 kreuzberg/_utils/_tmp.py

{kreuzberg-3.1.7 → kreuzberg-3.2.0}/kreuzberg.egg-info/requires.txt RENAMED Viewed

@@ -1,7 +1,7 @@
 anyio>=4.9.0
 charset-normalizer>=3.4.2
-html-to-markdown>=1.3.3
-playa-pdf>=0.5.1
+html-to-markdown>=1.4.0
+playa-pdf>=0.6.1
 pypdfium2==4.30.0
 python-calamine>=0.3.2
 python-pptx>=1.0.2
@@ -15,7 +15,7 @@ typing-extensions>=4.14.0
 [all]
 easyocr>=1.7.2
 gmft>=0.4.1
-paddleocr>=3.0.1
+paddleocr>=3.0.2
 paddlepaddle>=3.0.0
 semantic-text-splitter>=0.27.0
 setuptools>=80.9.0
@@ -30,6 +30,6 @@ easyocr>=1.7.2
 gmft>=0.4.1
 [paddleocr]
-paddleocr>=3.0.1
+paddleocr>=3.0.2
 paddlepaddle>=3.0.0
 setuptools>=80.9.0

{kreuzberg-3.1.7 → kreuzberg-3.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "kreuzberg"
-version = "3.1.7"
+version = "3.2.0"
 description = "A text extraction library supporting PDFs, images, office documents and more"
 readme = "README.md"
 keywords = [
@@ -40,8 +40,8 @@ dependencies = [
   "anyio>=4.9.0",
   "charset-normalizer>=3.4.2",
   "exceptiongroup>=1.2.2; python_version<'3.11'",
-  "html-to-markdown>=1.3.3",
-  "playa-pdf>=0.5.1",                                 # pinned due to breaking changes in 0.5.0
+  "html-to-markdown>=1.4.0",
+  "playa-pdf>=0.6.1",                                 # pinned due to breaking changes in 0.5.0
   "pypdfium2==4.30.0",                                # pinned due to bug in 4.30.1, until v5 is stable
   "python-calamine>=0.3.2",
   "python-pptx>=1.0.2",
@@ -54,7 +54,7 @@ optional-dependencies.all = [
   # gmft
   "gmft>=0.4.1",
   # paddle
-  "paddleocr>=3.0.1",
+  "paddleocr>=3.0.2",
   "paddlepaddle>=3.0.0",
   # chunking
   "semantic-text-splitter>=0.27.0",
@@ -70,7 +70,7 @@ optional-dependencies.gmft = [
   "gmft>=0.4.1",
 ]
 optional-dependencies.paddleocr = [
-  "paddleocr>=3.0.1",
+  "paddleocr>=3.0.2",
   "paddlepaddle>=3.0.0",
   "setuptools>=80.9.0",
 ]
@@ -79,13 +79,13 @@ urls.homepage = "https://github.com/Goldziher/kreuzberg"
 [dependency-groups]
 dev = [
   "covdefaults>=2.3.0",
-  "mypy>=1.16.0",
+  "mypy>=1.16.1",
   "pre-commit>=4.2.0",
-  "pytest>=8.4.0",
-  "pytest-cov>=6.1.1",
+  "pytest>=8.4.1",
+  "pytest-cov>=6.2.1",
   "pytest-mock>=3.14.0",
   "pytest-timeout>=2.4.0",
-  "ruff>=0.11.13",
+  "ruff>=0.12.0",
   "trio>=0.30.0",
   "uv-bump",
 ]
@@ -108,19 +108,20 @@ format.docstring-code-line-length = 120
 format.docstring-code-format = true
 lint.select = [ "ALL" ]
 lint.ignore = [
-  "ANN401", # Dynamically typed ANY for kwargs
-  "COM812", # Conflicts with formatter
-  "D100",   # Missing docstring in public module
-  "D104",   # Missing docstring in public package
-  "D107",   # Missing docstring in __init__
-  "D205",   # 1 blank line required between summary line and description
-  "E501",   # Line too long, handled by ruff format
-  "EM",     # Exception messages,
-  "FBT",    # Boolean-typed positional argument in function definition
-  "FIX",    # We allow todo and fixme comments
-  "ISC001", # Conflicts with formatter
-  "TD",     # We allow todo and fixme comments
-  "TRY",    # Try except block, rules are too strict
+  "ANN401",  # Dynamically typed ANY for kwargs
+  "COM812",  # Conflicts with formatter
+  "D100",    # Missing docstring in public module
+  "D104",    # Missing docstring in public package
+  "D107",    # Missing docstring in __init__
+  "D205",    # 1 blank line required between summary line and description
+  "E501",    # Line too long, handled by ruff format
+  "EM",      # Exception messages,
+  "FBT",     # Boolean-typed positional argument in function definition
+  "FIX",     # We allow todo and fixme comments
+  "ISC001",  # Conflicts with formatter
+  "PLC0415", # Import should be at top-level (we use conditional imports)
+  "TD",      # We allow todo and fixme comments
+  "TRY",     # Try except block, rules are too strict
 ]
 lint.per-file-ignores."tests/**/*.*" = [
   "ARG001",
@@ -128,6 +129,7 @@ lint.per-file-ignores."tests/**/*.*" = [
   "N815",
   "PD",
   "PGH003",
+  "PLC",     # Disable all PLC rules for tests
   "PLR0915",
   "PLR2004",
   "PT006",