PyPI - kreuzberg - Versions diffs - 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl - Mend

kreuzberg 3.1.7py3-none-any.whl → 3.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

kreuzberg/__init__.py +3 -0
kreuzberg/__main__.py +8 -0
kreuzberg/_cli_config.py +175 -0
kreuzberg/_extractors/_image.py +39 -4
kreuzberg/_extractors/_pandoc.py +158 -18
kreuzberg/_extractors/_pdf.py +199 -19
kreuzberg/_extractors/_presentation.py +1 -1
kreuzberg/_extractors/_spread_sheet.py +65 -7
kreuzberg/_gmft.py +222 -16
kreuzberg/_mime_types.py +62 -16
kreuzberg/_multiprocessing/__init__.py +6 -0
kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
kreuzberg/_multiprocessing/process_manager.py +188 -0
kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
kreuzberg/_ocr/_easyocr.py +66 -10
kreuzberg/_ocr/_paddleocr.py +86 -7
kreuzberg/_ocr/_tesseract.py +136 -46
kreuzberg/_playa.py +43 -0
kreuzberg/_utils/_cache.py +372 -0
kreuzberg/_utils/_device.py +356 -0
kreuzberg/_utils/_document_cache.py +220 -0
kreuzberg/_utils/_errors.py +232 -0
kreuzberg/_utils/_pdf_lock.py +72 -0
kreuzberg/_utils/_process_pool.py +100 -0
kreuzberg/_utils/_serialization.py +82 -0
kreuzberg/_utils/_string.py +1 -1
kreuzberg/_utils/_sync.py +21 -0
kreuzberg/cli.py +338 -0
kreuzberg/extraction.py +247 -36
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +95 -34
kreuzberg-3.3.0.dist-info/RECORD +48 -0
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
kreuzberg-3.1.7.dist-info/RECORD +0 -33
kreuzberg-3.1.7.dist-info/top_level.txt +0 -1
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_device.py ADDED Viewed

@@ -0,0 +1,356 @@
+"""Device detection and management utilities for GPU acceleration."""
+# ruff: noqa: BLE001  # ~keep
+from __future__ import annotations
+import warnings
+from dataclasses import dataclass
+from typing import Literal
+from kreuzberg.exceptions import ValidationError
+DeviceType = Literal["cpu", "cuda", "mps", "auto"]
+@dataclass(frozen=True)
+class DeviceInfo:
+    """Information about a compute device."""
+    device_type: Literal["cpu", "cuda", "mps"]
+    """The type of device."""
+    device_id: int | None = None
+    """Device ID for multi-GPU systems. None for CPU or single GPU."""
+    memory_total: float | None = None
+    """Total memory in GB. None if unknown."""
+    memory_available: float | None = None
+    """Available memory in GB. None if unknown."""
+    name: str | None = None
+    """Human-readable device name."""
+def detect_available_devices() -> list[DeviceInfo]:
+    """Detect all available compute devices.
+    Returns:
+        List of available devices, with the most preferred device first.
+    """
+    devices: list[DeviceInfo] = []
+    devices.append(
+        DeviceInfo(
+            device_type="cpu",
+            name="CPU",
+        )
+    )
+    if _is_cuda_available():
+        cuda_devices = _get_cuda_devices()
+        devices.extend(cuda_devices)
+    if _is_mps_available():
+        mps_device = _get_mps_device()
+        if mps_device:
+            devices.append(mps_device)
+    gpu_devices = [d for d in devices if d.device_type != "cpu"]
+    cpu_devices = [d for d in devices if d.device_type == "cpu"]
+    return gpu_devices + cpu_devices
+def get_optimal_device() -> DeviceInfo:
+    """Get the optimal device for OCR processing.
+    Returns:
+        The best available device, preferring GPU over CPU.
+    """
+    devices = detect_available_devices()
+    return devices[0] if devices else DeviceInfo(device_type="cpu", name="CPU")
+def validate_device_request(
+    requested: DeviceType,
+    backend: str,
+    *,
+    memory_limit: float | None = None,
+    fallback_to_cpu: bool = True,
+) -> DeviceInfo:
+    """Validate and resolve a device request.
+    Args:
+        requested: The requested device type.
+        backend: Name of the OCR backend requesting the device.
+        memory_limit: Optional memory limit in GB.
+        fallback_to_cpu: Whether to fallback to CPU if requested device unavailable.
+    Returns:
+        A validated DeviceInfo object.
+    Raises:
+        ValidationError: If the requested device is not available and fallback is disabled.
+    """
+    available_devices = detect_available_devices()
+    if requested == "auto":
+        device = get_optimal_device()
+        if memory_limit is not None:
+            _validate_memory_limit(device, memory_limit)
+        return device
+    matching_devices = [d for d in available_devices if d.device_type == requested]
+    if not matching_devices:
+        if fallback_to_cpu and requested != "cpu":
+            warnings.warn(
+                f"Requested device '{requested}' not available for {backend}. Falling back to CPU.",
+                UserWarning,
+                stacklevel=2,
+            )
+            cpu_device = next((d for d in available_devices if d.device_type == "cpu"), None)
+            if cpu_device:
+                return cpu_device
+        raise ValidationError(
+            f"Requested device '{requested}' is not available for {backend}",
+            context={
+                "requested_device": requested,
+                "backend": backend,
+                "available_devices": [d.device_type for d in available_devices],
+            },
+        )
+    device = matching_devices[0]
+    if memory_limit is not None:
+        _validate_memory_limit(device, memory_limit)
+    return device
+def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | None]:
+    """Get memory information for a device.
+    Args:
+        device: The device to query.
+    Returns:
+        Tuple of (total_memory_gb, available_memory_gb). None values if unknown.
+    """
+    if device.device_type == "cpu":
+        return None, None
+    if device.device_type == "cuda":
+        return _get_cuda_memory_info(device.device_id or 0)
+    if device.device_type == "mps":
+        return _get_mps_memory_info()
+    return None, None
+def _is_cuda_available() -> bool:
+    """Check if CUDA is available."""
+    try:
+        import torch  # type: ignore[import-not-found,unused-ignore]
+        return torch.cuda.is_available()
+    except ImportError:
+        return False
+def _is_mps_available() -> bool:
+    """Check if MPS (Apple Silicon) is available."""
+    try:
+        import torch  # type: ignore[import-not-found,unused-ignore]
+        return torch.backends.mps.is_available()
+    except ImportError:
+        return False
+def _get_cuda_devices() -> list[DeviceInfo]:
+    """Get information about available CUDA devices."""
+    devices: list[DeviceInfo] = []
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            return devices
+        for i in range(torch.cuda.device_count()):
+            props = torch.cuda.get_device_properties(i)
+            total_memory = props.total_memory / (1024**3)
+            torch.cuda.set_device(i)
+            available_memory = torch.cuda.get_device_properties(i).total_memory / (1024**3)
+            try:
+                allocated = torch.cuda.memory_allocated(i) / (1024**3)
+                available_memory = total_memory - allocated
+            except Exception:
+                available_memory = total_memory
+            devices.append(
+                DeviceInfo(
+                    device_type="cuda",
+                    device_id=i,
+                    memory_total=total_memory,
+                    memory_available=available_memory,
+                    name=props.name,
+                )
+            )
+    except ImportError:
+        pass
+    return devices
+def _get_mps_device() -> DeviceInfo | None:
+    """Get information about the MPS device."""
+    try:
+        import torch
+        if not torch.backends.mps.is_available():
+            return None
+        return DeviceInfo(
+            device_type="mps",
+            name="Apple Silicon GPU (MPS)",
+        )
+    except ImportError:
+        return None
+def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
+    """Get CUDA memory information for a specific device."""
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            return None, None
+        props = torch.cuda.get_device_properties(device_id)
+        total_memory = props.total_memory / (1024**3)
+        try:
+            allocated = torch.cuda.memory_allocated(device_id) / (1024**3)
+            available_memory = total_memory - allocated
+        except Exception:
+            available_memory = total_memory
+        return total_memory, available_memory
+    except ImportError:
+        return None, None
+def _get_mps_memory_info() -> tuple[float | None, float | None]:
+    """Get MPS memory information."""
+    return None, None
+def _validate_memory_limit(device: DeviceInfo, memory_limit: float) -> None:
+    """Validate that a device has enough memory for the requested limit.
+    Args:
+        device: The device to validate.
+        memory_limit: Required memory in GB.
+    Raises:
+        ValidationError: If the device doesn't have enough memory.
+    """
+    if device.device_type == "cpu":
+        # CPU memory validation is complex and OS-dependent, skip for now  # ~keep
+        return
+    total_memory, available_memory = get_device_memory_info(device)
+    if total_memory is not None and memory_limit > total_memory:
+        raise ValidationError(
+            f"Requested memory limit ({memory_limit:.1f}GB) exceeds device capacity ({total_memory:.1f}GB)",
+            context={
+                "device": device.device_type,
+                "device_name": device.name,
+                "requested_memory": memory_limit,
+                "total_memory": total_memory,
+                "available_memory": available_memory,
+            },
+        )
+    if available_memory is not None and memory_limit > available_memory:
+        warnings.warn(
+            f"Requested memory limit ({memory_limit:.1f}GB) exceeds available memory "
+            f"({available_memory:.1f}GB) on {device.name or device.device_type}",
+            UserWarning,
+            stacklevel=3,
+        )
+def is_backend_gpu_compatible(backend: str) -> bool:
+    """Check if an OCR backend supports GPU acceleration.
+    Args:
+        backend: Name of the OCR backend.
+    Returns:
+        True if the backend supports GPU acceleration.
+    """
+    # EasyOCR and PaddleOCR support GPU, Tesseract does not  # ~keep
+    return backend.lower() in ("easyocr", "paddleocr")
+def get_recommended_batch_size(device: DeviceInfo, input_size_mb: float = 10.0) -> int:
+    """Get recommended batch size for OCR processing.
+    Args:
+        device: The device to optimize for.
+        input_size_mb: Estimated input size per item in MB.
+    Returns:
+        Recommended batch size.
+    """
+    if device.device_type == "cpu":
+        # Conservative batch size for CPU  # ~keep
+        return 1
+    _, available_memory = get_device_memory_info(device)
+    if available_memory is None:
+        return 4
+    # Use approximately 50% of available memory for batching  # ~keep
+    usable_memory_gb = available_memory * 0.5
+    usable_memory_mb = usable_memory_gb * 1024
+    # Estimate batch size (conservative)  # ~keep
+    estimated_batch_size = max(1, int(usable_memory_mb / (input_size_mb * 4)))
+    # Cap at reasonable limits  # ~keep
+    return min(estimated_batch_size, 32)
+def cleanup_device_memory(device: DeviceInfo) -> None:
+    """Clean up device memory.
+    Args:
+        device: The device to clean up.
+    """
+    if device.device_type == "cuda":
+        try:
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except ImportError:
+            pass
+    elif device.device_type == "mps":
+        try:
+            import torch
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
+        except (ImportError, AttributeError):
+            pass

kreuzberg/_utils/_document_cache.py ADDED Viewed

@@ -0,0 +1,220 @@
+"""Document-level caching to prevent pypdfium2 issues with duplicate processing."""
+from __future__ import annotations
+import hashlib
+import threading
+import time
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from kreuzberg._types import ExtractionConfig, ExtractionResult
+class DocumentCache:
+    """Session-scoped cache for document extraction results.
+    Ensures each unique document is processed only once per session,
+    preventing pypdfium2 state corruption issues with repeated processing.
+    """
+    def __init__(self) -> None:
+        """Initialize document cache."""
+        self._cache: dict[str, ExtractionResult] = {}
+        self._processing: dict[str, threading.Event] = {}
+        self._lock = threading.Lock()
+        self._file_metadata: dict[str, dict[str, Any]] = {}
+    def _get_cache_key(self, file_path: Path | str, config: ExtractionConfig | None = None) -> str:
+        """Generate cache key for a file and config combination.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+        Returns:
+            Unique cache key string
+        """
+        path = Path(file_path).resolve()
+        try:
+            stat = path.stat()
+            file_info = {
+                "path": str(path),
+                "size": stat.st_size,
+                "mtime": stat.st_mtime,
+            }
+        except OSError:
+            file_info = {"path": str(path), "size": 0, "mtime": 0}
+        config_info = {}
+        if config:
+            config_info = {
+                "force_ocr": config.force_ocr,
+                "ocr_backend": config.ocr_backend,
+                "extract_tables": config.extract_tables,
+                "chunk_content": config.chunk_content,
+                "max_chars": config.max_chars,
+                "max_overlap": config.max_overlap,
+            }
+        cache_data = {**file_info, **config_info}
+        cache_str = str(sorted(cache_data.items()))
+        return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
+    def _is_cache_valid(self, cache_key: str, file_path: Path | str) -> bool:
+        """Check if cached result is still valid.
+        Args:
+            cache_key: The cache key to validate
+            file_path: Path to the file
+        Returns:
+            True if cache is valid, False if invalidated
+        """
+        if cache_key not in self._file_metadata:
+            return False
+        path = Path(file_path)
+        try:
+            current_stat = path.stat()
+            cached_metadata = self._file_metadata[cache_key]
+            return bool(
+                cached_metadata["size"] == current_stat.st_size and cached_metadata["mtime"] == current_stat.st_mtime
+            )
+        except OSError:
+            return False
+    def get(self, file_path: Path | str, config: ExtractionConfig | None = None) -> ExtractionResult | None:
+        """Get cached extraction result if available and valid.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+        Returns:
+            Cached result if available, None otherwise
+        """
+        cache_key = self._get_cache_key(file_path, config)
+        with self._lock:
+            if cache_key in self._cache:
+                if self._is_cache_valid(cache_key, file_path):
+                    return self._cache[cache_key]
+                self._cache.pop(cache_key, None)
+                self._file_metadata.pop(cache_key, None)
+        return None
+    def set(self, file_path: Path | str, config: ExtractionConfig | None, result: ExtractionResult) -> None:
+        """Cache extraction result.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+            result: Extraction result to cache
+        """
+        cache_key = self._get_cache_key(file_path, config)
+        path = Path(file_path)
+        try:
+            stat = path.stat()
+            file_metadata = {
+                "size": stat.st_size,
+                "mtime": stat.st_mtime,
+                "cached_at": time.time(),
+            }
+        except OSError:
+            file_metadata = {
+                "size": 0,
+                "mtime": 0,
+                "cached_at": time.time(),
+            }
+        with self._lock:
+            self._cache[cache_key] = result
+            self._file_metadata[cache_key] = file_metadata
+    def is_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> bool:
+        """Check if file is currently being processed.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+        Returns:
+            True if file is currently being processed
+        """
+        cache_key = self._get_cache_key(file_path, config)
+        with self._lock:
+            return cache_key in self._processing
+    def mark_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> threading.Event:
+        """Mark file as being processed and return event to wait on.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+        Returns:
+            Event that will be set when processing completes
+        """
+        cache_key = self._get_cache_key(file_path, config)
+        with self._lock:
+            if cache_key not in self._processing:
+                self._processing[cache_key] = threading.Event()
+            return self._processing[cache_key]
+    def mark_complete(self, file_path: Path | str, config: ExtractionConfig | None = None) -> None:
+        """Mark file processing as complete.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+        """
+        cache_key = self._get_cache_key(file_path, config)
+        with self._lock:
+            if cache_key in self._processing:
+                event = self._processing.pop(cache_key)
+                event.set()
+    def clear(self) -> None:
+        """Clear all cached results."""
+        with self._lock:
+            self._cache.clear()
+            self._file_metadata.clear()
+    def get_stats(self) -> dict[str, Any]:
+        """Get cache statistics.
+        Returns:
+            Dictionary with cache statistics
+        """
+        with self._lock:
+            return {
+                "cached_documents": len(self._cache),
+                "processing_documents": len(self._processing),
+                "total_cache_size_mb": sum(len(result.content.encode("utf-8")) for result in self._cache.values())
+                / 1024
+                / 1024,
+            }
+_document_cache = DocumentCache()
+def get_document_cache() -> DocumentCache:
+    """Get the global document cache instance."""
+    return _document_cache
+def clear_document_cache() -> None:
+    """Clear the global document cache."""
+    _document_cache.clear()

kreuzberg 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl

kreuzberg 3.1.7py3-none-any.whl → 3.3.0py3-none-any.whl