PyPI - kreuzberg - Versions diffs - 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl - Mend

kreuzberg 3.2.0py3-none-any.whl → 3.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

kreuzberg/__init__.py +3 -0
kreuzberg/__main__.py +8 -0
kreuzberg/_api/__init__.py +0 -0
kreuzberg/_api/main.py +87 -0
kreuzberg/_cli_config.py +175 -0
kreuzberg/_extractors/_image.py +39 -4
kreuzberg/_extractors/_pandoc.py +158 -18
kreuzberg/_extractors/_pdf.py +199 -19
kreuzberg/_extractors/_presentation.py +1 -1
kreuzberg/_extractors/_spread_sheet.py +65 -7
kreuzberg/_gmft.py +222 -16
kreuzberg/_mime_types.py +62 -16
kreuzberg/_multiprocessing/__init__.py +6 -0
kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
kreuzberg/_multiprocessing/process_manager.py +188 -0
kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
kreuzberg/_ocr/_easyocr.py +6 -12
kreuzberg/_ocr/_paddleocr.py +15 -13
kreuzberg/_ocr/_tesseract.py +136 -46
kreuzberg/_playa.py +43 -0
kreuzberg/_types.py +4 -0
kreuzberg/_utils/_cache.py +372 -0
kreuzberg/_utils/_device.py +10 -27
kreuzberg/_utils/_document_cache.py +220 -0
kreuzberg/_utils/_errors.py +232 -0
kreuzberg/_utils/_pdf_lock.py +72 -0
kreuzberg/_utils/_process_pool.py +100 -0
kreuzberg/_utils/_serialization.py +82 -0
kreuzberg/_utils/_string.py +1 -1
kreuzberg/_utils/_sync.py +21 -0
kreuzberg/cli.py +338 -0
kreuzberg/extraction.py +247 -36
kreuzberg-3.4.0.dist-info/METADATA +290 -0
kreuzberg-3.4.0.dist-info/RECORD +50 -0
{kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
kreuzberg-3.2.0.dist-info/METADATA +0 -166
kreuzberg-3.2.0.dist-info/RECORD +0 -34
kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
{kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_cache.py ADDED Viewed

@@ -0,0 +1,372 @@
+"""General-purpose file-based caching layer for Kreuzberg."""
+from __future__ import annotations
+import hashlib
+import os
+import threading
+import time
+from contextlib import suppress
+from pathlib import Path
+from typing import Any, Generic, TypeVar
+from anyio import Path as AsyncPath
+from kreuzberg._types import ExtractionResult
+from kreuzberg._utils._serialization import deserialize, serialize
+from kreuzberg._utils._sync import run_sync
+T = TypeVar("T")
+class KreuzbergCache(Generic[T]):
+    """File-based cache for Kreuzberg operations.
+    Provides both sync and async interfaces for caching extraction results,
+    OCR results, table data, and other expensive operations to disk.
+    """
+    def __init__(
+        self,
+        cache_type: str,
+        cache_dir: Path | str | None = None,
+        max_cache_size_mb: float = 500.0,
+        max_age_days: int = 30,
+    ) -> None:
+        """Initialize cache.
+        Args:
+            cache_type: Type of cache (e.g., 'ocr', 'tables', 'documents', 'mime')
+            cache_dir: Cache directory (defaults to .kreuzberg/{cache_type} in cwd)
+            max_cache_size_mb: Maximum cache size in MB (default: 500MB)
+            max_age_days: Maximum age of cached results in days (default: 30 days)
+        """
+        if cache_dir is None:
+            cache_dir = Path.cwd() / ".kreuzberg" / cache_type
+        self.cache_dir = Path(cache_dir)
+        self.cache_type = cache_type
+        self.max_cache_size_mb = max_cache_size_mb
+        self.max_age_days = max_age_days
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        # In-memory tracking of processing state (session-scoped)  # ~keep
+        self._processing: dict[str, threading.Event] = {}
+        self._lock = threading.Lock()
+    def _get_cache_key(self, **kwargs: Any) -> str:
+        """Generate cache key from kwargs.
+        Args:
+            **kwargs: Key-value pairs to generate cache key from
+        Returns:
+            Unique cache key string
+        """
+        # Sort for consistent hashing  # ~keep
+        cache_str = str(sorted(kwargs.items()))
+        return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
+    def _get_cache_path(self, cache_key: str) -> Path:
+        """Get cache file path for key."""
+        return self.cache_dir / f"{cache_key}.msgpack"
+    def _is_cache_valid(self, cache_path: Path) -> bool:
+        """Check if cached result is still valid."""
+        try:
+            if not cache_path.exists():
+                return False
+            mtime = cache_path.stat().st_mtime
+            age_days = (time.time() - mtime) / (24 * 3600)
+            return age_days <= self.max_age_days
+        except OSError:
+            return False
+    def _serialize_result(self, result: T) -> dict[str, Any]:
+        """Serialize result for caching with metadata."""
+        return {"type": type(result).__name__, "data": result, "cached_at": time.time()}
+    def _deserialize_result(self, cached_data: dict[str, Any]) -> T:
+        """Deserialize cached result."""
+        data = cached_data["data"]
+        if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
+            from kreuzberg._types import ExtractionResult
+            return ExtractionResult(**data)  # type: ignore[return-value]
+        return data  # type: ignore[no-any-return]
+    def _cleanup_cache(self) -> None:
+        """Clean up old and oversized cache entries."""
+        try:
+            cache_files = list(self.cache_dir.glob("*.msgpack"))
+            cutoff_time = time.time() - (self.max_age_days * 24 * 3600)
+            for cache_file in cache_files[:]:
+                try:
+                    if cache_file.stat().st_mtime < cutoff_time:
+                        cache_file.unlink(missing_ok=True)
+                        cache_files.remove(cache_file)
+                except OSError:  # noqa: PERF203
+                    continue
+            total_size = sum(cache_file.stat().st_size for cache_file in cache_files if cache_file.exists()) / (
+                1024 * 1024
+            )
+            if total_size > self.max_cache_size_mb:
+                cache_files.sort(key=lambda f: f.stat().st_mtime if f.exists() else 0)
+                for cache_file in cache_files:
+                    try:
+                        size_mb = cache_file.stat().st_size / (1024 * 1024)
+                        cache_file.unlink(missing_ok=True)
+                        total_size -= size_mb
+                        if total_size <= self.max_cache_size_mb * 0.8:
+                            break
+                    except OSError:
+                        continue
+        except (OSError, ValueError, TypeError):
+            pass
+    def get(self, **kwargs: Any) -> T | None:
+        """Get cached result (sync).
+        Args:
+            **kwargs: Key-value pairs to generate cache key from
+        Returns:
+            Cached result if available, None otherwise
+        """
+        cache_key = self._get_cache_key(**kwargs)
+        cache_path = self._get_cache_path(cache_key)
+        if not self._is_cache_valid(cache_path):
+            return None
+        try:
+            content = cache_path.read_bytes()
+            cached_data = deserialize(content, dict)
+            return self._deserialize_result(cached_data)
+        except (OSError, ValueError, KeyError):
+            with suppress(OSError):
+                cache_path.unlink(missing_ok=True)
+            return None
+    def set(self, result: T, **kwargs: Any) -> None:
+        """Cache result (sync).
+        Args:
+            result: Result to cache
+            **kwargs: Key-value pairs to generate cache key from
+        """
+        cache_key = self._get_cache_key(**kwargs)
+        cache_path = self._get_cache_path(cache_key)
+        try:
+            serialized = self._serialize_result(result)
+            content = serialize(serialized)
+            cache_path.write_bytes(content)
+            if hash(cache_key) % 100 == 0:
+                self._cleanup_cache()
+        except (OSError, TypeError, ValueError):
+            pass
+    async def aget(self, **kwargs: Any) -> T | None:
+        """Get cached result (async).
+        Args:
+            **kwargs: Key-value pairs to generate cache key from
+        Returns:
+            Cached result if available, None otherwise
+        """
+        cache_key = self._get_cache_key(**kwargs)
+        cache_path = AsyncPath(self._get_cache_path(cache_key))
+        if not await run_sync(self._is_cache_valid, Path(cache_path)):
+            return None
+        try:
+            content = await cache_path.read_bytes()
+            cached_data = deserialize(content, dict)
+            return self._deserialize_result(cached_data)
+        except (OSError, ValueError, KeyError):
+            with suppress(Exception):
+                await cache_path.unlink(missing_ok=True)
+            return None
+    async def aset(self, result: T, **kwargs: Any) -> None:
+        """Cache result (async).
+        Args:
+            result: Result to cache
+            **kwargs: Key-value pairs to generate cache key from
+        """
+        cache_key = self._get_cache_key(**kwargs)
+        cache_path = AsyncPath(self._get_cache_path(cache_key))
+        try:
+            serialized = self._serialize_result(result)
+            content = serialize(serialized)
+            await cache_path.write_bytes(content)
+            if hash(cache_key) % 100 == 0:
+                await run_sync(self._cleanup_cache)
+        except (OSError, TypeError, ValueError):
+            pass
+    def is_processing(self, **kwargs: Any) -> bool:
+        """Check if operation is currently being processed."""
+        cache_key = self._get_cache_key(**kwargs)
+        with self._lock:
+            return cache_key in self._processing
+    def mark_processing(self, **kwargs: Any) -> threading.Event:
+        """Mark operation as being processed and return event to wait on."""
+        cache_key = self._get_cache_key(**kwargs)
+        with self._lock:
+            if cache_key not in self._processing:
+                self._processing[cache_key] = threading.Event()
+            return self._processing[cache_key]
+    def mark_complete(self, **kwargs: Any) -> None:
+        """Mark operation processing as complete."""
+        cache_key = self._get_cache_key(**kwargs)
+        with self._lock:
+            if cache_key in self._processing:
+                event = self._processing.pop(cache_key)
+                event.set()
+    def clear(self) -> None:
+        """Clear all cached results."""
+        try:
+            for cache_file in self.cache_dir.glob("*.msgpack"):
+                cache_file.unlink(missing_ok=True)
+        except OSError:
+            pass
+        with self._lock:
+            pass
+    def get_stats(self) -> dict[str, Any]:
+        """Get cache statistics."""
+        try:
+            cache_files = list(self.cache_dir.glob("*.msgpack"))
+            total_size = sum(cache_file.stat().st_size for cache_file in cache_files if cache_file.exists())
+            return {
+                "cache_type": self.cache_type,
+                "cached_results": len(cache_files),
+                "processing_results": len(self._processing),
+                "total_cache_size_mb": total_size / 1024 / 1024,
+                "avg_result_size_kb": (total_size / len(cache_files) / 1024) if cache_files else 0,
+                "cache_dir": str(self.cache_dir),
+                "max_cache_size_mb": self.max_cache_size_mb,
+                "max_age_days": self.max_age_days,
+            }
+        except OSError:
+            return {
+                "cache_type": self.cache_type,
+                "cached_results": 0,
+                "processing_results": len(self._processing),
+                "total_cache_size_mb": 0.0,
+                "avg_result_size_kb": 0.0,
+                "cache_dir": str(self.cache_dir),
+                "max_cache_size_mb": self.max_cache_size_mb,
+                "max_age_days": self.max_age_days,
+            }
+_ocr_cache: KreuzbergCache[ExtractionResult] | None = None
+_document_cache: KreuzbergCache[ExtractionResult] | None = None
+_table_cache: KreuzbergCache[Any] | None = None
+_mime_cache: KreuzbergCache[str] | None = None
+def get_ocr_cache() -> KreuzbergCache[ExtractionResult]:
+    """Get the global OCR cache instance."""
+    global _ocr_cache
+    if _ocr_cache is None:
+        cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
+        cache_dir: Path | None = None
+        if cache_dir_str:
+            cache_dir = Path(cache_dir_str) / "ocr"
+        _ocr_cache = KreuzbergCache[ExtractionResult](
+            cache_type="ocr",
+            cache_dir=cache_dir,
+            max_cache_size_mb=float(os.environ.get("KREUZBERG_OCR_CACHE_SIZE_MB", "500")),
+            max_age_days=int(os.environ.get("KREUZBERG_OCR_CACHE_AGE_DAYS", "30")),
+        )
+    return _ocr_cache
+def get_document_cache() -> KreuzbergCache[ExtractionResult]:
+    """Get the global document cache instance."""
+    global _document_cache
+    if _document_cache is None:
+        cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
+        cache_dir: Path | None = None
+        if cache_dir_str:
+            cache_dir = Path(cache_dir_str) / "documents"
+        _document_cache = KreuzbergCache[ExtractionResult](
+            cache_type="documents",
+            cache_dir=cache_dir,
+            max_cache_size_mb=float(os.environ.get("KREUZBERG_DOCUMENT_CACHE_SIZE_MB", "1000")),
+            max_age_days=int(os.environ.get("KREUZBERG_DOCUMENT_CACHE_AGE_DAYS", "7")),
+        )
+    return _document_cache
+def get_table_cache() -> KreuzbergCache[Any]:
+    """Get the global table cache instance."""
+    global _table_cache
+    if _table_cache is None:
+        cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
+        cache_dir: Path | None = None
+        if cache_dir_str:
+            cache_dir = Path(cache_dir_str) / "tables"
+        _table_cache = KreuzbergCache[Any](
+            cache_type="tables",
+            cache_dir=cache_dir,
+            max_cache_size_mb=float(os.environ.get("KREUZBERG_TABLE_CACHE_SIZE_MB", "200")),
+            max_age_days=int(os.environ.get("KREUZBERG_TABLE_CACHE_AGE_DAYS", "30")),
+        )
+    return _table_cache
+def get_mime_cache() -> KreuzbergCache[str]:
+    """Get the global MIME type cache instance."""
+    global _mime_cache
+    if _mime_cache is None:
+        cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
+        cache_dir: Path | None = None
+        if cache_dir_str:
+            cache_dir = Path(cache_dir_str) / "mime"
+        _mime_cache = KreuzbergCache[str](
+            cache_type="mime",
+            cache_dir=cache_dir,
+            max_cache_size_mb=float(os.environ.get("KREUZBERG_MIME_CACHE_SIZE_MB", "50")),
+            max_age_days=int(os.environ.get("KREUZBERG_MIME_CACHE_AGE_DAYS", "60")),
+        )
+    return _mime_cache
+def clear_all_caches() -> None:
+    """Clear all caches."""
+    get_ocr_cache().clear()
+    get_document_cache().clear()
+    get_table_cache().clear()
+    get_mime_cache().clear()

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """Device detection and management utilities for GPU acceleration."""
-# ruff: noqa: BLE001
+# ruff: noqa: BLE001  # ~keep
 from __future__ import annotations
@@ -36,7 +36,6 @@ def detect_available_devices() -> list[DeviceInfo]:
     """
     devices: list[DeviceInfo] = []
-    # Always include CPU as fallback
     devices.append(
         DeviceInfo(
             device_type="cpu",
@@ -44,18 +43,15 @@ def detect_available_devices() -> list[DeviceInfo]:
         )
     )
-    # Check for CUDA (NVIDIA GPUs)
     if _is_cuda_available():
         cuda_devices = _get_cuda_devices()
         devices.extend(cuda_devices)
-    # Check for MPS (Apple Silicon)
     if _is_mps_available():
         mps_device = _get_mps_device()
         if mps_device:
             devices.append(mps_device)
-    # Reorder to put GPU devices first
     gpu_devices = [d for d in devices if d.device_type != "cpu"]
     cpu_devices = [d for d in devices if d.device_type == "cpu"]
@@ -95,14 +91,12 @@ def validate_device_request(
     """
     available_devices = detect_available_devices()
-    # Handle auto device selection
     if requested == "auto":
         device = get_optimal_device()
         if memory_limit is not None:
             _validate_memory_limit(device, memory_limit)
         return device
-    # Find requested device
     matching_devices = [d for d in available_devices if d.device_type == requested]
     if not matching_devices:
@@ -125,10 +119,8 @@ def validate_device_request(
             },
         )
-    # Use the first matching device (typically the best one)
     device = matching_devices[0]
-    # Validate memory limit if specified
     if memory_limit is not None:
         _validate_memory_limit(device, memory_limit)
@@ -159,7 +151,7 @@ def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | No
 def _is_cuda_available() -> bool:
     """Check if CUDA is available."""
     try:
-        import torch
+        import torch  # type: ignore[import-not-found,unused-ignore]
         return torch.cuda.is_available()
     except ImportError:
@@ -169,7 +161,7 @@ def _is_cuda_available() -> bool:
 def _is_mps_available() -> bool:
     """Check if MPS (Apple Silicon) is available."""
     try:
-        import torch
+        import torch  # type: ignore[import-not-found,unused-ignore]
         return torch.backends.mps.is_available()
     except ImportError:
@@ -188,17 +180,14 @@ def _get_cuda_devices() -> list[DeviceInfo]:
         for i in range(torch.cuda.device_count()):
             props = torch.cuda.get_device_properties(i)
-            total_memory = props.total_memory / (1024**3)  # Convert to GB
+            total_memory = props.total_memory / (1024**3)
-            # Get available memory
             torch.cuda.set_device(i)
             available_memory = torch.cuda.get_device_properties(i).total_memory / (1024**3)
             try:
-                # Try to get current memory usage
                 allocated = torch.cuda.memory_allocated(i) / (1024**3)
                 available_memory = total_memory - allocated
             except Exception:
-                # Fallback to total memory if we can't get allocation info
                 available_memory = total_memory
             devices.append(
@@ -225,7 +214,6 @@ def _get_mps_device() -> DeviceInfo | None:
         if not torch.backends.mps.is_available():
             return None
-        # MPS doesn't provide detailed memory info
         return DeviceInfo(
             device_type="mps",
             name="Apple Silicon GPU (MPS)",
@@ -260,8 +248,6 @@ def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
 def _get_mps_memory_info() -> tuple[float | None, float | None]:
     """Get MPS memory information."""
-    # MPS doesn't provide detailed memory info through PyTorch
-    # We could potentially use system calls but that's platform-specific
     return None, None
@@ -276,7 +262,7 @@ def _validate_memory_limit(device: DeviceInfo, memory_limit: float) -> None:
         ValidationError: If the device doesn't have enough memory.
     """
     if device.device_type == "cpu":
-        # CPU memory validation is complex and OS-dependent, skip for now
+        # CPU memory validation is complex and OS-dependent, skip for now  # ~keep
         return
     total_memory, available_memory = get_device_memory_info(device)
@@ -311,7 +297,7 @@ def is_backend_gpu_compatible(backend: str) -> bool:
     Returns:
         True if the backend supports GPU acceleration.
     """
-    # EasyOCR and PaddleOCR support GPU, Tesseract does not
+    # EasyOCR and PaddleOCR support GPU, Tesseract does not  # ~keep
     return backend.lower() in ("easyocr", "paddleocr")
@@ -326,25 +312,22 @@ def get_recommended_batch_size(device: DeviceInfo, input_size_mb: float = 10.0)
         Recommended batch size.
     """
     if device.device_type == "cpu":
-        # Conservative batch size for CPU
+        # Conservative batch size for CPU  # ~keep
         return 1
-    # For GPU devices, estimate based on available memory
     _, available_memory = get_device_memory_info(device)
     if available_memory is None:
-        # Conservative default for unknown memory
         return 4
-    # Reserve some memory for model and intermediate calculations
-    # Use approximately 50% of available memory for batching
+    # Use approximately 50% of available memory for batching  # ~keep
     usable_memory_gb = available_memory * 0.5
     usable_memory_mb = usable_memory_gb * 1024
-    # Estimate batch size (conservative)
+    # Estimate batch size (conservative)  # ~keep
     estimated_batch_size = max(1, int(usable_memory_mb / (input_size_mb * 4)))
-    # Cap at reasonable limits
+    # Cap at reasonable limits  # ~keep
     return min(estimated_batch_size, 32)

kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

kreuzberg 3.2.0py3-none-any.whl → 3.4.0py3-none-any.whl