PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +212 -292
kreuzberg/_document_classification.py +20 -47
kreuzberg/_entity_extraction.py +1 -122
kreuzberg/_extractors/_base.py +4 -71
kreuzberg/_extractors/_email.py +1 -15
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -25
kreuzberg/_extractors/_pandoc.py +10 -147
kreuzberg/_extractors/_pdf.py +38 -94
kreuzberg/_extractors/_presentation.py +0 -99
kreuzberg/_extractors/_spread_sheet.py +13 -55
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -199
kreuzberg/_language_detection.py +1 -36
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -19
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +124 -186
kreuzberg/_ocr/_paddleocr.py +154 -224
kreuzberg/_ocr/_table_extractor.py +184 -0
kreuzberg/_ocr/_tesseract.py +797 -361
kreuzberg/_playa.py +5 -31
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +588 -93
kreuzberg/_utils/_cache.py +84 -138
kreuzberg/_utils/_device.py +0 -74
kreuzberg/_utils/_document_cache.py +0 -75
kreuzberg/_utils/_errors.py +0 -50
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -16
kreuzberg/_utils/_process_pool.py +17 -64
kreuzberg/_utils/_quality.py +0 -60
kreuzberg/_utils/_ref.py +32 -0
kreuzberg/_utils/_serialization.py +0 -30
kreuzberg/_utils/_string.py +9 -59
kreuzberg/_utils/_sync.py +0 -77
kreuzberg/_utils/_table.py +49 -101
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
kreuzberg-3.13.1.dist-info/RECORD +57 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_document_cache.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""Document-level caching to prevent pypdfium2 issues with duplicate processing."""
 from __future__ import annotations
 import hashlib
@@ -13,14 +11,7 @@ if TYPE_CHECKING:
 class DocumentCache:
-    """Session-scoped cache for document extraction results.
-    Ensures each unique document is processed only once per session,
-    preventing pypdfium2 state corruption issues with repeated processing.
-    """
     def __init__(self) -> None:
-        """Initialize document cache."""
         self._cache: dict[str, ExtractionResult] = {}
         self._processing: dict[str, threading.Event] = {}
         self._lock = threading.Lock()
@@ -28,15 +19,6 @@ class DocumentCache:
         self._file_metadata: dict[str, dict[str, Any]] = {}
     def _get_cache_key(self, file_path: Path | str, config: ExtractionConfig | None = None) -> str:
-        """Generate cache key for a file and config combination.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-        Returns:
-            Unique cache key string
-        """
         path = Path(file_path).resolve()
         try:
@@ -67,15 +49,6 @@ class DocumentCache:
         return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
     def _is_cache_valid(self, cache_key: str, file_path: Path | str) -> bool:
-        """Check if cached result is still valid.
-        Args:
-            cache_key: The cache key to validate
-            file_path: Path to the file
-        Returns:
-            True if cache is valid, False if invalidated
-        """
         if cache_key not in self._file_metadata:
             return False
@@ -91,15 +64,6 @@ class DocumentCache:
             return False
     def get(self, file_path: Path | str, config: ExtractionConfig | None = None) -> ExtractionResult | None:
-        """Get cached extraction result if available and valid.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-        Returns:
-            Cached result if available, None otherwise
-        """
         cache_key = self._get_cache_key(file_path, config)
         with self._lock:
@@ -113,13 +77,6 @@ class DocumentCache:
         return None
     def set(self, file_path: Path | str, config: ExtractionConfig | None, result: ExtractionResult) -> None:
-        """Cache extraction result.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-            result: Extraction result to cache
-        """
         cache_key = self._get_cache_key(file_path, config)
         path = Path(file_path)
@@ -142,29 +99,11 @@ class DocumentCache:
             self._file_metadata[cache_key] = file_metadata
     def is_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> bool:
-        """Check if file is currently being processed.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-        Returns:
-            True if file is currently being processed
-        """
         cache_key = self._get_cache_key(file_path, config)
         with self._lock:
             return cache_key in self._processing
     def mark_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> threading.Event:
-        """Mark file as being processed and return event to wait on.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-        Returns:
-            Event that will be set when processing completes
-        """
         cache_key = self._get_cache_key(file_path, config)
         with self._lock:
@@ -173,12 +112,6 @@ class DocumentCache:
             return self._processing[cache_key]
     def mark_complete(self, file_path: Path | str, config: ExtractionConfig | None = None) -> None:
-        """Mark file processing as complete.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-        """
         cache_key = self._get_cache_key(file_path, config)
         with self._lock:
@@ -187,17 +120,11 @@ class DocumentCache:
                 event.set()
     def clear(self) -> None:
-        """Clear all cached results."""
         with self._lock:
             self._cache.clear()
             self._file_metadata.clear()
     def get_stats(self) -> dict[str, Any]:
-        """Get cache statistics.
-        Returns:
-            Dictionary with cache statistics
-        """
         with self._lock:
             return {
                 "cached_documents": len(self._cache),
@@ -212,10 +139,8 @@ _document_cache = DocumentCache()
 def get_document_cache() -> DocumentCache:
-    """Get the global document cache instance."""
     return _document_cache
 def clear_document_cache() -> None:
-    """Clear the global document cache."""
     _document_cache.clear()

kreuzberg/_utils/_errors.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""Enhanced error handling utilities."""
 from __future__ import annotations
 import platform
@@ -12,7 +10,6 @@ import psutil
 from kreuzberg.exceptions import ValidationError
-# Define error keywords as frozensets for O(1) membership testing
 _SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
 _TRANSIENT_ERROR_PATTERNS = frozenset(
     {
@@ -56,17 +53,6 @@ def create_error_context(
     error: Exception | None = None,
     **extra: Any,
 ) -> dict[str, Any]:
-    """Create comprehensive error context.
-    Args:
-        operation: The operation being performed (e.g., "extract_file", "convert_pdf_to_images")
-        file_path: The file being processed, if applicable
-        error: The original exception, if any
-        **extra: Additional context fields
-    Returns:
-        Dictionary with error context including system info
-    """
     context: dict[str, Any] = {
         "timestamp": datetime.now(timezone.utc).isoformat(),
         "operation": operation,
@@ -107,14 +93,6 @@ def create_error_context(
 def is_transient_error(error: Exception) -> bool:
-    """Check if an error is likely transient and worth retrying.
-    Args:
-        error: The exception to check
-    Returns:
-        True if the error is likely transient
-    """
     transient_types = (
         OSError,
         PermissionError,
@@ -131,29 +109,11 @@ def is_transient_error(error: Exception) -> bool:
 def is_resource_error(error: Exception) -> bool:
-    """Check if an error is related to system resources.
-    Args:
-        error: The exception to check
-    Returns:
-        True if the error is resource-related
-    """
     error_str = str(error).lower()
     return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
 def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
-    """Determine if an operation should be retried.
-    Args:
-        error: The exception that occurred
-        attempt: Current attempt number (1-based)
-        max_attempts: Maximum number of attempts
-    Returns:
-        True if the operation should be retried
-    """
     if attempt >= max_attempts:
         return False
@@ -164,22 +124,17 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
 class BatchExtractionResult:
-    """Result container for batch operations with partial success support."""
     __slots__ = ("failed", "successful", "total_count")
     def __init__(self) -> None:
-        """Initialize batch result container."""
         self.successful: list[tuple[int, Any]] = []
         self.failed: list[tuple[int, dict[str, Any]]] = []
         self.total_count: int = 0
     def add_success(self, index: int, result: Any) -> None:
-        """Add a successful result."""
         self.successful.append((index, result))
     def add_failure(self, index: int, error: Exception, context: dict[str, Any]) -> None:
-        """Add a failed result with context."""
         error_info = {
             "error": {
                 "type": type(error).__name__,
@@ -191,30 +146,25 @@ class BatchExtractionResult:
     @property
     def success_count(self) -> int:
-        """Number of successful operations."""
         return len(self.successful)
     @property
     def failure_count(self) -> int:
-        """Number of failed operations."""
         return len(self.failed)
     @property
     def success_rate(self) -> float:
-        """Success rate as a percentage."""
         if self.total_count == 0:
             return 0.0
         return (self.success_count / self.total_count) * 100
     def get_ordered_results(self) -> list[Any | None]:
-        """Get results in original order with None for failures."""
         results = [None] * self.total_count
         for index, result in self.successful:
             results[index] = result
         return results
     def get_summary(self) -> dict[str, Any]:
-        """Get summary of batch operation."""
         return {
             "total": self.total_count,
             "successful": self.success_count,

kreuzberg/_utils/_ocr_cache.py ADDED Viewed

@@ -0,0 +1,136 @@
+from __future__ import annotations
+import hashlib
+import io
+from typing import TYPE_CHECKING, Any
+import anyio
+from kreuzberg._utils._cache import get_ocr_cache
+if TYPE_CHECKING:
+    from pathlib import Path
+    from PIL.Image import Image as PILImage
+    from kreuzberg._types import ExtractionResult
+def get_file_info(path: Path) -> dict[str, Any]:
+    from pathlib import Path as PathType  # noqa: PLC0415
+    path_obj = PathType(path) if not isinstance(path, PathType) else path
+    try:
+        stat = path_obj.stat()
+        return {
+            "path": str(path_obj.resolve()),
+            "size": stat.st_size,
+            "mtime": stat.st_mtime,
+        }
+    except OSError:
+        return {
+            "path": str(path_obj),
+            "size": 0,
+            "mtime": 0,
+        }
+def generate_image_hash(image: PILImage) -> str:
+    save_image = image
+    if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
+        save_image = image.convert("RGB")
+    image_buffer = io.BytesIO()
+    save_image.save(image_buffer, format="PNG")
+    image_content = image_buffer.getvalue()
+    return hashlib.sha256(image_content).hexdigest()[:16]
+def build_cache_kwargs(
+    backend_name: str,
+    config_dict: dict[str, Any],
+    image_hash: str | None = None,
+    file_info: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    cache_kwargs = {
+        "ocr_backend": backend_name,
+        "ocr_config": str(sorted(config_dict.items())),
+    }
+    if image_hash:
+        cache_kwargs["image_hash"] = image_hash
+    if file_info:
+        cache_kwargs["file_info"] = str(sorted(file_info.items()))
+    return cache_kwargs
+async def handle_cache_lookup_async(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
+    ocr_cache = get_ocr_cache()
+    cached_result = await ocr_cache.aget(**cache_kwargs)
+    if cached_result is not None:
+        return cached_result
+    if ocr_cache.is_processing(**cache_kwargs):
+        event = ocr_cache.mark_processing(**cache_kwargs)
+        await anyio.to_thread.run_sync(event.wait)
+        cached_result = await ocr_cache.aget(**cache_kwargs)
+        if cached_result is not None:
+            return cached_result
+    ocr_cache.mark_processing(**cache_kwargs)
+    return None
+def handle_cache_lookup_sync(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
+    ocr_cache = get_ocr_cache()
+    cached_result = ocr_cache.get(**cache_kwargs)
+    if cached_result is not None:
+        return cached_result
+    if ocr_cache.is_processing(**cache_kwargs):
+        event = ocr_cache.mark_processing(**cache_kwargs)
+        event.wait()
+        cached_result = ocr_cache.get(**cache_kwargs)
+        if cached_result is not None:
+            return cached_result
+    ocr_cache.mark_processing(**cache_kwargs)
+    return None
+async def cache_and_complete_async(
+    result: ExtractionResult,
+    cache_kwargs: dict[str, Any],
+    use_cache: bool,
+) -> None:
+    ocr_cache = get_ocr_cache()
+    if use_cache:
+        await ocr_cache.aset(result, **cache_kwargs)
+    ocr_cache.mark_complete(**cache_kwargs)
+def cache_and_complete_sync(
+    result: ExtractionResult,
+    cache_kwargs: dict[str, Any],
+    use_cache: bool,
+) -> None:
+    ocr_cache = get_ocr_cache()
+    if use_cache:
+        ocr_cache.set(result, **cache_kwargs)
+    ocr_cache.mark_complete(**cache_kwargs)
+def mark_processing_complete(cache_kwargs: dict[str, Any]) -> None:
+    ocr_cache = get_ocr_cache()
+    ocr_cache.mark_complete(**cache_kwargs)

kreuzberg/_utils/_pdf_lock.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""PDF processing lock utilities for thread-safe pypdfium2 operations."""
 from __future__ import annotations
 import hashlib
@@ -21,13 +19,11 @@ _FILE_LOCKS_LOCK = threading.Lock()
 def _get_file_key(file_path: Path | str) -> str:
-    """Get a consistent key for a file path."""
     path_str = str(Path(file_path).resolve())
     return hashlib.md5(path_str.encode()).hexdigest()  # noqa: S324
 def _get_file_lock(file_path: Path | str) -> threading.RLock:
-    """Get or create a lock for a specific file."""
     file_key = _get_file_key(file_path)
     with _FILE_LOCKS_LOCK:
@@ -41,30 +37,18 @@ def _get_file_lock(file_path: Path | str) -> threading.RLock:
 @contextmanager
 def pypdfium_lock() -> Generator[None, None, None]:
-    """Context manager for thread-safe pypdfium2 operations.
-    This prevents segmentation faults on macOS where pypdfium2
-    is not fork-safe when used concurrently.
-    """
     with _PYPDFIUM_LOCK:
         yield
 @contextmanager
 def pypdfium_file_lock(file_path: Path | str) -> Generator[None, None, None]:
-    """Context manager for per-file pypdfium2 operations.
-    This allows concurrent processing of different files while
-    preventing segfaults. Document caching handles same-file issues.
-    """
     lock = _get_file_lock(file_path)
     with lock:
         yield
 def with_pypdfium_lock(func: Any) -> Any:
-    """Decorator to wrap functions with pypdfium2 lock."""
     def wrapper(*args: Any, **kwargs: Any) -> Any:
         with pypdfium_lock():
             return func(*args, **kwargs)

kreuzberg/_utils/_process_pool.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""Process pool utilities for CPU-intensive operations."""
 from __future__ import annotations
 import io
@@ -13,6 +11,8 @@ import psutil
 import pypdfium2
 from typing_extensions import Self
+from kreuzberg._utils._ref import Ref
 if TYPE_CHECKING:
     import types
     from collections.abc import Callable, Generator
@@ -20,47 +20,45 @@ if TYPE_CHECKING:
 T = TypeVar("T")
-_PROCESS_POOL: ProcessPoolExecutor | None = None
 _POOL_SIZE = max(1, mp.cpu_count() - 1)
-def _init_process_pool() -> ProcessPoolExecutor:
-    """Initialize the global process pool."""
-    global _PROCESS_POOL
-    if _PROCESS_POOL is None:
-        _PROCESS_POOL = ProcessPoolExecutor(max_workers=_POOL_SIZE)
-    return _PROCESS_POOL
+def _create_process_pool() -> ProcessPoolExecutor:
+    return ProcessPoolExecutor(max_workers=_POOL_SIZE)
+_process_pool_ref = Ref("process_pool", _create_process_pool)
+def _get_process_pool() -> ProcessPoolExecutor:
+    return _process_pool_ref.get()
 @contextmanager
 def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
-    """Get the global process pool."""
-    pool = _init_process_pool()
+    pool = _get_process_pool()
     try:
         yield pool
     except Exception:  # noqa: BLE001
         shutdown_process_pool()
-        pool = _init_process_pool()
+        pool = _get_process_pool()
         yield pool
 def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
-    """Submit a function to the process pool and wait for result."""
     with process_pool() as pool:
         future = pool.submit(func, *args, **kwargs)
         return future.result()
 def shutdown_process_pool() -> None:
-    """Shutdown the global process pool."""
-    global _PROCESS_POOL
-    if _PROCESS_POOL is not None:
-        _PROCESS_POOL.shutdown(wait=True)
-        _PROCESS_POOL = None
+    if _process_pool_ref.is_initialized():
+        pool = _process_pool_ref.get()
+        pool.shutdown(wait=True)
+        _process_pool_ref.clear()
 def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
-    """Worker function for extracting PDF text in a separate process."""
     pdf = None
     try:
         pdf = pypdfium2.PdfDocument(pdf_path)
@@ -80,7 +78,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
 def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
-    """Worker function for converting PDF to images in a separate process."""
     pdf = None
     try:
         pdf = pypdfium2.PdfDocument(pdf_path)
@@ -102,19 +99,11 @@ def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str,
 class ProcessPoolManager:
-    """Resource-aware process pool manager for CPU-intensive tasks."""
     def __init__(
         self,
         max_processes: int | None = None,
         memory_limit_gb: float | None = None,
     ) -> None:
-        """Initialize the process pool manager.
-        Args:
-            max_processes: Maximum number of processes. Defaults to CPU count.
-            memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
-        """
         self.max_processes = max_processes or mp.cpu_count()
         if memory_limit_gb is None:
@@ -127,21 +116,12 @@ class ProcessPoolManager:
         self._active_tasks = 0
     def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
-        """Calculate optimal number of workers based on memory constraints.
-        Args:
-            task_memory_mb: Estimated memory usage per task in MB.
-        Returns:
-            Optimal number of workers.
-        """
         task_memory_bytes = task_memory_mb * 1024**2
         memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
         return min(self.max_processes, memory_based_limit)
     def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
-        """Ensure process pool executor is initialized."""
         if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
             if self._executor is not None:
                 self._executor.shutdown(wait=False)
@@ -157,16 +137,6 @@ class ProcessPoolManager:
         *args: Any,
         task_memory_mb: float = 100,
     ) -> T:
-        """Submit a task to the process pool.
-        Args:
-            func: Function to execute.
-            *args: Positional arguments for the function.
-            task_memory_mb: Estimated memory usage in MB.
-        Returns:
-            Result of the function execution.
-        """
         workers = self.get_optimal_workers(task_memory_mb)
         self._ensure_executor(workers)
@@ -184,17 +154,6 @@ class ProcessPoolManager:
         task_memory_mb: float = 100,
         max_concurrent: int | None = None,
     ) -> list[T]:
-        """Submit a batch of tasks to the process pool.
-        Args:
-            func: Function to execute.
-            arg_batches: List of argument tuples for each task.
-            task_memory_mb: Estimated memory usage per task in MB.
-            max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
-        Returns:
-            List of results in the same order as input.
-        """
         if not arg_batches:
             return []
@@ -225,7 +184,6 @@ class ProcessPoolManager:
         return results
     def get_system_info(self) -> dict[str, Any]:
-        """Get current system resource information."""
         memory = psutil.virtual_memory()
         cpu_percent = psutil.cpu_percent(interval=1)
@@ -241,13 +199,11 @@ class ProcessPoolManager:
         }
     def shutdown(self, wait: bool = True) -> None:
-        """Shutdown the process pool."""
         if self._executor is not None:
             self._executor.shutdown(wait=wait)
             self._executor = None
     def __enter__(self) -> Self:
-        """Context manager entry."""
         return self
     def __exit__(
@@ -256,11 +212,9 @@ class ProcessPoolManager:
         exc_val: BaseException | None,
         exc_tb: types.TracebackType | None,
     ) -> None:
-        """Context manager exit."""
         self.shutdown()
     async def __aenter__(self) -> Self:
-        """Async context manager entry."""
         return self
     async def __aexit__(
@@ -269,5 +223,4 @@ class ProcessPoolManager:
         exc_val: BaseException | None,
         exc_tb: types.TracebackType | None,
     ) -> None:
-        """Async context manager exit."""
         self.shutdown()

kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl