PyPI - kreuzberg - Versions diffs - 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl - Mend

kreuzberg 3.13.0py3-none-any.whl → 3.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +0 -124
kreuzberg/_document_classification.py +20 -39
kreuzberg/_entity_extraction.py +0 -29
kreuzberg/_extractors/_base.py +4 -66
kreuzberg/_extractors/_email.py +0 -4
kreuzberg/_extractors/_image.py +0 -2
kreuzberg/_extractors/_pandoc.py +0 -58
kreuzberg/_extractors/_pdf.py +0 -3
kreuzberg/_extractors/_presentation.py +0 -82
kreuzberg/_extractors/_spread_sheet.py +0 -2
kreuzberg/_gmft.py +0 -61
kreuzberg/_language_detection.py +0 -14
kreuzberg/_mime_types.py +0 -17
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +110 -85
kreuzberg/_ocr/_paddleocr.py +146 -138
kreuzberg/_ocr/_table_extractor.py +0 -76
kreuzberg/_ocr/_tesseract.py +0 -206
kreuzberg/_playa.py +0 -27
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +16 -119
kreuzberg/_utils/_cache.py +0 -52
kreuzberg/_utils/_device.py +0 -56
kreuzberg/_utils/_document_cache.py +0 -73
kreuzberg/_utils/_errors.py +0 -47
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -14
kreuzberg/_utils/_process_pool.py +0 -47
kreuzberg/_utils/_quality.py +0 -17
kreuzberg/_utils/_ref.py +0 -16
kreuzberg/_utils/_serialization.py +0 -25
kreuzberg/_utils/_string.py +0 -20
kreuzberg/_utils/_sync.py +0 -76
kreuzberg/_utils/_table.py +0 -45
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +2 -2
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/METADATA +3 -2
kreuzberg-3.13.2.dist-info/RECORD +57 -0
kreuzberg-3.13.0.dist-info/RECORD +0 -56
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/WHEEL +0 -0
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_document_cache.py CHANGED Viewed

@@ -11,14 +11,7 @@ if TYPE_CHECKING:
 class DocumentCache:
-    """Session-scoped cache for document extraction results.
-    Ensures each unique document is processed only once per session,
-    preventing pypdfium2 state corruption issues with repeated processing.
-    """
     def __init__(self) -> None:
-        """Initialize document cache."""
         self._cache: dict[str, ExtractionResult] = {}
         self._processing: dict[str, threading.Event] = {}
         self._lock = threading.Lock()
@@ -26,15 +19,6 @@ class DocumentCache:
         self._file_metadata: dict[str, dict[str, Any]] = {}
     def _get_cache_key(self, file_path: Path | str, config: ExtractionConfig | None = None) -> str:
-        """Generate cache key for a file and config combination.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-        Returns:
-            Unique cache key string
-        """
         path = Path(file_path).resolve()
         try:
@@ -65,15 +49,6 @@ class DocumentCache:
         return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
     def _is_cache_valid(self, cache_key: str, file_path: Path | str) -> bool:
-        """Check if cached result is still valid.
-        Args:
-            cache_key: The cache key to validate
-            file_path: Path to the file
-        Returns:
-            True if cache is valid, False if invalidated
-        """
         if cache_key not in self._file_metadata:
             return False
@@ -89,15 +64,6 @@ class DocumentCache:
             return False
     def get(self, file_path: Path | str, config: ExtractionConfig | None = None) -> ExtractionResult | None:
-        """Get cached extraction result if available and valid.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-        Returns:
-            Cached result if available, None otherwise
-        """
         cache_key = self._get_cache_key(file_path, config)
         with self._lock:
@@ -111,13 +77,6 @@ class DocumentCache:
         return None
     def set(self, file_path: Path | str, config: ExtractionConfig | None, result: ExtractionResult) -> None:
-        """Cache extraction result.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-            result: Extraction result to cache
-        """
         cache_key = self._get_cache_key(file_path, config)
         path = Path(file_path)
@@ -140,29 +99,11 @@ class DocumentCache:
             self._file_metadata[cache_key] = file_metadata
     def is_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> bool:
-        """Check if file is currently being processed.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-        Returns:
-            True if file is currently being processed
-        """
         cache_key = self._get_cache_key(file_path, config)
         with self._lock:
             return cache_key in self._processing
     def mark_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> threading.Event:
-        """Mark file as being processed and return event to wait on.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-        Returns:
-            Event that will be set when processing completes
-        """
         cache_key = self._get_cache_key(file_path, config)
         with self._lock:
@@ -171,12 +112,6 @@ class DocumentCache:
             return self._processing[cache_key]
     def mark_complete(self, file_path: Path | str, config: ExtractionConfig | None = None) -> None:
-        """Mark file processing as complete.
-        Args:
-            file_path: Path to the file
-            config: Extraction configuration
-        """
         cache_key = self._get_cache_key(file_path, config)
         with self._lock:
@@ -185,17 +120,11 @@ class DocumentCache:
                 event.set()
     def clear(self) -> None:
-        """Clear all cached results."""
         with self._lock:
             self._cache.clear()
             self._file_metadata.clear()
     def get_stats(self) -> dict[str, Any]:
-        """Get cache statistics.
-        Returns:
-            Dictionary with cache statistics
-        """
         with self._lock:
             return {
                 "cached_documents": len(self._cache),
@@ -210,10 +139,8 @@ _document_cache = DocumentCache()
 def get_document_cache() -> DocumentCache:
-    """Get the global document cache instance."""
     return _document_cache
 def clear_document_cache() -> None:
-    """Clear the global document cache."""
     _document_cache.clear()

kreuzberg/_utils/_errors.py CHANGED Viewed

@@ -53,17 +53,6 @@ def create_error_context(
     error: Exception | None = None,
     **extra: Any,
 ) -> dict[str, Any]:
-    """Create comprehensive error context.
-    Args:
-        operation: The operation being performed (e.g., "extract_file", "convert_pdf_to_images")
-        file_path: The file being processed, if applicable
-        error: The original exception, if any
-        **extra: Additional context fields
-    Returns:
-        Dictionary with error context including system info
-    """
     context: dict[str, Any] = {
         "timestamp": datetime.now(timezone.utc).isoformat(),
         "operation": operation,
@@ -104,14 +93,6 @@ def create_error_context(
 def is_transient_error(error: Exception) -> bool:
-    """Check if an error is likely transient and worth retrying.
-    Args:
-        error: The exception to check
-    Returns:
-        True if the error is likely transient
-    """
     transient_types = (
         OSError,
         PermissionError,
@@ -128,29 +109,11 @@ def is_transient_error(error: Exception) -> bool:
 def is_resource_error(error: Exception) -> bool:
-    """Check if an error is related to system resources.
-    Args:
-        error: The exception to check
-    Returns:
-        True if the error is resource-related
-    """
     error_str = str(error).lower()
     return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
 def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
-    """Determine if an operation should be retried.
-    Args:
-        error: The exception that occurred
-        attempt: Current attempt number (1-based)
-        max_attempts: Maximum number of attempts
-    Returns:
-        True if the operation should be retried
-    """
     if attempt >= max_attempts:
         return False
@@ -161,22 +124,17 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
 class BatchExtractionResult:
-    """Result container for batch operations with partial success support."""
     __slots__ = ("failed", "successful", "total_count")
     def __init__(self) -> None:
-        """Initialize batch result container."""
         self.successful: list[tuple[int, Any]] = []
         self.failed: list[tuple[int, dict[str, Any]]] = []
         self.total_count: int = 0
     def add_success(self, index: int, result: Any) -> None:
-        """Add a successful result."""
         self.successful.append((index, result))
     def add_failure(self, index: int, error: Exception, context: dict[str, Any]) -> None:
-        """Add a failed result with context."""
         error_info = {
             "error": {
                 "type": type(error).__name__,
@@ -188,30 +146,25 @@ class BatchExtractionResult:
     @property
     def success_count(self) -> int:
-        """Number of successful operations."""
         return len(self.successful)
     @property
     def failure_count(self) -> int:
-        """Number of failed operations."""
         return len(self.failed)
     @property
     def success_rate(self) -> float:
-        """Success rate as a percentage."""
         if self.total_count == 0:
             return 0.0
         return (self.success_count / self.total_count) * 100
     def get_ordered_results(self) -> list[Any | None]:
-        """Get results in original order with None for failures."""
         results = [None] * self.total_count
         for index, result in self.successful:
             results[index] = result
         return results
     def get_summary(self) -> dict[str, Any]:
-        """Get summary of batch operation."""
         return {
             "total": self.total_count,
             "successful": self.success_count,

kreuzberg/_utils/_ocr_cache.py ADDED Viewed

@@ -0,0 +1,136 @@
+from __future__ import annotations
+import hashlib
+import io
+from typing import TYPE_CHECKING, Any
+import anyio
+from kreuzberg._utils._cache import get_ocr_cache
+if TYPE_CHECKING:
+    from pathlib import Path
+    from PIL.Image import Image as PILImage
+    from kreuzberg._types import ExtractionResult
+def get_file_info(path: Path) -> dict[str, Any]:
+    from pathlib import Path as PathType  # noqa: PLC0415
+    path_obj = PathType(path) if not isinstance(path, PathType) else path
+    try:
+        stat = path_obj.stat()
+        return {
+            "path": str(path_obj.resolve()),
+            "size": stat.st_size,
+            "mtime": stat.st_mtime,
+        }
+    except OSError:
+        return {
+            "path": str(path_obj),
+            "size": 0,
+            "mtime": 0,
+        }
+def generate_image_hash(image: PILImage) -> str:
+    save_image = image
+    if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
+        save_image = image.convert("RGB")
+    image_buffer = io.BytesIO()
+    save_image.save(image_buffer, format="PNG")
+    image_content = image_buffer.getvalue()
+    return hashlib.sha256(image_content).hexdigest()[:16]
+def build_cache_kwargs(
+    backend_name: str,
+    config_dict: dict[str, Any],
+    image_hash: str | None = None,
+    file_info: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    cache_kwargs = {
+        "ocr_backend": backend_name,
+        "ocr_config": str(sorted(config_dict.items())),
+    }
+    if image_hash:
+        cache_kwargs["image_hash"] = image_hash
+    if file_info:
+        cache_kwargs["file_info"] = str(sorted(file_info.items()))
+    return cache_kwargs
+async def handle_cache_lookup_async(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
+    ocr_cache = get_ocr_cache()
+    cached_result = await ocr_cache.aget(**cache_kwargs)
+    if cached_result is not None:
+        return cached_result
+    if ocr_cache.is_processing(**cache_kwargs):
+        event = ocr_cache.mark_processing(**cache_kwargs)
+        await anyio.to_thread.run_sync(event.wait)
+        cached_result = await ocr_cache.aget(**cache_kwargs)
+        if cached_result is not None:
+            return cached_result
+    ocr_cache.mark_processing(**cache_kwargs)
+    return None
+def handle_cache_lookup_sync(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
+    ocr_cache = get_ocr_cache()
+    cached_result = ocr_cache.get(**cache_kwargs)
+    if cached_result is not None:
+        return cached_result
+    if ocr_cache.is_processing(**cache_kwargs):
+        event = ocr_cache.mark_processing(**cache_kwargs)
+        event.wait()
+        cached_result = ocr_cache.get(**cache_kwargs)
+        if cached_result is not None:
+            return cached_result
+    ocr_cache.mark_processing(**cache_kwargs)
+    return None
+async def cache_and_complete_async(
+    result: ExtractionResult,
+    cache_kwargs: dict[str, Any],
+    use_cache: bool,
+) -> None:
+    ocr_cache = get_ocr_cache()
+    if use_cache:
+        await ocr_cache.aset(result, **cache_kwargs)
+    ocr_cache.mark_complete(**cache_kwargs)
+def cache_and_complete_sync(
+    result: ExtractionResult,
+    cache_kwargs: dict[str, Any],
+    use_cache: bool,
+) -> None:
+    ocr_cache = get_ocr_cache()
+    if use_cache:
+        ocr_cache.set(result, **cache_kwargs)
+    ocr_cache.mark_complete(**cache_kwargs)
+def mark_processing_complete(cache_kwargs: dict[str, Any]) -> None:
+    ocr_cache = get_ocr_cache()
+    ocr_cache.mark_complete(**cache_kwargs)

kreuzberg/_utils/_pdf_lock.py CHANGED Viewed

@@ -19,13 +19,11 @@ _FILE_LOCKS_LOCK = threading.Lock()
 def _get_file_key(file_path: Path | str) -> str:
-    """Get a consistent key for a file path."""
     path_str = str(Path(file_path).resolve())
     return hashlib.md5(path_str.encode()).hexdigest()  # noqa: S324
 def _get_file_lock(file_path: Path | str) -> threading.RLock:
-    """Get or create a lock for a specific file."""
     file_key = _get_file_key(file_path)
     with _FILE_LOCKS_LOCK:
@@ -39,30 +37,18 @@ def _get_file_lock(file_path: Path | str) -> threading.RLock:
 @contextmanager
 def pypdfium_lock() -> Generator[None, None, None]:
-    """Context manager for thread-safe pypdfium2 operations.
-    This prevents segmentation faults on macOS where pypdfium2
-    is not fork-safe when used concurrently.
-    """
     with _PYPDFIUM_LOCK:
         yield
 @contextmanager
 def pypdfium_file_lock(file_path: Path | str) -> Generator[None, None, None]:
-    """Context manager for per-file pypdfium2 operations.
-    This allows concurrent processing of different files while
-    preventing segfaults. Document caching handles same-file issues.
-    """
     lock = _get_file_lock(file_path)
     with lock:
         yield
 def with_pypdfium_lock(func: Any) -> Any:
-    """Decorator to wrap functions with pypdfium2 lock."""
     def wrapper(*args: Any, **kwargs: Any) -> Any:
         with pypdfium_lock():
             return func(*args, **kwargs)

kreuzberg/_utils/_process_pool.py CHANGED Viewed

@@ -36,7 +36,6 @@ def _get_process_pool() -> ProcessPoolExecutor:
 @contextmanager
 def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
-    """Get the process pool."""
     pool = _get_process_pool()
     try:
         yield pool
@@ -47,14 +46,12 @@ def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
 def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
-    """Submit a function to the process pool and wait for result."""
     with process_pool() as pool:
         future = pool.submit(func, *args, **kwargs)
         return future.result()
 def shutdown_process_pool() -> None:
-    """Shutdown the process pool."""
     if _process_pool_ref.is_initialized():
         pool = _process_pool_ref.get()
         pool.shutdown(wait=True)
@@ -102,19 +99,11 @@ def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str,
 class ProcessPoolManager:
-    """Resource-aware process pool manager for CPU-intensive tasks."""
     def __init__(
         self,
         max_processes: int | None = None,
         memory_limit_gb: float | None = None,
     ) -> None:
-        """Initialize the process pool manager.
-        Args:
-            max_processes: Maximum number of processes. Defaults to CPU count.
-            memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
-        """
         self.max_processes = max_processes or mp.cpu_count()
         if memory_limit_gb is None:
@@ -127,21 +116,12 @@ class ProcessPoolManager:
         self._active_tasks = 0
     def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
-        """Calculate optimal number of workers based on memory constraints.
-        Args:
-            task_memory_mb: Estimated memory usage per task in MB.
-        Returns:
-            Optimal number of workers.
-        """
         task_memory_bytes = task_memory_mb * 1024**2
         memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
         return min(self.max_processes, memory_based_limit)
     def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
-        """Ensure process pool executor is initialized."""
         if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
             if self._executor is not None:
                 self._executor.shutdown(wait=False)
@@ -157,16 +137,6 @@ class ProcessPoolManager:
         *args: Any,
         task_memory_mb: float = 100,
     ) -> T:
-        """Submit a task to the process pool.
-        Args:
-            func: Function to execute.
-            *args: Positional arguments for the function.
-            task_memory_mb: Estimated memory usage in MB.
-        Returns:
-            Result of the function execution.
-        """
         workers = self.get_optimal_workers(task_memory_mb)
         self._ensure_executor(workers)
@@ -184,17 +154,6 @@ class ProcessPoolManager:
         task_memory_mb: float = 100,
         max_concurrent: int | None = None,
     ) -> list[T]:
-        """Submit a batch of tasks to the process pool.
-        Args:
-            func: Function to execute.
-            arg_batches: List of argument tuples for each task.
-            task_memory_mb: Estimated memory usage per task in MB.
-            max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
-        Returns:
-            List of results in the same order as input.
-        """
         if not arg_batches:
             return []
@@ -225,7 +184,6 @@ class ProcessPoolManager:
         return results
     def get_system_info(self) -> dict[str, Any]:
-        """Get current system resource information."""
         memory = psutil.virtual_memory()
         cpu_percent = psutil.cpu_percent(interval=1)
@@ -241,13 +199,11 @@ class ProcessPoolManager:
         }
     def shutdown(self, wait: bool = True) -> None:
-        """Shutdown the process pool."""
         if self._executor is not None:
             self._executor.shutdown(wait=wait)
             self._executor = None
     def __enter__(self) -> Self:
-        """Context manager entry."""
         return self
     def __exit__(
@@ -256,11 +212,9 @@ class ProcessPoolManager:
         exc_val: BaseException | None,
         exc_tb: types.TracebackType | None,
     ) -> None:
-        """Context manager exit."""
         self.shutdown()
     async def __aenter__(self) -> Self:
-        """Async context manager entry."""
         return self
     async def __aexit__(
@@ -269,5 +223,4 @@ class ProcessPoolManager:
         exc_val: BaseException | None,
         exc_tb: types.TracebackType | None,
     ) -> None:
-        """Async context manager exit."""
         self.shutdown()

kreuzberg/_utils/_quality.py CHANGED Viewed

@@ -44,15 +44,6 @@ _NAVIGATION_PATTERNS = {
 def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -> float:
-    """Calculate overall quality score for extracted text.
-    Args:
-        text: The extracted text content
-        metadata: Optional metadata for additional scoring
-    Returns:
-        Quality score between 0.0 and 1.0
-    """
     if not text or not text.strip():
         return 0.0
@@ -79,14 +70,6 @@ def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -
 def clean_extracted_text(text: str) -> str:
-    """Clean extracted text by removing artifacts and improving quality.
-    Args:
-        text: The raw extracted text
-    Returns:
-        Cleaned text with artifacts removed
-    """
     if not text:
         return text

kreuzberg/_utils/_ref.py CHANGED Viewed

@@ -9,40 +9,24 @@ T = TypeVar("T")
 class Ref(Generic[T]):
-    """A reference container that manages singleton instances without global variables.
-    This provides a clean alternative to global variables by using a registry pattern
-    with type safety.
-    """
     _instances: ClassVar[dict[str, Any]] = {}
     def __init__(self, name: str, factory: Callable[[], T]) -> None:
-        """Initialize a reference container.
-        Args:
-            name: Unique name for this reference
-            factory: Factory function to create the instance when needed
-        """
         self.name = name
         self.factory = factory
     def get(self) -> T:
-        """Get the singleton instance, creating it if it doesn't exist."""
         if self.name not in self._instances:
             self._instances[self.name] = self.factory()
         return cast("T", self._instances[self.name])
     def clear(self) -> None:
-        """Clear the singleton instance."""
         if self.name in self._instances:
             del self._instances[self.name]
     def is_initialized(self) -> bool:
-        """Check if the singleton instance exists."""
         return self.name in self._instances
     @classmethod
     def clear_all(cls) -> None:
-        """Clear all singleton instances."""
         cls._instances.clear()

kreuzberg/_utils/_serialization.py CHANGED Viewed

@@ -22,7 +22,6 @@ _DICT_METHOD_NAMES = (
 def encode_hook(obj: Any) -> Any:
-    """Custom encoder for complex objects."""
     if callable(obj):
         return None
@@ -44,18 +43,6 @@ def encode_hook(obj: Any) -> Any:
 def deserialize(value: str | bytes, target_type: type[T]) -> T:
-    """Deserialize bytes/string to target type.
-    Args:
-        value: Serialized data
-        target_type: Type to deserialize to
-    Returns:
-        Deserialized object
-    Raises:
-        ValueError: If deserialization fails
-    """
     try:
         return decode(cast("bytes", value), type=target_type, strict=False)
     except MsgspecError as e:
@@ -63,18 +50,6 @@ def deserialize(value: str | bytes, target_type: type[T]) -> T:
 def serialize(value: Any, **kwargs: Any) -> bytes:
-    """Serialize value to bytes.
-    Args:
-        value: Object to serialize
-        **kwargs: Additional data to merge with value if it's a dict
-    Returns:
-        Serialized bytes
-    Raises:
-        ValueError: If serialization fails
-    """
     if isinstance(value, dict) and kwargs:
         value = value | kwargs

kreuzberg 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl

kreuzberg 3.13.0py3-none-any.whl → 3.13.2py3-none-any.whl