PyPI - kreuzberg - Versions diffs - 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl - Mend

kreuzberg 3.2.0py3-none-any.whl → 3.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

kreuzberg/__init__.py +3 -0
kreuzberg/__main__.py +8 -0
kreuzberg/_api/__init__.py +0 -0
kreuzberg/_api/main.py +87 -0
kreuzberg/_cli_config.py +175 -0
kreuzberg/_extractors/_image.py +39 -4
kreuzberg/_extractors/_pandoc.py +158 -18
kreuzberg/_extractors/_pdf.py +199 -19
kreuzberg/_extractors/_presentation.py +1 -1
kreuzberg/_extractors/_spread_sheet.py +65 -7
kreuzberg/_gmft.py +222 -16
kreuzberg/_mime_types.py +62 -16
kreuzberg/_multiprocessing/__init__.py +6 -0
kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
kreuzberg/_multiprocessing/process_manager.py +188 -0
kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
kreuzberg/_ocr/_easyocr.py +6 -12
kreuzberg/_ocr/_paddleocr.py +15 -13
kreuzberg/_ocr/_tesseract.py +136 -46
kreuzberg/_playa.py +43 -0
kreuzberg/_types.py +4 -0
kreuzberg/_utils/_cache.py +372 -0
kreuzberg/_utils/_device.py +10 -27
kreuzberg/_utils/_document_cache.py +220 -0
kreuzberg/_utils/_errors.py +232 -0
kreuzberg/_utils/_pdf_lock.py +72 -0
kreuzberg/_utils/_process_pool.py +100 -0
kreuzberg/_utils/_serialization.py +82 -0
kreuzberg/_utils/_string.py +1 -1
kreuzberg/_utils/_sync.py +21 -0
kreuzberg/cli.py +338 -0
kreuzberg/extraction.py +247 -36
kreuzberg-3.4.0.dist-info/METADATA +290 -0
kreuzberg-3.4.0.dist-info/RECORD +50 -0
{kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
kreuzberg-3.2.0.dist-info/METADATA +0 -166
kreuzberg-3.2.0.dist-info/RECORD +0 -34
kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
{kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_document_cache.py ADDED Viewed

@@ -0,0 +1,220 @@
+"""Document-level caching to prevent pypdfium2 issues with duplicate processing."""
+from __future__ import annotations
+import hashlib
+import threading
+import time
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from kreuzberg._types import ExtractionConfig, ExtractionResult
+class DocumentCache:
+    """Session-scoped cache for document extraction results.
+    Ensures each unique document is processed only once per session,
+    preventing pypdfium2 state corruption issues with repeated processing.
+    """
+    def __init__(self) -> None:
+        """Initialize document cache."""
+        self._cache: dict[str, ExtractionResult] = {}
+        self._processing: dict[str, threading.Event] = {}
+        self._lock = threading.Lock()
+        self._file_metadata: dict[str, dict[str, Any]] = {}
+    def _get_cache_key(self, file_path: Path | str, config: ExtractionConfig | None = None) -> str:
+        """Generate cache key for a file and config combination.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+        Returns:
+            Unique cache key string
+        """
+        path = Path(file_path).resolve()
+        try:
+            stat = path.stat()
+            file_info = {
+                "path": str(path),
+                "size": stat.st_size,
+                "mtime": stat.st_mtime,
+            }
+        except OSError:
+            file_info = {"path": str(path), "size": 0, "mtime": 0}
+        config_info = {}
+        if config:
+            config_info = {
+                "force_ocr": config.force_ocr,
+                "ocr_backend": config.ocr_backend,
+                "extract_tables": config.extract_tables,
+                "chunk_content": config.chunk_content,
+                "max_chars": config.max_chars,
+                "max_overlap": config.max_overlap,
+            }
+        cache_data = {**file_info, **config_info}
+        cache_str = str(sorted(cache_data.items()))
+        return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
+    def _is_cache_valid(self, cache_key: str, file_path: Path | str) -> bool:
+        """Check if cached result is still valid.
+        Args:
+            cache_key: The cache key to validate
+            file_path: Path to the file
+        Returns:
+            True if cache is valid, False if invalidated
+        """
+        if cache_key not in self._file_metadata:
+            return False
+        path = Path(file_path)
+        try:
+            current_stat = path.stat()
+            cached_metadata = self._file_metadata[cache_key]
+            return bool(
+                cached_metadata["size"] == current_stat.st_size and cached_metadata["mtime"] == current_stat.st_mtime
+            )
+        except OSError:
+            return False
+    def get(self, file_path: Path | str, config: ExtractionConfig | None = None) -> ExtractionResult | None:
+        """Get cached extraction result if available and valid.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+        Returns:
+            Cached result if available, None otherwise
+        """
+        cache_key = self._get_cache_key(file_path, config)
+        with self._lock:
+            if cache_key in self._cache:
+                if self._is_cache_valid(cache_key, file_path):
+                    return self._cache[cache_key]
+                self._cache.pop(cache_key, None)
+                self._file_metadata.pop(cache_key, None)
+        return None
+    def set(self, file_path: Path | str, config: ExtractionConfig | None, result: ExtractionResult) -> None:
+        """Cache extraction result.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+            result: Extraction result to cache
+        """
+        cache_key = self._get_cache_key(file_path, config)
+        path = Path(file_path)
+        try:
+            stat = path.stat()
+            file_metadata = {
+                "size": stat.st_size,
+                "mtime": stat.st_mtime,
+                "cached_at": time.time(),
+            }
+        except OSError:
+            file_metadata = {
+                "size": 0,
+                "mtime": 0,
+                "cached_at": time.time(),
+            }
+        with self._lock:
+            self._cache[cache_key] = result
+            self._file_metadata[cache_key] = file_metadata
+    def is_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> bool:
+        """Check if file is currently being processed.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+        Returns:
+            True if file is currently being processed
+        """
+        cache_key = self._get_cache_key(file_path, config)
+        with self._lock:
+            return cache_key in self._processing
+    def mark_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> threading.Event:
+        """Mark file as being processed and return event to wait on.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+        Returns:
+            Event that will be set when processing completes
+        """
+        cache_key = self._get_cache_key(file_path, config)
+        with self._lock:
+            if cache_key not in self._processing:
+                self._processing[cache_key] = threading.Event()
+            return self._processing[cache_key]
+    def mark_complete(self, file_path: Path | str, config: ExtractionConfig | None = None) -> None:
+        """Mark file processing as complete.
+        Args:
+            file_path: Path to the file
+            config: Extraction configuration
+        """
+        cache_key = self._get_cache_key(file_path, config)
+        with self._lock:
+            if cache_key in self._processing:
+                event = self._processing.pop(cache_key)
+                event.set()
+    def clear(self) -> None:
+        """Clear all cached results."""
+        with self._lock:
+            self._cache.clear()
+            self._file_metadata.clear()
+    def get_stats(self) -> dict[str, Any]:
+        """Get cache statistics.
+        Returns:
+            Dictionary with cache statistics
+        """
+        with self._lock:
+            return {
+                "cached_documents": len(self._cache),
+                "processing_documents": len(self._processing),
+                "total_cache_size_mb": sum(len(result.content.encode("utf-8")) for result in self._cache.values())
+                / 1024
+                / 1024,
+            }
+_document_cache = DocumentCache()
+def get_document_cache() -> DocumentCache:
+    """Get the global document cache instance."""
+    return _document_cache
+def clear_document_cache() -> None:
+    """Clear the global document cache."""
+    _document_cache.clear()

kreuzberg/_utils/_errors.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""Enhanced error handling utilities."""
+from __future__ import annotations
+import platform
+import traceback
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING, Any
+import psutil
+if TYPE_CHECKING:
+    from pathlib import Path
+def create_error_context(
+    *,
+    operation: str,
+    file_path: Path | str | None = None,
+    error: Exception | None = None,
+    **extra: Any,
+) -> dict[str, Any]:
+    """Create comprehensive error context.
+    Args:
+        operation: The operation being performed (e.g., "extract_file", "convert_pdf_to_images")
+        file_path: The file being processed, if applicable
+        error: The original exception, if any
+        **extra: Additional context fields
+    Returns:
+        Dictionary with error context including system info
+    """
+    context: dict[str, Any] = {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "operation": operation,
+    }
+    if file_path:
+        from pathlib import Path
+        path = Path(file_path) if isinstance(file_path, str) else file_path
+        context["file"] = {
+            "path": str(path),
+            "name": path.name,
+            "exists": path.exists(),
+            "size": path.stat().st_size if path.exists() else None,
+        }
+    if error:
+        context["error"] = {
+            "type": type(error).__name__,
+            "message": str(error),
+            "traceback": traceback.format_exception_only(type(error), error),
+        }
+    if (
+        any(keyword in str(error).lower() for keyword in ["memory", "resource", "process", "thread"])
+        if error
+        else False
+    ):
+        try:
+            mem = psutil.virtual_memory()
+            context["system"] = {
+                "memory_available_mb": mem.available / 1024 / 1024,
+                "memory_percent": mem.percent,
+                "cpu_percent": psutil.cpu_percent(interval=0.1),
+                "process_count": len(psutil.pids()),
+                "platform": platform.platform(),
+            }
+        except Exception:  # noqa: BLE001
+            pass
+    context.update(extra)
+    return context
+def is_transient_error(error: Exception) -> bool:
+    """Check if an error is likely transient and worth retrying.
+    Args:
+        error: The exception to check
+    Returns:
+        True if the error is likely transient
+    """
+    transient_types = (
+        OSError,
+        PermissionError,
+        TimeoutError,
+        ConnectionError,
+        BrokenPipeError,
+    )
+    if isinstance(error, transient_types):
+        return True
+    transient_patterns = [
+        "temporary",
+        "locked",
+        "in use",
+        "access denied",
+        "permission",
+        "timeout",
+        "connection",
+        "network",
+        "too many open files",
+        "cannot allocate memory",
+        "resource temporarily unavailable",
+        "broken pipe",
+        "subprocess",
+        "signal",
+    ]
+    error_str = str(error).lower()
+    return any(pattern in error_str for pattern in transient_patterns)
+def is_resource_error(error: Exception) -> bool:
+    """Check if an error is related to system resources.
+    Args:
+        error: The exception to check
+    Returns:
+        True if the error is resource-related
+    """
+    resource_patterns = [
+        "memory",
+        "out of memory",
+        "cannot allocate",
+        "too many open files",
+        "file descriptor",
+        "resource",
+        "exhausted",
+        "limit",
+        "cpu",
+        "thread",
+        "process",
+    ]
+    error_str = str(error).lower()
+    return any(pattern in error_str for pattern in resource_patterns)
+def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
+    """Determine if an operation should be retried.
+    Args:
+        error: The exception that occurred
+        attempt: Current attempt number (1-based)
+        max_attempts: Maximum number of attempts
+    Returns:
+        True if the operation should be retried
+    """
+    if attempt >= max_attempts:
+        return False
+    from kreuzberg.exceptions import ValidationError
+    if isinstance(error, ValidationError):
+        return False
+    return is_transient_error(error)
+class BatchExtractionResult:
+    """Result container for batch operations with partial success support."""
+    def __init__(self) -> None:
+        """Initialize batch result container."""
+        self.successful: list[tuple[int, Any]] = []
+        self.failed: list[tuple[int, dict[str, Any]]] = []
+        self.total_count: int = 0
+    def add_success(self, index: int, result: Any) -> None:
+        """Add a successful result."""
+        self.successful.append((index, result))
+    def add_failure(self, index: int, error: Exception, context: dict[str, Any]) -> None:
+        """Add a failed result with context."""
+        error_info = {
+            "error": {
+                "type": type(error).__name__,
+                "message": str(error),
+            },
+            "context": context,
+        }
+        self.failed.append((index, error_info))
+    @property
+    def success_count(self) -> int:
+        """Number of successful operations."""
+        return len(self.successful)
+    @property
+    def failure_count(self) -> int:
+        """Number of failed operations."""
+        return len(self.failed)
+    @property
+    def success_rate(self) -> float:
+        """Success rate as a percentage."""
+        if self.total_count == 0:
+            return 0.0
+        return (self.success_count / self.total_count) * 100
+    def get_ordered_results(self) -> list[Any | None]:
+        """Get results in original order with None for failures."""
+        results = [None] * self.total_count
+        for index, result in self.successful:
+            results[index] = result
+        return results
+    def get_summary(self) -> dict[str, Any]:
+        """Get summary of batch operation."""
+        return {
+            "total": self.total_count,
+            "successful": self.success_count,
+            "failed": self.failure_count,
+            "success_rate": f"{self.success_rate:.1f}%",
+            "failures": [
+                {
+                    "index": idx,
+                    "error": info["error"]["type"],
+                    "message": info["error"]["message"],
+                }
+                for idx, info in self.failed
+            ],
+        }

kreuzberg/_utils/_pdf_lock.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""PDF processing lock utilities for thread-safe pypdfium2 operations."""
+from __future__ import annotations
+import hashlib
+import threading
+from contextlib import contextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from weakref import WeakValueDictionary
+if TYPE_CHECKING:
+    from collections.abc import Generator
+_PYPDFIUM_LOCK = threading.RLock()
+_FILE_LOCKS_CACHE = WeakValueDictionary[str, threading.RLock]()
+_FILE_LOCKS_LOCK = threading.Lock()
+def _get_file_key(file_path: Path | str) -> str:
+    """Get a consistent key for a file path."""
+    path_str = str(Path(file_path).resolve())
+    return hashlib.md5(path_str.encode()).hexdigest()  # noqa: S324
+def _get_file_lock(file_path: Path | str) -> threading.RLock:
+    """Get or create a lock for a specific file."""
+    file_key = _get_file_key(file_path)
+    with _FILE_LOCKS_LOCK:
+        if file_key in _FILE_LOCKS_CACHE:
+            return _FILE_LOCKS_CACHE[file_key]
+        lock = threading.RLock()
+        _FILE_LOCKS_CACHE[file_key] = lock
+        return lock
+@contextmanager
+def pypdfium_lock() -> Generator[None, None, None]:
+    """Context manager for thread-safe pypdfium2 operations.
+    This prevents segmentation faults on macOS where pypdfium2
+    is not fork-safe when used concurrently.
+    """
+    with _PYPDFIUM_LOCK:
+        yield
+@contextmanager
+def pypdfium_file_lock(file_path: Path | str) -> Generator[None, None, None]:
+    """Context manager for per-file pypdfium2 operations.
+    This allows concurrent processing of different files while
+    preventing segfaults. Document caching handles same-file issues.
+    """
+    lock = _get_file_lock(file_path)
+    with lock:
+        yield
+def with_pypdfium_lock(func: Any) -> Any:
+    """Decorator to wrap functions with pypdfium2 lock."""
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        with pypdfium_lock():
+            return func(*args, **kwargs)
+    return wrapper

kreuzberg/_utils/_process_pool.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Process pool utilities for CPU-intensive operations."""
+from __future__ import annotations
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Any, Callable, TypeVar
+if TYPE_CHECKING:
+    from collections.abc import Generator
+T = TypeVar("T")
+_PROCESS_POOL: ProcessPoolExecutor | None = None
+_POOL_SIZE = max(1, mp.cpu_count() - 1)
+def _init_process_pool() -> ProcessPoolExecutor:
+    """Initialize the global process pool."""
+    global _PROCESS_POOL
+    if _PROCESS_POOL is None:
+        _PROCESS_POOL = ProcessPoolExecutor(max_workers=_POOL_SIZE)
+    return _PROCESS_POOL
+@contextmanager
+def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
+    """Get the global process pool."""
+    pool = _init_process_pool()
+    try:
+        yield pool
+    except Exception:  # noqa: BLE001
+        shutdown_process_pool()
+        pool = _init_process_pool()
+        yield pool
+def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
+    """Submit a function to the process pool and wait for result."""
+    with process_pool() as pool:
+        future = pool.submit(func, *args, **kwargs)
+        return future.result()
+def shutdown_process_pool() -> None:
+    """Shutdown the global process pool."""
+    global _PROCESS_POOL
+    if _PROCESS_POOL is not None:
+        _PROCESS_POOL.shutdown(wait=True)
+        _PROCESS_POOL = None
+def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
+    """Worker function for extracting PDF text in a separate process."""
+    import pypdfium2
+    pdf = None
+    try:
+        pdf = pypdfium2.PdfDocument(pdf_path)
+        text_parts = []
+        for page in pdf:
+            text_page = page.get_textpage()
+            text = text_page.get_text_range()
+            text_parts.append(text)
+            text_page.close()
+            page.close()
+        return (pdf_path, "".join(text_parts))
+    except Exception as e:  # noqa: BLE001
+        return (pdf_path, f"ERROR: {e}")
+    finally:
+        if pdf:
+            pdf.close()
+def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
+    """Worker function for converting PDF to images in a separate process."""
+    import io
+    import pypdfium2
+    pdf = None
+    try:
+        pdf = pypdfium2.PdfDocument(pdf_path)
+        image_bytes = []
+        for page in pdf:
+            bitmap = page.render(scale=scale)
+            pil_image = bitmap.to_pil()
+            img_bytes = io.BytesIO()
+            pil_image.save(img_bytes, format="PNG")
+            image_bytes.append(img_bytes.getvalue())
+            bitmap.close()
+            page.close()
+        return (pdf_path, image_bytes)
+    except Exception:  # noqa: BLE001
+        return (pdf_path, [])
+    finally:
+        if pdf:
+            pdf.close()

kreuzberg/_utils/_serialization.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Fast serialization utilities using msgspec."""
+from __future__ import annotations
+from dataclasses import asdict, is_dataclass
+from enum import Enum
+from typing import Any, TypeVar, cast
+from msgspec import MsgspecError
+from msgspec.msgpack import decode, encode
+T = TypeVar("T")
+def encode_hook(obj: Any) -> Any:
+    """Custom encoder for complex objects."""
+    if callable(obj):
+        return None
+    if isinstance(obj, Exception):
+        return {"message": str(obj), "type": type(obj).__name__}
+    for key in (
+        "to_dict",
+        "as_dict",
+        "dict",
+        "model_dump",
+        "json",
+        "to_list",
+        "tolist",
+    ):
+        if hasattr(obj, key) and callable(getattr(obj, key)):
+            return getattr(obj, key)()
+    if is_dataclass(obj) and not isinstance(obj, type):
+        return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}
+    if hasattr(obj, "save") and hasattr(obj, "format"):
+        return None
+    raise TypeError(f"Unsupported type: {type(obj)!r}")
+def deserialize(value: str | bytes, target_type: type[T]) -> T:
+    """Deserialize bytes/string to target type.
+    Args:
+        value: Serialized data
+        target_type: Type to deserialize to
+    Returns:
+        Deserialized object
+    Raises:
+        ValueError: If deserialization fails
+    """
+    try:
+        return decode(cast("bytes", value), type=target_type, strict=False)
+    except MsgspecError as e:
+        raise ValueError(f"Failed to deserialize to {target_type.__name__}: {e}") from e
+def serialize(value: Any, **kwargs: Any) -> bytes:
+    """Serialize value to bytes.
+    Args:
+        value: Object to serialize
+        **kwargs: Additional data to merge with value if it's a dict
+    Returns:
+        Serialized bytes
+    Raises:
+        ValueError: If serialization fails
+    """
+    if isinstance(value, dict) and kwargs:
+        value = value | kwargs
+    try:
+        return encode(value, enc_hook=encode_hook)
+    except (MsgspecError, TypeError) as e:
+        raise ValueError(f"Failed to serialize {type(value).__name__}: {e}") from e

kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

kreuzberg 3.2.0py3-none-any.whl → 3.4.0py3-none-any.whl