PyPI - kreuzberg - Versions diffs - 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl - Mend

kreuzberg 3.1.7py3-none-any.whl → 3.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

kreuzberg/__init__.py +3 -0
kreuzberg/__main__.py +8 -0
kreuzberg/_cli_config.py +175 -0
kreuzberg/_extractors/_image.py +39 -4
kreuzberg/_extractors/_pandoc.py +158 -18
kreuzberg/_extractors/_pdf.py +199 -19
kreuzberg/_extractors/_presentation.py +1 -1
kreuzberg/_extractors/_spread_sheet.py +65 -7
kreuzberg/_gmft.py +222 -16
kreuzberg/_mime_types.py +62 -16
kreuzberg/_multiprocessing/__init__.py +6 -0
kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
kreuzberg/_multiprocessing/process_manager.py +188 -0
kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
kreuzberg/_ocr/_easyocr.py +66 -10
kreuzberg/_ocr/_paddleocr.py +86 -7
kreuzberg/_ocr/_tesseract.py +136 -46
kreuzberg/_playa.py +43 -0
kreuzberg/_utils/_cache.py +372 -0
kreuzberg/_utils/_device.py +356 -0
kreuzberg/_utils/_document_cache.py +220 -0
kreuzberg/_utils/_errors.py +232 -0
kreuzberg/_utils/_pdf_lock.py +72 -0
kreuzberg/_utils/_process_pool.py +100 -0
kreuzberg/_utils/_serialization.py +82 -0
kreuzberg/_utils/_string.py +1 -1
kreuzberg/_utils/_sync.py +21 -0
kreuzberg/cli.py +338 -0
kreuzberg/extraction.py +247 -36
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +95 -34
kreuzberg-3.3.0.dist-info/RECORD +48 -0
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
kreuzberg-3.1.7.dist-info/RECORD +0 -33
kreuzberg-3.1.7.dist-info/top_level.txt +0 -1
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_errors.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""Enhanced error handling utilities."""
+from __future__ import annotations
+import platform
+import traceback
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING, Any
+import psutil
+if TYPE_CHECKING:
+    from pathlib import Path
+def create_error_context(
+    *,
+    operation: str,
+    file_path: Path | str | None = None,
+    error: Exception | None = None,
+    **extra: Any,
+) -> dict[str, Any]:
+    """Create comprehensive error context.
+    Args:
+        operation: The operation being performed (e.g., "extract_file", "convert_pdf_to_images")
+        file_path: The file being processed, if applicable
+        error: The original exception, if any
+        **extra: Additional context fields
+    Returns:
+        Dictionary with error context including system info
+    """
+    context: dict[str, Any] = {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "operation": operation,
+    }
+    if file_path:
+        from pathlib import Path
+        path = Path(file_path) if isinstance(file_path, str) else file_path
+        context["file"] = {
+            "path": str(path),
+            "name": path.name,
+            "exists": path.exists(),
+            "size": path.stat().st_size if path.exists() else None,
+        }
+    if error:
+        context["error"] = {
+            "type": type(error).__name__,
+            "message": str(error),
+            "traceback": traceback.format_exception_only(type(error), error),
+        }
+    if (
+        any(keyword in str(error).lower() for keyword in ["memory", "resource", "process", "thread"])
+        if error
+        else False
+    ):
+        try:
+            mem = psutil.virtual_memory()
+            context["system"] = {
+                "memory_available_mb": mem.available / 1024 / 1024,
+                "memory_percent": mem.percent,
+                "cpu_percent": psutil.cpu_percent(interval=0.1),
+                "process_count": len(psutil.pids()),
+                "platform": platform.platform(),
+            }
+        except Exception:  # noqa: BLE001
+            pass
+    context.update(extra)
+    return context
+def is_transient_error(error: Exception) -> bool:
+    """Check if an error is likely transient and worth retrying.
+    Args:
+        error: The exception to check
+    Returns:
+        True if the error is likely transient
+    """
+    transient_types = (
+        OSError,
+        PermissionError,
+        TimeoutError,
+        ConnectionError,
+        BrokenPipeError,
+    )
+    if isinstance(error, transient_types):
+        return True
+    transient_patterns = [
+        "temporary",
+        "locked",
+        "in use",
+        "access denied",
+        "permission",
+        "timeout",
+        "connection",
+        "network",
+        "too many open files",
+        "cannot allocate memory",
+        "resource temporarily unavailable",
+        "broken pipe",
+        "subprocess",
+        "signal",
+    ]
+    error_str = str(error).lower()
+    return any(pattern in error_str for pattern in transient_patterns)
+def is_resource_error(error: Exception) -> bool:
+    """Check if an error is related to system resources.
+    Args:
+        error: The exception to check
+    Returns:
+        True if the error is resource-related
+    """
+    resource_patterns = [
+        "memory",
+        "out of memory",
+        "cannot allocate",
+        "too many open files",
+        "file descriptor",
+        "resource",
+        "exhausted",
+        "limit",
+        "cpu",
+        "thread",
+        "process",
+    ]
+    error_str = str(error).lower()
+    return any(pattern in error_str for pattern in resource_patterns)
+def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
+    """Determine if an operation should be retried.
+    Args:
+        error: The exception that occurred
+        attempt: Current attempt number (1-based)
+        max_attempts: Maximum number of attempts
+    Returns:
+        True if the operation should be retried
+    """
+    if attempt >= max_attempts:
+        return False
+    from kreuzberg.exceptions import ValidationError
+    if isinstance(error, ValidationError):
+        return False
+    return is_transient_error(error)
+class BatchExtractionResult:
+    """Result container for batch operations with partial success support."""
+    def __init__(self) -> None:
+        """Initialize batch result container."""
+        self.successful: list[tuple[int, Any]] = []
+        self.failed: list[tuple[int, dict[str, Any]]] = []
+        self.total_count: int = 0
+    def add_success(self, index: int, result: Any) -> None:
+        """Add a successful result."""
+        self.successful.append((index, result))
+    def add_failure(self, index: int, error: Exception, context: dict[str, Any]) -> None:
+        """Add a failed result with context."""
+        error_info = {
+            "error": {
+                "type": type(error).__name__,
+                "message": str(error),
+            },
+            "context": context,
+        }
+        self.failed.append((index, error_info))
+    @property
+    def success_count(self) -> int:
+        """Number of successful operations."""
+        return len(self.successful)
+    @property
+    def failure_count(self) -> int:
+        """Number of failed operations."""
+        return len(self.failed)
+    @property
+    def success_rate(self) -> float:
+        """Success rate as a percentage."""
+        if self.total_count == 0:
+            return 0.0
+        return (self.success_count / self.total_count) * 100
+    def get_ordered_results(self) -> list[Any | None]:
+        """Get results in original order with None for failures."""
+        results = [None] * self.total_count
+        for index, result in self.successful:
+            results[index] = result
+        return results
+    def get_summary(self) -> dict[str, Any]:
+        """Get summary of batch operation."""
+        return {
+            "total": self.total_count,
+            "successful": self.success_count,
+            "failed": self.failure_count,
+            "success_rate": f"{self.success_rate:.1f}%",
+            "failures": [
+                {
+                    "index": idx,
+                    "error": info["error"]["type"],
+                    "message": info["error"]["message"],
+                }
+                for idx, info in self.failed
+            ],
+        }

kreuzberg/_utils/_pdf_lock.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""PDF processing lock utilities for thread-safe pypdfium2 operations."""
+from __future__ import annotations
+import hashlib
+import threading
+from contextlib import contextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from weakref import WeakValueDictionary
+if TYPE_CHECKING:
+    from collections.abc import Generator
+_PYPDFIUM_LOCK = threading.RLock()
+_FILE_LOCKS_CACHE = WeakValueDictionary[str, threading.RLock]()
+_FILE_LOCKS_LOCK = threading.Lock()
+def _get_file_key(file_path: Path | str) -> str:
+    """Get a consistent key for a file path."""
+    path_str = str(Path(file_path).resolve())
+    return hashlib.md5(path_str.encode()).hexdigest()  # noqa: S324
+def _get_file_lock(file_path: Path | str) -> threading.RLock:
+    """Get or create a lock for a specific file."""
+    file_key = _get_file_key(file_path)
+    with _FILE_LOCKS_LOCK:
+        if file_key in _FILE_LOCKS_CACHE:
+            return _FILE_LOCKS_CACHE[file_key]
+        lock = threading.RLock()
+        _FILE_LOCKS_CACHE[file_key] = lock
+        return lock
+@contextmanager
+def pypdfium_lock() -> Generator[None, None, None]:
+    """Context manager for thread-safe pypdfium2 operations.
+    This prevents segmentation faults on macOS where pypdfium2
+    is not fork-safe when used concurrently.
+    """
+    with _PYPDFIUM_LOCK:
+        yield
+@contextmanager
+def pypdfium_file_lock(file_path: Path | str) -> Generator[None, None, None]:
+    """Context manager for per-file pypdfium2 operations.
+    This allows concurrent processing of different files while
+    preventing segfaults. Document caching handles same-file issues.
+    """
+    lock = _get_file_lock(file_path)
+    with lock:
+        yield
+def with_pypdfium_lock(func: Any) -> Any:
+    """Decorator to wrap functions with pypdfium2 lock."""
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        with pypdfium_lock():
+            return func(*args, **kwargs)
+    return wrapper

kreuzberg/_utils/_process_pool.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Process pool utilities for CPU-intensive operations."""
+from __future__ import annotations
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Any, Callable, TypeVar
+if TYPE_CHECKING:
+    from collections.abc import Generator
+T = TypeVar("T")
+_PROCESS_POOL: ProcessPoolExecutor | None = None
+_POOL_SIZE = max(1, mp.cpu_count() - 1)
+def _init_process_pool() -> ProcessPoolExecutor:
+    """Initialize the global process pool."""
+    global _PROCESS_POOL
+    if _PROCESS_POOL is None:
+        _PROCESS_POOL = ProcessPoolExecutor(max_workers=_POOL_SIZE)
+    return _PROCESS_POOL
+@contextmanager
+def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
+    """Get the global process pool."""
+    pool = _init_process_pool()
+    try:
+        yield pool
+    except Exception:  # noqa: BLE001
+        shutdown_process_pool()
+        pool = _init_process_pool()
+        yield pool
+def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
+    """Submit a function to the process pool and wait for result."""
+    with process_pool() as pool:
+        future = pool.submit(func, *args, **kwargs)
+        return future.result()
+def shutdown_process_pool() -> None:
+    """Shutdown the global process pool."""
+    global _PROCESS_POOL
+    if _PROCESS_POOL is not None:
+        _PROCESS_POOL.shutdown(wait=True)
+        _PROCESS_POOL = None
+def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
+    """Worker function for extracting PDF text in a separate process."""
+    import pypdfium2
+    pdf = None
+    try:
+        pdf = pypdfium2.PdfDocument(pdf_path)
+        text_parts = []
+        for page in pdf:
+            text_page = page.get_textpage()
+            text = text_page.get_text_range()
+            text_parts.append(text)
+            text_page.close()
+            page.close()
+        return (pdf_path, "".join(text_parts))
+    except Exception as e:  # noqa: BLE001
+        return (pdf_path, f"ERROR: {e}")
+    finally:
+        if pdf:
+            pdf.close()
+def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
+    """Worker function for converting PDF to images in a separate process."""
+    import io
+    import pypdfium2
+    pdf = None
+    try:
+        pdf = pypdfium2.PdfDocument(pdf_path)
+        image_bytes = []
+        for page in pdf:
+            bitmap = page.render(scale=scale)
+            pil_image = bitmap.to_pil()
+            img_bytes = io.BytesIO()
+            pil_image.save(img_bytes, format="PNG")
+            image_bytes.append(img_bytes.getvalue())
+            bitmap.close()
+            page.close()
+        return (pdf_path, image_bytes)
+    except Exception:  # noqa: BLE001
+        return (pdf_path, [])
+    finally:
+        if pdf:
+            pdf.close()

kreuzberg/_utils/_serialization.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Fast serialization utilities using msgspec."""
+from __future__ import annotations
+from dataclasses import asdict, is_dataclass
+from enum import Enum
+from typing import Any, TypeVar, cast
+from msgspec import MsgspecError
+from msgspec.msgpack import decode, encode
+T = TypeVar("T")
+def encode_hook(obj: Any) -> Any:
+    """Custom encoder for complex objects."""
+    if callable(obj):
+        return None
+    if isinstance(obj, Exception):
+        return {"message": str(obj), "type": type(obj).__name__}
+    for key in (
+        "to_dict",
+        "as_dict",
+        "dict",
+        "model_dump",
+        "json",
+        "to_list",
+        "tolist",
+    ):
+        if hasattr(obj, key) and callable(getattr(obj, key)):
+            return getattr(obj, key)()
+    if is_dataclass(obj) and not isinstance(obj, type):
+        return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}
+    if hasattr(obj, "save") and hasattr(obj, "format"):
+        return None
+    raise TypeError(f"Unsupported type: {type(obj)!r}")
+def deserialize(value: str | bytes, target_type: type[T]) -> T:
+    """Deserialize bytes/string to target type.
+    Args:
+        value: Serialized data
+        target_type: Type to deserialize to
+    Returns:
+        Deserialized object
+    Raises:
+        ValueError: If deserialization fails
+    """
+    try:
+        return decode(cast("bytes", value), type=target_type, strict=False)
+    except MsgspecError as e:
+        raise ValueError(f"Failed to deserialize to {target_type.__name__}: {e}") from e
+def serialize(value: Any, **kwargs: Any) -> bytes:
+    """Serialize value to bytes.
+    Args:
+        value: Object to serialize
+        **kwargs: Additional data to merge with value if it's a dict
+    Returns:
+        Serialized bytes
+    Raises:
+        ValueError: If serialization fails
+    """
+    if isinstance(value, dict) and kwargs:
+        value = value | kwargs
+    try:
+        return encode(value, enc_hook=encode_hook)
+    except (MsgspecError, TypeError) as e:
+        raise ValueError(f"Failed to serialize {type(value).__name__}: {e}") from e

kreuzberg/_utils/_string.py CHANGED Viewed

@@ -20,7 +20,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
     encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
-    for enc in [e for e in encodings if e]:  # pragma: no cover
+    for enc in [e for e in encodings if e]:
         with suppress(UnicodeDecodeError, LookupError):
             return byte_data.decode(enc)

kreuzberg/_utils/_sync.py CHANGED Viewed

@@ -119,3 +119,24 @@ def run_maybe_async(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs:
         T: The return value of the executed function, resolved if asynchronous.
     """
     return cast("T", fn(*args, **kwargs) if not iscoroutinefunction(fn) else anyio.run(partial(fn, **kwargs), *args))
+def run_sync_only(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
+    """Runs a function, but only if it's synchronous. Raises error if async.
+    This is used for pure sync code paths where we cannot handle async functions.
+    Args:
+        fn: The function to be executed, must be synchronous.
+        *args: Positional arguments to be passed to the function.
+        **kwargs: Keyword arguments to be passed to the function.
+    Returns:
+        T: The return value of the executed function.
+    Raises:
+        RuntimeError: If the function is asynchronous.
+    """
+    if iscoroutinefunction(fn):
+        raise RuntimeError(f"Cannot run async function {fn.__name__} in sync-only context")
+    return cast("T", fn(*args, **kwargs))

kreuzberg 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl

kreuzberg 3.1.7py3-none-any.whl → 3.3.0py3-none-any.whl