PyPI - kreuzberg - Versions diffs - 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl - Mend

kreuzberg 3.2.0py3-none-any.whl → 3.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

kreuzberg/__init__.py +3 -0
kreuzberg/__main__.py +8 -0
kreuzberg/_api/__init__.py +0 -0
kreuzberg/_api/main.py +87 -0
kreuzberg/_cli_config.py +175 -0
kreuzberg/_extractors/_image.py +39 -4
kreuzberg/_extractors/_pandoc.py +158 -18
kreuzberg/_extractors/_pdf.py +199 -19
kreuzberg/_extractors/_presentation.py +1 -1
kreuzberg/_extractors/_spread_sheet.py +65 -7
kreuzberg/_gmft.py +222 -16
kreuzberg/_mime_types.py +62 -16
kreuzberg/_multiprocessing/__init__.py +6 -0
kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
kreuzberg/_multiprocessing/process_manager.py +188 -0
kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
kreuzberg/_ocr/_easyocr.py +6 -12
kreuzberg/_ocr/_paddleocr.py +15 -13
kreuzberg/_ocr/_tesseract.py +136 -46
kreuzberg/_playa.py +43 -0
kreuzberg/_types.py +4 -0
kreuzberg/_utils/_cache.py +372 -0
kreuzberg/_utils/_device.py +10 -27
kreuzberg/_utils/_document_cache.py +220 -0
kreuzberg/_utils/_errors.py +232 -0
kreuzberg/_utils/_pdf_lock.py +72 -0
kreuzberg/_utils/_process_pool.py +100 -0
kreuzberg/_utils/_serialization.py +82 -0
kreuzberg/_utils/_string.py +1 -1
kreuzberg/_utils/_sync.py +21 -0
kreuzberg/cli.py +338 -0
kreuzberg/extraction.py +247 -36
kreuzberg-3.4.0.dist-info/METADATA +290 -0
kreuzberg-3.4.0.dist-info/RECORD +50 -0
{kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
kreuzberg-3.2.0.dist-info/METADATA +0 -166
kreuzberg-3.2.0.dist-info/RECORD +0 -34
kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
{kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_mime_types.py CHANGED Viewed

@@ -161,27 +161,48 @@ def validate_mime_type(
     Returns:
         The validated MIME type.
     """
-    if file_path and check_file_exists:
-        path = Path(file_path)
-        if not path.exists():
-            raise ValidationError("The file does not exist", context={"file_path": str(path)})
+    if mime_type:
+        return _validate_explicit_mime_type(mime_type)
+    if file_path:
+        from kreuzberg._utils._cache import get_mime_cache
-    if not mime_type:
-        if not file_path:
-            raise ValidationError(
-                "Could not determine mime type.",
-            )
         path = Path(file_path)
-        ext = path.suffix.lower()
-        mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
+        try:
+            stat = path.stat() if check_file_exists else None
+            file_info = {
+                "path": str(path.resolve()),
+                "size": stat.st_size if stat else 0,
+                "mtime": stat.st_mtime if stat else 0,
+                "check_file_exists": check_file_exists,
+            }
+        except OSError:
+            file_info = {
+                "path": str(path),
+                "size": 0,
+                "mtime": 0,
+                "check_file_exists": check_file_exists,
+            }
+        cache_kwargs = {"file_info": str(sorted(file_info.items())), "detector": "mime_type"}
+        mime_cache = get_mime_cache()
+        cached_result = mime_cache.get(**cache_kwargs)
+        if cached_result is not None:
+            return cached_result
+        detected_mime_type = _detect_mime_type_uncached(file_path, check_file_exists)
+        mime_cache.set(detected_mime_type, **cache_kwargs)
+        return detected_mime_type
+    return _detect_mime_type_uncached(file_path, check_file_exists)
-        if not mime_type:  # pragma: no cover
-            raise ValidationError(
-                "Could not determine the mime type of the file. Please specify the mime_type parameter explicitly.",
-                context={"input_file": str(path), "extension": ext},
-            )
+def _validate_explicit_mime_type(mime_type: str) -> str:
+    """Validate an explicitly provided MIME type."""
     if mime_type in SUPPORTED_MIME_TYPES:
         return mime_type
@@ -193,3 +214,28 @@ def validate_mime_type(
         f"Unsupported mime type: {mime_type}",
         context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
     )
+def _detect_mime_type_uncached(file_path: PathLike[str] | str | None = None, check_file_exists: bool = True) -> str:
+    """Detect MIME type without caching (internal function)."""
+    if file_path and check_file_exists:
+        path = Path(file_path)
+        if not path.exists():
+            raise ValidationError("The file does not exist", context={"file_path": str(path)})
+    if not file_path:
+        raise ValidationError(
+            "Could not determine mime type.",
+        )
+    path = Path(file_path)
+    ext = path.suffix.lower()
+    mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
+    if not mime_type:  # pragma: no cover
+        raise ValidationError(
+            "Could not determine the mime type of the file. Please specify the mime_type parameter explicitly.",
+            context={"input_file": str(path), "extension": ext},
+        )
+    return _validate_explicit_mime_type(mime_type)

kreuzberg/_multiprocessing/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Multiprocessing utilities for kreuzberg."""
+from .process_manager import ProcessPoolManager
+from .tesseract_pool import TesseractProcessPool
+__all__ = ["ProcessPoolManager", "TesseractProcessPool"]

kreuzberg/_multiprocessing/gmft_isolated.py ADDED Viewed

@@ -0,0 +1,332 @@
+"""Isolated GMFT table extraction to handle segmentation faults."""
+from __future__ import annotations
+import multiprocessing as mp
+import pickle
+import queue
+import signal
+import traceback
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from os import PathLike
+    from kreuzberg._gmft import GMFTConfig
+    from kreuzberg._types import TableData
+def _extract_tables_in_process(
+    file_path: str | PathLike[str],
+    config_dict: dict[str, Any],
+    result_queue: queue.Queue[tuple[bool, Any]],
+) -> None:
+    """Extract tables in an isolated process to handle potential segfaults.
+    Args:
+        file_path: Path to the PDF file
+        config_dict: Serialized GMFTConfig as a dict
+        result_queue: Queue to put results or errors
+    """
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+    try:
+        from gmft.auto import AutoTableDetector, AutoTableFormatter  # type: ignore[attr-defined]
+        from gmft.detectors.tatr import TATRDetectorConfig  # type: ignore[attr-defined]
+        from gmft.formatters.tatr import TATRFormatConfig
+        from gmft.pdf_bindings.pdfium import PyPDFium2Document
+        from kreuzberg._gmft import GMFTConfig
+        config = GMFTConfig(**config_dict)
+        formatter = AutoTableFormatter(  # type: ignore[no-untyped-call]
+            config=TATRFormatConfig(
+                verbosity=config.verbosity,
+                formatter_base_threshold=config.formatter_base_threshold,
+                cell_required_confidence=config.cell_required_confidence,
+                remove_null_rows=config.remove_null_rows,
+                enable_multi_header=config.enable_multi_header,
+                semantic_spanning_cells=config.semantic_spanning_cells,
+                semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
+                large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
+                large_table_threshold=config.large_table_threshold,
+                large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
+                large_table_maximum_rows=config.large_table_maximum_rows,
+                force_large_table_assumption=config.force_large_table_assumption,
+            )
+        )
+        detector = AutoTableDetector(  # type: ignore[no-untyped-call]
+            config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
+        )
+        doc = PyPDFium2Document(str(file_path))
+        cropped_tables = []
+        dataframes = []
+        try:
+            for page in doc:
+                cropped_tables.extend(detector.extract(page))  # type: ignore[attr-defined]
+            for cropped_table in cropped_tables:
+                formatted_table = formatter.extract(cropped_table)  # type: ignore[attr-defined]
+                dataframes.append(formatted_table.df())
+            results = []
+            for data_frame, cropped_table in zip(dataframes, cropped_tables):
+                import io
+                img_bytes = io.BytesIO()
+                cropped_image = cropped_table.image()
+                cropped_image.save(img_bytes, format="PNG")
+                img_bytes.seek(0)
+                results.append(
+                    {
+                        "cropped_image_bytes": img_bytes.getvalue(),
+                        "page_number": cropped_table.page.page_number,
+                        "text": data_frame.to_markdown(),
+                        "df_pickle": pickle.dumps(data_frame),
+                    }
+                )
+            result_queue.put((True, results))
+        finally:
+            doc.close()  # type: ignore[no-untyped-call]
+    except Exception as e:  # noqa: BLE001
+        error_info = {"error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
+        result_queue.put((False, error_info))
+def extract_tables_isolated(
+    file_path: str | PathLike[str],
+    config: GMFTConfig | None = None,
+    timeout: float = 300.0,
+) -> list[TableData]:
+    """Extract tables using an isolated process to handle segfaults.
+    Args:
+        file_path: Path to the PDF file
+        config: GMFT configuration
+        timeout: Maximum time to wait for extraction
+    Returns:
+        List of extracted tables
+    Raises:
+        RuntimeError: If extraction fails or times out
+    """
+    from kreuzberg._gmft import GMFTConfig
+    from kreuzberg._types import TableData
+    from kreuzberg.exceptions import ParsingError
+    config = config or GMFTConfig()
+    config_dict = config.__dict__.copy()
+    ctx = mp.get_context("spawn")
+    result_queue = ctx.Queue()
+    process = ctx.Process(
+        target=_extract_tables_in_process,
+        args=(str(file_path), config_dict, result_queue),
+    )
+    process.start()
+    try:
+        # Wait for result with timeout, checking for process death  # ~keep
+        import time
+        start_time = time.time()
+        while True:
+            try:
+                success, result = result_queue.get_nowait()
+                break
+            except queue.Empty:
+                if time.time() - start_time > timeout:
+                    raise
+                if not process.is_alive():
+                    # Process died without putting result  # ~keep
+                    if process.exitcode == -signal.SIGSEGV:
+                        raise ParsingError(
+                            "GMFT process crashed with segmentation fault",
+                            context={
+                                "file_path": str(file_path),
+                                "exit_code": process.exitcode,
+                            },
+                        ) from None
+                    raise ParsingError(
+                        f"GMFT process died unexpectedly with exit code {process.exitcode}",
+                        context={
+                            "file_path": str(file_path),
+                            "exit_code": process.exitcode,
+                        },
+                    ) from None
+                time.sleep(0.1)
+        if success:
+            tables = []
+            for table_dict in result:
+                import io
+                import pickle
+                from PIL import Image
+                img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
+                df = pickle.loads(table_dict["df_pickle"])  # noqa: S301
+                tables.append(
+                    TableData(
+                        cropped_image=img,
+                        page_number=table_dict["page_number"],
+                        text=table_dict["text"],
+                        df=df,
+                    )
+                )
+            return tables
+        error_info = result
+        raise ParsingError(
+            f"GMFT table extraction failed: {error_info['error']}",
+            context={
+                "file_path": str(file_path),
+                "error_type": error_info["type"],
+                "traceback": error_info["traceback"],
+            },
+        )
+    except queue.Empty as e:
+        raise ParsingError(
+            "GMFT table extraction timed out",
+            context={
+                "file_path": str(file_path),
+                "timeout": timeout,
+            },
+        ) from e
+    finally:
+        if process.is_alive():
+            process.terminate()
+            process.join(timeout=5)
+            if process.is_alive():
+                process.kill()
+                process.join()
+async def extract_tables_isolated_async(
+    file_path: str | PathLike[str],
+    config: GMFTConfig | None = None,
+    timeout: float = 300.0,
+) -> list[TableData]:
+    """Async version of extract_tables_isolated using asyncio.
+    Args:
+        file_path: Path to the PDF file
+        config: GMFT configuration
+        timeout: Maximum time to wait for extraction
+    Returns:
+        List of extracted tables
+    Raises:
+        RuntimeError: If extraction fails or times out
+    """
+    import anyio
+    from kreuzberg._gmft import GMFTConfig
+    from kreuzberg._types import TableData
+    from kreuzberg.exceptions import ParsingError
+    config = config or GMFTConfig()
+    config_dict = config.__dict__.copy()
+    ctx = mp.get_context("spawn")
+    result_queue = ctx.Queue()
+    process = ctx.Process(
+        target=_extract_tables_in_process,
+        args=(str(file_path), config_dict, result_queue),
+    )
+    process.start()
+    try:
+        async def wait_for_result() -> tuple[bool, Any]:
+            while True:
+                try:
+                    return result_queue.get_nowait()  # type: ignore[no-any-return]
+                except queue.Empty:  # noqa: PERF203
+                    await anyio.sleep(0.1)
+                    if not process.is_alive():
+                        # Process died without putting result  # ~keep
+                        if process.exitcode == -signal.SIGSEGV:
+                            raise ParsingError(
+                                "GMFT process crashed with segmentation fault",
+                                context={
+                                    "file_path": str(file_path),
+                                    "exit_code": process.exitcode,
+                                },
+                            ) from None
+                        raise ParsingError(
+                            f"GMFT process died unexpectedly with exit code {process.exitcode}",
+                            context={
+                                "file_path": str(file_path),
+                                "exit_code": process.exitcode,
+                            },
+                        ) from None
+        with anyio.fail_after(timeout):
+            success, result = await wait_for_result()
+        if success:
+            tables = []
+            for table_dict in result:
+                import io
+                import pickle
+                from PIL import Image
+                img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
+                df = pickle.loads(table_dict["df_pickle"])  # noqa: S301
+                tables.append(
+                    TableData(
+                        cropped_image=img,
+                        page_number=table_dict["page_number"],
+                        text=table_dict["text"],
+                        df=df,
+                    )
+                )
+            return tables
+        error_info = result
+        raise ParsingError(
+            f"GMFT table extraction failed: {error_info['error']}",
+            context={
+                "file_path": str(file_path),
+                "error_type": error_info["type"],
+                "traceback": error_info["traceback"],
+            },
+        )
+    except TimeoutError as e:
+        raise ParsingError(
+            "GMFT table extraction timed out",
+            context={
+                "file_path": str(file_path),
+                "timeout": timeout,
+            },
+        ) from e
+    finally:
+        if process.is_alive():
+            process.terminate()
+            await anyio.to_thread.run_sync(lambda: process.join(timeout=5))
+            if process.is_alive():
+                process.kill()
+                await anyio.to_thread.run_sync(process.join)

kreuzberg/_multiprocessing/process_manager.py ADDED Viewed

@@ -0,0 +1,188 @@
+"""Process pool manager for resource-aware multiprocessing."""
+from __future__ import annotations
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor
+from typing import TYPE_CHECKING, Any, Callable, TypeVar
+import anyio
+import psutil
+from typing_extensions import Self
+if TYPE_CHECKING:
+    import types
+T = TypeVar("T")
+class ProcessPoolManager:
+    """Resource-aware process pool manager for CPU-intensive tasks."""
+    def __init__(
+        self,
+        max_processes: int | None = None,
+        memory_limit_gb: float | None = None,
+    ) -> None:
+        """Initialize the process pool manager.
+        Args:
+            max_processes: Maximum number of processes. Defaults to CPU count.
+            memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
+        """
+        self.max_processes = max_processes or mp.cpu_count()
+        if memory_limit_gb is None:
+            available_memory = psutil.virtual_memory().available
+            self.memory_limit_bytes = int(available_memory * 0.75)  # Use 75% of available  # ~keep
+        else:
+            self.memory_limit_bytes = int(memory_limit_gb * 1024**3)
+        self._executor: ProcessPoolExecutor | None = None
+        self._active_tasks = 0
+    def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
+        """Calculate optimal number of workers based on memory constraints.
+        Args:
+            task_memory_mb: Estimated memory usage per task in MB.
+        Returns:
+            Optimal number of workers.
+        """
+        task_memory_bytes = task_memory_mb * 1024**2
+        memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
+        return min(self.max_processes, memory_based_limit)
+    def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
+        """Ensure process pool executor is initialized."""
+        if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
+            if self._executor is not None:
+                self._executor.shutdown(wait=False)
+            workers = max_workers or self.max_processes
+            self._executor = ProcessPoolExecutor(max_workers=workers)
+        return self._executor
+    async def submit_task(
+        self,
+        func: Callable[..., T],
+        *args: Any,
+        task_memory_mb: float = 100,
+    ) -> T:
+        """Submit a task to the process pool.
+        Args:
+            func: Function to execute.
+            *args: Positional arguments for the function.
+            task_memory_mb: Estimated memory usage in MB.
+        Returns:
+            Result of the function execution.
+        """
+        workers = self.get_optimal_workers(task_memory_mb)
+        self._ensure_executor(workers)
+        self._active_tasks += 1
+        try:
+            return await anyio.to_thread.run_sync(func, *args)
+        finally:
+            self._active_tasks -= 1
+    async def submit_batch(
+        self,
+        func: Callable[..., T],
+        arg_batches: list[tuple[Any, ...]],
+        task_memory_mb: float = 100,
+        max_concurrent: int | None = None,
+    ) -> list[T]:
+        """Submit a batch of tasks to the process pool.
+        Args:
+            func: Function to execute.
+            arg_batches: List of argument tuples for each task.
+            task_memory_mb: Estimated memory usage per task in MB.
+            max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
+        Returns:
+            List of results in the same order as input.
+        """
+        if not arg_batches:
+            return []
+        workers = self.get_optimal_workers(task_memory_mb)
+        max_concurrent = max_concurrent or workers
+        self._ensure_executor(workers)
+        semaphore = anyio.CapacityLimiter(max_concurrent)
+        async def submit_single(args: tuple[Any, ...]) -> T:
+            async with semaphore:
+                self._active_tasks += 1
+                try:
+                    return await anyio.to_thread.run_sync(func, *args)
+                finally:
+                    self._active_tasks -= 1
+        async with anyio.create_task_group() as tg:
+            results: list[T] = [None] * len(arg_batches)  # type: ignore[list-item]
+            async def run_task(idx: int, args: tuple[Any, ...]) -> None:
+                results[idx] = await submit_single(args)
+            for idx, args in enumerate(arg_batches):
+                tg.start_soon(run_task, idx, args)
+        return results
+    def get_system_info(self) -> dict[str, Any]:
+        """Get current system resource information."""
+        memory = psutil.virtual_memory()
+        cpu_percent = psutil.cpu_percent(interval=1)
+        return {
+            "cpu_count": mp.cpu_count(),
+            "cpu_percent": cpu_percent,
+            "memory_total": memory.total,
+            "memory_available": memory.available,
+            "memory_percent": memory.percent,
+            "active_tasks": self._active_tasks,
+            "max_processes": self.max_processes,
+            "memory_limit": self.memory_limit_bytes,
+        }
+    def shutdown(self, wait: bool = True) -> None:
+        """Shutdown the process pool."""
+        if self._executor is not None:
+            self._executor.shutdown(wait=wait)
+            self._executor = None
+    def __enter__(self) -> Self:
+        """Context manager entry."""
+        return self
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: types.TracebackType | None,
+    ) -> None:
+        """Context manager exit."""
+        self.shutdown()
+    async def __aenter__(self) -> Self:
+        """Async context manager entry."""
+        return self
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: types.TracebackType | None,
+    ) -> None:
+        """Async context manager exit."""
+        self.shutdown()

kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

kreuzberg 3.2.0py3-none-any.whl → 3.4.0py3-none-any.whl