PyPI - kreuzberg - Versions diffs - 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl - Mend

kreuzberg 3.8.0py3-none-any.whl → 3.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

kreuzberg/__init__.py +4 -0
kreuzberg/_api/main.py +22 -1
kreuzberg/_config.py +404 -0
kreuzberg/_entity_extraction.py +4 -5
kreuzberg/_extractors/_base.py +3 -5
kreuzberg/_extractors/_image.py +18 -32
kreuzberg/_extractors/_pandoc.py +3 -14
kreuzberg/_extractors/_pdf.py +39 -57
kreuzberg/_extractors/_spread_sheet.py +2 -3
kreuzberg/_extractors/_structured.py +10 -7
kreuzberg/_gmft.py +314 -10
kreuzberg/_language_detection.py +1 -1
kreuzberg/_mcp/server.py +58 -8
kreuzberg/_ocr/__init__.py +1 -22
kreuzberg/_ocr/_base.py +59 -0
kreuzberg/_ocr/_easyocr.py +92 -1
kreuzberg/_ocr/_paddleocr.py +90 -1
kreuzberg/_ocr/_tesseract.py +556 -5
kreuzberg/_playa.py +2 -3
kreuzberg/_types.py +46 -24
kreuzberg/_utils/_cache.py +35 -4
kreuzberg/_utils/_device.py +10 -20
kreuzberg/_utils/_errors.py +44 -45
kreuzberg/_utils/_process_pool.py +2 -6
kreuzberg/_utils/_quality.py +7 -11
kreuzberg/_utils/_serialization.py +21 -16
kreuzberg/_utils/_string.py +22 -12
kreuzberg/_utils/_table.py +3 -4
kreuzberg/cli.py +4 -5
kreuzberg/exceptions.py +10 -0
kreuzberg/extraction.py +6 -24
kreuzberg-3.8.2.dist-info/METADATA +265 -0
kreuzberg-3.8.2.dist-info/RECORD +53 -0
kreuzberg/_cli_config.py +0 -175
kreuzberg/_multiprocessing/__init__.py +0 -5
kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
kreuzberg/_ocr/_pool.py +0 -357
kreuzberg/_ocr/_sync.py +0 -566
kreuzberg-3.8.0.dist-info/METADATA +0 -313
kreuzberg-3.8.0.dist-info/RECORD +0 -57
{kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
{kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_multiprocessing/gmft_isolated.py DELETED Viewed

@@ -1,330 +0,0 @@
-"""Isolated GMFT table extraction to handle segmentation faults."""
-from __future__ import annotations
-import multiprocessing as mp
-import pickle
-import queue
-import signal
-import traceback
-from typing import TYPE_CHECKING, Any
-if TYPE_CHECKING:
-    from os import PathLike
-    from kreuzberg._gmft import GMFTConfig
-    from kreuzberg._types import TableData
-def _extract_tables_in_process(
-    file_path: str | PathLike[str],
-    config_dict: dict[str, Any],
-    result_queue: queue.Queue[tuple[bool, Any]],
-) -> None:
-    """Extract tables in an isolated process to handle potential segfaults.
-    Args:
-        file_path: Path to the PDF file
-        config_dict: Serialized GMFTConfig as a dict
-        result_queue: Queue to put results or errors
-    """
-    signal.signal(signal.SIGINT, signal.SIG_IGN)
-    try:
-        from gmft.auto import AutoTableDetector, AutoTableFormatter  # type: ignore[attr-defined]
-        from gmft.detectors.tatr import TATRDetectorConfig  # type: ignore[attr-defined]
-        from gmft.formatters.tatr import TATRFormatConfig
-        from gmft.pdf_bindings.pdfium import PyPDFium2Document
-        from kreuzberg._gmft import GMFTConfig
-        config = GMFTConfig(**config_dict)
-        formatter = AutoTableFormatter(  # type: ignore[no-untyped-call]
-            config=TATRFormatConfig(
-                verbosity=config.verbosity,
-                formatter_base_threshold=config.formatter_base_threshold,
-                cell_required_confidence=config.cell_required_confidence,
-                remove_null_rows=config.remove_null_rows,
-                enable_multi_header=config.enable_multi_header,
-                semantic_spanning_cells=config.semantic_spanning_cells,
-                semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
-                large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
-                large_table_threshold=config.large_table_threshold,
-                large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
-                large_table_maximum_rows=config.large_table_maximum_rows,
-                force_large_table_assumption=config.force_large_table_assumption,
-            )
-        )
-        detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))  # type: ignore[no-untyped-call]
-        doc = PyPDFium2Document(str(file_path))
-        cropped_tables = []
-        dataframes = []
-        try:
-            for page in doc:
-                cropped_tables.extend(detector.extract(page))  # type: ignore[attr-defined]
-            for cropped_table in cropped_tables:
-                formatted_table = formatter.extract(cropped_table)  # type: ignore[attr-defined]
-                dataframes.append(formatted_table.df())
-            results = []
-            for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
-                import io
-                img_bytes = io.BytesIO()
-                cropped_image = cropped_table.image()
-                cropped_image.save(img_bytes, format="PNG")
-                img_bytes.seek(0)
-                results.append(
-                    {
-                        "cropped_image_bytes": img_bytes.getvalue(),
-                        "page_number": cropped_table.page.page_number,
-                        "text": data_frame.to_markdown(),
-                        "df_pickle": pickle.dumps(data_frame),
-                    }
-                )
-            result_queue.put((True, results))
-        finally:
-            doc.close()  # type: ignore[no-untyped-call]
-    except Exception as e:  # noqa: BLE001
-        error_info = {"error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
-        result_queue.put((False, error_info))
-def extract_tables_isolated(
-    file_path: str | PathLike[str],
-    config: GMFTConfig | None = None,
-    timeout: float = 300.0,
-) -> list[TableData]:
-    """Extract tables using an isolated process to handle segfaults.
-    Args:
-        file_path: Path to the PDF file
-        config: GMFT configuration
-        timeout: Maximum time to wait for extraction
-    Returns:
-        List of extracted tables
-    Raises:
-        RuntimeError: If extraction fails or times out
-    """
-    from kreuzberg._gmft import GMFTConfig
-    from kreuzberg._types import TableData
-    from kreuzberg.exceptions import ParsingError
-    config = config or GMFTConfig()
-    config_dict = config.__dict__.copy()
-    ctx = mp.get_context("spawn")
-    result_queue = ctx.Queue()
-    process = ctx.Process(
-        target=_extract_tables_in_process,
-        args=(str(file_path), config_dict, result_queue),
-    )
-    process.start()
-    try:
-        # Wait for result with timeout, checking for process death  # ~keep
-        import time
-        start_time = time.time()
-        while True:
-            try:
-                success, result = result_queue.get_nowait()
-                break
-            except queue.Empty:
-                if time.time() - start_time > timeout:
-                    raise
-                if not process.is_alive():
-                    # Process died without putting result  # ~keep
-                    if process.exitcode == -signal.SIGSEGV:
-                        raise ParsingError(
-                            "GMFT process crashed with segmentation fault",
-                            context={
-                                "file_path": str(file_path),
-                                "exit_code": process.exitcode,
-                            },
-                        ) from None
-                    raise ParsingError(
-                        f"GMFT process died unexpectedly with exit code {process.exitcode}",
-                        context={
-                            "file_path": str(file_path),
-                            "exit_code": process.exitcode,
-                        },
-                    ) from None
-                time.sleep(0.1)
-        if success:
-            tables = []
-            for table_dict in result:
-                import io
-                import pickle
-                from PIL import Image
-                img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
-                df = pickle.loads(table_dict["df_pickle"])  # noqa: S301
-                tables.append(
-                    TableData(
-                        cropped_image=img,
-                        page_number=table_dict["page_number"],
-                        text=table_dict["text"],
-                        df=df,
-                    )
-                )
-            return tables
-        error_info = result
-        raise ParsingError(
-            f"GMFT table extraction failed: {error_info['error']}",
-            context={
-                "file_path": str(file_path),
-                "error_type": error_info["type"],
-                "traceback": error_info["traceback"],
-            },
-        )
-    except queue.Empty as e:
-        raise ParsingError(
-            "GMFT table extraction timed out",
-            context={
-                "file_path": str(file_path),
-                "timeout": timeout,
-            },
-        ) from e
-    finally:
-        if process.is_alive():
-            process.terminate()
-            process.join(timeout=5)
-            if process.is_alive():
-                process.kill()
-                process.join()
-async def extract_tables_isolated_async(
-    file_path: str | PathLike[str],
-    config: GMFTConfig | None = None,
-    timeout: float = 300.0,
-) -> list[TableData]:
-    """Async version of extract_tables_isolated using asyncio.
-    Args:
-        file_path: Path to the PDF file
-        config: GMFT configuration
-        timeout: Maximum time to wait for extraction
-    Returns:
-        List of extracted tables
-    Raises:
-        RuntimeError: If extraction fails or times out
-    """
-    import anyio
-    from kreuzberg._gmft import GMFTConfig
-    from kreuzberg._types import TableData
-    from kreuzberg.exceptions import ParsingError
-    config = config or GMFTConfig()
-    config_dict = config.__dict__.copy()
-    ctx = mp.get_context("spawn")
-    result_queue = ctx.Queue()
-    process = ctx.Process(
-        target=_extract_tables_in_process,
-        args=(str(file_path), config_dict, result_queue),
-    )
-    process.start()
-    try:
-        async def wait_for_result() -> tuple[bool, Any]:
-            while True:
-                try:
-                    return result_queue.get_nowait()  # type: ignore[no-any-return]
-                except queue.Empty:  # noqa: PERF203
-                    await anyio.sleep(0.1)
-                    if not process.is_alive():
-                        # Process died without putting result  # ~keep
-                        if process.exitcode == -signal.SIGSEGV:
-                            raise ParsingError(
-                                "GMFT process crashed with segmentation fault",
-                                context={
-                                    "file_path": str(file_path),
-                                    "exit_code": process.exitcode,
-                                },
-                            ) from None
-                        raise ParsingError(
-                            f"GMFT process died unexpectedly with exit code {process.exitcode}",
-                            context={
-                                "file_path": str(file_path),
-                                "exit_code": process.exitcode,
-                            },
-                        ) from None
-        with anyio.fail_after(timeout):
-            success, result = await wait_for_result()
-        if success:
-            tables = []
-            for table_dict in result:
-                import io
-                import pickle
-                from PIL import Image
-                img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
-                df = pickle.loads(table_dict["df_pickle"])  # noqa: S301
-                tables.append(
-                    TableData(
-                        cropped_image=img,
-                        page_number=table_dict["page_number"],
-                        text=table_dict["text"],
-                        df=df,
-                    )
-                )
-            return tables
-        error_info = result
-        raise ParsingError(
-            f"GMFT table extraction failed: {error_info['error']}",
-            context={
-                "file_path": str(file_path),
-                "error_type": error_info["type"],
-                "traceback": error_info["traceback"],
-            },
-        )
-    except TimeoutError as e:
-        raise ParsingError(
-            "GMFT table extraction timed out",
-            context={
-                "file_path": str(file_path),
-                "timeout": timeout,
-            },
-        ) from e
-    finally:
-        if process.is_alive():
-            process.terminate()
-            await anyio.to_thread.run_sync(lambda: process.join(timeout=5))
-            if process.is_alive():
-                process.kill()
-                await anyio.to_thread.run_sync(process.join)

kreuzberg/_ocr/_pool.py DELETED Viewed

@@ -1,357 +0,0 @@
-"""Process pools for parallel OCR processing."""
-from __future__ import annotations
-from pathlib import Path
-from typing import TYPE_CHECKING, Any
-from PIL import Image
-from typing_extensions import Self
-from kreuzberg._ocr._tesseract import TesseractConfig
-from kreuzberg._types import ExtractionResult
-from kreuzberg._utils._process_pool import ProcessPoolManager
-if TYPE_CHECKING:
-    import types
-def _process_image_with_tesseract(
-    image_path: str,
-    config_dict: dict[str, Any],
-) -> dict[str, Any]:
-    """Process a single image with Tesseract in a separate process.
-    This function is designed to be pickled and executed in a subprocess.
-    It uses direct tesseract command execution to avoid async complications.
-    Args:
-        image_path: Path to the image file.
-        config_dict: Tesseract configuration as dictionary.
-    Returns:
-        OCR result as dictionary.
-    """
-    try:
-        import os
-        import subprocess
-        import tempfile
-        with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
-            output_base = tmp_file.name.replace(".txt", "")
-        try:
-            language = config_dict.get("language", "eng")
-            psm = config_dict.get("psm", 3)
-            command = [
-                "tesseract",
-                image_path,
-                output_base,
-                "-l",
-                language,
-                "--psm",
-                str(psm),
-                "--oem",
-                "1",
-                "--loglevel",
-                "OFF",
-            ]
-            boolean_options = [
-                "classify_use_pre_adapted_templates",
-                "language_model_ngram_on",
-                "tessedit_dont_blkrej_good_wds",
-                "tessedit_dont_rowrej_good_wds",
-                "tessedit_enable_dict_correction",
-                "tessedit_use_primary_params_model",
-                "textord_space_size_is_variable",
-                "thresholding_method",
-            ]
-            for option in boolean_options:
-                if option in config_dict:
-                    value = 1 if config_dict[option] else 0
-                    command.extend(["-c", f"{option}={value}"])
-            env = os.environ.copy()
-            env["OMP_THREAD_LIMIT"] = "1"
-            result = subprocess.run(
-                command,
-                check=False,
-                env=env,
-                capture_output=True,
-                text=True,
-                timeout=30,
-            )
-            if result.returncode != 0:
-                raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
-            output_file = output_base + ".txt"
-            with Path(output_file).open(encoding="utf-8") as f:
-                text = f.read()
-            from kreuzberg._utils._string import normalize_spaces
-            text = normalize_spaces(text)
-            return {
-                "success": True,
-                "text": text,
-                "confidence": None,
-                "error": None,
-            }
-        finally:
-            for ext in [".txt"]:
-                temp_file = output_base + ext
-                temp_path = Path(temp_file)
-                if temp_path.exists():
-                    temp_path.unlink()
-    except Exception as e:  # noqa: BLE001
-        return {
-            "success": False,
-            "text": "",
-            "confidence": None,
-            "error": str(e),
-        }
-def _process_image_bytes_with_tesseract(
-    image_bytes: bytes,
-    config_dict: dict[str, Any],
-) -> dict[str, Any]:
-    """Process image bytes with Tesseract in a separate process.
-    Args:
-        image_bytes: Image data as bytes.
-        config_dict: Tesseract configuration as dictionary.
-    Returns:
-        OCR result as dictionary.
-    """
-    try:
-        import io
-        import tempfile
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
-            with Image.open(io.BytesIO(image_bytes)) as image:
-                image.save(tmp_image.name, format="PNG")
-            image_path = tmp_image.name
-        try:
-            return _process_image_with_tesseract(image_path, config_dict)
-        finally:
-            image_file = Path(image_path)
-            if image_file.exists():
-                image_file.unlink()
-    except Exception as e:  # noqa: BLE001
-        return {
-            "success": False,
-            "text": "",
-            "confidence": None,
-            "error": str(e),
-        }
-class TesseractProcessPool:
-    """Process pool for parallel Tesseract OCR processing."""
-    def __init__(
-        self,
-        config: TesseractConfig | None = None,
-        max_processes: int | None = None,
-        memory_limit_gb: float | None = None,
-    ) -> None:
-        """Initialize the Tesseract process pool.
-        Args:
-            config: Default Tesseract configuration.
-            max_processes: Maximum number of processes.
-            memory_limit_gb: Memory limit in GB.
-        """
-        self.config = config or TesseractConfig()
-        self.process_manager = ProcessPoolManager(
-            max_processes=max_processes,
-            memory_limit_gb=memory_limit_gb,
-        )
-    def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
-        """Convert TesseractConfig to dictionary for pickling."""
-        cfg = config or self.config
-        config_dict = {}
-        for field_name in cfg.__dataclass_fields__:
-            value = getattr(cfg, field_name)
-            if hasattr(value, "value"):
-                config_dict[field_name] = value.value
-            else:
-                config_dict[field_name] = value
-        return config_dict
-    def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
-        """Convert result dictionary back to OCRResult."""
-        if not result_dict["success"]:
-            from kreuzberg.exceptions import OCRError
-            raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
-        from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
-        return ExtractionResult(
-            content=result_dict["text"],
-            mime_type=PLAIN_TEXT_MIME_TYPE,
-            metadata={"confidence": result_dict["confidence"]} if result_dict["confidence"] else {},  # type: ignore[typeddict-unknown-key]
-            chunks=[],
-        )
-    async def process_image(
-        self,
-        image_path: str | Path,
-        config: TesseractConfig | None = None,
-    ) -> ExtractionResult:
-        """Process a single image file with Tesseract.
-        Args:
-            image_path: Path to the image file.
-            config: Tesseract configuration (uses default if None).
-        Returns:
-            OCR result.
-        """
-        config_dict = self._config_to_dict(config)
-        task_memory_mb = 80
-        result_dict = await self.process_manager.submit_task(
-            _process_image_with_tesseract,
-            str(image_path),
-            config_dict,
-            task_memory_mb=task_memory_mb,
-        )
-        return self._result_from_dict(result_dict)
-    async def process_image_bytes(
-        self,
-        image_bytes: bytes,
-        config: TesseractConfig | None = None,
-    ) -> ExtractionResult:
-        """Process image bytes with Tesseract.
-        Args:
-            image_bytes: Image data as bytes.
-            config: Tesseract configuration (uses default if None).
-        Returns:
-            OCR result.
-        """
-        config_dict = self._config_to_dict(config)
-        image_size_mb = len(image_bytes) / 1024 / 1024
-        task_memory_mb = max(80, image_size_mb * 2 + 50)
-        result_dict = await self.process_manager.submit_task(
-            _process_image_bytes_with_tesseract,
-            image_bytes,
-            config_dict,
-            task_memory_mb=task_memory_mb,
-        )
-        return self._result_from_dict(result_dict)
-    async def process_batch_images(
-        self,
-        image_paths: list[str | Path],
-        config: TesseractConfig | None = None,
-        max_concurrent: int | None = None,
-    ) -> list[ExtractionResult]:
-        """Process a batch of images in parallel.
-        Args:
-            image_paths: List of image file paths.
-            config: Tesseract configuration (uses default if None).
-            max_concurrent: Maximum concurrent processes.
-        Returns:
-            List of OCR results in the same order as input.
-        """
-        if not image_paths:
-            return []
-        config_dict = self._config_to_dict(config)
-        arg_batches = [(str(path), config_dict) for path in image_paths]
-        task_memory_mb = 80
-        result_dicts = await self.process_manager.submit_batch(
-            _process_image_with_tesseract,
-            arg_batches,
-            task_memory_mb=task_memory_mb,
-            max_concurrent=max_concurrent,
-        )
-        return [self._result_from_dict(result_dict) for result_dict in result_dicts]
-    async def process_batch_bytes(
-        self,
-        image_bytes_list: list[bytes],
-        config: TesseractConfig | None = None,
-        max_concurrent: int | None = None,
-    ) -> list[ExtractionResult]:
-        """Process a batch of image bytes in parallel.
-        Args:
-            image_bytes_list: List of image data as bytes.
-            config: Tesseract configuration (uses default if None).
-            max_concurrent: Maximum concurrent processes.
-        Returns:
-            List of OCR results in the same order as input.
-        """
-        if not image_bytes_list:
-            return []
-        config_dict = self._config_to_dict(config)
-        arg_batches = [(image_bytes, config_dict) for image_bytes in image_bytes_list]
-        avg_image_size_mb = sum(len(img) for img in image_bytes_list) / len(image_bytes_list) / 1024 / 1024
-        task_memory_mb = max(80, avg_image_size_mb * 2 + 50)
-        result_dicts = await self.process_manager.submit_batch(
-            _process_image_bytes_with_tesseract,
-            arg_batches,
-            task_memory_mb=task_memory_mb,
-            max_concurrent=max_concurrent,
-        )
-        return [self._result_from_dict(result_dict) for result_dict in result_dicts]
-    def get_system_info(self) -> dict[str, Any]:
-        """Get system information from the process manager."""
-        return self.process_manager.get_system_info()
-    def shutdown(self, wait: bool = True) -> None:
-        """Shutdown the process pool."""
-        self.process_manager.shutdown(wait=wait)
-    async def __aenter__(self) -> Self:
-        """Async context manager entry."""
-        return self
-    async def __aexit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc_val: BaseException | None,
-        exc_tb: types.TracebackType | None,
-    ) -> None:
-        """Async context manager exit."""
-        self.shutdown()

kreuzberg 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl

kreuzberg 3.8.0py3-none-any.whl → 3.8.2py3-none-any.whl