PyPI - kreuzberg - Versions diffs - 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl - Mend

kreuzberg 3.3.0py3-none-any.whl → 3.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

kreuzberg/__init__.py +9 -2
kreuzberg/_api/__init__.py +0 -0
kreuzberg/_api/main.py +87 -0
kreuzberg/_entity_extraction.py +238 -0
kreuzberg/_extractors/_base.py +39 -1
kreuzberg/_extractors/_email.py +149 -0
kreuzberg/_extractors/_html.py +15 -3
kreuzberg/_extractors/_image.py +27 -22
kreuzberg/_extractors/_pandoc.py +3 -14
kreuzberg/_extractors/_pdf.py +97 -34
kreuzberg/_extractors/_presentation.py +62 -10
kreuzberg/_extractors/_spread_sheet.py +181 -6
kreuzberg/_extractors/_structured.py +148 -0
kreuzberg/_gmft.py +318 -11
kreuzberg/_language_detection.py +95 -0
kreuzberg/_mcp/__init__.py +5 -0
kreuzberg/_mcp/server.py +227 -0
kreuzberg/_mime_types.py +27 -1
kreuzberg/_ocr/__init__.py +10 -1
kreuzberg/_ocr/_base.py +59 -0
kreuzberg/_ocr/_easyocr.py +92 -1
kreuzberg/_ocr/_paddleocr.py +89 -0
kreuzberg/_ocr/_tesseract.py +569 -5
kreuzberg/_registry.py +4 -0
kreuzberg/_types.py +181 -4
kreuzberg/_utils/_cache.py +52 -4
kreuzberg/_utils/_device.py +2 -2
kreuzberg/_utils/_errors.py +3 -7
kreuzberg/_utils/_process_pool.py +182 -9
kreuzberg/_utils/_quality.py +237 -0
kreuzberg/_utils/_serialization.py +4 -2
kreuzberg/_utils/_string.py +153 -10
kreuzberg/_utils/_sync.py +6 -7
kreuzberg/_utils/_table.py +261 -0
kreuzberg/_utils/_tmp.py +2 -2
kreuzberg/cli.py +1 -2
kreuzberg/extraction.py +43 -34
kreuzberg-3.8.1.dist-info/METADATA +301 -0
kreuzberg-3.8.1.dist-info/RECORD +53 -0
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
kreuzberg/_multiprocessing/__init__.py +0 -6
kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
kreuzberg/_multiprocessing/process_manager.py +0 -188
kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
kreuzberg-3.3.0.dist-info/METADATA +0 -235
kreuzberg-3.3.0.dist-info/RECORD +0 -48
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_structured.py ADDED Viewed

@@ -0,0 +1,148 @@
+from __future__ import annotations
+import json
+from typing import TYPE_CHECKING, Any, ClassVar
+from anyio import Path as AsyncPath
+from kreuzberg._extractors._base import Extractor
+from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
+from kreuzberg._types import ExtractionResult, normalize_metadata
+from kreuzberg._utils._string import normalize_spaces, safe_decode
+from kreuzberg._utils._sync import run_sync
+if TYPE_CHECKING:
+    from pathlib import Path
+class StructuredDataExtractor(Extractor):
+    SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
+        JSON_MIME_TYPE,
+        "text/json",
+        YAML_MIME_TYPE,
+        "text/yaml",
+        "text/x-yaml",
+        "application/yaml",
+        TOML_MIME_TYPE,
+        "text/toml",
+    }
+    async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
+        return await run_sync(self.extract_bytes_sync, content)
+    async def extract_path_async(self, path: Path) -> ExtractionResult:
+        content = await AsyncPath(path).read_bytes()
+        return await self.extract_bytes_async(content)
+    def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
+        text_content = safe_decode(content)
+        try:
+            if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
+                data = json.loads(text_content)
+            elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
+                try:
+                    import tomllib  # type: ignore[import-not-found]
+                except ImportError:
+                    try:
+                        import tomli as tomllib  # type: ignore[import-not-found]
+                    except ImportError:
+                        return ExtractionResult(
+                            content=normalize_spaces(text_content),
+                            mime_type=PLAIN_TEXT_MIME_TYPE,
+                            metadata={"warning": "tomllib/tomli not available, returning raw text"},
+                            chunks=[],
+                        )
+                data = tomllib.loads(text_content)
+            else:
+                try:
+                    import yaml
+                    data = yaml.safe_load(text_content)
+                except ImportError:
+                    return ExtractionResult(
+                        content=normalize_spaces(text_content),
+                        mime_type=PLAIN_TEXT_MIME_TYPE,
+                        metadata={"warning": "PyYAML not available, returning raw text"},
+                        chunks=[],
+                    )
+            text_parts: list[str] = []
+            metadata: dict[str, Any] = {}
+            if isinstance(data, dict):
+                text_parts.extend(self._extract_from_dict(data, metadata))
+            elif isinstance(data, list):
+                text_parts.extend(self._extract_from_list(data, metadata))
+            else:
+                text_parts.append(str(data))
+            combined_text = "\n".join(text_parts) if text_parts else text_content
+            return ExtractionResult(
+                content=normalize_spaces(combined_text),
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata=normalize_metadata(metadata),
+                chunks=[],
+            )
+        except (ValueError, TypeError, KeyError, AttributeError, UnicodeDecodeError) as e:
+            return ExtractionResult(
+                content=normalize_spaces(text_content),
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata={"parse_error": str(e)},
+                chunks=[],
+            )
+    def extract_path_sync(self, path: Path) -> ExtractionResult:
+        content = path.read_bytes()
+        return self.extract_bytes_sync(content)
+    def _extract_from_dict(self, data: dict[str, Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
+        text_parts = []
+        for key, value in data.items():
+            full_key = f"{prefix}.{key}" if prefix else key
+            if isinstance(value, str) and value.strip():
+                text_parts.append(f"{full_key}: {value}")
+                if any(
+                    text_field in key.lower()
+                    for text_field in ["title", "name", "subject", "description", "content", "body", "text", "message"]
+                ):
+                    metadata[full_key] = value
+            elif isinstance(value, (int, float, bool)):
+                text_parts.append(f"{full_key}: {value}")
+            elif isinstance(value, dict):
+                text_parts.extend(self._extract_from_dict(value, metadata, full_key))
+            elif isinstance(value, list):
+                text_parts.extend(self._extract_from_list(value, metadata, full_key))
+            elif value is not None:
+                text_parts.append(f"{full_key}: {value!s}")
+        return text_parts
+    def _extract_from_list(self, data: list[Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
+        text_parts = []
+        for i, item in enumerate(data):
+            item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
+            if isinstance(item, str) and item.strip():
+                text_parts.append(f"{item_key}: {item}")
+            elif isinstance(item, dict):
+                text_parts.extend(self._extract_from_dict(item, metadata, item_key))
+            elif isinstance(item, list):
+                text_parts.extend(self._extract_from_list(item, metadata, item_key))
+            elif item is not None:
+                text_parts.append(f"{item_key}: {item!s}")
+        return text_parts

kreuzberg/_gmft.py CHANGED Viewed

@@ -1,12 +1,17 @@
 from __future__ import annotations
+import multiprocessing as mp
 import os
+import queue
+import signal
+import traceback
 from dataclasses import dataclass, field
+from io import StringIO
 from typing import TYPE_CHECKING, Any, Literal
 from kreuzberg._types import TableData
 from kreuzberg._utils._sync import run_sync
-from kreuzberg.exceptions import MissingDependencyError
+from kreuzberg.exceptions import MissingDependencyError, ParsingError
 if TYPE_CHECKING:
     from os import PathLike
@@ -196,9 +201,7 @@ async def extract_tables(  # noqa: PLR0915
     try:
         if use_isolated_process:
-            from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated_async
-            result = await extract_tables_isolated_async(file_path, config)
+            result = await _extract_tables_isolated_async(file_path, config)
             await table_cache.aset(result, **cache_kwargs)
@@ -210,7 +213,7 @@ async def extract_tables(  # noqa: PLR0915
             from gmft.formatters.tatr import TATRFormatConfig
             from gmft.pdf_bindings.pdfium import PyPDFium2Document
-            formatter: Any = AutoTableFormatter(  # type: ignore[no-untyped-call]
+            formatter: Any = AutoTableFormatter(  # type: ignore[no-untyped-call]  # type: ignore[no-untyped-call]
                 config=TATRFormatConfig(
                     verbosity=config.verbosity,
                     formatter_base_threshold=config.formatter_base_threshold,
@@ -226,7 +229,7 @@ async def extract_tables(  # noqa: PLR0915
                     force_large_table_assumption=config.force_large_table_assumption,
                 )
             )
-            detector: Any = AutoTableDetector(  # type: ignore[no-untyped-call]
+            detector: Any = AutoTableDetector(  # type: ignore[no-untyped-call]  # type: ignore[no-untyped-call]
                 config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
             )
             doc = await run_sync(PyPDFium2Document, str(file_path))
@@ -247,7 +250,7 @@ async def extract_tables(  # noqa: PLR0915
                         text=data_frame.to_markdown(),
                         df=data_frame,
                     )
-                    for data_frame, cropped_table in zip(dataframes, cropped_tables)
+                    for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
                 ]
                 await table_cache.aset(result, **cache_kwargs)
@@ -314,9 +317,7 @@ def extract_tables_sync(
         return cached_result  # type: ignore[no-any-return]
     if use_isolated_process:
-        from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated
-        result = extract_tables_isolated(file_path, config)
+        result = _extract_tables_isolated(file_path, config)
         table_cache.set(result, **cache_kwargs)
@@ -365,7 +366,7 @@ def extract_tables_sync(
                     text=data_frame.to_markdown(),
                     df=data_frame,
                 )
-                for data_frame, cropped_table in zip(dataframes, cropped_tables)
+                for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
             ]
             table_cache.set(result, **cache_kwargs)
@@ -378,3 +379,309 @@ def extract_tables_sync(
         raise MissingDependencyError.create_for_package(
             dependency_group="gmft", functionality="table extraction", package_name="gmft"
         ) from e
+def _extract_tables_in_process(
+    file_path: str | PathLike[str],
+    config_dict: dict[str, Any],
+    result_queue: queue.Queue[tuple[bool, Any]],
+) -> None:
+    """Extract tables in an isolated process to handle potential segfaults.
+    Args:
+        file_path: Path to the PDF file
+        config_dict: Serialized GMFTConfig as a dict
+        result_queue: Queue to put results or errors
+    """
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+    try:
+        from gmft.auto import AutoTableDetector, AutoTableFormatter  # type: ignore[attr-defined]
+        from gmft.detectors.tatr import TATRDetectorConfig  # type: ignore[attr-defined]
+        from gmft.formatters.tatr import TATRFormatConfig
+        from gmft.pdf_bindings.pdfium import PyPDFium2Document
+        config = GMFTConfig(**config_dict)
+        formatter = AutoTableFormatter(  # type: ignore[no-untyped-call]
+            config=TATRFormatConfig(
+                verbosity=config.verbosity,
+                formatter_base_threshold=config.formatter_base_threshold,
+                cell_required_confidence=config.cell_required_confidence,
+                remove_null_rows=config.remove_null_rows,
+                enable_multi_header=config.enable_multi_header,
+                semantic_spanning_cells=config.semantic_spanning_cells,
+                semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
+                large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
+                large_table_threshold=config.large_table_threshold,
+                large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
+                large_table_maximum_rows=config.large_table_maximum_rows,
+                force_large_table_assumption=config.force_large_table_assumption,
+            )
+        )
+        detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))  # type: ignore[no-untyped-call]
+        doc = PyPDFium2Document(str(file_path))
+        cropped_tables = []
+        dataframes = []
+        try:
+            for page in doc:
+                cropped_tables.extend(detector.extract(page))  # type: ignore[attr-defined]
+            for cropped_table in cropped_tables:
+                formatted_table = formatter.extract(cropped_table)  # type: ignore[attr-defined]
+                dataframes.append(formatted_table.df())
+            results = []
+            for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
+                import io
+                img_bytes = io.BytesIO()
+                cropped_image = cropped_table.image()
+                cropped_image.save(img_bytes, format="PNG")
+                img_bytes.seek(0)
+                results.append(
+                    {
+                        "cropped_image_bytes": img_bytes.getvalue(),
+                        "page_number": cropped_table.page.page_number,
+                        "text": data_frame.to_markdown(),
+                        "df_csv": data_frame.to_csv(index=False),
+                    }
+                )
+            result_queue.put((True, results))
+        finally:
+            doc.close()  # type: ignore[no-untyped-call]
+    except Exception as e:  # noqa: BLE001
+        error_info = {"error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
+        result_queue.put((False, error_info))
+def _extract_tables_isolated(
+    file_path: str | PathLike[str],
+    config: GMFTConfig | None = None,
+    timeout: float = 300.0,
+) -> list[TableData]:
+    """Extract tables using an isolated process to handle segfaults.
+    Args:
+        file_path: Path to the PDF file
+        config: GMFT configuration
+        timeout: Maximum time to wait for extraction
+    Returns:
+        List of extracted tables
+    Raises:
+        RuntimeError: If extraction fails or times out
+    """
+    config = config or GMFTConfig()
+    config_dict = config.__dict__.copy()
+    ctx = mp.get_context("spawn")
+    result_queue = ctx.Queue()
+    process = ctx.Process(
+        target=_extract_tables_in_process,
+        args=(str(file_path), config_dict, result_queue),
+    )
+    process.start()
+    try:
+        # Wait for result with timeout, checking for process death  # ~keep
+        import time
+        start_time = time.time()
+        while True:
+            try:
+                success, result = result_queue.get_nowait()
+                break
+            except queue.Empty:
+                if time.time() - start_time > timeout:
+                    raise
+                if not process.is_alive():
+                    # Process died without putting result  # ~keep
+                    if process.exitcode == -signal.SIGSEGV:
+                        raise ParsingError(
+                            "GMFT process crashed with segmentation fault",
+                            context={
+                                "file_path": str(file_path),
+                                "exit_code": process.exitcode,
+                            },
+                        ) from None
+                    raise ParsingError(
+                        f"GMFT process died unexpectedly with exit code {process.exitcode}",
+                        context={
+                            "file_path": str(file_path),
+                            "exit_code": process.exitcode,
+                        },
+                    ) from None
+                time.sleep(0.1)
+        if success:
+            tables = []
+            for table_dict in result:
+                import io
+                from PIL import Image
+                img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
+                import pandas as pd
+                df = pd.read_csv(StringIO(table_dict["df_csv"]))
+                tables.append(
+                    TableData(
+                        cropped_image=img,
+                        page_number=table_dict["page_number"],
+                        text=table_dict["text"],
+                        df=df,
+                    )
+                )
+            return tables
+        error_info = result
+        raise ParsingError(
+            f"GMFT table extraction failed: {error_info['error']}",
+            context={
+                "file_path": str(file_path),
+                "error_type": error_info["type"],
+                "traceback": error_info["traceback"],
+            },
+        )
+    except queue.Empty as e:
+        raise ParsingError(
+            "GMFT table extraction timed out",
+            context={
+                "file_path": str(file_path),
+                "timeout": timeout,
+            },
+        ) from e
+    finally:
+        if process.is_alive():
+            process.terminate()
+            process.join(timeout=5)
+            if process.is_alive():
+                process.kill()
+                process.join()
+async def _extract_tables_isolated_async(
+    file_path: str | PathLike[str],
+    config: GMFTConfig | None = None,
+    timeout: float = 300.0,
+) -> list[TableData]:
+    """Async version of extract_tables_isolated using asyncio.
+    Args:
+        file_path: Path to the PDF file
+        config: GMFT configuration
+        timeout: Maximum time to wait for extraction
+    Returns:
+        List of extracted tables
+    Raises:
+        RuntimeError: If extraction fails or times out
+    """
+    import anyio
+    config = config or GMFTConfig()
+    config_dict = config.__dict__.copy()
+    ctx = mp.get_context("spawn")
+    result_queue = ctx.Queue()
+    process = ctx.Process(
+        target=_extract_tables_in_process,
+        args=(str(file_path), config_dict, result_queue),
+    )
+    process.start()
+    try:
+        async def wait_for_result() -> tuple[bool, Any]:
+            while True:
+                try:
+                    return result_queue.get_nowait()  # type: ignore[no-any-return]
+                except queue.Empty:  # noqa: PERF203
+                    await anyio.sleep(0.1)
+                    if not process.is_alive():
+                        # Process died without putting result  # ~keep
+                        if process.exitcode == -signal.SIGSEGV:
+                            raise ParsingError(
+                                "GMFT process crashed with segmentation fault",
+                                context={
+                                    "file_path": str(file_path),
+                                    "exit_code": process.exitcode,
+                                },
+                            ) from None
+                        raise ParsingError(
+                            f"GMFT process died unexpectedly with exit code {process.exitcode}",
+                            context={
+                                "file_path": str(file_path),
+                                "exit_code": process.exitcode,
+                            },
+                        ) from None
+        with anyio.fail_after(timeout):
+            success, result = await wait_for_result()
+        if success:
+            tables = []
+            for table_dict in result:
+                import io
+                from PIL import Image
+                img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
+                import pandas as pd
+                df = pd.read_csv(StringIO(table_dict["df_csv"]))
+                tables.append(
+                    TableData(
+                        cropped_image=img,
+                        page_number=table_dict["page_number"],
+                        text=table_dict["text"],
+                        df=df,
+                    )
+                )
+            return tables
+        error_info = result
+        raise ParsingError(
+            f"GMFT table extraction failed: {error_info['error']}",
+            context={
+                "file_path": str(file_path),
+                "error_type": error_info["type"],
+                "traceback": error_info["traceback"],
+            },
+        )
+    except TimeoutError as e:
+        raise ParsingError(
+            "GMFT table extraction timed out",
+            context={
+                "file_path": str(file_path),
+                "timeout": timeout,
+            },
+        ) from e
+    finally:
+        if process.is_alive():
+            process.terminate()
+            await anyio.to_thread.run_sync(lambda: process.join(timeout=5))
+            if process.is_alive():
+                process.kill()
+                await anyio.to_thread.run_sync(process.join)

kreuzberg/_language_detection.py ADDED Viewed

@@ -0,0 +1,95 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any
+from kreuzberg.exceptions import MissingDependencyError
+if TYPE_CHECKING:
+    from fast_langdetect import LangDetectConfig as FastLangDetectConfig
+try:
+    from fast_langdetect import LangDetectConfig as FastLangDetectConfig
+    from fast_langdetect import detect, detect_multilingual
+    HAS_FAST_LANGDETECT = True
+except ImportError:
+    HAS_FAST_LANGDETECT = False
+    detect = None
+    detect_multilingual = None
+    FastLangDetectConfig = None
+_CACHE_SIZE = 128
+@dataclass(frozen=True)
+class LanguageDetectionConfig:
+    """Configuration for language detection.
+    Attributes:
+        low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
+            Defaults to True for better memory efficiency.
+        top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
+        multilingual: If True, uses multilingual detection to handle mixed-language text.
+            If False, uses single language detection. Defaults to False.
+        cache_dir: Custom directory for model cache. If None, uses system default.
+        allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
+    """
+    low_memory: bool = True
+    top_k: int = 3
+    multilingual: bool = False
+    cache_dir: str | None = None
+    allow_fallback: bool = True
+def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
+    """Create FastLangDetectConfig from our config."""
+    if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
+        return None
+    kwargs: dict[str, Any] = {
+        "allow_fallback": config.allow_fallback,
+    }
+    if config.cache_dir is not None:
+        kwargs["cache_dir"] = config.cache_dir
+    return FastLangDetectConfig(**kwargs)
+@lru_cache(maxsize=_CACHE_SIZE)
+def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
+    """Detect the most probable languages in the given text using fast-langdetect.
+    Args:
+        text: The text to analyze.
+        config: Configuration for language detection. If None, uses defaults.
+    Returns:
+        A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
+        or None if detection fails.
+    Raises:
+        MissingDependencyError: If fast-langdetect is not installed.
+    """
+    if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
+        raise MissingDependencyError.create_for_package(
+            dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
+        )
+    if config is None:
+        config = LanguageDetectionConfig()
+    try:
+        if config.multilingual:
+            results = detect_multilingual(text, low_memory=config.low_memory, k=config.top_k)
+            return [result["lang"].lower() for result in results if result.get("lang")]
+        result = detect(text, low_memory=config.low_memory)
+        if result and result.get("lang"):
+            return [result["lang"].lower()]
+        return None
+    except Exception:  # noqa: BLE001
+        return None

kreuzberg/_mcp/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""MCP server for Kreuzberg text extraction."""
+from .server import mcp
+__all__ = ["mcp"]

kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

kreuzberg 3.3.0py3-none-any.whl → 3.8.1py3-none-any.whl