PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_config.py +248 -204
kreuzberg/_document_classification.py +0 -8
kreuzberg/_entity_extraction.py +1 -93
kreuzberg/_extractors/_base.py +0 -5
kreuzberg/_extractors/_email.py +1 -11
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -23
kreuzberg/_extractors/_pandoc.py +10 -89
kreuzberg/_extractors/_pdf.py +39 -92
kreuzberg/_extractors/_presentation.py +0 -17
kreuzberg/_extractors/_spread_sheet.py +13 -53
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -138
kreuzberg/_language_detection.py +1 -22
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -2
kreuzberg/_ocr/_easyocr.py +21 -108
kreuzberg/_ocr/_paddleocr.py +16 -94
kreuzberg/_ocr/_table_extractor.py +260 -0
kreuzberg/_ocr/_tesseract.py +906 -264
kreuzberg/_playa.py +5 -4
kreuzberg/_types.py +638 -40
kreuzberg/_utils/_cache.py +88 -90
kreuzberg/_utils/_device.py +0 -18
kreuzberg/_utils/_document_cache.py +0 -2
kreuzberg/_utils/_errors.py +0 -3
kreuzberg/_utils/_pdf_lock.py +0 -2
kreuzberg/_utils/_process_pool.py +19 -19
kreuzberg/_utils/_quality.py +0 -43
kreuzberg/_utils/_ref.py +48 -0
kreuzberg/_utils/_serialization.py +0 -5
kreuzberg/_utils/_string.py +9 -39
kreuzberg/_utils/_sync.py +0 -1
kreuzberg/_utils/_table.py +50 -57
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
kreuzberg-3.13.0.dist-info/RECORD +56 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_gmft.py CHANGED Viewed

@@ -7,16 +7,17 @@ import queue
 import signal
 import time
 import traceback
-from dataclasses import dataclass, field
 from io import StringIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any
 import anyio
 import msgspec
+import pandas as pd
 from PIL import Image
-from kreuzberg._types import TableData
+from kreuzberg._types import GMFTConfig, TableData
+from kreuzberg._utils._cache import get_table_cache
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, ParsingError
@@ -27,117 +28,6 @@ if TYPE_CHECKING:
     from pandas import DataFrame
-@dataclass(unsafe_hash=True, slots=True)
-class GMFTConfig:
-    """Configuration options for GMFT.
-    This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
-    """
-    verbosity: int = 0
-    """
-    Verbosity level for logging.
-    0: errors only
-    1: print warnings
-    2: print warnings and info
-    3: print warnings, info, and debug
-    """
-    formatter_base_threshold: float = 0.3
-    """
-    Base threshold for the confidence demanded of a table feature (row/column).
-    Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
-    """
-    cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
-        default_factory=lambda: {
-            0: 0.3,
-            1: 0.3,
-            2: 0.3,
-            3: 0.3,
-            4: 0.5,
-            5: 0.5,
-            6: 99,
-        },
-        hash=False,
-    )
-    """
-    Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
-    But low confidences may be better than too high confidence (see formatter_base_threshold)
-    """
-    detector_base_threshold: float = 0.9
-    """Minimum confidence score required for a table"""
-    remove_null_rows: bool = True
-    """
-    Flag to remove rows with no text.
-    """
-    enable_multi_header: bool = False
-    """
-    Enable multi-indices in the dataframe.
-    If false, then multiple headers will be merged column-wise.
-    """
-    semantic_spanning_cells: bool = False
-    """
-    [Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
-    """
-    semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
-    """
-    [Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
-    Possible values: 'algorithm', 'deep', None.
-    'algorithm': assumes that the higher-level header is always the first row followed by several empty rows.
-    'deep': merges headers according to the spanning cells detected by the Table Transformer.
-    None: headers are not duplicated.
-    """
-    large_table_if_n_rows_removed: int = 8
-    """
-    If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
-    """
-    large_table_threshold: int = 10
-    """
-    With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
-    Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
-    """
-    large_table_row_overlap_threshold: float = 0.2
-    """
-    With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
-    Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
-    """
-    large_table_maximum_rows: int = 1000
-    """
-    Maximum number of rows allowed for a large table.
-    """
-    force_large_table_assumption: bool | None = None
-    """
-    Force the large table assumption to be applied, regardless of the number of rows and overlap.
-    """
-    total_overlap_reject_threshold: float = 0.9
-    """
-    Reject if total overlap is > 90% of table area.
-    """
-    total_overlap_warn_threshold: float = 0.1
-    """
-    Warn if total overlap is > 10% of table area.
-    """
-    nms_warn_threshold: int = 5
-    """
-    Warn if non maxima suppression removes > 5 rows.
-    """
-    iob_reject_threshold: float = 0.05
-    """
-    Reject if iob between textbox and cell is < 5%.
-    """
-    iob_warn_threshold: float = 0.5
-    """
-    Warn if iob between textbox and cell is < 50%.
-    """
 async def extract_tables(
     file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
 ) -> list[TableData]:
@@ -158,8 +48,6 @@ async def extract_tables(
     Returns:
         A list of table data dictionaries.
     """
-    from kreuzberg._utils._cache import get_table_cache  # noqa: PLC0415
     # Determine if we should use isolated process  # ~keep
     if use_isolated_process is None:
         use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
@@ -211,15 +99,15 @@ async def extract_tables(
             return result
         try:
-            from gmft.auto import (  # type: ignore[attr-defined]  # noqa: PLC0415  # noqa: PLC0415
+            from gmft.auto import (  # type: ignore[attr-defined]  # noqa: PLC0415
                 AutoTableDetector,
                 AutoTableFormatter,
             )
             from gmft.detectors.tatr import TATRDetectorConfig  # type: ignore[attr-defined]  # noqa: PLC0415
-            from gmft.formatters.tatr import TATRFormatConfig  # noqa: PLC0415  # noqa: PLC0415
-            from gmft.pdf_bindings.pdfium import PyPDFium2Document  # noqa: PLC0415  # noqa: PLC0415
+            from gmft.formatters.tatr import TATRFormatConfig  # noqa: PLC0415
+            from gmft.pdf_bindings.pdfium import PyPDFium2Document  # noqa: PLC0415
-            formatter: Any = AutoTableFormatter(  # type: ignore[no-untyped-call]  # type: ignore[no-untyped-call]
+            formatter: Any = AutoTableFormatter(  # type: ignore[no-untyped-call]
                 config=TATRFormatConfig(
                     verbosity=config.verbosity,
                     formatter_base_threshold=config.formatter_base_threshold,
@@ -235,7 +123,7 @@ async def extract_tables(
                     force_large_table_assumption=config.force_large_table_assumption,
                 )
             )
-            detector: Any = AutoTableDetector(  # type: ignore[no-untyped-call]  # type: ignore[no-untyped-call]
+            detector: Any = AutoTableDetector(  # type: ignore[no-untyped-call]
                 config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
             )
             doc = await run_sync(PyPDFium2Document, str(file_path))
@@ -287,8 +175,6 @@ def extract_tables_sync(
     Returns:
         A list of table data dictionaries.
     """
-    from kreuzberg._utils._cache import get_table_cache  # noqa: PLC0415
     # Determine if we should use isolated process  # ~keep
     if use_isolated_process is None:
         use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
@@ -542,7 +428,6 @@ def _extract_tables_isolated(
             tables = []
             for table_dict in result:
                 img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
-                import pandas as pd  # noqa: PLC0415
                 if table_dict["df_csv"] is None:
                     df = pd.DataFrame(columns=table_dict["df_columns"])
@@ -620,38 +505,29 @@ async def _extract_tables_isolated_async(
     try:
-        async def wait_for_result() -> tuple[bool, Any]:
+        def get_result_sync() -> tuple[bool, Any]:
             while True:
                 try:
-                    return result_queue.get_nowait()  # type: ignore[no-any-return]
+                    return result_queue.get(timeout=0.1)  # type: ignore[no-any-return]
                 except queue.Empty:  # noqa: PERF203
-                    await anyio.sleep(0.1)
                     if not process.is_alive():
-                        # Process died without putting result  # ~keep
                         if process.exitcode == -signal.SIGSEGV:
                             raise ParsingError(
                                 "GMFT process crashed with segmentation fault",
-                                context={
-                                    "file_path": str(file_path),
-                                    "exit_code": process.exitcode,
-                                },
+                                context={"file_path": str(file_path), "exit_code": process.exitcode},
                             ) from None
                         raise ParsingError(
                             f"GMFT process died unexpectedly with exit code {process.exitcode}",
-                            context={
-                                "file_path": str(file_path),
-                                "exit_code": process.exitcode,
-                            },
+                            context={"file_path": str(file_path), "exit_code": process.exitcode},
                         ) from None
         with anyio.fail_after(timeout):
-            success, result = await wait_for_result()
+            success, result = await anyio.to_thread.run_sync(get_result_sync)
         if success:
             tables = []
             for table_dict in result:
                 img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
-                import pandas as pd  # noqa: PLC0415
                 if table_dict["df_csv"] is None:
                     df = pd.DataFrame(columns=table_dict["df_columns"])

kreuzberg/_language_detection.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from __future__ import annotations
-from dataclasses import dataclass
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any
+from kreuzberg._types import LanguageDetectionConfig
 from kreuzberg.exceptions import MissingDependencyError
 if TYPE_CHECKING:
@@ -23,27 +23,6 @@ except ImportError:  # pragma: no cover
 _CACHE_SIZE = 128
-@dataclass(frozen=True, slots=True)
-class LanguageDetectionConfig:
-    """Configuration for language detection.
-    Attributes:
-        low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
-            Defaults to True for better memory efficiency.
-        top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
-        multilingual: If True, uses multilingual detection to handle mixed-language text.
-            If False, uses single language detection. Defaults to False.
-        cache_dir: Custom directory for model cache. If None, uses system default.
-        allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
-    """
-    low_memory: bool = True
-    top_k: int = 3
-    multilingual: bool = False
-    cache_dir: str | None = None
-    allow_fallback: bool = True
 def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
     """Create FastLangDetectConfig from our config."""
     if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:

kreuzberg/_mcp/__init__.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""MCP server for Kreuzberg text extraction."""
 from .server import mcp
 __all__ = ["mcp"]

kreuzberg/_mcp/server.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""Kreuzberg MCP server implementation."""
 from __future__ import annotations
 import base64
@@ -10,11 +8,10 @@ import msgspec
 from mcp.server import FastMCP
 from mcp.types import TextContent
-from kreuzberg._config import try_discover_config
+from kreuzberg._config import discover_config
 from kreuzberg._types import ExtractionConfig, OcrBackendType
 from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
-# Create the MCP server
 mcp = FastMCP("Kreuzberg Text Extraction")
@@ -27,14 +24,11 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
     Returns:
         ExtractionConfig instance.
     """
-    # Try to discover configuration from files
-    base_config = try_discover_config()
+    base_config = discover_config()
     if base_config is None:
-        # No config file found, use defaults
         return ExtractionConfig(**kwargs)
-    # Merge discovered config with tool parameters (tool params take precedence)
     config_dict: dict[str, Any] = {
         "force_ocr": base_config.force_ocr,
         "chunk_content": base_config.chunk_content,
@@ -50,7 +44,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
         "gmft_config": base_config.gmft_config,
     }
-    # Override with provided parameters
     config_dict = config_dict | kwargs
     return ExtractionConfig(**config_dict)
@@ -189,7 +182,7 @@ def get_default_config() -> str:
 @mcp.resource("config://discovered")
 def get_discovered_config() -> str:
     """Get the discovered configuration from config files."""
-    config = try_discover_config()
+    config = discover_config()
     if config is None:
         return "No configuration file found"
     return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)

kreuzberg/_mime_types.py CHANGED Viewed

@@ -4,6 +4,7 @@ from mimetypes import guess_type
 from pathlib import Path
 from typing import TYPE_CHECKING, Final
+from kreuzberg._utils._cache import get_mime_cache
 from kreuzberg.exceptions import ValidationError
 if TYPE_CHECKING:  # pragma: no cover
@@ -191,8 +192,6 @@ def validate_mime_type(
         return _validate_explicit_mime_type(mime_type)
     if file_path:
-        from kreuzberg._utils._cache import get_mime_cache  # noqa: PLC0415
         path = Path(file_path)
         try:

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -1,15 +1,14 @@
 from __future__ import annotations
 import warnings
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
+from typing import TYPE_CHECKING, Any, ClassVar, Final
 from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr._base import OCRBackend
-from kreuzberg._types import ExtractionResult, Metadata
-from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
+from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata
+from kreuzberg._utils._device import DeviceInfo, validate_device_request
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -22,6 +21,18 @@ try:  # pragma: no cover
 except ImportError:  # pragma: no cover
     from typing_extensions import Unpack
+try:
+    import easyocr
+    import numpy as np
+    import torch
+    HAS_EASYOCR = True
+except ImportError:
+    HAS_EASYOCR = False
+    easyocr = None
+    np = None  # type: ignore[assignment]
+    torch = None  # type: ignore[assignment]
 EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
     "abq",
@@ -110,59 +121,6 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
 }
-@dataclass(unsafe_hash=True, frozen=True, slots=True)
-class EasyOCRConfig:
-    """Configuration options for EasyOCR."""
-    add_margin: float = 0.1
-    """Extend bounding boxes in all directions."""
-    adjust_contrast: float = 0.5
-    """Target contrast level for low contrast text."""
-    beam_width: int = 5
-    """Beam width for beam search in recognition."""
-    canvas_size: int = 2560
-    """Maximum image dimension for detection."""
-    contrast_ths: float = 0.1
-    """Contrast threshold for preprocessing."""
-    decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
-    """Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
-    height_ths: float = 0.5
-    """Maximum difference in box height for merging."""
-    language: str | list[str] = "en"
-    """Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
-    a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
-    link_threshold: float = 0.4
-    """Link confidence threshold."""
-    low_text: float = 0.4
-    """Text low-bound score."""
-    mag_ratio: float = 1.0
-    """Image magnification ratio."""
-    min_size: int = 10
-    """Minimum text box size in pixels."""
-    rotation_info: list[int] | None = None
-    """List of angles to try for detection."""
-    slope_ths: float = 0.1
-    """Maximum slope for merging text boxes."""
-    text_threshold: float = 0.7
-    """Text confidence threshold."""
-    use_gpu: bool = False
-    """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
-    device: DeviceType = "auto"
-    """Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
-    gpu_memory_limit: float | None = None
-    """Maximum GPU memory to use in GB. None for no limit."""
-    fallback_to_cpu: bool = True
-    """Whether to fallback to CPU if requested device is unavailable."""
-    width_ths: float = 0.5
-    """Maximum horizontal distance for merging boxes."""
-    x_ths: float = 1.0
-    """Maximum horizontal distance for paragraph merging."""
-    y_ths: float = 0.5
-    """Maximum vertical distance for paragraph merging."""
-    ycenter_ths: float = 0.5
-    """Maximum shift in y direction for merging."""
 class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     _reader: ClassVar[Any] = None
@@ -179,8 +137,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         Raises:
             OCRError: If OCR processing fails.
         """
-        import numpy as np  # noqa: PLC0415
         await self._init_easyocr(**kwargs)
         beam_width = kwargs.pop("beam_width")
@@ -225,15 +181,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     @staticmethod
     def _process_easyocr_result(result: list[Any], image: Image.Image) -> ExtractionResult:
-        """Process EasyOCR result into an ExtractionResult with metadata.
-        Args:
-            result: The raw result from EasyOCR.
-            image: The original PIL image.
-        Returns:
-            ExtractionResult: The extraction result containing text content, mime type, and metadata.
-        """
         if not result:
             return ExtractionResult(
                 content="",
@@ -314,38 +261,19 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     @classmethod
     def _is_gpu_available(cls) -> bool:
-        """Check if GPU is available for EasyOCR.
-        Returns:
-            bool: True if GPU support is available.
-        """
-        try:
-            import torch  # noqa: PLC0415
-            return bool(torch.cuda.is_available())
-        except ImportError:  # pragma: no cover
+        if not HAS_EASYOCR or torch is None:
             return False
+        return bool(torch.cuda.is_available())
     @classmethod
     async def _init_easyocr(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
-        """Initialize EasyOCR with the provided configuration.
-        Args:
-            **kwargs: Configuration parameters for EasyOCR including language, etc.
-        Raises:
-            MissingDependencyError: If EasyOCR is not installed.
-            OCRError: If initialization fails.
-        """
         if cls._reader is not None:
             return
-        try:
-            import easyocr  # noqa: PLC0415
-        except ImportError as e:  # pragma: no cover
+        if not HAS_EASYOCR or easyocr is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
-            ) from e
+            )
         languages = cls._validate_language_code(kwargs.pop("language", "en"))
@@ -369,17 +297,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     @classmethod
     def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
-        """Resolve device configuration with backward compatibility.
-        Args:
-            **kwargs: Configuration parameters including device settings.
-        Returns:
-            DeviceInfo object for the selected device.
-        Raises:
-            ValidationError: If requested device is not available and fallback is disabled.
-        """
         use_gpu = kwargs.get("use_gpu", False)
         device = kwargs.get("device", "auto")
         memory_limit = kwargs.get("gpu_memory_limit")
@@ -457,8 +374,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         Raises:
             OCRError: If OCR processing fails.
         """
-        import numpy as np  # noqa: PLC0415
         self._init_easyocr_sync(**kwargs)
         beam_width = kwargs.pop("beam_width")
@@ -513,12 +428,10 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         if cls._reader is not None:
             return
-        try:
-            import easyocr  # noqa: PLC0415
-        except ImportError as e:  # pragma: no cover
+        if not HAS_EASYOCR or easyocr is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
-            ) from e
+            )
         languages = cls._validate_language_code(kwargs.pop("language", "en"))

kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl