PyPI - kreuzberg - Versions diffs - 3.13.3__py3-none-any.whl → 3.14.0__py3-none-any.whl - Mend

kreuzberg 3.13.3py3-none-any.whl → 3.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

kreuzberg/_api/main.py +82 -18
kreuzberg/_config.py +0 -1
kreuzberg/_extractors/_image.py +20 -2
kreuzberg/_extractors/_pdf.py +21 -1
kreuzberg/_extractors/_spread_sheet.py +0 -1
kreuzberg/_gmft.py +79 -33
kreuzberg/_mcp/server.py +0 -76
kreuzberg/_ocr/_base.py +1 -2
kreuzberg/_ocr/_paddleocr.py +39 -13
kreuzberg/_ocr/_tesseract.py +2 -3
kreuzberg/_registry.py +26 -0
kreuzberg/_types.py +64 -1
kreuzberg/_utils/_cache.py +34 -12
kreuzberg/_utils/_image_preprocessing.py +346 -0
kreuzberg/_utils/_ocr_cache.py +2 -5
kreuzberg/_utils/_process_pool.py +3 -3
kreuzberg/_utils/_table.py +4 -1
kreuzberg/cli.py +19 -2
kreuzberg/extraction.py +4 -4
{kreuzberg-3.13.3.dist-info → kreuzberg-3.14.0.dist-info}/METADATA +4 -4
{kreuzberg-3.13.3.dist-info → kreuzberg-3.14.0.dist-info}/RECORD +24 -23
{kreuzberg-3.13.3.dist-info → kreuzberg-3.14.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.13.3.dist-info → kreuzberg-3.14.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.13.3.dist-info → kreuzberg-3.14.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import os
 import platform
 import warnings
 from importlib.util import find_spec
@@ -36,18 +37,31 @@ except ImportError:  # pragma: no cover
 if TYPE_CHECKING:
     import numpy as np
     from paddleocr import PaddleOCR
+else:
+    np: Any = None
+    PaddleOCR: Any = None
+HAS_PADDLEOCR: bool = False
+def _import_paddleocr() -> tuple[Any, Any]:
+    global HAS_PADDLEOCR, np, PaddleOCR
+    if HAS_PADDLEOCR:
+        return np, PaddleOCR
-HAS_PADDLEOCR: bool
-if not TYPE_CHECKING:
     try:
-        import numpy as np
-        from paddleocr import PaddleOCR
+        os.environ.setdefault("HUB_DATASET_ENDPOINT", "https://modelscope.cn/api/v1/datasets")
+        import numpy as _np  # noqa: PLC0415, ICN001
+        from paddleocr import PaddleOCR as _PaddleOCR  # noqa: PLC0415
+        np = _np
+        PaddleOCR = _PaddleOCR
         HAS_PADDLEOCR = True
+        return np, PaddleOCR
     except ImportError:
-        HAS_PADDLEOCR = False
-        np: Any = None
-        PaddleOCR: Any = None
+        return None, None
 PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
@@ -74,7 +88,12 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             if image.mode != "RGB":
                 image = image.convert("RGB")
-            image_np = np.array(image)
+            _np, _ = _import_paddleocr()
+            if _np is None:
+                raise MissingDependencyError.create_for_package(
+                    dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
+                )
+            image_np = _np.array(image)
             use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
             result = await run_sync(self._paddle_ocr.ocr, image_np, cls=use_textline_orientation)
@@ -195,7 +214,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         if cls._paddle_ocr is not None:
             return
-        if not HAS_PADDLEOCR or PaddleOCR is None:
+        _np, _paddle_ocr = _import_paddleocr()
+        if _paddle_ocr is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
             )
@@ -224,7 +244,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
         try:
-            cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, **kwargs)
+            cls._paddle_ocr = await run_sync(_paddle_ocr, lang=language, **kwargs)
         except Exception as e:
             raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
@@ -304,7 +324,12 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             if image.mode != "RGB":
                 image = image.convert("RGB")
-            image_np = np.array(image)
+            _np, _ = _import_paddleocr()
+            if _np is None:
+                raise MissingDependencyError.create_for_package(
+                    dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
+                )
+            image_np = _np.array(image)
             use_textline_orientation = kwargs.get("use_textline_orientation", kwargs.get("use_angle_cls", True))
             result = self._paddle_ocr.ocr(image_np, cls=use_textline_orientation)
@@ -352,7 +377,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         if cls._paddle_ocr is not None:
             return
-        if not HAS_PADDLEOCR or PaddleOCR is None:
+        _np, _paddle_ocr = _import_paddleocr()
+        if _paddle_ocr is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
             )
@@ -381,6 +407,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported())
         try:
-            cls._paddle_ocr = PaddleOCR(lang=language, **kwargs)
+            cls._paddle_ocr = _paddle_ocr(lang=language, **kwargs)
         except Exception as e:
             raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -28,6 +28,7 @@ from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
 from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
 from kreuzberg._utils._cache import get_ocr_cache
+from kreuzberg._utils._process_pool import ProcessPoolManager
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg._utils._tmp import create_temp_file
@@ -467,7 +468,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             last_para = -1
             for line_key in sorted(lines.keys()):
-                page_num, block_num, par_num, line_num = line_key
+                _page_num, block_num, par_num, _line_num = line_key
                 if block_num != last_block:
                     if text_parts:  # ~keep
@@ -1297,8 +1298,6 @@ class TesseractProcessPool:
         max_processes: int | None = None,
         memory_limit_gb: float | None = None,
     ) -> None:
-        from kreuzberg._utils._process_pool import ProcessPoolManager  # noqa: PLC0415
         self.config = config or TesseractConfig()
         self.process_manager = ProcessPoolManager(
             max_processes=max_processes,

kreuzberg/_registry.py CHANGED Viewed

@@ -28,6 +28,13 @@ if TYPE_CHECKING:
 class ExtractorRegistry:
+    """Registry for managing document extractors.
+    This class maintains a registry of extractors for different file types and provides
+    functionality to get the appropriate extractor for a given MIME type, as well as
+    add or remove custom extractors.
+    """
     _default_extractors: ClassVar[list[type[Extractor]]] = [
         PDFExtractor,
         OfficeDocumentExtractor,
@@ -51,6 +58,15 @@ class ExtractorRegistry:
     @classmethod
     @lru_cache
     def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
+        """Get an appropriate extractor for the given MIME type.
+        Args:
+            mime_type: The MIME type to find an extractor for.
+            config: The extraction configuration.
+        Returns:
+            An extractor instance if one supports the MIME type, None otherwise.
+        """
         extractors: list[type[Extractor]] = [
             *cls._registered_extractors,
             *cls._default_extractors,
@@ -64,11 +80,21 @@ class ExtractorRegistry:
     @classmethod
     def add_extractor(cls, extractor: type[Extractor]) -> None:
+        """Add a custom extractor to the registry.
+        Args:
+            extractor: The extractor class to add to the registry.
+        """
         cls._registered_extractors.append(extractor)
         cls.get_extractor.cache_clear()
     @classmethod
     def remove_extractor(cls, extractor: type[Extractor]) -> None:
+        """Remove a custom extractor from the registry.
+        Args:
+            extractor: The extractor class to remove from the registry.
+        """
         try:
             cls._registered_extractors.remove(extractor)
             cls.get_extractor.cache_clear()

kreuzberg/_types.py CHANGED Viewed

@@ -4,7 +4,7 @@ import sys
 from collections.abc import Awaitable, Callable, Iterable, Mapping
 from dataclasses import asdict, dataclass, field
 from enum import Enum
-from typing import TYPE_CHECKING, Any, Literal, TypedDict
+from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
 import msgspec
@@ -508,6 +508,35 @@ class TableData(TypedDict):
     """The table text as a markdown string."""
+class ImagePreprocessingMetadata(NamedTuple):
+    """Metadata about image preprocessing operations for OCR."""
+    original_dimensions: tuple[int, int]
+    """Original image dimensions (width, height) in pixels."""
+    original_dpi: tuple[float, float]
+    """Original image DPI (horizontal, vertical)."""
+    target_dpi: int
+    """Target DPI that was requested."""
+    scale_factor: float
+    """Scale factor applied to the image."""
+    auto_adjusted: bool
+    """Whether DPI was automatically adjusted due to size constraints."""
+    final_dpi: int | None = None
+    """Final DPI used after processing."""
+    new_dimensions: tuple[int, int] | None = None
+    """New image dimensions after processing (width, height) in pixels."""
+    resample_method: str | None = None
+    """Resampling method used (LANCZOS, BICUBIC, etc.)."""
+    skipped_resize: bool = False
+    """Whether resizing was skipped (no change needed)."""
+    dimension_clamped: bool = False
+    """Whether image was clamped to maximum dimension constraints."""
+    calculated_dpi: int | None = None
+    """DPI calculated during auto-adjustment."""
+    resize_error: str | None = None
+    """Error message if resizing failed."""
 class Metadata(TypedDict, total=False):
     authors: NotRequired[list[str]]
     """List of document authors."""
@@ -587,6 +616,8 @@ class Metadata(TypedDict, total=False):
     """Summary of table extraction results."""
     quality_score: NotRequired[float]
     """Quality score for extracted content (0.0-1.0)."""
+    image_preprocessing: NotRequired[ImagePreprocessingMetadata]
+    """Metadata about image preprocessing operations (DPI adjustments, scaling, etc.)."""
     source_format: NotRequired[str]
     """Source format of the extracted content."""
     error: NotRequired[str]
@@ -632,6 +663,7 @@ _VALID_METADATA_KEYS = {
     "table_count",
     "tables_summary",
     "quality_score",
+    "image_preprocessing",
 }
@@ -775,6 +807,16 @@ class ExtractionConfig(ConfigDict):
     """Configuration for HTML to Markdown conversion. If None, uses default settings."""
     use_cache: bool = True
     """Whether to use caching for extraction results. Set to False to disable all caching."""
+    target_dpi: int = 150
+    """Target DPI for OCR processing. Images and PDF pages will be scaled to this DPI for optimal OCR results."""
+    max_image_dimension: int = 25000
+    """Maximum allowed pixel dimension (width or height) for processed images to prevent memory issues."""
+    auto_adjust_dpi: bool = True
+    """Whether to automatically adjust DPI based on image dimensions to stay within max_image_dimension limits."""
+    min_dpi: int = 72
+    """Minimum DPI threshold when auto-adjusting DPI."""
+    max_dpi: int = 600
+    """Maximum DPI threshold when auto-adjusting DPI."""
     def __post_init__(self) -> None:
         if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
@@ -797,6 +839,27 @@ class ExtractionConfig(ConfigDict):
                 context={"ocr_backend": self.ocr_backend, "ocr_config": type(self.ocr_config).__name__},
             )
+        # Validate DPI configuration
+        if self.target_dpi <= 0:
+            raise ValidationError("target_dpi must be positive", context={"target_dpi": self.target_dpi})
+        if self.min_dpi <= 0:
+            raise ValidationError("min_dpi must be positive", context={"min_dpi": self.min_dpi})
+        if self.max_dpi <= 0:
+            raise ValidationError("max_dpi must be positive", context={"max_dpi": self.max_dpi})
+        if self.min_dpi >= self.max_dpi:
+            raise ValidationError(
+                "min_dpi must be less than max_dpi", context={"min_dpi": self.min_dpi, "max_dpi": self.max_dpi}
+            )
+        if self.max_image_dimension <= 0:
+            raise ValidationError(
+                "max_image_dimension must be positive", context={"max_image_dimension": self.max_image_dimension}
+            )
+        if not (self.min_dpi <= self.target_dpi <= self.max_dpi):
+            raise ValidationError(
+                "target_dpi must be between min_dpi and max_dpi",
+                context={"target_dpi": self.target_dpi, "min_dpi": self.min_dpi, "max_dpi": self.max_dpi},
+            )
     def get_config_dict(self) -> dict[str, Any]:
         if self.ocr_backend is None:
             return {"use_cache": self.use_cache}

kreuzberg/_utils/_cache.py CHANGED Viewed

@@ -1,14 +1,16 @@
 from __future__ import annotations
 import hashlib
+import io
 import os
 import threading
 import time
 from contextlib import suppress
 from io import StringIO
 from pathlib import Path
-from typing import Any, Generic, TypeVar
+from typing import Any, Generic, TypeVar, cast
+import polars as pl
 from anyio import Path as AsyncPath
 from kreuzberg._types import ExtractionResult
@@ -79,10 +81,18 @@ class KreuzbergCache(Generic[T]):
             for item in result:
                 if isinstance(item, dict) and "df" in item:
                     serialized_item = {k: v for k, v in item.items() if k != "df"}
-                    if hasattr(item["df"], "to_csv"):
-                        serialized_item["df_csv"] = item["df"].to_csv(index=False)
+                    if item["df"] is not None:
+                        buffer = io.BytesIO()
+                        if hasattr(item["df"], "write_parquet"):
+                            item["df"].write_parquet(buffer)
+                            serialized_item["df_parquet"] = buffer.getvalue()
+                        elif hasattr(item["df"], "write_csv"):
+                            item["df"].write_csv(buffer)
+                            serialized_item["df_parquet"] = buffer.getvalue()
+                        else:
+                            serialized_item["df_parquet"] = None
                     else:
-                        serialized_item["df_csv"] = str(item["df"])
+                        serialized_item["df_parquet"] = None
                     serialized_data.append(serialized_item)
                 else:
                     serialized_data.append(item)
@@ -94,22 +104,34 @@ class KreuzbergCache(Generic[T]):
         data = cached_data["data"]
         if cached_data.get("type") == "TableDataList" and isinstance(data, list):
-            import pandas as pd  # noqa: PLC0415
             deserialized_data = []
             for item in data:
-                if isinstance(item, dict) and "df_csv" in item:
-                    deserialized_item = {k: v for k, v in item.items() if k != "df_csv"}
-                    deserialized_item["df"] = pd.read_csv(StringIO(item["df_csv"]))
+                if isinstance(item, dict) and ("df_parquet" in item or "df_csv" in item):
+                    deserialized_item = {k: v for k, v in item.items() if k not in ("df_parquet", "df_csv")}
+                    if "df_parquet" in item:
+                        if item["df_parquet"] is None:
+                            deserialized_item["df"] = pl.DataFrame()
+                        else:
+                            buffer = io.BytesIO(item["df_parquet"])
+                            try:
+                                deserialized_item["df"] = pl.read_parquet(buffer)
+                            except Exception:  # noqa: BLE001
+                                deserialized_item["df"] = pl.DataFrame()
+                    elif "df_csv" in item:
+                        if item["df_csv"] is None or item["df_csv"] == "" or item["df_csv"] == "\n":
+                            deserialized_item["df"] = pl.DataFrame()
+                        else:
+                            deserialized_item["df"] = pl.read_csv(StringIO(item["df_csv"]))
                     deserialized_data.append(deserialized_item)
                 else:
                     deserialized_data.append(item)
-            return deserialized_data  # type: ignore[return-value]
+            return cast("T", deserialized_data)
         if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
-            return ExtractionResult(**data)  # type: ignore[return-value]
+            return cast("T", ExtractionResult(**data))
-        return data  # type: ignore[no-any-return]
+        return cast("T", data)
     def _cleanup_cache(self) -> None:
         try:

kreuzberg 3.13.3__py3-none-any.whl → 3.14.0__py3-none-any.whl

kreuzberg 3.13.3py3-none-any.whl → 3.14.0py3-none-any.whl