PyPI - kreuzberg - Versions diffs - 3.0.0__py3-none-any.whl → 3.1.0__py3-none-any.whl - Mend

kreuzberg 3.0.0py3-none-any.whl → 3.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

kreuzberg/__init__.py +4 -1
kreuzberg/_extractors/__init__.py +0 -0
kreuzberg/_extractors/_base.py +92 -0
kreuzberg/_extractors/_html.py +34 -0
kreuzberg/_extractors/_image.py +74 -0
kreuzberg/_extractors/_pandoc.py +613 -0
kreuzberg/_extractors/_pdf.py +171 -0
kreuzberg/_extractors/_presentation.py +233 -0
kreuzberg/_extractors/_spread_sheet.py +125 -0
kreuzberg/_gmft.py +174 -0
kreuzberg/_ocr/__init__.py +17 -0
kreuzberg/_ocr/_base.py +54 -0
kreuzberg/_ocr/_easyocr.py +376 -0
kreuzberg/_ocr/_paddleocr.py +283 -0
kreuzberg/_ocr/_tesseract.py +342 -0
kreuzberg/_types.py +31 -4
kreuzberg/_utils/__init__.py +0 -0
kreuzberg/_utils/_string.py +39 -0
kreuzberg/_utils/_sync.py +121 -0
kreuzberg/_utils/_tmp.py +37 -0
{kreuzberg-3.0.0.dist-info → kreuzberg-3.1.0.dist-info}/METADATA +14 -19
kreuzberg-3.1.0.dist-info/RECORD +33 -0
{kreuzberg-3.0.0.dist-info → kreuzberg-3.1.0.dist-info}/WHEEL +1 -1
kreuzberg-3.0.0.dist-info/RECORD +0 -15
{kreuzberg-3.0.0.dist-info → kreuzberg-3.1.0.dist-info}/licenses/LICENSE +0 -0
{kreuzberg-3.0.0.dist-info → kreuzberg-3.1.0.dist-info}/top_level.txt +0 -0

kreuzberg/_gmft.py ADDED Viewed

@@ -0,0 +1,174 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Literal
+from kreuzberg._types import TableData
+from kreuzberg._utils._sync import run_sync
+from kreuzberg.exceptions import MissingDependencyError
+if TYPE_CHECKING:
+    from os import PathLike
+    from gmft.detectors.base import CroppedTable
+    from pandas import DataFrame
+@dataclass(unsafe_hash=True)
+class GMFTConfig:
+    """Configuration options for GMFT.
+    This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
+    """
+    verbosity: int = 0
+    """
+    Verbosity level for logging.
+    0: errors only
+    1: print warnings
+    2: print warnings and info
+    3: print warnings, info, and debug
+    """
+    formatter_base_threshold: float = 0.3
+    """
+    Base threshold for the confidence demanded of a table feature (row/column).
+    Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
+    """
+    cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
+        default_factory=lambda: {
+            0: 0.3,
+            1: 0.3,
+            2: 0.3,
+            3: 0.3,
+            4: 0.5,
+            5: 0.5,
+            6: 99,
+        },
+        hash=False,
+    )
+    """
+    Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
+    But low confidences may be better than too high confidence (see formatter_base_threshold)
+    """
+    detector_base_threshold: float = 0.9
+    """Minimum confidence score required for a table"""
+    remove_null_rows: bool = True
+    """
+    Flag to remove rows with no text.
+    """
+    enable_multi_header: bool = False
+    """
+    Enable multi-indices in the dataframe.
+    If false, then multiple headers will be merged column-wise.
+    """
+    semantic_spanning_cells: bool = False
+    """
+    [Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
+    """
+    semantic_hierarchical_left_fill: str | None = "algorithm"
+    """
+    [Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
+    Possible values: 'algorithm', 'deep', None.
+    'algorithm': assumes that the higher-level header is always the first row followed by several empty rows.
+    'deep': merges headers according to the spanning cells detected by the Table Transformer.
+    None: headers are not duplicated.
+    """
+    large_table_if_n_rows_removed: int = 8
+    """
+    If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
+    """
+    large_table_threshold: int = 10
+    """
+    With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
+    Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
+    """
+    large_table_row_overlap_threshold: float = 0.2
+    """
+    With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
+    Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
+    """
+    large_table_maximum_rows: int = 1000
+    """
+    Maximum number of rows allowed for a large table.
+    """
+    force_large_table_assumption: bool | None = None
+    """
+    Force the large table assumption to be applied, regardless of the number of rows and overlap.
+    """
+async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | None = None) -> list[TableData]:
+    """Extracts tables from a PDF file.
+    This function takes a file path to a PDF file, and an optional configuration object.
+    It returns a list of strings, where each string is a markdown-formatted table.
+    Args:
+        file_path: The path to the PDF file.
+        config: An optional configuration object.
+    Raises:
+        MissingDependencyError: Raised when the required dependencies are not installed.
+    Returns:
+        A list of table data dictionaries.
+    """
+    try:
+        from gmft.auto import AutoTableDetector, AutoTableFormatter
+        from gmft.detectors.tatr import TATRDetectorConfig
+        from gmft.formatters.tatr import TATRFormatConfig
+        from gmft.pdf_bindings.pdfium import PyPDFium2Document
+        config = config or GMFTConfig()
+        formatter = AutoTableFormatter(
+            config=TATRFormatConfig(
+                verbosity=config.verbosity,
+                formatter_base_threshold=config.formatter_base_threshold,
+                cell_required_confidence=config.cell_required_confidence,
+                remove_null_rows=config.remove_null_rows,
+                enable_multi_header=config.enable_multi_header,
+                semantic_spanning_cells=config.semantic_spanning_cells,
+                semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
+                large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
+                large_table_threshold=config.large_table_threshold,
+                large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
+                large_table_maximum_rows=config.large_table_maximum_rows,
+                force_large_table_assumption=config.force_large_table_assumption,
+            )
+        )
+        detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))
+        doc = await run_sync(PyPDFium2Document, str(file_path))
+        cropped_tables: list[CroppedTable] = []
+        dataframes: list[DataFrame] = []
+        try:
+            for page in doc:
+                cropped_tables.extend(await run_sync(detector.extract, page))
+            for cropped_table in cropped_tables:
+                formatted_table = await run_sync(formatter.extract, cropped_table)
+                dataframes.append(await run_sync(formatted_table.df))
+            return [
+                TableData(
+                    cropped_image=cropped_table.image(),
+                    page_number=cropped_table.page.page_number,
+                    text=data_frame.to_markdown(),
+                    df=data_frame,
+                )
+                for data_frame, cropped_table in zip(dataframes, cropped_tables)
+            ]
+        finally:
+            await run_sync(doc.close)
+    except ImportError as e:
+        raise MissingDependencyError.create_for_package(
+            dependency_group="gmft", functionality="table extraction", package_name="gmft"
+        ) from e

kreuzberg/_ocr/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+from functools import lru_cache
+from typing import Any
+from kreuzberg._ocr._base import OCRBackend
+from kreuzberg._ocr._easyocr import EasyOCRBackend
+from kreuzberg._ocr._paddleocr import PaddleBackend
+from kreuzberg._ocr._tesseract import TesseractBackend
+from kreuzberg._types import OcrBackendType
+@lru_cache
+def get_ocr_backend(backend: OcrBackendType) -> OCRBackend[Any]:
+    if backend == "easyocr":
+        return EasyOCRBackend()
+    if backend == "paddleocr":
+        return PaddleBackend()
+    return TesseractBackend()

kreuzberg/_ocr/_base.py ADDED Viewed

@@ -0,0 +1,54 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Generic, TypeVar
+from PIL.Image import Image
+from kreuzberg._types import ExtractionResult
+try:  # pragma: no cover
+    from typing import Unpack  # type: ignore[attr-defined]
+except ImportError:  # pragma: no cover
+    from typing_extensions import Unpack
+T = TypeVar("T")
+class OCRBackend(ABC, Generic[T]):
+    """Abstract base class for Optical Character Recognition (OCR) backend implementations.
+    This class provides the blueprint for OCR backend implementations,
+    offering both synchronous and asynchronous methods to process images
+    and files for text extraction.
+    """
+    @abstractmethod
+    async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
+        """Asynchronously process an image and extract its text and metadata.
+        Args:
+            image: An instance of PIL.Image representing the input image.
+            **kwargs: Any kwargs related to the given backend
+        Returns:
+            The extraction result object
+        """
+        ...
+    @abstractmethod
+    async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
+        """Asynchronously process a file and extract its text and metadata.
+        Args:
+            path: A Path object representing the file to be processed.
+            **kwargs: Any kwargs related to the given backend
+        Returns:
+            The extraction result object
+        """
+        ...
+    def __hash__(self) -> int:
+        """Hash function for allowing caching."""
+        return hash(type(self).__name__)

kreuzberg/_ocr/_easyocr.py ADDED Viewed

@@ -0,0 +1,376 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
+from PIL import Image
+from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._ocr._base import OCRBackend
+from kreuzberg._types import ExtractionResult, Metadata
+from kreuzberg._utils._string import normalize_spaces
+from kreuzberg._utils._sync import run_sync
+from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
+if TYPE_CHECKING:
+    from pathlib import Path
+try:  # pragma: no cover
+    from typing import Unpack  # type: ignore[attr-defined]
+except ImportError:  # pragma: no cover
+    from typing_extensions import Unpack
+EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
+    "abq",
+    "ady",
+    "af",
+    "ang",
+    "ar",
+    "as",
+    "ava",
+    "az",
+    "be",
+    "bg",
+    "bh",
+    "bho",
+    "bn",
+    "bs",
+    "ch_sim",
+    "ch_tra",
+    "che",
+    "cs",
+    "cy",
+    "da",
+    "dar",
+    "de",
+    "en",
+    "es",
+    "et",
+    "fa",
+    "fr",
+    "ga",
+    "gom",
+    "hi",
+    "hr",
+    "hu",
+    "id",
+    "inh",  # codespell:ignore
+    "is",
+    "it",
+    "ja",
+    "kbd",
+    "kn",
+    "ko",
+    "ku",
+    "la",
+    "lbe",
+    "lez",
+    "lt",
+    "lv",
+    "mah",
+    "mai",
+    "mi",
+    "mn",
+    "mr",
+    "ms",
+    "mt",
+    "ne",
+    "new",
+    "nl",
+    "no",
+    "oc",
+    "pi",
+    "pl",
+    "pt",
+    "ro",
+    "ru",
+    "rs_cyrillic",
+    "rs_latin",
+    "sck",
+    "sk",
+    "sl",
+    "sq",
+    "sv",
+    "sw",
+    "ta",
+    "tab",
+    "te",  # codespell:ignore
+    "th",
+    "tjk",
+    "tl",
+    "tr",
+    "ug",
+    "uk",
+    "ur",
+    "uz",
+    "vi",
+}
+@dataclass(unsafe_hash=True, frozen=True)
+class EasyOCRConfig:
+    """Configuration options for EasyOCR."""
+    add_margin: float = 0.1
+    """Extend bounding boxes in all directions."""
+    adjust_contrast: float = 0.5
+    """Target contrast level for low contrast text."""
+    beam_width: int = 5
+    """Beam width for beam search in recognition."""
+    canvas_size: int = 2560
+    """Maximum image dimension for detection."""
+    contrast_ths: float = 0.1
+    """Contrast threshold for preprocessing."""
+    decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
+    """Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
+    height_ths: float = 0.5
+    """Maximum difference in box height for merging."""
+    language: str | list[str] = "en"
+    """Language or languages to use for OCR."""
+    link_threshold: float = 0.4
+    """Link confidence threshold."""
+    low_text: float = 0.4
+    """Text low-bound score."""
+    mag_ratio: float = 1.0
+    """Image magnification ratio."""
+    min_size: int = 10
+    """Minimum text box size in pixels."""
+    rotation_info: list[int] | None = None
+    """List of angles to try for detection."""
+    slope_ths: float = 0.1
+    """Maximum slope for merging text boxes."""
+    text_threshold: float = 0.7
+    """Text confidence threshold."""
+    use_gpu: bool = False
+    """Whether to use GPU for inference."""
+    width_ths: float = 0.5
+    """Maximum horizontal distance for merging boxes."""
+    x_ths: float = 1.0
+    """Maximum horizontal distance for paragraph merging."""
+    y_ths: float = 0.5
+    """Maximum vertical distance for paragraph merging."""
+    ycenter_ths: float = 0.5
+    """Maximum shift in y direction for merging."""
+class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
+    _reader: ClassVar[Any] = None
+    async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        """Asynchronously process an image and extract its text and metadata using EasyOCR.
+        Args:
+            image: An instance of PIL.Image representing the input image.
+            **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        Raises:
+            OCRError: If OCR processing fails.
+        """
+        await self._init_easyocr(**kwargs)
+        beam_width = kwargs.pop("beam_width")
+        try:
+            result = await run_sync(
+                self._reader.readtext,
+                image.tobytes(),
+                beamWidth=beam_width,
+                **kwargs,
+            )
+            return self._process_easyocr_result(result, image)
+        except Exception as e:
+            raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
+    async def process_file(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        """Asynchronously process a file and extract its text and metadata using EasyOCR.
+        Args:
+            path: A Path object representing the file to be processed.
+            **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        Raises:
+            OCRError: If file loading or OCR processing fails.
+        """
+        await self._init_easyocr(**kwargs)
+        try:
+            image = await run_sync(Image.open, path)
+            return await self.process_image(image, **kwargs)
+        except Exception as e:
+            raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
+    @staticmethod
+    def _process_easyocr_result(result: list[Any], image: Image.Image) -> ExtractionResult:
+        """Process EasyOCR result into an ExtractionResult with metadata.
+        Args:
+            result: The raw result from EasyOCR.
+            image: The original PIL image.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        """
+        if not result:
+            return ExtractionResult(
+                content="",
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata=Metadata(width=image.width, height=image.height),
+                chunks=[],
+            )
+        expected_tuple_length = 2
+        if all(len(item) == expected_tuple_length for item in result):
+            text_content = ""
+            confidence_sum = 0
+            confidence_count = 0
+            for text, confidence in result:
+                if text:
+                    text_content += text + "\n"
+                    confidence_sum += confidence
+                    confidence_count += 1
+            metadata = Metadata(
+                width=image.width,
+                height=image.height,
+            )
+            return ExtractionResult(
+                content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+            )
+        sorted_results = sorted(result, key=lambda x: x[0][0][1] + x[0][2][1])
+        line_groups: list[list[Any]] = []
+        current_line: list[Any] = []
+        prev_y_center: float | None = None
+        line_height_threshold = 20
+        for item in sorted_results:
+            box, text, confidence = item
+            y_center = sum(point[1] for point in box) / 4
+            if prev_y_center is None or abs(y_center - prev_y_center) > line_height_threshold:
+                if current_line:
+                    line_groups.append(current_line)
+                current_line = [item]
+            else:
+                current_line.append(item)
+            prev_y_center = y_center
+        if current_line:
+            line_groups.append(current_line)
+        text_content = ""
+        confidence_sum = 0
+        confidence_count = 0
+        for line in line_groups:
+            line_sorted = sorted(line, key=lambda x: x[0][0][0])
+            for item in line_sorted:
+                _, text, confidence = item
+                if text:
+                    text_content += text + " "
+                    confidence_sum += confidence
+                    confidence_count += 1
+            text_content += "\n"
+        metadata = Metadata(
+            width=image.width,
+            height=image.height,
+        )
+        return ExtractionResult(
+            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+        )
+    @classmethod
+    def _is_gpu_available(cls) -> bool:
+        """Check if GPU is available for EasyOCR.
+        Returns:
+            bool: True if GPU support is available.
+        """
+        try:
+            import torch
+            return torch.cuda.is_available()
+        except ImportError:
+            return False
+    @classmethod
+    async def _init_easyocr(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
+        """Initialize EasyOCR with the provided configuration.
+        Args:
+            **kwargs: Configuration parameters for EasyOCR including language, etc.
+        Raises:
+            MissingDependencyError: If EasyOCR is not installed.
+            OCRError: If initialization fails.
+        """
+        if cls._reader is not None:
+            return
+        try:
+            import easyocr
+        except ImportError as e:
+            raise MissingDependencyError.create_for_package(
+                dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
+            ) from e
+        languages = cls._validate_language_code(kwargs.pop("language", "en"))
+        has_gpu = cls._is_gpu_available()
+        kwargs.setdefault("gpu", has_gpu)
+        kwargs.setdefault("detector", True)
+        kwargs.setdefault("recognizer", True)
+        kwargs.setdefault("download_enabled", True)
+        kwargs.setdefault("recog_network", "standard")
+        try:
+            cls._reader = await run_sync(
+                easyocr.Reader,
+                languages,
+                gpu=kwargs.get("use_gpu"),
+                verbose=False,
+            )
+        except Exception as e:
+            raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
+    @staticmethod
+    def _validate_language_code(language_codes: str | list[str]) -> list[str]:
+        """Validate and normalize a provided language code.
+        Args:
+            language_codes: The language code string.
+        Raises:
+            ValidationError: If the language is not supported by EasyOCR
+        Returns:
+            A list with the normalized language code.
+        """
+        if not isinstance(language_codes, list):
+            languages = [language_codes.lower()]
+        else:
+            languages = [lang.lower() for lang in language_codes]
+        if all(lang in EASYOCR_SUPPORTED_LANGUAGE_CODES for lang in languages):
+            return languages
+        raise ValidationError(
+            "The provided language codes are not supported by EasyOCR",
+            context={
+                "language_code": ",".join([lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]),
+                "supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
+            },
+        )

kreuzberg 3.0.0__py3-none-any.whl → 3.1.0__py3-none-any.whl

kreuzberg 3.0.0py3-none-any.whl → 3.1.0py3-none-any.whl