PyPI - kreuzberg - Versions diffs - 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl - Mend

kreuzberg 2.1.2py3-none-any.whl → 3.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

kreuzberg/__init__.py +16 -2
kreuzberg/_chunker.py +51 -0
kreuzberg/_constants.py +2 -3
kreuzberg/_extractors/__init__.py +0 -0
kreuzberg/_extractors/_base.py +92 -0
kreuzberg/_extractors/_html.py +34 -0
kreuzberg/_extractors/_image.py +74 -0
kreuzberg/_extractors/_pandoc.py +613 -0
kreuzberg/_extractors/_pdf.py +163 -0
kreuzberg/_extractors/_presentation.py +233 -0
kreuzberg/_extractors/_spread_sheet.py +125 -0
kreuzberg/_mime_types.py +19 -26
kreuzberg/_ocr/__init__.py +17 -0
kreuzberg/_ocr/_base.py +54 -0
kreuzberg/_ocr/_easyocr.py +376 -0
kreuzberg/_ocr/_paddleocr.py +291 -0
kreuzberg/_ocr/_tesseract.py +342 -0
kreuzberg/_playa.py +276 -0
kreuzberg/_registry.py +108 -0
kreuzberg/_types.py +133 -36
kreuzberg/_utils/__init__.py +0 -0
kreuzberg/{_string.py → _utils/_string.py} +0 -2
kreuzberg/_utils/_sync.py +121 -0
kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
kreuzberg/exceptions.py +25 -0
kreuzberg/extraction.py +114 -227
kreuzberg-3.0.1.dist-info/METADATA +178 -0
kreuzberg-3.0.1.dist-info/RECORD +32 -0
{kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
kreuzberg/_html.py +0 -31
kreuzberg/_pandoc.py +0 -366
kreuzberg/_pdf.py +0 -190
kreuzberg/_pptx.py +0 -88
kreuzberg/_sync.py +0 -74
kreuzberg/_tesseract.py +0 -231
kreuzberg/_xlsx.py +0 -88
kreuzberg-2.1.2.dist-info/METADATA +0 -446
kreuzberg-2.1.2.dist-info/RECORD +0 -21
{kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
{kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0

kreuzberg/_ocr/_easyocr.py ADDED Viewed

@@ -0,0 +1,376 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
+from PIL import Image
+from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._ocr._base import OCRBackend
+from kreuzberg._types import ExtractionResult, Metadata
+from kreuzberg._utils._string import normalize_spaces
+from kreuzberg._utils._sync import run_sync
+from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
+if TYPE_CHECKING:
+    from pathlib import Path
+try:  # pragma: no cover
+    from typing import Unpack  # type: ignore[attr-defined]
+except ImportError:  # pragma: no cover
+    from typing_extensions import Unpack
+EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
+    "abq",
+    "ady",
+    "af",
+    "ang",
+    "ar",
+    "as",
+    "ava",
+    "az",
+    "be",
+    "bg",
+    "bh",
+    "bho",
+    "bn",
+    "bs",
+    "ch_sim",
+    "ch_tra",
+    "che",
+    "cs",
+    "cy",
+    "da",
+    "dar",
+    "de",
+    "en",
+    "es",
+    "et",
+    "fa",
+    "fr",
+    "ga",
+    "gom",
+    "hi",
+    "hr",
+    "hu",
+    "id",
+    "inh",  # codespell:ignore
+    "is",
+    "it",
+    "ja",
+    "kbd",
+    "kn",
+    "ko",
+    "ku",
+    "la",
+    "lbe",
+    "lez",
+    "lt",
+    "lv",
+    "mah",
+    "mai",
+    "mi",
+    "mn",
+    "mr",
+    "ms",
+    "mt",
+    "ne",
+    "new",
+    "nl",
+    "no",
+    "oc",
+    "pi",
+    "pl",
+    "pt",
+    "ro",
+    "ru",
+    "rs_cyrillic",
+    "rs_latin",
+    "sck",
+    "sk",
+    "sl",
+    "sq",
+    "sv",
+    "sw",
+    "ta",
+    "tab",
+    "te",  # codespell:ignore
+    "th",
+    "tjk",
+    "tl",
+    "tr",
+    "ug",
+    "uk",
+    "ur",
+    "uz",
+    "vi",
+}
+@dataclass(unsafe_hash=True, frozen=True)
+class EasyOCRConfig:
+    """Configuration options for EasyOCR."""
+    add_margin: float = 0.1
+    """Extend bounding boxes in all directions."""
+    adjust_contrast: float = 0.5
+    """Target contrast level for low contrast text."""
+    beam_width: int = 5
+    """Beam width for beam search in recognition."""
+    canvas_size: int = 2560
+    """Maximum image dimension for detection."""
+    contrast_ths: float = 0.1
+    """Contrast threshold for preprocessing."""
+    decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
+    """Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
+    height_ths: float = 0.5
+    """Maximum difference in box height for merging."""
+    language: str | list[str] = "en"
+    """Language or languages to use for OCR."""
+    link_threshold: float = 0.4
+    """Link confidence threshold."""
+    low_text: float = 0.4
+    """Text low-bound score."""
+    mag_ratio: float = 1.0
+    """Image magnification ratio."""
+    min_size: int = 10
+    """Minimum text box size in pixels."""
+    rotation_info: list[int] | None = None
+    """List of angles to try for detection."""
+    slope_ths: float = 0.1
+    """Maximum slope for merging text boxes."""
+    text_threshold: float = 0.7
+    """Text confidence threshold."""
+    use_gpu: bool = False
+    """Whether to use GPU for inference."""
+    width_ths: float = 0.5
+    """Maximum horizontal distance for merging boxes."""
+    x_ths: float = 1.0
+    """Maximum horizontal distance for paragraph merging."""
+    y_ths: float = 0.5
+    """Maximum vertical distance for paragraph merging."""
+    ycenter_ths: float = 0.5
+    """Maximum shift in y direction for merging."""
+class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
+    _reader: ClassVar[Any] = None
+    async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        """Asynchronously process an image and extract its text and metadata using EasyOCR.
+        Args:
+            image: An instance of PIL.Image representing the input image.
+            **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        Raises:
+            OCRError: If OCR processing fails.
+        """
+        await self._init_easyocr(**kwargs)
+        beam_width = kwargs.pop("beam_width")
+        try:
+            result = await run_sync(
+                self._reader.readtext,
+                image.tobytes(),
+                beamWidth=beam_width,
+                **kwargs,
+            )
+            return self._process_easyocr_result(result, image)
+        except Exception as e:
+            raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
+    async def process_file(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        """Asynchronously process a file and extract its text and metadata using EasyOCR.
+        Args:
+            path: A Path object representing the file to be processed.
+            **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        Raises:
+            OCRError: If file loading or OCR processing fails.
+        """
+        await self._init_easyocr(**kwargs)
+        try:
+            image = await run_sync(Image.open, path)
+            return await self.process_image(image, **kwargs)
+        except Exception as e:
+            raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
+    @staticmethod
+    def _process_easyocr_result(result: list[Any], image: Image.Image) -> ExtractionResult:
+        """Process EasyOCR result into an ExtractionResult with metadata.
+        Args:
+            result: The raw result from EasyOCR.
+            image: The original PIL image.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        """
+        if not result:
+            return ExtractionResult(
+                content="",
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata=Metadata(width=image.width, height=image.height),
+                chunks=[],
+            )
+        expected_tuple_length = 2
+        if all(len(item) == expected_tuple_length for item in result):
+            text_content = ""
+            confidence_sum = 0
+            confidence_count = 0
+            for text, confidence in result:
+                if text:
+                    text_content += text + "\n"
+                    confidence_sum += confidence
+                    confidence_count += 1
+            metadata = Metadata(
+                width=image.width,
+                height=image.height,
+            )
+            return ExtractionResult(
+                content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+            )
+        sorted_results = sorted(result, key=lambda x: x[0][0][1] + x[0][2][1])
+        line_groups: list[list[Any]] = []
+        current_line: list[Any] = []
+        prev_y_center: float | None = None
+        line_height_threshold = 20
+        for item in sorted_results:
+            box, text, confidence = item
+            y_center = sum(point[1] for point in box) / 4
+            if prev_y_center is None or abs(y_center - prev_y_center) > line_height_threshold:
+                if current_line:
+                    line_groups.append(current_line)
+                current_line = [item]
+            else:
+                current_line.append(item)
+            prev_y_center = y_center
+        if current_line:
+            line_groups.append(current_line)
+        text_content = ""
+        confidence_sum = 0
+        confidence_count = 0
+        for line in line_groups:
+            line_sorted = sorted(line, key=lambda x: x[0][0][0])
+            for item in line_sorted:
+                _, text, confidence = item
+                if text:
+                    text_content += text + " "
+                    confidence_sum += confidence
+                    confidence_count += 1
+            text_content += "\n"
+        metadata = Metadata(
+            width=image.width,
+            height=image.height,
+        )
+        return ExtractionResult(
+            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+        )
+    @classmethod
+    def _is_gpu_available(cls) -> bool:
+        """Check if GPU is available for EasyOCR.
+        Returns:
+            bool: True if GPU support is available.
+        """
+        try:
+            import torch
+            return torch.cuda.is_available()
+        except ImportError:
+            return False
+    @classmethod
+    async def _init_easyocr(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
+        """Initialize EasyOCR with the provided configuration.
+        Args:
+            **kwargs: Configuration parameters for EasyOCR including language, etc.
+        Raises:
+            MissingDependencyError: If EasyOCR is not installed.
+            OCRError: If initialization fails.
+        """
+        if cls._reader is not None:
+            return
+        try:
+            import easyocr
+        except ImportError as e:
+            raise MissingDependencyError.create_for_package(
+                dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
+            ) from e
+        languages = cls._validate_language_code(kwargs.pop("language", "en"))
+        has_gpu = cls._is_gpu_available()
+        kwargs.setdefault("gpu", has_gpu)
+        kwargs.setdefault("detector", True)
+        kwargs.setdefault("recognizer", True)
+        kwargs.setdefault("download_enabled", True)
+        kwargs.setdefault("recog_network", "standard")
+        try:
+            cls._reader = await run_sync(
+                easyocr.Reader,
+                languages,
+                gpu=kwargs.get("use_gpu"),
+                verbose=False,
+            )
+        except Exception as e:
+            raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
+    @staticmethod
+    def _validate_language_code(language_codes: str | list[str]) -> list[str]:
+        """Validate and normalize a provided language code.
+        Args:
+            language_codes: The language code string.
+        Raises:
+            ValidationError: If the language is not supported by EasyOCR
+        Returns:
+            A list with the normalized language code.
+        """
+        if not isinstance(language_codes, list):
+            languages = [language_codes.lower()]
+        else:
+            languages = [lang.lower() for lang in language_codes]
+        if all(lang in EASYOCR_SUPPORTED_LANGUAGE_CODES for lang in languages):
+            return languages
+        raise ValidationError(
+            "The provided language codes are not supported by EasyOCR",
+            context={
+                "language_code": ",".join([lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]),
+                "supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
+            },
+        )

kreuzberg/_ocr/_paddleocr.py ADDED Viewed

@@ -0,0 +1,291 @@
+from __future__ import annotations
+import platform
+import sys
+from dataclasses import dataclass
+from importlib.util import find_spec
+from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
+from PIL import Image
+from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._ocr._base import OCRBackend
+from kreuzberg._types import ExtractionResult, Metadata
+from kreuzberg._utils._string import normalize_spaces
+from kreuzberg._utils._sync import run_sync
+from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
+if TYPE_CHECKING:
+    from pathlib import Path
+try:  # pragma: no cover
+    from typing import Unpack  # type: ignore[attr-defined]
+except ImportError:  # pragma: no cover
+    from typing_extensions import Unpack
+PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
+@dataclass(unsafe_hash=True, frozen=True)
+class PaddleOCRConfig:
+    """Configuration options for PaddleOCR.
+    This TypedDict provides type hints and documentation for all PaddleOCR parameters.
+    """
+    cls_image_shape: str = "3,48,192"
+    """Image shape for classification algorithm in format 'channels,height,width'."""
+    det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
+    """Detection algorithm."""
+    det_db_box_thresh: float = 0.5
+    """Score threshold for detected boxes. Boxes below this value are discarded."""
+    det_db_thresh: float = 0.3
+    """Binarization threshold for DB output map."""
+    det_db_unclip_ratio: float = 2.0
+    """Expansion ratio for detected text boxes."""
+    det_east_cover_thresh: float = 0.1
+    """Score threshold for EAST output boxes."""
+    det_east_nms_thresh: float = 0.2
+    """NMS threshold for EAST model output boxes."""
+    det_east_score_thresh: float = 0.8
+    """Binarization threshold for EAST output map."""
+    det_max_side_len: int = 960
+    """Maximum size of image long side. Images exceeding this will be proportionally resized."""
+    drop_score: float = 0.5
+    """Filter recognition results by confidence score. Results below this are discarded."""
+    enable_mkldnn: bool = False
+    """Whether to enable MKL-DNN acceleration (Intel CPU only)."""
+    gpu_mem: int = 8000
+    """GPU memory size (in MB) to use for initialization."""
+    language: str = "en"
+    """Language to use for OCR."""
+    max_text_length: int = 25
+    """Maximum text length that the recognition algorithm can recognize."""
+    rec: bool = True
+    """Enable text recognition when using the ocr() function."""
+    rec_algorithm: Literal[
+        "CRNN",
+        "SRN",
+        "NRTR",
+        "SAR",
+        "SEED",
+        "SVTR",
+        "SVTR_LCNet",
+        "ViTSTR",
+        "ABINet",
+        "VisionLAN",
+        "SPIN",
+        "RobustScanner",
+        "RFL",
+    ] = "CRNN"
+    """Recognition algorithm."""
+    rec_image_shape: str = "3,32,320"
+    """Image shape for recognition algorithm in format 'channels,height,width'."""
+    table: bool = True
+    """Whether to enable table recognition."""
+    use_angle_cls: bool = True
+    """Whether to use text orientation classification model."""
+    use_gpu: bool = False
+    """Whether to use GPU for inference. Requires installing the paddlepaddle-gpu package"""
+    use_space_char: bool = True
+    """Whether to recognize spaces."""
+    use_zero_copy_run: bool = False
+    """Whether to enable zero_copy_run for inference optimization."""
+class PaddleBackend(OCRBackend[PaddleOCRConfig]):
+    _paddle_ocr: ClassVar[Any] = None
+    async def process_image(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
+        """Asynchronously process an image and extract its text and metadata using PaddleOCR.
+        Args:
+            image: An instance of PIL.Image representing the input image.
+            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        Raises:
+            OCRError: If OCR processing fails.
+        """
+        import numpy as np
+        await self._init_paddle_ocr(**kwargs)
+        image_np = np.array(image)
+        try:
+            result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
+            return self._process_paddle_result(result, image)
+        except Exception as e:
+            raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
+    async def process_file(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
+        """Asynchronously process a file and extract its text and metadata using PaddleOCR.
+        Args:
+            path: A Path object representing the file to be processed.
+            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        Raises:
+            OCRError: If file loading or OCR processing fails.
+        """
+        await self._init_paddle_ocr(**kwargs)
+        try:
+            image = await run_sync(Image.open, path)
+            return await self.process_image(image, **kwargs)
+        except Exception as e:
+            raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
+    @staticmethod
+    def _process_paddle_result(result: list[Any], image: Image.Image) -> ExtractionResult:
+        """Process PaddleOCR result into an ExtractionResult with metadata.
+        Args:
+            result: The raw result from PaddleOCR.
+            image: The original PIL image.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        """
+        text_content = ""
+        confidence_sum = 0
+        confidence_count = 0
+        for page_result in result:
+            if not page_result:
+                continue
+            sorted_boxes = sorted(page_result, key=lambda x: x[0][0][1])
+            line_groups: list[list[Any]] = []
+            current_line: list[Any] = []
+            prev_y: float | None = None
+            for box in sorted_boxes:
+                box_points, (_, _) = box
+                current_y = sum(point[1] for point in box_points) / 4
+                min_box_distance = 20
+                if prev_y is None or abs(current_y - prev_y) > min_box_distance:
+                    if current_line:
+                        line_groups.append(current_line)
+                    current_line = [box]
+                else:
+                    current_line.append(box)
+                prev_y = current_y
+            if current_line:
+                line_groups.append(current_line)
+            for line in line_groups:
+                line_sorted = sorted(line, key=lambda x: x[0][0][0])
+                for box in line_sorted:
+                    _, (text, confidence) = box
+                    if text:
+                        text_content += text + " "
+                        confidence_sum += confidence
+                        confidence_count += 1
+                text_content += "\n"
+        width, height = image.size
+        metadata = Metadata(
+            width=width,
+            height=height,
+        )
+        return ExtractionResult(
+            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+        )
+    @classmethod
+    def _is_mkldnn_supported(cls) -> bool:
+        """Check if the current architecture supports MKL-DNN optimization.
+        Returns:
+            True if MKL-DNN is supported on this architecture.
+        """
+        system = platform.system().lower()
+        processor = platform.processor().lower()
+        machine = platform.machine().lower()
+        if system in ("linux", "windows"):
+            return "intel" in processor or "x86" in machine or "amd64" in machine or "x86_64" in machine
+        if system == "darwin":
+            return machine == "x86_64"
+        return False
+    @classmethod
+    async def _init_paddle_ocr(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
+        """Initialize PaddleOCR with the provided configuration.
+        Args:
+            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        Raises:
+            MissingDependencyError: If PaddleOCR is not installed.
+            OCRError: If initialization fails.
+            ValidationError: If the python version is too high.
+        """
+        if cls._paddle_ocr is not None:
+            return
+        if sys.version_info >= (3, 13):  # pragma: no cover
+            raise ValidationError(
+                "PaddleOCR is only available in python 3.12 and below. Please downgrade your Python or switch to a different OCR backend.",
+                context={"issue": "https://github.com/PaddlePaddle/Paddle/issues/71616"},
+            )
+        try:
+            from paddleocr import PaddleOCR
+        except ImportError as e:
+            raise MissingDependencyError.create_for_package(
+                dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
+            ) from e
+        language = cls._validate_language_code(kwargs.pop("language", "en"))
+        has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
+        kwargs.setdefault("use_angle_cls", True)
+        kwargs.setdefault("use_gpu", has_gpu_package)
+        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not has_gpu_package)
+        kwargs.setdefault("det_db_thresh", 0.3)
+        kwargs.setdefault("det_db_box_thresh", 0.5)
+        kwargs.setdefault("det_db_unclip_ratio", 1.6)
+        try:
+            cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
+        except Exception as e:
+            raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
+    @staticmethod
+    def _validate_language_code(lang_code: str) -> str:
+        """Convert a language code to PaddleOCR format.
+        Args:
+            lang_code: ISO language code or language name
+        Raises:
+            ValidationError: If the language is not supported by PaddleOCR
+        Returns:
+            Language code compatible with PaddleOCR
+        """
+        normalized = lang_code.lower()
+        if normalized in PADDLEOCR_SUPPORTED_LANGUAGE_CODES:
+            return normalized
+        raise ValidationError(
+            "The provided language code is not supported by PaddleOCR",
+            context={
+                "language_code": lang_code,
+                "supported_languages": ",".join(sorted(PADDLEOCR_SUPPORTED_LANGUAGE_CODES)),
+            },
+        )

kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl

kreuzberg 2.1.2py3-none-any.whl → 3.0.1py3-none-any.whl