PyPI - kreuzberg - Versions diffs - 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl - Mend

kreuzberg 3.7.0py3-none-any.whl → 3.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

kreuzberg/_entity_extraction.py +1 -2
kreuzberg/_extractors/_base.py +39 -1
kreuzberg/_extractors/_email.py +149 -0
kreuzberg/_extractors/_html.py +15 -3
kreuzberg/_extractors/_image.py +21 -36
kreuzberg/_extractors/_pandoc.py +3 -14
kreuzberg/_extractors/_pdf.py +81 -48
kreuzberg/_extractors/_presentation.py +62 -10
kreuzberg/_extractors/_spread_sheet.py +179 -4
kreuzberg/_extractors/_structured.py +148 -0
kreuzberg/_gmft.py +314 -7
kreuzberg/_mime_types.py +27 -1
kreuzberg/_ocr/__init__.py +10 -1
kreuzberg/_ocr/_base.py +59 -0
kreuzberg/_ocr/_easyocr.py +91 -0
kreuzberg/_ocr/_paddleocr.py +89 -0
kreuzberg/_ocr/_tesseract.py +564 -4
kreuzberg/_registry.py +4 -0
kreuzberg/_types.py +131 -0
kreuzberg/_utils/_cache.py +52 -4
kreuzberg/_utils/_errors.py +3 -7
kreuzberg/_utils/_process_pool.py +180 -7
kreuzberg/_utils/_quality.py +237 -0
kreuzberg/_utils/_serialization.py +4 -2
kreuzberg/_utils/_string.py +153 -10
kreuzberg/_utils/_sync.py +5 -2
kreuzberg/_utils/_table.py +261 -0
kreuzberg/cli.py +1 -2
kreuzberg/extraction.py +4 -22
{kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/METADATA +58 -54
kreuzberg-3.8.1.dist-info/RECORD +53 -0
kreuzberg/_multiprocessing/__init__.py +0 -6
kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
kreuzberg/_multiprocessing/process_manager.py +0 -189
kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
kreuzberg-3.7.0.dist-info/RECORD +0 -56
{kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_gmft.py CHANGED Viewed

@@ -1,12 +1,17 @@
 from __future__ import annotations
+import multiprocessing as mp
 import os
+import queue
+import signal
+import traceback
 from dataclasses import dataclass, field
+from io import StringIO
 from typing import TYPE_CHECKING, Any, Literal
 from kreuzberg._types import TableData
 from kreuzberg._utils._sync import run_sync
-from kreuzberg.exceptions import MissingDependencyError
+from kreuzberg.exceptions import MissingDependencyError, ParsingError
 if TYPE_CHECKING:
     from os import PathLike
@@ -196,9 +201,7 @@ async def extract_tables(  # noqa: PLR0915
     try:
         if use_isolated_process:
-            from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated_async
-            result = await extract_tables_isolated_async(file_path, config)
+            result = await _extract_tables_isolated_async(file_path, config)
             await table_cache.aset(result, **cache_kwargs)
@@ -314,9 +317,7 @@ def extract_tables_sync(
         return cached_result  # type: ignore[no-any-return]
     if use_isolated_process:
-        from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated
-        result = extract_tables_isolated(file_path, config)
+        result = _extract_tables_isolated(file_path, config)
         table_cache.set(result, **cache_kwargs)
@@ -378,3 +379,309 @@ def extract_tables_sync(
         raise MissingDependencyError.create_for_package(
             dependency_group="gmft", functionality="table extraction", package_name="gmft"
         ) from e
+def _extract_tables_in_process(
+    file_path: str | PathLike[str],
+    config_dict: dict[str, Any],
+    result_queue: queue.Queue[tuple[bool, Any]],
+) -> None:
+    """Extract tables in an isolated process to handle potential segfaults.
+    Args:
+        file_path: Path to the PDF file
+        config_dict: Serialized GMFTConfig as a dict
+        result_queue: Queue to put results or errors
+    """
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+    try:
+        from gmft.auto import AutoTableDetector, AutoTableFormatter  # type: ignore[attr-defined]
+        from gmft.detectors.tatr import TATRDetectorConfig  # type: ignore[attr-defined]
+        from gmft.formatters.tatr import TATRFormatConfig
+        from gmft.pdf_bindings.pdfium import PyPDFium2Document
+        config = GMFTConfig(**config_dict)
+        formatter = AutoTableFormatter(  # type: ignore[no-untyped-call]
+            config=TATRFormatConfig(
+                verbosity=config.verbosity,
+                formatter_base_threshold=config.formatter_base_threshold,
+                cell_required_confidence=config.cell_required_confidence,
+                remove_null_rows=config.remove_null_rows,
+                enable_multi_header=config.enable_multi_header,
+                semantic_spanning_cells=config.semantic_spanning_cells,
+                semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
+                large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
+                large_table_threshold=config.large_table_threshold,
+                large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
+                large_table_maximum_rows=config.large_table_maximum_rows,
+                force_large_table_assumption=config.force_large_table_assumption,
+            )
+        )
+        detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))  # type: ignore[no-untyped-call]
+        doc = PyPDFium2Document(str(file_path))
+        cropped_tables = []
+        dataframes = []
+        try:
+            for page in doc:
+                cropped_tables.extend(detector.extract(page))  # type: ignore[attr-defined]
+            for cropped_table in cropped_tables:
+                formatted_table = formatter.extract(cropped_table)  # type: ignore[attr-defined]
+                dataframes.append(formatted_table.df())
+            results = []
+            for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
+                import io
+                img_bytes = io.BytesIO()
+                cropped_image = cropped_table.image()
+                cropped_image.save(img_bytes, format="PNG")
+                img_bytes.seek(0)
+                results.append(
+                    {
+                        "cropped_image_bytes": img_bytes.getvalue(),
+                        "page_number": cropped_table.page.page_number,
+                        "text": data_frame.to_markdown(),
+                        "df_csv": data_frame.to_csv(index=False),
+                    }
+                )
+            result_queue.put((True, results))
+        finally:
+            doc.close()  # type: ignore[no-untyped-call]
+    except Exception as e:  # noqa: BLE001
+        error_info = {"error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
+        result_queue.put((False, error_info))
+def _extract_tables_isolated(
+    file_path: str | PathLike[str],
+    config: GMFTConfig | None = None,
+    timeout: float = 300.0,
+) -> list[TableData]:
+    """Extract tables using an isolated process to handle segfaults.
+    Args:
+        file_path: Path to the PDF file
+        config: GMFT configuration
+        timeout: Maximum time to wait for extraction
+    Returns:
+        List of extracted tables
+    Raises:
+        RuntimeError: If extraction fails or times out
+    """
+    config = config or GMFTConfig()
+    config_dict = config.__dict__.copy()
+    ctx = mp.get_context("spawn")
+    result_queue = ctx.Queue()
+    process = ctx.Process(
+        target=_extract_tables_in_process,
+        args=(str(file_path), config_dict, result_queue),
+    )
+    process.start()
+    try:
+        # Wait for result with timeout, checking for process death  # ~keep
+        import time
+        start_time = time.time()
+        while True:
+            try:
+                success, result = result_queue.get_nowait()
+                break
+            except queue.Empty:
+                if time.time() - start_time > timeout:
+                    raise
+                if not process.is_alive():
+                    # Process died without putting result  # ~keep
+                    if process.exitcode == -signal.SIGSEGV:
+                        raise ParsingError(
+                            "GMFT process crashed with segmentation fault",
+                            context={
+                                "file_path": str(file_path),
+                                "exit_code": process.exitcode,
+                            },
+                        ) from None
+                    raise ParsingError(
+                        f"GMFT process died unexpectedly with exit code {process.exitcode}",
+                        context={
+                            "file_path": str(file_path),
+                            "exit_code": process.exitcode,
+                        },
+                    ) from None
+                time.sleep(0.1)
+        if success:
+            tables = []
+            for table_dict in result:
+                import io
+                from PIL import Image
+                img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
+                import pandas as pd
+                df = pd.read_csv(StringIO(table_dict["df_csv"]))
+                tables.append(
+                    TableData(
+                        cropped_image=img,
+                        page_number=table_dict["page_number"],
+                        text=table_dict["text"],
+                        df=df,
+                    )
+                )
+            return tables
+        error_info = result
+        raise ParsingError(
+            f"GMFT table extraction failed: {error_info['error']}",
+            context={
+                "file_path": str(file_path),
+                "error_type": error_info["type"],
+                "traceback": error_info["traceback"],
+            },
+        )
+    except queue.Empty as e:
+        raise ParsingError(
+            "GMFT table extraction timed out",
+            context={
+                "file_path": str(file_path),
+                "timeout": timeout,
+            },
+        ) from e
+    finally:
+        if process.is_alive():
+            process.terminate()
+            process.join(timeout=5)
+            if process.is_alive():
+                process.kill()
+                process.join()
+async def _extract_tables_isolated_async(
+    file_path: str | PathLike[str],
+    config: GMFTConfig | None = None,
+    timeout: float = 300.0,
+) -> list[TableData]:
+    """Async version of extract_tables_isolated using asyncio.
+    Args:
+        file_path: Path to the PDF file
+        config: GMFT configuration
+        timeout: Maximum time to wait for extraction
+    Returns:
+        List of extracted tables
+    Raises:
+        RuntimeError: If extraction fails or times out
+    """
+    import anyio
+    config = config or GMFTConfig()
+    config_dict = config.__dict__.copy()
+    ctx = mp.get_context("spawn")
+    result_queue = ctx.Queue()
+    process = ctx.Process(
+        target=_extract_tables_in_process,
+        args=(str(file_path), config_dict, result_queue),
+    )
+    process.start()
+    try:
+        async def wait_for_result() -> tuple[bool, Any]:
+            while True:
+                try:
+                    return result_queue.get_nowait()  # type: ignore[no-any-return]
+                except queue.Empty:  # noqa: PERF203
+                    await anyio.sleep(0.1)
+                    if not process.is_alive():
+                        # Process died without putting result  # ~keep
+                        if process.exitcode == -signal.SIGSEGV:
+                            raise ParsingError(
+                                "GMFT process crashed with segmentation fault",
+                                context={
+                                    "file_path": str(file_path),
+                                    "exit_code": process.exitcode,
+                                },
+                            ) from None
+                        raise ParsingError(
+                            f"GMFT process died unexpectedly with exit code {process.exitcode}",
+                            context={
+                                "file_path": str(file_path),
+                                "exit_code": process.exitcode,
+                            },
+                        ) from None
+        with anyio.fail_after(timeout):
+            success, result = await wait_for_result()
+        if success:
+            tables = []
+            for table_dict in result:
+                import io
+                from PIL import Image
+                img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
+                import pandas as pd
+                df = pd.read_csv(StringIO(table_dict["df_csv"]))
+                tables.append(
+                    TableData(
+                        cropped_image=img,
+                        page_number=table_dict["page_number"],
+                        text=table_dict["text"],
+                        df=df,
+                    )
+                )
+            return tables
+        error_info = result
+        raise ParsingError(
+            f"GMFT table extraction failed: {error_info['error']}",
+            context={
+                "file_path": str(file_path),
+                "error_type": error_info["type"],
+                "traceback": error_info["traceback"],
+            },
+        )
+    except TimeoutError as e:
+        raise ParsingError(
+            "GMFT table extraction timed out",
+            context={
+                "file_path": str(file_path),
+                "timeout": timeout,
+            },
+        ) from e
+    finally:
+        if process.is_alive():
+            process.terminate()
+            await anyio.to_thread.run_sync(lambda: process.join(timeout=5))
+            if process.is_alive():
+                process.kill()
+                await anyio.to_thread.run_sync(process.join)

kreuzberg/_mime_types.py CHANGED Viewed

@@ -17,6 +17,12 @@ PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
 POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+EML_MIME_TYPE: Final = "message/rfc822"
+MSG_MIME_TYPE: Final = "application/vnd.ms-outlook"
+JSON_MIME_TYPE: Final = "application/json"
+YAML_MIME_TYPE: Final = "application/x-yaml"
+TOML_MIME_TYPE: Final = "application/toml"
 EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
 EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
@@ -127,6 +133,12 @@ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
     ".org": "text/x-org",
     ".epub": "application/epub+zip",
     ".rtf": "application/rtf",
+    ".eml": EML_MIME_TYPE,
+    ".msg": MSG_MIME_TYPE,
+    ".json": JSON_MIME_TYPE,
+    ".yaml": YAML_MIME_TYPE,
+    ".yml": YAML_MIME_TYPE,
+    ".toml": TOML_MIME_TYPE,
     ".odt": "application/vnd.oasis.opendocument.text",
     ".docx": DOCX_MIME_TYPE,
     ".bib": "application/x-bibtex",
@@ -139,7 +151,21 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
     | IMAGE_MIME_TYPES
     | PANDOC_SUPPORTED_MIME_TYPES
     | SPREADSHEET_MIME_TYPES
-    | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
+    | {
+        PDF_MIME_TYPE,
+        POWER_POINT_MIME_TYPE,
+        HTML_MIME_TYPE,
+        EML_MIME_TYPE,
+        MSG_MIME_TYPE,
+        JSON_MIME_TYPE,
+        YAML_MIME_TYPE,
+        TOML_MIME_TYPE,
+        "text/json",
+        "text/yaml",
+        "text/x-yaml",
+        "application/yaml",
+        "text/toml",
+    }
 )

kreuzberg/_ocr/__init__.py CHANGED Viewed

@@ -4,9 +4,18 @@ from typing import Any
 from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._ocr._easyocr import EasyOCRBackend
 from kreuzberg._ocr._paddleocr import PaddleBackend
-from kreuzberg._ocr._tesseract import TesseractBackend
+from kreuzberg._ocr._tesseract import TesseractBackend, TesseractProcessPool
 from kreuzberg._types import OcrBackendType
+__all__ = [
+    "EasyOCRBackend",
+    "OCRBackend",
+    "PaddleBackend",
+    "TesseractBackend",
+    "TesseractProcessPool",
+    "get_ocr_backend",
+]
 @lru_cache
 def get_ocr_backend(backend: OcrBackendType) -> OCRBackend[Any]:

kreuzberg/_ocr/_base.py CHANGED Viewed

@@ -49,6 +49,65 @@ class OCRBackend(ABC, Generic[T]):
         """
         ...
+    @abstractmethod
+    def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
+        """Synchronously process an image and extract its text and metadata.
+        Args:
+            image: An instance of PIL.Image representing the input image.
+            **kwargs: Any kwargs related to the given backend
+        Returns:
+            The extraction result object
+        """
+        ...
+    @abstractmethod
+    def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
+        """Synchronously process a file and extract its text and metadata.
+        Args:
+            path: A Path object representing the file to be processed.
+            **kwargs: Any kwargs related to the given backend
+        Returns:
+            The extraction result object
+        """
+        ...
+    def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
+        """Synchronously process a batch of files and extract their text and metadata.
+        Default implementation processes files sequentially. Backends can override
+        for more efficient batch processing.
+        Args:
+            paths: List of Path objects representing files to be processed.
+            **kwargs: Any kwargs related to the given backend
+        Returns:
+            List of extraction result objects in the same order as input paths
+        """
+        return [self.process_file_sync(path, **kwargs) for path in paths]
+    async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
+        """Asynchronously process a batch of files and extract their text and metadata.
+        Default implementation processes files concurrently. Backends can override
+        for more efficient batch processing.
+        Args:
+            paths: List of Path objects representing files to be processed.
+            **kwargs: Any kwargs related to the given backend
+        Returns:
+            List of extraction result objects in the same order as input paths
+        """
+        from kreuzberg._utils._sync import run_taskgroup
+        tasks = [self.process_file(path, **kwargs) for path in paths]
+        return await run_taskgroup(*tasks)
     def __hash__(self) -> int:
         """Hash function for allowing caching."""
         return hash(type(self).__name__)

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -4,6 +4,7 @@ import warnings
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
+import numpy as np
 from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -440,3 +441,93 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
             )
         return languages
+    def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        """Synchronously process an image and extract its text and metadata using EasyOCR.
+        Args:
+            image: An instance of PIL.Image representing the input image.
+            **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        Raises:
+            OCRError: If OCR processing fails.
+        """
+        self._init_easyocr_sync(**kwargs)
+        beam_width = kwargs.pop("beam_width")
+        kwargs.pop("language", None)
+        kwargs.pop("use_gpu", None)
+        try:
+            result = self._reader.readtext(
+                np.array(image),
+                beamWidth=beam_width,
+                **kwargs,
+            )
+            return self._process_easyocr_result(result, image)
+        except Exception as e:
+            raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
+    def process_file_sync(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        """Synchronously process a file and extract its text and metadata using EasyOCR.
+        Args:
+            path: A Path object representing the file to be processed.
+            **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        Raises:
+            OCRError: If file loading or OCR processing fails.
+        """
+        self._init_easyocr_sync(**kwargs)
+        try:
+            image = Image.open(path)
+            return self.process_image_sync(image, **kwargs)
+        except Exception as e:
+            raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
+    @classmethod
+    def _init_easyocr_sync(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
+        """Synchronously initialize EasyOCR with the provided configuration.
+        Args:
+            **kwargs: Configuration parameters for EasyOCR including language, etc.
+        Raises:
+            MissingDependencyError: If EasyOCR is not installed.
+            OCRError: If initialization fails.
+        """
+        if cls._reader is not None:
+            return
+        try:
+            import easyocr
+        except ImportError as e:
+            raise MissingDependencyError.create_for_package(
+                dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
+            ) from e
+        languages = cls._validate_language_code(kwargs.pop("language", "en"))
+        device_info = cls._resolve_device_config(**kwargs)
+        use_gpu = device_info.device_type in ("cuda", "mps")
+        kwargs.setdefault("detector", True)
+        kwargs.setdefault("recognizer", True)
+        kwargs.setdefault("download_enabled", True)
+        kwargs.setdefault("recog_network", "standard")
+        try:
+            cls._reader = easyocr.Reader(
+                languages,
+                gpu=use_gpu,
+                verbose=False,
+            )
+        except Exception as e:
+            raise OCRError(f"Failed to initialize EasyOCR: {e}") from e

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -4,8 +4,10 @@ import platform
 import warnings
 from dataclasses import dataclass
 from importlib.util import find_spec
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
+import numpy as np
 from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -364,3 +366,90 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
                 "supported_languages": ",".join(sorted(PADDLEOCR_SUPPORTED_LANGUAGE_CODES)),
             },
         )
+    def process_image_sync(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
+        """Synchronously process an image and extract its text and metadata using PaddleOCR.
+        Args:
+            image: An instance of PIL.Image representing the input image.
+            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        Raises:
+            OCRError: If OCR processing fails.
+        """
+        self._init_paddle_ocr_sync(**kwargs)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        image_np = np.array(image)
+        try:
+            result = self._paddle_ocr.ocr(image_np, cls=kwargs.get("use_angle_cls", True))
+            return self._process_paddle_result(result, image)
+        except Exception as e:
+            raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
+    def process_file_sync(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
+        """Synchronously process a file and extract its text and metadata using PaddleOCR.
+        Args:
+            path: A Path object representing the file to be processed.
+            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        Returns:
+            ExtractionResult: The extraction result containing text content, mime type, and metadata.
+        Raises:
+            OCRError: If file loading or OCR processing fails.
+        """
+        self._init_paddle_ocr_sync(**kwargs)
+        try:
+            image = Image.open(path)
+            return self.process_image_sync(image, **kwargs)
+        except Exception as e:
+            raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
+    @classmethod
+    def _init_paddle_ocr_sync(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
+        """Synchronously initialize PaddleOCR with the provided configuration.
+        Args:
+            **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
+        Raises:
+            MissingDependencyError: If PaddleOCR is not installed.
+            OCRError: If initialization fails.
+        """
+        if cls._paddle_ocr is not None:
+            return
+        try:
+            from paddleocr import PaddleOCR
+        except ImportError as e:
+            raise MissingDependencyError.create_for_package(
+                dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
+            ) from e
+        language = cls._validate_language_code(kwargs.pop("language", "en"))
+        device_info = cls._resolve_device_config(**kwargs)
+        use_gpu = device_info.device_type == "cuda"
+        has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
+        kwargs.setdefault("use_angle_cls", True)
+        kwargs["use_gpu"] = use_gpu and has_gpu_package
+        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
+        kwargs.setdefault("det_db_thresh", 0.3)
+        kwargs.setdefault("det_db_box_thresh", 0.5)
+        kwargs.setdefault("det_db_unclip_ratio", 1.6)
+        if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
+            kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
+        try:
+            cls._paddle_ocr = PaddleOCR(lang=language, show_log=False, **kwargs)
+        except Exception as e:
+            raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e

kreuzberg 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

kreuzberg 3.7.0py3-none-any.whl → 3.8.1py3-none-any.whl