PyPI - kreuzberg - Versions diffs - 3.4.2__py3-none-any.whl → 3.6.0__py3-none-any.whl - Mend

kreuzberg 3.4.2py3-none-any.whl → 3.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

kreuzberg/__init__.py +6 -1
kreuzberg/_entity_extraction.py +239 -0
kreuzberg/_extractors/_image.py +21 -1
kreuzberg/_extractors/_pdf.py +44 -14
kreuzberg/_extractors/_spread_sheet.py +2 -2
kreuzberg/_gmft.py +4 -4
kreuzberg/_language_detection.py +95 -0
kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
kreuzberg/_multiprocessing/process_manager.py +2 -1
kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
kreuzberg/_ocr/_easyocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +7 -3
kreuzberg/_types.py +46 -4
kreuzberg/_utils/_device.py +2 -2
kreuzberg/_utils/_process_pool.py +2 -2
kreuzberg/_utils/_sync.py +1 -5
kreuzberg/_utils/_tmp.py +2 -2
kreuzberg/extraction.py +39 -12
{kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/METADATA +12 -4
{kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/RECORD +24 -20
{kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_multiprocessing/sync_easyocr.py ADDED Viewed

@@ -0,0 +1,235 @@
+"""Pure synchronous EasyOCR without any async overhead."""
+from __future__ import annotations
+import tempfile
+from pathlib import Path
+from typing import Any
+from PIL import Image
+from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._ocr._easyocr import EasyOCRConfig
+from kreuzberg._types import ExtractionResult
+from kreuzberg._utils._string import normalize_spaces
+from kreuzberg.exceptions import MissingDependencyError, OCRError
+def _get_easyocr_instance(config: EasyOCRConfig) -> Any:
+    """Get an EasyOCR Reader instance with the given configuration."""
+    try:
+        import easyocr
+    except ImportError as e:
+        raise MissingDependencyError("EasyOCR is not installed. Install it with: pip install easyocr") from e
+    gpu = False
+    if hasattr(config, "device"):
+        if config.device and config.device.lower() != "cpu":
+            gpu = True
+    elif hasattr(config, "use_gpu"):
+        gpu = config.use_gpu
+    language = config.language if hasattr(config, "language") else "en"
+    if isinstance(language, str):
+        lang_list = [lang.strip().lower() for lang in language.split(",")]
+    else:
+        lang_list = [lang.lower() for lang in language]
+    kwargs = {
+        "lang_list": lang_list,
+        "gpu": gpu,
+        "model_storage_directory": getattr(config, "model_storage_directory", None),
+        "user_network_directory": getattr(config, "user_network_directory", None),
+        "recog_network": getattr(config, "recog_network", None),
+        "detector": getattr(config, "detector", None),
+        "recognizer": getattr(config, "recognizer", None),
+        "verbose": False,
+        "quantize": getattr(config, "quantize", None),
+        "cudnn_benchmark": getattr(config, "cudnn_benchmark", None),
+    }
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    return easyocr.Reader(**kwargs)
+def process_image_sync_pure(
+    image_path: str | Path,
+    config: EasyOCRConfig | None = None,
+) -> ExtractionResult:
+    """Process an image with EasyOCR using pure sync implementation.
+    This bypasses all async overhead and calls EasyOCR directly.
+    Args:
+        image_path: Path to the image file.
+        config: EasyOCR configuration.
+    Returns:
+        Extraction result.
+    """
+    cfg = config or EasyOCRConfig()
+    try:
+        reader = _get_easyocr_instance(cfg)
+        readtext_kwargs = {
+            "decoder": cfg.decoder,
+            "beamWidth": cfg.beam_width,
+            "batch_size": getattr(cfg, "batch_size", 1),
+            "workers": getattr(cfg, "workers", 0),
+            "allowlist": getattr(cfg, "allowlist", None),
+            "blocklist": getattr(cfg, "blocklist", None),
+            "detail": getattr(cfg, "detail", 1),
+            "rotation_info": cfg.rotation_info,
+            "paragraph": getattr(cfg, "paragraph", False),
+            "min_size": cfg.min_size,
+            "text_threshold": cfg.text_threshold,
+            "low_text": cfg.low_text,
+            "link_threshold": cfg.link_threshold,
+            "canvas_size": cfg.canvas_size,
+            "mag_ratio": cfg.mag_ratio,
+            "slope_ths": cfg.slope_ths,
+            "ycenter_ths": cfg.ycenter_ths,
+            "height_ths": cfg.height_ths,
+            "width_ths": cfg.width_ths,
+            "add_margin": cfg.add_margin,
+            "x_ths": cfg.x_ths,
+            "y_ths": cfg.y_ths,
+        }
+        readtext_kwargs = {k: v for k, v in readtext_kwargs.items() if v is not None}
+        results = reader.readtext(str(image_path), **readtext_kwargs)
+        if not results:
+            return ExtractionResult(
+                content="",
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata={},
+                chunks=[],
+            )
+        texts = []
+        confidences = []
+        detail_value = getattr(cfg, "detail", 1)
+        if detail_value:
+            for result in results:
+                min_result_length = 2
+                max_confidence_index = 2
+                if len(result) >= min_result_length:
+                    _bbox, text = result[0], result[1]
+                    confidence = result[max_confidence_index] if len(result) > max_confidence_index else 1.0
+                    texts.append(text)
+                    confidences.append(confidence)
+        else:
+            texts = results
+            confidences = [1.0] * len(texts)
+        content = "\n".join(texts)
+        content = normalize_spaces(content)
+        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+        metadata = {"confidence": avg_confidence} if confidences else {}
+        return ExtractionResult(
+            content=content,
+            mime_type=PLAIN_TEXT_MIME_TYPE,
+            metadata=metadata,  # type: ignore[arg-type]
+            chunks=[],
+        )
+    except Exception as e:
+        raise OCRError(f"EasyOCR processing failed: {e}") from e
+def process_image_bytes_sync_pure(
+    image_bytes: bytes,
+    config: EasyOCRConfig | None = None,
+) -> ExtractionResult:
+    """Process image bytes with EasyOCR using pure sync implementation.
+    Args:
+        image_bytes: Image data as bytes.
+        config: EasyOCR configuration.
+    Returns:
+        Extraction result.
+    """
+    import io
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
+        with Image.open(io.BytesIO(image_bytes)) as image:
+            image.save(tmp_image.name, format="PNG")
+        image_path = tmp_image.name
+    try:
+        return process_image_sync_pure(image_path, config)
+    finally:
+        image_file = Path(image_path)
+        if image_file.exists():
+            image_file.unlink()
+def process_batch_images_sync_pure(
+    image_paths: list[str | Path],
+    config: EasyOCRConfig | None = None,
+) -> list[ExtractionResult]:
+    """Process a batch of images sequentially with pure sync implementation.
+    Args:
+        image_paths: List of image file paths.
+        config: EasyOCR configuration.
+    Returns:
+        List of extraction results.
+    """
+    results = []
+    for image_path in image_paths:
+        result = process_image_sync_pure(image_path, config)
+        results.append(result)
+    return results
+def process_batch_images_threaded(
+    image_paths: list[str | Path],
+    config: EasyOCRConfig | None = None,
+    max_workers: int | None = None,
+) -> list[ExtractionResult]:
+    """Process a batch of images using threading.
+    Args:
+        image_paths: List of image file paths.
+        config: EasyOCR configuration.
+        max_workers: Maximum number of threads.
+    Returns:
+        List of extraction results in same order as input.
+    """
+    import multiprocessing as mp
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    if max_workers is None:
+        max_workers = min(len(image_paths), mp.cpu_count())
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_index = {
+            executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
+        }
+        results: list[ExtractionResult] = [None] * len(image_paths)  # type: ignore[list-item]
+        for future in as_completed(future_to_index):
+            index = future_to_index[future]
+            try:
+                results[index] = future.result()
+            except Exception as e:  # noqa: BLE001
+                results[index] = ExtractionResult(
+                    content=f"Error: {e}",
+                    mime_type=PLAIN_TEXT_MIME_TYPE,
+                    metadata={"error": str(e)},  # type: ignore[typeddict-unknown-key]
+                    chunks=[],
+                )
+    return results

kreuzberg/_multiprocessing/sync_paddleocr.py ADDED Viewed

@@ -0,0 +1,199 @@
+"""Pure synchronous PaddleOCR without any async overhead."""
+from __future__ import annotations
+import tempfile
+from pathlib import Path
+from typing import Any
+from PIL import Image
+from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+from kreuzberg._types import ExtractionResult
+from kreuzberg._utils._string import normalize_spaces
+from kreuzberg.exceptions import MissingDependencyError, OCRError
+def _get_paddleocr_instance(config: PaddleOCRConfig) -> Any:
+    """Get a PaddleOCR instance with the given configuration."""
+    try:
+        import paddleocr
+    except ImportError as e:
+        raise MissingDependencyError("PaddleOCR is not installed. Install it with: pip install paddleocr") from e
+    if hasattr(config, "device"):
+        if config.device and config.device.lower() != "cpu":
+            pass
+    elif hasattr(config, "use_gpu"):
+        pass
+    kwargs = {
+        "lang": config.language,
+        "use_textline_orientation": config.use_angle_cls,
+    }
+    if hasattr(config, "det_db_thresh"):
+        kwargs["text_det_thresh"] = config.det_db_thresh
+    if hasattr(config, "det_db_box_thresh"):
+        kwargs["text_det_box_thresh"] = config.det_db_box_thresh
+    if hasattr(config, "det_db_unclip_ratio"):
+        kwargs["text_det_unclip_ratio"] = config.det_db_unclip_ratio
+    if hasattr(config, "det_max_side_len"):
+        kwargs["text_det_limit_side_len"] = config.det_max_side_len
+    if hasattr(config, "drop_score"):
+        kwargs["text_rec_score_thresh"] = config.drop_score
+    return paddleocr.PaddleOCR(**kwargs)
+def process_image_sync_pure(
+    image_path: str | Path,
+    config: PaddleOCRConfig | None = None,
+) -> ExtractionResult:
+    """Process an image with PaddleOCR using pure sync implementation.
+    This bypasses all async overhead and calls PaddleOCR directly.
+    Args:
+        image_path: Path to the image file.
+        config: PaddleOCR configuration.
+    Returns:
+        Extraction result.
+    """
+    cfg = config or PaddleOCRConfig()
+    try:
+        ocr_instance = _get_paddleocr_instance(cfg)
+        results = ocr_instance.ocr(str(image_path))
+        if not results or not results[0]:
+            return ExtractionResult(
+                content="",
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata={},
+                chunks=[],
+            )
+        ocr_result = results[0]
+        result_data = ocr_result.json["res"]
+        texts = result_data.get("rec_texts", [])
+        scores = result_data.get("rec_scores", [])
+        if not texts:
+            return ExtractionResult(
+                content="",
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata={},
+                chunks=[],
+            )
+        content = "\n".join(texts)
+        content = normalize_spaces(content)
+        avg_confidence = sum(scores) / len(scores) if scores else 0.0
+        metadata = {"confidence": avg_confidence} if scores else {}
+        return ExtractionResult(
+            content=content,
+            mime_type=PLAIN_TEXT_MIME_TYPE,
+            metadata=metadata,  # type: ignore[arg-type]
+            chunks=[],
+        )
+    except Exception as e:
+        raise OCRError(f"PaddleOCR processing failed: {e}") from e
+def process_image_bytes_sync_pure(
+    image_bytes: bytes,
+    config: PaddleOCRConfig | None = None,
+) -> ExtractionResult:
+    """Process image bytes with PaddleOCR using pure sync implementation.
+    Args:
+        image_bytes: Image data as bytes.
+        config: PaddleOCR configuration.
+    Returns:
+        Extraction result.
+    """
+    import io
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
+        with Image.open(io.BytesIO(image_bytes)) as image:
+            image.save(tmp_image.name, format="PNG")
+        image_path = tmp_image.name
+    try:
+        return process_image_sync_pure(image_path, config)
+    finally:
+        image_file = Path(image_path)
+        if image_file.exists():
+            image_file.unlink()
+def process_batch_images_sync_pure(
+    image_paths: list[str | Path],
+    config: PaddleOCRConfig | None = None,
+) -> list[ExtractionResult]:
+    """Process a batch of images sequentially with pure sync implementation.
+    Args:
+        image_paths: List of image file paths.
+        config: PaddleOCR configuration.
+    Returns:
+        List of extraction results.
+    """
+    results = []
+    for image_path in image_paths:
+        result = process_image_sync_pure(image_path, config)
+        results.append(result)
+    return results
+def process_batch_images_threaded(
+    image_paths: list[str | Path],
+    config: PaddleOCRConfig | None = None,
+    max_workers: int | None = None,
+) -> list[ExtractionResult]:
+    """Process a batch of images using threading.
+    Args:
+        image_paths: List of image file paths.
+        config: PaddleOCR configuration.
+        max_workers: Maximum number of threads.
+    Returns:
+        List of extraction results in same order as input.
+    """
+    import multiprocessing as mp
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    if max_workers is None:
+        max_workers = min(len(image_paths), mp.cpu_count())
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_index = {
+            executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
+        }
+        results: list[ExtractionResult] = [None] * len(image_paths)  # type: ignore[list-item]
+        for future in as_completed(future_to_index):
+            index = future_to_index[future]
+            try:
+                results[index] = future.result()
+            except Exception as e:  # noqa: BLE001
+                results[index] = ExtractionResult(
+                    content=f"Error: {e}",
+                    mime_type=PLAIN_TEXT_MIME_TYPE,
+                    metadata={"error": str(e)},  # type: ignore[typeddict-unknown-key]
+                    chunks=[],
+                )
+    return results

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -319,7 +319,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         try:
             import torch
-            return torch.cuda.is_available()
+            return bool(torch.cuda.is_available())
         except ImportError:
             return False

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -202,9 +202,11 @@ class TesseractConfig:
             -   'deu' for German
             -    multiple languages combined with '+', e.g. 'eng+deu')
     """
-    language_model_ngram_on: bool = True
-    """Enable or disable the use of n-gram-based language models for improved text recognition."""
-    psm: PSMMode = PSMMode.AUTO
+    language_model_ngram_on: bool = False
+    """Enable or disable the use of n-gram-based language models for improved text recognition.
+    Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
+    psm: PSMMode = PSMMode.AUTO_ONLY
     """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
     tessedit_dont_blkrej_good_wds: bool = True
     """If True, prevents block rejection of words identified as good, improving text output quality."""
@@ -212,6 +214,8 @@ class TesseractConfig:
     """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
     tessedit_enable_dict_correction: bool = True
     """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
+    tessedit_char_whitelist: str = ""
+    """Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
     tessedit_use_primary_params_model: bool = True
     """If True, forces the use of the primary parameters model for text recognition."""
     textord_space_size_is_variable: bool = True

kreuzberg/_types.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from __future__ import annotations
 import sys
-from collections.abc import Awaitable
+from collections.abc import Awaitable, Callable
 from dataclasses import asdict, dataclass, field
-from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
+from typing import TYPE_CHECKING, Any, Literal, TypedDict
 from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
 from kreuzberg.exceptions import ValidationError
@@ -17,7 +17,9 @@ if TYPE_CHECKING:
     from pandas import DataFrame
     from PIL.Image import Image
+    from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
     from kreuzberg._gmft import GMFTConfig
+    from kreuzberg._language_detection import LanguageDetectionConfig
     from kreuzberg._ocr._easyocr import EasyOCRConfig
     from kreuzberg._ocr._paddleocr import PaddleOCRConfig
     from kreuzberg._ocr._tesseract import TesseractConfig
@@ -99,6 +101,20 @@ class Metadata(TypedDict, total=False):
     """Width of the document page/slide/image, if applicable."""
+@dataclass(frozen=True)
+class Entity:
+    """Represents an extracted entity with type, text, and position."""
+    type: str
+    """e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
+    text: str
+    """Extracted text"""
+    start: int
+    """Start character offset in the content"""
+    end: int
+    """End character offset in the content"""
 @dataclass
 class ExtractionResult:
     """The result of a file extraction."""
@@ -113,14 +129,20 @@ class ExtractionResult:
     """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
     chunks: list[str] = field(default_factory=list)
     """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
+    entities: list[Entity] | None = None
+    """Extracted entities, if entity extraction is enabled."""
+    keywords: list[tuple[str, float]] | None = None
+    """Extracted keywords and their scores, if keyword extraction is enabled."""
+    detected_languages: list[str] | None = None
+    """Languages detected in the extracted content, if language detection is enabled."""
     def to_dict(self) -> dict[str, Any]:
         """Converts the ExtractionResult to a dictionary."""
         return asdict(self)
-PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
-ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
+PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[ExtractionResult]]
+ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
 @dataclass(unsafe_hash=True)
@@ -157,8 +179,28 @@ class ExtractionConfig:
     """Post processing hooks to call after processing is done and before the final result is returned."""
     validators: list[ValidationHook] | None = None
     """Validation hooks to call after processing is done and before post-processing and result return."""
+    extract_entities: bool = False
+    """Whether to extract named entities from the content."""
+    extract_keywords: bool = False
+    """Whether to extract keywords from the content."""
+    keyword_count: int = 10
+    """Number of keywords to extract if extract_keywords is True."""
+    custom_entity_patterns: frozenset[tuple[str, str]] | None = None
+    """Custom entity patterns as a frozenset of (entity_type, regex_pattern) tuples."""
+    auto_detect_language: bool = False
+    """Whether to automatically detect language and configure OCR accordingly."""
+    language_detection_config: LanguageDetectionConfig | None = None
+    """Configuration for language detection. If None, uses default settings."""
+    spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
+    """Configuration for spaCy entity extraction. If None, uses default settings."""
     def __post_init__(self) -> None:
+        if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
+            object.__setattr__(self, "custom_entity_patterns", frozenset(self.custom_entity_patterns.items()))
+        if self.post_processing_hooks is not None and isinstance(self.post_processing_hooks, list):
+            object.__setattr__(self, "post_processing_hooks", tuple(self.post_processing_hooks))
+        if self.validators is not None and isinstance(self.validators, list):
+            object.__setattr__(self, "validators", tuple(self.validators))
         from kreuzberg._ocr._easyocr import EasyOCRConfig
         from kreuzberg._ocr._paddleocr import PaddleOCRConfig
         from kreuzberg._ocr._tesseract import TesseractConfig

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -153,7 +153,7 @@ def _is_cuda_available() -> bool:
     try:
         import torch  # type: ignore[import-not-found,unused-ignore]
-        return torch.cuda.is_available()
+        return bool(torch.cuda.is_available())
     except ImportError:
         return False
@@ -163,7 +163,7 @@ def _is_mps_available() -> bool:
     try:
         import torch  # type: ignore[import-not-found,unused-ignore]
-        return torch.backends.mps.is_available()
+        return bool(torch.backends.mps.is_available())
     except ImportError:
         return False

kreuzberg/_utils/_process_pool.py CHANGED Viewed

@@ -5,10 +5,10 @@ from __future__ import annotations
 import multiprocessing as mp
 from concurrent.futures import ProcessPoolExecutor
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Callable, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 if TYPE_CHECKING:
-    from collections.abc import Generator
+    from collections.abc import Callable, Generator
 T = TypeVar("T")

kreuzberg/_utils/_sync.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import sys
 from functools import partial
 from inspect import isawaitable, iscoroutinefunction
 from typing import TYPE_CHECKING, Any, TypeVar, cast
@@ -12,10 +11,7 @@ from anyio.to_thread import run_sync as any_io_run_sync
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Awaitable, Callable
-if sys.version_info >= (3, 10):
-    from typing import ParamSpec
-else:  # pragma: no cover
-    from typing_extensions import ParamSpec
+from typing import ParamSpec
 T = TypeVar("T")
 P = ParamSpec("P")

kreuzberg/_utils/_tmp.py CHANGED Viewed

@@ -3,14 +3,14 @@ from __future__ import annotations
 from contextlib import suppress
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING
 from anyio import Path as AsyncPath
 from kreuzberg._utils._sync import run_sync
 if TYPE_CHECKING:  # pragma: no cover
-    from collections.abc import Coroutine
+    from collections.abc import Callable, Coroutine
 async def create_temp_file(

kreuzberg 3.4.2__py3-none-any.whl → 3.6.0__py3-none-any.whl

kreuzberg 3.4.2py3-none-any.whl → 3.6.0py3-none-any.whl