PyPI - kreuzberg - Versions diffs - 3.4.2__py3-none-any.whl → 3.6.0__py3-none-any.whl - Mend

kreuzberg 3.4.2py3-none-any.whl → 3.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

kreuzberg/__init__.py +6 -1
kreuzberg/_entity_extraction.py +239 -0
kreuzberg/_extractors/_image.py +21 -1
kreuzberg/_extractors/_pdf.py +44 -14
kreuzberg/_extractors/_spread_sheet.py +2 -2
kreuzberg/_gmft.py +4 -4
kreuzberg/_language_detection.py +95 -0
kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
kreuzberg/_multiprocessing/process_manager.py +2 -1
kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
kreuzberg/_ocr/_easyocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +7 -3
kreuzberg/_types.py +46 -4
kreuzberg/_utils/_device.py +2 -2
kreuzberg/_utils/_process_pool.py +2 -2
kreuzberg/_utils/_sync.py +1 -5
kreuzberg/_utils/_tmp.py +2 -2
kreuzberg/extraction.py +39 -12
{kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/METADATA +12 -4
{kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/RECORD +24 -20
{kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/__init__.py CHANGED Viewed

@@ -1,13 +1,15 @@
 from importlib.metadata import version
+from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
 from kreuzberg._gmft import GMFTConfig
+from kreuzberg._language_detection import LanguageDetectionConfig
 from kreuzberg._ocr._easyocr import EasyOCRConfig
 from kreuzberg._ocr._paddleocr import PaddleOCRConfig
 from kreuzberg._ocr._tesseract import TesseractConfig
 from ._ocr._tesseract import PSMMode
 from ._registry import ExtractorRegistry
-from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
+from ._types import Entity, ExtractionConfig, ExtractionResult, Metadata, TableData
 from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
 from .extraction import (
     batch_extract_bytes,
@@ -24,17 +26,20 @@ __version__ = version("kreuzberg")
 __all__ = [
     "EasyOCRConfig",
+    "Entity",
     "ExtractionConfig",
     "ExtractionResult",
     "ExtractorRegistry",
     "GMFTConfig",
     "KreuzbergError",
+    "LanguageDetectionConfig",
     "Metadata",
     "MissingDependencyError",
     "OCRError",
     "PSMMode",
     "PaddleOCRConfig",
     "ParsingError",
+    "SpacyEntityExtractionConfig",
     "TableData",
     "TesseractConfig",
     "ValidationError",

kreuzberg/_entity_extraction.py ADDED Viewed

@@ -0,0 +1,239 @@
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any
+from kreuzberg._types import Entity
+from kreuzberg.exceptions import MissingDependencyError
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from pathlib import Path
+@dataclass(unsafe_hash=True, frozen=True)
+class SpacyEntityExtractionConfig:
+    """Configuration for spaCy-based entity extraction."""
+    model_cache_dir: str | Path | None = None
+    """Directory to cache spaCy models. If None, uses spaCy's default."""
+    language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
+    """Mapping of language codes to spaCy model names.
+    If None, uses default mappings:
+    - en: en_core_web_sm
+    - de: de_core_news_sm
+    - fr: fr_core_news_sm
+    - es: es_core_news_sm
+    - pt: pt_core_news_sm
+    - it: it_core_news_sm
+    - nl: nl_core_news_sm
+    - zh: zh_core_web_sm
+    - ja: ja_core_news_sm
+    """
+    fallback_to_multilingual: bool = True
+    """If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
+    max_doc_length: int = 1000000
+    """Maximum document length for spaCy processing."""
+    batch_size: int = 1000
+    """Batch size for processing multiple texts."""
+    def __post_init__(self) -> None:
+        if self.language_models is None:
+            object.__setattr__(self, "language_models", self._get_default_language_models())
+        if isinstance(self.language_models, dict):
+            object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
+    @staticmethod
+    def _get_default_language_models() -> dict[str, str]:
+        """Get default language model mappings based on available spaCy models."""
+        return {
+            "en": "en_core_web_sm",
+            "de": "de_core_news_sm",
+            "fr": "fr_core_news_sm",
+            "es": "es_core_news_sm",
+            "pt": "pt_core_news_sm",
+            "it": "it_core_news_sm",
+            "nl": "nl_core_news_sm",
+            "zh": "zh_core_web_sm",
+            "ja": "ja_core_news_sm",
+            "ko": "ko_core_news_sm",
+            "ru": "ru_core_news_sm",
+            "pl": "pl_core_news_sm",
+            "ro": "ro_core_news_sm",
+            "el": "el_core_news_sm",
+            "da": "da_core_news_sm",
+            "fi": "fi_core_news_sm",
+            "nb": "nb_core_news_sm",
+            "sv": "sv_core_news_sm",
+            "ca": "ca_core_news_sm",
+            "hr": "hr_core_news_sm",
+            "lt": "lt_core_news_sm",
+            "mk": "mk_core_news_sm",
+            "sl": "sl_core_news_sm",
+            "uk": "uk_core_news_sm",
+        }
+    def get_model_for_language(self, language_code: str) -> str | None:
+        """Get the appropriate spaCy model for a language code."""
+        if not self.language_models:
+            return None
+        models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
+        if language_code in models_dict:
+            return models_dict[language_code]
+        base_lang = language_code.split("-")[0].lower()
+        if base_lang in models_dict:
+            return models_dict[base_lang]
+        return None
+    def get_fallback_model(self) -> str | None:
+        """Get fallback multilingual model if enabled."""
+        return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
+def extract_entities(
+    text: str,
+    entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
+    custom_patterns: frozenset[tuple[str, str]] | None = None,
+    languages: list[str] | None = None,
+    spacy_config: SpacyEntityExtractionConfig | None = None,
+) -> list[Entity]:
+    """Extract entities from text using custom regex patterns and/or a NER model.
+    Args:
+        text: The input text to extract entities from.
+        entity_types: List of entity types to extract using the NER model.
+        custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
+        languages: List of detected languages to choose appropriate spaCy models.
+        spacy_config: Configuration for spaCy entity extraction.
+    Returns:
+        list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
+    Raises:
+        MissingDependencyError: If `spacy` is not installed.
+    """
+    entities: list[Entity] = []
+    if custom_patterns:
+        custom_patterns_dict = dict(custom_patterns)
+        for ent_type, pattern in custom_patterns_dict.items():
+            entities.extend(
+                Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
+                for match in re.finditer(pattern, text)
+            )
+    if spacy_config is None:
+        spacy_config = SpacyEntityExtractionConfig()
+    try:
+        import spacy  # noqa: F401
+    except ImportError as e:
+        raise MissingDependencyError.create_for_package(
+            package_name="spacy",
+            dependency_group="entity-extraction",
+            functionality="Entity Extraction",
+        ) from e
+    model_name = _select_spacy_model(languages, spacy_config)
+    if not model_name:
+        return entities
+    nlp = _load_spacy_model(model_name, spacy_config)
+    if not nlp:
+        return entities
+    if len(text) > spacy_config.max_doc_length:
+        text = text[: spacy_config.max_doc_length]
+    doc = nlp(text)
+    entity_type_mapping = {etype.upper() for etype in entity_types}
+    entities.extend(
+        Entity(
+            type=ent.label_,
+            text=ent.text,
+            start=ent.start_char,
+            end=ent.end_char,
+        )
+        for ent in doc.ents
+        if ent.label_ in entity_type_mapping or ent.label_.upper() in entity_type_mapping
+    )
+    return entities
+@lru_cache(maxsize=32)
+def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
+    """Load a spaCy model with caching."""
+    try:
+        import spacy
+        if spacy_config.model_cache_dir:
+            import os
+            os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
+        nlp = spacy.load(model_name)
+        nlp.max_length = spacy_config.max_doc_length
+        return nlp
+    except (OSError, ImportError):
+        return None
+def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
+    """Select the best spaCy model based on detected languages."""
+    if not languages:
+        return spacy_config.get_model_for_language("en")
+    for lang in languages:
+        model_name = spacy_config.get_model_for_language(lang)
+        if model_name:
+            return model_name
+    return spacy_config.get_fallback_model()
+def extract_keywords(
+    text: str,
+    keyword_count: int = 10,
+) -> list[tuple[str, float]]:
+    """Extract keywords from text using the KeyBERT model.
+    Args:
+        text: The input text to extract keywords from.
+        keyword_count: Number of top keywords to return. Defaults to 10.
+    Returns:
+        list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
+    Raises:
+        MissingDependencyError: If `keybert` is not installed.
+    """
+    try:
+        from keybert import KeyBERT
+        kw_model = KeyBERT()
+        keywords = kw_model.extract_keywords(text, top_n=keyword_count)
+        return [(kw, float(score)) for kw, score in keywords]
+    except (RuntimeError, OSError, ValueError):
+        return []
+    except ImportError as e:
+        raise MissingDependencyError.create_for_package(
+            package_name="keybert",
+            dependency_group="entity-extraction",
+            functionality="Keyword Extraction",
+        ) from e

kreuzberg/_extractors/_image.py CHANGED Viewed

@@ -80,11 +80,11 @@ class ImageExtractor(Extractor):
         if self.config.ocr_backend is None:
             raise ValidationError("ocr_backend is None, cannot perform OCR")
-        from kreuzberg._ocr._tesseract import TesseractConfig
         from kreuzberg._types import ExtractionResult
         if self.config.ocr_backend == "tesseract":
             from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
+            from kreuzberg._ocr._tesseract import TesseractConfig
             if isinstance(self.config.ocr_config, TesseractConfig):
                 config = self.config.ocr_config
@@ -96,6 +96,26 @@ class ImageExtractor(Extractor):
                 return results[0]
             return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
+        if self.config.ocr_backend == "paddleocr":
+            from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
+            from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+            paddle_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+            )
+            return paddle_process(path, paddle_config)
+        if self.config.ocr_backend == "easyocr":
+            from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
+            from kreuzberg._ocr._easyocr import EasyOCRConfig
+            easy_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+            )
+            return easy_process(path, easy_config)
         raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
     def _get_extension_from_mime_type(self, mime_type: str) -> str:

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -299,8 +299,6 @@ class PDFExtractor(Extractor):
         """Extract text from PDF using OCR (sync version)."""
         pdf = None
         try:
-            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
             images = []
             with pypdfium_file_lock(path):
                 pdf = pypdfium2.PdfDocument(str(path))
@@ -325,18 +323,7 @@ class PDFExtractor(Extractor):
                     os.close(fd)
                     image_paths.append(temp_path)
-                if self.config.ocr_backend == "tesseract":
-                    from kreuzberg._ocr._tesseract import TesseractConfig
-                    if isinstance(self.config.ocr_config, TesseractConfig):
-                        config = self.config.ocr_config
-                    else:
-                        config = TesseractConfig()
-                    results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
-                    text_parts = [r.content for r in results]
-                    return "\n\n".join(text_parts)
-                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+                return self._process_pdf_images_with_ocr(image_paths)
             finally:
                 for _, temp_path in temp_files:
@@ -349,3 +336,46 @@ class PDFExtractor(Extractor):
             if pdf:
                 with pypdfium_file_lock(path), contextlib.suppress(Exception):
                     pdf.close()
+    def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
+        """Process PDF images with the configured OCR backend."""
+        if self.config.ocr_backend == "tesseract":
+            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
+            from kreuzberg._ocr._tesseract import TesseractConfig
+            tesseract_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
+            )
+            results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
+            text_parts = [r.content for r in results]
+            return "\n\n".join(text_parts)
+        if self.config.ocr_backend == "paddleocr":
+            from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
+            from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+            paddle_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+            )
+            text_parts = []
+            for image_path in image_paths:
+                result = paddle_process(Path(image_path), paddle_config)
+                text_parts.append(result.content)
+            return "\n\n".join(text_parts)
+        if self.config.ocr_backend == "easyocr":
+            from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
+            from kreuzberg._ocr._easyocr import EasyOCRConfig
+            easy_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+            )
+            text_parts = []
+            for image_path in image_paths:
+                result = easy_process(Path(image_path), easy_config)
+                text_parts.append(result.content)
+            return "\n\n".join(text_parts)
+        raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -6,7 +6,7 @@ import sys
 from datetime import date, datetime, time, timedelta
 from io import StringIO
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 from anyio import Path as AsyncPath
 from python_calamine import CalamineWorkbook
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11):  # pragma: no cover
     from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
-CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
+CellValue = int | float | str | bool | time | date | datetime | timedelta
 class SpreadSheetExtractor(Extractor):

kreuzberg/_gmft.py CHANGED Viewed

@@ -210,7 +210,7 @@ async def extract_tables(  # noqa: PLR0915
             from gmft.formatters.tatr import TATRFormatConfig
             from gmft.pdf_bindings.pdfium import PyPDFium2Document
-            formatter: Any = AutoTableFormatter(  # type: ignore[no-untyped-call]
+            formatter: Any = AutoTableFormatter(  # type: ignore[no-untyped-call]  # type: ignore[no-untyped-call]
                 config=TATRFormatConfig(
                     verbosity=config.verbosity,
                     formatter_base_threshold=config.formatter_base_threshold,
@@ -226,7 +226,7 @@ async def extract_tables(  # noqa: PLR0915
                     force_large_table_assumption=config.force_large_table_assumption,
                 )
             )
-            detector: Any = AutoTableDetector(  # type: ignore[no-untyped-call]
+            detector: Any = AutoTableDetector(  # type: ignore[no-untyped-call]  # type: ignore[no-untyped-call]
                 config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
             )
             doc = await run_sync(PyPDFium2Document, str(file_path))
@@ -247,7 +247,7 @@ async def extract_tables(  # noqa: PLR0915
                         text=data_frame.to_markdown(),
                         df=data_frame,
                     )
-                    for data_frame, cropped_table in zip(dataframes, cropped_tables)
+                    for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
                 ]
                 await table_cache.aset(result, **cache_kwargs)
@@ -365,7 +365,7 @@ def extract_tables_sync(
                     text=data_frame.to_markdown(),
                     df=data_frame,
                 )
-                for data_frame, cropped_table in zip(dataframes, cropped_tables)
+                for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
             ]
             table_cache.set(result, **cache_kwargs)

kreuzberg/_language_detection.py ADDED Viewed

@@ -0,0 +1,95 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any
+from kreuzberg.exceptions import MissingDependencyError
+if TYPE_CHECKING:
+    from fast_langdetect import LangDetectConfig as FastLangDetectConfig
+try:
+    from fast_langdetect import LangDetectConfig as FastLangDetectConfig
+    from fast_langdetect import detect, detect_multilingual
+    HAS_FAST_LANGDETECT = True
+except ImportError:
+    HAS_FAST_LANGDETECT = False
+    detect = None
+    detect_multilingual = None
+    FastLangDetectConfig = None
+_CACHE_SIZE = 128
+@dataclass(frozen=True)
+class LanguageDetectionConfig:
+    """Configuration for language detection.
+    Attributes:
+        low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
+            Defaults to True for better memory efficiency.
+        top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
+        multilingual: If True, uses multilingual detection to handle mixed-language text.
+            If False, uses single language detection. Defaults to False.
+        cache_dir: Custom directory for model cache. If None, uses system default.
+        allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
+    """
+    low_memory: bool = True
+    top_k: int = 3
+    multilingual: bool = False
+    cache_dir: str | None = None
+    allow_fallback: bool = True
+def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
+    """Create FastLangDetectConfig from our config."""
+    if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
+        return None
+    kwargs: dict[str, Any] = {
+        "allow_fallback": config.allow_fallback,
+    }
+    if config.cache_dir is not None:
+        kwargs["cache_dir"] = config.cache_dir
+    return FastLangDetectConfig(**kwargs)
+@lru_cache(maxsize=_CACHE_SIZE)
+def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
+    """Detect the most probable languages in the given text using fast-langdetect.
+    Args:
+        text: The text to analyze.
+        config: Configuration for language detection. If None, uses defaults.
+    Returns:
+        A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
+        or None if detection fails.
+    Raises:
+        MissingDependencyError: If fast-langdetect is not installed.
+    """
+    if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
+        raise MissingDependencyError.create_for_package(
+            dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
+        )
+    if config is None:
+        config = LanguageDetectionConfig()
+    try:
+        if config.multilingual:
+            results = detect_multilingual(text, low_memory=config.low_memory, k=config.top_k)
+            return [result["lang"].lower() for result in results if result.get("lang")]
+        result = detect(text, low_memory=config.low_memory)
+        if result and result.get("lang"):
+            return [result["lang"].lower()]
+        return None
+    except Exception:  # noqa: BLE001
+        return None

kreuzberg/_multiprocessing/gmft_isolated.py CHANGED Viewed

@@ -56,9 +56,7 @@ def _extract_tables_in_process(
                 force_large_table_assumption=config.force_large_table_assumption,
             )
         )
-        detector = AutoTableDetector(  # type: ignore[no-untyped-call]
-            config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
-        )
+        detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))  # type: ignore[no-untyped-call]
         doc = PyPDFium2Document(str(file_path))
         cropped_tables = []
@@ -73,7 +71,7 @@ def _extract_tables_in_process(
                 dataframes.append(formatted_table.df())
             results = []
-            for data_frame, cropped_table in zip(dataframes, cropped_tables):
+            for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
                 import io
                 img_bytes = io.BytesIO()

kreuzberg/_multiprocessing/process_manager.py CHANGED Viewed

@@ -4,7 +4,7 @@ from __future__ import annotations
 import multiprocessing as mp
 from concurrent.futures import ProcessPoolExecutor
-from typing import TYPE_CHECKING, Any, Callable, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 import anyio
 import psutil
@@ -12,6 +12,7 @@ from typing_extensions import Self
 if TYPE_CHECKING:
     import types
+    from collections.abc import Callable
 T = TypeVar("T")

kreuzberg 3.4.2__py3-none-any.whl → 3.6.0__py3-none-any.whl

kreuzberg 3.4.2py3-none-any.whl → 3.6.0py3-none-any.whl