PyPI - kreuzberg - Versions diffs - 3.4.1__py3-none-any.whl → 3.5.0__py3-none-any.whl - Mend

kreuzberg 3.4.1py3-none-any.whl → 3.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

kreuzberg/__init__.py +2 -0
kreuzberg/_extractors/_image.py +21 -1
kreuzberg/_extractors/_pdf.py +44 -14
kreuzberg/_extractors/_spread_sheet.py +2 -2
kreuzberg/_gmft.py +4 -4
kreuzberg/_language_detection.py +95 -0
kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
kreuzberg/_multiprocessing/process_manager.py +2 -1
kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
kreuzberg/_ocr/_easyocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +7 -3
kreuzberg/_types.py +11 -4
kreuzberg/_utils/_device.py +2 -2
kreuzberg/_utils/_process_pool.py +2 -2
kreuzberg/_utils/_sync.py +1 -5
kreuzberg/_utils/_tmp.py +2 -2
kreuzberg/extraction.py +10 -0
{kreuzberg-3.4.1.dist-info → kreuzberg-3.5.0.dist-info}/METADATA +4 -3
{kreuzberg-3.4.1.dist-info → kreuzberg-3.5.0.dist-info}/RECORD +23 -20
{kreuzberg-3.4.1.dist-info → kreuzberg-3.5.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.4.1.dist-info → kreuzberg-3.5.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.4.1.dist-info → kreuzberg-3.5.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from importlib.metadata import version
 from kreuzberg._gmft import GMFTConfig
+from kreuzberg._language_detection import LanguageDetectionConfig
 from kreuzberg._ocr._easyocr import EasyOCRConfig
 from kreuzberg._ocr._paddleocr import PaddleOCRConfig
 from kreuzberg._ocr._tesseract import TesseractConfig
@@ -29,6 +30,7 @@ __all__ = [
     "ExtractorRegistry",
     "GMFTConfig",
     "KreuzbergError",
+    "LanguageDetectionConfig",
     "Metadata",
     "MissingDependencyError",
     "OCRError",

kreuzberg/_extractors/_image.py CHANGED Viewed

@@ -80,11 +80,11 @@ class ImageExtractor(Extractor):
         if self.config.ocr_backend is None:
             raise ValidationError("ocr_backend is None, cannot perform OCR")
-        from kreuzberg._ocr._tesseract import TesseractConfig
         from kreuzberg._types import ExtractionResult
         if self.config.ocr_backend == "tesseract":
             from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
+            from kreuzberg._ocr._tesseract import TesseractConfig
             if isinstance(self.config.ocr_config, TesseractConfig):
                 config = self.config.ocr_config
@@ -96,6 +96,26 @@ class ImageExtractor(Extractor):
                 return results[0]
             return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
+        if self.config.ocr_backend == "paddleocr":
+            from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
+            from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+            paddle_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+            )
+            return paddle_process(path, paddle_config)
+        if self.config.ocr_backend == "easyocr":
+            from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
+            from kreuzberg._ocr._easyocr import EasyOCRConfig
+            easy_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+            )
+            return easy_process(path, easy_config)
         raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
     def _get_extension_from_mime_type(self, mime_type: str) -> str:

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -299,8 +299,6 @@ class PDFExtractor(Extractor):
         """Extract text from PDF using OCR (sync version)."""
         pdf = None
         try:
-            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
             images = []
             with pypdfium_file_lock(path):
                 pdf = pypdfium2.PdfDocument(str(path))
@@ -325,18 +323,7 @@ class PDFExtractor(Extractor):
                     os.close(fd)
                     image_paths.append(temp_path)
-                if self.config.ocr_backend == "tesseract":
-                    from kreuzberg._ocr._tesseract import TesseractConfig
-                    if isinstance(self.config.ocr_config, TesseractConfig):
-                        config = self.config.ocr_config
-                    else:
-                        config = TesseractConfig()
-                    results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
-                    text_parts = [r.content for r in results]
-                    return "\n\n".join(text_parts)
-                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+                return self._process_pdf_images_with_ocr(image_paths)
             finally:
                 for _, temp_path in temp_files:
@@ -349,3 +336,46 @@ class PDFExtractor(Extractor):
             if pdf:
                 with pypdfium_file_lock(path), contextlib.suppress(Exception):
                     pdf.close()
+    def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
+        """Process PDF images with the configured OCR backend."""
+        if self.config.ocr_backend == "tesseract":
+            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
+            from kreuzberg._ocr._tesseract import TesseractConfig
+            tesseract_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
+            )
+            results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
+            text_parts = [r.content for r in results]
+            return "\n\n".join(text_parts)
+        if self.config.ocr_backend == "paddleocr":
+            from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
+            from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+            paddle_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+            )
+            text_parts = []
+            for image_path in image_paths:
+                result = paddle_process(Path(image_path), paddle_config)
+                text_parts.append(result.content)
+            return "\n\n".join(text_parts)
+        if self.config.ocr_backend == "easyocr":
+            from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
+            from kreuzberg._ocr._easyocr import EasyOCRConfig
+            easy_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+            )
+            text_parts = []
+            for image_path in image_paths:
+                result = easy_process(Path(image_path), easy_config)
+                text_parts.append(result.content)
+            return "\n\n".join(text_parts)
+        raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -6,7 +6,7 @@ import sys
 from datetime import date, datetime, time, timedelta
 from io import StringIO
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 from anyio import Path as AsyncPath
 from python_calamine import CalamineWorkbook
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11):  # pragma: no cover
     from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
-CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
+CellValue = int | float | str | bool | time | date | datetime | timedelta
 class SpreadSheetExtractor(Extractor):

kreuzberg/_gmft.py CHANGED Viewed

@@ -210,7 +210,7 @@ async def extract_tables(  # noqa: PLR0915
             from gmft.formatters.tatr import TATRFormatConfig
             from gmft.pdf_bindings.pdfium import PyPDFium2Document
-            formatter: Any = AutoTableFormatter(  # type: ignore[no-untyped-call]
+            formatter: Any = AutoTableFormatter(  # type: ignore[no-untyped-call]  # type: ignore[no-untyped-call]
                 config=TATRFormatConfig(
                     verbosity=config.verbosity,
                     formatter_base_threshold=config.formatter_base_threshold,
@@ -226,7 +226,7 @@ async def extract_tables(  # noqa: PLR0915
                     force_large_table_assumption=config.force_large_table_assumption,
                 )
             )
-            detector: Any = AutoTableDetector(  # type: ignore[no-untyped-call]
+            detector: Any = AutoTableDetector(  # type: ignore[no-untyped-call]  # type: ignore[no-untyped-call]
                 config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
             )
             doc = await run_sync(PyPDFium2Document, str(file_path))
@@ -247,7 +247,7 @@ async def extract_tables(  # noqa: PLR0915
                         text=data_frame.to_markdown(),
                         df=data_frame,
                     )
-                    for data_frame, cropped_table in zip(dataframes, cropped_tables)
+                    for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
                 ]
                 await table_cache.aset(result, **cache_kwargs)
@@ -365,7 +365,7 @@ def extract_tables_sync(
                     text=data_frame.to_markdown(),
                     df=data_frame,
                 )
-                for data_frame, cropped_table in zip(dataframes, cropped_tables)
+                for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
             ]
             table_cache.set(result, **cache_kwargs)

kreuzberg/_language_detection.py ADDED Viewed

@@ -0,0 +1,95 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any
+from kreuzberg.exceptions import MissingDependencyError
+if TYPE_CHECKING:
+    from fast_langdetect import LangDetectConfig as FastLangDetectConfig
+try:
+    from fast_langdetect import LangDetectConfig as FastLangDetectConfig
+    from fast_langdetect import detect, detect_multilingual
+    HAS_FAST_LANGDETECT = True
+except ImportError:
+    HAS_FAST_LANGDETECT = False
+    detect = None
+    detect_multilingual = None
+    FastLangDetectConfig = None
+_CACHE_SIZE = 128
+@dataclass(frozen=True)
+class LanguageDetectionConfig:
+    """Configuration for language detection.
+    Attributes:
+        low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
+            Defaults to True for better memory efficiency.
+        top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
+        multilingual: If True, uses multilingual detection to handle mixed-language text.
+            If False, uses single language detection. Defaults to False.
+        cache_dir: Custom directory for model cache. If None, uses system default.
+        allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
+    """
+    low_memory: bool = True
+    top_k: int = 3
+    multilingual: bool = False
+    cache_dir: str | None = None
+    allow_fallback: bool = True
+def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
+    """Create FastLangDetectConfig from our config."""
+    if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
+        return None
+    kwargs: dict[str, Any] = {
+        "allow_fallback": config.allow_fallback,
+    }
+    if config.cache_dir is not None:
+        kwargs["cache_dir"] = config.cache_dir
+    return FastLangDetectConfig(**kwargs)
+@lru_cache(maxsize=_CACHE_SIZE)
+def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
+    """Detect the most probable languages in the given text using fast-langdetect.
+    Args:
+        text: The text to analyze.
+        config: Configuration for language detection. If None, uses defaults.
+    Returns:
+        A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
+        or None if detection fails.
+    Raises:
+        MissingDependencyError: If fast-langdetect is not installed.
+    """
+    if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
+        raise MissingDependencyError.create_for_package(
+            dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
+        )
+    if config is None:
+        config = LanguageDetectionConfig()
+    try:
+        if config.multilingual:
+            results = detect_multilingual(text, low_memory=config.low_memory, k=config.top_k)
+            return [result["lang"].lower() for result in results if result.get("lang")]
+        result = detect(text, low_memory=config.low_memory)
+        if result and result.get("lang"):
+            return [result["lang"].lower()]
+        return None
+    except Exception:  # noqa: BLE001
+        return None

kreuzberg/_multiprocessing/gmft_isolated.py CHANGED Viewed

@@ -56,9 +56,7 @@ def _extract_tables_in_process(
                 force_large_table_assumption=config.force_large_table_assumption,
             )
         )
-        detector = AutoTableDetector(  # type: ignore[no-untyped-call]
-            config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
-        )
+        detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))  # type: ignore[no-untyped-call]
         doc = PyPDFium2Document(str(file_path))
         cropped_tables = []
@@ -73,7 +71,7 @@ def _extract_tables_in_process(
                 dataframes.append(formatted_table.df())
             results = []
-            for data_frame, cropped_table in zip(dataframes, cropped_tables):
+            for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
                 import io
                 img_bytes = io.BytesIO()

kreuzberg/_multiprocessing/process_manager.py CHANGED Viewed

@@ -4,7 +4,7 @@ from __future__ import annotations
 import multiprocessing as mp
 from concurrent.futures import ProcessPoolExecutor
-from typing import TYPE_CHECKING, Any, Callable, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 import anyio
 import psutil
@@ -12,6 +12,7 @@ from typing_extensions import Self
 if TYPE_CHECKING:
     import types
+    from collections.abc import Callable
 T = TypeVar("T")

kreuzberg/_multiprocessing/sync_easyocr.py ADDED Viewed

@@ -0,0 +1,235 @@
+"""Pure synchronous EasyOCR without any async overhead."""
+from __future__ import annotations
+import tempfile
+from pathlib import Path
+from typing import Any
+from PIL import Image
+from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._ocr._easyocr import EasyOCRConfig
+from kreuzberg._types import ExtractionResult
+from kreuzberg._utils._string import normalize_spaces
+from kreuzberg.exceptions import MissingDependencyError, OCRError
+def _get_easyocr_instance(config: EasyOCRConfig) -> Any:
+    """Get an EasyOCR Reader instance with the given configuration."""
+    try:
+        import easyocr
+    except ImportError as e:
+        raise MissingDependencyError("EasyOCR is not installed. Install it with: pip install easyocr") from e
+    gpu = False
+    if hasattr(config, "device"):
+        if config.device and config.device.lower() != "cpu":
+            gpu = True
+    elif hasattr(config, "use_gpu"):
+        gpu = config.use_gpu
+    language = config.language if hasattr(config, "language") else "en"
+    if isinstance(language, str):
+        lang_list = [lang.strip().lower() for lang in language.split(",")]
+    else:
+        lang_list = [lang.lower() for lang in language]
+    kwargs = {
+        "lang_list": lang_list,
+        "gpu": gpu,
+        "model_storage_directory": getattr(config, "model_storage_directory", None),
+        "user_network_directory": getattr(config, "user_network_directory", None),
+        "recog_network": getattr(config, "recog_network", None),
+        "detector": getattr(config, "detector", None),
+        "recognizer": getattr(config, "recognizer", None),
+        "verbose": False,
+        "quantize": getattr(config, "quantize", None),
+        "cudnn_benchmark": getattr(config, "cudnn_benchmark", None),
+    }
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    return easyocr.Reader(**kwargs)
+def process_image_sync_pure(
+    image_path: str | Path,
+    config: EasyOCRConfig | None = None,
+) -> ExtractionResult:
+    """Process an image with EasyOCR using pure sync implementation.
+    This bypasses all async overhead and calls EasyOCR directly.
+    Args:
+        image_path: Path to the image file.
+        config: EasyOCR configuration.
+    Returns:
+        Extraction result.
+    """
+    cfg = config or EasyOCRConfig()
+    try:
+        reader = _get_easyocr_instance(cfg)
+        readtext_kwargs = {
+            "decoder": cfg.decoder,
+            "beamWidth": cfg.beam_width,
+            "batch_size": getattr(cfg, "batch_size", 1),
+            "workers": getattr(cfg, "workers", 0),
+            "allowlist": getattr(cfg, "allowlist", None),
+            "blocklist": getattr(cfg, "blocklist", None),
+            "detail": getattr(cfg, "detail", 1),
+            "rotation_info": cfg.rotation_info,
+            "paragraph": getattr(cfg, "paragraph", False),
+            "min_size": cfg.min_size,
+            "text_threshold": cfg.text_threshold,
+            "low_text": cfg.low_text,
+            "link_threshold": cfg.link_threshold,
+            "canvas_size": cfg.canvas_size,
+            "mag_ratio": cfg.mag_ratio,
+            "slope_ths": cfg.slope_ths,
+            "ycenter_ths": cfg.ycenter_ths,
+            "height_ths": cfg.height_ths,
+            "width_ths": cfg.width_ths,
+            "add_margin": cfg.add_margin,
+            "x_ths": cfg.x_ths,
+            "y_ths": cfg.y_ths,
+        }
+        readtext_kwargs = {k: v for k, v in readtext_kwargs.items() if v is not None}
+        results = reader.readtext(str(image_path), **readtext_kwargs)
+        if not results:
+            return ExtractionResult(
+                content="",
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata={},
+                chunks=[],
+            )
+        texts = []
+        confidences = []
+        detail_value = getattr(cfg, "detail", 1)
+        if detail_value:
+            for result in results:
+                min_result_length = 2
+                max_confidence_index = 2
+                if len(result) >= min_result_length:
+                    _bbox, text = result[0], result[1]
+                    confidence = result[max_confidence_index] if len(result) > max_confidence_index else 1.0
+                    texts.append(text)
+                    confidences.append(confidence)
+        else:
+            texts = results
+            confidences = [1.0] * len(texts)
+        content = "\n".join(texts)
+        content = normalize_spaces(content)
+        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+        metadata = {"confidence": avg_confidence} if confidences else {}
+        return ExtractionResult(
+            content=content,
+            mime_type=PLAIN_TEXT_MIME_TYPE,
+            metadata=metadata,  # type: ignore[arg-type]
+            chunks=[],
+        )
+    except Exception as e:
+        raise OCRError(f"EasyOCR processing failed: {e}") from e
+def process_image_bytes_sync_pure(
+    image_bytes: bytes,
+    config: EasyOCRConfig | None = None,
+) -> ExtractionResult:
+    """Process image bytes with EasyOCR using pure sync implementation.
+    Args:
+        image_bytes: Image data as bytes.
+        config: EasyOCR configuration.
+    Returns:
+        Extraction result.
+    """
+    import io
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
+        with Image.open(io.BytesIO(image_bytes)) as image:
+            image.save(tmp_image.name, format="PNG")
+        image_path = tmp_image.name
+    try:
+        return process_image_sync_pure(image_path, config)
+    finally:
+        image_file = Path(image_path)
+        if image_file.exists():
+            image_file.unlink()
+def process_batch_images_sync_pure(
+    image_paths: list[str | Path],
+    config: EasyOCRConfig | None = None,
+) -> list[ExtractionResult]:
+    """Process a batch of images sequentially with pure sync implementation.
+    Args:
+        image_paths: List of image file paths.
+        config: EasyOCR configuration.
+    Returns:
+        List of extraction results.
+    """
+    results = []
+    for image_path in image_paths:
+        result = process_image_sync_pure(image_path, config)
+        results.append(result)
+    return results
+def process_batch_images_threaded(
+    image_paths: list[str | Path],
+    config: EasyOCRConfig | None = None,
+    max_workers: int | None = None,
+) -> list[ExtractionResult]:
+    """Process a batch of images using threading.
+    Args:
+        image_paths: List of image file paths.
+        config: EasyOCR configuration.
+        max_workers: Maximum number of threads.
+    Returns:
+        List of extraction results in same order as input.
+    """
+    import multiprocessing as mp
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    if max_workers is None:
+        max_workers = min(len(image_paths), mp.cpu_count())
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_index = {
+            executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
+        }
+        results: list[ExtractionResult] = [None] * len(image_paths)  # type: ignore[list-item]
+        for future in as_completed(future_to_index):
+            index = future_to_index[future]
+            try:
+                results[index] = future.result()
+            except Exception as e:  # noqa: BLE001
+                results[index] = ExtractionResult(
+                    content=f"Error: {e}",
+                    mime_type=PLAIN_TEXT_MIME_TYPE,
+                    metadata={"error": str(e)},  # type: ignore[typeddict-unknown-key]
+                    chunks=[],
+                )
+    return results

kreuzberg/_multiprocessing/sync_paddleocr.py ADDED Viewed

@@ -0,0 +1,199 @@
+"""Pure synchronous PaddleOCR without any async overhead."""
+from __future__ import annotations
+import tempfile
+from pathlib import Path
+from typing import Any
+from PIL import Image
+from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+from kreuzberg._types import ExtractionResult
+from kreuzberg._utils._string import normalize_spaces
+from kreuzberg.exceptions import MissingDependencyError, OCRError
+def _get_paddleocr_instance(config: PaddleOCRConfig) -> Any:
+    """Get a PaddleOCR instance with the given configuration."""
+    try:
+        import paddleocr
+    except ImportError as e:
+        raise MissingDependencyError("PaddleOCR is not installed. Install it with: pip install paddleocr") from e
+    if hasattr(config, "device"):
+        if config.device and config.device.lower() != "cpu":
+            pass
+    elif hasattr(config, "use_gpu"):
+        pass
+    kwargs = {
+        "lang": config.language,
+        "use_textline_orientation": config.use_angle_cls,
+    }
+    if hasattr(config, "det_db_thresh"):
+        kwargs["text_det_thresh"] = config.det_db_thresh
+    if hasattr(config, "det_db_box_thresh"):
+        kwargs["text_det_box_thresh"] = config.det_db_box_thresh
+    if hasattr(config, "det_db_unclip_ratio"):
+        kwargs["text_det_unclip_ratio"] = config.det_db_unclip_ratio
+    if hasattr(config, "det_max_side_len"):
+        kwargs["text_det_limit_side_len"] = config.det_max_side_len
+    if hasattr(config, "drop_score"):
+        kwargs["text_rec_score_thresh"] = config.drop_score
+    return paddleocr.PaddleOCR(**kwargs)
+def process_image_sync_pure(
+    image_path: str | Path,
+    config: PaddleOCRConfig | None = None,
+) -> ExtractionResult:
+    """Process an image with PaddleOCR using pure sync implementation.
+    This bypasses all async overhead and calls PaddleOCR directly.
+    Args:
+        image_path: Path to the image file.
+        config: PaddleOCR configuration.
+    Returns:
+        Extraction result.
+    """
+    cfg = config or PaddleOCRConfig()
+    try:
+        ocr_instance = _get_paddleocr_instance(cfg)
+        results = ocr_instance.ocr(str(image_path))
+        if not results or not results[0]:
+            return ExtractionResult(
+                content="",
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata={},
+                chunks=[],
+            )
+        ocr_result = results[0]
+        result_data = ocr_result.json["res"]
+        texts = result_data.get("rec_texts", [])
+        scores = result_data.get("rec_scores", [])
+        if not texts:
+            return ExtractionResult(
+                content="",
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata={},
+                chunks=[],
+            )
+        content = "\n".join(texts)
+        content = normalize_spaces(content)
+        avg_confidence = sum(scores) / len(scores) if scores else 0.0
+        metadata = {"confidence": avg_confidence} if scores else {}
+        return ExtractionResult(
+            content=content,
+            mime_type=PLAIN_TEXT_MIME_TYPE,
+            metadata=metadata,  # type: ignore[arg-type]
+            chunks=[],
+        )
+    except Exception as e:
+        raise OCRError(f"PaddleOCR processing failed: {e}") from e
+def process_image_bytes_sync_pure(
+    image_bytes: bytes,
+    config: PaddleOCRConfig | None = None,
+) -> ExtractionResult:
+    """Process image bytes with PaddleOCR using pure sync implementation.
+    Args:
+        image_bytes: Image data as bytes.
+        config: PaddleOCR configuration.
+    Returns:
+        Extraction result.
+    """
+    import io
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
+        with Image.open(io.BytesIO(image_bytes)) as image:
+            image.save(tmp_image.name, format="PNG")
+        image_path = tmp_image.name
+    try:
+        return process_image_sync_pure(image_path, config)
+    finally:
+        image_file = Path(image_path)
+        if image_file.exists():
+            image_file.unlink()
+def process_batch_images_sync_pure(
+    image_paths: list[str | Path],
+    config: PaddleOCRConfig | None = None,
+) -> list[ExtractionResult]:
+    """Process a batch of images sequentially with pure sync implementation.
+    Args:
+        image_paths: List of image file paths.
+        config: PaddleOCR configuration.
+    Returns:
+        List of extraction results.
+    """
+    results = []
+    for image_path in image_paths:
+        result = process_image_sync_pure(image_path, config)
+        results.append(result)
+    return results
+def process_batch_images_threaded(
+    image_paths: list[str | Path],
+    config: PaddleOCRConfig | None = None,
+    max_workers: int | None = None,
+) -> list[ExtractionResult]:
+    """Process a batch of images using threading.
+    Args:
+        image_paths: List of image file paths.
+        config: PaddleOCR configuration.
+        max_workers: Maximum number of threads.
+    Returns:
+        List of extraction results in same order as input.
+    """
+    import multiprocessing as mp
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    if max_workers is None:
+        max_workers = min(len(image_paths), mp.cpu_count())
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_index = {
+            executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
+        }
+        results: list[ExtractionResult] = [None] * len(image_paths)  # type: ignore[list-item]
+        for future in as_completed(future_to_index):
+            index = future_to_index[future]
+            try:
+                results[index] = future.result()
+            except Exception as e:  # noqa: BLE001
+                results[index] = ExtractionResult(
+                    content=f"Error: {e}",
+                    mime_type=PLAIN_TEXT_MIME_TYPE,
+                    metadata={"error": str(e)},  # type: ignore[typeddict-unknown-key]
+                    chunks=[],
+                )
+    return results

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -319,7 +319,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         try:
             import torch
-            return torch.cuda.is_available()
+            return bool(torch.cuda.is_available())
         except ImportError:
             return False

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -202,9 +202,11 @@ class TesseractConfig:
             -   'deu' for German
             -    multiple languages combined with '+', e.g. 'eng+deu')
     """
-    language_model_ngram_on: bool = True
-    """Enable or disable the use of n-gram-based language models for improved text recognition."""
-    psm: PSMMode = PSMMode.AUTO
+    language_model_ngram_on: bool = False
+    """Enable or disable the use of n-gram-based language models for improved text recognition.
+    Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
+    psm: PSMMode = PSMMode.AUTO_ONLY
     """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
     tessedit_dont_blkrej_good_wds: bool = True
     """If True, prevents block rejection of words identified as good, improving text output quality."""
@@ -212,6 +214,8 @@ class TesseractConfig:
     """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
     tessedit_enable_dict_correction: bool = True
     """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
+    tessedit_char_whitelist: str = ""
+    """Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
     tessedit_use_primary_params_model: bool = True
     """If True, forces the use of the primary parameters model for text recognition."""
     textord_space_size_is_variable: bool = True

kreuzberg/_types.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from __future__ import annotations
 import sys
-from collections.abc import Awaitable
+from collections.abc import Awaitable, Callable
 from dataclasses import asdict, dataclass, field
-from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
+from typing import TYPE_CHECKING, Any, Literal, TypedDict
 from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
 from kreuzberg.exceptions import ValidationError
@@ -18,6 +18,7 @@ if TYPE_CHECKING:
     from PIL.Image import Image
     from kreuzberg._gmft import GMFTConfig
+    from kreuzberg._language_detection import LanguageDetectionConfig
     from kreuzberg._ocr._easyocr import EasyOCRConfig
     from kreuzberg._ocr._paddleocr import PaddleOCRConfig
     from kreuzberg._ocr._tesseract import TesseractConfig
@@ -113,14 +114,16 @@ class ExtractionResult:
     """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
     chunks: list[str] = field(default_factory=list)
     """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
+    detected_languages: list[str] | None = None
+    """Languages detected in the extracted content, if language detection is enabled."""
     def to_dict(self) -> dict[str, Any]:
         """Converts the ExtractionResult to a dictionary."""
         return asdict(self)
-PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
-ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
+PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[ExtractionResult]]
+ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
 @dataclass(unsafe_hash=True)
@@ -157,6 +160,10 @@ class ExtractionConfig:
     """Post processing hooks to call after processing is done and before the final result is returned."""
     validators: list[ValidationHook] | None = None
     """Validation hooks to call after processing is done and before post-processing and result return."""
+    auto_detect_language: bool = False
+    """Whether to automatically detect language and configure OCR accordingly."""
+    language_detection_config: LanguageDetectionConfig | None = None
+    """Configuration for language detection. If None, uses default settings."""
     def __post_init__(self) -> None:
         from kreuzberg._ocr._easyocr import EasyOCRConfig

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -153,7 +153,7 @@ def _is_cuda_available() -> bool:
     try:
         import torch  # type: ignore[import-not-found,unused-ignore]
-        return torch.cuda.is_available()
+        return bool(torch.cuda.is_available())
     except ImportError:
         return False
@@ -163,7 +163,7 @@ def _is_mps_available() -> bool:
     try:
         import torch  # type: ignore[import-not-found,unused-ignore]
-        return torch.backends.mps.is_available()
+        return bool(torch.backends.mps.is_available())
     except ImportError:
         return False

kreuzberg/_utils/_process_pool.py CHANGED Viewed

@@ -5,10 +5,10 @@ from __future__ import annotations
 import multiprocessing as mp
 from concurrent.futures import ProcessPoolExecutor
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Callable, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 if TYPE_CHECKING:
-    from collections.abc import Generator
+    from collections.abc import Callable, Generator
 T = TypeVar("T")

kreuzberg/_utils/_sync.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import sys
 from functools import partial
 from inspect import isawaitable, iscoroutinefunction
 from typing import TYPE_CHECKING, Any, TypeVar, cast
@@ -12,10 +11,7 @@ from anyio.to_thread import run_sync as any_io_run_sync
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Awaitable, Callable
-if sys.version_info >= (3, 10):
-    from typing import ParamSpec
-else:  # pragma: no cover
-    from typing_extensions import ParamSpec
+from typing import ParamSpec
 T = TypeVar("T")
 P = ParamSpec("P")

kreuzberg/_utils/_tmp.py CHANGED Viewed

@@ -3,14 +3,14 @@ from __future__ import annotations
 from contextlib import suppress
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING
 from anyio import Path as AsyncPath
 from kreuzberg._utils._sync import run_sync
 if TYPE_CHECKING:  # pragma: no cover
-    from collections.abc import Coroutine
+    from collections.abc import Callable, Coroutine
 async def create_temp_file(

kreuzberg/extraction.py CHANGED Viewed

@@ -28,6 +28,11 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
     for validator in config.validators or []:
         await run_maybe_sync(validator, result)
+    if config.auto_detect_language and result.content:
+        from kreuzberg._language_detection import detect_languages
+        result.detected_languages = detect_languages(result.content, config.language_detection_config)
     if config.chunk_content:
         result.chunks = _handle_chunk_content(
             mime_type=result.mime_type,
@@ -45,6 +50,11 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
     for validator in config.validators or []:
         run_sync_only(validator, result)
+    if config.auto_detect_language and result.content:
+        from kreuzberg._language_detection import detect_languages
+        result.detected_languages = detect_languages(result.content, config.language_detection_config)
     if config.chunk_content:
         result.chunks = _handle_chunk_content(
             mime_type=result.mime_type,

{kreuzberg-3.4.1.dist-info → kreuzberg-3.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.4.1
+Version: 3.5.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -12,7 +12,6 @@ Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
@@ -22,7 +21,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Text Processing :: General
 Classifier: Topic :: Utilities
 Classifier: Typing :: Typed
-Requires-Python: >=3.9
+Requires-Python: >=3.10
 Requires-Dist: anyio>=4.9.0
 Requires-Dist: charset-normalizer>=3.4.2
 Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
@@ -57,6 +56,8 @@ Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
 Provides-Extra: gmft
 Requires-Dist: gmft>=0.4.2; extra == 'gmft'
+Provides-Extra: langdetect
+Requires-Dist: fast-langdetect>=0.2.0; extra == 'langdetect'
 Provides-Extra: paddleocr
 Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'

{kreuzberg-3.4.1.dist-info → kreuzberg-3.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,50 +1,53 @@
-kreuzberg/__init__.py,sha256=5GP2j8PI3P_ZNSEhLpm8iqseY3i4nye6iUmVGUnfzno,1311
+kreuzberg/__init__.py,sha256=zZ_puArNdw0pQk93BV99fXCxzkHFKXB9kINn8-6-y24,1408
 kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
 kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
 kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
 kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
-kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
+kreuzberg/_gmft.py,sha256=e-UpYwizRX_V-dn0a7ja0Z9nShAmDKA1Q7HThJy8cyA,14856
+kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
 kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
 kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
 kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
-kreuzberg/_types.py,sha256=8kwDjQjBdiTbNcRwJmH4vijNpf9Ml9WNW85Uxv2alDw,7634
+kreuzberg/_types.py,sha256=Tnl9yP56dn8ziBZk1sorNk1ZHZbJYMjSoqh7xxImFHs,8092
 kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
 kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
-kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
+kreuzberg/extraction.py,sha256=Jz0f31Mm90mBkWwn0L3vn3z7-irdwNIzMHWByIj5d_I,17005
 kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
 kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
 kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
-kreuzberg/_extractors/_image.py,sha256=Vks6WEDoW5AlGqIGVSeuhZzvJNwS8V6wxeD46Fxxogw,3947
+kreuzberg/_extractors/_image.py,sha256=pYfh3x9CkiIxOLvp0jkkZcmLbB_FpdfDo01klSc6OzQ,4819
 kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
-kreuzberg/_extractors/_pdf.py,sha256=qgYwGvAlvyZzb94lXGcKGIhzmSFpP6YGzYc7fs8b-yw,13432
+kreuzberg/_extractors/_pdf.py,sha256=R33ggTd0IU6NsEnzgHFTr9ScgcnM8nIIstDq7XMVcvg,14792
 kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
-kreuzberg/_extractors/_spread_sheet.py,sha256=ToLZIK_PO72IYbsdtSQkHOwTUhDwptjOfSX--e1UdSM,6487
+kreuzberg/_extractors/_spread_sheet.py,sha256=HOzCeYQc6kaMveAHfi80LrsF0yU7Kn74aKQ7lrMAlo8,6480
 kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
-kreuzberg/_multiprocessing/gmft_isolated.py,sha256=wpZ5br5dL9P6hhGjAYckHbz8IvXrDdEvajJ7fxbFmAU,11199
-kreuzberg/_multiprocessing/process_manager.py,sha256=dvO9JBWYnH1KCpzwn9h3Tz-wAoihMwTLE6OS-DF_sK0,6030
+kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
+kreuzberg/_multiprocessing/process_manager.py,sha256=_qtB8y9td2coJevlIl4z6F__jau320RdI1lqdyuaeD4,6061
+kreuzberg/_multiprocessing/sync_easyocr.py,sha256=-3_Ol0H8G6RhPxTbTPvoe8fTsTz3e-dg2QbHHnoJL48,7693
+kreuzberg/_multiprocessing/sync_paddleocr.py,sha256=5558iTjPXCyJWuyhZckmuJLadUwJDb5YVC8Cv-FOaWg,6090
 kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
 kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
 kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
 kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
-kreuzberg/_ocr/_easyocr.py,sha256=QSd6Bw7RBsOyL5ry-6lFLD7gJxcpK1P3AD_RRK4TPWs,13734
+kreuzberg/_ocr/_easyocr.py,sha256=90Dv1xaLXbpG7EtmRQE5ykvnhqZJR3xSFXlxFMCSVSI,13740
 kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
-kreuzberg/_ocr/_tesseract.py,sha256=NAHklkHvDKMgHVqjhgYfxC3DIJuQn8fXPkvnmQxUiV8,12784
+kreuzberg/_ocr/_tesseract.py,sha256=3s3MkZN9xA_Uedx4s2p5m4IEIMhGjs9gYHxan9Iz-2g,13044
 kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_utils/_cache.py,sha256=JGiwwcNBoD950IbsPUUAD5gAGS7byUuz0BqYSneVakc,13088
-kreuzberg/_utils/_device.py,sha256=Dk4g-LzUMJ-WMM-9czNQJj3mUI43l2w7t6MJcERYb2U,10264
+kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
 kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
 kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
 kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
-kreuzberg/_utils/_process_pool.py,sha256=7n5UN3d-xeYHU5TiRI62u-JenERPinJzFhbRUq-zL9k,2895
+kreuzberg/_utils/_process_pool.py,sha256=-0SNP01Qz21D7hgJmN0eHoqKusSygwPbi1U7IzJlPio,2895
 kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lTklO0g,2132
 kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
-kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
-kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
-kreuzberg-3.4.1.dist-info/METADATA,sha256=g3DwLXNiDzvPDBApPnDp3BeZ4SbVN0NTrEzN9cyKy34,8751
-kreuzberg-3.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-kreuzberg-3.4.1.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
-kreuzberg-3.4.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-3.4.1.dist-info/RECORD,,
+kreuzberg/_utils/_sync.py,sha256=oT4Y_cDBKtE_BFEoLTae3rSisqlYXzW-jlUG_x-dmLM,4725
+kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
+kreuzberg-3.5.0.dist-info/METADATA,sha256=jJXbwUuTXevmry2VVg1H8d6rEzebILJyN7q7kJ0M9mQ,8790
+kreuzberg-3.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+kreuzberg-3.5.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
+kreuzberg-3.5.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-3.5.0.dist-info/RECORD,,

{kreuzberg-3.4.1.dist-info → kreuzberg-3.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{kreuzberg-3.4.1.dist-info → kreuzberg-3.5.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{kreuzberg-3.4.1.dist-info → kreuzberg-3.5.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

kreuzberg 3.4.1__py3-none-any.whl → 3.5.0__py3-none-any.whl

kreuzberg 3.4.1py3-none-any.whl → 3.5.0py3-none-any.whl