PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +212 -292
kreuzberg/_document_classification.py +20 -47
kreuzberg/_entity_extraction.py +1 -122
kreuzberg/_extractors/_base.py +4 -71
kreuzberg/_extractors/_email.py +1 -15
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -25
kreuzberg/_extractors/_pandoc.py +10 -147
kreuzberg/_extractors/_pdf.py +38 -94
kreuzberg/_extractors/_presentation.py +0 -99
kreuzberg/_extractors/_spread_sheet.py +13 -55
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -199
kreuzberg/_language_detection.py +1 -36
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -19
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +124 -186
kreuzberg/_ocr/_paddleocr.py +154 -224
kreuzberg/_ocr/_table_extractor.py +184 -0
kreuzberg/_ocr/_tesseract.py +797 -361
kreuzberg/_playa.py +5 -31
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +588 -93
kreuzberg/_utils/_cache.py +84 -138
kreuzberg/_utils/_device.py +0 -74
kreuzberg/_utils/_document_cache.py +0 -75
kreuzberg/_utils/_errors.py +0 -50
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -16
kreuzberg/_utils/_process_pool.py +17 -64
kreuzberg/_utils/_quality.py +0 -60
kreuzberg/_utils/_ref.py +32 -0
kreuzberg/_utils/_serialization.py +0 -30
kreuzberg/_utils/_string.py +9 -59
kreuzberg/_utils/_sync.py +0 -77
kreuzberg/_utils/_table.py +49 -101
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
kreuzberg-3.13.1.dist-info/RECORD +57 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_document_classification.py CHANGED Viewed

@@ -3,6 +3,8 @@ from __future__ import annotations
 import re
 from typing import TYPE_CHECKING
+import polars as pl
 from kreuzberg._ocr import get_ocr_backend
 from kreuzberg._types import ExtractionConfig, ExtractionResult  # noqa: TC001
 from kreuzberg.exceptions import MissingDependencyError
@@ -40,21 +42,8 @@ DOCUMENT_CLASSIFIERS = {
 def _get_translated_text(result: ExtractionResult) -> str:
-    """Translate extracted text to English using Google Translate API.
-    Args:
-        result: ExtractionResult containing the text to be translated
-    Returns:
-        str: The translated text in lowercase English
-    Raises:
-        MissingDependencyError: If the deep-translator package is not installed
-    """
-    # Combine content with metadata for classification
     text_to_classify = result.content
     if result.metadata:
-        # Add metadata values to the text for classification
         metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
         text_to_classify = f"{text_to_classify} {metadata_text}"
@@ -68,21 +57,10 @@ def _get_translated_text(result: ExtractionResult) -> str:
     try:
         return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
     except Exception:  # noqa: BLE001
-        # Fall back to original content in lowercase if translation fails
         return text_to_classify.lower()
 def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
-    """Classifies the document type based on keywords and patterns.
-    Args:
-        result: The extraction result containing the content.
-        config: The extraction configuration.
-    Returns:
-        A tuple containing the detected document type and the confidence score,
-        or (None, None) if no type is detected with sufficient confidence.
-    """
     if not config.auto_detect_document_type:
         return None, None
@@ -111,33 +89,20 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
 def classify_document_from_layout(
     result: ExtractionResult, config: ExtractionConfig
 ) -> tuple[str | None, float | None]:
-    """Classifies the document type based on layout information from OCR.
-    Args:
-        result: The extraction result containing the layout data.
-        config: The extraction configuration.
-    Returns:
-        A tuple containing the detected document type and the confidence score,
-        or (None, None) if no type is detected with sufficient confidence.
-    """
     if not config.auto_detect_document_type:
         return None, None
-    if result.layout is None or result.layout.empty:
+    if result.layout is None or result.layout.is_empty():
         return None, None
     layout_df = result.layout
     if not all(col in layout_df.columns for col in ["text", "top", "height"]):
         return None, None
-    # Use layout text for classification, not the content
-    layout_text = " ".join(layout_df["text"].astype(str).tolist())
+    layout_text = " ".join(layout_df["text"].cast(str).to_list())
-    # Translate layout text directly for classification
     text_to_classify = layout_text
     if result.metadata:
-        # Add metadata values to the text for classification
         metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
         text_to_classify = f"{text_to_classify} {metadata_text}"
@@ -146,20 +111,29 @@ def classify_document_from_layout(
         translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
     except Exception:  # noqa: BLE001
-        # Fall back to original content in lowercase if translation fails
         translated_text = text_to_classify.lower()
-    layout_df["translated_text"] = translated_text
+    layout_df = layout_df.with_columns(pl.lit(translated_text).alias("translated_text"))
-    page_height = layout_df["top"].max() + layout_df["height"].max()
+    try:
+        layout_df = layout_df.with_columns(
+            [pl.col("top").cast(pl.Float64, strict=False), pl.col("height").cast(pl.Float64, strict=False)]
+        )
+        page_height_val = layout_df.select(pl.col("top").max() + pl.col("height").max()).item()
+        if page_height_val is None:
+            page_height_val = 0.0
+        page_height = float(page_height_val)
+    except Exception:  # noqa: BLE001
+        page_height = 1000.0
     scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0.0)
     for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
         for pattern in patterns:
-            found_words = layout_df[layout_df["translated_text"].str.contains(pattern, case=False, na=False)]
-            if not found_words.empty:
+            found_words = layout_df.filter(layout_df["translated_text"].str.contains(pattern))
+            if not found_words.is_empty():
                 scores[doc_type] += 1.0
-                word_top = found_words.iloc[0]["top"]
+                word_top = found_words[0, "top"]
                 if word_top < page_height * 0.3:
                     scores[doc_type] += 0.5
@@ -183,8 +157,7 @@ def auto_detect_document_type(
     if config.document_classification_mode == "vision" and file_path:
         layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
         result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
-    elif result.layout is not None and not result.layout.empty:
-        # Use layout-based classification if layout data is available
+    elif result.layout is not None and not result.layout.is_empty():
         result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
     else:
         result.document_type, result.document_type_confidence = classify_document(result, config)

kreuzberg/_entity_extraction.py CHANGED Viewed

@@ -2,105 +2,14 @@ from __future__ import annotations
 import os
 import re
-from dataclasses import dataclass
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any
-from kreuzberg._types import Entity
+from kreuzberg._types import Entity, SpacyEntityExtractionConfig
 from kreuzberg.exceptions import MissingDependencyError
 if TYPE_CHECKING:
     from collections.abc import Sequence
-    from pathlib import Path
-@dataclass(unsafe_hash=True, frozen=True, slots=True)
-class SpacyEntityExtractionConfig:
-    """Configuration for spaCy-based entity extraction."""
-    model_cache_dir: str | Path | None = None
-    """Directory to cache spaCy models. If None, uses spaCy's default."""
-    language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
-    """Mapping of language codes to spaCy model names.
-    If None, uses default mappings:
-    - en: en_core_web_sm
-    - de: de_core_news_sm
-    - fr: fr_core_news_sm
-    - es: es_core_news_sm
-    - pt: pt_core_news_sm
-    - it: it_core_news_sm
-    - nl: nl_core_news_sm
-    - zh: zh_core_web_sm
-    - ja: ja_core_news_sm
-    """
-    fallback_to_multilingual: bool = True
-    """If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
-    max_doc_length: int = 1000000
-    """Maximum document length for spaCy processing."""
-    batch_size: int = 1000
-    """Batch size for processing multiple texts."""
-    def __post_init__(self) -> None:
-        if self.language_models is None:
-            object.__setattr__(self, "language_models", self._get_default_language_models())
-        if isinstance(self.language_models, dict):
-            object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
-    @staticmethod
-    def _get_default_language_models() -> dict[str, str]:
-        """Get default language model mappings based on available spaCy models."""
-        return {
-            "en": "en_core_web_sm",
-            "de": "de_core_news_sm",
-            "fr": "fr_core_news_sm",
-            "es": "es_core_news_sm",
-            "pt": "pt_core_news_sm",
-            "it": "it_core_news_sm",
-            "nl": "nl_core_news_sm",
-            "zh": "zh_core_web_sm",
-            "ja": "ja_core_news_sm",
-            "ko": "ko_core_news_sm",
-            "ru": "ru_core_news_sm",
-            "pl": "pl_core_news_sm",
-            "ro": "ro_core_news_sm",
-            "el": "el_core_news_sm",
-            "da": "da_core_news_sm",
-            "fi": "fi_core_news_sm",
-            "nb": "nb_core_news_sm",
-            "sv": "sv_core_news_sm",
-            "ca": "ca_core_news_sm",
-            "hr": "hr_core_news_sm",
-            "lt": "lt_core_news_sm",
-            "mk": "mk_core_news_sm",
-            "sl": "sl_core_news_sm",
-            "uk": "uk_core_news_sm",
-        }
-    def get_model_for_language(self, language_code: str) -> str | None:
-        """Get the appropriate spaCy model for a language code."""
-        if not self.language_models:
-            return None
-        models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
-        if language_code in models_dict:
-            return models_dict[language_code]
-        base_lang = language_code.split("-")[0].lower()
-        if base_lang in models_dict:
-            return models_dict[base_lang]
-        return None
-    def get_fallback_model(self) -> str | None:
-        """Get fallback multilingual model if enabled."""
-        return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
 def extract_entities(
@@ -110,24 +19,8 @@ def extract_entities(
     languages: list[str] | None = None,
     spacy_config: SpacyEntityExtractionConfig | None = None,
 ) -> list[Entity]:
-    """Extract entities from text using custom regex patterns and/or a NER model.
-    Args:
-        text: The input text to extract entities from.
-        entity_types: List of entity types to extract using the NER model.
-        custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
-        languages: List of detected languages to choose appropriate spaCy models.
-        spacy_config: Configuration for spaCy entity extraction.
-    Returns:
-        list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
-    Raises:
-        MissingDependencyError: If `spacy` is not installed.
-    """
     entities: list[Entity] = []
     if custom_patterns:
-        # Direct iteration over frozenset - no need to convert to dict
         for ent_type, pattern in custom_patterns:
             entities.extend(
                 Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
@@ -177,7 +70,6 @@ def extract_entities(
 @lru_cache(maxsize=32)
 def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
-    """Load a spaCy model with caching."""
     try:
         import spacy  # noqa: PLC0415
@@ -194,7 +86,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
 def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
-    """Select the best spaCy model based on detected languages."""
     if not languages:
         return spacy_config.get_model_for_language("en")
@@ -210,18 +101,6 @@ def extract_keywords(
     text: str,
     keyword_count: int = 10,
 ) -> list[tuple[str, float]]:
-    """Extract keywords from text using the KeyBERT model.
-    Args:
-        text: The input text to extract keywords from.
-        keyword_count: Number of top keywords to return. Defaults to 10.
-    Returns:
-        list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
-    Raises:
-        MissingDependencyError: If `keybert` is not installed.
-    """
     try:
         from keybert import KeyBERT  # noqa: PLC0415

kreuzberg/_extractors/_base.py CHANGED Viewed

@@ -13,20 +13,6 @@ if TYPE_CHECKING:
 class Extractor(ABC):
-    """Abstract base class for file content extraction.
-    This class provides the interface for different types of content extractors.
-    Subclasses are expected to implement the methods for extracting content
-    either asynchronously or synchronously and determining the supported MIME types.
-    Attributes:
-        SUPPORTED_MIME_TYPES: The set of supported mime types - all none abstract extractors must implement this.
-    Args:
-        mime_type: The MIME type that this extractor handles (e.g., "application/pdf").
-        config: Configuration options for the extraction process.
-    """
     __slots__ = ("config", "mime_type")
     SUPPORTED_MIME_TYPES: ClassVar[set[str]]
@@ -36,89 +22,36 @@ class Extractor(ABC):
         self.config = config
     @abstractmethod
-    async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
-        """Asynchronously extract content from a byte stream.
-        Args:
-            content: The byte content to extract.
-        Returns:
-            ExtractionResult: The extracted content along with metadata about the extraction.
-        """
+    async def extract_bytes_async(self, content: bytes) -> ExtractionResult: ...
     @abstractmethod
-    async def extract_path_async(self, path: Path) -> ExtractionResult:
-        """Asynchronously extract content from a file located at the specified path.
-        Args:
-            path: The path to the file to process.
-        Returns:
-            ExtractionResult: The extracted content along with metadata about the extraction.
-        """
+    async def extract_path_async(self, path: Path) -> ExtractionResult: ...
     @abstractmethod
-    def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Synchronously extract content from a byte stream.
-        Args:
-            content: The byte content to extract.
-        Returns:
-            ExtractionResult: The extracted content along with metadata about the extraction.
-        """
+    def extract_bytes_sync(self, content: bytes) -> ExtractionResult: ...
     @abstractmethod
-    def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Synchronously extract content from a file located at the specified path.
-        Args:
-            path: The path to the file to process.
-        Returns:
-            ExtractionResult: The extracted content along with metadata about the extraction.
-        """
+    def extract_path_sync(self, path: Path) -> ExtractionResult: ...
     @classmethod
     def supports_mimetype(cls, mime_type: str) -> bool:
-        """Verify whether the extractor supports the given MIME type.
-        Args:
-            mime_type: The MIME type to check (e.g., "application/pdf").
-        Returns:
-            bool: True if the MIME type is supported, False otherwise.
-        """
         return mime_type in cls.SUPPORTED_MIME_TYPES or any(
             mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
         )
     def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
-        """Apply quality post-processing to extraction result if enabled.
-        Args:
-            result: The raw extraction result
-        Returns:
-            Enhanced extraction result with quality improvements (if enabled)
-        """
-        # Only apply quality processing if enabled in config
         if not self.config.enable_quality_processing:
             return result
         if not result.content:
             return result
-        # Clean the content
         cleaned_content = clean_extracted_text(result.content)
-        # Calculate quality score
         quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
-        # Add quality metadata
         enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
-        # Return enhanced result
         return ExtractionResult(
             content=cleaned_content,
             mime_type=result.mime_type,

kreuzberg/_extractors/_email.py CHANGED Viewed

@@ -16,7 +16,6 @@ from kreuzberg.exceptions import MissingDependencyError
 if TYPE_CHECKING:
     from pathlib import Path
-# Import optional dependencies at module level with proper error handling
 try:
     import mailparse
 except ImportError:  # pragma: no cover
@@ -27,7 +26,6 @@ try:
 except ImportError:  # pragma: no cover
     html2text = None
-# Compile regex pattern once at module level
 _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
@@ -44,8 +42,6 @@ class EmailExtractor(Extractor):
     def _extract_email_headers(
         self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
     ) -> None:
-        """Extract and process email headers."""
-        # Use single dict access where possible to avoid repeated lookups
         subject = parsed_email.get("subject")
         if subject:
             metadata["subject"] = subject
@@ -59,9 +55,7 @@ class EmailExtractor(Extractor):
         to_info = parsed_email.get("to")
         if to_info:
-            # Store the raw value in metadata (could be string, dict, or list)
             if isinstance(to_info, list) and to_info:
-                # For metadata, use first recipient's email if it's a list
                 to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
                 metadata["email_to"] = to_email
             elif isinstance(to_info, dict):
@@ -69,7 +63,6 @@ class EmailExtractor(Extractor):
             else:
                 metadata["email_to"] = str(to_info)
-            # For display, format all recipients
             to_formatted = self._format_email_field(to_info)
             text_parts.append(f"To: {to_formatted}")
@@ -91,7 +84,6 @@ class EmailExtractor(Extractor):
             text_parts.append(f"BCC: {bcc_formatted}")
     def _format_email_field(self, field: Any) -> str:
-        """Format email field (to, cc, bcc) for display."""
         if isinstance(field, list):
             emails = []
             for item in field:
@@ -107,23 +99,20 @@ class EmailExtractor(Extractor):
         return str(field)
     def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
-        """Extract and process email body content."""
         text_content = parsed_email.get("text")
         if text_content:
             text_parts.append(f"\n{text_content}")
-            return  # If we have text, prefer it over HTML
+            return
         html_content = parsed_email.get("html")
         if html_content:
             if html2text is not None:
-                # Use html2text if available (faster path)
                 h = html2text.HTML2Text()
                 h.ignore_links = True
                 h.ignore_images = True
                 converted_text = h.handle(html_content)
                 text_parts.append(f"\n{converted_text}")
             else:
-                # Fallback: strip HTML tags and unescape entities
                 clean_html = _HTML_TAG_PATTERN.sub("", html_content)
                 clean_html = unescape(clean_html)
                 text_parts.append(f"\n{clean_html}")
@@ -131,7 +120,6 @@ class EmailExtractor(Extractor):
     def _extract_email_attachments(
         self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
     ) -> None:
-        """Extract and process email attachments info."""
         if parsed_email.get("attachments"):
             attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
             metadata["attachments"] = attachment_names
@@ -148,12 +136,10 @@ class EmailExtractor(Extractor):
             text_parts: list[str] = []
             metadata: dict[str, Any] = {}
-            # Extract headers, body, and attachments
             self._extract_email_headers(parsed_email, text_parts, metadata)
             self._extract_email_body(parsed_email, text_parts)
             self._extract_email_attachments(parsed_email, text_parts, metadata)
-            # Join efficiently
             combined_text = "\n".join(text_parts)
             return ExtractionResult(

kreuzberg/_extractors/_html.py CHANGED Viewed

@@ -7,7 +7,7 @@ from anyio import Path as AsyncPath
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
-from kreuzberg._types import ExtractionResult
+from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig
 from kreuzberg._utils._string import safe_decode
 from kreuzberg._utils._sync import run_sync
@@ -26,19 +26,16 @@ class HTMLExtractor(Extractor):
         return await run_sync(self.extract_bytes_sync, content)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        # Use html-to-markdown with script/nav removal for better quality
-        result = html_to_markdown.convert_to_markdown(
-            safe_decode(content),
-            preprocess_html=True,
-            preprocessing_preset="aggressive",
-            remove_navigation=True,
-            remove_forms=True,
-        )
-        # Skip normalize_spaces since quality processing will handle whitespace
+        config = self.config.html_to_markdown_config if self.config else None
+        if config is None:
+            config = HTMLToMarkdownConfig()
+        config_dict = config.to_dict()
+        result = html_to_markdown.convert_to_markdown(safe_decode(content), **config_dict)
         extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
-        # Apply quality processing which includes normalization
         return self._apply_quality_processing(extraction_result)
     def extract_path_sync(self, path: Path) -> ExtractionResult:

kreuzberg/_extractors/_image.py CHANGED Viewed

@@ -3,7 +3,6 @@ from __future__ import annotations
 import contextlib
 import os
 import tempfile
-from dataclasses import asdict
 from pathlib import Path
 from typing import TYPE_CHECKING, ClassVar
@@ -12,9 +11,6 @@ from anyio import Path as AsyncPath
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import IMAGE_MIME_TYPES
 from kreuzberg._ocr import get_ocr_backend
-from kreuzberg._ocr._easyocr import EasyOCRConfig
-from kreuzberg._ocr._paddleocr import PaddleOCRConfig
-from kreuzberg._ocr._tesseract import TesseractConfig
 from kreuzberg._utils._tmp import create_temp_file
 from kreuzberg.exceptions import ValidationError
@@ -65,7 +61,6 @@ class ImageExtractor(Extractor):
         return self._apply_quality_processing(result)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Pure sync implementation of extract_bytes."""
         extension = self._get_extension_from_mime_type(self.mime_type)
         fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
@@ -79,30 +74,11 @@ class ImageExtractor(Extractor):
                 Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Pure sync implementation of extract_path."""
         if self.config.ocr_backend is None:
             raise ValidationError("ocr_backend is None, cannot perform OCR")
         backend = get_ocr_backend(self.config.ocr_backend)
-        match self.config.ocr_backend:
-            case "tesseract":
-                config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
-                )
-                result = backend.process_file_sync(path, **asdict(config))
-            case "paddleocr":
-                paddle_config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
-                )
-                result = backend.process_file_sync(path, **asdict(paddle_config))
-            case "easyocr":
-                easy_config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
-                )
-                result = backend.process_file_sync(path, **asdict(easy_config))
-            case _:
-                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+        result = backend.process_file_sync(path, **self.config.get_config_dict())
         return self._apply_quality_processing(result)
     def _get_extension_from_mime_type(self, mime_type: str) -> str:

kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl