PyPI - kreuzberg - Versions diffs - 3.18.0__tar.gz → 3.20.1__tar.gz - Mend

kreuzberg 3.18.0tar.gz → 3.20.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (362) hide show

{kreuzberg-3.18.0 → kreuzberg-3.20.1}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,11 @@
-Metadata-Version: 2.4
+Metadata-Version: 2.3
 Name: kreuzberg
-Version: 3.18.0
+Version: 3.20.1
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
-Project-URL: documentation, https://kreuzberg.dev
-Project-URL: homepage, https://github.com/Goldziher/kreuzberg
+Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
+Author: Na'aman Hirschfeld
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
-License-File: LICENSE
-Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Information Technology
@@ -27,67 +25,56 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Text Processing :: General
 Classifier: Typing :: Typed
-Requires-Python: >=3.10
 Requires-Dist: anyio>=4.11.0
 Requires-Dist: chardetng-py>=0.3.5
-Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
-Requires-Dist: html-to-markdown[lxml]>=1.16.0
+Requires-Dist: exceptiongroup>=1.2.2 ; python_full_version < '3.11'
+Requires-Dist: html-to-markdown>=2.1.0
 Requires-Dist: langcodes>=3.5.0
-Requires-Dist: mcp>=1.15.0
+Requires-Dist: mcp>=1.17.0
 Requires-Dist: msgspec>=0.18.0
 Requires-Dist: numpy>=2.0.0
 Requires-Dist: playa-pdf>=0.7.0
-Requires-Dist: polars>=1.33.1
+Requires-Dist: polars>=1.34.0
 Requires-Dist: psutil>=7.1.0
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.5.3
 Requires-Dist: python-pptx>=1.0.2
-Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
+Requires-Dist: transformers>=4.55.0
+Requires-Dist: typing-extensions>=4.15.0 ; python_full_version < '3.12'
+Requires-Dist: mailparse>=1.0.15 ; extra == 'additional-extensions'
+Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'additional-extensions'
+Requires-Dist: kreuzberg[additional-extensions,api,chunking,cli,crypto,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr] ; extra == 'all'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0 ; extra == 'api'
+Requires-Dist: semantic-text-splitter>=0.28.0 ; extra == 'chunking'
+Requires-Dist: click>=8.3.0 ; extra == 'cli'
+Requires-Dist: rich>=14.2.0 ; extra == 'cli'
+Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'cli'
+Requires-Dist: playa-pdf[crypto]>=0.7.0 ; extra == 'crypto'
+Requires-Dist: deep-translator>=1.11.4 ; extra == 'document-classification'
+Requires-Dist: easyocr>=1.7.2 ; python_full_version < '3.14' and extra == 'easyocr'
+Requires-Dist: keybert>=0.9.0 ; extra == 'entity-extraction'
+Requires-Dist: spacy>=3.8.7 ; python_full_version < '3.14' and extra == 'entity-extraction'
+Requires-Dist: gmft>=0.4.2 ; extra == 'gmft'
+Requires-Dist: transformers>=4.57.0 ; extra == 'gmft'
+Requires-Dist: fast-langdetect>=1.0.0 ; extra == 'langdetect'
+Requires-Dist: paddleocr>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
+Requires-Dist: paddlepaddle>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
+Requires-Dist: setuptools>=80.9.0 ; extra == 'paddleocr'
+Requires-Python: >=3.10
+Project-URL: documentation, https://kreuzberg.dev
+Project-URL: homepage, https://github.com/Goldziher/kreuzberg
 Provides-Extra: additional-extensions
-Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
-Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
 Provides-Extra: all
-Requires-Dist: click>=8.2.1; extra == 'all'
-Requires-Dist: deep-translator>=1.11.4; extra == 'all'
-Requires-Dist: easyocr>=1.7.2; extra == 'all'
-Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
-Requires-Dist: gmft>=0.4.2; extra == 'all'
-Requires-Dist: keybert>=0.9.0; extra == 'all'
-Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
-Requires-Dist: mailparse>=1.0.15; extra == 'all'
-Requires-Dist: paddleocr>=3.2.0; extra == 'all'
-Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
-Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
-Requires-Dist: rich>=14.1.0; extra == 'all'
-Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
-Requires-Dist: setuptools>=80.9.0; extra == 'all'
-Requires-Dist: spacy>=3.8.7; extra == 'all'
-Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
 Provides-Extra: api
-Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
 Provides-Extra: chunking
-Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
 Provides-Extra: cli
-Requires-Dist: click>=8.2.1; extra == 'cli'
-Requires-Dist: rich>=14.1.0; extra == 'cli'
-Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
 Provides-Extra: crypto
-Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
 Provides-Extra: document-classification
-Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
 Provides-Extra: easyocr
-Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
 Provides-Extra: entity-extraction
-Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
-Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
 Provides-Extra: gmft
-Requires-Dist: gmft>=0.4.2; extra == 'gmft'
 Provides-Extra: langdetect
-Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
 Provides-Extra: paddleocr
-Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
-Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
-Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
 Description-Content-Type: text/markdown
 # Kreuzberg

{kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_api/main.py RENAMED Viewed

@@ -110,10 +110,9 @@ def _get_max_upload_size() -> int:
     Environment Variables:
         KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
     """
-    default_size = 1024 * 1024 * 1024  # 1GB
+    default_size = 1024 * 1024 * 1024
     try:
         size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
-        # Return default if negative
         return size if size >= 0 else default_size
     except ValueError:
         return default_size
@@ -311,6 +310,9 @@ async def handle_files_upload(  # noqa: PLR0913
     """
     static_config = discover_config_cached()
+    if not data:
+        raise ValidationError("No files provided for extraction", context={"file_count": 0})
     min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
     max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)

{kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_entity_extraction.py RENAMED Viewed

@@ -144,10 +144,9 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
     try:
         nlp = spacy.load(model_name)
     except OSError:
-        # Try to download the model automatically
         async def install_model() -> tuple[bool, str | None]:
             """Install model and return success status and error message."""
-            # First try spaCy's built-in download
             try:
                 success = await install_spacy_model_with_spacy(model_name)
                 if success:
@@ -157,7 +156,6 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
             else:
                 spacy_error = "spaCy download failed"
-            # If spaCy download failed and uv is available, try uv as fallback
             if is_uv_available():
                 try:
                     result = await install_spacy_model_with_uv(model_name)
@@ -167,14 +165,12 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
             return False, spacy_error
-        # Run the async installation in a sync context
         try:
             success, error_details = anyio.run(install_model)
-        except (OSError, RuntimeError) as e:
-            success, error_details = False, str(e)
+        except SystemExit as e:
+            success, error_details = False, f"spaCy CLI exit code: {e.code}"
         if not success:
-            # Generate appropriate error message based on available tools
             if is_uv_available():
                 model_url = get_spacy_model_url(model_name)
                 manual_install_cmd = f"uv pip install {model_url}"
@@ -234,7 +230,7 @@ def extract_keywords(
         kw_model = KeyBERT()
         keywords = kw_model.extract_keywords(text, top_n=keyword_count)
         return [(kw, float(score)) for kw, score in keywords]
-    except (RuntimeError, OSError, ValueError):
+    except ValueError:
         return []
     except ImportError as e:  # pragma: no cover
         raise MissingDependencyError.create_for_package(

kreuzberg-3.20.1/kreuzberg/_error_handling.py ADDED Viewed

@@ -0,0 +1,182 @@
+"""Type-safe error handling utilities for extraction pipeline."""
+from __future__ import annotations
+import traceback
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from collections.abc import Callable
+from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
+from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
+def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
+    """Determine if an exception should bubble up or be handled gracefully.
+    Args:
+        exception: The exception to classify
+        context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
+    Returns:
+        True if the exception should bubble up, False if it should be handled gracefully
+    """
+    if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
+        return True
+    if isinstance(exception, MissingDependencyError):
+        return True
+    if isinstance(exception, ValidationError):
+        if context == "batch_processing":
+            return False
+        return context != "optional_feature"
+    if isinstance(exception, KreuzbergError) and context == "optional_feature":
+        return False
+    if context == "batch_processing":
+        return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
+    return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
+class FeatureProcessingError:
+    """Type-safe processing error for extraction features."""
+    def __init__(self, feature: str, error: Exception) -> None:
+        self._feature = feature
+        self._error = error
+        self._traceback = traceback.format_exc()
+    @property
+    def feature(self) -> str:
+        return self._feature
+    @property
+    def error_type(self) -> str:
+        return type(self._error).__name__
+    @property
+    def error_message(self) -> str:
+        return str(self._error)
+    @property
+    def traceback(self) -> str:
+        return self._traceback
+    def to_dict(self) -> ProcessingErrorDict:
+        return {
+            "feature": self.feature,
+            "error_type": self.error_type,
+            "error_message": self.error_message,
+            "traceback": self.traceback,
+        }
+def safe_feature_execution(
+    feature_name: str,
+    execution_func: Callable[[], Any],
+    default_value: Any,
+    result: ExtractionResult,
+    context: ErrorContextType = "optional_feature",
+) -> Any:
+    """Safely execute a feature extraction function with proper error handling.
+    Args:
+        feature_name: Name of the feature being executed
+        execution_func: Function to execute that may raise exceptions
+        default_value: Default value to return if execution fails
+        result: ExtractionResult to update with error information
+        context: The context for exception handling decisions
+    Returns:
+        Either the successful result or the default value
+    """
+    try:
+        return execution_func()
+    except Exception as e:
+        if should_exception_bubble_up(e, context):
+            raise
+        _add_processing_error(result, FeatureProcessingError(feature_name, e))
+        return default_value
+def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
+    """Add a processing error to the result metadata in a type-safe way."""
+    if result.metadata is None:
+        result.metadata = {}
+    if "processing_errors" not in result.metadata:
+        result.metadata["processing_errors"] = []
+    errors_list = result.metadata["processing_errors"]
+    if isinstance(errors_list, list):
+        errors_list.append(error.to_dict())
+    else:
+        result.metadata["processing_errors"] = [error.to_dict()]
+def preserve_result_with_errors(
+    result: ExtractionResult,
+    errors: list[FeatureProcessingError],
+) -> ExtractionResult:
+    """Preserve a successful extraction result while adding error information.
+    This is used when core extraction succeeds but optional features fail.
+    Args:
+        result: The successful extraction result
+        errors: List of errors that occurred during optional processing
+    Returns:
+        The result with error information added to metadata
+    """
+    for error in errors:
+        _add_processing_error(result, error)
+    return result
+def create_error_result(
+    content: str,
+    mime_type: str,
+    errors: list[FeatureProcessingError],
+    **metadata_kwargs: Any,
+) -> ExtractionResult:
+    """Create an error result with proper type safety.
+    Args:
+        content: Error content to include
+        mime_type: MIME type of the result
+        errors: List of errors that occurred
+        **metadata_kwargs: Additional metadata to include
+    Returns:
+        An ExtractionResult with error information
+    """
+    metadata: Metadata = {
+        "error": f"Multiple processing errors occurred: {len(errors)} errors",
+        "error_context": {
+            "error_count": len(errors),
+            "errors": [error.to_dict() for error in errors],
+            **metadata_kwargs,
+        },
+        "processing_errors": [error.to_dict() for error in errors],
+    }
+    return ExtractionResult(
+        content=content,
+        chunks=[],
+        mime_type=mime_type,
+        metadata=metadata,
+        entities=[],
+        keywords=[],
+        detected_languages=[],
+        tables=[],
+        images=[],
+        image_ocr_results=[],
+    )

{kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_base.py RENAMED Viewed

@@ -230,13 +230,13 @@ class Extractor(ABC):
                 confidence_score=None,
                 processing_time=duration,
             )
-        except (OSError, ValueError) as e:  # pragma: no cover
+        except ValueError as e:  # pragma: no cover
             return ImageOCRResult(
                 image=target,
                 ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
                 skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
             )
-        except (RuntimeError, TypeError) as e:  # pragma: no cover
+        except TypeError as e:  # pragma: no cover
             return ImageOCRResult(
                 image=target,
                 ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),

kreuzberg-3.20.1/kreuzberg/_extractors/_html.py ADDED Viewed

@@ -0,0 +1,138 @@
+from __future__ import annotations
+import logging
+from typing import TYPE_CHECKING, Any, ClassVar
+from anyio import Path as AsyncPath
+from html_to_markdown import HtmlToMarkdownError
+from html_to_markdown._html_to_markdown import (
+    InlineImageConfig,
+    convert_with_inline_images,
+)
+from html_to_markdown._html_to_markdown import (
+    convert as rust_convert,
+)
+from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
+from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
+from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
+from kreuzberg._utils._string import safe_decode
+from kreuzberg._utils._sync import run_maybe_async, run_sync
+if TYPE_CHECKING:
+    from pathlib import Path
+logger = logging.getLogger(__name__)
+class HTMLExtractor(Extractor):
+    SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
+    async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
+        result = await run_sync(self.extract_bytes_sync, content)
+        if self.config.extract_images and self.config.ocr_extracted_images and result.images:
+            result.image_ocr_results = await self._process_images_with_ocr(result.images)
+        return result
+    async def extract_path_async(self, path: Path) -> ExtractionResult:
+        content = await AsyncPath(path).read_bytes()
+        result = await run_sync(self.extract_bytes_sync, content)
+        if self.config.extract_images and self.config.ocr_extracted_images and result.images:
+            result.image_ocr_results = await self._process_images_with_ocr(result.images)
+        return result
+    def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
+        extraction_config = self.config
+        html_content = safe_decode(content)
+        if extraction_config and extraction_config.html_to_markdown_config is not None:
+            html_config = extraction_config.html_to_markdown_config
+        else:
+            html_config = HTMLToMarkdownConfig()
+        conversion_options, _ = html_config.to_options()
+        extract_inline_images = bool(extraction_config and extraction_config.extract_images)
+        run_ocr_on_images = bool(
+            extraction_config and extraction_config.extract_images and extraction_config.ocr_extracted_images
+        )
+        inline_image_config = None
+        if extract_inline_images:
+            inline_image_config = InlineImageConfig(
+                max_decoded_size_bytes=MAX_SINGLE_IMAGE_SIZE,
+                filename_prefix=None,
+                capture_svg=True,
+                infer_dimensions=True,
+            )
+        try:
+            if extract_inline_images:
+                markdown, images_payload, warnings = convert_with_inline_images(
+                    html_content,
+                    options=conversion_options,
+                    image_config=inline_image_config,
+                )
+            else:
+                markdown = rust_convert(
+                    html_content,
+                    conversion_options,
+                )
+                images_payload = []
+                warnings = []
+        except (HtmlToMarkdownError, ValueError) as exc:
+            logger.exception("Failed to convert HTML to Markdown: %s", exc)
+            markdown = ""
+            images_payload = []
+            warnings = []
+        for warning in warnings:
+            self._log_inline_warning(warning)
+        extraction_result = ExtractionResult(content=markdown, mime_type=MARKDOWN_MIME_TYPE, metadata={})
+        inline_images = [self._build_extracted_image(image) for image in images_payload]
+        if inline_images:
+            extraction_result.images = inline_images
+            if run_ocr_on_images:
+                extraction_result.image_ocr_results = run_maybe_async(
+                    self._process_images_with_ocr,
+                    inline_images,
+                )
+        return self._apply_quality_processing(extraction_result)
+    def extract_path_sync(self, path: Path) -> ExtractionResult:
+        content = path.read_bytes()
+        return self.extract_bytes_sync(content)
+    @staticmethod
+    def _build_extracted_image(image: dict[str, Any]) -> ExtractedImage:
+        dimensions_value = image.get("dimensions")
+        dimensions = tuple(dimensions_value) if dimensions_value else None
+        return ExtractedImage(
+            data=image["data"],
+            format=image["format"],
+            filename=image.get("filename"),
+            description=image.get("description"),
+            dimensions=dimensions,
+        )
+    @staticmethod
+    def _log_inline_warning(warning: Any) -> None:
+        if isinstance(warning, dict):
+            index = warning.get("index")
+            message = warning.get("message")
+            if index is not None and message:
+                logger.warning("Inline image %s: %s", index, message)
+            elif message:
+                logger.warning("Inline image warning: %s", message)
+            else:
+                logger.warning("Inline image warning received with no message")
+            return
+        message = getattr(warning, "message", None)
+        index = getattr(warning, "index", None)
+        if message and index is not None:
+            logger.warning("Inline image %s: %s", index, message)
+        elif message:
+            logger.warning("Inline image warning: %s", message)
+        else:
+            logger.warning("Inline image warning received with no message")

{kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_pdf.py RENAMED Viewed

@@ -6,7 +6,6 @@ import logging
 import os
 import tempfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import asdict
 from itertools import count
 from multiprocessing import cpu_count
 from pathlib import Path
@@ -27,14 +26,11 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr import get_ocr_backend
 from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
 from kreuzberg._types import (
-    EasyOCRConfig,
     ExtractedImage,
     ExtractionResult,
     ImageOCRResult,
     Metadata,
     OcrBackendType,
-    PaddleOCRConfig,
-    TesseractConfig,
 )
 from kreuzberg._utils._errors import create_error_context, should_retry
 from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
@@ -134,48 +130,47 @@ class PDFExtractor(Extractor):
     def extract_path_sync(self, path: Path) -> ExtractionResult:
         content_bytes = path.read_bytes()
+        result: ExtractionResult | None = None
         document: Document | None = None
         if self.config.extract_images or self.config.extract_tables:
             document = self._parse_with_password_attempts(content_bytes)
-        try:
-            text = self._extract_pdf_searchable_text_sync(path)
-        except ParsingError:
-            text = ""
+        if not self.config.force_ocr:
+            try:
+                content = self._extract_pdf_searchable_text_sync(path)
+                if self._validate_extracted_text(content):
+                    result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
+            except ParsingError:
+                pass
-        if (self.config.force_ocr or not self._validate_extracted_text(text)) and self.config.ocr_backend is not None:
-            text = self._extract_pdf_with_ocr_sync(path)
+        if not result and self.config.ocr_backend is not None:
+            result = self._extract_pdf_text_with_ocr_sync(path, self.config.ocr_backend)
+        if not result:
+            result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
+        metadata = self._extract_metadata_with_password_attempts_sync(content_bytes)
+        result.metadata = metadata
-        tables = []
         if self.config.extract_tables:
             # GMFT is optional dependency ~keep
             try:
                 from kreuzberg._gmft import extract_tables_sync  # noqa: PLC0415
                 tables = extract_tables_sync(path)
+                result.tables = tables
             except ImportError:  # pragma: no cover
-                tables = []
-        if not self.config.force_ocr and self._validate_extracted_text(text):
-            text = self._extract_with_playa_sync(path, fallback_text=text)
-        text = normalize_spaces(text)
-        result = ExtractionResult(
-            content=text,
-            mime_type=PLAIN_TEXT_MIME_TYPE,
-            metadata={},
-            tables=list(tables),
-        )
+                result.tables = []
-        if tables:
-            table_summary = generate_table_summary(tables)
-            result.metadata = result.metadata | {
-                "table_count": table_summary["table_count"],
-                "tables_summary": f"Document contains {table_summary['table_count']} tables "
-                f"across {table_summary['pages_with_tables']} pages with "
-                f"{table_summary['total_rows']} total rows",
-            }
+            if result.tables:
+                table_summary = generate_table_summary(result.tables)
+                result.metadata = result.metadata | {
+                    "table_count": table_summary["table_count"],
+                    "tables_summary": f"Document contains {table_summary['table_count']} tables "
+                    f"across {table_summary['pages_with_tables']} pages with "
+                    f"{table_summary['total_rows']} total rows",
+                }
         if self.config.extract_images and document:
             images = self._extract_images_from_playa_sync(document)
@@ -405,7 +400,7 @@ class PDFExtractor(Extractor):
         except Exception as e:
             raise ParsingError(f"Failed to extract PDF text: {e}") from e
-    def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
+    def _extract_pdf_text_with_ocr_sync(self, path: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
         temp_files: list[Path] = []
         try:
             with pdf_document_sync(path) as pdf:
@@ -443,7 +438,8 @@ class PDFExtractor(Extractor):
                         with pdf_resources_sync(bitmap, page):
                             pil_image.close()
-            return self._process_pdf_images_with_ocr([str(p) for p in temp_files])
+            content = self._process_pdf_images_with_ocr([str(p) for p in temp_files], ocr_backend)
+            return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
         except Exception as e:
             raise ParsingError(f"Failed to OCR PDF: {e}") from e
@@ -452,28 +448,11 @@ class PDFExtractor(Extractor):
                 with contextlib.suppress(OSError):
                     p.unlink()
-    def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
-        backend = get_ocr_backend(self.config.ocr_backend)
+    def _process_pdf_images_with_ocr(self, image_paths: list[str], ocr_backend: OcrBackendType) -> str:
+        backend = get_ocr_backend(ocr_backend)
         paths = [Path(p) for p in image_paths]
-        match self.config.ocr_backend:
-            case "tesseract":
-                config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
-                )
-                results = backend.process_batch_sync(paths, **asdict(config))
-            case "paddleocr":
-                paddle_config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
-                )
-                results = backend.process_batch_sync(paths, **asdict(paddle_config))
-            case "easyocr":
-                easy_config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
-                )
-                results = backend.process_batch_sync(paths, **asdict(easy_config))
-            case _:
-                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+        results = backend.process_batch_sync(paths, **self.config.get_config_dict())
         return "\n\n".join(result.content for result in results)

{kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_structured.py RENAMED Viewed

@@ -14,7 +14,7 @@ else:  # pragma: no cover
 try:
     import yaml
 except ImportError:  # pragma: no cover
-    yaml = None
+    yaml = None  # type: ignore[assignment]
 from anyio import Path as AsyncPath

kreuzberg 3.18.0__tar.gz → 3.20.1__tar.gz

kreuzberg 3.18.0tar.gz → 3.20.1tar.gz