PyPI - kreuzberg - Versions diffs - 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl - Mend

kreuzberg 3.14.1py3-none-any.whl → 3.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

kreuzberg/__init__.py +10 -0
kreuzberg/_api/_config_cache.py +247 -0
kreuzberg/_api/main.py +74 -45
kreuzberg/_chunker.py +7 -6
kreuzberg/_config.py +11 -1
kreuzberg/_constants.py +2 -0
kreuzberg/_document_classification.py +5 -7
kreuzberg/_entity_extraction.py +9 -4
kreuzberg/_extractors/_base.py +269 -3
kreuzberg/_extractors/_email.py +101 -27
kreuzberg/_extractors/_html.py +112 -7
kreuzberg/_extractors/_image.py +23 -22
kreuzberg/_extractors/_pandoc.py +106 -75
kreuzberg/_extractors/_pdf.py +208 -99
kreuzberg/_extractors/_presentation.py +76 -8
kreuzberg/_extractors/_spread_sheet.py +24 -30
kreuzberg/_extractors/_structured.py +83 -15
kreuzberg/_gmft.py +5 -0
kreuzberg/_mcp/server.py +324 -25
kreuzberg/_mime_types.py +42 -0
kreuzberg/_ocr/_easyocr.py +53 -21
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +88 -37
kreuzberg/_types.py +291 -61
kreuzberg/_utils/_cache.py +10 -4
kreuzberg/_utils/_device.py +2 -4
kreuzberg/_utils/_html_streaming.py +20 -0
kreuzberg/_utils/_image_preprocessing.py +12 -39
kreuzberg/_utils/_process_pool.py +29 -8
kreuzberg/_utils/_quality.py +7 -2
kreuzberg/_utils/_resource_managers.py +65 -0
kreuzberg/_utils/_serialization.py +13 -6
kreuzberg/_utils/_sync.py +39 -10
kreuzberg/_utils/_tmp.py +37 -1
kreuzberg/cli.py +34 -20
kreuzberg/extraction.py +44 -28
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
kreuzberg-3.16.0.dist-info/RECORD +61 -0
kreuzberg-3.14.1.dist-info/RECORD +0 -58
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_mcp/server.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from __future__ import annotations
 import base64
+import binascii
 import json
+from pathlib import Path
 from typing import Any
 import msgspec
@@ -9,34 +11,170 @@ from mcp.server import FastMCP
 from mcp.types import TextContent
 from kreuzberg._config import discover_config
-from kreuzberg._types import ExtractionConfig, OcrBackendType
-from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
+from kreuzberg._types import ExtractionConfig, OcrBackendType, PSMMode, TesseractConfig
+from kreuzberg.exceptions import ValidationError
+from kreuzberg.extraction import (
+    batch_extract_bytes_sync,
+    batch_extract_file_sync,
+    extract_bytes_sync,
+    extract_file_sync,
+)
 mcp = FastMCP("Kreuzberg Text Extraction")
+MAX_BATCH_SIZE = 100
+def _validate_file_path(file_path: str) -> Path:
+    """Validate file path to prevent path traversal attacks.
+    Args:
+        file_path: The file path to validate
+    Returns:
+        Path: The validated Path object
+    Raises:
+        ValidationError: If path traversal is detected or path is invalid
+    """
+    try:
+        path = Path(file_path).resolve()
+    except (OSError, ValueError) as e:
+        raise ValidationError(
+            f"Invalid file path: {file_path}",
+            context={"file_path": file_path, "error": str(e)},
+        ) from e
+    if ".." in file_path and not file_path.startswith("/"):
+        raise ValidationError(
+            "Path traversal detected in file path",
+            context={"file_path": file_path, "resolved_path": str(path)},
+        )
+    if not path.exists():
+        raise ValidationError(
+            f"File not found: {file_path}",
+            context={"file_path": file_path, "resolved_path": str(path)},
+        )
+    if not path.is_file():
+        raise ValidationError(
+            f"Path is not a file: {file_path}",
+            context={"file_path": file_path, "resolved_path": str(path)},
+        )
+    return path
+def _validate_file_path_with_context(file_path: str, index: int, total: int) -> Path:
+    """Validate file path and add context for batch operations."""
+    try:
+        return _validate_file_path(file_path)
+    except ValidationError as e:
+        e.context = e.context or {}
+        e.context["batch_index"] = index
+        e.context["total_files"] = total
+        raise
+def _validate_base64_content(content_base64: str, context_info: str | None = None) -> bytes:
+    """Validate and decode base64 content with proper error handling.
+    Args:
+        content_base64: The base64 string to validate and decode
+        context_info: Additional context information for error reporting
+    Returns:
+        bytes: The decoded content
+    Raises:
+        ValidationError: If the base64 content is invalid
+    """
+    if not content_base64:
+        raise ValidationError(
+            "Base64 content cannot be empty",
+            context={"context": context_info},
+        )
+    if not content_base64.strip():
+        raise ValidationError(
+            "Base64 content cannot be whitespace only",
+            context={"content_preview": content_base64[:50], "context": context_info},
+        )
+    try:
+        content_bytes = base64.b64decode(content_base64, validate=True)
+    except (ValueError, binascii.Error) as e:
+        error_type = type(e).__name__
+        raise ValidationError(
+            f"Invalid base64 content: {error_type}: {e}",
+            context={
+                "error_type": error_type,
+                "error": str(e),
+                "content_preview": content_base64[:50] + "..." if len(content_base64) > 50 else content_base64,
+                "context": context_info,
+            },
+        ) from e
+    return content_bytes
 def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
     base_config = discover_config()
+    tesseract_lang = kwargs.pop("tesseract_lang", None)
+    tesseract_psm = kwargs.pop("tesseract_psm", None)
+    tesseract_output_format = kwargs.pop("tesseract_output_format", None)
+    enable_table_detection = kwargs.pop("enable_table_detection", None)
     if base_config is None:
-        return ExtractionConfig(**kwargs)
-    config_dict: dict[str, Any] = {
-        "force_ocr": base_config.force_ocr,
-        "chunk_content": base_config.chunk_content,
-        "extract_tables": base_config.extract_tables,
-        "extract_entities": base_config.extract_entities,
-        "extract_keywords": base_config.extract_keywords,
-        "ocr_backend": base_config.ocr_backend,
-        "max_chars": base_config.max_chars,
-        "max_overlap": base_config.max_overlap,
-        "keyword_count": base_config.keyword_count,
-        "auto_detect_language": base_config.auto_detect_language,
-        "ocr_config": base_config.ocr_config,
-        "gmft_config": base_config.gmft_config,
-    }
-    config_dict = config_dict | kwargs
+        config_dict = kwargs
+    else:
+        config_dict = {
+            "force_ocr": base_config.force_ocr,
+            "chunk_content": base_config.chunk_content,
+            "extract_tables": base_config.extract_tables,
+            "extract_entities": base_config.extract_entities,
+            "extract_keywords": base_config.extract_keywords,
+            "ocr_backend": base_config.ocr_backend,
+            "max_chars": base_config.max_chars,
+            "max_overlap": base_config.max_overlap,
+            "keyword_count": base_config.keyword_count,
+            "auto_detect_language": base_config.auto_detect_language,
+            "ocr_config": base_config.ocr_config,
+            "gmft_config": base_config.gmft_config,
+        }
+        config_dict = config_dict | kwargs
+    ocr_backend = config_dict.get("ocr_backend")
+    if ocr_backend == "tesseract" and (
+        tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
+    ):
+        tesseract_config_dict = {}
+        if tesseract_lang:
+            tesseract_config_dict["language"] = tesseract_lang
+        if tesseract_psm is not None:
+            try:
+                tesseract_config_dict["psm"] = PSMMode(tesseract_psm)
+            except ValueError as e:
+                raise ValidationError(
+                    f"Invalid PSM mode value: {tesseract_psm}",
+                    context={"psm_value": tesseract_psm, "error": str(e)},
+                ) from e
+        if tesseract_output_format:
+            tesseract_config_dict["output_format"] = tesseract_output_format
+        if enable_table_detection:
+            tesseract_config_dict["enable_table_detection"] = True
+        if tesseract_config_dict:
+            existing_ocr_config = config_dict.get("ocr_config")
+            if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
+                existing_dict = existing_ocr_config.to_dict()
+                merged_dict = existing_dict | tesseract_config_dict
+                config_dict["ocr_config"] = TesseractConfig(**merged_dict)
+            else:
+                config_dict["ocr_config"] = TesseractConfig(**tesseract_config_dict)
     return ExtractionConfig(**config_dict)
@@ -55,7 +193,12 @@ def extract_document(  # noqa: PLR0913
     max_overlap: int = 200,
     keyword_count: int = 10,
     auto_detect_language: bool = False,
+    tesseract_lang: str | None = None,
+    tesseract_psm: int | None = None,
+    tesseract_output_format: str | None = None,
+    enable_table_detection: bool | None = None,
 ) -> dict[str, Any]:
+    validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides(
         force_ocr=force_ocr,
         chunk_content=chunk_content,
@@ -67,9 +210,13 @@ def extract_document(  # noqa: PLR0913
         max_overlap=max_overlap,
         keyword_count=keyword_count,
         auto_detect_language=auto_detect_language,
+        tesseract_lang=tesseract_lang,
+        tesseract_psm=tesseract_psm,
+        tesseract_output_format=tesseract_output_format,
+        enable_table_detection=enable_table_detection,
     )
-    result = extract_file_sync(file_path, mime_type, config)
+    result = extract_file_sync(str(validated_path), mime_type, config)
     return result.to_dict(include_none=True)
@@ -87,8 +234,12 @@ def extract_bytes(  # noqa: PLR0913
     max_overlap: int = 200,
     keyword_count: int = 10,
     auto_detect_language: bool = False,
+    tesseract_lang: str | None = None,
+    tesseract_psm: int | None = None,
+    tesseract_output_format: str | None = None,
+    enable_table_detection: bool | None = None,
 ) -> dict[str, Any]:
-    content_bytes = base64.b64decode(content_base64)
+    content_bytes = _validate_base64_content(content_base64, "extract_bytes")
     config = _create_config_with_overrides(
         force_ocr=force_ocr,
@@ -101,19 +252,165 @@ def extract_bytes(  # noqa: PLR0913
         max_overlap=max_overlap,
         keyword_count=keyword_count,
         auto_detect_language=auto_detect_language,
+        tesseract_lang=tesseract_lang,
+        tesseract_psm=tesseract_psm,
+        tesseract_output_format=tesseract_output_format,
+        enable_table_detection=enable_table_detection,
     )
     result = extract_bytes_sync(content_bytes, mime_type, config)
     return result.to_dict(include_none=True)
+@mcp.tool()
+def batch_extract_document(  # noqa: PLR0913
+    file_paths: list[str],
+    force_ocr: bool = False,
+    chunk_content: bool = False,
+    extract_tables: bool = False,
+    extract_entities: bool = False,
+    extract_keywords: bool = False,
+    ocr_backend: OcrBackendType = "tesseract",
+    max_chars: int = 1000,
+    max_overlap: int = 200,
+    keyword_count: int = 10,
+    auto_detect_language: bool = False,
+    tesseract_lang: str | None = None,
+    tesseract_psm: int | None = None,
+    tesseract_output_format: str | None = None,
+    enable_table_detection: bool | None = None,
+) -> list[dict[str, Any]]:
+    if len(file_paths) > MAX_BATCH_SIZE:
+        raise ValidationError(
+            f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
+            context={"batch_size": len(file_paths), "max_batch_size": MAX_BATCH_SIZE},
+        )
+    if not file_paths:
+        raise ValidationError(
+            "File paths list cannot be empty",
+            context={"file_paths": file_paths},
+        )
+    validated_paths = []
+    for i, file_path in enumerate(file_paths):
+        validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
+        validated_paths.append(str(validated_path))
+    config = _create_config_with_overrides(
+        force_ocr=force_ocr,
+        chunk_content=chunk_content,
+        extract_tables=extract_tables,
+        extract_entities=extract_entities,
+        extract_keywords=extract_keywords,
+        ocr_backend=ocr_backend,
+        max_chars=max_chars,
+        max_overlap=max_overlap,
+        keyword_count=keyword_count,
+        auto_detect_language=auto_detect_language,
+        tesseract_lang=tesseract_lang,
+        tesseract_psm=tesseract_psm,
+        tesseract_output_format=tesseract_output_format,
+        enable_table_detection=enable_table_detection,
+    )
+    results = batch_extract_file_sync(validated_paths, config)
+    return [result.to_dict(include_none=True) for result in results]
+@mcp.tool()
+def batch_extract_bytes(  # noqa: PLR0913
+    content_items: list[dict[str, str]],
+    force_ocr: bool = False,
+    chunk_content: bool = False,
+    extract_tables: bool = False,
+    extract_entities: bool = False,
+    extract_keywords: bool = False,
+    ocr_backend: OcrBackendType = "tesseract",
+    max_chars: int = 1000,
+    max_overlap: int = 200,
+    keyword_count: int = 10,
+    auto_detect_language: bool = False,
+    tesseract_lang: str | None = None,
+    tesseract_psm: int | None = None,
+    tesseract_output_format: str | None = None,
+    enable_table_detection: bool | None = None,
+) -> list[dict[str, Any]]:
+    if not content_items:
+        raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
+    if not isinstance(content_items, list):
+        raise ValidationError(
+            "content_items must be a list", context={"content_items_type": type(content_items).__name__}
+        )
+    if len(content_items) > MAX_BATCH_SIZE:
+        raise ValidationError(
+            f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
+            context={"batch_size": len(content_items), "max_batch_size": MAX_BATCH_SIZE},
+        )
+    config = _create_config_with_overrides(
+        force_ocr=force_ocr,
+        chunk_content=chunk_content,
+        extract_tables=extract_tables,
+        extract_entities=extract_entities,
+        extract_keywords=extract_keywords,
+        ocr_backend=ocr_backend,
+        max_chars=max_chars,
+        max_overlap=max_overlap,
+        keyword_count=keyword_count,
+        auto_detect_language=auto_detect_language,
+        tesseract_lang=tesseract_lang,
+        tesseract_psm=tesseract_psm,
+        tesseract_output_format=tesseract_output_format,
+        enable_table_detection=enable_table_detection,
+    )
+    contents = []
+    for i, item in enumerate(content_items):
+        if not isinstance(item, dict):
+            raise ValidationError(
+                f"Item at index {i} must be a dictionary",
+                context={"item_index": i, "item_type": type(item).__name__, "item": item},
+            )
+        if "content_base64" not in item:
+            raise ValidationError(
+                f"Item at index {i} is missing required key 'content_base64'",
+                context={"item_index": i, "item_keys": list(item.keys()), "item": item},
+            )
+        if "mime_type" not in item:
+            raise ValidationError(
+                f"Item at index {i} is missing required key 'mime_type'",
+                context={"item_index": i, "item_keys": list(item.keys()), "item": item},
+            )
+        content_base64 = item["content_base64"]
+        mime_type = item["mime_type"]
+        try:
+            content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
+        except ValidationError as e:
+            e.context = e.context or {}
+            e.context["item_index"] = i
+            e.context["total_items"] = len(content_items)
+            raise
+        contents.append((content_bytes, mime_type))
+    results = batch_extract_bytes_sync(contents, config)
+    return [result.to_dict(include_none=True) for result in results]
 @mcp.tool()
 def extract_simple(
     file_path: str,
     mime_type: str | None = None,
 ) -> str:
+    validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides()
-    result = extract_file_sync(file_path, mime_type, config)
+    result = extract_file_sync(str(validated_path), mime_type, config)
     return result.content
@@ -151,7 +448,8 @@ def get_supported_formats() -> str:
 @mcp.prompt()
 def extract_and_summarize(file_path: str) -> list[TextContent]:
-    result = extract_file_sync(file_path, None, _create_config_with_overrides())
+    validated_path = _validate_file_path(file_path)
+    result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
     return [
         TextContent(
@@ -163,12 +461,13 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
 @mcp.prompt()
 def extract_structured(file_path: str) -> list[TextContent]:
+    validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides(
         extract_entities=True,
         extract_keywords=True,
         extract_tables=True,
     )
-    result = extract_file_sync(file_path, None, config)
+    result = extract_file_sync(str(validated_path), None, config)
     content = f"Document Content:\n{result.content}\n\n"

kreuzberg/_mime_types.py CHANGED Viewed

@@ -56,6 +56,48 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
     "image/x-tiff",
 }
+IMAGE_FORMATS: Final[frozenset[str]] = frozenset(
+    {
+        "jpg",
+        "jpeg",
+        "png",
+        "gif",
+        "bmp",
+        "tiff",
+        "tif",
+        "webp",
+        "jp2",
+        "jpx",
+        "jpm",
+        "mj2",
+        "pnm",
+        "pbm",
+        "pgm",
+        "ppm",
+    }
+)
+IMAGE_MIME_TO_EXT: Final[dict[str, str]] = {
+    "image/bmp": "bmp",
+    "image/x-bmp": "bmp",
+    "image/x-ms-bmp": "bmp",
+    "image/gif": "gif",
+    "image/jpeg": "jpg",
+    "image/pjpeg": "jpg",
+    "image/png": "png",
+    "image/tiff": "tiff",
+    "image/x-tiff": "tiff",
+    "image/jp2": "jp2",
+    "image/jpx": "jpx",
+    "image/jpm": "jpm",
+    "image/mj2": "mj2",
+    "image/webp": "webp",
+    "image/x-portable-anymap": "pnm",
+    "image/x-portable-bitmap": "pbm",
+    "image/x-portable-graymap": "pgm",
+    "image/x-portable-pixmap": "ppm",
+}
 PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
     "application/csl+json",
     "application/docbook+xml",

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -33,22 +33,39 @@ except ImportError:  # pragma: no cover
 if TYPE_CHECKING:
     import easyocr
-    import numpy as np
     import torch
+else:
+    easyocr: Any = None
+    torch: Any = None
+HAS_EASYOCR: bool = False
+def _import_easyocr() -> tuple[Any, Any]:
+    global HAS_EASYOCR, easyocr, torch
+    # If easyocr is already set (either real module or mock), return it
+    if easyocr is not None:
+        return easyocr, torch
+    # If explicitly disabled for testing
+    if not HAS_EASYOCR and easyocr is None:
+        return None, None
-HAS_EASYOCR: bool
-if not TYPE_CHECKING:
     try:
-        import easyocr
-        import numpy as np
-        import torch
+        import easyocr as _easyocr  # noqa: PLC0415
+        try:
+            import torch as _torch  # noqa: PLC0415
+        except ImportError:
+            _torch = None  # type: ignore[assignment]
+        easyocr = _easyocr
+        torch = _torch
         HAS_EASYOCR = True
+        return easyocr, torch
     except ImportError:
-        HAS_EASYOCR = False
-        easyocr: Any = None
-        np: Any = None
-        torch: Any = None
+        return None, None
 EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
@@ -142,6 +159,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     _reader: ClassVar[Any] = None
     async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        try:
+            import numpy as np  # noqa: PLC0415
+        except ImportError as e:
+            raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
         use_cache = kwargs.pop("use_cache", True)
         cache_kwargs = None
@@ -239,7 +261,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
             )
             return ExtractionResult(
-                content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+                content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
             )
         # Group text boxes by lines based on Y coordinate  # ~keep
@@ -287,12 +309,13 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         )
         return ExtractionResult(
-            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
         )
     @classmethod
     def _is_gpu_available(cls) -> bool:
-        if not HAS_EASYOCR or torch is None:
+        # Use the module-level torch variable directly to respect patches
+        if torch is None:
             return False
         return bool(torch.cuda.is_available())
@@ -301,13 +324,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         if cls._reader is not None:
             return
-        if not HAS_EASYOCR or easyocr is None:
+        # Validate language first before attempting import
+        languages = cls._validate_language_code(kwargs.pop("language", "en"))
+        easyocr_module, _ = _import_easyocr()
+        if easyocr_module is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
             )
-        languages = cls._validate_language_code(kwargs.pop("language", "en"))
         device_info = cls._resolve_device_config(**kwargs)
         use_gpu = device_info.device_type in ("cuda", "mps")
@@ -318,7 +343,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         try:
             cls._reader = await run_sync(
-                easyocr.Reader,
+                easyocr_module.Reader,
                 languages,
                 gpu=use_gpu,
                 verbose=False,
@@ -382,6 +407,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         return languages
     def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        try:
+            import numpy as np  # noqa: PLC0415
+        except ImportError as e:
+            raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
         use_cache = kwargs.pop("use_cache", True)
         cache_kwargs = None
@@ -453,13 +483,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         if cls._reader is not None:
             return
-        if not HAS_EASYOCR or easyocr is None:
+        # Validate language first before attempting import
+        languages = cls._validate_language_code(kwargs.pop("language", "en"))
+        easyocr_module, _ = _import_easyocr()
+        if easyocr_module is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
             )
-        languages = cls._validate_language_code(kwargs.pop("language", "en"))
         device_info = cls._resolve_device_config(**kwargs)
         use_gpu = device_info.device_type in ("cuda", "mps")
@@ -469,7 +501,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         kwargs.setdefault("recog_network", "standard")
         try:
-            cls._reader = easyocr.Reader(
+            cls._reader = easyocr_module.Reader(
                 languages,
                 gpu=use_gpu,
                 verbose=False,

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -192,7 +192,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         )
         return ExtractionResult(
-            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
         )
     @classmethod

kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

kreuzberg 3.14.1py3-none-any.whl → 3.16.0py3-none-any.whl