PyPI - kreuzberg - Versions diffs - 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl - Mend

kreuzberg 3.14.0py3-none-any.whl → 3.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

kreuzberg/__init__.py +6 -0
kreuzberg/_api/_config_cache.py +247 -0
kreuzberg/_api/main.py +156 -30
kreuzberg/_chunker.py +7 -6
kreuzberg/_constants.py +2 -0
kreuzberg/_document_classification.py +4 -6
kreuzberg/_entity_extraction.py +9 -4
kreuzberg/_extractors/_base.py +269 -3
kreuzberg/_extractors/_email.py +95 -27
kreuzberg/_extractors/_html.py +85 -7
kreuzberg/_extractors/_image.py +23 -22
kreuzberg/_extractors/_pandoc.py +106 -75
kreuzberg/_extractors/_pdf.py +209 -99
kreuzberg/_extractors/_presentation.py +72 -8
kreuzberg/_extractors/_spread_sheet.py +25 -30
kreuzberg/_mcp/server.py +345 -25
kreuzberg/_mime_types.py +42 -0
kreuzberg/_ocr/_easyocr.py +2 -2
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +74 -34
kreuzberg/_types.py +182 -23
kreuzberg/_utils/_cache.py +10 -4
kreuzberg/_utils/_device.py +2 -4
kreuzberg/_utils/_image_preprocessing.py +12 -39
kreuzberg/_utils/_process_pool.py +29 -8
kreuzberg/_utils/_quality.py +7 -2
kreuzberg/_utils/_resource_managers.py +65 -0
kreuzberg/_utils/_sync.py +36 -6
kreuzberg/_utils/_tmp.py +37 -1
kreuzberg/cli.py +34 -20
kreuzberg/extraction.py +43 -27
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
kreuzberg-3.15.0.dist-info/RECORD +60 -0
kreuzberg-3.14.0.dist-info/RECORD +0 -58
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -2,13 +2,10 @@ from __future__ import annotations
 import contextlib
 import csv
-import os
 import sys
-import tempfile
 from datetime import date, datetime, time, timedelta
 from io import StringIO
-from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 import polars as pl
 from anyio import Path as AsyncPath
@@ -21,9 +18,12 @@ from kreuzberg._types import ExtractionResult, Metadata, TableData
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync, run_taskgroup
 from kreuzberg._utils._table import enhance_table_markdown
-from kreuzberg._utils._tmp import create_temp_file
+from kreuzberg._utils._tmp import create_temp_file, temporary_file, temporary_file_sync
 from kreuzberg.exceptions import ParsingError
+if TYPE_CHECKING:
+    from pathlib import Path
 if sys.version_info < (3, 11):  # pragma: no cover
     from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
@@ -48,12 +48,8 @@ class SpreadSheetExtractor(Extractor):
     async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
         file_extension = self._get_file_extension()
-        xlsx_path, unlink = await create_temp_file(file_extension)
-        await AsyncPath(xlsx_path).write_bytes(content)
-        try:
+        async with temporary_file(file_extension, content) as xlsx_path:
             return await self.extract_path_async(xlsx_path)
-        finally:
-            await unlink()
     async def extract_path_async(self, path: Path) -> ExtractionResult:
         try:
@@ -86,16 +82,8 @@ class SpreadSheetExtractor(Extractor):
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
         file_extension = self._get_file_extension()
-        fd, temp_path = tempfile.mkstemp(suffix=file_extension)
-        try:
-            with os.fdopen(fd, "wb") as f:
-                f.write(content)
-            return self.extract_path_sync(Path(temp_path))
-        finally:
-            with contextlib.suppress(OSError):
-                Path(temp_path).unlink()
+        with temporary_file_sync(file_extension, content) as temp_path:
+            return self.extract_path_sync(temp_path)
     def extract_path_sync(self, path: Path) -> ExtractionResult:
         try:
@@ -122,15 +110,17 @@ class SpreadSheetExtractor(Extractor):
     @staticmethod
     def _convert_cell_to_str(value: Any) -> str:
-        if value is None:
-            return ""
-        if isinstance(value, bool):
-            return str(value).lower()
-        if isinstance(value, (datetime, date, time)):
-            return value.isoformat()
-        if isinstance(value, timedelta):
-            return f"{value.total_seconds()} seconds"
-        return str(value)
+        match value:
+            case None:
+                return ""
+            case bool():
+                return str(value).lower()
+            case datetime() | date() | time():
+                return value.isoformat()
+            case timedelta():
+                return f"{value.total_seconds()} seconds"
+            case _:
+                return str(value)
     async def _convert_sheet_to_text(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
         values = workbook.get_sheet_by_name(sheet_name).to_python()
@@ -207,7 +197,12 @@ class SpreadSheetExtractor(Extractor):
             if not data or not any(row for row in data):
                 return f"## {sheet_name}\n\n*Empty sheet*"
-            df = pl.DataFrame(data)
+            # Normalize row lengths to avoid polars ShapeError
+            if data:
+                max_cols = max(len(row) if row else 0 for row in data)
+                data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data]  # type: ignore[list-item]
+            df = pl.DataFrame(data, strict=False)
             df = df.filter(~pl.all_horizontal(pl.all().is_null()))
             df = df.select([col for col in df.columns if not df[col].is_null().all()])

kreuzberg/_mcp/server.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from __future__ import annotations
 import base64
+import binascii
 import json
+from pathlib import Path
 from typing import Any
 import msgspec
@@ -9,34 +11,178 @@ from mcp.server import FastMCP
 from mcp.types import TextContent
 from kreuzberg._config import discover_config
-from kreuzberg._types import ExtractionConfig, OcrBackendType
-from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
+from kreuzberg._types import ExtractionConfig, OcrBackendType, PSMMode, TesseractConfig
+from kreuzberg.exceptions import ValidationError
+from kreuzberg.extraction import (
+    batch_extract_bytes_sync,
+    batch_extract_file_sync,
+    extract_bytes_sync,
+    extract_file_sync,
+)
 mcp = FastMCP("Kreuzberg Text Extraction")
+# Security and performance limits
+MAX_BATCH_SIZE = 100
+def _validate_file_path(file_path: str) -> Path:
+    """Validate file path to prevent path traversal attacks.
+    Args:
+        file_path: The file path to validate
+    Returns:
+        Path: The validated Path object
+    Raises:
+        ValidationError: If path traversal is detected or path is invalid
+    """
+    try:
+        path = Path(file_path).resolve()
+    except (OSError, ValueError) as e:
+        raise ValidationError(
+            f"Invalid file path: {file_path}",
+            context={"file_path": file_path, "error": str(e)},
+        ) from e
+    # Check for path traversal attempts
+    if ".." in file_path and not file_path.startswith("/"):
+        raise ValidationError(
+            "Path traversal detected in file path",
+            context={"file_path": file_path, "resolved_path": str(path)},
+        )
+    if not path.exists():
+        raise ValidationError(
+            f"File not found: {file_path}",
+            context={"file_path": file_path, "resolved_path": str(path)},
+        )
+    if not path.is_file():
+        raise ValidationError(
+            f"Path is not a file: {file_path}",
+            context={"file_path": file_path, "resolved_path": str(path)},
+        )
+    return path
+def _validate_file_path_with_context(file_path: str, index: int, total: int) -> Path:
+    """Validate file path and add context for batch operations."""
+    try:
+        return _validate_file_path(file_path)
+    except ValidationError as e:
+        # Add context about which file in the batch failed
+        e.context = e.context or {}
+        e.context["batch_index"] = index
+        e.context["total_files"] = total
+        raise
+def _validate_base64_content(content_base64: str, context_info: str | None = None) -> bytes:
+    """Validate and decode base64 content with proper error handling.
+    Args:
+        content_base64: The base64 string to validate and decode
+        context_info: Additional context information for error reporting
+    Returns:
+        bytes: The decoded content
+    Raises:
+        ValidationError: If the base64 content is invalid
+    """
+    if not content_base64:
+        raise ValidationError(
+            "Base64 content cannot be empty",
+            context={"context": context_info},
+        )
+    # Check for whitespace-only content
+    if not content_base64.strip():
+        raise ValidationError(
+            "Base64 content cannot be whitespace only",
+            context={"content_preview": content_base64[:50], "context": context_info},
+        )
+    try:
+        content_bytes = base64.b64decode(content_base64, validate=True)
+    except (ValueError, binascii.Error) as e:
+        error_type = type(e).__name__
+        raise ValidationError(
+            f"Invalid base64 content: {error_type}: {e}",
+            context={
+                "error_type": error_type,
+                "error": str(e),
+                "content_preview": content_base64[:50] + "..." if len(content_base64) > 50 else content_base64,
+                "context": context_info,
+            },
+        ) from e
+    return content_bytes
 def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
     base_config = discover_config()
+    # Extract Tesseract-specific parameters from kwargs first
+    tesseract_lang = kwargs.pop("tesseract_lang", None)
+    tesseract_psm = kwargs.pop("tesseract_psm", None)
+    tesseract_output_format = kwargs.pop("tesseract_output_format", None)
+    enable_table_detection = kwargs.pop("enable_table_detection", None)
     if base_config is None:
-        return ExtractionConfig(**kwargs)
-    config_dict: dict[str, Any] = {
-        "force_ocr": base_config.force_ocr,
-        "chunk_content": base_config.chunk_content,
-        "extract_tables": base_config.extract_tables,
-        "extract_entities": base_config.extract_entities,
-        "extract_keywords": base_config.extract_keywords,
-        "ocr_backend": base_config.ocr_backend,
-        "max_chars": base_config.max_chars,
-        "max_overlap": base_config.max_overlap,
-        "keyword_count": base_config.keyword_count,
-        "auto_detect_language": base_config.auto_detect_language,
-        "ocr_config": base_config.ocr_config,
-        "gmft_config": base_config.gmft_config,
-    }
-    config_dict = config_dict | kwargs
+        config_dict = kwargs
+    else:
+        config_dict = {
+            "force_ocr": base_config.force_ocr,
+            "chunk_content": base_config.chunk_content,
+            "extract_tables": base_config.extract_tables,
+            "extract_entities": base_config.extract_entities,
+            "extract_keywords": base_config.extract_keywords,
+            "ocr_backend": base_config.ocr_backend,
+            "max_chars": base_config.max_chars,
+            "max_overlap": base_config.max_overlap,
+            "keyword_count": base_config.keyword_count,
+            "auto_detect_language": base_config.auto_detect_language,
+            "ocr_config": base_config.ocr_config,
+            "gmft_config": base_config.gmft_config,
+        }
+        config_dict = config_dict | kwargs
+    # Handle Tesseract OCR configuration
+    ocr_backend = config_dict.get("ocr_backend")
+    if ocr_backend == "tesseract" and (
+        tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
+    ):
+        tesseract_config_dict = {}
+        if tesseract_lang:
+            tesseract_config_dict["language"] = tesseract_lang
+        if tesseract_psm is not None:
+            try:
+                tesseract_config_dict["psm"] = PSMMode(tesseract_psm)
+            except ValueError as e:
+                raise ValidationError(
+                    f"Invalid PSM mode value: {tesseract_psm}",
+                    context={"psm_value": tesseract_psm, "error": str(e)},
+                ) from e
+        if tesseract_output_format:
+            tesseract_config_dict["output_format"] = tesseract_output_format
+        if enable_table_detection:
+            tesseract_config_dict["enable_table_detection"] = True
+        if tesseract_config_dict:
+            # Merge with existing tesseract config if present
+            existing_ocr_config = config_dict.get("ocr_config")
+            if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
+                # Convert existing config to dict, merge, and recreate
+                existing_dict = existing_ocr_config.to_dict()
+                merged_dict = existing_dict | tesseract_config_dict
+                config_dict["ocr_config"] = TesseractConfig(**merged_dict)
+            else:
+                config_dict["ocr_config"] = TesseractConfig(**tesseract_config_dict)
     return ExtractionConfig(**config_dict)
@@ -55,7 +201,13 @@ def extract_document(  # noqa: PLR0913
     max_overlap: int = 200,
     keyword_count: int = 10,
     auto_detect_language: bool = False,
+    tesseract_lang: str | None = None,
+    tesseract_psm: int | None = None,
+    tesseract_output_format: str | None = None,
+    enable_table_detection: bool | None = None,
 ) -> dict[str, Any]:
+    # Validate file path for security
+    validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides(
         force_ocr=force_ocr,
         chunk_content=chunk_content,
@@ -67,9 +219,13 @@ def extract_document(  # noqa: PLR0913
         max_overlap=max_overlap,
         keyword_count=keyword_count,
         auto_detect_language=auto_detect_language,
+        tesseract_lang=tesseract_lang,
+        tesseract_psm=tesseract_psm,
+        tesseract_output_format=tesseract_output_format,
+        enable_table_detection=enable_table_detection,
     )
-    result = extract_file_sync(file_path, mime_type, config)
+    result = extract_file_sync(str(validated_path), mime_type, config)
     return result.to_dict(include_none=True)
@@ -87,8 +243,12 @@ def extract_bytes(  # noqa: PLR0913
     max_overlap: int = 200,
     keyword_count: int = 10,
     auto_detect_language: bool = False,
+    tesseract_lang: str | None = None,
+    tesseract_psm: int | None = None,
+    tesseract_output_format: str | None = None,
+    enable_table_detection: bool | None = None,
 ) -> dict[str, Any]:
-    content_bytes = base64.b64decode(content_base64)
+    content_bytes = _validate_base64_content(content_base64, "extract_bytes")
     config = _create_config_with_overrides(
         force_ocr=force_ocr,
@@ -101,19 +261,175 @@ def extract_bytes(  # noqa: PLR0913
         max_overlap=max_overlap,
         keyword_count=keyword_count,
         auto_detect_language=auto_detect_language,
+        tesseract_lang=tesseract_lang,
+        tesseract_psm=tesseract_psm,
+        tesseract_output_format=tesseract_output_format,
+        enable_table_detection=enable_table_detection,
     )
     result = extract_bytes_sync(content_bytes, mime_type, config)
     return result.to_dict(include_none=True)
+@mcp.tool()
+def batch_extract_document(  # noqa: PLR0913
+    file_paths: list[str],
+    force_ocr: bool = False,
+    chunk_content: bool = False,
+    extract_tables: bool = False,
+    extract_entities: bool = False,
+    extract_keywords: bool = False,
+    ocr_backend: OcrBackendType = "tesseract",
+    max_chars: int = 1000,
+    max_overlap: int = 200,
+    keyword_count: int = 10,
+    auto_detect_language: bool = False,
+    tesseract_lang: str | None = None,
+    tesseract_psm: int | None = None,
+    tesseract_output_format: str | None = None,
+    enable_table_detection: bool | None = None,
+) -> list[dict[str, Any]]:
+    # Validate batch size
+    if len(file_paths) > MAX_BATCH_SIZE:
+        raise ValidationError(
+            f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
+            context={"batch_size": len(file_paths), "max_batch_size": MAX_BATCH_SIZE},
+        )
+    if not file_paths:
+        raise ValidationError(
+            "File paths list cannot be empty",
+            context={"file_paths": file_paths},
+        )
+    # Validate all file paths for security
+    validated_paths = []
+    for i, file_path in enumerate(file_paths):
+        validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
+        validated_paths.append(str(validated_path))
+    config = _create_config_with_overrides(
+        force_ocr=force_ocr,
+        chunk_content=chunk_content,
+        extract_tables=extract_tables,
+        extract_entities=extract_entities,
+        extract_keywords=extract_keywords,
+        ocr_backend=ocr_backend,
+        max_chars=max_chars,
+        max_overlap=max_overlap,
+        keyword_count=keyword_count,
+        auto_detect_language=auto_detect_language,
+        tesseract_lang=tesseract_lang,
+        tesseract_psm=tesseract_psm,
+        tesseract_output_format=tesseract_output_format,
+        enable_table_detection=enable_table_detection,
+    )
+    results = batch_extract_file_sync(validated_paths, config)
+    return [result.to_dict(include_none=True) for result in results]
+@mcp.tool()
+def batch_extract_bytes(  # noqa: PLR0913
+    content_items: list[dict[str, str]],
+    force_ocr: bool = False,
+    chunk_content: bool = False,
+    extract_tables: bool = False,
+    extract_entities: bool = False,
+    extract_keywords: bool = False,
+    ocr_backend: OcrBackendType = "tesseract",
+    max_chars: int = 1000,
+    max_overlap: int = 200,
+    keyword_count: int = 10,
+    auto_detect_language: bool = False,
+    tesseract_lang: str | None = None,
+    tesseract_psm: int | None = None,
+    tesseract_output_format: str | None = None,
+    enable_table_detection: bool | None = None,
+) -> list[dict[str, Any]]:
+    # Validate input
+    if not content_items:
+        raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
+    if not isinstance(content_items, list):
+        raise ValidationError(
+            "content_items must be a list", context={"content_items_type": type(content_items).__name__}
+        )
+    # Validate batch size
+    if len(content_items) > MAX_BATCH_SIZE:
+        raise ValidationError(
+            f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
+            context={"batch_size": len(content_items), "max_batch_size": MAX_BATCH_SIZE},
+        )
+    config = _create_config_with_overrides(
+        force_ocr=force_ocr,
+        chunk_content=chunk_content,
+        extract_tables=extract_tables,
+        extract_entities=extract_entities,
+        extract_keywords=extract_keywords,
+        ocr_backend=ocr_backend,
+        max_chars=max_chars,
+        max_overlap=max_overlap,
+        keyword_count=keyword_count,
+        auto_detect_language=auto_detect_language,
+        tesseract_lang=tesseract_lang,
+        tesseract_psm=tesseract_psm,
+        tesseract_output_format=tesseract_output_format,
+        enable_table_detection=enable_table_detection,
+    )
+    # Convert list of dicts to list of tuples (bytes, mime_type)
+    contents = []
+    for i, item in enumerate(content_items):
+        # Validate item structure
+        if not isinstance(item, dict):
+            raise ValidationError(
+                f"Item at index {i} must be a dictionary",
+                context={"item_index": i, "item_type": type(item).__name__, "item": item},
+            )
+        # Check for required keys
+        if "content_base64" not in item:
+            raise ValidationError(
+                f"Item at index {i} is missing required key 'content_base64'",
+                context={"item_index": i, "item_keys": list(item.keys()), "item": item},
+            )
+        if "mime_type" not in item:
+            raise ValidationError(
+                f"Item at index {i} is missing required key 'mime_type'",
+                context={"item_index": i, "item_keys": list(item.keys()), "item": item},
+            )
+        content_base64 = item["content_base64"]
+        mime_type = item["mime_type"]
+        # Validate base64 content
+        try:
+            content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
+        except ValidationError as e:
+            # Add batch-specific context
+            e.context = e.context or {}
+            e.context["item_index"] = i
+            e.context["total_items"] = len(content_items)
+            raise
+        contents.append((content_bytes, mime_type))
+    results = batch_extract_bytes_sync(contents, config)
+    return [result.to_dict(include_none=True) for result in results]
 @mcp.tool()
 def extract_simple(
     file_path: str,
     mime_type: str | None = None,
 ) -> str:
+    # Validate file path for security
+    validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides()
-    result = extract_file_sync(file_path, mime_type, config)
+    result = extract_file_sync(str(validated_path), mime_type, config)
     return result.content
@@ -151,7 +467,9 @@ def get_supported_formats() -> str:
 @mcp.prompt()
 def extract_and_summarize(file_path: str) -> list[TextContent]:
-    result = extract_file_sync(file_path, None, _create_config_with_overrides())
+    # Validate file path for security
+    validated_path = _validate_file_path(file_path)
+    result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
     return [
         TextContent(
@@ -163,12 +481,14 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
 @mcp.prompt()
 def extract_structured(file_path: str) -> list[TextContent]:
+    # Validate file path for security
+    validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides(
         extract_entities=True,
         extract_keywords=True,
         extract_tables=True,
     )
-    result = extract_file_sync(file_path, None, config)
+    result = extract_file_sync(str(validated_path), None, config)
     content = f"Document Content:\n{result.content}\n\n"

kreuzberg/_mime_types.py CHANGED Viewed

@@ -56,6 +56,48 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
     "image/x-tiff",
 }
+IMAGE_FORMATS: Final[frozenset[str]] = frozenset(
+    {
+        "jpg",
+        "jpeg",
+        "png",
+        "gif",
+        "bmp",
+        "tiff",
+        "tif",
+        "webp",
+        "jp2",
+        "jpx",
+        "jpm",
+        "mj2",
+        "pnm",
+        "pbm",
+        "pgm",
+        "ppm",
+    }
+)
+IMAGE_MIME_TO_EXT: Final[dict[str, str]] = {
+    "image/bmp": "bmp",
+    "image/x-bmp": "bmp",
+    "image/x-ms-bmp": "bmp",
+    "image/gif": "gif",
+    "image/jpeg": "jpg",
+    "image/pjpeg": "jpg",
+    "image/png": "png",
+    "image/tiff": "tiff",
+    "image/x-tiff": "tiff",
+    "image/jp2": "jp2",
+    "image/jpx": "jpx",
+    "image/jpm": "jpm",
+    "image/mj2": "mj2",
+    "image/webp": "webp",
+    "image/x-portable-anymap": "pnm",
+    "image/x-portable-bitmap": "pbm",
+    "image/x-portable-graymap": "pgm",
+    "image/x-portable-pixmap": "ppm",
+}
 PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
     "application/csl+json",
     "application/docbook+xml",

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -239,7 +239,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
             )
             return ExtractionResult(
-                content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+                content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
             )
         # Group text boxes by lines based on Y coordinate  # ~keep
@@ -287,7 +287,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         )
         return ExtractionResult(
-            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
         )
     @classmethod

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -192,7 +192,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         )
         return ExtractionResult(
-            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
+            content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
         )
     @classmethod

kreuzberg 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl

kreuzberg 3.14.0py3-none-any.whl → 3.15.0py3-none-any.whl