PyPI - kreuzberg - Versions diffs - 3.13.2__py3-none-any.whl → 3.14.0__py3-none-any.whl - Mend

kreuzberg 3.13.2py3-none-any.whl → 3.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

kreuzberg/_api/main.py +117 -15
kreuzberg/_config.py +3 -0
kreuzberg/_extractors/_image.py +20 -2
kreuzberg/_extractors/_pdf.py +21 -1
kreuzberg/_extractors/_spread_sheet.py +16 -2
kreuzberg/_gmft.py +79 -33
kreuzberg/_mcp/server.py +0 -76
kreuzberg/_ocr/_base.py +1 -2
kreuzberg/_ocr/_paddleocr.py +39 -13
kreuzberg/_ocr/_tesseract.py +16 -6
kreuzberg/_registry.py +26 -0
kreuzberg/_types.py +64 -1
kreuzberg/_utils/_cache.py +34 -12
kreuzberg/_utils/_image_preprocessing.py +346 -0
kreuzberg/_utils/_ocr_cache.py +2 -5
kreuzberg/_utils/_process_pool.py +3 -3
kreuzberg/_utils/_table.py +4 -1
kreuzberg/cli.py +19 -2
kreuzberg/extraction.py +4 -4
{kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/METADATA +10 -10
{kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/RECORD +24 -23
{kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_image_preprocessing.py ADDED Viewed

@@ -0,0 +1,346 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+from PIL import Image
+from kreuzberg._types import ExtractionConfig, ImagePreprocessingMetadata
+if TYPE_CHECKING:
+    from PIL.Image import Image as PILImage
+def calculate_optimal_dpi(
+    page_width: float,
+    page_height: float,
+    target_dpi: int,
+    max_dimension: int,
+    min_dpi: int = 72,
+    max_dpi: int = 600,
+) -> int:
+    """Calculate optimal DPI based on page dimensions and constraints.
+    Args:
+        page_width: Page width in points (1/72 inch)
+        page_height: Page height in points (1/72 inch)
+        target_dpi: Desired target DPI
+        max_dimension: Maximum allowed pixel dimension
+        min_dpi: Minimum DPI threshold
+        max_dpi: Maximum DPI threshold
+    Returns:
+        Optimal DPI value that keeps image within max_dimension
+    """
+    # Convert points to inches (72 points = 1 inch)
+    width_inches = page_width / 72.0
+    height_inches = page_height / 72.0
+    # Calculate pixel dimensions at target DPI
+    target_width_pixels = int(width_inches * target_dpi)
+    target_height_pixels = int(height_inches * target_dpi)
+    # Check if target DPI results in oversized image
+    max_pixel_dimension = max(target_width_pixels, target_height_pixels)
+    if max_pixel_dimension <= max_dimension:
+        # Target DPI is fine, clamp to min/max bounds
+        return max(min_dpi, min(target_dpi, max_dpi))
+    # Calculate maximum DPI that keeps within dimension constraints
+    max_dpi_for_width = max_dimension / width_inches if width_inches > 0 else max_dpi
+    max_dpi_for_height = max_dimension / height_inches if height_inches > 0 else max_dpi
+    constrained_dpi = int(min(max_dpi_for_width, max_dpi_for_height))
+    # Clamp to min/max bounds
+    return max(min_dpi, min(constrained_dpi, max_dpi))
+def _extract_image_dpi(image: PILImage) -> tuple[tuple[float, float], float]:
+    """Extract DPI information from image."""
+    current_dpi_info = image.info.get("dpi", (72.0, 72.0))
+    if isinstance(current_dpi_info, (list, tuple)):
+        original_dpi = (float(current_dpi_info[0]), float(current_dpi_info[1]))
+        current_dpi = float(current_dpi_info[0])  # Use horizontal DPI
+    else:
+        current_dpi = float(current_dpi_info)
+        original_dpi = (current_dpi, current_dpi)
+    return original_dpi, current_dpi
+def _should_skip_processing(
+    original_width: int,
+    original_height: int,
+    current_dpi: float,
+    config: ExtractionConfig,
+) -> bool:
+    """Check if processing should be skipped."""
+    max_current_dimension = max(original_width, original_height)
+    current_matches_target = abs(current_dpi - config.target_dpi) < 1.0
+    return not config.auto_adjust_dpi and current_matches_target and max_current_dimension <= config.max_image_dimension
+def _calculate_target_dpi(
+    original_width: int,
+    original_height: int,
+    current_dpi: float,
+    config: ExtractionConfig,
+) -> tuple[int, bool, int | None]:
+    """Calculate target DPI and whether it was auto-adjusted."""
+    calculated_dpi = None
+    if config.auto_adjust_dpi:
+        # Convert pixel dimensions to approximate point dimensions
+        # This is an approximation since we don't know the actual physical size
+        approx_width_points = original_width * 72.0 / current_dpi
+        approx_height_points = original_height * 72.0 / current_dpi
+        optimal_dpi = calculate_optimal_dpi(
+            approx_width_points,
+            approx_height_points,
+            config.target_dpi,
+            config.max_image_dimension,
+            config.min_dpi,
+            config.max_dpi,
+        )
+        calculated_dpi = optimal_dpi
+        auto_adjusted = optimal_dpi != config.target_dpi
+        target_dpi = optimal_dpi
+    else:
+        auto_adjusted = False
+        target_dpi = config.target_dpi
+    return target_dpi, auto_adjusted, calculated_dpi
+def normalize_image_dpi(
+    image: PILImage,
+    config: ExtractionConfig,
+) -> tuple[PILImage, ImagePreprocessingMetadata]:
+    """Normalize image DPI and dimensions for optimal OCR processing.
+    Args:
+        image: PIL Image to normalize
+        config: ExtractionConfig containing DPI settings
+    Returns:
+        Tuple of (normalized_image, ImagePreprocessingMetadata)
+    Note:
+        If auto_adjust_dpi is False, uses target_dpi directly.
+        If True, calculates optimal DPI based on image dimensions and constraints.
+    """
+    original_width, original_height = image.size
+    original_dpi, current_dpi = _extract_image_dpi(image)
+    # If no auto-adjustment and current DPI matches target and within limits, skip processing
+    if _should_skip_processing(original_width, original_height, current_dpi, config):
+        return image, ImagePreprocessingMetadata(
+            original_dimensions=(original_width, original_height),
+            original_dpi=original_dpi,
+            target_dpi=config.target_dpi,
+            scale_factor=1.0,
+            auto_adjusted=False,
+            final_dpi=config.target_dpi,
+            skipped_resize=True,
+        )
+    # Calculate target DPI
+    target_dpi, auto_adjusted, calculated_dpi = _calculate_target_dpi(
+        original_width, original_height, current_dpi, config
+    )
+    # Calculate scale factor based on DPI ratio
+    scale_factor = target_dpi / current_dpi
+    # If scale factor is very close to 1.0, skip resizing
+    if abs(scale_factor - 1.0) < 0.05:
+        return image, ImagePreprocessingMetadata(
+            original_dimensions=(original_width, original_height),
+            original_dpi=original_dpi,
+            target_dpi=config.target_dpi,
+            scale_factor=scale_factor,
+            auto_adjusted=auto_adjusted,
+            final_dpi=target_dpi,
+            calculated_dpi=calculated_dpi,
+            skipped_resize=True,
+        )
+    # Calculate new dimensions
+    new_width = int(original_width * scale_factor)
+    new_height = int(original_height * scale_factor)
+    # Ensure we don't exceed max_dimension (safety check)
+    dimension_clamped = False
+    max_new_dimension = max(new_width, new_height)
+    if max_new_dimension > config.max_image_dimension:
+        dimension_scale = config.max_image_dimension / max_new_dimension
+        new_width = int(new_width * dimension_scale)
+        new_height = int(new_height * dimension_scale)
+        scale_factor *= dimension_scale
+        dimension_clamped = True
+    # Resize image
+    try:
+        # Use LANCZOS for high-quality downscaling, BICUBIC for upscaling
+        # Handle different PIL versions
+        try:
+            # Modern PIL version
+            if scale_factor < 1.0:
+                resample_method = Image.Resampling.LANCZOS
+                resample_name = "LANCZOS"
+            else:
+                resample_method = Image.Resampling.BICUBIC
+                resample_name = "BICUBIC"
+        except AttributeError:
+            # Older PIL version
+            if scale_factor < 1.0:
+                resample_method = getattr(Image, "LANCZOS", 1)  # type: ignore[arg-type]
+                resample_name = "LANCZOS"
+            else:
+                resample_method = getattr(Image, "BICUBIC", 3)  # type: ignore[arg-type]
+                resample_name = "BICUBIC"
+        normalized_image = image.resize((new_width, new_height), resample_method)
+        # Update DPI info in the new image
+        normalized_image.info["dpi"] = (target_dpi, target_dpi)
+        return normalized_image, ImagePreprocessingMetadata(
+            original_dimensions=(original_width, original_height),
+            original_dpi=original_dpi,
+            target_dpi=config.target_dpi,
+            scale_factor=scale_factor,
+            auto_adjusted=auto_adjusted,
+            final_dpi=target_dpi,
+            new_dimensions=(new_width, new_height),
+            resample_method=resample_name,
+            dimension_clamped=dimension_clamped,
+            calculated_dpi=calculated_dpi,
+        )
+    except OSError as e:
+        # If resizing fails, return original image with error info
+        return image, ImagePreprocessingMetadata(
+            original_dimensions=(original_width, original_height),
+            original_dpi=original_dpi,
+            target_dpi=config.target_dpi,
+            scale_factor=scale_factor,
+            auto_adjusted=auto_adjusted,
+            final_dpi=target_dpi,
+            calculated_dpi=calculated_dpi,
+            resize_error=str(e),
+        )
+def get_dpi_adjustment_heuristics(
+    width: float,
+    height: float,
+    current_dpi: int,
+    target_dpi: int,
+    max_dimension: int,
+    content_type: str = "document",
+) -> dict[str, Any]:
+    """Get smart DPI adjustment recommendations based on content analysis.
+    Args:
+        width: Image width in pixels
+        height: Image height in pixels
+        current_dpi: Current DPI setting
+        target_dpi: Desired target DPI
+        max_dimension: Maximum allowed dimension
+        content_type: Type of content ("document", "photo", "mixed")
+    Returns:
+        Dictionary with adjustment recommendations and rationale
+    """
+    recommendations: list[str] = []
+    heuristics = {
+        "recommended_dpi": target_dpi,
+        "content_analysis": {},
+        "performance_impact": "medium",
+        "quality_impact": "medium",
+        "recommendations": recommendations,
+    }
+    # Calculate aspect ratio and size analysis
+    aspect_ratio = width / height if height > 0 else 1.0
+    total_pixels = width * height
+    megapixels = total_pixels / 1_000_000
+    heuristics["content_analysis"] = {
+        "aspect_ratio": aspect_ratio,
+        "megapixels": megapixels,
+        "is_portrait": aspect_ratio < 0.8,
+        "is_landscape": aspect_ratio > 1.2,
+        "is_large": max(width, height) > max_dimension * 0.8,
+    }
+    # Document-specific heuristics
+    if content_type == "document":
+        if aspect_ratio > 2.0 or aspect_ratio < 0.5:
+            # Very wide or very tall documents (like forms, receipts)
+            recommendations.append("Consider higher DPI for narrow documents")
+            if target_dpi < 200:
+                heuristics["recommended_dpi"] = min(200, target_dpi * 1.3)
+        if megapixels > 50:  # Very large document
+            recommendations.append("Large document detected - consider DPI reduction")
+            heuristics["performance_impact"] = "high"
+            if target_dpi > 150:
+                heuristics["recommended_dpi"] = max(120, target_dpi * 0.8)
+    # Memory usage estimation
+    estimated_memory_mb = (width * height * 3) / (1024 * 1024)  # RGB bytes
+    if estimated_memory_mb > 200:
+        heuristics["performance_impact"] = "high"
+        recommendations.append(f"High memory usage expected (~{estimated_memory_mb:.0f}MB)")
+    # Quality vs performance tradeoffs
+    scale_factor = target_dpi / current_dpi if current_dpi > 0 else 1.0
+    if scale_factor < 0.7:
+        heuristics["quality_impact"] = "high"
+        recommendations.append("Significant downscaling may reduce OCR accuracy")
+    elif scale_factor > 1.5:
+        heuristics["performance_impact"] = "high"
+        recommendations.append("Upscaling will increase processing time")
+    return heuristics
+def estimate_processing_time(
+    width: int,
+    height: int,
+    ocr_backend: str = "tesseract",
+) -> dict[str, float | str]:
+    """Estimate processing time based on image dimensions and OCR backend.
+    Args:
+        width: Image width in pixels
+        height: Image height in pixels
+        ocr_backend: OCR backend name
+    Returns:
+        Dictionary with time estimates in seconds
+    """
+    total_pixels = width * height
+    megapixels = total_pixels / 1_000_000
+    # Base processing times per megapixel (rough estimates)
+    base_times = {
+        "tesseract": 2.5,  # seconds per megapixel
+        "easyocr": 4.0,  # slower due to deep learning
+        "paddleocr": 3.5,  # moderate speed
+    }
+    base_time = base_times.get(ocr_backend, 3.0)
+    # Non-linear scaling for very large images
+    scaling_factor = 1.0 + (megapixels - 10) * 0.1 if megapixels > 10 else 1.0
+    estimated_time = base_time * megapixels * scaling_factor
+    return {
+        "estimated_seconds": estimated_time,
+        "megapixels": megapixels,
+        "backend": ocr_backend,
+        "scaling_factor": scaling_factor,
+    }

kreuzberg/_utils/_ocr_cache.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import hashlib
 import io
+from pathlib import Path
 from typing import TYPE_CHECKING, Any
 import anyio
@@ -9,17 +10,13 @@ import anyio
 from kreuzberg._utils._cache import get_ocr_cache
 if TYPE_CHECKING:
-    from pathlib import Path
     from PIL.Image import Image as PILImage
     from kreuzberg._types import ExtractionResult
 def get_file_info(path: Path) -> dict[str, Any]:
-    from pathlib import Path as PathType  # noqa: PLC0415
-    path_obj = PathType(path) if not isinstance(path, PathType) else path
+    path_obj = path if isinstance(path, Path) else Path(path)
     try:
         stat = path_obj.stat()

kreuzberg/_utils/_process_pool.py CHANGED Viewed

@@ -4,7 +4,7 @@ import io
 import multiprocessing as mp
 from concurrent.futures import ProcessPoolExecutor
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar, cast
 import anyio
 import psutil
@@ -173,7 +173,7 @@ class ProcessPoolManager:
                     self._active_tasks -= 1
         async with anyio.create_task_group() as tg:
-            results: list[T] = [None] * len(arg_batches)  # type: ignore[list-item]
+            results: list[T | None] = [None] * len(arg_batches)
             async def run_task(idx: int, args: tuple[Any, ...]) -> None:
                 results[idx] = await submit_single(args)
@@ -181,7 +181,7 @@ class ProcessPoolManager:
             for idx, args in enumerate(arg_batches):
                 tg.start_soon(run_task, idx, args)
-        return results
+        return cast("list[T]", results)
     def get_system_info(self) -> dict[str, Any]:
         memory = psutil.virtual_memory()

kreuzberg/_utils/_table.py CHANGED Viewed

@@ -89,6 +89,8 @@ def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -
                     formatted_row.append(str(int(value)))
                 else:
                     formatted_row.append(f"{value:.2f}")
+            elif isinstance(value, bool):
+                formatted_row.append(str(value).lower())
             else:
                 clean_value = str(value).strip().replace("|", "\\|")
                 formatted_row.append(clean_value)
@@ -201,7 +203,8 @@ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
     total_cells = df.height * df.width
     if total_cells > 0:
-        empty_cells = df.null_count().sum().item()
+        null_counts = df.null_count()
+        empty_cells = sum(null_counts.row(0))
         info["empty_cells"] = empty_cells
         info["data_density"] = (total_cells - empty_cells) / total_cells

kreuzberg/cli.py CHANGED Viewed

@@ -62,7 +62,20 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
         if show_metadata:
             output_data["metadata"] = result.metadata
         if result.tables:
-            output_data["tables"] = result.tables
+            json_tables = []
+            for table in result.tables:
+                json_table = {
+                    "page_number": table.get("page_number"),
+                    "text": table.get("text"),
+                }
+                if "df" in table and table["df"] is not None:
+                    df = table["df"]
+                    if hasattr(df, "write_csv"):
+                        json_table["data_csv"] = df.write_csv()
+                    elif hasattr(df, "to_csv"):
+                        json_table["data_csv"] = df.to_csv(index=False)
+                json_tables.append(json_table)
+            output_data["tables"] = json_tables
         if result.chunks:
             output_data["chunks"] = result.chunks
         return json.dumps(output_data, indent=2, ensure_ascii=False)
@@ -77,7 +90,11 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
         output_parts.append("\n\n--- TABLES ---")
         for i, table in enumerate(result.tables):
             output_parts.append(f"\nTable {i + 1}:")
-            output_parts.append(json.dumps(table, indent=2, ensure_ascii=False))
+            json_table = {
+                "page_number": table.get("page_number"),
+                "text": table.get("text"),
+            }
+            output_parts.append(json.dumps(json_table, indent=2, ensure_ascii=False))
     return "\n".join(output_parts)

kreuzberg/extraction.py CHANGED Viewed

@@ -426,12 +426,12 @@ def batch_extract_file_sync(
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         future_to_index = {executor.submit(extract_single, fp): i for i, fp in enumerate(file_paths)}
-        results: list[ExtractionResult] = [None] * len(file_paths)  # type: ignore[list-item]
+        results: list[ExtractionResult | None] = [None] * len(file_paths)
         for future in as_completed(future_to_index):
             index, result = future.result()
             results[index] = result
-    return results
+    return cast("list[ExtractionResult]", results)
 def batch_extract_bytes_sync(
@@ -479,9 +479,9 @@ def batch_extract_bytes_sync(
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
-        results: list[ExtractionResult] = [None] * len(contents)  # type: ignore[list-item]
+        results: list[ExtractionResult | None] = [None] * len(contents)
         for future in as_completed(future_to_index):
             index, result = future.result()
             results[index] = result
-    return results
+    return cast("list[ExtractionResult]", results)

{kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.13.2
+Version: 3.14.0
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,15 +31,15 @@ Requires-Python: >=3.10
 Requires-Dist: anyio>=4.10.0
 Requires-Dist: chardetng-py>=0.3.5
 Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
-Requires-Dist: html-to-markdown[lxml]>=1.9.1
-Requires-Dist: mcp>=1.13.0
+Requires-Dist: html-to-markdown[lxml]>=1.11.0
+Requires-Dist: mcp>=1.14.0
 Requires-Dist: msgspec>=0.18.0
-Requires-Dist: numpy>=1.24.0
+Requires-Dist: numpy>=2.0.0
 Requires-Dist: playa-pdf>=0.7.0
-Requires-Dist: polars>=1.33.0
+Requires-Dist: polars>=1.33.1
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: pypdfium2==4.30.0
-Requires-Dist: python-calamine>=0.5.2
+Requires-Dist: python-calamine>=0.5.3
 Requires-Dist: python-pptx>=1.0.2
 Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
 Provides-Extra: additional-extensions
@@ -55,17 +55,17 @@ Requires-Dist: keybert>=0.9.0; extra == 'all'
 Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
 Requires-Dist: mailparse>=1.0.15; extra == 'all'
 Requires-Dist: paddleocr>=3.2.0; extra == 'all'
-Requires-Dist: paddlepaddle>=3.1.1; extra == 'all'
+Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
 Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
 Requires-Dist: rich>=14.1.0; extra == 'all'
-Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
+Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
 Requires-Dist: setuptools>=80.9.0; extra == 'all'
 Requires-Dist: spacy>=3.8.7; extra == 'all'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
 Provides-Extra: api
 Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
 Provides-Extra: chunking
-Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
+Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
 Provides-Extra: cli
 Requires-Dist: click>=8.2.1; extra == 'cli'
 Requires-Dist: rich>=14.1.0; extra == 'cli'
@@ -85,7 +85,7 @@ Provides-Extra: langdetect
 Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
 Provides-Extra: paddleocr
 Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
-Requires-Dist: paddlepaddle>=3.1.1; extra == 'paddleocr'
+Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
 Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
 Description-Content-Type: text/markdown

{kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/RECORD RENAMED Viewed

@@ -1,57 +1,58 @@
 kreuzberg/__init__.py,sha256=Oh_NTp8wf0BlvD8CSBad2A493nEWH4jTE0x8v7v1Y9w,1341
 kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
 kreuzberg/_chunker.py,sha256=tr9_KUYTSLauFois3MsB-A-0hGcTT8hTQFrqNRTii-I,1373
-kreuzberg/_config.py,sha256=Q5oiJE1XRf8ITuYcO8LZAOB3G2zNlXz2458rgPSth-U,12257
+kreuzberg/_config.py,sha256=2LI5z9gXniqO4afrMmbZfMdhlT2701O5OlGKkrMo-bM,12385
 kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
 kreuzberg/_document_classification.py,sha256=Mz_s2GJGsEl7MQ-67BPoGYCZibTy9Sw0PScUZKBjKOA,5736
 kreuzberg/_entity_extraction.py,sha256=5YpPnqoJ5aiHd_sy4bN4-Ngiq79RhCV6yaUQE8joGXo,3503
-kreuzberg/_gmft.py,sha256=jKbD7V_KP9XTLjT9SBgSgE3CyDjqbRDm9BAiWV2sAC0,19542
+kreuzberg/_gmft.py,sha256=a7KDXbZM0PxyFpAIjM0xMRvxzoMo4fTQuGlFNa8uXBU,20502
 kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWuyhs,1960
 kreuzberg/_mime_types.py,sha256=kGBDSMO4XPgzUKC7iaBeChCtRQXZ9_zXq6eJydejX_k,7739
 kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
-kreuzberg/_registry.py,sha256=8cPpz3oZVnMwWDT2v_Q7wf-GHd5YuHmc-nkLtvPfE1I,2433
-kreuzberg/_types.py,sha256=D-2d_WG8HyByA163izGhjk7t-e4FL_N-_6UzlVso8Dg,36020
-kreuzberg/cli.py,sha256=nPH4FDW6WkoF4gtH0s4RWmxjAveJ_-Unb6fev6x0Sko,12752
+kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
+kreuzberg/_types.py,sha256=yw8ZzCgwp8T4byh00gdSlABDtRwro6H1pemQsO5IZMQ,39132
+kreuzberg/cli.py,sha256=Ob0IfqWcaiM09pFdC6wTpdSeql0SGZDxBxfrEhJAGmo,13501
 kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
-kreuzberg/extraction.py,sha256=jiMKiDyTf3sHyk76sMffHR-eH-_yg-DFRMuXEKufRYI,17649
+kreuzberg/extraction.py,sha256=qT-Ziw5FmMqcPT88VrglikL1RASSJCf5W7xP6L9Vi5s,17673
 kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg/_api/main.py,sha256=JALYRD0qwyoZloWk5dNNuslBtG4GlVNc0G2oADm6cAc,7578
+kreuzberg/_api/main.py,sha256=bZLaQpW8eoTFGvCGJgFodALy4rDfe9kuY1oj9OKPQpU,10792
 kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_extractors/_base.py,sha256=i2FvAhRnamEtBb4a-C7pfcdWIXnkEBw0saMQu7h1_RQ,2069
 kreuzberg/_extractors/_email.py,sha256=jn_8J4BASKJ7zFHBG0PgxNe3OT4pjmEM2tTKX8y_0AE,5887
 kreuzberg/_extractors/_html.py,sha256=NyQKChNLvaSUC_5x1qTYlIQGwL4lEbgUF7BgH9ejEVY,1583
-kreuzberg/_extractors/_image.py,sha256=UqPoYfvDRX6Rd1yPhcLHJLDw6d2cUzgkqOGjh2eleJM,3301
+kreuzberg/_extractors/_image.py,sha256=lFPoxAf7_Zbx-1t8W4vU2bhHauiNGOAFbZxr_2gNUsw,3991
 kreuzberg/_extractors/_pandoc.py,sha256=-Ai4S1cXs7F6yeonb_7Y7_ZoWHn29E2oP1WlPtM-4HM,22505
-kreuzberg/_extractors/_pdf.py,sha256=Yv_c3xYzrGAjgTbwCGqbiQTDLjIUP_Pu7Z3GmMOqgqg,17865
+kreuzberg/_extractors/_pdf.py,sha256=naJ_AgtAgtGIjAqiU4_G7lgftKWhUjZDLVILSG2AyVc,18757
 kreuzberg/_extractors/_presentation.py,sha256=ULGkt7dzeA9sYSEhpAucKZmkdv9EubzeZtOjoLP3Z2E,6994
-kreuzberg/_extractors/_spread_sheet.py,sha256=x25u2M-ufxpDd7_qrjhMEz1yFftIcOISE1qwPW09Zm0,11962
+kreuzberg/_extractors/_spread_sheet.py,sha256=eBAx_OwoyRqMzmD4Z07UlOBwcXckymgvj_0o7di6thA,12715
 kreuzberg/_extractors/_structured.py,sha256=PpefI_GDrdLyUgnElrbdB-MeTMKVWium4Ckxm5Zg100,5536
 kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
-kreuzberg/_mcp/server.py,sha256=iYJG6g0u7I6mWtC4R1XlxydBrPpgnp5dGJzpm9QAZig,8438
+kreuzberg/_mcp/server.py,sha256=YPMJp6xnZ3DC32NEdX5Gqf3vwxsHZxXxUxZ6jghpv6I,5688
 kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
-kreuzberg/_ocr/_base.py,sha256=5ef2g8JuSaZF2sDiAmoaODHbeG4MT0LtNzbtW0n9BnU,1445
+kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
 kreuzberg/_ocr/_easyocr.py,sha256=XbgpGt5tkE4xHleIGvV1cHlpOQTp43rSXBO1CyIyKTg,14599
-kreuzberg/_ocr/_paddleocr.py,sha256=58sKOHfKCHGFJNlRLrJwey8G_7xbsAAPBXB4n3hKc7k,14052
+kreuzberg/_ocr/_paddleocr.py,sha256=hfc6Zi2eSUYTVVF9y9D1P2_pLiLXPfFRoJ6QDJ6oZag,15017
 kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
-kreuzberg/_ocr/_tesseract.py,sha256=xGML3ygY5xMN5T3YznrKDVAH_DWfaFiteFBo_-GpjCs,48931
+kreuzberg/_ocr/_tesseract.py,sha256=QEKK_PDZnNiZRgpklOgMXB-cObJy6C-HuxL6Gza5Z3c,49136
 kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg/_utils/_cache.py,sha256=S6Oc4TJamiuuWeJ2ABxDFbbQh4o8w38AUyZeBEc1NN8,12767
+kreuzberg/_utils/_cache.py,sha256=qeyI6rJOQlKtdHjJeOjUxx31eItak_drrNn8Cf8HbN8,13956
 kreuzberg/_utils/_device.py,sha256=UxGkSTN3Up-Zn43CSyvf8CozW2xAF05Cm01LWA2FZmg,8263
 kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
 kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
-kreuzberg/_utils/_ocr_cache.py,sha256=8_-qmPlK2adQKsH4OO4Mlk8wmqBMl3XxkcV_NsXVyFs,3501
+kreuzberg/_utils/_image_preprocessing.py,sha256=2u0A28M07F9XlYebTG5salOUVEE3YT3m8fiR8Z2ZM8E,12326
+kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
 kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
-kreuzberg/_utils/_process_pool.py,sha256=9dPMD_gBocQ5VaeCIrlSJfPXKyXNuyKaATmqOPExxiE,6723
+kreuzberg/_utils/_process_pool.py,sha256=7p8Co1w-Tvh2MUdxMcPMpvOikumrb0nN2ApQVytV-_c,6726
 kreuzberg/_utils/_quality.py,sha256=f7NbyZysyJQD8jKCNWhogvluU9A7GdEYhMsDBeMbGAA,5412
 kreuzberg/_utils/_ref.py,sha256=iOflvjTUc_F0XaL28Bd6fpvL6qkeoURGA4B77Nqky7I,840
 kreuzberg/_utils/_serialization.py,sha256=97iIgdcxdbym-BEvy0J6HAduBCUXyCGwhuEHCT_l7I4,1513
 kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4366
 kreuzberg/_utils/_sync.py,sha256=OWiciXPTGHIxgiGoHI2AglZ1siTNT-nU_JCgHPNzzHk,2196
-kreuzberg/_utils/_table.py,sha256=R-6owHjvcvHGhem_vDsFH7S2yMHGoUUO2PFcj-Idptk,6361
+kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
 kreuzberg/_utils/_tmp.py,sha256=wnOInBkcuQoxI1vBLvNv9NqbRCEu9Y03qfOjqQuAk3s,841
-kreuzberg-3.13.2.dist-info/METADATA,sha256=c1w8iB_Frnzr0DHY-X-a9rk5S9vQPICPIniPzwfvHV8,12127
-kreuzberg-3.13.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-kreuzberg-3.13.2.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
-kreuzberg-3.13.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-3.13.2.dist-info/RECORD,,
+kreuzberg-3.14.0.dist-info/METADATA,sha256=68rRivXnf8n_F9lqekOydDOd8sehWpHpbbKzRup7XDc,12127
+kreuzberg-3.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+kreuzberg-3.14.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
+kreuzberg-3.14.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-3.14.0.dist-info/RECORD,,

{kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

kreuzberg 3.13.2__py3-none-any.whl → 3.14.0__py3-none-any.whl

kreuzberg 3.13.2py3-none-any.whl → 3.14.0py3-none-any.whl