PyPI - kreuzberg - Versions diffs - 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl - Mend

kreuzberg 3.14.0py3-none-any.whl → 3.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

kreuzberg/__init__.py +6 -0
kreuzberg/_api/_config_cache.py +247 -0
kreuzberg/_api/main.py +156 -30
kreuzberg/_chunker.py +7 -6
kreuzberg/_constants.py +2 -0
kreuzberg/_document_classification.py +4 -6
kreuzberg/_entity_extraction.py +9 -4
kreuzberg/_extractors/_base.py +269 -3
kreuzberg/_extractors/_email.py +95 -27
kreuzberg/_extractors/_html.py +85 -7
kreuzberg/_extractors/_image.py +23 -22
kreuzberg/_extractors/_pandoc.py +106 -75
kreuzberg/_extractors/_pdf.py +209 -99
kreuzberg/_extractors/_presentation.py +72 -8
kreuzberg/_extractors/_spread_sheet.py +25 -30
kreuzberg/_mcp/server.py +345 -25
kreuzberg/_mime_types.py +42 -0
kreuzberg/_ocr/_easyocr.py +2 -2
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +74 -34
kreuzberg/_types.py +182 -23
kreuzberg/_utils/_cache.py +10 -4
kreuzberg/_utils/_device.py +2 -4
kreuzberg/_utils/_image_preprocessing.py +12 -39
kreuzberg/_utils/_process_pool.py +29 -8
kreuzberg/_utils/_quality.py +7 -2
kreuzberg/_utils/_resource_managers.py +65 -0
kreuzberg/_utils/_sync.py +36 -6
kreuzberg/_utils/_tmp.py +37 -1
kreuzberg/cli.py +34 -20
kreuzberg/extraction.py +43 -27
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
kreuzberg-3.15.0.dist-info/RECORD +60 -0
kreuzberg-3.14.0.dist-info/RECORD +0 -58
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_image_preprocessing.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any
 from PIL import Image
+from kreuzberg._constants import PDF_POINTS_PER_INCH
 from kreuzberg._types import ExtractionConfig, ImagePreprocessingMetadata
 if TYPE_CHECKING:
@@ -31,36 +32,30 @@ def calculate_optimal_dpi(
     Returns:
         Optimal DPI value that keeps image within max_dimension
     """
-    # Convert points to inches (72 points = 1 inch)
-    width_inches = page_width / 72.0
-    height_inches = page_height / 72.0
+    width_inches = page_width / PDF_POINTS_PER_INCH
+    height_inches = page_height / PDF_POINTS_PER_INCH
-    # Calculate pixel dimensions at target DPI
     target_width_pixels = int(width_inches * target_dpi)
     target_height_pixels = int(height_inches * target_dpi)
-    # Check if target DPI results in oversized image
     max_pixel_dimension = max(target_width_pixels, target_height_pixels)
     if max_pixel_dimension <= max_dimension:
-        # Target DPI is fine, clamp to min/max bounds
         return max(min_dpi, min(target_dpi, max_dpi))
-    # Calculate maximum DPI that keeps within dimension constraints
     max_dpi_for_width = max_dimension / width_inches if width_inches > 0 else max_dpi
     max_dpi_for_height = max_dimension / height_inches if height_inches > 0 else max_dpi
     constrained_dpi = int(min(max_dpi_for_width, max_dpi_for_height))
-    # Clamp to min/max bounds
     return max(min_dpi, min(constrained_dpi, max_dpi))
 def _extract_image_dpi(image: PILImage) -> tuple[tuple[float, float], float]:
     """Extract DPI information from image."""
-    current_dpi_info = image.info.get("dpi", (72.0, 72.0))
+    current_dpi_info = image.info.get("dpi", (PDF_POINTS_PER_INCH, PDF_POINTS_PER_INCH))
     if isinstance(current_dpi_info, (list, tuple)):
         original_dpi = (float(current_dpi_info[0]), float(current_dpi_info[1]))
-        current_dpi = float(current_dpi_info[0])  # Use horizontal DPI
+        current_dpi = float(current_dpi_info[0])
     else:
         current_dpi = float(current_dpi_info)
         original_dpi = (current_dpi, current_dpi)
@@ -88,10 +83,8 @@ def _calculate_target_dpi(
     """Calculate target DPI and whether it was auto-adjusted."""
     calculated_dpi = None
     if config.auto_adjust_dpi:
-        # Convert pixel dimensions to approximate point dimensions
-        # This is an approximation since we don't know the actual physical size
-        approx_width_points = original_width * 72.0 / current_dpi
-        approx_height_points = original_height * 72.0 / current_dpi
+        approx_width_points = original_width * PDF_POINTS_PER_INCH / current_dpi
+        approx_height_points = original_height * PDF_POINTS_PER_INCH / current_dpi
         optimal_dpi = calculate_optimal_dpi(
             approx_width_points,
@@ -131,7 +124,6 @@ def normalize_image_dpi(
     original_width, original_height = image.size
     original_dpi, current_dpi = _extract_image_dpi(image)
-    # If no auto-adjustment and current DPI matches target and within limits, skip processing
     if _should_skip_processing(original_width, original_height, current_dpi, config):
         return image, ImagePreprocessingMetadata(
             original_dimensions=(original_width, original_height),
@@ -143,15 +135,12 @@ def normalize_image_dpi(
             skipped_resize=True,
         )
-    # Calculate target DPI
     target_dpi, auto_adjusted, calculated_dpi = _calculate_target_dpi(
         original_width, original_height, current_dpi, config
     )
-    # Calculate scale factor based on DPI ratio
     scale_factor = target_dpi / current_dpi
-    # If scale factor is very close to 1.0, skip resizing
     if abs(scale_factor - 1.0) < 0.05:
         return image, ImagePreprocessingMetadata(
             original_dimensions=(original_width, original_height),
@@ -164,11 +153,9 @@ def normalize_image_dpi(
             skipped_resize=True,
         )
-    # Calculate new dimensions
     new_width = int(original_width * scale_factor)
     new_height = int(original_height * scale_factor)
-    # Ensure we don't exceed max_dimension (safety check)
     dimension_clamped = False
     max_new_dimension = max(new_width, new_height)
     if max_new_dimension > config.max_image_dimension:
@@ -178,12 +165,8 @@ def normalize_image_dpi(
         scale_factor *= dimension_scale
         dimension_clamped = True
-    # Resize image
     try:
-        # Use LANCZOS for high-quality downscaling, BICUBIC for upscaling
-        # Handle different PIL versions
         try:
-            # Modern PIL version
             if scale_factor < 1.0:
                 resample_method = Image.Resampling.LANCZOS
                 resample_name = "LANCZOS"
@@ -191,7 +174,6 @@ def normalize_image_dpi(
                 resample_method = Image.Resampling.BICUBIC
                 resample_name = "BICUBIC"
         except AttributeError:
-            # Older PIL version
             if scale_factor < 1.0:
                 resample_method = getattr(Image, "LANCZOS", 1)  # type: ignore[arg-type]
                 resample_name = "LANCZOS"
@@ -201,7 +183,6 @@ def normalize_image_dpi(
         normalized_image = image.resize((new_width, new_height), resample_method)
-        # Update DPI info in the new image
         normalized_image.info["dpi"] = (target_dpi, target_dpi)
         return normalized_image, ImagePreprocessingMetadata(
@@ -218,7 +199,6 @@ def normalize_image_dpi(
         )
     except OSError as e:
-        # If resizing fails, return original image with error info
         return image, ImagePreprocessingMetadata(
             original_dimensions=(original_width, original_height),
             original_dpi=original_dpi,
@@ -261,7 +241,6 @@ def get_dpi_adjustment_heuristics(
         "recommendations": recommendations,
     }
-    # Calculate aspect ratio and size analysis
     aspect_ratio = width / height if height > 0 else 1.0
     total_pixels = width * height
     megapixels = total_pixels / 1_000_000
@@ -274,27 +253,23 @@ def get_dpi_adjustment_heuristics(
         "is_large": max(width, height) > max_dimension * 0.8,
     }
-    # Document-specific heuristics
     if content_type == "document":
         if aspect_ratio > 2.0 or aspect_ratio < 0.5:
-            # Very wide or very tall documents (like forms, receipts)
             recommendations.append("Consider higher DPI for narrow documents")
             if target_dpi < 200:
                 heuristics["recommended_dpi"] = min(200, target_dpi * 1.3)
-        if megapixels > 50:  # Very large document
+        if megapixels > 50:
             recommendations.append("Large document detected - consider DPI reduction")
             heuristics["performance_impact"] = "high"
             if target_dpi > 150:
                 heuristics["recommended_dpi"] = max(120, target_dpi * 0.8)
-    # Memory usage estimation
-    estimated_memory_mb = (width * height * 3) / (1024 * 1024)  # RGB bytes
+    estimated_memory_mb = (width * height * 3) / (1024 * 1024)
     if estimated_memory_mb > 200:
         heuristics["performance_impact"] = "high"
         recommendations.append(f"High memory usage expected (~{estimated_memory_mb:.0f}MB)")
-    # Quality vs performance tradeoffs
     scale_factor = target_dpi / current_dpi if current_dpi > 0 else 1.0
     if scale_factor < 0.7:
         heuristics["quality_impact"] = "high"
@@ -324,16 +299,14 @@ def estimate_processing_time(
     total_pixels = width * height
     megapixels = total_pixels / 1_000_000
-    # Base processing times per megapixel (rough estimates)
     base_times = {
-        "tesseract": 2.5,  # seconds per megapixel
-        "easyocr": 4.0,  # slower due to deep learning
-        "paddleocr": 3.5,  # moderate speed
+        "tesseract": 2.5,
+        "easyocr": 4.0,
+        "paddleocr": 3.5,
     }
     base_time = base_times.get(ocr_backend, 3.0)
-    # Non-linear scaling for very large images
     scaling_factor = 1.0 + (megapixels - 10) * 0.1 if megapixels > 10 else 1.0
     estimated_time = base_time * megapixels * scaling_factor

kreuzberg/_utils/_process_pool.py CHANGED Viewed

@@ -19,15 +19,9 @@ if TYPE_CHECKING:
 T = TypeVar("T")
+_POOL_SIZE = mp.cpu_count()
-_POOL_SIZE = max(1, mp.cpu_count() - 1)
-def _create_process_pool() -> ProcessPoolExecutor:
-    return ProcessPoolExecutor(max_workers=_POOL_SIZE)
-_process_pool_ref = Ref("process_pool", _create_process_pool)
+_process_pool_ref = Ref("process_pool", lambda: ProcessPoolExecutor(max_workers=_POOL_SIZE))
 def _get_process_pool() -> ProcessPoolExecutor:
@@ -51,6 +45,33 @@ def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) ->
         return future.result()
+def get_optimal_worker_count(num_tasks: int, cpu_intensive: bool = True) -> int:
+    """Calculate optimal worker count based on workload.
+    Optimized based on benchmarking results:
+    - For 1 task: Use 1 worker (avoid overhead)
+    - For 2-3 tasks: Use num_tasks workers
+    - For 4+ tasks: Use all CPU cores for CPU-intensive work
+    """
+    cpu_count = mp.cpu_count()
+    if num_tasks == 1:
+        return 1
+    if num_tasks <= 3:
+        return min(num_tasks, cpu_count)
+    if cpu_intensive:
+        return cpu_count
+    return min(cpu_count * 2, max(cpu_count, num_tasks))
+def warmup_process_pool() -> None:
+    """Warm up the process pool to reduce initialization overhead."""
+    with process_pool() as pool:
+        futures = [pool.submit(lambda: None) for _ in range(_POOL_SIZE)]
+        for future in futures:
+            future.result()
 def shutdown_process_pool() -> None:
     if _process_pool_ref.is_initialized():
         pool = _process_pool_ref.get()

kreuzberg/_utils/_quality.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import re
 from functools import reduce
+from itertools import chain
 from typing import Any
 _OCR_ARTIFACTS = {
@@ -97,7 +98,9 @@ def _calculate_script_penalty(text: str, total_chars: int) -> float:
     if total_chars == 0:
         return 0.0
-    script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
+    script_chars = sum(
+        len(match) for match in chain.from_iterable(pattern.findall(text) for pattern in _SCRIPT_PATTERNS.values())
+    )
     return min(1.0, script_chars / total_chars)
@@ -106,7 +109,9 @@ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
     if total_chars == 0:
         return 0.0
-    nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
+    nav_chars = sum(
+        len(match) for match in chain.from_iterable(pattern.findall(text) for pattern in _NAVIGATION_PATTERNS.values())
+    )
     return min(1.0, nav_chars / total_chars)

kreuzberg/_utils/_resource_managers.py ADDED Viewed

@@ -0,0 +1,65 @@
+from __future__ import annotations
+import contextlib
+from typing import TYPE_CHECKING
+import pypdfium2
+from kreuzberg._utils._pdf_lock import pypdfium_file_lock
+from kreuzberg._utils._sync import run_sync
+if TYPE_CHECKING:  # pragma: no cover
+    from collections.abc import AsyncGenerator, Generator
+    from pathlib import Path
+@contextlib.asynccontextmanager
+async def pdf_document(file_path: Path) -> AsyncGenerator[pypdfium2.PdfDocument, None]:
+    """Async context manager for PyPDFium document resources."""
+    document = None
+    try:
+        with pypdfium_file_lock(file_path):
+            document = await run_sync(pypdfium2.PdfDocument, str(file_path))
+            yield document
+    finally:
+        if document:
+            with pypdfium_file_lock(file_path), contextlib.suppress(Exception):
+                await run_sync(document.close)
+@contextlib.contextmanager
+def pdf_document_sync(file_path: Path) -> Generator[pypdfium2.PdfDocument, None, None]:
+    """Sync context manager for PyPDFium document resources."""
+    document = None
+    try:
+        with pypdfium_file_lock(file_path):
+            document = pypdfium2.PdfDocument(str(file_path))
+            yield document
+    finally:
+        if document:
+            with pypdfium_file_lock(file_path), contextlib.suppress(Exception):
+                document.close()
+@contextlib.contextmanager
+def pdf_resources_sync(*resources: object) -> Generator[None, None, None]:
+    """Context manager for multiple PDF resources (pages, textpages, bitmaps)."""
+    try:
+        yield
+    finally:
+        for resource in resources:
+            with contextlib.suppress(Exception):
+                if hasattr(resource, "close"):
+                    resource.close()
+@contextlib.contextmanager
+def image_resources(*images: object) -> Generator[None, None, None]:
+    """Context manager for PIL Image resources."""
+    try:
+        yield
+    finally:
+        for image in images:
+            with contextlib.suppress(Exception):
+                if hasattr(image, "close"):
+                    image.close()

kreuzberg/_utils/_sync.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import asyncio
 from functools import partial
 from inspect import isawaitable, iscoroutinefunction
 from typing import TYPE_CHECKING, Any, TypeVar, cast
@@ -37,14 +38,43 @@ async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
     return results
-async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
-    results: list[Any] = []
+async def run_taskgroup_batched(
+    *async_tasks: Awaitable[Any],
+    batch_size: int,
+    use_semaphore: bool = True,
+) -> list[Any]:
+    """Run async tasks with controlled concurrency.
-    for i in range(0, len(async_tasks), batch_size):
-        batch = async_tasks[i : i + batch_size]
-        results.extend(await run_taskgroup(*batch))
+    Args:
+        async_tasks: Tasks to execute
+        batch_size: Maximum concurrent tasks
+        use_semaphore: Use semaphore for concurrency control instead of sequential batches
-    return results
+    Returns:
+        List of results in the same order as input tasks
+    """
+    if not async_tasks:
+        return []
+    if len(async_tasks) <= batch_size or not use_semaphore:
+        results: list[Any] = []
+        for i in range(0, len(async_tasks), batch_size):
+            batch = async_tasks[i : i + batch_size]
+            results.extend(await run_taskgroup(*batch))
+        return results
+    semaphore = asyncio.Semaphore(batch_size)
+    async def run_with_semaphore(task: Awaitable[Any], index: int) -> tuple[int, Any]:
+        async with semaphore:
+            result = await task
+            return (index, result)
+    indexed_tasks = [run_with_semaphore(task, i) for i, task in enumerate(async_tasks)]
+    indexed_results = await asyncio.gather(*indexed_tasks)
+    indexed_results.sort(key=lambda x: x[0])
+    return [result for _, result in indexed_results]
 async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:

kreuzberg/_utils/_tmp.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from __future__ import annotations
+import contextlib
+import os
+import tempfile
 from contextlib import suppress
 from pathlib import Path
 from tempfile import NamedTemporaryFile
@@ -10,7 +13,7 @@ from anyio import Path as AsyncPath
 from kreuzberg._utils._sync import run_sync
 if TYPE_CHECKING:  # pragma: no cover
-    from collections.abc import Callable, Coroutine
+    from collections.abc import AsyncGenerator, Callable, Coroutine, Generator
 async def create_temp_file(
@@ -26,3 +29,36 @@ async def create_temp_file(
             await AsyncPath(file.name).unlink(missing_ok=True)
     return Path(file.name), unlink
+@contextlib.asynccontextmanager
+async def temporary_file(extension: str, content: bytes | None = None) -> AsyncGenerator[Path, None]:
+    """Async context manager for temporary files with automatic cleanup."""
+    file_path, unlink = await create_temp_file(extension, content)
+    try:
+        yield file_path
+    finally:
+        await unlink()
+@contextlib.contextmanager
+def temporary_file_sync(extension: str, content: bytes | None = None) -> Generator[Path, None, None]:
+    """Sync context manager for temporary files with automatic cleanup."""
+    fd, temp_path = tempfile.mkstemp(suffix=extension)
+    try:
+        if content:
+            with os.fdopen(fd, "wb") as f:
+                f.write(content)
+        else:
+            os.close(fd)
+        yield Path(temp_path)
+    finally:
+        with suppress(OSError, PermissionError):
+            Path(temp_path).unlink()
+@contextlib.contextmanager
+def temporary_directory() -> Generator[Path, None, None]:
+    """Context manager for temporary directories with automatic cleanup."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        yield Path(temp_dir)

kreuzberg/cli.py CHANGED Viewed

@@ -122,32 +122,37 @@ def _build_cli_args(params: dict[str, Any]) -> dict[str, Any]:
         "force_ocr": params["force_ocr"] if params["force_ocr"] else None,
         "chunk_content": params["chunk_content"] if params["chunk_content"] else None,
         "extract_tables": params["extract_tables"] if params["extract_tables"] else None,
+        "extract_entities": params["extract_entities"] if params["extract_entities"] else None,
+        "extract_keywords": params["extract_keywords"] if params["extract_keywords"] else None,
+        "auto_detect_language": params["auto_detect_language"] if params["auto_detect_language"] else None,
+        "keyword_count": params["keyword_count"] if params["keyword_count"] != 10 else None,
         "max_chars": params["max_chars"] if params["max_chars"] != DEFAULT_MAX_CHARACTERS else None,
         "max_overlap": params["max_overlap"] if params["max_overlap"] != DEFAULT_MAX_OVERLAP else None,
         "ocr_backend": params["ocr_backend"],
     }
     ocr_backend = params["ocr_backend"]
-    if ocr_backend == "tesseract" and (
-        params["tesseract_lang"]
-        or params["tesseract_psm"] is not None
-        or params["tesseract_output_format"]
-        or params["enable_table_detection"]
-    ):
-        tesseract_config = {}
-        if params["tesseract_lang"]:
-            tesseract_config["language"] = params["tesseract_lang"]
-        if params["tesseract_psm"] is not None:
-            tesseract_config["psm"] = params["tesseract_psm"]
-        if params["tesseract_output_format"]:
-            tesseract_config["output_format"] = params["tesseract_output_format"]
-        if params["enable_table_detection"]:
-            tesseract_config["enable_table_detection"] = True
-        cli_args["tesseract_config"] = tesseract_config
-    elif ocr_backend == "easyocr" and params["easyocr_languages"]:
-        cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
-    elif ocr_backend == "paddleocr" and params["paddleocr_languages"]:
-        cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
+    match ocr_backend:
+        case "tesseract" if (
+            params["tesseract_lang"]
+            or params["tesseract_psm"] is not None
+            or params["tesseract_output_format"]
+            or params["enable_table_detection"]
+        ):
+            tesseract_config = {}
+            if params["tesseract_lang"]:
+                tesseract_config["language"] = params["tesseract_lang"]
+            if params["tesseract_psm"] is not None:
+                tesseract_config["psm"] = params["tesseract_psm"]
+            if params["tesseract_output_format"]:
+                tesseract_config["output_format"] = params["tesseract_output_format"]
+            if params["enable_table_detection"]:
+                tesseract_config["enable_table_detection"] = True
+            cli_args["tesseract_config"] = tesseract_config
+        case "easyocr" if params["easyocr_languages"]:
+            cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
+        case "paddleocr" if params["paddleocr_languages"]:
+            cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
     return cli_args
@@ -250,6 +255,9 @@ def cli(ctx: click.Context) -> None:
 @click.option("--force-ocr", is_flag=True, help="Force OCR processing")
 @click.option("--chunk-content", is_flag=True, help="Enable content chunking")
 @click.option("--extract-tables", is_flag=True, help="Enable table extraction")
+@click.option("--extract-entities", is_flag=True, help="Enable entity extraction")
+@click.option("--extract-keywords", is_flag=True, help="Enable keyword extraction")
+@click.option("--auto-detect-language", is_flag=True, help="Enable automatic language detection")
 @click.option(
     "--max-chars",
     type=int,
@@ -262,6 +270,12 @@ def cli(ctx: click.Context) -> None:
     default=DEFAULT_MAX_OVERLAP,
     help=f"Maximum overlap between chunks (default: {DEFAULT_MAX_OVERLAP})",
 )
+@click.option(
+    "--keyword-count",
+    type=int,
+    default=10,
+    help="Number of keywords to extract (default: 10)",
+)
 @click.option(
     "--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
 )

kreuzberg 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl

kreuzberg 3.14.0py3-none-any.whl → 3.15.0py3-none-any.whl