PyPI - kreuzberg - Versions diffs - 3.8.1__py3-none-any.whl → 3.9.0__py3-none-any.whl - Mend

kreuzberg 3.8.1py3-none-any.whl → 3.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

kreuzberg/__init__.py +4 -0
kreuzberg/_api/main.py +22 -1
kreuzberg/_chunker.py +3 -3
kreuzberg/_config.py +404 -0
kreuzberg/_document_classification.py +156 -0
kreuzberg/_entity_extraction.py +6 -6
kreuzberg/_extractors/_image.py +4 -3
kreuzberg/_extractors/_pdf.py +40 -29
kreuzberg/_extractors/_spread_sheet.py +6 -8
kreuzberg/_extractors/_structured.py +34 -25
kreuzberg/_gmft.py +33 -42
kreuzberg/_language_detection.py +1 -1
kreuzberg/_mcp/server.py +58 -8
kreuzberg/_mime_types.py +1 -1
kreuzberg/_ocr/_base.py +1 -1
kreuzberg/_ocr/_easyocr.py +5 -5
kreuzberg/_ocr/_paddleocr.py +4 -4
kreuzberg/_ocr/_tesseract.py +12 -21
kreuzberg/_playa.py +2 -3
kreuzberg/_types.py +65 -27
kreuzberg/_utils/_cache.py +14 -17
kreuzberg/_utils/_device.py +17 -27
kreuzberg/_utils/_errors.py +41 -38
kreuzberg/_utils/_quality.py +7 -11
kreuzberg/_utils/_serialization.py +21 -16
kreuzberg/_utils/_string.py +22 -12
kreuzberg/_utils/_table.py +3 -4
kreuzberg/cli.py +5 -5
kreuzberg/exceptions.py +10 -0
kreuzberg/extraction.py +20 -11
kreuzberg-3.9.0.dist-info/METADATA +269 -0
kreuzberg-3.9.0.dist-info/RECORD +54 -0
kreuzberg/_cli_config.py +0 -175
kreuzberg-3.8.1.dist-info/METADATA +0 -301
kreuzberg-3.8.1.dist-info/RECORD +0 -53
{kreuzberg-3.8.1.dist-info → kreuzberg-3.9.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.8.1.dist-info → kreuzberg-3.9.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.8.1.dist-info → kreuzberg-3.9.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import warnings
 from dataclasses import dataclass
+from itertools import chain
 from typing import Literal
 from kreuzberg.exceptions import ValidationError
@@ -12,7 +13,7 @@ from kreuzberg.exceptions import ValidationError
 DeviceType = Literal["cpu", "cuda", "mps", "auto"]
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class DeviceInfo:
     """Information about a compute device."""
@@ -34,28 +35,17 @@ def detect_available_devices() -> list[DeviceInfo]:
     Returns:
         List of available devices, with the most preferred device first.
     """
-    devices: list[DeviceInfo] = []
-    devices.append(
-        DeviceInfo(
-            device_type="cpu",
-            name="CPU",
-        )
-    )
-    if _is_cuda_available():
-        cuda_devices = _get_cuda_devices()
-        devices.extend(cuda_devices)
+    # Build device lists efficiently using generators
+    cpu_device = DeviceInfo(device_type="cpu", name="CPU")
-    if _is_mps_available():
-        mps_device = _get_mps_device()
-        if mps_device:
-            devices.append(mps_device)
+    cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
-    gpu_devices = [d for d in devices if d.device_type != "cpu"]
-    cpu_devices = [d for d in devices if d.device_type == "cpu"]
+    mps_device = _get_mps_device() if _is_mps_available() else None
+    mps_devices = [mps_device] if mps_device else []
-    return gpu_devices + cpu_devices
+    # Return GPU devices first, then CPU using itertools.chain
+    gpu_devices = list(chain(cuda_devices, mps_devices))
+    return [*gpu_devices, cpu_device]
 def get_optimal_device() -> DeviceInfo:
@@ -151,7 +141,7 @@ def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | No
 def _is_cuda_available() -> bool:
     """Check if CUDA is available."""
     try:
-        import torch  # type: ignore[import-not-found,unused-ignore]
+        import torch  # type: ignore[import-not-found,unused-ignore]  # noqa: PLC0415
         return bool(torch.cuda.is_available())
     except ImportError:
@@ -161,7 +151,7 @@ def _is_cuda_available() -> bool:
 def _is_mps_available() -> bool:
     """Check if MPS (Apple Silicon) is available."""
     try:
-        import torch  # type: ignore[import-not-found,unused-ignore]
+        import torch  # type: ignore[import-not-found,unused-ignore]  # noqa: PLC0415
         return bool(torch.backends.mps.is_available())
     except ImportError:
@@ -173,7 +163,7 @@ def _get_cuda_devices() -> list[DeviceInfo]:
     devices: list[DeviceInfo] = []
     try:
-        import torch
+        import torch  # noqa: PLC0415
         if not torch.cuda.is_available():
             return devices
@@ -209,7 +199,7 @@ def _get_cuda_devices() -> list[DeviceInfo]:
 def _get_mps_device() -> DeviceInfo | None:
     """Get information about the MPS device."""
     try:
-        import torch
+        import torch  # noqa: PLC0415
         if not torch.backends.mps.is_available():
             return None
@@ -226,7 +216,7 @@ def _get_mps_device() -> DeviceInfo | None:
 def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
     """Get CUDA memory information for a specific device."""
     try:
-        import torch
+        import torch  # noqa: PLC0415
         if not torch.cuda.is_available():
             return None, None
@@ -339,7 +329,7 @@ def cleanup_device_memory(device: DeviceInfo) -> None:
     """
     if device.device_type == "cuda":
         try:
-            import torch
+            import torch  # noqa: PLC0415
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
@@ -348,7 +338,7 @@ def cleanup_device_memory(device: DeviceInfo) -> None:
     elif device.device_type == "mps":
         try:
-            import torch
+            import torch  # noqa: PLC0415
             if torch.backends.mps.is_available():
                 torch.mps.empty_cache()

kreuzberg/_utils/_errors.py CHANGED Viewed

@@ -12,6 +12,42 @@ import psutil
 from kreuzberg.exceptions import ValidationError
+# Define error keywords as frozensets for O(1) membership testing
+_SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
+_TRANSIENT_ERROR_PATTERNS = frozenset(
+    {
+        "temporary",
+        "locked",
+        "in use",
+        "access denied",
+        "permission",
+        "timeout",
+        "connection",
+        "network",
+        "too many open files",
+        "cannot allocate memory",
+        "resource temporarily unavailable",
+        "broken pipe",
+        "subprocess",
+        "signal",
+    }
+)
+_RESOURCE_ERROR_PATTERNS = frozenset(
+    {
+        "memory",
+        "out of memory",
+        "cannot allocate",
+        "too many open files",
+        "file descriptor",
+        "resource",
+        "exhausted",
+        "limit",
+        "cpu",
+        "thread",
+        "process",
+    }
+)
 def create_error_context(
     *,
@@ -52,11 +88,7 @@ def create_error_context(
             "traceback": traceback.format_exception_only(type(error), error),
         }
-    if (
-        any(keyword in str(error).lower() for keyword in ["memory", "resource", "process", "thread"])
-        if error
-        else False
-    ):
+    if error and any(keyword in str(error).lower() for keyword in _SYSTEM_ERROR_KEYWORDS):
         try:
             mem = psutil.virtual_memory()
             context["system"] = {
@@ -94,25 +126,8 @@ def is_transient_error(error: Exception) -> bool:
     if isinstance(error, transient_types):
         return True
-    transient_patterns = [
-        "temporary",
-        "locked",
-        "in use",
-        "access denied",
-        "permission",
-        "timeout",
-        "connection",
-        "network",
-        "too many open files",
-        "cannot allocate memory",
-        "resource temporarily unavailable",
-        "broken pipe",
-        "subprocess",
-        "signal",
-    ]
     error_str = str(error).lower()
-    return any(pattern in error_str for pattern in transient_patterns)
+    return any(pattern in error_str for pattern in _TRANSIENT_ERROR_PATTERNS)
 def is_resource_error(error: Exception) -> bool:
@@ -124,22 +139,8 @@ def is_resource_error(error: Exception) -> bool:
     Returns:
         True if the error is resource-related
     """
-    resource_patterns = [
-        "memory",
-        "out of memory",
-        "cannot allocate",
-        "too many open files",
-        "file descriptor",
-        "resource",
-        "exhausted",
-        "limit",
-        "cpu",
-        "thread",
-        "process",
-    ]
     error_str = str(error).lower()
-    return any(pattern in error_str for pattern in resource_patterns)
+    return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
 def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
@@ -165,6 +166,8 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
 class BatchExtractionResult:
     """Result container for batch operations with partial success support."""
+    __slots__ = ("failed", "successful", "total_count")
     def __init__(self) -> None:
         """Initialize batch result container."""
         self.successful: list[tuple[int, Any]] = []

kreuzberg/_utils/_quality.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import re
+from functools import reduce
 from typing import Any
 # Pre-compiled patterns for performance
@@ -102,9 +103,8 @@ def clean_extracted_text(text: str) -> str:
     if not text:
         return text
-    # Remove script and style content
-    for pattern in _SCRIPT_PATTERNS.values():
-        text = pattern.sub(" ", text)
+    # Remove script and style content using functools.reduce for single pass
+    text = reduce(lambda t, pattern: pattern.sub(" ", t), _SCRIPT_PATTERNS.values(), text)
     # Clean OCR artifacts
     text = _clean_ocr_artifacts(text)
@@ -134,10 +134,8 @@ def _calculate_script_penalty(text: str, total_chars: int) -> float:
     if total_chars == 0:
         return 0.0
-    script_chars = 0
-    for pattern in _SCRIPT_PATTERNS.values():
-        matches = pattern.findall(text)
-        script_chars += sum(len(match) for match in matches)
+    # Use sum with generator expression for single-pass calculation
+    script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
     return min(1.0, script_chars / total_chars)
@@ -147,10 +145,8 @@ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
     if total_chars == 0:
         return 0.0
-    nav_chars = 0
-    for pattern in _NAVIGATION_PATTERNS.values():
-        matches = pattern.findall(text)
-        nav_chars += sum(len(match) for match in matches)
+    # Use sum with generator expression for single-pass calculation
+    nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
     return min(1.0, nav_chars / total_chars)

kreuzberg/_utils/_serialization.py CHANGED Viewed

@@ -2,16 +2,28 @@
 from __future__ import annotations
-from dataclasses import asdict, is_dataclass
-from enum import Enum
+from dataclasses import is_dataclass
 from typing import Any, TypeVar, cast
+import msgspec
 from msgspec import MsgspecError
 from msgspec.msgpack import decode, encode
 T = TypeVar("T")
+# Define dict method names in priority order
+_DICT_METHOD_NAMES = (
+    "to_dict",
+    "as_dict",
+    "dict",
+    "model_dump",
+    "json",
+    "to_list",
+    "tolist",
+)
 def encode_hook(obj: Any) -> Any:
     """Custom encoder for complex objects."""
     if callable(obj):
@@ -20,22 +32,15 @@ def encode_hook(obj: Any) -> Any:
     if isinstance(obj, Exception):
         return {"message": str(obj), "type": type(obj).__name__}
-    for key in (
-        "to_dict",
-        "as_dict",
-        "dict",
-        "model_dump",
-        "json",
-        "to_list",
-        "tolist",
-    ):
-        if hasattr(obj, key):
-            method = getattr(obj, key)  # Cache the attribute lookup
-            if callable(method):
-                return method()
+    # Check for dict-like methods more efficiently using any() with generator
+    for attr_name in _DICT_METHOD_NAMES:
+        method = getattr(obj, attr_name, None)
+        if method is not None and callable(method):
+            return method()
     if is_dataclass(obj) and not isinstance(obj, type):
-        return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}
+        # Use msgspec.to_builtins for more efficient conversion
+        return msgspec.to_builtins(obj)
     if hasattr(obj, "save") and hasattr(obj, "format"):
         return None

kreuzberg/_utils/_string.py CHANGED Viewed

@@ -28,6 +28,7 @@ _encoding_cache: dict[str, str] = {}
 @lru_cache(maxsize=128)
 def _get_encoding_cache_key(data_hash: str, size: int) -> str:
     """Generate cache key for encoding detection."""
+    # Use string interpolation which is faster than format strings for simple cases
     return f"{data_hash}:{size}"
@@ -104,25 +105,29 @@ def _calculate_text_confidence(text: str) -> float:
     if not text:
         return 0.0
-    # Check for common encoding problems
-    replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
-    control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
     total_chars = len(text)
     if total_chars == 0:
         return 0.0
+    # Check for common encoding problems - compile patterns once
+    replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
+    control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
     # Penalize replacement and control characters
     penalty = (replacement_count + control_count * 2) / total_chars
-    # Bonus for readable character ranges
+    # Bonus for readable character ranges - more efficient counting
+    # Use generator expression with early termination
     readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
     readability_score = readable_chars / total_chars
     # Check for suspicious Cyrillic that might be misencoded Hebrew
     cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
-    if cyrillic_matches and len("".join(cyrillic_matches)) > total_chars * 0.1:
-        penalty += 0.3  # Heavy penalty for likely mojibake
+    if cyrillic_matches:
+        # Calculate total length more efficiently
+        cyrillic_length = sum(len(match) for match in cyrillic_matches)
+        if cyrillic_length > total_chars * 0.1:
+            penalty += 0.3  # Heavy penalty for likely mojibake
     return max(0.0, min(1.0, readability_score - penalty))
@@ -164,7 +169,8 @@ def normalize_spaces(text: str) -> str:
     # Split by double newlines to preserve paragraph breaks
     paragraphs = text.split("\n\n")
-    normalized_paragraphs = []
+    result_paragraphs = []
     for paragraph in paragraphs:
         # Use pre-compiled patterns for better performance
@@ -173,10 +179,14 @@ def normalize_spaces(text: str) -> str:
         # Clean up multiple newlines within paragraph (keep single newlines)
         cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
-        # Strip and filter empty lines efficiently
-        lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
+        # Process lines efficiently - manual loop avoids double strip() calls
+        lines = []
+        for line in cleaned.split("\n"):
+            stripped_line = line.strip()
+            if stripped_line:
+                lines.append(stripped_line)
         if lines:
-            normalized_paragraphs.append("\n".join(lines))
+            result_paragraphs.append("\n".join(lines))
-    return "\n\n".join(normalized_paragraphs)
+    return "\n\n".join(result_paragraphs)

kreuzberg/_utils/_table.py CHANGED Viewed

@@ -3,7 +3,6 @@
 from __future__ import annotations
 import csv
-from io import StringIO
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
@@ -23,9 +22,9 @@ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
     if "df" not in table or table["df"] is None:
         return ""
-    output = StringIO()
-    table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
-    return output.getvalue().strip()
+    # Use pandas to_csv() direct string return instead of StringIO
+    csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
+    return str(csv_output).strip()
 def export_table_to_tsv(table: TableData) -> str:

kreuzberg/cli.py CHANGED Viewed

@@ -18,7 +18,7 @@ except ImportError as e:
     ) from e
 from kreuzberg import __version__, extract_bytes_sync, extract_file_sync
-from kreuzberg._cli_config import build_extraction_config, find_default_config, load_config_from_file
+from kreuzberg._config import build_extraction_config, find_config_file, load_config_from_file
 from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
 DEFAULT_MAX_CHARACTERS = 4000
@@ -92,7 +92,7 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
     if config:
         file_config = load_config_from_file(config)
     else:
-        default_config = find_default_config()
+        default_config = find_config_file()
         if default_config:
             try:
                 file_config = load_config_from_file(default_config)
@@ -160,7 +160,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
             progress.add_task("Extracting text...", total=None)
             try:
-                import magic  # type: ignore[import-not-found]
+                import magic  # type: ignore[import-not-found]  # noqa: PLC0415
                 mime_type = magic.from_buffer(input_bytes, mime=True)
             except ImportError:
@@ -260,7 +260,7 @@ def cli(ctx: click.Context) -> None:
 @click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
 @click.pass_context
 def extract(  # noqa: PLR0913
-    ctx: click.Context,  # noqa: ARG001
+    _: click.Context,
     file: Path | None,
     output: Path | None,
     force_ocr: bool,
@@ -314,7 +314,7 @@ def extract(  # noqa: PLR0913
 def config(config: Path | None) -> None:
     """Show current configuration."""
     try:
-        config_path = config or find_default_config()
+        config_path = config or find_config_file()
         if config_path:
             file_config = load_config_from_file(config_path)

kreuzberg/exceptions.py CHANGED Viewed

@@ -7,6 +7,8 @@ from typing import Any
 class KreuzbergError(Exception):
     """Base exception for all Kreuzberg errors."""
+    __slots__ = ("context",)
     context: Any
     """The context of the error."""
@@ -43,14 +45,20 @@ class KreuzbergError(Exception):
 class ParsingError(KreuzbergError):
     """Raised when a parsing error occurs."""
+    __slots__ = ()
 class ValidationError(KreuzbergError):
     """Raised when a validation error occurs."""
+    __slots__ = ()
 class MissingDependencyError(KreuzbergError):
     """Raised when a dependency is missing."""
+    __slots__ = ()
     @classmethod
     def create_for_package(
         cls, *, dependency_group: str, functionality: str, package_name: str
@@ -79,3 +87,5 @@ class MissingDependencyError(KreuzbergError):
 class OCRError(KreuzbergError):
     """Raised when an OCR error occurs."""
+    __slots__ = ()

kreuzberg/extraction.py CHANGED Viewed

@@ -7,15 +7,15 @@ from typing import TYPE_CHECKING, Any, Final, cast
 import anyio
-from kreuzberg import ExtractionResult
 from kreuzberg._chunker import get_chunker
+from kreuzberg._document_classification import auto_detect_document_type
 from kreuzberg._entity_extraction import extract_entities, extract_keywords
 from kreuzberg._language_detection import detect_languages
 from kreuzberg._mime_types import (
     validate_mime_type,
 )
 from kreuzberg._registry import ExtractorRegistry
-from kreuzberg._types import ExtractionConfig
+from kreuzberg._types import ExtractionConfig, ExtractionResult
 from kreuzberg._utils._document_cache import get_document_cache
 from kreuzberg._utils._errors import create_error_context
 from kreuzberg._utils._string import safe_decode
@@ -30,7 +30,9 @@ if TYPE_CHECKING:
 DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
-def _validate_and_post_process_helper(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
+def _validate_and_post_process_helper(
+    result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
+) -> ExtractionResult:
     if config.chunk_content:
         result.chunks = _handle_chunk_content(
             mime_type=result.mime_type,
@@ -62,14 +64,19 @@ def _validate_and_post_process_helper(result: ExtractionResult, config: Extracti
             config=config.language_detection_config,
         )
+    if config.auto_detect_document_type:
+        result = auto_detect_document_type(result, config, file_path=file_path)
     return result
-async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
+async def _validate_and_post_process_async(
+    result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
+) -> ExtractionResult:
     for validator in config.validators or []:
         await run_maybe_sync(validator, result)
-    result = _validate_and_post_process_helper(result, config)
+    result = _validate_and_post_process_helper(result, config, file_path)
     for post_processor in config.post_processing_hooks or []:
         result = await run_maybe_sync(post_processor, result)
@@ -77,11 +84,13 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
     return result
-def _validate_and_post_process_sync(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
+def _validate_and_post_process_sync(
+    result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
+) -> ExtractionResult:
     for validator in config.validators or []:
         run_sync_only(validator, result)
-    result = _validate_and_post_process_helper(result, config)
+    result = _validate_and_post_process_helper(result, config, file_path)
     for post_processor in config.post_processing_hooks or []:
         result = run_sync_only(post_processor, result)
@@ -172,7 +181,7 @@ async def extract_file(
                 metadata={},
             )
-        result = await _validate_and_post_process_async(result=result, config=config)
+        result = await _validate_and_post_process_async(result=result, config=config, file_path=path)
         cache.set(path, config, result)
@@ -357,7 +366,7 @@ def extract_file_sync(
                 metadata={},
             )
-        result = _validate_and_post_process_sync(result=result, config=config)
+        result = _validate_and_post_process_sync(result=result, config=config, file_path=path)
         cache.set(path, config, result)
@@ -460,8 +469,8 @@ def batch_extract_bytes_sync(
             return (index, error_result)
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        indexed_contents = list(enumerate(contents))
-        future_to_index = {executor.submit(extract_single, ic): i for i, ic in enumerate(indexed_contents)}
+        # Avoid creating intermediate list, use enumerate directly
+        future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
         results: list[ExtractionResult] = [None] * len(contents)  # type: ignore[list-item]
         for future in as_completed(future_to_index):

kreuzberg 3.8.1__py3-none-any.whl → 3.9.0__py3-none-any.whl

kreuzberg 3.8.1py3-none-any.whl → 3.9.0py3-none-any.whl