PyPI - kreuzberg - Versions diffs - 3.8.1__py3-none-any.whl → 3.8.2__py3-none-any.whl - Mend

kreuzberg 3.8.1py3-none-any.whl → 3.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

kreuzberg/__init__.py +4 -0
kreuzberg/_api/main.py +22 -1
kreuzberg/_config.py +404 -0
kreuzberg/_entity_extraction.py +3 -3
kreuzberg/_extractors/_pdf.py +22 -19
kreuzberg/_extractors/_spread_sheet.py +2 -3
kreuzberg/_extractors/_structured.py +10 -7
kreuzberg/_gmft.py +8 -11
kreuzberg/_language_detection.py +1 -1
kreuzberg/_mcp/server.py +58 -8
kreuzberg/_ocr/_easyocr.py +1 -1
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +2 -7
kreuzberg/_playa.py +2 -3
kreuzberg/_types.py +46 -24
kreuzberg/_utils/_cache.py +15 -17
kreuzberg/_utils/_device.py +10 -20
kreuzberg/_utils/_errors.py +41 -38
kreuzberg/_utils/_quality.py +7 -11
kreuzberg/_utils/_serialization.py +21 -16
kreuzberg/_utils/_string.py +22 -12
kreuzberg/_utils/_table.py +3 -4
kreuzberg/cli.py +3 -3
kreuzberg/exceptions.py +10 -0
kreuzberg/extraction.py +2 -2
kreuzberg-3.8.2.dist-info/METADATA +265 -0
kreuzberg-3.8.2.dist-info/RECORD +53 -0
kreuzberg/_cli_config.py +0 -175
kreuzberg-3.8.1.dist-info/METADATA +0 -301
kreuzberg-3.8.1.dist-info/RECORD +0 -53
{kreuzberg-3.8.1.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
{kreuzberg-3.8.1.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.8.1.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_gmft.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import io
 import multiprocessing as mp
 import os
 import queue
@@ -9,6 +10,8 @@ from dataclasses import dataclass, field
 from io import StringIO
 from typing import TYPE_CHECKING, Any, Literal
+import msgspec
 from kreuzberg._types import TableData
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, ParsingError
@@ -20,7 +23,7 @@ if TYPE_CHECKING:
     from pandas import DataFrame
-@dataclass(unsafe_hash=True)
+@dataclass(unsafe_hash=True, slots=True)
 class GMFTConfig:
     """Configuration options for GMFT.
@@ -178,7 +181,7 @@ async def extract_tables(  # noqa: PLR0915
     cache_kwargs = {
         "file_info": str(sorted(file_info.items())),
         "extractor": "gmft",
-        "config": str(sorted(config.__dict__.items())),
+        "config": str(sorted(msgspec.to_builtins(config).items())),
     }
     table_cache = get_table_cache()
@@ -308,7 +311,7 @@ def extract_tables_sync(
     cache_kwargs = {
         "file_info": str(sorted(file_info.items())),
         "extractor": "gmft",
-        "config": str(sorted(config.__dict__.items())),
+        "config": str(sorted(msgspec.to_builtins(config).items())),
     }
     table_cache = get_table_cache()
@@ -435,8 +438,6 @@ def _extract_tables_in_process(
             results = []
             for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
-                import io
                 img_bytes = io.BytesIO()
                 cropped_image = cropped_table.image()
                 cropped_image.save(img_bytes, format="PNG")
@@ -480,7 +481,7 @@ def _extract_tables_isolated(
         RuntimeError: If extraction fails or times out
     """
     config = config or GMFTConfig()
-    config_dict = config.__dict__.copy()
+    config_dict = msgspec.to_builtins(config)
     ctx = mp.get_context("spawn")
     result_queue = ctx.Queue()
@@ -528,8 +529,6 @@ def _extract_tables_isolated(
         if success:
             tables = []
             for table_dict in result:
-                import io
                 from PIL import Image
                 img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
@@ -596,7 +595,7 @@ async def _extract_tables_isolated_async(
     import anyio
     config = config or GMFTConfig()
-    config_dict = config.__dict__.copy()
+    config_dict = msgspec.to_builtins(config)
     ctx = mp.get_context("spawn")
     result_queue = ctx.Queue()
@@ -640,8 +639,6 @@ async def _extract_tables_isolated_async(
         if success:
             tables = []
             for table_dict in result:
-                import io
                 from PIL import Image
                 img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))

kreuzberg/_language_detection.py CHANGED Viewed

@@ -23,7 +23,7 @@ except ImportError:
 _CACHE_SIZE = 128
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class LanguageDetectionConfig:
     """Configuration for language detection.

kreuzberg/_mcp/server.py CHANGED Viewed

@@ -3,11 +3,14 @@
 from __future__ import annotations
 import base64
+import json
 from typing import Any
+import msgspec
 from mcp.server import FastMCP
 from mcp.types import TextContent
+from kreuzberg._config import try_discover_config
 from kreuzberg._types import ExtractionConfig, OcrBackendType
 from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
@@ -15,6 +18,44 @@ from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
 mcp = FastMCP("Kreuzberg Text Extraction")
+def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
+    """Create ExtractionConfig with discovered config as base and tool parameters as overrides.
+    Args:
+        **kwargs: Tool parameters to override defaults/discovered config.
+    Returns:
+        ExtractionConfig instance.
+    """
+    # Try to discover configuration from files
+    base_config = try_discover_config()
+    if base_config is None:
+        # No config file found, use defaults
+        return ExtractionConfig(**kwargs)
+    # Merge discovered config with tool parameters (tool params take precedence)
+    config_dict: dict[str, Any] = {
+        "force_ocr": base_config.force_ocr,
+        "chunk_content": base_config.chunk_content,
+        "extract_tables": base_config.extract_tables,
+        "extract_entities": base_config.extract_entities,
+        "extract_keywords": base_config.extract_keywords,
+        "ocr_backend": base_config.ocr_backend,
+        "max_chars": base_config.max_chars,
+        "max_overlap": base_config.max_overlap,
+        "keyword_count": base_config.keyword_count,
+        "auto_detect_language": base_config.auto_detect_language,
+        "ocr_config": base_config.ocr_config,
+        "gmft_config": base_config.gmft_config,
+    }
+    # Override with provided parameters
+    config_dict.update(kwargs)
+    return ExtractionConfig(**config_dict)
 @mcp.tool()
 def extract_document(  # noqa: PLR0913
     file_path: str,
@@ -49,7 +90,7 @@ def extract_document(  # noqa: PLR0913
     Returns:
         Extracted content with metadata, tables, chunks, entities, and keywords
     """
-    config = ExtractionConfig(
+    config = _create_config_with_overrides(
         force_ocr=force_ocr,
         chunk_content=chunk_content,
         extract_tables=extract_tables,
@@ -63,7 +104,7 @@ def extract_document(  # noqa: PLR0913
     )
     result = extract_file_sync(file_path, mime_type, config)
-    return result.to_dict()
+    return result.to_dict(include_none=True)
 @mcp.tool()
@@ -102,7 +143,7 @@ def extract_bytes(  # noqa: PLR0913
     """
     content_bytes = base64.b64decode(content_base64)
-    config = ExtractionConfig(
+    config = _create_config_with_overrides(
         force_ocr=force_ocr,
         chunk_content=chunk_content,
         extract_tables=extract_tables,
@@ -116,7 +157,7 @@ def extract_bytes(  # noqa: PLR0913
     )
     result = extract_bytes_sync(content_bytes, mime_type, config)
-    return result.to_dict()
+    return result.to_dict(include_none=True)
 @mcp.tool()
@@ -133,7 +174,7 @@ def extract_simple(
     Returns:
         Extracted text content as a string
     """
-    config = ExtractionConfig()
+    config = _create_config_with_overrides()
     result = extract_file_sync(file_path, mime_type, config)
     return result.content
@@ -142,7 +183,16 @@ def extract_simple(
 def get_default_config() -> str:
     """Get the default extraction configuration."""
     config = ExtractionConfig()
-    return str(config.__dict__)
+    return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)
+@mcp.resource("config://discovered")
+def get_discovered_config() -> str:
+    """Get the discovered configuration from config files."""
+    config = try_discover_config()
+    if config is None:
+        return "No configuration file found"
+    return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)
 @mcp.resource("config://available-backends")
@@ -175,7 +225,7 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
     Returns:
         Extracted content with summarization prompt
     """
-    result = extract_file_sync(file_path, None, ExtractionConfig())
+    result = extract_file_sync(file_path, None, _create_config_with_overrides())
     return [
         TextContent(
@@ -195,7 +245,7 @@ def extract_structured(file_path: str) -> list[TextContent]:
     Returns:
         Extracted content with structured analysis prompt
     """
-    config = ExtractionConfig(
+    config = _create_config_with_overrides(
         extract_entities=True,
         extract_keywords=True,
         extract_tables=True,

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -111,7 +111,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
 }
-@dataclass(unsafe_hash=True, frozen=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class EasyOCRConfig:
     """Configuration options for EasyOCR."""

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -31,7 +31,7 @@ except ImportError:  # pragma: no cover
 PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
-@dataclass(unsafe_hash=True, frozen=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class PaddleOCRConfig:
     """Configuration options for PaddleOCR.

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import hashlib
+import io
 import os
 import re
 import subprocess
@@ -192,7 +193,7 @@ class PSMMode(Enum):
     """Treat the image as a single character."""
-@dataclass(unsafe_hash=True, frozen=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class TesseractConfig:
     """Configuration options for Tesseract OCR engine."""
@@ -235,8 +236,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         image: Image,
         **kwargs: Unpack[TesseractConfig],
     ) -> ExtractionResult:
-        import io
         from kreuzberg._utils._cache import get_ocr_cache
         image_buffer = io.BytesIO()
@@ -424,8 +423,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         Returns:
             The extraction result object
         """
-        import io
         from kreuzberg._utils._cache import get_ocr_cache
         image_buffer = io.BytesIO()
@@ -774,8 +771,6 @@ def _process_image_bytes_with_tesseract(
         OCR result as dictionary.
     """
     try:
-        import io
         from PIL import Image
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:

kreuzberg/_playa.py CHANGED Viewed

@@ -114,9 +114,8 @@ def _extract_keyword_metadata(pdf_info: dict[str, Any], result: Metadata) -> Non
     if keywords := pdf_info.get("keywords"):
         if isinstance(keywords, (str, bytes)):
             kw_str = decode_text(keywords)
-            kw_list = [k.strip() for k in kw_str.split(",")]
-            kw_list = [k.strip() for k in " ".join(kw_list).split(";")]
-            result["keywords"] = [k for k in kw_list if k]
+            # Combine multiple operations into a single comprehension
+            result["keywords"] = [k.strip() for part in kw_str.replace(";", ",").split(",") if (k := part.strip())]
         elif isinstance(keywords, list):
             result["keywords"] = [decode_text(k) for k in keywords]

kreuzberg/_types.py CHANGED Viewed

@@ -5,7 +5,10 @@ from collections.abc import Awaitable, Callable
 from dataclasses import asdict, dataclass, field
 from typing import TYPE_CHECKING, Any, Literal, TypedDict
+import msgspec
 from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
+from kreuzberg._utils._table import export_table_to_csv, export_table_to_tsv, extract_table_structure_info
 from kreuzberg.exceptions import ValidationError
 if sys.version_info < (3, 11):  # pragma: no cover
@@ -191,7 +194,7 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
     return normalized
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class Entity:
     """Represents an extracted entity with type, text, and position."""
@@ -205,7 +208,7 @@ class Entity:
     """End character offset in the content"""
-@dataclass
+@dataclass(slots=True)
 class ExtractionResult:
     """The result of a file extraction."""
@@ -226,9 +229,29 @@ class ExtractionResult:
     detected_languages: list[str] | None = None
     """Languages detected in the extracted content, if language detection is enabled."""
-    def to_dict(self) -> dict[str, Any]:
-        """Converts the ExtractionResult to a dictionary."""
-        return asdict(self)
+    def to_dict(self, include_none: bool = False) -> dict[str, Any]:
+        """Converts the ExtractionResult to a dictionary.
+        Args:
+            include_none: If True, include fields with None values.
+                         If False (default), exclude None values.
+        Returns:
+            Dictionary representation of the ExtractionResult.
+        """
+        # Use msgspec.to_builtins for efficient conversion
+        # The builtin_types parameter allows DataFrames to pass through
+        result = msgspec.to_builtins(
+            self,
+            builtin_types=(type(None),),  # Allow None to pass through
+            order="deterministic",  # Ensure consistent output
+        )
+        if include_none:
+            return result  # type: ignore[no-any-return]
+        # Remove None values to match expected behavior
+        return {k: v for k, v in result.items() if v is not None}
     def export_tables_to_csv(self) -> list[str]:
         """Export all tables to CSV format.
@@ -239,8 +262,6 @@ class ExtractionResult:
         if not self.tables:
             return []
-        from kreuzberg._utils._table import export_table_to_csv
         return [export_table_to_csv(table) for table in self.tables]
     def export_tables_to_tsv(self) -> list[str]:
@@ -252,8 +273,6 @@ class ExtractionResult:
         if not self.tables:
             return []
-        from kreuzberg._utils._table import export_table_to_tsv
         return [export_table_to_tsv(table) for table in self.tables]
     def get_table_summaries(self) -> list[dict[str, Any]]:
@@ -265,8 +284,6 @@ class ExtractionResult:
         if not self.tables:
             return []
-        from kreuzberg._utils._table import extract_table_structure_info
         return [extract_table_structure_info(table) for table in self.tables]
@@ -274,7 +291,7 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
 ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
-@dataclass(unsafe_hash=True)
+@dataclass(unsafe_hash=True, slots=True)
 class ExtractionConfig:
     """Represents configuration settings for an extraction process.
@@ -355,18 +372,23 @@ class ExtractionConfig:
         Returns:
             A dict of the OCR configuration or an empty dict if no backend is provided.
         """
-        if self.ocr_backend is not None:
-            if self.ocr_config is not None:
-                return asdict(self.ocr_config)
-            if self.ocr_backend == "tesseract":
-                from kreuzberg._ocr._tesseract import TesseractConfig
+        if self.ocr_backend is None:
+            return {}
-                return asdict(TesseractConfig())
-            if self.ocr_backend == "easyocr":
-                from kreuzberg._ocr._easyocr import EasyOCRConfig
+        if self.ocr_config is not None:
+            # Use asdict for OCR configs to preserve enum objects correctly
+            return asdict(self.ocr_config)
-                return asdict(EasyOCRConfig())
-            from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+        # Lazy load and cache default configs instead of creating new instances
+        if self.ocr_backend == "tesseract":
+            from kreuzberg._ocr._tesseract import TesseractConfig
-            return asdict(PaddleOCRConfig())
-        return {}
+            return asdict(TesseractConfig())
+        if self.ocr_backend == "easyocr":
+            from kreuzberg._ocr._easyocr import EasyOCRConfig
+            return asdict(EasyOCRConfig())
+        # paddleocr
+        from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+        return asdict(PaddleOCRConfig())

kreuzberg/_utils/_cache.py CHANGED Viewed

@@ -64,11 +64,10 @@ class KreuzbergCache(Generic[T]):
         Returns:
             Unique cache key string
         """
-        # Use more efficient string building for cache key
         if not kwargs:
             return "empty"
-        # Build key string efficiently
+        # Build cache key using list + join (faster than StringIO)
         parts = []
         for key in sorted(kwargs):
             value = kwargs[key]
@@ -81,6 +80,7 @@ class KreuzbergCache(Generic[T]):
                 parts.append(f"{key}={type(value).__name__}:{value!s}")
         cache_str = "&".join(parts)
+        # SHA256 is secure and fast enough for cache keys
         return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
     def _get_cache_path(self, cache_key: str) -> Path:
@@ -107,15 +107,14 @@ class KreuzbergCache(Generic[T]):
             serialized_data = []
             for item in result:
                 if isinstance(item, dict) and "df" in item:
-                    # Create a copy and serialize the DataFrame as CSV
-                    item_copy = item.copy()
+                    # Build new dict without unnecessary copy
+                    serialized_item = {k: v for k, v in item.items() if k != "df"}
                     if hasattr(item["df"], "to_csv"):
-                        item_copy["df_csv"] = item["df"].to_csv(index=False)
+                        serialized_item["df_csv"] = item["df"].to_csv(index=False)
                     else:
                         # Fallback for non-DataFrame objects
-                        item_copy["df_csv"] = str(item["df"])
-                    del item_copy["df"]
-                    serialized_data.append(item_copy)
+                        serialized_item["df_csv"] = str(item["df"])
+                    serialized_data.append(serialized_item)
                 else:
                     serialized_data.append(item)
             return {"type": "TableDataList", "data": serialized_data, "cached_at": time.time()}
@@ -127,18 +126,17 @@ class KreuzbergCache(Generic[T]):
         data = cached_data["data"]
         if cached_data.get("type") == "TableDataList" and isinstance(data, list):
+            from io import StringIO
+            import pandas as pd
             deserialized_data = []
             for item in data:
                 if isinstance(item, dict) and "df_csv" in item:
-                    # Restore the DataFrame from CSV
-                    item_copy = item.copy()
-                    from io import StringIO
-                    import pandas as pd
-                    item_copy["df"] = pd.read_csv(StringIO(item["df_csv"]))
-                    del item_copy["df_csv"]
-                    deserialized_data.append(item_copy)
+                    # Build new dict without unnecessary copy
+                    deserialized_item = {k: v for k, v in item.items() if k != "df_csv"}
+                    deserialized_item["df"] = pd.read_csv(StringIO(item["df_csv"]))
+                    deserialized_data.append(deserialized_item)
                 else:
                     deserialized_data.append(item)
             return deserialized_data  # type: ignore[return-value]

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import warnings
 from dataclasses import dataclass
+from itertools import chain
 from typing import Literal
 from kreuzberg.exceptions import ValidationError
@@ -12,7 +13,7 @@ from kreuzberg.exceptions import ValidationError
 DeviceType = Literal["cpu", "cuda", "mps", "auto"]
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class DeviceInfo:
     """Information about a compute device."""
@@ -34,28 +35,17 @@ def detect_available_devices() -> list[DeviceInfo]:
     Returns:
         List of available devices, with the most preferred device first.
     """
-    devices: list[DeviceInfo] = []
-    devices.append(
-        DeviceInfo(
-            device_type="cpu",
-            name="CPU",
-        )
-    )
-    if _is_cuda_available():
-        cuda_devices = _get_cuda_devices()
-        devices.extend(cuda_devices)
+    # Build device lists efficiently using generators
+    cpu_device = DeviceInfo(device_type="cpu", name="CPU")
-    if _is_mps_available():
-        mps_device = _get_mps_device()
-        if mps_device:
-            devices.append(mps_device)
+    cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
-    gpu_devices = [d for d in devices if d.device_type != "cpu"]
-    cpu_devices = [d for d in devices if d.device_type == "cpu"]
+    mps_device = _get_mps_device() if _is_mps_available() else None
+    mps_devices = [mps_device] if mps_device else []
-    return gpu_devices + cpu_devices
+    # Return GPU devices first, then CPU using itertools.chain
+    gpu_devices = list(chain(cuda_devices, mps_devices))
+    return [*gpu_devices, cpu_device]
 def get_optimal_device() -> DeviceInfo:

kreuzberg/_utils/_errors.py CHANGED Viewed

@@ -12,6 +12,42 @@ import psutil
 from kreuzberg.exceptions import ValidationError
+# Define error keywords as frozensets for O(1) membership testing
+_SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
+_TRANSIENT_ERROR_PATTERNS = frozenset(
+    {
+        "temporary",
+        "locked",
+        "in use",
+        "access denied",
+        "permission",
+        "timeout",
+        "connection",
+        "network",
+        "too many open files",
+        "cannot allocate memory",
+        "resource temporarily unavailable",
+        "broken pipe",
+        "subprocess",
+        "signal",
+    }
+)
+_RESOURCE_ERROR_PATTERNS = frozenset(
+    {
+        "memory",
+        "out of memory",
+        "cannot allocate",
+        "too many open files",
+        "file descriptor",
+        "resource",
+        "exhausted",
+        "limit",
+        "cpu",
+        "thread",
+        "process",
+    }
+)
 def create_error_context(
     *,
@@ -52,11 +88,7 @@ def create_error_context(
             "traceback": traceback.format_exception_only(type(error), error),
         }
-    if (
-        any(keyword in str(error).lower() for keyword in ["memory", "resource", "process", "thread"])
-        if error
-        else False
-    ):
+    if error and any(keyword in str(error).lower() for keyword in _SYSTEM_ERROR_KEYWORDS):
         try:
             mem = psutil.virtual_memory()
             context["system"] = {
@@ -94,25 +126,8 @@ def is_transient_error(error: Exception) -> bool:
     if isinstance(error, transient_types):
         return True
-    transient_patterns = [
-        "temporary",
-        "locked",
-        "in use",
-        "access denied",
-        "permission",
-        "timeout",
-        "connection",
-        "network",
-        "too many open files",
-        "cannot allocate memory",
-        "resource temporarily unavailable",
-        "broken pipe",
-        "subprocess",
-        "signal",
-    ]
     error_str = str(error).lower()
-    return any(pattern in error_str for pattern in transient_patterns)
+    return any(pattern in error_str for pattern in _TRANSIENT_ERROR_PATTERNS)
 def is_resource_error(error: Exception) -> bool:
@@ -124,22 +139,8 @@ def is_resource_error(error: Exception) -> bool:
     Returns:
         True if the error is resource-related
     """
-    resource_patterns = [
-        "memory",
-        "out of memory",
-        "cannot allocate",
-        "too many open files",
-        "file descriptor",
-        "resource",
-        "exhausted",
-        "limit",
-        "cpu",
-        "thread",
-        "process",
-    ]
     error_str = str(error).lower()
-    return any(pattern in error_str for pattern in resource_patterns)
+    return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
 def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
@@ -165,6 +166,8 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
 class BatchExtractionResult:
     """Result container for batch operations with partial success support."""
+    __slots__ = ("failed", "successful", "total_count")
     def __init__(self) -> None:
         """Initialize batch result container."""
         self.successful: list[tuple[int, Any]] = []

kreuzberg 3.8.1__py3-none-any.whl → 3.8.2__py3-none-any.whl

kreuzberg 3.8.1py3-none-any.whl → 3.8.2py3-none-any.whl