PyPI - kreuzberg - Versions diffs - 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl - Mend

kreuzberg 3.14.0py3-none-any.whl → 3.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

kreuzberg/__init__.py +6 -0
kreuzberg/_api/_config_cache.py +247 -0
kreuzberg/_api/main.py +156 -30
kreuzberg/_chunker.py +7 -6
kreuzberg/_constants.py +2 -0
kreuzberg/_document_classification.py +4 -6
kreuzberg/_entity_extraction.py +9 -4
kreuzberg/_extractors/_base.py +269 -3
kreuzberg/_extractors/_email.py +95 -27
kreuzberg/_extractors/_html.py +85 -7
kreuzberg/_extractors/_image.py +23 -22
kreuzberg/_extractors/_pandoc.py +106 -75
kreuzberg/_extractors/_pdf.py +209 -99
kreuzberg/_extractors/_presentation.py +72 -8
kreuzberg/_extractors/_spread_sheet.py +25 -30
kreuzberg/_mcp/server.py +345 -25
kreuzberg/_mime_types.py +42 -0
kreuzberg/_ocr/_easyocr.py +2 -2
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +74 -34
kreuzberg/_types.py +182 -23
kreuzberg/_utils/_cache.py +10 -4
kreuzberg/_utils/_device.py +2 -4
kreuzberg/_utils/_image_preprocessing.py +12 -39
kreuzberg/_utils/_process_pool.py +29 -8
kreuzberg/_utils/_quality.py +7 -2
kreuzberg/_utils/_resource_managers.py +65 -0
kreuzberg/_utils/_sync.py +36 -6
kreuzberg/_utils/_tmp.py +37 -1
kreuzberg/cli.py +34 -20
kreuzberg/extraction.py +43 -27
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
kreuzberg-3.15.0.dist-info/RECORD +60 -0
kreuzberg-3.14.0.dist-info/RECORD +0 -58
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -8,6 +8,7 @@ import re
 import subprocess
 import sys
 import tempfile
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from io import StringIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Final
@@ -28,10 +29,10 @@ from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
 from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
 from kreuzberg._utils._cache import get_ocr_cache
-from kreuzberg._utils._process_pool import ProcessPoolManager
+from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
-from kreuzberg._utils._tmp import create_temp_file
+from kreuzberg._utils._tmp import create_temp_file, temporary_file_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
 if TYPE_CHECKING:
@@ -257,18 +258,19 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         if enable_table_detection and output_format == "text":
             output_format = "tsv"
-        if output_format == "markdown":
-            tesseract_format = "hocr"
-            ext = ".hocr"
-        elif output_format == "tsv":
-            tesseract_format = "tsv"
-            ext = ".tsv"
-        elif output_format == "hocr":
-            tesseract_format = "hocr"
-            ext = ".hocr"
-        else:
-            tesseract_format = "text"
-            ext = ".txt"
+        match output_format:
+            case "markdown":
+                tesseract_format = "hocr"
+                ext = ".hocr"
+            case "tsv":
+                tesseract_format = "tsv"
+                ext = ".tsv"
+            case "hocr":
+                tesseract_format = "hocr"
+                ext = ".hocr"
+            case _:
+                tesseract_format = "text"
+                ext = ".txt"
         return {
             "language": language,
@@ -344,11 +346,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         if output_format == "tsv":
             return self._extract_text_from_tsv(output)
         if output_format == "hocr":
-            return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
+            return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={})
-        return ExtractionResult(
-            content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
-        )
+        return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
     async def process_file(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
         use_cache = kwargs.pop("use_cache", True)
@@ -494,9 +494,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     content += parts[11] + " "
             content = content.strip()
-        return ExtractionResult(
-            content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
-        )
+        return ExtractionResult(content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
     async def _process_hocr_to_markdown(
         self,
@@ -517,7 +515,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         tables: list[TableData] = []
         if enable_table_detection:
-            soup = BeautifulSoup(hocr_content, "lxml")
+            soup = BeautifulSoup(hocr_content, "xml")
             tables = await self._extract_tables_from_hocr(
                 soup,
                 table_column_threshold,
@@ -539,7 +537,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             markdown_content = normalize_spaces(markdown_content)
         except (ValueError, TypeError, AttributeError):
             try:
-                soup = BeautifulSoup(hocr_content, "lxml")
+                soup = BeautifulSoup(hocr_content, "xml")
                 words = soup.find_all("span", class_="ocrx_word")
                 text_parts = []
                 for word in words:
@@ -690,7 +688,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         except (ValueError, TypeError, AttributeError):
             try:
-                soup = BeautifulSoup(hocr_content, "lxml")
+                soup = BeautifulSoup(hocr_content, "xml")
                 words = soup.find_all("span", class_="ocrx_word")
                 text_parts = []
                 for word in words:
@@ -948,11 +946,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         if output_format == "tsv":
             return self._extract_text_from_tsv(output)
         if output_format == "hocr":
-            return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
+            return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={})
-        return ExtractionResult(
-            content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
-        )
+        return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
     def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
         use_cache = kwargs.pop("use_cache", True)
@@ -979,10 +975,8 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         ocr_cache = get_ocr_cache()
         try:
             self._validate_tesseract_version_sync()
-            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
-                image_path = Path(tmp_file.name)
+            with temporary_file_sync(".png") as image_path:
                 save_image.save(str(image_path), format="PNG")
-            try:
                 kwargs_with_cache = {**kwargs, "use_cache": use_cache}
                 result = self.process_file_sync(image_path, **kwargs_with_cache)
@@ -990,9 +984,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     ocr_cache.set(result, **cache_kwargs)
                 return result
-            finally:
-                if image_path.exists():
-                    image_path.unlink()
         finally:
             if use_cache:
                 ocr_cache.mark_complete(**cache_kwargs)
@@ -1092,6 +1083,55 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 "mtime": 0,
             }
+    def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
+        """Convert a worker result dict to ExtractionResult."""
+        if result_dict.get("success"):
+            return ExtractionResult(
+                content=str(result_dict.get("text", "")),
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata={},
+                chunks=[],
+            )
+        return ExtractionResult(
+            content=f"[OCR error: {result_dict.get('error', 'Unknown error')}]",
+            mime_type=PLAIN_TEXT_MIME_TYPE,
+            metadata={},
+            chunks=[],
+        )
+    def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[TesseractConfig]) -> list[ExtractionResult]:
+        if not paths:
+            return []
+        results: list[ExtractionResult] = [
+            ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
+        ] * len(paths)
+        run_config = self._prepare_tesseract_run_config(**kwargs)
+        config_dict: dict[str, Any] = {
+            **run_config["remaining_kwargs"],
+            "language": run_config["language"],
+            "psm": run_config["psm"],
+        }
+        optimal_workers = get_optimal_worker_count(len(paths), cpu_intensive=True)
+        with ProcessPoolExecutor(max_workers=optimal_workers) as pool:
+            future_to_idx = {
+                pool.submit(_process_image_with_tesseract, str(p), config_dict): idx for idx, p in enumerate(paths)
+            }
+            for future in as_completed(future_to_idx):
+                idx = future_to_idx[future]
+                try:
+                    result_dict = future.result()
+                    results[idx] = self._result_from_dict(result_dict)
+                except Exception as e:  # noqa: BLE001
+                    results[idx] = ExtractionResult(
+                        content=f"[OCR error: {e}]", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
+                    )
+        return results
     def _build_tesseract_command(
         self,
         path: Path,

kreuzberg/_types.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sys
 from collections.abc import Awaitable, Callable, Iterable, Mapping
 from dataclasses import asdict, dataclass, field
 from enum import Enum
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
 import msgspec
@@ -25,8 +26,6 @@ else:  # pragma: no cover
     from typing import NotRequired
 if TYPE_CHECKING:
-    from pathlib import Path
     from PIL.Image import Image
     from polars import DataFrame
@@ -165,6 +164,12 @@ class EasyOCRConfig(ConfigDict):
     ycenter_ths: float = 0.5
     """Maximum shift in y direction for merging."""
+    def __post_init__(self) -> None:
+        if isinstance(self.language, list):
+            object.__setattr__(self, "language", tuple(self.language))
+        if isinstance(self.rotation_info, list):
+            object.__setattr__(self, "rotation_info", tuple(self.rotation_info))
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class PaddleOCRConfig(ConfigDict):
@@ -349,7 +354,52 @@ class GMFTConfig(ConfigDict):
     """
-@dataclass(frozen=True, slots=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
+class ImageOCRConfig(ConfigDict):
+    """Configuration for OCR processing of extracted images."""
+    enabled: bool = False
+    """Whether to perform OCR on extracted images."""
+    backend: OcrBackendType | None = None
+    """OCR backend for image OCR. Falls back to main ocr_backend when None."""
+    backend_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
+    """Backend-specific configuration for image OCR."""
+    min_dimensions: tuple[int, int] = (50, 50)
+    """Minimum (width, height) in pixels for image OCR eligibility."""
+    max_dimensions: tuple[int, int] = (10000, 10000)
+    """Maximum (width, height) in pixels for image OCR eligibility."""
+    allowed_formats: frozenset[str] = frozenset(
+        {
+            "jpg",
+            "jpeg",
+            "png",
+            "gif",
+            "bmp",
+            "tiff",
+            "tif",
+            "webp",
+            "jp2",
+            "jpx",
+            "jpm",
+            "mj2",
+            "pnm",
+            "pbm",
+            "pgm",
+            "ppm",
+        }
+    )
+    """Allowed image formats for OCR processing (lowercase, without dot)."""
+    batch_size: int = 4
+    """Number of images to process in parallel for OCR."""
+    timeout_seconds: int = 30
+    """Maximum time in seconds for OCR processing per image."""
+    def __post_init__(self) -> None:
+        if isinstance(self.allowed_formats, list):
+            object.__setattr__(self, "allowed_formats", frozenset(self.allowed_formats))
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class LanguageDetectionConfig(ConfigDict):
     low_memory: bool = True
     """If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
@@ -391,6 +441,9 @@ class SpacyEntityExtractionConfig(ConfigDict):
     """Batch size for processing multiple texts."""
     def __post_init__(self) -> None:
+        if isinstance(self.model_cache_dir, Path):
+            object.__setattr__(self, "model_cache_dir", str(self.model_cache_dir))
         if self.language_models is None:
             object.__setattr__(self, "language_models", self._get_default_language_models())
@@ -622,6 +675,8 @@ class Metadata(TypedDict, total=False):
     """Source format of the extracted content."""
     error: NotRequired[str]
     """Error message if extraction failed."""
+    error_context: NotRequired[dict[str, Any]]
+    """Error context information for debugging."""
 _VALID_METADATA_KEYS = {
@@ -664,6 +719,9 @@ _VALID_METADATA_KEYS = {
     "tables_summary",
     "quality_score",
     "image_preprocessing",
+    "source_format",
+    "error",
+    "error_context",
 }
@@ -679,7 +737,7 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
     return normalized
-@dataclass(frozen=True, slots=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class Entity:
     type: str
     """e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
@@ -691,18 +749,44 @@ class Entity:
     """End character offset in the content"""
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
+class ExtractedImage:
+    data: bytes
+    format: str
+    filename: str | None = None
+    page_number: int | None = None
+    dimensions: tuple[int, int] | None = None
+    colorspace: str | None = None
+    bits_per_component: int | None = None
+    is_mask: bool = False
+    description: str | None = None
+@dataclass(slots=True)
+class ImageOCRResult:
+    image: ExtractedImage
+    ocr_result: ExtractionResult
+    confidence_score: float | None = None
+    processing_time: float | None = None
+    skipped_reason: str | None = None
 @dataclass(slots=True)
 class ExtractionResult:
     content: str
     """The extracted content."""
     mime_type: str
     """The mime type of the extracted content. Is either text/plain or text/markdown."""
-    metadata: Metadata
+    metadata: Metadata = field(default_factory=lambda: Metadata())
     """The metadata of the content."""
     tables: list[TableData] = field(default_factory=list)
     """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
     chunks: list[str] = field(default_factory=list)
     """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
+    images: list[ExtractedImage] = field(default_factory=list)
+    """Extracted images. Empty list if 'extract_images' is not enabled."""
+    image_ocr_results: list[ImageOCRResult] = field(default_factory=list)
+    """OCR results from extracted images. Empty list if disabled or none processed."""
     entities: list[Entity] | None = None
     """Extracted entities, if entity extraction is enabled."""
     keywords: list[tuple[str, float]] | None = None
@@ -751,7 +835,7 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
 ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
-@dataclass(unsafe_hash=True, slots=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class ExtractionConfig(ConfigDict):
     force_ocr: bool = False
     """Whether to force OCR."""
@@ -761,6 +845,41 @@ class ExtractionConfig(ConfigDict):
     """Whether to extract tables from the content. This requires the 'gmft' dependency."""
     extract_tables_from_ocr: bool = False
     """Extract tables from OCR output using TSV format (Tesseract only)."""
+    extract_images: bool = False
+    """Whether to extract images from documents."""
+    deduplicate_images: bool = True
+    """Whether to remove duplicate images using CRC32 checksums."""
+    image_ocr_config: ImageOCRConfig | None = None
+    """Configuration for OCR processing of extracted images."""
+    ocr_extracted_images: bool = False
+    """Deprecated: Use image_ocr_config.enabled instead."""
+    image_ocr_backend: OcrBackendType | None = None
+    """Deprecated: Use image_ocr_config.backend instead."""
+    image_ocr_min_dimensions: tuple[int, int] = (50, 50)
+    """Deprecated: Use image_ocr_config.min_dimensions instead."""
+    image_ocr_max_dimensions: tuple[int, int] = (10000, 10000)
+    """Deprecated: Use image_ocr_config.max_dimensions instead."""
+    image_ocr_formats: frozenset[str] = frozenset(
+        {
+            "jpg",
+            "jpeg",
+            "png",
+            "gif",
+            "bmp",
+            "tiff",
+            "tif",
+            "webp",
+            "jp2",
+            "jpx",
+            "jpm",
+            "mj2",
+            "pnm",
+            "pbm",
+            "pgm",
+            "ppm",
+        }
+    )
+    """Deprecated: Use image_ocr_config.allowed_formats instead."""
     max_chars: int = DEFAULT_MAX_CHARACTERS
     """The size of each chunk in characters."""
     max_overlap: int = DEFAULT_MAX_OVERLAP
@@ -826,6 +945,51 @@ class ExtractionConfig(ConfigDict):
         if self.validators is not None and isinstance(self.validators, list):
             object.__setattr__(self, "validators", tuple(self.validators))
+        if isinstance(self.pdf_password, list):
+            object.__setattr__(self, "pdf_password", tuple(self.pdf_password))
+        if isinstance(self.image_ocr_formats, list):
+            object.__setattr__(self, "image_ocr_formats", frozenset(self.image_ocr_formats))
+        if self.image_ocr_config is None and (
+            self.ocr_extracted_images
+            or self.image_ocr_backend is not None
+            or self.image_ocr_min_dimensions != (50, 50)
+            or self.image_ocr_max_dimensions != (10000, 10000)
+            or self.image_ocr_formats
+            != frozenset(
+                {
+                    "jpg",
+                    "jpeg",
+                    "png",
+                    "gif",
+                    "bmp",
+                    "tiff",
+                    "tif",
+                    "webp",
+                    "jp2",
+                    "jpx",
+                    "jpm",
+                    "mj2",
+                    "pnm",
+                    "pbm",
+                    "pgm",
+                    "ppm",
+                }
+            )
+        ):
+            object.__setattr__(
+                self,
+                "image_ocr_config",
+                ImageOCRConfig(
+                    enabled=self.ocr_extracted_images,
+                    backend=self.image_ocr_backend,
+                    min_dimensions=self.image_ocr_min_dimensions,
+                    max_dimensions=self.image_ocr_max_dimensions,
+                    allowed_formats=self.image_ocr_formats,
+                ),
+            )
         if self.ocr_backend is None and self.ocr_config is not None:
             raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
@@ -839,7 +1003,6 @@ class ExtractionConfig(ConfigDict):
                 context={"ocr_backend": self.ocr_backend, "ocr_config": type(self.ocr_config).__name__},
             )
-        # Validate DPI configuration
         if self.target_dpi <= 0:
             raise ValidationError("target_dpi must be positive", context={"target_dpi": self.target_dpi})
         if self.min_dpi <= 0:
@@ -861,27 +1024,22 @@ class ExtractionConfig(ConfigDict):
             )
     def get_config_dict(self) -> dict[str, Any]:
-        if self.ocr_backend is None:
-            return {"use_cache": self.use_cache}
-        if self.ocr_config is not None:
-            config_dict = asdict(self.ocr_config)
-            config_dict["use_cache"] = self.use_cache
-            return config_dict
         match self.ocr_backend:
-            case "tesseract":
-                config_dict = asdict(TesseractConfig())
+            case None:
+                return {"use_cache": self.use_cache}
+            case _ if self.ocr_config is not None:
+                config_dict = asdict(self.ocr_config)
                 config_dict["use_cache"] = self.use_cache
                 return config_dict
+            case "tesseract":
+                config_dict = asdict(TesseractConfig())
             case "easyocr":
                 config_dict = asdict(EasyOCRConfig())
-                config_dict["use_cache"] = self.use_cache
-                return config_dict
             case _:
                 config_dict = asdict(PaddleOCRConfig())
-                config_dict["use_cache"] = self.use_cache
-                return config_dict
+        config_dict["use_cache"] = self.use_cache
+        return config_dict
     def to_dict(self, include_none: bool = False) -> dict[str, Any]:
         result = msgspec.to_builtins(
@@ -900,7 +1058,7 @@ class ExtractionConfig(ConfigDict):
         return {k: v for k, v in result.items() if v is not None}
-@dataclass(frozen=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class HTMLToMarkdownConfig:
     stream_processing: bool = False
     """Enable streaming mode for processing large HTML documents."""
@@ -968,4 +1126,5 @@ class HTMLToMarkdownConfig:
     """Remove form elements from HTML."""
     def to_dict(self) -> dict[str, Any]:
-        return {key: value for key, value in self.__dict__.items() if value is not None}
+        result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
+        return {k: v for k, v in result.items() if v is not None}

kreuzberg/_utils/_cache.py CHANGED Viewed

@@ -20,6 +20,8 @@ from kreuzberg._utils._sync import run_sync
 T = TypeVar("T")
+CACHE_CLEANUP_FREQUENCY = 100
 class KreuzbergCache(Generic[T]):
     def __init__(
@@ -136,16 +138,20 @@ class KreuzbergCache(Generic[T]):
     def _cleanup_cache(self) -> None:
         try:
             cache_files = list(self.cache_dir.glob("*.msgpack"))
             cutoff_time = time.time() - (self.max_age_days * 24 * 3600)
-            for cache_file in cache_files[:]:
+            remaining_files = []
+            for cache_file in cache_files:
                 try:
                     if cache_file.stat().st_mtime < cutoff_time:
                         cache_file.unlink(missing_ok=True)
-                        cache_files.remove(cache_file)
+                    else:
+                        remaining_files.append(cache_file)
                 except OSError:  # noqa: PERF203
                     continue
+            cache_files = remaining_files
             total_size = sum(cache_file.stat().st_size for cache_file in cache_files if cache_file.exists()) / (
                 1024 * 1024
             )
@@ -191,7 +197,7 @@ class KreuzbergCache(Generic[T]):
             content = serialize(serialized)
             cache_path.write_bytes(content)
-            if hash(cache_key) % 100 == 0:
+            if hash(cache_key) % CACHE_CLEANUP_FREQUENCY == 0:
                 self._cleanup_cache()
         except (OSError, TypeError, ValueError):
             pass

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -12,7 +12,7 @@ from kreuzberg.exceptions import ValidationError
 DeviceType = Literal["cpu", "cuda", "mps", "auto"]
-@dataclass(frozen=True, slots=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class DeviceInfo:
     device_type: Literal["cpu", "cuda", "mps"]
     """The type of device."""
@@ -30,12 +30,10 @@ def detect_available_devices() -> list[DeviceInfo]:
     cpu_device = DeviceInfo(device_type="cpu", name="CPU")
     cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
     mps_device = _get_mps_device() if _is_mps_available() else None
     mps_devices = [mps_device] if mps_device else []
-    gpu_devices = list(chain(cuda_devices, mps_devices))
-    return [*gpu_devices, cpu_device]
+    return list(chain(cuda_devices, mps_devices, [cpu_device]))
 def get_optimal_device() -> DeviceInfo:

kreuzberg 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl

kreuzberg 3.14.0py3-none-any.whl → 3.15.0py3-none-any.whl