PyPI - kreuzberg - Versions diffs - 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl - Mend

kreuzberg 3.14.1py3-none-any.whl → 3.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

kreuzberg/__init__.py +10 -0
kreuzberg/_api/_config_cache.py +247 -0
kreuzberg/_api/main.py +74 -45
kreuzberg/_chunker.py +7 -6
kreuzberg/_config.py +11 -1
kreuzberg/_constants.py +2 -0
kreuzberg/_document_classification.py +5 -7
kreuzberg/_entity_extraction.py +9 -4
kreuzberg/_extractors/_base.py +269 -3
kreuzberg/_extractors/_email.py +101 -27
kreuzberg/_extractors/_html.py +112 -7
kreuzberg/_extractors/_image.py +23 -22
kreuzberg/_extractors/_pandoc.py +106 -75
kreuzberg/_extractors/_pdf.py +208 -99
kreuzberg/_extractors/_presentation.py +76 -8
kreuzberg/_extractors/_spread_sheet.py +24 -30
kreuzberg/_extractors/_structured.py +83 -15
kreuzberg/_gmft.py +5 -0
kreuzberg/_mcp/server.py +324 -25
kreuzberg/_mime_types.py +42 -0
kreuzberg/_ocr/_easyocr.py +53 -21
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +88 -37
kreuzberg/_types.py +291 -61
kreuzberg/_utils/_cache.py +10 -4
kreuzberg/_utils/_device.py +2 -4
kreuzberg/_utils/_html_streaming.py +20 -0
kreuzberg/_utils/_image_preprocessing.py +12 -39
kreuzberg/_utils/_process_pool.py +29 -8
kreuzberg/_utils/_quality.py +7 -2
kreuzberg/_utils/_resource_managers.py +65 -0
kreuzberg/_utils/_serialization.py +13 -6
kreuzberg/_utils/_sync.py +39 -10
kreuzberg/_utils/_tmp.py +37 -1
kreuzberg/cli.py +34 -20
kreuzberg/extraction.py +44 -28
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
kreuzberg-3.16.0.dist-info/RECORD +61 -0
kreuzberg-3.14.1.dist-info/RECORD +0 -58
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_types.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from __future__ import annotations
 import sys
-from collections.abc import Awaitable, Callable, Iterable, Mapping
+from collections.abc import Awaitable, Callable, Mapping
 from dataclasses import asdict, dataclass, field
 from enum import Enum
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
 import msgspec
@@ -25,8 +26,6 @@ else:  # pragma: no cover
     from typing import NotRequired
 if TYPE_CHECKING:
-    from pathlib import Path
     from PIL.Image import Image
     from polars import DataFrame
@@ -165,6 +164,12 @@ class EasyOCRConfig(ConfigDict):
     ycenter_ths: float = 0.5
     """Maximum shift in y direction for merging."""
+    def __post_init__(self) -> None:
+        if isinstance(self.language, list):
+            object.__setattr__(self, "language", tuple(self.language))
+        if isinstance(self.rotation_info, list):
+            object.__setattr__(self, "rotation_info", tuple(self.rotation_info))
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class PaddleOCRConfig(ConfigDict):
@@ -349,6 +354,51 @@ class GMFTConfig(ConfigDict):
     """
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
+class ImageOCRConfig(ConfigDict):
+    """Configuration for OCR processing of extracted images."""
+    enabled: bool = False
+    """Whether to perform OCR on extracted images."""
+    backend: OcrBackendType | None = None
+    """OCR backend for image OCR. Falls back to main ocr_backend when None."""
+    backend_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
+    """Backend-specific configuration for image OCR."""
+    min_dimensions: tuple[int, int] = (50, 50)
+    """Minimum (width, height) in pixels for image OCR eligibility."""
+    max_dimensions: tuple[int, int] = (10000, 10000)
+    """Maximum (width, height) in pixels for image OCR eligibility."""
+    allowed_formats: frozenset[str] = frozenset(
+        {
+            "jpg",
+            "jpeg",
+            "png",
+            "gif",
+            "bmp",
+            "tiff",
+            "tif",
+            "webp",
+            "jp2",
+            "jpx",
+            "jpm",
+            "mj2",
+            "pnm",
+            "pbm",
+            "pgm",
+            "ppm",
+        }
+    )
+    """Allowed image formats for OCR processing (lowercase, without dot)."""
+    batch_size: int = 4
+    """Number of images to process in parallel for OCR."""
+    timeout_seconds: int = 30
+    """Maximum time in seconds for OCR processing per image."""
+    def __post_init__(self) -> None:
+        if isinstance(self.allowed_formats, list):
+            object.__setattr__(self, "allowed_formats", frozenset(self.allowed_formats))
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class LanguageDetectionConfig(ConfigDict):
     low_memory: bool = True
@@ -391,6 +441,9 @@ class SpacyEntityExtractionConfig(ConfigDict):
     """Batch size for processing multiple texts."""
     def __post_init__(self) -> None:
+        if isinstance(self.model_cache_dir, Path):
+            object.__setattr__(self, "model_cache_dir", str(self.model_cache_dir))
         if self.language_models is None:
             object.__setattr__(self, "language_models", self._get_default_language_models())
@@ -538,6 +591,8 @@ class ImagePreprocessingMetadata(NamedTuple):
 class Metadata(TypedDict, total=False):
+    abstract: NotRequired[str]
+    """Document abstract or summary."""
     authors: NotRequired[list[str]]
     """List of document authors."""
     categories: NotRequired[list[str]]
@@ -622,9 +677,28 @@ class Metadata(TypedDict, total=False):
     """Source format of the extracted content."""
     error: NotRequired[str]
     """Error message if extraction failed."""
+    error_context: NotRequired[dict[str, Any]]
+    """Error context information for debugging."""
+    json_schema: NotRequired[dict[str, Any]]
+    """JSON schema information extracted from structured data."""
+    notes: NotRequired[list[str]]
+    """Notes or additional information extracted from documents."""
+    note: NotRequired[str]
+    """Single note or annotation."""
+    name: NotRequired[str]
+    """Name field from structured data."""
+    body: NotRequired[str]
+    """Body text content."""
+    text: NotRequired[str]
+    """Generic text content."""
+    message: NotRequired[str]
+    """Message or communication content."""
+    attributes: NotRequired[dict[str, Any]]
+    """Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
 _VALID_METADATA_KEYS = {
+    "abstract",
     "authors",
     "categories",
     "citations",
@@ -664,6 +738,17 @@ _VALID_METADATA_KEYS = {
     "tables_summary",
     "quality_score",
     "image_preprocessing",
+    "source_format",
+    "error",
+    "error_context",
+    "json_schema",
+    "notes",
+    "note",
+    "name",
+    "body",
+    "text",
+    "message",
+    "attributes",
 }
@@ -672,14 +757,34 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
         return {}
     normalized: Metadata = {}
+    attributes: dict[str, Any] = {}
     for key, value in data.items():
-        if key in _VALID_METADATA_KEYS and value is not None:
-            normalized[key] = value  # type: ignore[literal-required]
+        if value is not None:
+            if key in _VALID_METADATA_KEYS:
+                normalized[key] = value  # type: ignore[literal-required]
+            elif "." in key and key.split(".")[-1] in {
+                "title",
+                "name",
+                "subject",
+                "description",
+                "content",
+                "body",
+                "text",
+                "message",
+                "note",
+                "abstract",
+                "summary",
+            }:
+                attributes[key] = value
+    if attributes:
+        normalized["attributes"] = attributes
     return normalized
-@dataclass(frozen=True, slots=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class Entity:
     type: str
     """e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
@@ -691,18 +796,44 @@ class Entity:
     """End character offset in the content"""
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
+class ExtractedImage:
+    data: bytes
+    format: str
+    filename: str | None = None
+    page_number: int | None = None
+    dimensions: tuple[int, int] | None = None
+    colorspace: str | None = None
+    bits_per_component: int | None = None
+    is_mask: bool = False
+    description: str | None = None
+@dataclass(slots=True)
+class ImageOCRResult:
+    image: ExtractedImage
+    ocr_result: ExtractionResult
+    confidence_score: float | None = None
+    processing_time: float | None = None
+    skipped_reason: str | None = None
 @dataclass(slots=True)
 class ExtractionResult:
     content: str
     """The extracted content."""
     mime_type: str
     """The mime type of the extracted content. Is either text/plain or text/markdown."""
-    metadata: Metadata
+    metadata: Metadata = field(default_factory=lambda: Metadata())
     """The metadata of the content."""
     tables: list[TableData] = field(default_factory=list)
     """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
     chunks: list[str] = field(default_factory=list)
     """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
+    images: list[ExtractedImage] = field(default_factory=list)
+    """Extracted images. Empty list if 'extract_images' is not enabled."""
+    image_ocr_results: list[ImageOCRResult] = field(default_factory=list)
+    """OCR results from extracted images. Empty list if disabled or none processed."""
     entities: list[Entity] | None = None
     """Extracted entities, if entity extraction is enabled."""
     keywords: list[tuple[str, float]] | None = None
@@ -751,6 +882,30 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
 ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
+class JSONExtractionConfig(ConfigDict):
+    extract_schema: bool = False
+    """Extract and include JSON schema information in metadata."""
+    custom_text_field_patterns: frozenset[str] | None = None
+    """Custom patterns to identify text fields beyond default keywords."""
+    max_depth: int = 10
+    """Maximum nesting depth to process in JSON structures."""
+    array_item_limit: int = 1000
+    """Maximum number of array items to process to prevent memory issues."""
+    include_type_info: bool = False
+    """Include data type information in extracted content."""
+    flatten_nested_objects: bool = True
+    """Flatten nested objects using dot notation for better text extraction."""
+    def __post_init__(self) -> None:
+        if self.max_depth <= 0:
+            raise ValidationError("max_depth must be positive", context={"max_depth": self.max_depth})
+        if self.array_item_limit <= 0:
+            raise ValidationError(
+                "array_item_limit must be positive", context={"array_item_limit": self.array_item_limit}
+            )
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class ExtractionConfig(ConfigDict):
     force_ocr: bool = False
@@ -761,6 +916,41 @@ class ExtractionConfig(ConfigDict):
     """Whether to extract tables from the content. This requires the 'gmft' dependency."""
     extract_tables_from_ocr: bool = False
     """Extract tables from OCR output using TSV format (Tesseract only)."""
+    extract_images: bool = False
+    """Whether to extract images from documents."""
+    deduplicate_images: bool = True
+    """Whether to remove duplicate images using CRC32 checksums."""
+    image_ocr_config: ImageOCRConfig | None = None
+    """Configuration for OCR processing of extracted images."""
+    ocr_extracted_images: bool = False
+    """Deprecated: Use image_ocr_config.enabled instead."""
+    image_ocr_backend: OcrBackendType | None = None
+    """Deprecated: Use image_ocr_config.backend instead."""
+    image_ocr_min_dimensions: tuple[int, int] = (50, 50)
+    """Deprecated: Use image_ocr_config.min_dimensions instead."""
+    image_ocr_max_dimensions: tuple[int, int] = (10000, 10000)
+    """Deprecated: Use image_ocr_config.max_dimensions instead."""
+    image_ocr_formats: frozenset[str] = frozenset(
+        {
+            "jpg",
+            "jpeg",
+            "png",
+            "gif",
+            "bmp",
+            "tiff",
+            "tif",
+            "webp",
+            "jp2",
+            "jpx",
+            "jpm",
+            "mj2",
+            "pnm",
+            "pbm",
+            "pgm",
+            "ppm",
+        }
+    )
+    """Deprecated: Use image_ocr_config.allowed_formats instead."""
     max_chars: int = DEFAULT_MAX_CHARACTERS
     """The size of each chunk in characters."""
     max_overlap: int = DEFAULT_MAX_OVERLAP
@@ -805,6 +995,8 @@ class ExtractionConfig(ConfigDict):
     """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
     html_to_markdown_config: HTMLToMarkdownConfig | None = None
     """Configuration for HTML to Markdown conversion. If None, uses default settings."""
+    json_config: JSONExtractionConfig | None = None
+    """Configuration for enhanced JSON extraction features. If None, uses standard JSON processing."""
     use_cache: bool = True
     """Whether to use caching for extraction results. Set to False to disable all caching."""
     target_dpi: int = 150
@@ -826,6 +1018,51 @@ class ExtractionConfig(ConfigDict):
         if self.validators is not None and isinstance(self.validators, list):
             object.__setattr__(self, "validators", tuple(self.validators))
+        if isinstance(self.pdf_password, list):
+            object.__setattr__(self, "pdf_password", tuple(self.pdf_password))
+        if isinstance(self.image_ocr_formats, list):
+            object.__setattr__(self, "image_ocr_formats", frozenset(self.image_ocr_formats))
+        if self.image_ocr_config is None and (
+            self.ocr_extracted_images
+            or self.image_ocr_backend is not None
+            or self.image_ocr_min_dimensions != (50, 50)
+            or self.image_ocr_max_dimensions != (10000, 10000)
+            or self.image_ocr_formats
+            != frozenset(
+                {
+                    "jpg",
+                    "jpeg",
+                    "png",
+                    "gif",
+                    "bmp",
+                    "tiff",
+                    "tif",
+                    "webp",
+                    "jp2",
+                    "jpx",
+                    "jpm",
+                    "mj2",
+                    "pnm",
+                    "pbm",
+                    "pgm",
+                    "ppm",
+                }
+            )
+        ):
+            object.__setattr__(
+                self,
+                "image_ocr_config",
+                ImageOCRConfig(
+                    enabled=self.ocr_extracted_images,
+                    backend=self.image_ocr_backend,
+                    min_dimensions=self.image_ocr_min_dimensions,
+                    max_dimensions=self.image_ocr_max_dimensions,
+                    allowed_formats=self.image_ocr_formats,
+                ),
+            )
         if self.ocr_backend is None and self.ocr_config is not None:
             raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
@@ -839,7 +1076,6 @@ class ExtractionConfig(ConfigDict):
                 context={"ocr_backend": self.ocr_backend, "ocr_config": type(self.ocr_config).__name__},
             )
-        # Validate DPI configuration
         if self.target_dpi <= 0:
             raise ValidationError("target_dpi must be positive", context={"target_dpi": self.target_dpi})
         if self.min_dpi <= 0:
@@ -861,27 +1097,22 @@ class ExtractionConfig(ConfigDict):
             )
     def get_config_dict(self) -> dict[str, Any]:
-        if self.ocr_backend is None:
-            return {"use_cache": self.use_cache}
-        if self.ocr_config is not None:
-            config_dict = asdict(self.ocr_config)
-            config_dict["use_cache"] = self.use_cache
-            return config_dict
         match self.ocr_backend:
-            case "tesseract":
-                config_dict = asdict(TesseractConfig())
+            case None:
+                return {"use_cache": self.use_cache}
+            case _ if self.ocr_config is not None:
+                config_dict = asdict(self.ocr_config)
                 config_dict["use_cache"] = self.use_cache
                 return config_dict
+            case "tesseract":
+                config_dict = asdict(TesseractConfig())
             case "easyocr":
                 config_dict = asdict(EasyOCRConfig())
-                config_dict["use_cache"] = self.use_cache
-                return config_dict
             case _:
                 config_dict = asdict(PaddleOCRConfig())
-                config_dict["use_cache"] = self.use_cache
-                return config_dict
+        config_dict["use_cache"] = self.use_cache
+        return config_dict
     def to_dict(self, include_none: bool = False) -> dict[str, Any]:
         result = msgspec.to_builtins(
@@ -900,72 +1131,71 @@ class ExtractionConfig(ConfigDict):
         return {k: v for k, v in result.items() if v is not None}
-@dataclass(frozen=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class HTMLToMarkdownConfig:
-    stream_processing: bool = False
-    """Enable streaming mode for processing large HTML documents."""
-    chunk_size: int = 1024
-    """Size of chunks when stream_processing is enabled."""
-    chunk_callback: Callable[[str], None] | None = None
-    """Callback function invoked for each chunk during stream processing."""
-    progress_callback: Callable[[int, int], None] | None = None
-    """Callback function for progress updates (current, total)."""
-    parser: str | None = "lxml"
-    """BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
     autolinks: bool = True
-    """Convert URLs to clickable links automatically."""
+    """Automatically convert valid URLs to Markdown links."""
+    br_in_tables: bool = False
+    """Use <br> tags for line breaks in table cells instead of spaces."""
     bullets: str = "*+-"
     """Characters to use for unordered list bullets."""
     code_language: str = ""
-    """Default language for code blocks."""
+    """Default language identifier for fenced code blocks."""
     code_language_callback: Callable[[Any], str] | None = None
-    """Callback to determine code language dynamically."""
-    convert: str | Iterable[str] | None = None
-    """HTML tags to convert. If None, all supported tags are converted."""
+    """Function to dynamically determine code block language."""
+    convert: list[str] | None = None
+    """List of HTML tags to convert (None = all supported tags)."""
     convert_as_inline: bool = False
-    """Convert block elements as inline elements."""
-    custom_converters: Mapping[Any, Any] | None = None
-    """Custom converters for specific HTML elements."""
+    """Treat content as inline elements only."""
+    custom_converters: Mapping[str, Callable[..., str]] | None = None
+    """Mapping of HTML tag names to custom converter functions."""
     default_title: bool = False
-    """Use a default title if none is found."""
+    """Use default titles for elements like links."""
     escape_asterisks: bool = True
-    """Escape asterisks in text to prevent unintended emphasis."""
+    """Escape * characters to prevent unintended formatting."""
     escape_misc: bool = True
-    """Escape miscellaneous characters that have special meaning in Markdown."""
+    """Escape miscellaneous characters to prevent Markdown conflicts."""
     escape_underscores: bool = True
-    """Escape underscores in text to prevent unintended emphasis."""
+    """Escape _ characters to prevent unintended formatting."""
     extract_metadata: bool = True
-    """Extract metadata from HTML head section."""
+    """Extract document metadata as comment header."""
     heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
     """Style for markdown headings."""
     highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
     """Style for highlighting text."""
-    keep_inline_images_in: Iterable[str] | None = None
-    """HTML tags where inline images should be preserved."""
+    keep_inline_images_in: list[str] | None = None
+    """Tags where inline images should be preserved."""
+    list_indent_type: Literal["spaces", "tabs"] = "spaces"
+    """Type of indentation to use for lists."""
+    list_indent_width: int = 4
+    """Number of spaces per indentation level (use 2 for Discord/Slack)."""
     newline_style: Literal["spaces", "backslash"] = "spaces"
     """Style for line breaks in markdown."""
-    strip: str | Iterable[str] | None = None
-    """HTML tags to strip completely from output."""
+    preprocess_html: bool = False
+    """Enable HTML preprocessing to clean messy HTML."""
+    preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
+    """Preprocessing level for cleaning HTML."""
+    remove_forms: bool = True
+    """Remove form elements during preprocessing."""
+    remove_navigation: bool = True
+    """Remove navigation elements during preprocessing."""
+    strip: list[str] | None = None
+    """List of HTML tags to remove from output."""
     strip_newlines: bool = False
-    """Strip newlines from the output."""
+    """Remove newlines from HTML input before processing."""
     strong_em_symbol: Literal["*", "_"] = "*"
     """Symbol to use for strong/emphasis formatting."""
     sub_symbol: str = ""
     """Symbol to use for subscript text."""
     sup_symbol: str = ""
     """Symbol to use for superscript text."""
+    whitespace_mode: Literal["normalized", "strict"] = "normalized"
+    """Whitespace handling mode."""
     wrap: bool = False
     """Enable text wrapping."""
     wrap_width: int = 80
-    """Width for text wrapping when wrap is True."""
-    preprocess_html: bool = True
-    """Enable HTML preprocessing to clean up the input."""
-    preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
-    """Preprocessing level for cleaning HTML."""
-    remove_navigation: bool = True
-    """Remove navigation elements from HTML."""
-    remove_forms: bool = True
-    """Remove form elements from HTML."""
+    """Width for text wrapping."""
     def to_dict(self) -> dict[str, Any]:
-        return {key: value for key, value in self.__dict__.items() if value is not None}
+        result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
+        return {k: v for k, v in result.items() if v is not None}

kreuzberg/_utils/_cache.py CHANGED Viewed

@@ -20,6 +20,8 @@ from kreuzberg._utils._sync import run_sync
 T = TypeVar("T")
+CACHE_CLEANUP_FREQUENCY = 100
 class KreuzbergCache(Generic[T]):
     def __init__(
@@ -136,16 +138,20 @@ class KreuzbergCache(Generic[T]):
     def _cleanup_cache(self) -> None:
         try:
             cache_files = list(self.cache_dir.glob("*.msgpack"))
             cutoff_time = time.time() - (self.max_age_days * 24 * 3600)
-            for cache_file in cache_files[:]:
+            remaining_files = []
+            for cache_file in cache_files:
                 try:
                     if cache_file.stat().st_mtime < cutoff_time:
                         cache_file.unlink(missing_ok=True)
-                        cache_files.remove(cache_file)
+                    else:
+                        remaining_files.append(cache_file)
                 except OSError:  # noqa: PERF203
                     continue
+            cache_files = remaining_files
             total_size = sum(cache_file.stat().st_size for cache_file in cache_files if cache_file.exists()) / (
                 1024 * 1024
             )
@@ -191,7 +197,7 @@ class KreuzbergCache(Generic[T]):
             content = serialize(serialized)
             cache_path.write_bytes(content)
-            if hash(cache_key) % 100 == 0:
+            if hash(cache_key) % CACHE_CLEANUP_FREQUENCY == 0:
                 self._cleanup_cache()
         except (OSError, TypeError, ValueError):
             pass

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -12,7 +12,7 @@ from kreuzberg.exceptions import ValidationError
 DeviceType = Literal["cpu", "cuda", "mps", "auto"]
-@dataclass(frozen=True, slots=True)
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
 class DeviceInfo:
     device_type: Literal["cpu", "cuda", "mps"]
     """The type of device."""
@@ -30,12 +30,10 @@ def detect_available_devices() -> list[DeviceInfo]:
     cpu_device = DeviceInfo(device_type="cpu", name="CPU")
     cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
     mps_device = _get_mps_device() if _is_mps_available() else None
     mps_devices = [mps_device] if mps_device else []
-    gpu_devices = list(chain(cuda_devices, mps_devices))
-    return [*gpu_devices, cpu_device]
+    return list(chain(cuda_devices, mps_devices, [cpu_device]))
 def get_optimal_device() -> DeviceInfo:

kreuzberg/_utils/_html_streaming.py ADDED Viewed

@@ -0,0 +1,20 @@
+from __future__ import annotations
+_STREAMING_THRESHOLD_KB = 10
+_LARGE_FILE_THRESHOLD_MB = 1
+_DEFAULT_CHUNK_SIZE = 2048
+_LARGE_FILE_CHUNK_SIZE = 4096
+_STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
+_LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
+def should_use_streaming(content_size: int) -> tuple[bool, int]:
+    if content_size < 0:
+        return False, _DEFAULT_CHUNK_SIZE
+    if content_size > _STREAMING_THRESHOLD_BYTES:
+        if content_size > _LARGE_FILE_THRESHOLD_BYTES:
+            return True, _LARGE_FILE_CHUNK_SIZE
+        return True, _DEFAULT_CHUNK_SIZE
+    return False, _DEFAULT_CHUNK_SIZE

kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

kreuzberg 3.14.1py3-none-any.whl → 3.16.0py3-none-any.whl