PyPI - kreuzberg - Versions diffs - 3.15.0__py3-none-any.whl → 3.16.0__py3-none-any.whl - Mend

kreuzberg 3.15.0py3-none-any.whl → 3.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

kreuzberg/__init__.py +4 -0
kreuzberg/_api/main.py +0 -53
kreuzberg/_config.py +11 -1
kreuzberg/_document_classification.py +1 -1
kreuzberg/_extractors/_email.py +16 -10
kreuzberg/_extractors/_html.py +39 -12
kreuzberg/_extractors/_pdf.py +2 -3
kreuzberg/_extractors/_presentation.py +4 -0
kreuzberg/_extractors/_spread_sheet.py +0 -1
kreuzberg/_extractors/_structured.py +83 -15
kreuzberg/_gmft.py +5 -0
kreuzberg/_mcp/server.py +0 -21
kreuzberg/_ocr/_easyocr.py +51 -19
kreuzberg/_ocr/_tesseract.py +14 -3
kreuzberg/_types.py +111 -40
kreuzberg/_utils/_html_streaming.py +20 -0
kreuzberg/_utils/_serialization.py +13 -6
kreuzberg/_utils/_sync.py +15 -16
kreuzberg/extraction.py +2 -2
{kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +12 -11
{kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/RECORD +24 -23
{kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_mcp/server.py CHANGED Viewed

@@ -22,7 +22,6 @@ from kreuzberg.extraction import (
 mcp = FastMCP("Kreuzberg Text Extraction")
-# Security and performance limits
 MAX_BATCH_SIZE = 100
@@ -46,7 +45,6 @@ def _validate_file_path(file_path: str) -> Path:
             context={"file_path": file_path, "error": str(e)},
         ) from e
-    # Check for path traversal attempts
     if ".." in file_path and not file_path.startswith("/"):
         raise ValidationError(
             "Path traversal detected in file path",
@@ -73,7 +71,6 @@ def _validate_file_path_with_context(file_path: str, index: int, total: int) ->
     try:
         return _validate_file_path(file_path)
     except ValidationError as e:
-        # Add context about which file in the batch failed
         e.context = e.context or {}
         e.context["batch_index"] = index
         e.context["total_files"] = total
@@ -99,7 +96,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
             context={"context": context_info},
         )
-    # Check for whitespace-only content
     if not content_base64.strip():
         raise ValidationError(
             "Base64 content cannot be whitespace only",
@@ -126,7 +122,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
 def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
     base_config = discover_config()
-    # Extract Tesseract-specific parameters from kwargs first
     tesseract_lang = kwargs.pop("tesseract_lang", None)
     tesseract_psm = kwargs.pop("tesseract_psm", None)
     tesseract_output_format = kwargs.pop("tesseract_output_format", None)
@@ -151,7 +146,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
         }
         config_dict = config_dict | kwargs
-    # Handle Tesseract OCR configuration
     ocr_backend = config_dict.get("ocr_backend")
     if ocr_backend == "tesseract" and (
         tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
@@ -174,10 +168,8 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
             tesseract_config_dict["enable_table_detection"] = True
         if tesseract_config_dict:
-            # Merge with existing tesseract config if present
             existing_ocr_config = config_dict.get("ocr_config")
             if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
-                # Convert existing config to dict, merge, and recreate
                 existing_dict = existing_ocr_config.to_dict()
                 merged_dict = existing_dict | tesseract_config_dict
                 config_dict["ocr_config"] = TesseractConfig(**merged_dict)
@@ -206,7 +198,6 @@ def extract_document(  # noqa: PLR0913
     tesseract_output_format: str | None = None,
     enable_table_detection: bool | None = None,
 ) -> dict[str, Any]:
-    # Validate file path for security
     validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides(
         force_ocr=force_ocr,
@@ -289,7 +280,6 @@ def batch_extract_document(  # noqa: PLR0913
     tesseract_output_format: str | None = None,
     enable_table_detection: bool | None = None,
 ) -> list[dict[str, Any]]:
-    # Validate batch size
     if len(file_paths) > MAX_BATCH_SIZE:
         raise ValidationError(
             f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
@@ -302,7 +292,6 @@ def batch_extract_document(  # noqa: PLR0913
             context={"file_paths": file_paths},
         )
-    # Validate all file paths for security
     validated_paths = []
     for i, file_path in enumerate(file_paths):
         validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
@@ -346,7 +335,6 @@ def batch_extract_bytes(  # noqa: PLR0913
     tesseract_output_format: str | None = None,
     enable_table_detection: bool | None = None,
 ) -> list[dict[str, Any]]:
-    # Validate input
     if not content_items:
         raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
@@ -355,7 +343,6 @@ def batch_extract_bytes(  # noqa: PLR0913
             "content_items must be a list", context={"content_items_type": type(content_items).__name__}
         )
-    # Validate batch size
     if len(content_items) > MAX_BATCH_SIZE:
         raise ValidationError(
             f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
@@ -379,17 +366,14 @@ def batch_extract_bytes(  # noqa: PLR0913
         enable_table_detection=enable_table_detection,
     )
-    # Convert list of dicts to list of tuples (bytes, mime_type)
     contents = []
     for i, item in enumerate(content_items):
-        # Validate item structure
         if not isinstance(item, dict):
             raise ValidationError(
                 f"Item at index {i} must be a dictionary",
                 context={"item_index": i, "item_type": type(item).__name__, "item": item},
             )
-        # Check for required keys
         if "content_base64" not in item:
             raise ValidationError(
                 f"Item at index {i} is missing required key 'content_base64'",
@@ -405,11 +389,9 @@ def batch_extract_bytes(  # noqa: PLR0913
         content_base64 = item["content_base64"]
         mime_type = item["mime_type"]
-        # Validate base64 content
         try:
             content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
         except ValidationError as e:
-            # Add batch-specific context
             e.context = e.context or {}
             e.context["item_index"] = i
             e.context["total_items"] = len(content_items)
@@ -426,7 +408,6 @@ def extract_simple(
     file_path: str,
     mime_type: str | None = None,
 ) -> str:
-    # Validate file path for security
     validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides()
     result = extract_file_sync(str(validated_path), mime_type, config)
@@ -467,7 +448,6 @@ def get_supported_formats() -> str:
 @mcp.prompt()
 def extract_and_summarize(file_path: str) -> list[TextContent]:
-    # Validate file path for security
     validated_path = _validate_file_path(file_path)
     result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
@@ -481,7 +461,6 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
 @mcp.prompt()
 def extract_structured(file_path: str) -> list[TextContent]:
-    # Validate file path for security
     validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides(
         extract_entities=True,

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -33,22 +33,39 @@ except ImportError:  # pragma: no cover
 if TYPE_CHECKING:
     import easyocr
-    import numpy as np
     import torch
+else:
+    easyocr: Any = None
+    torch: Any = None
+HAS_EASYOCR: bool = False
+def _import_easyocr() -> tuple[Any, Any]:
+    global HAS_EASYOCR, easyocr, torch
+    # If easyocr is already set (either real module or mock), return it
+    if easyocr is not None:
+        return easyocr, torch
+    # If explicitly disabled for testing
+    if not HAS_EASYOCR and easyocr is None:
+        return None, None
-HAS_EASYOCR: bool
-if not TYPE_CHECKING:
     try:
-        import easyocr
-        import numpy as np
-        import torch
+        import easyocr as _easyocr  # noqa: PLC0415
+        try:
+            import torch as _torch  # noqa: PLC0415
+        except ImportError:
+            _torch = None  # type: ignore[assignment]
+        easyocr = _easyocr
+        torch = _torch
         HAS_EASYOCR = True
+        return easyocr, torch
     except ImportError:
-        HAS_EASYOCR = False
-        easyocr: Any = None
-        np: Any = None
-        torch: Any = None
+        return None, None
 EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
@@ -142,6 +159,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     _reader: ClassVar[Any] = None
     async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        try:
+            import numpy as np  # noqa: PLC0415
+        except ImportError as e:
+            raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
         use_cache = kwargs.pop("use_cache", True)
         cache_kwargs = None
@@ -292,7 +314,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     @classmethod
     def _is_gpu_available(cls) -> bool:
-        if not HAS_EASYOCR or torch is None:
+        # Use the module-level torch variable directly to respect patches
+        if torch is None:
             return False
         return bool(torch.cuda.is_available())
@@ -301,13 +324,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         if cls._reader is not None:
             return
-        if not HAS_EASYOCR or easyocr is None:
+        # Validate language first before attempting import
+        languages = cls._validate_language_code(kwargs.pop("language", "en"))
+        easyocr_module, _ = _import_easyocr()
+        if easyocr_module is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
             )
-        languages = cls._validate_language_code(kwargs.pop("language", "en"))
         device_info = cls._resolve_device_config(**kwargs)
         use_gpu = device_info.device_type in ("cuda", "mps")
@@ -318,7 +343,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         try:
             cls._reader = await run_sync(
-                easyocr.Reader,
+                easyocr_module.Reader,
                 languages,
                 gpu=use_gpu,
                 verbose=False,
@@ -382,6 +407,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         return languages
     def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        try:
+            import numpy as np  # noqa: PLC0415
+        except ImportError as e:
+            raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
         use_cache = kwargs.pop("use_cache", True)
         cache_kwargs = None
@@ -453,13 +483,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         if cls._reader is not None:
             return
-        if not HAS_EASYOCR or easyocr is None:
+        # Validate language first before attempting import
+        languages = cls._validate_language_code(kwargs.pop("language", "en"))
+        easyocr_module, _ = _import_easyocr()
+        if easyocr_module is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
             )
-        languages = cls._validate_language_code(kwargs.pop("language", "en"))
         device_info = cls._resolve_device_config(**kwargs)
         use_gpu = device_info.device_type in ("cuda", "mps")
@@ -469,7 +501,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         kwargs.setdefault("recog_network", "standard")
         try:
-            cls._reader = easyocr.Reader(
+            cls._reader = easyocr_module.Reader(
                 languages,
                 gpu=use_gpu,
                 verbose=False,

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -29,6 +29,7 @@ from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
 from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
 from kreuzberg._utils._cache import get_ocr_cache
+from kreuzberg._utils._html_streaming import should_use_streaming
 from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
@@ -510,7 +511,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             escape_asterisks=False,
             escape_underscores=False,
             extract_metadata=False,
-            strip="meta title",
+            strip=["meta", "title"],
         )
         tables: list[TableData] = []
@@ -532,6 +533,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         config_dict = config.to_dict()
         config_dict["custom_converters"] = all_converters
+        use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
+        config_dict["stream_processing"] = use_streaming
+        config_dict["chunk_size"] = chunk_size
         try:
             markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
             markdown_content = normalize_spaces(markdown_content)
@@ -676,12 +681,18 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 escape_asterisks=False,
                 escape_underscores=False,
                 extract_metadata=False,
-                strip="meta title",
+                strip=["meta", "title"],
             )
+            config_dict = html_config.to_dict()
+            use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
+            config_dict["stream_processing"] = use_streaming
+            config_dict["chunk_size"] = chunk_size
             markdown_content = html_to_markdown.convert_to_markdown(
                 hocr_content,
-                **html_config.to_dict(),
+                **config_dict,
             )
             markdown_content = normalize_spaces(markdown_content)

kreuzberg/_types.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import sys
-from collections.abc import Awaitable, Callable, Iterable, Mapping
+from collections.abc import Awaitable, Callable, Mapping
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from pathlib import Path
@@ -591,6 +591,8 @@ class ImagePreprocessingMetadata(NamedTuple):
 class Metadata(TypedDict, total=False):
+    abstract: NotRequired[str]
+    """Document abstract or summary."""
     authors: NotRequired[list[str]]
     """List of document authors."""
     categories: NotRequired[list[str]]
@@ -677,9 +679,26 @@ class Metadata(TypedDict, total=False):
     """Error message if extraction failed."""
     error_context: NotRequired[dict[str, Any]]
     """Error context information for debugging."""
+    json_schema: NotRequired[dict[str, Any]]
+    """JSON schema information extracted from structured data."""
+    notes: NotRequired[list[str]]
+    """Notes or additional information extracted from documents."""
+    note: NotRequired[str]
+    """Single note or annotation."""
+    name: NotRequired[str]
+    """Name field from structured data."""
+    body: NotRequired[str]
+    """Body text content."""
+    text: NotRequired[str]
+    """Generic text content."""
+    message: NotRequired[str]
+    """Message or communication content."""
+    attributes: NotRequired[dict[str, Any]]
+    """Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
 _VALID_METADATA_KEYS = {
+    "abstract",
     "authors",
     "categories",
     "citations",
@@ -722,6 +741,14 @@ _VALID_METADATA_KEYS = {
     "source_format",
     "error",
     "error_context",
+    "json_schema",
+    "notes",
+    "note",
+    "name",
+    "body",
+    "text",
+    "message",
+    "attributes",
 }
@@ -730,9 +757,29 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
         return {}
     normalized: Metadata = {}
+    attributes: dict[str, Any] = {}
     for key, value in data.items():
-        if key in _VALID_METADATA_KEYS and value is not None:
-            normalized[key] = value  # type: ignore[literal-required]
+        if value is not None:
+            if key in _VALID_METADATA_KEYS:
+                normalized[key] = value  # type: ignore[literal-required]
+            elif "." in key and key.split(".")[-1] in {
+                "title",
+                "name",
+                "subject",
+                "description",
+                "content",
+                "body",
+                "text",
+                "message",
+                "note",
+                "abstract",
+                "summary",
+            }:
+                attributes[key] = value
+    if attributes:
+        normalized["attributes"] = attributes
     return normalized
@@ -835,6 +882,30 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
 ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
+@dataclass(unsafe_hash=True, frozen=True, slots=True)
+class JSONExtractionConfig(ConfigDict):
+    extract_schema: bool = False
+    """Extract and include JSON schema information in metadata."""
+    custom_text_field_patterns: frozenset[str] | None = None
+    """Custom patterns to identify text fields beyond default keywords."""
+    max_depth: int = 10
+    """Maximum nesting depth to process in JSON structures."""
+    array_item_limit: int = 1000
+    """Maximum number of array items to process to prevent memory issues."""
+    include_type_info: bool = False
+    """Include data type information in extracted content."""
+    flatten_nested_objects: bool = True
+    """Flatten nested objects using dot notation for better text extraction."""
+    def __post_init__(self) -> None:
+        if self.max_depth <= 0:
+            raise ValidationError("max_depth must be positive", context={"max_depth": self.max_depth})
+        if self.array_item_limit <= 0:
+            raise ValidationError(
+                "array_item_limit must be positive", context={"array_item_limit": self.array_item_limit}
+            )
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class ExtractionConfig(ConfigDict):
     force_ocr: bool = False
@@ -924,6 +995,8 @@ class ExtractionConfig(ConfigDict):
     """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
     html_to_markdown_config: HTMLToMarkdownConfig | None = None
     """Configuration for HTML to Markdown conversion. If None, uses default settings."""
+    json_config: JSONExtractionConfig | None = None
+    """Configuration for enhanced JSON extraction features. If None, uses standard JSON processing."""
     use_cache: bool = True
     """Whether to use caching for extraction results. Set to False to disable all caching."""
     target_dpi: int = 150
@@ -1060,70 +1133,68 @@ class ExtractionConfig(ConfigDict):
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class HTMLToMarkdownConfig:
-    stream_processing: bool = False
-    """Enable streaming mode for processing large HTML documents."""
-    chunk_size: int = 1024
-    """Size of chunks when stream_processing is enabled."""
-    chunk_callback: Callable[[str], None] | None = None
-    """Callback function invoked for each chunk during stream processing."""
-    progress_callback: Callable[[int, int], None] | None = None
-    """Callback function for progress updates (current, total)."""
-    parser: str | None = "lxml"
-    """BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
     autolinks: bool = True
-    """Convert URLs to clickable links automatically."""
+    """Automatically convert valid URLs to Markdown links."""
+    br_in_tables: bool = False
+    """Use <br> tags for line breaks in table cells instead of spaces."""
     bullets: str = "*+-"
     """Characters to use for unordered list bullets."""
     code_language: str = ""
-    """Default language for code blocks."""
+    """Default language identifier for fenced code blocks."""
     code_language_callback: Callable[[Any], str] | None = None
-    """Callback to determine code language dynamically."""
-    convert: str | Iterable[str] | None = None
-    """HTML tags to convert. If None, all supported tags are converted."""
+    """Function to dynamically determine code block language."""
+    convert: list[str] | None = None
+    """List of HTML tags to convert (None = all supported tags)."""
     convert_as_inline: bool = False
-    """Convert block elements as inline elements."""
-    custom_converters: Mapping[Any, Any] | None = None
-    """Custom converters for specific HTML elements."""
+    """Treat content as inline elements only."""
+    custom_converters: Mapping[str, Callable[..., str]] | None = None
+    """Mapping of HTML tag names to custom converter functions."""
     default_title: bool = False
-    """Use a default title if none is found."""
+    """Use default titles for elements like links."""
     escape_asterisks: bool = True
-    """Escape asterisks in text to prevent unintended emphasis."""
+    """Escape * characters to prevent unintended formatting."""
     escape_misc: bool = True
-    """Escape miscellaneous characters that have special meaning in Markdown."""
+    """Escape miscellaneous characters to prevent Markdown conflicts."""
     escape_underscores: bool = True
-    """Escape underscores in text to prevent unintended emphasis."""
+    """Escape _ characters to prevent unintended formatting."""
     extract_metadata: bool = True
-    """Extract metadata from HTML head section."""
+    """Extract document metadata as comment header."""
     heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
     """Style for markdown headings."""
     highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
     """Style for highlighting text."""
-    keep_inline_images_in: Iterable[str] | None = None
-    """HTML tags where inline images should be preserved."""
+    keep_inline_images_in: list[str] | None = None
+    """Tags where inline images should be preserved."""
+    list_indent_type: Literal["spaces", "tabs"] = "spaces"
+    """Type of indentation to use for lists."""
+    list_indent_width: int = 4
+    """Number of spaces per indentation level (use 2 for Discord/Slack)."""
     newline_style: Literal["spaces", "backslash"] = "spaces"
     """Style for line breaks in markdown."""
-    strip: str | Iterable[str] | None = None
-    """HTML tags to strip completely from output."""
+    preprocess_html: bool = False
+    """Enable HTML preprocessing to clean messy HTML."""
+    preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
+    """Preprocessing level for cleaning HTML."""
+    remove_forms: bool = True
+    """Remove form elements during preprocessing."""
+    remove_navigation: bool = True
+    """Remove navigation elements during preprocessing."""
+    strip: list[str] | None = None
+    """List of HTML tags to remove from output."""
     strip_newlines: bool = False
-    """Strip newlines from the output."""
+    """Remove newlines from HTML input before processing."""
     strong_em_symbol: Literal["*", "_"] = "*"
     """Symbol to use for strong/emphasis formatting."""
     sub_symbol: str = ""
     """Symbol to use for subscript text."""
     sup_symbol: str = ""
     """Symbol to use for superscript text."""
+    whitespace_mode: Literal["normalized", "strict"] = "normalized"
+    """Whitespace handling mode."""
     wrap: bool = False
     """Enable text wrapping."""
     wrap_width: int = 80
-    """Width for text wrapping when wrap is True."""
-    preprocess_html: bool = True
-    """Enable HTML preprocessing to clean up the input."""
-    preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
-    """Preprocessing level for cleaning HTML."""
-    remove_navigation: bool = True
-    """Remove navigation elements from HTML."""
-    remove_forms: bool = True
-    """Remove form elements from HTML."""
+    """Width for text wrapping."""
     def to_dict(self) -> dict[str, Any]:
         result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")

kreuzberg/_utils/_html_streaming.py ADDED Viewed

@@ -0,0 +1,20 @@
+from __future__ import annotations
+_STREAMING_THRESHOLD_KB = 10
+_LARGE_FILE_THRESHOLD_MB = 1
+_DEFAULT_CHUNK_SIZE = 2048
+_LARGE_FILE_CHUNK_SIZE = 4096
+_STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
+_LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
+def should_use_streaming(content_size: int) -> tuple[bool, int]:
+    if content_size < 0:
+        return False, _DEFAULT_CHUNK_SIZE
+    if content_size > _STREAMING_THRESHOLD_BYTES:
+        if content_size > _LARGE_FILE_THRESHOLD_BYTES:
+            return True, _LARGE_FILE_CHUNK_SIZE
+        return True, _DEFAULT_CHUNK_SIZE
+    return False, _DEFAULT_CHUNK_SIZE

kreuzberg/_utils/_serialization.py CHANGED Viewed

@@ -1,11 +1,10 @@
 from __future__ import annotations
 from dataclasses import is_dataclass
-from typing import Any, TypeVar, cast
+from typing import Any, TypeVar
 import msgspec
 from msgspec import MsgspecError
-from msgspec.msgpack import decode, encode
 T = TypeVar("T")
@@ -42,18 +41,26 @@ def encode_hook(obj: Any) -> Any:
     raise TypeError(f"Unsupported type: {type(obj)!r}")
-def deserialize(value: str | bytes, target_type: type[T]) -> T:
+def deserialize(value: str | bytes, target_type: type[T], json: bool = False) -> T:
+    decoder = msgspec.json.decode if json else msgspec.msgpack.decode
+    if json:
+        data = value.encode() if isinstance(value, str) else value
+    else:
+        data = value.encode() if isinstance(value, str) else value
     try:
-        return decode(cast("bytes", value), type=target_type, strict=False)
+        return decoder(data, type=target_type, strict=False)
     except MsgspecError as e:
         raise ValueError(f"Failed to deserialize to {target_type.__name__}: {e}") from e
-def serialize(value: Any, **kwargs: Any) -> bytes:
+def serialize(value: Any, json: bool = False, **kwargs: Any) -> bytes:
     if isinstance(value, dict) and kwargs:
         value = value | kwargs
+    encoder = msgspec.json.encode if json else msgspec.msgpack.encode
     try:
-        return encode(value, enc_hook=encode_hook)
+        return encoder(value, enc_hook=encode_hook)
     except (MsgspecError, TypeError) as e:
         raise ValueError(f"Failed to serialize {type(value).__name__}: {e}") from e

kreuzberg 3.15.0__py3-none-any.whl → 3.16.0__py3-none-any.whl

kreuzberg 3.15.0py3-none-any.whl → 3.16.0py3-none-any.whl