PyPI - kreuzberg - Versions diffs - 3.13.0__tar.gz → 3.13.2__tar.gz - Mend

kreuzberg 3.13.0tar.gz → 3.13.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (233) hide show

{kreuzberg-3.13.0 → kreuzberg-3.13.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.13.0
+Version: 3.13.2
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -34,11 +34,12 @@ Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
 Requires-Dist: html-to-markdown[lxml]>=1.9.1
 Requires-Dist: mcp>=1.13.0
 Requires-Dist: msgspec>=0.18.0
+Requires-Dist: numpy>=1.24.0
 Requires-Dist: playa-pdf>=0.7.0
 Requires-Dist: polars>=1.33.0
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: pypdfium2==4.30.0
-Requires-Dist: python-calamine>=0.3.2
+Requires-Dist: python-calamine>=0.5.2
 Requires-Dist: python-pptx>=1.0.2
 Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
 Provides-Extra: additional-extensions

kreuzberg-3.13.2/docker-logs/docker-info.txt ADDED Viewed

@@ -0,0 +1,60 @@
+Client: Docker Engine - Community
+ Version:    28.0.4
+ Context:    default
+ Debug Mode: false
+ Plugins:
+  buildx: Docker Buildx (Docker Inc.)
+    Version:  v0.27.0
+    Path:     /usr/libexec/docker/cli-plugins/docker-buildx
+  compose: Docker Compose (Docker Inc.)
+    Version:  v2.38.2
+    Path:     /usr/libexec/docker/cli-plugins/docker-compose
+Server:
+ Containers: 1
+  Running: 1
+  Paused: 0
+  Stopped: 0
+ Images: 2
+ Server Version: 28.0.4
+ Storage Driver: overlay2
+  Backing Filesystem: extfs
+  Supports d_type: true
+  Using metacopy: false
+  Native Overlay Diff: false
+  userxattr: false
+ Logging Driver: json-file
+ Cgroup Driver: systemd
+ Cgroup Version: 2
+ Plugins:
+  Volume: local
+  Network: bridge host ipvlan macvlan null overlay
+  Log: awslogs fluentd gcplogs gelf journald json-file local splunk syslog
+ Swarm: inactive
+ Runtimes: io.containerd.runc.v2 runc
+ Default Runtime: runc
+ Init Binary: docker-init
+ containerd version: 05044ec0a9a75232cad458027ca83437aae3f4da
+ runc version: v1.2.5-0-g59923ef
+ init version: de40ad0
+ Security Options:
+  apparmor
+  seccomp
+   Profile: builtin
+  cgroupns
+ Kernel Version: 6.11.0-1018-azure
+ Operating System: Ubuntu 24.04.3 LTS
+ OSType: linux
+ Architecture: x86_64
+ CPUs: 4
+ Total Memory: 15.62GiB
+ Name: pkrvm7jw40e0xgp
+ ID: 33a18c03-7dc8-4ab9-bfe1-99342b7c1aaf
+ Docker Root Dir: /var/lib/docker
+ Debug Mode: false
+ Username: githubactions
+ Experimental: false
+ Insecure Registries:
+  ::1/128
+  127.0.0.0/8
+ Live Restore Enabled: false

kreuzberg-3.13.2/docker-logs/docker-version.txt ADDED Viewed

@@ -0,0 +1,27 @@
+Client: Docker Engine - Community
+ Version:           28.0.4
+ API version:       1.48
+ Go version:        go1.23.7
+ Git commit:        b8034c0
+ Built:             Tue Mar 25 15:07:16 2025
+ OS/Arch:           linux/amd64
+ Context:           default
+Server: Docker Engine - Community
+ Engine:
+  Version:          28.0.4
+  API version:      1.48 (minimum version 1.24)
+  Go version:       go1.23.7
+  Git commit:       6430e49
+  Built:            Tue Mar 25 15:07:16 2025
+  OS/Arch:          linux/amd64
+  Experimental:     false
+ containerd:
+  Version:          1.7.27
+  GitCommit:        05044ec0a9a75232cad458027ca83437aae3f4da
+ runc:
+  Version:          1.2.5
+  GitCommit:        v1.2.5-0-g59923ef
+ docker-init:
+  Version:          0.19.0
+  GitCommit:        de40ad0

{kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_chunker.py RENAMED Viewed

@@ -17,21 +17,6 @@ def get_chunker(
     max_characters: int = DEFAULT_MAX_CHARACTERS,
     overlap_characters: int = DEFAULT_MAX_OVERLAP,
 ) -> MarkdownSplitter | TextSplitter:
-    """Creates and returns a Chunker object configured with the given maximum
-    characters per chunk and overlap between chunks.
-    Args:
-        mime_type: The mime type of the content.
-        max_characters: Maximum number of characters allowed in each chunk.
-        overlap_characters: Number of characters overlapping between two consecutive chunks.
-    Raises:
-        MissingDependencyError: if semantic-text-splitter is not installed.
-    Returns:
-        Chunker: A Chunker object configured with the specified maximum
-            characters and overlap.
-    """
     key = (max_characters, overlap_characters, mime_type)
     if key not in _chunkers:
         try:

{kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_config.py RENAMED Viewed

@@ -148,17 +148,6 @@ def _create_ocr_config(
 def load_config_from_file(config_path: Path) -> dict[str, Any]:
-    """Load configuration from a TOML file.
-    Args:
-        config_path: Path to the configuration file.
-    Returns:
-        Dictionary containing the loaded configuration.
-    Raises:
-        ValidationError: If the file cannot be read or parsed.
-    """
     try:
         with config_path.open("rb") as f:
             data = tomllib.load(f)
@@ -177,15 +166,6 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
 def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
-    """Merge two configuration dictionaries recursively.
-    Args:
-        base: Base configuration dictionary.
-        override: Configuration dictionary to override base values.
-    Returns:
-        Merged configuration dictionary.
-    """
     result = base.copy()
     for key, value in override.items():
         if isinstance(value, dict) and key in result and isinstance(result[key], dict):
@@ -198,18 +178,6 @@ def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, A
 def parse_ocr_backend_config(
     config_dict: dict[str, Any], backend: OcrBackendType
 ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
-    """Parse OCR backend-specific configuration.
-    Args:
-        config_dict: Configuration dictionary.
-        backend: The OCR backend type.
-    Returns:
-        Backend-specific configuration object or None.
-    Raises:
-        ValidationError: If the backend configuration is invalid.
-    """
     if backend not in config_dict:
         return None
@@ -230,17 +198,6 @@ def parse_ocr_backend_config(
 def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
-    """Build ExtractionConfig from a configuration dictionary.
-    Args:
-        config_dict: Configuration dictionary from TOML file.
-    Returns:
-        ExtractionConfig instance.
-    Raises:
-        ValidationError: If the configuration is invalid.
-    """
     extraction_config: dict[str, Any] = {field: config_dict[field] for field in _CONFIG_FIELDS if field in config_dict}
     ocr_backend = extraction_config.get("ocr_backend")
@@ -288,18 +245,6 @@ def build_extraction_config(
     file_config: dict[str, Any],
     cli_args: MutableMapping[str, Any],
 ) -> ExtractionConfig:
-    """Build ExtractionConfig from file config and CLI arguments.
-    Args:
-        file_config: Configuration loaded from file.
-        cli_args: CLI arguments.
-    Returns:
-        ExtractionConfig instance.
-    Raises:
-        ValidationError: If the combined configuration is invalid.
-    """
     config_dict: dict[str, Any] = {}
     _merge_file_config(config_dict, file_config)
@@ -321,21 +266,6 @@ def build_extraction_config(
 def find_config_file(start_path: Path | None = None) -> Path | None:
-    """Find configuration file by searching up the directory tree.
-    Searches for configuration files in the following order:
-    1. kreuzberg.toml
-    2. pyproject.toml (with [tool.kreuzberg] section)
-    Args:
-        start_path: Directory to start searching from. Defaults to current working directory.
-    Returns:
-        Path to the configuration file or None if not found.
-    Raises:
-        ValidationError: If a config file exists but cannot be read or has invalid TOML.
-    """
     current = start_path or Path.cwd()
     while current != current.parent:
@@ -366,17 +296,6 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
 def load_default_config(start_path: Path | None = None) -> ExtractionConfig | None:
-    """Load the default configuration from discovered config file.
-    Args:
-        start_path: Directory to start searching from. Defaults to current working directory.
-    Returns:
-        ExtractionConfig instance or None if no configuration found.
-    Raises:
-        ValidationError: If configuration file exists but contains invalid configuration.
-    """
     config_path = find_config_file(start_path)
     if not config_path:
         return None
@@ -388,34 +307,12 @@ def load_default_config(start_path: Path | None = None) -> ExtractionConfig | No
 def load_config_from_path(config_path: Path | str) -> ExtractionConfig:
-    """Load configuration from a specific file path.
-    Args:
-        config_path: Path to the configuration file.
-    Returns:
-        ExtractionConfig instance.
-    Raises:
-        ValidationError: If the file cannot be read, parsed, or is invalid.
-    """
     path = Path(config_path)
     config_dict = load_config_from_file(path)
     return build_extraction_config_from_dict(config_dict)
 def discover_and_load_config(start_path: Path | str | None = None) -> ExtractionConfig:
-    """Load configuration by discovering config files in the directory tree.
-    Args:
-        start_path: Directory to start searching from. Defaults to current working directory.
-    Returns:
-        ExtractionConfig instance.
-    Raises:
-        ValidationError: If no configuration file is found or if the file is invalid.
-    """
     search_path = Path(start_path) if start_path else None
     config_path = find_config_file(search_path)
@@ -436,19 +333,6 @@ def discover_and_load_config(start_path: Path | str | None = None) -> Extraction
 def discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
-    """Discover and load configuration, returning None if no config file found.
-    If a config file is found, attempts to load it. Any errors during loading will bubble up.
-    Args:
-        start_path: Directory to start searching from. Defaults to current working directory.
-    Returns:
-        ExtractionConfig instance or None if no configuration file found.
-    Raises:
-        ValidationError: If a configuration file exists but is invalid.
-    """
     search_path = Path(start_path) if start_path else None
     config_path = find_config_file(search_path)
@@ -462,12 +346,4 @@ def discover_config(start_path: Path | str | None = None) -> ExtractionConfig |
 def find_default_config() -> Path | None:
-    """Find the default configuration file (pyproject.toml).
-    Returns:
-        Path to the configuration file or None if not found.
-    Note:
-        This function is deprecated. Use find_config_file() instead.
-    """
     return find_config_file()

{kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_document_classification.py RENAMED Viewed

@@ -3,6 +3,8 @@ from __future__ import annotations
 import re
 from typing import TYPE_CHECKING
+import polars as pl
 from kreuzberg._ocr import get_ocr_backend
 from kreuzberg._types import ExtractionConfig, ExtractionResult  # noqa: TC001
 from kreuzberg.exceptions import MissingDependencyError
@@ -40,17 +42,6 @@ DOCUMENT_CLASSIFIERS = {
 def _get_translated_text(result: ExtractionResult) -> str:
-    """Translate extracted text to English using Google Translate API.
-    Args:
-        result: ExtractionResult containing the text to be translated
-    Returns:
-        str: The translated text in lowercase English
-    Raises:
-        MissingDependencyError: If the deep-translator package is not installed
-    """
     text_to_classify = result.content
     if result.metadata:
         metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
@@ -70,16 +61,6 @@ def _get_translated_text(result: ExtractionResult) -> str:
 def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
-    """Classifies the document type based on keywords and patterns.
-    Args:
-        result: The extraction result containing the content.
-        config: The extraction configuration.
-    Returns:
-        A tuple containing the detected document type and the confidence score,
-        or (None, None) if no type is detected with sufficient confidence.
-    """
     if not config.auto_detect_document_type:
         return None, None
@@ -108,27 +89,17 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
 def classify_document_from_layout(
     result: ExtractionResult, config: ExtractionConfig
 ) -> tuple[str | None, float | None]:
-    """Classifies the document type based on layout information from OCR.
-    Args:
-        result: The extraction result containing the layout data.
-        config: The extraction configuration.
-    Returns:
-        A tuple containing the detected document type and the confidence score,
-        or (None, None) if no type is detected with sufficient confidence.
-    """
     if not config.auto_detect_document_type:
         return None, None
-    if result.layout is None or result.layout.empty:
+    if result.layout is None or result.layout.is_empty():
         return None, None
     layout_df = result.layout
     if not all(col in layout_df.columns for col in ["text", "top", "height"]):
         return None, None
-    layout_text = " ".join(layout_df["text"].astype(str).tolist())
+    layout_text = " ".join(layout_df["text"].cast(str).to_list())
     text_to_classify = layout_text
     if result.metadata:
@@ -142,17 +113,27 @@ def classify_document_from_layout(
     except Exception:  # noqa: BLE001
         translated_text = text_to_classify.lower()
-    layout_df["translated_text"] = translated_text
+    layout_df = layout_df.with_columns(pl.lit(translated_text).alias("translated_text"))
-    page_height = layout_df["top"].max() + layout_df["height"].max()
+    try:
+        layout_df = layout_df.with_columns(
+            [pl.col("top").cast(pl.Float64, strict=False), pl.col("height").cast(pl.Float64, strict=False)]
+        )
+        page_height_val = layout_df.select(pl.col("top").max() + pl.col("height").max()).item()
+        if page_height_val is None:
+            page_height_val = 0.0
+        page_height = float(page_height_val)
+    except Exception:  # noqa: BLE001
+        page_height = 1000.0
     scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0.0)
     for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
         for pattern in patterns:
-            found_words = layout_df[layout_df["translated_text"].str.contains(pattern, case=False, na=False)]
-            if not found_words.empty:
+            found_words = layout_df.filter(layout_df["translated_text"].str.contains(pattern))
+            if not found_words.is_empty():
                 scores[doc_type] += 1.0
-                word_top = found_words.iloc[0]["top"]
+                word_top = found_words[0, "top"]
                 if word_top < page_height * 0.3:
                     scores[doc_type] += 0.5
@@ -176,7 +157,7 @@ def auto_detect_document_type(
     if config.document_classification_mode == "vision" and file_path:
         layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
         result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
-    elif result.layout is not None and not result.layout.empty:
+    elif result.layout is not None and not result.layout.is_empty():
         result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
     else:
         result.document_type, result.document_type_confidence = classify_document(result, config)

{kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_entity_extraction.py RENAMED Viewed

@@ -19,21 +19,6 @@ def extract_entities(
     languages: list[str] | None = None,
     spacy_config: SpacyEntityExtractionConfig | None = None,
 ) -> list[Entity]:
-    """Extract entities from text using custom regex patterns and/or a NER model.
-    Args:
-        text: The input text to extract entities from.
-        entity_types: List of entity types to extract using the NER model.
-        custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
-        languages: List of detected languages to choose appropriate spaCy models.
-        spacy_config: Configuration for spaCy entity extraction.
-    Returns:
-        list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
-    Raises:
-        MissingDependencyError: If `spacy` is not installed.
-    """
     entities: list[Entity] = []
     if custom_patterns:
         for ent_type, pattern in custom_patterns:
@@ -85,7 +70,6 @@ def extract_entities(
 @lru_cache(maxsize=32)
 def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
-    """Load a spaCy model with caching."""
     try:
         import spacy  # noqa: PLC0415
@@ -102,7 +86,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
 def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
-    """Select the best spaCy model based on detected languages."""
     if not languages:
         return spacy_config.get_model_for_language("en")
@@ -118,18 +101,6 @@ def extract_keywords(
     text: str,
     keyword_count: int = 10,
 ) -> list[tuple[str, float]]:
-    """Extract keywords from text using the KeyBERT model.
-    Args:
-        text: The input text to extract keywords from.
-        keyword_count: Number of top keywords to return. Defaults to 10.
-    Returns:
-        list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
-    Raises:
-        MissingDependencyError: If `keybert` is not installed.
-    """
     try:
         from keybert import KeyBERT  # noqa: PLC0415

kreuzberg-3.13.2/kreuzberg/_extractors/_base.py ADDED Viewed

@@ -0,0 +1,62 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, ClassVar
+from kreuzberg._types import ExtractionResult, normalize_metadata
+from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
+if TYPE_CHECKING:
+    from pathlib import Path
+    from kreuzberg._types import ExtractionConfig
+class Extractor(ABC):
+    __slots__ = ("config", "mime_type")
+    SUPPORTED_MIME_TYPES: ClassVar[set[str]]
+    def __init__(self, mime_type: str, config: ExtractionConfig) -> None:
+        self.mime_type = mime_type
+        self.config = config
+    @abstractmethod
+    async def extract_bytes_async(self, content: bytes) -> ExtractionResult: ...
+    @abstractmethod
+    async def extract_path_async(self, path: Path) -> ExtractionResult: ...
+    @abstractmethod
+    def extract_bytes_sync(self, content: bytes) -> ExtractionResult: ...
+    @abstractmethod
+    def extract_path_sync(self, path: Path) -> ExtractionResult: ...
+    @classmethod
+    def supports_mimetype(cls, mime_type: str) -> bool:
+        return mime_type in cls.SUPPORTED_MIME_TYPES or any(
+            mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
+        )
+    def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
+        if not self.config.enable_quality_processing:
+            return result
+        if not result.content:
+            return result
+        cleaned_content = clean_extracted_text(result.content)
+        quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
+        enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
+        return ExtractionResult(
+            content=cleaned_content,
+            mime_type=result.mime_type,
+            metadata=normalize_metadata(enhanced_metadata),
+            chunks=result.chunks,
+            detected_languages=result.detected_languages,
+            tables=result.tables,
+        )

{kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/_email.py RENAMED Viewed

@@ -42,7 +42,6 @@ class EmailExtractor(Extractor):
     def _extract_email_headers(
         self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
     ) -> None:
-        """Extract and process email headers."""
         subject = parsed_email.get("subject")
         if subject:
             metadata["subject"] = subject
@@ -85,7 +84,6 @@ class EmailExtractor(Extractor):
             text_parts.append(f"BCC: {bcc_formatted}")
     def _format_email_field(self, field: Any) -> str:
-        """Format email field (to, cc, bcc) for display."""
         if isinstance(field, list):
             emails = []
             for item in field:
@@ -101,7 +99,6 @@ class EmailExtractor(Extractor):
         return str(field)
     def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
-        """Extract and process email body content."""
         text_content = parsed_email.get("text")
         if text_content:
             text_parts.append(f"\n{text_content}")
@@ -123,7 +120,6 @@ class EmailExtractor(Extractor):
     def _extract_email_attachments(
         self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
     ) -> None:
-        """Extract and process email attachments info."""
         if parsed_email.get("attachments"):
             attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
             metadata["attachments"] = attachment_names

{kreuzberg-3.13.0 → kreuzberg-3.13.2}/kreuzberg/_extractors/_image.py RENAMED Viewed

@@ -61,7 +61,6 @@ class ImageExtractor(Extractor):
         return self._apply_quality_processing(result)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Pure sync implementation of extract_bytes."""
         extension = self._get_extension_from_mime_type(self.mime_type)
         fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
@@ -75,7 +74,6 @@ class ImageExtractor(Extractor):
                 Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Pure sync implementation of extract_path."""
         if self.config.ocr_backend is None:
             raise ValidationError("ocr_backend is None, cannot perform OCR")

kreuzberg 3.13.0__tar.gz → 3.13.2__tar.gz

kreuzberg 3.13.0tar.gz → 3.13.2tar.gz