PyPI - kreuzberg - Versions diffs - 3.11.3__py3-none-any.whl → 3.13.0__py3-none-any.whl - Mend

kreuzberg 3.11.3py3-none-any.whl → 3.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_config.py +248 -204
kreuzberg/_document_classification.py +0 -8
kreuzberg/_entity_extraction.py +1 -93
kreuzberg/_extractors/_base.py +0 -5
kreuzberg/_extractors/_email.py +1 -11
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -23
kreuzberg/_extractors/_pandoc.py +10 -89
kreuzberg/_extractors/_pdf.py +39 -92
kreuzberg/_extractors/_presentation.py +0 -17
kreuzberg/_extractors/_spread_sheet.py +13 -53
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -138
kreuzberg/_language_detection.py +1 -22
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -2
kreuzberg/_ocr/_easyocr.py +21 -108
kreuzberg/_ocr/_paddleocr.py +16 -94
kreuzberg/_ocr/_table_extractor.py +260 -0
kreuzberg/_ocr/_tesseract.py +906 -264
kreuzberg/_playa.py +5 -4
kreuzberg/_types.py +638 -40
kreuzberg/_utils/_cache.py +88 -90
kreuzberg/_utils/_device.py +0 -18
kreuzberg/_utils/_document_cache.py +0 -2
kreuzberg/_utils/_errors.py +0 -3
kreuzberg/_utils/_pdf_lock.py +0 -2
kreuzberg/_utils/_process_pool.py +19 -19
kreuzberg/_utils/_quality.py +0 -43
kreuzberg/_utils/_ref.py +48 -0
kreuzberg/_utils/_serialization.py +0 -5
kreuzberg/_utils/_string.py +9 -39
kreuzberg/_utils/_sync.py +0 -1
kreuzberg/_utils/_table.py +50 -57
kreuzberg/cli.py +55 -77
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
kreuzberg-3.13.0.dist-info/RECORD +56 -0
kreuzberg-3.11.3.dist-info/RECORD +0 -54
{kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_entity_extraction.py CHANGED Viewed

@@ -2,105 +2,14 @@ from __future__ import annotations
 import os
 import re
-from dataclasses import dataclass
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any
-from kreuzberg._types import Entity
+from kreuzberg._types import Entity, SpacyEntityExtractionConfig
 from kreuzberg.exceptions import MissingDependencyError
 if TYPE_CHECKING:
     from collections.abc import Sequence
-    from pathlib import Path
-@dataclass(unsafe_hash=True, frozen=True, slots=True)
-class SpacyEntityExtractionConfig:
-    """Configuration for spaCy-based entity extraction."""
-    model_cache_dir: str | Path | None = None
-    """Directory to cache spaCy models. If None, uses spaCy's default."""
-    language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
-    """Mapping of language codes to spaCy model names.
-    If None, uses default mappings:
-    - en: en_core_web_sm
-    - de: de_core_news_sm
-    - fr: fr_core_news_sm
-    - es: es_core_news_sm
-    - pt: pt_core_news_sm
-    - it: it_core_news_sm
-    - nl: nl_core_news_sm
-    - zh: zh_core_web_sm
-    - ja: ja_core_news_sm
-    """
-    fallback_to_multilingual: bool = True
-    """If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
-    max_doc_length: int = 1000000
-    """Maximum document length for spaCy processing."""
-    batch_size: int = 1000
-    """Batch size for processing multiple texts."""
-    def __post_init__(self) -> None:
-        if self.language_models is None:
-            object.__setattr__(self, "language_models", self._get_default_language_models())
-        if isinstance(self.language_models, dict):
-            object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
-    @staticmethod
-    def _get_default_language_models() -> dict[str, str]:
-        """Get default language model mappings based on available spaCy models."""
-        return {
-            "en": "en_core_web_sm",
-            "de": "de_core_news_sm",
-            "fr": "fr_core_news_sm",
-            "es": "es_core_news_sm",
-            "pt": "pt_core_news_sm",
-            "it": "it_core_news_sm",
-            "nl": "nl_core_news_sm",
-            "zh": "zh_core_web_sm",
-            "ja": "ja_core_news_sm",
-            "ko": "ko_core_news_sm",
-            "ru": "ru_core_news_sm",
-            "pl": "pl_core_news_sm",
-            "ro": "ro_core_news_sm",
-            "el": "el_core_news_sm",
-            "da": "da_core_news_sm",
-            "fi": "fi_core_news_sm",
-            "nb": "nb_core_news_sm",
-            "sv": "sv_core_news_sm",
-            "ca": "ca_core_news_sm",
-            "hr": "hr_core_news_sm",
-            "lt": "lt_core_news_sm",
-            "mk": "mk_core_news_sm",
-            "sl": "sl_core_news_sm",
-            "uk": "uk_core_news_sm",
-        }
-    def get_model_for_language(self, language_code: str) -> str | None:
-        """Get the appropriate spaCy model for a language code."""
-        if not self.language_models:
-            return None
-        models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
-        if language_code in models_dict:
-            return models_dict[language_code]
-        base_lang = language_code.split("-")[0].lower()
-        if base_lang in models_dict:
-            return models_dict[base_lang]
-        return None
-    def get_fallback_model(self) -> str | None:
-        """Get fallback multilingual model if enabled."""
-        return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
 def extract_entities(
@@ -127,7 +36,6 @@ def extract_entities(
     """
     entities: list[Entity] = []
     if custom_patterns:
-        # Direct iteration over frozenset - no need to convert to dict
         for ent_type, pattern in custom_patterns:
             entities.extend(
                 Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())

kreuzberg/_extractors/_base.py CHANGED Viewed

@@ -102,23 +102,18 @@ class Extractor(ABC):
         Returns:
             Enhanced extraction result with quality improvements (if enabled)
         """
-        # Only apply quality processing if enabled in config
         if not self.config.enable_quality_processing:
             return result
         if not result.content:
             return result
-        # Clean the content
         cleaned_content = clean_extracted_text(result.content)
-        # Calculate quality score
         quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
-        # Add quality metadata
         enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
-        # Return enhanced result
         return ExtractionResult(
             content=cleaned_content,
             mime_type=result.mime_type,

kreuzberg/_extractors/_email.py CHANGED Viewed

@@ -16,7 +16,6 @@ from kreuzberg.exceptions import MissingDependencyError
 if TYPE_CHECKING:
     from pathlib import Path
-# Import optional dependencies at module level with proper error handling
 try:
     import mailparse
 except ImportError:  # pragma: no cover
@@ -27,7 +26,6 @@ try:
 except ImportError:  # pragma: no cover
     html2text = None
-# Compile regex pattern once at module level
 _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
@@ -45,7 +43,6 @@ class EmailExtractor(Extractor):
         self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
     ) -> None:
         """Extract and process email headers."""
-        # Use single dict access where possible to avoid repeated lookups
         subject = parsed_email.get("subject")
         if subject:
             metadata["subject"] = subject
@@ -59,9 +56,7 @@ class EmailExtractor(Extractor):
         to_info = parsed_email.get("to")
         if to_info:
-            # Store the raw value in metadata (could be string, dict, or list)
             if isinstance(to_info, list) and to_info:
-                # For metadata, use first recipient's email if it's a list
                 to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
                 metadata["email_to"] = to_email
             elif isinstance(to_info, dict):
@@ -69,7 +64,6 @@ class EmailExtractor(Extractor):
             else:
                 metadata["email_to"] = str(to_info)
-            # For display, format all recipients
             to_formatted = self._format_email_field(to_info)
             text_parts.append(f"To: {to_formatted}")
@@ -111,19 +105,17 @@ class EmailExtractor(Extractor):
         text_content = parsed_email.get("text")
         if text_content:
             text_parts.append(f"\n{text_content}")
-            return  # If we have text, prefer it over HTML
+            return
         html_content = parsed_email.get("html")
         if html_content:
             if html2text is not None:
-                # Use html2text if available (faster path)
                 h = html2text.HTML2Text()
                 h.ignore_links = True
                 h.ignore_images = True
                 converted_text = h.handle(html_content)
                 text_parts.append(f"\n{converted_text}")
             else:
-                # Fallback: strip HTML tags and unescape entities
                 clean_html = _HTML_TAG_PATTERN.sub("", html_content)
                 clean_html = unescape(clean_html)
                 text_parts.append(f"\n{clean_html}")
@@ -148,12 +140,10 @@ class EmailExtractor(Extractor):
             text_parts: list[str] = []
             metadata: dict[str, Any] = {}
-            # Extract headers, body, and attachments
             self._extract_email_headers(parsed_email, text_parts, metadata)
             self._extract_email_body(parsed_email, text_parts)
             self._extract_email_attachments(parsed_email, text_parts, metadata)
-            # Join efficiently
             combined_text = "\n".join(text_parts)
             return ExtractionResult(

kreuzberg/_extractors/_html.py CHANGED Viewed

@@ -7,7 +7,7 @@ from anyio import Path as AsyncPath
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
-from kreuzberg._types import ExtractionResult
+from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig
 from kreuzberg._utils._string import safe_decode
 from kreuzberg._utils._sync import run_sync
@@ -26,19 +26,16 @@ class HTMLExtractor(Extractor):
         return await run_sync(self.extract_bytes_sync, content)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        # Use html-to-markdown with script/nav removal for better quality
-        result = html_to_markdown.convert_to_markdown(
-            safe_decode(content),
-            preprocess_html=True,
-            preprocessing_preset="aggressive",
-            remove_navigation=True,
-            remove_forms=True,
-        )
-        # Skip normalize_spaces since quality processing will handle whitespace
+        config = self.config.html_to_markdown_config if self.config else None
+        if config is None:
+            config = HTMLToMarkdownConfig()
+        config_dict = config.to_dict()
+        result = html_to_markdown.convert_to_markdown(safe_decode(content), **config_dict)
         extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
-        # Apply quality processing which includes normalization
         return self._apply_quality_processing(extraction_result)
     def extract_path_sync(self, path: Path) -> ExtractionResult:

kreuzberg/_extractors/_image.py CHANGED Viewed

@@ -3,7 +3,6 @@ from __future__ import annotations
 import contextlib
 import os
 import tempfile
-from dataclasses import asdict
 from pathlib import Path
 from typing import TYPE_CHECKING, ClassVar
@@ -12,9 +11,6 @@ from anyio import Path as AsyncPath
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import IMAGE_MIME_TYPES
 from kreuzberg._ocr import get_ocr_backend
-from kreuzberg._ocr._easyocr import EasyOCRConfig
-from kreuzberg._ocr._paddleocr import PaddleOCRConfig
-from kreuzberg._ocr._tesseract import TesseractConfig
 from kreuzberg._utils._tmp import create_temp_file
 from kreuzberg.exceptions import ValidationError
@@ -84,25 +80,7 @@ class ImageExtractor(Extractor):
             raise ValidationError("ocr_backend is None, cannot perform OCR")
         backend = get_ocr_backend(self.config.ocr_backend)
-        match self.config.ocr_backend:
-            case "tesseract":
-                config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
-                )
-                result = backend.process_file_sync(path, **asdict(config))
-            case "paddleocr":
-                paddle_config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
-                )
-                result = backend.process_file_sync(path, **asdict(paddle_config))
-            case "easyocr":
-                easy_config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
-                )
-                result = backend.process_file_sync(path, **asdict(easy_config))
-            case _:
-                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+        result = backend.process_file_sync(path, **self.config.get_config_dict())
         return self._apply_quality_processing(result)
     def _get_extension_from_mime_type(self, mime_type: str) -> str:

kreuzberg/_extractors/_pandoc.py CHANGED Viewed

@@ -244,18 +244,13 @@ class PandocExtractor(Extractor):
             raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
     async def _validate_pandoc_version(self) -> None:
-        """Validate that the installed Pandoc version meets the minimum requirement.
-        Raises:
-            MissingDependencyError: If Pandoc is not installed or version is too low
-        """
         try:
             if self._checked_version:
                 return
             command = ["pandoc", "--version"]
             result = await run_process(command)
-            stdout = result.stdout.decode()
+            stdout = result.stdout.decode("utf-8")
             version_match = re.search(
                 r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
@@ -299,14 +294,6 @@ class PandocExtractor(Extractor):
     @staticmethod
     def _get_pandoc_key(key: str) -> str | None:
-        """Map Pandoc metadata keys to our standard metadata keys.
-        Args:
-            key: The key from Pandoc metadata
-        Returns:
-            The mapped key name for our system, or None if not mapped
-        """
         if key == "abstract":
             return "summary"
@@ -325,17 +312,6 @@ class PandocExtractor(Extractor):
         return key
     def _get_pandoc_type_from_mime_type(self, mime_type: str) -> str:
-        """Get Pandoc format type from MIME type.
-        Args:
-            mime_type: The MIME type to look up
-        Returns:
-            The corresponding Pandoc type
-        Raises:
-            ValidationError: If mime_type is not supported
-        """
         if pandoc_type := (self.MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
             return pandoc_type
@@ -349,17 +325,6 @@ class PandocExtractor(Extractor):
         raise ValidationError(f"Unsupported mime type: {mime_type}")
     async def _handle_extract_metadata(self, input_file: str | PathLike[str]) -> Metadata:
-        """Extract metadata from a file using Pandoc.
-        Args:
-            input_file: The file to extract metadata from
-        Returns:
-            The extracted metadata
-        Raises:
-            ParsingError: If metadata extraction fails
-        """
         pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
         metadata_file, unlink = await create_temp_file(".json")
         try:
@@ -389,17 +354,6 @@ class PandocExtractor(Extractor):
             await unlink()
     async def _handle_extract_file(self, input_file: str | PathLike[str]) -> str:
-        """Extract text content from a file using Pandoc.
-        Args:
-            input_file: The file to extract content from
-        Returns:
-            The extracted text content
-        Raises:
-            ParsingError: If content extraction fails
-        """
         pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
         output_path, unlink = await create_temp_file(".md")
         try:
@@ -431,14 +385,6 @@ class PandocExtractor(Extractor):
             await unlink()
     def _extract_metadata(self, raw_meta: dict[str, Any]) -> Metadata:
-        """Extract structured metadata from Pandoc JSON metadata.
-        Args:
-            raw_meta: The raw metadata from Pandoc
-        Returns:
-            Structured metadata
-        """
         meta: Metadata = {}
         if (
@@ -485,16 +431,6 @@ class PandocExtractor(Extractor):
         return meta
     def _extract_inline_text(self, node: dict[str, Any], type_field: str = "t", content_field: str = "c") -> str | None:
-        """Extract text from an inline node in a document structure.
-        Args:
-            node: The node to extract text from
-            type_field: The field name for the node type
-            content_field: The field name for the node content
-        Returns:
-            The extracted text or None if no text could be extracted
-        """
         if node_type := node.get(type_field):
             if node_type == "Str":
                 return node.get(content_field)
@@ -505,29 +441,11 @@ class PandocExtractor(Extractor):
         return None
     def _extract_inlines(self, nodes: list[dict[str, Any]]) -> str | None:
-        """Extract text from a list of inline nodes.
-        Args:
-            nodes: The list of nodes to extract text from
-        Returns:
-            The extracted text or None if no text could be extracted
-        """
         texts = [text for node in nodes if (text := self._extract_inline_text(node))]
         result = "".join(texts).strip()
         return result if result else None
     def _extract_meta_value(self, node: Any, type_field: str = "t", content_field: str = "c") -> str | list[str] | None:
-        """Extract a metadata value from a node.
-        Args:
-            node: The node to extract metadata from
-            type_field: The field name for the node type
-            content_field: The field name for the node content
-        Returns:
-            The extracted metadata value or None if no metadata could be extracted
-        """
         if not isinstance(node, dict) or type_field not in node:
             return None
@@ -577,12 +495,17 @@ class PandocExtractor(Extractor):
         return None
     def _validate_pandoc_version_sync(self) -> None:
-        """Synchronous version of _validate_pandoc_version."""
         try:
             if self._checked_version:
                 return
-            result = subprocess.run(["pandoc", "--version"], capture_output=True, text=True, check=False)  # noqa: S607
+            result = subprocess.run(
+                ["pandoc", "--version"],  # noqa: S607
+                capture_output=True,
+                text=True,
+                check=False,
+                encoding="utf-8",
+            )
             if result.returncode != 0:
                 raise MissingDependencyError(
@@ -621,7 +544,6 @@ class PandocExtractor(Extractor):
             ) from e
     def _extract_metadata_sync(self, path: Path) -> Metadata:
-        """Synchronous version of _handle_extract_metadata."""
         pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
         fd, metadata_file = tempfile.mkstemp(suffix=".json")
         os.close(fd)
@@ -638,7 +560,7 @@ class PandocExtractor(Extractor):
                 str(metadata_file),
             ]
-            result = subprocess.run(command, capture_output=True, text=True, check=False)
+            result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
             if result.returncode != 0:
                 raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
@@ -655,7 +577,6 @@ class PandocExtractor(Extractor):
                 Path(metadata_file).unlink()
     def _extract_file_sync(self, path: Path) -> str:
-        """Synchronous version of _handle_extract_file."""
         pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
         fd, output_path = tempfile.mkstemp(suffix=".md")
         os.close(fd)
@@ -673,7 +594,7 @@ class PandocExtractor(Extractor):
                 str(output_path),
             ]
-            result = subprocess.run(command, capture_output=True, text=True, check=False)
+            result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
             if result.returncode != 0:
                 raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})

kreuzberg 3.11.3__py3-none-any.whl → 3.13.0__py3-none-any.whl

kreuzberg 3.11.3py3-none-any.whl → 3.13.0py3-none-any.whl