PyPI - kreuzberg - Versions diffs - 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl - Mend

kreuzberg 3.13.0py3-none-any.whl → 3.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +0 -124
kreuzberg/_document_classification.py +20 -39
kreuzberg/_entity_extraction.py +0 -29
kreuzberg/_extractors/_base.py +4 -66
kreuzberg/_extractors/_email.py +0 -4
kreuzberg/_extractors/_image.py +0 -2
kreuzberg/_extractors/_pandoc.py +0 -58
kreuzberg/_extractors/_pdf.py +0 -3
kreuzberg/_extractors/_presentation.py +0 -82
kreuzberg/_extractors/_spread_sheet.py +0 -2
kreuzberg/_gmft.py +0 -61
kreuzberg/_language_detection.py +0 -14
kreuzberg/_mime_types.py +0 -17
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +110 -85
kreuzberg/_ocr/_paddleocr.py +146 -138
kreuzberg/_ocr/_table_extractor.py +0 -76
kreuzberg/_ocr/_tesseract.py +0 -206
kreuzberg/_playa.py +0 -27
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +16 -119
kreuzberg/_utils/_cache.py +0 -52
kreuzberg/_utils/_device.py +0 -56
kreuzberg/_utils/_document_cache.py +0 -73
kreuzberg/_utils/_errors.py +0 -47
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -14
kreuzberg/_utils/_process_pool.py +0 -47
kreuzberg/_utils/_quality.py +0 -17
kreuzberg/_utils/_ref.py +0 -16
kreuzberg/_utils/_serialization.py +0 -25
kreuzberg/_utils/_string.py +0 -20
kreuzberg/_utils/_sync.py +0 -76
kreuzberg/_utils/_table.py +0 -45
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +2 -2
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/METADATA +3 -2
kreuzberg-3.13.2.dist-info/RECORD +57 -0
kreuzberg-3.13.0.dist-info/RECORD +0 -56
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/WHEEL +0 -0
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_chunker.py CHANGED Viewed

@@ -17,21 +17,6 @@ def get_chunker(
     max_characters: int = DEFAULT_MAX_CHARACTERS,
     overlap_characters: int = DEFAULT_MAX_OVERLAP,
 ) -> MarkdownSplitter | TextSplitter:
-    """Creates and returns a Chunker object configured with the given maximum
-    characters per chunk and overlap between chunks.
-    Args:
-        mime_type: The mime type of the content.
-        max_characters: Maximum number of characters allowed in each chunk.
-        overlap_characters: Number of characters overlapping between two consecutive chunks.
-    Raises:
-        MissingDependencyError: if semantic-text-splitter is not installed.
-    Returns:
-        Chunker: A Chunker object configured with the specified maximum
-            characters and overlap.
-    """
     key = (max_characters, overlap_characters, mime_type)
     if key not in _chunkers:
         try:

kreuzberg/_config.py CHANGED Viewed

@@ -148,17 +148,6 @@ def _create_ocr_config(
 def load_config_from_file(config_path: Path) -> dict[str, Any]:
-    """Load configuration from a TOML file.
-    Args:
-        config_path: Path to the configuration file.
-    Returns:
-        Dictionary containing the loaded configuration.
-    Raises:
-        ValidationError: If the file cannot be read or parsed.
-    """
     try:
         with config_path.open("rb") as f:
             data = tomllib.load(f)
@@ -177,15 +166,6 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
 def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
-    """Merge two configuration dictionaries recursively.
-    Args:
-        base: Base configuration dictionary.
-        override: Configuration dictionary to override base values.
-    Returns:
-        Merged configuration dictionary.
-    """
     result = base.copy()
     for key, value in override.items():
         if isinstance(value, dict) and key in result and isinstance(result[key], dict):
@@ -198,18 +178,6 @@ def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, A
 def parse_ocr_backend_config(
     config_dict: dict[str, Any], backend: OcrBackendType
 ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
-    """Parse OCR backend-specific configuration.
-    Args:
-        config_dict: Configuration dictionary.
-        backend: The OCR backend type.
-    Returns:
-        Backend-specific configuration object or None.
-    Raises:
-        ValidationError: If the backend configuration is invalid.
-    """
     if backend not in config_dict:
         return None
@@ -230,17 +198,6 @@ def parse_ocr_backend_config(
 def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
-    """Build ExtractionConfig from a configuration dictionary.
-    Args:
-        config_dict: Configuration dictionary from TOML file.
-    Returns:
-        ExtractionConfig instance.
-    Raises:
-        ValidationError: If the configuration is invalid.
-    """
     extraction_config: dict[str, Any] = {field: config_dict[field] for field in _CONFIG_FIELDS if field in config_dict}
     ocr_backend = extraction_config.get("ocr_backend")
@@ -288,18 +245,6 @@ def build_extraction_config(
     file_config: dict[str, Any],
     cli_args: MutableMapping[str, Any],
 ) -> ExtractionConfig:
-    """Build ExtractionConfig from file config and CLI arguments.
-    Args:
-        file_config: Configuration loaded from file.
-        cli_args: CLI arguments.
-    Returns:
-        ExtractionConfig instance.
-    Raises:
-        ValidationError: If the combined configuration is invalid.
-    """
     config_dict: dict[str, Any] = {}
     _merge_file_config(config_dict, file_config)
@@ -321,21 +266,6 @@ def build_extraction_config(
 def find_config_file(start_path: Path | None = None) -> Path | None:
-    """Find configuration file by searching up the directory tree.
-    Searches for configuration files in the following order:
-    1. kreuzberg.toml
-    2. pyproject.toml (with [tool.kreuzberg] section)
-    Args:
-        start_path: Directory to start searching from. Defaults to current working directory.
-    Returns:
-        Path to the configuration file or None if not found.
-    Raises:
-        ValidationError: If a config file exists but cannot be read or has invalid TOML.
-    """
     current = start_path or Path.cwd()
     while current != current.parent:
@@ -366,17 +296,6 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
 def load_default_config(start_path: Path | None = None) -> ExtractionConfig | None:
-    """Load the default configuration from discovered config file.
-    Args:
-        start_path: Directory to start searching from. Defaults to current working directory.
-    Returns:
-        ExtractionConfig instance or None if no configuration found.
-    Raises:
-        ValidationError: If configuration file exists but contains invalid configuration.
-    """
     config_path = find_config_file(start_path)
     if not config_path:
         return None
@@ -388,34 +307,12 @@ def load_default_config(start_path: Path | None = None) -> ExtractionConfig | No
 def load_config_from_path(config_path: Path | str) -> ExtractionConfig:
-    """Load configuration from a specific file path.
-    Args:
-        config_path: Path to the configuration file.
-    Returns:
-        ExtractionConfig instance.
-    Raises:
-        ValidationError: If the file cannot be read, parsed, or is invalid.
-    """
     path = Path(config_path)
     config_dict = load_config_from_file(path)
     return build_extraction_config_from_dict(config_dict)
 def discover_and_load_config(start_path: Path | str | None = None) -> ExtractionConfig:
-    """Load configuration by discovering config files in the directory tree.
-    Args:
-        start_path: Directory to start searching from. Defaults to current working directory.
-    Returns:
-        ExtractionConfig instance.
-    Raises:
-        ValidationError: If no configuration file is found or if the file is invalid.
-    """
     search_path = Path(start_path) if start_path else None
     config_path = find_config_file(search_path)
@@ -436,19 +333,6 @@ def discover_and_load_config(start_path: Path | str | None = None) -> Extraction
 def discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
-    """Discover and load configuration, returning None if no config file found.
-    If a config file is found, attempts to load it. Any errors during loading will bubble up.
-    Args:
-        start_path: Directory to start searching from. Defaults to current working directory.
-    Returns:
-        ExtractionConfig instance or None if no configuration file found.
-    Raises:
-        ValidationError: If a configuration file exists but is invalid.
-    """
     search_path = Path(start_path) if start_path else None
     config_path = find_config_file(search_path)
@@ -462,12 +346,4 @@ def discover_config(start_path: Path | str | None = None) -> ExtractionConfig |
 def find_default_config() -> Path | None:
-    """Find the default configuration file (pyproject.toml).
-    Returns:
-        Path to the configuration file or None if not found.
-    Note:
-        This function is deprecated. Use find_config_file() instead.
-    """
     return find_config_file()

kreuzberg/_document_classification.py CHANGED Viewed

@@ -3,6 +3,8 @@ from __future__ import annotations
 import re
 from typing import TYPE_CHECKING
+import polars as pl
 from kreuzberg._ocr import get_ocr_backend
 from kreuzberg._types import ExtractionConfig, ExtractionResult  # noqa: TC001
 from kreuzberg.exceptions import MissingDependencyError
@@ -40,17 +42,6 @@ DOCUMENT_CLASSIFIERS = {
 def _get_translated_text(result: ExtractionResult) -> str:
-    """Translate extracted text to English using Google Translate API.
-    Args:
-        result: ExtractionResult containing the text to be translated
-    Returns:
-        str: The translated text in lowercase English
-    Raises:
-        MissingDependencyError: If the deep-translator package is not installed
-    """
     text_to_classify = result.content
     if result.metadata:
         metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
@@ -70,16 +61,6 @@ def _get_translated_text(result: ExtractionResult) -> str:
 def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
-    """Classifies the document type based on keywords and patterns.
-    Args:
-        result: The extraction result containing the content.
-        config: The extraction configuration.
-    Returns:
-        A tuple containing the detected document type and the confidence score,
-        or (None, None) if no type is detected with sufficient confidence.
-    """
     if not config.auto_detect_document_type:
         return None, None
@@ -108,27 +89,17 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
 def classify_document_from_layout(
     result: ExtractionResult, config: ExtractionConfig
 ) -> tuple[str | None, float | None]:
-    """Classifies the document type based on layout information from OCR.
-    Args:
-        result: The extraction result containing the layout data.
-        config: The extraction configuration.
-    Returns:
-        A tuple containing the detected document type and the confidence score,
-        or (None, None) if no type is detected with sufficient confidence.
-    """
     if not config.auto_detect_document_type:
         return None, None
-    if result.layout is None or result.layout.empty:
+    if result.layout is None or result.layout.is_empty():
         return None, None
     layout_df = result.layout
     if not all(col in layout_df.columns for col in ["text", "top", "height"]):
         return None, None
-    layout_text = " ".join(layout_df["text"].astype(str).tolist())
+    layout_text = " ".join(layout_df["text"].cast(str).to_list())
     text_to_classify = layout_text
     if result.metadata:
@@ -142,17 +113,27 @@ def classify_document_from_layout(
     except Exception:  # noqa: BLE001
         translated_text = text_to_classify.lower()
-    layout_df["translated_text"] = translated_text
+    layout_df = layout_df.with_columns(pl.lit(translated_text).alias("translated_text"))
-    page_height = layout_df["top"].max() + layout_df["height"].max()
+    try:
+        layout_df = layout_df.with_columns(
+            [pl.col("top").cast(pl.Float64, strict=False), pl.col("height").cast(pl.Float64, strict=False)]
+        )
+        page_height_val = layout_df.select(pl.col("top").max() + pl.col("height").max()).item()
+        if page_height_val is None:
+            page_height_val = 0.0
+        page_height = float(page_height_val)
+    except Exception:  # noqa: BLE001
+        page_height = 1000.0
     scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0.0)
     for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
         for pattern in patterns:
-            found_words = layout_df[layout_df["translated_text"].str.contains(pattern, case=False, na=False)]
-            if not found_words.empty:
+            found_words = layout_df.filter(layout_df["translated_text"].str.contains(pattern))
+            if not found_words.is_empty():
                 scores[doc_type] += 1.0
-                word_top = found_words.iloc[0]["top"]
+                word_top = found_words[0, "top"]
                 if word_top < page_height * 0.3:
                     scores[doc_type] += 0.5
@@ -176,7 +157,7 @@ def auto_detect_document_type(
     if config.document_classification_mode == "vision" and file_path:
         layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
         result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
-    elif result.layout is not None and not result.layout.empty:
+    elif result.layout is not None and not result.layout.is_empty():
         result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
     else:
         result.document_type, result.document_type_confidence = classify_document(result, config)

kreuzberg/_entity_extraction.py CHANGED Viewed

@@ -19,21 +19,6 @@ def extract_entities(
     languages: list[str] | None = None,
     spacy_config: SpacyEntityExtractionConfig | None = None,
 ) -> list[Entity]:
-    """Extract entities from text using custom regex patterns and/or a NER model.
-    Args:
-        text: The input text to extract entities from.
-        entity_types: List of entity types to extract using the NER model.
-        custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
-        languages: List of detected languages to choose appropriate spaCy models.
-        spacy_config: Configuration for spaCy entity extraction.
-    Returns:
-        list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
-    Raises:
-        MissingDependencyError: If `spacy` is not installed.
-    """
     entities: list[Entity] = []
     if custom_patterns:
         for ent_type, pattern in custom_patterns:
@@ -85,7 +70,6 @@ def extract_entities(
 @lru_cache(maxsize=32)
 def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
-    """Load a spaCy model with caching."""
     try:
         import spacy  # noqa: PLC0415
@@ -102,7 +86,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
 def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
-    """Select the best spaCy model based on detected languages."""
     if not languages:
         return spacy_config.get_model_for_language("en")
@@ -118,18 +101,6 @@ def extract_keywords(
     text: str,
     keyword_count: int = 10,
 ) -> list[tuple[str, float]]:
-    """Extract keywords from text using the KeyBERT model.
-    Args:
-        text: The input text to extract keywords from.
-        keyword_count: Number of top keywords to return. Defaults to 10.
-    Returns:
-        list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
-    Raises:
-        MissingDependencyError: If `keybert` is not installed.
-    """
     try:
         from keybert import KeyBERT  # noqa: PLC0415

kreuzberg/_extractors/_base.py CHANGED Viewed

@@ -13,20 +13,6 @@ if TYPE_CHECKING:
 class Extractor(ABC):
-    """Abstract base class for file content extraction.
-    This class provides the interface for different types of content extractors.
-    Subclasses are expected to implement the methods for extracting content
-    either asynchronously or synchronously and determining the supported MIME types.
-    Attributes:
-        SUPPORTED_MIME_TYPES: The set of supported mime types - all none abstract extractors must implement this.
-    Args:
-        mime_type: The MIME type that this extractor handles (e.g., "application/pdf").
-        config: Configuration options for the extraction process.
-    """
     __slots__ = ("config", "mime_type")
     SUPPORTED_MIME_TYPES: ClassVar[set[str]]
@@ -36,72 +22,24 @@ class Extractor(ABC):
         self.config = config
     @abstractmethod
-    async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
-        """Asynchronously extract content from a byte stream.
-        Args:
-            content: The byte content to extract.
-        Returns:
-            ExtractionResult: The extracted content along with metadata about the extraction.
-        """
+    async def extract_bytes_async(self, content: bytes) -> ExtractionResult: ...
     @abstractmethod
-    async def extract_path_async(self, path: Path) -> ExtractionResult:
-        """Asynchronously extract content from a file located at the specified path.
-        Args:
-            path: The path to the file to process.
-        Returns:
-            ExtractionResult: The extracted content along with metadata about the extraction.
-        """
+    async def extract_path_async(self, path: Path) -> ExtractionResult: ...
     @abstractmethod
-    def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Synchronously extract content from a byte stream.
-        Args:
-            content: The byte content to extract.
-        Returns:
-            ExtractionResult: The extracted content along with metadata about the extraction.
-        """
+    def extract_bytes_sync(self, content: bytes) -> ExtractionResult: ...
     @abstractmethod
-    def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Synchronously extract content from a file located at the specified path.
-        Args:
-            path: The path to the file to process.
-        Returns:
-            ExtractionResult: The extracted content along with metadata about the extraction.
-        """
+    def extract_path_sync(self, path: Path) -> ExtractionResult: ...
     @classmethod
     def supports_mimetype(cls, mime_type: str) -> bool:
-        """Verify whether the extractor supports the given MIME type.
-        Args:
-            mime_type: The MIME type to check (e.g., "application/pdf").
-        Returns:
-            bool: True if the MIME type is supported, False otherwise.
-        """
         return mime_type in cls.SUPPORTED_MIME_TYPES or any(
             mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
         )
     def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
-        """Apply quality post-processing to extraction result if enabled.
-        Args:
-            result: The raw extraction result
-        Returns:
-            Enhanced extraction result with quality improvements (if enabled)
-        """
         if not self.config.enable_quality_processing:
             return result

kreuzberg/_extractors/_email.py CHANGED Viewed

@@ -42,7 +42,6 @@ class EmailExtractor(Extractor):
     def _extract_email_headers(
         self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
     ) -> None:
-        """Extract and process email headers."""
         subject = parsed_email.get("subject")
         if subject:
             metadata["subject"] = subject
@@ -85,7 +84,6 @@ class EmailExtractor(Extractor):
             text_parts.append(f"BCC: {bcc_formatted}")
     def _format_email_field(self, field: Any) -> str:
-        """Format email field (to, cc, bcc) for display."""
         if isinstance(field, list):
             emails = []
             for item in field:
@@ -101,7 +99,6 @@ class EmailExtractor(Extractor):
         return str(field)
     def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
-        """Extract and process email body content."""
         text_content = parsed_email.get("text")
         if text_content:
             text_parts.append(f"\n{text_content}")
@@ -123,7 +120,6 @@ class EmailExtractor(Extractor):
     def _extract_email_attachments(
         self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
     ) -> None:
-        """Extract and process email attachments info."""
         if parsed_email.get("attachments"):
             attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
             metadata["attachments"] = attachment_names

kreuzberg/_extractors/_image.py CHANGED Viewed

@@ -61,7 +61,6 @@ class ImageExtractor(Extractor):
         return self._apply_quality_processing(result)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Pure sync implementation of extract_bytes."""
         extension = self._get_extension_from_mime_type(self.mime_type)
         fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
@@ -75,7 +74,6 @@ class ImageExtractor(Extractor):
                 Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Pure sync implementation of extract_path."""
         if self.config.ocr_backend is None:
             raise ValidationError("ocr_backend is None, cannot perform OCR")

kreuzberg/_extractors/_pandoc.py CHANGED Viewed

@@ -84,8 +84,6 @@ NodeType = Literal[
 class PandocExtractor(Extractor):
-    """Extractor for documents supported by Pandoc."""
     _checked_version: bool = False
     MIMETYPE_TO_PANDOC_TYPE_MAPPING: ClassVar[Mapping[str, str]] = {
@@ -153,14 +151,6 @@ class PandocExtractor(Extractor):
     }
     async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
-        """Extract text and metadata from bytes content using Pandoc.
-        Args:
-            content: The content bytes to process.
-        Returns:
-            ExtractionResult with the extracted text and metadata.
-        """
         extension = self._get_pandoc_type_from_mime_type(self.mime_type)
         input_file, unlink = await create_temp_file(f".{extension}")
@@ -171,17 +161,6 @@ class PandocExtractor(Extractor):
             await unlink()
     async def extract_path_async(self, path: Path) -> ExtractionResult:
-        """Extract text and metadata from a file using Pandoc.
-        Args:
-            path: The path to the file to process.
-        Raises:
-            ParsingError: If the file data could not be extracted.
-        Returns:
-            ExtractionResult with the extracted text and metadata.
-        """
         await self._validate_pandoc_version()
         self._get_pandoc_type_from_mime_type(self.mime_type)
@@ -198,14 +177,6 @@ class PandocExtractor(Extractor):
             raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Pure sync implementation of extract_bytes.
-        Args:
-            content: The content bytes to process.
-        Returns:
-            ExtractionResult with the extracted text and metadata.
-        """
         extension = self._get_pandoc_type_from_mime_type(self.mime_type)
         fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
@@ -219,17 +190,6 @@ class PandocExtractor(Extractor):
                 Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Pure sync implementation of extract_path.
-        Args:
-            path: The path to the file to process.
-        Returns:
-            ExtractionResult with the extracted text and metadata.
-        Raises:
-            ParsingError: When file processing fails.
-        """
         self._validate_pandoc_version_sync()
         self._get_pandoc_type_from_mime_type(self.mime_type)
@@ -612,8 +572,6 @@ class PandocExtractor(Extractor):
 class MarkdownExtractor(PandocExtractor):
-    """Extractor for Markdown-based document formats."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "text/x-markdown",
         "text/x-commonmark",
@@ -625,8 +583,6 @@ class MarkdownExtractor(PandocExtractor):
 class OfficeDocumentExtractor(PandocExtractor):
-    """Extractor for Office document formats (Word, ODT)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
         "application/vnd.oasis.opendocument.text",
@@ -634,8 +590,6 @@ class OfficeDocumentExtractor(PandocExtractor):
 class EbookExtractor(PandocExtractor):
-    """Extractor for e-book formats (EPUB, FB2)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/epub+zip",
         "application/x-fictionbook+xml",
@@ -643,8 +597,6 @@ class EbookExtractor(PandocExtractor):
 class StructuredTextExtractor(PandocExtractor):
-    """Extractor for structured text formats (RST, Org, etc.)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "text/x-rst",
         "text/x-org",
@@ -654,8 +606,6 @@ class StructuredTextExtractor(PandocExtractor):
 class LaTeXExtractor(PandocExtractor):
-    """Extractor for LaTeX and Typst documents."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/x-latex",
         "application/x-typst",
@@ -663,8 +613,6 @@ class LaTeXExtractor(PandocExtractor):
 class BibliographyExtractor(PandocExtractor):
-    """Extractor for bibliography formats (BibTeX, CSL JSON, etc.)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/x-bibtex",
         "application/x-biblatex",
@@ -675,8 +623,6 @@ class BibliographyExtractor(PandocExtractor):
 class XMLBasedExtractor(PandocExtractor):
-    """Extractor for XML-based document formats (DocBook, JATS, OPML)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/docbook+xml",
         "application/x-jats+xml",
@@ -685,8 +631,6 @@ class XMLBasedExtractor(PandocExtractor):
 class TabularDataExtractor(PandocExtractor):
-    """Extractor for tabular data formats (CSV, TSV)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "text/csv",
         "text/tab-separated-values",
@@ -694,8 +638,6 @@ class TabularDataExtractor(PandocExtractor):
 class MiscFormatExtractor(PandocExtractor):
-    """Extractor for miscellaneous formats (RTF, man, Jupyter notebooks)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/rtf",
         "text/troff",

kreuzberg 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl

kreuzberg 3.13.0py3-none-any.whl → 3.13.2py3-none-any.whl