PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +212 -292
kreuzberg/_document_classification.py +20 -47
kreuzberg/_entity_extraction.py +1 -122
kreuzberg/_extractors/_base.py +4 -71
kreuzberg/_extractors/_email.py +1 -15
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -25
kreuzberg/_extractors/_pandoc.py +10 -147
kreuzberg/_extractors/_pdf.py +38 -94
kreuzberg/_extractors/_presentation.py +0 -99
kreuzberg/_extractors/_spread_sheet.py +13 -55
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -199
kreuzberg/_language_detection.py +1 -36
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -19
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +124 -186
kreuzberg/_ocr/_paddleocr.py +154 -224
kreuzberg/_ocr/_table_extractor.py +184 -0
kreuzberg/_ocr/_tesseract.py +797 -361
kreuzberg/_playa.py +5 -31
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +588 -93
kreuzberg/_utils/_cache.py +84 -138
kreuzberg/_utils/_device.py +0 -74
kreuzberg/_utils/_document_cache.py +0 -75
kreuzberg/_utils/_errors.py +0 -50
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -16
kreuzberg/_utils/_process_pool.py +17 -64
kreuzberg/_utils/_quality.py +0 -60
kreuzberg/_utils/_ref.py +32 -0
kreuzberg/_utils/_serialization.py +0 -30
kreuzberg/_utils/_string.py +9 -59
kreuzberg/_utils/_sync.py +0 -77
kreuzberg/_utils/_table.py +49 -101
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
kreuzberg-3.13.1.dist-info/RECORD +57 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_presentation.py CHANGED Viewed

@@ -1,12 +1,3 @@
-"""This module provides functions to extract textual content from files.
-It includes vendored code:
-- The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
-    See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
-    Refer to the markitdown repository for it's license (MIT).
-"""
 from __future__ import annotations
 import re
@@ -30,99 +21,27 @@ if TYPE_CHECKING:  # pragma: no cover
     from kreuzberg._types import Metadata
-# Pre-compiled regex patterns for performance
 _NON_WORD_PATTERN = re.compile(r"\W")
 class PresentationExtractor(Extractor):
-    """Extractor for PowerPoint (.pptx) files.
-    This extractor processes PowerPoint presentations and converts their content into Markdown format.
-    It handles slides, shapes, images, tables, and slide notes, preserving the structure and content
-    of the presentation in a readable text format.
-    The extractor provides both synchronous and asynchronous methods for processing files either
-    from disk or from bytes in memory.
-    """
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
     async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
-        """Asynchronously extract content from PowerPoint file bytes.
-        Args:
-            content: Raw bytes of the PowerPoint file to process.
-        Returns:
-            ExtractionResult: Contains the extracted content in Markdown format,
-                the MIME type, and any additional metadata.
-        """
         return self._extract_pptx(content)
     async def extract_path_async(self, path: Path) -> ExtractionResult:
-        """Asynchronously extract content from a PowerPoint file on disk.
-        Args:
-            path: Path to the PowerPoint file to process.
-        Returns:
-            ExtractionResult: Contains the extracted content in Markdown format,
-                the MIME type, and any additional metadata.
-        """
         content = await AsyncPath(path).read_bytes()
         return self._extract_pptx(content)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Synchronously extract content from PowerPoint file bytes.
-        Args:
-            content: Raw bytes of the PowerPoint file to process.
-        Returns:
-            ExtractionResult: Contains the extracted content in Markdown format,
-                the MIME type, and any additional metadata.
-        """
         return self._extract_pptx(content)
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Synchronously extract content from a PowerPoint file on disk.
-        Args:
-            path: Path to the PowerPoint file to process.
-        Returns:
-            ExtractionResult: Contains the extracted content in Markdown format,
-                the MIME type, and any additional metadata.
-        """
         content = Path(path).read_bytes()
         return self._extract_pptx(content)
     def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
-        """Process PowerPoint file contents and convert to Markdown.
-        This method handles the core logic of extracting content from a PowerPoint file.
-        It processes:
-        - Slide titles and content
-        - Images (with alt text if available)
-        - Tables (converted to HTML format)
-        - Text frames
-        - Slide notes
-        Args:
-            file_contents: Raw bytes of the PowerPoint file to process.
-        Returns:
-            ExtractionResult: Contains the extracted content in Markdown format,
-                the MIME type, and any additional metadata.
-        Notes:
-            The extraction preserves the following elements:
-            - Slide numbers (as HTML comments)
-            - Images (converted to Markdown image syntax with alt text)
-            - Tables (converted to HTML table syntax)
-            - Text content (with titles properly formatted)
-            - Slide notes (under a dedicated section for each slide)
-        """
         md_content = ""
         presentation = pptx.Presentation(BytesIO(file_contents))
@@ -191,33 +110,20 @@ class PresentationExtractor(Extractor):
     @staticmethod
     def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
-        """Extract metadata from a presentation instance.
-        Args:
-            presentation: A `Presentation` object representing the PowerPoint file.
-        Returns:
-            PresentationMetadata: Object containing presentation-specific metadata fields.
-        """
         metadata: Metadata = {}
-        # Extract core properties
         PresentationExtractor._extract_core_properties(presentation, metadata)
-        # Extract fonts used in presentation
         fonts = PresentationExtractor._extract_fonts(presentation)
         if fonts:
             metadata["fonts"] = list(fonts)
-        # Add structural information
         PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
         return metadata
     @staticmethod
     def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
-        """Extract core document properties from presentation."""
-        # Property mapping for core metadata
         property_mapping = [
             ("authors", "author"),
             ("comments", "comments"),
@@ -236,7 +142,6 @@ class PresentationExtractor(Extractor):
             if core_property := getattr(presentation.core_properties, core_property_key, None):
                 metadata[metadata_key] = core_property  # type: ignore[literal-required]
-        # Handle special list properties
         if presentation.core_properties.language:
             metadata["languages"] = [presentation.core_properties.language]
@@ -245,7 +150,6 @@ class PresentationExtractor(Extractor):
     @staticmethod
     def _extract_fonts(presentation: Presentation) -> set[str]:
-        """Extract all fonts used in the presentation."""
         fonts = set()
         for slide in presentation.slides:
             for shape in slide.shapes:
@@ -260,12 +164,10 @@ class PresentationExtractor(Extractor):
     @staticmethod
     def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
-        """Add structural information about the presentation."""
         slide_count = len(presentation.slides)
         if slide_count == 0:
             return
-        # Build description
         structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
         slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
@@ -274,7 +176,6 @@ class PresentationExtractor(Extractor):
         metadata["description"] = structure_info
-        # Build summary if not already present
         if "summary" not in metadata:
             summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
             if slides_with_notes > 0:

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -10,15 +10,17 @@ from io import StringIO
 from pathlib import Path
 from typing import Any
+import polars as pl
 from anyio import Path as AsyncPath
 from PIL import Image
 from python_calamine import CalamineWorkbook
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
-from kreuzberg._types import ExtractionResult, Metadata
+from kreuzberg._types import ExtractionResult, Metadata, TableData
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync, run_taskgroup
+from kreuzberg._utils._table import enhance_table_markdown
 from kreuzberg._utils._tmp import create_temp_file
 from kreuzberg.exceptions import ParsingError
@@ -70,7 +72,6 @@ class SpreadSheetExtractor(Extractor):
             ) from e
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Pure sync implementation of extract_bytes."""
         fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
         try:
@@ -83,7 +84,6 @@ class SpreadSheetExtractor(Extractor):
                 Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Pure sync implementation of extract_path."""
         try:
             workbook = CalamineWorkbook.from_path(str(path))
             results = []
@@ -108,14 +108,6 @@ class SpreadSheetExtractor(Extractor):
     @staticmethod
     def _convert_cell_to_str(value: Any) -> str:
-        """Convert a cell value to string representation.
-        Args:
-            value: The cell value to convert.
-        Returns:
-            String representation of the cell value.
-        """
         if value is None:
             return ""
         if isinstance(value, bool):
@@ -139,7 +131,7 @@ class SpreadSheetExtractor(Extractor):
         csv_buffer.close()
         csv_path, unlink = await create_temp_file(".csv")
-        await AsyncPath(csv_path).write_text(csv_data)
+        await AsyncPath(csv_path).write_text(csv_data, encoding="utf-8")
         csv_reader = csv.reader(StringIO(csv_data))
         rows = list(csv_reader)
@@ -162,7 +154,6 @@ class SpreadSheetExtractor(Extractor):
         return f"## {sheet_name}\n\n{normalize_spaces(result)}"
     def _convert_sheet_to_text_sync(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
-        """Synchronous version of _convert_sheet_to_text."""
         values = workbook.get_sheet_by_name(sheet_name).to_python()
         csv_buffer = StringIO()
@@ -195,82 +186,57 @@ class SpreadSheetExtractor(Extractor):
         return f"## {sheet_name}\n\n{normalize_spaces(result)}"
     def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
-        """Enhanced sheet processing with better table structure preservation."""
         try:
-            # pandas is optional dependency
-            import pandas as pd  # noqa: PLC0415
-            from kreuzberg._utils._table import enhance_table_markdown  # noqa: PLC0415
             sheet = workbook.get_sheet_by_name(sheet_name)
             data = sheet.to_python()
             if not data or not any(row for row in data):
                 return f"## {sheet_name}\n\n*Empty sheet*"
-            # Convert to DataFrame
-            df = pd.DataFrame(data)
+            df = pl.DataFrame(data)
-            # Clean up empty rows and columns
-            df = df.dropna(how="all").dropna(axis=1, how="all")
+            df = df.filter(~pl.all_horizontal(pl.all().is_null()))
+            df = df.select([col for col in df.columns if not df[col].is_null().all()])
-            if df.empty:
+            if df.is_empty():
                 return f"## {sheet_name}\n\n*No data*"
-            # Create a mock TableData for enhanced formatting
-            from kreuzberg._types import TableData  # noqa: PLC0415
-            # Create a 1x1 transparent image as placeholder
             placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
             mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
             enhanced_markdown = enhance_table_markdown(mock_table)
             return f"## {sheet_name}\n\n{enhanced_markdown}"
-        except (ImportError, AttributeError, ValueError):
-            # Fallback to original method if pandas/table enhancement fails
+        except (AttributeError, ValueError):
             return self._convert_sheet_to_text_sync(workbook, sheet_name)
     @staticmethod
     def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
-        """Extract metadata from spreadsheet using python-calamine.
-        Args:
-            workbook: CalamineWorkbook instance
-        Returns:
-            Metadata dict using existing metadata keys where possible
-        """
         metadata: Metadata = {}
-        # Extract basic document properties
         SpreadSheetExtractor._extract_document_properties(workbook, metadata)
-        # Add structural information
         SpreadSheetExtractor._add_structure_info(workbook, metadata)
-        # Analyze content complexity
         SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
         return metadata
     @staticmethod
     def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
-        """Extract basic document properties from workbook."""
         with contextlib.suppress(AttributeError, Exception):
             if not (hasattr(workbook, "metadata") and workbook.metadata):
                 return
             props = workbook.metadata
-            # Basic properties mapping
             property_mapping = {
                 "title": "title",
-                "author": "authors",  # Convert to list
+                "author": "authors",
                 "subject": "subject",
                 "comments": "comments",
-                "keywords": "keywords",  # Process separately
-                "category": "categories",  # Convert to list
+                "keywords": "keywords",
+                "category": "categories",
                 "company": "organization",
                 "manager": "modified_by",
             }
@@ -286,12 +252,10 @@ class SpreadSheetExtractor(Extractor):
                     else:
                         metadata[meta_key] = value  # type: ignore[literal-required]
-            # Handle dates separately
             SpreadSheetExtractor._extract_date_properties(props, metadata)
     @staticmethod
     def _extract_date_properties(props: Any, metadata: Metadata) -> None:
-        """Extract and format date properties."""
         date_mapping = {"created": "created_at", "modified": "modified_at"}
         for prop_name, meta_key in date_mapping.items():
@@ -304,14 +268,12 @@ class SpreadSheetExtractor(Extractor):
     @staticmethod
     def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
-        """Add structural information about the spreadsheet."""
         if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
             return
         sheet_count = len(workbook.sheet_names)
         structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
-        # Don't list too many sheet names (magic number made constant)
         max_sheet_names_to_list = 5
         if sheet_count <= max_sheet_names_to_list:
             structure_info += f": {', '.join(workbook.sheet_names)}"
@@ -320,12 +282,10 @@ class SpreadSheetExtractor(Extractor):
     @staticmethod
     def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
-        """Analyze spreadsheet content for complexity indicators."""
         with contextlib.suppress(Exception):
             has_formulas = False
             total_cells = 0
-            # Check only first few sheets for performance
             max_sheets_to_check = 3
             max_rows_to_check = 50
@@ -335,17 +295,15 @@ class SpreadSheetExtractor(Extractor):
                     data = sheet.to_python()
                     for row in data[:max_rows_to_check]:
-                        if not row:  # Skip empty rows
+                        if not row:
                             continue
                         total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
-                        # Check for formulas (simple heuristic)
                         if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
                             has_formulas = True
                             break
-            # Build summary
             summary_parts = []
             if total_cells > 0:
                 summary_parts.append(f"Contains {total_cells}+ data cells")

kreuzberg/_extractors/_structured.py CHANGED Viewed

@@ -28,7 +28,6 @@ from kreuzberg._utils._sync import run_sync
 if TYPE_CHECKING:
     from pathlib import Path
-# Define text field keywords as a set for O(1) membership testing
 _TEXT_FIELD_KEYWORDS = frozenset({"title", "name", "subject", "description", "content", "body", "text", "message"})
@@ -79,7 +78,6 @@ class StructuredDataExtractor(Extractor):
             text_parts: list[str] = []
             metadata: dict[str, Any] = {}
-            # Use match statement for cleaner code and avoid multiple isinstance calls
             if isinstance(data, dict):
                 text_parts = self._extract_from_dict(data, metadata)
             elif isinstance(data, list):
@@ -96,7 +94,7 @@ class StructuredDataExtractor(Extractor):
                 chunks=[],
             )
-        except (json.JSONDecodeError, ValueError, TypeError) as e:
+        except (ValueError, TypeError) as e:
             return ExtractionResult(
                 content=normalize_spaces(text_content),
                 mime_type=PLAIN_TEXT_MIME_TYPE,
@@ -117,7 +115,6 @@ class StructuredDataExtractor(Extractor):
             if isinstance(value, str) and value.strip():
                 text_parts.append(f"{full_key}: {value}")
-                # Check if key contains any text field keywords efficiently
                 key_lower = key.lower()
                 if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
                     metadata[full_key] = value

kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl