PyPI - kreuzberg - Versions diffs - 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl - Mend

kreuzberg 3.3.0py3-none-any.whl → 3.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

kreuzberg/__init__.py +9 -2
kreuzberg/_api/__init__.py +0 -0
kreuzberg/_api/main.py +87 -0
kreuzberg/_entity_extraction.py +238 -0
kreuzberg/_extractors/_base.py +39 -1
kreuzberg/_extractors/_email.py +149 -0
kreuzberg/_extractors/_html.py +15 -3
kreuzberg/_extractors/_image.py +27 -22
kreuzberg/_extractors/_pandoc.py +3 -14
kreuzberg/_extractors/_pdf.py +97 -34
kreuzberg/_extractors/_presentation.py +62 -10
kreuzberg/_extractors/_spread_sheet.py +181 -6
kreuzberg/_extractors/_structured.py +148 -0
kreuzberg/_gmft.py +318 -11
kreuzberg/_language_detection.py +95 -0
kreuzberg/_mcp/__init__.py +5 -0
kreuzberg/_mcp/server.py +227 -0
kreuzberg/_mime_types.py +27 -1
kreuzberg/_ocr/__init__.py +10 -1
kreuzberg/_ocr/_base.py +59 -0
kreuzberg/_ocr/_easyocr.py +92 -1
kreuzberg/_ocr/_paddleocr.py +89 -0
kreuzberg/_ocr/_tesseract.py +569 -5
kreuzberg/_registry.py +4 -0
kreuzberg/_types.py +181 -4
kreuzberg/_utils/_cache.py +52 -4
kreuzberg/_utils/_device.py +2 -2
kreuzberg/_utils/_errors.py +3 -7
kreuzberg/_utils/_process_pool.py +182 -9
kreuzberg/_utils/_quality.py +237 -0
kreuzberg/_utils/_serialization.py +4 -2
kreuzberg/_utils/_string.py +153 -10
kreuzberg/_utils/_sync.py +6 -7
kreuzberg/_utils/_table.py +261 -0
kreuzberg/_utils/_tmp.py +2 -2
kreuzberg/cli.py +1 -2
kreuzberg/extraction.py +43 -34
kreuzberg-3.8.1.dist-info/METADATA +301 -0
kreuzberg-3.8.1.dist-info/RECORD +53 -0
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
kreuzberg/_multiprocessing/__init__.py +0 -6
kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
kreuzberg/_multiprocessing/process_manager.py +0 -188
kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
kreuzberg-3.3.0.dist-info/METADATA +0 -235
kreuzberg-3.3.0.dist-info/RECORD +0 -48
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_pandoc.py CHANGED Viewed

@@ -1,8 +1,11 @@
 from __future__ import annotations
 import contextlib
+import os
 import re
+import subprocess
 import sys
+import tempfile
 from json import JSONDecodeError, loads
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
@@ -203,10 +206,6 @@ class PandocExtractor(Extractor):
         Returns:
             ExtractionResult with the extracted text and metadata.
         """
-        import os
-        import tempfile
-        from pathlib import Path
         extension = self._get_pandoc_type_from_mime_type(self.mime_type)
         fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
@@ -579,8 +578,6 @@ class PandocExtractor(Extractor):
     def _validate_pandoc_version_sync(self) -> None:
         """Synchronous version of _validate_pandoc_version."""
-        import subprocess
         try:
             if self._checked_version:
                 return
@@ -625,10 +622,6 @@ class PandocExtractor(Extractor):
     def _extract_metadata_sync(self, path: Path) -> Metadata:
         """Synchronous version of _handle_extract_metadata."""
-        import os
-        import subprocess
-        import tempfile
         pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
         fd, metadata_file = tempfile.mkstemp(suffix=".json")
         os.close(fd)
@@ -663,10 +656,6 @@ class PandocExtractor(Extractor):
     def _extract_file_sync(self, path: Path) -> str:
         """Synchronous version of _handle_extract_file."""
-        import os
-        import subprocess
-        import tempfile
         pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
         fd, output_path = tempfile.mkstemp(suffix=".md")
         os.close(fd)

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
 import contextlib
+import os
+import tempfile
 from multiprocessing import cpu_count
 from pathlib import Path
 from re import Pattern
@@ -10,15 +12,21 @@ from typing import TYPE_CHECKING, ClassVar, cast
 import anyio
 import pypdfium2
 from anyio import Path as AsyncPath
+from playa import parse
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr import get_ocr_backend
-from kreuzberg._playa import extract_pdf_metadata
+from kreuzberg._ocr._easyocr import EasyOCRConfig
+from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+from kreuzberg._ocr._tesseract import TesseractConfig
+from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
 from kreuzberg._types import ExtractionResult, OcrBackendType
+from kreuzberg._utils._errors import create_error_context, should_retry
 from kreuzberg._utils._pdf_lock import pypdfium_file_lock
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
+from kreuzberg._utils._table import generate_table_summary
 from kreuzberg._utils._tmp import create_temp_file
 from kreuzberg.exceptions import ParsingError
@@ -63,17 +71,30 @@ class PDFExtractor(Extractor):
         result.metadata = await extract_pdf_metadata(content_bytes)
         if self.config.extract_tables:
-            from kreuzberg._gmft import extract_tables
-            result.tables = await extract_tables(path, self.config.gmft_config)
+            # GMFT is optional dependency
+            try:
+                from kreuzberg._gmft import extract_tables
-        return result
+                result.tables = await extract_tables(path, self.config.gmft_config)
+            except ImportError:
+                result.tables = []
+            # Enhance metadata with table information
+            if result.tables:
+                table_summary = generate_table_summary(result.tables)
+                result.metadata.update(
+                    {
+                        "table_count": table_summary["table_count"],
+                        "tables_summary": f"Document contains {table_summary['table_count']} tables "
+                        f"across {table_summary['pages_with_tables']} pages with "
+                        f"{table_summary['total_rows']} total rows",
+                    }
+                )
+        return self._apply_quality_processing(result)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
         """Pure sync implementation of PDF extraction from bytes."""
-        import os
-        import tempfile
         fd, temp_path = tempfile.mkstemp(suffix=".pdf")
         try:
             with os.fdopen(fd, "wb") as f:
@@ -81,8 +102,6 @@ class PDFExtractor(Extractor):
             result = self.extract_path_sync(Path(temp_path))
-            from kreuzberg._playa import extract_pdf_metadata_sync
             metadata = extract_pdf_metadata_sync(content)
             result.metadata = metadata
@@ -100,16 +119,21 @@ class PDFExtractor(Extractor):
         tables = []
         if self.config.extract_tables:
+            # GMFT is optional dependency
             try:
                 from kreuzberg._gmft import extract_tables_sync
                 tables = extract_tables_sync(path)
             except ImportError:
-                pass
+                tables = []
+        # Use playa for better text structure preservation when not using OCR
+        if not self.config.force_ocr and self._validate_extracted_text(text):
+            text = self._extract_with_playa_sync(path, fallback_text=text)
         text = normalize_spaces(text)
-        return ExtractionResult(
+        result = ExtractionResult(
             content=text,
             mime_type=PLAIN_TEXT_MIME_TYPE,
             metadata={},
@@ -117,6 +141,21 @@ class PDFExtractor(Extractor):
             chunks=[],
         )
+        # Enhance metadata with table information
+        if tables:
+            table_summary = generate_table_summary(tables)
+            result.metadata.update(
+                {
+                    "table_count": table_summary["table_count"],
+                    "tables_summary": f"Document contains {table_summary['table_count']} tables "
+                    f"across {table_summary['pages_with_tables']} pages with "
+                    f"{table_summary['total_rows']} total rows",
+                }
+            )
+        # Apply quality processing
+        return self._apply_quality_processing(result)
     def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
         """Check if text extracted from PDF is valid or corrupted.
@@ -155,8 +194,6 @@ class PDFExtractor(Extractor):
         Returns:
             A list of Pillow Images.
         """
-        from kreuzberg._utils._errors import create_error_context, should_retry
         document: pypdfium2.PdfDocument | None = None
         last_error = None
@@ -228,8 +265,6 @@ class PDFExtractor(Extractor):
         Returns:
             The extracted text.
         """
-        from kreuzberg._utils._errors import create_error_context
         document: pypdfium2.PdfDocument | None = None
         try:
             with pypdfium_file_lock(input_file):
@@ -283,7 +318,7 @@ class PDFExtractor(Extractor):
                 text_parts = []
                 for page in pdf:
                     text_page = page.get_textpage()
-                    text = text_page.get_text_range()
+                    text = text_page.get_text_bounded()
                     text_parts.append(text)
                     text_page.close()
                     page.close()
@@ -299,8 +334,6 @@ class PDFExtractor(Extractor):
         """Extract text from PDF using OCR (sync version)."""
         pdf = None
         try:
-            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
             images = []
             with pypdfium_file_lock(path):
                 pdf = pypdfium2.PdfDocument(str(path))
@@ -311,9 +344,6 @@ class PDFExtractor(Extractor):
                     bitmap.close()
                     page.close()
-            import os
-            import tempfile
             image_paths = []
             temp_files = []
@@ -325,18 +355,7 @@ class PDFExtractor(Extractor):
                     os.close(fd)
                     image_paths.append(temp_path)
-                if self.config.ocr_backend == "tesseract":
-                    from kreuzberg._ocr._tesseract import TesseractConfig
-                    if isinstance(self.config.ocr_config, TesseractConfig):
-                        config = self.config.ocr_config
-                    else:
-                        config = TesseractConfig()
-                    results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
-                    text_parts = [r.content for r in results]
-                    return "\n\n".join(text_parts)
-                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+                return self._process_pdf_images_with_ocr(image_paths)
             finally:
                 for _, temp_path in temp_files:
@@ -349,3 +368,47 @@ class PDFExtractor(Extractor):
             if pdf:
                 with pypdfium_file_lock(path), contextlib.suppress(Exception):
                     pdf.close()
+    def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
+        """Process PDF images with the configured OCR backend."""
+        backend = get_ocr_backend(self.config.ocr_backend)
+        paths = [Path(p) for p in image_paths]
+        if self.config.ocr_backend == "tesseract":
+            config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
+            )
+            results = backend.process_batch_sync(paths, **config.__dict__)
+        elif self.config.ocr_backend == "paddleocr":
+            paddle_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+            )
+            results = backend.process_batch_sync(paths, **paddle_config.__dict__)
+        elif self.config.ocr_backend == "easyocr":
+            easy_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+            )
+            results = backend.process_batch_sync(paths, **easy_config.__dict__)
+        else:
+            raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+        text_parts = [r.content for r in results]
+        return "\n\n".join(text_parts)
+    def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
+        """Extract text using playa for better structure preservation."""
+        with contextlib.suppress(Exception):
+            content = path.read_bytes()
+            document = parse(content, max_workers=1)
+            text_parts = []
+            for page in document.pages:
+                # Extract text while preserving structure
+                page_text = page.extract_text()
+                if page_text and page_text.strip():
+                    text_parts.append(page_text)
+            if text_parts:
+                return "\n\n".join(text_parts)
+        return fallback_text

kreuzberg/_extractors/_presentation.py CHANGED Viewed

@@ -30,6 +30,9 @@ if TYPE_CHECKING:  # pragma: no cover
     from kreuzberg._types import Metadata
+# Pre-compiled regex patterns for performance
+_NON_WORD_PATTERN = re.compile(r"\W")
 class PresentationExtractor(Extractor):
     """Extractor for PowerPoint (.pptx) files.
@@ -141,7 +144,7 @@ class PresentationExtractor(Extractor):
                     with suppress(AttributeError):
                         alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")  # noqa: SLF001
-                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                    filename = _NON_WORD_PATTERN.sub("", shape.name) + ".jpg"
                     md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
                 elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
@@ -162,7 +165,10 @@ class PresentationExtractor(Extractor):
                     md_content += "\n" + html_table + "\n"
                 elif shape.has_text_frame:
-                    md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
+                    if shape == title:
+                        md_content += "# " + shape.text.lstrip() + "\n"
+                    else:
+                        md_content += shape.text + "\n"
             md_content = md_content.strip()
             if slide.has_notes_slide:
@@ -174,13 +180,15 @@ class PresentationExtractor(Extractor):
                 md_content = md_content.strip()
-        return ExtractionResult(
+        result = ExtractionResult(
             content=normalize_spaces(md_content),
             mime_type=MARKDOWN_MIME_TYPE,
             metadata=self._extract_presentation_metadata(presentation),
             chunks=[],
         )
+        return self._apply_quality_processing(result)
     @staticmethod
     def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
         """Extract metadata from a presentation instance.
@@ -193,7 +201,24 @@ class PresentationExtractor(Extractor):
         """
         metadata: Metadata = {}
-        for metadata_key, core_property_key in [
+        # Extract core properties
+        PresentationExtractor._extract_core_properties(presentation, metadata)
+        # Extract fonts used in presentation
+        fonts = PresentationExtractor._extract_fonts(presentation)
+        if fonts:
+            metadata["fonts"] = list(fonts)
+        # Add structural information
+        PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
+        return metadata
+    @staticmethod
+    def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
+        """Extract core document properties from presentation."""
+        # Property mapping for core metadata
+        property_mapping = [
             ("authors", "author"),
             ("comments", "comments"),
             ("status", "content_status"),
@@ -205,17 +230,22 @@ class PresentationExtractor(Extractor):
             ("version", "revision"),
             ("subject", "subject"),
             ("title", "title"),
-            ("version", "version"),
-        ]:
+        ]
+        for metadata_key, core_property_key in property_mapping:
             if core_property := getattr(presentation.core_properties, core_property_key, None):
                 metadata[metadata_key] = core_property  # type: ignore[literal-required]
+        # Handle special list properties
         if presentation.core_properties.language:
             metadata["languages"] = [presentation.core_properties.language]
         if presentation.core_properties.category:
             metadata["categories"] = [presentation.core_properties.category]
+    @staticmethod
+    def _extract_fonts(presentation: Presentation) -> set[str]:
+        """Extract all fonts used in the presentation."""
         fonts = set()
         for slide in presentation.slides:
             for shape in slide.shapes:
@@ -226,8 +256,30 @@ class PresentationExtractor(Extractor):
                     for run in paragraph.runs:
                         if hasattr(run, "font") and run.font.name:
                             fonts.add(run.font.name)
+        return fonts
-        if fonts:
-            metadata["fonts"] = list(fonts)
-        return metadata
+    @staticmethod
+    def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
+        """Add structural information about the presentation."""
+        slide_count = len(presentation.slides)
+        if slide_count == 0:
+            return
+        # Build description
+        structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
+        slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
+        if slides_with_notes > 0:
+            structure_info += f", {slides_with_notes} with notes"
+        metadata["description"] = structure_info
+        # Build summary if not already present
+        if "summary" not in metadata:
+            summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
+            if slides_with_notes > 0:
+                summary_parts.append(f"{slides_with_notes} slides have notes")
+            if fonts:
+                summary_parts.append(f"uses {len(fonts)} font{'s' if len(fonts) != 1 else ''}")
+            metadata["summary"] = f"{'. '.join(summary_parts)}."

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -6,14 +6,14 @@ import sys
 from datetime import date, datetime, time, timedelta
 from io import StringIO
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 from anyio import Path as AsyncPath
 from python_calamine import CalamineWorkbook
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
-from kreuzberg._types import ExtractionResult
+from kreuzberg._types import ExtractionResult, Metadata
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync, run_taskgroup
 from kreuzberg._utils._tmp import create_temp_file
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11):  # pragma: no cover
     from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
-CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
+CellValue = int | float | str | bool | time | date | datetime | timedelta
 class SpreadSheetExtractor(Extractor):
@@ -45,9 +45,14 @@ class SpreadSheetExtractor(Extractor):
             try:
                 results: list[str] = await run_taskgroup(*tasks)
-                return ExtractionResult(
-                    content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[]
+                result = ExtractionResult(
+                    content="\n\n".join(results),
+                    mime_type=MARKDOWN_MIME_TYPE,
+                    metadata=self._extract_spreadsheet_metadata(workbook),
+                    chunks=[],
                 )
+                return self._apply_quality_processing(result)
             except ExceptionGroup as eg:
                 raise ParsingError(
                     "Failed to extract file data",
@@ -87,7 +92,14 @@ class SpreadSheetExtractor(Extractor):
                 sheet_text = self._convert_sheet_to_text_sync(workbook, sheet_name)
                 results.append(sheet_text)
-            return ExtractionResult(content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
+            result = ExtractionResult(
+                content="\n\n".join(results),
+                mime_type=MARKDOWN_MIME_TYPE,
+                metadata=self._extract_spreadsheet_metadata(workbook),
+                chunks=[],
+            )
+            return self._apply_quality_processing(result)
         except Exception as e:
             raise ParsingError(
                 "Failed to extract file data",
@@ -181,3 +193,166 @@ class SpreadSheetExtractor(Extractor):
             result = "\n".join(markdown_lines)
         return f"## {sheet_name}\n\n{normalize_spaces(result)}"
+    def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
+        """Enhanced sheet processing with better table structure preservation."""
+        try:
+            # pandas is optional dependency
+            import pandas as pd
+            from kreuzberg._utils._table import enhance_table_markdown
+            sheet = workbook.get_sheet_by_name(sheet_name)
+            data = sheet.to_python()
+            if not data or not any(row for row in data):
+                return f"## {sheet_name}\n\n*Empty sheet*"
+            # Convert to DataFrame
+            df = pd.DataFrame(data)
+            # Clean up empty rows and columns
+            df = df.dropna(how="all").dropna(axis=1, how="all")
+            if df.empty:
+                return f"## {sheet_name}\n\n*No data*"
+            # Create a mock TableData for enhanced formatting
+            from PIL import Image
+            from kreuzberg._types import TableData
+            # Create a 1x1 transparent image as placeholder
+            placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
+            mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
+            enhanced_markdown = enhance_table_markdown(mock_table)
+            return f"## {sheet_name}\n\n{enhanced_markdown}"
+        except (ImportError, AttributeError, ValueError):
+            # Fallback to original method if pandas/table enhancement fails
+            return self._convert_sheet_to_text_sync(workbook, sheet_name)
+    @staticmethod
+    def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
+        """Extract metadata from spreadsheet using python-calamine.
+        Args:
+            workbook: CalamineWorkbook instance
+        Returns:
+            Metadata dict using existing metadata keys where possible
+        """
+        metadata: Metadata = {}
+        # Extract basic document properties
+        SpreadSheetExtractor._extract_document_properties(workbook, metadata)
+        # Add structural information
+        SpreadSheetExtractor._add_structure_info(workbook, metadata)
+        # Analyze content complexity
+        SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
+        return metadata
+    @staticmethod
+    def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
+        """Extract basic document properties from workbook."""
+        with contextlib.suppress(AttributeError, Exception):
+            if not (hasattr(workbook, "metadata") and workbook.metadata):
+                return
+            props = workbook.metadata
+            # Basic properties mapping
+            property_mapping = {
+                "title": "title",
+                "author": "authors",  # Convert to list
+                "subject": "subject",
+                "comments": "comments",
+                "keywords": "keywords",  # Process separately
+                "category": "categories",  # Convert to list
+                "company": "organization",
+                "manager": "modified_by",
+            }
+            for prop_name, meta_key in property_mapping.items():
+                if hasattr(props, prop_name) and (value := getattr(props, prop_name)):
+                    if meta_key in ("authors", "categories"):
+                        metadata[meta_key] = [value]  # type: ignore[literal-required]
+                    elif meta_key == "keywords":
+                        keywords = [k.strip() for k in value.replace(";", ",").split(",") if k.strip()]
+                        if keywords:
+                            metadata[meta_key] = keywords  # type: ignore[literal-required]
+                    else:
+                        metadata[meta_key] = value  # type: ignore[literal-required]
+            # Handle dates separately
+            SpreadSheetExtractor._extract_date_properties(props, metadata)
+    @staticmethod
+    def _extract_date_properties(props: Any, metadata: Metadata) -> None:
+        """Extract and format date properties."""
+        date_mapping = {"created": "created_at", "modified": "modified_at"}
+        for prop_name, meta_key in date_mapping.items():
+            if hasattr(props, prop_name) and (date_value := getattr(props, prop_name)):
+                with contextlib.suppress(Exception):
+                    if hasattr(date_value, "isoformat"):
+                        metadata[meta_key] = date_value.isoformat()  # type: ignore[literal-required]
+                    else:
+                        metadata[meta_key] = str(date_value)  # type: ignore[literal-required]
+    @staticmethod
+    def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
+        """Add structural information about the spreadsheet."""
+        if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
+            return
+        sheet_count = len(workbook.sheet_names)
+        structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
+        # Don't list too many sheet names (magic number made constant)
+        max_sheet_names_to_list = 5
+        if sheet_count <= max_sheet_names_to_list:
+            structure_info += f": {', '.join(workbook.sheet_names)}"
+        metadata["description"] = structure_info
+    @staticmethod
+    def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
+        """Analyze spreadsheet content for complexity indicators."""
+        with contextlib.suppress(Exception):
+            has_formulas = False
+            total_cells = 0
+            # Check only first few sheets for performance
+            max_sheets_to_check = 3
+            max_rows_to_check = 50
+            for sheet_name in workbook.sheet_names[:max_sheets_to_check]:
+                with contextlib.suppress(Exception):
+                    sheet = workbook.get_sheet_by_name(sheet_name)
+                    data = sheet.to_python()
+                    for row in data[:max_rows_to_check]:
+                        if not row:  # Skip empty rows
+                            continue
+                        total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
+                        # Check for formulas (simple heuristic)
+                        if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
+                            has_formulas = True
+                            break
+            # Build summary
+            summary_parts = []
+            if total_cells > 0:
+                summary_parts.append(f"Contains {total_cells}+ data cells")
+            if has_formulas:
+                summary_parts.append("includes formulas")
+            if summary_parts and "summary" not in metadata:
+                metadata["summary"] = f"Spreadsheet that {', '.join(summary_parts)}."

kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

kreuzberg 3.3.0py3-none-any.whl → 3.8.1py3-none-any.whl