PyPI - kreuzberg - Versions diffs - 3.6.2__py3-none-any.whl → 3.8.0__py3-none-any.whl - Mend

kreuzberg 3.6.2py3-none-any.whl → 3.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

kreuzberg/_extractors/_base.py +40 -0
kreuzberg/_extractors/_email.py +149 -0
kreuzberg/_extractors/_html.py +15 -3
kreuzberg/_extractors/_image.py +17 -18
kreuzberg/_extractors/_pdf.py +68 -14
kreuzberg/_extractors/_presentation.py +62 -10
kreuzberg/_extractors/_spread_sheet.py +179 -4
kreuzberg/_extractors/_structured.py +148 -0
kreuzberg/_gmft.py +2 -2
kreuzberg/_mcp/__init__.py +5 -0
kreuzberg/_mcp/server.py +227 -0
kreuzberg/_mime_types.py +27 -1
kreuzberg/_multiprocessing/__init__.py +2 -3
kreuzberg/_ocr/__init__.py +30 -0
kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
kreuzberg/_ocr/_sync.py +566 -0
kreuzberg/_ocr/_tesseract.py +6 -2
kreuzberg/_registry.py +4 -0
kreuzberg/_types.py +131 -0
kreuzberg/_utils/_cache.py +17 -2
kreuzberg/_utils/_process_pool.py +178 -1
kreuzberg/_utils/_quality.py +237 -0
kreuzberg/_utils/_serialization.py +4 -2
kreuzberg/_utils/_string.py +153 -10
kreuzberg/_utils/_sync.py +5 -2
kreuzberg/_utils/_table.py +261 -0
{kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +116 -48
kreuzberg-3.8.0.dist-info/RECORD +57 -0
{kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +1 -0
kreuzberg/_multiprocessing/process_manager.py +0 -189
kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
kreuzberg-3.6.2.dist-info/RECORD +0 -54
{kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_base.py CHANGED Viewed

@@ -90,3 +90,43 @@ class Extractor(ABC):
         return mime_type in cls.SUPPORTED_MIME_TYPES or any(
             mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
         )
+    def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
+        """Apply quality post-processing to extraction result if enabled.
+        Args:
+            result: The raw extraction result
+        Returns:
+            Enhanced extraction result with quality improvements (if enabled)
+        """
+        # Only apply quality processing if enabled in config
+        if not self.config.enable_quality_processing:
+            return result
+        from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
+        if not result.content:
+            return result
+        # Clean the content
+        cleaned_content = clean_extracted_text(result.content)
+        # Calculate quality score
+        quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
+        # Add quality metadata
+        enhanced_metadata = dict(result.metadata) if result.metadata else {}
+        enhanced_metadata["quality_score"] = quality_score
+        # Return enhanced result
+        from kreuzberg._types import ExtractionResult, normalize_metadata
+        return ExtractionResult(
+            content=cleaned_content,
+            mime_type=result.mime_type,
+            metadata=normalize_metadata(enhanced_metadata),
+            chunks=result.chunks,
+            detected_languages=result.detected_languages,
+            tables=result.tables,
+        )

kreuzberg/_extractors/_email.py ADDED Viewed

@@ -0,0 +1,149 @@
+from __future__ import annotations
+import re
+from html import unescape
+from typing import TYPE_CHECKING, Any, ClassVar
+from anyio import Path as AsyncPath
+from kreuzberg._extractors._base import Extractor
+from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
+from kreuzberg._types import ExtractionResult, normalize_metadata
+from kreuzberg._utils._string import normalize_spaces
+from kreuzberg._utils._sync import run_sync
+from kreuzberg.exceptions import MissingDependencyError
+if TYPE_CHECKING:
+    from pathlib import Path
+# Import optional dependencies at module level with proper error handling
+try:
+    import mailparse
+except ImportError:
+    mailparse = None
+try:
+    import html2text  # type: ignore[import-not-found]
+except ImportError:
+    html2text = None
+# Compile regex pattern once at module level
+_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
+class EmailExtractor(Extractor):
+    SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {EML_MIME_TYPE}
+    async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
+        return await run_sync(self.extract_bytes_sync, content)
+    async def extract_path_async(self, path: Path) -> ExtractionResult:
+        content = await AsyncPath(path).read_bytes()
+        return await self.extract_bytes_async(content)
+    def _extract_email_headers(
+        self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
+    ) -> None:
+        """Extract and process email headers."""
+        # Use single dict access where possible to avoid repeated lookups
+        subject = parsed_email.get("subject")
+        if subject:
+            metadata["subject"] = subject
+            text_parts.append(f"Subject: {subject}")
+        from_info = parsed_email.get("from")
+        if from_info:
+            from_email = from_info.get("email", "") if isinstance(from_info, dict) else str(from_info)
+            metadata["email_from"] = from_email
+            text_parts.append(f"From: {from_email}")
+        to_info = parsed_email.get("to")
+        if to_info:
+            if isinstance(to_info, list) and to_info:
+                to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
+            elif isinstance(to_info, dict):
+                to_email = to_info.get("email", "")
+            else:
+                to_email = str(to_info)
+            metadata["email_to"] = to_email
+            text_parts.append(f"To: {to_email}")
+        date = parsed_email.get("date")
+        if date:
+            metadata["date"] = date
+            text_parts.append(f"Date: {date}")
+        cc = parsed_email.get("cc")
+        if cc:
+            metadata["email_cc"] = cc
+            text_parts.append(f"CC: {cc}")
+        bcc = parsed_email.get("bcc")
+        if bcc:
+            metadata["email_bcc"] = bcc
+            text_parts.append(f"BCC: {bcc}")
+    def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
+        """Extract and process email body content."""
+        text_content = parsed_email.get("text")
+        if text_content:
+            text_parts.append(f"\n{text_content}")
+            return  # If we have text, prefer it over HTML
+        html_content = parsed_email.get("html")
+        if html_content:
+            if html2text is not None:
+                # Use html2text if available (faster path)
+                h = html2text.HTML2Text()
+                h.ignore_links = True
+                h.ignore_images = True
+                converted_text = h.handle(html_content)
+                text_parts.append(f"\n{converted_text}")
+            else:
+                # Fallback: strip HTML tags and unescape entities
+                clean_html = _HTML_TAG_PATTERN.sub("", html_content)
+                clean_html = unescape(clean_html)
+                text_parts.append(f"\n{clean_html}")
+    def _extract_email_attachments(
+        self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
+    ) -> None:
+        """Extract and process email attachments info."""
+        if parsed_email.get("attachments"):
+            attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
+            metadata["attachments"] = attachment_names
+            if attachment_names:
+                text_parts.append(f"\nAttachments: {', '.join(attachment_names)}")
+    def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
+        if mailparse is None:
+            msg = "mailparse is required for email extraction. Install with: pip install 'kreuzberg[additional-extensions]'"
+            raise MissingDependencyError(msg)
+        try:
+            parsed_email = mailparse.EmailDecode.load(content)
+            text_parts: list[str] = []
+            metadata: dict[str, Any] = {}
+            # Extract headers, body, and attachments
+            self._extract_email_headers(parsed_email, text_parts, metadata)
+            self._extract_email_body(parsed_email, text_parts)
+            self._extract_email_attachments(parsed_email, text_parts, metadata)
+            # Join efficiently
+            combined_text = "\n".join(text_parts)
+            return ExtractionResult(
+                content=normalize_spaces(combined_text),
+                mime_type=PLAIN_TEXT_MIME_TYPE,
+                metadata=normalize_metadata(metadata),
+                chunks=[],
+            )
+        except Exception as e:
+            msg = f"Failed to parse email content: {e}"
+            raise RuntimeError(msg) from e
+    def extract_path_sync(self, path: Path) -> ExtractionResult:
+        content = path.read_bytes()
+        return self.extract_bytes_sync(content)

kreuzberg/_extractors/_html.py CHANGED Viewed

@@ -8,7 +8,7 @@ from anyio import Path as AsyncPath
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
 from kreuzberg._types import ExtractionResult
-from kreuzberg._utils._string import normalize_spaces, safe_decode
+from kreuzberg._utils._string import safe_decode
 from kreuzberg._utils._sync import run_sync
 if TYPE_CHECKING:
@@ -26,8 +26,20 @@ class HTMLExtractor(Extractor):
         return await run_sync(self.extract_bytes_sync, content)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        result = html_to_markdown.convert_to_markdown(safe_decode(content))
-        return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
+        # Use html-to-markdown with script/nav removal for better quality
+        result = html_to_markdown.convert_to_markdown(
+            safe_decode(content),
+            preprocess_html=True,
+            preprocessing_preset="aggressive",
+            remove_navigation=True,
+            remove_forms=True,
+        )
+        # Skip normalize_spaces since quality processing will handle whitespace
+        extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
+        # Apply quality processing which includes normalization
+        return self._apply_quality_processing(extraction_result)
     def extract_path_sync(self, path: Path) -> ExtractionResult:
         content = path.read_bytes()

kreuzberg/_extractors/_image.py CHANGED Viewed

@@ -1,5 +1,9 @@
 from __future__ import annotations
+import contextlib
+import os
+import tempfile
+from pathlib import Path
 from typing import TYPE_CHECKING, ClassVar
 from anyio import Path as AsyncPath
@@ -7,17 +11,13 @@ from anyio import Path as AsyncPath
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import IMAGE_MIME_TYPES
 from kreuzberg._ocr import get_ocr_backend
+from kreuzberg._types import ExtractionResult
 from kreuzberg._utils._tmp import create_temp_file
 from kreuzberg.exceptions import ValidationError
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Mapping
-    from kreuzberg._types import ExtractionResult
-import contextlib
-from pathlib import Path
 class ImageExtractor(Extractor):
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
@@ -56,13 +56,11 @@ class ImageExtractor(Extractor):
         if self.config.ocr_backend is None:
             raise ValidationError("ocr_backend is None, cannot perform OCR")
-        return await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
+        result = await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
+        return self._apply_quality_processing(result)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
         """Pure sync implementation of extract_bytes."""
-        import os
-        import tempfile
         extension = self._get_extension_from_mime_type(self.mime_type)
         fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
@@ -80,10 +78,8 @@ class ImageExtractor(Extractor):
         if self.config.ocr_backend is None:
             raise ValidationError("ocr_backend is None, cannot perform OCR")
-        from kreuzberg._types import ExtractionResult
         if self.config.ocr_backend == "tesseract":
-            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
+            from kreuzberg._ocr._sync import process_batch_images_sync
             from kreuzberg._ocr._tesseract import TesseractConfig
             if isinstance(self.config.ocr_config, TesseractConfig):
@@ -91,30 +87,33 @@ class ImageExtractor(Extractor):
             else:
                 config = TesseractConfig()
-            results = process_batch_images_sync_pure([str(path)], config)
+            results = process_batch_images_sync([str(path)], config, backend="tesseract")
             if results:
-                return results[0]
+                result = results[0]
+                return self._apply_quality_processing(result)
             return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
         if self.config.ocr_backend == "paddleocr":
-            from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
             from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+            from kreuzberg._ocr._sync import process_image_paddleocr_sync as paddle_process
             paddle_config = (
                 self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
             )
-            return paddle_process(path, paddle_config)
+            result = paddle_process(path, paddle_config)
+            return self._apply_quality_processing(result)
         if self.config.ocr_backend == "easyocr":
-            from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
             from kreuzberg._ocr._easyocr import EasyOCRConfig
+            from kreuzberg._ocr._sync import process_image_easyocr_sync as easy_process
             easy_config = (
                 self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
             )
-            return easy_process(path, easy_config)
+            result = easy_process(path, easy_config)
+            return self._apply_quality_processing(result)
         raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -10,15 +10,17 @@ from typing import TYPE_CHECKING, ClassVar, cast
 import anyio
 import pypdfium2
 from anyio import Path as AsyncPath
+from playa import parse
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr import get_ocr_backend
-from kreuzberg._playa import extract_pdf_metadata
+from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
 from kreuzberg._types import ExtractionResult, OcrBackendType
 from kreuzberg._utils._pdf_lock import pypdfium_file_lock
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
+from kreuzberg._utils._table import generate_table_summary
 from kreuzberg._utils._tmp import create_temp_file
 from kreuzberg.exceptions import ParsingError
@@ -63,11 +65,27 @@ class PDFExtractor(Extractor):
         result.metadata = await extract_pdf_metadata(content_bytes)
         if self.config.extract_tables:
-            from kreuzberg._gmft import extract_tables
-            result.tables = await extract_tables(path, self.config.gmft_config)
+            # GMFT is optional dependency
+            try:
+                from kreuzberg._gmft import extract_tables
-        return result
+                result.tables = await extract_tables(path, self.config.gmft_config)
+            except ImportError:
+                result.tables = []
+            # Enhance metadata with table information
+            if result.tables:
+                table_summary = generate_table_summary(result.tables)
+                result.metadata.update(
+                    {
+                        "table_count": table_summary["table_count"],
+                        "tables_summary": f"Document contains {table_summary['table_count']} tables "
+                        f"across {table_summary['pages_with_tables']} pages with "
+                        f"{table_summary['total_rows']} total rows",
+                    }
+                )
+        return self._apply_quality_processing(result)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
         """Pure sync implementation of PDF extraction from bytes."""
@@ -81,8 +99,6 @@ class PDFExtractor(Extractor):
             result = self.extract_path_sync(Path(temp_path))
-            from kreuzberg._playa import extract_pdf_metadata_sync
             metadata = extract_pdf_metadata_sync(content)
             result.metadata = metadata
@@ -100,16 +116,21 @@ class PDFExtractor(Extractor):
         tables = []
         if self.config.extract_tables:
+            # GMFT is optional dependency
             try:
                 from kreuzberg._gmft import extract_tables_sync
                 tables = extract_tables_sync(path)
             except ImportError:
-                pass
+                tables = []
+        # Use playa for better text structure preservation when not using OCR
+        if not self.config.force_ocr and self._validate_extracted_text(text):
+            text = self._extract_with_playa_sync(path, fallback_text=text)
         text = normalize_spaces(text)
-        return ExtractionResult(
+        result = ExtractionResult(
             content=text,
             mime_type=PLAIN_TEXT_MIME_TYPE,
             metadata={},
@@ -117,6 +138,21 @@ class PDFExtractor(Extractor):
             chunks=[],
         )
+        # Enhance metadata with table information
+        if tables:
+            table_summary = generate_table_summary(tables)
+            result.metadata.update(
+                {
+                    "table_count": table_summary["table_count"],
+                    "tables_summary": f"Document contains {table_summary['table_count']} tables "
+                    f"across {table_summary['pages_with_tables']} pages with "
+                    f"{table_summary['total_rows']} total rows",
+                }
+            )
+        # Apply quality processing
+        return self._apply_quality_processing(result)
     def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
         """Check if text extracted from PDF is valid or corrupted.
@@ -283,7 +319,7 @@ class PDFExtractor(Extractor):
                 text_parts = []
                 for page in pdf:
                     text_page = page.get_textpage()
-                    text = text_page.get_text_range()
+                    text = text_page.get_text_bounded()
                     text_parts.append(text)
                     text_page.close()
                     page.close()
@@ -340,19 +376,19 @@ class PDFExtractor(Extractor):
     def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
         """Process PDF images with the configured OCR backend."""
         if self.config.ocr_backend == "tesseract":
-            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
+            from kreuzberg._ocr._sync import process_batch_images_sync
             from kreuzberg._ocr._tesseract import TesseractConfig
             tesseract_config = (
                 self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
             )
-            results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
+            results = process_batch_images_sync([str(p) for p in image_paths], tesseract_config, backend="tesseract")
             text_parts = [r.content for r in results]
             return "\n\n".join(text_parts)
         if self.config.ocr_backend == "paddleocr":
-            from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
             from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+            from kreuzberg._ocr._sync import process_image_paddleocr_sync as paddle_process
             paddle_config = (
                 self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
@@ -365,8 +401,8 @@ class PDFExtractor(Extractor):
             return "\n\n".join(text_parts)
         if self.config.ocr_backend == "easyocr":
-            from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
             from kreuzberg._ocr._easyocr import EasyOCRConfig
+            from kreuzberg._ocr._sync import process_image_easyocr_sync as easy_process
             easy_config = (
                 self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
@@ -379,3 +415,21 @@ class PDFExtractor(Extractor):
             return "\n\n".join(text_parts)
         raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+    def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
+        """Extract text using playa for better structure preservation."""
+        with contextlib.suppress(Exception):
+            content = path.read_bytes()
+            document = parse(content, max_workers=1)
+            text_parts = []
+            for page in document.pages:
+                # Extract text while preserving structure
+                page_text = page.extract_text()
+                if page_text and page_text.strip():
+                    text_parts.append(page_text)
+            if text_parts:
+                return "\n\n".join(text_parts)
+        return fallback_text

kreuzberg/_extractors/_presentation.py CHANGED Viewed

@@ -30,6 +30,9 @@ if TYPE_CHECKING:  # pragma: no cover
     from kreuzberg._types import Metadata
+# Pre-compiled regex patterns for performance
+_NON_WORD_PATTERN = re.compile(r"\W")
 class PresentationExtractor(Extractor):
     """Extractor for PowerPoint (.pptx) files.
@@ -141,7 +144,7 @@ class PresentationExtractor(Extractor):
                     with suppress(AttributeError):
                         alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")  # noqa: SLF001
-                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                    filename = _NON_WORD_PATTERN.sub("", shape.name) + ".jpg"
                     md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
                 elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
@@ -162,7 +165,10 @@ class PresentationExtractor(Extractor):
                     md_content += "\n" + html_table + "\n"
                 elif shape.has_text_frame:
-                    md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
+                    if shape == title:
+                        md_content += "# " + shape.text.lstrip() + "\n"
+                    else:
+                        md_content += shape.text + "\n"
             md_content = md_content.strip()
             if slide.has_notes_slide:
@@ -174,13 +180,15 @@ class PresentationExtractor(Extractor):
                 md_content = md_content.strip()
-        return ExtractionResult(
+        result = ExtractionResult(
             content=normalize_spaces(md_content),
             mime_type=MARKDOWN_MIME_TYPE,
             metadata=self._extract_presentation_metadata(presentation),
             chunks=[],
         )
+        return self._apply_quality_processing(result)
     @staticmethod
     def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
         """Extract metadata from a presentation instance.
@@ -193,7 +201,24 @@ class PresentationExtractor(Extractor):
         """
         metadata: Metadata = {}
-        for metadata_key, core_property_key in [
+        # Extract core properties
+        PresentationExtractor._extract_core_properties(presentation, metadata)
+        # Extract fonts used in presentation
+        fonts = PresentationExtractor._extract_fonts(presentation)
+        if fonts:
+            metadata["fonts"] = list(fonts)
+        # Add structural information
+        PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
+        return metadata
+    @staticmethod
+    def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
+        """Extract core document properties from presentation."""
+        # Property mapping for core metadata
+        property_mapping = [
             ("authors", "author"),
             ("comments", "comments"),
             ("status", "content_status"),
@@ -205,17 +230,22 @@ class PresentationExtractor(Extractor):
             ("version", "revision"),
             ("subject", "subject"),
             ("title", "title"),
-            ("version", "version"),
-        ]:
+        ]
+        for metadata_key, core_property_key in property_mapping:
             if core_property := getattr(presentation.core_properties, core_property_key, None):
                 metadata[metadata_key] = core_property  # type: ignore[literal-required]
+        # Handle special list properties
         if presentation.core_properties.language:
             metadata["languages"] = [presentation.core_properties.language]
         if presentation.core_properties.category:
             metadata["categories"] = [presentation.core_properties.category]
+    @staticmethod
+    def _extract_fonts(presentation: Presentation) -> set[str]:
+        """Extract all fonts used in the presentation."""
         fonts = set()
         for slide in presentation.slides:
             for shape in slide.shapes:
@@ -226,8 +256,30 @@ class PresentationExtractor(Extractor):
                     for run in paragraph.runs:
                         if hasattr(run, "font") and run.font.name:
                             fonts.add(run.font.name)
+        return fonts
-        if fonts:
-            metadata["fonts"] = list(fonts)
-        return metadata
+    @staticmethod
+    def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
+        """Add structural information about the presentation."""
+        slide_count = len(presentation.slides)
+        if slide_count == 0:
+            return
+        # Build description
+        structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
+        slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
+        if slides_with_notes > 0:
+            structure_info += f", {slides_with_notes} with notes"
+        metadata["description"] = structure_info
+        # Build summary if not already present
+        if "summary" not in metadata:
+            summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
+            if slides_with_notes > 0:
+                summary_parts.append(f"{slides_with_notes} slides have notes")
+            if fonts:
+                summary_parts.append(f"uses {len(fonts)} font{'s' if len(fonts) != 1 else ''}")
+            metadata["summary"] = f"{'. '.join(summary_parts)}."

kreuzberg 3.6.2__py3-none-any.whl → 3.8.0__py3-none-any.whl

kreuzberg 3.6.2py3-none-any.whl → 3.8.0py3-none-any.whl