PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_config.py +248 -204
kreuzberg/_document_classification.py +0 -8
kreuzberg/_entity_extraction.py +1 -93
kreuzberg/_extractors/_base.py +0 -5
kreuzberg/_extractors/_email.py +1 -11
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -23
kreuzberg/_extractors/_pandoc.py +10 -89
kreuzberg/_extractors/_pdf.py +39 -92
kreuzberg/_extractors/_presentation.py +0 -17
kreuzberg/_extractors/_spread_sheet.py +13 -53
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -138
kreuzberg/_language_detection.py +1 -22
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -2
kreuzberg/_ocr/_easyocr.py +21 -108
kreuzberg/_ocr/_paddleocr.py +16 -94
kreuzberg/_ocr/_table_extractor.py +260 -0
kreuzberg/_ocr/_tesseract.py +906 -264
kreuzberg/_playa.py +5 -4
kreuzberg/_types.py +638 -40
kreuzberg/_utils/_cache.py +88 -90
kreuzberg/_utils/_device.py +0 -18
kreuzberg/_utils/_document_cache.py +0 -2
kreuzberg/_utils/_errors.py +0 -3
kreuzberg/_utils/_pdf_lock.py +0 -2
kreuzberg/_utils/_process_pool.py +19 -19
kreuzberg/_utils/_quality.py +0 -43
kreuzberg/_utils/_ref.py +48 -0
kreuzberg/_utils/_serialization.py +0 -5
kreuzberg/_utils/_string.py +9 -39
kreuzberg/_utils/_sync.py +0 -1
kreuzberg/_utils/_table.py +50 -57
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
kreuzberg-3.13.0.dist-info/RECORD +56 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -18,11 +18,8 @@ from playa import parse
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr import get_ocr_backend
-from kreuzberg._ocr._easyocr import EasyOCRConfig
-from kreuzberg._ocr._paddleocr import PaddleOCRConfig
-from kreuzberg._ocr._tesseract import TesseractConfig
 from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
-from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
+from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata, OcrBackendType, PaddleOCRConfig, TesseractConfig
 from kreuzberg._utils._errors import create_error_context, should_retry
 from kreuzberg._utils._pdf_lock import pypdfium_file_lock
 from kreuzberg._utils._string import normalize_spaces
@@ -65,7 +62,6 @@ class PDFExtractor(Extractor):
                 if self._validate_extracted_text(content):
                     result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
             except ParsingError:
-                # If searchable text extraction fails, continue to OCR or empty result
                 pass
         if not result and self.config.ocr_backend is not None:
@@ -77,7 +73,7 @@ class PDFExtractor(Extractor):
         result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
         if self.config.extract_tables:
-            # GMFT is optional dependency
+            # GMFT is optional dependency ~keep
             try:
                 from kreuzberg._gmft import extract_tables  # noqa: PLC0415
@@ -85,7 +81,6 @@ class PDFExtractor(Extractor):
             except ImportError:  # pragma: no cover
                 result.tables = []
-            # Enhance metadata with table information
             if result.tables:
                 table_summary = generate_table_summary(result.tables)
                 result.metadata = result.metadata | {
@@ -126,7 +121,7 @@ class PDFExtractor(Extractor):
         tables = []
         if self.config.extract_tables:
-            # GMFT is optional dependency
+            # GMFT is optional dependency ~keep
             try:
                 from kreuzberg._gmft import extract_tables_sync  # noqa: PLC0415
@@ -134,7 +129,6 @@ class PDFExtractor(Extractor):
             except ImportError:
                 tables = []
-        # Use playa for better text structure preservation when not using OCR
         if not self.config.force_ocr and self._validate_extracted_text(text):
             text = self._extract_with_playa_sync(path, fallback_text=text)
@@ -148,7 +142,6 @@ class PDFExtractor(Extractor):
             chunks=[],
         )
-        # Enhance metadata with table information
         if tables:
             table_summary = generate_table_summary(tables)
             result.metadata = result.metadata | {
@@ -158,25 +151,9 @@ class PDFExtractor(Extractor):
                 f"{table_summary['total_rows']} total rows",
             }
-        # Apply quality processing
         return self._apply_quality_processing(result)
     def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
-        """Check if text extracted from PDF is valid or corrupted.
-        This checks for indicators of corrupted PDF text extraction:
-        1. Empty or whitespace-only text
-        2. High concentration of control characters and null bytes
-        3. High concentration of Unicode replacement characters
-        Args:
-            text: The extracted text to validate
-            corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
-                characters (default: 0.05 or 5%)
-        Returns:
-            True if the text appears valid, False if it seems corrupted
-        """
         if not text or not text.strip():
             return False
@@ -188,17 +165,6 @@ class PDFExtractor(Extractor):
         return (len(corruption_matches) / len(text)) < corruption_threshold
     async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
-        """Convert a PDF file to images.
-        Args:
-            input_file: The path to the PDF file.
-        Raises:
-            ParsingError: If the PDF file could not be converted to images.
-        Returns:
-            A list of Pillow Images.
-        """
         document: pypdfium2.PdfDocument | None = None
         last_error = None
@@ -206,7 +172,7 @@ class PDFExtractor(Extractor):
             try:
                 with pypdfium_file_lock(input_file):
                     document = await run_sync(pypdfium2.PdfDocument, str(input_file))
-                    return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
+                    return [page.render(scale=200 / 72).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
             except pypdfium2.PdfiumError as e:  # noqa: PERF203
                 last_error = e
                 if not should_retry(e, attempt + 1):
@@ -238,39 +204,18 @@ class PDFExtractor(Extractor):
         ) from last_error
     async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
-        """Extract text from a scanned PDF file using OCR.
-        Args:
-            input_file: The path to the PDF file.
-            ocr_backend: The OCR backend to use.
-        Returns:
-            The extraction result with text content and metadata.
-        """
         images = await self._convert_pdf_to_images(input_file)
         backend = get_ocr_backend(ocr_backend)
         ocr_results = await run_taskgroup_batched(
             *[backend.process_image(image, **self.config.get_config_dict()) for image in images],
             batch_size=cpu_count(),
         )
-        # Use list comprehension and join for efficient string building
         content = "\n".join(result.content for result in ocr_results)
         return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
     @staticmethod
     async def _extract_pdf_searchable_text(input_file: Path) -> str:
-        """Extract text from a searchable PDF file using pypdfium2.
-        Args:
-            input_file: The path to the PDF file.
-        Raises:
-            ParsingError: If the text could not be extracted from the PDF file.
-        Returns:
-            The extracted text.
-        """
         document: pypdfium2.PdfDocument | None = None
         try:
             with pypdfium_file_lock(input_file):
@@ -318,7 +263,6 @@ class PDFExtractor(Extractor):
                     await run_sync(document.close)
     def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
-        """Extract searchable text from PDF using pypdfium2 (sync version)."""
         pdf = None
         try:
             with pypdfium_file_lock(path):
@@ -339,7 +283,6 @@ class PDFExtractor(Extractor):
                     pdf.close()
     def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
-        """Extract text from PDF using OCR (sync version)."""
         pdf = None
         try:
             images = []
@@ -352,23 +295,7 @@ class PDFExtractor(Extractor):
                     bitmap.close()
                     page.close()
-            image_paths = []
-            temp_files = []
-            try:
-                for i, img in enumerate(images):
-                    fd, temp_path = tempfile.mkstemp(suffix=f"_page_{i}.png")
-                    temp_files.append((fd, temp_path))
-                    img.save(temp_path, format="PNG")
-                    os.close(fd)
-                    image_paths.append(temp_path)
-                return self._process_pdf_images_with_ocr(image_paths)
-            finally:
-                for _, temp_path in temp_files:
-                    with contextlib.suppress(OSError):
-                        Path(temp_path).unlink()
+            return self._process_pdf_images_with_ocr_direct(images)
         except Exception as e:
             raise ParsingError(f"Failed to OCR PDF: {e}") from e
@@ -378,7 +305,6 @@ class PDFExtractor(Extractor):
                     pdf.close()
     def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
-        """Process PDF images with the configured OCR backend."""
         backend = get_ocr_backend(self.config.ocr_backend)
         paths = [Path(p) for p in image_paths]
@@ -401,18 +327,48 @@ class PDFExtractor(Extractor):
             case _:
                 raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
-        # Use list comprehension and join for efficient string building
+        return "\n\n".join(result.content for result in results)
+    def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
+        """Process PIL images directly without temp files."""
+        backend = get_ocr_backend(self.config.ocr_backend)
+        match self.config.ocr_backend:
+            case "tesseract":
+                config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
+                )
+                results = []
+                for image in images:
+                    result = backend.process_image_sync(image, **asdict(config))
+                    results.append(result)
+            case "paddleocr":
+                paddle_config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+                )
+                results = []
+                for image in images:
+                    result = backend.process_image_sync(image, **asdict(paddle_config))
+                    results.append(result)
+            case "easyocr":
+                easy_config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+                )
+                results = []
+                for image in images:
+                    result = backend.process_image_sync(image, **asdict(easy_config))
+                    results.append(result)
+            case _:
+                raise NotImplementedError(f"Direct image OCR not implemented for {self.config.ocr_backend}")
         return "\n\n".join(result.content for result in results)
     def _parse_with_password_attempts(self, content: bytes) -> Document:
-        """Parse PDF with password attempts."""
-        # Normalize password to list
         if isinstance(self.config.pdf_password, str):
             passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
         else:
             passwords = list(self.config.pdf_password)
-        # Try each password in sequence
         last_exception = None
         for password in passwords:
             try:
@@ -421,21 +377,17 @@ class PDFExtractor(Extractor):
                 last_exception = e
                 continue
-        # If all passwords failed, raise the last exception
         if last_exception:
             raise last_exception from None
-        # Fallback to no password
         return parse(content, max_workers=1, password="")
     def _get_passwords_to_try(self) -> list[str]:
-        """Get list of passwords to try in sequence."""
         if isinstance(self.config.pdf_password, str):
             return [self.config.pdf_password] if self.config.pdf_password else [""]
         return list(self.config.pdf_password) if self.config.pdf_password else [""]
     async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
-        """Extract PDF metadata with password attempts."""
         passwords = self._get_passwords_to_try()
         last_exception = None
@@ -446,7 +398,6 @@ class PDFExtractor(Extractor):
                 last_exception = e
                 continue
-        # If all passwords failed, try with empty password as fallback
         try:
             return await extract_pdf_metadata(content, password="")
         except Exception:
@@ -455,7 +406,6 @@ class PDFExtractor(Extractor):
             raise
     def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
-        """Extract PDF metadata with password attempts (sync version)."""
         passwords = self._get_passwords_to_try()
         last_exception = None
@@ -466,7 +416,6 @@ class PDFExtractor(Extractor):
                 last_exception = e
                 continue
-        # If all passwords failed, try with empty password as fallback
         try:
             return extract_pdf_metadata_sync(content, password="")
         except Exception:
@@ -475,12 +424,10 @@ class PDFExtractor(Extractor):
             raise
     def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
-        """Extract text using playa for better structure preservation."""
         with contextlib.suppress(Exception):
             content = path.read_bytes()
             document = self._parse_with_password_attempts(content)
-            # Extract text while preserving structure
             pages_text = []
             for page in document.pages:
                 page_text = page.extract_text()

kreuzberg/_extractors/_presentation.py CHANGED Viewed

@@ -1,12 +1,3 @@
-"""This module provides functions to extract textual content from files.
-It includes vendored code:
-- The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
-    See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
-    Refer to the markitdown repository for it's license (MIT).
-"""
 from __future__ import annotations
 import re
@@ -30,7 +21,6 @@ if TYPE_CHECKING:  # pragma: no cover
     from kreuzberg._types import Metadata
-# Pre-compiled regex patterns for performance
 _NON_WORD_PATTERN = re.compile(r"\W")
@@ -201,15 +191,12 @@ class PresentationExtractor(Extractor):
         """
         metadata: Metadata = {}
-        # Extract core properties
         PresentationExtractor._extract_core_properties(presentation, metadata)
-        # Extract fonts used in presentation
         fonts = PresentationExtractor._extract_fonts(presentation)
         if fonts:
             metadata["fonts"] = list(fonts)
-        # Add structural information
         PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
         return metadata
@@ -217,7 +204,6 @@ class PresentationExtractor(Extractor):
     @staticmethod
     def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
         """Extract core document properties from presentation."""
-        # Property mapping for core metadata
         property_mapping = [
             ("authors", "author"),
             ("comments", "comments"),
@@ -236,7 +222,6 @@ class PresentationExtractor(Extractor):
             if core_property := getattr(presentation.core_properties, core_property_key, None):
                 metadata[metadata_key] = core_property  # type: ignore[literal-required]
-        # Handle special list properties
         if presentation.core_properties.language:
             metadata["languages"] = [presentation.core_properties.language]
@@ -265,7 +250,6 @@ class PresentationExtractor(Extractor):
         if slide_count == 0:
             return
-        # Build description
         structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
         slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
@@ -274,7 +258,6 @@ class PresentationExtractor(Extractor):
         metadata["description"] = structure_info
-        # Build summary if not already present
         if "summary" not in metadata:
             summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
             if slides_with_notes > 0:

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -10,15 +10,17 @@ from io import StringIO
 from pathlib import Path
 from typing import Any
+import polars as pl
 from anyio import Path as AsyncPath
 from PIL import Image
 from python_calamine import CalamineWorkbook
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
-from kreuzberg._types import ExtractionResult, Metadata
+from kreuzberg._types import ExtractionResult, Metadata, TableData
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync, run_taskgroup
+from kreuzberg._utils._table import enhance_table_markdown
 from kreuzberg._utils._tmp import create_temp_file
 from kreuzberg.exceptions import ParsingError
@@ -108,14 +110,6 @@ class SpreadSheetExtractor(Extractor):
     @staticmethod
     def _convert_cell_to_str(value: Any) -> str:
-        """Convert a cell value to string representation.
-        Args:
-            value: The cell value to convert.
-        Returns:
-            String representation of the cell value.
-        """
         if value is None:
             return ""
         if isinstance(value, bool):
@@ -139,7 +133,7 @@ class SpreadSheetExtractor(Extractor):
         csv_buffer.close()
         csv_path, unlink = await create_temp_file(".csv")
-        await AsyncPath(csv_path).write_text(csv_data)
+        await AsyncPath(csv_path).write_text(csv_data, encoding="utf-8")
         csv_reader = csv.reader(StringIO(csv_data))
         rows = list(csv_reader)
@@ -162,7 +156,6 @@ class SpreadSheetExtractor(Extractor):
         return f"## {sheet_name}\n\n{normalize_spaces(result)}"
     def _convert_sheet_to_text_sync(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
-        """Synchronous version of _convert_sheet_to_text."""
         values = workbook.get_sheet_by_name(sheet_name).to_python()
         csv_buffer = StringIO()
@@ -195,82 +188,57 @@ class SpreadSheetExtractor(Extractor):
         return f"## {sheet_name}\n\n{normalize_spaces(result)}"
     def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
-        """Enhanced sheet processing with better table structure preservation."""
         try:
-            # pandas is optional dependency
-            import pandas as pd  # noqa: PLC0415
-            from kreuzberg._utils._table import enhance_table_markdown  # noqa: PLC0415
             sheet = workbook.get_sheet_by_name(sheet_name)
             data = sheet.to_python()
             if not data or not any(row for row in data):
                 return f"## {sheet_name}\n\n*Empty sheet*"
-            # Convert to DataFrame
-            df = pd.DataFrame(data)
+            df = pl.DataFrame(data)
-            # Clean up empty rows and columns
-            df = df.dropna(how="all").dropna(axis=1, how="all")
+            df = df.filter(~pl.all_horizontal(pl.all().is_null()))
+            df = df.select([col for col in df.columns if not df[col].is_null().all()])
-            if df.empty:
+            if df.is_empty():
                 return f"## {sheet_name}\n\n*No data*"
-            # Create a mock TableData for enhanced formatting
-            from kreuzberg._types import TableData  # noqa: PLC0415
-            # Create a 1x1 transparent image as placeholder
             placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
             mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
             enhanced_markdown = enhance_table_markdown(mock_table)
             return f"## {sheet_name}\n\n{enhanced_markdown}"
-        except (ImportError, AttributeError, ValueError):
-            # Fallback to original method if pandas/table enhancement fails
+        except (AttributeError, ValueError):
             return self._convert_sheet_to_text_sync(workbook, sheet_name)
     @staticmethod
     def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
-        """Extract metadata from spreadsheet using python-calamine.
-        Args:
-            workbook: CalamineWorkbook instance
-        Returns:
-            Metadata dict using existing metadata keys where possible
-        """
         metadata: Metadata = {}
-        # Extract basic document properties
         SpreadSheetExtractor._extract_document_properties(workbook, metadata)
-        # Add structural information
         SpreadSheetExtractor._add_structure_info(workbook, metadata)
-        # Analyze content complexity
         SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
         return metadata
     @staticmethod
     def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
-        """Extract basic document properties from workbook."""
         with contextlib.suppress(AttributeError, Exception):
             if not (hasattr(workbook, "metadata") and workbook.metadata):
                 return
             props = workbook.metadata
-            # Basic properties mapping
             property_mapping = {
                 "title": "title",
-                "author": "authors",  # Convert to list
+                "author": "authors",
                 "subject": "subject",
                 "comments": "comments",
-                "keywords": "keywords",  # Process separately
-                "category": "categories",  # Convert to list
+                "keywords": "keywords",
+                "category": "categories",
                 "company": "organization",
                 "manager": "modified_by",
             }
@@ -286,12 +254,10 @@ class SpreadSheetExtractor(Extractor):
                     else:
                         metadata[meta_key] = value  # type: ignore[literal-required]
-            # Handle dates separately
             SpreadSheetExtractor._extract_date_properties(props, metadata)
     @staticmethod
     def _extract_date_properties(props: Any, metadata: Metadata) -> None:
-        """Extract and format date properties."""
         date_mapping = {"created": "created_at", "modified": "modified_at"}
         for prop_name, meta_key in date_mapping.items():
@@ -304,14 +270,12 @@ class SpreadSheetExtractor(Extractor):
     @staticmethod
     def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
-        """Add structural information about the spreadsheet."""
         if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
             return
         sheet_count = len(workbook.sheet_names)
         structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
-        # Don't list too many sheet names (magic number made constant)
         max_sheet_names_to_list = 5
         if sheet_count <= max_sheet_names_to_list:
             structure_info += f": {', '.join(workbook.sheet_names)}"
@@ -320,12 +284,10 @@ class SpreadSheetExtractor(Extractor):
     @staticmethod
     def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
-        """Analyze spreadsheet content for complexity indicators."""
         with contextlib.suppress(Exception):
             has_formulas = False
             total_cells = 0
-            # Check only first few sheets for performance
             max_sheets_to_check = 3
             max_rows_to_check = 50
@@ -335,17 +297,15 @@ class SpreadSheetExtractor(Extractor):
                     data = sheet.to_python()
                     for row in data[:max_rows_to_check]:
-                        if not row:  # Skip empty rows
+                        if not row:
                             continue
                         total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
-                        # Check for formulas (simple heuristic)
                         if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
                             has_formulas = True
                             break
-            # Build summary
             summary_parts = []
             if total_cells > 0:
                 summary_parts.append(f"Contains {total_cells}+ data cells")

kreuzberg/_extractors/_structured.py CHANGED Viewed

@@ -28,7 +28,6 @@ from kreuzberg._utils._sync import run_sync
 if TYPE_CHECKING:
     from pathlib import Path
-# Define text field keywords as a set for O(1) membership testing
 _TEXT_FIELD_KEYWORDS = frozenset({"title", "name", "subject", "description", "content", "body", "text", "message"})
@@ -79,7 +78,6 @@ class StructuredDataExtractor(Extractor):
             text_parts: list[str] = []
             metadata: dict[str, Any] = {}
-            # Use match statement for cleaner code and avoid multiple isinstance calls
             if isinstance(data, dict):
                 text_parts = self._extract_from_dict(data, metadata)
             elif isinstance(data, list):
@@ -96,7 +94,7 @@ class StructuredDataExtractor(Extractor):
                 chunks=[],
             )
-        except (json.JSONDecodeError, ValueError, TypeError) as e:
+        except (ValueError, TypeError) as e:
             return ExtractionResult(
                 content=normalize_spaces(text_content),
                 mime_type=PLAIN_TEXT_MIME_TYPE,
@@ -117,7 +115,6 @@ class StructuredDataExtractor(Extractor):
             if isinstance(value, str) and value.strip():
                 text_parts.append(f"{full_key}: {value}")
-                # Check if key contains any text field keywords efficiently
                 key_lower = key.lower()
                 if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
                     metadata[full_key] = value

kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl