PyPI - kreuzberg - Versions diffs - 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl - Mend

kreuzberg 3.15.0py3-none-any.whl → 3.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

kreuzberg/__init__.py +6 -0
kreuzberg/_api/main.py +0 -53
kreuzberg/_config.py +17 -8
kreuzberg/_document_classification.py +1 -1
kreuzberg/_extractors/_base.py +0 -46
kreuzberg/_extractors/_email.py +16 -10
kreuzberg/_extractors/_html.py +39 -12
kreuzberg/_extractors/_pandoc.py +2 -2
kreuzberg/_extractors/_pdf.py +6 -7
kreuzberg/_extractors/_presentation.py +4 -0
kreuzberg/_extractors/_spread_sheet.py +0 -1
kreuzberg/_extractors/_structured.py +83 -15
kreuzberg/_gmft.py +7 -2
kreuzberg/_mcp/server.py +1 -22
kreuzberg/_mime_types.py +1 -1
kreuzberg/_ocr/_easyocr.py +47 -20
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +27 -26
kreuzberg/_token_reduction/__init__.py +11 -0
kreuzberg/_token_reduction/_reducer.py +439 -0
kreuzberg/_token_reduction/_stopwords.py +116 -0
kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
kreuzberg/_types.py +146 -43
kreuzberg/_utils/_html_streaming.py +20 -0
kreuzberg/_utils/_image_preprocessing.py +1 -1
kreuzberg/_utils/_ref.py +14 -6
kreuzberg/_utils/_serialization.py +13 -6
kreuzberg/_utils/_sync.py +15 -16
kreuzberg/exceptions.py +0 -1
kreuzberg/extraction.py +27 -11
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
kreuzberg-3.17.0.dist-info/RECORD +128 -0
kreuzberg-3.15.0.dist-info/RECORD +0 -60
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/__init__.py CHANGED Viewed

@@ -8,8 +8,10 @@ from ._types import (
     ExtractionConfig,
     ExtractionResult,
     GMFTConfig,
+    HTMLToMarkdownConfig,
     ImageOCRConfig,
     ImageOCRResult,
+    JSONExtractionConfig,
     LanguageDetectionConfig,
     Metadata,
     PaddleOCRConfig,
@@ -17,6 +19,7 @@ from ._types import (
     SpacyEntityExtractionConfig,
     TableData,
     TesseractConfig,
+    TokenReductionConfig,
 )
 from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
 from .extraction import (
@@ -40,8 +43,10 @@ __all__ = [
     "ExtractionResult",
     "ExtractorRegistry",
     "GMFTConfig",
+    "HTMLToMarkdownConfig",
     "ImageOCRConfig",
     "ImageOCRResult",
+    "JSONExtractionConfig",
     "KreuzbergError",
     "LanguageDetectionConfig",
     "Metadata",
@@ -53,6 +58,7 @@ __all__ = [
     "SpacyEntityExtractionConfig",
     "TableData",
     "TesseractConfig",
+    "TokenReductionConfig",
     "ValidationError",
     "__version__",
     "batch_extract_bytes",

kreuzberg/_api/main.py CHANGED Viewed

@@ -13,10 +13,8 @@ from typing_extensions import TypedDict
 from kreuzberg import (
     EasyOCRConfig,
-    ExtractedImage,
     ExtractionConfig,
     ExtractionResult,
-    ImageOCRResult,
     KreuzbergError,
     MissingDependencyError,
     PaddleOCRConfig,
@@ -40,30 +38,6 @@ if TYPE_CHECKING:
     from litestar.datastructures import UploadFile
-class ExtractedImageDict(TypedDict):
-    """TypedDict for extracted image JSON representation."""
-    data: str
-    format: str
-    filename: str | None
-    page_number: int | None
-    dimensions: tuple[int, int] | None
-    colorspace: str | None
-    bits_per_component: int | None
-    is_mask: bool
-    description: str | None
-class ImageOCRResultDict(TypedDict):
-    """TypedDict for image OCR result JSON representation."""
-    image: ExtractedImageDict
-    ocr_result: Any
-    confidence_score: float | None
-    processing_time: float | None
-    skipped_reason: str | None
 class HealthResponse(TypedDict):
     """Response model for health check endpoint."""
@@ -384,31 +358,6 @@ def _pil_image_encoder(obj: Any) -> str:
     return f"data:image/png;base64,{img_str}"
-def _extracted_image_encoder(obj: ExtractedImage) -> ExtractedImageDict:
-    encoded_data = base64.b64encode(obj.data).decode()
-    return ExtractedImageDict(
-        data=f"data:image/{obj.format};base64,{encoded_data}",
-        format=obj.format,
-        filename=obj.filename,
-        page_number=obj.page_number,
-        dimensions=obj.dimensions,
-        colorspace=obj.colorspace,
-        bits_per_component=obj.bits_per_component,
-        is_mask=obj.is_mask,
-        description=obj.description,
-    )
-def _image_ocr_result_encoder(obj: ImageOCRResult) -> ImageOCRResultDict:
-    return ImageOCRResultDict(
-        image=_extracted_image_encoder(obj.image),
-        ocr_result=obj.ocr_result,
-        confidence_score=obj.confidence_score,
-        processing_time=obj.processing_time,
-        skipped_reason=obj.skipped_reason,
-    )
 openapi_config = OpenAPIConfig(
     title="Kreuzberg API",
     version="3.14.0",
@@ -428,8 +377,6 @@ openapi_config = OpenAPIConfig(
 type_encoders = {
     pl.DataFrame: _polars_dataframe_encoder,
     Image.Image: _pil_image_encoder,
-    ExtractedImage: _extracted_image_encoder,
-    ImageOCRResult: _image_ocr_result_encoder,
 }
 app = Litestar(

kreuzberg/_config.py CHANGED Viewed

@@ -69,12 +69,21 @@ def _build_ocr_config_from_cli(
     try:
         match ocr_backend:
             case "tesseract":
-                return TesseractConfig(**backend_args)
+                processed_args = backend_args.copy()
+                if "psm" in processed_args and isinstance(processed_args["psm"], int):
+                    try:
+                        processed_args["psm"] = PSMMode(processed_args["psm"])
+                    except ValueError as e:  # pragma: no cover
+                        raise ValidationError(
+                            f"Invalid PSM mode value: {processed_args['psm']}",
+                            context={"psm_value": processed_args["psm"], "error": str(e)},
+                        ) from e
+                return TesseractConfig(**processed_args)
             case "easyocr":
                 return EasyOCRConfig(**backend_args)
             case "paddleocr":
                 return PaddleOCRConfig(**backend_args)
-            case _:
+            case _:  # pragma: no cover
                 return None
     except (TypeError, ValueError) as e:
         raise ValidationError(
@@ -112,7 +121,7 @@ def _configure_gmft(
     try:
         if cli_args.get("gmft_config"):
             gmft_config = GMFTConfig(**cli_args["gmft_config"])
-        elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
+        elif "gmft" in file_config and isinstance(file_config["gmft"], dict):  # pragma: no cover
             gmft_config = GMFTConfig(**file_config["gmft"])
     except (TypeError, ValueError) as e:
         raise ValidationError(
@@ -120,7 +129,7 @@ def _configure_gmft(
             context={"gmft_config": cli_args.get("gmft_config") or file_config.get("gmft"), "error": str(e)},
         ) from e
-    if gmft_config:
+    if gmft_config:  # pragma: no cover
         config_dict["gmft_config"] = gmft_config
@@ -151,7 +160,7 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
     try:
         with config_path.open("rb") as f:
             data = tomllib.load(f)
-    except FileNotFoundError as e:
+    except FileNotFoundError as e:  # pragma: no cover
         raise ValidationError(f"Configuration file not found: {config_path}") from e
     except tomllib.TOMLDecodeError as e:
         raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
@@ -237,7 +246,7 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
     try:
         return ExtractionConfig(**extraction_config)
-    except (TypeError, ValueError) as e:
+    except (TypeError, ValueError) as e:  # pragma: no cover
         raise ValidationError(
             f"Invalid extraction configuration: {e}",
             context={"config": extraction_config, "error": str(e)},
@@ -261,7 +270,7 @@ def build_extraction_config(
     try:
         return ExtractionConfig(**config_dict)
-    except (TypeError, ValueError) as e:
+    except (TypeError, ValueError) as e:  # pragma: no cover
         raise ValidationError(
             f"Invalid extraction configuration: {e}",
             context={"config": config_dict, "error": str(e)},
@@ -283,7 +292,7 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
                     data = tomllib.load(f)
                 if "tool" in data and "kreuzberg" in data["tool"]:
                     return pyproject_toml
-            except OSError as e:
+            except OSError as e:  # pragma: no cover
                 raise ValidationError(
                     f"Failed to read pyproject.toml: {e}",
                     context={"file": str(pyproject_toml), "error": str(e)},

kreuzberg/_document_classification.py CHANGED Viewed

@@ -132,7 +132,7 @@ def classify_document_from_layout(
             if not found_words.is_empty():
                 scores[doc_type] += 1.0
                 word_top = found_words[0, "top"]
-                if word_top < page_height * 0.3:
+                if word_top is not None and word_top < page_height * 0.3:
                     scores[doc_type] += 0.5
     total_score = sum(scores.values())

kreuzberg/_extractors/_base.py CHANGED Viewed

@@ -96,7 +96,6 @@ class Extractor(ABC):
         )
     def _check_image_memory_limits(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
-        """Filter images based on memory safety limits."""
         if not images:
             return []
@@ -142,17 +141,6 @@ class Extractor(ABC):
     _HASH_SAMPLE_SIZE = 512
     def _compute_image_hash(self, img: ExtractedImage) -> int:
-        """Compute hash for image deduplication using progressive hashing.
-        For small images (<1KB), hash the entire content.
-        For larger images, use size + first/last bytes for quick comparison.
-        Args:
-            img: Image to hash
-        Returns:
-            Hash value for deduplication
-        """
         data_len = len(img.data)
         if data_len < self._SMALL_IMAGE_THRESHOLD:
@@ -189,14 +177,6 @@ class Extractor(ABC):
         return unique_images
     def _prepare_ocr_config(self, backend_name: str) -> dict[str, Any]:
-        """Prepare OCR configuration for the specified backend.
-        Args:
-            backend_name: Name of the OCR backend
-        Returns:
-            Configuration dictionary for the backend
-        """
         default_config: TesseractConfig | EasyOCRConfig | PaddleOCRConfig
         config_class: type[TesseractConfig | EasyOCRConfig | PaddleOCRConfig]
@@ -222,14 +202,6 @@ class Extractor(ABC):
         return cfg
     def _validate_image_for_ocr(self, img: ExtractedImage) -> str | None:
-        """Validate if an image is suitable for OCR processing.
-        Args:
-            img: Image to validate
-        Returns:
-            Reason for skipping if invalid, None if valid
-        """
         fmt = img.format.lower()
         if fmt not in self.config.image_ocr_formats:
             return f"Unsupported format: {img.format}"
@@ -247,16 +219,6 @@ class Extractor(ABC):
         return None
     async def _ocr_single_image(self, target: ExtractedImage, backend: Any, cfg: dict[str, Any]) -> ImageOCRResult:
-        """Process a single image with OCR.
-        Args:
-            target: Image to process
-            backend: OCR backend instance
-            cfg: Configuration for the backend
-        Returns:
-            OCR result for the image
-        """
         try:
             start = time.time()
             pil_img = Image.open(io.BytesIO(target.data))
@@ -284,14 +246,6 @@ class Extractor(ABC):
     async def _process_images_with_ocr(
         self, images: tuple[ExtractedImage, ...] | list[ExtractedImage]
     ) -> list[ImageOCRResult]:
-        """Process multiple images with OCR.
-        Args:
-            images: Tuple or list of images to process
-        Returns:
-            List of OCR results
-        """
         if not images or not self.config.ocr_extracted_images:
             return []

kreuzberg/_extractors/_email.py CHANGED Viewed

@@ -27,6 +27,8 @@ except ImportError:  # pragma: no cover
     html2text = None
 _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
+_UNICODE_QUOTES_PATTERN = re.compile(r"[\u201c\u201d]")
+_UNICODE_SINGLE_QUOTES_PATTERN = re.compile(r"[\u2018\u2019]")
 class EmailExtractor(Extractor):
@@ -86,7 +88,14 @@ class EmailExtractor(Extractor):
     def _format_email_field(self, field: Any) -> str:
         match field:
             case list():
-                return ", ".join(str(item.get("email", "")) if isinstance(item, dict) else str(item) for item in field)
+                emails = []
+                for item in field:
+                    if isinstance(item, dict):
+                        if email := item.get("email", ""):
+                            emails.append(str(email))
+                    else:
+                        emails.append(str(item))
+                return ", ".join(emails)
             case dict():
                 return str(field.get("email", ""))
             case _:
@@ -111,12 +120,8 @@ class EmailExtractor(Extractor):
                 cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
                 clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
                 clean_html = unescape(clean_html)
-                clean_html = (
-                    clean_html.replace("\u201c", '"')
-                    .replace("\u201d", '"')
-                    .replace("\u2019", "'")
-                    .replace("\u2018", "'")
-                )
+                clean_html = _UNICODE_QUOTES_PATTERN.sub('"', clean_html)
+                clean_html = _UNICODE_SINGLE_QUOTES_PATTERN.sub("'", clean_html)
                 text_parts.append(clean_html)
     def _extract_email_attachments(
@@ -129,12 +134,12 @@ class EmailExtractor(Extractor):
         for att in attachments:
             name_val: str = "unknown"
             if isinstance(att, dict):
-                n = att.get("name")
+                n = att.get("name") or att.get("filename")
                 if isinstance(n, str) and n:
                     name_val = n
             names.append(name_val)
-        metadata["attachments"] = names
         if names:
+            metadata["attachments"] = names
             text_parts.append("Attachments: " + ", ".join(names))
     def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
@@ -151,7 +156,8 @@ class EmailExtractor(Extractor):
             if not isinstance(mime, str) or not mime.startswith("image/"):
                 continue
-            name = att.get("name") if isinstance(att.get("name"), str) else None
+            name = att.get("name") or att.get("filename")
+            name = name if isinstance(name, str) else None
             data = att.get("data") or att.get("content") or att.get("payload")
             raw: bytes | None = None
             if isinstance(data, (bytes, bytearray)):

kreuzberg/_extractors/_html.py CHANGED Viewed

@@ -1,16 +1,20 @@
 from __future__ import annotations
 import base64
+import binascii
+import io
 import logging
 from typing import TYPE_CHECKING, ClassVar
 import html_to_markdown
 from anyio import Path as AsyncPath
 from bs4 import BeautifulSoup
+from PIL import Image
 from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
 from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
 from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
+from kreuzberg._utils._html_streaming import should_use_streaming
 from kreuzberg._utils._string import safe_decode
 from kreuzberg._utils._sync import run_maybe_async, run_sync
@@ -44,6 +48,11 @@ class HTMLExtractor(Extractor):
         config_dict = config.to_dict()
         html_content = safe_decode(content)
+        use_streaming, chunk_size = should_use_streaming(len(content))
+        config_dict["stream_processing"] = use_streaming
+        config_dict["chunk_size"] = chunk_size
         result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
         extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
@@ -89,6 +98,13 @@ class HTMLExtractor(Extractor):
                         )
                         continue
+                    dimensions = None
+                    try:
+                        with Image.open(io.BytesIO(image_data)) as pil_img:
+                            dimensions = pil_img.size
+                    except (OSError, ValueError) as e:  # pragma: no cover
+                        logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
                     alt_val = img.get("alt")  # type: ignore[union-attr]
                     desc = alt_val if isinstance(alt_val, str) else None
                     images.append(
@@ -97,25 +113,36 @@ class HTMLExtractor(Extractor):
                             format=format_name,
                             filename=f"embedded_image_{len(images) + 1}.{format_name}",
                             description=desc,
+                            dimensions=dimensions,
                         )
                     )
-                except Exception as e:  # noqa: BLE001
+                except (ValueError, binascii.Error) as e:
                     logger.warning("Failed to extract base64 image: %s", e)
-        for svg in soup.find_all("svg"):
+        def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
             try:
-                svg_content = str(svg).encode("utf-8")
-                title_or_aria = svg.get("title") or svg.get("aria-label")  # type: ignore[union-attr]
+                svg_content = str(svg_element).encode("utf-8")
+                def _get_attr_safe(obj: object, attr: str) -> str | None:
+                    get_method = getattr(obj, "get", None)
+                    if callable(get_method):
+                        result = get_method(attr)
+                        return result if isinstance(result, str) else None
+                    return None
+                title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
                 desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
-                images.append(
-                    ExtractedImage(
-                        data=svg_content,
-                        format="svg",
-                        filename=f"inline_svg_{len(images) + 1}.svg",
-                        description=desc_svg,
-                    )
+                return ExtractedImage(
+                    data=svg_content,
+                    format="svg",
+                    filename=f"inline_svg_{len(images) + 1}.svg",
+                    description=desc_svg,
                 )
-            except Exception as e:  # noqa: BLE001, PERF203
+            except (UnicodeEncodeError, AttributeError) as e:
                 logger.warning("Failed to extract SVG: %s", e)
+                return None
+        svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
+        images.extend(img for img in svg_images if img is not None)
         return images

kreuzberg/_extractors/_pandoc.py CHANGED Viewed

@@ -253,7 +253,7 @@ class PandocExtractor(Extractor):
                 "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
             )
-        except FileNotFoundError as e:
+        except FileNotFoundError as e:  # pragma: no cover
             raise MissingDependencyError(
                 "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
             ) from e
@@ -491,7 +491,7 @@ class PandocExtractor(Extractor):
                 "Please install it on your system and make sure its available in $PATH."
             )
-        except (subprocess.SubprocessError, FileNotFoundError) as e:
+        except (subprocess.SubprocessError, FileNotFoundError) as e:  # pragma: no cover
             raise MissingDependencyError(
                 "Pandoc version 2 or above is a required system dependency. "
                 "Please install it on your system and make sure its available in $PATH."

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import asyncio
 import contextlib
 import io
 import logging
@@ -41,7 +40,7 @@ from kreuzberg._utils._errors import create_error_context, should_retry
 from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
 from kreuzberg._utils._resource_managers import pdf_document, pdf_document_sync, pdf_resources_sync
 from kreuzberg._utils._string import normalize_spaces
-from kreuzberg._utils._sync import run_maybe_async, run_taskgroup_batched
+from kreuzberg._utils._sync import run_maybe_async, run_taskgroup, run_taskgroup_batched
 from kreuzberg._utils._table import generate_table_summary
 from kreuzberg._utils._tmp import temporary_file, temporary_file_sync
 from kreuzberg.exceptions import ParsingError
@@ -154,7 +153,7 @@ class PDFExtractor(Extractor):
                 from kreuzberg._gmft import extract_tables_sync  # noqa: PLC0415
                 tables = extract_tables_sync(path)
-            except ImportError:
+            except ImportError:  # pragma: no cover
                 tables = []
         if not self.config.force_ocr and self._validate_extracted_text(text):
@@ -231,7 +230,7 @@ class PDFExtractor(Extractor):
                 img_counter += 1
         if tasks:
-            results = await asyncio.gather(*tasks)
+            results = await run_taskgroup(*tasks)
             return [img for img in results if img is not None]
         return []
@@ -501,7 +500,7 @@ class PDFExtractor(Extractor):
             except (ValueError, TypeError, KeyError, RuntimeError) as e:  # noqa: PERF203
                 last_exception = e
                 continue
-            except OSError as e:
+            except OSError as e:  # pragma: no cover
                 raise ParsingError(f"Failed to parse PDF: {e}") from e
         if last_exception:
@@ -521,7 +520,7 @@ class PDFExtractor(Extractor):
         for password in passwords:
             try:
                 return await extract_pdf_metadata(content, password=password)
-            except (ParsingError, ValueError, TypeError, OSError) as e:  # noqa: PERF203
+            except (ParsingError, ValueError, TypeError, OSError) as e:  # noqa: PERF203  # pragma: no cover
                 last_exception = e
                 continue
@@ -539,7 +538,7 @@ class PDFExtractor(Extractor):
         for password in passwords:
             try:
                 return extract_pdf_metadata_sync(content, password=password)
-            except (ParsingError, ValueError, TypeError, OSError) as e:  # noqa: PERF203
+            except (ParsingError, ValueError, TypeError, OSError) as e:  # noqa: PERF203  # pragma: no cover
                 last_exception = e
                 continue

kreuzberg/_extractors/_presentation.py CHANGED Viewed

@@ -142,6 +142,8 @@ class PresentationExtractor(Extractor):
                 if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                     try:
                         image = shape.image
+                        if not image.blob or not isinstance(image.blob, bytes):
+                            continue
                         filename = f"slide_{slide_num}_image_{len(images) + 1}.{image.ext}"
                         images.append(
@@ -162,6 +164,8 @@ class PresentationExtractor(Extractor):
             if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                 try:
                     image = shape.image
+                    if not image.blob or not isinstance(image.blob, bytes):
+                        continue
                     filename = f"slide_{slide_num}_group_image_{image_count + len(images) + 1}.{image.ext}"
                     images.append(
                         ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -197,7 +197,6 @@ class SpreadSheetExtractor(Extractor):
             if not data or not any(row for row in data):
                 return f"## {sheet_name}\n\n*Empty sheet*"
-            # Normalize row lengths to avoid polars ShapeError
             if data:
                 max_cols = max(len(row) if row else 0 for row in data)
                 data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data]  # type: ignore[list-item]

kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl

kreuzberg 3.15.0py3-none-any.whl → 3.17.0py3-none-any.whl