PyPI - kreuzberg - Versions diffs - 3.15.0__py3-none-any.whl → 3.16.0__py3-none-any.whl - Mend

kreuzberg 3.15.0py3-none-any.whl → 3.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

kreuzberg/__init__.py +4 -0
kreuzberg/_api/main.py +0 -53
kreuzberg/_config.py +11 -1
kreuzberg/_document_classification.py +1 -1
kreuzberg/_extractors/_email.py +16 -10
kreuzberg/_extractors/_html.py +39 -12
kreuzberg/_extractors/_pdf.py +2 -3
kreuzberg/_extractors/_presentation.py +4 -0
kreuzberg/_extractors/_spread_sheet.py +0 -1
kreuzberg/_extractors/_structured.py +83 -15
kreuzberg/_gmft.py +5 -0
kreuzberg/_mcp/server.py +0 -21
kreuzberg/_ocr/_easyocr.py +51 -19
kreuzberg/_ocr/_tesseract.py +14 -3
kreuzberg/_types.py +111 -40
kreuzberg/_utils/_html_streaming.py +20 -0
kreuzberg/_utils/_serialization.py +13 -6
kreuzberg/_utils/_sync.py +15 -16
kreuzberg/extraction.py +2 -2
{kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +12 -11
{kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/RECORD +24 -23
{kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/__init__.py CHANGED Viewed

@@ -8,8 +8,10 @@ from ._types import (
     ExtractionConfig,
     ExtractionResult,
     GMFTConfig,
+    HTMLToMarkdownConfig,
     ImageOCRConfig,
     ImageOCRResult,
+    JSONExtractionConfig,
     LanguageDetectionConfig,
     Metadata,
     PaddleOCRConfig,
@@ -40,8 +42,10 @@ __all__ = [
     "ExtractionResult",
     "ExtractorRegistry",
     "GMFTConfig",
+    "HTMLToMarkdownConfig",
     "ImageOCRConfig",
     "ImageOCRResult",
+    "JSONExtractionConfig",
     "KreuzbergError",
     "LanguageDetectionConfig",
     "Metadata",

kreuzberg/_api/main.py CHANGED Viewed

@@ -13,10 +13,8 @@ from typing_extensions import TypedDict
 from kreuzberg import (
     EasyOCRConfig,
-    ExtractedImage,
     ExtractionConfig,
     ExtractionResult,
-    ImageOCRResult,
     KreuzbergError,
     MissingDependencyError,
     PaddleOCRConfig,
@@ -40,30 +38,6 @@ if TYPE_CHECKING:
     from litestar.datastructures import UploadFile
-class ExtractedImageDict(TypedDict):
-    """TypedDict for extracted image JSON representation."""
-    data: str
-    format: str
-    filename: str | None
-    page_number: int | None
-    dimensions: tuple[int, int] | None
-    colorspace: str | None
-    bits_per_component: int | None
-    is_mask: bool
-    description: str | None
-class ImageOCRResultDict(TypedDict):
-    """TypedDict for image OCR result JSON representation."""
-    image: ExtractedImageDict
-    ocr_result: Any
-    confidence_score: float | None
-    processing_time: float | None
-    skipped_reason: str | None
 class HealthResponse(TypedDict):
     """Response model for health check endpoint."""
@@ -384,31 +358,6 @@ def _pil_image_encoder(obj: Any) -> str:
     return f"data:image/png;base64,{img_str}"
-def _extracted_image_encoder(obj: ExtractedImage) -> ExtractedImageDict:
-    encoded_data = base64.b64encode(obj.data).decode()
-    return ExtractedImageDict(
-        data=f"data:image/{obj.format};base64,{encoded_data}",
-        format=obj.format,
-        filename=obj.filename,
-        page_number=obj.page_number,
-        dimensions=obj.dimensions,
-        colorspace=obj.colorspace,
-        bits_per_component=obj.bits_per_component,
-        is_mask=obj.is_mask,
-        description=obj.description,
-    )
-def _image_ocr_result_encoder(obj: ImageOCRResult) -> ImageOCRResultDict:
-    return ImageOCRResultDict(
-        image=_extracted_image_encoder(obj.image),
-        ocr_result=obj.ocr_result,
-        confidence_score=obj.confidence_score,
-        processing_time=obj.processing_time,
-        skipped_reason=obj.skipped_reason,
-    )
 openapi_config = OpenAPIConfig(
     title="Kreuzberg API",
     version="3.14.0",
@@ -428,8 +377,6 @@ openapi_config = OpenAPIConfig(
 type_encoders = {
     pl.DataFrame: _polars_dataframe_encoder,
     Image.Image: _pil_image_encoder,
-    ExtractedImage: _extracted_image_encoder,
-    ImageOCRResult: _image_ocr_result_encoder,
 }
 app = Litestar(

kreuzberg/_config.py CHANGED Viewed

@@ -69,7 +69,17 @@ def _build_ocr_config_from_cli(
     try:
         match ocr_backend:
             case "tesseract":
-                return TesseractConfig(**backend_args)
+                # Handle PSM mode conversion from int to enum
+                processed_args = backend_args.copy()
+                if "psm" in processed_args and isinstance(processed_args["psm"], int):
+                    try:
+                        processed_args["psm"] = PSMMode(processed_args["psm"])
+                    except ValueError as e:
+                        raise ValidationError(
+                            f"Invalid PSM mode value: {processed_args['psm']}",
+                            context={"psm_value": processed_args["psm"], "error": str(e)},
+                        ) from e
+                return TesseractConfig(**processed_args)
             case "easyocr":
                 return EasyOCRConfig(**backend_args)
             case "paddleocr":

kreuzberg/_document_classification.py CHANGED Viewed

@@ -132,7 +132,7 @@ def classify_document_from_layout(
             if not found_words.is_empty():
                 scores[doc_type] += 1.0
                 word_top = found_words[0, "top"]
-                if word_top < page_height * 0.3:
+                if word_top is not None and word_top < page_height * 0.3:
                     scores[doc_type] += 0.5
     total_score = sum(scores.values())

kreuzberg/_extractors/_email.py CHANGED Viewed

@@ -27,6 +27,8 @@ except ImportError:  # pragma: no cover
     html2text = None
 _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
+_UNICODE_QUOTES_PATTERN = re.compile(r"[\u201c\u201d]")
+_UNICODE_SINGLE_QUOTES_PATTERN = re.compile(r"[\u2018\u2019]")
 class EmailExtractor(Extractor):
@@ -86,7 +88,14 @@ class EmailExtractor(Extractor):
     def _format_email_field(self, field: Any) -> str:
         match field:
             case list():
-                return ", ".join(str(item.get("email", "")) if isinstance(item, dict) else str(item) for item in field)
+                emails = []
+                for item in field:
+                    if isinstance(item, dict):
+                        if email := item.get("email", ""):
+                            emails.append(str(email))
+                    else:
+                        emails.append(str(item))
+                return ", ".join(emails)
             case dict():
                 return str(field.get("email", ""))
             case _:
@@ -111,12 +120,8 @@ class EmailExtractor(Extractor):
                 cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
                 clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
                 clean_html = unescape(clean_html)
-                clean_html = (
-                    clean_html.replace("\u201c", '"')
-                    .replace("\u201d", '"')
-                    .replace("\u2019", "'")
-                    .replace("\u2018", "'")
-                )
+                clean_html = _UNICODE_QUOTES_PATTERN.sub('"', clean_html)
+                clean_html = _UNICODE_SINGLE_QUOTES_PATTERN.sub("'", clean_html)
                 text_parts.append(clean_html)
     def _extract_email_attachments(
@@ -129,12 +134,12 @@ class EmailExtractor(Extractor):
         for att in attachments:
             name_val: str = "unknown"
             if isinstance(att, dict):
-                n = att.get("name")
+                n = att.get("name") or att.get("filename")
                 if isinstance(n, str) and n:
                     name_val = n
             names.append(name_val)
-        metadata["attachments"] = names
         if names:
+            metadata["attachments"] = names
             text_parts.append("Attachments: " + ", ".join(names))
     def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
@@ -151,7 +156,8 @@ class EmailExtractor(Extractor):
             if not isinstance(mime, str) or not mime.startswith("image/"):
                 continue
-            name = att.get("name") if isinstance(att.get("name"), str) else None
+            name = att.get("name") or att.get("filename")
+            name = name if isinstance(name, str) else None
             data = att.get("data") or att.get("content") or att.get("payload")
             raw: bytes | None = None
             if isinstance(data, (bytes, bytearray)):

kreuzberg/_extractors/_html.py CHANGED Viewed

@@ -1,16 +1,20 @@
 from __future__ import annotations
 import base64
+import binascii
+import io
 import logging
 from typing import TYPE_CHECKING, ClassVar
 import html_to_markdown
 from anyio import Path as AsyncPath
 from bs4 import BeautifulSoup
+from PIL import Image
 from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
 from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
 from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
+from kreuzberg._utils._html_streaming import should_use_streaming
 from kreuzberg._utils._string import safe_decode
 from kreuzberg._utils._sync import run_maybe_async, run_sync
@@ -44,6 +48,11 @@ class HTMLExtractor(Extractor):
         config_dict = config.to_dict()
         html_content = safe_decode(content)
+        use_streaming, chunk_size = should_use_streaming(len(content))
+        config_dict["stream_processing"] = use_streaming
+        config_dict["chunk_size"] = chunk_size
         result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
         extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
@@ -89,6 +98,13 @@ class HTMLExtractor(Extractor):
                         )
                         continue
+                    dimensions = None
+                    try:
+                        with Image.open(io.BytesIO(image_data)) as pil_img:
+                            dimensions = pil_img.size
+                    except (OSError, ValueError) as e:
+                        logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
                     alt_val = img.get("alt")  # type: ignore[union-attr]
                     desc = alt_val if isinstance(alt_val, str) else None
                     images.append(
@@ -97,25 +113,36 @@ class HTMLExtractor(Extractor):
                             format=format_name,
                             filename=f"embedded_image_{len(images) + 1}.{format_name}",
                             description=desc,
+                            dimensions=dimensions,
                         )
                     )
-                except Exception as e:  # noqa: BLE001
+                except (ValueError, binascii.Error) as e:
                     logger.warning("Failed to extract base64 image: %s", e)
-        for svg in soup.find_all("svg"):
+        def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
             try:
-                svg_content = str(svg).encode("utf-8")
-                title_or_aria = svg.get("title") or svg.get("aria-label")  # type: ignore[union-attr]
+                svg_content = str(svg_element).encode("utf-8")
+                def _get_attr_safe(obj: object, attr: str) -> str | None:
+                    get_method = getattr(obj, "get", None)
+                    if callable(get_method):
+                        result = get_method(attr)
+                        return result if isinstance(result, str) else None
+                    return None
+                title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
                 desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
-                images.append(
-                    ExtractedImage(
-                        data=svg_content,
-                        format="svg",
-                        filename=f"inline_svg_{len(images) + 1}.svg",
-                        description=desc_svg,
-                    )
+                return ExtractedImage(
+                    data=svg_content,
+                    format="svg",
+                    filename=f"inline_svg_{len(images) + 1}.svg",
+                    description=desc_svg,
                 )
-            except Exception as e:  # noqa: BLE001, PERF203
+            except (UnicodeEncodeError, AttributeError) as e:
                 logger.warning("Failed to extract SVG: %s", e)
+                return None
+        svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
+        images.extend(img for img in svg_images if img is not None)
         return images

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import asyncio
 import contextlib
 import io
 import logging
@@ -41,7 +40,7 @@ from kreuzberg._utils._errors import create_error_context, should_retry
 from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
 from kreuzberg._utils._resource_managers import pdf_document, pdf_document_sync, pdf_resources_sync
 from kreuzberg._utils._string import normalize_spaces
-from kreuzberg._utils._sync import run_maybe_async, run_taskgroup_batched
+from kreuzberg._utils._sync import run_maybe_async, run_taskgroup, run_taskgroup_batched
 from kreuzberg._utils._table import generate_table_summary
 from kreuzberg._utils._tmp import temporary_file, temporary_file_sync
 from kreuzberg.exceptions import ParsingError
@@ -231,7 +230,7 @@ class PDFExtractor(Extractor):
                 img_counter += 1
         if tasks:
-            results = await asyncio.gather(*tasks)
+            results = await run_taskgroup(*tasks)
             return [img for img in results if img is not None]
         return []

kreuzberg/_extractors/_presentation.py CHANGED Viewed

@@ -142,6 +142,8 @@ class PresentationExtractor(Extractor):
                 if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                     try:
                         image = shape.image
+                        if not image.blob or not isinstance(image.blob, bytes):
+                            continue
                         filename = f"slide_{slide_num}_image_{len(images) + 1}.{image.ext}"
                         images.append(
@@ -162,6 +164,8 @@ class PresentationExtractor(Extractor):
             if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                 try:
                     image = shape.image
+                    if not image.blob or not isinstance(image.blob, bytes):
+                        continue
                     filename = f"slide_{slide_num}_group_image_{image_count + len(images) + 1}.{image.ext}"
                     images.append(
                         ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -197,7 +197,6 @@ class SpreadSheetExtractor(Extractor):
             if not data or not any(row for row in data):
                 return f"## {sheet_name}\n\n*Empty sheet*"
-            # Normalize row lengths to avoid polars ShapeError
             if data:
                 max_cols = max(len(row) if row else 0 for row in data)
                 data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data]  # type: ignore[list-item]

kreuzberg/_extractors/_structured.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import json
 import sys
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -17,11 +16,13 @@ try:
 except ImportError:  # pragma: no cover
     yaml = None
 from anyio import Path as AsyncPath
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
-from kreuzberg._types import ExtractionResult, normalize_metadata
+from kreuzberg._types import ExtractionResult, JSONExtractionConfig, normalize_metadata
+from kreuzberg._utils._serialization import deserialize
 from kreuzberg._utils._string import normalize_spaces, safe_decode
 from kreuzberg._utils._sync import run_sync
@@ -43,6 +44,42 @@ class StructuredDataExtractor(Extractor):
         "text/toml",
     }
+    @property
+    def _json_config(self) -> JSONExtractionConfig | None:
+        return self.config.json_config
+    def _get_text_field_keywords(self) -> frozenset[str]:
+        json_config = self._json_config
+        if json_config and json_config.custom_text_field_patterns:
+            return _TEXT_FIELD_KEYWORDS | json_config.custom_text_field_patterns
+        return _TEXT_FIELD_KEYWORDS
+    def _extract_json_schema(self, data: Any, path: str = "", depth: int = 0) -> dict[str, Any]:
+        json_config = self._json_config
+        if not json_config or not json_config.extract_schema:
+            return {}
+        if depth >= json_config.max_depth:
+            return {"max_depth_reached": True}
+        schema_info: dict[str, Any] = {"type": type(data).__name__}
+        if isinstance(data, dict):
+            schema_info["properties"] = {}
+            for key, value in data.items():
+                key_path = f"{path}.{key}" if path else key
+                schema_info["properties"][key] = self._extract_json_schema(value, key_path, depth + 1)
+        elif isinstance(data, list) and data:
+            if len(data) <= json_config.array_item_limit:
+                schema_info["items"] = self._extract_json_schema(data[0], f"{path}[0]", depth + 1)
+                schema_info["length"] = len(data)
+            else:
+                schema_info["items"] = {"type": "truncated"}
+                schema_info["length"] = len(data)
+                schema_info["truncated"] = True
+        return schema_info
     async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
         return await run_sync(self.extract_bytes_sync, content)
@@ -51,12 +88,12 @@ class StructuredDataExtractor(Extractor):
         return await self.extract_bytes_async(content)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        text_content = safe_decode(content)
+        text_content: None | str = None
         try:
             if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
-                data = json.loads(text_content)
+                data = deserialize(content, dict, json=True)
             elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
+                text_content = safe_decode(content)
                 if tomllib is None:
                     return ExtractionResult(
                         content=normalize_spaces(text_content),
@@ -66,6 +103,7 @@ class StructuredDataExtractor(Extractor):
                     )
                 data = tomllib.loads(text_content)
             else:
+                text_content = safe_decode(content)
                 if yaml is None:
                     return ExtractionResult(
                         content=normalize_spaces(text_content),
@@ -75,9 +113,17 @@ class StructuredDataExtractor(Extractor):
                     )
                 data = yaml.safe_load(text_content)
-            text_parts: list[str] = []
             metadata: dict[str, Any] = {}
+            if (
+                self.mime_type in {JSON_MIME_TYPE, "text/json"}
+                and self._json_config
+                and self._json_config.extract_schema
+            ):
+                schema_info = self._extract_json_schema(data)
+                if schema_info:
+                    metadata["json_schema"] = schema_info
             if isinstance(data, dict):
                 text_parts = self._extract_from_dict(data, metadata)
             elif isinstance(data, list):
@@ -85,7 +131,7 @@ class StructuredDataExtractor(Extractor):
             else:
                 text_parts = [str(data)]
-            combined_text = "\n".join(text_parts) if text_parts else text_content
+            combined_text = "\n".join(text_parts) if text_parts else (text_content or safe_decode(content))
             return ExtractionResult(
                 content=normalize_spaces(combined_text),
@@ -96,7 +142,7 @@ class StructuredDataExtractor(Extractor):
         except (ValueError, TypeError) as e:
             return ExtractionResult(
-                content=normalize_spaces(text_content),
+                content=normalize_spaces(text_content or safe_decode(content)),
                 mime_type=PLAIN_TEXT_MIME_TYPE,
                 metadata={"parse_error": str(e)},
                 chunks=[],
@@ -113,23 +159,38 @@ class StructuredDataExtractor(Extractor):
             full_key = f"{prefix}.{key}" if prefix else key
             if isinstance(value, str) and value.strip():
-                text_parts.append(f"{full_key}: {value}")
+                if self._json_config and self._json_config.include_type_info:
+                    text_parts.append(f"{full_key} (string): {value}")
+                else:
+                    text_parts.append(f"{full_key}: {value}")
                 key_lower = key.lower()
-                if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
+                text_field_keywords = self._get_text_field_keywords()
+                if any(keyword in key_lower for keyword in text_field_keywords):
                     metadata[full_key] = value
             elif isinstance(value, (int, float, bool)):
-                text_parts.append(f"{full_key}: {value}")
+                if self._json_config and self._json_config.include_type_info:
+                    type_name = type(value).__name__
+                    text_parts.append(f"{full_key} ({type_name}): {value}")
+                else:
+                    text_parts.append(f"{full_key}: {value}")
             elif isinstance(value, dict):
-                text_parts.extend(self._extract_from_dict(value, metadata, full_key))
+                if self._json_config and not self._json_config.flatten_nested_objects:
+                    text_parts.append(f"{full_key}: [nested object with {len(value)} properties]")
+                else:
+                    text_parts.extend(self._extract_from_dict(value, metadata, full_key))
             elif isinstance(value, list):
                 text_parts.extend(self._extract_from_list(value, metadata, full_key))
             elif value is not None:
-                text_parts.append(f"{full_key}: {value!s}")
+                if self._json_config and self._json_config.include_type_info:
+                    type_name = type(value).__name__
+                    text_parts.append(f"{full_key} ({type_name}): {value!s}")
+                else:
+                    text_parts.append(f"{full_key}: {value!s}")
         return text_parts
@@ -140,7 +201,10 @@ class StructuredDataExtractor(Extractor):
             item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
             if isinstance(item, str) and item.strip():
-                text_parts.append(f"{item_key}: {item}")
+                if self._json_config and self._json_config.include_type_info:
+                    text_parts.append(f"{item_key} (string): {item}")
+                else:
+                    text_parts.append(f"{item_key}: {item}")
             elif isinstance(item, dict):
                 text_parts.extend(self._extract_from_dict(item, metadata, item_key))
@@ -149,6 +213,10 @@ class StructuredDataExtractor(Extractor):
                 text_parts.extend(self._extract_from_list(item, metadata, item_key))
             elif item is not None:
-                text_parts.append(f"{item_key}: {item!s}")
+                if self._json_config and self._json_config.include_type_info:
+                    type_name = type(item).__name__
+                    text_parts.append(f"{item_key} ({type_name}): {item!s}")
+                else:
+                    text_parts.append(f"{item_key}: {item!s}")
         return text_parts

kreuzberg/_gmft.py CHANGED Viewed

@@ -312,6 +312,11 @@ def _extract_tables_in_process(
         from gmft.formatters.tatr import TATRFormatConfig  # noqa: PLC0415
         from gmft.pdf_bindings.pdfium import PyPDFium2Document  # noqa: PLC0415
+        if "cell_required_confidence" in config_dict:
+            cell_config = config_dict["cell_required_confidence"]
+            if isinstance(cell_config, dict) and cell_config:
+                config_dict["cell_required_confidence"] = {int(k): v for k, v in cell_config.items()}
         config = GMFTConfig(**config_dict)
         formatter = AutoTableFormatter(  # type: ignore[no-untyped-call]

kreuzberg 3.15.0__py3-none-any.whl → 3.16.0__py3-none-any.whl

kreuzberg 3.15.0py3-none-any.whl → 3.16.0py3-none-any.whl