PyPI - kreuzberg - Versions diffs - 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl - Mend

kreuzberg 3.14.1py3-none-any.whl → 3.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

kreuzberg/__init__.py +10 -0
kreuzberg/_api/_config_cache.py +247 -0
kreuzberg/_api/main.py +74 -45
kreuzberg/_chunker.py +7 -6
kreuzberg/_config.py +11 -1
kreuzberg/_constants.py +2 -0
kreuzberg/_document_classification.py +5 -7
kreuzberg/_entity_extraction.py +9 -4
kreuzberg/_extractors/_base.py +269 -3
kreuzberg/_extractors/_email.py +101 -27
kreuzberg/_extractors/_html.py +112 -7
kreuzberg/_extractors/_image.py +23 -22
kreuzberg/_extractors/_pandoc.py +106 -75
kreuzberg/_extractors/_pdf.py +208 -99
kreuzberg/_extractors/_presentation.py +76 -8
kreuzberg/_extractors/_spread_sheet.py +24 -30
kreuzberg/_extractors/_structured.py +83 -15
kreuzberg/_gmft.py +5 -0
kreuzberg/_mcp/server.py +324 -25
kreuzberg/_mime_types.py +42 -0
kreuzberg/_ocr/_easyocr.py +53 -21
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +88 -37
kreuzberg/_types.py +291 -61
kreuzberg/_utils/_cache.py +10 -4
kreuzberg/_utils/_device.py +2 -4
kreuzberg/_utils/_html_streaming.py +20 -0
kreuzberg/_utils/_image_preprocessing.py +12 -39
kreuzberg/_utils/_process_pool.py +29 -8
kreuzberg/_utils/_quality.py +7 -2
kreuzberg/_utils/_resource_managers.py +65 -0
kreuzberg/_utils/_serialization.py +13 -6
kreuzberg/_utils/_sync.py +39 -10
kreuzberg/_utils/_tmp.py +37 -1
kreuzberg/cli.py +34 -20
kreuzberg/extraction.py +44 -28
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
kreuzberg-3.16.0.dist-info/RECORD +61 -0
kreuzberg-3.14.1.dist-info/RECORD +0 -58
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -2,13 +2,10 @@ from __future__ import annotations
 import contextlib
 import csv
-import os
 import sys
-import tempfile
 from datetime import date, datetime, time, timedelta
 from io import StringIO
-from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 import polars as pl
 from anyio import Path as AsyncPath
@@ -21,9 +18,12 @@ from kreuzberg._types import ExtractionResult, Metadata, TableData
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync, run_taskgroup
 from kreuzberg._utils._table import enhance_table_markdown
-from kreuzberg._utils._tmp import create_temp_file
+from kreuzberg._utils._tmp import create_temp_file, temporary_file, temporary_file_sync
 from kreuzberg.exceptions import ParsingError
+if TYPE_CHECKING:
+    from pathlib import Path
 if sys.version_info < (3, 11):  # pragma: no cover
     from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
@@ -48,12 +48,8 @@ class SpreadSheetExtractor(Extractor):
     async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
         file_extension = self._get_file_extension()
-        xlsx_path, unlink = await create_temp_file(file_extension)
-        await AsyncPath(xlsx_path).write_bytes(content)
-        try:
+        async with temporary_file(file_extension, content) as xlsx_path:
             return await self.extract_path_async(xlsx_path)
-        finally:
-            await unlink()
     async def extract_path_async(self, path: Path) -> ExtractionResult:
         try:
@@ -86,16 +82,8 @@ class SpreadSheetExtractor(Extractor):
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
         file_extension = self._get_file_extension()
-        fd, temp_path = tempfile.mkstemp(suffix=file_extension)
-        try:
-            with os.fdopen(fd, "wb") as f:
-                f.write(content)
-            return self.extract_path_sync(Path(temp_path))
-        finally:
-            with contextlib.suppress(OSError):
-                Path(temp_path).unlink()
+        with temporary_file_sync(file_extension, content) as temp_path:
+            return self.extract_path_sync(temp_path)
     def extract_path_sync(self, path: Path) -> ExtractionResult:
         try:
@@ -122,15 +110,17 @@ class SpreadSheetExtractor(Extractor):
     @staticmethod
     def _convert_cell_to_str(value: Any) -> str:
-        if value is None:
-            return ""
-        if isinstance(value, bool):
-            return str(value).lower()
-        if isinstance(value, (datetime, date, time)):
-            return value.isoformat()
-        if isinstance(value, timedelta):
-            return f"{value.total_seconds()} seconds"
-        return str(value)
+        match value:
+            case None:
+                return ""
+            case bool():
+                return str(value).lower()
+            case datetime() | date() | time():
+                return value.isoformat()
+            case timedelta():
+                return f"{value.total_seconds()} seconds"
+            case _:
+                return str(value)
     async def _convert_sheet_to_text(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
         values = workbook.get_sheet_by_name(sheet_name).to_python()
@@ -207,7 +197,11 @@ class SpreadSheetExtractor(Extractor):
             if not data or not any(row for row in data):
                 return f"## {sheet_name}\n\n*Empty sheet*"
-            df = pl.DataFrame(data)
+            if data:
+                max_cols = max(len(row) if row else 0 for row in data)
+                data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data]  # type: ignore[list-item]
+            df = pl.DataFrame(data, strict=False)
             df = df.filter(~pl.all_horizontal(pl.all().is_null()))
             df = df.select([col for col in df.columns if not df[col].is_null().all()])

kreuzberg/_extractors/_structured.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import json
 import sys
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -17,11 +16,13 @@ try:
 except ImportError:  # pragma: no cover
     yaml = None
 from anyio import Path as AsyncPath
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
-from kreuzberg._types import ExtractionResult, normalize_metadata
+from kreuzberg._types import ExtractionResult, JSONExtractionConfig, normalize_metadata
+from kreuzberg._utils._serialization import deserialize
 from kreuzberg._utils._string import normalize_spaces, safe_decode
 from kreuzberg._utils._sync import run_sync
@@ -43,6 +44,42 @@ class StructuredDataExtractor(Extractor):
         "text/toml",
     }
+    @property
+    def _json_config(self) -> JSONExtractionConfig | None:
+        return self.config.json_config
+    def _get_text_field_keywords(self) -> frozenset[str]:
+        json_config = self._json_config
+        if json_config and json_config.custom_text_field_patterns:
+            return _TEXT_FIELD_KEYWORDS | json_config.custom_text_field_patterns
+        return _TEXT_FIELD_KEYWORDS
+    def _extract_json_schema(self, data: Any, path: str = "", depth: int = 0) -> dict[str, Any]:
+        json_config = self._json_config
+        if not json_config or not json_config.extract_schema:
+            return {}
+        if depth >= json_config.max_depth:
+            return {"max_depth_reached": True}
+        schema_info: dict[str, Any] = {"type": type(data).__name__}
+        if isinstance(data, dict):
+            schema_info["properties"] = {}
+            for key, value in data.items():
+                key_path = f"{path}.{key}" if path else key
+                schema_info["properties"][key] = self._extract_json_schema(value, key_path, depth + 1)
+        elif isinstance(data, list) and data:
+            if len(data) <= json_config.array_item_limit:
+                schema_info["items"] = self._extract_json_schema(data[0], f"{path}[0]", depth + 1)
+                schema_info["length"] = len(data)
+            else:
+                schema_info["items"] = {"type": "truncated"}
+                schema_info["length"] = len(data)
+                schema_info["truncated"] = True
+        return schema_info
     async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
         return await run_sync(self.extract_bytes_sync, content)
@@ -51,12 +88,12 @@ class StructuredDataExtractor(Extractor):
         return await self.extract_bytes_async(content)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        text_content = safe_decode(content)
+        text_content: None | str = None
         try:
             if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
-                data = json.loads(text_content)
+                data = deserialize(content, dict, json=True)
             elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
+                text_content = safe_decode(content)
                 if tomllib is None:
                     return ExtractionResult(
                         content=normalize_spaces(text_content),
@@ -66,6 +103,7 @@ class StructuredDataExtractor(Extractor):
                     )
                 data = tomllib.loads(text_content)
             else:
+                text_content = safe_decode(content)
                 if yaml is None:
                     return ExtractionResult(
                         content=normalize_spaces(text_content),
@@ -75,9 +113,17 @@ class StructuredDataExtractor(Extractor):
                     )
                 data = yaml.safe_load(text_content)
-            text_parts: list[str] = []
             metadata: dict[str, Any] = {}
+            if (
+                self.mime_type in {JSON_MIME_TYPE, "text/json"}
+                and self._json_config
+                and self._json_config.extract_schema
+            ):
+                schema_info = self._extract_json_schema(data)
+                if schema_info:
+                    metadata["json_schema"] = schema_info
             if isinstance(data, dict):
                 text_parts = self._extract_from_dict(data, metadata)
             elif isinstance(data, list):
@@ -85,7 +131,7 @@ class StructuredDataExtractor(Extractor):
             else:
                 text_parts = [str(data)]
-            combined_text = "\n".join(text_parts) if text_parts else text_content
+            combined_text = "\n".join(text_parts) if text_parts else (text_content or safe_decode(content))
             return ExtractionResult(
                 content=normalize_spaces(combined_text),
@@ -96,7 +142,7 @@ class StructuredDataExtractor(Extractor):
         except (ValueError, TypeError) as e:
             return ExtractionResult(
-                content=normalize_spaces(text_content),
+                content=normalize_spaces(text_content or safe_decode(content)),
                 mime_type=PLAIN_TEXT_MIME_TYPE,
                 metadata={"parse_error": str(e)},
                 chunks=[],
@@ -113,23 +159,38 @@ class StructuredDataExtractor(Extractor):
             full_key = f"{prefix}.{key}" if prefix else key
             if isinstance(value, str) and value.strip():
-                text_parts.append(f"{full_key}: {value}")
+                if self._json_config and self._json_config.include_type_info:
+                    text_parts.append(f"{full_key} (string): {value}")
+                else:
+                    text_parts.append(f"{full_key}: {value}")
                 key_lower = key.lower()
-                if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
+                text_field_keywords = self._get_text_field_keywords()
+                if any(keyword in key_lower for keyword in text_field_keywords):
                     metadata[full_key] = value
             elif isinstance(value, (int, float, bool)):
-                text_parts.append(f"{full_key}: {value}")
+                if self._json_config and self._json_config.include_type_info:
+                    type_name = type(value).__name__
+                    text_parts.append(f"{full_key} ({type_name}): {value}")
+                else:
+                    text_parts.append(f"{full_key}: {value}")
             elif isinstance(value, dict):
-                text_parts.extend(self._extract_from_dict(value, metadata, full_key))
+                if self._json_config and not self._json_config.flatten_nested_objects:
+                    text_parts.append(f"{full_key}: [nested object with {len(value)} properties]")
+                else:
+                    text_parts.extend(self._extract_from_dict(value, metadata, full_key))
             elif isinstance(value, list):
                 text_parts.extend(self._extract_from_list(value, metadata, full_key))
             elif value is not None:
-                text_parts.append(f"{full_key}: {value!s}")
+                if self._json_config and self._json_config.include_type_info:
+                    type_name = type(value).__name__
+                    text_parts.append(f"{full_key} ({type_name}): {value!s}")
+                else:
+                    text_parts.append(f"{full_key}: {value!s}")
         return text_parts
@@ -140,7 +201,10 @@ class StructuredDataExtractor(Extractor):
             item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
             if isinstance(item, str) and item.strip():
-                text_parts.append(f"{item_key}: {item}")
+                if self._json_config and self._json_config.include_type_info:
+                    text_parts.append(f"{item_key} (string): {item}")
+                else:
+                    text_parts.append(f"{item_key}: {item}")
             elif isinstance(item, dict):
                 text_parts.extend(self._extract_from_dict(item, metadata, item_key))
@@ -149,6 +213,10 @@ class StructuredDataExtractor(Extractor):
                 text_parts.extend(self._extract_from_list(item, metadata, item_key))
             elif item is not None:
-                text_parts.append(f"{item_key}: {item!s}")
+                if self._json_config and self._json_config.include_type_info:
+                    type_name = type(item).__name__
+                    text_parts.append(f"{item_key} ({type_name}): {item!s}")
+                else:
+                    text_parts.append(f"{item_key}: {item!s}")
         return text_parts

kreuzberg/_gmft.py CHANGED Viewed

@@ -312,6 +312,11 @@ def _extract_tables_in_process(
         from gmft.formatters.tatr import TATRFormatConfig  # noqa: PLC0415
         from gmft.pdf_bindings.pdfium import PyPDFium2Document  # noqa: PLC0415
+        if "cell_required_confidence" in config_dict:
+            cell_config = config_dict["cell_required_confidence"]
+            if isinstance(cell_config, dict) and cell_config:
+                config_dict["cell_required_confidence"] = {int(k): v for k, v in cell_config.items()}
         config = GMFTConfig(**config_dict)
         formatter = AutoTableFormatter(  # type: ignore[no-untyped-call]

kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

kreuzberg 3.14.1py3-none-any.whl → 3.16.0py3-none-any.whl