PyPI - kreuzberg - Versions diffs - 3.10.0__tar.gz → 3.11.0__tar.gz - Mend

kreuzberg 3.10.0tar.gz → 3.11.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (239) hide show

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/.github/workflows/ci.yaml RENAMED Viewed

@@ -51,12 +51,103 @@ jobs:
       - name: Execute Pre-Commit
         run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
+  # Coverage job runs first, only on Python 3.13 Ubuntu
+  coverage:
+    needs: validate
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+      - name: Install Python
+        uses: actions/setup-python@v5
+        id: setup-python
+        with:
+          python-version: "3.13"
+      - name: Cache Python Dependencies
+        id: python-cache
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/uv
+            .venv
+          key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
+          restore-keys: |
+            python-dependencies-ubuntu-latest-3.13-
+      - name: Install Dependencies
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            uv sync --all-packages --all-extras --dev
+          shell: bash
+      - name: Install System Dependencies
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
+          shell: bash
+      - name: Run Tests with Coverage
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 15
+          max_attempts: 3
+          retry_wait_seconds: 10
+          command: |
+            uv run coverage erase
+            uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
+            uv run coverage report --precision=2
+          shell: bash
+      - name: Upload Coverage to DeepSource
+        if: always() && github.event_name == 'push'
+        env:
+          DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
+        run: |
+          # Install DeepSource CLI
+          curl -fsSL https://deepsource.io/cli | sh
+          # Upload coverage report
+          ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
+      - name: Upload Coverage Artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-report-${{ github.sha }}
+          path: |
+            coverage.lcov
+            .coverage
+          retention-days: 7
+  # Full test matrix runs only after coverage succeeds
   test:
+    needs: coverage
+    runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [ ubuntu-latest, macOS-latest, windows-latest ]
-        python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
-    runs-on: ${{ matrix.os }}
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python: ["3.10", "3.11", "3.12", "3.13"]
+        exclude:
+          # Skip Python 3.13 on macOS for now due to compatibility issues
+          - os: macos-latest
+            python: "3.13"
     timeout-minutes: 30
     steps:
       - name: Checkout
@@ -146,52 +237,12 @@ jobs:
             pandoc --version
           shell: pwsh
-      - name: Clean Coverage Data
-        run: |
-          rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
-        shell: bash
-      - name: Run Tests with Coverage
-        run: |
-          uv run coverage erase
-          uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
-      - name: Upload Coverage Artifacts
-        if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
-        uses: actions/upload-artifact@v4
-        with:
-          name: coverage-report
-          path: coverage.lcov
-          retention-days: 1
-  upload-coverage:
-    needs: test
-    runs-on: ubuntu-latest
-    if: github.event_name == 'push' || github.event_name == 'pull_request'
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.sha || github.sha }}
-      - name: Download Coverage Artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: coverage-report
-          path: .
-      - name: Install DeepSource CLI
+      - name: Run Tests (without coverage)
         uses: nick-fields/retry@v3
         with:
-          timeout_minutes: 3
+          timeout_minutes: 15
           max_attempts: 3
           retry_wait_seconds: 10
           command: |
-            curl -fsSL https://deepsource.io/cli | sh
+            uv run pytest -s -vvv --reruns 2 --reruns-delay 1
           shell: bash
-      - name: Upload Coverage to DeepSource
-        env:
-          DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
-        run: |
-          ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/.pre-commit-config.yaml RENAMED Viewed

@@ -53,7 +53,7 @@ repos:
     hooks:
       - id: pyproject-fmt
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.5
+    rev: v0.12.7
     hooks:
       - id: ruff
         args: ["--fix", "--unsafe-fixes"]

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.10.0
+Version: 3.11.0
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.9.0
 Requires-Dist: chardetng-py>=0.3.5
 Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
 Requires-Dist: html-to-markdown[lxml]>=1.9.0
-Requires-Dist: mcp>=1.12.2
+Requires-Dist: mcp>=1.12.3
 Requires-Dist: msgspec>=0.18.0
 Requires-Dist: playa-pdf>=0.6.4
 Requires-Dist: psutil>=7.0.0
@@ -45,6 +45,7 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
 Provides-Extra: all
 Requires-Dist: click>=8.2.1; extra == 'all'
+Requires-Dist: deep-translator>=1.11.4; extra == 'all'
 Requires-Dist: easyocr>=1.7.2; extra == 'all'
 Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
 Requires-Dist: gmft>=0.4.2; extra == 'all'
@@ -53,6 +54,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
 Requires-Dist: mailparse>=1.0.15; extra == 'all'
 Requires-Dist: paddleocr>=3.1.0; extra == 'all'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
+Requires-Dist: pandas>=2.3.1; extra == 'all'
 Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
 Requires-Dist: rich>=14.1.0; extra == 'all'
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
@@ -61,9 +63,6 @@ Requires-Dist: spacy>=3.8.7; extra == 'all'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
 Provides-Extra: api
 Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
-Provides-Extra: auto-classify-document-type
-Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
-Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
 Provides-Extra: chunking
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
 Provides-Extra: cli
@@ -72,6 +71,9 @@ Requires-Dist: rich>=14.1.0; extra == 'cli'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
 Provides-Extra: crypto
 Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
+Provides-Extra: document-classification
+Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
+Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
 Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
 Provides-Extra: entity-extraction

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/benchmark_baseline.py RENAMED Viewed

@@ -108,7 +108,7 @@ async def run_baseline_benchmark() -> dict[str, object] | None:
     return results  # type: ignore[return-value]
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     baseline_results = asyncio.run(run_baseline_benchmark())
     baseline_file = Path("baseline_results.json")

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/end_to_end_benchmark.py RENAMED Viewed

@@ -195,7 +195,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
     }
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     print("🧪 REPRODUCIBLE CACHE BENCHMARK")
     print("Testing msgpack implementation with statistical rigor...")
     print()

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py RENAMED Viewed

@@ -1,4 +1,5 @@
 """Core benchmark implementations comparing sync vs async performance."""
+# mypy: disable-error-code=unused-ignore
 from __future__ import annotations

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/benchmarks/statistical_benchmark.py RENAMED Viewed

@@ -187,7 +187,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
     }
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     print("🧪 STATISTICAL CACHE BENCHMARK")
     print("Testing msgpack implementation with proper error analysis...")
     print()

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/contributing.md RENAMED Viewed

@@ -34,7 +34,7 @@ All commands run through `uv run`:
 # Testing
 uv run pytest                      # Run all tests
 uv run pytest tests/foo_test.py    # Run specific test
-uv run pytest --cov                # With coverage (must be ≥95%)
+uv run pytest --cov                # With coverage (must be ≥85%)
 # Code quality
 uv run ruff format                 # Format code

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/getting-started/installation.md RENAMED Viewed

@@ -134,6 +134,16 @@ python -m spacy download es_core_news_sm  # Spanish
     spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
+### Document Classification
+For automatic document type detection (invoice, contract, receipt, etc.), install the document classification extra:
+```shell
+pip install "kreuzberg[document-classification]"
+```
+This feature uses Google Translate for multi-language support and requires explicit opt-in by setting `auto_detect_document_type=True` in your configuration.
 ### All Optional Dependencies
 To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -145,5 +155,5 @@ pip install "kreuzberg[all]"
 This is equivalent to:
 ```shell
-pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
+pip install "kreuzberg[chunking,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
 ```

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/index.md RENAMED Viewed

@@ -22,7 +22,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
 ### Engineering Principles
-- **Test Coverage**: 95%+ coverage with comprehensive test suites
+- **Test Coverage**: Comprehensive test suites ensuring code reliability
 - **API Design**: True async/await implementation alongside synchronous APIs
 - **Error Handling**: Consistent exception hierarchy with detailed context
 - **Type Safety**: Full type annotations for enhanced developer experience

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/docs/user-guide/document-classification.md RENAMED Viewed

@@ -2,9 +2,17 @@
 Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
+## Installation
+Document classification requires the `document-classification` extra to be installed:
+```bash
+pip install "kreuzberg[document-classification]"
+```
 ## Enabling Document Classification
-To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
+Document classification is disabled by default. To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
 ```python
 from kreuzberg import ExtractionConfig, extract_file

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_api/main.py RENAMED Viewed

@@ -30,7 +30,7 @@ try:
         HTTP_422_UNPROCESSABLE_ENTITY,
         HTTP_500_INTERNAL_SERVER_ERROR,
     )
-except ImportError as e:
+except ImportError as e:  # pragma: no cover
     raise MissingDependencyError.create_for_package(
         dependency_group="litestar",
         functionality="Litestar API and docker container",

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_chunker.py RENAMED Viewed

@@ -43,7 +43,7 @@ def get_chunker(
                 from semantic_text_splitter import TextSplitter  # noqa: PLC0415
                 _chunkers[key] = TextSplitter(max_characters, overlap_characters)
-        except ImportError as e:
+        except ImportError as e:  # pragma: no cover
             raise MissingDependencyError.create_for_package(
                 dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
             ) from e

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_config.py RENAMED Viewed

@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
 if sys.version_info >= (3, 11):
     import tomllib
-else:
+else:  # pragma: no cover
     import tomli as tomllib  # type: ignore[import-not-found]
 from kreuzberg._gmft import GMFTConfig
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
     # Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
     if config_path.name == "kreuzberg.toml":
         return data  # type: ignore[no-any-return]
-    return data.get("tool", {}).get("kreuzberg", {})  # type: ignore[no-any-return]
+    # For other files, check if they have [tool.kreuzberg] section
+    if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
+        return data.get("tool", {}).get("kreuzberg", {})  # type: ignore[no-any-return]
+    # Otherwise assume root-level configuration
+    return data  # type: ignore[no-any-return]
 def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
@@ -91,19 +97,21 @@ def parse_ocr_backend_config(
     if not isinstance(backend_config, dict):
         return None
-    if backend == "tesseract":
-        # Convert psm integer to PSMMode enum if needed
-        processed_config = backend_config.copy()
-        if "psm" in processed_config and isinstance(processed_config["psm"], int):
-            from kreuzberg._ocr._tesseract import PSMMode  # noqa: PLC0415
-            processed_config["psm"] = PSMMode(processed_config["psm"])
-        return TesseractConfig(**processed_config)
-    if backend == "easyocr":
-        return EasyOCRConfig(**backend_config)
-    if backend == "paddleocr":
-        return PaddleOCRConfig(**backend_config)
-    return None
+    match backend:
+        case "tesseract":
+            # Convert psm integer to PSMMode enum if needed
+            processed_config = backend_config.copy()
+            if "psm" in processed_config and isinstance(processed_config["psm"], int):
+                from kreuzberg._ocr._tesseract import PSMMode  # noqa: PLC0415
+                processed_config["psm"] = PSMMode(processed_config["psm"])
+            return TesseractConfig(**processed_config)
+        case "easyocr":
+            return EasyOCRConfig(**backend_config)
+        case "paddleocr":
+            return PaddleOCRConfig(**backend_config)
+        case _:
+            return None
 def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
@@ -129,12 +137,25 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
         "extract_keywords",
         "auto_detect_language",
         "enable_quality_processing",
+        "auto_detect_document_type",
+        "document_type_confidence_threshold",
+        "document_classification_mode",
+        "keyword_count",
+    }
+    extraction_config = extraction_config | {
+        field: config_dict[field] for field in basic_fields if field in config_dict
     }
-    extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
     # Handle OCR backend configuration
     ocr_backend = extraction_config.get("ocr_backend")
     if ocr_backend and ocr_backend != "none":
+        # Validate OCR backend
+        valid_backends = {"tesseract", "easyocr", "paddleocr"}
+        if ocr_backend not in valid_backends:
+            raise ValidationError(
+                f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
+                context={"provided": ocr_backend, "valid": sorted(valid_backends)},
+            )
         ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
         if ocr_config:
             extraction_config["ocr_config"] = ocr_config
@@ -286,6 +307,10 @@ _CONFIG_FIELDS = [
     "extract_keywords",
     "auto_detect_language",
     "enable_quality_processing",
+    "auto_detect_document_type",
+    "document_type_confidence_threshold",
+    "document_classification_mode",
+    "keyword_count",
 ]

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_document_classification.py RENAMED Viewed

@@ -4,13 +4,12 @@ import re
 from typing import TYPE_CHECKING
 from kreuzberg._ocr import get_ocr_backend
+from kreuzberg._types import ExtractionConfig, ExtractionResult  # noqa: TC001
 from kreuzberg.exceptions import MissingDependencyError
 if TYPE_CHECKING:
     from pathlib import Path
-    from kreuzberg._types import ExtractionConfig, ExtractionResult
 DOCUMENT_CLASSIFIERS = {
     "invoice": [
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
     Raises:
         MissingDependencyError: If the deep-translator package is not installed
     """
+    # Combine content with metadata for classification
+    text_to_classify = result.content
+    if result.metadata:
+        # Add metadata values to the text for classification
+        metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
+        text_to_classify = f"{text_to_classify} {metadata_text}"
     try:
         from deep_translator import GoogleTranslator  # noqa: PLC0415
-    except ImportError as e:
+    except ImportError as e:  # pragma: no cover
         raise MissingDependencyError(
-            "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
+            "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
         ) from e
-    return str(GoogleTranslator(source="auto", target="en").translate(result.content).lower())
+    try:
+        return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
+    except Exception:  # noqa: BLE001
+        # Fall back to original content in lowercase if translation fails
+        return text_to_classify.lower()
 def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
         A tuple containing the detected document type and the confidence score,
         or (None, None) if no type is detected with sufficient confidence.
     """
+    if not config.auto_detect_document_type:
+        return None, None
     translated_text = _get_translated_text(result)
     scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
@@ -108,7 +121,8 @@ def classify_document_from_layout(
         A tuple containing the detected document type and the confidence score,
         or (None, None) if no type is detected with sufficient confidence.
     """
-    translated_text = _get_translated_text(result)
+    if not config.auto_detect_document_type:
+        return None, None
     if result.layout is None or result.layout.empty:
         return None, None
@@ -117,6 +131,24 @@ def classify_document_from_layout(
     if not all(col in layout_df.columns for col in ["text", "top", "height"]):
         return None, None
+    # Use layout text for classification, not the content
+    layout_text = " ".join(layout_df["text"].astype(str).tolist())
+    # Translate layout text directly for classification
+    text_to_classify = layout_text
+    if result.metadata:
+        # Add metadata values to the text for classification
+        metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
+        text_to_classify = f"{text_to_classify} {metadata_text}"
+    try:
+        from deep_translator import GoogleTranslator  # noqa: PLC0415
+        translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
+    except Exception:  # noqa: BLE001
+        # Fall back to original content in lowercase if translation fails
+        translated_text = text_to_classify.lower()
     layout_df["translated_text"] = translated_text
     page_height = layout_df["top"].max() + layout_df["height"].max()
@@ -151,6 +183,9 @@ def auto_detect_document_type(
     if config.document_classification_mode == "vision" and file_path:
         layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
         result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
+    elif result.layout is not None and not result.layout.empty:
+        # Use layout-based classification if layout data is available
+        result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
     else:
         result.document_type, result.document_type_confidence = classify_document(result, config)
     return result

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_entity_extraction.py RENAMED Viewed

@@ -139,7 +139,7 @@ def extract_entities(
     try:
         import spacy  # noqa: F401, PLC0415
-    except ImportError as e:
+    except ImportError as e:  # pragma: no cover
         raise MissingDependencyError.create_for_package(
             package_name="spacy",
             dependency_group="entity-extraction",
@@ -230,7 +230,7 @@ def extract_keywords(
         return [(kw, float(score)) for kw, score in keywords]
     except (RuntimeError, OSError, ValueError):
         return []
-    except ImportError as e:
+    except ImportError as e:  # pragma: no cover
         raise MissingDependencyError.create_for_package(
             package_name="keybert",
             dependency_group="entity-extraction",

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_base.py RENAMED Viewed

@@ -116,8 +116,7 @@ class Extractor(ABC):
         quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
         # Add quality metadata
-        enhanced_metadata = dict(result.metadata) if result.metadata else {}
-        enhanced_metadata["quality_score"] = quality_score
+        enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
         # Return enhanced result
         return ExtractionResult(

{kreuzberg-3.10.0 → kreuzberg-3.11.0}/kreuzberg/_extractors/_email.py RENAMED Viewed

@@ -19,12 +19,12 @@ if TYPE_CHECKING:
 # Import optional dependencies at module level with proper error handling
 try:
     import mailparse
-except ImportError:
+except ImportError:  # pragma: no cover
     mailparse = None
 try:
     import html2text  # type: ignore[import-not-found]
-except ImportError:
+except ImportError:  # pragma: no cover
     html2text = None
 # Compile regex pattern once at module level
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
         to_info = parsed_email.get("to")
         if to_info:
+            # Store the raw value in metadata (could be string, dict, or list)
             if isinstance(to_info, list) and to_info:
+                # For metadata, use first recipient's email if it's a list
                 to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
+                metadata["email_to"] = to_email
             elif isinstance(to_info, dict):
-                to_email = to_info.get("email", "")
+                metadata["email_to"] = to_info.get("email", "")
             else:
-                to_email = str(to_info)
-            metadata["email_to"] = to_email
-            text_parts.append(f"To: {to_email}")
+                metadata["email_to"] = str(to_info)
+            # For display, format all recipients
+            to_formatted = self._format_email_field(to_info)
+            text_parts.append(f"To: {to_formatted}")
         date = parsed_email.get("date")
         if date:
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
         cc = parsed_email.get("cc")
         if cc:
             metadata["email_cc"] = cc
-            text_parts.append(f"CC: {cc}")
+            cc_formatted = self._format_email_field(cc)
+            text_parts.append(f"CC: {cc_formatted}")
         bcc = parsed_email.get("bcc")
         if bcc:
             metadata["email_bcc"] = bcc
-            text_parts.append(f"BCC: {bcc}")
+            bcc_formatted = self._format_email_field(bcc)
+            text_parts.append(f"BCC: {bcc_formatted}")
+    def _format_email_field(self, field: Any) -> str:
+        """Format email field (to, cc, bcc) for display."""
+        if isinstance(field, list):
+            emails = []
+            for item in field:
+                if isinstance(item, dict):
+                    email = item.get("email", "")
+                    if email:
+                        emails.append(email)
+                else:
+                    emails.append(str(item))
+            return ", ".join(emails)
+        if isinstance(field, dict):
+            return str(field.get("email", ""))
+        return str(field)
     def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
         """Extract and process email body content."""

kreuzberg 3.10.0__tar.gz → 3.11.0__tar.gz

kreuzberg 3.10.0tar.gz → 3.11.0tar.gz