PyPI - kreuzberg - Versions diffs - 3.10.0__tar.gz → 3.10.1__tar.gz - Mend

kreuzberg 3.10.0tar.gz → 3.10.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (239) hide show

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/.github/workflows/ci.yaml RENAMED Viewed

@@ -51,12 +51,103 @@ jobs:
       - name: Execute Pre-Commit
         run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
+  # Coverage job runs first, only on Python 3.13 Ubuntu
+  coverage:
+    needs: validate
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+      - name: Install Python
+        uses: actions/setup-python@v5
+        id: setup-python
+        with:
+          python-version: "3.13"
+      - name: Cache Python Dependencies
+        id: python-cache
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/uv
+            .venv
+          key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
+          restore-keys: |
+            python-dependencies-ubuntu-latest-3.13-
+      - name: Install Dependencies
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            uv sync --all-packages --all-extras --dev
+          shell: bash
+      - name: Install System Dependencies
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
+          shell: bash
+      - name: Run Tests with Coverage
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 15
+          max_attempts: 3
+          retry_wait_seconds: 10
+          command: |
+            uv run coverage erase
+            uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
+            uv run coverage report --precision=2
+          shell: bash
+      - name: Upload Coverage to DeepSource
+        if: always() && github.event_name == 'push'
+        env:
+          DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
+        run: |
+          # Install DeepSource CLI
+          curl -fsSL https://deepsource.io/cli | sh
+          # Upload coverage report
+          ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
+      - name: Upload Coverage Artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-report-${{ github.sha }}
+          path: |
+            coverage.lcov
+            .coverage
+          retention-days: 7
+  # Full test matrix runs only after coverage succeeds
   test:
+    needs: coverage
+    runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [ ubuntu-latest, macOS-latest, windows-latest ]
-        python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
-    runs-on: ${{ matrix.os }}
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python: ["3.10", "3.11", "3.12", "3.13"]
+        exclude:
+          # Skip Python 3.13 on macOS for now due to compatibility issues
+          - os: macos-latest
+            python: "3.13"
     timeout-minutes: 30
     steps:
       - name: Checkout
@@ -146,52 +237,12 @@ jobs:
             pandoc --version
           shell: pwsh
-      - name: Clean Coverage Data
-        run: |
-          rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
-        shell: bash
-      - name: Run Tests with Coverage
-        run: |
-          uv run coverage erase
-          uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
-      - name: Upload Coverage Artifacts
-        if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
-        uses: actions/upload-artifact@v4
-        with:
-          name: coverage-report
-          path: coverage.lcov
-          retention-days: 1
-  upload-coverage:
-    needs: test
-    runs-on: ubuntu-latest
-    if: github.event_name == 'push' || github.event_name == 'pull_request'
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.sha || github.sha }}
-      - name: Download Coverage Artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: coverage-report
-          path: .
-      - name: Install DeepSource CLI
+      - name: Run Tests (without coverage)
         uses: nick-fields/retry@v3
         with:
-          timeout_minutes: 3
+          timeout_minutes: 15
           max_attempts: 3
           retry_wait_seconds: 10
           command: |
-            curl -fsSL https://deepsource.io/cli | sh
+            uv run pytest -s -vvv --reruns 2 --reruns-delay 1
           shell: bash
-      - name: Upload Coverage to DeepSource
-        env:
-          DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
-        run: |
-          ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/.pre-commit-config.yaml RENAMED Viewed

@@ -53,7 +53,7 @@ repos:
     hooks:
       - id: pyproject-fmt
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.5
+    rev: v0.12.7
     hooks:
       - id: ruff
         args: ["--fix", "--unsafe-fixes"]

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.10.0
+Version: 3.10.1
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/benchmark_baseline.py RENAMED Viewed

@@ -108,7 +108,7 @@ async def run_baseline_benchmark() -> dict[str, object] | None:
     return results  # type: ignore[return-value]
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     baseline_results = asyncio.run(run_baseline_benchmark())
     baseline_file = Path("baseline_results.json")

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/end_to_end_benchmark.py RENAMED Viewed

@@ -195,7 +195,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
     }
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     print("🧪 REPRODUCIBLE CACHE BENCHMARK")
     print("Testing msgpack implementation with statistical rigor...")
     print()

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py RENAMED Viewed

@@ -1,4 +1,5 @@
 """Core benchmark implementations comparing sync vs async performance."""
+# mypy: disable-error-code=unused-ignore
 from __future__ import annotations

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/benchmarks/statistical_benchmark.py RENAMED Viewed

@@ -187,7 +187,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
     }
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     print("🧪 STATISTICAL CACHE BENCHMARK")
     print("Testing msgpack implementation with proper error analysis...")
     print()

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_api/main.py RENAMED Viewed

@@ -30,7 +30,7 @@ try:
         HTTP_422_UNPROCESSABLE_ENTITY,
         HTTP_500_INTERNAL_SERVER_ERROR,
     )
-except ImportError as e:
+except ImportError as e:  # pragma: no cover
     raise MissingDependencyError.create_for_package(
         dependency_group="litestar",
         functionality="Litestar API and docker container",

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_chunker.py RENAMED Viewed

@@ -43,7 +43,7 @@ def get_chunker(
                 from semantic_text_splitter import TextSplitter  # noqa: PLC0415
                 _chunkers[key] = TextSplitter(max_characters, overlap_characters)
-        except ImportError as e:
+        except ImportError as e:  # pragma: no cover
             raise MissingDependencyError.create_for_package(
                 dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
             ) from e

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_config.py RENAMED Viewed

@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
 if sys.version_info >= (3, 11):
     import tomllib
-else:
+else:  # pragma: no cover
     import tomli as tomllib  # type: ignore[import-not-found]
 from kreuzberg._gmft import GMFTConfig
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
     # Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
     if config_path.name == "kreuzberg.toml":
         return data  # type: ignore[no-any-return]
-    return data.get("tool", {}).get("kreuzberg", {})  # type: ignore[no-any-return]
+    # For other files, check if they have [tool.kreuzberg] section
+    if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
+        return data.get("tool", {}).get("kreuzberg", {})  # type: ignore[no-any-return]
+    # Otherwise assume root-level configuration
+    return data  # type: ignore[no-any-return]
 def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
@@ -129,12 +135,23 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
         "extract_keywords",
         "auto_detect_language",
         "enable_quality_processing",
+        "auto_detect_document_type",
+        "document_type_confidence_threshold",
+        "document_classification_mode",
+        "keyword_count",
     }
     extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
     # Handle OCR backend configuration
     ocr_backend = extraction_config.get("ocr_backend")
     if ocr_backend and ocr_backend != "none":
+        # Validate OCR backend
+        valid_backends = {"tesseract", "easyocr", "paddleocr"}
+        if ocr_backend not in valid_backends:
+            raise ValidationError(
+                f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
+                context={"provided": ocr_backend, "valid": sorted(valid_backends)},
+            )
         ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
         if ocr_config:
             extraction_config["ocr_config"] = ocr_config
@@ -286,6 +303,10 @@ _CONFIG_FIELDS = [
     "extract_keywords",
     "auto_detect_language",
     "enable_quality_processing",
+    "auto_detect_document_type",
+    "document_type_confidence_threshold",
+    "document_classification_mode",
+    "keyword_count",
 ]

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_document_classification.py RENAMED Viewed

@@ -4,13 +4,12 @@ import re
 from typing import TYPE_CHECKING
 from kreuzberg._ocr import get_ocr_backend
+from kreuzberg._types import ExtractionConfig, ExtractionResult  # noqa: TC001
 from kreuzberg.exceptions import MissingDependencyError
 if TYPE_CHECKING:
     from pathlib import Path
-    from kreuzberg._types import ExtractionConfig, ExtractionResult
 DOCUMENT_CLASSIFIERS = {
     "invoice": [
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
     Raises:
         MissingDependencyError: If the deep-translator package is not installed
     """
+    # Combine content with metadata for classification
+    text_to_classify = result.content
+    if result.metadata:
+        # Add metadata values to the text for classification
+        metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
+        text_to_classify = f"{text_to_classify} {metadata_text}"
     try:
         from deep_translator import GoogleTranslator  # noqa: PLC0415
-    except ImportError as e:
+    except ImportError as e:  # pragma: no cover
         raise MissingDependencyError(
             "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
         ) from e
-    return str(GoogleTranslator(source="auto", target="en").translate(result.content).lower())
+    try:
+        return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
+    except Exception:  # noqa: BLE001
+        # Fall back to original content in lowercase if translation fails
+        return text_to_classify.lower()
 def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
         A tuple containing the detected document type and the confidence score,
         or (None, None) if no type is detected with sufficient confidence.
     """
+    if not config.auto_detect_document_type:
+        return None, None
     translated_text = _get_translated_text(result)
     scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
@@ -108,7 +121,8 @@ def classify_document_from_layout(
         A tuple containing the detected document type and the confidence score,
         or (None, None) if no type is detected with sufficient confidence.
     """
-    translated_text = _get_translated_text(result)
+    if not config.auto_detect_document_type:
+        return None, None
     if result.layout is None or result.layout.empty:
         return None, None
@@ -117,6 +131,24 @@ def classify_document_from_layout(
     if not all(col in layout_df.columns for col in ["text", "top", "height"]):
         return None, None
+    # Use layout text for classification, not the content
+    layout_text = " ".join(layout_df["text"].astype(str).tolist())
+    # Translate layout text directly for classification
+    text_to_classify = layout_text
+    if result.metadata:
+        # Add metadata values to the text for classification
+        metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
+        text_to_classify = f"{text_to_classify} {metadata_text}"
+    try:
+        from deep_translator import GoogleTranslator  # noqa: PLC0415
+        translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
+    except Exception:  # noqa: BLE001
+        # Fall back to original content in lowercase if translation fails
+        translated_text = text_to_classify.lower()
     layout_df["translated_text"] = translated_text
     page_height = layout_df["top"].max() + layout_df["height"].max()
@@ -151,6 +183,9 @@ def auto_detect_document_type(
     if config.document_classification_mode == "vision" and file_path:
         layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
         result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
+    elif result.layout is not None and not result.layout.empty:
+        # Use layout-based classification if layout data is available
+        result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
     else:
         result.document_type, result.document_type_confidence = classify_document(result, config)
     return result

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_entity_extraction.py RENAMED Viewed

@@ -139,7 +139,7 @@ def extract_entities(
     try:
         import spacy  # noqa: F401, PLC0415
-    except ImportError as e:
+    except ImportError as e:  # pragma: no cover
         raise MissingDependencyError.create_for_package(
             package_name="spacy",
             dependency_group="entity-extraction",
@@ -230,7 +230,7 @@ def extract_keywords(
         return [(kw, float(score)) for kw, score in keywords]
     except (RuntimeError, OSError, ValueError):
         return []
-    except ImportError as e:
+    except ImportError as e:  # pragma: no cover
         raise MissingDependencyError.create_for_package(
             package_name="keybert",
             dependency_group="entity-extraction",

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_email.py RENAMED Viewed

@@ -19,12 +19,12 @@ if TYPE_CHECKING:
 # Import optional dependencies at module level with proper error handling
 try:
     import mailparse
-except ImportError:
+except ImportError:  # pragma: no cover
     mailparse = None
 try:
     import html2text  # type: ignore[import-not-found]
-except ImportError:
+except ImportError:  # pragma: no cover
     html2text = None
 # Compile regex pattern once at module level
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
         to_info = parsed_email.get("to")
         if to_info:
+            # Store the raw value in metadata (could be string, dict, or list)
             if isinstance(to_info, list) and to_info:
+                # For metadata, use first recipient's email if it's a list
                 to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
+                metadata["email_to"] = to_email
             elif isinstance(to_info, dict):
-                to_email = to_info.get("email", "")
+                metadata["email_to"] = to_info.get("email", "")
             else:
-                to_email = str(to_info)
-            metadata["email_to"] = to_email
-            text_parts.append(f"To: {to_email}")
+                metadata["email_to"] = str(to_info)
+            # For display, format all recipients
+            to_formatted = self._format_email_field(to_info)
+            text_parts.append(f"To: {to_formatted}")
         date = parsed_email.get("date")
         if date:
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
         cc = parsed_email.get("cc")
         if cc:
             metadata["email_cc"] = cc
-            text_parts.append(f"CC: {cc}")
+            cc_formatted = self._format_email_field(cc)
+            text_parts.append(f"CC: {cc_formatted}")
         bcc = parsed_email.get("bcc")
         if bcc:
             metadata["email_bcc"] = bcc
-            text_parts.append(f"BCC: {bcc}")
+            bcc_formatted = self._format_email_field(bcc)
+            text_parts.append(f"BCC: {bcc_formatted}")
+    def _format_email_field(self, field: Any) -> str:
+        """Format email field (to, cc, bcc) for display."""
+        if isinstance(field, list):
+            emails = []
+            for item in field:
+                if isinstance(item, dict):
+                    email = item.get("email", "")
+                    if email:
+                        emails.append(email)
+                else:
+                    emails.append(str(item))
+            return ", ".join(emails)
+        if isinstance(field, dict):
+            return str(field.get("email", ""))
+        return str(field)
     def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
         """Extract and process email body content."""

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_pdf.py RENAMED Viewed

@@ -82,7 +82,7 @@ class PDFExtractor(Extractor):
                 from kreuzberg._gmft import extract_tables  # noqa: PLC0415
                 result.tables = await extract_tables(path, self.config.gmft_config)
-            except ImportError:
+            except ImportError:  # pragma: no cover
                 result.tables = []
             # Enhance metadata with table information

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_extractors/_structured.py RENAMED Viewed

@@ -6,15 +6,15 @@ from typing import TYPE_CHECKING, Any, ClassVar
 if sys.version_info >= (3, 11):
     import tomllib
-else:
+else:  # pragma: no cover
     try:
         import tomli as tomllib  # type: ignore[import-not-found]
-    except ImportError:
+    except ImportError:  # pragma: no cover
         tomllib = None
 try:
     import yaml
-except ImportError:
+except ImportError:  # pragma: no cover
     yaml = None
 from anyio import Path as AsyncPath

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_gmft.py RENAMED Viewed

@@ -265,7 +265,7 @@ async def extract_tables(
             finally:
                 await run_sync(doc.close)
-        except ImportError as e:
+        except ImportError as e:  # pragma: no cover
             raise MissingDependencyError.create_for_package(
                 dependency_group="gmft", functionality="table extraction", package_name="gmft"
             ) from e
@@ -379,7 +379,7 @@ def extract_tables_sync(
         finally:
             doc.close()  # type: ignore[no-untyped-call]
-    except ImportError as e:
+    except ImportError as e:  # pragma: no cover
         raise MissingDependencyError.create_for_package(
             dependency_group="gmft", functionality="table extraction", package_name="gmft"
         ) from e

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_language_detection.py RENAMED Viewed

@@ -14,7 +14,7 @@ try:
     from fast_langdetect import detect, detect_multilingual
     HAS_FAST_LANGDETECT = True
-except ImportError:
+except ImportError:  # pragma: no cover
     HAS_FAST_LANGDETECT = False
     detect = None
     detect_multilingual = None

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_mcp/server.py RENAMED Viewed

@@ -268,7 +268,7 @@ def extract_structured(file_path: str) -> list[TextContent]:
     return [TextContent(type="text", text=content)]
-def main() -> None:
+def main() -> None:  # pragma: no cover
     """Main entry point for the MCP server."""
     mcp.run()

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_base.py RENAMED Viewed

@@ -88,7 +88,7 @@ class OCRBackend(ABC, Generic[T]):
         Returns:
             List of extraction result objects in the same order as input paths
         """
-        return [self.process_file_sync(path, **kwargs) for path in paths]
+        return [self.process_file_sync(path, **kwargs) for path in paths]  # pragma: no cover
     async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
         """Asynchronously process a batch of files and extract their text and metadata.
@@ -106,8 +106,8 @@ class OCRBackend(ABC, Generic[T]):
         from kreuzberg._utils._sync import run_taskgroup  # noqa: PLC0415
         tasks = [self.process_file(path, **kwargs) for path in paths]
-        return await run_taskgroup(*tasks)
+        return await run_taskgroup(*tasks)  # pragma: no cover
     def __hash__(self) -> int:
         """Hash function for allowing caching."""
-        return hash(type(self).__name__)
+        return hash(type(self).__name__)  # pragma: no cover

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_easyocr.py RENAMED Viewed

@@ -321,7 +321,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
             import torch  # noqa: PLC0415
             return bool(torch.cuda.is_available())
-        except ImportError:
+        except ImportError:  # pragma: no cover
             return False
     @classmethod
@@ -340,7 +340,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         try:
             import easyocr  # noqa: PLC0415
-        except ImportError as e:
+        except ImportError as e:  # pragma: no cover
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
             ) from e
@@ -508,7 +508,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         try:
             import easyocr  # noqa: PLC0415
-        except ImportError as e:
+        except ImportError as e:  # pragma: no cover
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
             ) from e

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_ocr/_paddleocr.py RENAMED Viewed

@@ -261,7 +261,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         try:
             from paddleocr import PaddleOCR  # noqa: PLC0415
-        except ImportError as e:
+        except ImportError as e:  # pragma: no cover
             raise MissingDependencyError.create_for_package(
                 dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
             ) from e
@@ -428,7 +428,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         try:
             from paddleocr import PaddleOCR  # noqa: PLC0415
-        except ImportError as e:
+        except ImportError as e:  # pragma: no cover
             raise MissingDependencyError.create_for_package(
                 dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
             ) from e

{kreuzberg-3.10.0 → kreuzberg-3.10.1}/kreuzberg/_playa.py RENAMED Viewed

@@ -143,7 +143,9 @@ def _parse_date_string(date_str: str) -> str:
             minute = date_str[10:12]
             second = date_str[12:14]
             time_part = f"T{hour}:{minute}:{second}"
-        return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y%m%d%H%M%S").isoformat()  # noqa: DTZ007
+        if time_part:
+            return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y-%m-%dT%H:%M:%S").isoformat()  # noqa: DTZ007
+        return datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").isoformat()  # noqa: DTZ007
     return date_str

kreuzberg 3.10.0__tar.gz → 3.10.1__tar.gz

kreuzberg 3.10.0tar.gz → 3.10.1tar.gz