PyPI - kreuzberg - Versions diffs - 3.10.1__tar.gz → 3.11.1__tar.gz - Mend

kreuzberg 3.10.1tar.gz → 3.11.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/ci.yaml RENAMED Viewed

@@ -15,7 +15,7 @@ jobs:
     timeout-minutes: 10
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
       - name: Install uv
         uses: astral-sh/setup-uv@v6
@@ -58,7 +58,7 @@ jobs:
     timeout-minutes: 20
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
       - name: Install uv
         uses: astral-sh/setup-uv@v6
@@ -151,7 +151,7 @@ jobs:
     timeout-minutes: 30
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
       - name: Install uv
         uses: astral-sh/setup-uv@v6

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/docs.yml RENAMED Viewed

@@ -24,7 +24,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           fetch-depth: 0

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/pr-title.yaml RENAMED Viewed

@@ -15,6 +15,6 @@ jobs:
     name: Validate PR title
     runs-on: ubuntu-latest
     steps:
-      - uses: amannn/action-semantic-pull-request@v5
+      - uses: amannn/action-semantic-pull-request@v6
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/publish-docker.yml RENAMED Viewed

@@ -46,7 +46,7 @@ jobs:
           df -h
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           ref: ${{ github.ref }}

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/.github/workflows/release.yaml RENAMED Viewed

@@ -13,7 +13,7 @@ jobs:
       contents: read
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
       - name: Install uv
         uses: astral-sh/setup-uv@v6

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/.pre-commit-config.yaml RENAMED Viewed

@@ -5,13 +5,15 @@ repos:
       - id: commitlint
         stages: [commit-msg]
         additional_dependencies: ["@commitlint/config-conventional"]
-  - repo: https://github.com/Goldziher/ai-rulez
-    rev: v1.1.4
-    hooks:
-      - id: ai-rulez-validate
-      - id: ai-rulez-generate
+  # Temporarily disabled - ai-rulez Go build failing in CI
+  # TODO: Re-enable once ai-rulez v1.4.4+ Python migration is stable
+  # - repo: https://github.com/Goldziher/ai-rulez
+  #   rev: v1.4.3
+  #   hooks:
+  #     - id: ai-rulez-validate
+  #     - id: ai-rulez-generate
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
       - id: name-tests-test
         args:
@@ -53,7 +55,7 @@ repos:
     hooks:
       - id: pyproject-fmt
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.7
+    rev: v0.12.8
     hooks:
       - id: ruff
         args: ["--fix", "--unsafe-fixes"]

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.10.1
+Version: 3.11.1
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -28,13 +28,13 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Text Processing :: General
 Classifier: Typing :: Typed
 Requires-Python: >=3.10
-Requires-Dist: anyio>=4.9.0
+Requires-Dist: anyio>=4.10.0
 Requires-Dist: chardetng-py>=0.3.5
 Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
 Requires-Dist: html-to-markdown[lxml]>=1.9.0
-Requires-Dist: mcp>=1.12.2
+Requires-Dist: mcp>=1.12.4
 Requires-Dist: msgspec>=0.18.0
-Requires-Dist: playa-pdf>=0.6.4
+Requires-Dist: playa-pdf>=0.7.0
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.3.2
@@ -45,25 +45,24 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
 Provides-Extra: all
 Requires-Dist: click>=8.2.1; extra == 'all'
+Requires-Dist: deep-translator>=1.11.4; extra == 'all'
 Requires-Dist: easyocr>=1.7.2; extra == 'all'
 Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
 Requires-Dist: gmft>=0.4.2; extra == 'all'
 Requires-Dist: keybert>=0.9.0; extra == 'all'
-Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
 Requires-Dist: mailparse>=1.0.15; extra == 'all'
 Requires-Dist: paddleocr>=3.1.0; extra == 'all'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
-Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
+Requires-Dist: pandas>=2.3.1; extra == 'all'
+Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
 Requires-Dist: rich>=14.1.0; extra == 'all'
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
 Requires-Dist: setuptools>=80.9.0; extra == 'all'
 Requires-Dist: spacy>=3.8.7; extra == 'all'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
 Provides-Extra: api
-Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
-Provides-Extra: auto-classify-document-type
-Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
-Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
 Provides-Extra: chunking
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
 Provides-Extra: cli
@@ -71,7 +70,10 @@ Requires-Dist: click>=8.2.1; extra == 'cli'
 Requires-Dist: rich>=14.1.0; extra == 'cli'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
 Provides-Extra: crypto
-Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
+Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
+Provides-Extra: document-classification
+Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
+Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
 Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
 Provides-Extra: entity-extraction

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/contributing.md RENAMED Viewed

@@ -34,7 +34,7 @@ All commands run through `uv run`:
 # Testing
 uv run pytest                      # Run all tests
 uv run pytest tests/foo_test.py    # Run specific test
-uv run pytest --cov                # With coverage (must be ≥95%)
+uv run pytest --cov                # With coverage (must be ≥85%)
 # Code quality
 uv run ruff format                 # Format code

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/examples/extraction-examples.md RENAMED Viewed

@@ -132,15 +132,15 @@ async def extract_tables_from_pdf():
     # Process extracted tables
     print(f"Found {len(result.tables)} tables")
     for i, table in enumerate(result.tables):
-        print(f"Table {i+1} on page {table.page_number}:")
-        print(table.text)  # Markdown formatted table
+        print(f"Table {i+1} on page {table['page_number']}:")
+        print(table["text"])  # Markdown formatted table
         # Work with the pandas DataFrame
-        df = table.df
+        df = table["df"]
         print(f"Table shape: {df.shape}")
         # The cropped table image is also available
-        # table.cropped_image.save(f"table_{i+1}.png")
+        # table['cropped_image'].save(f"table_{i+1}.png")
     # With custom GMFT configuration
     custom_config = ExtractionConfig(

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/getting-started/installation.md RENAMED Viewed

@@ -134,6 +134,16 @@ python -m spacy download es_core_news_sm  # Spanish
     spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
+### Document Classification
+For automatic document type detection (invoice, contract, receipt, etc.), install the document classification extra:
+```shell
+pip install "kreuzberg[document-classification]"
+```
+This feature uses Google Translate for multi-language support and requires explicit opt-in by setting `auto_detect_document_type=True` in your configuration.
 ### All Optional Dependencies
 To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -145,5 +155,5 @@ pip install "kreuzberg[all]"
 This is equivalent to:
 ```shell
-pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
+pip install "kreuzberg[chunking,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
 ```

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/index.md RENAMED Viewed

@@ -22,7 +22,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
 ### Engineering Principles
-- **Test Coverage**: 95%+ coverage with comprehensive test suites
+- **Test Coverage**: Comprehensive test suites ensuring code reliability
 - **API Design**: True async/await implementation alongside synchronous APIs
 - **Error Handling**: Consistent exception hierarchy with detailed context
 - **Type Safety**: Full type annotations for enhanced developer experience

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/document-classification.md RENAMED Viewed

@@ -2,9 +2,17 @@
 Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
+## Installation
+Document classification requires the `document-classification` extra to be installed:
+```bash
+pip install "kreuzberg[document-classification]"
+```
 ## Enabling Document Classification
-To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
+Document classification is disabled by default. To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
 ```python
 from kreuzberg import ExtractionConfig, extract_file

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/docs/user-guide/extraction-configuration.md RENAMED Viewed

@@ -237,10 +237,10 @@ result = await extract_file("document_with_tables.pdf", config=config)
 # Access extracted tables
 for i, table in enumerate(result.tables):
-    print(f"Table {i+1} on page {table.page_number}:")
-    print(table.text)  # Markdown formatted table text
+    print(f"Table {i+1} on page {table['page_number']}:")
+    print(table["text"])  # Markdown formatted table text
     # You can also access the pandas DataFrame directly
-    df = table.df
+    df = table["df"]
     print(df.shape)  # (rows, columns)
 ```

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_config.py RENAMED Viewed

@@ -97,19 +97,21 @@ def parse_ocr_backend_config(
     if not isinstance(backend_config, dict):
         return None
-    if backend == "tesseract":
-        # Convert psm integer to PSMMode enum if needed
-        processed_config = backend_config.copy()
-        if "psm" in processed_config and isinstance(processed_config["psm"], int):
-            from kreuzberg._ocr._tesseract import PSMMode  # noqa: PLC0415
-            processed_config["psm"] = PSMMode(processed_config["psm"])
-        return TesseractConfig(**processed_config)
-    if backend == "easyocr":
-        return EasyOCRConfig(**backend_config)
-    if backend == "paddleocr":
-        return PaddleOCRConfig(**backend_config)
-    return None
+    match backend:
+        case "tesseract":
+            # Convert psm integer to PSMMode enum if needed
+            processed_config = backend_config.copy()
+            if "psm" in processed_config and isinstance(processed_config["psm"], int):
+                from kreuzberg._ocr._tesseract import PSMMode  # noqa: PLC0415
+                processed_config["psm"] = PSMMode(processed_config["psm"])
+            return TesseractConfig(**processed_config)
+        case "easyocr":
+            return EasyOCRConfig(**backend_config)
+        case "paddleocr":
+            return PaddleOCRConfig(**backend_config)
+        case _:
+            return None
 def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
@@ -140,7 +142,9 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
         "document_classification_mode",
         "keyword_count",
     }
-    extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
+    extraction_config = extraction_config | {
+        field: config_dict[field] for field in basic_fields if field in config_dict
+    }
     # Handle OCR backend configuration
     ocr_backend = extraction_config.get("ocr_backend")

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_document_classification.py RENAMED Viewed

@@ -62,7 +62,7 @@ def _get_translated_text(result: ExtractionResult) -> str:
         from deep_translator import GoogleTranslator  # noqa: PLC0415
     except ImportError as e:  # pragma: no cover
         raise MissingDependencyError(
-            "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
+            "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
         ) from e
     try:

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_base.py RENAMED Viewed

@@ -116,8 +116,7 @@ class Extractor(ABC):
         quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
         # Add quality metadata
-        enhanced_metadata = dict(result.metadata) if result.metadata else {}
-        enhanced_metadata["quality_score"] = quality_score
+        enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
         # Return enhanced result
         return ExtractionResult(

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_image.py RENAMED Viewed

@@ -85,23 +85,24 @@ class ImageExtractor(Extractor):
         backend = get_ocr_backend(self.config.ocr_backend)
-        if self.config.ocr_backend == "tesseract":
-            config = (
-                self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
-            )
-            result = backend.process_file_sync(path, **asdict(config))
-        elif self.config.ocr_backend == "paddleocr":
-            paddle_config = (
-                self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
-            )
-            result = backend.process_file_sync(path, **asdict(paddle_config))
-        elif self.config.ocr_backend == "easyocr":
-            easy_config = (
-                self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
-            )
-            result = backend.process_file_sync(path, **asdict(easy_config))
-        else:
-            raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+        match self.config.ocr_backend:
+            case "tesseract":
+                config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
+                )
+                result = backend.process_file_sync(path, **asdict(config))
+            case "paddleocr":
+                paddle_config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+                )
+                result = backend.process_file_sync(path, **asdict(paddle_config))
+            case "easyocr":
+                easy_config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+                )
+                result = backend.process_file_sync(path, **asdict(easy_config))
+            case _:
+                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
         return self._apply_quality_processing(result)
     def _get_extension_from_mime_type(self, mime_type: str) -> str:

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_extractors/_pdf.py RENAMED Viewed

@@ -88,14 +88,12 @@ class PDFExtractor(Extractor):
             # Enhance metadata with table information
             if result.tables:
                 table_summary = generate_table_summary(result.tables)
-                result.metadata.update(
-                    {
-                        "table_count": table_summary["table_count"],
-                        "tables_summary": f"Document contains {table_summary['table_count']} tables "
-                        f"across {table_summary['pages_with_tables']} pages with "
-                        f"{table_summary['total_rows']} total rows",
-                    }
-                )
+                result.metadata = result.metadata | {
+                    "table_count": table_summary["table_count"],
+                    "tables_summary": f"Document contains {table_summary['table_count']} tables "
+                    f"across {table_summary['pages_with_tables']} pages with "
+                    f"{table_summary['total_rows']} total rows",
+                }
         return self._apply_quality_processing(result)
@@ -153,14 +151,12 @@ class PDFExtractor(Extractor):
         # Enhance metadata with table information
         if tables:
             table_summary = generate_table_summary(tables)
-            result.metadata.update(
-                {
-                    "table_count": table_summary["table_count"],
-                    "tables_summary": f"Document contains {table_summary['table_count']} tables "
-                    f"across {table_summary['pages_with_tables']} pages with "
-                    f"{table_summary['total_rows']} total rows",
-                }
-            )
+            result.metadata = result.metadata | {
+                "table_count": table_summary["table_count"],
+                "tables_summary": f"Document contains {table_summary['table_count']} tables "
+                f"across {table_summary['pages_with_tables']} pages with "
+                f"{table_summary['total_rows']} total rows",
+            }
         # Apply quality processing
         return self._apply_quality_processing(result)
@@ -386,23 +382,24 @@ class PDFExtractor(Extractor):
         backend = get_ocr_backend(self.config.ocr_backend)
         paths = [Path(p) for p in image_paths]
-        if self.config.ocr_backend == "tesseract":
-            config = (
-                self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
-            )
-            results = backend.process_batch_sync(paths, **asdict(config))
-        elif self.config.ocr_backend == "paddleocr":
-            paddle_config = (
-                self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
-            )
-            results = backend.process_batch_sync(paths, **asdict(paddle_config))
-        elif self.config.ocr_backend == "easyocr":
-            easy_config = (
-                self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
-            )
-            results = backend.process_batch_sync(paths, **asdict(easy_config))
-        else:
-            raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+        match self.config.ocr_backend:
+            case "tesseract":
+                config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
+                )
+                results = backend.process_batch_sync(paths, **asdict(config))
+            case "paddleocr":
+                paddle_config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+                )
+                results = backend.process_batch_sync(paths, **asdict(paddle_config))
+            case "easyocr":
+                easy_config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+                )
+                results = backend.process_batch_sync(paths, **asdict(easy_config))
+            case _:
+                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
         # Use list comprehension and join for efficient string building
         return "\n\n".join(result.content for result in results)

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_mcp/server.py RENAMED Viewed

@@ -51,7 +51,7 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
     }
     # Override with provided parameters
-    config_dict.update(kwargs)
+    config_dict = config_dict | kwargs
     return ExtractionConfig(**config_dict)

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/_easyocr.py RENAMED Viewed

@@ -4,7 +4,6 @@ import warnings
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
-import numpy as np
 from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -188,6 +187,9 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         kwargs.pop("language", None)
         kwargs.pop("use_gpu", None)
+        kwargs.pop("device", None)
+        kwargs.pop("gpu_memory_limit", None)
+        kwargs.pop("fallback_to_cpu", None)
         try:
             result = await run_sync(
@@ -455,11 +457,16 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         Raises:
             OCRError: If OCR processing fails.
         """
+        import numpy as np  # noqa: PLC0415
         self._init_easyocr_sync(**kwargs)
         beam_width = kwargs.pop("beam_width")
         kwargs.pop("language", None)
         kwargs.pop("use_gpu", None)
+        kwargs.pop("device", None)
+        kwargs.pop("gpu_memory_limit", None)
+        kwargs.pop("fallback_to_cpu", None)
         try:
             result = self._reader.readtext(

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_ocr/_paddleocr.py RENAMED Viewed

@@ -7,7 +7,6 @@ from importlib.util import find_spec
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
-import numpy as np
 from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -380,6 +379,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         Raises:
             OCRError: If OCR processing fails.
         """
+        import numpy as np  # noqa: PLC0415
         self._init_paddle_ocr_sync(**kwargs)
         if image.mode != "RGB":

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/kreuzberg/_types.py RENAMED Viewed

@@ -349,7 +349,7 @@ class ExtractionConfig:
     """Configuration for language detection. If None, uses default settings."""
     spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
     """Configuration for spaCy entity extraction. If None, uses default settings."""
-    auto_detect_document_type: bool = True
+    auto_detect_document_type: bool = False
     """Whether to automatically detect the document type."""
     document_type_confidence_threshold: float = 0.5
     """Confidence threshold for document type detection."""
@@ -398,15 +398,16 @@ class ExtractionConfig:
             return asdict(self.ocr_config)
         # Lazy load and cache default configs instead of creating new instances
-        if self.ocr_backend == "tesseract":
-            from kreuzberg._ocr._tesseract import TesseractConfig  # noqa: PLC0415
+        match self.ocr_backend:
+            case "tesseract":
+                from kreuzberg._ocr._tesseract import TesseractConfig  # noqa: PLC0415
-            return asdict(TesseractConfig())
-        if self.ocr_backend == "easyocr":
-            from kreuzberg._ocr._easyocr import EasyOCRConfig  # noqa: PLC0415
+                return asdict(TesseractConfig())
+            case "easyocr":
+                from kreuzberg._ocr._easyocr import EasyOCRConfig  # noqa: PLC0415
-            return asdict(EasyOCRConfig())
-        # paddleocr
-        from kreuzberg._ocr._paddleocr import PaddleOCRConfig  # noqa: PLC0415
+                return asdict(EasyOCRConfig())
+            case _:  # paddleocr or any other backend
+                from kreuzberg._ocr._paddleocr import PaddleOCRConfig  # noqa: PLC0415
-        return asdict(PaddleOCRConfig())
+                return asdict(PaddleOCRConfig())

{kreuzberg-3.10.1 → kreuzberg-3.11.1}/mkdocs.yaml RENAMED Viewed

@@ -158,4 +158,3 @@ nav:
       - Custom Hooks: advanced/custom-hooks.md
       - Custom Extractors: advanced/custom-extractors.md
   - Contributing: contributing.md
-  - Changelog: changelog.md

kreuzberg 3.10.1__tar.gz → 3.11.1__tar.gz

kreuzberg 3.10.1tar.gz → 3.11.1tar.gz