PyPI - kreuzberg - Versions diffs - 3.8.2__tar.gz → 3.9.1__tar.gz - Mend

kreuzberg 3.8.2tar.gz → 3.9.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

kreuzberg-3.9.1/.deepsource.toml ADDED Viewed

@@ -0,0 +1,54 @@
+version = 1
+test_patterns = ["tests/**"]
+exclude_patterns = [
+    # Virtual environments
+    ".venv/**",
+    "venv/**",
+    # Build and distribution artifacts
+    "dist/**",
+    "build/**",
+    "*.egg-info/**",
+    # Documentation
+    "docs/**",
+    "site/**",
+    # Cache directories
+    "**/__pycache__/**",
+    ".pytest_cache/**",
+    ".mypy_cache/**",
+    ".ruff_cache/**",
+    ".coverage",
+    "htmlcov/**",
+    # Benchmarks and performance tests
+    "benchmarks/**",
+    # IDE and editor files
+    ".idea/**",
+    ".vscode/**",
+    # Version control
+    ".git/**",
+    # Temporary and generated files
+    "*.pyc",
+    ".DS_Store",
+    "*.swp",
+    "*.swo",
+]
+[[analyzers]]
+name = "test-coverage"
+[[analyzers]]
+name = "python"
+[analyzers.meta]
+runtime_version = "3.x.x"
+[[transformers]]
+name = "ruff"

kreuzberg-3.9.1/.github/workflows/ci.yaml ADDED Viewed

@@ -0,0 +1,197 @@
+name: CI
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+      - feat/smart-multiprocessing
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: "pyproject.toml"
+      - name: Install Dependencies
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            if [[ "${{ runner.os }}" == "Windows" ]] && [[ -d ".venv" ]]; then
+              echo "Removing existing .venv directory on Windows"
+              rm -rf .venv
+            fi
+            uv sync --all-packages --all-extras --dev
+          shell: bash
+      - name: Load Cached Pre-Commit Dependencies
+        id: cached-pre-commit-dependencies
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pre-commit/
+          key: pre-commit|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
+      - name: Execute Pre-Commit
+        run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
+  test:
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macOS-latest, windows-latest ]
+        python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 30
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+      - name: Install Python
+        uses: actions/setup-python@v5
+        id: setup-python
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Cache Python Dependencies
+        id: python-cache
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/uv
+            .venv
+          key: python-dependencies-${{ matrix.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('uv.lock') }}
+          restore-keys: |
+            python-dependencies-${{ matrix.os }}-${{ matrix.python }}-
+      - name: Install Dependencies
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            if [[ "${{ runner.os }}" == "Windows" ]] && [[ -d ".venv" ]]; then
+              echo "Removing existing .venv directory on Windows"
+              rm -rf .venv
+            fi
+            uv sync --all-packages --all-extras --dev
+          shell: bash
+      - name: Cache Test Artifacts
+        uses: actions/cache@v4
+        with:
+          path: .pytest_cache/
+          key: pytest-cache-${{ matrix.os }}-${{ matrix.python }}
+      - name: Cache and Install Homebrew (macOS)
+        if: runner.os == 'macOS'
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            # Using the underlying homebrew commands instead of the action
+            brew update || true
+            brew install tesseract tesseract-lang pandoc || brew upgrade tesseract tesseract-lang pandoc || true
+            brew list tesseract tesseract-lang pandoc
+          shell: bash
+      - name: Cache and Install APT Packages (Linux)
+        if: runner.os == 'Linux'
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
+          shell: bash
+      - name: Install System Dependencies (Windows)
+        if: runner.os == 'Windows'
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            choco install -y tesseract pandoc --no-progress
+            Write-Output "C:\Program Files\Tesseract-OCR" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+            Write-Output "C:\Program Files\Pandoc" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+            $env:PATH = "C:\Program Files\Tesseract-OCR;C:\Program Files\Pandoc;" + $env:PATH
+            tesseract --version
+            pandoc --version
+          shell: pwsh
+      - name: Clean Coverage Data
+        run: |
+          rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
+        shell: bash
+      - name: Run Tests with Coverage
+        run: |
+          uv run coverage erase
+          uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
+      - name: Upload Coverage Artifacts
+        if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-report
+          path: coverage.lcov
+          retention-days: 1
+  upload-coverage:
+    needs: test
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' || github.event_name == 'pull_request'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+      - name: Download Coverage Artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: coverage-report
+          path: .
+      - name: Install DeepSource CLI
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 3
+          max_attempts: 3
+          retry_wait_seconds: 10
+          command: |
+            curl -fsSL https://deepsource.io/cli | sh
+          shell: bash
+      - name: Upload Coverage to DeepSource
+        env:
+          DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
+        run: |
+          ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/.gitignore RENAMED Viewed

@@ -1,5 +1,6 @@
 *$py.class
 *.Cache
+.clause/
 *.cscfg
 *.egg-info/
 *.log
@@ -9,6 +10,8 @@
 *temp/
 .coverage
 .coverage*
+coverage.lcov
+htmlcov/
 .cursorrules
 .dist/
 .DS_store

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/.pre-commit-config.yaml RENAMED Viewed

@@ -53,7 +53,7 @@ repos:
     hooks:
       - id: pyproject-fmt
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.2
+    rev: v0.12.5
     hooks:
       - id: ruff
         args: ["--fix", "--unsafe-fixes"]

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,13 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.8.2
+Version: 3.9.1
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
 License-File: LICENSE
-Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
+Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Information Technology
@@ -29,12 +29,12 @@ Classifier: Topic :: Text Processing :: General
 Classifier: Typing :: Typed
 Requires-Python: >=3.10
 Requires-Dist: anyio>=4.9.0
-Requires-Dist: chardetng-py>=0.3.4
+Requires-Dist: chardetng-py>=0.3.5
 Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
-Requires-Dist: html-to-markdown[lxml]>=1.8.0
-Requires-Dist: mcp>=1.11.0
+Requires-Dist: html-to-markdown[lxml]>=1.9.0
+Requires-Dist: mcp>=1.12.2
 Requires-Dist: msgspec>=0.18.0
-Requires-Dist: playa-pdf>=0.6.1
+Requires-Dist: playa-pdf>=0.6.4
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.3.2
@@ -53,18 +53,21 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
 Requires-Dist: mailparse>=1.0.15; extra == 'all'
 Requires-Dist: paddleocr>=3.1.0; extra == 'all'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
-Requires-Dist: rich>=14.0.0; extra == 'all'
+Requires-Dist: rich>=14.1.0; extra == 'all'
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
 Requires-Dist: setuptools>=80.9.0; extra == 'all'
 Requires-Dist: spacy>=3.8.7; extra == 'all'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
 Provides-Extra: api
 Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
+Provides-Extra: auto-classify-document-type
+Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
+Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
 Provides-Extra: chunking
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
 Provides-Extra: cli
 Requires-Dist: click>=8.2.1; extra == 'cli'
-Requires-Dist: rich>=14.0.0; extra == 'cli'
+Requires-Dist: rich>=14.1.0; extra == 'cli'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
 Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
@@ -88,7 +91,7 @@ Description-Content-Type: text/markdown
 [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
 [![Benchmarks](https://img.shields.io/badge/benchmarks-fastest%20CPU-orange)](https://benchmarks.kreuzberg.dev/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
+[![DeepSource](https://app.deepsource.com/gh/Goldziher/kreuzberg.svg/?label=code+coverage&show_trend=true&token=U8AW1VWWSLwVhrbtL8LmLBDN)](https://app.deepsource.com/gh/Goldziher/kreuzberg/)
 **A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
@@ -103,6 +106,7 @@ Description-Content-Type: text/markdown
 - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
 - **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
 - **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
+- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
 ### Technical Architecture
@@ -126,14 +130,14 @@ Kreuzberg leverages established open source technologies:
 ### Extract Text with CLI
 ```bash
-# Extract text from any file to markdown
-uvx kreuzberg extract document.pdf > output.md
+# Extract text from any file to text format
+uvx kreuzberg extract document.pdf > output.txt
 # With all features (OCR, table extraction, etc.)
-uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
+uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr-backend tesseract --output-format text
 # Extract with rich metadata
-uvx kreuzberg extract report.pdf --show-metadata --format json
+uvx kreuzberg extract report.pdf --show-metadata --output-format json
 ```
 ### Python Usage

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/README.md RENAMED Viewed

@@ -5,7 +5,7 @@
 [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
 [![Benchmarks](https://img.shields.io/badge/benchmarks-fastest%20CPU-orange)](https://benchmarks.kreuzberg.dev/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
+[![DeepSource](https://app.deepsource.com/gh/Goldziher/kreuzberg.svg/?label=code+coverage&show_trend=true&token=U8AW1VWWSLwVhrbtL8LmLBDN)](https://app.deepsource.com/gh/Goldziher/kreuzberg/)
 **A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
@@ -20,6 +20,7 @@
 - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
 - **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
 - **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
+- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
 ### Technical Architecture
@@ -43,14 +44,14 @@ Kreuzberg leverages established open source technologies:
 ### Extract Text with CLI
 ```bash
-# Extract text from any file to markdown
-uvx kreuzberg extract document.pdf > output.md
+# Extract text from any file to text format
+uvx kreuzberg extract document.pdf > output.txt
 # With all features (OCR, table extraction, etc.)
-uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
+uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr-backend tesseract --output-format text
 # Extract with rich metadata
-uvx kreuzberg extract report.pdf --show-metadata --format json
+uvx kreuzberg extract report.pdf --show-metadata --output-format json
 ```
 ### Python Usage

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/end_to_end_benchmark.py RENAMED Viewed

@@ -43,7 +43,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
     print(f"Tables: {len(cold_result.tables)}")
     print(f"Chunks: {len(cold_result.chunks)}")
-    from kreuzberg._utils._cache import (
+    from kreuzberg._utils._cache import (  # noqa: PLC0415
         get_ocr_cache,
         get_table_cache,
         get_mime_cache,

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/benchmarks/statistical_benchmark.py RENAMED Viewed

@@ -130,7 +130,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
         f"  Cache consistency: {'✅ STABLE' if warm_clean_stdev / warm_clean_mean < 0.1 else '⚠️ VARIABLE'}"
     )
-    from kreuzberg._utils._cache import (
+    from kreuzberg._utils._cache import (  # noqa: PLC0415
         get_ocr_cache,
         get_table_cache,
         get_mime_cache,

kreuzberg-3.9.1/docs/changelog.md ADDED Viewed

@@ -0,0 +1,49 @@
+# Changelog
+All notable changes to Kreuzberg will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [3.9.0] - 2025-01-17
+### Added
+- Automatic Document Type Detection (#88) - A new feature for classifying documents into categories (contract, form, invoice, receipt, report)
+    - Integration with Google Translate for multi-language support
+    - New optional dependency group `auto-classify-document-type` with `deep-translator` and `pandas`
+    - Comprehensive tests and documentation
+- DeepSource integration for code quality analysis
+### Fixed
+- PDF extraction handling when no OCR backend is available
+- Entity extraction test updated to use frozenset of tuples
+- Config handling for dataclasses with `slots=True` - replaced `config.__dict__` with `asdict(config)`
+- Coverage configuration and cleanup issues
+### Changed
+- CI/CD: Added retry logic for flaky steps across all platforms
+- Improved coverage gathering and cleanup in test runs
+- Updated dependencies in `uv.lock`
+## [3.8.2] - Previous Release
+### Added
+- Documentation site with comprehensive examples and API reference
+- Improved configuration for all OCR backends
+- Added hooks system for validation and post-processing
+- Language detection feature with `auto_detect_language` configuration option
+- New optional dependency group `langdetect` for automatic language detection
+### Changed
+- Refactored internal structure for better maintainability
+- Updated extraction functions to use config object instead of kwargs
+- Improved error messages and reporting
+## Previous Versions
+For a complete history of changes, please refer to the [GitHub releases page](https://github.com/strickvl/kreuzberg/releases).

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/index.md RENAMED Viewed

@@ -49,6 +49,7 @@ Kreuzberg addresses the complete document intelligence pipeline through a modula
 - **OCR Engines**: Tesseract (default), EasyOCR, PaddleOCR with automatic fallback strategies
 - **Data Extraction**: Text content, document metadata, table structures, and embedded resources
 - **Processing Capabilities**: Content chunking for RAG pipelines, language detection, format preservation
+- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
 - **Extensibility**: Plugin architecture for custom extractors and hooks
 ## Architecture Philosophy

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/basic-usage.md RENAMED Viewed

@@ -131,3 +131,31 @@ async def show_metadata():
 asyncio.run(show_metadata())
 ```
+## Document Classification
+Kreuzberg can automatically classify documents into categories (contracts, forms, invoices, receipts, reports):
+```python
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig
+async def classify_document():
+    config = ExtractionConfig(
+        auto_detect_document_type=True,
+        document_classification_mode="text",  # or "vision" for better accuracy
+        type_confidence_threshold=0.5,
+    )
+    result = await extract_file("invoice.pdf", config=config)
+    # Access classification results
+    if result.document_type:
+        print(f"Document type: {result.document_type}")
+        print(f"Confidence: {result.type_confidence:.2%}")
+    # The extracted content is still available
+    print(f"Content: {result.content[:200]}...")
+asyncio.run(classify_document())
+```

kreuzberg-3.9.1/docs/user-guide/document-classification.md ADDED Viewed

@@ -0,0 +1,53 @@
+# Automatic Document Classification
+Kreuzberg can automatically classify documents into common types like invoices, contracts, and receipts. This allows you to build custom processing pipelines tailored to each document type.
+## Enabling Document Classification
+To enable this feature, set `auto_detect_document_type=True` in your `ExtractionConfig`:
+```python
+from kreuzberg import ExtractionConfig, extract_file
+config = ExtractionConfig(auto_detect_document_type=True)
+result = await extract_file("path/to/your/document.pdf", config=config)
+if result.document_type:
+    print(f"Detected document type: {result.document_type}")
+    print(f"Confidence: {result.document_type_confidence:.2f}")
+```
+## Classification Modes
+You can choose between two classification modes using the `document_classification_mode` parameter in `ExtractionConfig`:
+- `"text"` (default): This mode uses a rule-based classifier that analyzes the extracted text for keywords and patterns. It's fast and works well for text-based documents.
+- `"vision"`: This mode uses layout information from OCR to identify document types. It's more accurate for scanned documents and images, but it requires the Tesseract OCR backend.
+Here's how to use the vision-based classifier:
+```python
+config = ExtractionConfig(
+    auto_detect_document_type=True,
+    document_classification_mode="vision",
+    force_ocr=True,  # Recommended for vision-based classification
+)
+```
+## Confidence Threshold
+You can control the minimum confidence required for a classification to be considered valid by setting the `type_confidence_threshold` in `ExtractionConfig`. The default value is `0.7`.
+```python
+config = ExtractionConfig(
+    auto_detect_document_type=True,
+    type_confidence_threshold=0.85,  # Require 85% confidence
+)
+```
+## Output
+The classification results are available in the `ExtractionResult` object:
+- `document_type`: The detected document type (e.g., `"invoice"`, `"contract"`) or `None` if no type was detected with sufficient confidence.
+- `type_confidence`: The confidence score of the detection (a float between 0.0 and 1.0) or `None`.

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/extraction-configuration.md RENAMED Viewed

@@ -31,6 +31,9 @@ max_chars = 2000
 max_overlap = 100
 ocr_backend = "tesseract"
 auto_detect_language = true
+auto_detect_document_type = true
+document_classification_mode = "text"  # or "vision"
+type_confidence_threshold = 0.5
 # Tesseract OCR configuration
 [tesseract]
@@ -76,6 +79,9 @@ force_ocr = false
 chunk_content = true
 extract_tables = true
 auto_detect_language = true
+auto_detect_document_type = true
+document_classification_mode = "text"
+type_confidence_threshold = 0.5
 [tool.kreuzberg.tesseract]
 language = "eng"

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/docs/user-guide/index.md RENAMED Viewed

@@ -8,6 +8,7 @@ This guide provides comprehensive documentation for the Kreuzberg document intel
 - [Extraction Configuration](extraction-configuration.md) - Configure the extraction process ([API](../api-reference/types.md#extractionconfig))
 - [Metadata Extraction](metadata-extraction.md) - Document metadata extraction ([API](../api-reference/types.md#metadata))
 - [Content Chunking](chunking.md) - Split documents into manageable chunks
+- [Document Classification](document-classification.md) - Automatic document type detection
 - [OCR Configuration](ocr-configuration.md) - Configure OCR settings ([API](../api-reference/ocr-configuration.md))
 - [OCR Backends](ocr-backends.md) - Choose and configure different OCR engines
 - [Supported Formats](supported-formats.md) - All supported document formats

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_chunker.py RENAMED Viewed

@@ -2,9 +2,9 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
-from kreuzberg import MissingDependencyError
 from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
 from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
+from kreuzberg.exceptions import MissingDependencyError
 if TYPE_CHECKING:
     from semantic_text_splitter import MarkdownSplitter, TextSplitter
@@ -36,11 +36,11 @@ def get_chunker(
     if key not in _chunkers:
         try:
             if mime_type == MARKDOWN_MIME_TYPE:
-                from semantic_text_splitter import MarkdownSplitter
+                from semantic_text_splitter import MarkdownSplitter  # noqa: PLC0415
                 _chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
             else:
-                from semantic_text_splitter import TextSplitter
+                from semantic_text_splitter import TextSplitter  # noqa: PLC0415
                 _chunkers[key] = TextSplitter(max_characters, overlap_characters)
         except ImportError as e:

{kreuzberg-3.8.2 → kreuzberg-3.9.1}/kreuzberg/_config.py RENAMED Viewed

@@ -95,7 +95,7 @@ def parse_ocr_backend_config(
         # Convert psm integer to PSMMode enum if needed
         processed_config = backend_config.copy()
         if "psm" in processed_config and isinstance(processed_config["psm"], int):
-            from kreuzberg._ocr._tesseract import PSMMode
+            from kreuzberg._ocr._tesseract import PSMMode  # noqa: PLC0415
             processed_config["psm"] = PSMMode(processed_config["psm"])
         return TesseractConfig(**processed_config)

kreuzberg 3.8.2__tar.gz → 3.9.1__tar.gz

kreuzberg 3.8.2tar.gz → 3.9.1tar.gz