PyPI - kreuzberg - Versions diffs - 3.4.2__tar.gz → 3.6.0__tar.gz - Mend

kreuzberg 3.4.2tar.gz → 3.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (196) hide show

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/publish-docker.yml RENAMED Viewed

@@ -1,20 +1,13 @@
-# .github/workflows/publish-docker.yml
 name: Publish Docker Images
 on:
-  workflow_run:
-    workflows: ["Release"]
-    types:
-      - completed
-    branches:
-      - main
   workflow_dispatch:
 jobs:
   build-and-push:
     runs-on: ubuntu-latest
-    if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
+    if: ${{ github.event_name == 'workflow_dispatch' }}
     permissions:
       contents: read
       packages: write
@@ -24,7 +17,7 @@ jobs:
         include:
           - name: core
             extras: ""
-            tag_suffix: "" # The base image tag (includes API + tesseract)
+            tag_suffix: ""
           - name: easyocr
             extras: "easyocr"
             tag_suffix: "-easyocr"
@@ -42,27 +35,14 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event.workflow_run.head_branch || github.ref }}
+          ref: ${{ github.ref }}
       - name: Get release version
         id: get_version
         run: |
-          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
-            # For manual dispatch, get the latest tag by listing all tags
-            git fetch --tags
-            VERSION=$(git tag --sort=-version:refname | head -n1)
-          else
-            # For workflow_run, use the head branch
-            VERSION="${{ github.event.workflow_run.head_branch }}"
-            # If triggered by a tag, extract version
-            if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
-              VERSION="$VERSION"
-            else
-              # Get the latest tag by listing all tags
-              git fetch --tags
-              VERSION=$(git tag --sort=-version:refname | head -n1)
-            fi
-          fi
+          # Get the latest tag by listing all tags
+          git fetch --tags
+          VERSION=$(git tag --sort=-version:refname | head -n1)
           echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
       - name: Set up QEMU
@@ -89,7 +69,7 @@ jobs:
             type=raw,value=latest${{ matrix.tag_suffix }}
       - name: Build and push Docker image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: .
           file: ./.docker/Dockerfile

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/release.yaml RENAMED Viewed

@@ -29,3 +29,15 @@ jobs:
       - name: Publish
         uses: pypa/gh-action-pypi-publish@release/v1
+      - name: Trigger Docker Build
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.actions.createWorkflowDispatch({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              workflow_id: 'publish-docker.yml',
+              ref: 'main'
+            });

kreuzberg-3.6.0/.gitmodules ADDED Viewed

@@ -0,0 +1,3 @@
+[submodule "python-text-extraction-libs-benchmarks"]
+	path = python-text-extraction-libs-benchmarks
+	url = https://github.com/Goldziher/python-text-extraction-libs-benchmarks.git

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.4.2
+Version: 3.6.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
 License-File: LICENSE
-Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
+Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
@@ -36,16 +36,19 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
 Provides-Extra: all
 Requires-Dist: click>=8.2.1; extra == 'all'
 Requires-Dist: easyocr>=1.7.2; extra == 'all'
+Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
 Requires-Dist: gmft>=0.4.2; extra == 'all'
-Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
+Requires-Dist: keybert>=0.9.0; extra == 'all'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
 Requires-Dist: paddleocr>=3.1.0; extra == 'all'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
 Requires-Dist: rich>=14.0.0; extra == 'all'
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
 Requires-Dist: setuptools>=80.9.0; extra == 'all'
+Requires-Dist: spacy>=3.8.7; extra == 'all'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
 Provides-Extra: api
-Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
 Provides-Extra: chunking
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
 Provides-Extra: cli
@@ -54,8 +57,13 @@ Requires-Dist: rich>=14.0.0; extra == 'cli'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
 Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
+Provides-Extra: entity-extraction
+Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
+Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
 Provides-Extra: gmft
 Requires-Dist: gmft>=0.4.2; extra == 'gmft'
+Provides-Extra: langdetect
+Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
 Provides-Extra: paddleocr
 Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/ai-rulez.yaml RENAMED Viewed

@@ -1,6 +1,6 @@
 metadata:
   name: "Kreuzberg"
-  version: "3.4.0"
+  version: "3.5.0"
   description: "A text extraction library supporting PDFs, images, office documents and more"
 outputs:
@@ -115,6 +115,7 @@ rules:
       - **OCR Backends**: Pluggable OCR engines with separate configuration classes
       - **GMFT Integration**: Table extraction using GMFT library for PDFs
       - **Chunking**: Text splitting functionality in `_chunker.py`
+      - **Language Detection**: Automatic language detection using fast-langdetect
       - **Async/Sync**: Primary async implementation with sync wrappers in `_utils/_sync.py`
       - **API Server**: REST API using Litestar framework in `_api/main.py`
       - **CLI**: Command-line interface for batch processing and automation
@@ -144,6 +145,8 @@ rules:
       - Mock OCR responses for predictable testing
       - Both sync and async test variants
       - Comprehensive error case coverage
+      - OCR tests marked as `xfail` in CI environments for resilience
+      - Integration tests use timeouts and retry logic where appropriate
   - name: "Important Instructions"
     priority: 10
@@ -160,16 +163,17 @@ rules:
     priority: 6
     content: |
       ### GitHub Actions Workflows
-      - **Release**: Automated PyPI publishing via GitHub releases
-      - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64)
+      - **Release**: Automated PyPI publishing via GitHub releases, triggers Docker builds
+      - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64), triggered by releases
       - **Documentation**: Auto-deploy to GitHub Pages on docs changes
+      - **CI**: Comprehensive testing across multiple Python versions and platforms
       ### Docker Variants
-      - **Core** (`goldziher/kreuzberg:v3.4.0`): API + Tesseract OCR
-      - **EasyOCR** (`goldziher/kreuzberg:v3.4.0-easyocr`): Core + EasyOCR
-      - **PaddleOCR** (`goldziher/kreuzberg:v3.4.0-paddle`): Core + PaddleOCR
-      - **GMFT** (`goldziher/kreuzberg:v3.4.0-gmft`): Core + table extraction
-      - **All** (`goldziher/kreuzberg:v3.4.0-all`): All features included
+      - **Core** (`goldziher/kreuzberg:v3.5.0`): API + Tesseract OCR
+      - **EasyOCR** (`goldziher/kreuzberg:v3.5.0-easyocr`): Core + EasyOCR
+      - **PaddleOCR** (`goldziher/kreuzberg:v3.5.0-paddle`): Core + PaddleOCR
+      - **GMFT** (`goldziher/kreuzberg:v3.5.0-gmft`): Core + table extraction
+      - **All** (`goldziher/kreuzberg:v3.5.0-all`): All features included
       ### Manual Triggers
       - Docker builds: `gh workflow run "Publish Docker Images"`
@@ -191,8 +195,9 @@ rules:
       chunking = ["semantic-text-splitter>=0.27.0"]
       easyocr = ["easyocr>=1.7.2"]
       gmft = ["gmft>=0.4.2"]
+      langdetect = ["fast-langdetect>=0.2.0"]
       paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
-      all = ["kreuzberg[api,chunking,cli,easyocr,gmft,paddleocr]"]
+      all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
       ```
       ### Installation Patterns
@@ -207,6 +212,17 @@ rules:
       - **Development**: Uses dependency groups in pyproject.toml
 sections:
+  - title: "Language Detection"
+    content: |
+      ### Automatic Language Detection (v3.5.0+)
+      - **Feature**: Automatically detect languages in extracted text
+      - **Implementation**: Uses fast-langdetect library for high-performance detection
+      - **Configuration**:
+        - Enable with `auto_detect_language=True` in `ExtractionConfig`
+        - Configure via `LanguageDetectionConfig` for confidence thresholds
+      - **Output**: Results available in `ExtractionResult.detected_languages`
+      - **Integration**: Works with all extraction methods and file types
   - title: "Planned Features"
     content: |
       ### Structured Extraction (Issue #55)

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/benchmark_baseline.py RENAMED Viewed

@@ -8,7 +8,7 @@ from kreuzberg import ExtractionConfig, batch_extract_file, extract_file_sync
 from kreuzberg._utils._document_cache import clear_document_cache, get_document_cache
-async def run_baseline_benchmark() -> dict[str, object] | None:  # type: ignore[syntax]
+async def run_baseline_benchmark() -> dict[str, object] | None:
     """Run comprehensive baseline benchmark."""
     test_files_dir = Path("tests/test_source_files")
     test_files = list(test_files_dir.glob("*.pdf"))

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/final_benchmark.py RENAMED Viewed

@@ -15,7 +15,7 @@ from kreuzberg._utils._cache import (
 )
-async def run_final_benchmark() -> dict[str, object] | None:  # type: ignore[syntax]
+async def run_final_benchmark() -> dict[str, object] | None:
     """Run comprehensive benchmark of all caching improvements."""
     test_files_dir = Path("tests/test_source_files")
     pdf_files = list(test_files_dir.glob("*.pdf"))

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/serialization_benchmark.py RENAMED Viewed

@@ -101,7 +101,6 @@ def benchmark_serialization() -> dict[str, object]:
     json_serialize = analyze_times(json_serialize_times, "JSON Serialize")
     json_deserialize = analyze_times(json_deserialize_times, "JSON Deserialize")
-    # Type casting for arithmetic operations
     json_ser_mean = json_serialize["mean"]
     json_deser_mean = json_deserialize["mean"]
     msgpack_ser_mean = msgpack_serialize["mean"]

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/performance.md RENAMED Viewed

@@ -125,22 +125,59 @@ The async API leverages Python's asyncio with intelligent task scheduling:
 1. **Configure OCR appropriately** for your document types
 1. **Profile your specific workload** - results vary by content
-### Configuration Examples
+### Optimized Default Configuration
+Kreuzberg's default configuration is **optimized out-of-the-box for modern PDFs and standard documents**:
 ```python
-from kreuzberg import ExtractionConfig, extract_file_sync
-from kreuzberg._ocr import TesseractConfig
+from kreuzberg import ExtractionConfig
-# Optimized for speed
-fast_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=6))  # Assume uniform text block
+# Default configuration - already optimized for modern documents
+config = ExtractionConfig()  # Uses optimized defaults:
+# - PSM: AUTO_ONLY (fast without orientation detection)
+# - Language model: Disabled for performance
+# - Dictionary correction: Enabled for accuracy
+```
-# Optimized for accuracy
-accurate_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=1))  # Auto page segmentation
+### Advanced Configuration Examples
-# For simple documents (no OCR)
-text_only_config = ExtractionConfig(force_ocr=False, ocr_backend=None)
+```python
+from kreuzberg import ExtractionConfig, extract_file_sync
+from kreuzberg._ocr._tesseract import TesseractConfig, PSMMode
+# Maximum speed configuration (for high-volume processing)
+speed_config = ExtractionConfig(
+    ocr_backend="tesseract",
+    ocr_config=TesseractConfig(
+        psm=PSMMode.SINGLE_BLOCK,  # Assume simple layout
+        language_model_ngram_on=False,  # Already disabled by default
+        tessedit_enable_dict_correction=False,  # Disable for maximum speed
+    ),
+)
+# Maximum accuracy configuration (for degraded documents)
+accuracy_config = ExtractionConfig(
+    ocr_backend="tesseract",
+    ocr_config=TesseractConfig(
+        psm=PSMMode.AUTO,  # Full analysis with orientation detection
+        language_model_ngram_on=True,  # Enable for historical/degraded text
+        tessedit_enable_dict_correction=True,  # Default - keep enabled
+    ),
+)
+# No OCR configuration (text documents only)
+text_only_config = ExtractionConfig(ocr_backend=None, force_ocr=False)
 ```
+### Performance Optimization Tips
+Based on comprehensive benchmarking with 138+ documents:
+1. **Disable OCR for text documents**: Setting `ocr_backend=None` provides significant speedup for documents with text layers
+1. **Use PSM `AUTO_ONLY` (default)**: Optimized for modern documents without orientation detection overhead
+1. **Language model trade-offs**: Disabling `language_model_ngram_on` can provide 30x+ speedup with minimal quality impact on clean documents
+1. **Dictionary correction**: Disabling `tessedit_enable_dict_correction` speeds up processing for technical documents
 ### Batch Processing Best Practices
 ```python

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/types.md RENAMED Viewed

@@ -40,10 +40,28 @@ Configuration options for the GMFT table extraction engine:
 ::: kreuzberg.GMFTConfig
+## Entity Extraction Configuration
+Configuration options for spaCy-based entity extraction:
+::: kreuzberg.SpacyEntityExtractionConfig
+## Language Detection Configuration
+Configuration options for automatic language detection:
+::: kreuzberg.LanguageDetectionConfig
 ## PSMMode (Page Segmentation Mode)
 ::: kreuzberg.PSMMode
+## Entity
+Represents an extracted named entity:
+::: kreuzberg.Entity
 ## Metadata
 A TypedDict that contains optional metadata fields extracted from documents:

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/changelog.md RENAMED Viewed

@@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Documentation site with comprehensive examples and API reference
 - Improved configuration for all OCR backends
 - Added hooks system for validation and post-processing
+- Language detection feature with `auto_detect_language` configuration option
+- New optional dependency group `langdetect` for automatic language detection
 ### Changed

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/examples/extraction-examples.md RENAMED Viewed

@@ -79,6 +79,47 @@ async def extract_with_different_backends():
     print(f"No OCR result: {result.content[:100]}...")
 ```
+## Language Detection
+```python
+from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
+async def detect_document_language():
+    # Simple automatic language detection
+    result = await extract_file("document.pdf", config=ExtractionConfig(auto_detect_language=True))
+    # Access detected languages
+    if result.detected_languages:
+        print(f"Detected languages: {', '.join(result.detected_languages)}")
+        # Example output: "Detected languages: en, de, fr"
+async def detect_multilingual_document():
+    # Advanced multilingual detection with custom configuration
+    lang_config = LanguageDetectionConfig(
+        multilingual=True,  # Detect multiple languages in mixed text
+        top_k=5,  # Return top 5 languages
+        low_memory=False,  # Use high accuracy mode
+    )
+    result = await extract_file(
+        "multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
+    )
+    if result.detected_languages:
+        print(f"Detected languages: {result.detected_languages}")
+        # Use detected languages for OCR
+        from kreuzberg import TesseractConfig
+        # Create language string for Tesseract (e.g., "eng+deu+fra")
+        tesseract_langs = "+".join(result.detected_languages[:3])
+        result_with_ocr = await extract_file(
+            "multilingual_document.pdf",
+            config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=tesseract_langs)),
+        )
+```
 ## Table Extraction
 ```python
@@ -148,6 +189,83 @@ async def process_upload(file_content: bytes, mime_type: str):
             print(f"{key}: {value}")
 ```
+## Keywords
+Kreuzberg supports keywords and regex extraction as follows:
+```python
+from kreuzberg import ExtractionConfig, extract_file
+async def extract_keywords():
+    config = ExtractionConfig(
+        extract_keywords=True,
+        keyword_count=5,  # defaults to 10 if not set
+    )
+    result = await extract_file(
+        "document.pdf",
+        config=config,
+    )
+    print(f"Keywords: {result.keywords}")
+```
+## Entity and Keyword Extraction
+Kreuzberg can extract named entities using spaCy and keywords using KeyBERT. It automatically detects entities like people, organizations, locations, and more, plus supports custom regex patterns:
+```python
+from kreuzberg import ExtractionConfig, extract_file, SpacyEntityExtractionConfig
+async def extract_entities_and_keywords():
+    # Basic extraction
+    config = ExtractionConfig(
+        extract_entities=True,
+        extract_keywords=True,
+        keyword_count=5,
+        custom_entity_patterns={
+            "INVOICE_ID": r"INV-\d+",
+            "EMAIL": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
+        },
+    )
+    result = await extract_file("document.pdf", config=config)
+    # Print extracted entities
+    if result.entities:
+        for entity in result.entities:
+            print(f"{entity.type}: {entity.text}")
+    # Print extracted keywords
+    if result.keywords:
+        for keyword, score in result.keywords:
+            print(f"Keyword: {keyword} (score: {score:.3f})")
+async def extract_multilingual_entities():
+    # Configure spaCy for multiple languages
+    spacy_config = SpacyEntityExtractionConfig(
+        language_models={
+            "en": "en_core_web_sm",
+            "de": "de_core_news_sm",
+            "fr": "fr_core_news_sm",
+        },
+        fallback_to_multilingual=True,
+    )
+    config = ExtractionConfig(
+        auto_detect_language=True,  # Automatically detect document languages
+        extract_entities=True,
+        spacy_entity_extraction_config=spacy_config,
+    )
+    result = await extract_file("multilingual_document.pdf", config=config)
+    if result.detected_languages:
+        print(f"Detected languages: {result.detected_languages}")
+    if result.entities:
+        print(f"Extracted {len(result.entities)} entities")
+        for entity in result.entities:
+            print(f"  {entity.type}: {entity.text}")
+```
 ## Synchronous API
 For cases where async isn't needed or available:

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/getting-started/installation.md RENAMED Viewed

@@ -102,6 +102,38 @@ Table extraction is an optional feature that allows Kreuzberg to extract tables
 pip install "kreuzberg[gmft]"
 ```
+### Language Detection
+Language detection is an optional feature that automatically detects the language of extracted text. It uses the [fast-langdetect](https://github.com/LlmKira/fast-langdetect) package. To install Kreuzberg with language detection support, you can use:
+```shell
+pip install "kreuzberg[langdetect]"
+```
+### Entity and Keyword Extraction
+Entity and keyword extraction are optional features that extract named entities and keywords from documents. Entity extraction uses [spaCy](https://spacy.io/) for multilingual named entity recognition, while keyword extraction uses [KeyBERT](https://github.com/MaartenGr/KeyBERT) for semantic keyword extraction:
+```shell
+pip install "kreuzberg[entity-extraction]"
+```
+After installation, you'll need to download the spaCy language models you plan to use:
+```shell
+# Download English model (most common)
+python -m spacy download en_core_web_sm
+# Download other language models as needed
+python -m spacy download de_core_news_sm  # German
+python -m spacy download fr_core_news_sm  # French
+python -m spacy download es_core_news_sm  # Spanish
+```
+!!! note "Language Model Requirements"
+    spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
 ### All Optional Dependencies
 To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -113,5 +145,5 @@ pip install "kreuzberg[all]"
 This is equivalent to:
 ```shell
-pip install "kreuzberg[chunking,easyocr,gmft,paddleocr]"
+pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
 ```

{kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/docker.md RENAMED Viewed

@@ -124,6 +124,34 @@ Additional dependencies by variant:
 - **gmft**: GMFT for table extraction
 - **all**: All optional dependencies
+### Health Check
+All Docker images include a health check endpoint:
+```bash
+# Check API health
+curl http://localhost:8000/health
+```
+Returns a JSON response with service status and version information.
+### Observability
+The Docker images include built-in OpenTelemetry instrumentation via Litestar:
+- **Tracing**: Automatic request/response tracing
+- **Metrics**: Performance and usage metrics
+- **Logging**: Structured JSON logging
+Configure via standard OpenTelemetry environment variables:
+```bash
+docker run -p 8000:8000 \
+  -e OTEL_SERVICE_NAME=kreuzberg-api \
+  -e OTEL_EXPORTER_OTLP_ENDPOINT=http://your-collector:4317 \
+  goldziher/kreuzberg:latest
+```
 ### Environment Variables
 - `PYTHONUNBUFFERED=1` - Ensures proper logging output
@@ -150,6 +178,12 @@ server {
         client_max_body_size 100M;
         proxy_read_timeout 300s;
     }
+    # Health check endpoint
+    location /health {
+        proxy_pass http://localhost:8000/health;
+        access_log off;
+    }
 }
 ```
@@ -175,6 +209,21 @@ spec:
         image: goldziher/kreuzberg:latest
         ports:
         - containerPort: 8000
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 5
+          periodSeconds: 5
+        env:
+        - name: OTEL_SERVICE_NAME
+          value: "kreuzberg-api"
         resources:
           requests:
             memory: "512Mi"

kreuzberg 3.4.2__tar.gz → 3.6.0__tar.gz

kreuzberg 3.4.2tar.gz → 3.6.0tar.gz