PyPI - kreuzberg - Versions diffs - 3.4.0__tar.gz → 3.4.2__tar.gz - Mend

kreuzberg 3.4.0tar.gz → 3.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (188) hide show

{kreuzberg-3.4.0 → kreuzberg-3.4.2}/.github/workflows/ci.yaml RENAMED Viewed

@@ -50,7 +50,7 @@ jobs:
     strategy:
       matrix:
         os: [ ubuntu-latest, macOS-latest, windows-latest ]
-        python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.9", "3.10", "3.11", "3.12", "3.13"]') }}
+        python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
     runs-on: ${{ matrix.os }}
     timeout-minutes: 30
     steps:

kreuzberg-3.4.2/.github/workflows/docs.yml ADDED Viewed

@@ -0,0 +1,66 @@
+name: Deploy Documentation
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'docs/**'
+      - 'mkdocs.yaml'
+      - '.github/workflows/docs.yml'
+  workflow_dispatch:
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+      - name: Install dependencies
+        run: |
+          uv sync --group doc
+      - name: Setup Pages
+        uses: actions/configure-pages@v5
+      - name: Build documentation
+        run: |
+          uv run mkdocs build --clean --strict
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: ./site
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4

{kreuzberg-3.4.0 → kreuzberg-3.4.2}/.github/workflows/publish-docker.yml RENAMED Viewed

@@ -9,11 +9,12 @@ on:
       - completed
     branches:
       - main
+  workflow_dispatch:
 jobs:
   build-and-push:
     runs-on: ubuntu-latest
-    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
     permissions:
       contents: read
       packages: write
@@ -41,20 +42,28 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event.workflow_run.head_branch }}
+          ref: ${{ github.event.workflow_run.head_branch || github.ref }}
       - name: Get release version
         id: get_version
         run: |
-          echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
-          # If triggered by a tag, extract version
-          if [[ "${{ github.event.workflow_run.head_branch }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
-            echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
-          else
-            # Get the latest tag
+          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            # For manual dispatch, get the latest tag by listing all tags
             git fetch --tags
-            echo "VERSION=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
+            VERSION=$(git tag --sort=-version:refname | head -n1)
+          else
+            # For workflow_run, use the head branch
+            VERSION="${{ github.event.workflow_run.head_branch }}"
+            # If triggered by a tag, extract version
+            if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
+              VERSION="$VERSION"
+            else
+              # Get the latest tag by listing all tags
+              git fetch --tags
+              VERSION=$(git tag --sort=-version:refname | head -n1)
+            fi
           fi
+          echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
@@ -94,6 +103,7 @@ jobs:
       - name: Update Docker Hub README
         uses: peter-evans/dockerhub-description@v4
         if: matrix.name == 'core'
+        continue-on-error: true
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}

{kreuzberg-3.4.0 → kreuzberg-3.4.2}/.gitignore RENAMED Viewed

@@ -32,3 +32,4 @@ docker-compose.yaml
 GEMINI.md
 prompt_template.egg-info/
 requirements.txt
+site/

kreuzberg-3.4.2/PKG-INFO ADDED Viewed

@@ -0,0 +1,232 @@
+Metadata-Version: 2.4
+Name: kreuzberg
+Version: 3.4.2
+Summary: A text extraction library supporting PDFs, images, office documents and more
+Project-URL: homepage, https://github.com/Goldziher/kreuzberg
+Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
+License: MIT
+License-File: LICENSE
+Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: General
+Classifier: Topic :: Utilities
+Classifier: Typing :: Typed
+Requires-Python: >=3.10
+Requires-Dist: anyio>=4.9.0
+Requires-Dist: charset-normalizer>=3.4.2
+Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
+Requires-Dist: html-to-markdown>=1.4.0
+Requires-Dist: msgspec>=0.18.0
+Requires-Dist: playa-pdf>=0.6.1
+Requires-Dist: psutil>=7.0.0
+Requires-Dist: pypdfium2==4.30.0
+Requires-Dist: python-calamine>=0.3.2
+Requires-Dist: python-pptx>=1.0.2
+Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
+Provides-Extra: all
+Requires-Dist: click>=8.2.1; extra == 'all'
+Requires-Dist: easyocr>=1.7.2; extra == 'all'
+Requires-Dist: gmft>=0.4.2; extra == 'all'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
+Requires-Dist: paddleocr>=3.1.0; extra == 'all'
+Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
+Requires-Dist: rich>=14.0.0; extra == 'all'
+Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
+Requires-Dist: setuptools>=80.9.0; extra == 'all'
+Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
+Provides-Extra: api
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
+Provides-Extra: chunking
+Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
+Provides-Extra: cli
+Requires-Dist: click>=8.2.1; extra == 'cli'
+Requires-Dist: rich>=14.0.0; extra == 'cli'
+Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
+Provides-Extra: easyocr
+Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
+Provides-Extra: gmft
+Requires-Dist: gmft>=0.4.2; extra == 'gmft'
+Provides-Extra: paddleocr
+Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
+Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
+Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
+Description-Content-Type: text/markdown
+# Kreuzberg
+[![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
+[![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
+[![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+**High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
+📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
+## Why Kreuzberg?
+- **🚀 Fastest Performance**: [Benchmarked](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) as the fastest text extraction library
+- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
+- **⚡ Dual APIs**: Only library with both sync and async support
+- **🔧 Zero Configuration**: Works out of the box with sane defaults
+- **🏠 Local Processing**: No cloud dependencies or external API calls
+- **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
+- **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
+- **🐳 Production Ready**: CLI, REST API, and Docker images included
+## Quick Start
+### Installation
+```bash
+# Basic installation
+pip install kreuzberg
+# With optional features
+pip install "kreuzberg[cli,api]"        # CLI + REST API
+pip install "kreuzberg[easyocr,gmft]"   # EasyOCR + table extraction
+pip install "kreuzberg[all]"            # Everything
+```
+### System Dependencies
+```bash
+# Ubuntu/Debian
+sudo apt-get install tesseract-ocr pandoc
+# macOS
+brew install tesseract pandoc
+# Windows
+choco install tesseract pandoc
+```
+### Basic Usage
+```python
+import asyncio
+from kreuzberg import extract_file
+async def main():
+    # Extract from any document type
+    result = await extract_file("document.pdf")
+    print(result.content)
+    print(result.metadata)
+asyncio.run(main())
+```
+## Deployment Options
+### 🐳 Docker (Recommended)
+```bash
+# Run API server
+docker run -p 8000:8000 goldziher/kreuzberg:3.4.0
+# Extract files
+curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
+```
+Available variants: `3.4.0`, `3.4.0-easyocr`, `3.4.0-paddle`, `3.4.0-gmft`, `3.4.0-all`
+### 🌐 REST API
+```bash
+# Install and run
+pip install "kreuzberg[api]"
+litestar --app kreuzberg._api.main:app run
+# Health check
+curl http://localhost:8000/health
+# Extract files
+curl -X POST http://localhost:8000/extract -F "data=@file.pdf"
+```
+### 💻 Command Line
+```bash
+# Install CLI
+pip install "kreuzberg[cli]"
+# Extract to stdout
+kreuzberg extract document.pdf
+# JSON output with metadata
+kreuzberg extract document.pdf --output-format json --show-metadata
+# Batch processing
+kreuzberg extract *.pdf --output-dir ./extracted/
+```
+## Supported Formats
+| Category          | Formats                        |
+| ----------------- | ------------------------------ |
+| **Documents**     | PDF, DOCX, DOC, RTF, TXT, EPUB |
+| **Images**        | JPG, PNG, TIFF, BMP, GIF, WEBP |
+| **Spreadsheets**  | XLSX, XLS, CSV, ODS            |
+| **Presentations** | PPTX, PPT, ODP                 |
+| **Web**           | HTML, XML, MHTML               |
+| **Archives**      | Support via extraction         |
+## Performance
+**Fastest extraction speeds** with minimal resource usage:
+| Library       | Speed          | Memory        | Size        | Success Rate |
+| ------------- | -------------- | ------------- | ----------- | ------------ |
+| **Kreuzberg** | ⚡ **Fastest** | 💾 **Lowest** | 📦 **71MB** | ✅ **100%**  |
+| Unstructured  | 2-3x slower    | 2x higher     | 146MB       | 95%          |
+| MarkItDown    | 3-4x slower    | 3x higher     | 251MB       | 90%          |
+| Docling       | 4-5x slower    | 10x higher    | 1,032MB     | 85%          |
+> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
+## Documentation
+### Quick Links
+- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
+- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
+- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
+- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
+- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
+- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
+- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
+## Advanced Features
+- **📊 Table Extraction**: Extract tables from PDFs with GMFT
+- **🧩 Content Chunking**: Split documents for RAG applications
+- **🎯 Custom Extractors**: Extend with your own document handlers
+- **🔧 Configuration**: Flexible TOML-based configuration
+- **🪝 Hooks**: Pre/post-processing customization
+- **🌍 Multi-language OCR**: 100+ languages supported
+- **⚙️ Metadata Extraction**: Rich document metadata
+- **🔄 Batch Processing**: Efficient bulk document processing
+## License
+MIT License - see [LICENSE](LICENSE) for details.
+______________________________________________________________________
+<div align="center">
+**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
+Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
+</div>

kreuzberg-3.4.2/README.md ADDED Viewed

@@ -0,0 +1,168 @@
+# Kreuzberg
+[![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
+[![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
+[![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+**High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
+📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
+## Why Kreuzberg?
+- **🚀 Fastest Performance**: [Benchmarked](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) as the fastest text extraction library
+- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
+- **⚡ Dual APIs**: Only library with both sync and async support
+- **🔧 Zero Configuration**: Works out of the box with sane defaults
+- **🏠 Local Processing**: No cloud dependencies or external API calls
+- **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
+- **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
+- **🐳 Production Ready**: CLI, REST API, and Docker images included
+## Quick Start
+### Installation
+```bash
+# Basic installation
+pip install kreuzberg
+# With optional features
+pip install "kreuzberg[cli,api]"        # CLI + REST API
+pip install "kreuzberg[easyocr,gmft]"   # EasyOCR + table extraction
+pip install "kreuzberg[all]"            # Everything
+```
+### System Dependencies
+```bash
+# Ubuntu/Debian
+sudo apt-get install tesseract-ocr pandoc
+# macOS
+brew install tesseract pandoc
+# Windows
+choco install tesseract pandoc
+```
+### Basic Usage
+```python
+import asyncio
+from kreuzberg import extract_file
+async def main():
+    # Extract from any document type
+    result = await extract_file("document.pdf")
+    print(result.content)
+    print(result.metadata)
+asyncio.run(main())
+```
+## Deployment Options
+### 🐳 Docker (Recommended)
+```bash
+# Run API server
+docker run -p 8000:8000 goldziher/kreuzberg:3.4.0
+# Extract files
+curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
+```
+Available variants: `3.4.0`, `3.4.0-easyocr`, `3.4.0-paddle`, `3.4.0-gmft`, `3.4.0-all`
+### 🌐 REST API
+```bash
+# Install and run
+pip install "kreuzberg[api]"
+litestar --app kreuzberg._api.main:app run
+# Health check
+curl http://localhost:8000/health
+# Extract files
+curl -X POST http://localhost:8000/extract -F "data=@file.pdf"
+```
+### 💻 Command Line
+```bash
+# Install CLI
+pip install "kreuzberg[cli]"
+# Extract to stdout
+kreuzberg extract document.pdf
+# JSON output with metadata
+kreuzberg extract document.pdf --output-format json --show-metadata
+# Batch processing
+kreuzberg extract *.pdf --output-dir ./extracted/
+```
+## Supported Formats
+| Category          | Formats                        |
+| ----------------- | ------------------------------ |
+| **Documents**     | PDF, DOCX, DOC, RTF, TXT, EPUB |
+| **Images**        | JPG, PNG, TIFF, BMP, GIF, WEBP |
+| **Spreadsheets**  | XLSX, XLS, CSV, ODS            |
+| **Presentations** | PPTX, PPT, ODP                 |
+| **Web**           | HTML, XML, MHTML               |
+| **Archives**      | Support via extraction         |
+## Performance
+**Fastest extraction speeds** with minimal resource usage:
+| Library       | Speed          | Memory        | Size        | Success Rate |
+| ------------- | -------------- | ------------- | ----------- | ------------ |
+| **Kreuzberg** | ⚡ **Fastest** | 💾 **Lowest** | 📦 **71MB** | ✅ **100%**  |
+| Unstructured  | 2-3x slower    | 2x higher     | 146MB       | 95%          |
+| MarkItDown    | 3-4x slower    | 3x higher     | 251MB       | 90%          |
+| Docling       | 4-5x slower    | 10x higher    | 1,032MB     | 85%          |
+> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
+## Documentation
+### Quick Links
+- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
+- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
+- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
+- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
+- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
+- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
+- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
+## Advanced Features
+- **📊 Table Extraction**: Extract tables from PDFs with GMFT
+- **🧩 Content Chunking**: Split documents for RAG applications
+- **🎯 Custom Extractors**: Extend with your own document handlers
+- **🔧 Configuration**: Flexible TOML-based configuration
+- **🪝 Hooks**: Pre/post-processing customization
+- **🌍 Multi-language OCR**: 100+ languages supported
+- **⚙️ Metadata Extraction**: Rich document metadata
+- **🔄 Batch Processing**: Efficient bulk document processing
+## License
+MIT License - see [LICENSE](LICENSE) for details.
+______________________________________________________________________
+<div align="center">
+**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
+Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
+</div>

{kreuzberg-3.4.0 → kreuzberg-3.4.2}/ai-rulez.yaml RENAMED Viewed

@@ -1,6 +1,6 @@
 metadata:
   name: "Kreuzberg"
-  version: "3.2.0"
+  version: "3.4.0"
   description: "A text extraction library supporting PDFs, images, office documents and more"
 outputs:
@@ -96,8 +96,9 @@ rules:
       - Run manually: `pre-commit run --all-files`
       ### Documentation
-      - Build docs: `mkdocs build`
-      - Serve docs locally: `mkdocs serve`
+      - Build docs: `uv run mkdocs build --clean --strict`
+      - Serve docs locally: `uv run mkdocs serve`
+      - Install doc dependencies: `uv sync --group doc`
   - name: "Architecture"
     priority: 9
@@ -115,6 +116,8 @@ rules:
       - **GMFT Integration**: Table extraction using GMFT library for PDFs
       - **Chunking**: Text splitting functionality in `_chunker.py`
       - **Async/Sync**: Primary async implementation with sync wrappers in `_utils/_sync.py`
+      - **API Server**: REST API using Litestar framework in `_api/main.py`
+      - **CLI**: Command-line interface for batch processing and automation
       ### Adding New Features
       - New extractors: Inherit from `BaseExtractor` and register with `ExtractorRegistry`
@@ -153,6 +156,56 @@ rules:
       - All builtin imports should be at the top level (except for cyclical or optional dependencies)
       - When committing, always use the format specified in the CLAUDE.md
+  - name: "CI/CD and Deployment"
+    priority: 6
+    content: |
+      ### GitHub Actions Workflows
+      - **Release**: Automated PyPI publishing via GitHub releases
+      - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64)
+      - **Documentation**: Auto-deploy to GitHub Pages on docs changes
+      ### Docker Variants
+      - **Core** (`goldziher/kreuzberg:v3.4.0`): API + Tesseract OCR
+      - **EasyOCR** (`goldziher/kreuzberg:v3.4.0-easyocr`): Core + EasyOCR
+      - **PaddleOCR** (`goldziher/kreuzberg:v3.4.0-paddle`): Core + PaddleOCR
+      - **GMFT** (`goldziher/kreuzberg:v3.4.0-gmft`): Core + table extraction
+      - **All** (`goldziher/kreuzberg:v3.4.0-all`): All features included
+      ### Manual Triggers
+      - Docker builds: `gh workflow run "Publish Docker Images"`
+      - Documentation: Auto-deploys on docs/ changes
+      ### Common Issues
+      - **Docker version detection**: Use `git tag --sort=-version:refname | head -n1` not `git describe`
+      - **Docs dependencies**: Use `uv sync --group doc` for proper mkdocs-material[imaging] support
+      - **Docker Hub README**: May fail due to permissions, use `continue-on-error: true`
+  - name: "Package Management"
+    priority: 6
+    content: |
+      ### Optional Dependencies Structure
+      ```toml
+      [project.optional-dependencies]
+      api = ["litestar[standard,structlog,opentelemetry]>=2.1.6"]
+      cli = ["click>=8.2.1", "rich>=14.0.0", "tomli>=2.0.0; python_version<'3.11'"]
+      chunking = ["semantic-text-splitter>=0.27.0"]
+      easyocr = ["easyocr>=1.7.2"]
+      gmft = ["gmft>=0.4.2"]
+      paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
+      all = ["kreuzberg[api,chunking,cli,easyocr,gmft,paddleocr]"]
+      ```
+      ### Installation Patterns
+      - Basic: `pip install kreuzberg`
+      - With features: `pip install "kreuzberg[api,cli]"`
+      - All features: `pip install "kreuzberg[all]"`
+      - Development: `uv sync --all-extras`
+      ### Dependencies
+      - **Core**: pypdfium2, playa-pdf, python-pptx, etc.
+      - **System**: tesseract-ocr, pandoc (via package manager)
+      - **Development**: Uses dependency groups in pyproject.toml
 sections:
   - title: "Planned Features"
     content: |

{kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/contributing.md RENAMED Viewed

@@ -68,7 +68,7 @@ Use [Conventional Commits](https://www.conventionalcommits.org/):
 ## Notes
-- Python 3.9-3.13 supported
+- Python 3.10-3.13 supported
 - System dependencies (optional): Tesseract, Pandoc
 - Pre-commit runs automatically on commit
 - Join our [Discord](https://discord.gg/pXxagNK2zN) for help

{kreuzberg-3.4.0 → kreuzberg-3.4.2}/docs/user-guide/docker.md RENAMED Viewed

@@ -7,11 +7,12 @@ Kreuzberg provides official Docker images for easy deployment and containerized
 Docker images are available on [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg):
 - `goldziher/kreuzberg:latest` - Core image with API server and Tesseract OCR
-- `goldziher/kreuzberg:VERSION` - Specific version (e.g., `3.0.0`)
-- `goldziher/kreuzberg:VERSION-easyocr` - With EasyOCR support
-- `goldziher/kreuzberg:VERSION-paddle` - With PaddleOCR support
-- `goldziher/kreuzberg:VERSION-gmft` - With GMFT table extraction
-- `goldziher/kreuzberg:VERSION-all` - With all optional dependencies
+- `goldziher/kreuzberg:latest-easyocr` - With EasyOCR support
+- `goldziher/kreuzberg:latest-paddle` - With PaddleOCR support
+- `goldziher/kreuzberg:latest-gmft` - With GMFT table extraction
+- `goldziher/kreuzberg:latest-all` - With all optional dependencies
+> **Note**: Specific version tags are also available (e.g., `v3.4.0`, `v3.4.0-easyocr`)
 ## Quick Start
@@ -45,8 +46,6 @@ curl -X POST http://localhost:8000/extract \
 Create a `docker-compose.yml`:
 ```yaml
-version: '3.8'
 services:
   kreuzberg:
     image: goldziher/kreuzberg:latest
@@ -54,9 +53,6 @@ services:
       - "8000:8000"
     environment:
       - PYTHONUNBUFFERED=1
-    volumes:
-      # Optional: Mount local directory for file access
-      - ./documents:/app/documents
     restart: unless-stopped
 ```
@@ -107,7 +103,7 @@ CMD ["python", "custom_config.py"]
 ### Base Image
-- Based on `python:3.13-bookworm`
+- Based on `python:3.13-bookworm` (requires Python 3.10+)
 - Includes system dependencies: `pandoc`, `tesseract-ocr`
 - Runs as non-root user `appuser`
 - Exposes port 8000

kreuzberg 3.4.0__tar.gz → 3.4.2__tar.gz

kreuzberg 3.4.0tar.gz → 3.4.2tar.gz