PyPI - kreuzberg - Versions diffs - 3.5.0__tar.gz → 3.6.1__tar.gz - Mend

kreuzberg 3.5.0tar.gz → 3.6.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

{kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/publish-docker.yml RENAMED Viewed

@@ -2,18 +2,14 @@
 name: Publish Docker Images
 on:
-  workflow_run:
-    workflows: ["Release"]
-    types:
-      - completed
-    branches:
-      - main
   workflow_dispatch:
+  release:
+    types: [published]
 jobs:
   build-and-push:
     runs-on: ubuntu-latest
-    if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'release' }}
     permissions:
       contents: read
       packages: write
@@ -41,28 +37,21 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event.workflow_run.head_branch || github.ref }}
+          ref: ${{ github.ref }}
       - name: Get release version
         id: get_version
         run: |
-          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
-            # For manual dispatch, get the latest tag by listing all tags
+          if [ "${{ github.event_name }}" = "release" ]; then
+            # For release events, use the release tag
+            VERSION="${{ github.event.release.tag_name }}"
+          else
+            # For workflow_dispatch, get the latest tag
             git fetch --tags
             VERSION=$(git tag --sort=-version:refname | head -n1)
-          else
-            # For workflow_run, use the head branch
-            VERSION="${{ github.event.workflow_run.head_branch }}"
-            # If triggered by a tag, extract version
-            if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
-              VERSION="$VERSION"
-            else
-              # Get the latest tag by listing all tags
-              git fetch --tags
-              VERSION=$(git tag --sort=-version:refname | head -n1)
-            fi
           fi
           echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
+          echo "Using version: $VERSION"
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3

{kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/release.yaml RENAMED Viewed

@@ -10,6 +10,7 @@ jobs:
     environment: pypi
     permissions:
       id-token: write
+      contents: read
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -29,3 +30,8 @@ jobs:
       - name: Publish
         uses: pypa/gh-action-pypi-publish@release/v1
+      - name: Docker Build Info
+        run: |
+          echo "Docker images will be built automatically by the publish-docker.yml workflow"
+          echo "triggered by this release event. No manual triggering needed."

{kreuzberg-3.5.0 → kreuzberg-3.6.1}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.5.0
+Version: 3.6.1
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
 License-File: LICENSE
-Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
+Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
@@ -36,16 +36,19 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
 Provides-Extra: all
 Requires-Dist: click>=8.2.1; extra == 'all'
 Requires-Dist: easyocr>=1.7.2; extra == 'all'
+Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
 Requires-Dist: gmft>=0.4.2; extra == 'all'
-Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
+Requires-Dist: keybert>=0.9.0; extra == 'all'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
 Requires-Dist: paddleocr>=3.1.0; extra == 'all'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
 Requires-Dist: rich>=14.0.0; extra == 'all'
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
 Requires-Dist: setuptools>=80.9.0; extra == 'all'
+Requires-Dist: spacy>=3.8.7; extra == 'all'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
 Provides-Extra: api
-Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
 Provides-Extra: chunking
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
 Provides-Extra: cli
@@ -54,10 +57,13 @@ Requires-Dist: rich>=14.0.0; extra == 'cli'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
 Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
+Provides-Extra: entity-extraction
+Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
+Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
 Provides-Extra: gmft
 Requires-Dist: gmft>=0.4.2; extra == 'gmft'
 Provides-Extra: langdetect
-Requires-Dist: fast-langdetect>=0.2.0; extra == 'langdetect'
+Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
 Provides-Extra: paddleocr
 Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'

{kreuzberg-3.5.0 → kreuzberg-3.6.1}/ai-rulez.yaml RENAMED Viewed

@@ -1,6 +1,6 @@
 metadata:
   name: "Kreuzberg"
-  version: "3.4.0"
+  version: "3.5.0"
   description: "A text extraction library supporting PDFs, images, office documents and more"
 outputs:
@@ -115,6 +115,7 @@ rules:
       - **OCR Backends**: Pluggable OCR engines with separate configuration classes
       - **GMFT Integration**: Table extraction using GMFT library for PDFs
       - **Chunking**: Text splitting functionality in `_chunker.py`
+      - **Language Detection**: Automatic language detection using fast-langdetect
       - **Async/Sync**: Primary async implementation with sync wrappers in `_utils/_sync.py`
       - **API Server**: REST API using Litestar framework in `_api/main.py`
       - **CLI**: Command-line interface for batch processing and automation
@@ -144,6 +145,8 @@ rules:
       - Mock OCR responses for predictable testing
       - Both sync and async test variants
       - Comprehensive error case coverage
+      - OCR tests marked as `xfail` in CI environments for resilience
+      - Integration tests use timeouts and retry logic where appropriate
   - name: "Important Instructions"
     priority: 10
@@ -160,16 +163,17 @@ rules:
     priority: 6
     content: |
       ### GitHub Actions Workflows
-      - **Release**: Automated PyPI publishing via GitHub releases
-      - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64)
+      - **Release**: Automated PyPI publishing via GitHub releases, triggers Docker builds
+      - **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64), triggered by releases
       - **Documentation**: Auto-deploy to GitHub Pages on docs changes
+      - **CI**: Comprehensive testing across multiple Python versions and platforms
       ### Docker Variants
-      - **Core** (`goldziher/kreuzberg:v3.4.0`): API + Tesseract OCR
-      - **EasyOCR** (`goldziher/kreuzberg:v3.4.0-easyocr`): Core + EasyOCR
-      - **PaddleOCR** (`goldziher/kreuzberg:v3.4.0-paddle`): Core + PaddleOCR
-      - **GMFT** (`goldziher/kreuzberg:v3.4.0-gmft`): Core + table extraction
-      - **All** (`goldziher/kreuzberg:v3.4.0-all`): All features included
+      - **Core** (`goldziher/kreuzberg:v3.5.0`): API + Tesseract OCR
+      - **EasyOCR** (`goldziher/kreuzberg:v3.5.0-easyocr`): Core + EasyOCR
+      - **PaddleOCR** (`goldziher/kreuzberg:v3.5.0-paddle`): Core + PaddleOCR
+      - **GMFT** (`goldziher/kreuzberg:v3.5.0-gmft`): Core + table extraction
+      - **All** (`goldziher/kreuzberg:v3.5.0-all`): All features included
       ### Manual Triggers
       - Docker builds: `gh workflow run "Publish Docker Images"`
@@ -191,8 +195,9 @@ rules:
       chunking = ["semantic-text-splitter>=0.27.0"]
       easyocr = ["easyocr>=1.7.2"]
       gmft = ["gmft>=0.4.2"]
+      langdetect = ["fast-langdetect>=0.2.0"]
       paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
-      all = ["kreuzberg[api,chunking,cli,easyocr,gmft,paddleocr]"]
+      all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
       ```
       ### Installation Patterns
@@ -207,6 +212,17 @@ rules:
       - **Development**: Uses dependency groups in pyproject.toml
 sections:
+  - title: "Language Detection"
+    content: |
+      ### Automatic Language Detection (v3.5.0+)
+      - **Feature**: Automatically detect languages in extracted text
+      - **Implementation**: Uses fast-langdetect library for high-performance detection
+      - **Configuration**:
+        - Enable with `auto_detect_language=True` in `ExtractionConfig`
+        - Configure via `LanguageDetectionConfig` for confidence thresholds
+      - **Output**: Results available in `ExtractionResult.detected_languages`
+      - **Integration**: Works with all extraction methods and file types
   - title: "Planned Features"
     content: |
       ### Structured Extraction (Issue #55)

{kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/types.md RENAMED Viewed

@@ -40,10 +40,28 @@ Configuration options for the GMFT table extraction engine:
 ::: kreuzberg.GMFTConfig
+## Entity Extraction Configuration
+Configuration options for spaCy-based entity extraction:
+::: kreuzberg.SpacyEntityExtractionConfig
+## Language Detection Configuration
+Configuration options for automatic language detection:
+::: kreuzberg.LanguageDetectionConfig
 ## PSMMode (Page Segmentation Mode)
 ::: kreuzberg.PSMMode
+## Entity
+Represents an extracted named entity:
+::: kreuzberg.Entity
 ## Metadata
 A TypedDict that contains optional metadata fields extracted from documents:

{kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/examples/extraction-examples.md RENAMED Viewed

@@ -189,6 +189,83 @@ async def process_upload(file_content: bytes, mime_type: str):
             print(f"{key}: {value}")
 ```
+## Keywords
+Kreuzberg supports keywords and regex extraction as follows:
+```python
+from kreuzberg import ExtractionConfig, extract_file
+async def extract_keywords():
+    config = ExtractionConfig(
+        extract_keywords=True,
+        keyword_count=5,  # defaults to 10 if not set
+    )
+    result = await extract_file(
+        "document.pdf",
+        config=config,
+    )
+    print(f"Keywords: {result.keywords}")
+```
+## Entity and Keyword Extraction
+Kreuzberg can extract named entities using spaCy and keywords using KeyBERT. It automatically detects entities like people, organizations, locations, and more, plus supports custom regex patterns:
+```python
+from kreuzberg import ExtractionConfig, extract_file, SpacyEntityExtractionConfig
+async def extract_entities_and_keywords():
+    # Basic extraction
+    config = ExtractionConfig(
+        extract_entities=True,
+        extract_keywords=True,
+        keyword_count=5,
+        custom_entity_patterns={
+            "INVOICE_ID": r"INV-\d+",
+            "EMAIL": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
+        },
+    )
+    result = await extract_file("document.pdf", config=config)
+    # Print extracted entities
+    if result.entities:
+        for entity in result.entities:
+            print(f"{entity.type}: {entity.text}")
+    # Print extracted keywords
+    if result.keywords:
+        for keyword, score in result.keywords:
+            print(f"Keyword: {keyword} (score: {score:.3f})")
+async def extract_multilingual_entities():
+    # Configure spaCy for multiple languages
+    spacy_config = SpacyEntityExtractionConfig(
+        language_models={
+            "en": "en_core_web_sm",
+            "de": "de_core_news_sm",
+            "fr": "fr_core_news_sm",
+        },
+        fallback_to_multilingual=True,
+    )
+    config = ExtractionConfig(
+        auto_detect_language=True,  # Automatically detect document languages
+        extract_entities=True,
+        spacy_entity_extraction_config=spacy_config,
+    )
+    result = await extract_file("multilingual_document.pdf", config=config)
+    if result.detected_languages:
+        print(f"Detected languages: {result.detected_languages}")
+    if result.entities:
+        print(f"Extracted {len(result.entities)} entities")
+        for entity in result.entities:
+            print(f"  {entity.type}: {entity.text}")
+```
 ## Synchronous API
 For cases where async isn't needed or available:

{kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/getting-started/installation.md RENAMED Viewed

@@ -110,6 +110,30 @@ Language detection is an optional feature that automatically detects the languag
 pip install "kreuzberg[langdetect]"
 ```
+### Entity and Keyword Extraction
+Entity and keyword extraction are optional features that extract named entities and keywords from documents. Entity extraction uses [spaCy](https://spacy.io/) for multilingual named entity recognition, while keyword extraction uses [KeyBERT](https://github.com/MaartenGr/KeyBERT) for semantic keyword extraction:
+```shell
+pip install "kreuzberg[entity-extraction]"
+```
+After installation, you'll need to download the spaCy language models you plan to use:
+```shell
+# Download English model (most common)
+python -m spacy download en_core_web_sm
+# Download other language models as needed
+python -m spacy download de_core_news_sm  # German
+python -m spacy download fr_core_news_sm  # French
+python -m spacy download es_core_news_sm  # Spanish
+```
+!!! note "Language Model Requirements"
+    spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
 ### All Optional Dependencies
 To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -121,5 +145,5 @@ pip install "kreuzberg[all]"
 This is equivalent to:
 ```shell
-pip install "kreuzberg[chunking,easyocr,gmft,langdetect,paddleocr]"
+pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
 ```

{kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/extraction-configuration.md RENAMED Viewed

@@ -153,6 +153,134 @@ The feature requires the `langdetect` dependency:
 pip install "kreuzberg[langdetect]"
 ```
+### Entity and Keyword Extraction
+Kreuzberg can extract named entities and keywords from documents using spaCy for entity recognition and KeyBERT for keyword extraction:
+```python
+from kreuzberg import extract_file, ExtractionConfig, SpacyEntityExtractionConfig
+# Basic entity and keyword extraction
+result = await extract_file(
+    "document.pdf",
+    config=ExtractionConfig(
+        extract_entities=True,
+        extract_keywords=True,
+        keyword_count=10,  # Number of keywords to extract (default: 10)
+    ),
+)
+# Access extracted entities and keywords
+if result.entities:
+    for entity in result.entities:
+        print(f"{entity.type}: {entity.text} (position {entity.start}-{entity.end})")
+        # Example: "PERSON: John Doe (position 0-8)"
+if result.keywords:
+    for keyword, score in result.keywords:
+        print(f"{keyword}: {score:.3f}")
+        # Example: "artificial intelligence: 0.845"
+```
+#### Entity Extraction with Language Support
+spaCy supports entity extraction in multiple languages. You can configure language-specific models:
+```python
+from kreuzberg import extract_file, ExtractionConfig, SpacyEntityExtractionConfig
+# Configure spaCy for specific languages
+spacy_config = SpacyEntityExtractionConfig(
+    language_models={
+        "en": "en_core_web_sm",  # English
+        "de": "de_core_news_sm",  # German
+        "fr": "fr_core_news_sm",  # French
+        "es": "es_core_news_sm",  # Spanish
+    },
+    model_cache_dir="/tmp/spacy_models",  # Custom model cache directory
+    fallback_to_multilingual=True,  # Use multilingual model if language-specific model fails
+)
+# Extract with language detection to automatically choose the right model
+result = await extract_file(
+    "multilingual_document.pdf",
+    config=ExtractionConfig(
+        auto_detect_language=True,  # Enable language detection
+        extract_entities=True,
+        spacy_entity_extraction_config=spacy_config,
+    ),
+)
+# The system will automatically use the appropriate spaCy model based on detected languages
+if result.detected_languages and result.entities:
+    print(f"Detected languages: {result.detected_languages}")
+    print(f"Extracted {len(result.entities)} entities")
+```
+#### Custom Entity Patterns
+You can define custom entity patterns using regular expressions:
+```python
+result = await extract_file(
+    "invoice.pdf",
+    config=ExtractionConfig(
+        extract_entities=True,
+        custom_entity_patterns={
+            "INVOICE_ID": r"INV-\d{4,}",  # Invoice numbers
+            "PHONE": r"\+?\d{1,3}[-.\s]?\d{3,4}[-.\s]?\d{3,4}[-.\s]?\d{3,4}",  # Phone numbers
+            "EMAIL": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",  # Email addresses
+        },
+    ),
+)
+# Custom patterns are combined with spaCy's standard entity types
+for entity in result.entities:
+    if entity.type in ["INVOICE_ID", "PHONE", "EMAIL"]:
+        print(f"Custom entity - {entity.type}: {entity.text}")
+    else:
+        print(f"Standard entity - {entity.type}: {entity.text}")
+```
+#### Supported Entity Types
+spaCy automatically detects these standard entity types:
+- **PERSON**: People's names
+- **ORG**: Organizations, companies, agencies
+- **GPE**: Countries, cities, states (Geopolitical entities)
+- **MONEY**: Monetary values
+- **DATE**: Date expressions
+- **TIME**: Time expressions
+- **PERCENT**: Percentage values
+- **CARDINAL**: Numerals that do not fall under another type
+Language-specific models may support additional entity types relevant to that language.
+#### spaCy Configuration Options
+- `language_models`: Dict mapping language codes to spaCy model names
+- `model_cache_dir`: Custom directory for caching spaCy models
+- `fallback_to_multilingual`: Whether to use multilingual model (`xx_ent_wiki_sm`) as fallback
+- `max_doc_length`: Maximum document length for spaCy processing (default: 1,000,000 characters)
+- `batch_size`: Batch size for processing multiple texts (default: 1,000)
+#### Installation Requirements
+Entity and keyword extraction require additional dependencies:
+```shell
+# For entity extraction with spaCy
+pip install "kreuzberg[entity-extraction]"
+# Install specific spaCy language models as needed
+python -m spacy download en_core_web_sm    # English
+python -m spacy download de_core_news_sm   # German
+python -m spacy download fr_core_news_sm   # French
+```
+Available spaCy models include: `en_core_web_sm`, `de_core_news_sm`, `fr_core_news_sm`, `es_core_news_sm`, `pt_core_news_sm`, `it_core_news_sm`, `nl_core_news_sm`, `zh_core_web_sm`, `ja_core_news_sm`, `ko_core_news_sm`, `ru_core_news_sm`, and many others.
 ### Batch Processing
 ```python

{kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/__init__.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from importlib.metadata import version
+from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
 from kreuzberg._gmft import GMFTConfig
 from kreuzberg._language_detection import LanguageDetectionConfig
 from kreuzberg._ocr._easyocr import EasyOCRConfig
@@ -8,7 +9,7 @@ from kreuzberg._ocr._tesseract import TesseractConfig
 from ._ocr._tesseract import PSMMode
 from ._registry import ExtractorRegistry
-from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
+from ._types import Entity, ExtractionConfig, ExtractionResult, Metadata, TableData
 from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
 from .extraction import (
     batch_extract_bytes,
@@ -25,6 +26,7 @@ __version__ = version("kreuzberg")
 __all__ = [
     "EasyOCRConfig",
+    "Entity",
     "ExtractionConfig",
     "ExtractionResult",
     "ExtractorRegistry",
@@ -37,6 +39,7 @@ __all__ = [
     "PSMMode",
     "PaddleOCRConfig",
     "ParsingError",
+    "SpacyEntityExtractionConfig",
     "TableData",
     "TesseractConfig",
     "ValidationError",

kreuzberg 3.5.0__tar.gz → 3.6.1__tar.gz

kreuzberg 3.5.0tar.gz → 3.6.1tar.gz