PyPI - kreuzberg - Versions diffs - 3.4.2__tar.gz → 3.5.0__tar.gz - Mend

kreuzberg 3.4.2tar.gz → 3.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (193) hide show

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/.github/workflows/publish-docker.yml RENAMED Viewed

@@ -1,4 +1,3 @@
-# .github/workflows/publish-docker.yml
 name: Publish Docker Images
@@ -24,7 +23,7 @@ jobs:
         include:
           - name: core
             extras: ""
-            tag_suffix: "" # The base image tag (includes API + tesseract)
+            tag_suffix: ""
           - name: easyocr
             extras: "easyocr"
             tag_suffix: "-easyocr"
@@ -89,7 +88,7 @@ jobs:
             type=raw,value=latest${{ matrix.tag_suffix }}
       - name: Build and push Docker image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: .
           file: ./.docker/Dockerfile

kreuzberg-3.5.0/.gitmodules ADDED Viewed

@@ -0,0 +1,3 @@
+[submodule "python-text-extraction-libs-benchmarks"]
+	path = python-text-extraction-libs-benchmarks
+	url = https://github.com/Goldziher/python-text-extraction-libs-benchmarks.git

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.4.2
+Version: 3.5.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -56,6 +56,8 @@ Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
 Provides-Extra: gmft
 Requires-Dist: gmft>=0.4.2; extra == 'gmft'
+Provides-Extra: langdetect
+Requires-Dist: fast-langdetect>=0.2.0; extra == 'langdetect'
 Provides-Extra: paddleocr
 Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/benchmark_baseline.py RENAMED Viewed

@@ -8,7 +8,7 @@ from kreuzberg import ExtractionConfig, batch_extract_file, extract_file_sync
 from kreuzberg._utils._document_cache import clear_document_cache, get_document_cache
-async def run_baseline_benchmark() -> dict[str, object] | None:  # type: ignore[syntax]
+async def run_baseline_benchmark() -> dict[str, object] | None:
     """Run comprehensive baseline benchmark."""
     test_files_dir = Path("tests/test_source_files")
     test_files = list(test_files_dir.glob("*.pdf"))

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/final_benchmark.py RENAMED Viewed

@@ -15,7 +15,7 @@ from kreuzberg._utils._cache import (
 )
-async def run_final_benchmark() -> dict[str, object] | None:  # type: ignore[syntax]
+async def run_final_benchmark() -> dict[str, object] | None:
     """Run comprehensive benchmark of all caching improvements."""
     test_files_dir = Path("tests/test_source_files")
     pdf_files = list(test_files_dir.glob("*.pdf"))

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/benchmarks/serialization_benchmark.py RENAMED Viewed

@@ -101,7 +101,6 @@ def benchmark_serialization() -> dict[str, object]:
     json_serialize = analyze_times(json_serialize_times, "JSON Serialize")
     json_deserialize = analyze_times(json_deserialize_times, "JSON Deserialize")
-    # Type casting for arithmetic operations
     json_ser_mean = json_serialize["mean"]
     json_deser_mean = json_deserialize["mean"]
     msgpack_ser_mean = msgpack_serialize["mean"]

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/advanced/performance.md RENAMED Viewed

@@ -125,22 +125,59 @@ The async API leverages Python's asyncio with intelligent task scheduling:
 1. **Configure OCR appropriately** for your document types
 1. **Profile your specific workload** - results vary by content
-### Configuration Examples
+### Optimized Default Configuration
+Kreuzberg's default configuration is **optimized out-of-the-box for modern PDFs and standard documents**:
 ```python
-from kreuzberg import ExtractionConfig, extract_file_sync
-from kreuzberg._ocr import TesseractConfig
+from kreuzberg import ExtractionConfig
-# Optimized for speed
-fast_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=6))  # Assume uniform text block
+# Default configuration - already optimized for modern documents
+config = ExtractionConfig()  # Uses optimized defaults:
+# - PSM: AUTO_ONLY (fast without orientation detection)
+# - Language model: Disabled for performance
+# - Dictionary correction: Enabled for accuracy
+```
-# Optimized for accuracy
-accurate_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=1))  # Auto page segmentation
+### Advanced Configuration Examples
-# For simple documents (no OCR)
-text_only_config = ExtractionConfig(force_ocr=False, ocr_backend=None)
+```python
+from kreuzberg import ExtractionConfig, extract_file_sync
+from kreuzberg._ocr._tesseract import TesseractConfig, PSMMode
+# Maximum speed configuration (for high-volume processing)
+speed_config = ExtractionConfig(
+    ocr_backend="tesseract",
+    ocr_config=TesseractConfig(
+        psm=PSMMode.SINGLE_BLOCK,  # Assume simple layout
+        language_model_ngram_on=False,  # Already disabled by default
+        tessedit_enable_dict_correction=False,  # Disable for maximum speed
+    ),
+)
+# Maximum accuracy configuration (for degraded documents)
+accuracy_config = ExtractionConfig(
+    ocr_backend="tesseract",
+    ocr_config=TesseractConfig(
+        psm=PSMMode.AUTO,  # Full analysis with orientation detection
+        language_model_ngram_on=True,  # Enable for historical/degraded text
+        tessedit_enable_dict_correction=True,  # Default - keep enabled
+    ),
+)
+# No OCR configuration (text documents only)
+text_only_config = ExtractionConfig(ocr_backend=None, force_ocr=False)
 ```
+### Performance Optimization Tips
+Based on comprehensive benchmarking with 138+ documents:
+1. **Disable OCR for text documents**: Setting `ocr_backend=None` provides significant speedup for documents with text layers
+1. **Use PSM `AUTO_ONLY` (default)**: Optimized for modern documents without orientation detection overhead
+1. **Language model trade-offs**: Disabling `language_model_ngram_on` can provide 30x+ speedup with minimal quality impact on clean documents
+1. **Dictionary correction**: Disabling `tessedit_enable_dict_correction` speeds up processing for technical documents
 ### Batch Processing Best Practices
 ```python

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/changelog.md RENAMED Viewed

@@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Documentation site with comprehensive examples and API reference
 - Improved configuration for all OCR backends
 - Added hooks system for validation and post-processing
+- Language detection feature with `auto_detect_language` configuration option
+- New optional dependency group `langdetect` for automatic language detection
 ### Changed

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/examples/extraction-examples.md RENAMED Viewed

@@ -79,6 +79,47 @@ async def extract_with_different_backends():
     print(f"No OCR result: {result.content[:100]}...")
 ```
+## Language Detection
+```python
+from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
+async def detect_document_language():
+    # Simple automatic language detection
+    result = await extract_file("document.pdf", config=ExtractionConfig(auto_detect_language=True))
+    # Access detected languages
+    if result.detected_languages:
+        print(f"Detected languages: {', '.join(result.detected_languages)}")
+        # Example output: "Detected languages: en, de, fr"
+async def detect_multilingual_document():
+    # Advanced multilingual detection with custom configuration
+    lang_config = LanguageDetectionConfig(
+        multilingual=True,  # Detect multiple languages in mixed text
+        top_k=5,  # Return top 5 languages
+        low_memory=False,  # Use high accuracy mode
+    )
+    result = await extract_file(
+        "multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
+    )
+    if result.detected_languages:
+        print(f"Detected languages: {result.detected_languages}")
+        # Use detected languages for OCR
+        from kreuzberg import TesseractConfig
+        # Create language string for Tesseract (e.g., "eng+deu+fra")
+        tesseract_langs = "+".join(result.detected_languages[:3])
+        result_with_ocr = await extract_file(
+            "multilingual_document.pdf",
+            config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=tesseract_langs)),
+        )
+```
 ## Table Extraction
 ```python

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/getting-started/installation.md RENAMED Viewed

@@ -102,6 +102,14 @@ Table extraction is an optional feature that allows Kreuzberg to extract tables
 pip install "kreuzberg[gmft]"
 ```
+### Language Detection
+Language detection is an optional feature that automatically detects the language of extracted text. It uses the [fast-langdetect](https://github.com/LlmKira/fast-langdetect) package. To install Kreuzberg with language detection support, you can use:
+```shell
+pip install "kreuzberg[langdetect]"
+```
 ### All Optional Dependencies
 To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
@@ -113,5 +121,5 @@ pip install "kreuzberg[all]"
 This is equivalent to:
 ```shell
-pip install "kreuzberg[chunking,easyocr,gmft,paddleocr]"
+pip install "kreuzberg[chunking,easyocr,gmft,langdetect,paddleocr]"
 ```

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/docker.md RENAMED Viewed

@@ -124,6 +124,34 @@ Additional dependencies by variant:
 - **gmft**: GMFT for table extraction
 - **all**: All optional dependencies
+### Health Check
+All Docker images include a health check endpoint:
+```bash
+# Check API health
+curl http://localhost:8000/health
+```
+Returns a JSON response with service status and version information.
+### Observability
+The Docker images include built-in OpenTelemetry instrumentation via Litestar:
+- **Tracing**: Automatic request/response tracing
+- **Metrics**: Performance and usage metrics
+- **Logging**: Structured JSON logging
+Configure via standard OpenTelemetry environment variables:
+```bash
+docker run -p 8000:8000 \
+  -e OTEL_SERVICE_NAME=kreuzberg-api \
+  -e OTEL_EXPORTER_OTLP_ENDPOINT=http://your-collector:4317 \
+  goldziher/kreuzberg:latest
+```
 ### Environment Variables
 - `PYTHONUNBUFFERED=1` - Ensures proper logging output
@@ -150,6 +178,12 @@ server {
         client_max_body_size 100M;
         proxy_read_timeout 300s;
     }
+    # Health check endpoint
+    location /health {
+        proxy_pass http://localhost:8000/health;
+        access_log off;
+    }
 }
 ```
@@ -175,6 +209,21 @@ spec:
         image: goldziher/kreuzberg:latest
         ports:
         - containerPort: 8000
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 5
+          periodSeconds: 5
+        env:
+        - name: OTEL_SERVICE_NAME
+          value: "kreuzberg-api"
         resources:
           requests:
             memory: "512Mi"

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/extraction-configuration.md RENAMED Viewed

@@ -9,6 +9,7 @@ All extraction functions accept an optional `config` parameter of type `Extracti
 - Control OCR behavior with `force_ocr` and `ocr_backend`
 - Provide engine-specific OCR configuration via `ocr_config`
 - Enable table extraction with `extract_tables` and configure it via `gmft_config`
+- Enable automatic language detection with `auto_detect_language`
 - Add validation and post-processing hooks
 - Configure custom extractors
@@ -100,6 +101,58 @@ Note that table extraction requires the `gmft` dependency. You can install it wi
 pip install "kreuzberg[gmft]"
 ```
+### Language Detection
+Kreuzberg can automatically detect the language of extracted text using fast-langdetect:
+```python
+from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
+# Simple automatic language detection
+result = await extract_file("multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True))
+# Access detected languages (lowercase ISO 639-1 codes)
+if result.detected_languages:
+    print(f"Detected languages: {', '.join(result.detected_languages)}")
+    # Example output: "Detected languages: en, de, fr"
+# Advanced configuration with multilingual detection
+lang_config = LanguageDetectionConfig(
+    multilingual=True,  # Enable mixed-language detection
+    top_k=5,  # Return top 5 languages
+    low_memory=False,  # Use high accuracy mode
+    cache_dir="/tmp/lang_models",  # Custom model cache directory
+)
+result = await extract_file(
+    "multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
+)
+# Use detected languages for OCR
+if result.detected_languages:
+    # Re-extract with OCR using the primary detected language
+    from kreuzberg import TesseractConfig
+    result_with_ocr = await extract_file(
+        "multilingual_document.pdf",
+        config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=result.detected_languages[0])),
+    )
+```
+#### Language Detection Configuration Options
+- `low_memory` (default: `True`): Use smaller model (~200MB) vs larger, more accurate model
+- `multilingual` (default: `False`): Enable detection of multiple languages in mixed text
+- `top_k` (default: `3`): Maximum number of languages to return
+- `cache_dir`: Custom directory for language model storage
+- `allow_fallback` (default: `True`): Fall back to small model if large model fails
+The feature requires the `langdetect` dependency:
+```shell
+pip install "kreuzberg[langdetect]"
+```
 ### Batch Processing
 ```python

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/docs/user-guide/ocr-configuration.md RENAMED Viewed

@@ -62,15 +62,15 @@ result = await extract_file("document.pdf", config=ExtractionConfig(ocr_config=T
 #### Available PSM Modes
-| Mode                 | Enum Value                | Description                                              | Best For                                       |
-| -------------------- | ------------------------- | -------------------------------------------------------- | ---------------------------------------------- |
-| Automatic            | `PSMMode.AUTO`            | Automatic page segmentation with orientation detection   | General purpose (default)                      |
-| Single Block         | `PSMMode.SINGLE_BLOCK`    | Treat the image as a single text block                   | Simple layouts, preserving paragraph structure |
-| Single Line          | `PSMMode.SINGLE_LINE`     | Treat the image as a single text line                    | Receipts, labels, single-line text             |
-| Single Word          | `PSMMode.SINGLE_WORD`     | Treat the image as a single word                         | Word recognition tasks                         |
-| Single Character     | `PSMMode.SINGLE_CHAR`     | Treat the image as a single character                    | Character recognition tasks                    |
-| Sparse Text          | `PSMMode.SPARSE_TEXT`     | Find as much text as possible without assuming structure | Forms, tables, scattered text                  |
-| Sparse Text with OSD | `PSMMode.SPARSE_TEXT_OSD` | Like SPARSE_TEXT with orientation detection              | Complex layouts with varying text orientation  |
+| Mode          | Enum Value              | Description                                              | Best For                                       |
+| ------------- | ----------------------- | -------------------------------------------------------- | ---------------------------------------------- |
+| Auto Only     | `PSMMode.AUTO_ONLY`     | Automatic segmentation without orientation detection     | Modern documents (default - fastest)           |
+| Automatic     | `PSMMode.AUTO`          | Automatic page segmentation with orientation detection   | Rotated/skewed documents                       |
+| Single Block  | `PSMMode.SINGLE_BLOCK`  | Treat the image as a single text block                   | Simple layouts, preserving paragraph structure |
+| Single Column | `PSMMode.SINGLE_COLUMN` | Assume a single column of text                           | Books, articles, single-column documents       |
+| Single Line   | `PSMMode.SINGLE_LINE`   | Treat the image as a single text line                    | Receipts, labels, single-line text             |
+| Single Word   | `PSMMode.SINGLE_WORD`   | Treat the image as a single word                         | Word recognition tasks                         |
+| Sparse Text   | `PSMMode.SPARSE_TEXT`   | Find as much text as possible without assuming structure | Forms, tables, scattered text                  |
 ### Forcing OCR
@@ -139,23 +139,90 @@ result = await extract_file(
 ## Performance Optimization
-OCR performance and parallel processing can be controlled through process handlers and extraction hooks which are configured in the `ExtractionConfig` object. The default configuration handles performance optimization automatically.
+### Default Configuration
-This is useful for:
+Kreuzberg's defaults are optimized out-of-the-box for modern PDFs and standard documents:
-- Limiting resource usage on systems with limited memory
-- Optimizing performance on systems with many CPU cores
-- Balancing OCR tasks with other application workloads
+- **PSM Mode**: `AUTO_ONLY` - Faster than `AUTO` without orientation detection overhead
+- **Language Model**: Disabled by default for optimal performance on modern documents
+- **Dictionary Correction**: Enabled for accuracy
+The default configuration provides excellent extraction quality for:
+- Modern PDFs with embedded text
+- Scanned documents with clear printing
+- Office documents (DOCX, PPTX, XLSX)
+- Standard business documents
+### Speed vs Quality Trade-offs
+```python
+from kreuzberg import ExtractionConfig, TesseractConfig, PSMMode
+# Default configuration (optimized for modern documents)
+default_config = ExtractionConfig()  # Already optimized for speed and quality
+# Maximum speed configuration
+speed_config = ExtractionConfig(
+    ocr_backend="tesseract",
+    ocr_config=TesseractConfig(
+        psm=PSMMode.SINGLE_BLOCK,  # Assume simple layout
+        tessedit_enable_dict_correction=False,  # Skip dictionary correction
+    ),
+)
+# Maximum accuracy configuration (for degraded/historical documents)
+accuracy_config = ExtractionConfig(
+    ocr_backend="tesseract",
+    ocr_config=TesseractConfig(
+        psm=PSMMode.AUTO,  # Full analysis with orientation detection
+        language_model_ngram_on=True,  # Enable for degraded/historical text
+        tessedit_enable_dict_correction=True,  # Correct OCR errors
+    ),
+)
+```
+### Language Model N-gram Settings
+The `language_model_ngram_on` parameter controls Tesseract's use of n-gram language models:
+- **Default (False)**: Optimized for modern documents with clear text
+- **When to enable**: Historical documents, degraded scans, handwritten text, or noisy images
+```python
+# For degraded or historical documents
+historical_config = ExtractionConfig(
+    ocr_backend="tesseract",
+    ocr_config=TesseractConfig(
+        language_model_ngram_on=True,  # Enable for better accuracy on poor quality text
+    ),
+)
+```
+### When to Disable OCR
+For documents with text layers (searchable PDFs, Office docs), disable OCR entirely:
+```python
+# No OCR overhead for text documents
+text_config = ExtractionConfig(ocr_backend=None)
+```
+This provides significant speedup (78% of PDFs have text layers and extract in \<0.01s)
 ## Best Practices
 - **Language Selection**: Always specify the correct language for your documents to improve OCR accuracy
 - **PSM Mode Selection**: Choose the appropriate PSM mode based on your document layout:
-    - Use `PSM.SINGLE_BLOCK` for documents with simple layouts
-    - Use `PSM.SPARSE_TEXT` for forms or documents with tables
-    - Use `PSM.SINGLE_LINE` for receipts or labels
+    - Use `PSMMode.AUTO_ONLY` (default) for modern, well-formatted documents
+    - Use `PSMMode.SINGLE_BLOCK` for simple layouts with faster processing
+    - Use `PSMMode.SPARSE_TEXT` for forms or documents with tables
+    - Use `PSMMode.AUTO` only when orientation detection is needed
+- **Performance Optimization**:
+    - Disable OCR (`ocr_backend=None`) for documents with text layers
+    - Disable language model for clean documents (`language_model_ngram_on=False`)
+    - Disable dictionary correction for technical documents
 - **Image Quality**: For best results, ensure images are:
     - High resolution (at least 300 DPI)
     - Well-lit with good contrast
-    - Not skewed or rotated
-- **Performance**: For batch processing, adjust `max_processes` based on your system's capabilities
+    - Not skewed or rotated (unless using `PSMMode.AUTO`)

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/__init__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from importlib.metadata import version
 from kreuzberg._gmft import GMFTConfig
+from kreuzberg._language_detection import LanguageDetectionConfig
 from kreuzberg._ocr._easyocr import EasyOCRConfig
 from kreuzberg._ocr._paddleocr import PaddleOCRConfig
 from kreuzberg._ocr._tesseract import TesseractConfig
@@ -29,6 +30,7 @@ __all__ = [
     "ExtractorRegistry",
     "GMFTConfig",
     "KreuzbergError",
+    "LanguageDetectionConfig",
     "Metadata",
     "MissingDependencyError",
     "OCRError",

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_image.py RENAMED Viewed

@@ -80,11 +80,11 @@ class ImageExtractor(Extractor):
         if self.config.ocr_backend is None:
             raise ValidationError("ocr_backend is None, cannot perform OCR")
-        from kreuzberg._ocr._tesseract import TesseractConfig
         from kreuzberg._types import ExtractionResult
         if self.config.ocr_backend == "tesseract":
             from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
+            from kreuzberg._ocr._tesseract import TesseractConfig
             if isinstance(self.config.ocr_config, TesseractConfig):
                 config = self.config.ocr_config
@@ -96,6 +96,26 @@ class ImageExtractor(Extractor):
                 return results[0]
             return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
+        if self.config.ocr_backend == "paddleocr":
+            from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
+            from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+            paddle_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+            )
+            return paddle_process(path, paddle_config)
+        if self.config.ocr_backend == "easyocr":
+            from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
+            from kreuzberg._ocr._easyocr import EasyOCRConfig
+            easy_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+            )
+            return easy_process(path, easy_config)
         raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
     def _get_extension_from_mime_type(self, mime_type: str) -> str:

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_pdf.py RENAMED Viewed

@@ -299,8 +299,6 @@ class PDFExtractor(Extractor):
         """Extract text from PDF using OCR (sync version)."""
         pdf = None
         try:
-            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
             images = []
             with pypdfium_file_lock(path):
                 pdf = pypdfium2.PdfDocument(str(path))
@@ -325,18 +323,7 @@ class PDFExtractor(Extractor):
                     os.close(fd)
                     image_paths.append(temp_path)
-                if self.config.ocr_backend == "tesseract":
-                    from kreuzberg._ocr._tesseract import TesseractConfig
-                    if isinstance(self.config.ocr_config, TesseractConfig):
-                        config = self.config.ocr_config
-                    else:
-                        config = TesseractConfig()
-                    results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
-                    text_parts = [r.content for r in results]
-                    return "\n\n".join(text_parts)
-                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+                return self._process_pdf_images_with_ocr(image_paths)
             finally:
                 for _, temp_path in temp_files:
@@ -349,3 +336,46 @@ class PDFExtractor(Extractor):
             if pdf:
                 with pypdfium_file_lock(path), contextlib.suppress(Exception):
                     pdf.close()
+    def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
+        """Process PDF images with the configured OCR backend."""
+        if self.config.ocr_backend == "tesseract":
+            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
+            from kreuzberg._ocr._tesseract import TesseractConfig
+            tesseract_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
+            )
+            results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
+            text_parts = [r.content for r in results]
+            return "\n\n".join(text_parts)
+        if self.config.ocr_backend == "paddleocr":
+            from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
+            from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+            paddle_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+            )
+            text_parts = []
+            for image_path in image_paths:
+                result = paddle_process(Path(image_path), paddle_config)
+                text_parts.append(result.content)
+            return "\n\n".join(text_parts)
+        if self.config.ocr_backend == "easyocr":
+            from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
+            from kreuzberg._ocr._easyocr import EasyOCRConfig
+            easy_config = (
+                self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+            )
+            text_parts = []
+            for image_path in image_paths:
+                result = easy_process(Path(image_path), easy_config)
+                text_parts.append(result.content)
+            return "\n\n".join(text_parts)
+        raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")

{kreuzberg-3.4.2 → kreuzberg-3.5.0}/kreuzberg/_extractors/_spread_sheet.py RENAMED Viewed

@@ -6,7 +6,7 @@ import sys
 from datetime import date, datetime, time, timedelta
 from io import StringIO
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 from anyio import Path as AsyncPath
 from python_calamine import CalamineWorkbook
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11):  # pragma: no cover
     from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
-CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
+CellValue = int | float | str | bool | time | date | datetime | timedelta
 class SpreadSheetExtractor(Extractor):

kreuzberg 3.4.2__tar.gz → 3.5.0__tar.gz

kreuzberg 3.4.2tar.gz → 3.5.0tar.gz