PyPI - kreuzberg - Versions diffs - 3.8.1__py3-none-any.whl → 3.8.2__py3-none-any.whl - Mend

kreuzberg 3.8.1py3-none-any.whl → 3.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

kreuzberg/__init__.py +4 -0
kreuzberg/_api/main.py +22 -1
kreuzberg/_config.py +404 -0
kreuzberg/_entity_extraction.py +3 -3
kreuzberg/_extractors/_pdf.py +22 -19
kreuzberg/_extractors/_spread_sheet.py +2 -3
kreuzberg/_extractors/_structured.py +10 -7
kreuzberg/_gmft.py +8 -11
kreuzberg/_language_detection.py +1 -1
kreuzberg/_mcp/server.py +58 -8
kreuzberg/_ocr/_easyocr.py +1 -1
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +2 -7
kreuzberg/_playa.py +2 -3
kreuzberg/_types.py +46 -24
kreuzberg/_utils/_cache.py +15 -17
kreuzberg/_utils/_device.py +10 -20
kreuzberg/_utils/_errors.py +41 -38
kreuzberg/_utils/_quality.py +7 -11
kreuzberg/_utils/_serialization.py +21 -16
kreuzberg/_utils/_string.py +22 -12
kreuzberg/_utils/_table.py +3 -4
kreuzberg/cli.py +3 -3
kreuzberg/exceptions.py +10 -0
kreuzberg/extraction.py +2 -2
kreuzberg-3.8.2.dist-info/METADATA +265 -0
kreuzberg-3.8.2.dist-info/RECORD +53 -0
kreuzberg/_cli_config.py +0 -175
kreuzberg-3.8.1.dist-info/METADATA +0 -301
kreuzberg-3.8.1.dist-info/RECORD +0 -53
{kreuzberg-3.8.1.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
{kreuzberg-3.8.1.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.8.1.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_quality.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import re
+from functools import reduce
 from typing import Any
 # Pre-compiled patterns for performance
@@ -102,9 +103,8 @@ def clean_extracted_text(text: str) -> str:
     if not text:
         return text
-    # Remove script and style content
-    for pattern in _SCRIPT_PATTERNS.values():
-        text = pattern.sub(" ", text)
+    # Remove script and style content using functools.reduce for single pass
+    text = reduce(lambda t, pattern: pattern.sub(" ", t), _SCRIPT_PATTERNS.values(), text)
     # Clean OCR artifacts
     text = _clean_ocr_artifacts(text)
@@ -134,10 +134,8 @@ def _calculate_script_penalty(text: str, total_chars: int) -> float:
     if total_chars == 0:
         return 0.0
-    script_chars = 0
-    for pattern in _SCRIPT_PATTERNS.values():
-        matches = pattern.findall(text)
-        script_chars += sum(len(match) for match in matches)
+    # Use sum with generator expression for single-pass calculation
+    script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
     return min(1.0, script_chars / total_chars)
@@ -147,10 +145,8 @@ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
     if total_chars == 0:
         return 0.0
-    nav_chars = 0
-    for pattern in _NAVIGATION_PATTERNS.values():
-        matches = pattern.findall(text)
-        nav_chars += sum(len(match) for match in matches)
+    # Use sum with generator expression for single-pass calculation
+    nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
     return min(1.0, nav_chars / total_chars)

kreuzberg/_utils/_serialization.py CHANGED Viewed

@@ -2,16 +2,28 @@
 from __future__ import annotations
-from dataclasses import asdict, is_dataclass
-from enum import Enum
+from dataclasses import is_dataclass
 from typing import Any, TypeVar, cast
+import msgspec
 from msgspec import MsgspecError
 from msgspec.msgpack import decode, encode
 T = TypeVar("T")
+# Define dict method names in priority order
+_DICT_METHOD_NAMES = (
+    "to_dict",
+    "as_dict",
+    "dict",
+    "model_dump",
+    "json",
+    "to_list",
+    "tolist",
+)
 def encode_hook(obj: Any) -> Any:
     """Custom encoder for complex objects."""
     if callable(obj):
@@ -20,22 +32,15 @@ def encode_hook(obj: Any) -> Any:
     if isinstance(obj, Exception):
         return {"message": str(obj), "type": type(obj).__name__}
-    for key in (
-        "to_dict",
-        "as_dict",
-        "dict",
-        "model_dump",
-        "json",
-        "to_list",
-        "tolist",
-    ):
-        if hasattr(obj, key):
-            method = getattr(obj, key)  # Cache the attribute lookup
-            if callable(method):
-                return method()
+    # Check for dict-like methods more efficiently using any() with generator
+    for attr_name in _DICT_METHOD_NAMES:
+        method = getattr(obj, attr_name, None)
+        if method is not None and callable(method):
+            return method()
     if is_dataclass(obj) and not isinstance(obj, type):
-        return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}
+        # Use msgspec.to_builtins for more efficient conversion
+        return msgspec.to_builtins(obj)
     if hasattr(obj, "save") and hasattr(obj, "format"):
         return None

kreuzberg/_utils/_string.py CHANGED Viewed

@@ -28,6 +28,7 @@ _encoding_cache: dict[str, str] = {}
 @lru_cache(maxsize=128)
 def _get_encoding_cache_key(data_hash: str, size: int) -> str:
     """Generate cache key for encoding detection."""
+    # Use string interpolation which is faster than format strings for simple cases
     return f"{data_hash}:{size}"
@@ -104,25 +105,29 @@ def _calculate_text_confidence(text: str) -> float:
     if not text:
         return 0.0
-    # Check for common encoding problems
-    replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
-    control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
     total_chars = len(text)
     if total_chars == 0:
         return 0.0
+    # Check for common encoding problems - compile patterns once
+    replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
+    control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
     # Penalize replacement and control characters
     penalty = (replacement_count + control_count * 2) / total_chars
-    # Bonus for readable character ranges
+    # Bonus for readable character ranges - more efficient counting
+    # Use generator expression with early termination
     readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
     readability_score = readable_chars / total_chars
     # Check for suspicious Cyrillic that might be misencoded Hebrew
     cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
-    if cyrillic_matches and len("".join(cyrillic_matches)) > total_chars * 0.1:
-        penalty += 0.3  # Heavy penalty for likely mojibake
+    if cyrillic_matches:
+        # Calculate total length more efficiently
+        cyrillic_length = sum(len(match) for match in cyrillic_matches)
+        if cyrillic_length > total_chars * 0.1:
+            penalty += 0.3  # Heavy penalty for likely mojibake
     return max(0.0, min(1.0, readability_score - penalty))
@@ -164,7 +169,8 @@ def normalize_spaces(text: str) -> str:
     # Split by double newlines to preserve paragraph breaks
     paragraphs = text.split("\n\n")
-    normalized_paragraphs = []
+    result_paragraphs = []
     for paragraph in paragraphs:
         # Use pre-compiled patterns for better performance
@@ -173,10 +179,14 @@ def normalize_spaces(text: str) -> str:
         # Clean up multiple newlines within paragraph (keep single newlines)
         cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
-        # Strip and filter empty lines efficiently
-        lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
+        # Process lines efficiently - manual loop avoids double strip() calls
+        lines = []
+        for line in cleaned.split("\n"):
+            stripped_line = line.strip()
+            if stripped_line:
+                lines.append(stripped_line)
         if lines:
-            normalized_paragraphs.append("\n".join(lines))
+            result_paragraphs.append("\n".join(lines))
-    return "\n\n".join(normalized_paragraphs)
+    return "\n\n".join(result_paragraphs)

kreuzberg/_utils/_table.py CHANGED Viewed

@@ -3,7 +3,6 @@
 from __future__ import annotations
 import csv
-from io import StringIO
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
@@ -23,9 +22,9 @@ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
     if "df" not in table or table["df"] is None:
         return ""
-    output = StringIO()
-    table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
-    return output.getvalue().strip()
+    # Use pandas to_csv() direct string return instead of StringIO
+    csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
+    return str(csv_output).strip()
 def export_table_to_tsv(table: TableData) -> str:

kreuzberg/cli.py CHANGED Viewed

@@ -18,7 +18,7 @@ except ImportError as e:
     ) from e
 from kreuzberg import __version__, extract_bytes_sync, extract_file_sync
-from kreuzberg._cli_config import build_extraction_config, find_default_config, load_config_from_file
+from kreuzberg._config import build_extraction_config, find_config_file, load_config_from_file
 from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
 DEFAULT_MAX_CHARACTERS = 4000
@@ -92,7 +92,7 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
     if config:
         file_config = load_config_from_file(config)
     else:
-        default_config = find_default_config()
+        default_config = find_config_file()
         if default_config:
             try:
                 file_config = load_config_from_file(default_config)
@@ -314,7 +314,7 @@ def extract(  # noqa: PLR0913
 def config(config: Path | None) -> None:
     """Show current configuration."""
     try:
-        config_path = config or find_default_config()
+        config_path = config or find_config_file()
         if config_path:
             file_config = load_config_from_file(config_path)

kreuzberg/exceptions.py CHANGED Viewed

@@ -7,6 +7,8 @@ from typing import Any
 class KreuzbergError(Exception):
     """Base exception for all Kreuzberg errors."""
+    __slots__ = ("context",)
     context: Any
     """The context of the error."""
@@ -43,14 +45,20 @@ class KreuzbergError(Exception):
 class ParsingError(KreuzbergError):
     """Raised when a parsing error occurs."""
+    __slots__ = ()
 class ValidationError(KreuzbergError):
     """Raised when a validation error occurs."""
+    __slots__ = ()
 class MissingDependencyError(KreuzbergError):
     """Raised when a dependency is missing."""
+    __slots__ = ()
     @classmethod
     def create_for_package(
         cls, *, dependency_group: str, functionality: str, package_name: str
@@ -79,3 +87,5 @@ class MissingDependencyError(KreuzbergError):
 class OCRError(KreuzbergError):
     """Raised when an OCR error occurs."""
+    __slots__ = ()

kreuzberg/extraction.py CHANGED Viewed

@@ -460,8 +460,8 @@ def batch_extract_bytes_sync(
             return (index, error_result)
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        indexed_contents = list(enumerate(contents))
-        future_to_index = {executor.submit(extract_single, ic): i for i, ic in enumerate(indexed_contents)}
+        # Avoid creating intermediate list, use enumerate directly
+        future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
         results: list[ExtractionResult] = [None] * len(contents)  # type: ignore[list-item]
         for future in as_completed(future_to_index):

kreuzberg-3.8.2.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,265 @@
+Metadata-Version: 2.4
+Name: kreuzberg
+Version: 3.8.2
+Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
+Project-URL: documentation, https://kreuzberg.dev
+Project-URL: homepage, https://github.com/Goldziher/kreuzberg
+Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
+License: MIT
+License-File: LICENSE
+Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Information Technology
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Database
+Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
+Classifier: Topic :: Office/Business :: Office Suites
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Scientific/Engineering :: Information Analysis
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: General
+Classifier: Typing :: Typed
+Requires-Python: >=3.10
+Requires-Dist: anyio>=4.9.0
+Requires-Dist: chardetng-py>=0.3.4
+Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
+Requires-Dist: html-to-markdown[lxml]>=1.8.0
+Requires-Dist: mcp>=1.11.0
+Requires-Dist: msgspec>=0.18.0
+Requires-Dist: playa-pdf>=0.6.1
+Requires-Dist: psutil>=7.0.0
+Requires-Dist: pypdfium2==4.30.0
+Requires-Dist: python-calamine>=0.3.2
+Requires-Dist: python-pptx>=1.0.2
+Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
+Provides-Extra: additional-extensions
+Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
+Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
+Provides-Extra: all
+Requires-Dist: click>=8.2.1; extra == 'all'
+Requires-Dist: easyocr>=1.7.2; extra == 'all'
+Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
+Requires-Dist: gmft>=0.4.2; extra == 'all'
+Requires-Dist: keybert>=0.9.0; extra == 'all'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
+Requires-Dist: mailparse>=1.0.15; extra == 'all'
+Requires-Dist: paddleocr>=3.1.0; extra == 'all'
+Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
+Requires-Dist: rich>=14.0.0; extra == 'all'
+Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
+Requires-Dist: setuptools>=80.9.0; extra == 'all'
+Requires-Dist: spacy>=3.8.7; extra == 'all'
+Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
+Provides-Extra: api
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
+Provides-Extra: chunking
+Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
+Provides-Extra: cli
+Requires-Dist: click>=8.2.1; extra == 'cli'
+Requires-Dist: rich>=14.0.0; extra == 'cli'
+Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
+Provides-Extra: easyocr
+Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
+Provides-Extra: entity-extraction
+Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
+Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
+Provides-Extra: gmft
+Requires-Dist: gmft>=0.4.2; extra == 'gmft'
+Provides-Extra: langdetect
+Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
+Provides-Extra: paddleocr
+Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
+Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
+Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
+Description-Content-Type: text/markdown
+# Kreuzberg
+[![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
+[![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
+[![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
+[![Benchmarks](https://img.shields.io/badge/benchmarks-fastest%20CPU-orange)](https://benchmarks.kreuzberg.dev/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
+**A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
+📖 **[Complete Documentation](https://kreuzberg.dev/)**
+## Framework Overview
+### Document Intelligence Capabilities
+- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
+- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
+- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
+- **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
+- **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
+### Technical Architecture
+- **Performance**: Highest throughput among Python document processing frameworks (30+ docs/second)
+- **Resource Efficiency**: 71MB installation, ~360MB runtime memory footprint
+- **Extensibility**: Plugin architecture for custom extractors via the Extractor base class
+- **API Design**: Synchronous and asynchronous APIs with consistent interfaces
+- **Type Safety**: Complete type annotations throughout the codebase
+### Open Source Foundation
+Kreuzberg leverages established open source technologies:
+- **Pandoc**: Universal document converter for robust format support
+- **PDFium**: Google's PDF rendering engine for accurate PDF processing
+- **Tesseract**: Google's OCR engine for text recognition
+- **Python-docx/pptx**: Native Microsoft Office format support
+## Quick Start
+### Extract Text with CLI
+```bash
+# Extract text from any file to markdown
+uvx kreuzberg extract document.pdf > output.md
+# With all features (OCR, table extraction, etc.)
+uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
+# Extract with rich metadata
+uvx kreuzberg extract report.pdf --show-metadata --format json
+```
+### Python Usage
+**Async (recommended for web apps):**
+```python
+from kreuzberg import extract_file
+# In your async function
+result = await extract_file("presentation.pptx")
+print(result.content)
+# Rich metadata extraction
+print(f"Title: {result.metadata.title}")
+print(f"Author: {result.metadata.author}")
+print(f"Page count: {result.metadata.page_count}")
+print(f"Created: {result.metadata.created_at}")
+```
+**Sync (for scripts and CLI tools):**
+```python
+from kreuzberg import extract_file_sync
+result = extract_file_sync("report.docx")
+print(result.content)
+# Access rich metadata
+print(f"Language: {result.metadata.language}")
+print(f"Word count: {result.metadata.word_count}")
+print(f"Keywords: {result.metadata.keywords}")
+```
+### Docker
+```bash
+# Run the REST API
+docker run -p 8000:8000 goldziher/kreuzberg
+# Extract via API
+curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
+```
+📖 **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** • **[CLI Documentation](https://kreuzberg.dev/cli/)** • **[API Reference](https://kreuzberg.dev/api-reference/)**
+## Deployment Options
+### 🤖 MCP Server (AI Integration)
+**Add to Claude Desktop with one command:**
+```bash
+claude mcp add kreuzberg uvx -- --from "kreuzberg[all]" kreuzberg-mcp
+```
+**Or configure manually in `claude_desktop_config.json`:**
+```json
+{
+  "mcpServers": {
+    "kreuzberg": {
+      "command": "uvx",
+      "args": ["--from", "kreuzberg[all]", "kreuzberg-mcp"]
+    }
+  }
+}
+```
+**MCP capabilities:**
+- Extract text from PDFs, images, Office docs, and more
+- Full OCR support with multiple engines
+- Table extraction and metadata parsing
+📖 **[MCP Documentation](https://kreuzberg.dev/user-guide/mcp-server/)**
+## Supported Formats
+| Category          | Formats                        |
+| ----------------- | ------------------------------ |
+| **Documents**     | PDF, DOCX, DOC, RTF, TXT, EPUB |
+| **Images**        | JPG, PNG, TIFF, BMP, GIF, WEBP |
+| **Spreadsheets**  | XLSX, XLS, CSV, ODS            |
+| **Presentations** | PPTX, PPT, ODP                 |
+| **Web**           | HTML, XML, MHTML               |
+| **Archives**      | Support via extraction         |
+## 📊 Performance Characteristics
+[View comprehensive benchmarks](https://benchmarks.kreuzberg.dev/) • [Benchmark methodology](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://kreuzberg.dev/performance-analysis/)
+### Technical Specifications
+| Metric                       | Kreuzberg Sync | Kreuzberg Async | Benchmarked        |
+| ---------------------------- | -------------- | --------------- | ------------------ |
+| **Throughput (tiny files)**  | 31.78 files/s  | 23.94 files/s   | Highest throughput |
+| **Throughput (small files)** | 8.91 files/s   | 9.31 files/s    | Highest throughput |
+| **Memory footprint**         | 359.8 MB       | 395.2 MB        | Lowest usage       |
+| **Installation size**        | 71 MB          | 71 MB           | Smallest size      |
+| **Success rate**             | 100%           | 100%            | Perfect            |
+| **Supported formats**        | 18             | 18              | Comprehensive      |
+### Architecture Advantages
+- **Native C extensions**: Built on PDFium and Tesseract for maximum performance
+- **Async/await support**: True asynchronous processing with intelligent task scheduling
+- **Memory efficiency**: Streaming architecture minimizes memory allocation
+- **Process pooling**: Automatic multiprocessing for CPU-intensive operations
+- **Optimized data flow**: Efficient data handling with minimal transformations
+> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
+## Documentation
+### Quick Links
+- [Installation Guide](https://kreuzberg.dev/getting-started/installation/) - Setup and dependencies
+- [User Guide](https://kreuzberg.dev/user-guide/) - Comprehensive usage guide
+- [Performance Analysis](https://kreuzberg.dev/performance-analysis/) - Detailed benchmark results
+- [API Reference](https://kreuzberg.dev/api-reference/) - Complete API documentation
+- [Docker Guide](https://kreuzberg.dev/user-guide/docker/) - Container deployment
+- [REST API](https://kreuzberg.dev/user-guide/api-server/) - HTTP endpoints
+- [CLI Guide](https://kreuzberg.dev/cli/) - Command-line usage
+- [OCR Configuration](https://kreuzberg.dev/user-guide/ocr-configuration/) - OCR engine setup
+## License
+MIT License - see [LICENSE](LICENSE) for details.

kreuzberg-3.8.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,53 @@
+kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
+kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
+kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
+kreuzberg/_config.py,sha256=_9JU88ChId8dWUjZ13ueo9_JoFekkyzuv7rZpFkrPZk,12966
+kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
+kreuzberg/_entity_extraction.py,sha256=woNxARG27Z3T_l6w6N-dbt1PPe1IHptFMOZY_6etv54,7819
+kreuzberg/_gmft.py,sha256=Q46CyBxRxY_oDGpSuXMOJ7qfR9LwuCKXnrl60wcPvU4,25286
+kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
+kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
+kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
+kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
+kreuzberg/_types.py,sha256=GisvL0ps2LCc0heKopFwSyrEbzH3WpDxaeev4vn59X4,14257
+kreuzberg/cli.py,sha256=vTGS2TJlFTNMWp5LwZd3G2SS8u0m6bhQkH9n6a1oOoM,12439
+kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
+kreuzberg/extraction.py,sha256=UmeEVN-eSile4HMxP0iqG9092BrsH5_zSZNVHhwy0ko,16993
+kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg/_api/main.py,sha256=g3kqXUfSie2pcw3-EWOM4TAoJUqM7yj2e-cBQJ_bmYc,3253
+kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
+kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
+kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
+kreuzberg/_extractors/_image.py,sha256=eZ7mR4F-mTwYwUzd70xrY7SZYZrNiDxnP5bYDY5P75U,4455
+kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
+kreuzberg/_extractors/_pdf.py,sha256=d-hG_mhAMj22bQ35YuP2nq017z27_2Pp08r1qyHxlYI,16676
+kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
+kreuzberg/_extractors/_spread_sheet.py,sha256=vPxEDAyH-gDoVXSg-A0guOjOfaWIuRI3i2NU8xPwhK8,13695
+kreuzberg/_extractors/_structured.py,sha256=d0x6EyRimr8eWmr1qPb7HRWnrbKBuD-GpIrZd8XJp0o,5824
+kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
+kreuzberg/_mcp/server.py,sha256=Ab0w7kR3m7_L1cfhYHiC8HqDL282vt4uBYwYc9w9E08,8703
+kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
+kreuzberg/_ocr/_base.py,sha256=CUzYMsJjCqCmHzWckmDeIB2L5hd261xrPrK8Ql-Gdm0,3876
+kreuzberg/_ocr/_easyocr.py,sha256=c2ndpDlIHvAI2WyvQUXLQ1hb6XynKeKARsXQcQ3ntJ0,17110
+kreuzberg/_ocr/_paddleocr.py,sha256=fab8a-3cvDgnt97qF-Km9ZfmkacFeKD_g15O8HXYRVc,17492
+kreuzberg/_ocr/_tesseract.py,sha256=r1g_PCAXgJbZ0RPGn4aSxctZ0F9lLvI3zLGLEPAnviI,31455
+kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg/_utils/_cache.py,sha256=H2d6JOiTTAoJx5HPJoToCk4ik-ztTRNEJRrHgcSUTLs,15249
+kreuzberg/_utils/_device.py,sha256=PC8YUPE95pzOyU7sU_icqNZpSfi6HZlEFfmWcV1Uees,10226
+kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
+kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
+kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
+kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
+kreuzberg/_utils/_quality.py,sha256=-nKzj5n7yJDYrvl556oq2T5S5oKMEOrjpcRMlZ00Jqo,7668
+kreuzberg/_utils/_serialization.py,sha256=cqqxqN2cmtndBhIr4v2wqiMwnNadnKhvuN7EUj3i18M,2290
+kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6800
+kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
+kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
+kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
+kreuzberg-3.8.2.dist-info/METADATA,sha256=RiP64og5wOaf9gPZ7CwOsNYYx9GBnVMg8orgqZdncKA,11466
+kreuzberg-3.8.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+kreuzberg-3.8.2.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
+kreuzberg-3.8.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-3.8.2.dist-info/RECORD,,

kreuzberg 3.8.1__py3-none-any.whl → 3.8.2__py3-none-any.whl

kreuzberg 3.8.1py3-none-any.whl → 3.8.2py3-none-any.whl