PyPI - kreuzberg - Versions diffs - 3.3.0__py3-none-any.whl → 3.4.0__py3-none-any.whl - Mend

kreuzberg 3.3.0py3-none-any.whl → 3.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

kreuzberg/_api/__init__.py ADDED Viewed

File without changes

kreuzberg/_api/main.py ADDED Viewed

@@ -0,0 +1,87 @@
+from __future__ import annotations
+from json import dumps
+from typing import TYPE_CHECKING, Annotated, Any
+from kreuzberg import (
+    ExtractionResult,
+    KreuzbergError,
+    MissingDependencyError,
+    ParsingError,
+    ValidationError,
+    batch_extract_bytes,
+)
+if TYPE_CHECKING:
+    from litestar.datastructures import UploadFile
+try:
+    from litestar import Litestar, Request, Response, get, post
+    from litestar.contrib.opentelemetry import OpenTelemetryConfig, OpenTelemetryPlugin
+    from litestar.enums import RequestEncodingType
+    from litestar.logging import StructLoggingConfig
+    from litestar.params import Body
+    from litestar.status_codes import (
+        HTTP_400_BAD_REQUEST,
+        HTTP_422_UNPROCESSABLE_ENTITY,
+        HTTP_500_INTERNAL_SERVER_ERROR,
+    )
+except ImportError as e:
+    raise MissingDependencyError.create_for_package(
+        dependency_group="litestar",
+        functionality="Litestar API and docker container",
+        package_name="litestar",
+    ) from e
+def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError) -> Response[Any]:
+    if isinstance(exception, ValidationError):
+        status_code = HTTP_400_BAD_REQUEST
+    elif isinstance(exception, ParsingError):
+        status_code = HTTP_422_UNPROCESSABLE_ENTITY
+    else:
+        status_code = HTTP_500_INTERNAL_SERVER_ERROR
+    message = str(exception)
+    details = dumps(exception.context)
+    if request.app.logger:
+        request.app.logger.error(
+            "API error",
+            method=request.method,
+            url=str(request.url),
+            status_code=status_code,
+            message=message,
+            context=exception.context,
+        )
+    return Response(
+        content={"message": message, "details": details},
+        status_code=status_code,
+    )
+@post("/extract", operation_id="ExtractFiles")
+async def handle_files_upload(
+    data: Annotated[list[UploadFile], Body(media_type=RequestEncodingType.MULTI_PART)],
+) -> list[ExtractionResult]:
+    """Extracts text content from an uploaded file."""
+    return await batch_extract_bytes(
+        [(await file.read(), file.content_type) for file in data],
+    )
+@get("/health", operation_id="HealthCheck")
+async def health_check() -> dict[str, str]:
+    """A simple health check endpoint."""
+    return {"status": "ok"}
+app = Litestar(
+    route_handlers=[handle_files_upload, health_check],
+    plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
+    logging_config=StructLoggingConfig(),
+    exception_handlers={
+        KreuzbergError: exception_handler,
+    },
+)

kreuzberg/_types.py CHANGED Viewed

@@ -114,6 +114,10 @@ class ExtractionResult:
     chunks: list[str] = field(default_factory=list)
     """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
+    def to_dict(self) -> dict[str, Any]:
+        """Converts the ExtractionResult to a dictionary."""
+        return asdict(self)
 PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
 ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]

{kreuzberg-3.3.0.dist-info → kreuzberg-3.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.3.0
+Version: 3.4.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -34,12 +34,15 @@ Provides-Extra: all
 Requires-Dist: click>=8.2.1; extra == 'all'
 Requires-Dist: easyocr>=1.7.2; extra == 'all'
 Requires-Dist: gmft>=0.4.2; extra == 'all'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
 Requires-Dist: paddleocr>=3.1.0; extra == 'all'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
 Requires-Dist: rich>=14.0.0; extra == 'all'
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
 Requires-Dist: setuptools>=80.9.0; extra == 'all'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
+Provides-Extra: api
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
 Provides-Extra: chunking
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
 Provides-Extra: cli
@@ -63,10 +66,14 @@ Description-Content-Type: text/markdown
 [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
+Kreuzberg is a **high-performance** Python library for text extraction from documents. **Benchmarked as one of the fastest text extraction libraries available**, it provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs optimized for speed and efficiency.
 ## Why Kreuzberg?
+- **🚀 Substantially Faster**: Extraction speeds that significantly outperform other text extraction libraries
+- **⚡ Unique Dual API**: The only framework supporting both sync and async APIs for maximum flexibility
+- **💾 Memory Efficient**: Lower memory footprint compared to competing libraries
+- **📊 Proven Performance**: [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) demonstrate superior performance across formats
 - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
 - **Local Processing**: No external API calls or cloud dependencies required
 - **Resource Efficient**: Lightweight processing without GPU requirements
@@ -85,6 +92,9 @@ pip install kreuzberg
 # Or install with CLI support
 pip install "kreuzberg[cli]"
+# Or install with API server
+pip install "kreuzberg[api]"
 ```
 Install pandoc:
@@ -134,6 +144,31 @@ async def main():
 asyncio.run(main())
 ```
+## Docker
+Docker images are available for easy deployment:
+```bash
+# Run the API server
+docker run -p 8000:8000 goldziher/kreuzberg:latest
+# Extract files via API
+curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
+```
+See the [Docker documentation](https://goldziher.github.io/kreuzberg/user-guide/docker/) for more options.
+## REST API
+Run Kreuzberg as a REST API server:
+```bash
+pip install "kreuzberg[api]"
+litestar --app kreuzberg._api.main:app run
+```
+See the [API documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/) for endpoints and usage.
 ## Command Line Interface
 Kreuzberg includes a powerful CLI for processing documents from the command line:
@@ -208,7 +243,31 @@ For comparison and selection guidance, see the [OCR Backends](https://goldziher.
 ## Performance
-Kreuzberg offers both sync and async APIs. Choose the right one based on your use case:
+Kreuzberg delivers **exceptional performance** compared to other text extraction libraries:
+### 🏆 Competitive Benchmarks
+[Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) comparing Kreuzberg against other popular Python text extraction libraries show:
+- **Fastest Extraction**: Consistently fastest processing times across file formats
+- **Lowest Memory Usage**: Most memory-efficient text extraction solution
+- **100% Success Rate**: Reliable extraction across all tested document types
+- **Optimal for High-Throughput**: Designed for real-time, production applications
+### 💾 Installation Size Efficiency
+Kreuzberg delivers maximum performance with minimal overhead:
+1. **Kreuzberg**: 71.0 MB (20 deps) - Most lightweight
+1. **Unstructured**: 145.8 MB (54 deps) - Moderate footprint
+1. **MarkItDown**: 250.7 MB (25 deps) - ML inference overhead
+1. **Docling**: 1,031.9 MB (88 deps) - Full ML stack included
+**Kreuzberg is up to 14x smaller** than competing solutions while delivering superior performance.
+### ⚡ Sync vs Async Performance
+Kreuzberg is the only library offering both sync and async APIs. Choose based on your use case:
 | Operation              | Sync Time | Async Time | Async Advantage    |
 | ---------------------- | --------- | ---------- | ------------------ |
@@ -218,11 +277,7 @@ Kreuzberg offers both sync and async APIs. Choose the right one based on your us
 | OCR processing         | 0.4s      | 0.7s       | **✅ 1.7x faster** |
 | Batch operations       | 38.6s     | 8.5s       | **✅ 4.5x faster** |
-**Rule of thumb:**
-- Use **sync** for simple documents and CLI applications
-- Use **async** for complex PDFs, OCR, and batch processing
-- Use **batch operations** for multiple files
+**Rule of thumb:** Use async for complex documents, OCR, batch processing, and backend APIs.
 For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).

{kreuzberg-3.3.0.dist-info → kreuzberg-3.4.0.dist-info}/RECORD RENAMED Viewed

@@ -7,11 +7,13 @@ kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
 kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
 kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
 kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
-kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
+kreuzberg/_types.py,sha256=8kwDjQjBdiTbNcRwJmH4vijNpf9Ml9WNW85Uxv2alDw,7634
 kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
 kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
 kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
 kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
 kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
 kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
@@ -41,8 +43,8 @@ kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lT
 kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
 kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
 kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
-kreuzberg-3.3.0.dist-info/METADATA,sha256=beRlFJzCsZNcQ_DsRyzRc2WDT-UkBCfBvY6vTWiOxp0,8748
-kreuzberg-3.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-kreuzberg-3.3.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
-kreuzberg-3.3.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-3.3.0.dist-info/RECORD,,
+kreuzberg-3.4.0.dist-info/METADATA,sha256=Rg939xe9b-H0TExRcJKkf9MFg7-kWM_fvzGvV6VDG0Q,11215
+kreuzberg-3.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+kreuzberg-3.4.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
+kreuzberg-3.4.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-3.4.0.dist-info/RECORD,,

{kreuzberg-3.3.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{kreuzberg-3.3.0.dist-info → kreuzberg-3.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{kreuzberg-3.3.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

kreuzberg 3.3.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

kreuzberg 3.3.0py3-none-any.whl → 3.4.0py3-none-any.whl