kreuzberg 3.3.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
kreuzberg/_api/main.py ADDED
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ from json import dumps
4
+ from typing import TYPE_CHECKING, Annotated, Any
5
+
6
+ from kreuzberg import (
7
+ ExtractionResult,
8
+ KreuzbergError,
9
+ MissingDependencyError,
10
+ ParsingError,
11
+ ValidationError,
12
+ batch_extract_bytes,
13
+ )
14
+
15
+ if TYPE_CHECKING:
16
+ from litestar.datastructures import UploadFile
17
+
18
+ try:
19
+ from litestar import Litestar, Request, Response, get, post
20
+ from litestar.contrib.opentelemetry import OpenTelemetryConfig, OpenTelemetryPlugin
21
+ from litestar.enums import RequestEncodingType
22
+ from litestar.logging import StructLoggingConfig
23
+ from litestar.params import Body
24
+ from litestar.status_codes import (
25
+ HTTP_400_BAD_REQUEST,
26
+ HTTP_422_UNPROCESSABLE_ENTITY,
27
+ HTTP_500_INTERNAL_SERVER_ERROR,
28
+ )
29
+ except ImportError as e:
30
+ raise MissingDependencyError.create_for_package(
31
+ dependency_group="litestar",
32
+ functionality="Litestar API and docker container",
33
+ package_name="litestar",
34
+ ) from e
35
+
36
+
37
+ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError) -> Response[Any]:
38
+ if isinstance(exception, ValidationError):
39
+ status_code = HTTP_400_BAD_REQUEST
40
+ elif isinstance(exception, ParsingError):
41
+ status_code = HTTP_422_UNPROCESSABLE_ENTITY
42
+ else:
43
+ status_code = HTTP_500_INTERNAL_SERVER_ERROR
44
+
45
+ message = str(exception)
46
+ details = dumps(exception.context)
47
+
48
+ if request.app.logger:
49
+ request.app.logger.error(
50
+ "API error",
51
+ method=request.method,
52
+ url=str(request.url),
53
+ status_code=status_code,
54
+ message=message,
55
+ context=exception.context,
56
+ )
57
+
58
+ return Response(
59
+ content={"message": message, "details": details},
60
+ status_code=status_code,
61
+ )
62
+
63
+
64
+ @post("/extract", operation_id="ExtractFiles")
65
+ async def handle_files_upload(
66
+ data: Annotated[list[UploadFile], Body(media_type=RequestEncodingType.MULTI_PART)],
67
+ ) -> list[ExtractionResult]:
68
+ """Extracts text content from an uploaded file."""
69
+ return await batch_extract_bytes(
70
+ [(await file.read(), file.content_type) for file in data],
71
+ )
72
+
73
+
74
+ @get("/health", operation_id="HealthCheck")
75
+ async def health_check() -> dict[str, str]:
76
+ """A simple health check endpoint."""
77
+ return {"status": "ok"}
78
+
79
+
80
+ app = Litestar(
81
+ route_handlers=[handle_files_upload, health_check],
82
+ plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
83
+ logging_config=StructLoggingConfig(),
84
+ exception_handlers={
85
+ KreuzbergError: exception_handler,
86
+ },
87
+ )
kreuzberg/_types.py CHANGED
@@ -114,6 +114,10 @@ class ExtractionResult:
114
114
  chunks: list[str] = field(default_factory=list)
115
115
  """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
116
116
 
117
+ def to_dict(self) -> dict[str, Any]:
118
+ """Converts the ExtractionResult to a dictionary."""
119
+ return asdict(self)
120
+
117
121
 
118
122
  PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
119
123
  ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.3.0
3
+ Version: 3.4.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -34,12 +34,15 @@ Provides-Extra: all
34
34
  Requires-Dist: click>=8.2.1; extra == 'all'
35
35
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
36
36
  Requires-Dist: gmft>=0.4.2; extra == 'all'
37
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
37
38
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
38
39
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
39
40
  Requires-Dist: rich>=14.0.0; extra == 'all'
40
41
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
41
42
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
42
43
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
44
+ Provides-Extra: api
45
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
43
46
  Provides-Extra: chunking
44
47
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
45
48
  Provides-Extra: cli
@@ -63,10 +66,14 @@ Description-Content-Type: text/markdown
63
66
  [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
64
67
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
65
68
 
66
- Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
69
+ Kreuzberg is a **high-performance** Python library for text extraction from documents. **Benchmarked as one of the fastest text extraction libraries available**, it provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs optimized for speed and efficiency.
67
70
 
68
71
  ## Why Kreuzberg?
69
72
 
73
+ - **🚀 Substantially Faster**: Extraction speeds that significantly outperform other text extraction libraries
74
+ - **⚡ Unique Dual API**: The only framework supporting both sync and async APIs for maximum flexibility
75
+ - **💾 Memory Efficient**: Lower memory footprint compared to competing libraries
76
+ - **📊 Proven Performance**: [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) demonstrate superior performance across formats
70
77
  - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
71
78
  - **Local Processing**: No external API calls or cloud dependencies required
72
79
  - **Resource Efficient**: Lightweight processing without GPU requirements
@@ -85,6 +92,9 @@ pip install kreuzberg
85
92
 
86
93
  # Or install with CLI support
87
94
  pip install "kreuzberg[cli]"
95
+
96
+ # Or install with API server
97
+ pip install "kreuzberg[api]"
88
98
  ```
89
99
 
90
100
  Install pandoc:
@@ -134,6 +144,31 @@ async def main():
134
144
  asyncio.run(main())
135
145
  ```
136
146
 
147
+ ## Docker
148
+
149
+ Docker images are available for easy deployment:
150
+
151
+ ```bash
152
+ # Run the API server
153
+ docker run -p 8000:8000 goldziher/kreuzberg:latest
154
+
155
+ # Extract files via API
156
+ curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
157
+ ```
158
+
159
+ See the [Docker documentation](https://goldziher.github.io/kreuzberg/user-guide/docker/) for more options.
160
+
161
+ ## REST API
162
+
163
+ Run Kreuzberg as a REST API server:
164
+
165
+ ```bash
166
+ pip install "kreuzberg[api]"
167
+ litestar --app kreuzberg._api.main:app run
168
+ ```
169
+
170
+ See the [API documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/) for endpoints and usage.
171
+
137
172
  ## Command Line Interface
138
173
 
139
174
  Kreuzberg includes a powerful CLI for processing documents from the command line:
@@ -208,7 +243,31 @@ For comparison and selection guidance, see the [OCR Backends](https://goldziher.
208
243
 
209
244
  ## Performance
210
245
 
211
- Kreuzberg offers both sync and async APIs. Choose the right one based on your use case:
246
+ Kreuzberg delivers **exceptional performance** compared to other text extraction libraries:
247
+
248
+ ### 🏆 Competitive Benchmarks
249
+
250
+ [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) comparing Kreuzberg against other popular Python text extraction libraries show:
251
+
252
+ - **Fastest Extraction**: Consistently fastest processing times across file formats
253
+ - **Lowest Memory Usage**: Most memory-efficient text extraction solution
254
+ - **100% Success Rate**: Reliable extraction across all tested document types
255
+ - **Optimal for High-Throughput**: Designed for real-time, production applications
256
+
257
+ ### 💾 Installation Size Efficiency
258
+
259
+ Kreuzberg delivers maximum performance with minimal overhead:
260
+
261
+ 1. **Kreuzberg**: 71.0 MB (20 deps) - Most lightweight
262
+ 1. **Unstructured**: 145.8 MB (54 deps) - Moderate footprint
263
+ 1. **MarkItDown**: 250.7 MB (25 deps) - ML inference overhead
264
+ 1. **Docling**: 1,031.9 MB (88 deps) - Full ML stack included
265
+
266
+ **Kreuzberg is up to 14x smaller** than competing solutions while delivering superior performance.
267
+
268
+ ### ⚡ Sync vs Async Performance
269
+
270
+ Kreuzberg is the only library offering both sync and async APIs. Choose based on your use case:
212
271
 
213
272
  | Operation | Sync Time | Async Time | Async Advantage |
214
273
  | ---------------------- | --------- | ---------- | ------------------ |
@@ -218,11 +277,7 @@ Kreuzberg offers both sync and async APIs. Choose the right one based on your us
218
277
  | OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
219
278
  | Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
220
279
 
221
- **Rule of thumb:**
222
-
223
- - Use **sync** for simple documents and CLI applications
224
- - Use **async** for complex PDFs, OCR, and batch processing
225
- - Use **batch operations** for multiple files
280
+ **Rule of thumb:** Use async for complex documents, OCR, batch processing, and backend APIs.
226
281
 
227
282
  For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
228
283
 
@@ -7,11 +7,13 @@ kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
7
7
  kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
8
8
  kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
9
9
  kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
10
- kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
10
+ kreuzberg/_types.py,sha256=8kwDjQjBdiTbNcRwJmH4vijNpf9Ml9WNW85Uxv2alDw,7634
11
11
  kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
12
12
  kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
13
13
  kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
14
14
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
15
17
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
18
  kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
17
19
  kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
@@ -41,8 +43,8 @@ kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lT
41
43
  kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
42
44
  kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
43
45
  kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
44
- kreuzberg-3.3.0.dist-info/METADATA,sha256=beRlFJzCsZNcQ_DsRyzRc2WDT-UkBCfBvY6vTWiOxp0,8748
45
- kreuzberg-3.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
46
- kreuzberg-3.3.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
47
- kreuzberg-3.3.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
48
- kreuzberg-3.3.0.dist-info/RECORD,,
46
+ kreuzberg-3.4.0.dist-info/METADATA,sha256=Rg939xe9b-H0TExRcJKkf9MFg7-kWM_fvzGvV6VDG0Q,11215
47
+ kreuzberg-3.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
48
+ kreuzberg-3.4.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
49
+ kreuzberg-3.4.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
50
+ kreuzberg-3.4.0.dist-info/RECORD,,