kreuzberg 3.3.0__py3-none-any.whl → 3.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_types.py +4 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.4.0.dist-info}/METADATA +63 -8
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.4.0.dist-info}/RECORD +8 -6
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.4.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
File without changes
|
kreuzberg/_api/main.py
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from json import dumps
|
4
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
5
|
+
|
6
|
+
from kreuzberg import (
|
7
|
+
ExtractionResult,
|
8
|
+
KreuzbergError,
|
9
|
+
MissingDependencyError,
|
10
|
+
ParsingError,
|
11
|
+
ValidationError,
|
12
|
+
batch_extract_bytes,
|
13
|
+
)
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from litestar.datastructures import UploadFile
|
17
|
+
|
18
|
+
try:
|
19
|
+
from litestar import Litestar, Request, Response, get, post
|
20
|
+
from litestar.contrib.opentelemetry import OpenTelemetryConfig, OpenTelemetryPlugin
|
21
|
+
from litestar.enums import RequestEncodingType
|
22
|
+
from litestar.logging import StructLoggingConfig
|
23
|
+
from litestar.params import Body
|
24
|
+
from litestar.status_codes import (
|
25
|
+
HTTP_400_BAD_REQUEST,
|
26
|
+
HTTP_422_UNPROCESSABLE_ENTITY,
|
27
|
+
HTTP_500_INTERNAL_SERVER_ERROR,
|
28
|
+
)
|
29
|
+
except ImportError as e:
|
30
|
+
raise MissingDependencyError.create_for_package(
|
31
|
+
dependency_group="litestar",
|
32
|
+
functionality="Litestar API and docker container",
|
33
|
+
package_name="litestar",
|
34
|
+
) from e
|
35
|
+
|
36
|
+
|
37
|
+
def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError) -> Response[Any]:
|
38
|
+
if isinstance(exception, ValidationError):
|
39
|
+
status_code = HTTP_400_BAD_REQUEST
|
40
|
+
elif isinstance(exception, ParsingError):
|
41
|
+
status_code = HTTP_422_UNPROCESSABLE_ENTITY
|
42
|
+
else:
|
43
|
+
status_code = HTTP_500_INTERNAL_SERVER_ERROR
|
44
|
+
|
45
|
+
message = str(exception)
|
46
|
+
details = dumps(exception.context)
|
47
|
+
|
48
|
+
if request.app.logger:
|
49
|
+
request.app.logger.error(
|
50
|
+
"API error",
|
51
|
+
method=request.method,
|
52
|
+
url=str(request.url),
|
53
|
+
status_code=status_code,
|
54
|
+
message=message,
|
55
|
+
context=exception.context,
|
56
|
+
)
|
57
|
+
|
58
|
+
return Response(
|
59
|
+
content={"message": message, "details": details},
|
60
|
+
status_code=status_code,
|
61
|
+
)
|
62
|
+
|
63
|
+
|
64
|
+
@post("/extract", operation_id="ExtractFiles")
|
65
|
+
async def handle_files_upload(
|
66
|
+
data: Annotated[list[UploadFile], Body(media_type=RequestEncodingType.MULTI_PART)],
|
67
|
+
) -> list[ExtractionResult]:
|
68
|
+
"""Extracts text content from an uploaded file."""
|
69
|
+
return await batch_extract_bytes(
|
70
|
+
[(await file.read(), file.content_type) for file in data],
|
71
|
+
)
|
72
|
+
|
73
|
+
|
74
|
+
@get("/health", operation_id="HealthCheck")
|
75
|
+
async def health_check() -> dict[str, str]:
|
76
|
+
"""A simple health check endpoint."""
|
77
|
+
return {"status": "ok"}
|
78
|
+
|
79
|
+
|
80
|
+
app = Litestar(
|
81
|
+
route_handlers=[handle_files_upload, health_check],
|
82
|
+
plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
|
83
|
+
logging_config=StructLoggingConfig(),
|
84
|
+
exception_handlers={
|
85
|
+
KreuzbergError: exception_handler,
|
86
|
+
},
|
87
|
+
)
|
kreuzberg/_types.py
CHANGED
@@ -114,6 +114,10 @@ class ExtractionResult:
|
|
114
114
|
chunks: list[str] = field(default_factory=list)
|
115
115
|
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
116
116
|
|
117
|
+
def to_dict(self) -> dict[str, Any]:
|
118
|
+
"""Converts the ExtractionResult to a dictionary."""
|
119
|
+
return asdict(self)
|
120
|
+
|
117
121
|
|
118
122
|
PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
|
119
123
|
ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.4.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -34,12 +34,15 @@ Provides-Extra: all
|
|
34
34
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
35
35
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
36
36
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
37
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
|
37
38
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
38
39
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
39
40
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
40
41
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
41
42
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
42
43
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
44
|
+
Provides-Extra: api
|
45
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
|
43
46
|
Provides-Extra: chunking
|
44
47
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
45
48
|
Provides-Extra: cli
|
@@ -63,10 +66,14 @@ Description-Content-Type: text/markdown
|
|
63
66
|
[](https://goldziher.github.io/kreuzberg/)
|
64
67
|
[](https://opensource.org/licenses/MIT)
|
65
68
|
|
66
|
-
Kreuzberg is a Python library for text extraction from documents.
|
69
|
+
Kreuzberg is a **high-performance** Python library for text extraction from documents. **Benchmarked as one of the fastest text extraction libraries available**, it provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs optimized for speed and efficiency.
|
67
70
|
|
68
71
|
## Why Kreuzberg?
|
69
72
|
|
73
|
+
- **🚀 Substantially Faster**: Extraction speeds that significantly outperform other text extraction libraries
|
74
|
+
- **⚡ Unique Dual API**: The only framework supporting both sync and async APIs for maximum flexibility
|
75
|
+
- **💾 Memory Efficient**: Lower memory footprint compared to competing libraries
|
76
|
+
- **📊 Proven Performance**: [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) demonstrate superior performance across formats
|
70
77
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
71
78
|
- **Local Processing**: No external API calls or cloud dependencies required
|
72
79
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
@@ -85,6 +92,9 @@ pip install kreuzberg
|
|
85
92
|
|
86
93
|
# Or install with CLI support
|
87
94
|
pip install "kreuzberg[cli]"
|
95
|
+
|
96
|
+
# Or install with API server
|
97
|
+
pip install "kreuzberg[api]"
|
88
98
|
```
|
89
99
|
|
90
100
|
Install pandoc:
|
@@ -134,6 +144,31 @@ async def main():
|
|
134
144
|
asyncio.run(main())
|
135
145
|
```
|
136
146
|
|
147
|
+
## Docker
|
148
|
+
|
149
|
+
Docker images are available for easy deployment:
|
150
|
+
|
151
|
+
```bash
|
152
|
+
# Run the API server
|
153
|
+
docker run -p 8000:8000 goldziher/kreuzberg:latest
|
154
|
+
|
155
|
+
# Extract files via API
|
156
|
+
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
157
|
+
```
|
158
|
+
|
159
|
+
See the [Docker documentation](https://goldziher.github.io/kreuzberg/user-guide/docker/) for more options.
|
160
|
+
|
161
|
+
## REST API
|
162
|
+
|
163
|
+
Run Kreuzberg as a REST API server:
|
164
|
+
|
165
|
+
```bash
|
166
|
+
pip install "kreuzberg[api]"
|
167
|
+
litestar --app kreuzberg._api.main:app run
|
168
|
+
```
|
169
|
+
|
170
|
+
See the [API documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/) for endpoints and usage.
|
171
|
+
|
137
172
|
## Command Line Interface
|
138
173
|
|
139
174
|
Kreuzberg includes a powerful CLI for processing documents from the command line:
|
@@ -208,7 +243,31 @@ For comparison and selection guidance, see the [OCR Backends](https://goldziher.
|
|
208
243
|
|
209
244
|
## Performance
|
210
245
|
|
211
|
-
Kreuzberg
|
246
|
+
Kreuzberg delivers **exceptional performance** compared to other text extraction libraries:
|
247
|
+
|
248
|
+
### 🏆 Competitive Benchmarks
|
249
|
+
|
250
|
+
[Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) comparing Kreuzberg against other popular Python text extraction libraries show:
|
251
|
+
|
252
|
+
- **Fastest Extraction**: Consistently fastest processing times across file formats
|
253
|
+
- **Lowest Memory Usage**: Most memory-efficient text extraction solution
|
254
|
+
- **100% Success Rate**: Reliable extraction across all tested document types
|
255
|
+
- **Optimal for High-Throughput**: Designed for real-time, production applications
|
256
|
+
|
257
|
+
### 💾 Installation Size Efficiency
|
258
|
+
|
259
|
+
Kreuzberg delivers maximum performance with minimal overhead:
|
260
|
+
|
261
|
+
1. **Kreuzberg**: 71.0 MB (20 deps) - Most lightweight
|
262
|
+
1. **Unstructured**: 145.8 MB (54 deps) - Moderate footprint
|
263
|
+
1. **MarkItDown**: 250.7 MB (25 deps) - ML inference overhead
|
264
|
+
1. **Docling**: 1,031.9 MB (88 deps) - Full ML stack included
|
265
|
+
|
266
|
+
**Kreuzberg is up to 14x smaller** than competing solutions while delivering superior performance.
|
267
|
+
|
268
|
+
### ⚡ Sync vs Async Performance
|
269
|
+
|
270
|
+
Kreuzberg is the only library offering both sync and async APIs. Choose based on your use case:
|
212
271
|
|
213
272
|
| Operation | Sync Time | Async Time | Async Advantage |
|
214
273
|
| ---------------------- | --------- | ---------- | ------------------ |
|
@@ -218,11 +277,7 @@ Kreuzberg offers both sync and async APIs. Choose the right one based on your us
|
|
218
277
|
| OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
|
219
278
|
| Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
|
220
279
|
|
221
|
-
**Rule of thumb:**
|
222
|
-
|
223
|
-
- Use **sync** for simple documents and CLI applications
|
224
|
-
- Use **async** for complex PDFs, OCR, and batch processing
|
225
|
-
- Use **batch operations** for multiple files
|
280
|
+
**Rule of thumb:** Use async for complex documents, OCR, batch processing, and backend APIs.
|
226
281
|
|
227
282
|
For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
|
228
283
|
|
@@ -7,11 +7,13 @@ kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
|
|
7
7
|
kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
|
8
8
|
kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
|
9
9
|
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
10
|
-
kreuzberg/_types.py,sha256=
|
10
|
+
kreuzberg/_types.py,sha256=8kwDjQjBdiTbNcRwJmH4vijNpf9Ml9WNW85Uxv2alDw,7634
|
11
11
|
kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
|
12
12
|
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
13
13
|
kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
|
14
14
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
|
15
17
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
18
|
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
17
19
|
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
@@ -41,8 +43,8 @@ kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lT
|
|
41
43
|
kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
|
42
44
|
kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
|
43
45
|
kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
|
44
|
-
kreuzberg-3.
|
45
|
-
kreuzberg-3.
|
46
|
-
kreuzberg-3.
|
47
|
-
kreuzberg-3.
|
48
|
-
kreuzberg-3.
|
46
|
+
kreuzberg-3.4.0.dist-info/METADATA,sha256=Rg939xe9b-H0TExRcJKkf9MFg7-kWM_fvzGvV6VDG0Q,11215
|
47
|
+
kreuzberg-3.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
48
|
+
kreuzberg-3.4.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
|
49
|
+
kreuzberg-3.4.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
50
|
+
kreuzberg-3.4.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|