kreuzberg 3.3.0__py3-none-any.whl → 3.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +3 -1
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_types.py +4 -0
- kreuzberg-3.4.1.dist-info/METADATA +233 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.4.1.dist-info}/RECORD +9 -7
- kreuzberg-3.3.0.dist-info/METADATA +0 -235
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.4.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.4.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.4.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
from importlib.metadata import version
|
2
|
+
|
1
3
|
from kreuzberg._gmft import GMFTConfig
|
2
4
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
3
5
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
@@ -18,7 +20,7 @@ from .extraction import (
|
|
18
20
|
extract_file_sync,
|
19
21
|
)
|
20
22
|
|
21
|
-
__version__ = "
|
23
|
+
__version__ = version("kreuzberg")
|
22
24
|
|
23
25
|
__all__ = [
|
24
26
|
"EasyOCRConfig",
|
File without changes
|
kreuzberg/_api/main.py
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from json import dumps
|
4
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
5
|
+
|
6
|
+
from kreuzberg import (
|
7
|
+
ExtractionResult,
|
8
|
+
KreuzbergError,
|
9
|
+
MissingDependencyError,
|
10
|
+
ParsingError,
|
11
|
+
ValidationError,
|
12
|
+
batch_extract_bytes,
|
13
|
+
)
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from litestar.datastructures import UploadFile
|
17
|
+
|
18
|
+
try:
|
19
|
+
from litestar import Litestar, Request, Response, get, post
|
20
|
+
from litestar.contrib.opentelemetry import OpenTelemetryConfig, OpenTelemetryPlugin
|
21
|
+
from litestar.enums import RequestEncodingType
|
22
|
+
from litestar.logging import StructLoggingConfig
|
23
|
+
from litestar.params import Body
|
24
|
+
from litestar.status_codes import (
|
25
|
+
HTTP_400_BAD_REQUEST,
|
26
|
+
HTTP_422_UNPROCESSABLE_ENTITY,
|
27
|
+
HTTP_500_INTERNAL_SERVER_ERROR,
|
28
|
+
)
|
29
|
+
except ImportError as e:
|
30
|
+
raise MissingDependencyError.create_for_package(
|
31
|
+
dependency_group="litestar",
|
32
|
+
functionality="Litestar API and docker container",
|
33
|
+
package_name="litestar",
|
34
|
+
) from e
|
35
|
+
|
36
|
+
|
37
|
+
def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError) -> Response[Any]:
|
38
|
+
if isinstance(exception, ValidationError):
|
39
|
+
status_code = HTTP_400_BAD_REQUEST
|
40
|
+
elif isinstance(exception, ParsingError):
|
41
|
+
status_code = HTTP_422_UNPROCESSABLE_ENTITY
|
42
|
+
else:
|
43
|
+
status_code = HTTP_500_INTERNAL_SERVER_ERROR
|
44
|
+
|
45
|
+
message = str(exception)
|
46
|
+
details = dumps(exception.context)
|
47
|
+
|
48
|
+
if request.app.logger:
|
49
|
+
request.app.logger.error(
|
50
|
+
"API error",
|
51
|
+
method=request.method,
|
52
|
+
url=str(request.url),
|
53
|
+
status_code=status_code,
|
54
|
+
message=message,
|
55
|
+
context=exception.context,
|
56
|
+
)
|
57
|
+
|
58
|
+
return Response(
|
59
|
+
content={"message": message, "details": details},
|
60
|
+
status_code=status_code,
|
61
|
+
)
|
62
|
+
|
63
|
+
|
64
|
+
@post("/extract", operation_id="ExtractFiles")
|
65
|
+
async def handle_files_upload(
|
66
|
+
data: Annotated[list[UploadFile], Body(media_type=RequestEncodingType.MULTI_PART)],
|
67
|
+
) -> list[ExtractionResult]:
|
68
|
+
"""Extracts text content from an uploaded file."""
|
69
|
+
return await batch_extract_bytes(
|
70
|
+
[(await file.read(), file.content_type) for file in data],
|
71
|
+
)
|
72
|
+
|
73
|
+
|
74
|
+
@get("/health", operation_id="HealthCheck")
|
75
|
+
async def health_check() -> dict[str, str]:
|
76
|
+
"""A simple health check endpoint."""
|
77
|
+
return {"status": "ok"}
|
78
|
+
|
79
|
+
|
80
|
+
app = Litestar(
|
81
|
+
route_handlers=[handle_files_upload, health_check],
|
82
|
+
plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
|
83
|
+
logging_config=StructLoggingConfig(),
|
84
|
+
exception_handlers={
|
85
|
+
KreuzbergError: exception_handler,
|
86
|
+
},
|
87
|
+
)
|
kreuzberg/_types.py
CHANGED
@@ -114,6 +114,10 @@ class ExtractionResult:
|
|
114
114
|
chunks: list[str] = field(default_factory=list)
|
115
115
|
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
116
116
|
|
117
|
+
def to_dict(self) -> dict[str, Any]:
|
118
|
+
"""Converts the ExtractionResult to a dictionary."""
|
119
|
+
return asdict(self)
|
120
|
+
|
117
121
|
|
118
122
|
PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
|
119
123
|
ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
|
@@ -0,0 +1,233 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 3.4.1
|
4
|
+
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
|
+
License: MIT
|
8
|
+
License-File: LICENSE
|
9
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
11
|
+
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
13
|
+
Classifier: Operating System :: OS Independent
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
22
|
+
Classifier: Topic :: Text Processing :: General
|
23
|
+
Classifier: Topic :: Utilities
|
24
|
+
Classifier: Typing :: Typed
|
25
|
+
Requires-Python: >=3.9
|
26
|
+
Requires-Dist: anyio>=4.9.0
|
27
|
+
Requires-Dist: charset-normalizer>=3.4.2
|
28
|
+
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
29
|
+
Requires-Dist: html-to-markdown>=1.4.0
|
30
|
+
Requires-Dist: msgspec>=0.18.0
|
31
|
+
Requires-Dist: playa-pdf>=0.6.1
|
32
|
+
Requires-Dist: psutil>=7.0.0
|
33
|
+
Requires-Dist: pypdfium2==4.30.0
|
34
|
+
Requires-Dist: python-calamine>=0.3.2
|
35
|
+
Requires-Dist: python-pptx>=1.0.2
|
36
|
+
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
37
|
+
Provides-Extra: all
|
38
|
+
Requires-Dist: click>=8.2.1; extra == 'all'
|
39
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
40
|
+
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
41
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
|
42
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
43
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
44
|
+
Requires-Dist: rich>=14.0.0; extra == 'all'
|
45
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
46
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
47
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
48
|
+
Provides-Extra: api
|
49
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
|
50
|
+
Provides-Extra: chunking
|
51
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
52
|
+
Provides-Extra: cli
|
53
|
+
Requires-Dist: click>=8.2.1; extra == 'cli'
|
54
|
+
Requires-Dist: rich>=14.0.0; extra == 'cli'
|
55
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
56
|
+
Provides-Extra: easyocr
|
57
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
58
|
+
Provides-Extra: gmft
|
59
|
+
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
60
|
+
Provides-Extra: paddleocr
|
61
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
62
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
63
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
64
|
+
Description-Content-Type: text/markdown
|
65
|
+
|
66
|
+
# Kreuzberg
|
67
|
+
|
68
|
+
[](https://discord.gg/pXxagNK2zN)
|
69
|
+
[](https://badge.fury.io/py/kreuzberg)
|
70
|
+
[](https://goldziher.github.io/kreuzberg/)
|
71
|
+
[](https://opensource.org/licenses/MIT)
|
72
|
+
|
73
|
+
**High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
|
74
|
+
|
75
|
+
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
76
|
+
|
77
|
+
## Why Kreuzberg?
|
78
|
+
|
79
|
+
- **🚀 Fastest Performance**: [Benchmarked](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) as the fastest text extraction library
|
80
|
+
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
|
81
|
+
- **⚡ Dual APIs**: Only library with both sync and async support
|
82
|
+
- **🔧 Zero Configuration**: Works out of the box with sane defaults
|
83
|
+
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
84
|
+
- **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
|
85
|
+
- **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
|
86
|
+
- **🐳 Production Ready**: CLI, REST API, and Docker images included
|
87
|
+
|
88
|
+
## Quick Start
|
89
|
+
|
90
|
+
### Installation
|
91
|
+
|
92
|
+
```bash
|
93
|
+
# Basic installation
|
94
|
+
pip install kreuzberg
|
95
|
+
|
96
|
+
# With optional features
|
97
|
+
pip install "kreuzberg[cli,api]" # CLI + REST API
|
98
|
+
pip install "kreuzberg[easyocr,gmft]" # EasyOCR + table extraction
|
99
|
+
pip install "kreuzberg[all]" # Everything
|
100
|
+
```
|
101
|
+
|
102
|
+
### System Dependencies
|
103
|
+
|
104
|
+
```bash
|
105
|
+
# Ubuntu/Debian
|
106
|
+
sudo apt-get install tesseract-ocr pandoc
|
107
|
+
|
108
|
+
# macOS
|
109
|
+
brew install tesseract pandoc
|
110
|
+
|
111
|
+
# Windows
|
112
|
+
choco install tesseract pandoc
|
113
|
+
```
|
114
|
+
|
115
|
+
### Basic Usage
|
116
|
+
|
117
|
+
```python
|
118
|
+
import asyncio
|
119
|
+
from kreuzberg import extract_file
|
120
|
+
|
121
|
+
async def main():
|
122
|
+
# Extract from any document type
|
123
|
+
result = await extract_file("document.pdf")
|
124
|
+
print(result.content)
|
125
|
+
print(result.metadata)
|
126
|
+
|
127
|
+
asyncio.run(main())
|
128
|
+
```
|
129
|
+
|
130
|
+
## Deployment Options
|
131
|
+
|
132
|
+
### 🐳 Docker (Recommended)
|
133
|
+
|
134
|
+
```bash
|
135
|
+
# Run API server
|
136
|
+
docker run -p 8000:8000 goldziher/kreuzberg:3.4.0
|
137
|
+
|
138
|
+
# Extract files
|
139
|
+
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
140
|
+
```
|
141
|
+
|
142
|
+
Available variants: `3.4.0`, `3.4.0-easyocr`, `3.4.0-paddle`, `3.4.0-gmft`, `3.4.0-all`
|
143
|
+
|
144
|
+
### 🌐 REST API
|
145
|
+
|
146
|
+
```bash
|
147
|
+
# Install and run
|
148
|
+
pip install "kreuzberg[api]"
|
149
|
+
litestar --app kreuzberg._api.main:app run
|
150
|
+
|
151
|
+
# Health check
|
152
|
+
curl http://localhost:8000/health
|
153
|
+
|
154
|
+
# Extract files
|
155
|
+
curl -X POST http://localhost:8000/extract -F "data=@file.pdf"
|
156
|
+
```
|
157
|
+
|
158
|
+
### 💻 Command Line
|
159
|
+
|
160
|
+
```bash
|
161
|
+
# Install CLI
|
162
|
+
pip install "kreuzberg[cli]"
|
163
|
+
|
164
|
+
# Extract to stdout
|
165
|
+
kreuzberg extract document.pdf
|
166
|
+
|
167
|
+
# JSON output with metadata
|
168
|
+
kreuzberg extract document.pdf --output-format json --show-metadata
|
169
|
+
|
170
|
+
# Batch processing
|
171
|
+
kreuzberg extract *.pdf --output-dir ./extracted/
|
172
|
+
```
|
173
|
+
|
174
|
+
## Supported Formats
|
175
|
+
|
176
|
+
| Category | Formats |
|
177
|
+
| ----------------- | ------------------------------ |
|
178
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
179
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
180
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
181
|
+
| **Presentations** | PPTX, PPT, ODP |
|
182
|
+
| **Web** | HTML, XML, MHTML |
|
183
|
+
| **Archives** | Support via extraction |
|
184
|
+
|
185
|
+
## Performance
|
186
|
+
|
187
|
+
**Fastest extraction speeds** with minimal resource usage:
|
188
|
+
|
189
|
+
| Library | Speed | Memory | Size | Success Rate |
|
190
|
+
| ------------- | -------------- | ------------- | ----------- | ------------ |
|
191
|
+
| **Kreuzberg** | ⚡ **Fastest** | 💾 **Lowest** | 📦 **71MB** | ✅ **100%** |
|
192
|
+
| Unstructured | 2-3x slower | 2x higher | 146MB | 95% |
|
193
|
+
| MarkItDown | 3-4x slower | 3x higher | 251MB | 90% |
|
194
|
+
| Docling | 4-5x slower | 10x higher | 1,032MB | 85% |
|
195
|
+
|
196
|
+
> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
|
197
|
+
|
198
|
+
## Documentation
|
199
|
+
|
200
|
+
### Quick Links
|
201
|
+
|
202
|
+
- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
|
203
|
+
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
|
204
|
+
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
|
205
|
+
- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
|
206
|
+
- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
|
207
|
+
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
208
|
+
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
209
|
+
|
210
|
+
## Advanced Features
|
211
|
+
|
212
|
+
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
213
|
+
- **🧩 Content Chunking**: Split documents for RAG applications
|
214
|
+
- **🎯 Custom Extractors**: Extend with your own document handlers
|
215
|
+
- **🔧 Configuration**: Flexible TOML-based configuration
|
216
|
+
- **🪝 Hooks**: Pre/post-processing customization
|
217
|
+
- **🌍 Multi-language OCR**: 100+ languages supported
|
218
|
+
- **⚙️ Metadata Extraction**: Rich document metadata
|
219
|
+
- **🔄 Batch Processing**: Efficient bulk document processing
|
220
|
+
|
221
|
+
## License
|
222
|
+
|
223
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
224
|
+
|
225
|
+
______________________________________________________________________
|
226
|
+
|
227
|
+
<div align="center">
|
228
|
+
|
229
|
+
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
|
230
|
+
|
231
|
+
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
232
|
+
|
233
|
+
</div>
|
@@ -1,4 +1,4 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=
|
1
|
+
kreuzberg/__init__.py,sha256=5GP2j8PI3P_ZNSEhLpm8iqseY3i4nye6iUmVGUnfzno,1311
|
2
2
|
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
3
|
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
4
4
|
kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
|
@@ -7,11 +7,13 @@ kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
|
|
7
7
|
kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
|
8
8
|
kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
|
9
9
|
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
10
|
-
kreuzberg/_types.py,sha256=
|
10
|
+
kreuzberg/_types.py,sha256=8kwDjQjBdiTbNcRwJmH4vijNpf9Ml9WNW85Uxv2alDw,7634
|
11
11
|
kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
|
12
12
|
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
13
13
|
kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
|
14
14
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
|
15
17
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
18
|
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
17
19
|
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
@@ -41,8 +43,8 @@ kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lT
|
|
41
43
|
kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
|
42
44
|
kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
|
43
45
|
kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
|
44
|
-
kreuzberg-3.
|
45
|
-
kreuzberg-3.
|
46
|
-
kreuzberg-3.
|
47
|
-
kreuzberg-3.
|
48
|
-
kreuzberg-3.
|
46
|
+
kreuzberg-3.4.1.dist-info/METADATA,sha256=g3DwLXNiDzvPDBApPnDp3BeZ4SbVN0NTrEzN9cyKy34,8751
|
47
|
+
kreuzberg-3.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
48
|
+
kreuzberg-3.4.1.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
|
49
|
+
kreuzberg-3.4.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
50
|
+
kreuzberg-3.4.1.dist-info/RECORD,,
|
@@ -1,235 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: kreuzberg
|
3
|
-
Version: 3.3.0
|
4
|
-
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
-
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
|
-
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
|
-
License: MIT
|
8
|
-
License-File: LICENSE
|
9
|
-
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
10
|
-
Classifier: Development Status :: 4 - Beta
|
11
|
-
Classifier: Intended Audience :: Developers
|
12
|
-
Classifier: License :: OSI Approved :: MIT License
|
13
|
-
Classifier: Operating System :: OS Independent
|
14
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
15
|
-
Classifier: Programming Language :: Python :: 3.13
|
16
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
18
|
-
Classifier: Topic :: Text Processing :: General
|
19
|
-
Classifier: Topic :: Utilities
|
20
|
-
Classifier: Typing :: Typed
|
21
|
-
Requires-Python: >=3.13
|
22
|
-
Requires-Dist: anyio>=4.9.0
|
23
|
-
Requires-Dist: charset-normalizer>=3.4.2
|
24
|
-
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
25
|
-
Requires-Dist: html-to-markdown>=1.4.0
|
26
|
-
Requires-Dist: msgspec>=0.18.0
|
27
|
-
Requires-Dist: playa-pdf>=0.6.1
|
28
|
-
Requires-Dist: psutil>=7.0.0
|
29
|
-
Requires-Dist: pypdfium2==4.30.0
|
30
|
-
Requires-Dist: python-calamine>=0.3.2
|
31
|
-
Requires-Dist: python-pptx>=1.0.2
|
32
|
-
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
33
|
-
Provides-Extra: all
|
34
|
-
Requires-Dist: click>=8.2.1; extra == 'all'
|
35
|
-
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
36
|
-
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
37
|
-
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
38
|
-
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
39
|
-
Requires-Dist: rich>=14.0.0; extra == 'all'
|
40
|
-
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
41
|
-
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
42
|
-
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
43
|
-
Provides-Extra: chunking
|
44
|
-
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
45
|
-
Provides-Extra: cli
|
46
|
-
Requires-Dist: click>=8.2.1; extra == 'cli'
|
47
|
-
Requires-Dist: rich>=14.0.0; extra == 'cli'
|
48
|
-
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
49
|
-
Provides-Extra: easyocr
|
50
|
-
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
51
|
-
Provides-Extra: gmft
|
52
|
-
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
53
|
-
Provides-Extra: paddleocr
|
54
|
-
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
55
|
-
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
56
|
-
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
57
|
-
Description-Content-Type: text/markdown
|
58
|
-
|
59
|
-
# Kreuzberg
|
60
|
-
|
61
|
-
[](https://discord.gg/pXxagNK2zN)
|
62
|
-
[](https://badge.fury.io/py/kreuzberg)
|
63
|
-
[](https://goldziher.github.io/kreuzberg/)
|
64
|
-
[](https://opensource.org/licenses/MIT)
|
65
|
-
|
66
|
-
Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
|
67
|
-
|
68
|
-
## Why Kreuzberg?
|
69
|
-
|
70
|
-
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
71
|
-
- **Local Processing**: No external API calls or cloud dependencies required
|
72
|
-
- **Resource Efficient**: Lightweight processing without GPU requirements
|
73
|
-
- **Format Support**: Comprehensive support for documents, images, and text formats
|
74
|
-
- **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
|
75
|
-
- **Command Line Interface**: Powerful CLI for batch processing and automation
|
76
|
-
- **Metadata Extraction**: Get document metadata alongside text content
|
77
|
-
- **Table Extraction**: Extract tables from documents using the excellent GMFT library
|
78
|
-
- **Modern Python**: Built with async/await, type hints, and a functional-first approach
|
79
|
-
- **Permissive OSS**: MIT licensed with permissively licensed dependencies
|
80
|
-
|
81
|
-
## Quick Start
|
82
|
-
|
83
|
-
```bash
|
84
|
-
pip install kreuzberg
|
85
|
-
|
86
|
-
# Or install with CLI support
|
87
|
-
pip install "kreuzberg[cli]"
|
88
|
-
```
|
89
|
-
|
90
|
-
Install pandoc:
|
91
|
-
|
92
|
-
```bash
|
93
|
-
# Ubuntu/Debian
|
94
|
-
sudo apt-get install tesseract-ocr pandoc
|
95
|
-
|
96
|
-
# macOS
|
97
|
-
brew install tesseract pandoc
|
98
|
-
|
99
|
-
# Windows
|
100
|
-
choco install -y tesseract pandoc
|
101
|
-
```
|
102
|
-
|
103
|
-
The tesseract OCR engine is the default OCR engine. You can decide not to use it - and then either use one of the two alternative OCR engines, or have no OCR at all.
|
104
|
-
|
105
|
-
### Alternative OCR engines
|
106
|
-
|
107
|
-
```bash
|
108
|
-
# Install with EasyOCR support
|
109
|
-
pip install "kreuzberg[easyocr]"
|
110
|
-
|
111
|
-
# Install with PaddleOCR support
|
112
|
-
pip install "kreuzberg[paddleocr]"
|
113
|
-
```
|
114
|
-
|
115
|
-
## Quick Example
|
116
|
-
|
117
|
-
```python
|
118
|
-
import asyncio
|
119
|
-
from kreuzberg import extract_file
|
120
|
-
|
121
|
-
async def main():
|
122
|
-
# Extract text from a PDF
|
123
|
-
result = await extract_file("document.pdf")
|
124
|
-
print(result.content)
|
125
|
-
|
126
|
-
# Extract text from an image
|
127
|
-
result = await extract_file("scan.jpg")
|
128
|
-
print(result.content)
|
129
|
-
|
130
|
-
# Extract text from a Word document
|
131
|
-
result = await extract_file("report.docx")
|
132
|
-
print(result.content)
|
133
|
-
|
134
|
-
asyncio.run(main())
|
135
|
-
```
|
136
|
-
|
137
|
-
## Command Line Interface
|
138
|
-
|
139
|
-
Kreuzberg includes a powerful CLI for processing documents from the command line:
|
140
|
-
|
141
|
-
```bash
|
142
|
-
# Extract text from a file
|
143
|
-
kreuzberg extract document.pdf
|
144
|
-
|
145
|
-
# Extract with JSON output and metadata
|
146
|
-
kreuzberg extract document.pdf --output-format json --show-metadata
|
147
|
-
|
148
|
-
# Extract from stdin
|
149
|
-
cat document.html | kreuzberg extract
|
150
|
-
|
151
|
-
# Use specific OCR backend
|
152
|
-
kreuzberg extract image.png --ocr-backend easyocr --easyocr-languages en,de
|
153
|
-
|
154
|
-
# Extract with configuration file
|
155
|
-
kreuzberg extract document.pdf --config config.toml
|
156
|
-
```
|
157
|
-
|
158
|
-
### CLI Configuration
|
159
|
-
|
160
|
-
Configure via `pyproject.toml`:
|
161
|
-
|
162
|
-
```toml
|
163
|
-
[tool.kreuzberg]
|
164
|
-
force_ocr = true
|
165
|
-
chunk_content = false
|
166
|
-
extract_tables = true
|
167
|
-
max_chars = 4000
|
168
|
-
ocr_backend = "tesseract"
|
169
|
-
|
170
|
-
[tool.kreuzberg.tesseract]
|
171
|
-
language = "eng+deu"
|
172
|
-
psm = 3
|
173
|
-
```
|
174
|
-
|
175
|
-
For full CLI documentation, see the [CLI Guide](https://goldziher.github.io/kreuzberg/cli/).
|
176
|
-
|
177
|
-
## Documentation
|
178
|
-
|
179
|
-
For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
|
180
|
-
|
181
|
-
- [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
|
182
|
-
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
|
183
|
-
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line interface documentation
|
184
|
-
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
|
185
|
-
- [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
|
186
|
-
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
|
187
|
-
- [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) - Choose the right OCR engine
|
188
|
-
|
189
|
-
## Supported Formats
|
190
|
-
|
191
|
-
Kreuzberg supports a wide range of document formats:
|
192
|
-
|
193
|
-
- **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
|
194
|
-
- **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
|
195
|
-
- **Spreadsheets**: XLSX, XLS, CSV, etc.
|
196
|
-
- **Presentations**: PPTX, PPT, etc.
|
197
|
-
- **Web Content**: HTML, XML, etc.
|
198
|
-
|
199
|
-
## OCR Engines
|
200
|
-
|
201
|
-
Kreuzberg supports multiple OCR engines:
|
202
|
-
|
203
|
-
- **Tesseract** (Default): Lightweight, fast startup, requires system installation
|
204
|
-
- **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
|
205
|
-
- **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
|
206
|
-
|
207
|
-
For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
|
208
|
-
|
209
|
-
## Performance
|
210
|
-
|
211
|
-
Kreuzberg offers both sync and async APIs. Choose the right one based on your use case:
|
212
|
-
|
213
|
-
| Operation | Sync Time | Async Time | Async Advantage |
|
214
|
-
| ---------------------- | --------- | ---------- | ------------------ |
|
215
|
-
| Simple text (Markdown) | 0.4ms | 17.5ms | **❌ 41x slower** |
|
216
|
-
| HTML documents | 1.6ms | 1.1ms | **✅ 1.5x faster** |
|
217
|
-
| Complex PDFs | 39.0s | 8.5s | **✅ 4.6x faster** |
|
218
|
-
| OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
|
219
|
-
| Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
|
220
|
-
|
221
|
-
**Rule of thumb:**
|
222
|
-
|
223
|
-
- Use **sync** for simple documents and CLI applications
|
224
|
-
- Use **async** for complex PDFs, OCR, and batch processing
|
225
|
-
- Use **batch operations** for multiple files
|
226
|
-
|
227
|
-
For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
|
228
|
-
|
229
|
-
## Contributing
|
230
|
-
|
231
|
-
We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.
|
232
|
-
|
233
|
-
## License
|
234
|
-
|
235
|
-
This library is released under the MIT license.
|
File without changes
|
File without changes
|
File without changes
|