kreuzberg 3.3.0__py3-none-any.whl → 3.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -1,3 +1,5 @@
1
+ from importlib.metadata import version
2
+
1
3
  from kreuzberg._gmft import GMFTConfig
2
4
  from kreuzberg._ocr._easyocr import EasyOCRConfig
3
5
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
@@ -18,7 +20,7 @@ from .extraction import (
18
20
  extract_file_sync,
19
21
  )
20
22
 
21
- __version__ = "3.2.0"
23
+ __version__ = version("kreuzberg")
22
24
 
23
25
  __all__ = [
24
26
  "EasyOCRConfig",
File without changes
kreuzberg/_api/main.py ADDED
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ from json import dumps
4
+ from typing import TYPE_CHECKING, Annotated, Any
5
+
6
+ from kreuzberg import (
7
+ ExtractionResult,
8
+ KreuzbergError,
9
+ MissingDependencyError,
10
+ ParsingError,
11
+ ValidationError,
12
+ batch_extract_bytes,
13
+ )
14
+
15
+ if TYPE_CHECKING:
16
+ from litestar.datastructures import UploadFile
17
+
18
+ try:
19
+ from litestar import Litestar, Request, Response, get, post
20
+ from litestar.contrib.opentelemetry import OpenTelemetryConfig, OpenTelemetryPlugin
21
+ from litestar.enums import RequestEncodingType
22
+ from litestar.logging import StructLoggingConfig
23
+ from litestar.params import Body
24
+ from litestar.status_codes import (
25
+ HTTP_400_BAD_REQUEST,
26
+ HTTP_422_UNPROCESSABLE_ENTITY,
27
+ HTTP_500_INTERNAL_SERVER_ERROR,
28
+ )
29
+ except ImportError as e:
30
+ raise MissingDependencyError.create_for_package(
31
+ dependency_group="litestar",
32
+ functionality="Litestar API and docker container",
33
+ package_name="litestar",
34
+ ) from e
35
+
36
+
37
+ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError) -> Response[Any]:
38
+ if isinstance(exception, ValidationError):
39
+ status_code = HTTP_400_BAD_REQUEST
40
+ elif isinstance(exception, ParsingError):
41
+ status_code = HTTP_422_UNPROCESSABLE_ENTITY
42
+ else:
43
+ status_code = HTTP_500_INTERNAL_SERVER_ERROR
44
+
45
+ message = str(exception)
46
+ details = dumps(exception.context)
47
+
48
+ if request.app.logger:
49
+ request.app.logger.error(
50
+ "API error",
51
+ method=request.method,
52
+ url=str(request.url),
53
+ status_code=status_code,
54
+ message=message,
55
+ context=exception.context,
56
+ )
57
+
58
+ return Response(
59
+ content={"message": message, "details": details},
60
+ status_code=status_code,
61
+ )
62
+
63
+
64
+ @post("/extract", operation_id="ExtractFiles")
65
+ async def handle_files_upload(
66
+ data: Annotated[list[UploadFile], Body(media_type=RequestEncodingType.MULTI_PART)],
67
+ ) -> list[ExtractionResult]:
68
+ """Extracts text content from an uploaded file."""
69
+ return await batch_extract_bytes(
70
+ [(await file.read(), file.content_type) for file in data],
71
+ )
72
+
73
+
74
+ @get("/health", operation_id="HealthCheck")
75
+ async def health_check() -> dict[str, str]:
76
+ """A simple health check endpoint."""
77
+ return {"status": "ok"}
78
+
79
+
80
+ app = Litestar(
81
+ route_handlers=[handle_files_upload, health_check],
82
+ plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
83
+ logging_config=StructLoggingConfig(),
84
+ exception_handlers={
85
+ KreuzbergError: exception_handler,
86
+ },
87
+ )
kreuzberg/_types.py CHANGED
@@ -114,6 +114,10 @@ class ExtractionResult:
114
114
  chunks: list[str] = field(default_factory=list)
115
115
  """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
116
116
 
117
+ def to_dict(self) -> dict[str, Any]:
118
+ """Converts the ExtractionResult to a dictionary."""
119
+ return asdict(self)
120
+
117
121
 
118
122
  PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
119
123
  ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
@@ -0,0 +1,233 @@
1
+ Metadata-Version: 2.4
2
+ Name: kreuzberg
3
+ Version: 3.4.1
4
+ Summary: A text extraction library supporting PDFs, images, office documents and more
5
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Classifier: Topic :: Text Processing :: General
23
+ Classifier: Topic :: Utilities
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.9
26
+ Requires-Dist: anyio>=4.9.0
27
+ Requires-Dist: charset-normalizer>=3.4.2
28
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
29
+ Requires-Dist: html-to-markdown>=1.4.0
30
+ Requires-Dist: msgspec>=0.18.0
31
+ Requires-Dist: playa-pdf>=0.6.1
32
+ Requires-Dist: psutil>=7.0.0
33
+ Requires-Dist: pypdfium2==4.30.0
34
+ Requires-Dist: python-calamine>=0.3.2
35
+ Requires-Dist: python-pptx>=1.0.2
36
+ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
37
+ Provides-Extra: all
38
+ Requires-Dist: click>=8.2.1; extra == 'all'
39
+ Requires-Dist: easyocr>=1.7.2; extra == 'all'
40
+ Requires-Dist: gmft>=0.4.2; extra == 'all'
41
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
42
+ Requires-Dist: paddleocr>=3.1.0; extra == 'all'
43
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
44
+ Requires-Dist: rich>=14.0.0; extra == 'all'
45
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
46
+ Requires-Dist: setuptools>=80.9.0; extra == 'all'
47
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
48
+ Provides-Extra: api
49
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
50
+ Provides-Extra: chunking
51
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
52
+ Provides-Extra: cli
53
+ Requires-Dist: click>=8.2.1; extra == 'cli'
54
+ Requires-Dist: rich>=14.0.0; extra == 'cli'
55
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
56
+ Provides-Extra: easyocr
57
+ Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
58
+ Provides-Extra: gmft
59
+ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
60
+ Provides-Extra: paddleocr
61
+ Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
62
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
63
+ Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
64
+ Description-Content-Type: text/markdown
65
+
66
+ # Kreuzberg
67
+
68
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
69
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
70
+ [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
71
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
72
+
73
+ **High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
74
+
75
+ 📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
76
+
77
+ ## Why Kreuzberg?
78
+
79
+ - **🚀 Fastest Performance**: [Benchmarked](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) as the fastest text extraction library
80
+ - **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
81
+ - **⚡ Dual APIs**: Only library with both sync and async support
82
+ - **🔧 Zero Configuration**: Works out of the box with sane defaults
83
+ - **🏠 Local Processing**: No cloud dependencies or external API calls
84
+ - **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
85
+ - **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
86
+ - **🐳 Production Ready**: CLI, REST API, and Docker images included
87
+
88
+ ## Quick Start
89
+
90
+ ### Installation
91
+
92
+ ```bash
93
+ # Basic installation
94
+ pip install kreuzberg
95
+
96
+ # With optional features
97
+ pip install "kreuzberg[cli,api]" # CLI + REST API
98
+ pip install "kreuzberg[easyocr,gmft]" # EasyOCR + table extraction
99
+ pip install "kreuzberg[all]" # Everything
100
+ ```
101
+
102
+ ### System Dependencies
103
+
104
+ ```bash
105
+ # Ubuntu/Debian
106
+ sudo apt-get install tesseract-ocr pandoc
107
+
108
+ # macOS
109
+ brew install tesseract pandoc
110
+
111
+ # Windows
112
+ choco install tesseract pandoc
113
+ ```
114
+
115
+ ### Basic Usage
116
+
117
+ ```python
118
+ import asyncio
119
+ from kreuzberg import extract_file
120
+
121
+ async def main():
122
+ # Extract from any document type
123
+ result = await extract_file("document.pdf")
124
+ print(result.content)
125
+ print(result.metadata)
126
+
127
+ asyncio.run(main())
128
+ ```
129
+
130
+ ## Deployment Options
131
+
132
+ ### 🐳 Docker (Recommended)
133
+
134
+ ```bash
135
+ # Run API server
136
+ docker run -p 8000:8000 goldziher/kreuzberg:3.4.0
137
+
138
+ # Extract files
139
+ curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
140
+ ```
141
+
142
+ Available variants: `3.4.0`, `3.4.0-easyocr`, `3.4.0-paddle`, `3.4.0-gmft`, `3.4.0-all`
143
+
144
+ ### 🌐 REST API
145
+
146
+ ```bash
147
+ # Install and run
148
+ pip install "kreuzberg[api]"
149
+ litestar --app kreuzberg._api.main:app run
150
+
151
+ # Health check
152
+ curl http://localhost:8000/health
153
+
154
+ # Extract files
155
+ curl -X POST http://localhost:8000/extract -F "data=@file.pdf"
156
+ ```
157
+
158
+ ### 💻 Command Line
159
+
160
+ ```bash
161
+ # Install CLI
162
+ pip install "kreuzberg[cli]"
163
+
164
+ # Extract to stdout
165
+ kreuzberg extract document.pdf
166
+
167
+ # JSON output with metadata
168
+ kreuzberg extract document.pdf --output-format json --show-metadata
169
+
170
+ # Batch processing
171
+ kreuzberg extract *.pdf --output-dir ./extracted/
172
+ ```
173
+
174
+ ## Supported Formats
175
+
176
+ | Category | Formats |
177
+ | ----------------- | ------------------------------ |
178
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
179
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
180
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
181
+ | **Presentations** | PPTX, PPT, ODP |
182
+ | **Web** | HTML, XML, MHTML |
183
+ | **Archives** | Support via extraction |
184
+
185
+ ## Performance
186
+
187
+ **Fastest extraction speeds** with minimal resource usage:
188
+
189
+ | Library | Speed | Memory | Size | Success Rate |
190
+ | ------------- | -------------- | ------------- | ----------- | ------------ |
191
+ | **Kreuzberg** | ⚡ **Fastest** | 💾 **Lowest** | 📦 **71MB** | ✅ **100%** |
192
+ | Unstructured | 2-3x slower | 2x higher | 146MB | 95% |
193
+ | MarkItDown | 3-4x slower | 3x higher | 251MB | 90% |
194
+ | Docling | 4-5x slower | 10x higher | 1,032MB | 85% |
195
+
196
+ > **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
197
+
198
+ ## Documentation
199
+
200
+ ### Quick Links
201
+
202
+ - [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
203
+ - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
204
+ - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
205
+ - [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
206
+ - [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
207
+ - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
208
+ - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
209
+
210
+ ## Advanced Features
211
+
212
+ - **📊 Table Extraction**: Extract tables from PDFs with GMFT
213
+ - **🧩 Content Chunking**: Split documents for RAG applications
214
+ - **🎯 Custom Extractors**: Extend with your own document handlers
215
+ - **🔧 Configuration**: Flexible TOML-based configuration
216
+ - **🪝 Hooks**: Pre/post-processing customization
217
+ - **🌍 Multi-language OCR**: 100+ languages supported
218
+ - **⚙️ Metadata Extraction**: Rich document metadata
219
+ - **🔄 Batch Processing**: Efficient bulk document processing
220
+
221
+ ## License
222
+
223
+ MIT License - see [LICENSE](LICENSE) for details.
224
+
225
+ ______________________________________________________________________
226
+
227
+ <div align="center">
228
+
229
+ **[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
230
+
231
+ Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
232
+
233
+ </div>
@@ -1,4 +1,4 @@
1
- kreuzberg/__init__.py,sha256=jRm2U-loiKWwJpgOFgZ8Ev2mfz9sI1qJOZ2V3OoJUlg,1258
1
+ kreuzberg/__init__.py,sha256=5GP2j8PI3P_ZNSEhLpm8iqseY3i4nye6iUmVGUnfzno,1311
2
2
  kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
3
  kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
4
  kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
@@ -7,11 +7,13 @@ kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
7
7
  kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
8
8
  kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
9
9
  kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
10
- kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
10
+ kreuzberg/_types.py,sha256=8kwDjQjBdiTbNcRwJmH4vijNpf9Ml9WNW85Uxv2alDw,7634
11
11
  kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
12
12
  kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
13
13
  kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
14
14
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
15
17
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
18
  kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
17
19
  kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
@@ -41,8 +43,8 @@ kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lT
41
43
  kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
42
44
  kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
43
45
  kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
44
- kreuzberg-3.3.0.dist-info/METADATA,sha256=beRlFJzCsZNcQ_DsRyzRc2WDT-UkBCfBvY6vTWiOxp0,8748
45
- kreuzberg-3.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
46
- kreuzberg-3.3.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
47
- kreuzberg-3.3.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
48
- kreuzberg-3.3.0.dist-info/RECORD,,
46
+ kreuzberg-3.4.1.dist-info/METADATA,sha256=g3DwLXNiDzvPDBApPnDp3BeZ4SbVN0NTrEzN9cyKy34,8751
47
+ kreuzberg-3.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
48
+ kreuzberg-3.4.1.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
49
+ kreuzberg-3.4.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
50
+ kreuzberg-3.4.1.dist-info/RECORD,,
@@ -1,235 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: kreuzberg
3
- Version: 3.3.0
4
- Summary: A text extraction library supporting PDFs, images, office documents and more
5
- Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
- Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
- License: MIT
8
- License-File: LICENSE
9
- Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
10
- Classifier: Development Status :: 4 - Beta
11
- Classifier: Intended Audience :: Developers
12
- Classifier: License :: OSI Approved :: MIT License
13
- Classifier: Operating System :: OS Independent
14
- Classifier: Programming Language :: Python :: 3 :: Only
15
- Classifier: Programming Language :: Python :: 3.13
16
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
- Classifier: Topic :: Text Processing :: General
19
- Classifier: Topic :: Utilities
20
- Classifier: Typing :: Typed
21
- Requires-Python: >=3.13
22
- Requires-Dist: anyio>=4.9.0
23
- Requires-Dist: charset-normalizer>=3.4.2
24
- Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
25
- Requires-Dist: html-to-markdown>=1.4.0
26
- Requires-Dist: msgspec>=0.18.0
27
- Requires-Dist: playa-pdf>=0.6.1
28
- Requires-Dist: psutil>=7.0.0
29
- Requires-Dist: pypdfium2==4.30.0
30
- Requires-Dist: python-calamine>=0.3.2
31
- Requires-Dist: python-pptx>=1.0.2
32
- Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
33
- Provides-Extra: all
34
- Requires-Dist: click>=8.2.1; extra == 'all'
35
- Requires-Dist: easyocr>=1.7.2; extra == 'all'
36
- Requires-Dist: gmft>=0.4.2; extra == 'all'
37
- Requires-Dist: paddleocr>=3.1.0; extra == 'all'
38
- Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
39
- Requires-Dist: rich>=14.0.0; extra == 'all'
40
- Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
41
- Requires-Dist: setuptools>=80.9.0; extra == 'all'
42
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
43
- Provides-Extra: chunking
44
- Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
45
- Provides-Extra: cli
46
- Requires-Dist: click>=8.2.1; extra == 'cli'
47
- Requires-Dist: rich>=14.0.0; extra == 'cli'
48
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
49
- Provides-Extra: easyocr
50
- Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
51
- Provides-Extra: gmft
52
- Requires-Dist: gmft>=0.4.2; extra == 'gmft'
53
- Provides-Extra: paddleocr
54
- Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
55
- Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
56
- Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
57
- Description-Content-Type: text/markdown
58
-
59
- # Kreuzberg
60
-
61
- [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
62
- [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
63
- [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
64
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
65
-
66
- Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
67
-
68
- ## Why Kreuzberg?
69
-
70
- - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
71
- - **Local Processing**: No external API calls or cloud dependencies required
72
- - **Resource Efficient**: Lightweight processing without GPU requirements
73
- - **Format Support**: Comprehensive support for documents, images, and text formats
74
- - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
75
- - **Command Line Interface**: Powerful CLI for batch processing and automation
76
- - **Metadata Extraction**: Get document metadata alongside text content
77
- - **Table Extraction**: Extract tables from documents using the excellent GMFT library
78
- - **Modern Python**: Built with async/await, type hints, and a functional-first approach
79
- - **Permissive OSS**: MIT licensed with permissively licensed dependencies
80
-
81
- ## Quick Start
82
-
83
- ```bash
84
- pip install kreuzberg
85
-
86
- # Or install with CLI support
87
- pip install "kreuzberg[cli]"
88
- ```
89
-
90
- Install pandoc:
91
-
92
- ```bash
93
- # Ubuntu/Debian
94
- sudo apt-get install tesseract-ocr pandoc
95
-
96
- # macOS
97
- brew install tesseract pandoc
98
-
99
- # Windows
100
- choco install -y tesseract pandoc
101
- ```
102
-
103
- The tesseract OCR engine is the default OCR engine. You can decide not to use it - and then either use one of the two alternative OCR engines, or have no OCR at all.
104
-
105
- ### Alternative OCR engines
106
-
107
- ```bash
108
- # Install with EasyOCR support
109
- pip install "kreuzberg[easyocr]"
110
-
111
- # Install with PaddleOCR support
112
- pip install "kreuzberg[paddleocr]"
113
- ```
114
-
115
- ## Quick Example
116
-
117
- ```python
118
- import asyncio
119
- from kreuzberg import extract_file
120
-
121
- async def main():
122
- # Extract text from a PDF
123
- result = await extract_file("document.pdf")
124
- print(result.content)
125
-
126
- # Extract text from an image
127
- result = await extract_file("scan.jpg")
128
- print(result.content)
129
-
130
- # Extract text from a Word document
131
- result = await extract_file("report.docx")
132
- print(result.content)
133
-
134
- asyncio.run(main())
135
- ```
136
-
137
- ## Command Line Interface
138
-
139
- Kreuzberg includes a powerful CLI for processing documents from the command line:
140
-
141
- ```bash
142
- # Extract text from a file
143
- kreuzberg extract document.pdf
144
-
145
- # Extract with JSON output and metadata
146
- kreuzberg extract document.pdf --output-format json --show-metadata
147
-
148
- # Extract from stdin
149
- cat document.html | kreuzberg extract
150
-
151
- # Use specific OCR backend
152
- kreuzberg extract image.png --ocr-backend easyocr --easyocr-languages en,de
153
-
154
- # Extract with configuration file
155
- kreuzberg extract document.pdf --config config.toml
156
- ```
157
-
158
- ### CLI Configuration
159
-
160
- Configure via `pyproject.toml`:
161
-
162
- ```toml
163
- [tool.kreuzberg]
164
- force_ocr = true
165
- chunk_content = false
166
- extract_tables = true
167
- max_chars = 4000
168
- ocr_backend = "tesseract"
169
-
170
- [tool.kreuzberg.tesseract]
171
- language = "eng+deu"
172
- psm = 3
173
- ```
174
-
175
- For full CLI documentation, see the [CLI Guide](https://goldziher.github.io/kreuzberg/cli/).
176
-
177
- ## Documentation
178
-
179
- For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
180
-
181
- - [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
182
- - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
183
- - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line interface documentation
184
- - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
185
- - [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
186
- - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
187
- - [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) - Choose the right OCR engine
188
-
189
- ## Supported Formats
190
-
191
- Kreuzberg supports a wide range of document formats:
192
-
193
- - **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
194
- - **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
195
- - **Spreadsheets**: XLSX, XLS, CSV, etc.
196
- - **Presentations**: PPTX, PPT, etc.
197
- - **Web Content**: HTML, XML, etc.
198
-
199
- ## OCR Engines
200
-
201
- Kreuzberg supports multiple OCR engines:
202
-
203
- - **Tesseract** (Default): Lightweight, fast startup, requires system installation
204
- - **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
205
- - **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
206
-
207
- For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
208
-
209
- ## Performance
210
-
211
- Kreuzberg offers both sync and async APIs. Choose the right one based on your use case:
212
-
213
- | Operation | Sync Time | Async Time | Async Advantage |
214
- | ---------------------- | --------- | ---------- | ------------------ |
215
- | Simple text (Markdown) | 0.4ms | 17.5ms | **❌ 41x slower** |
216
- | HTML documents | 1.6ms | 1.1ms | **✅ 1.5x faster** |
217
- | Complex PDFs | 39.0s | 8.5s | **✅ 4.6x faster** |
218
- | OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
219
- | Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
220
-
221
- **Rule of thumb:**
222
-
223
- - Use **sync** for simple documents and CLI applications
224
- - Use **async** for complex PDFs, OCR, and batch processing
225
- - Use **batch operations** for multiple files
226
-
227
- For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
228
-
229
- ## Contributing
230
-
231
- We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.
232
-
233
- ## License
234
-
235
- This library is released under the MIT license.