kreuzberg 1.3.0__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg-1.5.0/PKG-INFO +318 -0
- kreuzberg-1.5.0/README.md +285 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg/_extractors.py +46 -81
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg/_mime_types.py +22 -31
- kreuzberg-1.5.0/kreuzberg/_pandoc.py +416 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg/_string.py +9 -12
- kreuzberg-1.5.0/kreuzberg/_tesseract.py +318 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg/exceptions.py +9 -1
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg/extraction.py +16 -16
- kreuzberg-1.5.0/kreuzberg.egg-info/PKG-INFO +318 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg.egg-info/SOURCES.txt +2 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg.egg-info/requires.txt +2 -2
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/pyproject.toml +13 -17
- kreuzberg-1.3.0/PKG-INFO +0 -306
- kreuzberg-1.3.0/README.md +0 -272
- kreuzberg-1.3.0/kreuzberg.egg-info/PKG-INFO +0 -306
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/LICENSE +0 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg/_sync.py +0 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-1.3.0 → kreuzberg-1.5.0}/setup.cfg +0 -0
kreuzberg-1.5.0/PKG-INFO
ADDED
@@ -0,0 +1,318 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 1.5.0
|
4
|
+
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Operating System :: OS Independent
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Classifier: Topic :: Text Processing :: General
|
22
|
+
Classifier: Topic :: Utilities
|
23
|
+
Classifier: Typing :: Typed
|
24
|
+
Requires-Python: >=3.9
|
25
|
+
Description-Content-Type: text/markdown
|
26
|
+
License-File: LICENSE
|
27
|
+
Requires-Dist: anyio>=4.8.0
|
28
|
+
Requires-Dist: charset-normalizer>=3.4.1
|
29
|
+
Requires-Dist: html-to-markdown>=1.2.0
|
30
|
+
Requires-Dist: pypdfium2>=4.30.1
|
31
|
+
Requires-Dist: python-pptx>=1.0.2
|
32
|
+
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
|
33
|
+
|
34
|
+
# Kreuzberg
|
35
|
+
|
36
|
+
Kreuzberg is a modern Python library for text extraction from documents, designed for simplicity and efficiency. It provides a unified async interface for extracting text from a wide range of file formats including PDFs, images, office documents, and more.
|
37
|
+
|
38
|
+
## Why Kreuzberg?
|
39
|
+
|
40
|
+
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
41
|
+
- **Local Processing**: No external API calls or cloud dependencies required
|
42
|
+
- **Resource Efficient**: Lightweight processing without GPU requirements
|
43
|
+
- **Format Support**: Comprehensive support for documents, images, and text formats
|
44
|
+
- **Modern Python**: Built with async/await, type hints, and current best practices
|
45
|
+
|
46
|
+
Kreuzberg was created to solve text extraction needs in RAG (Retrieval Augmented Generation) applications, but it's suitable for any text extraction use case. Unlike many commercial solutions that require API calls or complex setups, Kreuzberg focuses on local processing with minimal dependencies.
|
47
|
+
|
48
|
+
## Features
|
49
|
+
|
50
|
+
- **Universal Text Extraction**: Extract text from PDFs (both searchable and scanned), images, office documents, and more
|
51
|
+
- **Smart Processing**: Automatic OCR for scanned documents, encoding detection for text files
|
52
|
+
- **Modern Python Design**:
|
53
|
+
- Async-first API using `anyio`
|
54
|
+
- Comprehensive type hints for better IDE support
|
55
|
+
- Detailed error handling with context information
|
56
|
+
- **Production Ready**:
|
57
|
+
- Robust error handling
|
58
|
+
- Detailed debugging information
|
59
|
+
- Memory efficient processing
|
60
|
+
|
61
|
+
## Installation
|
62
|
+
|
63
|
+
### 1. Install the Python Package
|
64
|
+
|
65
|
+
```shell
|
66
|
+
pip install kreuzberg
|
67
|
+
```
|
68
|
+
|
69
|
+
### 2. Install System Dependencies
|
70
|
+
|
71
|
+
Kreuzberg requires two open-source tools:
|
72
|
+
|
73
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
74
|
+
|
75
|
+
- GPL v2.0 licensed (used via CLI only)
|
76
|
+
- Handles office documents and markup formats
|
77
|
+
|
78
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
79
|
+
- Apache License
|
80
|
+
- Required for scanned documents and images
|
81
|
+
|
82
|
+
## Architecture
|
83
|
+
|
84
|
+
Kreuzberg is designed as a high-level async abstraction over established open-source tools. It integrates:
|
85
|
+
|
86
|
+
- **PDF Processing**:
|
87
|
+
- `pdfium2` for searchable PDFs
|
88
|
+
- Tesseract OCR for scanned content
|
89
|
+
- **Document Conversion**:
|
90
|
+
- Pandoc for office documents and markup
|
91
|
+
- `python-pptx` for PowerPoint files
|
92
|
+
- `html-to-markdown` for HTML content
|
93
|
+
- **Text Processing**:
|
94
|
+
- Smart encoding detection
|
95
|
+
- Markdown and plain text handling
|
96
|
+
|
97
|
+
### Supported Formats
|
98
|
+
|
99
|
+
#### Document Formats
|
100
|
+
|
101
|
+
- PDF (`.pdf`, both searchable and scanned documents)
|
102
|
+
- Microsoft Word (`.docx`, `.doc`)
|
103
|
+
- PowerPoint presentations (`.pptx`)
|
104
|
+
- OpenDocument Text (`.odt`)
|
105
|
+
- Rich Text Format (`.rtf`)
|
106
|
+
- EPUB (`.epub`)
|
107
|
+
- DocBook XML (`.dbk`, `.xml`)
|
108
|
+
- FictionBook (`.fb2`)
|
109
|
+
- LaTeX (`.tex`, `.latex`)
|
110
|
+
- Typst (`.typ`)
|
111
|
+
|
112
|
+
#### Markup and Text Formats
|
113
|
+
|
114
|
+
- HTML (`.html`, `.htm`)
|
115
|
+
- Plain text (`.txt`) and Markdown (`.md`, `.markdown`)
|
116
|
+
- reStructuredText (`.rst`)
|
117
|
+
- Org-mode (`.org`)
|
118
|
+
- DokuWiki (`.txt`)
|
119
|
+
- Pod (`.pod`)
|
120
|
+
- Man pages (`.1`, `.2`, etc.)
|
121
|
+
|
122
|
+
#### Data and Research Formats
|
123
|
+
|
124
|
+
- CSV (`.csv`) and TSV (`.tsv`) files
|
125
|
+
- Jupyter Notebooks (`.ipynb`)
|
126
|
+
- BibTeX (`.bib`) and BibLaTeX (`.bib`)
|
127
|
+
- CSL-JSON (`.json`)
|
128
|
+
- EndNote XML (`.xml`)
|
129
|
+
- RIS (`.ris`)
|
130
|
+
- JATS XML (`.xml`)
|
131
|
+
|
132
|
+
#### Image Formats
|
133
|
+
|
134
|
+
- JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
|
135
|
+
- PNG (`.png`)
|
136
|
+
- TIFF (`.tiff`, `.tif`)
|
137
|
+
- BMP (`.bmp`)
|
138
|
+
- GIF (`.gif`)
|
139
|
+
- WebP (`.webp`)
|
140
|
+
- JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
|
141
|
+
- Portable Anymap (`.pnm`)
|
142
|
+
- Portable Bitmap (`.pbm`)
|
143
|
+
- Portable Graymap (`.pgm`)
|
144
|
+
- Portable Pixmap (`.ppm`)
|
145
|
+
|
146
|
+
## Usage
|
147
|
+
|
148
|
+
Kreuzberg provides a simple, async-first API for text extraction. The library exports two main functions:
|
149
|
+
|
150
|
+
- `extract_file()`: Extract text from a file (accepts string path or `pathlib.Path`)
|
151
|
+
- `extract_bytes()`: Extract text from bytes (accepts a byte string)
|
152
|
+
|
153
|
+
### Quick Start
|
154
|
+
|
155
|
+
```python
|
156
|
+
from pathlib import Path
|
157
|
+
from kreuzberg import extract_file, extract_bytes
|
158
|
+
|
159
|
+
# Basic file extraction
|
160
|
+
async def extract_document():
|
161
|
+
# Extract from a PDF file
|
162
|
+
pdf_result = await extract_file("document.pdf")
|
163
|
+
print(f"PDF text: {pdf_result.content}")
|
164
|
+
|
165
|
+
# Extract from an image
|
166
|
+
img_result = await extract_file("scan.png")
|
167
|
+
print(f"Image text: {img_result.content}")
|
168
|
+
|
169
|
+
# Extract from Word document
|
170
|
+
docx_result = await extract_file(Path("document.docx"))
|
171
|
+
print(f"Word text: {docx_result.content}")
|
172
|
+
```
|
173
|
+
|
174
|
+
### Processing Uploaded Files
|
175
|
+
|
176
|
+
```python
|
177
|
+
from kreuzberg import extract_bytes
|
178
|
+
|
179
|
+
async def process_upload(file_content: bytes, mime_type: str):
|
180
|
+
"""Process uploaded file content with known MIME type."""
|
181
|
+
result = await extract_bytes(file_content, mime_type=mime_type)
|
182
|
+
return result.content
|
183
|
+
|
184
|
+
# Example usage with different file types
|
185
|
+
async def handle_uploads():
|
186
|
+
# Process PDF upload
|
187
|
+
pdf_result = await extract_bytes(pdf_bytes, mime_type="application/pdf")
|
188
|
+
|
189
|
+
# Process image upload
|
190
|
+
img_result = await extract_bytes(image_bytes, mime_type="image/jpeg")
|
191
|
+
|
192
|
+
# Process Word document upload
|
193
|
+
docx_result = await extract_bytes(docx_bytes,
|
194
|
+
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
195
|
+
```
|
196
|
+
|
197
|
+
### Advanced Features
|
198
|
+
|
199
|
+
#### PDF Processing Options
|
200
|
+
|
201
|
+
```python
|
202
|
+
from kreuzberg import extract_file
|
203
|
+
|
204
|
+
async def process_pdf():
|
205
|
+
# Force OCR for PDFs with embedded images or scanned content
|
206
|
+
result = await extract_file("document.pdf", force_ocr=True)
|
207
|
+
|
208
|
+
# Process a scanned PDF (automatically uses OCR)
|
209
|
+
scanned = await extract_file("scanned.pdf")
|
210
|
+
```
|
211
|
+
|
212
|
+
#### ExtractionResult Object
|
213
|
+
|
214
|
+
All extraction functions return an `ExtractionResult` containing:
|
215
|
+
|
216
|
+
- `content`: The extracted text (str)
|
217
|
+
- `mime_type`: Output format ("text/plain" or "text/markdown" for Pandoc conversions)
|
218
|
+
|
219
|
+
```python
|
220
|
+
from kreuzberg import ExtractionResult
|
221
|
+
|
222
|
+
async def process_document(path: str) -> tuple[str, str]:
|
223
|
+
# Access as a named tuple
|
224
|
+
result: ExtractionResult = await extract_file(path)
|
225
|
+
print(f"Content: {result.content}")
|
226
|
+
print(f"Format: {result.mime_type}")
|
227
|
+
|
228
|
+
# Or unpack as a tuple
|
229
|
+
content, mime_type = await extract_file(path)
|
230
|
+
return content, mime_type
|
231
|
+
```
|
232
|
+
|
233
|
+
### Error Handling
|
234
|
+
|
235
|
+
Kreuzberg provides detailed error handling with two main exception types:
|
236
|
+
|
237
|
+
```python
|
238
|
+
from kreuzberg import extract_file
|
239
|
+
from kreuzberg.exceptions import ValidationError, ParsingError
|
240
|
+
|
241
|
+
async def safe_extract(path: str) -> str:
|
242
|
+
try:
|
243
|
+
result = await extract_file(path)
|
244
|
+
return result.content
|
245
|
+
|
246
|
+
except ValidationError as e:
|
247
|
+
# Handles input validation issues:
|
248
|
+
# - Unsupported file types
|
249
|
+
# - Missing files
|
250
|
+
# - Invalid MIME types
|
251
|
+
print(f"Invalid input: {e.message}")
|
252
|
+
print(f"Details: {e.context}")
|
253
|
+
|
254
|
+
except ParsingError as e:
|
255
|
+
# Handles processing errors:
|
256
|
+
# - PDF parsing failures
|
257
|
+
# - OCR errors
|
258
|
+
# - Format conversion issues
|
259
|
+
print(f"Processing failed: {e.message}")
|
260
|
+
print(f"Details: {e.context}")
|
261
|
+
|
262
|
+
return ""
|
263
|
+
|
264
|
+
# Example error contexts
|
265
|
+
try:
|
266
|
+
result = await extract_file("document.xyz")
|
267
|
+
except ValidationError as e:
|
268
|
+
# e.context might contain:
|
269
|
+
# {
|
270
|
+
# "file_path": "document.xyz",
|
271
|
+
# "error": "Unsupported file type",
|
272
|
+
# "supported_types": ["pdf", "docx", ...]
|
273
|
+
# }
|
274
|
+
|
275
|
+
try:
|
276
|
+
result = await extract_file("scan.pdf")
|
277
|
+
except ParsingError as e:
|
278
|
+
# e.context might contain:
|
279
|
+
# {
|
280
|
+
# "file_path": "scan.pdf",
|
281
|
+
# "error": "OCR processing failed",
|
282
|
+
# "details": "Tesseract error: Unable to process image"
|
283
|
+
# }
|
284
|
+
```
|
285
|
+
|
286
|
+
## Roadmap
|
287
|
+
|
288
|
+
V1:
|
289
|
+
|
290
|
+
- [x] - html file text extraction
|
291
|
+
- [ ] - better PDF table extraction
|
292
|
+
- [ ] - batch APIs
|
293
|
+
- [ ] - sync APIs
|
294
|
+
|
295
|
+
V2:
|
296
|
+
|
297
|
+
- [ ] - metadata extraction (breaking change)
|
298
|
+
- [ ] - TBD
|
299
|
+
|
300
|
+
## Contribution
|
301
|
+
|
302
|
+
This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
|
303
|
+
submitting PRs to avoid disappointment.
|
304
|
+
|
305
|
+
### Local Development
|
306
|
+
|
307
|
+
1. Clone the repo
|
308
|
+
2. Install the system dependencies
|
309
|
+
3. Install the full dependencies with `uv sync`
|
310
|
+
4. Install the pre-commit hooks with:
|
311
|
+
```shell
|
312
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
313
|
+
```
|
314
|
+
5. Make your changes and submit a PR
|
315
|
+
|
316
|
+
## License
|
317
|
+
|
318
|
+
This library uses the MIT license.
|
@@ -0,0 +1,285 @@
|
|
1
|
+
# Kreuzberg
|
2
|
+
|
3
|
+
Kreuzberg is a modern Python library for text extraction from documents, designed for simplicity and efficiency. It provides a unified async interface for extracting text from a wide range of file formats including PDFs, images, office documents, and more.
|
4
|
+
|
5
|
+
## Why Kreuzberg?
|
6
|
+
|
7
|
+
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
8
|
+
- **Local Processing**: No external API calls or cloud dependencies required
|
9
|
+
- **Resource Efficient**: Lightweight processing without GPU requirements
|
10
|
+
- **Format Support**: Comprehensive support for documents, images, and text formats
|
11
|
+
- **Modern Python**: Built with async/await, type hints, and current best practices
|
12
|
+
|
13
|
+
Kreuzberg was created to solve text extraction needs in RAG (Retrieval Augmented Generation) applications, but it's suitable for any text extraction use case. Unlike many commercial solutions that require API calls or complex setups, Kreuzberg focuses on local processing with minimal dependencies.
|
14
|
+
|
15
|
+
## Features
|
16
|
+
|
17
|
+
- **Universal Text Extraction**: Extract text from PDFs (both searchable and scanned), images, office documents, and more
|
18
|
+
- **Smart Processing**: Automatic OCR for scanned documents, encoding detection for text files
|
19
|
+
- **Modern Python Design**:
|
20
|
+
- Async-first API using `anyio`
|
21
|
+
- Comprehensive type hints for better IDE support
|
22
|
+
- Detailed error handling with context information
|
23
|
+
- **Production Ready**:
|
24
|
+
- Robust error handling
|
25
|
+
- Detailed debugging information
|
26
|
+
- Memory efficient processing
|
27
|
+
|
28
|
+
## Installation
|
29
|
+
|
30
|
+
### 1. Install the Python Package
|
31
|
+
|
32
|
+
```shell
|
33
|
+
pip install kreuzberg
|
34
|
+
```
|
35
|
+
|
36
|
+
### 2. Install System Dependencies
|
37
|
+
|
38
|
+
Kreuzberg requires two open-source tools:
|
39
|
+
|
40
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
41
|
+
|
42
|
+
- GPL v2.0 licensed (used via CLI only)
|
43
|
+
- Handles office documents and markup formats
|
44
|
+
|
45
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
46
|
+
- Apache License
|
47
|
+
- Required for scanned documents and images
|
48
|
+
|
49
|
+
## Architecture
|
50
|
+
|
51
|
+
Kreuzberg is designed as a high-level async abstraction over established open-source tools. It integrates:
|
52
|
+
|
53
|
+
- **PDF Processing**:
|
54
|
+
- `pdfium2` for searchable PDFs
|
55
|
+
- Tesseract OCR for scanned content
|
56
|
+
- **Document Conversion**:
|
57
|
+
- Pandoc for office documents and markup
|
58
|
+
- `python-pptx` for PowerPoint files
|
59
|
+
- `html-to-markdown` for HTML content
|
60
|
+
- **Text Processing**:
|
61
|
+
- Smart encoding detection
|
62
|
+
- Markdown and plain text handling
|
63
|
+
|
64
|
+
### Supported Formats
|
65
|
+
|
66
|
+
#### Document Formats
|
67
|
+
|
68
|
+
- PDF (`.pdf`, both searchable and scanned documents)
|
69
|
+
- Microsoft Word (`.docx`, `.doc`)
|
70
|
+
- PowerPoint presentations (`.pptx`)
|
71
|
+
- OpenDocument Text (`.odt`)
|
72
|
+
- Rich Text Format (`.rtf`)
|
73
|
+
- EPUB (`.epub`)
|
74
|
+
- DocBook XML (`.dbk`, `.xml`)
|
75
|
+
- FictionBook (`.fb2`)
|
76
|
+
- LaTeX (`.tex`, `.latex`)
|
77
|
+
- Typst (`.typ`)
|
78
|
+
|
79
|
+
#### Markup and Text Formats
|
80
|
+
|
81
|
+
- HTML (`.html`, `.htm`)
|
82
|
+
- Plain text (`.txt`) and Markdown (`.md`, `.markdown`)
|
83
|
+
- reStructuredText (`.rst`)
|
84
|
+
- Org-mode (`.org`)
|
85
|
+
- DokuWiki (`.txt`)
|
86
|
+
- Pod (`.pod`)
|
87
|
+
- Man pages (`.1`, `.2`, etc.)
|
88
|
+
|
89
|
+
#### Data and Research Formats
|
90
|
+
|
91
|
+
- CSV (`.csv`) and TSV (`.tsv`) files
|
92
|
+
- Jupyter Notebooks (`.ipynb`)
|
93
|
+
- BibTeX (`.bib`) and BibLaTeX (`.bib`)
|
94
|
+
- CSL-JSON (`.json`)
|
95
|
+
- EndNote XML (`.xml`)
|
96
|
+
- RIS (`.ris`)
|
97
|
+
- JATS XML (`.xml`)
|
98
|
+
|
99
|
+
#### Image Formats
|
100
|
+
|
101
|
+
- JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
|
102
|
+
- PNG (`.png`)
|
103
|
+
- TIFF (`.tiff`, `.tif`)
|
104
|
+
- BMP (`.bmp`)
|
105
|
+
- GIF (`.gif`)
|
106
|
+
- WebP (`.webp`)
|
107
|
+
- JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
|
108
|
+
- Portable Anymap (`.pnm`)
|
109
|
+
- Portable Bitmap (`.pbm`)
|
110
|
+
- Portable Graymap (`.pgm`)
|
111
|
+
- Portable Pixmap (`.ppm`)
|
112
|
+
|
113
|
+
## Usage
|
114
|
+
|
115
|
+
Kreuzberg provides a simple, async-first API for text extraction. The library exports two main functions:
|
116
|
+
|
117
|
+
- `extract_file()`: Extract text from a file (accepts string path or `pathlib.Path`)
|
118
|
+
- `extract_bytes()`: Extract text from bytes (accepts a byte string)
|
119
|
+
|
120
|
+
### Quick Start
|
121
|
+
|
122
|
+
```python
|
123
|
+
from pathlib import Path
|
124
|
+
from kreuzberg import extract_file, extract_bytes
|
125
|
+
|
126
|
+
# Basic file extraction
|
127
|
+
async def extract_document():
|
128
|
+
# Extract from a PDF file
|
129
|
+
pdf_result = await extract_file("document.pdf")
|
130
|
+
print(f"PDF text: {pdf_result.content}")
|
131
|
+
|
132
|
+
# Extract from an image
|
133
|
+
img_result = await extract_file("scan.png")
|
134
|
+
print(f"Image text: {img_result.content}")
|
135
|
+
|
136
|
+
# Extract from Word document
|
137
|
+
docx_result = await extract_file(Path("document.docx"))
|
138
|
+
print(f"Word text: {docx_result.content}")
|
139
|
+
```
|
140
|
+
|
141
|
+
### Processing Uploaded Files
|
142
|
+
|
143
|
+
```python
|
144
|
+
from kreuzberg import extract_bytes
|
145
|
+
|
146
|
+
async def process_upload(file_content: bytes, mime_type: str):
|
147
|
+
"""Process uploaded file content with known MIME type."""
|
148
|
+
result = await extract_bytes(file_content, mime_type=mime_type)
|
149
|
+
return result.content
|
150
|
+
|
151
|
+
# Example usage with different file types
|
152
|
+
async def handle_uploads():
|
153
|
+
# Process PDF upload
|
154
|
+
pdf_result = await extract_bytes(pdf_bytes, mime_type="application/pdf")
|
155
|
+
|
156
|
+
# Process image upload
|
157
|
+
img_result = await extract_bytes(image_bytes, mime_type="image/jpeg")
|
158
|
+
|
159
|
+
# Process Word document upload
|
160
|
+
docx_result = await extract_bytes(docx_bytes,
|
161
|
+
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
162
|
+
```
|
163
|
+
|
164
|
+
### Advanced Features
|
165
|
+
|
166
|
+
#### PDF Processing Options
|
167
|
+
|
168
|
+
```python
|
169
|
+
from kreuzberg import extract_file
|
170
|
+
|
171
|
+
async def process_pdf():
|
172
|
+
# Force OCR for PDFs with embedded images or scanned content
|
173
|
+
result = await extract_file("document.pdf", force_ocr=True)
|
174
|
+
|
175
|
+
# Process a scanned PDF (automatically uses OCR)
|
176
|
+
scanned = await extract_file("scanned.pdf")
|
177
|
+
```
|
178
|
+
|
179
|
+
#### ExtractionResult Object
|
180
|
+
|
181
|
+
All extraction functions return an `ExtractionResult` containing:
|
182
|
+
|
183
|
+
- `content`: The extracted text (str)
|
184
|
+
- `mime_type`: Output format ("text/plain" or "text/markdown" for Pandoc conversions)
|
185
|
+
|
186
|
+
```python
|
187
|
+
from kreuzberg import ExtractionResult
|
188
|
+
|
189
|
+
async def process_document(path: str) -> tuple[str, str]:
|
190
|
+
# Access as a named tuple
|
191
|
+
result: ExtractionResult = await extract_file(path)
|
192
|
+
print(f"Content: {result.content}")
|
193
|
+
print(f"Format: {result.mime_type}")
|
194
|
+
|
195
|
+
# Or unpack as a tuple
|
196
|
+
content, mime_type = await extract_file(path)
|
197
|
+
return content, mime_type
|
198
|
+
```
|
199
|
+
|
200
|
+
### Error Handling
|
201
|
+
|
202
|
+
Kreuzberg provides detailed error handling with two main exception types:
|
203
|
+
|
204
|
+
```python
|
205
|
+
from kreuzberg import extract_file
|
206
|
+
from kreuzberg.exceptions import ValidationError, ParsingError
|
207
|
+
|
208
|
+
async def safe_extract(path: str) -> str:
|
209
|
+
try:
|
210
|
+
result = await extract_file(path)
|
211
|
+
return result.content
|
212
|
+
|
213
|
+
except ValidationError as e:
|
214
|
+
# Handles input validation issues:
|
215
|
+
# - Unsupported file types
|
216
|
+
# - Missing files
|
217
|
+
# - Invalid MIME types
|
218
|
+
print(f"Invalid input: {e.message}")
|
219
|
+
print(f"Details: {e.context}")
|
220
|
+
|
221
|
+
except ParsingError as e:
|
222
|
+
# Handles processing errors:
|
223
|
+
# - PDF parsing failures
|
224
|
+
# - OCR errors
|
225
|
+
# - Format conversion issues
|
226
|
+
print(f"Processing failed: {e.message}")
|
227
|
+
print(f"Details: {e.context}")
|
228
|
+
|
229
|
+
return ""
|
230
|
+
|
231
|
+
# Example error contexts
|
232
|
+
try:
|
233
|
+
result = await extract_file("document.xyz")
|
234
|
+
except ValidationError as e:
|
235
|
+
# e.context might contain:
|
236
|
+
# {
|
237
|
+
# "file_path": "document.xyz",
|
238
|
+
# "error": "Unsupported file type",
|
239
|
+
# "supported_types": ["pdf", "docx", ...]
|
240
|
+
# }
|
241
|
+
|
242
|
+
try:
|
243
|
+
result = await extract_file("scan.pdf")
|
244
|
+
except ParsingError as e:
|
245
|
+
# e.context might contain:
|
246
|
+
# {
|
247
|
+
# "file_path": "scan.pdf",
|
248
|
+
# "error": "OCR processing failed",
|
249
|
+
# "details": "Tesseract error: Unable to process image"
|
250
|
+
# }
|
251
|
+
```
|
252
|
+
|
253
|
+
## Roadmap
|
254
|
+
|
255
|
+
V1:
|
256
|
+
|
257
|
+
- [x] - html file text extraction
|
258
|
+
- [ ] - better PDF table extraction
|
259
|
+
- [ ] - batch APIs
|
260
|
+
- [ ] - sync APIs
|
261
|
+
|
262
|
+
V2:
|
263
|
+
|
264
|
+
- [ ] - metadata extraction (breaking change)
|
265
|
+
- [ ] - TBD
|
266
|
+
|
267
|
+
## Contribution
|
268
|
+
|
269
|
+
This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
|
270
|
+
submitting PRs to avoid disappointment.
|
271
|
+
|
272
|
+
### Local Development
|
273
|
+
|
274
|
+
1. Clone the repo
|
275
|
+
2. Install the system dependencies
|
276
|
+
3. Install the full dependencies with `uv sync`
|
277
|
+
4. Install the pre-commit hooks with:
|
278
|
+
```shell
|
279
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
280
|
+
```
|
281
|
+
5. Make your changes and submit a PR
|
282
|
+
|
283
|
+
## License
|
284
|
+
|
285
|
+
This library uses the MIT license.
|