kreuzberg 1.6.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -2
- kreuzberg/_constants.py +6 -0
- kreuzberg/_html.py +32 -0
- kreuzberg/_mime_types.py +109 -1
- kreuzberg/_pandoc.py +154 -167
- kreuzberg/_pdf.py +189 -0
- kreuzberg/_pptx.py +88 -0
- kreuzberg/_string.py +5 -8
- kreuzberg/_sync.py +6 -1
- kreuzberg/_tesseract.py +101 -64
- kreuzberg/_tmp.py +37 -0
- kreuzberg/_types.py +71 -0
- kreuzberg/_xlsx.py +92 -0
- kreuzberg/extraction.py +269 -64
- kreuzberg-2.0.0.dist-info/METADATA +419 -0
- kreuzberg-2.0.0.dist-info/RECORD +21 -0
- kreuzberg/_extractors.py +0 -247
- kreuzberg-1.6.0.dist-info/METADATA +0 -317
- kreuzberg-1.6.0.dist-info/RECORD +0 -15
- {kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,419 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 2.0.0
|
4
|
+
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Operating System :: OS Independent
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Classifier: Topic :: Text Processing :: General
|
22
|
+
Classifier: Topic :: Utilities
|
23
|
+
Classifier: Typing :: Typed
|
24
|
+
Requires-Python: >=3.9
|
25
|
+
Description-Content-Type: text/markdown
|
26
|
+
License-File: LICENSE
|
27
|
+
Requires-Dist: anyio>=4.8.0
|
28
|
+
Requires-Dist: charset-normalizer>=3.4.1
|
29
|
+
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
+
Requires-Dist: html-to-markdown>=1.2.0
|
31
|
+
Requires-Dist: pypdfium2>=4.30.1
|
32
|
+
Requires-Dist: python-calamine>=0.3.1
|
33
|
+
Requires-Dist: python-pptx>=1.0.2
|
34
|
+
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.11"
|
35
|
+
|
36
|
+
# Kreuzberg
|
37
|
+
|
38
|
+
Kreuzberg is a Python library for text extraction from documents. It provides a unified async interface for extracting text from PDFs, images, office documents, and more.
|
39
|
+
|
40
|
+
## Why Kreuzberg?
|
41
|
+
|
42
|
+
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
43
|
+
- **Local Processing**: No external API calls or cloud dependencies required
|
44
|
+
- **Resource Efficient**: Lightweight processing without GPU requirements
|
45
|
+
- **Lightweight**: Has few curated dependencies and a minimal footprint
|
46
|
+
- **Format Support**: Comprehensive support for documents, images, and text formats
|
47
|
+
- **Modern Python**: Built with async/await, type hints, and functional first approach
|
48
|
+
- **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
|
49
|
+
|
50
|
+
Kreuzberg was built for RAG (Retrieval Augmented Generation) applications, focusing on local processing with minimal dependencies. Its designed for modern async applications, serverless functions, and dockerized applications.
|
51
|
+
|
52
|
+
## Installation
|
53
|
+
|
54
|
+
### 1. Install the Python Package
|
55
|
+
|
56
|
+
```shell
|
57
|
+
pip install kreuzberg
|
58
|
+
```
|
59
|
+
|
60
|
+
### 2. Install System Dependencies
|
61
|
+
|
62
|
+
Kreuzberg requires two system level dependencies:
|
63
|
+
|
64
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
65
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
66
|
+
|
67
|
+
Please install these using their respective installation guides.
|
68
|
+
|
69
|
+
## Architecture
|
70
|
+
|
71
|
+
Kreuzberg integrates:
|
72
|
+
|
73
|
+
- **PDF Processing**:
|
74
|
+
- `pdfium2` for searchable PDFs
|
75
|
+
- Tesseract OCR for scanned content
|
76
|
+
- **Document Conversion**:
|
77
|
+
- Pandoc for many document and markup formats
|
78
|
+
- `python-pptx` for PowerPoint files
|
79
|
+
- `html-to-markdown` for HTML content
|
80
|
+
- `calamine` for Excel spreadsheets (with multi-sheet support)
|
81
|
+
- **Text Processing**:
|
82
|
+
- Smart encoding detection
|
83
|
+
- Markdown and plain text handling
|
84
|
+
|
85
|
+
### Supported Formats
|
86
|
+
|
87
|
+
#### Document Formats
|
88
|
+
|
89
|
+
- PDF (`.pdf`, both searchable and scanned)
|
90
|
+
- Microsoft Word (`.docx`)
|
91
|
+
- PowerPoint presentations (`.pptx`)
|
92
|
+
- OpenDocument Text (`.odt`)
|
93
|
+
- Rich Text Format (`.rtf`)
|
94
|
+
- EPUB (`.epub`)
|
95
|
+
- DocBook XML (`.dbk`, `.xml`)
|
96
|
+
- FictionBook (`.fb2`)
|
97
|
+
- LaTeX (`.tex`, `.latex`)
|
98
|
+
- Typst (`.typ`)
|
99
|
+
|
100
|
+
#### Markup and Text Formats
|
101
|
+
|
102
|
+
- HTML (`.html`, `.htm`)
|
103
|
+
- Plain text (`.txt`) and Markdown (`.md`, `.markdown`)
|
104
|
+
- reStructuredText (`.rst`)
|
105
|
+
- Org-mode (`.org`)
|
106
|
+
- DokuWiki (`.txt`)
|
107
|
+
- Pod (`.pod`)
|
108
|
+
- Troff/Man (`.1`, `.2`, etc.)
|
109
|
+
|
110
|
+
#### Data and Research Formats
|
111
|
+
|
112
|
+
- Spreadsheets (`.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.xlam`, `.xla`, `.ods`)
|
113
|
+
- CSV (`.csv`) and TSV (`.tsv`) files
|
114
|
+
- OPML files (`.opml`)
|
115
|
+
- Jupyter Notebooks (`.ipynb`)
|
116
|
+
- BibTeX (`.bib`) and BibLaTeX (`.bib`)
|
117
|
+
- CSL-JSON (`.json`)
|
118
|
+
- EndNote and JATS XML (`.xml`)
|
119
|
+
- RIS (`.ris`)
|
120
|
+
|
121
|
+
#### Image Formats
|
122
|
+
|
123
|
+
- JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
|
124
|
+
- PNG (`.png`)
|
125
|
+
- TIFF (`.tiff`, `.tif`)
|
126
|
+
- BMP (`.bmp`)
|
127
|
+
- GIF (`.gif`)
|
128
|
+
- JPEG 2000 family (`.jp2`, `.jpm`, `.jpx`, `.mj2`)
|
129
|
+
- WebP (`.webp`)
|
130
|
+
- Portable anymap formats (`.pbm`, `.pgm`, `.ppm`, `.pnm`)
|
131
|
+
|
132
|
+
## Usage
|
133
|
+
|
134
|
+
Kreuzberg provides both async and sync APIs for text extraction, including batch processing. The library exports the following main functions:
|
135
|
+
|
136
|
+
- Single Item Processing:
|
137
|
+
|
138
|
+
- `extract_file()`: Async function to extract text from a file (accepts string path or `pathlib.Path`)
|
139
|
+
- `extract_bytes()`: Async function to extract text from bytes (accepts a byte string)
|
140
|
+
- `extract_file_sync()`: Synchronous version of `extract_file()`
|
141
|
+
- `extract_bytes_sync()`: Synchronous version of `extract_bytes()`
|
142
|
+
|
143
|
+
- Batch Processing:
|
144
|
+
- `batch_extract_file()`: Async function to extract text from multiple files concurrently
|
145
|
+
- `batch_extract_bytes()`: Async function to extract text from multiple byte contents concurrently
|
146
|
+
- `batch_extract_file_sync()`: Synchronous version of `batch_extract_file()`
|
147
|
+
- `batch_extract_bytes_sync()`: Synchronous version of `batch_extract_bytes()`
|
148
|
+
|
149
|
+
### Configuration Parameters
|
150
|
+
|
151
|
+
All extraction functions accept the following optional parameters for configuring OCR and performance:
|
152
|
+
|
153
|
+
#### OCR Configuration
|
154
|
+
|
155
|
+
- `language` (default: "eng"): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for non-English documents. Examples:
|
156
|
+
- "eng" for English
|
157
|
+
- "deu" for German
|
158
|
+
- "fra" for French
|
159
|
+
|
160
|
+
Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information.
|
161
|
+
|
162
|
+
- `psm` (Page Segmentation Mode, default: PSM.AUTO): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
|
163
|
+
|
164
|
+
#### Performance Configuration
|
165
|
+
|
166
|
+
- `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc. Higher values can lead to performance improvements, but may cause resource exhaustion and deadlocks (especially for tesseract).
|
167
|
+
|
168
|
+
### Quick Start
|
169
|
+
|
170
|
+
```python
|
171
|
+
from pathlib import Path
|
172
|
+
from kreuzberg import extract_file
|
173
|
+
from kreuzberg.extraction import ExtractionResult
|
174
|
+
from kreuzberg._tesseract import PSMMode, SupportedLanguage
|
175
|
+
|
176
|
+
|
177
|
+
# Basic file extraction
|
178
|
+
async def extract_document():
|
179
|
+
# Extract from a PDF file with default settings
|
180
|
+
pdf_result: ExtractionResult = await extract_file("document.pdf")
|
181
|
+
print(f"Content: {pdf_result.content}")
|
182
|
+
|
183
|
+
# Extract from an image with German language model
|
184
|
+
img_result = await extract_file(
|
185
|
+
"scan.png",
|
186
|
+
language="deu", # German language model
|
187
|
+
psm=PSMMode.SINGLE_BLOCK, # Treat as single block of text
|
188
|
+
max_processes=4 # Limit concurrent processes
|
189
|
+
)
|
190
|
+
print(f"Image text: {img_result.content}")
|
191
|
+
|
192
|
+
# Extract from Word document with metadata
|
193
|
+
docx_result = await extract_file(Path("document.docx"))
|
194
|
+
if docx_result.metadata:
|
195
|
+
print(f"Title: {docx_result.metadata.get('title')}")
|
196
|
+
print(f"Author: {docx_result.metadata.get('author')}")
|
197
|
+
```
|
198
|
+
|
199
|
+
### Extracting Bytes
|
200
|
+
|
201
|
+
```python
|
202
|
+
from kreuzberg import extract_bytes
|
203
|
+
from kreuzberg.extraction import ExtractionResult
|
204
|
+
|
205
|
+
|
206
|
+
async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
|
207
|
+
"""Process uploaded file content with known MIME type."""
|
208
|
+
return await extract_bytes(
|
209
|
+
file_content,
|
210
|
+
mime_type=mime_type,
|
211
|
+
)
|
212
|
+
|
213
|
+
|
214
|
+
# Example usage with different file types
|
215
|
+
async def handle_uploads(docx_bytes: bytes, pdf_bytes: bytes, image_bytes: bytes):
|
216
|
+
# Process PDF upload
|
217
|
+
pdf_result = await process_upload(pdf_bytes, mime_type="application/pdf")
|
218
|
+
print(f"PDF content: {pdf_result.content}")
|
219
|
+
print(f"PDF metadata: {pdf_result.metadata}")
|
220
|
+
|
221
|
+
# Process image upload (will use OCR)
|
222
|
+
img_result = await process_upload(image_bytes, mime_type="image/jpeg")
|
223
|
+
print(f"Image text: {img_result.content}")
|
224
|
+
|
225
|
+
# Process Word document upload
|
226
|
+
docx_result = await process_upload(
|
227
|
+
docx_bytes,
|
228
|
+
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
229
|
+
)
|
230
|
+
print(f"Word content: {docx_result.content}")
|
231
|
+
```
|
232
|
+
|
233
|
+
### Batch Processing
|
234
|
+
|
235
|
+
Kreuzberg supports efficient batch processing of multiple files or byte contents:
|
236
|
+
|
237
|
+
```python
|
238
|
+
from pathlib import Path
|
239
|
+
from kreuzberg import batch_extract_file, batch_extract_bytes
|
240
|
+
|
241
|
+
|
242
|
+
# Process multiple files concurrently
|
243
|
+
async def process_documents(file_paths: list[Path]) -> None:
|
244
|
+
# Extract from multiple files
|
245
|
+
results = await batch_extract_file(file_paths)
|
246
|
+
for path, result in zip(file_paths, results):
|
247
|
+
print(f"File {path}: {result.content[:100]}...")
|
248
|
+
|
249
|
+
|
250
|
+
# Process multiple uploaded files concurrently
|
251
|
+
async def process_uploads(contents: list[tuple[bytes, str]]) -> None:
|
252
|
+
# Each item is a tuple of (content, mime_type)
|
253
|
+
results = await batch_extract_bytes(contents)
|
254
|
+
for (_, mime_type), result in zip(contents, results):
|
255
|
+
print(f"Upload {mime_type}: {result.content[:100]}...")
|
256
|
+
|
257
|
+
|
258
|
+
# Synchronous batch processing is also available
|
259
|
+
def process_documents_sync(file_paths: list[Path]) -> None:
|
260
|
+
results = batch_extract_file_sync(file_paths)
|
261
|
+
for path, result in zip(file_paths, results):
|
262
|
+
print(f"File {path}: {result.content[:100]}...")
|
263
|
+
```
|
264
|
+
|
265
|
+
Features:
|
266
|
+
|
267
|
+
- Ordered results
|
268
|
+
- Concurrent processing
|
269
|
+
- Error handling per item
|
270
|
+
- Async and sync interfaces
|
271
|
+
- Same options as single extraction
|
272
|
+
|
273
|
+
### PDF Processing
|
274
|
+
|
275
|
+
Kreuzberg employs a smart approach to PDF text extraction:
|
276
|
+
|
277
|
+
1. **Searchable Text Detection**: First attempts to extract text directly from searchable PDFs using `pdfium2`.
|
278
|
+
|
279
|
+
2. **Text Validation**: Extracted text is validated for corruption by checking for:
|
280
|
+
|
281
|
+
- Control and non-printable characters
|
282
|
+
- Unicode replacement characters (�)
|
283
|
+
- Zero-width spaces and other invisible characters
|
284
|
+
- Empty or whitespace-only content
|
285
|
+
|
286
|
+
3. **Automatic OCR Fallback**: If the extracted text appears corrupted or if the PDF is image-based, automatically falls back to OCR using Tesseract.
|
287
|
+
|
288
|
+
This approach works well for searchable PDFs and standard text documents. For complex OCR (e.g., handwriting, photographs), use a specialized tool.
|
289
|
+
|
290
|
+
### PDF Processing Options
|
291
|
+
|
292
|
+
You can control PDF processing behavior using optional parameters:
|
293
|
+
|
294
|
+
```python
|
295
|
+
from kreuzberg import extract_file
|
296
|
+
|
297
|
+
|
298
|
+
async def process_pdf():
|
299
|
+
# Default behavior: auto-detect and use OCR if needed
|
300
|
+
# By default, max_processes=1 for safe operation
|
301
|
+
result = await extract_file("document.pdf")
|
302
|
+
print(result.content)
|
303
|
+
|
304
|
+
# Force OCR even for searchable PDFs
|
305
|
+
result = await extract_file("document.pdf", force_ocr=True)
|
306
|
+
print(result.content)
|
307
|
+
|
308
|
+
# Control OCR concurrency for large documents
|
309
|
+
# Warning: High concurrency values can cause system resource exhaustion
|
310
|
+
# Start with a low value and increase based on your system's capabilities
|
311
|
+
result = await extract_file(
|
312
|
+
"large_document.pdf",
|
313
|
+
max_processes=4 # Process up to 4 pages concurrently
|
314
|
+
)
|
315
|
+
print(result.content)
|
316
|
+
|
317
|
+
# Process a scanned PDF (automatically uses OCR)
|
318
|
+
result = await extract_file("scanned.pdf")
|
319
|
+
print(result.content)
|
320
|
+
```
|
321
|
+
|
322
|
+
### ExtractionResult Object
|
323
|
+
|
324
|
+
All extraction functions return an `ExtractionResult` or a list thereof (for batch functions). The `ExtractionResult` object is a `NamedTuple`:
|
325
|
+
|
326
|
+
- `content`: The extracted text (str)
|
327
|
+
- `mime_type`: Output format ("text/plain" or "text/markdown" for Pandoc conversions)
|
328
|
+
- `metadata`: A metadata dictionary. Currently this dictionary is only populated when extracting documents using pandoc.
|
329
|
+
|
330
|
+
```python
|
331
|
+
from kreuzberg import extract_file, ExtractionResult, Metadata
|
332
|
+
|
333
|
+
async def process_document(path: str) -> tuple[str, str, Metadata]:
|
334
|
+
# Access as a named tuple
|
335
|
+
result: ExtractionResult = await extract_file(path)
|
336
|
+
print(f"Content: {result.content}")
|
337
|
+
print(f"Format: {result.mime_type}")
|
338
|
+
|
339
|
+
# Or unpack as a tuple
|
340
|
+
content, mime_type, metadata = await extract_file(path)
|
341
|
+
return content, mime_type, metadata
|
342
|
+
```
|
343
|
+
|
344
|
+
### Error Handling
|
345
|
+
|
346
|
+
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
347
|
+
|
348
|
+
```python
|
349
|
+
from kreuzberg import extract_file
|
350
|
+
from kreuzberg.exceptions import (
|
351
|
+
ValidationError,
|
352
|
+
ParsingError,
|
353
|
+
OCRError,
|
354
|
+
MissingDependencyError
|
355
|
+
)
|
356
|
+
|
357
|
+
async def safe_extract(path: str) -> str:
|
358
|
+
try:
|
359
|
+
result = await extract_file(path)
|
360
|
+
return result.content
|
361
|
+
|
362
|
+
except ValidationError as e:
|
363
|
+
# Input validation issues
|
364
|
+
# - Unsupported or undetectable MIME types
|
365
|
+
# - Missing files
|
366
|
+
# - Invalid input parameters
|
367
|
+
print(f"Validation failed: {e}")
|
368
|
+
|
369
|
+
except OCRError as e:
|
370
|
+
# OCR-specific issues
|
371
|
+
# - Tesseract processing failures
|
372
|
+
# - Image conversion problems
|
373
|
+
print(f"OCR failed: {e}")
|
374
|
+
|
375
|
+
except MissingDependencyError as e:
|
376
|
+
# System dependency issues
|
377
|
+
# - Missing Tesseract OCR
|
378
|
+
# - Missing Pandoc
|
379
|
+
# - Incompatible versions
|
380
|
+
print(f"Dependency missing: {e}")
|
381
|
+
|
382
|
+
except ParsingError as e:
|
383
|
+
# General processing errors
|
384
|
+
# - PDF parsing failures
|
385
|
+
# - Format conversion issues
|
386
|
+
# - Encoding problems
|
387
|
+
print(f"Processing failed: {e}")
|
388
|
+
|
389
|
+
return ""
|
390
|
+
```
|
391
|
+
|
392
|
+
All exceptions include:
|
393
|
+
|
394
|
+
- Error message
|
395
|
+
- Context in the `context` attribute
|
396
|
+
- String representation
|
397
|
+
- Exception chaining
|
398
|
+
|
399
|
+
## Contribution
|
400
|
+
|
401
|
+
This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
|
402
|
+
submitting PRs to avoid disappointment.
|
403
|
+
|
404
|
+
### Local Development
|
405
|
+
|
406
|
+
1. Clone the repo
|
407
|
+
2. Install the system dependencies
|
408
|
+
3. Install the full dependencies with `uv sync`
|
409
|
+
4. Install the pre-commit hooks with:
|
410
|
+
|
411
|
+
```shell
|
412
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
413
|
+
```
|
414
|
+
|
415
|
+
5. Make your changes and submit a PR
|
416
|
+
|
417
|
+
## License
|
418
|
+
|
419
|
+
This library uses the MIT license.
|
@@ -0,0 +1,21 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=3opnj4Q8Ci151QuVqPaM3sCb8mpFIRhZbZUgBmp1LI0,410
|
2
|
+
kreuzberg/_constants.py,sha256=BXICWxbtN-22BEQDcGwCH5rLk3HZObtc9fJim1fXzDA,161
|
3
|
+
kreuzberg/_html.py,sha256=lj4GjvCGiUeDcBjotKZuMNNsG3wOuFwP1-bJLsI99YQ,978
|
4
|
+
kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
|
5
|
+
kreuzberg/_pandoc.py,sha256=8sggl4nE-BWLKBecGGPnUX-gfNjnKxX-2SInuWmtWKQ,13763
|
6
|
+
kreuzberg/_pdf.py,sha256=V1TVwPpGyrE0YJqnmW_5kh4Y1qWwZI5SSF-lwT_Bbac,6288
|
7
|
+
kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
|
8
|
+
kreuzberg/_string.py,sha256=Z1c53A1-9JtzNthsnrPENxUaMyPS1VD6Oj3SNagSNgg,1093
|
9
|
+
kreuzberg/_sync.py,sha256=3biXw0UDwcaxz-PGmfjWV5JaDE7olFpGKZdG12onxO0,981
|
10
|
+
kreuzberg/_tesseract.py,sha256=xt_4MU7PfN1nZWlWBVQF6zmJnMs9pJq8yWTzPUxTqm0,9240
|
11
|
+
kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
|
12
|
+
kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
|
13
|
+
kreuzberg/_xlsx.py,sha256=dDsNwJ_AGjUU5CQ8ExDFbiIYBauc3cEYAD-7zcP3Op0,2850
|
14
|
+
kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
|
15
|
+
kreuzberg/extraction.py,sha256=1RIs7YaUK0wcOpY1eDcIqh3n-UlJY7ZeulZPdaAxdvo,13345
|
16
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
kreuzberg-2.0.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
18
|
+
kreuzberg-2.0.0.dist-info/METADATA,sha256=cvD9ypz004yHqePKuw8eZZcuZ2lanyN1y2jlB5FMG0Q,14201
|
19
|
+
kreuzberg-2.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
20
|
+
kreuzberg-2.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
21
|
+
kreuzberg-2.0.0.dist-info/RECORD,,
|
kreuzberg/_extractors.py
DELETED
@@ -1,247 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import re
|
4
|
-
from contextlib import suppress
|
5
|
-
from html import escape
|
6
|
-
from io import BytesIO
|
7
|
-
from pathlib import Path
|
8
|
-
from tempfile import NamedTemporaryFile
|
9
|
-
from typing import TYPE_CHECKING
|
10
|
-
|
11
|
-
import html_to_markdown
|
12
|
-
import pptx
|
13
|
-
import pypdfium2
|
14
|
-
from anyio import Path as AsyncPath
|
15
|
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
16
|
-
from xlsx2csv import Xlsx2csv
|
17
|
-
|
18
|
-
from kreuzberg._pandoc import process_content, process_file
|
19
|
-
from kreuzberg._string import normalize_spaces, safe_decode
|
20
|
-
from kreuzberg._sync import run_sync
|
21
|
-
from kreuzberg._tesseract import batch_process_images
|
22
|
-
from kreuzberg.exceptions import ParsingError
|
23
|
-
|
24
|
-
if TYPE_CHECKING: # pragma: no cover
|
25
|
-
from PIL.Image import Image
|
26
|
-
|
27
|
-
|
28
|
-
async def convert_pdf_to_images(file_path: Path) -> list[Image]:
|
29
|
-
"""Convert a PDF file to images.
|
30
|
-
|
31
|
-
Args:
|
32
|
-
file_path: The path to the PDF file.
|
33
|
-
|
34
|
-
Raises:
|
35
|
-
ParsingError: If the PDF file could not be converted to images.
|
36
|
-
|
37
|
-
Returns:
|
38
|
-
A list of Pillow Images.
|
39
|
-
"""
|
40
|
-
try:
|
41
|
-
pdf = await run_sync(pypdfium2.PdfDocument, str(file_path))
|
42
|
-
return [page.render(scale=2.0).to_pil() for page in pdf]
|
43
|
-
except pypdfium2.PdfiumError as e:
|
44
|
-
raise ParsingError(
|
45
|
-
"Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
|
46
|
-
) from e
|
47
|
-
|
48
|
-
|
49
|
-
async def extract_pdf_with_tesseract(file_path: Path) -> str:
|
50
|
-
"""Extract text from a scanned PDF file using pytesseract.
|
51
|
-
|
52
|
-
Args:
|
53
|
-
file_path: The path to the PDF file.
|
54
|
-
|
55
|
-
Returns:
|
56
|
-
The extracted text.
|
57
|
-
"""
|
58
|
-
images = await convert_pdf_to_images(file_path)
|
59
|
-
ocr_results = await batch_process_images(images)
|
60
|
-
return normalize_spaces("\n".join(ocr_results))
|
61
|
-
|
62
|
-
|
63
|
-
async def extract_pdf_with_pdfium2(file_path: Path) -> str:
|
64
|
-
"""Extract text from a searchable PDF file using pypdfium2.
|
65
|
-
|
66
|
-
Args:
|
67
|
-
file_path: The path to the PDF file.
|
68
|
-
|
69
|
-
Raises:
|
70
|
-
ParsingError: If the text could not be extracted from the PDF file.
|
71
|
-
|
72
|
-
Returns:
|
73
|
-
The extracted text.
|
74
|
-
"""
|
75
|
-
try:
|
76
|
-
document = await run_sync(pypdfium2.PdfDocument, file_path)
|
77
|
-
text = "\n".join(page.get_textpage().get_text_range() for page in document)
|
78
|
-
return normalize_spaces(text)
|
79
|
-
except pypdfium2.PdfiumError as e:
|
80
|
-
raise ParsingError(
|
81
|
-
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
82
|
-
) from e
|
83
|
-
|
84
|
-
|
85
|
-
async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
|
86
|
-
"""Extract text from a PDF file.
|
87
|
-
|
88
|
-
Args:
|
89
|
-
file_path: The path to the PDF file.
|
90
|
-
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
91
|
-
|
92
|
-
Returns:
|
93
|
-
The extracted text.
|
94
|
-
"""
|
95
|
-
if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
|
96
|
-
return normalize_spaces(content)
|
97
|
-
|
98
|
-
return await extract_pdf_with_tesseract(file_path)
|
99
|
-
|
100
|
-
|
101
|
-
async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
|
102
|
-
"""Extract text using pandoc.
|
103
|
-
|
104
|
-
Args:
|
105
|
-
file_data: The content of the file.
|
106
|
-
mime_type: The mime type of the file.
|
107
|
-
|
108
|
-
Returns:
|
109
|
-
The extracted text.
|
110
|
-
"""
|
111
|
-
result = await process_content(file_data, mime_type=mime_type)
|
112
|
-
return normalize_spaces(result.content)
|
113
|
-
|
114
|
-
|
115
|
-
async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
|
116
|
-
"""Extract text using pandoc.
|
117
|
-
|
118
|
-
Args:
|
119
|
-
file_path: The path to the file.
|
120
|
-
mime_type: The mime type of the file.
|
121
|
-
|
122
|
-
Returns:
|
123
|
-
The extracted text.
|
124
|
-
"""
|
125
|
-
result = await process_file(file_path, mime_type=mime_type)
|
126
|
-
return normalize_spaces(result.content)
|
127
|
-
|
128
|
-
|
129
|
-
async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
130
|
-
"""Extract text from a PPTX file.
|
131
|
-
|
132
|
-
Notes:
|
133
|
-
This function is based on code vendored from `markitdown`, which has an MIT license as well.
|
134
|
-
|
135
|
-
Args:
|
136
|
-
file_path_or_contents: The path to the PPTX file or its contents as bytes.
|
137
|
-
|
138
|
-
Returns:
|
139
|
-
The extracted text content
|
140
|
-
"""
|
141
|
-
md_content = ""
|
142
|
-
file_contents = (
|
143
|
-
file_path_or_contents
|
144
|
-
if isinstance(file_path_or_contents, bytes)
|
145
|
-
else await AsyncPath(file_path_or_contents).read_bytes()
|
146
|
-
)
|
147
|
-
presentation = pptx.Presentation(BytesIO(file_contents))
|
148
|
-
|
149
|
-
for index, slide in enumerate(presentation.slides):
|
150
|
-
md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
|
151
|
-
|
152
|
-
title = slide.shapes.title
|
153
|
-
|
154
|
-
for shape in slide.shapes:
|
155
|
-
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
|
156
|
-
shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
|
157
|
-
):
|
158
|
-
alt_text = ""
|
159
|
-
with suppress(AttributeError):
|
160
|
-
# access non-visual properties
|
161
|
-
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
|
162
|
-
|
163
|
-
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
164
|
-
md_content += f"\n\n"
|
165
|
-
|
166
|
-
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
167
|
-
html_table = "<table>"
|
168
|
-
first_row = True
|
169
|
-
|
170
|
-
for row in shape.table.rows:
|
171
|
-
html_table += "<tr>"
|
172
|
-
|
173
|
-
for cell in row.cells:
|
174
|
-
tag = "th" if first_row else "td"
|
175
|
-
html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
|
176
|
-
|
177
|
-
html_table += "</tr>"
|
178
|
-
first_row = False
|
179
|
-
|
180
|
-
html_table += "</table>"
|
181
|
-
md_content += "\n" + html_table + "\n"
|
182
|
-
|
183
|
-
elif shape.has_text_frame:
|
184
|
-
md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
|
185
|
-
|
186
|
-
md_content = md_content.strip()
|
187
|
-
if slide.has_notes_slide:
|
188
|
-
md_content += "\n\n### Notes:\n"
|
189
|
-
notes_frame = slide.notes_slide.notes_text_frame
|
190
|
-
|
191
|
-
if notes_frame is not None:
|
192
|
-
md_content += notes_frame.text
|
193
|
-
|
194
|
-
md_content = md_content.strip()
|
195
|
-
|
196
|
-
return normalize_spaces(md_content)
|
197
|
-
|
198
|
-
|
199
|
-
async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
|
200
|
-
"""Extract text from an XLSX file by converting it to CSV and then to markdown.
|
201
|
-
|
202
|
-
Args:
|
203
|
-
file_path_or_contents: The path to the XLSX file or its contents as bytes.
|
204
|
-
|
205
|
-
Returns:
|
206
|
-
The extracted text content.
|
207
|
-
|
208
|
-
Raises:
|
209
|
-
ParsingError: If the XLSX file could not be parsed.
|
210
|
-
"""
|
211
|
-
try:
|
212
|
-
with NamedTemporaryFile(suffix=".xlsx") as xlsx_file, NamedTemporaryFile(suffix=".csv") as csv_file:
|
213
|
-
if isinstance(file_path_or_contents, bytes):
|
214
|
-
xlsx_file.write(file_path_or_contents)
|
215
|
-
xlsx_file.flush()
|
216
|
-
xlsx_path = xlsx_file.name
|
217
|
-
else:
|
218
|
-
xlsx_path = str(file_path_or_contents)
|
219
|
-
|
220
|
-
await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
|
221
|
-
result = await process_file(csv_file.name, mime_type="text/csv")
|
222
|
-
return normalize_spaces(result.content)
|
223
|
-
except Exception as e:
|
224
|
-
raise ParsingError(
|
225
|
-
"Could not extract text from XLSX file",
|
226
|
-
context={
|
227
|
-
"error": str(e),
|
228
|
-
"file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
|
229
|
-
},
|
230
|
-
) from e
|
231
|
-
|
232
|
-
|
233
|
-
async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
|
234
|
-
"""Extract text from an HTML string.
|
235
|
-
|
236
|
-
Args:
|
237
|
-
file_path_or_contents: The HTML content.
|
238
|
-
|
239
|
-
Returns:
|
240
|
-
The extracted text content.
|
241
|
-
"""
|
242
|
-
content = (
|
243
|
-
safe_decode(file_path_or_contents)
|
244
|
-
if isinstance(file_path_or_contents, bytes)
|
245
|
-
else await AsyncPath(file_path_or_contents).read_text()
|
246
|
-
)
|
247
|
-
return normalize_spaces(await run_sync(html_to_markdown.convert_to_markdown, content))
|