kreuzberg 1.5.0__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/PKG-INFO +56 -32
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/README.md +53 -30
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/_extractors.py +78 -10
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/_mime_types.py +2 -2
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/_pandoc.py +103 -69
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/_tesseract.py +18 -8
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/extraction.py +20 -12
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg.egg-info/PKG-INFO +56 -32
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg.egg-info/requires.txt +1 -2
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/pyproject.toml +18 -17
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/LICENSE +0 -0
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/_string.py +0 -0
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/_sync.py +0 -0
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg.egg-info/SOURCES.txt +0 -0
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-1.5.0 → kreuzberg-1.7.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.7.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -29,7 +29,8 @@ Requires-Dist: charset-normalizer>=3.4.1
|
|
29
29
|
Requires-Dist: html-to-markdown>=1.2.0
|
30
30
|
Requires-Dist: pypdfium2>=4.30.1
|
31
31
|
Requires-Dist: python-pptx>=1.0.2
|
32
|
-
Requires-Dist: typing-extensions>=4.12.2
|
32
|
+
Requires-Dist: typing-extensions>=4.12.2
|
33
|
+
Requires-Dist: xlsx2csv>=0.8.4
|
33
34
|
|
34
35
|
# Kreuzberg
|
35
36
|
|
@@ -68,16 +69,12 @@ pip install kreuzberg
|
|
68
69
|
|
69
70
|
### 2. Install System Dependencies
|
70
71
|
|
71
|
-
Kreuzberg requires two
|
72
|
+
Kreuzberg requires two system level dependencies:
|
72
73
|
|
73
74
|
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
74
|
-
|
75
|
-
- GPL v2.0 licensed (used via CLI only)
|
76
|
-
- Handles office documents and markup formats
|
77
|
-
|
78
75
|
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
79
|
-
|
80
|
-
|
76
|
+
|
77
|
+
Please install these using their respective installation guides.
|
81
78
|
|
82
79
|
## Architecture
|
83
80
|
|
@@ -87,9 +84,10 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
|
|
87
84
|
- `pdfium2` for searchable PDFs
|
88
85
|
- Tesseract OCR for scanned content
|
89
86
|
- **Document Conversion**:
|
90
|
-
- Pandoc for
|
87
|
+
- Pandoc for many document and markup formats
|
91
88
|
- `python-pptx` for PowerPoint files
|
92
89
|
- `html-to-markdown` for HTML content
|
90
|
+
- `xlsx2csv` for Excel spreadsheets
|
93
91
|
- **Text Processing**:
|
94
92
|
- Smart encoding detection
|
95
93
|
- Markdown and plain text handling
|
@@ -121,6 +119,7 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
|
|
121
119
|
|
122
120
|
#### Data and Research Formats
|
123
121
|
|
122
|
+
- Excel spreadsheets (`.xlsx`)
|
124
123
|
- CSV (`.csv`) and TSV (`.tsv`) files
|
125
124
|
- Jupyter Notebooks (`.ipynb`)
|
126
125
|
- BibTeX (`.bib`) and BibLaTeX (`.bib`)
|
@@ -232,11 +231,16 @@ async def process_document(path: str) -> tuple[str, str]:
|
|
232
231
|
|
233
232
|
### Error Handling
|
234
233
|
|
235
|
-
Kreuzberg provides
|
234
|
+
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
236
235
|
|
237
236
|
```python
|
238
237
|
from kreuzberg import extract_file
|
239
|
-
from kreuzberg.exceptions import
|
238
|
+
from kreuzberg.exceptions import (
|
239
|
+
ValidationError,
|
240
|
+
ParsingError,
|
241
|
+
OCRError,
|
242
|
+
MissingDependencyError
|
243
|
+
)
|
240
244
|
|
241
245
|
async def safe_extract(path: str) -> str:
|
242
246
|
try:
|
@@ -244,20 +248,31 @@ async def safe_extract(path: str) -> str:
|
|
244
248
|
return result.content
|
245
249
|
|
246
250
|
except ValidationError as e:
|
247
|
-
#
|
248
|
-
# - Unsupported
|
251
|
+
# Input validation issues
|
252
|
+
# - Unsupported or undetectable MIME types
|
249
253
|
# - Missing files
|
250
|
-
# - Invalid
|
251
|
-
print(f"
|
252
|
-
|
254
|
+
# - Invalid input parameters
|
255
|
+
print(f"Validation failed: {e}")
|
256
|
+
|
257
|
+
except OCRError as e:
|
258
|
+
# OCR-specific issues
|
259
|
+
# - Tesseract processing failures
|
260
|
+
# - Image conversion problems
|
261
|
+
print(f"OCR failed: {e}")
|
262
|
+
|
263
|
+
except MissingDependencyError as e:
|
264
|
+
# System dependency issues
|
265
|
+
# - Missing Tesseract OCR
|
266
|
+
# - Missing Pandoc
|
267
|
+
# - Incompatible versions
|
268
|
+
print(f"Dependency missing: {e}")
|
253
269
|
|
254
270
|
except ParsingError as e:
|
255
|
-
#
|
271
|
+
# General processing errors
|
256
272
|
# - PDF parsing failures
|
257
|
-
# - OCR errors
|
258
273
|
# - Format conversion issues
|
259
|
-
|
260
|
-
print(f"
|
274
|
+
# - Encoding problems
|
275
|
+
print(f"Processing failed: {e}")
|
261
276
|
|
262
277
|
return ""
|
263
278
|
|
@@ -265,24 +280,33 @@ async def safe_extract(path: str) -> str:
|
|
265
280
|
try:
|
266
281
|
result = await extract_file("document.xyz")
|
267
282
|
except ValidationError as e:
|
268
|
-
#
|
269
|
-
#
|
283
|
+
# Error will include context:
|
284
|
+
# ValidationError: Unsupported mime type
|
285
|
+
# Context: {
|
270
286
|
# "file_path": "document.xyz",
|
271
|
-
# "
|
272
|
-
# "supported_types": ["pdf", "docx", ...]
|
287
|
+
# "supported_mimetypes": ["application/pdf", ...]
|
273
288
|
# }
|
289
|
+
print(e)
|
274
290
|
|
275
291
|
try:
|
276
|
-
result = await extract_file("scan.
|
277
|
-
except
|
278
|
-
#
|
279
|
-
#
|
280
|
-
#
|
281
|
-
# "
|
282
|
-
# "
|
292
|
+
result = await extract_file("scan.jpg")
|
293
|
+
except OCRError as e:
|
294
|
+
# Error will include context:
|
295
|
+
# OCRError: OCR failed with a non-0 return code
|
296
|
+
# Context: {
|
297
|
+
# "file_path": "scan.jpg",
|
298
|
+
# "tesseract_version": "5.3.0"
|
283
299
|
# }
|
300
|
+
print(e)
|
284
301
|
```
|
285
302
|
|
303
|
+
All exceptions provide:
|
304
|
+
|
305
|
+
- A descriptive error message
|
306
|
+
- Relevant context in the `context` attribute
|
307
|
+
- String representation with both message and context
|
308
|
+
- Proper exception chaining for debugging
|
309
|
+
|
286
310
|
## Roadmap
|
287
311
|
|
288
312
|
V1:
|
@@ -35,16 +35,12 @@ pip install kreuzberg
|
|
35
35
|
|
36
36
|
### 2. Install System Dependencies
|
37
37
|
|
38
|
-
Kreuzberg requires two
|
38
|
+
Kreuzberg requires two system level dependencies:
|
39
39
|
|
40
40
|
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
41
|
-
|
42
|
-
- GPL v2.0 licensed (used via CLI only)
|
43
|
-
- Handles office documents and markup formats
|
44
|
-
|
45
41
|
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
46
|
-
|
47
|
-
|
42
|
+
|
43
|
+
Please install these using their respective installation guides.
|
48
44
|
|
49
45
|
## Architecture
|
50
46
|
|
@@ -54,9 +50,10 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
|
|
54
50
|
- `pdfium2` for searchable PDFs
|
55
51
|
- Tesseract OCR for scanned content
|
56
52
|
- **Document Conversion**:
|
57
|
-
- Pandoc for
|
53
|
+
- Pandoc for many document and markup formats
|
58
54
|
- `python-pptx` for PowerPoint files
|
59
55
|
- `html-to-markdown` for HTML content
|
56
|
+
- `xlsx2csv` for Excel spreadsheets
|
60
57
|
- **Text Processing**:
|
61
58
|
- Smart encoding detection
|
62
59
|
- Markdown and plain text handling
|
@@ -88,6 +85,7 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
|
|
88
85
|
|
89
86
|
#### Data and Research Formats
|
90
87
|
|
88
|
+
- Excel spreadsheets (`.xlsx`)
|
91
89
|
- CSV (`.csv`) and TSV (`.tsv`) files
|
92
90
|
- Jupyter Notebooks (`.ipynb`)
|
93
91
|
- BibTeX (`.bib`) and BibLaTeX (`.bib`)
|
@@ -199,11 +197,16 @@ async def process_document(path: str) -> tuple[str, str]:
|
|
199
197
|
|
200
198
|
### Error Handling
|
201
199
|
|
202
|
-
Kreuzberg provides
|
200
|
+
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
203
201
|
|
204
202
|
```python
|
205
203
|
from kreuzberg import extract_file
|
206
|
-
from kreuzberg.exceptions import
|
204
|
+
from kreuzberg.exceptions import (
|
205
|
+
ValidationError,
|
206
|
+
ParsingError,
|
207
|
+
OCRError,
|
208
|
+
MissingDependencyError
|
209
|
+
)
|
207
210
|
|
208
211
|
async def safe_extract(path: str) -> str:
|
209
212
|
try:
|
@@ -211,20 +214,31 @@ async def safe_extract(path: str) -> str:
|
|
211
214
|
return result.content
|
212
215
|
|
213
216
|
except ValidationError as e:
|
214
|
-
#
|
215
|
-
# - Unsupported
|
217
|
+
# Input validation issues
|
218
|
+
# - Unsupported or undetectable MIME types
|
216
219
|
# - Missing files
|
217
|
-
# - Invalid
|
218
|
-
print(f"
|
219
|
-
|
220
|
+
# - Invalid input parameters
|
221
|
+
print(f"Validation failed: {e}")
|
222
|
+
|
223
|
+
except OCRError as e:
|
224
|
+
# OCR-specific issues
|
225
|
+
# - Tesseract processing failures
|
226
|
+
# - Image conversion problems
|
227
|
+
print(f"OCR failed: {e}")
|
228
|
+
|
229
|
+
except MissingDependencyError as e:
|
230
|
+
# System dependency issues
|
231
|
+
# - Missing Tesseract OCR
|
232
|
+
# - Missing Pandoc
|
233
|
+
# - Incompatible versions
|
234
|
+
print(f"Dependency missing: {e}")
|
220
235
|
|
221
236
|
except ParsingError as e:
|
222
|
-
#
|
237
|
+
# General processing errors
|
223
238
|
# - PDF parsing failures
|
224
|
-
# - OCR errors
|
225
239
|
# - Format conversion issues
|
226
|
-
|
227
|
-
print(f"
|
240
|
+
# - Encoding problems
|
241
|
+
print(f"Processing failed: {e}")
|
228
242
|
|
229
243
|
return ""
|
230
244
|
|
@@ -232,24 +246,33 @@ async def safe_extract(path: str) -> str:
|
|
232
246
|
try:
|
233
247
|
result = await extract_file("document.xyz")
|
234
248
|
except ValidationError as e:
|
235
|
-
#
|
236
|
-
#
|
249
|
+
# Error will include context:
|
250
|
+
# ValidationError: Unsupported mime type
|
251
|
+
# Context: {
|
237
252
|
# "file_path": "document.xyz",
|
238
|
-
# "
|
239
|
-
# "supported_types": ["pdf", "docx", ...]
|
253
|
+
# "supported_mimetypes": ["application/pdf", ...]
|
240
254
|
# }
|
255
|
+
print(e)
|
241
256
|
|
242
257
|
try:
|
243
|
-
result = await extract_file("scan.
|
244
|
-
except
|
245
|
-
#
|
246
|
-
#
|
247
|
-
#
|
248
|
-
# "
|
249
|
-
# "
|
258
|
+
result = await extract_file("scan.jpg")
|
259
|
+
except OCRError as e:
|
260
|
+
# Error will include context:
|
261
|
+
# OCRError: OCR failed with a non-0 return code
|
262
|
+
# Context: {
|
263
|
+
# "file_path": "scan.jpg",
|
264
|
+
# "tesseract_version": "5.3.0"
|
250
265
|
# }
|
266
|
+
print(e)
|
251
267
|
```
|
252
268
|
|
269
|
+
All exceptions provide:
|
270
|
+
|
271
|
+
- A descriptive error message
|
272
|
+
- Relevant context in the `context` attribute
|
273
|
+
- String representation with both message and context
|
274
|
+
- Proper exception chaining for debugging
|
275
|
+
|
253
276
|
## Roadmap
|
254
277
|
|
255
278
|
V1:
|
@@ -1,9 +1,12 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import re
|
4
|
+
from asyncio import gather
|
4
5
|
from contextlib import suppress
|
5
6
|
from html import escape
|
6
7
|
from io import BytesIO
|
8
|
+
from pathlib import Path
|
9
|
+
from tempfile import NamedTemporaryFile
|
7
10
|
from typing import TYPE_CHECKING
|
8
11
|
|
9
12
|
import html_to_markdown
|
@@ -11,6 +14,7 @@ import pptx
|
|
11
14
|
import pypdfium2
|
12
15
|
from anyio import Path as AsyncPath
|
13
16
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
17
|
+
from xlsx2csv import Xlsx2csv
|
14
18
|
|
15
19
|
from kreuzberg._pandoc import process_content, process_file
|
16
20
|
from kreuzberg._string import normalize_spaces, safe_decode
|
@@ -19,8 +23,6 @@ from kreuzberg._tesseract import batch_process_images
|
|
19
23
|
from kreuzberg.exceptions import ParsingError
|
20
24
|
|
21
25
|
if TYPE_CHECKING: # pragma: no cover
|
22
|
-
from pathlib import Path
|
23
|
-
|
24
26
|
from PIL.Image import Image
|
25
27
|
|
26
28
|
|
@@ -36,13 +38,18 @@ async def convert_pdf_to_images(file_path: Path) -> list[Image]:
|
|
36
38
|
Returns:
|
37
39
|
A list of Pillow Images.
|
38
40
|
"""
|
41
|
+
pdf = None
|
42
|
+
resolved_path = str(await AsyncPath(file_path).resolve())
|
39
43
|
try:
|
40
|
-
pdf = await run_sync(pypdfium2.PdfDocument,
|
44
|
+
pdf = await run_sync(pypdfium2.PdfDocument, resolved_path)
|
41
45
|
return [page.render(scale=2.0).to_pil() for page in pdf]
|
42
46
|
except pypdfium2.PdfiumError as e:
|
43
47
|
raise ParsingError(
|
44
48
|
"Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
|
45
49
|
) from e
|
50
|
+
finally:
|
51
|
+
if pdf is not None:
|
52
|
+
pdf.close()
|
46
53
|
|
47
54
|
|
48
55
|
async def extract_pdf_with_tesseract(file_path: Path) -> str:
|
@@ -71,30 +78,49 @@ async def extract_pdf_with_pdfium2(file_path: Path) -> str:
|
|
71
78
|
Returns:
|
72
79
|
The extracted text.
|
73
80
|
"""
|
81
|
+
document = None
|
82
|
+
resolved_path = str(await AsyncPath(file_path).resolve())
|
74
83
|
try:
|
75
|
-
document = await run_sync(pypdfium2.PdfDocument,
|
76
|
-
text = "\n".join(page.get_textpage().
|
84
|
+
document = await run_sync(pypdfium2.PdfDocument, resolved_path)
|
85
|
+
text = "\n".join(page.get_textpage().get_text_bounded() for page in document)
|
77
86
|
return normalize_spaces(text)
|
78
87
|
except pypdfium2.PdfiumError as e:
|
79
88
|
raise ParsingError(
|
80
89
|
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
81
90
|
) from e
|
91
|
+
finally:
|
92
|
+
if document is not None:
|
93
|
+
document.close()
|
82
94
|
|
83
95
|
|
84
|
-
async def
|
96
|
+
async def extract_pdf(file_path_or_contents: Path | bytes, force_ocr: bool = False) -> str:
|
85
97
|
"""Extract text from a PDF file.
|
86
98
|
|
87
99
|
Args:
|
88
|
-
|
100
|
+
file_path_or_contents: The path to the PDF file or its contents as bytes.
|
89
101
|
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
90
102
|
|
91
103
|
Returns:
|
92
104
|
The extracted text.
|
93
105
|
"""
|
94
|
-
if
|
106
|
+
if isinstance(file_path_or_contents, bytes):
|
107
|
+
with NamedTemporaryFile(suffix=".pdf", delete=False) as pdf_file:
|
108
|
+
try:
|
109
|
+
file_path = Path(pdf_file.name)
|
110
|
+
await AsyncPath(file_path).write_bytes(file_path_or_contents)
|
111
|
+
|
112
|
+
if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
|
113
|
+
return normalize_spaces(content)
|
114
|
+
|
115
|
+
return await extract_pdf_with_tesseract(file_path)
|
116
|
+
finally:
|
117
|
+
pdf_file.close()
|
118
|
+
await AsyncPath(pdf_file.name).unlink()
|
119
|
+
|
120
|
+
if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path_or_contents)):
|
95
121
|
return normalize_spaces(content)
|
96
122
|
|
97
|
-
return await extract_pdf_with_tesseract(
|
123
|
+
return await extract_pdf_with_tesseract(file_path_or_contents)
|
98
124
|
|
99
125
|
|
100
126
|
async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
|
@@ -121,7 +147,8 @@ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str
|
|
121
147
|
Returns:
|
122
148
|
The extracted text.
|
123
149
|
"""
|
124
|
-
|
150
|
+
resolved_path = str(await AsyncPath(file_path).resolve())
|
151
|
+
result = await process_file(resolved_path, mime_type=mime_type)
|
125
152
|
return normalize_spaces(result.content)
|
126
153
|
|
127
154
|
|
@@ -195,6 +222,47 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
|
195
222
|
return normalize_spaces(md_content)
|
196
223
|
|
197
224
|
|
225
|
+
async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
|
226
|
+
"""Extract text from an XLSX file by converting it to CSV and then to markdown.
|
227
|
+
|
228
|
+
Args:
|
229
|
+
file_path_or_contents: The path to the XLSX file or its contents as bytes.
|
230
|
+
|
231
|
+
Returns:
|
232
|
+
The extracted text content.
|
233
|
+
|
234
|
+
Raises:
|
235
|
+
ParsingError: If the XLSX file could not be parsed.
|
236
|
+
"""
|
237
|
+
with (
|
238
|
+
NamedTemporaryFile(suffix=".xlsx", delete=False) as xlsx_file,
|
239
|
+
NamedTemporaryFile(suffix=".csv", delete=False) as csv_file,
|
240
|
+
):
|
241
|
+
try:
|
242
|
+
if isinstance(file_path_or_contents, bytes):
|
243
|
+
xlsx_file.write(file_path_or_contents)
|
244
|
+
xlsx_file.flush()
|
245
|
+
xlsx_path = xlsx_file.name
|
246
|
+
else:
|
247
|
+
xlsx_path = str(await AsyncPath(file_path_or_contents).resolve())
|
248
|
+
|
249
|
+
await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
|
250
|
+
result = await process_file(csv_file.name, mime_type="text/csv")
|
251
|
+
return normalize_spaces(result.content)
|
252
|
+
except Exception as e:
|
253
|
+
raise ParsingError(
|
254
|
+
"Could not extract text from XLSX file",
|
255
|
+
context={
|
256
|
+
"error": str(e),
|
257
|
+
"file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
|
258
|
+
},
|
259
|
+
) from e
|
260
|
+
finally:
|
261
|
+
xlsx_file.close()
|
262
|
+
csv_file.close()
|
263
|
+
await gather(AsyncPath(xlsx_file.name).unlink(), AsyncPath(csv_file.name).unlink())
|
264
|
+
|
265
|
+
|
198
266
|
async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
|
199
267
|
"""Extract text from an HTML string.
|
200
268
|
|
@@ -10,7 +10,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
|
|
10
10
|
PDF_MIME_TYPE: Final = "application/pdf"
|
11
11
|
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
12
12
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
13
|
-
|
13
|
+
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
14
14
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
15
15
|
|
16
16
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -89,5 +89,5 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
|
89
89
|
PLAIN_TEXT_MIME_TYPES
|
90
90
|
| IMAGE_MIME_TYPES
|
91
91
|
| PANDOC_SUPPORTED_MIME_TYPES
|
92
|
-
| {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
|
92
|
+
| {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
|
93
93
|
)
|
@@ -1,9 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import json
|
4
3
|
import subprocess
|
5
4
|
from asyncio import gather
|
6
5
|
from dataclasses import dataclass
|
6
|
+
from json import JSONDecodeError, loads
|
7
7
|
from tempfile import NamedTemporaryFile
|
8
8
|
from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
|
9
9
|
|
@@ -13,7 +13,7 @@ from kreuzberg._string import normalize_spaces
|
|
13
13
|
from kreuzberg._sync import run_sync
|
14
14
|
from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
|
15
15
|
|
16
|
-
if TYPE_CHECKING:
|
16
|
+
if TYPE_CHECKING: # pragma: no cover
|
17
17
|
from collections.abc import Mapping
|
18
18
|
from os import PathLike
|
19
19
|
|
@@ -80,7 +80,7 @@ NodeType = Literal[
|
|
80
80
|
"MetaBlocks",
|
81
81
|
]
|
82
82
|
|
83
|
-
|
83
|
+
MIMETYPE_TO_PANDOC_TYPE_MAPPING: Final[Mapping[str, str]] = {
|
84
84
|
"application/csl+json": "csljson",
|
85
85
|
"application/docbook+xml": "docbook",
|
86
86
|
"application/epub+zip": "epub",
|
@@ -112,6 +112,38 @@ PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
|
|
112
112
|
"text/x-rst": "rst",
|
113
113
|
}
|
114
114
|
|
115
|
+
MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
|
116
|
+
"application/csl+json": "json",
|
117
|
+
"application/docbook+xml": "xml",
|
118
|
+
"application/epub+zip": "epub",
|
119
|
+
"application/rtf": "rtf",
|
120
|
+
"application/vnd.oasis.opendocument.text": "odt",
|
121
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
122
|
+
"application/x-biblatex": "bib",
|
123
|
+
"application/x-bibtex": "bib",
|
124
|
+
"application/x-endnote+xml": "xml",
|
125
|
+
"application/x-fictionbook+xml": "fb2",
|
126
|
+
"application/x-ipynb+json": "ipynb",
|
127
|
+
"application/x-jats+xml": "xml",
|
128
|
+
"application/x-latex": "tex",
|
129
|
+
"application/x-opml+xml": "opml",
|
130
|
+
"application/x-research-info-systems": "ris",
|
131
|
+
"application/x-typst": "typst",
|
132
|
+
"text/csv": "csv",
|
133
|
+
"text/tab-separated-values": "tsv",
|
134
|
+
"text/troff": "1",
|
135
|
+
"text/x-commonmark": "md",
|
136
|
+
"text/x-dokuwiki": "wiki",
|
137
|
+
"text/x-gfm": "md",
|
138
|
+
"text/x-markdown": "md",
|
139
|
+
"text/x-markdown-extra": "md",
|
140
|
+
"text/x-mdoc": "md",
|
141
|
+
"text/x-multimarkdown": "md",
|
142
|
+
"text/x-org": "org",
|
143
|
+
"text/x-pod": "pod",
|
144
|
+
"text/x-rst": "rst",
|
145
|
+
}
|
146
|
+
|
115
147
|
|
116
148
|
class Metadata(TypedDict, total=False):
|
117
149
|
"""Document metadata extracted from Pandoc document.
|
@@ -232,7 +264,6 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
|
|
232
264
|
|
233
265
|
|
234
266
|
def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
235
|
-
"""Extract all non-empty metadata values from Pandoc AST metadata."""
|
236
267
|
meta: Metadata = {}
|
237
268
|
|
238
269
|
for key, value in raw_meta.items():
|
@@ -252,34 +283,30 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
|
252
283
|
return meta
|
253
284
|
|
254
285
|
|
255
|
-
def
|
256
|
-
if mime_type not in
|
257
|
-
mime_type.startswith(value) for value in
|
286
|
+
def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
|
287
|
+
if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
|
288
|
+
mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
|
258
289
|
):
|
259
290
|
raise ValidationError(
|
260
291
|
f"Unsupported mime type: {mime_type}",
|
261
292
|
context={
|
262
293
|
"mime_type": mime_type,
|
263
|
-
"supported_mimetypes": ",".join(sorted(
|
294
|
+
"supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
|
264
295
|
},
|
265
296
|
)
|
266
297
|
|
267
|
-
return
|
268
|
-
|
298
|
+
return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
|
299
|
+
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
269
300
|
)
|
270
301
|
|
271
302
|
|
272
|
-
async def
|
273
|
-
"""Validate that Pandoc is installed and is version 3 or above.
|
274
|
-
|
275
|
-
Raises:
|
276
|
-
MissingDependencyError: If Pandoc is not installed or is below version 3.
|
277
|
-
"""
|
303
|
+
async def _validate_pandoc_version() -> None:
|
278
304
|
try:
|
279
305
|
if version_ref["checked"]:
|
280
306
|
return
|
281
307
|
|
282
|
-
|
308
|
+
command = ["pandoc", "--version"]
|
309
|
+
result = await run_sync(subprocess.run, command, capture_output=True)
|
283
310
|
version = result.stdout.decode().split("\n")[0].split()[1]
|
284
311
|
if not version.startswith("3."):
|
285
312
|
raise MissingDependencyError("Pandoc version 3 or above is required.")
|
@@ -290,27 +317,15 @@ async def validate_pandoc_version() -> None:
|
|
290
317
|
raise MissingDependencyError("Pandoc is not installed.") from e
|
291
318
|
|
292
319
|
|
293
|
-
async def
|
294
|
-
|
295
|
-
|
296
|
-
Args:
|
297
|
-
input_file: The path to the file to process.
|
298
|
-
mime_type: The mime type of the file.
|
299
|
-
|
300
|
-
Raises:
|
301
|
-
ParsingError: If Pandoc fails to extract metadata.
|
302
|
-
|
303
|
-
Returns:
|
304
|
-
Dictionary containing document metadata.
|
305
|
-
"""
|
306
|
-
extension = _get_extension_from_mime_type(mime_type)
|
320
|
+
async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
|
321
|
+
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
307
322
|
|
308
|
-
with NamedTemporaryFile(suffix=".json") as metadata_file:
|
323
|
+
with NamedTemporaryFile(suffix=".json", delete=False) as metadata_file:
|
309
324
|
try:
|
310
325
|
command = [
|
311
326
|
"pandoc",
|
312
327
|
str(input_file),
|
313
|
-
f"--from={
|
328
|
+
f"--from={pandoc_type}",
|
314
329
|
"--to=json",
|
315
330
|
"--standalone",
|
316
331
|
"--quiet",
|
@@ -329,46 +344,60 @@ async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -
|
|
329
344
|
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
330
345
|
)
|
331
346
|
|
332
|
-
json_data =
|
347
|
+
json_data = loads(await AsyncPath(metadata_file.name).read_text("utf-8"))
|
333
348
|
return _extract_metadata(json_data)
|
334
349
|
|
335
|
-
except (RuntimeError, OSError,
|
350
|
+
except (RuntimeError, OSError, JSONDecodeError) as e:
|
336
351
|
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
337
352
|
|
353
|
+
finally:
|
354
|
+
metadata_file.close()
|
355
|
+
await AsyncPath(metadata_file.name).unlink()
|
338
356
|
|
339
|
-
async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
|
340
|
-
extension = _get_extension_from_mime_type(mime_type)
|
341
|
-
|
342
|
-
with NamedTemporaryFile(suffix=".md") as output_file:
|
343
|
-
command = [
|
344
|
-
"pandoc",
|
345
|
-
str(input_file),
|
346
|
-
f"--from={extension}",
|
347
|
-
"--to=markdown",
|
348
|
-
"--standalone",
|
349
|
-
"--wrap=preserve",
|
350
|
-
"--quiet",
|
351
|
-
"--output",
|
352
|
-
output_file.name,
|
353
|
-
]
|
354
357
|
|
355
|
-
|
356
|
-
|
358
|
+
async def _handle_extract_file(
|
359
|
+
input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
|
360
|
+
) -> str:
|
361
|
+
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
357
362
|
|
358
|
-
|
359
|
-
|
360
|
-
command
|
361
|
-
|
362
|
-
|
363
|
+
with NamedTemporaryFile(suffix=".md", delete=False) as output_file:
|
364
|
+
try:
|
365
|
+
command = [
|
366
|
+
"pandoc",
|
367
|
+
str(input_file),
|
368
|
+
f"--from={pandoc_type}",
|
369
|
+
"--to=markdown",
|
370
|
+
"--standalone",
|
371
|
+
"--wrap=preserve",
|
372
|
+
"--quiet",
|
373
|
+
"--output",
|
374
|
+
output_file.name,
|
375
|
+
]
|
376
|
+
|
377
|
+
if extra_args:
|
378
|
+
command.extend(extra_args)
|
363
379
|
|
364
|
-
|
365
|
-
|
366
|
-
|
380
|
+
result = await run_sync(
|
381
|
+
subprocess.run,
|
382
|
+
command,
|
383
|
+
capture_output=True,
|
367
384
|
)
|
368
385
|
|
369
|
-
|
386
|
+
if result.returncode != 0:
|
387
|
+
raise ParsingError(
|
388
|
+
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
389
|
+
)
|
370
390
|
|
371
|
-
|
391
|
+
text = await AsyncPath(output_file.name).read_text("utf-8")
|
392
|
+
|
393
|
+
return normalize_spaces(text)
|
394
|
+
|
395
|
+
except (RuntimeError, OSError) as e:
|
396
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
397
|
+
|
398
|
+
finally:
|
399
|
+
output_file.close()
|
400
|
+
await AsyncPath(output_file.name).unlink()
|
372
401
|
|
373
402
|
|
374
403
|
async def process_file(
|
@@ -384,12 +413,12 @@ async def process_file(
|
|
384
413
|
Returns:
|
385
414
|
PandocResult containing processed content and metadata.
|
386
415
|
"""
|
387
|
-
await
|
416
|
+
await _validate_pandoc_version()
|
388
417
|
|
389
418
|
metadata, content = await gather(
|
390
419
|
*[
|
391
|
-
|
392
|
-
|
420
|
+
_handle_extract_metadata(input_file, mime_type=mime_type),
|
421
|
+
_handle_extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
|
393
422
|
]
|
394
423
|
)
|
395
424
|
return PandocResult(
|
@@ -409,8 +438,13 @@ async def process_content(content: bytes, *, mime_type: str, extra_args: list[st
|
|
409
438
|
Returns:
|
410
439
|
PandocResult containing processed content and metadata.
|
411
440
|
"""
|
412
|
-
extension =
|
441
|
+
extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
|
442
|
+
|
443
|
+
with NamedTemporaryFile(suffix=f".{extension}", delete=False) as input_file:
|
444
|
+
try:
|
445
|
+
await AsyncPath(input_file.name).write_bytes(content)
|
446
|
+
return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
|
413
447
|
|
414
|
-
|
415
|
-
|
416
|
-
|
448
|
+
finally:
|
449
|
+
input_file.close()
|
450
|
+
await AsyncPath(input_file.name).unlink()
|
@@ -186,8 +186,9 @@ async def validate_tesseract_version() -> None:
|
|
186
186
|
if version_ref["checked"]:
|
187
187
|
return
|
188
188
|
|
189
|
-
|
190
|
-
|
189
|
+
command = ["tesseract", "--version"]
|
190
|
+
result = await run_sync(subprocess.run, command, capture_output=True)
|
191
|
+
version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
|
191
192
|
if not version_match or int(version_match.group(1)) < 5:
|
192
193
|
raise MissingDependencyError("Tesseract version 5 or above is required.")
|
193
194
|
|
@@ -213,10 +214,10 @@ async def process_file(
|
|
213
214
|
Returns:
|
214
215
|
str: Extracted text from the image.
|
215
216
|
"""
|
216
|
-
with NamedTemporaryFile(suffix=".txt") as output_file:
|
217
|
+
with NamedTemporaryFile(suffix=".txt", delete=False) as output_file:
|
217
218
|
# this is needed because tesseract adds .txt to the output file
|
218
|
-
output_file_name = output_file.name.replace(".txt", "")
|
219
219
|
try:
|
220
|
+
output_file_name = output_file.name.replace(".txt", "")
|
220
221
|
command = [
|
221
222
|
"tesseract",
|
222
223
|
str(input_file),
|
@@ -239,11 +240,15 @@ async def process_file(
|
|
239
240
|
if not result.returncode == 0:
|
240
241
|
raise OCRError("OCR failed with a non-0 return code.")
|
241
242
|
|
242
|
-
output = await AsyncPath(output_file.name).read_text()
|
243
|
+
output = await AsyncPath(output_file.name).read_text("utf-8")
|
243
244
|
return output.strip()
|
244
245
|
except (RuntimeError, OSError) as e:
|
245
246
|
raise OCRError("Failed to OCR using tesseract") from e
|
246
247
|
|
248
|
+
finally:
|
249
|
+
output_file.close()
|
250
|
+
await AsyncPath(output_file.name).unlink()
|
251
|
+
|
247
252
|
|
248
253
|
async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
|
249
254
|
"""Process a single Pillow Image using Tesseract OCR.
|
@@ -257,9 +262,14 @@ async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMM
|
|
257
262
|
Returns:
|
258
263
|
str: Extracted text from the image.
|
259
264
|
"""
|
260
|
-
with NamedTemporaryFile(suffix=".png") as image_file:
|
261
|
-
|
262
|
-
|
265
|
+
with NamedTemporaryFile(suffix=".png", delete=False) as image_file:
|
266
|
+
try:
|
267
|
+
await run_sync(image.save, image_file.name, format="PNG")
|
268
|
+
return await process_file(image_file.name, language=language, psm=psm, **kwargs)
|
269
|
+
|
270
|
+
finally:
|
271
|
+
image_file.close()
|
272
|
+
await AsyncPath(image_file.name).unlink()
|
263
273
|
|
264
274
|
|
265
275
|
async def process_image_with_tesseract(
|
@@ -20,10 +20,12 @@ from kreuzberg._extractors import (
|
|
20
20
|
extract_content_with_pandoc,
|
21
21
|
extract_file_with_pandoc,
|
22
22
|
extract_html_string,
|
23
|
-
|
23
|
+
extract_pdf,
|
24
24
|
extract_pptx_file,
|
25
|
+
extract_xlsx_file,
|
25
26
|
)
|
26
27
|
from kreuzberg._mime_types import (
|
28
|
+
EXCEL_MIME_TYPE,
|
27
29
|
HTML_MIME_TYPE,
|
28
30
|
IMAGE_MIME_TYPE_EXT_MAP,
|
29
31
|
IMAGE_MIME_TYPES,
|
@@ -69,18 +71,21 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
|
|
69
71
|
)
|
70
72
|
|
71
73
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
)
|
74
|
+
return ExtractionResult(content=await extract_pdf(content, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
75
|
+
|
76
|
+
if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
|
77
|
+
return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
|
77
78
|
|
78
79
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
79
|
-
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
80
|
+
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type], delete=False) as temp_file:
|
81
|
+
try:
|
82
|
+
await AsyncPath(temp_file.name).write_bytes(content)
|
83
|
+
return ExtractionResult(
|
84
|
+
content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
|
85
|
+
)
|
86
|
+
finally:
|
87
|
+
temp_file.close()
|
88
|
+
await AsyncPath(temp_file.name).unlink()
|
84
89
|
|
85
90
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
86
91
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
@@ -132,7 +137,10 @@ async def extract_file(
|
|
132
137
|
raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
|
133
138
|
|
134
139
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
135
|
-
return ExtractionResult(content=await
|
140
|
+
return ExtractionResult(content=await extract_pdf(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
141
|
+
|
142
|
+
if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
|
143
|
+
return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
136
144
|
|
137
145
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
138
146
|
return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.7.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -29,7 +29,8 @@ Requires-Dist: charset-normalizer>=3.4.1
|
|
29
29
|
Requires-Dist: html-to-markdown>=1.2.0
|
30
30
|
Requires-Dist: pypdfium2>=4.30.1
|
31
31
|
Requires-Dist: python-pptx>=1.0.2
|
32
|
-
Requires-Dist: typing-extensions>=4.12.2
|
32
|
+
Requires-Dist: typing-extensions>=4.12.2
|
33
|
+
Requires-Dist: xlsx2csv>=0.8.4
|
33
34
|
|
34
35
|
# Kreuzberg
|
35
36
|
|
@@ -68,16 +69,12 @@ pip install kreuzberg
|
|
68
69
|
|
69
70
|
### 2. Install System Dependencies
|
70
71
|
|
71
|
-
Kreuzberg requires two
|
72
|
+
Kreuzberg requires two system level dependencies:
|
72
73
|
|
73
74
|
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
74
|
-
|
75
|
-
- GPL v2.0 licensed (used via CLI only)
|
76
|
-
- Handles office documents and markup formats
|
77
|
-
|
78
75
|
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
79
|
-
|
80
|
-
|
76
|
+
|
77
|
+
Please install these using their respective installation guides.
|
81
78
|
|
82
79
|
## Architecture
|
83
80
|
|
@@ -87,9 +84,10 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
|
|
87
84
|
- `pdfium2` for searchable PDFs
|
88
85
|
- Tesseract OCR for scanned content
|
89
86
|
- **Document Conversion**:
|
90
|
-
- Pandoc for
|
87
|
+
- Pandoc for many document and markup formats
|
91
88
|
- `python-pptx` for PowerPoint files
|
92
89
|
- `html-to-markdown` for HTML content
|
90
|
+
- `xlsx2csv` for Excel spreadsheets
|
93
91
|
- **Text Processing**:
|
94
92
|
- Smart encoding detection
|
95
93
|
- Markdown and plain text handling
|
@@ -121,6 +119,7 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
|
|
121
119
|
|
122
120
|
#### Data and Research Formats
|
123
121
|
|
122
|
+
- Excel spreadsheets (`.xlsx`)
|
124
123
|
- CSV (`.csv`) and TSV (`.tsv`) files
|
125
124
|
- Jupyter Notebooks (`.ipynb`)
|
126
125
|
- BibTeX (`.bib`) and BibLaTeX (`.bib`)
|
@@ -232,11 +231,16 @@ async def process_document(path: str) -> tuple[str, str]:
|
|
232
231
|
|
233
232
|
### Error Handling
|
234
233
|
|
235
|
-
Kreuzberg provides
|
234
|
+
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
236
235
|
|
237
236
|
```python
|
238
237
|
from kreuzberg import extract_file
|
239
|
-
from kreuzberg.exceptions import
|
238
|
+
from kreuzberg.exceptions import (
|
239
|
+
ValidationError,
|
240
|
+
ParsingError,
|
241
|
+
OCRError,
|
242
|
+
MissingDependencyError
|
243
|
+
)
|
240
244
|
|
241
245
|
async def safe_extract(path: str) -> str:
|
242
246
|
try:
|
@@ -244,20 +248,31 @@ async def safe_extract(path: str) -> str:
|
|
244
248
|
return result.content
|
245
249
|
|
246
250
|
except ValidationError as e:
|
247
|
-
#
|
248
|
-
# - Unsupported
|
251
|
+
# Input validation issues
|
252
|
+
# - Unsupported or undetectable MIME types
|
249
253
|
# - Missing files
|
250
|
-
# - Invalid
|
251
|
-
print(f"
|
252
|
-
|
254
|
+
# - Invalid input parameters
|
255
|
+
print(f"Validation failed: {e}")
|
256
|
+
|
257
|
+
except OCRError as e:
|
258
|
+
# OCR-specific issues
|
259
|
+
# - Tesseract processing failures
|
260
|
+
# - Image conversion problems
|
261
|
+
print(f"OCR failed: {e}")
|
262
|
+
|
263
|
+
except MissingDependencyError as e:
|
264
|
+
# System dependency issues
|
265
|
+
# - Missing Tesseract OCR
|
266
|
+
# - Missing Pandoc
|
267
|
+
# - Incompatible versions
|
268
|
+
print(f"Dependency missing: {e}")
|
253
269
|
|
254
270
|
except ParsingError as e:
|
255
|
-
#
|
271
|
+
# General processing errors
|
256
272
|
# - PDF parsing failures
|
257
|
-
# - OCR errors
|
258
273
|
# - Format conversion issues
|
259
|
-
|
260
|
-
print(f"
|
274
|
+
# - Encoding problems
|
275
|
+
print(f"Processing failed: {e}")
|
261
276
|
|
262
277
|
return ""
|
263
278
|
|
@@ -265,24 +280,33 @@ async def safe_extract(path: str) -> str:
|
|
265
280
|
try:
|
266
281
|
result = await extract_file("document.xyz")
|
267
282
|
except ValidationError as e:
|
268
|
-
#
|
269
|
-
#
|
283
|
+
# Error will include context:
|
284
|
+
# ValidationError: Unsupported mime type
|
285
|
+
# Context: {
|
270
286
|
# "file_path": "document.xyz",
|
271
|
-
# "
|
272
|
-
# "supported_types": ["pdf", "docx", ...]
|
287
|
+
# "supported_mimetypes": ["application/pdf", ...]
|
273
288
|
# }
|
289
|
+
print(e)
|
274
290
|
|
275
291
|
try:
|
276
|
-
result = await extract_file("scan.
|
277
|
-
except
|
278
|
-
#
|
279
|
-
#
|
280
|
-
#
|
281
|
-
# "
|
282
|
-
# "
|
292
|
+
result = await extract_file("scan.jpg")
|
293
|
+
except OCRError as e:
|
294
|
+
# Error will include context:
|
295
|
+
# OCRError: OCR failed with a non-0 return code
|
296
|
+
# Context: {
|
297
|
+
# "file_path": "scan.jpg",
|
298
|
+
# "tesseract_version": "5.3.0"
|
283
299
|
# }
|
300
|
+
print(e)
|
284
301
|
```
|
285
302
|
|
303
|
+
All exceptions provide:
|
304
|
+
|
305
|
+
- A descriptive error message
|
306
|
+
- Relevant context in the `context` attribute
|
307
|
+
- String representation with both message and context
|
308
|
+
- Proper exception chaining for debugging
|
309
|
+
|
286
310
|
## Roadmap
|
287
311
|
|
288
312
|
V1:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "kreuzberg"
|
3
|
-
version = "1.
|
3
|
+
version = "1.7.0"
|
4
4
|
description = "A text extraction library supporting PDFs, images, office documents and more"
|
5
5
|
readme = "README.md"
|
6
6
|
keywords = [
|
@@ -36,27 +36,28 @@ classifiers = [
|
|
36
36
|
]
|
37
37
|
|
38
38
|
dependencies = [
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
39
|
+
"anyio>=4.8.0",
|
40
|
+
"charset-normalizer>=3.4.1",
|
41
|
+
"html-to-markdown>=1.2.0",
|
42
|
+
"pypdfium2>=4.30.1",
|
43
|
+
"python-pptx>=1.0.2",
|
44
|
+
"typing-extensions>=4.12.2",
|
45
|
+
"xlsx2csv>=0.8.4",
|
45
46
|
]
|
46
47
|
urls.homepage = "https://github.com/Goldziher/kreuzberg"
|
47
48
|
|
48
49
|
[dependency-groups]
|
49
50
|
dev = [
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
51
|
+
"covdefaults>=2.3.0",
|
52
|
+
"mypy>=1.15.0",
|
53
|
+
"pre-commit>=4.1.0",
|
54
|
+
"pytest>=8.3.4",
|
55
|
+
"pytest-asyncio>=0.25.3",
|
56
|
+
"pytest-cov>=6.0.0",
|
57
|
+
"pytest-mock>=3.14.0",
|
58
|
+
"pytest-timeout>=2.3.1",
|
59
|
+
"python-dotenv>=1.0.1",
|
60
|
+
"ruff>=0.9.6",
|
60
61
|
]
|
61
62
|
|
62
63
|
[tool.setuptools.packages.find]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|