kreuzberg 1.6.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors.py +52 -19
- kreuzberg/_pandoc.py +103 -69
- kreuzberg/_tesseract.py +18 -8
- kreuzberg/extraction.py +12 -12
- {kreuzberg-1.6.0.dist-info → kreuzberg-1.7.0.dist-info}/METADATA +49 -24
- kreuzberg-1.7.0.dist-info/RECORD +15 -0
- kreuzberg-1.6.0.dist-info/RECORD +0 -15
- {kreuzberg-1.6.0.dist-info → kreuzberg-1.7.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.6.0.dist-info → kreuzberg-1.7.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.6.0.dist-info → kreuzberg-1.7.0.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import re
|
4
|
+
from asyncio import gather
|
4
5
|
from contextlib import suppress
|
5
6
|
from html import escape
|
6
7
|
from io import BytesIO
|
@@ -37,13 +38,18 @@ async def convert_pdf_to_images(file_path: Path) -> list[Image]:
|
|
37
38
|
Returns:
|
38
39
|
A list of Pillow Images.
|
39
40
|
"""
|
41
|
+
pdf = None
|
42
|
+
resolved_path = str(await AsyncPath(file_path).resolve())
|
40
43
|
try:
|
41
|
-
pdf = await run_sync(pypdfium2.PdfDocument,
|
44
|
+
pdf = await run_sync(pypdfium2.PdfDocument, resolved_path)
|
42
45
|
return [page.render(scale=2.0).to_pil() for page in pdf]
|
43
46
|
except pypdfium2.PdfiumError as e:
|
44
47
|
raise ParsingError(
|
45
48
|
"Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
|
46
49
|
) from e
|
50
|
+
finally:
|
51
|
+
if pdf is not None:
|
52
|
+
pdf.close()
|
47
53
|
|
48
54
|
|
49
55
|
async def extract_pdf_with_tesseract(file_path: Path) -> str:
|
@@ -72,30 +78,49 @@ async def extract_pdf_with_pdfium2(file_path: Path) -> str:
|
|
72
78
|
Returns:
|
73
79
|
The extracted text.
|
74
80
|
"""
|
81
|
+
document = None
|
82
|
+
resolved_path = str(await AsyncPath(file_path).resolve())
|
75
83
|
try:
|
76
|
-
document = await run_sync(pypdfium2.PdfDocument,
|
77
|
-
text = "\n".join(page.get_textpage().
|
84
|
+
document = await run_sync(pypdfium2.PdfDocument, resolved_path)
|
85
|
+
text = "\n".join(page.get_textpage().get_text_bounded() for page in document)
|
78
86
|
return normalize_spaces(text)
|
79
87
|
except pypdfium2.PdfiumError as e:
|
80
88
|
raise ParsingError(
|
81
89
|
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
82
90
|
) from e
|
91
|
+
finally:
|
92
|
+
if document is not None:
|
93
|
+
document.close()
|
83
94
|
|
84
95
|
|
85
|
-
async def
|
96
|
+
async def extract_pdf(file_path_or_contents: Path | bytes, force_ocr: bool = False) -> str:
|
86
97
|
"""Extract text from a PDF file.
|
87
98
|
|
88
99
|
Args:
|
89
|
-
|
100
|
+
file_path_or_contents: The path to the PDF file or its contents as bytes.
|
90
101
|
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
91
102
|
|
92
103
|
Returns:
|
93
104
|
The extracted text.
|
94
105
|
"""
|
95
|
-
if
|
106
|
+
if isinstance(file_path_or_contents, bytes):
|
107
|
+
with NamedTemporaryFile(suffix=".pdf", delete=False) as pdf_file:
|
108
|
+
try:
|
109
|
+
file_path = Path(pdf_file.name)
|
110
|
+
await AsyncPath(file_path).write_bytes(file_path_or_contents)
|
111
|
+
|
112
|
+
if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
|
113
|
+
return normalize_spaces(content)
|
114
|
+
|
115
|
+
return await extract_pdf_with_tesseract(file_path)
|
116
|
+
finally:
|
117
|
+
pdf_file.close()
|
118
|
+
await AsyncPath(pdf_file.name).unlink()
|
119
|
+
|
120
|
+
if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path_or_contents)):
|
96
121
|
return normalize_spaces(content)
|
97
122
|
|
98
|
-
return await extract_pdf_with_tesseract(
|
123
|
+
return await extract_pdf_with_tesseract(file_path_or_contents)
|
99
124
|
|
100
125
|
|
101
126
|
async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
|
@@ -122,7 +147,8 @@ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str
|
|
122
147
|
Returns:
|
123
148
|
The extracted text.
|
124
149
|
"""
|
125
|
-
|
150
|
+
resolved_path = str(await AsyncPath(file_path).resolve())
|
151
|
+
result = await process_file(resolved_path, mime_type=mime_type)
|
126
152
|
return normalize_spaces(result.content)
|
127
153
|
|
128
154
|
|
@@ -208,26 +234,33 @@ async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
|
|
208
234
|
Raises:
|
209
235
|
ParsingError: If the XLSX file could not be parsed.
|
210
236
|
"""
|
211
|
-
|
212
|
-
|
237
|
+
with (
|
238
|
+
NamedTemporaryFile(suffix=".xlsx", delete=False) as xlsx_file,
|
239
|
+
NamedTemporaryFile(suffix=".csv", delete=False) as csv_file,
|
240
|
+
):
|
241
|
+
try:
|
213
242
|
if isinstance(file_path_or_contents, bytes):
|
214
243
|
xlsx_file.write(file_path_or_contents)
|
215
244
|
xlsx_file.flush()
|
216
245
|
xlsx_path = xlsx_file.name
|
217
246
|
else:
|
218
|
-
xlsx_path = str(file_path_or_contents)
|
247
|
+
xlsx_path = str(await AsyncPath(file_path_or_contents).resolve())
|
219
248
|
|
220
249
|
await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
|
221
250
|
result = await process_file(csv_file.name, mime_type="text/csv")
|
222
251
|
return normalize_spaces(result.content)
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
252
|
+
except Exception as e:
|
253
|
+
raise ParsingError(
|
254
|
+
"Could not extract text from XLSX file",
|
255
|
+
context={
|
256
|
+
"error": str(e),
|
257
|
+
"file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
|
258
|
+
},
|
259
|
+
) from e
|
260
|
+
finally:
|
261
|
+
xlsx_file.close()
|
262
|
+
csv_file.close()
|
263
|
+
await gather(AsyncPath(xlsx_file.name).unlink(), AsyncPath(csv_file.name).unlink())
|
231
264
|
|
232
265
|
|
233
266
|
async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
|
kreuzberg/_pandoc.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import json
|
4
3
|
import subprocess
|
5
4
|
from asyncio import gather
|
6
5
|
from dataclasses import dataclass
|
6
|
+
from json import JSONDecodeError, loads
|
7
7
|
from tempfile import NamedTemporaryFile
|
8
8
|
from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
|
9
9
|
|
@@ -13,7 +13,7 @@ from kreuzberg._string import normalize_spaces
|
|
13
13
|
from kreuzberg._sync import run_sync
|
14
14
|
from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
|
15
15
|
|
16
|
-
if TYPE_CHECKING:
|
16
|
+
if TYPE_CHECKING: # pragma: no cover
|
17
17
|
from collections.abc import Mapping
|
18
18
|
from os import PathLike
|
19
19
|
|
@@ -80,7 +80,7 @@ NodeType = Literal[
|
|
80
80
|
"MetaBlocks",
|
81
81
|
]
|
82
82
|
|
83
|
-
|
83
|
+
MIMETYPE_TO_PANDOC_TYPE_MAPPING: Final[Mapping[str, str]] = {
|
84
84
|
"application/csl+json": "csljson",
|
85
85
|
"application/docbook+xml": "docbook",
|
86
86
|
"application/epub+zip": "epub",
|
@@ -112,6 +112,38 @@ PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
|
|
112
112
|
"text/x-rst": "rst",
|
113
113
|
}
|
114
114
|
|
115
|
+
MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
|
116
|
+
"application/csl+json": "json",
|
117
|
+
"application/docbook+xml": "xml",
|
118
|
+
"application/epub+zip": "epub",
|
119
|
+
"application/rtf": "rtf",
|
120
|
+
"application/vnd.oasis.opendocument.text": "odt",
|
121
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
122
|
+
"application/x-biblatex": "bib",
|
123
|
+
"application/x-bibtex": "bib",
|
124
|
+
"application/x-endnote+xml": "xml",
|
125
|
+
"application/x-fictionbook+xml": "fb2",
|
126
|
+
"application/x-ipynb+json": "ipynb",
|
127
|
+
"application/x-jats+xml": "xml",
|
128
|
+
"application/x-latex": "tex",
|
129
|
+
"application/x-opml+xml": "opml",
|
130
|
+
"application/x-research-info-systems": "ris",
|
131
|
+
"application/x-typst": "typst",
|
132
|
+
"text/csv": "csv",
|
133
|
+
"text/tab-separated-values": "tsv",
|
134
|
+
"text/troff": "1",
|
135
|
+
"text/x-commonmark": "md",
|
136
|
+
"text/x-dokuwiki": "wiki",
|
137
|
+
"text/x-gfm": "md",
|
138
|
+
"text/x-markdown": "md",
|
139
|
+
"text/x-markdown-extra": "md",
|
140
|
+
"text/x-mdoc": "md",
|
141
|
+
"text/x-multimarkdown": "md",
|
142
|
+
"text/x-org": "org",
|
143
|
+
"text/x-pod": "pod",
|
144
|
+
"text/x-rst": "rst",
|
145
|
+
}
|
146
|
+
|
115
147
|
|
116
148
|
class Metadata(TypedDict, total=False):
|
117
149
|
"""Document metadata extracted from Pandoc document.
|
@@ -232,7 +264,6 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
|
|
232
264
|
|
233
265
|
|
234
266
|
def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
235
|
-
"""Extract all non-empty metadata values from Pandoc AST metadata."""
|
236
267
|
meta: Metadata = {}
|
237
268
|
|
238
269
|
for key, value in raw_meta.items():
|
@@ -252,34 +283,30 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
|
252
283
|
return meta
|
253
284
|
|
254
285
|
|
255
|
-
def
|
256
|
-
if mime_type not in
|
257
|
-
mime_type.startswith(value) for value in
|
286
|
+
def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
|
287
|
+
if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
|
288
|
+
mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
|
258
289
|
):
|
259
290
|
raise ValidationError(
|
260
291
|
f"Unsupported mime type: {mime_type}",
|
261
292
|
context={
|
262
293
|
"mime_type": mime_type,
|
263
|
-
"supported_mimetypes": ",".join(sorted(
|
294
|
+
"supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
|
264
295
|
},
|
265
296
|
)
|
266
297
|
|
267
|
-
return
|
268
|
-
|
298
|
+
return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
|
299
|
+
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
269
300
|
)
|
270
301
|
|
271
302
|
|
272
|
-
async def
|
273
|
-
"""Validate that Pandoc is installed and is version 3 or above.
|
274
|
-
|
275
|
-
Raises:
|
276
|
-
MissingDependencyError: If Pandoc is not installed or is below version 3.
|
277
|
-
"""
|
303
|
+
async def _validate_pandoc_version() -> None:
|
278
304
|
try:
|
279
305
|
if version_ref["checked"]:
|
280
306
|
return
|
281
307
|
|
282
|
-
|
308
|
+
command = ["pandoc", "--version"]
|
309
|
+
result = await run_sync(subprocess.run, command, capture_output=True)
|
283
310
|
version = result.stdout.decode().split("\n")[0].split()[1]
|
284
311
|
if not version.startswith("3."):
|
285
312
|
raise MissingDependencyError("Pandoc version 3 or above is required.")
|
@@ -290,27 +317,15 @@ async def validate_pandoc_version() -> None:
|
|
290
317
|
raise MissingDependencyError("Pandoc is not installed.") from e
|
291
318
|
|
292
319
|
|
293
|
-
async def
|
294
|
-
|
295
|
-
|
296
|
-
Args:
|
297
|
-
input_file: The path to the file to process.
|
298
|
-
mime_type: The mime type of the file.
|
299
|
-
|
300
|
-
Raises:
|
301
|
-
ParsingError: If Pandoc fails to extract metadata.
|
302
|
-
|
303
|
-
Returns:
|
304
|
-
Dictionary containing document metadata.
|
305
|
-
"""
|
306
|
-
extension = _get_extension_from_mime_type(mime_type)
|
320
|
+
async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
|
321
|
+
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
307
322
|
|
308
|
-
with NamedTemporaryFile(suffix=".json") as metadata_file:
|
323
|
+
with NamedTemporaryFile(suffix=".json", delete=False) as metadata_file:
|
309
324
|
try:
|
310
325
|
command = [
|
311
326
|
"pandoc",
|
312
327
|
str(input_file),
|
313
|
-
f"--from={
|
328
|
+
f"--from={pandoc_type}",
|
314
329
|
"--to=json",
|
315
330
|
"--standalone",
|
316
331
|
"--quiet",
|
@@ -329,46 +344,60 @@ async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -
|
|
329
344
|
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
330
345
|
)
|
331
346
|
|
332
|
-
json_data =
|
347
|
+
json_data = loads(await AsyncPath(metadata_file.name).read_text("utf-8"))
|
333
348
|
return _extract_metadata(json_data)
|
334
349
|
|
335
|
-
except (RuntimeError, OSError,
|
350
|
+
except (RuntimeError, OSError, JSONDecodeError) as e:
|
336
351
|
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
337
352
|
|
353
|
+
finally:
|
354
|
+
metadata_file.close()
|
355
|
+
await AsyncPath(metadata_file.name).unlink()
|
338
356
|
|
339
|
-
async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
|
340
|
-
extension = _get_extension_from_mime_type(mime_type)
|
341
|
-
|
342
|
-
with NamedTemporaryFile(suffix=".md") as output_file:
|
343
|
-
command = [
|
344
|
-
"pandoc",
|
345
|
-
str(input_file),
|
346
|
-
f"--from={extension}",
|
347
|
-
"--to=markdown",
|
348
|
-
"--standalone",
|
349
|
-
"--wrap=preserve",
|
350
|
-
"--quiet",
|
351
|
-
"--output",
|
352
|
-
output_file.name,
|
353
|
-
]
|
354
357
|
|
355
|
-
|
356
|
-
|
358
|
+
async def _handle_extract_file(
|
359
|
+
input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
|
360
|
+
) -> str:
|
361
|
+
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
357
362
|
|
358
|
-
|
359
|
-
|
360
|
-
command
|
361
|
-
|
362
|
-
|
363
|
+
with NamedTemporaryFile(suffix=".md", delete=False) as output_file:
|
364
|
+
try:
|
365
|
+
command = [
|
366
|
+
"pandoc",
|
367
|
+
str(input_file),
|
368
|
+
f"--from={pandoc_type}",
|
369
|
+
"--to=markdown",
|
370
|
+
"--standalone",
|
371
|
+
"--wrap=preserve",
|
372
|
+
"--quiet",
|
373
|
+
"--output",
|
374
|
+
output_file.name,
|
375
|
+
]
|
376
|
+
|
377
|
+
if extra_args:
|
378
|
+
command.extend(extra_args)
|
363
379
|
|
364
|
-
|
365
|
-
|
366
|
-
|
380
|
+
result = await run_sync(
|
381
|
+
subprocess.run,
|
382
|
+
command,
|
383
|
+
capture_output=True,
|
367
384
|
)
|
368
385
|
|
369
|
-
|
386
|
+
if result.returncode != 0:
|
387
|
+
raise ParsingError(
|
388
|
+
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
389
|
+
)
|
370
390
|
|
371
|
-
|
391
|
+
text = await AsyncPath(output_file.name).read_text("utf-8")
|
392
|
+
|
393
|
+
return normalize_spaces(text)
|
394
|
+
|
395
|
+
except (RuntimeError, OSError) as e:
|
396
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
397
|
+
|
398
|
+
finally:
|
399
|
+
output_file.close()
|
400
|
+
await AsyncPath(output_file.name).unlink()
|
372
401
|
|
373
402
|
|
374
403
|
async def process_file(
|
@@ -384,12 +413,12 @@ async def process_file(
|
|
384
413
|
Returns:
|
385
414
|
PandocResult containing processed content and metadata.
|
386
415
|
"""
|
387
|
-
await
|
416
|
+
await _validate_pandoc_version()
|
388
417
|
|
389
418
|
metadata, content = await gather(
|
390
419
|
*[
|
391
|
-
|
392
|
-
|
420
|
+
_handle_extract_metadata(input_file, mime_type=mime_type),
|
421
|
+
_handle_extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
|
393
422
|
]
|
394
423
|
)
|
395
424
|
return PandocResult(
|
@@ -409,8 +438,13 @@ async def process_content(content: bytes, *, mime_type: str, extra_args: list[st
|
|
409
438
|
Returns:
|
410
439
|
PandocResult containing processed content and metadata.
|
411
440
|
"""
|
412
|
-
extension =
|
441
|
+
extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
|
442
|
+
|
443
|
+
with NamedTemporaryFile(suffix=f".{extension}", delete=False) as input_file:
|
444
|
+
try:
|
445
|
+
await AsyncPath(input_file.name).write_bytes(content)
|
446
|
+
return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
|
413
447
|
|
414
|
-
|
415
|
-
|
416
|
-
|
448
|
+
finally:
|
449
|
+
input_file.close()
|
450
|
+
await AsyncPath(input_file.name).unlink()
|
kreuzberg/_tesseract.py
CHANGED
@@ -186,8 +186,9 @@ async def validate_tesseract_version() -> None:
|
|
186
186
|
if version_ref["checked"]:
|
187
187
|
return
|
188
188
|
|
189
|
-
|
190
|
-
|
189
|
+
command = ["tesseract", "--version"]
|
190
|
+
result = await run_sync(subprocess.run, command, capture_output=True)
|
191
|
+
version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
|
191
192
|
if not version_match or int(version_match.group(1)) < 5:
|
192
193
|
raise MissingDependencyError("Tesseract version 5 or above is required.")
|
193
194
|
|
@@ -213,10 +214,10 @@ async def process_file(
|
|
213
214
|
Returns:
|
214
215
|
str: Extracted text from the image.
|
215
216
|
"""
|
216
|
-
with NamedTemporaryFile(suffix=".txt") as output_file:
|
217
|
+
with NamedTemporaryFile(suffix=".txt", delete=False) as output_file:
|
217
218
|
# this is needed because tesseract adds .txt to the output file
|
218
|
-
output_file_name = output_file.name.replace(".txt", "")
|
219
219
|
try:
|
220
|
+
output_file_name = output_file.name.replace(".txt", "")
|
220
221
|
command = [
|
221
222
|
"tesseract",
|
222
223
|
str(input_file),
|
@@ -239,11 +240,15 @@ async def process_file(
|
|
239
240
|
if not result.returncode == 0:
|
240
241
|
raise OCRError("OCR failed with a non-0 return code.")
|
241
242
|
|
242
|
-
output = await AsyncPath(output_file.name).read_text()
|
243
|
+
output = await AsyncPath(output_file.name).read_text("utf-8")
|
243
244
|
return output.strip()
|
244
245
|
except (RuntimeError, OSError) as e:
|
245
246
|
raise OCRError("Failed to OCR using tesseract") from e
|
246
247
|
|
248
|
+
finally:
|
249
|
+
output_file.close()
|
250
|
+
await AsyncPath(output_file.name).unlink()
|
251
|
+
|
247
252
|
|
248
253
|
async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
|
249
254
|
"""Process a single Pillow Image using Tesseract OCR.
|
@@ -257,9 +262,14 @@ async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMM
|
|
257
262
|
Returns:
|
258
263
|
str: Extracted text from the image.
|
259
264
|
"""
|
260
|
-
with NamedTemporaryFile(suffix=".png") as image_file:
|
261
|
-
|
262
|
-
|
265
|
+
with NamedTemporaryFile(suffix=".png", delete=False) as image_file:
|
266
|
+
try:
|
267
|
+
await run_sync(image.save, image_file.name, format="PNG")
|
268
|
+
return await process_file(image_file.name, language=language, psm=psm, **kwargs)
|
269
|
+
|
270
|
+
finally:
|
271
|
+
image_file.close()
|
272
|
+
await AsyncPath(image_file.name).unlink()
|
263
273
|
|
264
274
|
|
265
275
|
async def process_image_with_tesseract(
|
kreuzberg/extraction.py
CHANGED
@@ -20,7 +20,7 @@ from kreuzberg._extractors import (
|
|
20
20
|
extract_content_with_pandoc,
|
21
21
|
extract_file_with_pandoc,
|
22
22
|
extract_html_string,
|
23
|
-
|
23
|
+
extract_pdf,
|
24
24
|
extract_pptx_file,
|
25
25
|
extract_xlsx_file,
|
26
26
|
)
|
@@ -71,21 +71,21 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
|
|
71
71
|
)
|
72
72
|
|
73
73
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
74
|
-
|
75
|
-
temp_file.write(content)
|
76
|
-
return ExtractionResult(
|
77
|
-
content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
|
78
|
-
)
|
74
|
+
return ExtractionResult(content=await extract_pdf(content, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
79
75
|
|
80
76
|
if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
|
81
77
|
return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
|
82
78
|
|
83
79
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
84
|
-
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
80
|
+
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type], delete=False) as temp_file:
|
81
|
+
try:
|
82
|
+
await AsyncPath(temp_file.name).write_bytes(content)
|
83
|
+
return ExtractionResult(
|
84
|
+
content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
|
85
|
+
)
|
86
|
+
finally:
|
87
|
+
temp_file.close()
|
88
|
+
await AsyncPath(temp_file.name).unlink()
|
89
89
|
|
90
90
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
91
91
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
@@ -137,7 +137,7 @@ async def extract_file(
|
|
137
137
|
raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
|
138
138
|
|
139
139
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
140
|
-
return ExtractionResult(content=await
|
140
|
+
return ExtractionResult(content=await extract_pdf(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
141
141
|
|
142
142
|
if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
|
143
143
|
return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.7.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -29,7 +29,7 @@ Requires-Dist: charset-normalizer>=3.4.1
|
|
29
29
|
Requires-Dist: html-to-markdown>=1.2.0
|
30
30
|
Requires-Dist: pypdfium2>=4.30.1
|
31
31
|
Requires-Dist: python-pptx>=1.0.2
|
32
|
-
Requires-Dist: typing-extensions>=4.12.2
|
32
|
+
Requires-Dist: typing-extensions>=4.12.2
|
33
33
|
Requires-Dist: xlsx2csv>=0.8.4
|
34
34
|
|
35
35
|
# Kreuzberg
|
@@ -231,11 +231,16 @@ async def process_document(path: str) -> tuple[str, str]:
|
|
231
231
|
|
232
232
|
### Error Handling
|
233
233
|
|
234
|
-
Kreuzberg provides
|
234
|
+
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
235
235
|
|
236
236
|
```python
|
237
237
|
from kreuzberg import extract_file
|
238
|
-
from kreuzberg.exceptions import
|
238
|
+
from kreuzberg.exceptions import (
|
239
|
+
ValidationError,
|
240
|
+
ParsingError,
|
241
|
+
OCRError,
|
242
|
+
MissingDependencyError
|
243
|
+
)
|
239
244
|
|
240
245
|
async def safe_extract(path: str) -> str:
|
241
246
|
try:
|
@@ -243,20 +248,31 @@ async def safe_extract(path: str) -> str:
|
|
243
248
|
return result.content
|
244
249
|
|
245
250
|
except ValidationError as e:
|
246
|
-
#
|
247
|
-
# - Unsupported
|
251
|
+
# Input validation issues
|
252
|
+
# - Unsupported or undetectable MIME types
|
248
253
|
# - Missing files
|
249
|
-
# - Invalid
|
250
|
-
print(f"
|
251
|
-
|
254
|
+
# - Invalid input parameters
|
255
|
+
print(f"Validation failed: {e}")
|
256
|
+
|
257
|
+
except OCRError as e:
|
258
|
+
# OCR-specific issues
|
259
|
+
# - Tesseract processing failures
|
260
|
+
# - Image conversion problems
|
261
|
+
print(f"OCR failed: {e}")
|
262
|
+
|
263
|
+
except MissingDependencyError as e:
|
264
|
+
# System dependency issues
|
265
|
+
# - Missing Tesseract OCR
|
266
|
+
# - Missing Pandoc
|
267
|
+
# - Incompatible versions
|
268
|
+
print(f"Dependency missing: {e}")
|
252
269
|
|
253
270
|
except ParsingError as e:
|
254
|
-
#
|
271
|
+
# General processing errors
|
255
272
|
# - PDF parsing failures
|
256
|
-
# - OCR errors
|
257
273
|
# - Format conversion issues
|
258
|
-
|
259
|
-
print(f"
|
274
|
+
# - Encoding problems
|
275
|
+
print(f"Processing failed: {e}")
|
260
276
|
|
261
277
|
return ""
|
262
278
|
|
@@ -264,24 +280,33 @@ async def safe_extract(path: str) -> str:
|
|
264
280
|
try:
|
265
281
|
result = await extract_file("document.xyz")
|
266
282
|
except ValidationError as e:
|
267
|
-
#
|
268
|
-
#
|
283
|
+
# Error will include context:
|
284
|
+
# ValidationError: Unsupported mime type
|
285
|
+
# Context: {
|
269
286
|
# "file_path": "document.xyz",
|
270
|
-
# "
|
271
|
-
# "supported_types": ["pdf", "docx", ...]
|
287
|
+
# "supported_mimetypes": ["application/pdf", ...]
|
272
288
|
# }
|
289
|
+
print(e)
|
273
290
|
|
274
291
|
try:
|
275
|
-
result = await extract_file("scan.
|
276
|
-
except
|
277
|
-
#
|
278
|
-
#
|
279
|
-
#
|
280
|
-
# "
|
281
|
-
# "
|
292
|
+
result = await extract_file("scan.jpg")
|
293
|
+
except OCRError as e:
|
294
|
+
# Error will include context:
|
295
|
+
# OCRError: OCR failed with a non-0 return code
|
296
|
+
# Context: {
|
297
|
+
# "file_path": "scan.jpg",
|
298
|
+
# "tesseract_version": "5.3.0"
|
282
299
|
# }
|
300
|
+
print(e)
|
283
301
|
```
|
284
302
|
|
303
|
+
All exceptions provide:
|
304
|
+
|
305
|
+
- A descriptive error message
|
306
|
+
- Relevant context in the `context` attribute
|
307
|
+
- String representation with both message and context
|
308
|
+
- Proper exception chaining for debugging
|
309
|
+
|
285
310
|
## Roadmap
|
286
311
|
|
287
312
|
V1:
|
@@ -0,0 +1,15 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
|
2
|
+
kreuzberg/_extractors.py,sha256=3VP7oBz0VpmkkhlbKDPjRmnZdHBv4K_xqcyMeeDaetM,9283
|
3
|
+
kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
|
4
|
+
kreuzberg/_pandoc.py,sha256=zhNJ8_92JMs4gG_Fj-IVwdpZwWsyaK-VTrbLke6NGyU,15097
|
5
|
+
kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
|
6
|
+
kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
|
7
|
+
kreuzberg/_tesseract.py,sha256=Yya15OxB4PBi2QqmrGXF70_SHBD7Luii9sBXzMJlCpU,8168
|
8
|
+
kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
|
9
|
+
kreuzberg/extraction.py,sha256=_vJ9O8t50a3p4co3hY8b3BdBIXV5S7XOUNl_kD9_FvM,6599
|
10
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
kreuzberg-1.7.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
12
|
+
kreuzberg-1.7.0.dist-info/METADATA,sha256=3wKe7X5G1IQfSPNzD0wnS0t81MqoWtQ-cgR-6MBoyec,10355
|
13
|
+
kreuzberg-1.7.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
14
|
+
kreuzberg-1.7.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
15
|
+
kreuzberg-1.7.0.dist-info/RECORD,,
|
kreuzberg-1.6.0.dist-info/RECORD
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
|
2
|
-
kreuzberg/_extractors.py,sha256=cbDjitvqI35Gimh27iXvEE0Zczf9jZRJZS7Do8ugVNE,7934
|
3
|
-
kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
|
4
|
-
kreuzberg/_pandoc.py,sha256=DC6y_NN_CG9dF6fhAj3WumXqKIJLjYmnql2H53_KHnE,13766
|
5
|
-
kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
|
6
|
-
kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
|
7
|
-
kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
|
8
|
-
kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
|
9
|
-
kreuzberg/extraction.py,sha256=G3_Uyzhe99qEib4WLE7_l1oC9JKlvoVdn3WEY56J_Wo,6572
|
10
|
-
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
kreuzberg-1.6.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
12
|
-
kreuzberg-1.6.0.dist-info/METADATA,sha256=GQNbGnxmym5vAcXDivDUccdVBUGnYh-4M38xYEkKTJk,9663
|
13
|
-
kreuzberg-1.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
14
|
-
kreuzberg-1.6.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
15
|
-
kreuzberg-1.6.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|