kreuzberg 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors.py +46 -81
- kreuzberg/_mime_types.py +22 -31
- kreuzberg/_pandoc.py +416 -0
- kreuzberg/_string.py +9 -12
- kreuzberg/_tesseract.py +318 -0
- kreuzberg/exceptions.py +9 -1
- kreuzberg/extraction.py +16 -16
- kreuzberg-1.5.0.dist-info/METADATA +318 -0
- kreuzberg-1.5.0.dist-info/RECORD +15 -0
- kreuzberg-1.3.0.dist-info/METADATA +0 -306
- kreuzberg-1.3.0.dist-info/RECORD +0 -13
- {kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors.py
CHANGED
@@ -4,53 +4,62 @@ import re
|
|
4
4
|
from contextlib import suppress
|
5
5
|
from html import escape
|
6
6
|
from io import BytesIO
|
7
|
-
from typing import TYPE_CHECKING
|
7
|
+
from typing import TYPE_CHECKING
|
8
8
|
|
9
|
+
import html_to_markdown
|
10
|
+
import pptx
|
11
|
+
import pypdfium2
|
9
12
|
from anyio import Path as AsyncPath
|
10
|
-
from charset_normalizer import detect
|
11
|
-
from html_to_markdown import convert_to_markdown
|
12
|
-
from pptx import Presentation
|
13
13
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
14
|
-
from pypandoc import convert_file, convert_text
|
15
|
-
from pypdfium2 import PdfDocument, PdfiumError
|
16
|
-
from pytesseract import TesseractError, image_to_string
|
17
14
|
|
18
|
-
from kreuzberg.
|
15
|
+
from kreuzberg._pandoc import process_content, process_file
|
19
16
|
from kreuzberg._string import normalize_spaces, safe_decode
|
20
17
|
from kreuzberg._sync import run_sync
|
18
|
+
from kreuzberg._tesseract import batch_process_images
|
21
19
|
from kreuzberg.exceptions import ParsingError
|
22
20
|
|
23
21
|
if TYPE_CHECKING: # pragma: no cover
|
24
22
|
from pathlib import Path
|
25
23
|
|
24
|
+
from PIL.Image import Image
|
26
25
|
|
27
|
-
|
28
|
-
|
26
|
+
|
27
|
+
async def convert_pdf_to_images(file_path: Path) -> list[Image]:
|
28
|
+
"""Convert a PDF file to images.
|
29
29
|
|
30
30
|
Args:
|
31
31
|
file_path: The path to the PDF file.
|
32
32
|
|
33
33
|
Raises:
|
34
|
-
ParsingError: If the
|
34
|
+
ParsingError: If the PDF file could not be converted to images.
|
35
35
|
|
36
36
|
Returns:
|
37
|
-
|
37
|
+
A list of Pillow Images.
|
38
38
|
"""
|
39
39
|
try:
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
text = "\n".join(image_to_string(img) for img in images)
|
45
|
-
return normalize_spaces(text)
|
46
|
-
except (PdfiumError, TesseractError) as e:
|
47
|
-
# TODO: add test case
|
40
|
+
pdf = await run_sync(pypdfium2.PdfDocument, str(file_path))
|
41
|
+
return [page.render(scale=2.0).to_pil() for page in pdf]
|
42
|
+
except pypdfium2.PdfiumError as e:
|
48
43
|
raise ParsingError(
|
49
|
-
"Could not
|
44
|
+
"Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
|
50
45
|
) from e
|
51
46
|
|
52
47
|
|
53
|
-
def
|
48
|
+
async def extract_pdf_with_tesseract(file_path: Path) -> str:
|
49
|
+
"""Extract text from a scanned PDF file using pytesseract.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
file_path: The path to the PDF file.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
The extracted text.
|
56
|
+
"""
|
57
|
+
images = await convert_pdf_to_images(file_path)
|
58
|
+
ocr_results = await batch_process_images(images)
|
59
|
+
return normalize_spaces("\n".join(ocr_results))
|
60
|
+
|
61
|
+
|
62
|
+
async def extract_pdf_with_pdfium2(file_path: Path) -> str:
|
54
63
|
"""Extract text from a searchable PDF file using pypdfium2.
|
55
64
|
|
56
65
|
Args:
|
@@ -63,17 +72,16 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
|
|
63
72
|
The extracted text.
|
64
73
|
"""
|
65
74
|
try:
|
66
|
-
document = PdfDocument
|
75
|
+
document = await run_sync(pypdfium2.PdfDocument, file_path)
|
67
76
|
text = "\n".join(page.get_textpage().get_text_range() for page in document)
|
68
77
|
return normalize_spaces(text)
|
69
|
-
except PdfiumError as e:
|
70
|
-
# TODO: add test case
|
78
|
+
except pypdfium2.PdfiumError as e:
|
71
79
|
raise ParsingError(
|
72
80
|
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
73
81
|
) from e
|
74
82
|
|
75
83
|
|
76
|
-
async def
|
84
|
+
async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
|
77
85
|
"""Extract text from a PDF file.
|
78
86
|
|
79
87
|
Args:
|
@@ -83,84 +91,41 @@ async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
|
|
83
91
|
Returns:
|
84
92
|
The extracted text.
|
85
93
|
"""
|
86
|
-
if not force_ocr and (content := await
|
94
|
+
if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
|
87
95
|
return normalize_spaces(content)
|
88
96
|
|
89
|
-
return
|
97
|
+
return await extract_pdf_with_tesseract(file_path)
|
90
98
|
|
91
99
|
|
92
|
-
async def
|
100
|
+
async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
|
93
101
|
"""Extract text using pandoc.
|
94
102
|
|
95
103
|
Args:
|
96
104
|
file_data: The content of the file.
|
97
105
|
mime_type: The mime type of the file.
|
98
|
-
encoding: An optional encoding to use when decoding the string.
|
99
|
-
|
100
|
-
Raises:
|
101
|
-
ParsingError: If the text could not be extracted from the file using pandoc.
|
102
106
|
|
103
107
|
Returns:
|
104
108
|
The extracted text.
|
105
109
|
"""
|
106
|
-
|
107
|
-
|
108
|
-
try:
|
109
|
-
return normalize_spaces(
|
110
|
-
cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
|
111
|
-
)
|
112
|
-
except RuntimeError as e:
|
113
|
-
# TODO: add test case
|
114
|
-
raise ParsingError(
|
115
|
-
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
|
116
|
-
context={"error": str(e)},
|
117
|
-
) from e
|
110
|
+
result = await process_content(file_data, mime_type=mime_type)
|
111
|
+
return normalize_spaces(result.content)
|
118
112
|
|
119
113
|
|
120
|
-
async def
|
114
|
+
async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
|
121
115
|
"""Extract text using pandoc.
|
122
116
|
|
123
117
|
Args:
|
124
118
|
file_path: The path to the file.
|
125
119
|
mime_type: The mime type of the file.
|
126
120
|
|
127
|
-
Raises:
|
128
|
-
ParsingError: If the text could not be extracted from the file using pandoc.
|
129
|
-
|
130
121
|
Returns:
|
131
122
|
The extracted text.
|
132
123
|
"""
|
133
|
-
|
134
|
-
|
135
|
-
return normalize_spaces(cast(str, await run_sync(convert_file, file_path, to="md", format=ext)))
|
136
|
-
except RuntimeError as e:
|
137
|
-
raise ParsingError(
|
138
|
-
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
|
139
|
-
context={"file_path": str(file_path), "error": str(e)},
|
140
|
-
) from e
|
141
|
-
|
142
|
-
|
143
|
-
async def _extract_image_with_tesseract(file_path: Path | str) -> str:
|
144
|
-
"""Extract text from an image file.
|
145
|
-
|
146
|
-
Args:
|
147
|
-
file_path: The path to the image file.
|
148
|
-
|
149
|
-
Raises:
|
150
|
-
ParsingError: If the text could not be extracted from the image file.
|
151
|
-
|
152
|
-
Returns:
|
153
|
-
The extracted content.
|
154
|
-
"""
|
155
|
-
try:
|
156
|
-
return normalize_spaces(cast(str, image_to_string(str(file_path))))
|
157
|
-
except TesseractError as e:
|
158
|
-
raise ParsingError(
|
159
|
-
"Could not extract text from image file", context={"file_path": str(file_path), "error": str(e)}
|
160
|
-
) from e
|
124
|
+
result = await process_file(file_path, mime_type=mime_type)
|
125
|
+
return normalize_spaces(result.content)
|
161
126
|
|
162
127
|
|
163
|
-
async def
|
128
|
+
async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
164
129
|
"""Extract text from a PPTX file.
|
165
130
|
|
166
131
|
Notes:
|
@@ -178,7 +143,7 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
|
178
143
|
if isinstance(file_path_or_contents, bytes)
|
179
144
|
else await AsyncPath(file_path_or_contents).read_bytes()
|
180
145
|
)
|
181
|
-
presentation = Presentation(BytesIO(file_contents))
|
146
|
+
presentation = pptx.Presentation(BytesIO(file_contents))
|
182
147
|
|
183
148
|
for index, slide in enumerate(presentation.slides):
|
184
149
|
md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
|
@@ -230,7 +195,7 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
|
230
195
|
return normalize_spaces(md_content)
|
231
196
|
|
232
197
|
|
233
|
-
async def
|
198
|
+
async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
|
234
199
|
"""Extract text from an HTML string.
|
235
200
|
|
236
201
|
Args:
|
@@ -244,4 +209,4 @@ async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
|
|
244
209
|
if isinstance(file_path_or_contents, bytes)
|
245
210
|
else await AsyncPath(file_path_or_contents).read_text()
|
246
211
|
)
|
247
|
-
return normalize_spaces(await run_sync(convert_to_markdown, content))
|
212
|
+
return normalize_spaces(await run_sync(html_to_markdown.convert_to_markdown, content))
|
kreuzberg/_mime_types.py
CHANGED
@@ -54,44 +54,35 @@ IMAGE_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
|
|
54
54
|
"image/x-portable-pixmap": "ppm",
|
55
55
|
}
|
56
56
|
PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
57
|
-
"application/
|
58
|
-
"application/
|
57
|
+
"application/csl+json",
|
58
|
+
"application/docbook+xml",
|
59
|
+
"application/epub+zip",
|
59
60
|
"application/rtf",
|
60
61
|
"application/vnd.oasis.opendocument.text",
|
61
62
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
62
|
-
"application/x-
|
63
|
+
"application/x-biblatex",
|
64
|
+
"application/x-bibtex",
|
65
|
+
"application/x-endnote+xml",
|
66
|
+
"application/x-fictionbook+xml",
|
67
|
+
"application/x-ipynb+json",
|
68
|
+
"application/x-jats+xml",
|
63
69
|
"application/x-latex",
|
64
|
-
"application/x-
|
65
|
-
"application/x-
|
70
|
+
"application/x-opml+xml",
|
71
|
+
"application/x-research-info-systems",
|
72
|
+
"application/x-typst",
|
66
73
|
"text/csv",
|
67
|
-
"text/latex",
|
68
|
-
"text/rst",
|
69
|
-
"text/rtf",
|
70
74
|
"text/tab-separated-values",
|
71
|
-
"text/
|
72
|
-
"text/x-
|
75
|
+
"text/troff",
|
76
|
+
"text/x-commonmark",
|
77
|
+
"text/x-dokuwiki",
|
78
|
+
"text/x-gfm",
|
79
|
+
"text/x-markdown",
|
80
|
+
"text/x-markdown-extra",
|
81
|
+
"text/x-mdoc",
|
82
|
+
"text/x-multimarkdown",
|
83
|
+
"text/x-org",
|
84
|
+
"text/x-pod",
|
73
85
|
"text/x-rst",
|
74
|
-
"text/x-tsv",
|
75
|
-
}
|
76
|
-
PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
|
77
|
-
"application/csv": "csv",
|
78
|
-
"application/latex": "latex",
|
79
|
-
"application/rtf": "rtf",
|
80
|
-
"application/vnd.oasis.opendocument.text": "odt",
|
81
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
82
|
-
"application/x-csv": "csv",
|
83
|
-
"application/x-latex": "latex",
|
84
|
-
"application/x-rtf": "rtf",
|
85
|
-
"application/x-vnd.oasis.opendocument.text": "odt",
|
86
|
-
"text/csv": "csv",
|
87
|
-
"text/latex": "latex",
|
88
|
-
"text/rst": "rst",
|
89
|
-
"text/rtf": "rtf",
|
90
|
-
"text/tab-separated-values": "tsv",
|
91
|
-
"text/x-csv": "csv",
|
92
|
-
"text/x-latex": "latex",
|
93
|
-
"text/x-rst": "rst",
|
94
|
-
"text/x-tsv": "tsv",
|
95
86
|
}
|
96
87
|
|
97
88
|
SUPPORTED_MIME_TYPES: Final[set[str]] = (
|