kreuzberg 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors.py +46 -81
- kreuzberg/_mime_types.py +22 -31
- kreuzberg/_pandoc.py +416 -0
- kreuzberg/_string.py +9 -12
- kreuzberg/_tesseract.py +318 -0
- kreuzberg/exceptions.py +9 -1
- kreuzberg/extraction.py +16 -16
- kreuzberg-1.5.0.dist-info/METADATA +318 -0
- kreuzberg-1.5.0.dist-info/RECORD +15 -0
- kreuzberg-1.3.0.dist-info/METADATA +0 -306
- kreuzberg-1.3.0.dist-info/RECORD +0 -13
- {kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/top_level.txt +0 -0
kreuzberg/_tesseract.py
ADDED
@@ -0,0 +1,318 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
import subprocess
|
5
|
+
from asyncio import gather
|
6
|
+
from enum import Enum
|
7
|
+
from os import PathLike
|
8
|
+
from tempfile import NamedTemporaryFile
|
9
|
+
from typing import Any, Literal, TypeVar, Union
|
10
|
+
|
11
|
+
from anyio import Path as AsyncPath
|
12
|
+
from PIL.Image import Image
|
13
|
+
|
14
|
+
from kreuzberg._sync import run_sync
|
15
|
+
from kreuzberg.exceptions import MissingDependencyError, OCRError
|
16
|
+
|
17
|
+
version_ref = {"checked": False}
|
18
|
+
|
19
|
+
T = TypeVar("T", bound=Union[Image, PathLike[str], str])
|
20
|
+
|
21
|
+
SupportedLanguages = Literal[
|
22
|
+
"afr",
|
23
|
+
"amh",
|
24
|
+
"ara",
|
25
|
+
"asm",
|
26
|
+
"aze",
|
27
|
+
"aze_cyrl",
|
28
|
+
"bel",
|
29
|
+
"ben",
|
30
|
+
"bod",
|
31
|
+
"bos",
|
32
|
+
"bre",
|
33
|
+
"bul",
|
34
|
+
"cat",
|
35
|
+
"ceb",
|
36
|
+
"ces",
|
37
|
+
"chi_sim",
|
38
|
+
"chi_tra",
|
39
|
+
"chr",
|
40
|
+
"cos",
|
41
|
+
"cym",
|
42
|
+
"dan",
|
43
|
+
"dan_frak",
|
44
|
+
"deu",
|
45
|
+
"deu_frak",
|
46
|
+
"deu_latf",
|
47
|
+
"dzo",
|
48
|
+
"ell",
|
49
|
+
"eng",
|
50
|
+
"enm",
|
51
|
+
"epo",
|
52
|
+
"equ",
|
53
|
+
"est",
|
54
|
+
"eus",
|
55
|
+
"fao",
|
56
|
+
"fas",
|
57
|
+
"fil",
|
58
|
+
"fin",
|
59
|
+
"fra",
|
60
|
+
"frk",
|
61
|
+
"frm",
|
62
|
+
"fry",
|
63
|
+
"gla",
|
64
|
+
"gle",
|
65
|
+
"glg",
|
66
|
+
"grc",
|
67
|
+
"guj",
|
68
|
+
"hat",
|
69
|
+
"heb",
|
70
|
+
"hin",
|
71
|
+
"hrv",
|
72
|
+
"hun",
|
73
|
+
"hye",
|
74
|
+
"iku",
|
75
|
+
"ind",
|
76
|
+
"isl",
|
77
|
+
"ita",
|
78
|
+
"ita_old",
|
79
|
+
"jav",
|
80
|
+
"jpn",
|
81
|
+
"kan",
|
82
|
+
"kat",
|
83
|
+
"kat_old",
|
84
|
+
"kaz",
|
85
|
+
"khm",
|
86
|
+
"kir",
|
87
|
+
"kmr",
|
88
|
+
"kor",
|
89
|
+
"kor_vert",
|
90
|
+
"kur",
|
91
|
+
"lao",
|
92
|
+
"lat",
|
93
|
+
"lav",
|
94
|
+
"lit",
|
95
|
+
"ltz",
|
96
|
+
"mal",
|
97
|
+
"mar",
|
98
|
+
"mkd",
|
99
|
+
"mlt",
|
100
|
+
"mon",
|
101
|
+
"mri",
|
102
|
+
"msa",
|
103
|
+
"mya",
|
104
|
+
"nep",
|
105
|
+
"nld",
|
106
|
+
"nor",
|
107
|
+
"oci",
|
108
|
+
"ori",
|
109
|
+
"osd",
|
110
|
+
"pan",
|
111
|
+
"pol",
|
112
|
+
"por",
|
113
|
+
"pus",
|
114
|
+
"que",
|
115
|
+
"ron",
|
116
|
+
"rus",
|
117
|
+
"san",
|
118
|
+
"sin",
|
119
|
+
"slk",
|
120
|
+
"slk_frak",
|
121
|
+
"slv",
|
122
|
+
"snd",
|
123
|
+
"spa",
|
124
|
+
"spa_old",
|
125
|
+
"sqi",
|
126
|
+
"srp",
|
127
|
+
"srp_latn",
|
128
|
+
"sun",
|
129
|
+
"swa",
|
130
|
+
"swe",
|
131
|
+
"syr",
|
132
|
+
"tam",
|
133
|
+
"tat",
|
134
|
+
"tel",
|
135
|
+
"tgk",
|
136
|
+
"tgl",
|
137
|
+
"tha",
|
138
|
+
"tir",
|
139
|
+
"ton",
|
140
|
+
"tur",
|
141
|
+
"uig",
|
142
|
+
"ukr",
|
143
|
+
"urd",
|
144
|
+
"uzb",
|
145
|
+
"uzb_cyrl",
|
146
|
+
"vie",
|
147
|
+
"yid",
|
148
|
+
"yor",
|
149
|
+
]
|
150
|
+
|
151
|
+
|
152
|
+
class PSMMode(Enum):
|
153
|
+
"""Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
|
154
|
+
|
155
|
+
OSD_ONLY = 0
|
156
|
+
"""Orientation and script detection only."""
|
157
|
+
AUTO_OSD = 1
|
158
|
+
"""Automatic page segmentation with orientation and script detection."""
|
159
|
+
AUTO_ONLY = 2
|
160
|
+
"""Automatic page segmentation without OSD."""
|
161
|
+
AUTO = 3
|
162
|
+
"""Fully automatic page segmentation (default)."""
|
163
|
+
SINGLE_COLUMN = 4
|
164
|
+
"""Assume a single column of text."""
|
165
|
+
SINGLE_BLOCK_VERTICAL = 5
|
166
|
+
"""Assume a single uniform block of vertically aligned text."""
|
167
|
+
SINGLE_BLOCK = 6
|
168
|
+
"""Assume a single uniform block of text."""
|
169
|
+
SINGLE_LINE = 7
|
170
|
+
"""Treat the image as a single text line."""
|
171
|
+
SINGLE_WORD = 8
|
172
|
+
"""Treat the image as a single word."""
|
173
|
+
CIRCLE_WORD = 9
|
174
|
+
"""Treat the image as a single word in a circle."""
|
175
|
+
SINGLE_CHAR = 10
|
176
|
+
"""Treat the image as a single character."""
|
177
|
+
|
178
|
+
|
179
|
+
async def validate_tesseract_version() -> None:
|
180
|
+
"""Validate that Tesseract is installed and is version 5 or above.
|
181
|
+
|
182
|
+
Raises:
|
183
|
+
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
184
|
+
"""
|
185
|
+
try:
|
186
|
+
if version_ref["checked"]:
|
187
|
+
return
|
188
|
+
|
189
|
+
result = await run_sync(subprocess.run, ["tesseract", "--version"], capture_output=True)
|
190
|
+
version_match = re.search(r"tesseract\s+(\d+)", result.stdout.decode())
|
191
|
+
if not version_match or int(version_match.group(1)) < 5:
|
192
|
+
raise MissingDependencyError("Tesseract version 5 or above is required.")
|
193
|
+
|
194
|
+
version_ref["checked"] = True
|
195
|
+
except FileNotFoundError as e:
|
196
|
+
raise MissingDependencyError("Tesseract is not installed.") from e
|
197
|
+
|
198
|
+
|
199
|
+
async def process_file(
|
200
|
+
input_file: str | PathLike[str], *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any
|
201
|
+
) -> str:
|
202
|
+
"""Process a single image file using Tesseract OCR.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
input_file: The path to the image file to process.
|
206
|
+
language: The language code for OCR.
|
207
|
+
psm: Page segmentation mode.
|
208
|
+
**kwargs: Additional Tesseract configuration options as key-value pairs.
|
209
|
+
|
210
|
+
Raises:
|
211
|
+
OCRError: If OCR fails to extract text from the image.
|
212
|
+
|
213
|
+
Returns:
|
214
|
+
str: Extracted text from the image.
|
215
|
+
"""
|
216
|
+
with NamedTemporaryFile(suffix=".txt") as output_file:
|
217
|
+
# this is needed because tesseract adds .txt to the output file
|
218
|
+
output_file_name = output_file.name.replace(".txt", "")
|
219
|
+
try:
|
220
|
+
command = [
|
221
|
+
"tesseract",
|
222
|
+
str(input_file),
|
223
|
+
output_file_name,
|
224
|
+
"-l",
|
225
|
+
language,
|
226
|
+
"--psm",
|
227
|
+
str(psm.value),
|
228
|
+
]
|
229
|
+
|
230
|
+
for key, value in kwargs.items():
|
231
|
+
command.extend(["-c", f"{key}={value}"])
|
232
|
+
|
233
|
+
result = await run_sync(
|
234
|
+
subprocess.run,
|
235
|
+
command,
|
236
|
+
capture_output=True,
|
237
|
+
)
|
238
|
+
|
239
|
+
if not result.returncode == 0:
|
240
|
+
raise OCRError("OCR failed with a non-0 return code.")
|
241
|
+
|
242
|
+
output = await AsyncPath(output_file.name).read_text()
|
243
|
+
return output.strip()
|
244
|
+
except (RuntimeError, OSError) as e:
|
245
|
+
raise OCRError("Failed to OCR using tesseract") from e
|
246
|
+
|
247
|
+
|
248
|
+
async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
|
249
|
+
"""Process a single Pillow Image using Tesseract OCR.
|
250
|
+
|
251
|
+
Args:
|
252
|
+
image: The Pillow Image to process.
|
253
|
+
language: The language code for OCR.
|
254
|
+
psm: Page segmentation mode.
|
255
|
+
**kwargs: Additional Tesseract configuration options as key-value pairs.
|
256
|
+
|
257
|
+
Returns:
|
258
|
+
str: Extracted text from the image.
|
259
|
+
"""
|
260
|
+
with NamedTemporaryFile(suffix=".png") as image_file:
|
261
|
+
await run_sync(image.save, image_file.name, format="PNG")
|
262
|
+
return await process_file(image_file.name, language=language, psm=psm, **kwargs)
|
263
|
+
|
264
|
+
|
265
|
+
async def process_image_with_tesseract(
|
266
|
+
image: Image | PathLike[str] | str,
|
267
|
+
*,
|
268
|
+
language: SupportedLanguages = "eng",
|
269
|
+
psm: PSMMode = PSMMode.AUTO,
|
270
|
+
**kwargs: Any,
|
271
|
+
) -> str:
|
272
|
+
"""Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
|
276
|
+
language: The language code for OCR (default: "eng").
|
277
|
+
psm: Page segmentation mode (default: PSMMode.AUTO).
|
278
|
+
**kwargs: Additional Tesseract configuration options as key-value pairs.
|
279
|
+
|
280
|
+
Raises:
|
281
|
+
ValueError: If the input is not a Pillow Image or a list of Pillow Images.
|
282
|
+
|
283
|
+
Returns:
|
284
|
+
Extracted text as a string
|
285
|
+
"""
|
286
|
+
await validate_tesseract_version()
|
287
|
+
|
288
|
+
if isinstance(image, Image):
|
289
|
+
return await process_image(image, language=language, psm=psm, **kwargs)
|
290
|
+
|
291
|
+
if isinstance(image, (PathLike, str)):
|
292
|
+
return await process_file(image, language=language, psm=psm, **kwargs)
|
293
|
+
|
294
|
+
raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
|
295
|
+
|
296
|
+
|
297
|
+
async def batch_process_images(
|
298
|
+
images: list[T],
|
299
|
+
*,
|
300
|
+
language: SupportedLanguages = "eng",
|
301
|
+
psm: PSMMode = PSMMode.AUTO,
|
302
|
+
**kwargs: Any,
|
303
|
+
) -> list[str]:
|
304
|
+
"""Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
|
305
|
+
|
306
|
+
Args:
|
307
|
+
images: A list of Pillow Images, paths or strings to process.
|
308
|
+
language: The language code for OCR (default: "eng").
|
309
|
+
psm: Page segmentation mode (default: PSMMode.AUTO).
|
310
|
+
**kwargs: Additional Tesseract configuration options as key-value pairs.
|
311
|
+
|
312
|
+
Returns:
|
313
|
+
Extracted text as a string (for single image) or a list of strings (for multiple images).
|
314
|
+
"""
|
315
|
+
await validate_tesseract_version()
|
316
|
+
return await gather(
|
317
|
+
*[process_image_with_tesseract(image, language=language, psm=psm, **kwargs) for image in images]
|
318
|
+
)
|
kreuzberg/exceptions.py
CHANGED
@@ -10,7 +10,7 @@ class KreuzbergError(Exception):
|
|
10
10
|
context: Any
|
11
11
|
"""The context of the error."""
|
12
12
|
|
13
|
-
def __init__(self, message: str, context: Any = None) -> None:
|
13
|
+
def __init__(self, message: str, *, context: Any = None) -> None:
|
14
14
|
self.context = context
|
15
15
|
super().__init__(message)
|
16
16
|
|
@@ -27,3 +27,11 @@ class ParsingError(KreuzbergError):
|
|
27
27
|
|
28
28
|
class ValidationError(KreuzbergError):
|
29
29
|
"""Raised when a validation error occurs."""
|
30
|
+
|
31
|
+
|
32
|
+
class MissingDependencyError(KreuzbergError):
|
33
|
+
"""Raised when a dependency is missing."""
|
34
|
+
|
35
|
+
|
36
|
+
class OCRError(KreuzbergError):
|
37
|
+
"""Raised when an OCR error occurs."""
|
kreuzberg/extraction.py
CHANGED
@@ -17,12 +17,11 @@ from typing import NamedTuple
|
|
17
17
|
from anyio import Path as AsyncPath
|
18
18
|
|
19
19
|
from kreuzberg._extractors import (
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
_extract_pptx_file,
|
20
|
+
extract_content_with_pandoc,
|
21
|
+
extract_file_with_pandoc,
|
22
|
+
extract_html_string,
|
23
|
+
extract_pdf_file,
|
24
|
+
extract_pptx_file,
|
26
25
|
)
|
27
26
|
from kreuzberg._mime_types import (
|
28
27
|
HTML_MIME_TYPE,
|
@@ -36,6 +35,7 @@ from kreuzberg._mime_types import (
|
|
36
35
|
SUPPORTED_MIME_TYPES,
|
37
36
|
)
|
38
37
|
from kreuzberg._string import safe_decode
|
38
|
+
from kreuzberg._tesseract import process_image_with_tesseract
|
39
39
|
from kreuzberg.exceptions import ValidationError
|
40
40
|
|
41
41
|
|
@@ -72,28 +72,28 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
|
|
72
72
|
with NamedTemporaryFile(suffix=".pdf") as temp_file:
|
73
73
|
temp_file.write(content)
|
74
74
|
return ExtractionResult(
|
75
|
-
content=await
|
75
|
+
content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
|
76
76
|
)
|
77
77
|
|
78
78
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
79
79
|
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
|
80
80
|
temp_file.write(content)
|
81
81
|
return ExtractionResult(
|
82
|
-
content=await
|
82
|
+
content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
|
83
83
|
)
|
84
84
|
|
85
85
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
86
86
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
87
87
|
):
|
88
88
|
return ExtractionResult(
|
89
|
-
content=await
|
89
|
+
content=await extract_content_with_pandoc(content, mime_type), mime_type=MARKDOWN_MIME_TYPE
|
90
90
|
)
|
91
91
|
|
92
92
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
93
|
-
return ExtractionResult(content=await
|
93
|
+
return ExtractionResult(content=await extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
|
94
94
|
|
95
95
|
if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
|
96
|
-
return ExtractionResult(content=await
|
96
|
+
return ExtractionResult(content=await extract_html_string(content), mime_type=MARKDOWN_MIME_TYPE)
|
97
97
|
|
98
98
|
return ExtractionResult(
|
99
99
|
content=safe_decode(content),
|
@@ -132,22 +132,22 @@ async def extract_file(
|
|
132
132
|
raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
|
133
133
|
|
134
134
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
135
|
-
return ExtractionResult(content=await
|
135
|
+
return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
136
136
|
|
137
137
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
138
|
-
return ExtractionResult(content=await
|
138
|
+
return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
139
139
|
|
140
140
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
141
141
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
142
142
|
):
|
143
143
|
return ExtractionResult(
|
144
|
-
content=await
|
144
|
+
content=await extract_file_with_pandoc(file_path, mime_type), mime_type=MARKDOWN_MIME_TYPE
|
145
145
|
)
|
146
146
|
|
147
147
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
148
|
-
return ExtractionResult(content=await
|
148
|
+
return ExtractionResult(content=await extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
149
149
|
|
150
150
|
if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
|
151
|
-
return ExtractionResult(content=await
|
151
|
+
return ExtractionResult(content=await extract_html_string(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
152
152
|
|
153
153
|
return ExtractionResult(content=await AsyncPath(file_path).read_text(), mime_type=mime_type)
|