kreuzberg 1.6.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -2
- kreuzberg/_constants.py +6 -0
- kreuzberg/_html.py +32 -0
- kreuzberg/_mime_types.py +109 -1
- kreuzberg/_pandoc.py +154 -167
- kreuzberg/_pdf.py +189 -0
- kreuzberg/_pptx.py +88 -0
- kreuzberg/_string.py +5 -8
- kreuzberg/_sync.py +6 -1
- kreuzberg/_tesseract.py +101 -64
- kreuzberg/_tmp.py +37 -0
- kreuzberg/_types.py +71 -0
- kreuzberg/_xlsx.py +92 -0
- kreuzberg/extraction.py +269 -64
- kreuzberg-2.0.0.dist-info/METADATA +419 -0
- kreuzberg-2.0.0.dist-info/RECORD +21 -0
- kreuzberg/_extractors.py +0 -247
- kreuzberg-1.6.0.dist-info/METADATA +0 -317
- kreuzberg-1.6.0.dist-info/RECORD +0 -15
- {kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/top_level.txt +0 -0
kreuzberg/_pdf.py
ADDED
@@ -0,0 +1,189 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from re import Pattern
|
4
|
+
from re import compile as compile_regex
|
5
|
+
from typing import TYPE_CHECKING, Final, cast
|
6
|
+
|
7
|
+
import pypdfium2
|
8
|
+
from anyio import Path as AsyncPath
|
9
|
+
|
10
|
+
from kreuzberg import ExtractionResult
|
11
|
+
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
12
|
+
from kreuzberg._string import normalize_spaces
|
13
|
+
from kreuzberg._sync import run_sync
|
14
|
+
from kreuzberg._tesseract import PSMMode, SupportedLanguage, batch_process_images
|
15
|
+
from kreuzberg.exceptions import ParsingError
|
16
|
+
|
17
|
+
if TYPE_CHECKING: # pragma: no cover
|
18
|
+
from pathlib import Path
|
19
|
+
|
20
|
+
from PIL.Image import Image
|
21
|
+
|
22
|
+
|
23
|
+
# Pattern to detect common PDF text extraction corruption:
|
24
|
+
# - Control and non-printable characters
|
25
|
+
# - Unicode replacement and invalid characters
|
26
|
+
# - Zero-width spaces and other invisible characters
|
27
|
+
CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(
|
28
|
+
r"[\x00-\x08\x0B-\x1F\x7F-\x9F]|\uFFFD|[\u200B-\u200F\u2028-\u202F]"
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
def _validate_extracted_text(text: str) -> bool:
|
33
|
+
"""Check if text extracted from PDF is valid or corrupted.
|
34
|
+
|
35
|
+
This checks for common indicators of corrupted PDF text extraction:
|
36
|
+
1. Empty or whitespace-only text
|
37
|
+
2. Control characters and other non-printable characters
|
38
|
+
3. Unicode replacement characters
|
39
|
+
4. Zero-width spaces and other invisible characters
|
40
|
+
|
41
|
+
Args:
|
42
|
+
text: The extracted text to validate
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
True if the text appears valid, False if it seems corrupted
|
46
|
+
"""
|
47
|
+
# Check for empty or whitespace-only text
|
48
|
+
if not text or not text.strip():
|
49
|
+
return False
|
50
|
+
|
51
|
+
# Check for corruption indicators
|
52
|
+
return not bool(CORRUPTED_PATTERN.search(text))
|
53
|
+
|
54
|
+
|
55
|
+
async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
56
|
+
"""Convert a PDF file to images.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
input_file: The path to the PDF file.
|
60
|
+
|
61
|
+
Raises:
|
62
|
+
ParsingError: If the PDF file could not be converted to images.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
A list of Pillow Images.
|
66
|
+
"""
|
67
|
+
document: pypdfium2.PdfDocument | None = None
|
68
|
+
try:
|
69
|
+
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
70
|
+
return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
|
71
|
+
except pypdfium2.PdfiumError as e:
|
72
|
+
raise ParsingError(
|
73
|
+
"Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
|
74
|
+
) from e
|
75
|
+
finally:
|
76
|
+
if document:
|
77
|
+
await run_sync(document.close)
|
78
|
+
|
79
|
+
|
80
|
+
async def _extract_pdf_text_with_ocr(
|
81
|
+
input_file: Path,
|
82
|
+
*,
|
83
|
+
language: SupportedLanguage = "eng",
|
84
|
+
max_processes: int,
|
85
|
+
psm: PSMMode = PSMMode.AUTO,
|
86
|
+
) -> ExtractionResult:
|
87
|
+
"""Extract text from a scanned PDF file using pytesseract.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
input_file: The path to the PDF file.
|
91
|
+
language: The language code for OCR. Defaults to "eng".
|
92
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
93
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
The extracted text.
|
97
|
+
"""
|
98
|
+
images = await _convert_pdf_to_images(input_file)
|
99
|
+
ocr_results = await batch_process_images(images, max_processes=max_processes, psm=psm, language=language)
|
100
|
+
return ExtractionResult(
|
101
|
+
content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
|
102
|
+
)
|
103
|
+
|
104
|
+
|
105
|
+
async def _extract_pdf_searchable_text(input_file: Path) -> str:
|
106
|
+
"""Extract text from a searchable PDF file using pypdfium2.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
input_file: The path to the PDF file.
|
110
|
+
|
111
|
+
Raises:
|
112
|
+
ParsingError: If the text could not be extracted from the PDF file.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
The extracted text.
|
116
|
+
"""
|
117
|
+
document: pypdfium2.PdfDocument | None = None
|
118
|
+
try:
|
119
|
+
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
120
|
+
text = "\n".join(page.get_textpage().get_text_bounded() for page in cast(pypdfium2.PdfDocument, document))
|
121
|
+
return normalize_spaces(text)
|
122
|
+
except pypdfium2.PdfiumError as e:
|
123
|
+
raise ParsingError(
|
124
|
+
"Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
|
125
|
+
) from e
|
126
|
+
finally:
|
127
|
+
if document:
|
128
|
+
await run_sync(document.close)
|
129
|
+
|
130
|
+
|
131
|
+
async def extract_pdf_file(
|
132
|
+
input_file: Path,
|
133
|
+
*,
|
134
|
+
force_ocr: bool,
|
135
|
+
language: SupportedLanguage = "eng",
|
136
|
+
max_processes: int,
|
137
|
+
psm: PSMMode = PSMMode.AUTO,
|
138
|
+
) -> ExtractionResult:
|
139
|
+
"""Extract text from a PDF file.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
input_file: The path to the PDF file.
|
143
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
144
|
+
language: The language code for OCR. Defaults to "eng".
|
145
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
146
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
The extracted text.
|
150
|
+
"""
|
151
|
+
if (
|
152
|
+
not force_ocr
|
153
|
+
and (content := await _extract_pdf_searchable_text(input_file))
|
154
|
+
and _validate_extracted_text(content)
|
155
|
+
):
|
156
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
157
|
+
|
158
|
+
return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
|
159
|
+
|
160
|
+
|
161
|
+
async def extract_pdf_content(
|
162
|
+
content: bytes,
|
163
|
+
*,
|
164
|
+
force_ocr: bool,
|
165
|
+
language: SupportedLanguage = "eng",
|
166
|
+
max_processes: int,
|
167
|
+
psm: PSMMode = PSMMode.AUTO,
|
168
|
+
) -> ExtractionResult:
|
169
|
+
"""Extract text from a PDF file content.
|
170
|
+
|
171
|
+
Args:
|
172
|
+
content: The PDF file content.
|
173
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
174
|
+
language: The language code for OCR. Defaults to "eng".
|
175
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
176
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
177
|
+
|
178
|
+
Returns:
|
179
|
+
The extracted text.
|
180
|
+
"""
|
181
|
+
from kreuzberg._tmp import create_temp_file
|
182
|
+
|
183
|
+
file_path, unlink = await create_temp_file(".pdf")
|
184
|
+
await AsyncPath(file_path).write_bytes(content)
|
185
|
+
result = await extract_pdf_file(
|
186
|
+
file_path, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
|
187
|
+
)
|
188
|
+
await unlink()
|
189
|
+
return result
|
kreuzberg/_pptx.py
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from contextlib import suppress
|
5
|
+
from html import escape
|
6
|
+
from io import BytesIO
|
7
|
+
from typing import TYPE_CHECKING
|
8
|
+
|
9
|
+
import pptx
|
10
|
+
from anyio import Path as AsyncPath
|
11
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
12
|
+
|
13
|
+
from kreuzberg import ExtractionResult
|
14
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
15
|
+
from kreuzberg._string import normalize_spaces
|
16
|
+
|
17
|
+
if TYPE_CHECKING: # pragma: no cover
|
18
|
+
from pathlib import Path
|
19
|
+
|
20
|
+
|
21
|
+
async def extract_pptx_file_content(file_path_or_contents: Path | bytes) -> ExtractionResult:
|
22
|
+
"""Extract text from a PPTX file.
|
23
|
+
|
24
|
+
Notes:
|
25
|
+
This function is based on code vendored from `markitdown`, which has an MIT license as well.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
file_path_or_contents: The path to the PPTX file or its contents as bytes.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
The extracted text content
|
32
|
+
"""
|
33
|
+
md_content = ""
|
34
|
+
file_contents = (
|
35
|
+
file_path_or_contents
|
36
|
+
if isinstance(file_path_or_contents, bytes)
|
37
|
+
else await AsyncPath(file_path_or_contents).read_bytes()
|
38
|
+
)
|
39
|
+
presentation = pptx.Presentation(BytesIO(file_contents))
|
40
|
+
|
41
|
+
for index, slide in enumerate(presentation.slides):
|
42
|
+
md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
|
43
|
+
|
44
|
+
title = slide.shapes.title
|
45
|
+
|
46
|
+
for shape in slide.shapes:
|
47
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
|
48
|
+
shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
|
49
|
+
):
|
50
|
+
alt_text = ""
|
51
|
+
with suppress(AttributeError):
|
52
|
+
# access non-visual properties
|
53
|
+
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
|
54
|
+
|
55
|
+
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
56
|
+
md_content += f"\n\n"
|
57
|
+
|
58
|
+
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
59
|
+
html_table = "<table>"
|
60
|
+
first_row = True
|
61
|
+
|
62
|
+
for row in shape.table.rows:
|
63
|
+
html_table += "<tr>"
|
64
|
+
|
65
|
+
for cell in row.cells:
|
66
|
+
tag = "th" if first_row else "td"
|
67
|
+
html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
|
68
|
+
|
69
|
+
html_table += "</tr>"
|
70
|
+
first_row = False
|
71
|
+
|
72
|
+
html_table += "</table>"
|
73
|
+
md_content += "\n" + html_table + "\n"
|
74
|
+
|
75
|
+
elif shape.has_text_frame:
|
76
|
+
md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
|
77
|
+
|
78
|
+
md_content = md_content.strip()
|
79
|
+
if slide.has_notes_slide:
|
80
|
+
md_content += "\n\n### Notes:\n"
|
81
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
82
|
+
|
83
|
+
if notes_frame is not None: # pragma: no branch
|
84
|
+
md_content += notes_frame.text
|
85
|
+
|
86
|
+
md_content = md_content.strip()
|
87
|
+
|
88
|
+
return ExtractionResult(content=normalize_spaces(md_content), mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
kreuzberg/_string.py
CHANGED
@@ -4,8 +4,6 @@ from contextlib import suppress
|
|
4
4
|
|
5
5
|
from charset_normalizer import detect
|
6
6
|
|
7
|
-
from kreuzberg.exceptions import ParsingError
|
8
|
-
|
9
7
|
|
10
8
|
def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
11
9
|
"""Decode a byte string safely, removing invalid sequences.
|
@@ -14,22 +12,21 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
14
12
|
byte_data: The byte string to decode.
|
15
13
|
encoding: The encoding to use when decoding the byte string.
|
16
14
|
|
17
|
-
Raises:
|
18
|
-
ParsingError: If the byte string could not be decoded.
|
19
|
-
|
20
15
|
Returns:
|
21
16
|
The decoded string.
|
22
17
|
"""
|
23
18
|
if not byte_data:
|
24
19
|
return ""
|
25
20
|
|
26
|
-
|
21
|
+
# We try each encoding in order until one works
|
22
|
+
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
|
27
23
|
|
28
|
-
for enc in [e for e in encodings if e]:
|
24
|
+
for enc in [e for e in encodings if e]: # pragma: no cover
|
29
25
|
with suppress(UnicodeDecodeError):
|
30
26
|
return byte_data.decode(enc)
|
31
27
|
|
32
|
-
|
28
|
+
# If all encodings fail, fall back to latin-1 which can handle any byte
|
29
|
+
return byte_data.decode("latin-1", errors="replace")
|
33
30
|
|
34
31
|
|
35
32
|
def normalize_spaces(text: str) -> str:
|
kreuzberg/_sync.py
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import sys
|
3
4
|
from functools import partial
|
4
5
|
from typing import TYPE_CHECKING, TypeVar, cast
|
5
6
|
|
6
7
|
from anyio.to_thread import run_sync as any_io_run_sync
|
7
|
-
from typing_extensions import ParamSpec
|
8
8
|
|
9
9
|
if TYPE_CHECKING: # pragma: no cover
|
10
10
|
from collections.abc import Callable
|
11
11
|
|
12
|
+
if sys.version_info >= (3, 10):
|
13
|
+
from typing import ParamSpec
|
14
|
+
else: # pragma: no cover
|
15
|
+
from typing_extensions import ParamSpec
|
16
|
+
|
12
17
|
T = TypeVar("T")
|
13
18
|
P = ParamSpec("P")
|
14
19
|
|
kreuzberg/_tesseract.py
CHANGED
@@ -2,23 +2,34 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import re
|
4
4
|
import subprocess
|
5
|
-
|
5
|
+
import sys
|
6
6
|
from enum import Enum
|
7
|
+
from functools import partial
|
7
8
|
from os import PathLike
|
8
|
-
from
|
9
|
-
from typing import Any, Literal, TypeVar, Union
|
9
|
+
from typing import Final, Literal, TypeVar, Union, cast
|
10
10
|
|
11
|
+
from anyio import CapacityLimiter, create_task_group, to_process
|
11
12
|
from anyio import Path as AsyncPath
|
12
13
|
from PIL.Image import Image
|
13
14
|
|
15
|
+
from kreuzberg import ExtractionResult, ParsingError
|
16
|
+
from kreuzberg._constants import DEFAULT_MAX_PROCESSES
|
17
|
+
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
18
|
+
from kreuzberg._string import normalize_spaces
|
14
19
|
from kreuzberg._sync import run_sync
|
20
|
+
from kreuzberg._tmp import create_temp_file
|
15
21
|
from kreuzberg.exceptions import MissingDependencyError, OCRError
|
16
22
|
|
23
|
+
if sys.version_info < (3, 11): # pragma: no cover
|
24
|
+
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
25
|
+
|
26
|
+
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
27
|
+
|
17
28
|
version_ref = {"checked": False}
|
18
29
|
|
19
30
|
T = TypeVar("T", bound=Union[Image, PathLike[str], str])
|
20
31
|
|
21
|
-
|
32
|
+
SupportedLanguage = Literal[
|
22
33
|
"afr",
|
23
34
|
"amh",
|
24
35
|
"ara",
|
@@ -186,9 +197,10 @@ async def validate_tesseract_version() -> None:
|
|
186
197
|
if version_ref["checked"]:
|
187
198
|
return
|
188
199
|
|
189
|
-
|
190
|
-
|
191
|
-
|
200
|
+
command = ["tesseract", "--version"]
|
201
|
+
result = await run_sync(subprocess.run, command, capture_output=True)
|
202
|
+
version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
|
203
|
+
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
192
204
|
raise MissingDependencyError("Tesseract version 5 or above is required.")
|
193
205
|
|
194
206
|
version_ref["checked"] = True
|
@@ -197,85 +209,96 @@ async def validate_tesseract_version() -> None:
|
|
197
209
|
|
198
210
|
|
199
211
|
async def process_file(
|
200
|
-
input_file: str | PathLike[str],
|
201
|
-
|
212
|
+
input_file: str | PathLike[str],
|
213
|
+
*,
|
214
|
+
language: SupportedLanguage,
|
215
|
+
psm: PSMMode,
|
216
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
217
|
+
) -> ExtractionResult:
|
202
218
|
"""Process a single image file using Tesseract OCR.
|
203
219
|
|
204
220
|
Args:
|
205
221
|
input_file: The path to the image file to process.
|
206
222
|
language: The language code for OCR.
|
207
223
|
psm: Page segmentation mode.
|
208
|
-
|
224
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
209
225
|
|
210
226
|
Raises:
|
211
227
|
OCRError: If OCR fails to extract text from the image.
|
212
228
|
|
213
229
|
Returns:
|
214
|
-
|
230
|
+
ExtractionResult: The extracted text from the image.
|
215
231
|
"""
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
)
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
232
|
+
output_path, unlink = await create_temp_file(".txt")
|
233
|
+
try:
|
234
|
+
output_base = str(output_path).replace(".txt", "")
|
235
|
+
command = [
|
236
|
+
"tesseract",
|
237
|
+
str(input_file),
|
238
|
+
output_base,
|
239
|
+
"-l",
|
240
|
+
language,
|
241
|
+
"--psm",
|
242
|
+
str(psm.value),
|
243
|
+
]
|
244
|
+
|
245
|
+
result = await to_process.run_sync(
|
246
|
+
partial(subprocess.run, capture_output=True),
|
247
|
+
command,
|
248
|
+
limiter=CapacityLimiter(max_processes),
|
249
|
+
cancellable=True,
|
250
|
+
)
|
251
|
+
|
252
|
+
if not result.returncode == 0:
|
253
|
+
raise OCRError("OCR failed with a non-0 return code.")
|
254
|
+
|
255
|
+
output = await AsyncPath(output_path).read_text("utf-8")
|
256
|
+
return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
257
|
+
except (RuntimeError, OSError) as e:
|
258
|
+
raise OCRError("Failed to OCR using tesseract") from e
|
259
|
+
finally:
|
260
|
+
await unlink()
|
261
|
+
|
262
|
+
|
263
|
+
async def process_image(
|
264
|
+
image: Image,
|
265
|
+
*,
|
266
|
+
language: SupportedLanguage,
|
267
|
+
psm: PSMMode,
|
268
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
269
|
+
) -> ExtractionResult:
|
249
270
|
"""Process a single Pillow Image using Tesseract OCR.
|
250
271
|
|
251
272
|
Args:
|
252
273
|
image: The Pillow Image to process.
|
253
274
|
language: The language code for OCR.
|
254
275
|
psm: Page segmentation mode.
|
255
|
-
|
276
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
256
277
|
|
257
278
|
Returns:
|
258
|
-
|
279
|
+
ExtractionResult: The extracted text from the image.
|
259
280
|
"""
|
260
|
-
|
261
|
-
|
262
|
-
|
281
|
+
image_path, unlink = await create_temp_file(".png")
|
282
|
+
await run_sync(image.save, str(image_path), format="PNG")
|
283
|
+
result = await process_file(image_path, language=language, psm=psm, max_processes=max_processes)
|
284
|
+
await unlink()
|
285
|
+
return result
|
263
286
|
|
264
287
|
|
265
288
|
async def process_image_with_tesseract(
|
266
289
|
image: Image | PathLike[str] | str,
|
267
290
|
*,
|
268
|
-
language:
|
291
|
+
language: SupportedLanguage = "eng",
|
269
292
|
psm: PSMMode = PSMMode.AUTO,
|
270
|
-
|
271
|
-
) ->
|
293
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
294
|
+
) -> ExtractionResult:
|
272
295
|
"""Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
|
273
296
|
|
274
297
|
Args:
|
275
298
|
image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
|
276
299
|
language: The language code for OCR (default: "eng").
|
277
300
|
psm: Page segmentation mode (default: PSMMode.AUTO).
|
278
|
-
|
301
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
279
302
|
|
280
303
|
Raises:
|
281
304
|
ValueError: If the input is not a Pillow Image or a list of Pillow Images.
|
@@ -286,10 +309,10 @@ async def process_image_with_tesseract(
|
|
286
309
|
await validate_tesseract_version()
|
287
310
|
|
288
311
|
if isinstance(image, Image):
|
289
|
-
return await process_image(image, language=language, psm=psm,
|
312
|
+
return await process_image(image, language=language, psm=psm, max_processes=max_processes)
|
290
313
|
|
291
314
|
if isinstance(image, (PathLike, str)):
|
292
|
-
return await process_file(image, language=language, psm=psm,
|
315
|
+
return await process_file(image, language=language, psm=psm, max_processes=max_processes)
|
293
316
|
|
294
317
|
raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
|
295
318
|
|
@@ -297,22 +320,36 @@ async def process_image_with_tesseract(
|
|
297
320
|
async def batch_process_images(
|
298
321
|
images: list[T],
|
299
322
|
*,
|
300
|
-
language:
|
323
|
+
language: SupportedLanguage = "eng",
|
301
324
|
psm: PSMMode = PSMMode.AUTO,
|
302
|
-
|
303
|
-
) -> list[
|
304
|
-
"""Run Tesseract OCR asynchronously on
|
325
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
326
|
+
) -> list[ExtractionResult]:
|
327
|
+
"""Run Tesseract OCR asynchronously on multiple images with controlled concurrency.
|
305
328
|
|
306
329
|
Args:
|
307
330
|
images: A list of Pillow Images, paths or strings to process.
|
308
331
|
language: The language code for OCR (default: "eng").
|
309
332
|
psm: Page segmentation mode (default: PSMMode.AUTO).
|
310
|
-
|
333
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
334
|
+
|
335
|
+
Raises:
|
336
|
+
ParsingError: If OCR fails to extract text from any of the images.
|
311
337
|
|
312
338
|
Returns:
|
313
|
-
|
339
|
+
List of ExtractionResult objects, one per input image.
|
314
340
|
"""
|
315
341
|
await validate_tesseract_version()
|
316
|
-
|
317
|
-
|
318
|
-
)
|
342
|
+
results = cast(list[ExtractionResult], list(range(len(images))))
|
343
|
+
|
344
|
+
async def _process_image(index: int, image: T) -> None:
|
345
|
+
results[index] = await process_image_with_tesseract(
|
346
|
+
image, language=language, psm=psm, max_processes=max_processes
|
347
|
+
)
|
348
|
+
|
349
|
+
try:
|
350
|
+
async with create_task_group() as tg:
|
351
|
+
for i, image in enumerate(images):
|
352
|
+
tg.start_soon(_process_image, i, image)
|
353
|
+
return results
|
354
|
+
except ExceptionGroup as eg:
|
355
|
+
raise ParsingError("Failed to process images with Tesseract") from eg
|
kreuzberg/_tmp.py
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from contextlib import suppress
|
4
|
+
from pathlib import Path
|
5
|
+
from tempfile import NamedTemporaryFile
|
6
|
+
from typing import TYPE_CHECKING, Callable
|
7
|
+
|
8
|
+
from anyio import Path as AsyncPath
|
9
|
+
|
10
|
+
from kreuzberg._sync import run_sync
|
11
|
+
|
12
|
+
if TYPE_CHECKING: # pragma: no cover
|
13
|
+
from collections.abc import Coroutine
|
14
|
+
|
15
|
+
|
16
|
+
async def create_temp_file(
|
17
|
+
extension: str, content: bytes | None = None
|
18
|
+
) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
|
19
|
+
"""Create a temporary file that is closed.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
extension: The file extension.
|
23
|
+
content: The content to write to the file.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
The temporary file path.
|
27
|
+
"""
|
28
|
+
file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
|
29
|
+
if content:
|
30
|
+
await AsyncPath(file.name).write_bytes(content)
|
31
|
+
await run_sync(file.close)
|
32
|
+
|
33
|
+
async def unlink() -> None:
|
34
|
+
with suppress(OSError, PermissionError):
|
35
|
+
await AsyncPath(file.name).unlink(missing_ok=True)
|
36
|
+
|
37
|
+
return Path(file.name), unlink
|