kreuzberg 3.0.0__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/__init__.py +0 -0
- kreuzberg/_extractors/_base.py +92 -0
- kreuzberg/_extractors/_html.py +34 -0
- kreuzberg/_extractors/_image.py +74 -0
- kreuzberg/_extractors/_pandoc.py +613 -0
- kreuzberg/_extractors/_pdf.py +163 -0
- kreuzberg/_extractors/_presentation.py +233 -0
- kreuzberg/_extractors/_spread_sheet.py +125 -0
- kreuzberg/_ocr/__init__.py +17 -0
- kreuzberg/_ocr/_base.py +54 -0
- kreuzberg/_ocr/_easyocr.py +376 -0
- kreuzberg/_ocr/_paddleocr.py +291 -0
- kreuzberg/_ocr/_tesseract.py +342 -0
- kreuzberg/_utils/__init__.py +0 -0
- kreuzberg/_utils/_string.py +39 -0
- kreuzberg/_utils/_sync.py +121 -0
- kreuzberg/_utils/_tmp.py +37 -0
- {kreuzberg-3.0.0.dist-info → kreuzberg-3.0.1.dist-info}/METADATA +2 -2
- kreuzberg-3.0.1.dist-info/RECORD +32 -0
- {kreuzberg-3.0.0.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
- kreuzberg-3.0.0.dist-info/RECORD +0 -15
- {kreuzberg-3.0.0.dist-info → kreuzberg-3.0.1.dist-info}/licenses/LICENSE +0 -0
- {kreuzberg-3.0.0.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,342 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
import sys
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from enum import Enum
|
7
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
8
|
+
|
9
|
+
from anyio import Path as AsyncPath
|
10
|
+
from anyio import run_process
|
11
|
+
|
12
|
+
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
13
|
+
from kreuzberg._ocr._base import OCRBackend
|
14
|
+
from kreuzberg._types import ExtractionResult
|
15
|
+
from kreuzberg._utils._string import normalize_spaces
|
16
|
+
from kreuzberg._utils._sync import run_sync
|
17
|
+
from kreuzberg._utils._tmp import create_temp_file
|
18
|
+
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
19
|
+
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
from pathlib import Path
|
22
|
+
|
23
|
+
from PIL.Image import Image
|
24
|
+
|
25
|
+
try: # pragma: no cover
|
26
|
+
from typing import Unpack # type: ignore[attr-defined]
|
27
|
+
except ImportError: # pragma: no cover
|
28
|
+
from typing_extensions import Unpack
|
29
|
+
|
30
|
+
|
31
|
+
TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
32
|
+
"afr",
|
33
|
+
"amh",
|
34
|
+
"ara",
|
35
|
+
"asm",
|
36
|
+
"aze",
|
37
|
+
"aze_cyrl",
|
38
|
+
"bel",
|
39
|
+
"ben",
|
40
|
+
"bod",
|
41
|
+
"bos",
|
42
|
+
"bre",
|
43
|
+
"bul",
|
44
|
+
"cat",
|
45
|
+
"ceb",
|
46
|
+
"ces",
|
47
|
+
"chi_sim",
|
48
|
+
"chi_tra",
|
49
|
+
"chr",
|
50
|
+
"cos",
|
51
|
+
"cym",
|
52
|
+
"dan",
|
53
|
+
"dan_frak",
|
54
|
+
"deu",
|
55
|
+
"deu_frak",
|
56
|
+
"deu_latf",
|
57
|
+
"dzo",
|
58
|
+
"ell",
|
59
|
+
"eng",
|
60
|
+
"enm",
|
61
|
+
"epo",
|
62
|
+
"equ",
|
63
|
+
"est",
|
64
|
+
"eus",
|
65
|
+
"fao",
|
66
|
+
"fas",
|
67
|
+
"fil",
|
68
|
+
"fin",
|
69
|
+
"fra",
|
70
|
+
"frk",
|
71
|
+
"frm",
|
72
|
+
"fry",
|
73
|
+
"gla",
|
74
|
+
"gle",
|
75
|
+
"glg",
|
76
|
+
"grc",
|
77
|
+
"guj",
|
78
|
+
"hat",
|
79
|
+
"heb",
|
80
|
+
"hin",
|
81
|
+
"hrv",
|
82
|
+
"hun",
|
83
|
+
"hye",
|
84
|
+
"iku",
|
85
|
+
"ind",
|
86
|
+
"isl",
|
87
|
+
"ita",
|
88
|
+
"ita_old",
|
89
|
+
"jav",
|
90
|
+
"jpn",
|
91
|
+
"kan",
|
92
|
+
"kat",
|
93
|
+
"kat_old",
|
94
|
+
"kaz",
|
95
|
+
"khm",
|
96
|
+
"kir",
|
97
|
+
"kmr",
|
98
|
+
"kor",
|
99
|
+
"kor_vert",
|
100
|
+
"kur",
|
101
|
+
"lao",
|
102
|
+
"lat",
|
103
|
+
"lav",
|
104
|
+
"lit",
|
105
|
+
"ltz",
|
106
|
+
"mal",
|
107
|
+
"mar",
|
108
|
+
"mkd",
|
109
|
+
"mlt",
|
110
|
+
"mon",
|
111
|
+
"mri",
|
112
|
+
"msa",
|
113
|
+
"mya",
|
114
|
+
"nep",
|
115
|
+
"nld",
|
116
|
+
"nor",
|
117
|
+
"oci",
|
118
|
+
"ori",
|
119
|
+
"osd",
|
120
|
+
"pan",
|
121
|
+
"pol",
|
122
|
+
"por",
|
123
|
+
"pus",
|
124
|
+
"que",
|
125
|
+
"ron",
|
126
|
+
"rus",
|
127
|
+
"san",
|
128
|
+
"sin",
|
129
|
+
"slk",
|
130
|
+
"slk_frak",
|
131
|
+
"slv",
|
132
|
+
"snd",
|
133
|
+
"spa",
|
134
|
+
"spa_old",
|
135
|
+
"sqi",
|
136
|
+
"srp",
|
137
|
+
"srp_latn",
|
138
|
+
"sun",
|
139
|
+
"swa",
|
140
|
+
"swe",
|
141
|
+
"syr",
|
142
|
+
"tam",
|
143
|
+
"tat",
|
144
|
+
"tel",
|
145
|
+
"tgk",
|
146
|
+
"tgl",
|
147
|
+
"tha", # codespell:ignore
|
148
|
+
"tir",
|
149
|
+
"ton",
|
150
|
+
"tur",
|
151
|
+
"uig",
|
152
|
+
"ukr",
|
153
|
+
"urd",
|
154
|
+
"uzb",
|
155
|
+
"uzb_cyrl",
|
156
|
+
"vie", # codespell:ignore
|
157
|
+
"yid",
|
158
|
+
"yor",
|
159
|
+
}
|
160
|
+
|
161
|
+
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
162
|
+
|
163
|
+
|
164
|
+
class PSMMode(Enum):
|
165
|
+
"""Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
|
166
|
+
|
167
|
+
OSD_ONLY = 0
|
168
|
+
"""Orientation and script detection only."""
|
169
|
+
AUTO_OSD = 1
|
170
|
+
"""Automatic page segmentation with orientation and script detection."""
|
171
|
+
AUTO_ONLY = 2
|
172
|
+
"""Automatic page segmentation without OSD."""
|
173
|
+
AUTO = 3
|
174
|
+
"""Fully automatic page segmentation (default)."""
|
175
|
+
SINGLE_COLUMN = 4
|
176
|
+
"""Assume a single column of text."""
|
177
|
+
SINGLE_BLOCK_VERTICAL = 5
|
178
|
+
"""Assume a single uniform block of vertically aligned text."""
|
179
|
+
SINGLE_BLOCK = 6
|
180
|
+
"""Assume a single uniform block of text."""
|
181
|
+
SINGLE_LINE = 7
|
182
|
+
"""Treat the image as a single text line."""
|
183
|
+
SINGLE_WORD = 8
|
184
|
+
"""Treat the image as a single word."""
|
185
|
+
CIRCLE_WORD = 9
|
186
|
+
"""Treat the image as a single word in a circle."""
|
187
|
+
SINGLE_CHAR = 10
|
188
|
+
"""Treat the image as a single character."""
|
189
|
+
|
190
|
+
|
191
|
+
@dataclass(unsafe_hash=True, frozen=True)
|
192
|
+
class TesseractConfig:
|
193
|
+
"""Configuration options for Tesseract OCR engine."""
|
194
|
+
|
195
|
+
classify_use_pre_adapted_templates: bool = True
|
196
|
+
"""Whether to use pre-adapted templates during classification to improve recognition accuracy."""
|
197
|
+
language: str = "eng"
|
198
|
+
"""Language code to use for OCR.
|
199
|
+
Examples:
|
200
|
+
- 'eng' for English
|
201
|
+
- 'deu' for German
|
202
|
+
- multiple languages combined with '+', e.g. 'eng+deu')
|
203
|
+
"""
|
204
|
+
language_model_ngram_on: bool = True
|
205
|
+
"""Enable or disable the use of n-gram-based language models for improved text recognition."""
|
206
|
+
psm: PSMMode = PSMMode.AUTO
|
207
|
+
"""Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
|
208
|
+
tessedit_dont_blkrej_good_wds: bool = True
|
209
|
+
"""If True, prevents block rejection of words identified as good, improving text output quality."""
|
210
|
+
tessedit_dont_rowrej_good_wds: bool = True
|
211
|
+
"""If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
|
212
|
+
tessedit_enable_dict_correction: bool = True
|
213
|
+
"""Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
|
214
|
+
tessedit_use_primary_params_model: bool = True
|
215
|
+
"""If True, forces the use of the primary parameters model for text recognition."""
|
216
|
+
textord_space_size_is_variable: bool = True
|
217
|
+
"""Allow variable spacing between words, useful for text with irregular spacing."""
|
218
|
+
thresholding_method: bool = False
|
219
|
+
"""Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
|
220
|
+
|
221
|
+
|
222
|
+
class TesseractBackend(OCRBackend[TesseractConfig]):
|
223
|
+
_version_checked: ClassVar[bool] = False
|
224
|
+
|
225
|
+
async def process_image(
|
226
|
+
self,
|
227
|
+
image: Image,
|
228
|
+
**kwargs: Unpack[TesseractConfig],
|
229
|
+
) -> ExtractionResult:
|
230
|
+
await self._validate_tesseract_version()
|
231
|
+
image_path, unlink = await create_temp_file(".png")
|
232
|
+
await run_sync(image.save, str(image_path), format="PNG")
|
233
|
+
try:
|
234
|
+
return await self.process_file(image_path, **kwargs)
|
235
|
+
finally:
|
236
|
+
await unlink()
|
237
|
+
|
238
|
+
async def process_file(
|
239
|
+
self,
|
240
|
+
path: Path,
|
241
|
+
**kwargs: Unpack[TesseractConfig],
|
242
|
+
) -> ExtractionResult:
|
243
|
+
await self._validate_tesseract_version()
|
244
|
+
output_path, unlink = await create_temp_file(".txt")
|
245
|
+
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
246
|
+
psm = kwargs.pop("psm", PSMMode.AUTO)
|
247
|
+
try:
|
248
|
+
output_base = str(output_path).replace(".txt", "")
|
249
|
+
command = [
|
250
|
+
"tesseract",
|
251
|
+
str(path),
|
252
|
+
output_base,
|
253
|
+
"-l",
|
254
|
+
language,
|
255
|
+
"--psm",
|
256
|
+
str(psm.value),
|
257
|
+
"--oem",
|
258
|
+
"1",
|
259
|
+
"--loglevel",
|
260
|
+
"OFF",
|
261
|
+
]
|
262
|
+
for kwarg, value in kwargs.items():
|
263
|
+
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
264
|
+
|
265
|
+
env: dict[str, Any] | None = None
|
266
|
+
if sys.platform.startswith("linux"):
|
267
|
+
# we have to prevent multithreading this way otherwise we will get deadlocks ~keep
|
268
|
+
env = {"OMP_THREAD_LIMIT": "1"}
|
269
|
+
|
270
|
+
result = await run_process(command, env=env)
|
271
|
+
|
272
|
+
if not result.returncode == 0:
|
273
|
+
raise OCRError(
|
274
|
+
"OCR failed with a non-0 return code.",
|
275
|
+
context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
|
276
|
+
)
|
277
|
+
|
278
|
+
output = await AsyncPath(output_path).read_text("utf-8")
|
279
|
+
return ExtractionResult(
|
280
|
+
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
281
|
+
)
|
282
|
+
except (RuntimeError, OSError) as e:
|
283
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
284
|
+
finally:
|
285
|
+
await unlink()
|
286
|
+
|
287
|
+
@classmethod
|
288
|
+
async def _validate_tesseract_version(cls) -> None:
|
289
|
+
"""Validate that Tesseract is installed and is version 5 or above.
|
290
|
+
|
291
|
+
Raises:
|
292
|
+
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
293
|
+
"""
|
294
|
+
try:
|
295
|
+
if cls._version_checked:
|
296
|
+
return
|
297
|
+
|
298
|
+
command = ["tesseract", "--version"]
|
299
|
+
result = await run_process(command)
|
300
|
+
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
301
|
+
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
302
|
+
raise MissingDependencyError(
|
303
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
304
|
+
)
|
305
|
+
|
306
|
+
cls._version_checked = True
|
307
|
+
except FileNotFoundError as e:
|
308
|
+
raise MissingDependencyError(
|
309
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
310
|
+
) from e
|
311
|
+
|
312
|
+
@staticmethod
|
313
|
+
def _validate_language_code(language_code: str) -> str:
|
314
|
+
"""Convert a language code to Tesseract format.
|
315
|
+
|
316
|
+
Args:
|
317
|
+
language_code: Tesseract supported language code or multiple language codes connected with '+'
|
318
|
+
|
319
|
+
Raises:
|
320
|
+
ValidationError: If the language is not supported by Tesseract
|
321
|
+
|
322
|
+
Returns:
|
323
|
+
Language code compatible with Tesseract
|
324
|
+
"""
|
325
|
+
normalized = language_code.lower()
|
326
|
+
if normalized in TESSERACT_SUPPORTED_LANGUAGE_CODES:
|
327
|
+
return normalized
|
328
|
+
|
329
|
+
if "+" in normalized and all(lang in TESSERACT_SUPPORTED_LANGUAGE_CODES for lang in normalized.split("+")):
|
330
|
+
return normalized
|
331
|
+
|
332
|
+
raise ValidationError(
|
333
|
+
"The provided language code is not supported by Tesseract",
|
334
|
+
context={
|
335
|
+
"language_code": normalized
|
336
|
+
if "+" not in normalized
|
337
|
+
else ",".join(
|
338
|
+
[lang for lang in normalized.split("+") if lang not in TESSERACT_SUPPORTED_LANGUAGE_CODES]
|
339
|
+
),
|
340
|
+
"supported_languages": ",".join(sorted(TESSERACT_SUPPORTED_LANGUAGE_CODES)),
|
341
|
+
},
|
342
|
+
)
|
File without changes
|
@@ -0,0 +1,39 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from contextlib import suppress
|
4
|
+
|
5
|
+
from charset_normalizer import detect
|
6
|
+
|
7
|
+
|
8
|
+
def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
9
|
+
"""Decode a byte string safely, removing invalid sequences.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
byte_data: The byte string to decode.
|
13
|
+
encoding: The encoding to use when decoding the byte string.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
The decoded string.
|
17
|
+
"""
|
18
|
+
if not byte_data:
|
19
|
+
return ""
|
20
|
+
|
21
|
+
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
|
22
|
+
|
23
|
+
for enc in [e for e in encodings if e]: # pragma: no cover
|
24
|
+
with suppress(UnicodeDecodeError, LookupError):
|
25
|
+
return byte_data.decode(enc)
|
26
|
+
|
27
|
+
return byte_data.decode("latin-1", errors="replace")
|
28
|
+
|
29
|
+
|
30
|
+
def normalize_spaces(text: str) -> str:
|
31
|
+
"""Normalize the spaces in a string.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
text: The text to sanitize.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
The sanitized text.
|
38
|
+
"""
|
39
|
+
return " ".join(text.strip().split())
|
@@ -0,0 +1,121 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import sys
|
4
|
+
from functools import partial
|
5
|
+
from inspect import isawaitable, iscoroutinefunction
|
6
|
+
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
7
|
+
|
8
|
+
import anyio
|
9
|
+
from anyio import create_task_group
|
10
|
+
from anyio.to_thread import run_sync as any_io_run_sync
|
11
|
+
|
12
|
+
if TYPE_CHECKING: # pragma: no cover
|
13
|
+
from collections.abc import Awaitable, Callable
|
14
|
+
|
15
|
+
if sys.version_info >= (3, 10):
|
16
|
+
from typing import ParamSpec
|
17
|
+
else: # pragma: no cover
|
18
|
+
from typing_extensions import ParamSpec
|
19
|
+
|
20
|
+
T = TypeVar("T")
|
21
|
+
P = ParamSpec("P")
|
22
|
+
|
23
|
+
|
24
|
+
async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
|
25
|
+
"""Run a synchronous function in an asynchronous context.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
sync_fn: The synchronous function to run.
|
29
|
+
*args: The positional arguments to pass to the function.
|
30
|
+
**kwargs: The keyword arguments to pass to the function.
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
The result of the synchronous function.
|
34
|
+
"""
|
35
|
+
handler = partial(sync_fn, **kwargs)
|
36
|
+
return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
37
|
+
|
38
|
+
|
39
|
+
async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
|
40
|
+
"""Run a list of coroutines concurrently.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
*async_tasks: The list of coroutines to run.
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
The results of the coroutines.
|
47
|
+
"""
|
48
|
+
results: list[Any] = [None] * len(async_tasks)
|
49
|
+
|
50
|
+
async def run_task(index: int, task: Awaitable[T]) -> None:
|
51
|
+
results[index] = await task
|
52
|
+
|
53
|
+
async with create_task_group() as tg:
|
54
|
+
for i, t in enumerate(async_tasks):
|
55
|
+
tg.start_soon(run_task, i, t)
|
56
|
+
|
57
|
+
return results
|
58
|
+
|
59
|
+
|
60
|
+
async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
|
61
|
+
"""Run a list of coroutines concurrently in batches.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
*async_tasks: The list of coroutines to run.
|
65
|
+
batch_size: The size of each batch.
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
The results of the coroutines.
|
69
|
+
"""
|
70
|
+
results: list[Any] = []
|
71
|
+
|
72
|
+
for i in range(0, len(async_tasks), batch_size):
|
73
|
+
batch = async_tasks[i : i + batch_size]
|
74
|
+
results.extend(await run_taskgroup(*batch))
|
75
|
+
|
76
|
+
return results
|
77
|
+
|
78
|
+
|
79
|
+
async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
|
80
|
+
"""Executes a callable function and handles both synchronous and asynchronous
|
81
|
+
results.
|
82
|
+
|
83
|
+
This function invokes the provided callable `sync_fn` with the given
|
84
|
+
arguments and keyword arguments. If the result of `sync_fn` is awaitable,
|
85
|
+
it awaits the result before returning it. Otherwise, the result is returned
|
86
|
+
directly.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
fn: The callable to be executed. It can produce either a
|
90
|
+
synchronous or asynchronous result.
|
91
|
+
*args: Positional arguments to pass to `sync_fn`.
|
92
|
+
**kwargs: Keyword arguments to pass to `sync_fn`.
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
The result of `sync_fn` invocation. If the result is awaitable, the
|
96
|
+
awaited value is returned. Otherwise, the synchronous result is
|
97
|
+
returned.
|
98
|
+
"""
|
99
|
+
result = fn(*args, **kwargs)
|
100
|
+
if isawaitable(result):
|
101
|
+
return cast("T", await result)
|
102
|
+
return result
|
103
|
+
|
104
|
+
|
105
|
+
def run_maybe_async(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
|
106
|
+
"""Runs a synchronous or asynchronous function, resolving the output.
|
107
|
+
|
108
|
+
Determines if the provided function is synchronous or asynchronous. If synchronous,
|
109
|
+
executes it directly. If asynchronous, it runs the function within the event loop
|
110
|
+
using anyio. The return value is resolved regardless of the function type.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
fn: The function to be executed, which can
|
114
|
+
either be synchronous or asynchronous.
|
115
|
+
*args: Positional arguments to be passed to the function.
|
116
|
+
**kwargs: Keyword arguments to be passed to the function.
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
T: The return value of the executed function, resolved if asynchronous.
|
120
|
+
"""
|
121
|
+
return cast("T", fn(*args, **kwargs) if not iscoroutinefunction(fn) else anyio.run(partial(fn, **kwargs), *args))
|
kreuzberg/_utils/_tmp.py
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from contextlib import suppress
|
4
|
+
from pathlib import Path
|
5
|
+
from tempfile import NamedTemporaryFile
|
6
|
+
from typing import TYPE_CHECKING, Callable
|
7
|
+
|
8
|
+
from anyio import Path as AsyncPath
|
9
|
+
|
10
|
+
from kreuzberg._utils._sync import run_sync
|
11
|
+
|
12
|
+
if TYPE_CHECKING: # pragma: no cover
|
13
|
+
from collections.abc import Coroutine
|
14
|
+
|
15
|
+
|
16
|
+
async def create_temp_file(
|
17
|
+
extension: str, content: bytes | None = None
|
18
|
+
) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
|
19
|
+
"""Create a temporary file that is closed.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
extension: The file extension.
|
23
|
+
content: The content to write to the file.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
The temporary file path.
|
27
|
+
"""
|
28
|
+
file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
|
29
|
+
if content:
|
30
|
+
await AsyncPath(file.name).write_bytes(content)
|
31
|
+
await run_sync(file.close)
|
32
|
+
|
33
|
+
async def unlink() -> None:
|
34
|
+
with suppress(OSError, PermissionError):
|
35
|
+
await AsyncPath(file.name).unlink(missing_ok=True)
|
36
|
+
|
37
|
+
return Path(file.name), unlink
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.0.
|
3
|
+
Version: 3.0.1
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -151,7 +151,7 @@ Kreuzberg supports multiple OCR engines:
|
|
151
151
|
- **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
|
152
152
|
- **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
|
153
153
|
|
154
|
-
For comparison and selection guidance, see the [OCR Backends](https://
|
154
|
+
For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
|
155
155
|
|
156
156
|
## Contribution
|
157
157
|
|
@@ -0,0 +1,32 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=KZ_y21m64cafWL7goGeG3EIDutM184st28n4UGajADs,1131
|
2
|
+
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
3
|
+
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
4
|
+
kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
|
5
|
+
kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
|
6
|
+
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
7
|
+
kreuzberg/_types.py,sha256=sZMxjRZQ1c_MzxdumhYSWghW6yXBwohTUIBa5eR-FKA,6582
|
8
|
+
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
9
|
+
kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
|
10
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
13
|
+
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
14
|
+
kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
|
15
|
+
kreuzberg/_extractors/_pandoc.py,sha256=a6cYQxoh5G9EMrDWVcQhrTkE4Mar24sNiGCY0zOOzw4,20121
|
16
|
+
kreuzberg/_extractors/_pdf.py,sha256=dcSAXyqH8SZ-z45OUAjjwdboSEbrli0YekS8PxCaVGA,6384
|
17
|
+
kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
|
18
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
|
19
|
+
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
20
|
+
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
21
|
+
kreuzberg/_ocr/_easyocr.py,sha256=VfYW66SkB2Bigbrtd7WEeJ6QZ_1Y5d8Z_rZYBPMsuk0,11037
|
22
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=X5es69QMl0P6DZuuRNKWHaRtLi1OJqFs-mWHR_gVKvY,10837
|
23
|
+
kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
|
24
|
+
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
+
kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
|
26
|
+
kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
|
27
|
+
kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
|
28
|
+
kreuzberg-3.0.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
29
|
+
kreuzberg-3.0.1.dist-info/METADATA,sha256=5Kt0w9rFBAina8SzbO-m2umEMRJQL-4mcPGAQASko_k,6545
|
30
|
+
kreuzberg-3.0.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
31
|
+
kreuzberg-3.0.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
32
|
+
kreuzberg-3.0.1.dist-info/RECORD,,
|
kreuzberg-3.0.0.dist-info/RECORD
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=KZ_y21m64cafWL7goGeG3EIDutM184st28n4UGajADs,1131
|
2
|
-
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
3
|
-
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
4
|
-
kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
|
5
|
-
kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
|
6
|
-
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
7
|
-
kreuzberg/_types.py,sha256=sZMxjRZQ1c_MzxdumhYSWghW6yXBwohTUIBa5eR-FKA,6582
|
8
|
-
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
9
|
-
kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
|
10
|
-
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
kreuzberg-3.0.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
12
|
-
kreuzberg-3.0.0.dist-info/METADATA,sha256=wlO9VCvZQy_gJJTmhGzH9j8BlPQPFQdmMZQxJOcQAUg,6515
|
13
|
-
kreuzberg-3.0.0.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
|
14
|
-
kreuzberg-3.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
15
|
-
kreuzberg-3.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|