kreuzberg 2.1.2__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +16 -2
- kreuzberg/_chunker.py +51 -0
- kreuzberg/_constants.py +2 -3
- kreuzberg/_mime_types.py +19 -26
- kreuzberg/_playa.py +276 -0
- kreuzberg/_registry.py +108 -0
- kreuzberg/_types.py +133 -36
- kreuzberg/exceptions.py +25 -0
- kreuzberg/extraction.py +114 -227
- kreuzberg-3.0.0.dist-info/METADATA +178 -0
- kreuzberg-3.0.0.dist-info/RECORD +15 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.0.dist-info}/WHEEL +1 -1
- kreuzberg/_html.py +0 -31
- kreuzberg/_pandoc.py +0 -366
- kreuzberg/_pdf.py +0 -190
- kreuzberg/_pptx.py +0 -88
- kreuzberg/_string.py +0 -41
- kreuzberg/_sync.py +0 -74
- kreuzberg/_tesseract.py +0 -231
- kreuzberg/_tmp.py +0 -37
- kreuzberg/_xlsx.py +0 -88
- kreuzberg-2.1.2.dist-info/METADATA +0 -446
- kreuzberg-2.1.2.dist-info/RECORD +0 -21
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.0.dist-info/licenses}/LICENSE +0 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.0.dist-info}/top_level.txt +0 -0
kreuzberg/_string.py
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from contextlib import suppress
|
4
|
-
|
5
|
-
from charset_normalizer import detect
|
6
|
-
|
7
|
-
|
8
|
-
def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
9
|
-
"""Decode a byte string safely, removing invalid sequences.
|
10
|
-
|
11
|
-
Args:
|
12
|
-
byte_data: The byte string to decode.
|
13
|
-
encoding: The encoding to use when decoding the byte string.
|
14
|
-
|
15
|
-
Returns:
|
16
|
-
The decoded string.
|
17
|
-
"""
|
18
|
-
if not byte_data:
|
19
|
-
return ""
|
20
|
-
|
21
|
-
# We try each encoding in order until one works
|
22
|
-
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
|
23
|
-
|
24
|
-
for enc in [e for e in encodings if e]: # pragma: no cover
|
25
|
-
with suppress(UnicodeDecodeError, LookupError):
|
26
|
-
return byte_data.decode(enc)
|
27
|
-
|
28
|
-
# If all encodings fail, fall back to latin-1 which can handle any byte
|
29
|
-
return byte_data.decode("latin-1", errors="replace")
|
30
|
-
|
31
|
-
|
32
|
-
def normalize_spaces(text: str) -> str:
|
33
|
-
"""Normalize the spaces in a string.
|
34
|
-
|
35
|
-
Args:
|
36
|
-
text: The text to sanitize.
|
37
|
-
|
38
|
-
Returns:
|
39
|
-
The sanitized text.
|
40
|
-
"""
|
41
|
-
return " ".join(text.strip().split())
|
kreuzberg/_sync.py
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import sys
|
4
|
-
from functools import partial
|
5
|
-
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
6
|
-
|
7
|
-
from anyio import create_task_group
|
8
|
-
from anyio.to_thread import run_sync as any_io_run_sync
|
9
|
-
|
10
|
-
if TYPE_CHECKING: # pragma: no cover
|
11
|
-
from collections.abc import Awaitable, Callable
|
12
|
-
|
13
|
-
if sys.version_info >= (3, 10):
|
14
|
-
from typing import ParamSpec
|
15
|
-
else: # pragma: no cover
|
16
|
-
from typing_extensions import ParamSpec
|
17
|
-
|
18
|
-
T = TypeVar("T")
|
19
|
-
P = ParamSpec("P")
|
20
|
-
|
21
|
-
|
22
|
-
async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
|
23
|
-
"""Run a synchronous function in an asynchronous context.
|
24
|
-
|
25
|
-
Args:
|
26
|
-
sync_fn: The synchronous function to run.
|
27
|
-
*args: The positional arguments to pass to the function.
|
28
|
-
**kwargs: The keyword arguments to pass to the function.
|
29
|
-
|
30
|
-
Returns:
|
31
|
-
The result of the synchronous function.
|
32
|
-
"""
|
33
|
-
handler = partial(sync_fn, **kwargs)
|
34
|
-
return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
35
|
-
|
36
|
-
|
37
|
-
async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
|
38
|
-
"""Run a list of coroutines concurrently.
|
39
|
-
|
40
|
-
Args:
|
41
|
-
*async_tasks: The list of coroutines to run.
|
42
|
-
|
43
|
-
Returns:
|
44
|
-
The results of the coroutines.
|
45
|
-
"""
|
46
|
-
results: list[Any] = [None] * len(async_tasks)
|
47
|
-
|
48
|
-
async def run_task(index: int, task: Awaitable[T]) -> None:
|
49
|
-
results[index] = await task
|
50
|
-
|
51
|
-
async with create_task_group() as tg:
|
52
|
-
for i, t in enumerate(async_tasks):
|
53
|
-
tg.start_soon(run_task, i, t)
|
54
|
-
|
55
|
-
return results
|
56
|
-
|
57
|
-
|
58
|
-
async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
|
59
|
-
"""Run a list of coroutines concurrently in batches.
|
60
|
-
|
61
|
-
Args:
|
62
|
-
*async_tasks: The list of coroutines to run.
|
63
|
-
batch_size: The size of each batch.
|
64
|
-
|
65
|
-
Returns:
|
66
|
-
The results of the coroutines.
|
67
|
-
"""
|
68
|
-
results: list[Any] = []
|
69
|
-
|
70
|
-
for i in range(0, len(async_tasks), batch_size):
|
71
|
-
batch = async_tasks[i : i + batch_size]
|
72
|
-
results.extend(await run_taskgroup(*batch))
|
73
|
-
|
74
|
-
return results
|
kreuzberg/_tesseract.py
DELETED
@@ -1,231 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import re
|
4
|
-
import sys
|
5
|
-
from enum import Enum
|
6
|
-
from os import PathLike
|
7
|
-
from typing import Any, TypeVar, Union
|
8
|
-
|
9
|
-
from anyio import Path as AsyncPath
|
10
|
-
from anyio import run_process
|
11
|
-
from PIL.Image import Image
|
12
|
-
|
13
|
-
from kreuzberg._constants import DEFAULT_MAX_PROCESSES, MINIMAL_SUPPORTED_TESSERACT_VERSION
|
14
|
-
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
15
|
-
from kreuzberg._string import normalize_spaces
|
16
|
-
from kreuzberg._sync import run_sync, run_taskgroup_batched
|
17
|
-
from kreuzberg._tmp import create_temp_file
|
18
|
-
from kreuzberg._types import ExtractionResult
|
19
|
-
from kreuzberg.exceptions import MissingDependencyError, OCRError, ParsingError
|
20
|
-
|
21
|
-
if sys.version_info < (3, 11): # pragma: no cover
|
22
|
-
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
23
|
-
|
24
|
-
version_ref = {"checked": False}
|
25
|
-
|
26
|
-
T = TypeVar("T", bound=Union[Image, PathLike[str], str])
|
27
|
-
|
28
|
-
|
29
|
-
class PSMMode(Enum):
|
30
|
-
"""Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
|
31
|
-
|
32
|
-
OSD_ONLY = 0
|
33
|
-
"""Orientation and script detection only."""
|
34
|
-
AUTO_OSD = 1
|
35
|
-
"""Automatic page segmentation with orientation and script detection."""
|
36
|
-
AUTO_ONLY = 2
|
37
|
-
"""Automatic page segmentation without OSD."""
|
38
|
-
AUTO = 3
|
39
|
-
"""Fully automatic page segmentation (default)."""
|
40
|
-
SINGLE_COLUMN = 4
|
41
|
-
"""Assume a single column of text."""
|
42
|
-
SINGLE_BLOCK_VERTICAL = 5
|
43
|
-
"""Assume a single uniform block of vertically aligned text."""
|
44
|
-
SINGLE_BLOCK = 6
|
45
|
-
"""Assume a single uniform block of text."""
|
46
|
-
SINGLE_LINE = 7
|
47
|
-
"""Treat the image as a single text line."""
|
48
|
-
SINGLE_WORD = 8
|
49
|
-
"""Treat the image as a single word."""
|
50
|
-
CIRCLE_WORD = 9
|
51
|
-
"""Treat the image as a single word in a circle."""
|
52
|
-
SINGLE_CHAR = 10
|
53
|
-
"""Treat the image as a single character."""
|
54
|
-
|
55
|
-
|
56
|
-
async def validate_tesseract_version() -> None:
|
57
|
-
"""Validate that Tesseract is installed and is version 5 or above.
|
58
|
-
|
59
|
-
Raises:
|
60
|
-
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
61
|
-
"""
|
62
|
-
try:
|
63
|
-
if version_ref["checked"]:
|
64
|
-
return
|
65
|
-
|
66
|
-
command = ["tesseract", "--version"]
|
67
|
-
result = await run_process(command)
|
68
|
-
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
69
|
-
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
70
|
-
raise MissingDependencyError("Tesseract version 5 or above is required.")
|
71
|
-
|
72
|
-
version_ref["checked"] = True
|
73
|
-
except FileNotFoundError as e:
|
74
|
-
raise MissingDependencyError(
|
75
|
-
"Tesseract is not installed or not in path. Please install tesseract 5 and above on your system."
|
76
|
-
) from e
|
77
|
-
|
78
|
-
|
79
|
-
async def process_file(
|
80
|
-
input_file: str | PathLike[str],
|
81
|
-
*,
|
82
|
-
language: str,
|
83
|
-
psm: PSMMode,
|
84
|
-
) -> ExtractionResult:
|
85
|
-
"""Process a single image file using Tesseract OCR.
|
86
|
-
|
87
|
-
Args:
|
88
|
-
input_file: The path to the image file to process.
|
89
|
-
language: The language code for OCR.
|
90
|
-
psm: Page segmentation mode.
|
91
|
-
|
92
|
-
Raises:
|
93
|
-
OCRError: If OCR fails to extract text from the image.
|
94
|
-
|
95
|
-
Returns:
|
96
|
-
ExtractionResult: The extracted text from the image.
|
97
|
-
"""
|
98
|
-
output_path, unlink = await create_temp_file(".txt")
|
99
|
-
try:
|
100
|
-
output_base = str(output_path).replace(".txt", "")
|
101
|
-
|
102
|
-
command = [
|
103
|
-
"tesseract",
|
104
|
-
str(input_file),
|
105
|
-
output_base,
|
106
|
-
"-l",
|
107
|
-
language,
|
108
|
-
"--psm",
|
109
|
-
str(psm.value),
|
110
|
-
"--oem",
|
111
|
-
"1",
|
112
|
-
"--loglevel",
|
113
|
-
"OFF",
|
114
|
-
"-c",
|
115
|
-
"thresholding_method=1",
|
116
|
-
"-c",
|
117
|
-
"tessedit_enable_dict_correction=1",
|
118
|
-
"-c",
|
119
|
-
"language_model_ngram_on=1",
|
120
|
-
"-c",
|
121
|
-
"textord_space_size_is_variable=1",
|
122
|
-
"-c",
|
123
|
-
"classify_use_pre_adapted_templates=1",
|
124
|
-
"-c",
|
125
|
-
"tessedit_dont_blkrej_good_wds=1",
|
126
|
-
"-c",
|
127
|
-
"tessedit_dont_rowrej_good_wds=1",
|
128
|
-
"-c",
|
129
|
-
"tessedit_use_primary_params_model=1",
|
130
|
-
]
|
131
|
-
|
132
|
-
env: dict[str, Any] | None = None
|
133
|
-
if sys.platform.startswith("linux"):
|
134
|
-
env = {"OMP_THREAD_LIMIT": "1"}
|
135
|
-
|
136
|
-
result = await run_process(command, env=env)
|
137
|
-
|
138
|
-
if not result.returncode == 0:
|
139
|
-
raise OCRError(
|
140
|
-
"OCR failed with a non-0 return code.",
|
141
|
-
context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
|
142
|
-
)
|
143
|
-
|
144
|
-
output = await AsyncPath(output_path).read_text("utf-8")
|
145
|
-
return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
146
|
-
except (RuntimeError, OSError) as e:
|
147
|
-
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
148
|
-
finally:
|
149
|
-
await unlink()
|
150
|
-
|
151
|
-
|
152
|
-
async def process_image(
|
153
|
-
image: Image,
|
154
|
-
*,
|
155
|
-
language: str,
|
156
|
-
psm: PSMMode,
|
157
|
-
) -> ExtractionResult:
|
158
|
-
"""Process a single Pillow Image using Tesseract OCR.
|
159
|
-
|
160
|
-
Args:
|
161
|
-
image: The Pillow Image to process.
|
162
|
-
language: The language code for OCR.
|
163
|
-
psm: Page segmentation mode.
|
164
|
-
|
165
|
-
Returns:
|
166
|
-
ExtractionResult: The extracted text from the image.
|
167
|
-
"""
|
168
|
-
image_path, unlink = await create_temp_file(".png")
|
169
|
-
await run_sync(image.save, str(image_path), format="PNG")
|
170
|
-
result = await process_file(image_path, language=language, psm=psm)
|
171
|
-
await unlink()
|
172
|
-
return result
|
173
|
-
|
174
|
-
|
175
|
-
async def process_image_with_tesseract(
|
176
|
-
image: Image | PathLike[str] | str,
|
177
|
-
*,
|
178
|
-
language: str = "eng",
|
179
|
-
psm: PSMMode = PSMMode.AUTO,
|
180
|
-
) -> ExtractionResult:
|
181
|
-
"""Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
|
182
|
-
|
183
|
-
Args:
|
184
|
-
image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
|
185
|
-
language: The language code for OCR (default: "eng").
|
186
|
-
psm: Page segmentation mode (default: PSMMode.AUTO).
|
187
|
-
|
188
|
-
Raises:
|
189
|
-
ValueError: If the input is not a Pillow Image or a list of Pillow Images.
|
190
|
-
|
191
|
-
Returns:
|
192
|
-
Extracted text as a string
|
193
|
-
"""
|
194
|
-
await validate_tesseract_version()
|
195
|
-
|
196
|
-
if isinstance(image, Image):
|
197
|
-
return await process_image(image, language=language, psm=psm)
|
198
|
-
|
199
|
-
if isinstance(image, (PathLike, str)):
|
200
|
-
return await process_file(image, language=language, psm=psm)
|
201
|
-
|
202
|
-
raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
|
203
|
-
|
204
|
-
|
205
|
-
async def batch_process_images(
|
206
|
-
images: list[T],
|
207
|
-
*,
|
208
|
-
language: str = "eng",
|
209
|
-
psm: PSMMode = PSMMode.AUTO,
|
210
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
211
|
-
) -> list[ExtractionResult]:
|
212
|
-
"""Run Tesseract OCR asynchronously on multiple images with controlled concurrency.
|
213
|
-
|
214
|
-
Args:
|
215
|
-
images: A list of Pillow Images, paths or strings to process.
|
216
|
-
language: The language code for OCR (default: "eng").
|
217
|
-
psm: Page segmentation mode (default: PSMMode.AUTO).
|
218
|
-
max_processes: Maximum number of concurrent processes (default: CPU count / 2).
|
219
|
-
|
220
|
-
Raises:
|
221
|
-
ParsingError: If OCR fails to extract text from any of the images.
|
222
|
-
|
223
|
-
Returns:
|
224
|
-
List of ExtractionResult objects, one per input image.
|
225
|
-
"""
|
226
|
-
await validate_tesseract_version()
|
227
|
-
try:
|
228
|
-
tasks = [process_image_with_tesseract(image, language=language, psm=psm) for image in images]
|
229
|
-
return await run_taskgroup_batched(*tasks, batch_size=max_processes)
|
230
|
-
except ExceptionGroup as eg:
|
231
|
-
raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg
|
kreuzberg/_tmp.py
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from contextlib import suppress
|
4
|
-
from pathlib import Path
|
5
|
-
from tempfile import NamedTemporaryFile
|
6
|
-
from typing import TYPE_CHECKING, Callable
|
7
|
-
|
8
|
-
from anyio import Path as AsyncPath
|
9
|
-
|
10
|
-
from kreuzberg._sync import run_sync
|
11
|
-
|
12
|
-
if TYPE_CHECKING: # pragma: no cover
|
13
|
-
from collections.abc import Coroutine
|
14
|
-
|
15
|
-
|
16
|
-
async def create_temp_file(
|
17
|
-
extension: str, content: bytes | None = None
|
18
|
-
) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
|
19
|
-
"""Create a temporary file that is closed.
|
20
|
-
|
21
|
-
Args:
|
22
|
-
extension: The file extension.
|
23
|
-
content: The content to write to the file.
|
24
|
-
|
25
|
-
Returns:
|
26
|
-
The temporary file path.
|
27
|
-
"""
|
28
|
-
file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
|
29
|
-
if content:
|
30
|
-
await AsyncPath(file.name).write_bytes(content)
|
31
|
-
await run_sync(file.close)
|
32
|
-
|
33
|
-
async def unlink() -> None:
|
34
|
-
with suppress(OSError, PermissionError):
|
35
|
-
await AsyncPath(file.name).unlink(missing_ok=True)
|
36
|
-
|
37
|
-
return Path(file.name), unlink
|
kreuzberg/_xlsx.py
DELETED
@@ -1,88 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import csv
|
4
|
-
import sys
|
5
|
-
from io import StringIO
|
6
|
-
from typing import TYPE_CHECKING
|
7
|
-
|
8
|
-
from anyio import Path as AsyncPath
|
9
|
-
from python_calamine import CalamineWorkbook
|
10
|
-
|
11
|
-
from kreuzberg import ExtractionResult, ParsingError
|
12
|
-
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
13
|
-
from kreuzberg._pandoc import process_file_with_pandoc
|
14
|
-
from kreuzberg._string import normalize_spaces
|
15
|
-
from kreuzberg._sync import run_sync, run_taskgroup
|
16
|
-
from kreuzberg._tmp import create_temp_file
|
17
|
-
|
18
|
-
if TYPE_CHECKING: # pragma: no cover
|
19
|
-
from pathlib import Path
|
20
|
-
|
21
|
-
if sys.version_info < (3, 11): # pragma: no cover
|
22
|
-
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
23
|
-
|
24
|
-
|
25
|
-
async def convert_sheet_to_text(workbook: CalamineWorkbook, sheet_name: str) -> str:
|
26
|
-
values = workbook.get_sheet_by_name(sheet_name).to_python()
|
27
|
-
|
28
|
-
csv_buffer = StringIO()
|
29
|
-
writer = csv.writer(csv_buffer)
|
30
|
-
|
31
|
-
for row in values:
|
32
|
-
writer.writerow(row)
|
33
|
-
|
34
|
-
csv_data = csv_buffer.getvalue()
|
35
|
-
csv_buffer.close()
|
36
|
-
|
37
|
-
csv_path, unlink = await create_temp_file(".csv")
|
38
|
-
await AsyncPath(csv_path).write_text(csv_data)
|
39
|
-
|
40
|
-
result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
|
41
|
-
await unlink()
|
42
|
-
return f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
|
43
|
-
|
44
|
-
|
45
|
-
async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
|
46
|
-
"""Extract text from an XLSX file by converting it to CSV and then to markdown.
|
47
|
-
|
48
|
-
Args:
|
49
|
-
input_file: The path to the XLSX file.
|
50
|
-
|
51
|
-
Returns:
|
52
|
-
The extracted text content.
|
53
|
-
|
54
|
-
Raises:
|
55
|
-
ParsingError: If the XLSX file could not be parsed.
|
56
|
-
"""
|
57
|
-
try:
|
58
|
-
workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
|
59
|
-
tasks = [convert_sheet_to_text(workbook, sheet_name) for sheet_name in workbook.sheet_names]
|
60
|
-
results: list[str] = await run_taskgroup(*tasks)
|
61
|
-
|
62
|
-
return ExtractionResult(
|
63
|
-
content="\n\n".join(results),
|
64
|
-
mime_type=MARKDOWN_MIME_TYPE,
|
65
|
-
metadata={},
|
66
|
-
)
|
67
|
-
except ExceptionGroup as eg:
|
68
|
-
raise ParsingError(
|
69
|
-
"Failed to extract file data",
|
70
|
-
context={"file": str(input_file), "errors": eg.exceptions},
|
71
|
-
) from eg
|
72
|
-
|
73
|
-
|
74
|
-
async def extract_xlsx_content(content: bytes) -> ExtractionResult:
|
75
|
-
"""Extract text from an XLSX file content.
|
76
|
-
|
77
|
-
Args:
|
78
|
-
content: The XLSX file content.
|
79
|
-
|
80
|
-
Returns:
|
81
|
-
The extracted text content.
|
82
|
-
"""
|
83
|
-
xlsx_path, unlink = await create_temp_file(".xlsx")
|
84
|
-
|
85
|
-
await AsyncPath(xlsx_path).write_bytes(content)
|
86
|
-
result = await extract_xlsx_file(xlsx_path)
|
87
|
-
await unlink()
|
88
|
-
return result
|