kreuzberg 2.1.1__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_string.py DELETED
@@ -1,41 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from contextlib import suppress
4
-
5
- from charset_normalizer import detect
6
-
7
-
8
- def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
9
- """Decode a byte string safely, removing invalid sequences.
10
-
11
- Args:
12
- byte_data: The byte string to decode.
13
- encoding: The encoding to use when decoding the byte string.
14
-
15
- Returns:
16
- The decoded string.
17
- """
18
- if not byte_data:
19
- return ""
20
-
21
- # We try each encoding in order until one works
22
- encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
23
-
24
- for enc in [e for e in encodings if e]: # pragma: no cover
25
- with suppress(UnicodeDecodeError, LookupError):
26
- return byte_data.decode(enc)
27
-
28
- # If all encodings fail, fall back to latin-1 which can handle any byte
29
- return byte_data.decode("latin-1", errors="replace")
30
-
31
-
32
- def normalize_spaces(text: str) -> str:
33
- """Normalize the spaces in a string.
34
-
35
- Args:
36
- text: The text to sanitize.
37
-
38
- Returns:
39
- The sanitized text.
40
- """
41
- return " ".join(text.strip().split())
kreuzberg/_sync.py DELETED
@@ -1,74 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import sys
4
- from functools import partial
5
- from typing import TYPE_CHECKING, Any, TypeVar, cast
6
-
7
- from anyio import create_task_group
8
- from anyio.to_thread import run_sync as any_io_run_sync
9
-
10
- if TYPE_CHECKING: # pragma: no cover
11
- from collections.abc import Awaitable, Callable
12
-
13
- if sys.version_info >= (3, 10):
14
- from typing import ParamSpec
15
- else: # pragma: no cover
16
- from typing_extensions import ParamSpec
17
-
18
- T = TypeVar("T")
19
- P = ParamSpec("P")
20
-
21
-
22
- async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
23
- """Run a synchronous function in an asynchronous context.
24
-
25
- Args:
26
- sync_fn: The synchronous function to run.
27
- *args: The positional arguments to pass to the function.
28
- **kwargs: The keyword arguments to pass to the function.
29
-
30
- Returns:
31
- The result of the synchronous function.
32
- """
33
- handler = partial(sync_fn, **kwargs)
34
- return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
35
-
36
-
37
- async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
38
- """Run a list of coroutines concurrently.
39
-
40
- Args:
41
- *async_tasks: The list of coroutines to run.
42
-
43
- Returns:
44
- The results of the coroutines.
45
- """
46
- results: list[Any] = [None] * len(async_tasks)
47
-
48
- async def run_task(index: int, task: Awaitable[T]) -> None:
49
- results[index] = await task
50
-
51
- async with create_task_group() as tg:
52
- for i, t in enumerate(async_tasks):
53
- tg.start_soon(run_task, i, t)
54
-
55
- return results
56
-
57
-
58
- async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
59
- """Run a list of coroutines concurrently in batches.
60
-
61
- Args:
62
- *async_tasks: The list of coroutines to run.
63
- batch_size: The size of each batch.
64
-
65
- Returns:
66
- The results of the coroutines.
67
- """
68
- results: list[Any] = []
69
-
70
- for i in range(0, len(async_tasks), batch_size):
71
- batch = async_tasks[i : i + batch_size]
72
- results.extend(await run_taskgroup(*batch))
73
-
74
- return results
kreuzberg/_tesseract.py DELETED
@@ -1,231 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import re
4
- import sys
5
- from enum import Enum
6
- from os import PathLike
7
- from typing import Any, TypeVar, Union
8
-
9
- from anyio import Path as AsyncPath
10
- from anyio import run_process
11
- from PIL.Image import Image
12
-
13
- from kreuzberg._constants import DEFAULT_MAX_PROCESSES, MINIMAL_SUPPORTED_TESSERACT_VERSION
14
- from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
15
- from kreuzberg._string import normalize_spaces
16
- from kreuzberg._sync import run_sync, run_taskgroup_batched
17
- from kreuzberg._tmp import create_temp_file
18
- from kreuzberg._types import ExtractionResult
19
- from kreuzberg.exceptions import MissingDependencyError, OCRError, ParsingError
20
-
21
- if sys.version_info < (3, 11): # pragma: no cover
22
- from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
23
-
24
- version_ref = {"checked": False}
25
-
26
- T = TypeVar("T", bound=Union[Image, PathLike[str], str])
27
-
28
-
29
- class PSMMode(Enum):
30
- """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
31
-
32
- OSD_ONLY = 0
33
- """Orientation and script detection only."""
34
- AUTO_OSD = 1
35
- """Automatic page segmentation with orientation and script detection."""
36
- AUTO_ONLY = 2
37
- """Automatic page segmentation without OSD."""
38
- AUTO = 3
39
- """Fully automatic page segmentation (default)."""
40
- SINGLE_COLUMN = 4
41
- """Assume a single column of text."""
42
- SINGLE_BLOCK_VERTICAL = 5
43
- """Assume a single uniform block of vertically aligned text."""
44
- SINGLE_BLOCK = 6
45
- """Assume a single uniform block of text."""
46
- SINGLE_LINE = 7
47
- """Treat the image as a single text line."""
48
- SINGLE_WORD = 8
49
- """Treat the image as a single word."""
50
- CIRCLE_WORD = 9
51
- """Treat the image as a single word in a circle."""
52
- SINGLE_CHAR = 10
53
- """Treat the image as a single character."""
54
-
55
-
56
- async def validate_tesseract_version() -> None:
57
- """Validate that Tesseract is installed and is version 5 or above.
58
-
59
- Raises:
60
- MissingDependencyError: If Tesseract is not installed or is below version 5.
61
- """
62
- try:
63
- if version_ref["checked"]:
64
- return
65
-
66
- command = ["tesseract", "--version"]
67
- result = await run_process(command)
68
- version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
69
- if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
70
- raise MissingDependencyError("Tesseract version 5 or above is required.")
71
-
72
- version_ref["checked"] = True
73
- except FileNotFoundError as e:
74
- raise MissingDependencyError(
75
- "Tesseract is not installed or not in path. Please install tesseract 5 and above on your system."
76
- ) from e
77
-
78
-
79
- async def process_file(
80
- input_file: str | PathLike[str],
81
- *,
82
- language: str,
83
- psm: PSMMode,
84
- ) -> ExtractionResult:
85
- """Process a single image file using Tesseract OCR.
86
-
87
- Args:
88
- input_file: The path to the image file to process.
89
- language: The language code for OCR.
90
- psm: Page segmentation mode.
91
-
92
- Raises:
93
- OCRError: If OCR fails to extract text from the image.
94
-
95
- Returns:
96
- ExtractionResult: The extracted text from the image.
97
- """
98
- output_path, unlink = await create_temp_file(".txt")
99
- try:
100
- output_base = str(output_path).replace(".txt", "")
101
-
102
- command = [
103
- "tesseract",
104
- str(input_file),
105
- output_base,
106
- "-l",
107
- language,
108
- "--psm",
109
- str(psm.value),
110
- "--oem",
111
- "1",
112
- "--loglevel",
113
- "OFF",
114
- "-c",
115
- "thresholding_method=1",
116
- "-c",
117
- "tessedit_enable_dict_correction=1",
118
- "-c",
119
- "language_model_ngram_on=1",
120
- "-c",
121
- "textord_space_size_is_variable=1",
122
- "-c",
123
- "classify_use_pre_adapted_templates=1",
124
- "-c",
125
- "tessedit_dont_blkrej_good_wds=1",
126
- "-c",
127
- "tessedit_dont_rowrej_good_wds=1",
128
- "-c",
129
- "tessedit_use_primary_params_model=1",
130
- ]
131
-
132
- env: dict[str, Any] | None = None
133
- if sys.platform.startswith("linux"):
134
- env = {"OMP_THREAD_LIMIT": "1"}
135
-
136
- result = await run_process(command, env=env)
137
-
138
- if not result.returncode == 0:
139
- raise OCRError(
140
- "OCR failed with a non-0 return code.",
141
- context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
142
- )
143
-
144
- output = await AsyncPath(output_path).read_text("utf-8")
145
- return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
146
- except (RuntimeError, OSError) as e:
147
- raise OCRError(f"Failed to OCR using tesseract: {e}") from e
148
- finally:
149
- await unlink()
150
-
151
-
152
- async def process_image(
153
- image: Image,
154
- *,
155
- language: str,
156
- psm: PSMMode,
157
- ) -> ExtractionResult:
158
- """Process a single Pillow Image using Tesseract OCR.
159
-
160
- Args:
161
- image: The Pillow Image to process.
162
- language: The language code for OCR.
163
- psm: Page segmentation mode.
164
-
165
- Returns:
166
- ExtractionResult: The extracted text from the image.
167
- """
168
- image_path, unlink = await create_temp_file(".png")
169
- await run_sync(image.save, str(image_path), format="PNG")
170
- result = await process_file(image_path, language=language, psm=psm)
171
- await unlink()
172
- return result
173
-
174
-
175
- async def process_image_with_tesseract(
176
- image: Image | PathLike[str] | str,
177
- *,
178
- language: str = "eng",
179
- psm: PSMMode = PSMMode.AUTO,
180
- ) -> ExtractionResult:
181
- """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
182
-
183
- Args:
184
- image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
185
- language: The language code for OCR (default: "eng").
186
- psm: Page segmentation mode (default: PSMMode.AUTO).
187
-
188
- Raises:
189
- ValueError: If the input is not a Pillow Image or a list of Pillow Images.
190
-
191
- Returns:
192
- Extracted text as a string
193
- """
194
- await validate_tesseract_version()
195
-
196
- if isinstance(image, Image):
197
- return await process_image(image, language=language, psm=psm)
198
-
199
- if isinstance(image, (PathLike, str)):
200
- return await process_file(image, language=language, psm=psm)
201
-
202
- raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
203
-
204
-
205
- async def batch_process_images(
206
- images: list[T],
207
- *,
208
- language: str = "eng",
209
- psm: PSMMode = PSMMode.AUTO,
210
- max_processes: int = DEFAULT_MAX_PROCESSES,
211
- ) -> list[ExtractionResult]:
212
- """Run Tesseract OCR asynchronously on multiple images with controlled concurrency.
213
-
214
- Args:
215
- images: A list of Pillow Images, paths or strings to process.
216
- language: The language code for OCR (default: "eng").
217
- psm: Page segmentation mode (default: PSMMode.AUTO).
218
- max_processes: Maximum number of concurrent processes (default: CPU count / 2).
219
-
220
- Raises:
221
- ParsingError: If OCR fails to extract text from any of the images.
222
-
223
- Returns:
224
- List of ExtractionResult objects, one per input image.
225
- """
226
- await validate_tesseract_version()
227
- try:
228
- tasks = [process_image_with_tesseract(image, language=language, psm=psm) for image in images]
229
- return await run_taskgroup_batched(*tasks, batch_size=max_processes)
230
- except ExceptionGroup as eg:
231
- raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg
kreuzberg/_tmp.py DELETED
@@ -1,37 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from contextlib import suppress
4
- from pathlib import Path
5
- from tempfile import NamedTemporaryFile
6
- from typing import TYPE_CHECKING, Callable
7
-
8
- from anyio import Path as AsyncPath
9
-
10
- from kreuzberg._sync import run_sync
11
-
12
- if TYPE_CHECKING: # pragma: no cover
13
- from collections.abc import Coroutine
14
-
15
-
16
- async def create_temp_file(
17
- extension: str, content: bytes | None = None
18
- ) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
19
- """Create a temporary file that is closed.
20
-
21
- Args:
22
- extension: The file extension.
23
- content: The content to write to the file.
24
-
25
- Returns:
26
- The temporary file path.
27
- """
28
- file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
29
- if content:
30
- await AsyncPath(file.name).write_bytes(content)
31
- await run_sync(file.close)
32
-
33
- async def unlink() -> None:
34
- with suppress(OSError, PermissionError):
35
- await AsyncPath(file.name).unlink(missing_ok=True)
36
-
37
- return Path(file.name), unlink
kreuzberg/_xlsx.py DELETED
@@ -1,88 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import csv
4
- import sys
5
- from io import StringIO
6
- from typing import TYPE_CHECKING
7
-
8
- from anyio import Path as AsyncPath
9
- from python_calamine import CalamineWorkbook
10
-
11
- from kreuzberg import ExtractionResult, ParsingError
12
- from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
13
- from kreuzberg._pandoc import process_file_with_pandoc
14
- from kreuzberg._string import normalize_spaces
15
- from kreuzberg._sync import run_sync, run_taskgroup
16
- from kreuzberg._tmp import create_temp_file
17
-
18
- if TYPE_CHECKING: # pragma: no cover
19
- from pathlib import Path
20
-
21
- if sys.version_info < (3, 11): # pragma: no cover
22
- from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
23
-
24
-
25
- async def convert_sheet_to_text(workbook: CalamineWorkbook, sheet_name: str) -> str:
26
- values = workbook.get_sheet_by_name(sheet_name).to_python()
27
-
28
- csv_buffer = StringIO()
29
- writer = csv.writer(csv_buffer)
30
-
31
- for row in values:
32
- writer.writerow(row)
33
-
34
- csv_data = csv_buffer.getvalue()
35
- csv_buffer.close()
36
-
37
- csv_path, unlink = await create_temp_file(".csv")
38
- await AsyncPath(csv_path).write_text(csv_data)
39
-
40
- result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
41
- await unlink()
42
- return f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
43
-
44
-
45
- async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
46
- """Extract text from an XLSX file by converting it to CSV and then to markdown.
47
-
48
- Args:
49
- input_file: The path to the XLSX file.
50
-
51
- Returns:
52
- The extracted text content.
53
-
54
- Raises:
55
- ParsingError: If the XLSX file could not be parsed.
56
- """
57
- try:
58
- workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
59
- tasks = [convert_sheet_to_text(workbook, sheet_name) for sheet_name in workbook.sheet_names]
60
- results: list[str] = await run_taskgroup(*tasks)
61
-
62
- return ExtractionResult(
63
- content="\n\n".join(results),
64
- mime_type=MARKDOWN_MIME_TYPE,
65
- metadata={},
66
- )
67
- except ExceptionGroup as eg:
68
- raise ParsingError(
69
- "Failed to extract file data",
70
- context={"file": str(input_file), "errors": eg.exceptions},
71
- ) from eg
72
-
73
-
74
- async def extract_xlsx_content(content: bytes) -> ExtractionResult:
75
- """Extract text from an XLSX file content.
76
-
77
- Args:
78
- content: The XLSX file content.
79
-
80
- Returns:
81
- The extracted text content.
82
- """
83
- xlsx_path, unlink = await create_temp_file(".xlsx")
84
-
85
- await AsyncPath(xlsx_path).write_bytes(content)
86
- result = await extract_xlsx_file(xlsx_path)
87
- await unlink()
88
- return result