kreuzberg 3.0.0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,342 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import sys
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from typing import TYPE_CHECKING, Any, ClassVar, Final
8
+
9
+ from anyio import Path as AsyncPath
10
+ from anyio import run_process
11
+
12
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
13
+ from kreuzberg._ocr._base import OCRBackend
14
+ from kreuzberg._types import ExtractionResult
15
+ from kreuzberg._utils._string import normalize_spaces
16
+ from kreuzberg._utils._sync import run_sync
17
+ from kreuzberg._utils._tmp import create_temp_file
18
+ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
19
+
20
+ if TYPE_CHECKING:
21
+ from pathlib import Path
22
+
23
+ from PIL.Image import Image
24
+
25
+ try: # pragma: no cover
26
+ from typing import Unpack # type: ignore[attr-defined]
27
+ except ImportError: # pragma: no cover
28
+ from typing_extensions import Unpack
29
+
30
+
31
+ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
32
+ "afr",
33
+ "amh",
34
+ "ara",
35
+ "asm",
36
+ "aze",
37
+ "aze_cyrl",
38
+ "bel",
39
+ "ben",
40
+ "bod",
41
+ "bos",
42
+ "bre",
43
+ "bul",
44
+ "cat",
45
+ "ceb",
46
+ "ces",
47
+ "chi_sim",
48
+ "chi_tra",
49
+ "chr",
50
+ "cos",
51
+ "cym",
52
+ "dan",
53
+ "dan_frak",
54
+ "deu",
55
+ "deu_frak",
56
+ "deu_latf",
57
+ "dzo",
58
+ "ell",
59
+ "eng",
60
+ "enm",
61
+ "epo",
62
+ "equ",
63
+ "est",
64
+ "eus",
65
+ "fao",
66
+ "fas",
67
+ "fil",
68
+ "fin",
69
+ "fra",
70
+ "frk",
71
+ "frm",
72
+ "fry",
73
+ "gla",
74
+ "gle",
75
+ "glg",
76
+ "grc",
77
+ "guj",
78
+ "hat",
79
+ "heb",
80
+ "hin",
81
+ "hrv",
82
+ "hun",
83
+ "hye",
84
+ "iku",
85
+ "ind",
86
+ "isl",
87
+ "ita",
88
+ "ita_old",
89
+ "jav",
90
+ "jpn",
91
+ "kan",
92
+ "kat",
93
+ "kat_old",
94
+ "kaz",
95
+ "khm",
96
+ "kir",
97
+ "kmr",
98
+ "kor",
99
+ "kor_vert",
100
+ "kur",
101
+ "lao",
102
+ "lat",
103
+ "lav",
104
+ "lit",
105
+ "ltz",
106
+ "mal",
107
+ "mar",
108
+ "mkd",
109
+ "mlt",
110
+ "mon",
111
+ "mri",
112
+ "msa",
113
+ "mya",
114
+ "nep",
115
+ "nld",
116
+ "nor",
117
+ "oci",
118
+ "ori",
119
+ "osd",
120
+ "pan",
121
+ "pol",
122
+ "por",
123
+ "pus",
124
+ "que",
125
+ "ron",
126
+ "rus",
127
+ "san",
128
+ "sin",
129
+ "slk",
130
+ "slk_frak",
131
+ "slv",
132
+ "snd",
133
+ "spa",
134
+ "spa_old",
135
+ "sqi",
136
+ "srp",
137
+ "srp_latn",
138
+ "sun",
139
+ "swa",
140
+ "swe",
141
+ "syr",
142
+ "tam",
143
+ "tat",
144
+ "tel",
145
+ "tgk",
146
+ "tgl",
147
+ "tha", # codespell:ignore
148
+ "tir",
149
+ "ton",
150
+ "tur",
151
+ "uig",
152
+ "ukr",
153
+ "urd",
154
+ "uzb",
155
+ "uzb_cyrl",
156
+ "vie", # codespell:ignore
157
+ "yid",
158
+ "yor",
159
+ }
160
+
161
+ MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
162
+
163
+
164
+ class PSMMode(Enum):
165
+ """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
166
+
167
+ OSD_ONLY = 0
168
+ """Orientation and script detection only."""
169
+ AUTO_OSD = 1
170
+ """Automatic page segmentation with orientation and script detection."""
171
+ AUTO_ONLY = 2
172
+ """Automatic page segmentation without OSD."""
173
+ AUTO = 3
174
+ """Fully automatic page segmentation (default)."""
175
+ SINGLE_COLUMN = 4
176
+ """Assume a single column of text."""
177
+ SINGLE_BLOCK_VERTICAL = 5
178
+ """Assume a single uniform block of vertically aligned text."""
179
+ SINGLE_BLOCK = 6
180
+ """Assume a single uniform block of text."""
181
+ SINGLE_LINE = 7
182
+ """Treat the image as a single text line."""
183
+ SINGLE_WORD = 8
184
+ """Treat the image as a single word."""
185
+ CIRCLE_WORD = 9
186
+ """Treat the image as a single word in a circle."""
187
+ SINGLE_CHAR = 10
188
+ """Treat the image as a single character."""
189
+
190
+
191
+ @dataclass(unsafe_hash=True, frozen=True)
192
+ class TesseractConfig:
193
+ """Configuration options for Tesseract OCR engine."""
194
+
195
+ classify_use_pre_adapted_templates: bool = True
196
+ """Whether to use pre-adapted templates during classification to improve recognition accuracy."""
197
+ language: str = "eng"
198
+ """Language code to use for OCR.
199
+ Examples:
200
+ - 'eng' for English
201
+ - 'deu' for German
202
+ - multiple languages combined with '+', e.g. 'eng+deu')
203
+ """
204
+ language_model_ngram_on: bool = True
205
+ """Enable or disable the use of n-gram-based language models for improved text recognition."""
206
+ psm: PSMMode = PSMMode.AUTO
207
+ """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
208
+ tessedit_dont_blkrej_good_wds: bool = True
209
+ """If True, prevents block rejection of words identified as good, improving text output quality."""
210
+ tessedit_dont_rowrej_good_wds: bool = True
211
+ """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
212
+ tessedit_enable_dict_correction: bool = True
213
+ """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
214
+ tessedit_use_primary_params_model: bool = True
215
+ """If True, forces the use of the primary parameters model for text recognition."""
216
+ textord_space_size_is_variable: bool = True
217
+ """Allow variable spacing between words, useful for text with irregular spacing."""
218
+ thresholding_method: bool = False
219
+ """Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
220
+
221
+
222
+ class TesseractBackend(OCRBackend[TesseractConfig]):
223
+ _version_checked: ClassVar[bool] = False
224
+
225
+ async def process_image(
226
+ self,
227
+ image: Image,
228
+ **kwargs: Unpack[TesseractConfig],
229
+ ) -> ExtractionResult:
230
+ await self._validate_tesseract_version()
231
+ image_path, unlink = await create_temp_file(".png")
232
+ await run_sync(image.save, str(image_path), format="PNG")
233
+ try:
234
+ return await self.process_file(image_path, **kwargs)
235
+ finally:
236
+ await unlink()
237
+
238
+ async def process_file(
239
+ self,
240
+ path: Path,
241
+ **kwargs: Unpack[TesseractConfig],
242
+ ) -> ExtractionResult:
243
+ await self._validate_tesseract_version()
244
+ output_path, unlink = await create_temp_file(".txt")
245
+ language = self._validate_language_code(kwargs.pop("language", "eng"))
246
+ psm = kwargs.pop("psm", PSMMode.AUTO)
247
+ try:
248
+ output_base = str(output_path).replace(".txt", "")
249
+ command = [
250
+ "tesseract",
251
+ str(path),
252
+ output_base,
253
+ "-l",
254
+ language,
255
+ "--psm",
256
+ str(psm.value),
257
+ "--oem",
258
+ "1",
259
+ "--loglevel",
260
+ "OFF",
261
+ ]
262
+ for kwarg, value in kwargs.items():
263
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
264
+
265
+ env: dict[str, Any] | None = None
266
+ if sys.platform.startswith("linux"):
267
+ # we have to prevent multithreading this way otherwise we will get deadlocks ~keep
268
+ env = {"OMP_THREAD_LIMIT": "1"}
269
+
270
+ result = await run_process(command, env=env)
271
+
272
+ if not result.returncode == 0:
273
+ raise OCRError(
274
+ "OCR failed with a non-0 return code.",
275
+ context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
276
+ )
277
+
278
+ output = await AsyncPath(output_path).read_text("utf-8")
279
+ return ExtractionResult(
280
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
281
+ )
282
+ except (RuntimeError, OSError) as e:
283
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
284
+ finally:
285
+ await unlink()
286
+
287
+ @classmethod
288
+ async def _validate_tesseract_version(cls) -> None:
289
+ """Validate that Tesseract is installed and is version 5 or above.
290
+
291
+ Raises:
292
+ MissingDependencyError: If Tesseract is not installed or is below version 5.
293
+ """
294
+ try:
295
+ if cls._version_checked:
296
+ return
297
+
298
+ command = ["tesseract", "--version"]
299
+ result = await run_process(command)
300
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
301
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
302
+ raise MissingDependencyError(
303
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
304
+ )
305
+
306
+ cls._version_checked = True
307
+ except FileNotFoundError as e:
308
+ raise MissingDependencyError(
309
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
310
+ ) from e
311
+
312
+ @staticmethod
313
+ def _validate_language_code(language_code: str) -> str:
314
+ """Convert a language code to Tesseract format.
315
+
316
+ Args:
317
+ language_code: Tesseract supported language code or multiple language codes connected with '+'
318
+
319
+ Raises:
320
+ ValidationError: If the language is not supported by Tesseract
321
+
322
+ Returns:
323
+ Language code compatible with Tesseract
324
+ """
325
+ normalized = language_code.lower()
326
+ if normalized in TESSERACT_SUPPORTED_LANGUAGE_CODES:
327
+ return normalized
328
+
329
+ if "+" in normalized and all(lang in TESSERACT_SUPPORTED_LANGUAGE_CODES for lang in normalized.split("+")):
330
+ return normalized
331
+
332
+ raise ValidationError(
333
+ "The provided language code is not supported by Tesseract",
334
+ context={
335
+ "language_code": normalized
336
+ if "+" not in normalized
337
+ else ",".join(
338
+ [lang for lang in normalized.split("+") if lang not in TESSERACT_SUPPORTED_LANGUAGE_CODES]
339
+ ),
340
+ "supported_languages": ",".join(sorted(TESSERACT_SUPPORTED_LANGUAGE_CODES)),
341
+ },
342
+ )
File without changes
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import suppress
4
+
5
+ from charset_normalizer import detect
6
+
7
+
8
+ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
9
+ """Decode a byte string safely, removing invalid sequences.
10
+
11
+ Args:
12
+ byte_data: The byte string to decode.
13
+ encoding: The encoding to use when decoding the byte string.
14
+
15
+ Returns:
16
+ The decoded string.
17
+ """
18
+ if not byte_data:
19
+ return ""
20
+
21
+ encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
22
+
23
+ for enc in [e for e in encodings if e]: # pragma: no cover
24
+ with suppress(UnicodeDecodeError, LookupError):
25
+ return byte_data.decode(enc)
26
+
27
+ return byte_data.decode("latin-1", errors="replace")
28
+
29
+
30
+ def normalize_spaces(text: str) -> str:
31
+ """Normalize the spaces in a string.
32
+
33
+ Args:
34
+ text: The text to sanitize.
35
+
36
+ Returns:
37
+ The sanitized text.
38
+ """
39
+ return " ".join(text.strip().split())
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from functools import partial
5
+ from inspect import isawaitable, iscoroutinefunction
6
+ from typing import TYPE_CHECKING, Any, TypeVar, cast
7
+
8
+ import anyio
9
+ from anyio import create_task_group
10
+ from anyio.to_thread import run_sync as any_io_run_sync
11
+
12
+ if TYPE_CHECKING: # pragma: no cover
13
+ from collections.abc import Awaitable, Callable
14
+
15
+ if sys.version_info >= (3, 10):
16
+ from typing import ParamSpec
17
+ else: # pragma: no cover
18
+ from typing_extensions import ParamSpec
19
+
20
+ T = TypeVar("T")
21
+ P = ParamSpec("P")
22
+
23
+
24
+ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
25
+ """Run a synchronous function in an asynchronous context.
26
+
27
+ Args:
28
+ sync_fn: The synchronous function to run.
29
+ *args: The positional arguments to pass to the function.
30
+ **kwargs: The keyword arguments to pass to the function.
31
+
32
+ Returns:
33
+ The result of the synchronous function.
34
+ """
35
+ handler = partial(sync_fn, **kwargs)
36
+ return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
37
+
38
+
39
+ async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
40
+ """Run a list of coroutines concurrently.
41
+
42
+ Args:
43
+ *async_tasks: The list of coroutines to run.
44
+
45
+ Returns:
46
+ The results of the coroutines.
47
+ """
48
+ results: list[Any] = [None] * len(async_tasks)
49
+
50
+ async def run_task(index: int, task: Awaitable[T]) -> None:
51
+ results[index] = await task
52
+
53
+ async with create_task_group() as tg:
54
+ for i, t in enumerate(async_tasks):
55
+ tg.start_soon(run_task, i, t)
56
+
57
+ return results
58
+
59
+
60
+ async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
61
+ """Run a list of coroutines concurrently in batches.
62
+
63
+ Args:
64
+ *async_tasks: The list of coroutines to run.
65
+ batch_size: The size of each batch.
66
+
67
+ Returns:
68
+ The results of the coroutines.
69
+ """
70
+ results: list[Any] = []
71
+
72
+ for i in range(0, len(async_tasks), batch_size):
73
+ batch = async_tasks[i : i + batch_size]
74
+ results.extend(await run_taskgroup(*batch))
75
+
76
+ return results
77
+
78
+
79
+ async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
80
+ """Executes a callable function and handles both synchronous and asynchronous
81
+ results.
82
+
83
+ This function invokes the provided callable `sync_fn` with the given
84
+ arguments and keyword arguments. If the result of `sync_fn` is awaitable,
85
+ it awaits the result before returning it. Otherwise, the result is returned
86
+ directly.
87
+
88
+ Args:
89
+ fn: The callable to be executed. It can produce either a
90
+ synchronous or asynchronous result.
91
+ *args: Positional arguments to pass to `sync_fn`.
92
+ **kwargs: Keyword arguments to pass to `sync_fn`.
93
+
94
+ Returns:
95
+ The result of `sync_fn` invocation. If the result is awaitable, the
96
+ awaited value is returned. Otherwise, the synchronous result is
97
+ returned.
98
+ """
99
+ result = fn(*args, **kwargs)
100
+ if isawaitable(result):
101
+ return cast("T", await result)
102
+ return result
103
+
104
+
105
+ def run_maybe_async(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
106
+ """Runs a synchronous or asynchronous function, resolving the output.
107
+
108
+ Determines if the provided function is synchronous or asynchronous. If synchronous,
109
+ executes it directly. If asynchronous, it runs the function within the event loop
110
+ using anyio. The return value is resolved regardless of the function type.
111
+
112
+ Args:
113
+ fn: The function to be executed, which can
114
+ either be synchronous or asynchronous.
115
+ *args: Positional arguments to be passed to the function.
116
+ **kwargs: Keyword arguments to be passed to the function.
117
+
118
+ Returns:
119
+ T: The return value of the executed function, resolved if asynchronous.
120
+ """
121
+ return cast("T", fn(*args, **kwargs) if not iscoroutinefunction(fn) else anyio.run(partial(fn, **kwargs), *args))
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import suppress
4
+ from pathlib import Path
5
+ from tempfile import NamedTemporaryFile
6
+ from typing import TYPE_CHECKING, Callable
7
+
8
+ from anyio import Path as AsyncPath
9
+
10
+ from kreuzberg._utils._sync import run_sync
11
+
12
+ if TYPE_CHECKING: # pragma: no cover
13
+ from collections.abc import Coroutine
14
+
15
+
16
+ async def create_temp_file(
17
+ extension: str, content: bytes | None = None
18
+ ) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
19
+ """Create a temporary file that is closed.
20
+
21
+ Args:
22
+ extension: The file extension.
23
+ content: The content to write to the file.
24
+
25
+ Returns:
26
+ The temporary file path.
27
+ """
28
+ file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
29
+ if content:
30
+ await AsyncPath(file.name).write_bytes(content)
31
+ await run_sync(file.close)
32
+
33
+ async def unlink() -> None:
34
+ with suppress(OSError, PermissionError):
35
+ await AsyncPath(file.name).unlink(missing_ok=True)
36
+
37
+ return Path(file.name), unlink
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.0.0
3
+ Version: 3.0.1
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -151,7 +151,7 @@ Kreuzberg supports multiple OCR engines:
151
151
  - **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
152
152
  - **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
153
153
 
154
- For comparison and selection guidance, see the [OCR Backends](https://example.com/ocr-backends) documentation.
154
+ For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
155
155
 
156
156
  ## Contribution
157
157
 
@@ -0,0 +1,32 @@
1
+ kreuzberg/__init__.py,sha256=KZ_y21m64cafWL7goGeG3EIDutM184st28n4UGajADs,1131
2
+ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
3
+ kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
4
+ kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
5
+ kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
6
+ kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
7
+ kreuzberg/_types.py,sha256=sZMxjRZQ1c_MzxdumhYSWghW6yXBwohTUIBa5eR-FKA,6582
8
+ kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
9
+ kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
10
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
13
+ kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
14
+ kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
15
+ kreuzberg/_extractors/_pandoc.py,sha256=a6cYQxoh5G9EMrDWVcQhrTkE4Mar24sNiGCY0zOOzw4,20121
16
+ kreuzberg/_extractors/_pdf.py,sha256=dcSAXyqH8SZ-z45OUAjjwdboSEbrli0YekS8PxCaVGA,6384
17
+ kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
18
+ kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
19
+ kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
20
+ kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
21
+ kreuzberg/_ocr/_easyocr.py,sha256=VfYW66SkB2Bigbrtd7WEeJ6QZ_1Y5d8Z_rZYBPMsuk0,11037
22
+ kreuzberg/_ocr/_paddleocr.py,sha256=X5es69QMl0P6DZuuRNKWHaRtLi1OJqFs-mWHR_gVKvY,10837
23
+ kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
24
+ kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
26
+ kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
27
+ kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
28
+ kreuzberg-3.0.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
29
+ kreuzberg-3.0.1.dist-info/METADATA,sha256=5Kt0w9rFBAina8SzbO-m2umEMRJQL-4mcPGAQASko_k,6545
30
+ kreuzberg-3.0.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
31
+ kreuzberg-3.0.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
32
+ kreuzberg-3.0.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.3)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,15 +0,0 @@
1
- kreuzberg/__init__.py,sha256=KZ_y21m64cafWL7goGeG3EIDutM184st28n4UGajADs,1131
2
- kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
3
- kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
4
- kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
5
- kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
6
- kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
7
- kreuzberg/_types.py,sha256=sZMxjRZQ1c_MzxdumhYSWghW6yXBwohTUIBa5eR-FKA,6582
8
- kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
9
- kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
10
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- kreuzberg-3.0.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
- kreuzberg-3.0.0.dist-info/METADATA,sha256=wlO9VCvZQy_gJJTmhGzH9j8BlPQPFQdmMZQxJOcQAUg,6515
13
- kreuzberg-3.0.0.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
14
- kreuzberg-3.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
- kreuzberg-3.0.0.dist-info/RECORD,,