kreuzberg 3.0.0__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import suppress
4
+
5
+ from charset_normalizer import detect
6
+
7
+
8
+ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
9
+ """Decode a byte string safely, removing invalid sequences.
10
+
11
+ Args:
12
+ byte_data: The byte string to decode.
13
+ encoding: The encoding to use when decoding the byte string.
14
+
15
+ Returns:
16
+ The decoded string.
17
+ """
18
+ if not byte_data:
19
+ return ""
20
+
21
+ encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
22
+
23
+ for enc in [e for e in encodings if e]: # pragma: no cover
24
+ with suppress(UnicodeDecodeError, LookupError):
25
+ return byte_data.decode(enc)
26
+
27
+ return byte_data.decode("latin-1", errors="replace")
28
+
29
+
30
+ def normalize_spaces(text: str) -> str:
31
+ """Normalize the spaces in a string.
32
+
33
+ Args:
34
+ text: The text to sanitize.
35
+
36
+ Returns:
37
+ The sanitized text.
38
+ """
39
+ return " ".join(text.strip().split())
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from functools import partial
5
+ from inspect import isawaitable, iscoroutinefunction
6
+ from typing import TYPE_CHECKING, Any, TypeVar, cast
7
+
8
+ import anyio
9
+ from anyio import create_task_group
10
+ from anyio.to_thread import run_sync as any_io_run_sync
11
+
12
+ if TYPE_CHECKING: # pragma: no cover
13
+ from collections.abc import Awaitable, Callable
14
+
15
+ if sys.version_info >= (3, 10):
16
+ from typing import ParamSpec
17
+ else: # pragma: no cover
18
+ from typing_extensions import ParamSpec
19
+
20
+ T = TypeVar("T")
21
+ P = ParamSpec("P")
22
+
23
+
24
+ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
25
+ """Run a synchronous function in an asynchronous context.
26
+
27
+ Args:
28
+ sync_fn: The synchronous function to run.
29
+ *args: The positional arguments to pass to the function.
30
+ **kwargs: The keyword arguments to pass to the function.
31
+
32
+ Returns:
33
+ The result of the synchronous function.
34
+ """
35
+ handler = partial(sync_fn, **kwargs)
36
+ return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
37
+
38
+
39
+ async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
40
+ """Run a list of coroutines concurrently.
41
+
42
+ Args:
43
+ *async_tasks: The list of coroutines to run.
44
+
45
+ Returns:
46
+ The results of the coroutines.
47
+ """
48
+ results: list[Any] = [None] * len(async_tasks)
49
+
50
+ async def run_task(index: int, task: Awaitable[T]) -> None:
51
+ results[index] = await task
52
+
53
+ async with create_task_group() as tg:
54
+ for i, t in enumerate(async_tasks):
55
+ tg.start_soon(run_task, i, t)
56
+
57
+ return results
58
+
59
+
60
+ async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
61
+ """Run a list of coroutines concurrently in batches.
62
+
63
+ Args:
64
+ *async_tasks: The list of coroutines to run.
65
+ batch_size: The size of each batch.
66
+
67
+ Returns:
68
+ The results of the coroutines.
69
+ """
70
+ results: list[Any] = []
71
+
72
+ for i in range(0, len(async_tasks), batch_size):
73
+ batch = async_tasks[i : i + batch_size]
74
+ results.extend(await run_taskgroup(*batch))
75
+
76
+ return results
77
+
78
+
79
+ async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
80
+ """Executes a callable function and handles both synchronous and asynchronous
81
+ results.
82
+
83
+ This function invokes the provided callable `sync_fn` with the given
84
+ arguments and keyword arguments. If the result of `sync_fn` is awaitable,
85
+ it awaits the result before returning it. Otherwise, the result is returned
86
+ directly.
87
+
88
+ Args:
89
+ fn: The callable to be executed. It can produce either a
90
+ synchronous or asynchronous result.
91
+ *args: Positional arguments to pass to `sync_fn`.
92
+ **kwargs: Keyword arguments to pass to `sync_fn`.
93
+
94
+ Returns:
95
+ The result of `sync_fn` invocation. If the result is awaitable, the
96
+ awaited value is returned. Otherwise, the synchronous result is
97
+ returned.
98
+ """
99
+ result = fn(*args, **kwargs)
100
+ if isawaitable(result):
101
+ return cast("T", await result)
102
+ return result
103
+
104
+
105
+ def run_maybe_async(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
106
+ """Runs a synchronous or asynchronous function, resolving the output.
107
+
108
+ Determines if the provided function is synchronous or asynchronous. If synchronous,
109
+ executes it directly. If asynchronous, it runs the function within the event loop
110
+ using anyio. The return value is resolved regardless of the function type.
111
+
112
+ Args:
113
+ fn: The function to be executed, which can
114
+ either be synchronous or asynchronous.
115
+ *args: Positional arguments to be passed to the function.
116
+ **kwargs: Keyword arguments to be passed to the function.
117
+
118
+ Returns:
119
+ T: The return value of the executed function, resolved if asynchronous.
120
+ """
121
+ return cast("T", fn(*args, **kwargs) if not iscoroutinefunction(fn) else anyio.run(partial(fn, **kwargs), *args))
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import suppress
4
+ from pathlib import Path
5
+ from tempfile import NamedTemporaryFile
6
+ from typing import TYPE_CHECKING, Callable
7
+
8
+ from anyio import Path as AsyncPath
9
+
10
+ from kreuzberg._utils._sync import run_sync
11
+
12
+ if TYPE_CHECKING: # pragma: no cover
13
+ from collections.abc import Coroutine
14
+
15
+
16
+ async def create_temp_file(
17
+ extension: str, content: bytes | None = None
18
+ ) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
19
+ """Create a temporary file that is closed.
20
+
21
+ Args:
22
+ extension: The file extension.
23
+ content: The content to write to the file.
24
+
25
+ Returns:
26
+ The temporary file path.
27
+ """
28
+ file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
29
+ if content:
30
+ await AsyncPath(file.name).write_bytes(content)
31
+ await run_sync(file.close)
32
+
33
+ async def unlink() -> None:
34
+ with suppress(OSError, PermissionError):
35
+ await AsyncPath(file.name).unlink(missing_ok=True)
36
+
37
+ return Path(file.name), unlink
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.0.0
3
+ Version: 3.1.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
7
7
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
- Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
8
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
9
9
  Classifier: Development Status :: 4 - Beta
10
10
  Classifier: Intended Audience :: Developers
11
11
  Classifier: License :: OSI Approved :: MIT License
@@ -27,7 +27,7 @@ License-File: LICENSE
27
27
  Requires-Dist: anyio>=4.9.0
28
28
  Requires-Dist: charset-normalizer>=3.4.1
29
29
  Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
- Requires-Dist: html-to-markdown>=1.2.0
30
+ Requires-Dist: html-to-markdown>=1.2.1
31
31
  Requires-Dist: playa-pdf>=0.4.1
32
32
  Requires-Dist: pypdfium2==4.30.0
33
33
  Requires-Dist: python-calamine>=0.3.1
@@ -35,19 +35,20 @@ Requires-Dist: python-pptx>=1.0.2
35
35
  Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
36
36
  Provides-Extra: all
37
37
  Requires-Dist: easyocr>=1.7.2; extra == "all"
38
- Requires-Dist: numpy>=2.0.2; extra == "all"
38
+ Requires-Dist: gmft>=0.4.1; extra == "all"
39
39
  Requires-Dist: paddleocr>=2.10.0; extra == "all"
40
- Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "all"
41
- Requires-Dist: semantic-text-splitter>=0.24.1; extra == "all"
40
+ Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
41
+ Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
42
42
  Requires-Dist: setuptools>=76.0.0; extra == "all"
43
43
  Provides-Extra: chunking
44
- Requires-Dist: semantic-text-splitter>=0.24.1; extra == "chunking"
44
+ Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
45
45
  Provides-Extra: easyocr
46
46
  Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
47
+ Provides-Extra: gmft
48
+ Requires-Dist: gmft>=0.4.1; extra == "gmft"
47
49
  Provides-Extra: paddleocr
48
- Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
49
50
  Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
50
- Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "paddleocr"
51
+ Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
51
52
  Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
52
53
  Dynamic: license-file
53
54
 
@@ -66,6 +67,8 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
66
67
  - **Resource Efficient**: Lightweight processing without GPU requirements
67
68
  - **Format Support**: Comprehensive support for documents, images, and text formats
68
69
  - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
70
+ - **Metadata Extraction**: Get document metadata alongside text content
71
+ - **Table Extraction**: Extract tables from documents using the excellent GMFT library
69
72
  - **Modern Python**: Built with async/await, type hints, and a functional-first approach
70
73
  - **Permissive OSS**: MIT licensed with permissively licensed dependencies
71
74
 
@@ -151,7 +154,7 @@ Kreuzberg supports multiple OCR engines:
151
154
  - **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
152
155
  - **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
153
156
 
154
- For comparison and selection guidance, see the [OCR Backends](https://example.com/ocr-backends) documentation.
157
+ For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
155
158
 
156
159
  ## Contribution
157
160
 
@@ -160,17 +163,9 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
160
163
  ### Local Development
161
164
 
162
165
  1. Clone the repo
163
-
164
166
  1. Install the system dependencies
165
-
166
167
  1. Install the full dependencies with `uv sync`
167
-
168
- 1. Install the pre-commit hooks with:
169
-
170
- ```shell
171
- pre-commit install && pre-commit install --hook-type commit-msg
172
- ```
173
-
168
+ 1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
174
169
  1. Make your changes and submit a PR
175
170
 
176
171
  ## License
@@ -0,0 +1,33 @@
1
+ kreuzberg/__init__.py,sha256=lT9OwIdf5CEhSX7IVmtSFPgRhz6B2z2A-RE8Zdm0PH4,1216
2
+ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
3
+ kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
4
+ kreuzberg/_gmft.py,sha256=qLhfepQuaROjPOdI-tDRqqqnOcqDY1D411ZXzoywnpg,7229
5
+ kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
6
+ kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
7
+ kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
8
+ kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
9
+ kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
10
+ kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
11
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
14
+ kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
15
+ kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
16
+ kreuzberg/_extractors/_pandoc.py,sha256=a6cYQxoh5G9EMrDWVcQhrTkE4Mar24sNiGCY0zOOzw4,20121
17
+ kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
18
+ kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
19
+ kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
20
+ kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
21
+ kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
22
+ kreuzberg/_ocr/_easyocr.py,sha256=VfYW66SkB2Bigbrtd7WEeJ6QZ_1Y5d8Z_rZYBPMsuk0,11037
23
+ kreuzberg/_ocr/_paddleocr.py,sha256=NDKXiMtHjIy-Uq4hXe4qm5oUWwOrhjJaibyC708Cw5E,10422
24
+ kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
25
+ kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
27
+ kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
28
+ kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
29
+ kreuzberg-3.1.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
30
+ kreuzberg-3.1.0.dist-info/METADATA,sha256=YemIJR6aygDxNgz9aoeg2oIGRHJjm897jD8sHuJYdMY,6651
31
+ kreuzberg-3.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
32
+ kreuzberg-3.1.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
33
+ kreuzberg-3.1.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.3)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,15 +0,0 @@
1
- kreuzberg/__init__.py,sha256=KZ_y21m64cafWL7goGeG3EIDutM184st28n4UGajADs,1131
2
- kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
3
- kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
4
- kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
5
- kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
6
- kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
7
- kreuzberg/_types.py,sha256=sZMxjRZQ1c_MzxdumhYSWghW6yXBwohTUIBa5eR-FKA,6582
8
- kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
9
- kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
10
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- kreuzberg-3.0.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
- kreuzberg-3.0.0.dist-info/METADATA,sha256=wlO9VCvZQy_gJJTmhGzH9j8BlPQPFQdmMZQxJOcQAUg,6515
13
- kreuzberg-3.0.0.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
14
- kreuzberg-3.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
- kreuzberg-3.0.0.dist-info/RECORD,,