kreuzberg 3.1.1__py3-none-any.whl → 3.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_pandoc.py +1 -1
- kreuzberg/_ocr/_easyocr.py +3 -1
- {kreuzberg-3.1.1.dist-info → kreuzberg-3.1.3.dist-info}/METADATA +10 -10
- {kreuzberg-3.1.1.dist-info → kreuzberg-3.1.3.dist-info}/RECORD +7 -7
- {kreuzberg-3.1.1.dist-info → kreuzberg-3.1.3.dist-info}/WHEEL +0 -0
- {kreuzberg-3.1.1.dist-info → kreuzberg-3.1.3.dist-info}/licenses/LICENSE +0 -0
- {kreuzberg-3.1.1.dist-info → kreuzberg-3.1.3.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -228,7 +228,7 @@ class PandocExtractor(Extractor):
|
|
228
228
|
command = ["pandoc", "--version"]
|
229
229
|
result = await run_process(command)
|
230
230
|
|
231
|
-
version_match = re.search(r"pandoc\s+v?(\d+)\.\d
|
231
|
+
version_match = re.search(r"pandoc\s+v?(\d+)\.\d+(\.\d+)?", result.stdout.decode())
|
232
232
|
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
|
233
233
|
raise MissingDependencyError(
|
234
234
|
"Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -170,6 +170,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
170
170
|
Raises:
|
171
171
|
OCRError: If OCR processing fails.
|
172
172
|
"""
|
173
|
+
import numpy as np
|
174
|
+
|
173
175
|
await self._init_easyocr(**kwargs)
|
174
176
|
|
175
177
|
beam_width = kwargs.pop("beam_width")
|
@@ -180,7 +182,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
180
182
|
try:
|
181
183
|
result = await run_sync(
|
182
184
|
self._reader.readtext,
|
183
|
-
|
185
|
+
np.array(image),
|
184
186
|
beamWidth=beam_width,
|
185
187
|
**kwargs,
|
186
188
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.1.
|
3
|
+
Version: 3.1.3
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -27,12 +27,12 @@ License-File: LICENSE
|
|
27
27
|
Requires-Dist: anyio>=4.9.0
|
28
28
|
Requires-Dist: charset-normalizer>=3.4.1
|
29
29
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
-
Requires-Dist: html-to-markdown>=1.
|
30
|
+
Requires-Dist: html-to-markdown>=1.3.0
|
31
31
|
Requires-Dist: playa-pdf>=0.4.1
|
32
32
|
Requires-Dist: pypdfium2==4.30.0
|
33
|
-
Requires-Dist: python-calamine>=0.3.
|
33
|
+
Requires-Dist: python-calamine>=0.3.2
|
34
34
|
Requires-Dist: python-pptx>=1.0.2
|
35
|
-
Requires-Dist: typing-extensions>=4.
|
35
|
+
Requires-Dist: typing-extensions>=4.13.1; python_version < "3.12"
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
38
38
|
Requires-Dist: gmft>=0.4.1; extra == "all"
|
@@ -140,7 +140,7 @@ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.gith
|
|
140
140
|
|
141
141
|
Kreuzberg supports a wide range of document formats:
|
142
142
|
|
143
|
-
- **Documents**: PDF, DOCX,
|
143
|
+
- **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
|
144
144
|
- **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
|
145
145
|
- **Spreadsheets**: XLSX, XLS, CSV, etc.
|
146
146
|
- **Presentations**: PPTX, PPT, etc.
|
@@ -162,11 +162,11 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
|
|
162
162
|
|
163
163
|
### Local Development
|
164
164
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
165
|
+
- Clone the repo
|
166
|
+
- Install the system dependencies
|
167
|
+
- Install the full dependencies with `uv sync`
|
168
|
+
- Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
|
169
|
+
- Make your changes and submit a PR
|
170
170
|
|
171
171
|
## License
|
172
172
|
|
@@ -13,21 +13,21 @@ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
|
|
13
13
|
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
14
14
|
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
15
15
|
kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
|
16
|
-
kreuzberg/_extractors/_pandoc.py,sha256=
|
16
|
+
kreuzberg/_extractors/_pandoc.py,sha256=U5CUrVilKdJqXJFUUT5xzcpy2jfJ26h7kde3p1N_N4w,20124
|
17
17
|
kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
|
18
18
|
kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
|
19
19
|
kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
|
20
20
|
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
21
21
|
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
22
|
-
kreuzberg/_ocr/_easyocr.py,sha256=
|
22
|
+
kreuzberg/_ocr/_easyocr.py,sha256=4K344njfpzIig7Oz82yI9D5qqFJDRrsSa6x9vDW-eT4,11140
|
23
23
|
kreuzberg/_ocr/_paddleocr.py,sha256=NDKXiMtHjIy-Uq4hXe4qm5oUWwOrhjJaibyC708Cw5E,10422
|
24
24
|
kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
|
25
25
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
26
|
kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
|
27
27
|
kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
|
28
28
|
kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
|
29
|
-
kreuzberg-3.1.
|
30
|
-
kreuzberg-3.1.
|
31
|
-
kreuzberg-3.1.
|
32
|
-
kreuzberg-3.1.
|
33
|
-
kreuzberg-3.1.
|
29
|
+
kreuzberg-3.1.3.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
30
|
+
kreuzberg-3.1.3.dist-info/METADATA,sha256=rgfHFNmD7V_s1HPx7aFrfcYep8dMmqpHidqMcY5_MJE,6641
|
31
|
+
kreuzberg-3.1.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
32
|
+
kreuzberg-3.1.3.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
33
|
+
kreuzberg-3.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|