kreuzberg 3.1.0__py3-none-any.whl → 3.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_pandoc.py +1 -1
- kreuzberg/_ocr/_easyocr.py +4 -0
- {kreuzberg-3.1.0.dist-info → kreuzberg-3.1.2.dist-info}/METADATA +10 -10
- {kreuzberg-3.1.0.dist-info → kreuzberg-3.1.2.dist-info}/RECORD +7 -7
- {kreuzberg-3.1.0.dist-info → kreuzberg-3.1.2.dist-info}/WHEEL +0 -0
- {kreuzberg-3.1.0.dist-info → kreuzberg-3.1.2.dist-info}/licenses/LICENSE +0 -0
- {kreuzberg-3.1.0.dist-info → kreuzberg-3.1.2.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -228,7 +228,7 @@ class PandocExtractor(Extractor):
|
|
228
228
|
command = ["pandoc", "--version"]
|
229
229
|
result = await run_process(command)
|
230
230
|
|
231
|
-
version_match = re.search(r"pandoc\s+v?(\d+)\.\d
|
231
|
+
version_match = re.search(r"pandoc\s+v?(\d+)\.\d+(\.\d+)?", result.stdout.decode())
|
232
232
|
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
|
233
233
|
raise MissingDependencyError(
|
234
234
|
"Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -173,6 +173,10 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
173
173
|
await self._init_easyocr(**kwargs)
|
174
174
|
|
175
175
|
beam_width = kwargs.pop("beam_width")
|
176
|
+
|
177
|
+
kwargs.pop("language", None)
|
178
|
+
kwargs.pop("use_gpu", None)
|
179
|
+
|
176
180
|
try:
|
177
181
|
result = await run_sync(
|
178
182
|
self._reader.readtext,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.1.
|
3
|
+
Version: 3.1.2
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -27,12 +27,12 @@ License-File: LICENSE
|
|
27
27
|
Requires-Dist: anyio>=4.9.0
|
28
28
|
Requires-Dist: charset-normalizer>=3.4.1
|
29
29
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
-
Requires-Dist: html-to-markdown>=1.
|
30
|
+
Requires-Dist: html-to-markdown>=1.3.0
|
31
31
|
Requires-Dist: playa-pdf>=0.4.1
|
32
32
|
Requires-Dist: pypdfium2==4.30.0
|
33
|
-
Requires-Dist: python-calamine>=0.3.
|
33
|
+
Requires-Dist: python-calamine>=0.3.2
|
34
34
|
Requires-Dist: python-pptx>=1.0.2
|
35
|
-
Requires-Dist: typing-extensions>=4.
|
35
|
+
Requires-Dist: typing-extensions>=4.13.1; python_version < "3.12"
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
38
38
|
Requires-Dist: gmft>=0.4.1; extra == "all"
|
@@ -140,7 +140,7 @@ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.gith
|
|
140
140
|
|
141
141
|
Kreuzberg supports a wide range of document formats:
|
142
142
|
|
143
|
-
- **Documents**: PDF, DOCX,
|
143
|
+
- **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
|
144
144
|
- **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
|
145
145
|
- **Spreadsheets**: XLSX, XLS, CSV, etc.
|
146
146
|
- **Presentations**: PPTX, PPT, etc.
|
@@ -162,11 +162,11 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
|
|
162
162
|
|
163
163
|
### Local Development
|
164
164
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
165
|
+
- Clone the repo
|
166
|
+
- Install the system dependencies
|
167
|
+
- Install the full dependencies with `uv sync`
|
168
|
+
- Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
|
169
|
+
- Make your changes and submit a PR
|
170
170
|
|
171
171
|
## License
|
172
172
|
|
@@ -13,21 +13,21 @@ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
|
|
13
13
|
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
14
14
|
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
15
15
|
kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
|
16
|
-
kreuzberg/_extractors/_pandoc.py,sha256=
|
16
|
+
kreuzberg/_extractors/_pandoc.py,sha256=U5CUrVilKdJqXJFUUT5xzcpy2jfJ26h7kde3p1N_N4w,20124
|
17
17
|
kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
|
18
18
|
kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
|
19
19
|
kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
|
20
20
|
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
21
21
|
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
22
|
-
kreuzberg/_ocr/_easyocr.py,sha256=
|
22
|
+
kreuzberg/_ocr/_easyocr.py,sha256=IUX5AGMp3C2u3Byiz8BADLMlgoNEpFhwswmdeifMcIo,11112
|
23
23
|
kreuzberg/_ocr/_paddleocr.py,sha256=NDKXiMtHjIy-Uq4hXe4qm5oUWwOrhjJaibyC708Cw5E,10422
|
24
24
|
kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
|
25
25
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
26
|
kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
|
27
27
|
kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
|
28
28
|
kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
|
29
|
-
kreuzberg-3.1.
|
30
|
-
kreuzberg-3.1.
|
31
|
-
kreuzberg-3.1.
|
32
|
-
kreuzberg-3.1.
|
33
|
-
kreuzberg-3.1.
|
29
|
+
kreuzberg-3.1.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
30
|
+
kreuzberg-3.1.2.dist-info/METADATA,sha256=7cx9eSl0NAfeu18rvYT4BtwVdVOA1ZgInDx8KcpXlw8,6641
|
31
|
+
kreuzberg-3.1.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
32
|
+
kreuzberg-3.1.2.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
33
|
+
kreuzberg-3.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|