kreuzberg 3.1.1__py3-none-any.whl → 3.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -228,7 +228,7 @@ class PandocExtractor(Extractor):
228
228
  command = ["pandoc", "--version"]
229
229
  result = await run_process(command)
230
230
 
231
- version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
231
+ version_match = re.search(r"pandoc\s+v?(\d+)\.\d+(\.\d+)?", result.stdout.decode())
232
232
  if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
233
233
  raise MissingDependencyError(
234
234
  "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
@@ -170,6 +170,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
170
170
  Raises:
171
171
  OCRError: If OCR processing fails.
172
172
  """
173
+ import numpy as np
174
+
173
175
  await self._init_easyocr(**kwargs)
174
176
 
175
177
  beam_width = kwargs.pop("beam_width")
@@ -180,7 +182,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
180
182
  try:
181
183
  result = await run_sync(
182
184
  self._reader.readtext,
183
- image.tobytes(),
185
+ np.array(image),
184
186
  beamWidth=beam_width,
185
187
  **kwargs,
186
188
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.1.1
3
+ Version: 3.1.3
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -27,12 +27,12 @@ License-File: LICENSE
27
27
  Requires-Dist: anyio>=4.9.0
28
28
  Requires-Dist: charset-normalizer>=3.4.1
29
29
  Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
- Requires-Dist: html-to-markdown>=1.2.1
30
+ Requires-Dist: html-to-markdown>=1.3.0
31
31
  Requires-Dist: playa-pdf>=0.4.1
32
32
  Requires-Dist: pypdfium2==4.30.0
33
- Requires-Dist: python-calamine>=0.3.1
33
+ Requires-Dist: python-calamine>=0.3.2
34
34
  Requires-Dist: python-pptx>=1.0.2
35
- Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
35
+ Requires-Dist: typing-extensions>=4.13.1; python_version < "3.12"
36
36
  Provides-Extra: all
37
37
  Requires-Dist: easyocr>=1.7.2; extra == "all"
38
38
  Requires-Dist: gmft>=0.4.1; extra == "all"
@@ -140,7 +140,7 @@ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.gith
140
140
 
141
141
  Kreuzberg supports a wide range of document formats:
142
142
 
143
- - **Documents**: PDF, DOCX, DOC, RTF, TXT, EPUB, etc.
143
+ - **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
144
144
  - **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
145
145
  - **Spreadsheets**: XLSX, XLS, CSV, etc.
146
146
  - **Presentations**: PPTX, PPT, etc.
@@ -162,11 +162,11 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
162
162
 
163
163
  ### Local Development
164
164
 
165
- 1. Clone the repo
166
- 1. Install the system dependencies
167
- 1. Install the full dependencies with `uv sync`
168
- 1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
169
- 1. Make your changes and submit a PR
165
+ - Clone the repo
166
+ - Install the system dependencies
167
+ - Install the full dependencies with `uv sync`
168
+ - Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
169
+ - Make your changes and submit a PR
170
170
 
171
171
  ## License
172
172
 
@@ -13,21 +13,21 @@ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
13
13
  kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
14
14
  kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
15
15
  kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
16
- kreuzberg/_extractors/_pandoc.py,sha256=a6cYQxoh5G9EMrDWVcQhrTkE4Mar24sNiGCY0zOOzw4,20121
16
+ kreuzberg/_extractors/_pandoc.py,sha256=U5CUrVilKdJqXJFUUT5xzcpy2jfJ26h7kde3p1N_N4w,20124
17
17
  kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
18
18
  kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
19
19
  kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
20
20
  kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
21
21
  kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
22
- kreuzberg/_ocr/_easyocr.py,sha256=IUX5AGMp3C2u3Byiz8BADLMlgoNEpFhwswmdeifMcIo,11112
22
+ kreuzberg/_ocr/_easyocr.py,sha256=4K344njfpzIig7Oz82yI9D5qqFJDRrsSa6x9vDW-eT4,11140
23
23
  kreuzberg/_ocr/_paddleocr.py,sha256=NDKXiMtHjIy-Uq4hXe4qm5oUWwOrhjJaibyC708Cw5E,10422
24
24
  kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
25
25
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
27
27
  kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
28
28
  kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
29
- kreuzberg-3.1.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
30
- kreuzberg-3.1.1.dist-info/METADATA,sha256=ZyRyrgBoKj42zgjIU7WpSVdE8j7euvqFLvXsEUtiUcA,6651
31
- kreuzberg-3.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
32
- kreuzberg-3.1.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
33
- kreuzberg-3.1.1.dist-info/RECORD,,
29
+ kreuzberg-3.1.3.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
30
+ kreuzberg-3.1.3.dist-info/METADATA,sha256=rgfHFNmD7V_s1HPx7aFrfcYep8dMmqpHidqMcY5_MJE,6641
31
+ kreuzberg-3.1.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
32
+ kreuzberg-3.1.3.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
33
+ kreuzberg-3.1.3.dist-info/RECORD,,