kreuzberg 3.1.2__py3-none-any.whl → 3.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_pandoc.py +46 -6
- kreuzberg/_ocr/_easyocr.py +23 -17
- kreuzberg/_ocr/_paddleocr.py +4 -0
- {kreuzberg-3.1.2.dist-info → kreuzberg-3.1.4.dist-info}/METADATA +5 -5
- {kreuzberg-3.1.2.dist-info → kreuzberg-3.1.4.dist-info}/RECORD +8 -8
- {kreuzberg-3.1.2.dist-info → kreuzberg-3.1.4.dist-info}/WHEEL +1 -1
- {kreuzberg-3.1.2.dist-info → kreuzberg-3.1.4.dist-info}/licenses/LICENSE +0 -0
- {kreuzberg-3.1.2.dist-info → kreuzberg-3.1.4.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -227,14 +227,54 @@ class PandocExtractor(Extractor):
|
|
227
227
|
|
228
228
|
command = ["pandoc", "--version"]
|
229
229
|
result = await run_process(command)
|
230
|
+
stdout = result.stdout.decode()
|
230
231
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
232
|
+
# Try more inclusive patterns to detect the pandoc version
|
233
|
+
# Try common formats first
|
234
|
+
version_match = re.search(
|
235
|
+
r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
|
236
|
+
)
|
237
|
+
|
238
|
+
# Try version in parentheses format
|
239
|
+
if not version_match:
|
240
|
+
version_match = re.search(r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)", stdout, re.IGNORECASE)
|
241
|
+
|
242
|
+
# Try hyphenated format
|
243
|
+
if not version_match:
|
244
|
+
version_match = re.search(r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?", stdout)
|
245
|
+
|
246
|
+
# If still no match, check for version at the beginning of the output or any line
|
247
|
+
if not version_match:
|
248
|
+
# Match version at the start of a line (like in the test case "2.9.2.1\npandoc-types 1.20")
|
249
|
+
version_match = re.search(r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?", stdout, re.MULTILINE)
|
250
|
+
|
251
|
+
# Try finding version-like patterns elsewhere in the text
|
252
|
+
if not version_match:
|
253
|
+
# Search for version-like patterns at the beginning of lines or after spaces
|
254
|
+
version_match = re.search(r"(?:^|\s)(\d+)\.(\d+)(?:\.(\d+))?(?:\s|$)", stdout)
|
255
|
+
|
256
|
+
# As a last resort, check any sequence of digits that might be a version
|
257
|
+
if not version_match:
|
258
|
+
out_lines = stdout.splitlines()
|
259
|
+
for line in out_lines:
|
260
|
+
for token in line.split():
|
261
|
+
# Match standalone version patterns like 2.11 or 2.11.4
|
262
|
+
version_pattern = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?$", token)
|
263
|
+
if version_pattern:
|
264
|
+
version_match = version_pattern
|
265
|
+
break
|
266
|
+
if version_match:
|
267
|
+
break
|
268
|
+
|
269
|
+
# If we found a version, check that the major version is at least the minimum required
|
270
|
+
if version_match and int(version_match.group(1)) >= MINIMAL_SUPPORTED_PANDOC_VERSION:
|
271
|
+
self._checked_version = True
|
272
|
+
return
|
236
273
|
|
237
|
-
|
274
|
+
# If we get here, we either didn't find a version or it's too low
|
275
|
+
raise MissingDependencyError(
|
276
|
+
"Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
277
|
+
)
|
238
278
|
|
239
279
|
except FileNotFoundError as e:
|
240
280
|
raise MissingDependencyError(
|
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -127,7 +127,8 @@ class EasyOCRConfig:
|
|
127
127
|
height_ths: float = 0.5
|
128
128
|
"""Maximum difference in box height for merging."""
|
129
129
|
language: str | list[str] = "en"
|
130
|
-
"""Language or languages to use for OCR.
|
130
|
+
"""Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
|
131
|
+
a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
|
131
132
|
link_threshold: float = 0.4
|
132
133
|
"""Link confidence threshold."""
|
133
134
|
low_text: float = 0.4
|
@@ -170,6 +171,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
170
171
|
Raises:
|
171
172
|
OCRError: If OCR processing fails.
|
172
173
|
"""
|
174
|
+
import numpy as np
|
175
|
+
|
173
176
|
await self._init_easyocr(**kwargs)
|
174
177
|
|
175
178
|
beam_width = kwargs.pop("beam_width")
|
@@ -180,7 +183,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
180
183
|
try:
|
181
184
|
result = await run_sync(
|
182
185
|
self._reader.readtext,
|
183
|
-
|
186
|
+
np.array(image),
|
184
187
|
beamWidth=beam_width,
|
185
188
|
**kwargs,
|
186
189
|
)
|
@@ -352,29 +355,32 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
352
355
|
|
353
356
|
@staticmethod
|
354
357
|
def _validate_language_code(language_codes: str | list[str]) -> list[str]:
|
355
|
-
"""Validate and normalize
|
358
|
+
"""Validate and normalize provided language codes.
|
356
359
|
|
357
360
|
Args:
|
358
|
-
language_codes: The language code string.
|
361
|
+
language_codes: The language code(s), either as a string (single or comma-separated) or a list.
|
359
362
|
|
360
363
|
Raises:
|
361
|
-
ValidationError: If the
|
364
|
+
ValidationError: If any of the languages are not supported by EasyOCR
|
362
365
|
|
363
366
|
Returns:
|
364
|
-
A list with the normalized language
|
367
|
+
A list with the normalized language codes.
|
365
368
|
"""
|
366
|
-
if
|
367
|
-
|
369
|
+
if isinstance(language_codes, str):
|
370
|
+
# Handle comma-separated language codes
|
371
|
+
languages = [lang.strip().lower() for lang in language_codes.split(",")]
|
368
372
|
else:
|
373
|
+
# Handle list of language codes
|
369
374
|
languages = [lang.lower() for lang in language_codes]
|
370
375
|
|
371
|
-
|
372
|
-
|
376
|
+
unsupported_langs = [lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]
|
377
|
+
if unsupported_langs:
|
378
|
+
raise ValidationError(
|
379
|
+
"The provided language codes are not supported by EasyOCR",
|
380
|
+
context={
|
381
|
+
"language_code": ",".join(unsupported_langs),
|
382
|
+
"supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
|
383
|
+
},
|
384
|
+
)
|
373
385
|
|
374
|
-
|
375
|
-
"The provided language codes are not supported by EasyOCR",
|
376
|
-
context={
|
377
|
-
"language_code": ",".join([lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]),
|
378
|
-
"supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
|
379
|
-
},
|
380
|
-
)
|
386
|
+
return languages
|
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -52,6 +52,8 @@ class PaddleOCRConfig:
|
|
52
52
|
"""Binarization threshold for EAST output map."""
|
53
53
|
det_max_side_len: int = 960
|
54
54
|
"""Maximum size of image long side. Images exceeding this will be proportionally resized."""
|
55
|
+
det_model_dir: str | None = None
|
56
|
+
"""Directory for detection model. If None, uses default model location."""
|
55
57
|
drop_score: float = 0.5
|
56
58
|
"""Filter recognition results by confidence score. Results below this are discarded."""
|
57
59
|
enable_mkldnn: bool = False
|
@@ -82,6 +84,8 @@ class PaddleOCRConfig:
|
|
82
84
|
"""Recognition algorithm."""
|
83
85
|
rec_image_shape: str = "3,32,320"
|
84
86
|
"""Image shape for recognition algorithm in format 'channels,height,width'."""
|
87
|
+
rec_model_dir: str | None = None
|
88
|
+
"""Directory for recognition model. If None, uses default model location."""
|
85
89
|
table: bool = True
|
86
90
|
"""Whether to enable table recognition."""
|
87
91
|
use_angle_cls: bool = True
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.1.
|
3
|
+
Version: 3.1.4
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -27,19 +27,19 @@ License-File: LICENSE
|
|
27
27
|
Requires-Dist: anyio>=4.9.0
|
28
28
|
Requires-Dist: charset-normalizer>=3.4.1
|
29
29
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
-
Requires-Dist: html-to-markdown>=1.3.
|
30
|
+
Requires-Dist: html-to-markdown>=1.3.2
|
31
31
|
Requires-Dist: playa-pdf>=0.4.1
|
32
32
|
Requires-Dist: pypdfium2==4.30.0
|
33
33
|
Requires-Dist: python-calamine>=0.3.2
|
34
34
|
Requires-Dist: python-pptx>=1.0.2
|
35
|
-
Requires-Dist: typing-extensions>=4.13.
|
35
|
+
Requires-Dist: typing-extensions>=4.13.2; python_version < "3.12"
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
38
38
|
Requires-Dist: gmft>=0.4.1; extra == "all"
|
39
39
|
Requires-Dist: paddleocr>=2.10.0; extra == "all"
|
40
40
|
Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
|
41
41
|
Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
|
42
|
-
Requires-Dist: setuptools>=
|
42
|
+
Requires-Dist: setuptools>=79.0.1; extra == "all"
|
43
43
|
Provides-Extra: chunking
|
44
44
|
Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
|
45
45
|
Provides-Extra: easyocr
|
@@ -49,7 +49,7 @@ Requires-Dist: gmft>=0.4.1; extra == "gmft"
|
|
49
49
|
Provides-Extra: paddleocr
|
50
50
|
Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
|
51
51
|
Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
|
52
|
-
Requires-Dist: setuptools>=
|
52
|
+
Requires-Dist: setuptools>=79.0.1; extra == "paddleocr"
|
53
53
|
Dynamic: license-file
|
54
54
|
|
55
55
|
# Kreuzberg
|
@@ -13,21 +13,21 @@ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
|
|
13
13
|
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
14
14
|
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
15
15
|
kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
|
16
|
-
kreuzberg/_extractors/_pandoc.py,sha256=
|
16
|
+
kreuzberg/_extractors/_pandoc.py,sha256=OAbWvfzEx3rjim9uNMS9yBRnvkI71rYJKlgVzndsvyc,22157
|
17
17
|
kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
|
18
18
|
kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
|
19
19
|
kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
|
20
20
|
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
21
21
|
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
22
|
-
kreuzberg/_ocr/_easyocr.py,sha256=
|
23
|
-
kreuzberg/_ocr/_paddleocr.py,sha256=
|
22
|
+
kreuzberg/_ocr/_easyocr.py,sha256=J8IP2Fg55dG2MH9-lpyZFounvgIgWgCrw694UkaUa9E,11491
|
23
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=FyALVb3AQFcej9NFOLy-0dkA-3uxE_ie9Mzd6ho3t68,10656
|
24
24
|
kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
|
25
25
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
26
|
kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
|
27
27
|
kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
|
28
28
|
kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
|
29
|
-
kreuzberg-3.1.
|
30
|
-
kreuzberg-3.1.
|
31
|
-
kreuzberg-3.1.
|
32
|
-
kreuzberg-3.1.
|
33
|
-
kreuzberg-3.1.
|
29
|
+
kreuzberg-3.1.4.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
30
|
+
kreuzberg-3.1.4.dist-info/METADATA,sha256=HE3cHXQLweGIUxHsAlcd0h7jY-V1_j9eeBABnkrcW0g,6641
|
31
|
+
kreuzberg-3.1.4.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
32
|
+
kreuzberg-3.1.4.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
33
|
+
kreuzberg-3.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|