kreuzberg 3.1.3__py3-none-any.whl → 3.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_pandoc.py +46 -6
- kreuzberg/_ocr/_easyocr.py +20 -16
- kreuzberg/_ocr/_paddleocr.py +4 -0
- {kreuzberg-3.1.3.dist-info → kreuzberg-3.1.5.dist-info}/METADATA +9 -9
- {kreuzberg-3.1.3.dist-info → kreuzberg-3.1.5.dist-info}/RECORD +8 -8
- {kreuzberg-3.1.3.dist-info → kreuzberg-3.1.5.dist-info}/WHEEL +1 -1
- {kreuzberg-3.1.3.dist-info → kreuzberg-3.1.5.dist-info}/licenses/LICENSE +0 -0
- {kreuzberg-3.1.3.dist-info → kreuzberg-3.1.5.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -227,14 +227,54 @@ class PandocExtractor(Extractor):
|
|
227
227
|
|
228
228
|
command = ["pandoc", "--version"]
|
229
229
|
result = await run_process(command)
|
230
|
+
stdout = result.stdout.decode()
|
230
231
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
232
|
+
# Try more inclusive patterns to detect the pandoc version
|
233
|
+
# Try common formats first
|
234
|
+
version_match = re.search(
|
235
|
+
r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
|
236
|
+
)
|
237
|
+
|
238
|
+
# Try version in parentheses format
|
239
|
+
if not version_match:
|
240
|
+
version_match = re.search(r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)", stdout, re.IGNORECASE)
|
241
|
+
|
242
|
+
# Try hyphenated format
|
243
|
+
if not version_match:
|
244
|
+
version_match = re.search(r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?", stdout)
|
245
|
+
|
246
|
+
# If still no match, check for version at the beginning of the output or any line
|
247
|
+
if not version_match:
|
248
|
+
# Match version at the start of a line (like in the test case "2.9.2.1\npandoc-types 1.20")
|
249
|
+
version_match = re.search(r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?", stdout, re.MULTILINE)
|
250
|
+
|
251
|
+
# Try finding version-like patterns elsewhere in the text
|
252
|
+
if not version_match:
|
253
|
+
# Search for version-like patterns at the beginning of lines or after spaces
|
254
|
+
version_match = re.search(r"(?:^|\s)(\d+)\.(\d+)(?:\.(\d+))?(?:\s|$)", stdout)
|
255
|
+
|
256
|
+
# As a last resort, check any sequence of digits that might be a version
|
257
|
+
if not version_match:
|
258
|
+
out_lines = stdout.splitlines()
|
259
|
+
for line in out_lines:
|
260
|
+
for token in line.split():
|
261
|
+
# Match standalone version patterns like 2.11 or 2.11.4
|
262
|
+
version_pattern = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?$", token)
|
263
|
+
if version_pattern:
|
264
|
+
version_match = version_pattern
|
265
|
+
break
|
266
|
+
if version_match:
|
267
|
+
break
|
268
|
+
|
269
|
+
# If we found a version, check that the major version is at least the minimum required
|
270
|
+
if version_match and int(version_match.group(1)) >= MINIMAL_SUPPORTED_PANDOC_VERSION:
|
271
|
+
self._checked_version = True
|
272
|
+
return
|
236
273
|
|
237
|
-
|
274
|
+
# If we get here, we either didn't find a version or it's too low
|
275
|
+
raise MissingDependencyError(
|
276
|
+
"Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
277
|
+
)
|
238
278
|
|
239
279
|
except FileNotFoundError as e:
|
240
280
|
raise MissingDependencyError(
|
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -127,7 +127,8 @@ class EasyOCRConfig:
|
|
127
127
|
height_ths: float = 0.5
|
128
128
|
"""Maximum difference in box height for merging."""
|
129
129
|
language: str | list[str] = "en"
|
130
|
-
"""Language or languages to use for OCR.
|
130
|
+
"""Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
|
131
|
+
a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
|
131
132
|
link_threshold: float = 0.4
|
132
133
|
"""Link confidence threshold."""
|
133
134
|
low_text: float = 0.4
|
@@ -354,29 +355,32 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
354
355
|
|
355
356
|
@staticmethod
|
356
357
|
def _validate_language_code(language_codes: str | list[str]) -> list[str]:
|
357
|
-
"""Validate and normalize
|
358
|
+
"""Validate and normalize provided language codes.
|
358
359
|
|
359
360
|
Args:
|
360
|
-
language_codes: The language code string.
|
361
|
+
language_codes: The language code(s), either as a string (single or comma-separated) or a list.
|
361
362
|
|
362
363
|
Raises:
|
363
|
-
ValidationError: If the
|
364
|
+
ValidationError: If any of the languages are not supported by EasyOCR
|
364
365
|
|
365
366
|
Returns:
|
366
|
-
A list with the normalized language
|
367
|
+
A list with the normalized language codes.
|
367
368
|
"""
|
368
|
-
if
|
369
|
-
|
369
|
+
if isinstance(language_codes, str):
|
370
|
+
# Handle comma-separated language codes
|
371
|
+
languages = [lang.strip().lower() for lang in language_codes.split(",")]
|
370
372
|
else:
|
373
|
+
# Handle list of language codes
|
371
374
|
languages = [lang.lower() for lang in language_codes]
|
372
375
|
|
373
|
-
|
374
|
-
|
376
|
+
unsupported_langs = [lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]
|
377
|
+
if unsupported_langs:
|
378
|
+
raise ValidationError(
|
379
|
+
"The provided language codes are not supported by EasyOCR",
|
380
|
+
context={
|
381
|
+
"language_code": ",".join(unsupported_langs),
|
382
|
+
"supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
|
383
|
+
},
|
384
|
+
)
|
375
385
|
|
376
|
-
|
377
|
-
"The provided language codes are not supported by EasyOCR",
|
378
|
-
context={
|
379
|
-
"language_code": ",".join([lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]),
|
380
|
-
"supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
|
381
|
-
},
|
382
|
-
)
|
386
|
+
return languages
|
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -52,6 +52,8 @@ class PaddleOCRConfig:
|
|
52
52
|
"""Binarization threshold for EAST output map."""
|
53
53
|
det_max_side_len: int = 960
|
54
54
|
"""Maximum size of image long side. Images exceeding this will be proportionally resized."""
|
55
|
+
det_model_dir: str | None = None
|
56
|
+
"""Directory for detection model. If None, uses default model location."""
|
55
57
|
drop_score: float = 0.5
|
56
58
|
"""Filter recognition results by confidence score. Results below this are discarded."""
|
57
59
|
enable_mkldnn: bool = False
|
@@ -82,6 +84,8 @@ class PaddleOCRConfig:
|
|
82
84
|
"""Recognition algorithm."""
|
83
85
|
rec_image_shape: str = "3,32,320"
|
84
86
|
"""Image shape for recognition algorithm in format 'channels,height,width'."""
|
87
|
+
rec_model_dir: str | None = None
|
88
|
+
"""Directory for recognition model. If None, uses default model location."""
|
85
89
|
table: bool = True
|
86
90
|
"""Whether to enable table recognition."""
|
87
91
|
use_angle_cls: bool = True
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.1.
|
3
|
+
Version: 3.1.5
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -25,23 +25,23 @@ Requires-Python: >=3.9
|
|
25
25
|
Description-Content-Type: text/markdown
|
26
26
|
License-File: LICENSE
|
27
27
|
Requires-Dist: anyio>=4.9.0
|
28
|
-
Requires-Dist: charset-normalizer>=3.4.
|
28
|
+
Requires-Dist: charset-normalizer>=3.4.2
|
29
29
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
-
Requires-Dist: html-to-markdown>=1.3.
|
31
|
-
Requires-Dist: playa-pdf>=0.4.
|
30
|
+
Requires-Dist: html-to-markdown>=1.3.2
|
31
|
+
Requires-Dist: playa-pdf>=0.4.3
|
32
32
|
Requires-Dist: pypdfium2==4.30.0
|
33
33
|
Requires-Dist: python-calamine>=0.3.2
|
34
34
|
Requires-Dist: python-pptx>=1.0.2
|
35
|
-
Requires-Dist: typing-extensions>=4.13.
|
35
|
+
Requires-Dist: typing-extensions>=4.13.2; python_version < "3.12"
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
38
38
|
Requires-Dist: gmft>=0.4.1; extra == "all"
|
39
39
|
Requires-Dist: paddleocr>=2.10.0; extra == "all"
|
40
40
|
Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
|
41
|
-
Requires-Dist: semantic-text-splitter>=0.
|
42
|
-
Requires-Dist: setuptools>=
|
41
|
+
Requires-Dist: semantic-text-splitter>=0.26.0; extra == "all"
|
42
|
+
Requires-Dist: setuptools>=80.4.0; extra == "all"
|
43
43
|
Provides-Extra: chunking
|
44
|
-
Requires-Dist: semantic-text-splitter>=0.
|
44
|
+
Requires-Dist: semantic-text-splitter>=0.26.0; extra == "chunking"
|
45
45
|
Provides-Extra: easyocr
|
46
46
|
Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
|
47
47
|
Provides-Extra: gmft
|
@@ -49,7 +49,7 @@ Requires-Dist: gmft>=0.4.1; extra == "gmft"
|
|
49
49
|
Provides-Extra: paddleocr
|
50
50
|
Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
|
51
51
|
Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
|
52
|
-
Requires-Dist: setuptools>=
|
52
|
+
Requires-Dist: setuptools>=80.4.0; extra == "paddleocr"
|
53
53
|
Dynamic: license-file
|
54
54
|
|
55
55
|
# Kreuzberg
|
@@ -13,21 +13,21 @@ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
|
|
13
13
|
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
14
14
|
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
15
15
|
kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
|
16
|
-
kreuzberg/_extractors/_pandoc.py,sha256=
|
16
|
+
kreuzberg/_extractors/_pandoc.py,sha256=OAbWvfzEx3rjim9uNMS9yBRnvkI71rYJKlgVzndsvyc,22157
|
17
17
|
kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
|
18
18
|
kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
|
19
19
|
kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
|
20
20
|
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
21
21
|
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
22
|
-
kreuzberg/_ocr/_easyocr.py,sha256=
|
23
|
-
kreuzberg/_ocr/_paddleocr.py,sha256=
|
22
|
+
kreuzberg/_ocr/_easyocr.py,sha256=J8IP2Fg55dG2MH9-lpyZFounvgIgWgCrw694UkaUa9E,11491
|
23
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=FyALVb3AQFcej9NFOLy-0dkA-3uxE_ie9Mzd6ho3t68,10656
|
24
24
|
kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
|
25
25
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
26
|
kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
|
27
27
|
kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
|
28
28
|
kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
|
29
|
-
kreuzberg-3.1.
|
30
|
-
kreuzberg-3.1.
|
31
|
-
kreuzberg-3.1.
|
32
|
-
kreuzberg-3.1.
|
33
|
-
kreuzberg-3.1.
|
29
|
+
kreuzberg-3.1.5.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
30
|
+
kreuzberg-3.1.5.dist-info/METADATA,sha256=nvd68yES8u37CwDxpNLwwZKmRHKqsqCPLx0MqRZHFjo,6641
|
31
|
+
kreuzberg-3.1.5.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
|
32
|
+
kreuzberg-3.1.5.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
33
|
+
kreuzberg-3.1.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|