PyPI - kreuzberg - Versions diffs - 3.1.2__py3-none-any.whl → 3.1.4__py3-none-any.whl - Mend

kreuzberg 3.1.2py3-none-any.whl → 3.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

kreuzberg/_extractors/_pandoc.py CHANGED Viewed

@@ -227,14 +227,54 @@ class PandocExtractor(Extractor):
             command = ["pandoc", "--version"]
             result = await run_process(command)
+            stdout = result.stdout.decode()
-            version_match = re.search(r"pandoc\s+v?(\d+)\.\d+(\.\d+)?", result.stdout.decode())
-            if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
-                raise MissingDependencyError(
-                    "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
-                )
+            # Try more inclusive patterns to detect the pandoc version
+            # Try common formats first
+            version_match = re.search(
+                r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
+            )
+            # Try version in parentheses format
+            if not version_match:
+                version_match = re.search(r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)", stdout, re.IGNORECASE)
+            # Try hyphenated format
+            if not version_match:
+                version_match = re.search(r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?", stdout)
+            # If still no match, check for version at the beginning of the output or any line
+            if not version_match:
+                # Match version at the start of a line (like in the test case "2.9.2.1\npandoc-types 1.20")
+                version_match = re.search(r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?", stdout, re.MULTILINE)
+            # Try finding version-like patterns elsewhere in the text
+            if not version_match:
+                # Search for version-like patterns at the beginning of lines or after spaces
+                version_match = re.search(r"(?:^|\s)(\d+)\.(\d+)(?:\.(\d+))?(?:\s|$)", stdout)
+            # As a last resort, check any sequence of digits that might be a version
+            if not version_match:
+                out_lines = stdout.splitlines()
+                for line in out_lines:
+                    for token in line.split():
+                        # Match standalone version patterns like 2.11 or 2.11.4
+                        version_pattern = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?$", token)
+                        if version_pattern:
+                            version_match = version_pattern
+                            break
+                    if version_match:
+                        break
+            # If we found a version, check that the major version is at least the minimum required
+            if version_match and int(version_match.group(1)) >= MINIMAL_SUPPORTED_PANDOC_VERSION:
+                self._checked_version = True
+                return
-            self._checked_version = True
+            # If we get here, we either didn't find a version or it's too low
+            raise MissingDependencyError(
+                "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
+            )
         except FileNotFoundError as e:
             raise MissingDependencyError(

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -127,7 +127,8 @@ class EasyOCRConfig:
     height_ths: float = 0.5
     """Maximum difference in box height for merging."""
     language: str | list[str] = "en"
-    """Language or languages to use for OCR."""
+    """Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
+    a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
     link_threshold: float = 0.4
     """Link confidence threshold."""
     low_text: float = 0.4
@@ -170,6 +171,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         Raises:
             OCRError: If OCR processing fails.
         """
+        import numpy as np
         await self._init_easyocr(**kwargs)
         beam_width = kwargs.pop("beam_width")
@@ -180,7 +183,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         try:
             result = await run_sync(
                 self._reader.readtext,
-                image.tobytes(),
+                np.array(image),
                 beamWidth=beam_width,
                 **kwargs,
             )
@@ -352,29 +355,32 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     @staticmethod
     def _validate_language_code(language_codes: str | list[str]) -> list[str]:
-        """Validate and normalize a provided language code.
+        """Validate and normalize provided language codes.
         Args:
-            language_codes: The language code string.
+            language_codes: The language code(s), either as a string (single or comma-separated) or a list.
         Raises:
-            ValidationError: If the language is not supported by EasyOCR
+            ValidationError: If any of the languages are not supported by EasyOCR
         Returns:
-            A list with the normalized language code.
+            A list with the normalized language codes.
         """
-        if not isinstance(language_codes, list):
-            languages = [language_codes.lower()]
+        if isinstance(language_codes, str):
+            # Handle comma-separated language codes
+            languages = [lang.strip().lower() for lang in language_codes.split(",")]
         else:
+            # Handle list of language codes
             languages = [lang.lower() for lang in language_codes]
-        if all(lang in EASYOCR_SUPPORTED_LANGUAGE_CODES for lang in languages):
-            return languages
+        unsupported_langs = [lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]
+        if unsupported_langs:
+            raise ValidationError(
+                "The provided language codes are not supported by EasyOCR",
+                context={
+                    "language_code": ",".join(unsupported_langs),
+                    "supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
+                },
+            )
-        raise ValidationError(
-            "The provided language codes are not supported by EasyOCR",
-            context={
-                "language_code": ",".join([lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]),
-                "supported_languages": ",".join(sorted(EASYOCR_SUPPORTED_LANGUAGE_CODES)),
-            },
-        )
+        return languages

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -52,6 +52,8 @@ class PaddleOCRConfig:
     """Binarization threshold for EAST output map."""
     det_max_side_len: int = 960
     """Maximum size of image long side. Images exceeding this will be proportionally resized."""
+    det_model_dir: str | None = None
+    """Directory for detection model. If None, uses default model location."""
     drop_score: float = 0.5
     """Filter recognition results by confidence score. Results below this are discarded."""
     enable_mkldnn: bool = False
@@ -82,6 +84,8 @@ class PaddleOCRConfig:
     """Recognition algorithm."""
     rec_image_shape: str = "3,32,320"
     """Image shape for recognition algorithm in format 'channels,height,width'."""
+    rec_model_dir: str | None = None
+    """Directory for recognition model. If None, uses default model location."""
     table: bool = True
     """Whether to enable table recognition."""
     use_angle_cls: bool = True

{kreuzberg-3.1.2.dist-info → kreuzberg-3.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.1.2
+Version: 3.1.4
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
@@ -27,19 +27,19 @@ License-File: LICENSE
 Requires-Dist: anyio>=4.9.0
 Requires-Dist: charset-normalizer>=3.4.1
 Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
-Requires-Dist: html-to-markdown>=1.3.0
+Requires-Dist: html-to-markdown>=1.3.2
 Requires-Dist: playa-pdf>=0.4.1
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.3.2
 Requires-Dist: python-pptx>=1.0.2
-Requires-Dist: typing-extensions>=4.13.1; python_version < "3.12"
+Requires-Dist: typing-extensions>=4.13.2; python_version < "3.12"
 Provides-Extra: all
 Requires-Dist: easyocr>=1.7.2; extra == "all"
 Requires-Dist: gmft>=0.4.1; extra == "all"
 Requires-Dist: paddleocr>=2.10.0; extra == "all"
 Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
 Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
-Requires-Dist: setuptools>=76.0.0; extra == "all"
+Requires-Dist: setuptools>=79.0.1; extra == "all"
 Provides-Extra: chunking
 Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
 Provides-Extra: easyocr
@@ -49,7 +49,7 @@ Requires-Dist: gmft>=0.4.1; extra == "gmft"
 Provides-Extra: paddleocr
 Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
 Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
-Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
+Requires-Dist: setuptools>=79.0.1; extra == "paddleocr"
 Dynamic: license-file
 # Kreuzberg

{kreuzberg-3.1.2.dist-info → kreuzberg-3.1.4.dist-info}/RECORD RENAMED Viewed

@@ -13,21 +13,21 @@ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
 kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
 kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
 kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
-kreuzberg/_extractors/_pandoc.py,sha256=U5CUrVilKdJqXJFUUT5xzcpy2jfJ26h7kde3p1N_N4w,20124
+kreuzberg/_extractors/_pandoc.py,sha256=OAbWvfzEx3rjim9uNMS9yBRnvkI71rYJKlgVzndsvyc,22157
 kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
 kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
 kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
 kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
 kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
-kreuzberg/_ocr/_easyocr.py,sha256=IUX5AGMp3C2u3Byiz8BADLMlgoNEpFhwswmdeifMcIo,11112
-kreuzberg/_ocr/_paddleocr.py,sha256=NDKXiMtHjIy-Uq4hXe4qm5oUWwOrhjJaibyC708Cw5E,10422
+kreuzberg/_ocr/_easyocr.py,sha256=J8IP2Fg55dG2MH9-lpyZFounvgIgWgCrw694UkaUa9E,11491
+kreuzberg/_ocr/_paddleocr.py,sha256=FyALVb3AQFcej9NFOLy-0dkA-3uxE_ie9Mzd6ho3t68,10656
 kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
 kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
 kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
 kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
-kreuzberg-3.1.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-3.1.2.dist-info/METADATA,sha256=7cx9eSl0NAfeu18rvYT4BtwVdVOA1ZgInDx8KcpXlw8,6641
-kreuzberg-3.1.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-kreuzberg-3.1.2.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
-kreuzberg-3.1.2.dist-info/RECORD,,
+kreuzberg-3.1.4.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-3.1.4.dist-info/METADATA,sha256=HE3cHXQLweGIUxHsAlcd0h7jY-V1_j9eeBABnkrcW0g,6641
+kreuzberg-3.1.4.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+kreuzberg-3.1.4.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
+kreuzberg-3.1.4.dist-info/RECORD,,

{kreuzberg-3.1.2.dist-info → kreuzberg-3.1.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (78.1.0)
+Generator: setuptools (79.0.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{kreuzberg-3.1.2.dist-info → kreuzberg-3.1.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{kreuzberg-3.1.2.dist-info → kreuzberg-3.1.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

kreuzberg 3.1.2__py3-none-any.whl → 3.1.4__py3-none-any.whl

kreuzberg 3.1.2py3-none-any.whl → 3.1.4py3-none-any.whl