PyPI - deepdoctection - Versions diffs - 0.30__py3-none-any.whl → 0.31__py3-none-any.whl - Mend

deepdoctection 0.30py3-none-any.whl → 0.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (74) hide show

deepdoctection/__init__.py +4 -2
deepdoctection/analyzer/dd.py +6 -5
deepdoctection/dataflow/base.py +0 -19
deepdoctection/dataflow/custom.py +4 -3
deepdoctection/dataflow/custom_serialize.py +14 -5
deepdoctection/dataflow/parallel_map.py +12 -11
deepdoctection/dataflow/serialize.py +5 -4
deepdoctection/datapoint/annotation.py +33 -12
deepdoctection/datapoint/box.py +1 -4
deepdoctection/datapoint/convert.py +3 -1
deepdoctection/datapoint/image.py +66 -29
deepdoctection/datapoint/view.py +57 -25
deepdoctection/datasets/adapter.py +1 -1
deepdoctection/datasets/base.py +83 -10
deepdoctection/datasets/dataflow_builder.py +1 -1
deepdoctection/datasets/info.py +2 -2
deepdoctection/datasets/instances/layouttest.py +2 -7
deepdoctection/eval/accmetric.py +1 -1
deepdoctection/eval/base.py +5 -4
deepdoctection/eval/eval.py +2 -2
deepdoctection/eval/tp_eval_callback.py +5 -4
deepdoctection/extern/base.py +39 -13
deepdoctection/extern/d2detect.py +164 -64
deepdoctection/extern/deskew.py +32 -7
deepdoctection/extern/doctrocr.py +227 -39
deepdoctection/extern/fastlang.py +45 -7
deepdoctection/extern/hfdetr.py +90 -33
deepdoctection/extern/hflayoutlm.py +109 -22
deepdoctection/extern/pdftext.py +2 -1
deepdoctection/extern/pt/ptutils.py +3 -2
deepdoctection/extern/tessocr.py +134 -22
deepdoctection/extern/texocr.py +2 -0
deepdoctection/extern/tp/tpcompat.py +4 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +2 -7
deepdoctection/extern/tpdetect.py +50 -23
deepdoctection/mapper/d2struct.py +1 -1
deepdoctection/mapper/hfstruct.py +1 -1
deepdoctection/mapper/laylmstruct.py +1 -1
deepdoctection/mapper/maputils.py +13 -2
deepdoctection/mapper/prodigystruct.py +1 -1
deepdoctection/mapper/pubstruct.py +10 -10
deepdoctection/mapper/tpstruct.py +1 -1
deepdoctection/pipe/anngen.py +35 -8
deepdoctection/pipe/base.py +53 -19
deepdoctection/pipe/cell.py +29 -8
deepdoctection/pipe/common.py +12 -4
deepdoctection/pipe/doctectionpipe.py +2 -2
deepdoctection/pipe/language.py +3 -2
deepdoctection/pipe/layout.py +3 -2
deepdoctection/pipe/lm.py +2 -2
deepdoctection/pipe/refine.py +18 -10
deepdoctection/pipe/segment.py +21 -16
deepdoctection/pipe/text.py +14 -8
deepdoctection/pipe/transform.py +16 -9
deepdoctection/train/d2_frcnn_train.py +15 -12
deepdoctection/train/hf_detr_train.py +8 -6
deepdoctection/train/hf_layoutlm_train.py +16 -11
deepdoctection/utils/__init__.py +3 -0
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +2 -2
deepdoctection/utils/env_info.py +55 -22
deepdoctection/utils/error.py +84 -0
deepdoctection/utils/file_utils.py +4 -15
deepdoctection/utils/fs.py +7 -7
deepdoctection/utils/pdf_utils.py +5 -4
deepdoctection/utils/settings.py +5 -1
deepdoctection/utils/transform.py +1 -1
deepdoctection/utils/utils.py +0 -6
deepdoctection/utils/viz.py +44 -2
{deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/METADATA +33 -58
{deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/RECORD +74 -73
{deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/WHEEL +1 -1
{deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/LICENSE +0 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/top_level.txt +0 -0

deepdoctection/utils/env_info.py CHANGED Viewed

@@ -53,7 +53,7 @@ import re
 import subprocess
 import sys
 from collections import defaultdict
-from typing import List, Optional, Tuple
+from typing import List, Literal, Optional, Tuple
 import numpy as np
 from tabulate import tabulate
@@ -420,7 +420,7 @@ def collect_env_info() -> str:
     try:
         import prctl  # type: ignore
-        _ = prctl.set_pdeathsig  # noqa
+        _ = prctl.set_pdeathsig  # pylint: disable=E1101
     except ModuleNotFoundError:
         has_prctl = False
     data.append(("python-prctl", str(has_prctl)))
@@ -452,6 +452,20 @@ def collect_env_info() -> str:
     return env_str
+def set_env(name: str, value: str) -> None:
+    """
+    Set an environment variable if it is not already set.
+    :param name: The name of the environment variable
+    :param value: The value of the environment variable
+    """
+    if os.environ.get(name):
+        return
+    os.environ[name] = value
+    return
 def auto_select_lib_and_device() -> None:
     """
     Select the DL library and subsequently the device.
@@ -461,41 +475,60 @@ def auto_select_lib_and_device() -> None:
     is not installed raise ImportError.
     """
+    # USE_TF and USE_TORCH are env variables that steer DL library selection for Doctr.
     if tf_available() and tensorpack_available():
         from tensorpack.utils.gpu import get_num_gpu  # pylint: disable=E0401
         if get_num_gpu() >= 1:
-            os.environ["USE_TENSORFLOW"] = "True"
-            os.environ["USE_PYTORCH"] = "False"
-            os.environ["USE_CUDA"] = "True"
-            os.environ["USE_MPS"] = "False"
+            set_env("USE_TENSORFLOW", "True")
+            set_env("USE_PYTORCH", "False")
+            set_env("USE_CUDA", "True")
+            set_env("USE_MPS", "False")
+            set_env("USE_TF", "TRUE")
+            set_env("USE_TORCH", "False")
             return
         if pytorch_available():
-            os.environ["USE_TENSORFLOW"] = "False"
-            os.environ["USE_PYTORCH"] = "True"
-            os.environ["USE_CUDA"] = "False"
+            set_env("USE_TENSORFLOW", "False")
+            set_env("USE_PYTORCH", "True")
+            set_env("USE_CUDA", "False")
+            set_env("USE_TF", "False")
+            set_env("USE_TORCH", "TRUE")
             return
         logger.warning(
             LoggingRecord("You have Tensorflow installed but no GPU is available. All Tensorflow models require a GPU.")
         )
+    if tf_available():
+        set_env("USE_TENSORFLOW", "False")
+        set_env("USE_PYTORCH", "False")
+        set_env("USE_CUDA", "False")
+        set_env("USE_TF", "AUTO")
+        set_env("USE_TORCH", "AUTO")
+        return
     if pytorch_available():
         import torch
         if torch.cuda.is_available():
-            os.environ["USE_TENSORFLOW"] = "False"
-            os.environ["USE_PYTORCH"] = "True"
-            os.environ["USE_CUDA"] = "True"
+            set_env("USE_TENSORFLOW", "False")
+            set_env("USE_PYTORCH", "True")
+            set_env("USE_CUDA", "True")
+            set_env("USE_TF", "False")
+            set_env("USE_TORCH", "TRUE")
             return
         if torch.backends.mps.is_available():
-            os.environ["USE_TENSORFLOW"] = "False"
-            os.environ["USE_PYTORCH"] = "True"
-            os.environ["USE_CUDA"] = "False"
-            os.environ["USE_MPS"] = "True"
+            set_env("USE_TENSORFLOW", "False")
+            set_env("USE_PYTORCH", "True")
+            set_env("USE_CUDA", "False")
+            set_env("USE_MPS", "True")
+            set_env("USE_TF", "False")
+            set_env("USE_TORCH", "TRUE")
             return
-        os.environ["USE_TENSORFLOW"] = "False"
-        os.environ["USE_PYTORCH"] = "True"
-        os.environ["USE_CUDA"] = "False"
-        os.environ["USE_MPS"] = "False"
+        set_env("USE_TENSORFLOW", "False")
+        set_env("USE_PYTORCH", "True")
+        set_env("USE_CUDA", "False")
+        set_env("USE_MPS", "False")
+        set_env("USE_TF", "AUTO")
+        set_env("USE_TORCH", "AUTO")
         return
     logger.warning(
         LoggingRecord(
@@ -505,7 +538,7 @@ def auto_select_lib_and_device() -> None:
     )
-def get_device(ignore_cpu: bool = True) -> str:
+def get_device(ignore_cpu: bool = True) -> Literal["cuda", "mps", "cpu"]:
     """
     Device checks for running PyTorch with CUDA, MPS or optionall CPU.
     If nothing can be found and if `disable_cpu` is deactivated it will raise a `ValueError`
@@ -520,7 +553,7 @@ def get_device(ignore_cpu: bool = True) -> str:
         return "mps"
     if not ignore_cpu:
         return "cpu"
-    raise ValueError("Could not find either GPU nor MPS")
+    raise RuntimeWarning("Could not find either GPU nor MPS")
 def auto_select_viz_library() -> None:

deepdoctection/utils/error.py ADDED Viewed

@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+# File: error.py
+# Copyright 2024 Dr. Janis Meyer. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Module for custom exceptions
+"""
+class BoundingBoxError(BaseException):
+    """Special exception only for `datapoint.box.BoundingBox`"""
+class AnnotationError(BaseException):
+    """Special exception only for `datapoint.annotation.Annotation`"""
+class ImageError(BaseException):
+    """Special exception only for `datapoint.image.Image`"""
+class UUIDError(BaseException):
+    """Special exception only for `utils.identifier`"""
+class DependencyError(BaseException):
+    """Special exception only for missing dependencies. We do not use the internals ImportError or
+    ModuleNotFoundError."""
+class DataFlowTerminatedError(BaseException):
+    """
+    An exception indicating that the DataFlow is unable to produce any more
+    data, i.e. something wrong happened so that calling `__iter__`
+    cannot give a valid iterator anymore.
+    In most DataFlow this will never be raised.
+    """
+class DataFlowResetStateNotCalledError(BaseException):
+    """
+    An exception indicating that `reset_state()` has not been called before starting
+    iteration.
+    """
+    def __init__(self) -> None:
+        super().__init__("Iterating a dataflow requires .reset_state() to be called first")
+class MalformedData(BaseException):
+    """
+    Exception class for malformed data. Use this class if something does not look right with the data
+    """
+class FileExtensionError(BaseException):
+    """
+    Exception class for wrong file extensions.
+    """
+class TesseractError(RuntimeError):
+    """
+    Tesseract Error
+    """
+    def __init__(self, status: int, message: str) -> None:
+        super().__init__()
+        self.status = status
+        self.message = message
+        self.args = (status, message)

deepdoctection/utils/file_utils.py CHANGED Viewed

@@ -22,6 +22,7 @@ import importlib_metadata
 from packaging import version
 from .detection_types import Requirement
+from .error import DependencyError
 from .logger import LoggingRecord, logger
 from .metacfg import AttrDict
@@ -263,7 +264,7 @@ def set_tesseract_path(tesseract_path: str) -> None:
     :param tesseract_path: Tesseract installation path.
     """
     if tesseract_path is None:
-        raise ValueError("tesseract_path is empty.")
+        raise TypeError("tesseract_path cannot be None")
     global _TESS_AVAILABLE  # pylint: disable=W0603
     global _TESS_PATH  # pylint: disable=W0603
@@ -288,12 +289,6 @@ def tesseract_available() -> bool:
 # copy paste from https://github.com/madmaze/pytesseract/blob/master/pytesseract/pytesseract.py
-class TesseractNotFound(BaseException):
-    """
-    Exception class for Tesseract being not found
-    """
 def get_tesseract_version() -> Union[int, version.Version]:
     """
     Returns Version object of the Tesseract version. We need at least Tesseract 3.05
@@ -306,7 +301,7 @@ def get_tesseract_version() -> Union[int, version.Version]:
             stdin=subprocess.DEVNULL,
         )
     except OSError:
-        raise TesseractNotFound(_TESS_ERR_MSG) from OSError
+        raise DependencyError(_TESS_ERR_MSG) from OSError
     raw_version = output.decode("utf-8")
     str_version, *_ = raw_version.lstrip(string.printable[10:]).partition(" ")
@@ -348,12 +343,6 @@ def pdf_to_cairo_available() -> bool:
     return bool(_PDF_TO_CAIRO_AVAILABLE)
-class PopplerNotFound(BaseException):
-    """
-    Exception class for Poppler being not found
-    """
 def get_poppler_version() -> Union[int, version.Version]:
     """
     Returns Version object of the Poppler version. We need at least Tesseract 3.05
@@ -371,7 +360,7 @@ def get_poppler_version() -> Union[int, version.Version]:
             [command, "-v"], stderr=subprocess.STDOUT, env=environ, stdin=subprocess.DEVNULL
         )
     except OSError:
-        raise PopplerNotFound() from OSError
+        raise DependencyError(_POPPLER_ERR_MSG) from OSError
     raw_version = output.decode("utf-8")
     list_version = raw_version.split("\n", maxsplit=1)[0].split(" ")[-1].split(".")

deepdoctection/utils/fs.py CHANGED Viewed

@@ -34,7 +34,7 @@ from .logger import LoggingRecord, logger
 from .pdf_utils import get_pdf_file_reader, get_pdf_file_writer
 from .settings import CONFIGS, DATASET_DIR, MODEL_DIR, PATH
 from .tqdm import get_tqdm
-from .utils import FileExtensionError, is_file_extension
+from .utils import is_file_extension
 from .viz import viz_handler
 __all__ = [
@@ -44,9 +44,7 @@ __all__ = [
     "maybe_path_or_pdf",
     "download",
     "mkdir_p",
-    "is_file_extension",
     "load_json",
-    "FileExtensionError",
     "sub_path",
     "get_package_path",
     "get_configs_dir_path",
@@ -125,8 +123,8 @@ def download(url: str, directory: Pathlike, file_name: Optional[str] = None, exp
     assert size > 0, f"Downloaded an empty file from {url}!"
     if expect_size is not None and size != expect_size:
-        logger.error(LoggingRecord(f"File downloaded from {url} does not match the expected size!"))
-        logger.error(
+        logger.warning(LoggingRecord(f"File downloaded from {url} does not match the expected size!"))
+        logger.warning(
             LoggingRecord("You may have downloaded a broken file, or the upstream may have modified the file.")
         )
@@ -210,13 +208,15 @@ def get_load_image_func(
     :return: The function loading the file (and converting to its desired format)
     """
-    assert is_file_extension(path, [".png", ".jpeg", ".jpg", ".pdf", ".tif"]), f"image type not allowed: {path}"
+    assert is_file_extension(path, [".png", ".jpeg", ".jpg", ".pdf", ".tif"]), f"image type not allowed: " f"{path}"
     if is_file_extension(path, [".png", ".jpeg", ".jpg", ".tif"]):
         return load_image_from_file
     if is_file_extension(path, [".pdf"]):
         return load_bytes_from_pdf_file
-    return NotImplemented
+    raise NotImplementedError(
+        "File extension not supported by any loader. Please specify a file type and raise an issue"
+    )
 def maybe_path_or_pdf(path: Pathlike) -> int:

deepdoctection/utils/pdf_utils.py CHANGED Viewed

@@ -32,9 +32,10 @@ from pypdf import PdfReader, PdfWriter, errors
 from .context import save_tmp_file, timeout_manager
 from .detection_types import ImageType, Pathlike
-from .file_utils import PopplerNotFound, pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
+from .error import DependencyError, FileExtensionError
+from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
 from .logger import LoggingRecord, logger
-from .utils import FileExtensionError, is_file_extension
+from .utils import is_file_extension
 from .viz import viz_handler
 __all__ = ["decrypt_pdf_document", "get_pdf_file_reader", "get_pdf_file_writer", "PDFStreamer", "pdf_to_np_array"]
@@ -165,7 +166,7 @@ def _input_to_cli_str(
     elif pdf_to_cairo_available():
         command = "pdftocairo"
     else:
-        raise PopplerNotFound("Poppler not found. Please install or add to your PATH.")
+        raise DependencyError("Poppler not found. Please install or add to your PATH.")
     if platform.system() == "Windows":
         command = command + ".exe"
@@ -201,7 +202,7 @@ def _run_poppler(poppler_args: List[str]) -> None:
     except OSError as error:
         if error.errno != ENOENT:
             raise error from error
-        raise PopplerNotFound("Poppler not found. Please install or add to your PATH.") from error
+        raise DependencyError("Poppler not found. Please install or add to your PATH.") from error
     with timeout_manager(proc, 0):
         if proc.returncode:

deepdoctection/utils/settings.py CHANGED Viewed

@@ -65,6 +65,7 @@ class PageType(ObjectTypes):
     document_type = "document_type"
     language = "language"
+    angle = "angle"
 @object_types_registry.register("SummaryType")
@@ -125,6 +126,7 @@ class LayoutType(ObjectTypes):
     column = "column"
     word = "word"
     line = "line"
+    background = "background"
 @object_types_registry.register("TableType")
@@ -324,7 +326,9 @@ def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes
     """
     if isinstance(token, TokenClasses) and isinstance(tag, BioTag):
         return _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG[(token, tag)]
-    raise TypeError("Token must be of type TokenClasses and tag must be of type BioTag")
+    raise TypeError(
+        f"Token must be of type TokenClasses, is of {type(token)} and tag " f"{type(tag)} must be of type BioTag"
+    )
 def token_class_with_tag_to_token_class_and_tag(

deepdoctection/utils/transform.py CHANGED Viewed

@@ -47,7 +47,7 @@ class BaseTransform(ABC):
     @abstractmethod
     def apply_image(self, img: ImageType) -> ImageType:
         """The transformation that should be applied to the image"""
-        raise NotImplementedError
+        raise NotImplementedError()
 class ResizeTransform(BaseTransform):

deepdoctection/utils/utils.py CHANGED Viewed

@@ -144,12 +144,6 @@ def get_rng(obj: Any = None) -> np.random.RandomState:
     return np.random.RandomState(seed)
-class FileExtensionError(BaseException):
-    """
-    An exception indicating that a file does not seem to have an expected type
-    """
 def is_file_extension(file_name: Pathlike, extension: Union[str, Sequence[str]]) -> bool:
     """
     Check if a given file name has a given extension

deepdoctection/utils/viz.py CHANGED Viewed

@@ -38,6 +38,7 @@ from numpy import float32, uint8
 from .detection_types import ImageType
 from .env_info import auto_select_viz_library
+from .error import DependencyError
 from .file_utils import get_opencv_requirement, get_pillow_requirement, opencv_available, pillow_available
 if opencv_available():
@@ -307,6 +308,7 @@ class VizPackageHandler:
             "draw_text": "_cv2_draw_text",
             "interactive_imshow": "_cv2_interactive_imshow",
             "encode": "_cv2_encode",
+            "rotate_image": "_cv2_rotate_image",
         },
         "pillow": {
             "read_image": "_pillow_read_image",
@@ -319,6 +321,7 @@ class VizPackageHandler:
             "draw_text": "_pillow_draw_text",
             "interactive_imshow": "_pillow_interactive_imshow",
             "encode": "_pillow_encode",
+            "rotate_image": "_pillow_rotate_image",
         },
     }
@@ -352,12 +355,12 @@ class VizPackageHandler:
         if maybe_cv2:
             requirements = get_opencv_requirement()
             if not requirements[1]:
-                raise ImportError(requirements[2])
+                raise DependencyError(requirements[2])
             return maybe_cv2
         requirements = get_pillow_requirement()
         if not requirements[1]:
-            raise ImportError(requirements[2])
+            raise DependencyError(requirements[2])
         return "pillow"
     def _set_vars(self, package: str) -> None:
@@ -690,6 +693,45 @@ class VizPackageHandler:
         pil_image = Image.fromarray(np.uint8(np_image[:, :, ::-1]))
         pil_image.show(name)
+    def rotate_image(self, np_image: ImageType, angle: int) -> ImageType:
+        """Rotating an image by some angle"""
+        return getattr(self, self.pkg_func_dict["rotate_image"])(np_image, angle)
+    @staticmethod
+    def _cv2_rotate_image(np_image: ImageType, angle: float) -> ImageType:
+        # copy & paste from https://stackoverflow.com/questions/43892506
+        # /opencv-python-rotate-image-without-cropping-sides
+        height, width = np_image.shape[:2]
+        image_center = (width / 2, height / 2)
+        rotation_mat = cv2.getRotationMatrix2D(center=image_center, angle=angle, scale=1.0)
+        # rotation calculates the cos and sin, taking absolutes of those.
+        abs_cos = abs(rotation_mat[0, 0])
+        abs_sin = abs(rotation_mat[0, 1])
+        # find the new width and height bounds
+        bound_w = int(height * abs_sin + width * abs_cos)
+        bound_h = int(height * abs_cos + width * abs_sin)
+        # subtract old image center (bringing image back to origo) and adding the new image center coordinates
+        rotation_mat[0, 2] += bound_w / 2 - image_center[0]
+        rotation_mat[1, 2] += bound_h / 2 - image_center[1]
+        np_image = cv2.warpAffine(  # type: ignore
+            src=np_image,
+            M=rotation_mat,
+            dsize=(bound_w, bound_h),
+        )
+        return np_image
+    @staticmethod
+    def _pillow_rotate_image(np_image: ImageType, angle: int) -> ImageType:
+        pil_image = Image.fromarray(np.uint8(np_image[:, :, ::-1]))
+        pil_image_rotated = pil_image.rotate(angle, expand=True)
+        return np.array(pil_image_rotated)[:, :, ::-1]
 auto_select_viz_library()
 viz_handler = VizPackageHandler()

deepdoctection 0.30__py3-none-any.whl → 0.31__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.30py3-none-any.whl → 0.31py3-none-any.whl