PyPI - deepdoctection - Versions diffs - 0.29__py3-none-any.whl → 0.31__py3-none-any.whl - Mend

deepdoctection 0.29py3-none-any.whl → 0.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (83) hide show

deepdoctection/__init__.py +6 -2
deepdoctection/analyzer/dd.py +13 -8
deepdoctection/dataflow/base.py +0 -19
deepdoctection/dataflow/custom.py +6 -5
deepdoctection/dataflow/custom_serialize.py +20 -5
deepdoctection/dataflow/parallel_map.py +22 -17
deepdoctection/dataflow/serialize.py +5 -4
deepdoctection/dataflow/stats.py +5 -5
deepdoctection/datapoint/annotation.py +35 -14
deepdoctection/datapoint/box.py +9 -6
deepdoctection/datapoint/convert.py +3 -1
deepdoctection/datapoint/image.py +66 -29
deepdoctection/datapoint/view.py +62 -24
deepdoctection/datasets/adapter.py +4 -5
deepdoctection/datasets/base.py +87 -14
deepdoctection/datasets/dataflow_builder.py +1 -1
deepdoctection/datasets/info.py +2 -2
deepdoctection/datasets/instances/fintabnet.py +3 -3
deepdoctection/datasets/instances/layouttest.py +2 -7
deepdoctection/datasets/instances/pubtabnet.py +3 -3
deepdoctection/eval/accmetric.py +7 -5
deepdoctection/eval/base.py +5 -4
deepdoctection/eval/eval.py +9 -7
deepdoctection/eval/tedsmetric.py +9 -3
deepdoctection/eval/tp_eval_callback.py +8 -7
deepdoctection/extern/base.py +39 -13
deepdoctection/extern/d2detect.py +164 -64
deepdoctection/extern/deskew.py +32 -7
deepdoctection/extern/doctrocr.py +268 -29
deepdoctection/extern/fastlang.py +45 -7
deepdoctection/extern/hfdetr.py +90 -33
deepdoctection/extern/hflayoutlm.py +109 -22
deepdoctection/extern/model.py +30 -11
deepdoctection/extern/pdftext.py +2 -1
deepdoctection/extern/pt/ptutils.py +3 -2
deepdoctection/extern/tessocr.py +134 -22
deepdoctection/extern/texocr.py +4 -2
deepdoctection/extern/tp/tpcompat.py +4 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +2 -7
deepdoctection/extern/tpdetect.py +50 -23
deepdoctection/mapper/d2struct.py +1 -1
deepdoctection/mapper/hfstruct.py +1 -1
deepdoctection/mapper/laylmstruct.py +1 -1
deepdoctection/mapper/maputils.py +19 -5
deepdoctection/mapper/prodigystruct.py +15 -13
deepdoctection/mapper/pubstruct.py +10 -10
deepdoctection/mapper/tpstruct.py +1 -1
deepdoctection/pipe/anngen.py +35 -8
deepdoctection/pipe/base.py +53 -19
deepdoctection/pipe/cell.py +29 -8
deepdoctection/pipe/common.py +12 -4
deepdoctection/pipe/doctectionpipe.py +4 -3
deepdoctection/pipe/language.py +3 -2
deepdoctection/pipe/layout.py +3 -2
deepdoctection/pipe/lm.py +2 -2
deepdoctection/pipe/order.py +67 -39
deepdoctection/pipe/refine.py +18 -10
deepdoctection/pipe/segment.py +34 -20
deepdoctection/pipe/text.py +14 -8
deepdoctection/pipe/transform.py +16 -8
deepdoctection/train/d2_frcnn_train.py +17 -14
deepdoctection/train/hf_detr_train.py +13 -9
deepdoctection/train/hf_layoutlm_train.py +31 -19
deepdoctection/utils/__init__.py +3 -0
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +5 -5
deepdoctection/utils/develop.py +2 -2
deepdoctection/utils/env_info.py +64 -27
deepdoctection/utils/error.py +84 -0
deepdoctection/utils/file_utils.py +28 -17
deepdoctection/utils/fs.py +16 -14
deepdoctection/utils/logger.py +43 -19
deepdoctection/utils/pdf_utils.py +14 -7
deepdoctection/utils/settings.py +5 -1
deepdoctection/utils/transform.py +1 -1
deepdoctection/utils/utils.py +0 -6
deepdoctection/utils/viz.py +83 -14
{deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/METADATA +39 -61
deepdoctection-0.31.dist-info/RECORD +144 -0
{deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/WHEEL +1 -1
deepdoctection-0.29.dist-info/RECORD +0 -143
{deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/LICENSE +0 -0
{deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/top_level.txt +0 -0

deepdoctection/__init__.py CHANGED Viewed

@@ -27,7 +27,7 @@ from .utils.logger import logger
 # pylint: enable=wrong-import-position
-__version__ = 0.29
+__version__ = 0.31
 _IMPORT_STRUCTURE = {
     "analyzer": [
@@ -179,6 +179,7 @@ _IMPORT_STRUCTURE = {
         "Jdeskewer",
         "DoctrTextlineDetector",
         "DoctrTextRecognizer",
+        "DocTrRotationTransformer",
         "FasttextLangDetector",
         "HFDetrDerivedDetector",
         "HFLayoutLmTokenClassifierBase",
@@ -194,6 +195,7 @@ _IMPORT_STRUCTURE = {
         "ModelDownloadManager",
         "PdfPlumberTextDetector",
         "TesseractOcrDetector",
+        "TesseractRotationTransformer",
         "TextractOcrDetector",
         "TPFrcnnDetector",
     ],
@@ -279,7 +281,7 @@ _IMPORT_STRUCTURE = {
         "PubtablesSegmentationService",
         "SegmentationResult",
         "TextExtractionService",
-        "SimpleTransformPipelineComponent",
+        "SimpleTransformService",
     ],
     "train": [
         "D2Trainer",
@@ -343,6 +345,8 @@ _IMPORT_STRUCTURE = {
         "get_opencv_requirement",
         "pillow_available",
         "get_pillow_requirement",
+        "spacy_available",
+        "get_spacy_requirement",
         "load_image_from_file",
         "load_bytes_from_pdf_file",
         "get_load_image_func",

deepdoctection/analyzer/dd.py CHANGED Viewed

@@ -54,7 +54,7 @@ from ..utils.file_utils import (
     tf_available,
 )
 from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
-from ..utils.logger import logger
+from ..utils.logger import LoggingRecord, logger
 from ..utils.metacfg import AttrDict, set_config_by_yaml
 from ..utils.settings import CellType, LayoutType
 from ..utils.transform import PadTransform
@@ -113,11 +113,12 @@ def config_sanity_checks(cfg: AttrDict) -> None:
     """Some config sanity checks"""
     if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
         raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
-    if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
-        raise ValueError(
-            "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True and set the other two "
-            "to False. Only one OCR system can be activated."
-        )
+    if cfg.USE_OCR:
+        if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
+            raise ValueError(
+                "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
+                "and set the other two to False. Only one OCR system can be activated."
+            )
 def build_detector(
@@ -231,9 +232,13 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
         weights = cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF if cfg.LIB == "TF" else cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT
         weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
         profile = ModelCatalog.get_profile(weights)
+        # get_full_path_configs will complete the path even if the model is not registered
+        config_path = ModelCatalog.get_full_path_configs(weights) if profile.config is not None else None
         if profile.architecture is None:
             raise ValueError("model profile.architecture must be specified")
-        return DoctrTextRecognizer(profile.architecture, weights_path, cfg.DEVICE, lib=cfg.LIB)
+        return DoctrTextRecognizer(
+            profile.architecture, weights_path, cfg.DEVICE, lib=cfg.LIB, path_config_json=config_path
+        )
     if cfg.OCR.USE_TEXTRACT:
         credentials_kwargs = {
             "aws_access_key_id": environ.get("ACCESS_KEY"),
@@ -445,7 +450,7 @@ def get_dd_analyzer(
         cfg.update_args(config_overwrite)
     config_sanity_checks(cfg)
-    logger.info("Config: \n %s", str(cfg), cfg.to_dict())
+    logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict()))  # type: ignore
     # will silent all TP logging while building the tower
     if tensorpack_available():

deepdoctection/dataflow/base.py CHANGED Viewed

@@ -17,25 +17,6 @@ from typing import Any, Iterator, no_type_check
 from ..utils.utils import get_rng
-class DataFlowTerminated(BaseException):
-    """
-    An exception indicating that the DataFlow is unable to produce any more
-    data, i.e. something wrong happened so that calling `__iter__`
-    cannot give a valid iterator anymore.
-    In most DataFlow this will never be raised.
-    """
-class DataFlowResetStateNotCalled(BaseException):
-    """
-    An exception indicating that `reset_state()` has not been called before starting
-    iteration.
-    """
-    def __init__(self) -> None:
-        super().__init__("Iterating a dataflow requires .reset_state() to be called first")
 class DataFlowReentrantGuard:
     """
     A tool to enforce non-reentrancy.

deepdoctection/dataflow/custom.py CHANGED Viewed

@@ -25,10 +25,11 @@ from typing import Any, Callable, Iterable, Iterator, List, Optional
 import numpy as np
-from ..utils.logger import logger
+from ..utils.error import DataFlowResetStateNotCalledError
+from ..utils.logger import LoggingRecord, logger
 from ..utils.tqdm import get_tqdm
 from ..utils.utils import get_rng
-from .base import DataFlow, DataFlowReentrantGuard, DataFlowResetStateNotCalled, ProxyDataFlow
+from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
 from .serialize import DataFromIterable, DataFromList
 __all__ = ["CacheData", "CustomDataFromList", "CustomDataFromIterable"]
@@ -65,7 +66,7 @@ class CacheData(ProxyDataFlow):
     def __iter__(self) -> Iterator[Any]:
         if self._guard is None:
-            raise DataFlowResetStateNotCalled()
+            raise DataFlowResetStateNotCalledError()
         with self._guard:
             if self.buffer:
@@ -139,10 +140,10 @@ class CustomDataFromList(DataFromList):
     def __iter__(self) -> Iterator[Any]:
         if self.rng is None:
-            raise DataFlowResetStateNotCalled()
+            raise DataFlowResetStateNotCalledError()
         if self.rebalance_func is not None:
             lst_tmp = self.rebalance_func(self.lst)
-            logger.info("subset size after re-balancing: %s", len(lst_tmp))
+            logger.info(LoggingRecord(f"CustomDataFromList: subset size after re-balancing: {len(lst_tmp)}"))
         else:
             lst_tmp = self.lst

deepdoctection/dataflow/custom_serialize.py CHANGED Viewed

@@ -23,16 +23,20 @@ import itertools
 import json
 import os
 from collections import defaultdict
+from pathlib import Path
 from typing import DefaultDict, Dict, List, Optional, Sequence, Union
 from jsonlines import Reader, Writer
+from tabulate import tabulate
+from termcolor import colored
 from ..utils.context import timed_operation
 from ..utils.detection_types import JsonDict, Pathlike
+from ..utils.error import FileExtensionError
 from ..utils.identifier import get_uuid_from_str
 from ..utils.pdf_utils import PDFStreamer
 from ..utils.tqdm import get_tqdm
-from ..utils.utils import FileExtensionError, is_file_extension
+from ..utils.utils import is_file_extension
 from .base import DataFlow
 from .common import FlattenData, JoinData, MapData
 from .custom import CacheData, CustomDataFromIterable, CustomDataFromList
@@ -186,6 +190,11 @@ class SerializerFiles:
         df2: DataFlow
         df3: DataFlow
+        if isinstance(path, str):
+            path = Path(path)
+        if not path.exists():
+            raise NotADirectoryError(f"The path {path} to the directory or file does not exist")
         if shuffle:
             sort = False
         it1 = os.walk(path, topdown=False)
@@ -217,7 +226,7 @@ class SerializerFiles:
         """
         Not implemented
         """
-        raise NotImplementedError
+        raise NotImplementedError()
 class CocoParser:
@@ -277,8 +286,14 @@ class CocoParser:
         """
         Print information about the annotation file.
         """
+        rows = []
         for key, value in self.dataset["info"].items():
-            print(f"{key}: {value}")
+            row = [key, value]
+            rows.append(row)
+        header = ["key", "value"]
+        table = tabulate(rows, headers=header, tablefmt="fancy_grid", stralign="left", numalign="left")
+        print(colored(table, "cyan"))
     def get_ann_ids(
         self,
@@ -493,7 +508,7 @@ class SerializerCoco:
         """
         Not implemented
         """
-        raise NotImplementedError
+        raise NotImplementedError()
 class SerializerPdfDoc:
@@ -541,7 +556,7 @@ class SerializerPdfDoc:
         """
         Not implemented
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     @staticmethod
     def split(path: Pathlike, path_target: Optional[Pathlike] = None, max_datapoint: Optional[int] = None) -> None:

deepdoctection/dataflow/parallel_map.py CHANGED Viewed

@@ -28,8 +28,9 @@ from typing import Any, Callable, Iterator, List, no_type_check
 import zmq
 from ..utils.concurrency import StoppableThread, enable_death_signal, start_proc_mask_signal
-from ..utils.logger import logger
-from .base import DataFlow, DataFlowReentrantGuard, DataFlowTerminated, ProxyDataFlow
+from ..utils.error import DataFlowTerminatedError
+from ..utils.logger import LoggingRecord, logger
+from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
 from .common import RepeatedData
 from .serialize import PickleSerializer
@@ -48,15 +49,15 @@ def _zmq_catch_error(name):
     try:
         yield
     except zmq.ContextTerminated as exc:
-        logger.info("[%s] Context terminated.", name)
-        raise DataFlowTerminated() from exc
+        logger.info(LoggingRecord(f"_zmq_catch_error: [{name}] Context terminated."))
+        raise DataFlowTerminatedError() from exc
     except zmq.ZMQError as exc:
         if exc.errno == errno.ENOTSOCK:  # socket closed
-            logger.info("[%s] Socket closed.", name)
-            raise DataFlowTerminated() from exc
-        raise ValueError from exc
+            logger.info(LoggingRecord(f"_zmq_catch_error: [{name}]  Socket closed."))
+            raise DataFlowTerminatedError() from exc
+        raise ValueError() from exc
     except Exception as exc:
-        raise ValueError from exc
+        raise ValueError() from exc
 @no_type_check
@@ -78,8 +79,8 @@ def _get_pipe_name(name):
 class _ParallelMapData(ProxyDataFlow, ABC):
     def __init__(self, df: DataFlow, buffer_size: int, strict: bool = False) -> None:
         super().__init__(df)
-        if not buffer_size:
-            raise ValueError("buffer_size must be a positive number")
+        if buffer_size <= 0:
+            raise ValueError(f"buffer_size must be a positive number, got {buffer_size}")
         self._buffer_size = buffer_size
         self._buffer_occupancy = 0  # actual #elements in buffer, only useful in strict mode
         self._strict = strict
@@ -95,12 +96,12 @@ class _ParallelMapData(ProxyDataFlow, ABC):
     @no_type_check
     @abstractmethod
     def _recv(self):
-        raise NotImplementedError
+        raise NotImplementedError()
     @no_type_check
     @abstractmethod
     def _send(self, dp: Any):
-        raise NotImplementedError
+        raise NotImplementedError()
     @no_type_check
     def _recv_filter_none(self):
@@ -312,7 +313,8 @@ class _MultiProcessZMQDataFlow(DataFlow, ABC):
             for x in self._procs:
                 x.terminate()
                 x.join(5)
-            logger.info("%s successfully cleaned-up.", type(self).__name__)
+            logger.info(LoggingRecord(f"_MultiProcessZMQDataFlow [{type(self).__name__}] successfully cleaned-up."))
         except Exception:  # pylint: disable=W0703
             pass
@@ -323,9 +325,12 @@ def _bind_guard(sock, name):
         sock.bind(name)
     except zmq.ZMQError:
         logger.error(
-            "ZMQError in socket.bind('{name}'). Perhaps you're using pipes on a non-local file system. "
-            "See documentation of MultiProcessRunnerZMQ for more information."
+            LoggingRecord(
+                f"ZMQError in socket.bind('{name}'). Perhaps you're using pipes on a non-local file system. "
+                "See documentation of MultiProcessRunnerZMQ for more information."
+            )
         )
         raise
@@ -394,8 +399,8 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
         _ParallelMapData.__init__(self, df, buffer_size, strict)
         _MultiProcessZMQDataFlow.__init__(self)
-        if not num_proc:
-            raise ValueError("num_proc must be a positive number")
+        if num_proc <= 0:
+            raise ValueError(f"num_proc must be a positive number, got {num_proc}")
         self.num_proc = num_proc
         self.map_func = map_func
         self._strict = strict

deepdoctection/dataflow/serialize.py CHANGED Viewed

@@ -16,7 +16,8 @@ from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
 import numpy as np
-from .base import DataFlow, DataFlowResetStateNotCalled, RNGDataFlow
+from ..utils.error import DataFlowResetStateNotCalledError
+from .base import DataFlow, RNGDataFlow
 class DataFromList(RNGDataFlow):
@@ -44,7 +45,7 @@ class DataFromList(RNGDataFlow):
                 for k in idxs:
                     yield self.lst[k]
             else:
-                raise DataFlowResetStateNotCalled()
+                raise DataFlowResetStateNotCalledError()
 class DataFromIterable(DataFlow):
@@ -63,7 +64,7 @@ class DataFromIterable(DataFlow):
     def __len__(self) -> int:
         if self._len is None:
-            raise NotImplementedError
+            raise NotImplementedError()
         return self._len
     def __iter__(self) -> Iterator[Any]:
@@ -107,7 +108,7 @@ class FakeData(RNGDataFlow):
     def __iter__(self) -> Iterator[Any]:
         if self.rng is None:
-            raise DataFlowResetStateNotCalled()
+            raise DataFlowResetStateNotCalledError()
         if self.random:
             for _ in range(self._size):
                 val = []

deepdoctection/dataflow/stats.py CHANGED Viewed

@@ -23,7 +23,7 @@ from typing import Any, Optional, Tuple, Union
 import numpy as np
 import numpy.typing as npt
-from ..utils.logger import logger
+from ..utils.logger import LoggingRecord, logger
 from ..utils.tqdm import get_tqdm
 from .base import DataFlow, ProxyDataFlow
@@ -95,7 +95,7 @@ class MeanFromDataFlow(ProxyDataFlow):
             self.df.reset_state()
         itr = iter(self.df)
-        logger.info("Calculating mean")
+        logger.info(LoggingRecord("Calculating mean"))
         len_df: Optional[int]
         try:
@@ -139,7 +139,7 @@ class MeanFromDataFlow(ProxyDataFlow):
                     if n == self.max_datapoints:
                         break
-        logger.info("Mean from %s datapoints along axis %s: %s", n, self.axis, self.mean)
+        logger.info(LoggingRecord(f"Mean from {n} datapoints along axis {self.axis}: {self.mean}"))
         return self.mean
@@ -216,7 +216,7 @@ class StdFromDataFlow(ProxyDataFlow):
             self.df.reset_state()
         itr = iter(self.df)
-        logger.info("Calculating standard deviation")
+        logger.info(LoggingRecord("Calculating standard deviation"))
         try:
             len_df = len(self.df)
         except NotImplementedError:
@@ -266,6 +266,6 @@ class StdFromDataFlow(ProxyDataFlow):
             var = (ex2 - (ex * ex) / n) / (n - 1)
             self.std = np.sqrt(var)
-        logger.info("Standard deviation from %s datapoints along axis %s: %s", n, self.axis, self.std)
+        logger.info(LoggingRecord(f"Standard deviation from  {n} datapoints along axis {self.axis}: {self.std}"))
         return self.std

deepdoctection/datapoint/annotation.py CHANGED Viewed

@@ -24,8 +24,9 @@ from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union, no_type_check
 from ..utils.detection_types import JsonDict
+from ..utils.error import AnnotationError, UUIDError
 from ..utils.identifier import get_uuid, is_uuid_like
-from ..utils.logger import logger
+from ..utils.logger import LoggingRecord, logger
 from ..utils.settings import DefaultType, ObjectTypes, SummaryType, TypeOrStr, get_type
 from .box import BoundingBox
 from .convert import as_dict
@@ -36,7 +37,16 @@ def ann_from_dict(cls, **kwargs):
     """
     A factory function to create subclasses of annotations from a given dict
     """
-    ann = cls(kwargs.get("external_id"), kwargs.get("category_name"), kwargs.get("category_id"), kwargs.get("score"))
+    _init_kwargs = {
+        "external_id": kwargs.get("external_id"),
+        "category_name": kwargs.get("category_name"),
+        "category_id": kwargs.get("category_id"),
+        "score": kwargs.get("score"),
+        "service_id": kwargs.get("service_id"),
+        "model_id": kwargs.get("model_id"),
+        "session_id": kwargs.get("session_id"),
+    }
+    ann = cls(**_init_kwargs)
     ann.active = kwargs.get("active")
     ann._annotation_id = kwargs.get("_annotation_id")  # pylint: disable=W0212
     if isinstance(kwargs.get("sub_categories"), dict):
@@ -74,11 +84,17 @@ class Annotation(ABC):
     id will not depend on the defining attributes.
     `_annotation_id`: Unique id for annotations. Will always be given as string representation of a md5-hash.
+    `service_id`: Service that generated the annotation. This will be the name of a pipeline component
+    `model_id`: Model that generated the annotation. This will be the name of particular model
+    `session_id`: Session id for the annotation. This will be the id of the session in which the annotation was created.
     """
     active: bool = field(default=True, init=False, repr=True)
     external_id: Optional[Union[str, int]] = field(default=None, init=True, repr=False)
     _annotation_id: Optional[str] = field(default=None, init=False, repr=True)
+    service_id: Optional[str] = field(default=None)
+    model_id: Optional[str] = field(default=None)
+    session_id: Optional[str] = field(default=None)
     def __post_init__(self) -> None:
         """
@@ -101,7 +117,7 @@ class Annotation(ABC):
         """
         if self._annotation_id:
             return self._annotation_id
-        raise ValueError("Dump annotation first or pass external_id to create an annotation id")
+        raise AnnotationError("Dump annotation first or pass external_id to create an annotation id")
     @annotation_id.setter
     def annotation_id(self, input_id: str) -> None:
@@ -109,13 +125,13 @@ class Annotation(ABC):
         annotation_id setter
         """
         if self._annotation_id is not None:
-            raise AssertionError("Annotation_id already defined and cannot be reset")
+            raise AnnotationError("Annotation_id already defined and cannot be reset")
         if is_uuid_like(input_id):
             self._annotation_id = input_id
         elif isinstance(input_id, property):
             pass
         else:
-            raise ValueError("Annotation_id must be uuid3 string")
+            raise AnnotationError("Annotation_id must be uuid3 string")
     @abstractmethod
     def get_defining_attributes(self) -> List[str]:
@@ -126,13 +142,13 @@ class Annotation(ABC):
         :return: A list of attributes.
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     def _assert_attributes_have_str(self, state_id: bool = False) -> None:
         defining_attributes = self.get_state_attributes() if state_id else self.get_defining_attributes()
         for attr in defining_attributes:
             if not hasattr(eval("self." + attr), "__str__"):  # pylint: disable=W0123
-                raise AttributeError(f"Attribute {attr} must have __str__ method")
+                raise AnnotationError(f"Attribute {attr} must have __str__ method")
     @staticmethod
     def set_annotation_id(annotation: "CategoryAnnotation", *container_id_context: Optional[str]) -> str:
@@ -179,7 +195,7 @@ class Annotation(ABC):
         :return: Annotation instance
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     @staticmethod
     @abstractmethod
@@ -189,7 +205,7 @@ class Annotation(ABC):
         :return: A list of attributes.
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     @property
     def state_id(self) -> str:
@@ -290,7 +306,12 @@ class CategoryAnnotation(Annotation):
         """
         if sub_category_name in self.sub_categories:
-            raise KeyError(f"{sub_category_name} as sub category already defined for " f"{self.annotation_id}")
+            raise AnnotationError(
+                f"sub category {sub_category_name} already defined: "
+                f"annotation_id: {self.annotation_id}, "
+                f"category_name: {self.category_name}, "
+                f"category_id: {self.category_id}"
+            )
         if self._annotation_id is not None:
             if annotation._annotation_id is None:  # pylint: disable=W0212
@@ -333,7 +354,7 @@ class CategoryAnnotation(Annotation):
         :param annotation_id: An annotation id
         """
         if not is_uuid_like(annotation_id):
-            raise ValueError("Annotation_id must be uuid")
+            raise UUIDError("Annotation_id must be uuid")
         key_type = get_type(key)
         if key not in self.relationships:
@@ -369,7 +390,7 @@ class CategoryAnnotation(Annotation):
                 try:
                     self.relationships[key].remove(ann_id)
                 except ValueError:
-                    logger.warning("Relationship %s cannot be removed because it does not exist", key)
+                    logger.warning(LoggingRecord(f"Relationship {key} cannot be removed because it does not exist"))
         else:
             self.relationships[key].clear()
@@ -436,14 +457,14 @@ class ImageAnnotation(CategoryAnnotation):
             box = self.bounding_box
         if box:
             return box
-        raise ValueError(f"bounding_box has not been initialized for {self.annotation_id}")
+        raise AnnotationError(f"bounding_box has not been initialized for {self.annotation_id}")
     def get_summary(self, key: ObjectTypes) -> CategoryAnnotation:
         """Get summary sub categories from `image`. Raises `ValueError` if `key` is not available"""
         if self.image:
             if self.image.summary:
                 return self.image.summary.get_sub_category(key)
-        raise ValueError(f"Summary does not exist for {self.annotation_id} and key: {key}")
+        raise AnnotationError(f"Summary does not exist for {self.annotation_id} and key: {key}")
 @dataclass

deepdoctection/datapoint/box.py CHANGED Viewed

@@ -28,8 +28,9 @@ import numpy.typing as npt
 from numpy import float32
 from ..utils.detection_types import ImageType
+from ..utils.error import BoundingBoxError
 from ..utils.file_utils import cocotools_available
-from ..utils.logger import logger
+from ..utils.logger import LoggingRecord, logger
 if cocotools_available():
     import pycocotools.mask as coco_mask
@@ -140,10 +141,6 @@ def iou(boxes1: npt.NDArray[float32], boxes2: npt.NDArray[float32]) -> npt.NDArr
     return np_iou(boxes1, boxes2)
-class BoundingBoxError(BaseException):
-    """Special exception only for `BoundingBox`"""
 @dataclass
 class BoundingBox:
     """
@@ -558,6 +555,12 @@ def intersection_boxes(boxes_1: Sequence[BoundingBox], boxes_2: Sequence[Boundin
     :param boxes_2: sequence of n BoundingBox
     :return: list of at most mxn BoundingBox
     """
+    if not boxes_1 and boxes_2:
+        return boxes_2
+    if not boxes_2 and boxes_1:
+        return boxes_1
+    if not boxes_1 and not boxes_2:
+        return []
     if boxes_1[0].absolute_coords != boxes_2[0].absolute_coords:
         raise ValueError("absolute_coords of boxes_1 and boxes_2 mus be equal")
     absolute_coords = boxes_1[0].absolute_coords
@@ -596,6 +599,6 @@ def intersection_boxes(boxes_1: Sequence[BoundingBox], boxes_2: Sequence[Boundin
                 "height": np_boxes_output[idx][3],
             }
-            logger.warning("intersection_boxes error %s", "", log_dict)
+            logger.warning(LoggingRecord("intersection_boxes", log_dict))  # type: ignore
     return boxes_output

deepdoctection/datapoint/convert.py CHANGED Viewed

@@ -32,6 +32,7 @@ from pypdf import PdfReader
 from ..utils.detection_types import ImageType
 from ..utils.develop import deprecated
+from ..utils.error import DependencyError
 from ..utils.pdf_utils import pdf_to_np_array
 from ..utils.viz import viz_handler
@@ -121,7 +122,8 @@ def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -
     """
     from pdf2image import convert_from_bytes  # type: ignore # pylint: disable=C0415, E0401
-    assert which("pdftoppm") is not None, "convert_pdf_bytes_to_np_array requires poppler to be installed"
+    if which("pdftoppm") is None:
+        raise DependencyError("convert_pdf_bytes_to_np_array requires poppler to be installed")
     with BytesIO(pdf_bytes) as pdf_file:
         pdf = PdfReader(pdf_file).pages[0]

deepdoctection 0.29__py3-none-any.whl → 0.31__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.29py3-none-any.whl → 0.31py3-none-any.whl