PyPI - deepdoctection - Versions diffs - 0.31__py3-none-any.whl → 0.33__py3-none-any.whl - Mend

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show

deepdoctection/__init__.py +16 -29
deepdoctection/analyzer/dd.py +70 -59
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +41 -56
deepdoctection/datapoint/box.py +9 -8
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +56 -44
deepdoctection/datapoint/view.py +245 -150
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +35 -26
deepdoctection/datasets/base.py +14 -12
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +24 -26
deepdoctection/datasets/instances/doclaynet.py +51 -51
deepdoctection/datasets/instances/fintabnet.py +46 -46
deepdoctection/datasets/instances/funsd.py +25 -24
deepdoctection/datasets/instances/iiitar13k.py +13 -10
deepdoctection/datasets/instances/layouttest.py +4 -3
deepdoctection/datasets/instances/publaynet.py +5 -5
deepdoctection/datasets/instances/pubtables1m.py +24 -21
deepdoctection/datasets/instances/pubtabnet.py +32 -30
deepdoctection/datasets/instances/rvlcdip.py +30 -30
deepdoctection/datasets/instances/xfund.py +26 -26
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +15 -13
deepdoctection/eval/eval.py +41 -37
deepdoctection/eval/tedsmetric.py +30 -23
deepdoctection/eval/tp_eval_callback.py +16 -19
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +85 -113
deepdoctection/extern/deskew.py +14 -11
deepdoctection/extern/doctrocr.py +141 -130
deepdoctection/extern/fastlang.py +27 -18
deepdoctection/extern/hfdetr.py +71 -62
deepdoctection/extern/hflayoutlm.py +504 -211
deepdoctection/extern/hflm.py +230 -0
deepdoctection/extern/model.py +488 -302
deepdoctection/extern/pdftext.py +23 -19
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +29 -19
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +18 -18
deepdoctection/extern/tp/tfutils.py +57 -9
deepdoctection/extern/tp/tpcompat.py +21 -14
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +45 -53
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/cats.py +27 -29
deepdoctection/mapper/cocostruct.py +10 -10
deepdoctection/mapper/d2struct.py +27 -26
deepdoctection/mapper/hfstruct.py +13 -8
deepdoctection/mapper/laylmstruct.py +178 -37
deepdoctection/mapper/maputils.py +12 -11
deepdoctection/mapper/match.py +2 -2
deepdoctection/mapper/misc.py +11 -9
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +5 -5
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +5 -5
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +12 -14
deepdoctection/pipe/base.py +52 -106
deepdoctection/pipe/common.py +72 -59
deepdoctection/pipe/concurrency.py +16 -11
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +20 -16
deepdoctection/pipe/lm.py +75 -105
deepdoctection/pipe/order.py +194 -89
deepdoctection/pipe/refine.py +111 -124
deepdoctection/pipe/segment.py +156 -161
deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +48 -41
deepdoctection/train/hf_detr_train.py +41 -30
deepdoctection/train/hf_layoutlm_train.py +153 -135
deepdoctection/train/tp_frcnn_train.py +32 -31
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +87 -125
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +22 -18
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +16 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +11 -11
deepdoctection/utils/settings.py +185 -181
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +74 -72
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
deepdoctection-0.33.dist-info/RECORD +146 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.31.dist-info/RECORD +0 -144
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0

deepdoctection/dataflow/custom_serialize.py CHANGED Viewed

@@ -19,23 +19,25 @@
 Methods that convert incoming data to dataflows.
 """
+from __future__ import annotations
 import itertools
 import json
 import os
 from collections import defaultdict
 from pathlib import Path
-from typing import DefaultDict, Dict, List, Optional, Sequence, Union
+from typing import Any, DefaultDict, Dict, Iterator, List, Optional, Sequence, TextIO, Union
 from jsonlines import Reader, Writer
 from tabulate import tabulate
 from termcolor import colored
 from ..utils.context import timed_operation
-from ..utils.detection_types import JsonDict, Pathlike
 from ..utils.error import FileExtensionError
 from ..utils.identifier import get_uuid_from_str
 from ..utils.pdf_utils import PDFStreamer
 from ..utils.tqdm import get_tqdm
+from ..utils.types import JsonDict, PathLikeOrStr
 from ..utils.utils import is_file_extension
 from .base import DataFlow
 from .common import FlattenData, JoinData, MapData
@@ -53,6 +55,59 @@ def _reset_df_and_get_length(df: DataFlow) -> int:
     return length
+class FileClosingIterator:
+    """
+    A custom iterator that closes the file object once the iteration is complete.
+    This iterator is used to ensure that the file object is properly closed after
+    reading the data from it. It is used in the context of reading data from a file
+    in a streaming manner, where the data is not loaded into memory all at once.
+    **Example:**
+        file = open(path, "r")
+        iterator = Reader(file)
+        closing_iterator = FileClosingIterator(file, iter(iterator))
+        df = CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints) # set up a dataflow
+    """
+    def __init__(self, file_obj: TextIO, iterator: Iterator[Any]):
+        """
+        Initializes the FileClosingIterator with a file object and its iterator.
+        :param file_obj (TextIO): The file object to read data from.
+        :param     iterator (Iterator): The actual iterator of the file object.
+        """
+        self.file_obj = file_obj
+        self.iterator = iterator
+    def __iter__(self) -> FileClosingIterator:
+        """
+        Returns the iterator object itself.
+        :return:  FileClosingIterator: The instance of the class itself.
+        """
+        return self
+    def __next__(self) -> Any:
+        """
+        Returns the next item from the file object's iterator.
+        Closes the file object if the iteration is finished.
+        :return: The next item from the file object's iterator.
+        Raises:
+            StopIteration: If there are no more items to return.
+        """
+        try:
+            return next(self.iterator)
+        except StopIteration as exc:
+            self.file_obj.close()
+            raise StopIteration from exc
 class SerializerJsonlines:
     """
     Serialize a dataflow from a jsonlines file. Alternatively, save a dataflow of JSON objects to a .jsonl file.
@@ -66,7 +121,7 @@ class SerializerJsonlines:
     """
     @staticmethod
-    def load(path: Pathlike, max_datapoints: Optional[int] = None) -> CustomDataFromIterable:
+    def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> CustomDataFromIterable:
         """
         :param path: a path to a .jsonl file.
         :param max_datapoints: Will stop the iteration once max_datapoints have been streamed
@@ -75,10 +130,11 @@ class SerializerJsonlines:
         """
         file = open(path, "r")  # pylint: disable=W1514,R1732
         iterator = Reader(file)
-        return CustomDataFromIterable(iterator, max_datapoints=max_datapoints)
+        closing_iterator = FileClosingIterator(file, iter(iterator))
+        return CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints)
     @staticmethod
-    def save(df: DataFlow, path: Pathlike, file_name: str, max_datapoints: Optional[int] = None) -> None:
+    def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
         """
         Writes a dataflow iteratively to a .jsonl file. Every datapoint must be a dict where all items are serializable.
         As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
@@ -120,7 +176,7 @@ class SerializerTabsepFiles:
     """
     @staticmethod
-    def load(path: Pathlike, max_datapoins: Optional[int] = None) -> CustomDataFromList:
+    def load(path: PathLikeOrStr, max_datapoins: Optional[int] = None) -> CustomDataFromList:
         """
         :param path: a path to a .txt file.
         :param max_datapoins: Will stop the iteration once max_datapoints have been streamed
@@ -133,7 +189,7 @@ class SerializerTabsepFiles:
         return CustomDataFromList(file_list, max_datapoints=max_datapoins)
     @staticmethod
-    def save(df: DataFlow, path: Pathlike, file_name: str, max_datapoints: Optional[int] = None) -> None:
+    def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
         """
         Writes a dataflow iteratively to a .txt file. Every datapoint must be a string.
         As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
@@ -168,7 +224,7 @@ class SerializerFiles:
     @staticmethod
     def load(
-        path: Pathlike,
+        path: PathLikeOrStr,
         file_type: Union[str, Sequence[str]],
         max_datapoints: Optional[int] = None,
         shuffle: Optional[bool] = False,
@@ -190,15 +246,14 @@ class SerializerFiles:
         df2: DataFlow
         df3: DataFlow
-        if isinstance(path, str):
-            path = Path(path)
+        path = Path(path)
         if not path.exists():
             raise NotADirectoryError(f"The path {path} to the directory or file does not exist")
         if shuffle:
             sort = False
-        it1 = os.walk(path, topdown=False)
-        it2 = os.walk(path, topdown=False)
+        it1 = os.walk(os.fspath(path), topdown=False)
+        it2 = os.walk(os.fspath(path), topdown=False)
         df1 = CustomDataFromIterable(it1)
         df2 = CustomDataFromIterable(it2)
         df1 = MapData(df1, lambda dp: None if len(dp[2]) == 0 else dp)
@@ -237,7 +292,7 @@ class CocoParser:
     :param annotation_file: location of annotation file
     """
-    def __init__(self, annotation_file: Optional[Pathlike] = None) -> None:
+    def __init__(self, annotation_file: Optional[PathLikeOrStr] = None) -> None:
         self.dataset: JsonDict = {}
         self.anns: Dict[int, JsonDict] = {}
         self.cats: Dict[int, JsonDict] = {}
@@ -465,7 +520,7 @@ class SerializerCoco:
     """
     @staticmethod
-    def load(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
+    def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
         """
         Loads a .json file and generates a dataflow.
@@ -478,7 +533,7 @@ class SerializerCoco:
                 {'image':{'id',...},'annotations':[{'id':…,'bbox':...}]}
-            for each single image id.
+            for each image id. We use the type hint CocoDatapointDict to describe this dictionary
         :param max_datapoints: Will stop the iteration once max_datapoints have been streamed.
         :param path: a path to a .json file.
@@ -525,7 +580,7 @@ class SerializerPdfDoc:
     """
     @staticmethod
-    def load(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
+    def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
         """
         Loads the document page wise and returns a dataflow accordingly.
@@ -552,14 +607,16 @@ class SerializerPdfDoc:
         return df
     @staticmethod
-    def save(path: Pathlike) -> None:
+    def save(path: PathLikeOrStr) -> None:
         """
         Not implemented
         """
         raise NotImplementedError()
     @staticmethod
-    def split(path: Pathlike, path_target: Optional[Pathlike] = None, max_datapoint: Optional[int] = None) -> None:
+    def split(
+        path: PathLikeOrStr, path_target: Optional[PathLikeOrStr] = None, max_datapoint: Optional[int] = None
+    ) -> None:
         """
         Split a document into single pages.
         """

deepdoctection/dataflow/parallel_map.py CHANGED Viewed

@@ -23,7 +23,7 @@ import uuid
 import weakref
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import Any, Callable, Iterator, List, no_type_check
+from typing import Any, Callable, Iterator, no_type_check
 import zmq
@@ -236,7 +236,7 @@ class MultiThreadMapData(_ParallelMapData):
         self._strict = strict
         self.num_thread = num_thread
         self.map_func = map_func
-        self._threads: List[Any] = []
+        self._threads: list[Any] = []
         self._evt = None
     def reset_state(self) -> None:
@@ -284,7 +284,7 @@ class _MultiProcessZMQDataFlow(DataFlow, ABC):
         if os.name == "nt":
             raise EnvironmentError("ZMQ IPC doesn't support windows")
         self._reset_done = False
-        self._procs: List[Any] = []
+        self._procs: list[Any] = []
         self.context = None
         self.socket = None

deepdoctection/dataflow/serialize.py CHANGED Viewed

@@ -12,7 +12,7 @@ Some DataFlow classes for serialization. Many classes have been taken from
 import pickle
 from copy import copy
-from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Any, Iterable, Iterator, Optional, Union
 import numpy as np
@@ -23,7 +23,7 @@ from .base import DataFlow, RNGDataFlow
 class DataFromList(RNGDataFlow):
     """Wrap a list of datapoints to a DataFlow"""
-    def __init__(self, lst: List[Any], shuffle: bool = True) -> None:
+    def __init__(self, lst: list[Any], shuffle: bool = True) -> None:
         """
         :param lst: input list. Each element is a datapoint.
         :param shuffle: shuffle data.
@@ -79,11 +79,11 @@ class FakeData(RNGDataFlow):
     def __init__(
         self,
-        shapes: List[Union[List[Any], Tuple[Any]]],
+        shapes: list[Union[list[Any], tuple[Any]]],
         size: int = 1000,
         random: bool = True,
         dtype: str = "float32",
-        domain: Tuple[Union[float, int], Union[float, int]] = (0, 1),
+        domain: tuple[Union[float, int], Union[float, int]] = (0, 1),
     ):
         """
         :param  shapes: a list of lists/tuples. Shapes of each component.

deepdoctection/dataflow/stats.py CHANGED Viewed

@@ -18,7 +18,7 @@
 """
 Dataflows for calculating statistical values of the underlying dataset
 """
-from typing import Any, Optional, Tuple, Union
+from typing import Any, Optional, Union
 import numpy as np
 import numpy.typing as npt
@@ -45,7 +45,7 @@ class MeanFromDataFlow(ProxyDataFlow):
     def __init__(
         self,
         df: DataFlow,
-        axis: Optional[Union[int, Tuple[int], Tuple[int, int], Tuple[int, int, int]]] = None,
+        axis: Optional[Union[int, tuple[int], tuple[int, int], tuple[int, int, int]]] = None,
         key: Optional[str] = None,
         max_datapoints: Optional[int] = None,
     ):
@@ -165,7 +165,7 @@ class StdFromDataFlow(ProxyDataFlow):
     def __init__(
         self,
         df: DataFlow,
-        axis: Optional[Union[int, Tuple[int], Tuple[int, int], Tuple[int, int, int]]] = None,
+        axis: Optional[Union[int, tuple[int], tuple[int, int], tuple[int, int, int]]] = None,
         key: Optional[str] = None,
         max_datapoints: Optional[int] = None,
     ):

deepdoctection/datapoint/annotation.py CHANGED Viewed

@@ -18,34 +18,38 @@
 """
 Dataclass for annotations and their derived classes.
 """
+from __future__ import annotations
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union, no_type_check
+from typing import Optional, Union, no_type_check
-from ..utils.detection_types import JsonDict
 from ..utils.error import AnnotationError, UUIDError
 from ..utils.identifier import get_uuid, is_uuid_like
 from ..utils.logger import LoggingRecord, logger
-from ..utils.settings import DefaultType, ObjectTypes, SummaryType, TypeOrStr, get_type
+from ..utils.settings import DefaultType, ObjectTypes, TypeOrStr, get_type
+from ..utils.types import AnnotationDict
 from .box import BoundingBox
 from .convert import as_dict
 @no_type_check
-def ann_from_dict(cls, **kwargs):
+def ann_from_dict(cls, **kwargs: AnnotationDict):
     """
     A factory function to create subclasses of annotations from a given dict
     """
     _init_kwargs = {
         "external_id": kwargs.get("external_id"),
         "category_name": kwargs.get("category_name"),
-        "category_id": kwargs.get("category_id"),
+        "category_id": kwargs.get("category_id", DEFAULT_CATEGORY_ID),
         "score": kwargs.get("score"),
         "service_id": kwargs.get("service_id"),
         "model_id": kwargs.get("model_id"),
         "session_id": kwargs.get("session_id"),
     }
+    _init_kwargs["category_id"] = (
+        int(_init_kwargs["category_id"]) if (_init_kwargs)["category_id"] not in ("None", "") else DEFAULT_CATEGORY_ID
+    )
     ann = cls(**_init_kwargs)
     ann.active = kwargs.get("active")
     ann._annotation_id = kwargs.get("_annotation_id")  # pylint: disable=W0212
@@ -134,7 +138,7 @@ class Annotation(ABC):
             raise AnnotationError("Annotation_id must be uuid3 string")
     @abstractmethod
-    def get_defining_attributes(self) -> List[str]:
+    def get_defining_attributes(self) -> list[str]:
         """
         Defining attributes of an annotation instance are attributes, of which you think that they uniquely
         describe the annotation object. If you do not provide an external id, only the defining attributes will be used
@@ -151,7 +155,7 @@ class Annotation(ABC):
                 raise AnnotationError(f"Attribute {attr} must have __str__ method")
     @staticmethod
-    def set_annotation_id(annotation: "CategoryAnnotation", *container_id_context: Optional[str]) -> str:
+    def set_annotation_id(annotation: CategoryAnnotation, *container_id_context: Optional[str]) -> str:
         """
         Defines the `annotation_id` by attributes of the annotation class as well as by external parameters given by a
         tuple or list of container id contexts.
@@ -167,7 +171,7 @@ class Annotation(ABC):
         attributes_values = [str(getattr(annotation, attribute)) for attribute in attributes]
         return get_uuid(*attributes_values, *container_id_context)  # type: ignore
-    def as_dict(self) -> Dict[str, Any]:
+    def as_dict(self) -> AnnotationDict:
         """
         Returning the full dataclass as dict. Uses the custom `convert.as_dict` to disregard attributes defined by
         `remove_keys`.
@@ -187,7 +191,7 @@ class Annotation(ABC):
     @classmethod
     @abstractmethod
-    def from_dict(cls, **kwargs: JsonDict) -> "Annotation":
+    def from_dict(cls, **kwargs: AnnotationDict) -> Annotation:
         """
         Method to initialize a derived class from dict.
@@ -199,7 +203,7 @@ class Annotation(ABC):
     @staticmethod
     @abstractmethod
-    def get_state_attributes() -> List[str]:
+    def get_state_attributes() -> list[str]:
         """
         Similar to `get_defining_attributes` but for `state_id`
@@ -242,6 +246,9 @@ class Annotation(ABC):
         return get_uuid(self.annotation_id, *container_ids)
+DEFAULT_CATEGORY_ID = -1
 @dataclass
 class CategoryAnnotation(Annotation):
     """
@@ -268,12 +275,12 @@ class CategoryAnnotation(Annotation):
     `dump_relationship` instead.
     """
-    category_name: TypeOrStr = field(default=DefaultType.default_type)
-    _category_name: ObjectTypes = field(default=DefaultType.default_type, init=False)
-    category_id: str = field(default="")
+    category_name: TypeOrStr = field(default=DefaultType.DEFAULT_TYPE)
+    _category_name: ObjectTypes = field(default=DefaultType.DEFAULT_TYPE, init=False)
+    category_id: int = field(default=DEFAULT_CATEGORY_ID)
     score: Optional[float] = field(default=None)
-    sub_categories: Dict[ObjectTypes, "CategoryAnnotation"] = field(default_factory=dict, init=False, repr=True)
-    relationships: Dict[ObjectTypes, List[str]] = field(default_factory=dict, init=False, repr=True)
+    sub_categories: dict[ObjectTypes, CategoryAnnotation] = field(default_factory=dict, init=False, repr=True)
+    relationships: dict[ObjectTypes, list[str]] = field(default_factory=dict, init=False, repr=True)
     @property  # type: ignore
     def category_name(self) -> ObjectTypes:
@@ -287,13 +294,11 @@ class CategoryAnnotation(Annotation):
             self._category_name = get_type(category_name)
     def __post_init__(self) -> None:
-        self.category_id = str(self.category_id)
-        assert self.category_name
         self._assert_attributes_have_str(state_id=True)
         super().__post_init__()
     def dump_sub_category(
-        self, sub_category_name: TypeOrStr, annotation: "CategoryAnnotation", *container_id_context: Optional[str]
+        self, sub_category_name: TypeOrStr, annotation: CategoryAnnotation, *container_id_context: Optional[str]
     ) -> None:
         """
         Storage of sub-categories. As sub-categories usually only depend on very few attributes and the parent
@@ -324,7 +329,7 @@ class CategoryAnnotation(Annotation):
                 )
         self.sub_categories[get_type(sub_category_name)] = annotation
-    def get_sub_category(self, sub_category_name: ObjectTypes) -> "CategoryAnnotation":
+    def get_sub_category(self, sub_category_name: ObjectTypes) -> CategoryAnnotation:
         """
         Return a sub category by its key.
@@ -362,7 +367,7 @@ class CategoryAnnotation(Annotation):
         if annotation_id not in self.relationships[key_type]:
             self.relationships[key_type].append(annotation_id)
-    def get_relationship(self, key: ObjectTypes) -> List[str]:
+    def get_relationship(self, key: ObjectTypes) -> list[str]:
         """
         Returns a list of annotation ids stored with a given relationship key.
@@ -373,7 +378,7 @@ class CategoryAnnotation(Annotation):
             return self.relationships[key]
         return []
-    def remove_relationship(self, key: ObjectTypes, annotation_ids: Optional[Union[List[str], str]] = None) -> None:
+    def remove_relationship(self, key: ObjectTypes, annotation_ids: Optional[Union[list[str], str]] = None) -> None:
         """
         Remove relationship by some given keys and ids. If no annotation ids are provided all relationship according
         to the key will be removed.
@@ -394,25 +399,25 @@ class CategoryAnnotation(Annotation):
         else:
             self.relationships[key].clear()
-    def get_defining_attributes(self) -> List[str]:
+    def get_defining_attributes(self) -> list[str]:
         return ["category_name", "category_id"]
     @staticmethod
-    def remove_keys() -> List[str]:
+    def remove_keys() -> list[str]:
         """
         A list of attributes to suspend from as_dict creation.
-        :return: List of attributes.
+        :return: list of attributes.
         """
         return []
     @classmethod
-    def from_dict(cls, **kwargs: JsonDict) -> "CategoryAnnotation":
+    def from_dict(cls, **kwargs: AnnotationDict) -> CategoryAnnotation:
         category_ann = ann_from_dict(cls, **kwargs)
         return category_ann
     @staticmethod
-    def get_state_attributes() -> List[str]:
+    def get_state_attributes() -> list[str]:
         return ["active", "sub_categories", "relationships"]
@@ -432,20 +437,20 @@ class ImageAnnotation(CategoryAnnotation):
     """
     bounding_box: Optional[BoundingBox] = field(default=None)
-    image: Optional["Image"] = field(default=None, init=False, repr=False)  # type: ignore
+    image: Optional[Image] = field(default=None, init=False, repr=False)  # type: ignore  # pylint: disable=E0602
-    def get_defining_attributes(self) -> List[str]:
+    def get_defining_attributes(self) -> list[str]:
         return ["category_name", "bounding_box"]
     @classmethod
-    def from_dict(cls, **kwargs: JsonDict) -> "ImageAnnotation":
+    def from_dict(cls, **kwargs: AnnotationDict) -> ImageAnnotation:
         image_ann = ann_from_dict(cls, **kwargs)
         if box_kwargs := kwargs.get("bounding_box"):
             image_ann.bounding_box = BoundingBox.from_dict(**box_kwargs)
         return image_ann
     @staticmethod
-    def get_state_attributes() -> List[str]:
+    def get_state_attributes() -> list[str]:
         return ["active", "sub_categories", "relationships", "image"]
     def get_bounding_box(self, image_id: Optional[str] = None) -> BoundingBox:
@@ -462,31 +467,10 @@ class ImageAnnotation(CategoryAnnotation):
     def get_summary(self, key: ObjectTypes) -> CategoryAnnotation:
         """Get summary sub categories from `image`. Raises `ValueError` if `key` is not available"""
         if self.image:
-            if self.image.summary:
-                return self.image.summary.get_sub_category(key)
+            return self.image.summary.get_sub_category(key)
         raise AnnotationError(f"Summary does not exist for {self.annotation_id} and key: {key}")
-@dataclass
-class SummaryAnnotation(CategoryAnnotation):
-    """
-    A dataclass for adding summaries. The various summaries can be stored as sub categories.
-    Summary annotations should be stored in the attribute provided: `image.Image.summary`  and should not be
-    dumped as a category.
-    """
-    def __post_init__(self) -> None:
-        self._category_name = SummaryType.summary
-        super().__post_init__()
-    @classmethod
-    def from_dict(cls, **kwargs: JsonDict) -> "SummaryAnnotation":
-        summary_ann = ann_from_dict(cls, **kwargs)
-        summary_ann.category_name = SummaryType.summary
-        return summary_ann
 @dataclass
 class ContainerAnnotation(CategoryAnnotation):
     """
@@ -496,13 +480,14 @@ class ContainerAnnotation(CategoryAnnotation):
      value: Attribute to store the value. Use strings.
     """
-    value: Optional[Union[List[str], str]] = field(default=None)
+    value: Optional[Union[list[str], str]] = field(default=None)
-    def get_defining_attributes(self) -> List[str]:
+    def get_defining_attributes(self) -> list[str]:
         return ["category_name", "value"]
     @classmethod
-    def from_dict(cls, **kwargs: JsonDict) -> "SummaryAnnotation":
+    def from_dict(cls, **kwargs: AnnotationDict) -> ContainerAnnotation:
         container_ann = ann_from_dict(cls, **kwargs)
-        container_ann.value = kwargs.get("value")
+        value = kwargs.get("value", "")
+        container_ann.value = value if isinstance(value, str) else list(value)
         return container_ann

deepdoctection/datapoint/box.py CHANGED Viewed

@@ -21,18 +21,19 @@ Implementation of BoundingBox class and related methods
 from dataclasses import dataclass
 from math import ceil, floor
-from typing import List, Optional, Sequence, no_type_check
+from typing import Optional, Sequence, no_type_check
 import numpy as np
 import numpy.typing as npt
+from lazy_imports import try_import
 from numpy import float32
-from ..utils.detection_types import ImageType
 from ..utils.error import BoundingBoxError
 from ..utils.file_utils import cocotools_available
 from ..utils.logger import LoggingRecord, logger
+from ..utils.types import PixelValues
-if cocotools_available():
+with try_import() as import_guard:
     import pycocotools.mask as coco_mask
@@ -220,7 +221,7 @@ class BoundingBox:
         return self.uly + 0.5 * self.height
     @property
-    def center(self) -> List[float]:
+    def center(self) -> list[float]:
         """
         Bounding box center [x,y]
         """
@@ -263,7 +264,7 @@ class BoundingBox:
             * np_poly_scale
         )
-    def to_list(self, mode: str, scale_x: float = 1.0, scale_y: float = 1.0) -> List[float]:
+    def to_list(self, mode: str, scale_x: float = 1.0, scale_y: float = 1.0) -> list[float]:
         """
         Returns the coordinates as list
@@ -344,7 +345,7 @@ class BoundingBox:
         return f"Bounding Box ulx: {self.ulx}, uly: {self.uly}, lrx: {self.lrx}, lry: {self.lry}"
     @staticmethod
-    def remove_keys() -> List[str]:
+    def remove_keys() -> list[str]:
         """
         A list of attributes to suspend from as_dict creation.
         """
@@ -397,8 +398,8 @@ def intersection_box(
 def crop_box_from_image(
-    np_image: ImageType, crop_box: BoundingBox, width: Optional[float] = None, height: Optional[float] = None
-) -> ImageType:
+    np_image: PixelValues, crop_box: BoundingBox, width: Optional[float] = None, height: Optional[float] = None
+) -> PixelValues:
     """
     Crop a box (the crop_box) from a np_image. Will floor the left  and ceil the right coordinate point.

deepdoctection/datapoint/convert.py CHANGED Viewed

@@ -30,10 +30,10 @@ from numpy import uint8
 from numpy.typing import NDArray
 from pypdf import PdfReader
-from ..utils.detection_types import ImageType
 from ..utils.develop import deprecated
 from ..utils.error import DependencyError
 from ..utils.pdf_utils import pdf_to_np_array
+from ..utils.types import PixelValues
 from ..utils.viz import viz_handler
 __all__ = [
@@ -75,7 +75,7 @@ def as_dict(obj: Any, dict_factory) -> Union[Any]:  # type: ignore
     return copy.deepcopy(obj)
-def convert_b64_to_np_array(image: str) -> ImageType:
+def convert_b64_to_np_array(image: str) -> PixelValues:
     """
     Converts an image in base4 string encoding representation to a numpy array of shape (width,height,channel).
@@ -86,7 +86,7 @@ def convert_b64_to_np_array(image: str) -> ImageType:
     return viz_handler.convert_b64_to_np(image).astype(uint8)
-def convert_np_array_to_b64(np_image: ImageType) -> str:
+def convert_np_array_to_b64(np_image: PixelValues) -> str:
     """
     Converts an image from numpy array into a base64 string encoding representation
@@ -97,7 +97,7 @@ def convert_np_array_to_b64(np_image: ImageType) -> str:
 @no_type_check
-def convert_np_array_to_b64_b(np_image: ImageType) -> bytes:
+def convert_np_array_to_b64_b(np_image: PixelValues) -> bytes:
     """
     Converts an image from numpy array into a base64 bytes encoding representation
@@ -108,7 +108,7 @@ def convert_np_array_to_b64_b(np_image: ImageType) -> bytes:
 @deprecated("Use convert_pdf_bytes_to_np_array_v2", "2022-02-23")
-def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -> ImageType:
+def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
     """
     Converts a pdf passed as bytes into a numpy array. Note, that this method expects poppler to be installed.
     Please check the installation guides at https://poppler.freedesktop.org/ . If no value for dpi is provided
@@ -143,7 +143,7 @@ def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -
     return np_array.astype(uint8)
-def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = None) -> ImageType:
+def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
     """
     Converts a pdf passed as bytes into a numpy array. Note, that this method expects poppler to be installed. This
     function, however does not rely on the wrapper pdf2image but uses a function of this lib which calls poppler

deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl