PyPI - deepdoctection - Versions diffs - 0.31__py3-none-any.whl → 0.33__py3-none-any.whl - Mend

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show

deepdoctection/__init__.py +16 -29
deepdoctection/analyzer/dd.py +70 -59
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +41 -56
deepdoctection/datapoint/box.py +9 -8
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +56 -44
deepdoctection/datapoint/view.py +245 -150
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +35 -26
deepdoctection/datasets/base.py +14 -12
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +24 -26
deepdoctection/datasets/instances/doclaynet.py +51 -51
deepdoctection/datasets/instances/fintabnet.py +46 -46
deepdoctection/datasets/instances/funsd.py +25 -24
deepdoctection/datasets/instances/iiitar13k.py +13 -10
deepdoctection/datasets/instances/layouttest.py +4 -3
deepdoctection/datasets/instances/publaynet.py +5 -5
deepdoctection/datasets/instances/pubtables1m.py +24 -21
deepdoctection/datasets/instances/pubtabnet.py +32 -30
deepdoctection/datasets/instances/rvlcdip.py +30 -30
deepdoctection/datasets/instances/xfund.py +26 -26
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +15 -13
deepdoctection/eval/eval.py +41 -37
deepdoctection/eval/tedsmetric.py +30 -23
deepdoctection/eval/tp_eval_callback.py +16 -19
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +85 -113
deepdoctection/extern/deskew.py +14 -11
deepdoctection/extern/doctrocr.py +141 -130
deepdoctection/extern/fastlang.py +27 -18
deepdoctection/extern/hfdetr.py +71 -62
deepdoctection/extern/hflayoutlm.py +504 -211
deepdoctection/extern/hflm.py +230 -0
deepdoctection/extern/model.py +488 -302
deepdoctection/extern/pdftext.py +23 -19
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +29 -19
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +18 -18
deepdoctection/extern/tp/tfutils.py +57 -9
deepdoctection/extern/tp/tpcompat.py +21 -14
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +45 -53
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/cats.py +27 -29
deepdoctection/mapper/cocostruct.py +10 -10
deepdoctection/mapper/d2struct.py +27 -26
deepdoctection/mapper/hfstruct.py +13 -8
deepdoctection/mapper/laylmstruct.py +178 -37
deepdoctection/mapper/maputils.py +12 -11
deepdoctection/mapper/match.py +2 -2
deepdoctection/mapper/misc.py +11 -9
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +5 -5
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +5 -5
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +12 -14
deepdoctection/pipe/base.py +52 -106
deepdoctection/pipe/common.py +72 -59
deepdoctection/pipe/concurrency.py +16 -11
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +20 -16
deepdoctection/pipe/lm.py +75 -105
deepdoctection/pipe/order.py +194 -89
deepdoctection/pipe/refine.py +111 -124
deepdoctection/pipe/segment.py +156 -161
deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +48 -41
deepdoctection/train/hf_detr_train.py +41 -30
deepdoctection/train/hf_layoutlm_train.py +153 -135
deepdoctection/train/tp_frcnn_train.py +32 -31
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +87 -125
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +22 -18
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +16 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +11 -11
deepdoctection/utils/settings.py +185 -181
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +74 -72
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
deepdoctection-0.33.dist-info/RECORD +146 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.31.dist-info/RECORD +0 -144
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0

deepdoctection/datasets/__init__.py CHANGED Viewed

@@ -26,13 +26,10 @@ Create an info card, a DataFlowBaseBuilder derived instance, possibly a category
 DatasetBase derived instance to create a data set.
 """
-from ..utils.file_utils import pytorch_available
+from .adapter import *
 from .base import *
 from .dataflow_builder import DataFlowBaseBuilder
 from .info import *
 from .instances import *
 from .registry import *
 from .save import *
-if pytorch_available():
-    from .adapter import *

deepdoctection/datasets/adapter.py CHANGED Viewed

@@ -22,19 +22,22 @@ Module for wrapping datasets into a pytorch dataset framework.
 from typing import Any, Callable, Iterator, Mapping, Optional, Union
-from ..dataflow import CacheData, CustomDataFromList, MapData, RepeatedData
+from lazy_imports import try_import
+from ..dataflow import CustomDataFromList, MapData, RepeatedData
 from ..datapoint.image import Image
 from ..datasets.base import DatasetBase
 from ..mapper.maputils import LabelSummarizer
-from ..utils.detection_types import DP, JsonDict
-from ..utils.file_utils import pytorch_available
 from ..utils.logger import LoggingRecord, log_once, logger
 from ..utils.settings import DatasetType, LayoutType, ObjectTypes, PageType, WordType
 from ..utils.tqdm import get_tqdm
+from ..utils.types import DP, JsonDict
 from .registry import get_dataset
-if pytorch_available():
+with try_import() as import_guard:
     from torch.utils.data import IterableDataset
+if not import_guard.is_successful():
+    from ..utils.mocks import IterableDataset  # type: ignore
 class DatasetAdapter(IterableDataset):  # type: ignore
@@ -54,6 +57,7 @@ class DatasetAdapter(IterableDataset):  # type: ignore
         cache_dataset: bool,
         image_to_framework_func: Optional[Callable[[DP], Optional[JsonDict]]] = None,
         use_token_tag: bool = True,
+        number_repetitions: int = -1,
         **build_kwargs: str,
     ) -> None:
         """
@@ -66,6 +70,12 @@ class DatasetAdapter(IterableDataset):  # type: ignore
                               `WordType.token_class`.
         :param build_kwargs: optional parameters for defining the dataflow.
         """
+        if number_repetitions == -1 and not cache_dataset:
+            raise ValueError(
+                "Number of repetitions cannot be infinite when not caching the dataset. Instead try to"
+                " set a high number of repetitions"
+            )
         if isinstance(name_or_dataset, str):
             self.dataset = get_dataset(name_or_dataset)
         else:
@@ -75,22 +85,22 @@ class DatasetAdapter(IterableDataset):  # type: ignore
         if cache_dataset:
             logger.info(LoggingRecord("Yielding dataflow into memory and create torch dataset"))
-            categories: Mapping[str, ObjectTypes] = {}
+            categories: Mapping[int, ObjectTypes] = {}
             _data_statistics = True
-            if self.dataset.dataset_info.type in (DatasetType.object_detection, DatasetType.sequence_classification):
+            if self.dataset.dataset_info.type in (DatasetType.OBJECT_DETECTION, DatasetType.SEQUENCE_CLASSIFICATION):
                 categories = self.dataset.dataflow.categories.get_categories(filtered=True)
-            elif self.dataset.dataset_info.type in (DatasetType.token_classification,):
+            elif self.dataset.dataset_info.type in (DatasetType.TOKEN_CLASSIFICATION,):
                 if use_token_tag:
                     categories = self.dataset.dataflow.categories.get_sub_categories(
-                        categories=LayoutType.word,
-                        sub_categories={LayoutType.word: [WordType.token_tag]},
+                        categories=LayoutType.WORD,
+                        sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG]},
                         keys=False,
                         values_as_dict=True,
-                    )[LayoutType.word][WordType.token_tag]
+                    )[LayoutType.WORD][WordType.TOKEN_TAG]
                 else:
                     categories = self.dataset.dataflow.categories.get_sub_categories(
-                        categories=LayoutType.word, sub_categories={LayoutType.word: [WordType.token_class]}, keys=False
-                    )[LayoutType.word][WordType.token_class]
+                        categories=LayoutType.WORD, sub_categories={LayoutType.WORD: [WordType.TOKEN_CLASS]}, keys=False
+                    )[LayoutType.WORD][WordType.TOKEN_CLASS]
             else:
                 logger.info(
                     LoggingRecord(f"dataset is of type {self.dataset.dataset_info.type}. Cannot generate statistics.")
@@ -118,19 +128,19 @@ class DatasetAdapter(IterableDataset):  # type: ignore
                             "images when needed and reduce memory costs!!!",
                             "warn",
                         )
-                    if self.dataset.dataset_info.type == DatasetType.object_detection:
+                    if self.dataset.dataset_info.type == DatasetType.OBJECT_DETECTION:
                         anns = dp.get_annotation()
-                        cat_ids = [int(ann.category_id) for ann in anns]
+                        cat_ids = [ann.category_id for ann in anns]
-                    elif self.dataset.dataset_info.type == DatasetType.sequence_classification:
-                        cat_ids = dp.summary.get_sub_category(PageType.document_type).category_id
+                    elif self.dataset.dataset_info.type == DatasetType.SEQUENCE_CLASSIFICATION:
+                        cat_ids = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id
-                    elif self.dataset.dataset_info.type == DatasetType.token_classification:
-                        anns = dp.get_annotation(category_names=LayoutType.word)
+                    elif self.dataset.dataset_info.type == DatasetType.TOKEN_CLASSIFICATION:
+                        anns = dp.get_annotation(category_names=LayoutType.WORD)
                         if use_token_tag:
-                            cat_ids = [ann.get_sub_category(WordType.token_tag).category_id for ann in anns]
+                            cat_ids = [ann.get_sub_category(WordType.TOKEN_TAG).category_id for ann in anns]
                         else:
-                            cat_ids = [ann.get_sub_category(WordType.token_class).category_id for ann in anns]
+                            cat_ids = [ann.get_sub_category(WordType.TOKEN_CLASS).category_id for ann in anns]
                     if _data_statistics:
                         summarizer.dump(cat_ids)
@@ -141,14 +151,13 @@ class DatasetAdapter(IterableDataset):  # type: ignore
             if _data_statistics:
                 summarizer.print_summary_histogram()
             self.number_datapoints = len(datapoints)
+            if not self.number_datapoints:
+                raise ValueError("DatasetAdapter receives no datapoints. Please check your dataflow build config.")
             df = CustomDataFromList(datapoints, shuffle=True)
-            if not image_to_framework_func:
-                df = RepeatedData(df, -1)
-            else:
-                df_list = CacheData(df).get_cache()
-                df = CustomDataFromList(df_list, shuffle=True)
-                df = RepeatedData(df, -1)
+            df = RepeatedData(df, number_repetitions)
+        else:
+            df = RepeatedData(df, number_repetitions)
         if image_to_framework_func:
             df = MapData(df, image_to_framework_func)

deepdoctection/datasets/base.py CHANGED Viewed

@@ -18,6 +18,8 @@
 """
 Module for the base class of datasets.
 """
+from __future__ import annotations
 import json
 import os
 import pprint
@@ -25,15 +27,15 @@ from abc import ABC, abstractmethod
 from collections import defaultdict
 from inspect import signature
 from pathlib import Path
-from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
+from typing import Any, Mapping, Optional, Sequence, Type, Union
 import numpy as np
 from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
 from ..datapoint.image import Image
-from ..utils.detection_types import Pathlike
 from ..utils.logger import LoggingRecord, logger
-from ..utils.settings import ObjectTypes, TypeOrStr, get_type
+from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
+from ..utils.types import PathLikeOrStr
 from .dataflow_builder import DataFlowBaseBuilder
 from .info import DatasetCategories, DatasetInfo, get_merged_categories
@@ -136,14 +138,14 @@ class SplitDataFlow(DataFlowBaseBuilder):
     Dataflow builder for splitting datasets
     """
-    def __init__(self, train: List[Image], val: List[Image], test: Optional[List[Image]]):
+    def __init__(self, train: list[Image], val: list[Image], test: Optional[list[Image]]):
         """
         :param train: Cached train split
         :param val: Cached val split
         :param test: Cached test split
         """
         super().__init__(location="")
-        self.split_cache: Dict[str, List[Image]]
+        self.split_cache: dict[str, list[Image]]
         if test is None:
             self.split_cache = {"train": train, "val": val}
         else:
@@ -213,8 +215,8 @@ class MergeDataset(DatasetBase):
         :param datasets: An arbitrary number of datasets
         """
         self.datasets = datasets
-        self.dataflows: Optional[Tuple[DataFlow, ...]] = None
-        self.datapoint_list: Optional[List[Image]] = None
+        self.dataflows: Optional[tuple[DataFlow, ...]] = None
+        self.datapoint_list: Optional[list[Image]] = None
         super().__init__()
         self._dataset_info.type = datasets[0].dataset_info.type
         self._dataset_info.name = "merge_" + "_".join([dataset.dataset_info.name for dataset in self.datasets])
@@ -237,7 +239,7 @@ class MergeDataset(DatasetBase):
             def __init__(self, *dataflow_builders: DataFlowBaseBuilder):
                 super().__init__("")
                 self.dataflow_builders = dataflow_builders
-                self.dataflows: Optional[Tuple[DataFlow, ...]] = None
+                self.dataflows: Optional[tuple[DataFlow, ...]] = None
             def build(self, **kwargs: Union[str, int]) -> DataFlow:
                 """
@@ -325,7 +327,7 @@ class MergeDataset(DatasetBase):
         self._dataflow_builder = SplitDataFlow(train_dataset, val_dataset, test_dataset)
         self._dataflow_builder.categories = self._categories()
-    def get_ids_by_split(self) -> Dict[str, List[str]]:
+    def get_ids_by_split(self) -> dict[str, list[str]]:
         """
         To reproduce a dataset split at a later stage, get a summary of the by having a dict of list with split and
         the image ids contained in the split.
@@ -387,7 +389,7 @@ class CustomDataset(DatasetBase):
         self,
         name: str,
         dataset_type: TypeOrStr,
-        location: Pathlike,
+        location: PathLikeOrStr,
         init_categories: Sequence[ObjectTypes],
         dataflow_builder: Type[DataFlowBaseBuilder],
         init_sub_categories: Optional[Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]]] = None,
@@ -423,7 +425,7 @@ class CustomDataset(DatasetBase):
         """
         self.name = name
-        self.type = get_type(dataset_type)
+        self.type: DatasetType = get_type(dataset_type)  # type: ignore
         self.location = location
         self.init_categories = init_categories
         if init_sub_categories is None:
@@ -449,7 +451,7 @@ class CustomDataset(DatasetBase):
         return self.dataflow_builder
     @staticmethod
-    def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> "CustomDataset":
+    def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
         """
         This static method creates a CustomDataset instance from a dataset card.

deepdoctection/datasets/dataflow_builder.py CHANGED Viewed

@@ -24,8 +24,8 @@ from pathlib import Path
 from typing import Mapping, Optional, Sequence, Union
 from ..dataflow import DataFlow
-from ..utils.detection_types import Pathlike
 from ..utils.fs import get_dataset_dir_path
+from ..utils.types import PathLikeOrStr
 from .info import DatasetCategories
@@ -44,7 +44,7 @@ class DataFlowBaseBuilder(ABC):
     def __init__(
         self,
-        location: Pathlike,
+        location: PathLikeOrStr,
         annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
     ):
         """
@@ -100,7 +100,7 @@ class DataFlowBaseBuilder(ABC):
         :return: local workdir
         """
-        return get_dataset_dir_path() / self.location
+        return Path(get_dataset_dir_path()) / self.location
     @abstractmethod
     def build(self, **kwargs: Union[str, int]) -> DataFlow:

deepdoctection/datasets/info.py CHANGED Viewed

@@ -22,34 +22,34 @@ Module for storing dataset info (e.g. general meta data or categories)
 from copy import copy
 from dataclasses import dataclass, field
 from itertools import chain
-from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Set, Union, no_type_check, overload
+from typing import Any, Literal, Mapping, Optional, Sequence, Union, no_type_check, overload
-from ..utils.settings import DefaultType, ObjectTypes, TypeOrStr, get_type
+from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
 from ..utils.utils import call_only_once
 __all__ = ["DatasetInfo", "DatasetCategories", "get_merged_categories"]
 @overload
-def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[True], starts_with: int = ...) -> Dict[ObjectTypes, str]:
+def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[True], starts_with: int = ...) -> dict[ObjectTypes, int]:
     ...
 @overload
-def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[False], starts_with: int = ...) -> Dict[str, ObjectTypes]:
+def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[False], starts_with: int = ...) -> dict[int, ObjectTypes]:
     ...
 @overload
 def _get_dict(
     l: Sequence[ObjectTypes], name_as_key: bool, starts_with: int = ...
-) -> Union[Dict[ObjectTypes, str], Dict[str, ObjectTypes]]:
+) -> Union[dict[ObjectTypes, int], dict[int, ObjectTypes]]:
     ...
 def _get_dict(
     l: Sequence[ObjectTypes], name_as_key: bool, starts_with: int = 1
-) -> Union[Dict[ObjectTypes, str], Dict[str, ObjectTypes]]:
+) -> Union[dict[ObjectTypes, int], dict[int, ObjectTypes]]:
     """
     Converts a list into a dict, where keys/values are the list indices.
@@ -59,8 +59,8 @@ def _get_dict(
     :return: A dictionary of list indices/list elements.
     """
     if name_as_key:
-        return {v: str(k) for k, v in enumerate(l, starts_with)}
-    return {str(k): v for k, v in enumerate(l, starts_with)}
+        return {v: k for k, v in enumerate(l, starts_with)}
+    return dict(enumerate(l, starts_with))
 @dataclass
@@ -89,7 +89,7 @@ class DatasetInfo:
     license: str = field(default="")
     url: Union[str, Sequence[str]] = field(default="")
     splits: Mapping[str, str] = field(default_factory=dict)
-    type: ObjectTypes = field(default=DefaultType.default_type)
+    type: DatasetType = field(default=DatasetType.DEFAULT)
     def get_split(self, key: str) -> str:
         """
@@ -143,13 +143,13 @@ class DatasetCategories:
     @overload
     def get_categories(
         self, *, name_as_key: Literal[True], init: bool = ..., filtered: bool = ...
-    ) -> Mapping[ObjectTypes, str]:
+    ) -> Mapping[ObjectTypes, int]:
         ...
     @overload
     def get_categories(
         self, *, name_as_key: Literal[False] = ..., init: bool = ..., filtered: bool = ...
-    ) -> Mapping[str, ObjectTypes]:
+    ) -> Mapping[int, ObjectTypes]:
         ...
     @overload
@@ -161,12 +161,12 @@ class DatasetCategories:
     @overload
     def get_categories(
         self, as_dict: Literal[True] = ..., name_as_key: bool = False, init: bool = False, filtered: bool = False
-    ) -> Union[Mapping[ObjectTypes, str], Mapping[str, ObjectTypes]]:
+    ) -> Union[Mapping[ObjectTypes, int], Mapping[int, ObjectTypes]]:
         ...
     def get_categories(
         self, as_dict: bool = True, name_as_key: bool = False, init: bool = False, filtered: bool = False
-    ) -> Union[Sequence[ObjectTypes], Mapping[ObjectTypes, str], Mapping[str, ObjectTypes]]:
+    ) -> Union[Sequence[ObjectTypes], Mapping[ObjectTypes, int], Mapping[int, ObjectTypes]]:
         """
         Get categories of a dataset. The returned value also respects modifications of the inventory like filtered
         categories of replaced categories with sub categories. However, you must correctly pass arguments to return the
@@ -229,7 +229,7 @@ class DatasetCategories:
         if sub_categories is None:
             sub_categories = {}
-        sub_cat: Dict[ObjectTypes, Union[ObjectTypes, List[ObjectTypes]]] = {}
+        sub_cat: dict[ObjectTypes, Union[ObjectTypes, list[ObjectTypes]]] = {}
         for cat in _categories:
             assert cat in self.get_categories(  # pylint: disable=E1135
                 as_dict=False, filtered=True
@@ -254,9 +254,9 @@ class DatasetCategories:
             for category, value in sub_cat.items():
                 if category not in sub_categories:
                     continue
-                sub_cat_tmp: Dict[str, Union[Dict[str, str], Sequence[str]]] = {}
+                sub_cat_tmp: dict[str, Union[dict[int, ObjectTypes], dict[ObjectTypes, int], Sequence[str]]] = {}
                 sub_categories_list: Union[
-                    ObjectTypes, str, List[Sequence[Union[ObjectTypes, str]]], Sequence[Union[ObjectTypes, str]]
+                    ObjectTypes, str, list[Sequence[Union[ObjectTypes, str]]], Sequence[Union[ObjectTypes, str]]
                 ]
                 if isinstance(sub_categories[category], ObjectTypes):
                     sub_categories_list = [sub_categories[category]]
@@ -267,14 +267,12 @@ class DatasetCategories:
                         continue
                     if values_as_dict:
                         if not name_as_key:
-                            sub_cat_tmp[sub_cat_key] = {
-                                str(k): v
-                                for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
-                            }
+                            sub_cat_tmp[sub_cat_key] = dict(
+                                enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
+                            )
                         else:
                             sub_cat_tmp[sub_cat_key] = {
-                                v: str(k)
-                                for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
+                                v: k for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
                             }
                     else:
                         sub_cat_tmp[sub_cat_key] = self.init_sub_categories[category][get_type(sub_cat_key)]
@@ -284,7 +282,7 @@ class DatasetCategories:
         return sub_cat
     @call_only_once
-    def set_cat_to_sub_cat(self, cat_to_sub_cat: Dict[TypeOrStr, TypeOrStr]) -> None:
+    def set_cat_to_sub_cat(self, cat_to_sub_cat: dict[TypeOrStr, TypeOrStr]) -> None:
         """
         Change category representation if sub-categories are available. Pass a dictionary of the main category
         and the requested sub-category. This will change the dictionary of categories and the category names
@@ -323,7 +321,7 @@ class DatasetCategories:
             self._categories_update = _categories_update_list
     @call_only_once
-    def filter_categories(self, categories: Union[TypeOrStr, List[TypeOrStr]]) -> None:
+    def filter_categories(self, categories: Union[TypeOrStr, list[TypeOrStr]]) -> None:
         """
         Filter categories of a dataset. This will keep all the categories chosen and remove all others.
         This method can only be called once per object.
@@ -415,7 +413,7 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
         # form a set of possible sub category values. To get a list of all values from all dataset, take the union
         intersect_init_sub_cat_values = {}
         for sub_cat_key in intersect_sub_cat_per_key:
-            val: Set[ObjectTypes] = set()
+            val: set[ObjectTypes] = set()
             for cat in categories:
                 val.update(cat.init_sub_categories[key][sub_cat_key])
             intersect_init_sub_cat_values[sub_cat_key] = list(val)
@@ -425,7 +423,7 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
     # construction is not deterministic but guarantees for unique values in all sub categories. Now we build the
     # ensemble dict of sub categories where we guarantee unique values on one hand side and always maintain the
     # same arrangements for all category/ sub category lists
-    init_sub_cat: Dict[ObjectTypes, Any] = {}
+    init_sub_cat: dict[ObjectTypes, Any] = {}
     for category in categories:
         for cat in intersect_sub_cat_keys:
             for sub_cat_key in category.init_sub_categories[cat]:

deepdoctection/datasets/instances/doclaynet.py CHANGED Viewed

@@ -25,19 +25,20 @@ Module for DocLayNet dataset. Place the dataset as follows
     ├── PNG
     │ ├── 0a0d43e301facee9e99cc33b9b16e732dd207135f4027e75f6aea2bf117535a2.png
 """
+from __future__ import annotations
 import os
 from typing import Mapping, Sequence, Union
 from ...dataflow import DataFlow, MapData, MapDataComponent, SerializerCoco
-from ...datapoint.annotation import CategoryAnnotation, SummaryAnnotation
+from ...datapoint.annotation import CategoryAnnotation
 from ...datapoint.image import Image
 from ...mapper.cats import add_summary, cat_to_sub_cat, filter_cat, filter_summary
 from ...mapper.cocostruct import coco_to_image
 from ...mapper.maputils import curry
-from ...utils.detection_types import JsonDict
 from ...utils.fs import load_image_from_file
-from ...utils.settings import DatasetType, DocumentType, LayoutType, ObjectTypes, PageType, TypeOrStr
+from ...utils.settings import DatasetType, DocumentType, LayoutType, ObjectTypes, PageType, SummaryType, TypeOrStr
+from ...utils.types import CocoDatapointDict
 from ..base import DatasetBase
 from ..dataflow_builder import DataFlowBaseBuilder
 from ..info import DatasetCategories, DatasetInfo
@@ -63,36 +64,36 @@ _DESCRIPTION = (
 _LICENSE = "CDLA-Permissive"
 _URL = "https://codait-cos-dax.s3.us.cloud-object-storage.appdomain.cloud/dax-doclaynet/1.0.0/DocLayNet_core.zip"
 _SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
-_TYPE = DatasetType.object_detection
+_TYPE = DatasetType.OBJECT_DETECTION
 _LOCATION = "DocLayNet_core"
 _ANNOTATION_FILES: Mapping[str, str] = {"train": "COCO/train.json", "val": "COCO/val.json", "test": "COCO/test.json"}
 _INIT_CATEGORIES = [
-    LayoutType.caption,
-    LayoutType.footnote,
-    LayoutType.formula,
-    LayoutType.list,
-    LayoutType.page_footer,
-    LayoutType.page_header,
-    LayoutType.figure,
-    LayoutType.section_header,
-    LayoutType.table,
-    LayoutType.text,
-    LayoutType.title,
+    LayoutType.CAPTION,
+    LayoutType.FOOTNOTE,
+    LayoutType.FORMULA,
+    LayoutType.LIST,
+    LayoutType.PAGE_FOOTER,
+    LayoutType.PAGE_HEADER,
+    LayoutType.FIGURE,
+    LayoutType.SECTION_HEADER,
+    LayoutType.TABLE,
+    LayoutType.TEXT,
+    LayoutType.TITLE,
 ]
 _SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]] = {
-    LayoutType.caption: {DatasetType.publaynet: [LayoutType.text]},
-    LayoutType.footnote: {DatasetType.publaynet: [LayoutType.text]},
-    LayoutType.formula: {DatasetType.publaynet: [LayoutType.text]},
-    LayoutType.list: {DatasetType.publaynet: [LayoutType.list]},
-    LayoutType.page_footer: {DatasetType.publaynet: [LayoutType.text]},
-    LayoutType.page_header: {DatasetType.publaynet: [LayoutType.title]},
-    LayoutType.figure: {DatasetType.publaynet: [LayoutType.figure]},
-    LayoutType.section_header: {DatasetType.publaynet: [LayoutType.title]},
-    LayoutType.table: {DatasetType.publaynet: [LayoutType.table]},
-    LayoutType.text: {DatasetType.publaynet: [LayoutType.text]},
-    LayoutType.title: {DatasetType.publaynet: [LayoutType.title]},
+    LayoutType.CAPTION: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
+    LayoutType.FOOTNOTE: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
+    LayoutType.FORMULA: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
+    LayoutType.LIST: {DatasetType.PUBLAYNET: [LayoutType.LIST]},
+    LayoutType.PAGE_FOOTER: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
+    LayoutType.PAGE_HEADER: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
+    LayoutType.FIGURE: {DatasetType.PUBLAYNET: [LayoutType.FIGURE]},
+    LayoutType.SECTION_HEADER: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
+    LayoutType.TABLE: {DatasetType.PUBLAYNET: [LayoutType.TABLE]},
+    LayoutType.TEXT: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
+    LayoutType.TITLE: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
 }
@@ -109,7 +110,7 @@ class DocLayNet(DatasetBase):
     def _categories(self) -> DatasetCategories:
         return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
-    def _builder(self) -> "DocLayNetBuilder":
+    def _builder(self) -> DocLayNetBuilder:
         return DocLayNetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
@@ -161,7 +162,7 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
                 filter_empty_image=True,
                 fake_score=fake_score,
                 coarse_mapping={1: 10, 2: 10, 3: 10, 4: 4, 5: 10, 6: 11, 7: 7, 8: 11, 9: 9, 10: 10, 11: 11},
-                coarse_sub_cat_name=DatasetType.publaynet,
+                coarse_sub_cat_name=DatasetType.PUBLAYNET,
             ),
         )
@@ -185,14 +186,14 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
 _NAME_SEQ = "doclaynet-seq"
-_TYPE_SEQ = DatasetType.sequence_classification
+_TYPE_SEQ = DatasetType.SEQUENCE_CLASSIFICATION
 _INIT_CATEGORIES_SEQ = [
-    DocumentType.financial_report,
-    DocumentType.scientific_publication,
-    DocumentType.laws_and_regulations,
-    DocumentType.government_tenders,
-    DocumentType.manuals,
-    DocumentType.patents,
+    DocumentType.FINANCIAL_REPORT,
+    DocumentType.SCIENTIFIC_PUBLICATION,
+    DocumentType.LAWS_AND_REGULATIONS,
+    DocumentType.GOVERNMENT_TENDERS,
+    DocumentType.MANUALS,
+    DocumentType.PATENTS,
 ]
@@ -209,7 +210,7 @@ class DocLayNetSeq(DatasetBase):
     def _categories(self) -> DatasetCategories:
         return DatasetCategories(init_categories=_INIT_CATEGORIES_SEQ)
-    def _builder(self) -> "DocLayNetSeqBuilder":
+    def _builder(self) -> DocLayNetSeqBuilder:
         return DocLayNetSeqBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
@@ -244,22 +245,22 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
         df = MapDataComponent(df, lambda dp: self.get_workdir() / "PNG" / dp, "file_name")
         @curry
-        def _map_to_image(dp: JsonDict, load_img: bool) -> Image:
+        def _map_to_image(dp: CocoDatapointDict, load_img: bool) -> Image:
             image = Image(location=dp["file_name"], file_name=os.path.split(dp["file_name"])[1])
             image.image = load_image_from_file(image.location)
-            summary = SummaryAnnotation()
+            summary = CategoryAnnotation(category_name=SummaryType.SUMMARY)
             label_to_category_name = {
-                "financial_reports": DocumentType.financial_report,
-                "scientific_articles": DocumentType.scientific_publication,
-                "laws_and_regulations": DocumentType.laws_and_regulations,
-                "government_tenders": DocumentType.government_tenders,
-                "manuals": DocumentType.manuals,
-                "patents": DocumentType.patents,
+                "financial_reports": DocumentType.FINANCIAL_REPORT,
+                "scientific_articles": DocumentType.SCIENTIFIC_PUBLICATION,
+                "laws_and_regulations": DocumentType.LAWS_AND_REGULATIONS,
+                "government_tenders": DocumentType.GOVERNMENT_TENDERS,
+                "manuals": DocumentType.MANUALS,
+                "patents": DocumentType.PATENTS,
             }
             categories_dict = self.categories.get_categories(init=True, name_as_key=True)
             category_name = label_to_category_name[dp["doc_category"]]
             summary.dump_sub_category(
-                PageType.document_type,
+                PageType.DOCUMENT_TYPE,
                 CategoryAnnotation(category_name=category_name, category_id=categories_dict[category_name]),
             )
             image.summary = summary
@@ -273,15 +274,14 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
         if self.categories.is_filtered():
             df = MapData(
                 df,
-                filter_summary({PageType.document_type: self.categories.get_categories(as_dict=False, filtered=True)}),
+                filter_summary({PageType.DOCUMENT_TYPE: self.categories.get_categories(as_dict=False, filtered=True)}),
             )
             @curry
-            def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, str]) -> Image:
-                if dp.summary:
-                    if PageType.document_type in dp.summary.sub_categories:
-                        summary_cat = dp.summary.get_sub_category(PageType.document_type)
-                        summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
+            def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, int]) -> Image:
+                if PageType.DOCUMENT_TYPE in dp.summary.sub_categories:
+                    summary_cat = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE)
+                    summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
                 return dp
             df = MapData(df, _re_map_cat_ids(self.categories.get_categories(filtered=True, name_as_key=True)))

deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl