deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (111) hide show
  1. deepdoctection/__init__.py +8 -25
  2. deepdoctection/analyzer/dd.py +84 -71
  3. deepdoctection/dataflow/common.py +9 -5
  4. deepdoctection/dataflow/custom.py +5 -5
  5. deepdoctection/dataflow/custom_serialize.py +75 -18
  6. deepdoctection/dataflow/parallel_map.py +3 -3
  7. deepdoctection/dataflow/serialize.py +4 -4
  8. deepdoctection/dataflow/stats.py +3 -3
  9. deepdoctection/datapoint/annotation.py +78 -56
  10. deepdoctection/datapoint/box.py +7 -7
  11. deepdoctection/datapoint/convert.py +6 -6
  12. deepdoctection/datapoint/image.py +157 -75
  13. deepdoctection/datapoint/view.py +175 -151
  14. deepdoctection/datasets/adapter.py +30 -24
  15. deepdoctection/datasets/base.py +10 -10
  16. deepdoctection/datasets/dataflow_builder.py +3 -3
  17. deepdoctection/datasets/info.py +23 -25
  18. deepdoctection/datasets/instances/doclaynet.py +48 -49
  19. deepdoctection/datasets/instances/fintabnet.py +44 -45
  20. deepdoctection/datasets/instances/funsd.py +23 -23
  21. deepdoctection/datasets/instances/iiitar13k.py +8 -8
  22. deepdoctection/datasets/instances/layouttest.py +2 -2
  23. deepdoctection/datasets/instances/publaynet.py +3 -3
  24. deepdoctection/datasets/instances/pubtables1m.py +18 -18
  25. deepdoctection/datasets/instances/pubtabnet.py +30 -29
  26. deepdoctection/datasets/instances/rvlcdip.py +28 -29
  27. deepdoctection/datasets/instances/xfund.py +51 -30
  28. deepdoctection/datasets/save.py +6 -6
  29. deepdoctection/eval/accmetric.py +32 -33
  30. deepdoctection/eval/base.py +8 -9
  31. deepdoctection/eval/cocometric.py +13 -12
  32. deepdoctection/eval/eval.py +32 -26
  33. deepdoctection/eval/tedsmetric.py +16 -12
  34. deepdoctection/eval/tp_eval_callback.py +7 -16
  35. deepdoctection/extern/base.py +339 -134
  36. deepdoctection/extern/d2detect.py +69 -89
  37. deepdoctection/extern/deskew.py +11 -10
  38. deepdoctection/extern/doctrocr.py +81 -64
  39. deepdoctection/extern/fastlang.py +23 -16
  40. deepdoctection/extern/hfdetr.py +53 -38
  41. deepdoctection/extern/hflayoutlm.py +216 -155
  42. deepdoctection/extern/hflm.py +35 -30
  43. deepdoctection/extern/model.py +433 -255
  44. deepdoctection/extern/pdftext.py +15 -15
  45. deepdoctection/extern/pt/ptutils.py +4 -2
  46. deepdoctection/extern/tessocr.py +39 -38
  47. deepdoctection/extern/texocr.py +14 -16
  48. deepdoctection/extern/tp/tfutils.py +16 -2
  49. deepdoctection/extern/tp/tpcompat.py +11 -7
  50. deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
  51. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
  52. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
  53. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
  54. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
  55. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
  56. deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
  57. deepdoctection/extern/tpdetect.py +40 -45
  58. deepdoctection/mapper/cats.py +36 -40
  59. deepdoctection/mapper/cocostruct.py +16 -12
  60. deepdoctection/mapper/d2struct.py +22 -22
  61. deepdoctection/mapper/hfstruct.py +7 -7
  62. deepdoctection/mapper/laylmstruct.py +22 -24
  63. deepdoctection/mapper/maputils.py +9 -10
  64. deepdoctection/mapper/match.py +33 -2
  65. deepdoctection/mapper/misc.py +6 -7
  66. deepdoctection/mapper/pascalstruct.py +4 -4
  67. deepdoctection/mapper/prodigystruct.py +6 -6
  68. deepdoctection/mapper/pubstruct.py +84 -92
  69. deepdoctection/mapper/tpstruct.py +3 -3
  70. deepdoctection/mapper/xfundstruct.py +33 -33
  71. deepdoctection/pipe/anngen.py +39 -14
  72. deepdoctection/pipe/base.py +68 -99
  73. deepdoctection/pipe/common.py +181 -85
  74. deepdoctection/pipe/concurrency.py +14 -10
  75. deepdoctection/pipe/doctectionpipe.py +24 -21
  76. deepdoctection/pipe/language.py +20 -25
  77. deepdoctection/pipe/layout.py +18 -16
  78. deepdoctection/pipe/lm.py +49 -47
  79. deepdoctection/pipe/order.py +63 -65
  80. deepdoctection/pipe/refine.py +102 -109
  81. deepdoctection/pipe/segment.py +157 -162
  82. deepdoctection/pipe/sub_layout.py +50 -40
  83. deepdoctection/pipe/text.py +37 -36
  84. deepdoctection/pipe/transform.py +19 -16
  85. deepdoctection/train/d2_frcnn_train.py +27 -25
  86. deepdoctection/train/hf_detr_train.py +22 -18
  87. deepdoctection/train/hf_layoutlm_train.py +49 -48
  88. deepdoctection/train/tp_frcnn_train.py +10 -11
  89. deepdoctection/utils/concurrency.py +1 -1
  90. deepdoctection/utils/context.py +13 -6
  91. deepdoctection/utils/develop.py +4 -4
  92. deepdoctection/utils/env_info.py +52 -14
  93. deepdoctection/utils/file_utils.py +6 -11
  94. deepdoctection/utils/fs.py +41 -14
  95. deepdoctection/utils/identifier.py +2 -2
  96. deepdoctection/utils/logger.py +15 -15
  97. deepdoctection/utils/metacfg.py +7 -7
  98. deepdoctection/utils/pdf_utils.py +39 -14
  99. deepdoctection/utils/settings.py +188 -182
  100. deepdoctection/utils/tqdm.py +1 -1
  101. deepdoctection/utils/transform.py +14 -9
  102. deepdoctection/utils/types.py +104 -0
  103. deepdoctection/utils/utils.py +7 -7
  104. deepdoctection/utils/viz.py +70 -69
  105. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
  106. deepdoctection-0.34.dist-info/RECORD +146 -0
  107. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
  108. deepdoctection/utils/detection_types.py +0 -68
  109. deepdoctection-0.32.dist-info/RECORD +0 -146
  110. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
  111. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
@@ -24,14 +24,14 @@ from typing import Any, Callable, Iterator, Mapping, Optional, Union
24
24
 
25
25
  from lazy_imports import try_import
26
26
 
27
- from ..dataflow import CacheData, CustomDataFromList, MapData, RepeatedData
27
+ from ..dataflow import CustomDataFromList, MapData, RepeatedData
28
28
  from ..datapoint.image import Image
29
29
  from ..datasets.base import DatasetBase
30
30
  from ..mapper.maputils import LabelSummarizer
31
- from ..utils.detection_types import DP, JsonDict
32
31
  from ..utils.logger import LoggingRecord, log_once, logger
33
32
  from ..utils.settings import DatasetType, LayoutType, ObjectTypes, PageType, WordType
34
33
  from ..utils.tqdm import get_tqdm
34
+ from ..utils.types import DP, JsonDict
35
35
  from .registry import get_dataset
36
36
 
37
37
  with try_import() as import_guard:
@@ -57,6 +57,7 @@ class DatasetAdapter(IterableDataset): # type: ignore
57
57
  cache_dataset: bool,
58
58
  image_to_framework_func: Optional[Callable[[DP], Optional[JsonDict]]] = None,
59
59
  use_token_tag: bool = True,
60
+ number_repetitions: int = -1,
60
61
  **build_kwargs: str,
61
62
  ) -> None:
62
63
  """
@@ -69,6 +70,12 @@ class DatasetAdapter(IterableDataset): # type: ignore
69
70
  `WordType.token_class`.
70
71
  :param build_kwargs: optional parameters for defining the dataflow.
71
72
  """
73
+ if number_repetitions == -1 and not cache_dataset:
74
+ raise ValueError(
75
+ "Number of repetitions cannot be infinite when not caching the dataset. Instead try to"
76
+ " set a high number of repetitions"
77
+ )
78
+
72
79
  if isinstance(name_or_dataset, str):
73
80
  self.dataset = get_dataset(name_or_dataset)
74
81
  else:
@@ -78,22 +85,22 @@ class DatasetAdapter(IterableDataset): # type: ignore
78
85
 
79
86
  if cache_dataset:
80
87
  logger.info(LoggingRecord("Yielding dataflow into memory and create torch dataset"))
81
- categories: Mapping[str, ObjectTypes] = {}
88
+ categories: Mapping[int, ObjectTypes] = {}
82
89
  _data_statistics = True
83
- if self.dataset.dataset_info.type in (DatasetType.object_detection, DatasetType.sequence_classification):
90
+ if self.dataset.dataset_info.type in (DatasetType.OBJECT_DETECTION, DatasetType.SEQUENCE_CLASSIFICATION):
84
91
  categories = self.dataset.dataflow.categories.get_categories(filtered=True)
85
- elif self.dataset.dataset_info.type in (DatasetType.token_classification,):
92
+ elif self.dataset.dataset_info.type in (DatasetType.TOKEN_CLASSIFICATION,):
86
93
  if use_token_tag:
87
94
  categories = self.dataset.dataflow.categories.get_sub_categories(
88
- categories=LayoutType.word,
89
- sub_categories={LayoutType.word: [WordType.token_tag]},
95
+ categories=LayoutType.WORD,
96
+ sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG]},
90
97
  keys=False,
91
98
  values_as_dict=True,
92
- )[LayoutType.word][WordType.token_tag]
99
+ )[LayoutType.WORD][WordType.TOKEN_TAG]
93
100
  else:
94
101
  categories = self.dataset.dataflow.categories.get_sub_categories(
95
- categories=LayoutType.word, sub_categories={LayoutType.word: [WordType.token_class]}, keys=False
96
- )[LayoutType.word][WordType.token_class]
102
+ categories=LayoutType.WORD, sub_categories={LayoutType.WORD: [WordType.TOKEN_CLASS]}, keys=False
103
+ )[LayoutType.WORD][WordType.TOKEN_CLASS]
97
104
  else:
98
105
  logger.info(
99
106
  LoggingRecord(f"dataset is of type {self.dataset.dataset_info.type}. Cannot generate statistics.")
@@ -121,19 +128,19 @@ class DatasetAdapter(IterableDataset): # type: ignore
121
128
  "images when needed and reduce memory costs!!!",
122
129
  "warn",
123
130
  )
124
- if self.dataset.dataset_info.type == DatasetType.object_detection:
131
+ if self.dataset.dataset_info.type == DatasetType.OBJECT_DETECTION:
125
132
  anns = dp.get_annotation()
126
- cat_ids = [int(ann.category_id) for ann in anns]
133
+ cat_ids = [ann.category_id for ann in anns]
127
134
 
128
- elif self.dataset.dataset_info.type == DatasetType.sequence_classification:
129
- cat_ids = dp.summary.get_sub_category(PageType.document_type).category_id
135
+ elif self.dataset.dataset_info.type == DatasetType.SEQUENCE_CLASSIFICATION:
136
+ cat_ids = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id
130
137
 
131
- elif self.dataset.dataset_info.type == DatasetType.token_classification:
132
- anns = dp.get_annotation(category_names=LayoutType.word)
138
+ elif self.dataset.dataset_info.type == DatasetType.TOKEN_CLASSIFICATION:
139
+ anns = dp.get_annotation(category_names=LayoutType.WORD)
133
140
  if use_token_tag:
134
- cat_ids = [ann.get_sub_category(WordType.token_tag).category_id for ann in anns]
141
+ cat_ids = [ann.get_sub_category(WordType.TOKEN_TAG).category_id for ann in anns]
135
142
  else:
136
- cat_ids = [ann.get_sub_category(WordType.token_class).category_id for ann in anns]
143
+ cat_ids = [ann.get_sub_category(WordType.TOKEN_CLASS).category_id for ann in anns]
137
144
 
138
145
  if _data_statistics:
139
146
  summarizer.dump(cat_ids)
@@ -144,14 +151,13 @@ class DatasetAdapter(IterableDataset): # type: ignore
144
151
  if _data_statistics:
145
152
  summarizer.print_summary_histogram()
146
153
  self.number_datapoints = len(datapoints)
154
+ if not self.number_datapoints:
155
+ raise ValueError("DatasetAdapter receives no datapoints. Please check your dataflow build config.")
147
156
 
148
157
  df = CustomDataFromList(datapoints, shuffle=True)
149
- if not image_to_framework_func:
150
- df = RepeatedData(df, -1)
151
- else:
152
- df_list = CacheData(df).get_cache()
153
- df = CustomDataFromList(df_list, shuffle=True)
154
- df = RepeatedData(df, -1)
158
+ df = RepeatedData(df, number_repetitions)
159
+ else:
160
+ df = RepeatedData(df, number_repetitions)
155
161
 
156
162
  if image_to_framework_func:
157
163
  df = MapData(df, image_to_framework_func)
@@ -27,15 +27,15 @@ from abc import ABC, abstractmethod
27
27
  from collections import defaultdict
28
28
  from inspect import signature
29
29
  from pathlib import Path
30
- from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
30
+ from typing import Any, Mapping, Optional, Sequence, Type, Union
31
31
 
32
32
  import numpy as np
33
33
 
34
34
  from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
35
35
  from ..datapoint.image import Image
36
- from ..utils.detection_types import Pathlike
37
36
  from ..utils.logger import LoggingRecord, logger
38
37
  from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
38
+ from ..utils.types import PathLikeOrStr
39
39
  from .dataflow_builder import DataFlowBaseBuilder
40
40
  from .info import DatasetCategories, DatasetInfo, get_merged_categories
41
41
 
@@ -138,14 +138,14 @@ class SplitDataFlow(DataFlowBaseBuilder):
138
138
  Dataflow builder for splitting datasets
139
139
  """
140
140
 
141
- def __init__(self, train: List[Image], val: List[Image], test: Optional[List[Image]]):
141
+ def __init__(self, train: list[Image], val: list[Image], test: Optional[list[Image]]):
142
142
  """
143
143
  :param train: Cached train split
144
144
  :param val: Cached val split
145
145
  :param test: Cached test split
146
146
  """
147
147
  super().__init__(location="")
148
- self.split_cache: Dict[str, List[Image]]
148
+ self.split_cache: dict[str, list[Image]]
149
149
  if test is None:
150
150
  self.split_cache = {"train": train, "val": val}
151
151
  else:
@@ -215,8 +215,8 @@ class MergeDataset(DatasetBase):
215
215
  :param datasets: An arbitrary number of datasets
216
216
  """
217
217
  self.datasets = datasets
218
- self.dataflows: Optional[Tuple[DataFlow, ...]] = None
219
- self.datapoint_list: Optional[List[Image]] = None
218
+ self.dataflows: Optional[tuple[DataFlow, ...]] = None
219
+ self.datapoint_list: Optional[list[Image]] = None
220
220
  super().__init__()
221
221
  self._dataset_info.type = datasets[0].dataset_info.type
222
222
  self._dataset_info.name = "merge_" + "_".join([dataset.dataset_info.name for dataset in self.datasets])
@@ -239,7 +239,7 @@ class MergeDataset(DatasetBase):
239
239
  def __init__(self, *dataflow_builders: DataFlowBaseBuilder):
240
240
  super().__init__("")
241
241
  self.dataflow_builders = dataflow_builders
242
- self.dataflows: Optional[Tuple[DataFlow, ...]] = None
242
+ self.dataflows: Optional[tuple[DataFlow, ...]] = None
243
243
 
244
244
  def build(self, **kwargs: Union[str, int]) -> DataFlow:
245
245
  """
@@ -327,7 +327,7 @@ class MergeDataset(DatasetBase):
327
327
  self._dataflow_builder = SplitDataFlow(train_dataset, val_dataset, test_dataset)
328
328
  self._dataflow_builder.categories = self._categories()
329
329
 
330
- def get_ids_by_split(self) -> Dict[str, List[str]]:
330
+ def get_ids_by_split(self) -> dict[str, list[str]]:
331
331
  """
332
332
  To reproduce a dataset split at a later stage, get a summary of the by having a dict of list with split and
333
333
  the image ids contained in the split.
@@ -389,7 +389,7 @@ class CustomDataset(DatasetBase):
389
389
  self,
390
390
  name: str,
391
391
  dataset_type: TypeOrStr,
392
- location: Pathlike,
392
+ location: PathLikeOrStr,
393
393
  init_categories: Sequence[ObjectTypes],
394
394
  dataflow_builder: Type[DataFlowBaseBuilder],
395
395
  init_sub_categories: Optional[Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]]] = None,
@@ -451,7 +451,7 @@ class CustomDataset(DatasetBase):
451
451
  return self.dataflow_builder
452
452
 
453
453
  @staticmethod
454
- def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
454
+ def from_dataset_card(file_path: PathLikeOrStr, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
455
455
  """
456
456
  This static method creates a CustomDataset instance from a dataset card.
457
457
 
@@ -24,8 +24,8 @@ from pathlib import Path
24
24
  from typing import Mapping, Optional, Sequence, Union
25
25
 
26
26
  from ..dataflow import DataFlow
27
- from ..utils.detection_types import Pathlike
28
27
  from ..utils.fs import get_dataset_dir_path
28
+ from ..utils.types import PathLikeOrStr
29
29
  from .info import DatasetCategories
30
30
 
31
31
 
@@ -44,7 +44,7 @@ class DataFlowBaseBuilder(ABC):
44
44
 
45
45
  def __init__(
46
46
  self,
47
- location: Pathlike,
47
+ location: PathLikeOrStr,
48
48
  annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
49
49
  ):
50
50
  """
@@ -100,7 +100,7 @@ class DataFlowBaseBuilder(ABC):
100
100
 
101
101
  :return: local workdir
102
102
  """
103
- return get_dataset_dir_path() / self.location
103
+ return Path(get_dataset_dir_path()) / self.location
104
104
 
105
105
  @abstractmethod
106
106
  def build(self, **kwargs: Union[str, int]) -> DataFlow:
@@ -22,7 +22,7 @@ Module for storing dataset info (e.g. general meta data or categories)
22
22
  from copy import copy
23
23
  from dataclasses import dataclass, field
24
24
  from itertools import chain
25
- from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Set, Union, no_type_check, overload
25
+ from typing import Any, Literal, Mapping, Optional, Sequence, Union, no_type_check, overload
26
26
 
27
27
  from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
28
28
  from ..utils.utils import call_only_once
@@ -31,25 +31,25 @@ __all__ = ["DatasetInfo", "DatasetCategories", "get_merged_categories"]
31
31
 
32
32
 
33
33
  @overload
34
- def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[True], starts_with: int = ...) -> Dict[ObjectTypes, str]:
34
+ def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[True], starts_with: int = ...) -> dict[ObjectTypes, int]:
35
35
  ...
36
36
 
37
37
 
38
38
  @overload
39
- def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[False], starts_with: int = ...) -> Dict[str, ObjectTypes]:
39
+ def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[False], starts_with: int = ...) -> dict[int, ObjectTypes]:
40
40
  ...
41
41
 
42
42
 
43
43
  @overload
44
44
  def _get_dict(
45
45
  l: Sequence[ObjectTypes], name_as_key: bool, starts_with: int = ...
46
- ) -> Union[Dict[ObjectTypes, str], Dict[str, ObjectTypes]]:
46
+ ) -> Union[dict[ObjectTypes, int], dict[int, ObjectTypes]]:
47
47
  ...
48
48
 
49
49
 
50
50
  def _get_dict(
51
51
  l: Sequence[ObjectTypes], name_as_key: bool, starts_with: int = 1
52
- ) -> Union[Dict[ObjectTypes, str], Dict[str, ObjectTypes]]:
52
+ ) -> Union[dict[ObjectTypes, int], dict[int, ObjectTypes]]:
53
53
  """
54
54
  Converts a list into a dict, where keys/values are the list indices.
55
55
 
@@ -59,8 +59,8 @@ def _get_dict(
59
59
  :return: A dictionary of list indices/list elements.
60
60
  """
61
61
  if name_as_key:
62
- return {v: str(k) for k, v in enumerate(l, starts_with)}
63
- return {str(k): v for k, v in enumerate(l, starts_with)}
62
+ return {v: k for k, v in enumerate(l, starts_with)}
63
+ return dict(enumerate(l, starts_with))
64
64
 
65
65
 
66
66
  @dataclass
@@ -89,7 +89,7 @@ class DatasetInfo:
89
89
  license: str = field(default="")
90
90
  url: Union[str, Sequence[str]] = field(default="")
91
91
  splits: Mapping[str, str] = field(default_factory=dict)
92
- type: DatasetType = field(default=DatasetType.default)
92
+ type: DatasetType = field(default=DatasetType.DEFAULT)
93
93
 
94
94
  def get_split(self, key: str) -> str:
95
95
  """
@@ -143,13 +143,13 @@ class DatasetCategories:
143
143
  @overload
144
144
  def get_categories(
145
145
  self, *, name_as_key: Literal[True], init: bool = ..., filtered: bool = ...
146
- ) -> Mapping[ObjectTypes, str]:
146
+ ) -> Mapping[ObjectTypes, int]:
147
147
  ...
148
148
 
149
149
  @overload
150
150
  def get_categories(
151
151
  self, *, name_as_key: Literal[False] = ..., init: bool = ..., filtered: bool = ...
152
- ) -> Mapping[str, ObjectTypes]:
152
+ ) -> Mapping[int, ObjectTypes]:
153
153
  ...
154
154
 
155
155
  @overload
@@ -161,12 +161,12 @@ class DatasetCategories:
161
161
  @overload
162
162
  def get_categories(
163
163
  self, as_dict: Literal[True] = ..., name_as_key: bool = False, init: bool = False, filtered: bool = False
164
- ) -> Union[Mapping[ObjectTypes, str], Mapping[str, ObjectTypes]]:
164
+ ) -> Union[Mapping[ObjectTypes, int], Mapping[int, ObjectTypes]]:
165
165
  ...
166
166
 
167
167
  def get_categories(
168
168
  self, as_dict: bool = True, name_as_key: bool = False, init: bool = False, filtered: bool = False
169
- ) -> Union[Sequence[ObjectTypes], Mapping[ObjectTypes, str], Mapping[str, ObjectTypes]]:
169
+ ) -> Union[Sequence[ObjectTypes], Mapping[ObjectTypes, int], Mapping[int, ObjectTypes]]:
170
170
  """
171
171
  Get categories of a dataset. The returned value also respects modifications of the inventory like filtered
172
172
  categories of replaced categories with sub categories. However, you must correctly pass arguments to return the
@@ -229,7 +229,7 @@ class DatasetCategories:
229
229
  if sub_categories is None:
230
230
  sub_categories = {}
231
231
 
232
- sub_cat: Dict[ObjectTypes, Union[ObjectTypes, List[ObjectTypes]]] = {}
232
+ sub_cat: dict[ObjectTypes, Union[ObjectTypes, list[ObjectTypes]]] = {}
233
233
  for cat in _categories:
234
234
  assert cat in self.get_categories( # pylint: disable=E1135
235
235
  as_dict=False, filtered=True
@@ -254,9 +254,9 @@ class DatasetCategories:
254
254
  for category, value in sub_cat.items():
255
255
  if category not in sub_categories:
256
256
  continue
257
- sub_cat_tmp: Dict[str, Union[Dict[str, str], Sequence[str]]] = {}
257
+ sub_cat_tmp: dict[str, Union[dict[int, ObjectTypes], dict[ObjectTypes, int], Sequence[str]]] = {}
258
258
  sub_categories_list: Union[
259
- ObjectTypes, str, List[Sequence[Union[ObjectTypes, str]]], Sequence[Union[ObjectTypes, str]]
259
+ ObjectTypes, str, list[Sequence[Union[ObjectTypes, str]]], Sequence[Union[ObjectTypes, str]]
260
260
  ]
261
261
  if isinstance(sub_categories[category], ObjectTypes):
262
262
  sub_categories_list = [sub_categories[category]]
@@ -267,14 +267,12 @@ class DatasetCategories:
267
267
  continue
268
268
  if values_as_dict:
269
269
  if not name_as_key:
270
- sub_cat_tmp[sub_cat_key] = {
271
- str(k): v
272
- for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
273
- }
270
+ sub_cat_tmp[sub_cat_key] = dict(
271
+ enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
272
+ )
274
273
  else:
275
274
  sub_cat_tmp[sub_cat_key] = {
276
- v: str(k)
277
- for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
275
+ v: k for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
278
276
  }
279
277
  else:
280
278
  sub_cat_tmp[sub_cat_key] = self.init_sub_categories[category][get_type(sub_cat_key)]
@@ -284,7 +282,7 @@ class DatasetCategories:
284
282
  return sub_cat
285
283
 
286
284
  @call_only_once
287
- def set_cat_to_sub_cat(self, cat_to_sub_cat: Dict[TypeOrStr, TypeOrStr]) -> None:
285
+ def set_cat_to_sub_cat(self, cat_to_sub_cat: dict[TypeOrStr, TypeOrStr]) -> None:
288
286
  """
289
287
  Change category representation if sub-categories are available. Pass a dictionary of the main category
290
288
  and the requested sub-category. This will change the dictionary of categories and the category names
@@ -323,7 +321,7 @@ class DatasetCategories:
323
321
  self._categories_update = _categories_update_list
324
322
 
325
323
  @call_only_once
326
- def filter_categories(self, categories: Union[TypeOrStr, List[TypeOrStr]]) -> None:
324
+ def filter_categories(self, categories: Union[TypeOrStr, list[TypeOrStr]]) -> None:
327
325
  """
328
326
  Filter categories of a dataset. This will keep all the categories chosen and remove all others.
329
327
  This method can only be called once per object.
@@ -415,7 +413,7 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
415
413
  # form a set of possible sub category values. To get a list of all values from all dataset, take the union
416
414
  intersect_init_sub_cat_values = {}
417
415
  for sub_cat_key in intersect_sub_cat_per_key:
418
- val: Set[ObjectTypes] = set()
416
+ val: set[ObjectTypes] = set()
419
417
  for cat in categories:
420
418
  val.update(cat.init_sub_categories[key][sub_cat_key])
421
419
  intersect_init_sub_cat_values[sub_cat_key] = list(val)
@@ -425,7 +423,7 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
425
423
  # construction is not deterministic but guarantees for unique values in all sub categories. Now we build the
426
424
  # ensemble dict of sub categories where we guarantee unique values on one hand side and always maintain the
427
425
  # same arrangements for all category/ sub category lists
428
- init_sub_cat: Dict[ObjectTypes, Any] = {}
426
+ init_sub_cat: dict[ObjectTypes, Any] = {}
429
427
  for category in categories:
430
428
  for cat in intersect_sub_cat_keys:
431
429
  for sub_cat_key in category.init_sub_categories[cat]:
@@ -31,14 +31,14 @@ import os
31
31
  from typing import Mapping, Sequence, Union
32
32
 
33
33
  from ...dataflow import DataFlow, MapData, MapDataComponent, SerializerCoco
34
- from ...datapoint.annotation import CategoryAnnotation, SummaryAnnotation
34
+ from ...datapoint.annotation import CategoryAnnotation
35
35
  from ...datapoint.image import Image
36
36
  from ...mapper.cats import add_summary, cat_to_sub_cat, filter_cat, filter_summary
37
37
  from ...mapper.cocostruct import coco_to_image
38
38
  from ...mapper.maputils import curry
39
- from ...utils.detection_types import JsonDict
40
39
  from ...utils.fs import load_image_from_file
41
- from ...utils.settings import DatasetType, DocumentType, LayoutType, ObjectTypes, PageType, TypeOrStr
40
+ from ...utils.settings import DatasetType, DocumentType, LayoutType, ObjectTypes, PageType, SummaryType, TypeOrStr
41
+ from ...utils.types import CocoDatapointDict
42
42
  from ..base import DatasetBase
43
43
  from ..dataflow_builder import DataFlowBaseBuilder
44
44
  from ..info import DatasetCategories, DatasetInfo
@@ -64,36 +64,36 @@ _DESCRIPTION = (
64
64
  _LICENSE = "CDLA-Permissive"
65
65
  _URL = "https://codait-cos-dax.s3.us.cloud-object-storage.appdomain.cloud/dax-doclaynet/1.0.0/DocLayNet_core.zip"
66
66
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
67
- _TYPE = DatasetType.object_detection
67
+ _TYPE = DatasetType.OBJECT_DETECTION
68
68
 
69
69
  _LOCATION = "DocLayNet_core"
70
70
 
71
71
  _ANNOTATION_FILES: Mapping[str, str] = {"train": "COCO/train.json", "val": "COCO/val.json", "test": "COCO/test.json"}
72
72
  _INIT_CATEGORIES = [
73
- LayoutType.caption,
74
- LayoutType.footnote,
75
- LayoutType.formula,
76
- LayoutType.list,
77
- LayoutType.page_footer,
78
- LayoutType.page_header,
79
- LayoutType.figure,
80
- LayoutType.section_header,
81
- LayoutType.table,
82
- LayoutType.text,
83
- LayoutType.title,
73
+ LayoutType.CAPTION,
74
+ LayoutType.FOOTNOTE,
75
+ LayoutType.FORMULA,
76
+ LayoutType.LIST,
77
+ LayoutType.PAGE_FOOTER,
78
+ LayoutType.PAGE_HEADER,
79
+ LayoutType.FIGURE,
80
+ LayoutType.SECTION_HEADER,
81
+ LayoutType.TABLE,
82
+ LayoutType.TEXT,
83
+ LayoutType.TITLE,
84
84
  ]
85
85
  _SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]] = {
86
- LayoutType.caption: {DatasetType.publaynet: [LayoutType.text]},
87
- LayoutType.footnote: {DatasetType.publaynet: [LayoutType.text]},
88
- LayoutType.formula: {DatasetType.publaynet: [LayoutType.text]},
89
- LayoutType.list: {DatasetType.publaynet: [LayoutType.list]},
90
- LayoutType.page_footer: {DatasetType.publaynet: [LayoutType.text]},
91
- LayoutType.page_header: {DatasetType.publaynet: [LayoutType.title]},
92
- LayoutType.figure: {DatasetType.publaynet: [LayoutType.figure]},
93
- LayoutType.section_header: {DatasetType.publaynet: [LayoutType.title]},
94
- LayoutType.table: {DatasetType.publaynet: [LayoutType.table]},
95
- LayoutType.text: {DatasetType.publaynet: [LayoutType.text]},
96
- LayoutType.title: {DatasetType.publaynet: [LayoutType.title]},
86
+ LayoutType.CAPTION: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
87
+ LayoutType.FOOTNOTE: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
88
+ LayoutType.FORMULA: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
89
+ LayoutType.LIST: {DatasetType.PUBLAYNET: [LayoutType.LIST]},
90
+ LayoutType.PAGE_FOOTER: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
91
+ LayoutType.PAGE_HEADER: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
92
+ LayoutType.FIGURE: {DatasetType.PUBLAYNET: [LayoutType.FIGURE]},
93
+ LayoutType.SECTION_HEADER: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
94
+ LayoutType.TABLE: {DatasetType.PUBLAYNET: [LayoutType.TABLE]},
95
+ LayoutType.TEXT: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
96
+ LayoutType.TITLE: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
97
97
  }
98
98
 
99
99
 
@@ -162,7 +162,7 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
162
162
  filter_empty_image=True,
163
163
  fake_score=fake_score,
164
164
  coarse_mapping={1: 10, 2: 10, 3: 10, 4: 4, 5: 10, 6: 11, 7: 7, 8: 11, 9: 9, 10: 10, 11: 11},
165
- coarse_sub_cat_name=DatasetType.publaynet,
165
+ coarse_sub_cat_name=DatasetType.PUBLAYNET,
166
166
  ),
167
167
  )
168
168
 
@@ -186,14 +186,14 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
186
186
 
187
187
 
188
188
  _NAME_SEQ = "doclaynet-seq"
189
- _TYPE_SEQ = DatasetType.sequence_classification
189
+ _TYPE_SEQ = DatasetType.SEQUENCE_CLASSIFICATION
190
190
  _INIT_CATEGORIES_SEQ = [
191
- DocumentType.financial_report,
192
- DocumentType.scientific_publication,
193
- DocumentType.laws_and_regulations,
194
- DocumentType.government_tenders,
195
- DocumentType.manuals,
196
- DocumentType.patents,
191
+ DocumentType.FINANCIAL_REPORT,
192
+ DocumentType.SCIENTIFIC_PUBLICATION,
193
+ DocumentType.LAWS_AND_REGULATIONS,
194
+ DocumentType.GOVERNMENT_TENDERS,
195
+ DocumentType.MANUALS,
196
+ DocumentType.PATENTS,
197
197
  ]
198
198
 
199
199
 
@@ -245,22 +245,22 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
245
245
  df = MapDataComponent(df, lambda dp: self.get_workdir() / "PNG" / dp, "file_name")
246
246
 
247
247
  @curry
248
- def _map_to_image(dp: JsonDict, load_img: bool) -> Image:
248
+ def _map_to_image(dp: CocoDatapointDict, load_img: bool) -> Image:
249
249
  image = Image(location=dp["file_name"], file_name=os.path.split(dp["file_name"])[1])
250
250
  image.image = load_image_from_file(image.location)
251
- summary = SummaryAnnotation()
251
+ summary = CategoryAnnotation(category_name=SummaryType.SUMMARY)
252
252
  label_to_category_name = {
253
- "financial_reports": DocumentType.financial_report,
254
- "scientific_articles": DocumentType.scientific_publication,
255
- "laws_and_regulations": DocumentType.laws_and_regulations,
256
- "government_tenders": DocumentType.government_tenders,
257
- "manuals": DocumentType.manuals,
258
- "patents": DocumentType.patents,
253
+ "financial_reports": DocumentType.FINANCIAL_REPORT,
254
+ "scientific_articles": DocumentType.SCIENTIFIC_PUBLICATION,
255
+ "laws_and_regulations": DocumentType.LAWS_AND_REGULATIONS,
256
+ "government_tenders": DocumentType.GOVERNMENT_TENDERS,
257
+ "manuals": DocumentType.MANUALS,
258
+ "patents": DocumentType.PATENTS,
259
259
  }
260
260
  categories_dict = self.categories.get_categories(init=True, name_as_key=True)
261
261
  category_name = label_to_category_name[dp["doc_category"]]
262
262
  summary.dump_sub_category(
263
- PageType.document_type,
263
+ PageType.DOCUMENT_TYPE,
264
264
  CategoryAnnotation(category_name=category_name, category_id=categories_dict[category_name]),
265
265
  )
266
266
  image.summary = summary
@@ -274,15 +274,14 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
274
274
  if self.categories.is_filtered():
275
275
  df = MapData(
276
276
  df,
277
- filter_summary({PageType.document_type: self.categories.get_categories(as_dict=False, filtered=True)}),
277
+ filter_summary({PageType.DOCUMENT_TYPE: self.categories.get_categories(as_dict=False, filtered=True)}),
278
278
  )
279
279
 
280
280
  @curry
281
- def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, str]) -> Image:
282
- if dp.summary:
283
- if PageType.document_type in dp.summary.sub_categories:
284
- summary_cat = dp.summary.get_sub_category(PageType.document_type)
285
- summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
281
+ def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, int]) -> Image:
282
+ if PageType.DOCUMENT_TYPE in dp.summary.sub_categories:
283
+ summary_cat = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE)
284
+ summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
286
285
  return dp
287
286
 
288
287
  df = MapData(df, _re_map_cat_ids(self.categories.get_categories(filtered=True, name_as_key=True)))