deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show
  1. deepdoctection/__init__.py +16 -29
  2. deepdoctection/analyzer/dd.py +70 -59
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/common.py +9 -5
  5. deepdoctection/dataflow/custom.py +5 -5
  6. deepdoctection/dataflow/custom_serialize.py +75 -18
  7. deepdoctection/dataflow/parallel_map.py +3 -3
  8. deepdoctection/dataflow/serialize.py +4 -4
  9. deepdoctection/dataflow/stats.py +3 -3
  10. deepdoctection/datapoint/annotation.py +41 -56
  11. deepdoctection/datapoint/box.py +9 -8
  12. deepdoctection/datapoint/convert.py +6 -6
  13. deepdoctection/datapoint/image.py +56 -44
  14. deepdoctection/datapoint/view.py +245 -150
  15. deepdoctection/datasets/__init__.py +1 -4
  16. deepdoctection/datasets/adapter.py +35 -26
  17. deepdoctection/datasets/base.py +14 -12
  18. deepdoctection/datasets/dataflow_builder.py +3 -3
  19. deepdoctection/datasets/info.py +24 -26
  20. deepdoctection/datasets/instances/doclaynet.py +51 -51
  21. deepdoctection/datasets/instances/fintabnet.py +46 -46
  22. deepdoctection/datasets/instances/funsd.py +25 -24
  23. deepdoctection/datasets/instances/iiitar13k.py +13 -10
  24. deepdoctection/datasets/instances/layouttest.py +4 -3
  25. deepdoctection/datasets/instances/publaynet.py +5 -5
  26. deepdoctection/datasets/instances/pubtables1m.py +24 -21
  27. deepdoctection/datasets/instances/pubtabnet.py +32 -30
  28. deepdoctection/datasets/instances/rvlcdip.py +30 -30
  29. deepdoctection/datasets/instances/xfund.py +26 -26
  30. deepdoctection/datasets/save.py +6 -6
  31. deepdoctection/eval/__init__.py +1 -4
  32. deepdoctection/eval/accmetric.py +32 -33
  33. deepdoctection/eval/base.py +8 -9
  34. deepdoctection/eval/cocometric.py +15 -13
  35. deepdoctection/eval/eval.py +41 -37
  36. deepdoctection/eval/tedsmetric.py +30 -23
  37. deepdoctection/eval/tp_eval_callback.py +16 -19
  38. deepdoctection/extern/__init__.py +2 -7
  39. deepdoctection/extern/base.py +339 -134
  40. deepdoctection/extern/d2detect.py +85 -113
  41. deepdoctection/extern/deskew.py +14 -11
  42. deepdoctection/extern/doctrocr.py +141 -130
  43. deepdoctection/extern/fastlang.py +27 -18
  44. deepdoctection/extern/hfdetr.py +71 -62
  45. deepdoctection/extern/hflayoutlm.py +504 -211
  46. deepdoctection/extern/hflm.py +230 -0
  47. deepdoctection/extern/model.py +488 -302
  48. deepdoctection/extern/pdftext.py +23 -19
  49. deepdoctection/extern/pt/__init__.py +1 -3
  50. deepdoctection/extern/pt/nms.py +6 -2
  51. deepdoctection/extern/pt/ptutils.py +29 -19
  52. deepdoctection/extern/tessocr.py +39 -38
  53. deepdoctection/extern/texocr.py +18 -18
  54. deepdoctection/extern/tp/tfutils.py +57 -9
  55. deepdoctection/extern/tp/tpcompat.py +21 -14
  56. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
  60. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  61. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
  62. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
  67. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
  68. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  69. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  70. deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
  71. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  72. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  73. deepdoctection/extern/tpdetect.py +45 -53
  74. deepdoctection/mapper/__init__.py +3 -8
  75. deepdoctection/mapper/cats.py +27 -29
  76. deepdoctection/mapper/cocostruct.py +10 -10
  77. deepdoctection/mapper/d2struct.py +27 -26
  78. deepdoctection/mapper/hfstruct.py +13 -8
  79. deepdoctection/mapper/laylmstruct.py +178 -37
  80. deepdoctection/mapper/maputils.py +12 -11
  81. deepdoctection/mapper/match.py +2 -2
  82. deepdoctection/mapper/misc.py +11 -9
  83. deepdoctection/mapper/pascalstruct.py +4 -4
  84. deepdoctection/mapper/prodigystruct.py +5 -5
  85. deepdoctection/mapper/pubstruct.py +84 -92
  86. deepdoctection/mapper/tpstruct.py +5 -5
  87. deepdoctection/mapper/xfundstruct.py +33 -33
  88. deepdoctection/pipe/__init__.py +1 -1
  89. deepdoctection/pipe/anngen.py +12 -14
  90. deepdoctection/pipe/base.py +52 -106
  91. deepdoctection/pipe/common.py +72 -59
  92. deepdoctection/pipe/concurrency.py +16 -11
  93. deepdoctection/pipe/doctectionpipe.py +24 -21
  94. deepdoctection/pipe/language.py +20 -25
  95. deepdoctection/pipe/layout.py +20 -16
  96. deepdoctection/pipe/lm.py +75 -105
  97. deepdoctection/pipe/order.py +194 -89
  98. deepdoctection/pipe/refine.py +111 -124
  99. deepdoctection/pipe/segment.py +156 -161
  100. deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
  101. deepdoctection/pipe/text.py +37 -36
  102. deepdoctection/pipe/transform.py +19 -16
  103. deepdoctection/train/__init__.py +6 -12
  104. deepdoctection/train/d2_frcnn_train.py +48 -41
  105. deepdoctection/train/hf_detr_train.py +41 -30
  106. deepdoctection/train/hf_layoutlm_train.py +153 -135
  107. deepdoctection/train/tp_frcnn_train.py +32 -31
  108. deepdoctection/utils/concurrency.py +1 -1
  109. deepdoctection/utils/context.py +13 -6
  110. deepdoctection/utils/develop.py +4 -4
  111. deepdoctection/utils/env_info.py +87 -125
  112. deepdoctection/utils/file_utils.py +6 -11
  113. deepdoctection/utils/fs.py +22 -18
  114. deepdoctection/utils/identifier.py +2 -2
  115. deepdoctection/utils/logger.py +16 -15
  116. deepdoctection/utils/metacfg.py +7 -7
  117. deepdoctection/utils/mocks.py +93 -0
  118. deepdoctection/utils/pdf_utils.py +11 -11
  119. deepdoctection/utils/settings.py +185 -181
  120. deepdoctection/utils/tqdm.py +1 -1
  121. deepdoctection/utils/transform.py +14 -9
  122. deepdoctection/utils/types.py +104 -0
  123. deepdoctection/utils/utils.py +7 -7
  124. deepdoctection/utils/viz.py +74 -72
  125. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
  126. deepdoctection-0.33.dist-info/RECORD +146 -0
  127. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
  128. deepdoctection/utils/detection_types.py +0 -68
  129. deepdoctection-0.31.dist-info/RECORD +0 -144
  130. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
  131. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
@@ -26,13 +26,10 @@ Create an info card, a DataFlowBaseBuilder derived instance, possibly a category
26
26
  DatasetBase derived instance to create a data set.
27
27
  """
28
28
 
29
- from ..utils.file_utils import pytorch_available
29
+ from .adapter import *
30
30
  from .base import *
31
31
  from .dataflow_builder import DataFlowBaseBuilder
32
32
  from .info import *
33
33
  from .instances import *
34
34
  from .registry import *
35
35
  from .save import *
36
-
37
- if pytorch_available():
38
- from .adapter import *
@@ -22,19 +22,22 @@ Module for wrapping datasets into a pytorch dataset framework.
22
22
 
23
23
  from typing import Any, Callable, Iterator, Mapping, Optional, Union
24
24
 
25
- from ..dataflow import CacheData, CustomDataFromList, MapData, RepeatedData
25
+ from lazy_imports import try_import
26
+
27
+ from ..dataflow import CustomDataFromList, MapData, RepeatedData
26
28
  from ..datapoint.image import Image
27
29
  from ..datasets.base import DatasetBase
28
30
  from ..mapper.maputils import LabelSummarizer
29
- from ..utils.detection_types import DP, JsonDict
30
- from ..utils.file_utils import pytorch_available
31
31
  from ..utils.logger import LoggingRecord, log_once, logger
32
32
  from ..utils.settings import DatasetType, LayoutType, ObjectTypes, PageType, WordType
33
33
  from ..utils.tqdm import get_tqdm
34
+ from ..utils.types import DP, JsonDict
34
35
  from .registry import get_dataset
35
36
 
36
- if pytorch_available():
37
+ with try_import() as import_guard:
37
38
  from torch.utils.data import IterableDataset
39
+ if not import_guard.is_successful():
40
+ from ..utils.mocks import IterableDataset # type: ignore
38
41
 
39
42
 
40
43
  class DatasetAdapter(IterableDataset): # type: ignore
@@ -54,6 +57,7 @@ class DatasetAdapter(IterableDataset): # type: ignore
54
57
  cache_dataset: bool,
55
58
  image_to_framework_func: Optional[Callable[[DP], Optional[JsonDict]]] = None,
56
59
  use_token_tag: bool = True,
60
+ number_repetitions: int = -1,
57
61
  **build_kwargs: str,
58
62
  ) -> None:
59
63
  """
@@ -66,6 +70,12 @@ class DatasetAdapter(IterableDataset): # type: ignore
66
70
  `WordType.token_class`.
67
71
  :param build_kwargs: optional parameters for defining the dataflow.
68
72
  """
73
+ if number_repetitions == -1 and not cache_dataset:
74
+ raise ValueError(
75
+ "Number of repetitions cannot be infinite when not caching the dataset. Instead try to"
76
+ " set a high number of repetitions"
77
+ )
78
+
69
79
  if isinstance(name_or_dataset, str):
70
80
  self.dataset = get_dataset(name_or_dataset)
71
81
  else:
@@ -75,22 +85,22 @@ class DatasetAdapter(IterableDataset): # type: ignore
75
85
 
76
86
  if cache_dataset:
77
87
  logger.info(LoggingRecord("Yielding dataflow into memory and create torch dataset"))
78
- categories: Mapping[str, ObjectTypes] = {}
88
+ categories: Mapping[int, ObjectTypes] = {}
79
89
  _data_statistics = True
80
- if self.dataset.dataset_info.type in (DatasetType.object_detection, DatasetType.sequence_classification):
90
+ if self.dataset.dataset_info.type in (DatasetType.OBJECT_DETECTION, DatasetType.SEQUENCE_CLASSIFICATION):
81
91
  categories = self.dataset.dataflow.categories.get_categories(filtered=True)
82
- elif self.dataset.dataset_info.type in (DatasetType.token_classification,):
92
+ elif self.dataset.dataset_info.type in (DatasetType.TOKEN_CLASSIFICATION,):
83
93
  if use_token_tag:
84
94
  categories = self.dataset.dataflow.categories.get_sub_categories(
85
- categories=LayoutType.word,
86
- sub_categories={LayoutType.word: [WordType.token_tag]},
95
+ categories=LayoutType.WORD,
96
+ sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG]},
87
97
  keys=False,
88
98
  values_as_dict=True,
89
- )[LayoutType.word][WordType.token_tag]
99
+ )[LayoutType.WORD][WordType.TOKEN_TAG]
90
100
  else:
91
101
  categories = self.dataset.dataflow.categories.get_sub_categories(
92
- categories=LayoutType.word, sub_categories={LayoutType.word: [WordType.token_class]}, keys=False
93
- )[LayoutType.word][WordType.token_class]
102
+ categories=LayoutType.WORD, sub_categories={LayoutType.WORD: [WordType.TOKEN_CLASS]}, keys=False
103
+ )[LayoutType.WORD][WordType.TOKEN_CLASS]
94
104
  else:
95
105
  logger.info(
96
106
  LoggingRecord(f"dataset is of type {self.dataset.dataset_info.type}. Cannot generate statistics.")
@@ -118,19 +128,19 @@ class DatasetAdapter(IterableDataset): # type: ignore
118
128
  "images when needed and reduce memory costs!!!",
119
129
  "warn",
120
130
  )
121
- if self.dataset.dataset_info.type == DatasetType.object_detection:
131
+ if self.dataset.dataset_info.type == DatasetType.OBJECT_DETECTION:
122
132
  anns = dp.get_annotation()
123
- cat_ids = [int(ann.category_id) for ann in anns]
133
+ cat_ids = [ann.category_id for ann in anns]
124
134
 
125
- elif self.dataset.dataset_info.type == DatasetType.sequence_classification:
126
- cat_ids = dp.summary.get_sub_category(PageType.document_type).category_id
135
+ elif self.dataset.dataset_info.type == DatasetType.SEQUENCE_CLASSIFICATION:
136
+ cat_ids = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id
127
137
 
128
- elif self.dataset.dataset_info.type == DatasetType.token_classification:
129
- anns = dp.get_annotation(category_names=LayoutType.word)
138
+ elif self.dataset.dataset_info.type == DatasetType.TOKEN_CLASSIFICATION:
139
+ anns = dp.get_annotation(category_names=LayoutType.WORD)
130
140
  if use_token_tag:
131
- cat_ids = [ann.get_sub_category(WordType.token_tag).category_id for ann in anns]
141
+ cat_ids = [ann.get_sub_category(WordType.TOKEN_TAG).category_id for ann in anns]
132
142
  else:
133
- cat_ids = [ann.get_sub_category(WordType.token_class).category_id for ann in anns]
143
+ cat_ids = [ann.get_sub_category(WordType.TOKEN_CLASS).category_id for ann in anns]
134
144
 
135
145
  if _data_statistics:
136
146
  summarizer.dump(cat_ids)
@@ -141,14 +151,13 @@ class DatasetAdapter(IterableDataset): # type: ignore
141
151
  if _data_statistics:
142
152
  summarizer.print_summary_histogram()
143
153
  self.number_datapoints = len(datapoints)
154
+ if not self.number_datapoints:
155
+ raise ValueError("DatasetAdapter receives no datapoints. Please check your dataflow build config.")
144
156
 
145
157
  df = CustomDataFromList(datapoints, shuffle=True)
146
- if not image_to_framework_func:
147
- df = RepeatedData(df, -1)
148
- else:
149
- df_list = CacheData(df).get_cache()
150
- df = CustomDataFromList(df_list, shuffle=True)
151
- df = RepeatedData(df, -1)
158
+ df = RepeatedData(df, number_repetitions)
159
+ else:
160
+ df = RepeatedData(df, number_repetitions)
152
161
 
153
162
  if image_to_framework_func:
154
163
  df = MapData(df, image_to_framework_func)
@@ -18,6 +18,8 @@
18
18
  """
19
19
  Module for the base class of datasets.
20
20
  """
21
+ from __future__ import annotations
22
+
21
23
  import json
22
24
  import os
23
25
  import pprint
@@ -25,15 +27,15 @@ from abc import ABC, abstractmethod
25
27
  from collections import defaultdict
26
28
  from inspect import signature
27
29
  from pathlib import Path
28
- from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
30
+ from typing import Any, Mapping, Optional, Sequence, Type, Union
29
31
 
30
32
  import numpy as np
31
33
 
32
34
  from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
33
35
  from ..datapoint.image import Image
34
- from ..utils.detection_types import Pathlike
35
36
  from ..utils.logger import LoggingRecord, logger
36
- from ..utils.settings import ObjectTypes, TypeOrStr, get_type
37
+ from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
38
+ from ..utils.types import PathLikeOrStr
37
39
  from .dataflow_builder import DataFlowBaseBuilder
38
40
  from .info import DatasetCategories, DatasetInfo, get_merged_categories
39
41
 
@@ -136,14 +138,14 @@ class SplitDataFlow(DataFlowBaseBuilder):
136
138
  Dataflow builder for splitting datasets
137
139
  """
138
140
 
139
- def __init__(self, train: List[Image], val: List[Image], test: Optional[List[Image]]):
141
+ def __init__(self, train: list[Image], val: list[Image], test: Optional[list[Image]]):
140
142
  """
141
143
  :param train: Cached train split
142
144
  :param val: Cached val split
143
145
  :param test: Cached test split
144
146
  """
145
147
  super().__init__(location="")
146
- self.split_cache: Dict[str, List[Image]]
148
+ self.split_cache: dict[str, list[Image]]
147
149
  if test is None:
148
150
  self.split_cache = {"train": train, "val": val}
149
151
  else:
@@ -213,8 +215,8 @@ class MergeDataset(DatasetBase):
213
215
  :param datasets: An arbitrary number of datasets
214
216
  """
215
217
  self.datasets = datasets
216
- self.dataflows: Optional[Tuple[DataFlow, ...]] = None
217
- self.datapoint_list: Optional[List[Image]] = None
218
+ self.dataflows: Optional[tuple[DataFlow, ...]] = None
219
+ self.datapoint_list: Optional[list[Image]] = None
218
220
  super().__init__()
219
221
  self._dataset_info.type = datasets[0].dataset_info.type
220
222
  self._dataset_info.name = "merge_" + "_".join([dataset.dataset_info.name for dataset in self.datasets])
@@ -237,7 +239,7 @@ class MergeDataset(DatasetBase):
237
239
  def __init__(self, *dataflow_builders: DataFlowBaseBuilder):
238
240
  super().__init__("")
239
241
  self.dataflow_builders = dataflow_builders
240
- self.dataflows: Optional[Tuple[DataFlow, ...]] = None
242
+ self.dataflows: Optional[tuple[DataFlow, ...]] = None
241
243
 
242
244
  def build(self, **kwargs: Union[str, int]) -> DataFlow:
243
245
  """
@@ -325,7 +327,7 @@ class MergeDataset(DatasetBase):
325
327
  self._dataflow_builder = SplitDataFlow(train_dataset, val_dataset, test_dataset)
326
328
  self._dataflow_builder.categories = self._categories()
327
329
 
328
- def get_ids_by_split(self) -> Dict[str, List[str]]:
330
+ def get_ids_by_split(self) -> dict[str, list[str]]:
329
331
  """
330
332
  To reproduce a dataset split at a later stage, get a summary of the by having a dict of list with split and
331
333
  the image ids contained in the split.
@@ -387,7 +389,7 @@ class CustomDataset(DatasetBase):
387
389
  self,
388
390
  name: str,
389
391
  dataset_type: TypeOrStr,
390
- location: Pathlike,
392
+ location: PathLikeOrStr,
391
393
  init_categories: Sequence[ObjectTypes],
392
394
  dataflow_builder: Type[DataFlowBaseBuilder],
393
395
  init_sub_categories: Optional[Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]]] = None,
@@ -423,7 +425,7 @@ class CustomDataset(DatasetBase):
423
425
  """
424
426
 
425
427
  self.name = name
426
- self.type = get_type(dataset_type)
428
+ self.type: DatasetType = get_type(dataset_type) # type: ignore
427
429
  self.location = location
428
430
  self.init_categories = init_categories
429
431
  if init_sub_categories is None:
@@ -449,7 +451,7 @@ class CustomDataset(DatasetBase):
449
451
  return self.dataflow_builder
450
452
 
451
453
  @staticmethod
452
- def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> "CustomDataset":
454
+ def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
453
455
  """
454
456
  This static method creates a CustomDataset instance from a dataset card.
455
457
 
@@ -24,8 +24,8 @@ from pathlib import Path
24
24
  from typing import Mapping, Optional, Sequence, Union
25
25
 
26
26
  from ..dataflow import DataFlow
27
- from ..utils.detection_types import Pathlike
28
27
  from ..utils.fs import get_dataset_dir_path
28
+ from ..utils.types import PathLikeOrStr
29
29
  from .info import DatasetCategories
30
30
 
31
31
 
@@ -44,7 +44,7 @@ class DataFlowBaseBuilder(ABC):
44
44
 
45
45
  def __init__(
46
46
  self,
47
- location: Pathlike,
47
+ location: PathLikeOrStr,
48
48
  annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
49
49
  ):
50
50
  """
@@ -100,7 +100,7 @@ class DataFlowBaseBuilder(ABC):
100
100
 
101
101
  :return: local workdir
102
102
  """
103
- return get_dataset_dir_path() / self.location
103
+ return Path(get_dataset_dir_path()) / self.location
104
104
 
105
105
  @abstractmethod
106
106
  def build(self, **kwargs: Union[str, int]) -> DataFlow:
@@ -22,34 +22,34 @@ Module for storing dataset info (e.g. general meta data or categories)
22
22
  from copy import copy
23
23
  from dataclasses import dataclass, field
24
24
  from itertools import chain
25
- from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Set, Union, no_type_check, overload
25
+ from typing import Any, Literal, Mapping, Optional, Sequence, Union, no_type_check, overload
26
26
 
27
- from ..utils.settings import DefaultType, ObjectTypes, TypeOrStr, get_type
27
+ from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
28
28
  from ..utils.utils import call_only_once
29
29
 
30
30
  __all__ = ["DatasetInfo", "DatasetCategories", "get_merged_categories"]
31
31
 
32
32
 
33
33
  @overload
34
- def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[True], starts_with: int = ...) -> Dict[ObjectTypes, str]:
34
+ def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[True], starts_with: int = ...) -> dict[ObjectTypes, int]:
35
35
  ...
36
36
 
37
37
 
38
38
  @overload
39
- def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[False], starts_with: int = ...) -> Dict[str, ObjectTypes]:
39
+ def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[False], starts_with: int = ...) -> dict[int, ObjectTypes]:
40
40
  ...
41
41
 
42
42
 
43
43
  @overload
44
44
  def _get_dict(
45
45
  l: Sequence[ObjectTypes], name_as_key: bool, starts_with: int = ...
46
- ) -> Union[Dict[ObjectTypes, str], Dict[str, ObjectTypes]]:
46
+ ) -> Union[dict[ObjectTypes, int], dict[int, ObjectTypes]]:
47
47
  ...
48
48
 
49
49
 
50
50
  def _get_dict(
51
51
  l: Sequence[ObjectTypes], name_as_key: bool, starts_with: int = 1
52
- ) -> Union[Dict[ObjectTypes, str], Dict[str, ObjectTypes]]:
52
+ ) -> Union[dict[ObjectTypes, int], dict[int, ObjectTypes]]:
53
53
  """
54
54
  Converts a list into a dict, where keys/values are the list indices.
55
55
 
@@ -59,8 +59,8 @@ def _get_dict(
59
59
  :return: A dictionary of list indices/list elements.
60
60
  """
61
61
  if name_as_key:
62
- return {v: str(k) for k, v in enumerate(l, starts_with)}
63
- return {str(k): v for k, v in enumerate(l, starts_with)}
62
+ return {v: k for k, v in enumerate(l, starts_with)}
63
+ return dict(enumerate(l, starts_with))
64
64
 
65
65
 
66
66
  @dataclass
@@ -89,7 +89,7 @@ class DatasetInfo:
89
89
  license: str = field(default="")
90
90
  url: Union[str, Sequence[str]] = field(default="")
91
91
  splits: Mapping[str, str] = field(default_factory=dict)
92
- type: ObjectTypes = field(default=DefaultType.default_type)
92
+ type: DatasetType = field(default=DatasetType.DEFAULT)
93
93
 
94
94
  def get_split(self, key: str) -> str:
95
95
  """
@@ -143,13 +143,13 @@ class DatasetCategories:
143
143
  @overload
144
144
  def get_categories(
145
145
  self, *, name_as_key: Literal[True], init: bool = ..., filtered: bool = ...
146
- ) -> Mapping[ObjectTypes, str]:
146
+ ) -> Mapping[ObjectTypes, int]:
147
147
  ...
148
148
 
149
149
  @overload
150
150
  def get_categories(
151
151
  self, *, name_as_key: Literal[False] = ..., init: bool = ..., filtered: bool = ...
152
- ) -> Mapping[str, ObjectTypes]:
152
+ ) -> Mapping[int, ObjectTypes]:
153
153
  ...
154
154
 
155
155
  @overload
@@ -161,12 +161,12 @@ class DatasetCategories:
161
161
  @overload
162
162
  def get_categories(
163
163
  self, as_dict: Literal[True] = ..., name_as_key: bool = False, init: bool = False, filtered: bool = False
164
- ) -> Union[Mapping[ObjectTypes, str], Mapping[str, ObjectTypes]]:
164
+ ) -> Union[Mapping[ObjectTypes, int], Mapping[int, ObjectTypes]]:
165
165
  ...
166
166
 
167
167
  def get_categories(
168
168
  self, as_dict: bool = True, name_as_key: bool = False, init: bool = False, filtered: bool = False
169
- ) -> Union[Sequence[ObjectTypes], Mapping[ObjectTypes, str], Mapping[str, ObjectTypes]]:
169
+ ) -> Union[Sequence[ObjectTypes], Mapping[ObjectTypes, int], Mapping[int, ObjectTypes]]:
170
170
  """
171
171
  Get categories of a dataset. The returned value also respects modifications of the inventory like filtered
172
172
  categories of replaced categories with sub categories. However, you must correctly pass arguments to return the
@@ -229,7 +229,7 @@ class DatasetCategories:
229
229
  if sub_categories is None:
230
230
  sub_categories = {}
231
231
 
232
- sub_cat: Dict[ObjectTypes, Union[ObjectTypes, List[ObjectTypes]]] = {}
232
+ sub_cat: dict[ObjectTypes, Union[ObjectTypes, list[ObjectTypes]]] = {}
233
233
  for cat in _categories:
234
234
  assert cat in self.get_categories( # pylint: disable=E1135
235
235
  as_dict=False, filtered=True
@@ -254,9 +254,9 @@ class DatasetCategories:
254
254
  for category, value in sub_cat.items():
255
255
  if category not in sub_categories:
256
256
  continue
257
- sub_cat_tmp: Dict[str, Union[Dict[str, str], Sequence[str]]] = {}
257
+ sub_cat_tmp: dict[str, Union[dict[int, ObjectTypes], dict[ObjectTypes, int], Sequence[str]]] = {}
258
258
  sub_categories_list: Union[
259
- ObjectTypes, str, List[Sequence[Union[ObjectTypes, str]]], Sequence[Union[ObjectTypes, str]]
259
+ ObjectTypes, str, list[Sequence[Union[ObjectTypes, str]]], Sequence[Union[ObjectTypes, str]]
260
260
  ]
261
261
  if isinstance(sub_categories[category], ObjectTypes):
262
262
  sub_categories_list = [sub_categories[category]]
@@ -267,14 +267,12 @@ class DatasetCategories:
267
267
  continue
268
268
  if values_as_dict:
269
269
  if not name_as_key:
270
- sub_cat_tmp[sub_cat_key] = {
271
- str(k): v
272
- for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
273
- }
270
+ sub_cat_tmp[sub_cat_key] = dict(
271
+ enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
272
+ )
274
273
  else:
275
274
  sub_cat_tmp[sub_cat_key] = {
276
- v: str(k)
277
- for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
275
+ v: k for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
278
276
  }
279
277
  else:
280
278
  sub_cat_tmp[sub_cat_key] = self.init_sub_categories[category][get_type(sub_cat_key)]
@@ -284,7 +282,7 @@ class DatasetCategories:
284
282
  return sub_cat
285
283
 
286
284
  @call_only_once
287
- def set_cat_to_sub_cat(self, cat_to_sub_cat: Dict[TypeOrStr, TypeOrStr]) -> None:
285
+ def set_cat_to_sub_cat(self, cat_to_sub_cat: dict[TypeOrStr, TypeOrStr]) -> None:
288
286
  """
289
287
  Change category representation if sub-categories are available. Pass a dictionary of the main category
290
288
  and the requested sub-category. This will change the dictionary of categories and the category names
@@ -323,7 +321,7 @@ class DatasetCategories:
323
321
  self._categories_update = _categories_update_list
324
322
 
325
323
  @call_only_once
326
- def filter_categories(self, categories: Union[TypeOrStr, List[TypeOrStr]]) -> None:
324
+ def filter_categories(self, categories: Union[TypeOrStr, list[TypeOrStr]]) -> None:
327
325
  """
328
326
  Filter categories of a dataset. This will keep all the categories chosen and remove all others.
329
327
  This method can only be called once per object.
@@ -415,7 +413,7 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
415
413
  # form a set of possible sub category values. To get a list of all values from all dataset, take the union
416
414
  intersect_init_sub_cat_values = {}
417
415
  for sub_cat_key in intersect_sub_cat_per_key:
418
- val: Set[ObjectTypes] = set()
416
+ val: set[ObjectTypes] = set()
419
417
  for cat in categories:
420
418
  val.update(cat.init_sub_categories[key][sub_cat_key])
421
419
  intersect_init_sub_cat_values[sub_cat_key] = list(val)
@@ -425,7 +423,7 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
425
423
  # construction is not deterministic but guarantees for unique values in all sub categories. Now we build the
426
424
  # ensemble dict of sub categories where we guarantee unique values on one hand side and always maintain the
427
425
  # same arrangements for all category/ sub category lists
428
- init_sub_cat: Dict[ObjectTypes, Any] = {}
426
+ init_sub_cat: dict[ObjectTypes, Any] = {}
429
427
  for category in categories:
430
428
  for cat in intersect_sub_cat_keys:
431
429
  for sub_cat_key in category.init_sub_categories[cat]:
@@ -25,19 +25,20 @@ Module for DocLayNet dataset. Place the dataset as follows
25
25
  ├── PNG
26
26
  │ ├── 0a0d43e301facee9e99cc33b9b16e732dd207135f4027e75f6aea2bf117535a2.png
27
27
  """
28
+ from __future__ import annotations
28
29
 
29
30
  import os
30
31
  from typing import Mapping, Sequence, Union
31
32
 
32
33
  from ...dataflow import DataFlow, MapData, MapDataComponent, SerializerCoco
33
- from ...datapoint.annotation import CategoryAnnotation, SummaryAnnotation
34
+ from ...datapoint.annotation import CategoryAnnotation
34
35
  from ...datapoint.image import Image
35
36
  from ...mapper.cats import add_summary, cat_to_sub_cat, filter_cat, filter_summary
36
37
  from ...mapper.cocostruct import coco_to_image
37
38
  from ...mapper.maputils import curry
38
- from ...utils.detection_types import JsonDict
39
39
  from ...utils.fs import load_image_from_file
40
- from ...utils.settings import DatasetType, DocumentType, LayoutType, ObjectTypes, PageType, TypeOrStr
40
+ from ...utils.settings import DatasetType, DocumentType, LayoutType, ObjectTypes, PageType, SummaryType, TypeOrStr
41
+ from ...utils.types import CocoDatapointDict
41
42
  from ..base import DatasetBase
42
43
  from ..dataflow_builder import DataFlowBaseBuilder
43
44
  from ..info import DatasetCategories, DatasetInfo
@@ -63,36 +64,36 @@ _DESCRIPTION = (
63
64
  _LICENSE = "CDLA-Permissive"
64
65
  _URL = "https://codait-cos-dax.s3.us.cloud-object-storage.appdomain.cloud/dax-doclaynet/1.0.0/DocLayNet_core.zip"
65
66
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
66
- _TYPE = DatasetType.object_detection
67
+ _TYPE = DatasetType.OBJECT_DETECTION
67
68
 
68
69
  _LOCATION = "DocLayNet_core"
69
70
 
70
71
  _ANNOTATION_FILES: Mapping[str, str] = {"train": "COCO/train.json", "val": "COCO/val.json", "test": "COCO/test.json"}
71
72
  _INIT_CATEGORIES = [
72
- LayoutType.caption,
73
- LayoutType.footnote,
74
- LayoutType.formula,
75
- LayoutType.list,
76
- LayoutType.page_footer,
77
- LayoutType.page_header,
78
- LayoutType.figure,
79
- LayoutType.section_header,
80
- LayoutType.table,
81
- LayoutType.text,
82
- LayoutType.title,
73
+ LayoutType.CAPTION,
74
+ LayoutType.FOOTNOTE,
75
+ LayoutType.FORMULA,
76
+ LayoutType.LIST,
77
+ LayoutType.PAGE_FOOTER,
78
+ LayoutType.PAGE_HEADER,
79
+ LayoutType.FIGURE,
80
+ LayoutType.SECTION_HEADER,
81
+ LayoutType.TABLE,
82
+ LayoutType.TEXT,
83
+ LayoutType.TITLE,
83
84
  ]
84
85
  _SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]] = {
85
- LayoutType.caption: {DatasetType.publaynet: [LayoutType.text]},
86
- LayoutType.footnote: {DatasetType.publaynet: [LayoutType.text]},
87
- LayoutType.formula: {DatasetType.publaynet: [LayoutType.text]},
88
- LayoutType.list: {DatasetType.publaynet: [LayoutType.list]},
89
- LayoutType.page_footer: {DatasetType.publaynet: [LayoutType.text]},
90
- LayoutType.page_header: {DatasetType.publaynet: [LayoutType.title]},
91
- LayoutType.figure: {DatasetType.publaynet: [LayoutType.figure]},
92
- LayoutType.section_header: {DatasetType.publaynet: [LayoutType.title]},
93
- LayoutType.table: {DatasetType.publaynet: [LayoutType.table]},
94
- LayoutType.text: {DatasetType.publaynet: [LayoutType.text]},
95
- LayoutType.title: {DatasetType.publaynet: [LayoutType.title]},
86
+ LayoutType.CAPTION: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
87
+ LayoutType.FOOTNOTE: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
88
+ LayoutType.FORMULA: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
89
+ LayoutType.LIST: {DatasetType.PUBLAYNET: [LayoutType.LIST]},
90
+ LayoutType.PAGE_FOOTER: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
91
+ LayoutType.PAGE_HEADER: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
92
+ LayoutType.FIGURE: {DatasetType.PUBLAYNET: [LayoutType.FIGURE]},
93
+ LayoutType.SECTION_HEADER: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
94
+ LayoutType.TABLE: {DatasetType.PUBLAYNET: [LayoutType.TABLE]},
95
+ LayoutType.TEXT: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
96
+ LayoutType.TITLE: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
96
97
  }
97
98
 
98
99
 
@@ -109,7 +110,7 @@ class DocLayNet(DatasetBase):
109
110
  def _categories(self) -> DatasetCategories:
110
111
  return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
111
112
 
112
- def _builder(self) -> "DocLayNetBuilder":
113
+ def _builder(self) -> DocLayNetBuilder:
113
114
  return DocLayNetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
114
115
 
115
116
 
@@ -161,7 +162,7 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
161
162
  filter_empty_image=True,
162
163
  fake_score=fake_score,
163
164
  coarse_mapping={1: 10, 2: 10, 3: 10, 4: 4, 5: 10, 6: 11, 7: 7, 8: 11, 9: 9, 10: 10, 11: 11},
164
- coarse_sub_cat_name=DatasetType.publaynet,
165
+ coarse_sub_cat_name=DatasetType.PUBLAYNET,
165
166
  ),
166
167
  )
167
168
 
@@ -185,14 +186,14 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
185
186
 
186
187
 
187
188
  _NAME_SEQ = "doclaynet-seq"
188
- _TYPE_SEQ = DatasetType.sequence_classification
189
+ _TYPE_SEQ = DatasetType.SEQUENCE_CLASSIFICATION
189
190
  _INIT_CATEGORIES_SEQ = [
190
- DocumentType.financial_report,
191
- DocumentType.scientific_publication,
192
- DocumentType.laws_and_regulations,
193
- DocumentType.government_tenders,
194
- DocumentType.manuals,
195
- DocumentType.patents,
191
+ DocumentType.FINANCIAL_REPORT,
192
+ DocumentType.SCIENTIFIC_PUBLICATION,
193
+ DocumentType.LAWS_AND_REGULATIONS,
194
+ DocumentType.GOVERNMENT_TENDERS,
195
+ DocumentType.MANUALS,
196
+ DocumentType.PATENTS,
196
197
  ]
197
198
 
198
199
 
@@ -209,7 +210,7 @@ class DocLayNetSeq(DatasetBase):
209
210
  def _categories(self) -> DatasetCategories:
210
211
  return DatasetCategories(init_categories=_INIT_CATEGORIES_SEQ)
211
212
 
212
- def _builder(self) -> "DocLayNetSeqBuilder":
213
+ def _builder(self) -> DocLayNetSeqBuilder:
213
214
  return DocLayNetSeqBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
214
215
 
215
216
 
@@ -244,22 +245,22 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
244
245
  df = MapDataComponent(df, lambda dp: self.get_workdir() / "PNG" / dp, "file_name")
245
246
 
246
247
  @curry
247
- def _map_to_image(dp: JsonDict, load_img: bool) -> Image:
248
+ def _map_to_image(dp: CocoDatapointDict, load_img: bool) -> Image:
248
249
  image = Image(location=dp["file_name"], file_name=os.path.split(dp["file_name"])[1])
249
250
  image.image = load_image_from_file(image.location)
250
- summary = SummaryAnnotation()
251
+ summary = CategoryAnnotation(category_name=SummaryType.SUMMARY)
251
252
  label_to_category_name = {
252
- "financial_reports": DocumentType.financial_report,
253
- "scientific_articles": DocumentType.scientific_publication,
254
- "laws_and_regulations": DocumentType.laws_and_regulations,
255
- "government_tenders": DocumentType.government_tenders,
256
- "manuals": DocumentType.manuals,
257
- "patents": DocumentType.patents,
253
+ "financial_reports": DocumentType.FINANCIAL_REPORT,
254
+ "scientific_articles": DocumentType.SCIENTIFIC_PUBLICATION,
255
+ "laws_and_regulations": DocumentType.LAWS_AND_REGULATIONS,
256
+ "government_tenders": DocumentType.GOVERNMENT_TENDERS,
257
+ "manuals": DocumentType.MANUALS,
258
+ "patents": DocumentType.PATENTS,
258
259
  }
259
260
  categories_dict = self.categories.get_categories(init=True, name_as_key=True)
260
261
  category_name = label_to_category_name[dp["doc_category"]]
261
262
  summary.dump_sub_category(
262
- PageType.document_type,
263
+ PageType.DOCUMENT_TYPE,
263
264
  CategoryAnnotation(category_name=category_name, category_id=categories_dict[category_name]),
264
265
  )
265
266
  image.summary = summary
@@ -273,15 +274,14 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
273
274
  if self.categories.is_filtered():
274
275
  df = MapData(
275
276
  df,
276
- filter_summary({PageType.document_type: self.categories.get_categories(as_dict=False, filtered=True)}),
277
+ filter_summary({PageType.DOCUMENT_TYPE: self.categories.get_categories(as_dict=False, filtered=True)}),
277
278
  )
278
279
 
279
280
  @curry
280
- def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, str]) -> Image:
281
- if dp.summary:
282
- if PageType.document_type in dp.summary.sub_categories:
283
- summary_cat = dp.summary.get_sub_category(PageType.document_type)
284
- summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
281
+ def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, int]) -> Image:
282
+ if PageType.DOCUMENT_TYPE in dp.summary.sub_categories:
283
+ summary_cat = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE)
284
+ summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
285
285
  return dp
286
286
 
287
287
  df = MapData(df, _re_map_cat_ids(self.categories.get_categories(filtered=True, name_as_key=True)))