deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (111) hide show
  1. deepdoctection/__init__.py +8 -25
  2. deepdoctection/analyzer/dd.py +84 -71
  3. deepdoctection/dataflow/common.py +9 -5
  4. deepdoctection/dataflow/custom.py +5 -5
  5. deepdoctection/dataflow/custom_serialize.py +75 -18
  6. deepdoctection/dataflow/parallel_map.py +3 -3
  7. deepdoctection/dataflow/serialize.py +4 -4
  8. deepdoctection/dataflow/stats.py +3 -3
  9. deepdoctection/datapoint/annotation.py +78 -56
  10. deepdoctection/datapoint/box.py +7 -7
  11. deepdoctection/datapoint/convert.py +6 -6
  12. deepdoctection/datapoint/image.py +157 -75
  13. deepdoctection/datapoint/view.py +175 -151
  14. deepdoctection/datasets/adapter.py +30 -24
  15. deepdoctection/datasets/base.py +10 -10
  16. deepdoctection/datasets/dataflow_builder.py +3 -3
  17. deepdoctection/datasets/info.py +23 -25
  18. deepdoctection/datasets/instances/doclaynet.py +48 -49
  19. deepdoctection/datasets/instances/fintabnet.py +44 -45
  20. deepdoctection/datasets/instances/funsd.py +23 -23
  21. deepdoctection/datasets/instances/iiitar13k.py +8 -8
  22. deepdoctection/datasets/instances/layouttest.py +2 -2
  23. deepdoctection/datasets/instances/publaynet.py +3 -3
  24. deepdoctection/datasets/instances/pubtables1m.py +18 -18
  25. deepdoctection/datasets/instances/pubtabnet.py +30 -29
  26. deepdoctection/datasets/instances/rvlcdip.py +28 -29
  27. deepdoctection/datasets/instances/xfund.py +51 -30
  28. deepdoctection/datasets/save.py +6 -6
  29. deepdoctection/eval/accmetric.py +32 -33
  30. deepdoctection/eval/base.py +8 -9
  31. deepdoctection/eval/cocometric.py +13 -12
  32. deepdoctection/eval/eval.py +32 -26
  33. deepdoctection/eval/tedsmetric.py +16 -12
  34. deepdoctection/eval/tp_eval_callback.py +7 -16
  35. deepdoctection/extern/base.py +339 -134
  36. deepdoctection/extern/d2detect.py +69 -89
  37. deepdoctection/extern/deskew.py +11 -10
  38. deepdoctection/extern/doctrocr.py +81 -64
  39. deepdoctection/extern/fastlang.py +23 -16
  40. deepdoctection/extern/hfdetr.py +53 -38
  41. deepdoctection/extern/hflayoutlm.py +216 -155
  42. deepdoctection/extern/hflm.py +35 -30
  43. deepdoctection/extern/model.py +433 -255
  44. deepdoctection/extern/pdftext.py +15 -15
  45. deepdoctection/extern/pt/ptutils.py +4 -2
  46. deepdoctection/extern/tessocr.py +39 -38
  47. deepdoctection/extern/texocr.py +14 -16
  48. deepdoctection/extern/tp/tfutils.py +16 -2
  49. deepdoctection/extern/tp/tpcompat.py +11 -7
  50. deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
  51. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
  52. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
  53. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
  54. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
  55. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
  56. deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
  57. deepdoctection/extern/tpdetect.py +40 -45
  58. deepdoctection/mapper/cats.py +36 -40
  59. deepdoctection/mapper/cocostruct.py +16 -12
  60. deepdoctection/mapper/d2struct.py +22 -22
  61. deepdoctection/mapper/hfstruct.py +7 -7
  62. deepdoctection/mapper/laylmstruct.py +22 -24
  63. deepdoctection/mapper/maputils.py +9 -10
  64. deepdoctection/mapper/match.py +33 -2
  65. deepdoctection/mapper/misc.py +6 -7
  66. deepdoctection/mapper/pascalstruct.py +4 -4
  67. deepdoctection/mapper/prodigystruct.py +6 -6
  68. deepdoctection/mapper/pubstruct.py +84 -92
  69. deepdoctection/mapper/tpstruct.py +3 -3
  70. deepdoctection/mapper/xfundstruct.py +33 -33
  71. deepdoctection/pipe/anngen.py +39 -14
  72. deepdoctection/pipe/base.py +68 -99
  73. deepdoctection/pipe/common.py +181 -85
  74. deepdoctection/pipe/concurrency.py +14 -10
  75. deepdoctection/pipe/doctectionpipe.py +24 -21
  76. deepdoctection/pipe/language.py +20 -25
  77. deepdoctection/pipe/layout.py +18 -16
  78. deepdoctection/pipe/lm.py +49 -47
  79. deepdoctection/pipe/order.py +63 -65
  80. deepdoctection/pipe/refine.py +102 -109
  81. deepdoctection/pipe/segment.py +157 -162
  82. deepdoctection/pipe/sub_layout.py +50 -40
  83. deepdoctection/pipe/text.py +37 -36
  84. deepdoctection/pipe/transform.py +19 -16
  85. deepdoctection/train/d2_frcnn_train.py +27 -25
  86. deepdoctection/train/hf_detr_train.py +22 -18
  87. deepdoctection/train/hf_layoutlm_train.py +49 -48
  88. deepdoctection/train/tp_frcnn_train.py +10 -11
  89. deepdoctection/utils/concurrency.py +1 -1
  90. deepdoctection/utils/context.py +13 -6
  91. deepdoctection/utils/develop.py +4 -4
  92. deepdoctection/utils/env_info.py +52 -14
  93. deepdoctection/utils/file_utils.py +6 -11
  94. deepdoctection/utils/fs.py +41 -14
  95. deepdoctection/utils/identifier.py +2 -2
  96. deepdoctection/utils/logger.py +15 -15
  97. deepdoctection/utils/metacfg.py +7 -7
  98. deepdoctection/utils/pdf_utils.py +39 -14
  99. deepdoctection/utils/settings.py +188 -182
  100. deepdoctection/utils/tqdm.py +1 -1
  101. deepdoctection/utils/transform.py +14 -9
  102. deepdoctection/utils/types.py +104 -0
  103. deepdoctection/utils/utils.py +7 -7
  104. deepdoctection/utils/viz.py +70 -69
  105. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
  106. deepdoctection-0.34.dist-info/RECORD +146 -0
  107. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
  108. deepdoctection/utils/detection_types.py +0 -68
  109. deepdoctection-0.32.dist-info/RECORD +0 -146
  110. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
  111. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
@@ -33,7 +33,7 @@ Module for Fintabnet dataset. Place the dataset as follows
33
33
  from __future__ import annotations
34
34
 
35
35
  from pathlib import Path
36
- from typing import List, Mapping, Sequence, Union
36
+ from typing import Mapping, Sequence, Union
37
37
 
38
38
  from ...dataflow import DataFlow, MapData, MultiProcessMapData
39
39
  from ...dataflow.common import FlattenData
@@ -43,10 +43,10 @@ from ...mapper.cats import cat_to_sub_cat, filter_cat
43
43
  from ...mapper.maputils import curry
44
44
  from ...mapper.misc import image_ann_to_image, maybe_ann_to_sub_image
45
45
  from ...mapper.pubstruct import pub_to_image
46
- from ...utils.detection_types import JsonDict
47
46
  from ...utils.file_utils import set_mp_spawn
48
47
  from ...utils.logger import LoggingRecord, logger
49
48
  from ...utils.settings import CellType, DatasetType, LayoutType, ObjectTypes, TableType
49
+ from ...utils.types import PubtabnetDict
50
50
  from ...utils.utils import to_bool
51
51
  from ..base import _BuiltInDataset
52
52
  from ..dataflow_builder import DataFlowBaseBuilder
@@ -83,38 +83,38 @@ _URL = (
83
83
  "fintabnet.tar.gz?_ga=2.17492593.994196051.1634564576-1173244232.1625045842"
84
84
  )
85
85
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
86
- _TYPE = DatasetType.object_detection
86
+ _TYPE = DatasetType.OBJECT_DETECTION
87
87
  _LOCATION = "fintabnet"
88
88
  _ANNOTATION_FILES: Mapping[str, str] = {
89
89
  "train": "FinTabNet_1.0.0_table_train.jsonl",
90
90
  "test": "FinTabNet_1.0.0_table_test.jsonl",
91
91
  "val": "FinTabNet_1.0.0_table_val.jsonl",
92
92
  }
93
- _INIT_CATEGORIES = [LayoutType.table, LayoutType.cell, TableType.item]
93
+ _INIT_CATEGORIES = [LayoutType.TABLE, LayoutType.CELL, TableType.ITEM]
94
94
  _SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]]
95
95
  _SUB_CATEGORIES = {
96
- LayoutType.cell: {
97
- CellType.header: [CellType.header, CellType.body],
98
- CellType.row_number: [],
99
- CellType.column_number: [],
100
- CellType.row_span: [],
101
- CellType.column_span: [],
102
- CellType.spanning: [CellType.spanning, LayoutType.cell],
96
+ LayoutType.CELL: {
97
+ CellType.HEADER: [CellType.HEADER, CellType.BODY],
98
+ CellType.ROW_NUMBER: [],
99
+ CellType.COLUMN_NUMBER: [],
100
+ CellType.ROW_SPAN: [],
101
+ CellType.COLUMN_SPAN: [],
102
+ CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
103
103
  },
104
- TableType.item: {TableType.item: [LayoutType.row, LayoutType.column]},
105
- CellType.header: {
106
- CellType.row_number: [],
107
- CellType.column_number: [],
108
- CellType.row_span: [],
109
- CellType.column_span: [],
110
- CellType.spanning: [CellType.spanning, LayoutType.cell],
104
+ TableType.ITEM: {TableType.ITEM: [LayoutType.ROW, LayoutType.COLUMN]},
105
+ CellType.HEADER: {
106
+ CellType.ROW_NUMBER: [],
107
+ CellType.COLUMN_NUMBER: [],
108
+ CellType.ROW_SPAN: [],
109
+ CellType.COLUMN_SPAN: [],
110
+ CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
111
111
  },
112
- CellType.body: {
113
- CellType.row_number: [],
114
- CellType.column_number: [],
115
- CellType.row_span: [],
116
- CellType.column_span: [],
117
- CellType.spanning: [CellType.spanning, LayoutType.cell],
112
+ CellType.BODY: {
113
+ CellType.ROW_NUMBER: [],
114
+ CellType.COLUMN_NUMBER: [],
115
+ CellType.ROW_SPAN: [],
116
+ CellType.COLUMN_SPAN: [],
117
+ CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
118
118
  },
119
119
  }
120
120
 
@@ -201,18 +201,17 @@ class FintabnetBuilder(DataFlowBaseBuilder):
201
201
 
202
202
  # Map
203
203
  @curry
204
- def _map_filename(dp: JsonDict, workdir: Path) -> JsonDict:
204
+ def _map_filename(dp: PubtabnetDict, workdir: Path) -> PubtabnetDict:
205
205
  dp["filename"] = workdir / "pdf" / dp["filename"]
206
206
  return dp
207
207
 
208
- map_filename = _map_filename(self.get_workdir()) # pylint: disable=E1120 # 259
209
- df = MapData(df, map_filename)
208
+ df = MapData(df, _map_filename(self.get_workdir()))
210
209
 
211
210
  buffer_size = 200 if max_datapoints is None else min(max_datapoints, 200) - 1
212
211
 
213
212
  pub_mapper = pub_to_image(
214
- self.categories.get_categories(name_as_key=True, init=True),
215
- load_image,
213
+ categories_name_as_key=self.categories.get_categories(name_as_key=True, init=True),
214
+ load_image=load_image,
216
215
  fake_score=fake_score,
217
216
  rows_and_cols=rows_and_cols,
218
217
  dd_pipe_like=False,
@@ -233,39 +232,39 @@ class FintabnetBuilder(DataFlowBaseBuilder):
233
232
  if build_mode == "table":
234
233
 
235
234
  @curry
236
- def _crop_and_add_image(dp: Image, category_names: List[str]) -> Image:
235
+ def _crop_and_add_image(dp: Image, category_names: list[str]) -> Image:
237
236
  return image_ann_to_image(dp, category_names=category_names)
238
237
 
239
238
  df = MapData(
240
239
  df,
241
240
  _crop_and_add_image( # pylint: disable=E1120
242
241
  category_names=[
243
- LayoutType.table,
244
- LayoutType.cell,
245
- CellType.header,
246
- CellType.body,
247
- TableType.item,
248
- LayoutType.row,
249
- LayoutType.column,
242
+ LayoutType.TABLE,
243
+ LayoutType.CELL,
244
+ CellType.HEADER,
245
+ CellType.BODY,
246
+ TableType.ITEM,
247
+ LayoutType.ROW,
248
+ LayoutType.COLUMN,
250
249
  ]
251
250
  ),
252
251
  )
253
252
  df = MapData(
254
253
  df,
255
254
  maybe_ann_to_sub_image( # pylint: disable=E1120 # 259
256
- category_names_sub_image=LayoutType.table,
255
+ category_names_sub_image=LayoutType.TABLE,
257
256
  category_names=[
258
- LayoutType.cell,
259
- CellType.header,
260
- CellType.body,
261
- TableType.item,
262
- LayoutType.row,
263
- LayoutType.column,
257
+ LayoutType.CELL,
258
+ CellType.HEADER,
259
+ CellType.BODY,
260
+ TableType.ITEM,
261
+ LayoutType.ROW,
262
+ LayoutType.COLUMN,
264
263
  ],
265
264
  add_summary=True,
266
265
  ),
267
266
  )
268
- df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation_iter(category_names=LayoutType.table)])
267
+ df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation(category_names=LayoutType.TABLE)])
269
268
  df = FlattenData(df)
270
269
  df = MapData(df, lambda dp: dp[0])
271
270
 
@@ -41,16 +41,16 @@ from ...dataflow import DataFlow, MapData, SerializerFiles
41
41
  from ...datasets.info import DatasetInfo
42
42
  from ...mapper.cats import cat_to_sub_cat, filter_cat
43
43
  from ...mapper.xfundstruct import xfund_to_image
44
- from ...utils.detection_types import JsonDict, Pathlike
45
44
  from ...utils.fs import load_json
46
45
  from ...utils.settings import BioTag, DatasetType, LayoutType, ObjectTypes, TokenClasses, TokenClassWithTag, WordType
46
+ from ...utils.types import FunsdDict, PathLikeOrStr
47
47
  from ..base import _BuiltInDataset
48
48
  from ..dataflow_builder import DataFlowBaseBuilder
49
49
  from ..info import DatasetCategories
50
50
  from ..registry import dataset_registry
51
51
 
52
52
 
53
- def load_file(path_ann: Pathlike) -> JsonDict:
53
+ def load_file(path_ann: PathLikeOrStr) -> FunsdDict:
54
54
  """
55
55
  Loading json file
56
56
 
@@ -80,28 +80,28 @@ _LICENSE = (
80
80
 
81
81
  _URL = "https://guillaumejaume.github.io/FUNSD/download/"
82
82
  _SPLITS: Mapping[str, str] = {"train": "training_data", "test": "testing_data"}
83
- _TYPE = DatasetType.token_classification
83
+ _TYPE = DatasetType.TOKEN_CLASSIFICATION
84
84
  _LOCATION = "funsd"
85
85
  _ANNOTATION_FILES: Mapping[str, str] = {"train": "annotations", "test": "annotations"}
86
86
 
87
- _INIT_CATEGORIES = [LayoutType.word, LayoutType.text]
87
+ _INIT_CATEGORIES = [LayoutType.WORD, LayoutType.TEXT]
88
88
  _SUB_CATEGORIES: Dict[ObjectTypes, Dict[ObjectTypes, List[ObjectTypes]]]
89
89
  _SUB_CATEGORIES = {
90
- LayoutType.word: {
91
- WordType.token_class: [TokenClasses.other, TokenClasses.question, TokenClasses.answer, TokenClasses.header],
92
- WordType.tag: [BioTag.inside, BioTag.outside, BioTag.begin],
93
- WordType.token_tag: [
94
- TokenClassWithTag.b_answer,
95
- TokenClassWithTag.b_header,
96
- TokenClassWithTag.b_question,
97
- TokenClassWithTag.i_answer,
98
- TokenClassWithTag.i_header,
99
- TokenClassWithTag.i_question,
100
- BioTag.outside,
90
+ LayoutType.WORD: {
91
+ WordType.TOKEN_CLASS: [TokenClasses.OTHER, TokenClasses.QUESTION, TokenClasses.ANSWER, TokenClasses.HEADER],
92
+ WordType.TAG: [BioTag.INSIDE, BioTag.OUTSIDE, BioTag.BEGIN],
93
+ WordType.TOKEN_TAG: [
94
+ TokenClassWithTag.B_ANSWER,
95
+ TokenClassWithTag.B_HEADER,
96
+ TokenClassWithTag.B_QUESTION,
97
+ TokenClassWithTag.I_ANSWER,
98
+ TokenClassWithTag.I_HEADER,
99
+ TokenClassWithTag.I_QUESTION,
100
+ BioTag.OUTSIDE,
101
101
  ],
102
102
  },
103
- LayoutType.text: {
104
- WordType.token_class: [TokenClasses.other, TokenClasses.question, TokenClasses.answer, TokenClasses.header]
103
+ LayoutType.TEXT: {
104
+ WordType.TOKEN_CLASS: [TokenClasses.OTHER, TokenClasses.QUESTION, TokenClasses.ANSWER, TokenClasses.HEADER]
105
105
  },
106
106
  }
107
107
 
@@ -159,14 +159,14 @@ class FunsdBuilder(DataFlowBaseBuilder):
159
159
  # Map
160
160
  categories_name_as_key = self.categories.get_categories(init=True, name_as_key=True)
161
161
  category_names_mapping = {
162
- "other": TokenClasses.other,
163
- "question": TokenClasses.question,
164
- "answer": TokenClasses.answer,
165
- "header": TokenClasses.header,
162
+ "other": TokenClasses.OTHER,
163
+ "question": TokenClasses.QUESTION,
164
+ "answer": TokenClasses.ANSWER,
165
+ "header": TokenClasses.HEADER,
166
166
  }
167
167
  ner_token_to_id_mapping = self.categories.get_sub_categories(
168
- categories=LayoutType.word,
169
- sub_categories={LayoutType.word: [WordType.token_tag, WordType.tag, WordType.token_class]},
168
+ categories=LayoutType.WORD,
169
+ sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
170
170
  keys=False,
171
171
  values_as_dict=True,
172
172
  name_as_key=True,
@@ -47,10 +47,10 @@ from ...datasets.info import DatasetInfo
47
47
  from ...mapper.maputils import curry
48
48
  from ...mapper.misc import xml_to_dict
49
49
  from ...mapper.pascalstruct import pascal_voc_dict_to_image
50
- from ...utils.detection_types import JsonDict
51
50
  from ...utils.file_utils import lxml_available
52
51
  from ...utils.fs import get_package_path
53
52
  from ...utils.settings import DatasetType, LayoutType
53
+ from ...utils.types import JsonDict
54
54
  from ..base import _BuiltInDataset
55
55
  from ..dataflow_builder import DataFlowBaseBuilder
56
56
  from ..info import DatasetCategories
@@ -76,7 +76,7 @@ _LICENSE = "NN"
76
76
  _URL = "http://cvit.iiit.ac.in/usodi/iiitar13k.php"
77
77
 
78
78
  _SPLITS: Mapping[str, str] = {"train": "training_images", "val": "validation_images", "test": "test_images"}
79
- _TYPE = DatasetType.object_detection
79
+ _TYPE = DatasetType.OBJECT_DETECTION
80
80
  _LOCATION = "iiitar13k"
81
81
  _ANNOTATION_FILES: Mapping[str, str] = {
82
82
  "train": "training_xml",
@@ -84,7 +84,7 @@ _ANNOTATION_FILES: Mapping[str, str] = {
84
84
  "test": "test_xml",
85
85
  }
86
86
 
87
- _INIT_CATEGORIES = [LayoutType.table, LayoutType.logo, LayoutType.figure, LayoutType.signature]
87
+ _INIT_CATEGORIES = [LayoutType.TABLE, LayoutType.LOGO, LayoutType.FIGURE, LayoutType.SIGNATURE]
88
88
 
89
89
 
90
90
  @dataset_registry.register("iiitar13k")
@@ -176,11 +176,11 @@ class IIITar13KBuilder(DataFlowBaseBuilder):
176
176
  filter_empty_image=True,
177
177
  fake_score=fake_score,
178
178
  category_name_mapping={
179
- "natural_image": LayoutType.figure,
180
- "figure": LayoutType.figure,
181
- "logo": LayoutType.logo,
182
- "signature": LayoutType.signature,
183
- "table": LayoutType.table,
179
+ "natural_image": LayoutType.FIGURE,
180
+ "figure": LayoutType.FIGURE,
181
+ "logo": LayoutType.LOGO,
182
+ "signature": LayoutType.SIGNATURE,
183
+ "table": LayoutType.FIGURE,
184
184
  },
185
185
  ),
186
186
  )
@@ -52,7 +52,7 @@ _LICENSE = (
52
52
  )
53
53
 
54
54
  _SPLITS: Mapping[str, str] = {"test": "test", "predict": "predict"}
55
- _TYPE = DatasetType.object_detection
55
+ _TYPE = DatasetType.OBJECT_DETECTION
56
56
  _LOCATION = "testlayout"
57
57
 
58
58
  _ANNOTATION_FILES: Mapping[str, str] = {
@@ -60,7 +60,7 @@ _ANNOTATION_FILES: Mapping[str, str] = {
60
60
  "predict": "xrf_layout_test_predict.jsonl",
61
61
  }
62
62
 
63
- _INIT_CATEGORIES = [LayoutType.text, LayoutType.title, LayoutType.list, LayoutType.table, LayoutType.figure]
63
+ _INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutType.TABLE, LayoutType.FIGURE]
64
64
 
65
65
 
66
66
  @dataset_registry.register("testlayout")
@@ -61,12 +61,12 @@ _URL = (
61
61
  "publaynet.tar.gz?_ga=2.23017467.1796315263.1628754613-1173244232.1625045842"
62
62
  )
63
63
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val"}
64
- _TYPE = DatasetType.object_detection
64
+ _TYPE = DatasetType.OBJECT_DETECTION
65
65
 
66
66
  _LOCATION = "publaynet"
67
67
 
68
68
  _ANNOTATION_FILES: Mapping[str, str] = {"train": "train.json", "val": "val.json"}
69
- _INIT_CATEGORIES = [LayoutType.text, LayoutType.title, LayoutType.list, LayoutType.table, LayoutType.figure]
69
+ _INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutType.TABLE, LayoutType.FIGURE]
70
70
 
71
71
 
72
72
  @dataset_registry.register("publaynet")
@@ -120,7 +120,7 @@ class PublaynetBuilder(DataFlowBaseBuilder):
120
120
  df = SerializerCoco.load(path, max_datapoints=max_datapoints)
121
121
 
122
122
  # Map
123
- df = MapDataComponent(df, lambda dp: self.get_workdir() / self.get_split(split) / dp, "file_name")
123
+ df = MapDataComponent(df, lambda dp: (self.get_workdir() / self.get_split(split) / dp).as_posix(), "file_name")
124
124
  coco_mapper = coco_to_image( # pylint: disable=E1120 # 259
125
125
  self.categories.get_categories(init=True),
126
126
  load_image,
@@ -50,10 +50,10 @@ from ...mapper.cats import filter_cat
50
50
  from ...mapper.maputils import curry
51
51
  from ...mapper.misc import xml_to_dict
52
52
  from ...mapper.pascalstruct import pascal_voc_dict_to_image
53
- from ...utils.detection_types import JsonDict
54
53
  from ...utils.file_utils import lxml_available
55
54
  from ...utils.fs import get_package_path
56
55
  from ...utils.settings import CellType, DatasetType, LayoutType
56
+ from ...utils.types import JsonDict
57
57
  from ..base import _BuiltInDataset
58
58
  from ..dataflow_builder import DataFlowBaseBuilder
59
59
  from ..info import DatasetCategories
@@ -80,14 +80,14 @@ _LICENSE = "Community Data License Agreement – Permissive, Version 1.0"
80
80
  _URL = "https://msropendata.com/datasets/505fcbe3-1383-42b1-913a-f651b8b712d3"
81
81
 
82
82
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
83
- _TYPE = DatasetType.object_detection
83
+ _TYPE = DatasetType.OBJECT_DETECTION
84
84
  _LOCATION = "PubTables1M"
85
85
  _ANNOTATION_FILES: Mapping[str, str] = {
86
86
  "train": "PubTables1M-Detection-PASCAL-VOC/train",
87
87
  "val": "PubTables1M-Detection-PASCAL-VOC/val",
88
88
  "test": "PubTables1M-Detection-PASCAL-VOC/test",
89
89
  }
90
- _INIT_CATEGORIES_DET = [LayoutType.table, LayoutType.table_rotated]
90
+ _INIT_CATEGORIES_DET = [LayoutType.TABLE, LayoutType.TABLE_ROTATED]
91
91
 
92
92
 
93
93
  @dataset_registry.register("pubtables1m_det")
@@ -180,7 +180,7 @@ class Pubtables1MBuilder(DataFlowBaseBuilder):
180
180
  load_image,
181
181
  filter_empty_image=True,
182
182
  fake_score=fake_score,
183
- category_name_mapping={"table": LayoutType.table, "table rotated": LayoutType.table_rotated},
183
+ category_name_mapping={"table": LayoutType.TABLE, "table rotated": LayoutType.TABLE_ROTATED},
184
184
  ),
185
185
  )
186
186
 
@@ -195,13 +195,13 @@ _ANNOTATION_FILES_STRUCT: Mapping[str, str] = {
195
195
  }
196
196
 
197
197
  _INIT_CATEGORIES_STRUCT = [
198
- LayoutType.table,
199
- LayoutType.row,
200
- LayoutType.column,
201
- CellType.spanning,
202
- CellType.row_header,
203
- CellType.column_header,
204
- CellType.projected_row_header,
198
+ LayoutType.TABLE,
199
+ LayoutType.ROW,
200
+ LayoutType.COLUMN,
201
+ CellType.SPANNING,
202
+ CellType.ROW_HEADER,
203
+ CellType.COLUMN_HEADER,
204
+ CellType.PROJECTED_ROW_HEADER,
205
205
  ]
206
206
 
207
207
  _IMAGES: Mapping[str, str] = {
@@ -302,13 +302,13 @@ class Pubtables1MBuilderStruct(DataFlowBaseBuilder):
302
302
  filter_empty_image=True,
303
303
  fake_score=fake_score,
304
304
  category_name_mapping={
305
- "table": LayoutType.table,
306
- "table spanning cell": CellType.spanning,
307
- "table row": LayoutType.row,
308
- "table row header": CellType.row_header,
309
- "table projected row header": CellType.projected_row_header,
310
- "table column": LayoutType.column,
311
- "table column header": CellType.column_header,
305
+ "table": LayoutType.TABLE,
306
+ "table spanning cell": CellType.SPANNING,
307
+ "table row": LayoutType.ROW,
308
+ "table row header": CellType.ROW_HEADER,
309
+ "table projected row header": CellType.PROJECTED_ROW_HEADER,
310
+ "table column": LayoutType.COLUMN,
311
+ "table column header": CellType.COLUMN_HEADER,
312
312
  },
313
313
  ),
314
314
  )
@@ -29,16 +29,16 @@ Module for Pubtabnet dataset. Place the dataset as follows
29
29
  """
30
30
  from __future__ import annotations
31
31
 
32
- from typing import Dict, List, Mapping, Union
32
+ from typing import Mapping, Union
33
33
 
34
34
  from ...dataflow import DataFlow, MapData
35
35
  from ...dataflow.custom_serialize import SerializerJsonlines
36
36
  from ...datasets.info import DatasetInfo
37
37
  from ...mapper.cats import cat_to_sub_cat, filter_cat
38
38
  from ...mapper.pubstruct import pub_to_image
39
- from ...utils.detection_types import JsonDict
40
39
  from ...utils.logger import LoggingRecord, logger
41
40
  from ...utils.settings import CellType, DatasetType, LayoutType, ObjectTypes, TableType, WordType
41
+ from ...utils.types import PubtabnetDict
42
42
  from ..base import _BuiltInDataset
43
43
  from ..dataflow_builder import DataFlowBaseBuilder
44
44
  from ..info import DatasetCategories
@@ -70,38 +70,38 @@ _URL = (
70
70
  "pubtabnet.tar.gz?_ga=2.267291150.146828643.1629125962-1173244232.1625045842"
71
71
  )
72
72
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
73
- _TYPE = DatasetType.object_detection
73
+ _TYPE = DatasetType.OBJECT_DETECTION
74
74
  _LOCATION = "pubtabnet"
75
75
  _ANNOTATION_FILES: Mapping[str, str] = {"all": "PubTabNet_2.0.0.jsonl"}
76
76
 
77
- _INIT_CATEGORIES = [LayoutType.cell, TableType.item, LayoutType.table, LayoutType.word]
78
- _SUB_CATEGORIES: Dict[ObjectTypes, Dict[ObjectTypes, List[ObjectTypes]]]
77
+ _INIT_CATEGORIES = [LayoutType.CELL, TableType.ITEM, LayoutType.TABLE, LayoutType.WORD]
78
+ _SUB_CATEGORIES: dict[ObjectTypes, dict[ObjectTypes, list[ObjectTypes]]]
79
79
  _SUB_CATEGORIES = {
80
- TableType.item: {TableType.item: [LayoutType.row, LayoutType.column]},
81
- LayoutType.cell: {
82
- CellType.header: [CellType.header, CellType.body],
83
- CellType.row_number: [],
84
- CellType.column_number: [],
85
- CellType.row_span: [],
86
- CellType.column_span: [],
87
- CellType.spanning: [CellType.spanning],
80
+ TableType.ITEM: {TableType.ITEM: [LayoutType.ROW, LayoutType.COLUMN]},
81
+ LayoutType.CELL: {
82
+ CellType.HEADER: [CellType.HEADER, CellType.BODY],
83
+ CellType.ROW_NUMBER: [],
84
+ CellType.COLUMN_NUMBER: [],
85
+ CellType.ROW_SPAN: [],
86
+ CellType.COLUMN_SPAN: [],
87
+ CellType.SPANNING: [CellType.SPANNING],
88
88
  },
89
- CellType.header: {
90
- CellType.row_number: [],
91
- CellType.column_number: [],
92
- CellType.row_span: [],
93
- CellType.column_span: [],
94
- CellType.spanning: [CellType.spanning],
89
+ CellType.HEADER: {
90
+ CellType.ROW_NUMBER: [],
91
+ CellType.COLUMN_NUMBER: [],
92
+ CellType.ROW_SPAN: [],
93
+ CellType.COLUMN_SPAN: [],
94
+ CellType.SPANNING: [CellType.SPANNING],
95
95
  },
96
- CellType.body: {
97
- CellType.row_number: [],
98
- CellType.column_number: [],
99
- CellType.row_span: [],
100
- CellType.column_span: [],
101
- CellType.spanning: [CellType.spanning],
96
+ CellType.BODY: {
97
+ CellType.ROW_NUMBER: [],
98
+ CellType.COLUMN_NUMBER: [],
99
+ CellType.ROW_SPAN: [],
100
+ CellType.COLUMN_SPAN: [],
101
+ CellType.SPANNING: [CellType.SPANNING],
102
102
  },
103
- LayoutType.table: {TableType.html: [TableType.html]},
104
- LayoutType.word: {WordType.characters: [WordType.characters]},
103
+ LayoutType.TABLE: {TableType.HTML: [TableType.HTML]},
104
+ LayoutType.WORD: {WordType.CHARACTERS: [WordType.CHARACTERS]},
105
105
  }
106
106
 
107
107
 
@@ -170,7 +170,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
170
170
  df = SerializerJsonlines.load(path, max_datapoints=max_datapoints)
171
171
 
172
172
  # Map
173
- def replace_filename(dp: JsonDict) -> JsonDict:
173
+ def replace_filename(dp: PubtabnetDict) -> PubtabnetDict:
174
174
  dp["filename"] = self.get_workdir() / dp["split"] / dp["filename"]
175
175
  return dp
176
176
 
@@ -178,7 +178,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
178
178
  df = MapData(df, lambda dp: dp if dp["split"] == split else None)
179
179
  pub_mapper = pub_to_image(
180
180
  self.categories.get_categories(name_as_key=True, init=True),
181
- load_image,
181
+ load_image=load_image,
182
182
  fake_score=fake_score,
183
183
  rows_and_cols=rows_and_cols,
184
184
  dd_pipe_like=dd_pipe_like,
@@ -187,6 +187,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
187
187
  )
188
188
 
189
189
  df = MapData(df, pub_mapper)
190
+
190
191
  if self.categories.is_cat_to_sub_cat():
191
192
  df = MapData(
192
193
  df,
@@ -36,12 +36,12 @@ from typing import Mapping, Union
36
36
 
37
37
  from ...dataflow import DataFlow, MapData
38
38
  from ...dataflow.custom_serialize import SerializerTabsepFiles
39
- from ...datapoint.annotation import CategoryAnnotation, SummaryAnnotation
39
+ from ...datapoint.annotation import CategoryAnnotation
40
40
  from ...datapoint.image import Image
41
41
  from ...mapper.cats import filter_summary
42
42
  from ...mapper.maputils import curry
43
43
  from ...utils.fs import load_image_from_file
44
- from ...utils.settings import DatasetType, DocumentType, PageType, TypeOrStr
44
+ from ...utils.settings import DatasetType, DocumentType, PageType, SummaryType, TypeOrStr
45
45
  from ..base import _BuiltInDataset
46
46
  from ..dataflow_builder import DataFlowBaseBuilder
47
47
  from ..info import DatasetCategories, DatasetInfo
@@ -64,27 +64,27 @@ _LICENSE = (
64
64
  _URL = "https://www.cs.cmu.edu/~aharley/rvl-cdip/"
65
65
 
66
66
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
67
- _TYPE = DatasetType.sequence_classification
67
+ _TYPE = DatasetType.SEQUENCE_CLASSIFICATION
68
68
  _LOCATION = "rvl-cdip"
69
69
 
70
70
  _ANNOTATION_FILES: Mapping[str, str] = {"train": "labels/train.txt", "val": "labels/val.txt", "test": "labels/test.txt"}
71
71
  _INIT_CATEGORIES = [
72
- DocumentType.letter,
73
- DocumentType.form,
74
- DocumentType.email,
75
- DocumentType.handwritten,
76
- DocumentType.advertisement,
77
- DocumentType.scientific_report,
78
- DocumentType.scientific_publication,
79
- DocumentType.specification,
80
- DocumentType.file_folder,
81
- DocumentType.news_article,
82
- DocumentType.budget,
83
- DocumentType.invoice,
84
- DocumentType.presentation,
85
- DocumentType.questionnaire,
86
- DocumentType.resume,
87
- DocumentType.memo,
72
+ DocumentType.LETTER,
73
+ DocumentType.FORM,
74
+ DocumentType.EMAIL,
75
+ DocumentType.HANDWRITTEN,
76
+ DocumentType.ADVERTISEMENT,
77
+ DocumentType.SCIENTIFIC_REPORT,
78
+ DocumentType.SCIENTIFIC_PUBLICATION,
79
+ DocumentType.SPECIFICATION,
80
+ DocumentType.FILE_FOLDER,
81
+ DocumentType.NEWS_ARTICLE,
82
+ DocumentType.BUDGET,
83
+ DocumentType.INVOICE,
84
+ DocumentType.PRESENTATION,
85
+ DocumentType.QUESTIONNAIRE,
86
+ DocumentType.RESUME,
87
+ DocumentType.MEMO,
88
88
  ]
89
89
 
90
90
 
@@ -139,15 +139,15 @@ class RvlcdipBuilder(DataFlowBaseBuilder):
139
139
 
140
140
  @curry
141
141
  def _map_str_to_image(dp: str, load_img: bool) -> Image:
142
- location, label = dp.split()[0], dp.split()[1]
143
- label = str(int(label) + 1)
142
+ location, label_str = dp.split()[0], dp.split()[1]
143
+ label = int(label_str) + 1
144
144
  file_name = os.path.split(location)[1]
145
145
  image = Image(location=(self.get_workdir() / "images" / location).as_posix(), file_name=file_name)
146
146
  image.image = load_image_from_file(image.location)
147
- summary = SummaryAnnotation()
147
+ summary = CategoryAnnotation(category_name=SummaryType.SUMMARY)
148
148
  categories_dict = self.categories.get_categories(init=True)
149
149
  summary.dump_sub_category(
150
- PageType.document_type, CategoryAnnotation(category_name=categories_dict[label], category_id=str(label))
150
+ PageType.DOCUMENT_TYPE, CategoryAnnotation(category_name=categories_dict[label], category_id=label)
151
151
  )
152
152
  image.summary = summary
153
153
  if not load_img:
@@ -159,15 +159,14 @@ class RvlcdipBuilder(DataFlowBaseBuilder):
159
159
  if self.categories.is_filtered():
160
160
  df = MapData(
161
161
  df,
162
- filter_summary({PageType.document_type: self.categories.get_categories(as_dict=False, filtered=True)}),
162
+ filter_summary({PageType.DOCUMENT_TYPE: self.categories.get_categories(as_dict=False, filtered=True)}),
163
163
  )
164
164
 
165
165
  @curry
166
- def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, str]) -> Image:
167
- if dp.summary:
168
- if PageType.document_type in dp.summary.sub_categories:
169
- summary_cat = dp.summary.get_sub_category(PageType.document_type)
170
- summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
166
+ def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, int]) -> Image:
167
+ if PageType.DOCUMENT_TYPE in dp.summary.sub_categories:
168
+ summary_cat = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE)
169
+ summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
171
170
  return dp
172
171
 
173
172
  df = MapData(df, _re_map_cat_ids(self.categories.get_categories(filtered=True, name_as_key=True)))