deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show
  1. deepdoctection/__init__.py +16 -29
  2. deepdoctection/analyzer/dd.py +70 -59
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/common.py +9 -5
  5. deepdoctection/dataflow/custom.py +5 -5
  6. deepdoctection/dataflow/custom_serialize.py +75 -18
  7. deepdoctection/dataflow/parallel_map.py +3 -3
  8. deepdoctection/dataflow/serialize.py +4 -4
  9. deepdoctection/dataflow/stats.py +3 -3
  10. deepdoctection/datapoint/annotation.py +41 -56
  11. deepdoctection/datapoint/box.py +9 -8
  12. deepdoctection/datapoint/convert.py +6 -6
  13. deepdoctection/datapoint/image.py +56 -44
  14. deepdoctection/datapoint/view.py +245 -150
  15. deepdoctection/datasets/__init__.py +1 -4
  16. deepdoctection/datasets/adapter.py +35 -26
  17. deepdoctection/datasets/base.py +14 -12
  18. deepdoctection/datasets/dataflow_builder.py +3 -3
  19. deepdoctection/datasets/info.py +24 -26
  20. deepdoctection/datasets/instances/doclaynet.py +51 -51
  21. deepdoctection/datasets/instances/fintabnet.py +46 -46
  22. deepdoctection/datasets/instances/funsd.py +25 -24
  23. deepdoctection/datasets/instances/iiitar13k.py +13 -10
  24. deepdoctection/datasets/instances/layouttest.py +4 -3
  25. deepdoctection/datasets/instances/publaynet.py +5 -5
  26. deepdoctection/datasets/instances/pubtables1m.py +24 -21
  27. deepdoctection/datasets/instances/pubtabnet.py +32 -30
  28. deepdoctection/datasets/instances/rvlcdip.py +30 -30
  29. deepdoctection/datasets/instances/xfund.py +26 -26
  30. deepdoctection/datasets/save.py +6 -6
  31. deepdoctection/eval/__init__.py +1 -4
  32. deepdoctection/eval/accmetric.py +32 -33
  33. deepdoctection/eval/base.py +8 -9
  34. deepdoctection/eval/cocometric.py +15 -13
  35. deepdoctection/eval/eval.py +41 -37
  36. deepdoctection/eval/tedsmetric.py +30 -23
  37. deepdoctection/eval/tp_eval_callback.py +16 -19
  38. deepdoctection/extern/__init__.py +2 -7
  39. deepdoctection/extern/base.py +339 -134
  40. deepdoctection/extern/d2detect.py +85 -113
  41. deepdoctection/extern/deskew.py +14 -11
  42. deepdoctection/extern/doctrocr.py +141 -130
  43. deepdoctection/extern/fastlang.py +27 -18
  44. deepdoctection/extern/hfdetr.py +71 -62
  45. deepdoctection/extern/hflayoutlm.py +504 -211
  46. deepdoctection/extern/hflm.py +230 -0
  47. deepdoctection/extern/model.py +488 -302
  48. deepdoctection/extern/pdftext.py +23 -19
  49. deepdoctection/extern/pt/__init__.py +1 -3
  50. deepdoctection/extern/pt/nms.py +6 -2
  51. deepdoctection/extern/pt/ptutils.py +29 -19
  52. deepdoctection/extern/tessocr.py +39 -38
  53. deepdoctection/extern/texocr.py +18 -18
  54. deepdoctection/extern/tp/tfutils.py +57 -9
  55. deepdoctection/extern/tp/tpcompat.py +21 -14
  56. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
  60. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  61. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
  62. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
  67. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
  68. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  69. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  70. deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
  71. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  72. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  73. deepdoctection/extern/tpdetect.py +45 -53
  74. deepdoctection/mapper/__init__.py +3 -8
  75. deepdoctection/mapper/cats.py +27 -29
  76. deepdoctection/mapper/cocostruct.py +10 -10
  77. deepdoctection/mapper/d2struct.py +27 -26
  78. deepdoctection/mapper/hfstruct.py +13 -8
  79. deepdoctection/mapper/laylmstruct.py +178 -37
  80. deepdoctection/mapper/maputils.py +12 -11
  81. deepdoctection/mapper/match.py +2 -2
  82. deepdoctection/mapper/misc.py +11 -9
  83. deepdoctection/mapper/pascalstruct.py +4 -4
  84. deepdoctection/mapper/prodigystruct.py +5 -5
  85. deepdoctection/mapper/pubstruct.py +84 -92
  86. deepdoctection/mapper/tpstruct.py +5 -5
  87. deepdoctection/mapper/xfundstruct.py +33 -33
  88. deepdoctection/pipe/__init__.py +1 -1
  89. deepdoctection/pipe/anngen.py +12 -14
  90. deepdoctection/pipe/base.py +52 -106
  91. deepdoctection/pipe/common.py +72 -59
  92. deepdoctection/pipe/concurrency.py +16 -11
  93. deepdoctection/pipe/doctectionpipe.py +24 -21
  94. deepdoctection/pipe/language.py +20 -25
  95. deepdoctection/pipe/layout.py +20 -16
  96. deepdoctection/pipe/lm.py +75 -105
  97. deepdoctection/pipe/order.py +194 -89
  98. deepdoctection/pipe/refine.py +111 -124
  99. deepdoctection/pipe/segment.py +156 -161
  100. deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
  101. deepdoctection/pipe/text.py +37 -36
  102. deepdoctection/pipe/transform.py +19 -16
  103. deepdoctection/train/__init__.py +6 -12
  104. deepdoctection/train/d2_frcnn_train.py +48 -41
  105. deepdoctection/train/hf_detr_train.py +41 -30
  106. deepdoctection/train/hf_layoutlm_train.py +153 -135
  107. deepdoctection/train/tp_frcnn_train.py +32 -31
  108. deepdoctection/utils/concurrency.py +1 -1
  109. deepdoctection/utils/context.py +13 -6
  110. deepdoctection/utils/develop.py +4 -4
  111. deepdoctection/utils/env_info.py +87 -125
  112. deepdoctection/utils/file_utils.py +6 -11
  113. deepdoctection/utils/fs.py +22 -18
  114. deepdoctection/utils/identifier.py +2 -2
  115. deepdoctection/utils/logger.py +16 -15
  116. deepdoctection/utils/metacfg.py +7 -7
  117. deepdoctection/utils/mocks.py +93 -0
  118. deepdoctection/utils/pdf_utils.py +11 -11
  119. deepdoctection/utils/settings.py +185 -181
  120. deepdoctection/utils/tqdm.py +1 -1
  121. deepdoctection/utils/transform.py +14 -9
  122. deepdoctection/utils/types.py +104 -0
  123. deepdoctection/utils/utils.py +7 -7
  124. deepdoctection/utils/viz.py +74 -72
  125. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
  126. deepdoctection-0.33.dist-info/RECORD +146 -0
  127. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
  128. deepdoctection/utils/detection_types.py +0 -68
  129. deepdoctection-0.31.dist-info/RECORD +0 -144
  130. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
  131. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
@@ -30,9 +30,10 @@ Module for Fintabnet dataset. Place the dataset as follows
30
30
  ├── FinTabNet_1.0.0_table_train.jsonl
31
31
  ├── FinTabNet_1.0.0_table_val.jsonl
32
32
  """
33
+ from __future__ import annotations
33
34
 
34
35
  from pathlib import Path
35
- from typing import List, Mapping, Sequence, Union
36
+ from typing import Mapping, Sequence, Union
36
37
 
37
38
  from ...dataflow import DataFlow, MapData, MultiProcessMapData
38
39
  from ...dataflow.common import FlattenData
@@ -42,10 +43,10 @@ from ...mapper.cats import cat_to_sub_cat, filter_cat
42
43
  from ...mapper.maputils import curry
43
44
  from ...mapper.misc import image_ann_to_image, maybe_ann_to_sub_image
44
45
  from ...mapper.pubstruct import pub_to_image
45
- from ...utils.detection_types import JsonDict
46
46
  from ...utils.file_utils import set_mp_spawn
47
47
  from ...utils.logger import LoggingRecord, logger
48
48
  from ...utils.settings import CellType, DatasetType, LayoutType, ObjectTypes, TableType
49
+ from ...utils.types import PubtabnetDict
49
50
  from ...utils.utils import to_bool
50
51
  from ..base import _BuiltInDataset
51
52
  from ..dataflow_builder import DataFlowBaseBuilder
@@ -82,38 +83,38 @@ _URL = (
82
83
  "fintabnet.tar.gz?_ga=2.17492593.994196051.1634564576-1173244232.1625045842"
83
84
  )
84
85
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
85
- _TYPE = DatasetType.object_detection
86
+ _TYPE = DatasetType.OBJECT_DETECTION
86
87
  _LOCATION = "fintabnet"
87
88
  _ANNOTATION_FILES: Mapping[str, str] = {
88
89
  "train": "FinTabNet_1.0.0_table_train.jsonl",
89
90
  "test": "FinTabNet_1.0.0_table_test.jsonl",
90
91
  "val": "FinTabNet_1.0.0_table_val.jsonl",
91
92
  }
92
- _INIT_CATEGORIES = [LayoutType.table, LayoutType.cell, TableType.item]
93
+ _INIT_CATEGORIES = [LayoutType.TABLE, LayoutType.CELL, TableType.ITEM]
93
94
  _SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]]
94
95
  _SUB_CATEGORIES = {
95
- LayoutType.cell: {
96
- CellType.header: [CellType.header, CellType.body],
97
- CellType.row_number: [],
98
- CellType.column_number: [],
99
- CellType.row_span: [],
100
- CellType.column_span: [],
101
- CellType.spanning: [CellType.spanning, LayoutType.cell],
96
+ LayoutType.CELL: {
97
+ CellType.HEADER: [CellType.HEADER, CellType.BODY],
98
+ CellType.ROW_NUMBER: [],
99
+ CellType.COLUMN_NUMBER: [],
100
+ CellType.ROW_SPAN: [],
101
+ CellType.COLUMN_SPAN: [],
102
+ CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
102
103
  },
103
- TableType.item: {TableType.item: [LayoutType.row, LayoutType.column]},
104
- CellType.header: {
105
- CellType.row_number: [],
106
- CellType.column_number: [],
107
- CellType.row_span: [],
108
- CellType.column_span: [],
109
- CellType.spanning: [CellType.spanning, LayoutType.cell],
104
+ TableType.ITEM: {TableType.ITEM: [LayoutType.ROW, LayoutType.COLUMN]},
105
+ CellType.HEADER: {
106
+ CellType.ROW_NUMBER: [],
107
+ CellType.COLUMN_NUMBER: [],
108
+ CellType.ROW_SPAN: [],
109
+ CellType.COLUMN_SPAN: [],
110
+ CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
110
111
  },
111
- CellType.body: {
112
- CellType.row_number: [],
113
- CellType.column_number: [],
114
- CellType.row_span: [],
115
- CellType.column_span: [],
116
- CellType.spanning: [CellType.spanning, LayoutType.cell],
112
+ CellType.BODY: {
113
+ CellType.ROW_NUMBER: [],
114
+ CellType.COLUMN_NUMBER: [],
115
+ CellType.ROW_SPAN: [],
116
+ CellType.COLUMN_SPAN: [],
117
+ CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
117
118
  },
118
119
  }
119
120
 
@@ -133,7 +134,7 @@ class Fintabnet(_BuiltInDataset):
133
134
  def _categories(self) -> DatasetCategories:
134
135
  return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
135
136
 
136
- def _builder(self) -> "FintabnetBuilder":
137
+ def _builder(self) -> FintabnetBuilder:
137
138
  return FintabnetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
138
139
 
139
140
 
@@ -200,18 +201,17 @@ class FintabnetBuilder(DataFlowBaseBuilder):
200
201
 
201
202
  # Map
202
203
  @curry
203
- def _map_filename(dp: JsonDict, workdir: Path) -> JsonDict:
204
+ def _map_filename(dp: PubtabnetDict, workdir: Path) -> PubtabnetDict:
204
205
  dp["filename"] = workdir / "pdf" / dp["filename"]
205
206
  return dp
206
207
 
207
- map_filename = _map_filename(self.get_workdir()) # pylint: disable=E1120 # 259
208
- df = MapData(df, map_filename)
208
+ df = MapData(df, _map_filename(self.get_workdir()))
209
209
 
210
210
  buffer_size = 200 if max_datapoints is None else min(max_datapoints, 200) - 1
211
211
 
212
212
  pub_mapper = pub_to_image(
213
- self.categories.get_categories(name_as_key=True, init=True),
214
- load_image,
213
+ categories_name_as_key=self.categories.get_categories(name_as_key=True, init=True),
214
+ load_image=load_image,
215
215
  fake_score=fake_score,
216
216
  rows_and_cols=rows_and_cols,
217
217
  dd_pipe_like=False,
@@ -232,39 +232,39 @@ class FintabnetBuilder(DataFlowBaseBuilder):
232
232
  if build_mode == "table":
233
233
 
234
234
  @curry
235
- def _crop_and_add_image(dp: Image, category_names: List[str]) -> Image:
235
+ def _crop_and_add_image(dp: Image, category_names: list[str]) -> Image:
236
236
  return image_ann_to_image(dp, category_names=category_names)
237
237
 
238
238
  df = MapData(
239
239
  df,
240
240
  _crop_and_add_image( # pylint: disable=E1120
241
241
  category_names=[
242
- LayoutType.table,
243
- LayoutType.cell,
244
- CellType.header,
245
- CellType.body,
246
- TableType.item,
247
- LayoutType.row,
248
- LayoutType.column,
242
+ LayoutType.TABLE,
243
+ LayoutType.CELL,
244
+ CellType.HEADER,
245
+ CellType.BODY,
246
+ TableType.ITEM,
247
+ LayoutType.ROW,
248
+ LayoutType.COLUMN,
249
249
  ]
250
250
  ),
251
251
  )
252
252
  df = MapData(
253
253
  df,
254
254
  maybe_ann_to_sub_image( # pylint: disable=E1120 # 259
255
- category_names_sub_image=LayoutType.table,
255
+ category_names_sub_image=LayoutType.TABLE,
256
256
  category_names=[
257
- LayoutType.cell,
258
- CellType.header,
259
- CellType.body,
260
- TableType.item,
261
- LayoutType.row,
262
- LayoutType.column,
257
+ LayoutType.CELL,
258
+ CellType.HEADER,
259
+ CellType.BODY,
260
+ TableType.ITEM,
261
+ LayoutType.ROW,
262
+ LayoutType.COLUMN,
263
263
  ],
264
264
  add_summary=True,
265
265
  ),
266
266
  )
267
- df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation_iter(category_names=LayoutType.table)])
267
+ df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation_iter(category_names=LayoutType.TABLE)])
268
268
  df = FlattenData(df)
269
269
  df = MapData(df, lambda dp: dp[0])
270
270
 
@@ -32,6 +32,7 @@ Module for Funsd dataset. Install the dataset following the folder structure
32
32
  │ ├── images
33
33
  │ │ ├── ...
34
34
  """
35
+ from __future__ import annotations
35
36
 
36
37
  import os
37
38
  from typing import Dict, List, Mapping, Union
@@ -40,16 +41,16 @@ from ...dataflow import DataFlow, MapData, SerializerFiles
40
41
  from ...datasets.info import DatasetInfo
41
42
  from ...mapper.cats import cat_to_sub_cat, filter_cat
42
43
  from ...mapper.xfundstruct import xfund_to_image
43
- from ...utils.detection_types import JsonDict, Pathlike
44
44
  from ...utils.fs import load_json
45
45
  from ...utils.settings import BioTag, DatasetType, LayoutType, ObjectTypes, TokenClasses, TokenClassWithTag, WordType
46
+ from ...utils.types import FunsdDict, PathLikeOrStr
46
47
  from ..base import _BuiltInDataset
47
48
  from ..dataflow_builder import DataFlowBaseBuilder
48
49
  from ..info import DatasetCategories
49
50
  from ..registry import dataset_registry
50
51
 
51
52
 
52
- def load_file(path_ann: Pathlike) -> JsonDict:
53
+ def load_file(path_ann: PathLikeOrStr) -> FunsdDict:
53
54
  """
54
55
  Loading json file
55
56
 
@@ -79,28 +80,28 @@ _LICENSE = (
79
80
 
80
81
  _URL = "https://guillaumejaume.github.io/FUNSD/download/"
81
82
  _SPLITS: Mapping[str, str] = {"train": "training_data", "test": "testing_data"}
82
- _TYPE = DatasetType.token_classification
83
+ _TYPE = DatasetType.TOKEN_CLASSIFICATION
83
84
  _LOCATION = "funsd"
84
85
  _ANNOTATION_FILES: Mapping[str, str] = {"train": "annotations", "test": "annotations"}
85
86
 
86
- _INIT_CATEGORIES = [LayoutType.word, LayoutType.text]
87
+ _INIT_CATEGORIES = [LayoutType.WORD, LayoutType.TEXT]
87
88
  _SUB_CATEGORIES: Dict[ObjectTypes, Dict[ObjectTypes, List[ObjectTypes]]]
88
89
  _SUB_CATEGORIES = {
89
- LayoutType.word: {
90
- WordType.token_class: [TokenClasses.other, TokenClasses.question, TokenClasses.answer, TokenClasses.header],
91
- WordType.tag: [BioTag.inside, BioTag.outside, BioTag.begin],
92
- WordType.token_tag: [
93
- TokenClassWithTag.b_answer,
94
- TokenClassWithTag.b_header,
95
- TokenClassWithTag.b_question,
96
- TokenClassWithTag.i_answer,
97
- TokenClassWithTag.i_header,
98
- TokenClassWithTag.i_question,
99
- BioTag.outside,
90
+ LayoutType.WORD: {
91
+ WordType.TOKEN_CLASS: [TokenClasses.OTHER, TokenClasses.QUESTION, TokenClasses.ANSWER, TokenClasses.HEADER],
92
+ WordType.TAG: [BioTag.INSIDE, BioTag.OUTSIDE, BioTag.BEGIN],
93
+ WordType.TOKEN_TAG: [
94
+ TokenClassWithTag.B_ANSWER,
95
+ TokenClassWithTag.B_HEADER,
96
+ TokenClassWithTag.B_QUESTION,
97
+ TokenClassWithTag.I_ANSWER,
98
+ TokenClassWithTag.I_HEADER,
99
+ TokenClassWithTag.I_QUESTION,
100
+ BioTag.OUTSIDE,
100
101
  ],
101
102
  },
102
- LayoutType.text: {
103
- WordType.token_class: [TokenClasses.other, TokenClasses.question, TokenClasses.answer, TokenClasses.header]
103
+ LayoutType.TEXT: {
104
+ WordType.TOKEN_CLASS: [TokenClasses.OTHER, TokenClasses.QUESTION, TokenClasses.ANSWER, TokenClasses.HEADER]
104
105
  },
105
106
  }
106
107
 
@@ -120,7 +121,7 @@ class Funsd(_BuiltInDataset):
120
121
  def _categories(self) -> DatasetCategories:
121
122
  return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
122
123
 
123
- def _builder(self) -> "FunsdBuilder":
124
+ def _builder(self) -> FunsdBuilder:
124
125
  return FunsdBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
125
126
 
126
127
 
@@ -158,14 +159,14 @@ class FunsdBuilder(DataFlowBaseBuilder):
158
159
  # Map
159
160
  categories_name_as_key = self.categories.get_categories(init=True, name_as_key=True)
160
161
  category_names_mapping = {
161
- "other": TokenClasses.other,
162
- "question": TokenClasses.question,
163
- "answer": TokenClasses.answer,
164
- "header": TokenClasses.header,
162
+ "other": TokenClasses.OTHER,
163
+ "question": TokenClasses.QUESTION,
164
+ "answer": TokenClasses.ANSWER,
165
+ "header": TokenClasses.HEADER,
165
166
  }
166
167
  ner_token_to_id_mapping = self.categories.get_sub_categories(
167
- categories=LayoutType.word,
168
- sub_categories={LayoutType.word: [WordType.token_tag, WordType.tag, WordType.token_class]},
168
+ categories=LayoutType.WORD,
169
+ sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
169
170
  keys=False,
170
171
  values_as_dict=True,
171
172
  name_as_key=True,
@@ -35,25 +35,28 @@ Module for IIITar13K dataset. Install the dataset following the folder structure
35
35
  │ ├── ...
36
36
 
37
37
  """
38
+ from __future__ import annotations
38
39
 
39
40
  import os
40
41
  from typing import Mapping, Union
41
42
 
43
+ from lazy_imports import try_import
44
+
42
45
  from ...dataflow import DataFlow, MapData, SerializerFiles
43
46
  from ...datasets.info import DatasetInfo
44
47
  from ...mapper.maputils import curry
45
48
  from ...mapper.misc import xml_to_dict
46
49
  from ...mapper.pascalstruct import pascal_voc_dict_to_image
47
- from ...utils.detection_types import JsonDict
48
50
  from ...utils.file_utils import lxml_available
49
51
  from ...utils.fs import get_package_path
50
52
  from ...utils.settings import DatasetType, LayoutType
53
+ from ...utils.types import JsonDict
51
54
  from ..base import _BuiltInDataset
52
55
  from ..dataflow_builder import DataFlowBaseBuilder
53
56
  from ..info import DatasetCategories
54
57
  from ..registry import dataset_registry
55
58
 
56
- if lxml_available():
59
+ with try_import() as import_guard:
57
60
  from lxml import etree
58
61
 
59
62
  _NAME = "iiitar13k"
@@ -73,7 +76,7 @@ _LICENSE = "NN"
73
76
  _URL = "http://cvit.iiit.ac.in/usodi/iiitar13k.php"
74
77
 
75
78
  _SPLITS: Mapping[str, str] = {"train": "training_images", "val": "validation_images", "test": "test_images"}
76
- _TYPE = DatasetType.object_detection
79
+ _TYPE = DatasetType.OBJECT_DETECTION
77
80
  _LOCATION = "iiitar13k"
78
81
  _ANNOTATION_FILES: Mapping[str, str] = {
79
82
  "train": "training_xml",
@@ -81,7 +84,7 @@ _ANNOTATION_FILES: Mapping[str, str] = {
81
84
  "test": "test_xml",
82
85
  }
83
86
 
84
- _INIT_CATEGORIES = [LayoutType.table, LayoutType.logo, LayoutType.figure, LayoutType.signature]
87
+ _INIT_CATEGORIES = [LayoutType.TABLE, LayoutType.LOGO, LayoutType.FIGURE, LayoutType.SIGNATURE]
85
88
 
86
89
 
87
90
  @dataset_registry.register("iiitar13k")
@@ -99,7 +102,7 @@ class IIITar13K(_BuiltInDataset):
99
102
  def _categories(self) -> DatasetCategories:
100
103
  return DatasetCategories(init_categories=_INIT_CATEGORIES)
101
104
 
102
- def _builder(self) -> "IIITar13KBuilder":
105
+ def _builder(self) -> IIITar13KBuilder:
103
106
  return IIITar13KBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
104
107
 
105
108
 
@@ -173,11 +176,11 @@ class IIITar13KBuilder(DataFlowBaseBuilder):
173
176
  filter_empty_image=True,
174
177
  fake_score=fake_score,
175
178
  category_name_mapping={
176
- "natural_image": LayoutType.figure,
177
- "figure": LayoutType.figure,
178
- "logo": LayoutType.logo,
179
- "signature": LayoutType.signature,
180
- "table": LayoutType.table,
179
+ "natural_image": LayoutType.FIGURE,
180
+ "figure": LayoutType.FIGURE,
181
+ "logo": LayoutType.LOGO,
182
+ "signature": LayoutType.SIGNATURE,
183
+ "table": LayoutType.FIGURE,
181
184
  },
182
185
  ),
183
186
  )
@@ -24,6 +24,7 @@ Module for Testlayout dataset. Install the dataset following the folder structur
24
24
  ├── test
25
25
  │ ├── xrf_layout_test.jsonl
26
26
  """
27
+ from __future__ import annotations
27
28
 
28
29
  from typing import Mapping, Union
29
30
 
@@ -51,7 +52,7 @@ _LICENSE = (
51
52
  )
52
53
 
53
54
  _SPLITS: Mapping[str, str] = {"test": "test", "predict": "predict"}
54
- _TYPE = DatasetType.object_detection
55
+ _TYPE = DatasetType.OBJECT_DETECTION
55
56
  _LOCATION = "testlayout"
56
57
 
57
58
  _ANNOTATION_FILES: Mapping[str, str] = {
@@ -59,7 +60,7 @@ _ANNOTATION_FILES: Mapping[str, str] = {
59
60
  "predict": "xrf_layout_test_predict.jsonl",
60
61
  }
61
62
 
62
- _INIT_CATEGORIES = [LayoutType.text, LayoutType.title, LayoutType.list, LayoutType.table, LayoutType.figure]
63
+ _INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutType.TABLE, LayoutType.FIGURE]
63
64
 
64
65
 
65
66
  @dataset_registry.register("testlayout")
@@ -77,7 +78,7 @@ class LayoutTest(_BuiltInDataset):
77
78
  def _categories(self) -> DatasetCategories:
78
79
  return DatasetCategories(init_categories=_INIT_CATEGORIES)
79
80
 
80
- def _builder(self) -> "LayoutTestBuilder":
81
+ def _builder(self) -> LayoutTestBuilder:
81
82
  return LayoutTestBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
82
83
 
83
84
 
@@ -28,7 +28,7 @@ Module for Publaynet dataset. Place the dataset as follows
28
28
  ├── train.json
29
29
  ├── val.json
30
30
  """
31
-
31
+ from __future__ import annotations
32
32
 
33
33
  from typing import Mapping, Union
34
34
 
@@ -61,12 +61,12 @@ _URL = (
61
61
  "publaynet.tar.gz?_ga=2.23017467.1796315263.1628754613-1173244232.1625045842"
62
62
  )
63
63
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val"}
64
- _TYPE = DatasetType.object_detection
64
+ _TYPE = DatasetType.OBJECT_DETECTION
65
65
 
66
66
  _LOCATION = "publaynet"
67
67
 
68
68
  _ANNOTATION_FILES: Mapping[str, str] = {"train": "train.json", "val": "val.json"}
69
- _INIT_CATEGORIES = [LayoutType.text, LayoutType.title, LayoutType.list, LayoutType.table, LayoutType.figure]
69
+ _INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutType.TABLE, LayoutType.FIGURE]
70
70
 
71
71
 
72
72
  @dataset_registry.register("publaynet")
@@ -84,7 +84,7 @@ class Publaynet(_BuiltInDataset):
84
84
  def _categories(self) -> DatasetCategories:
85
85
  return DatasetCategories(init_categories=_INIT_CATEGORIES)
86
86
 
87
- def _builder(self) -> "PublaynetBuilder":
87
+ def _builder(self) -> PublaynetBuilder:
88
88
  return PublaynetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
89
89
 
90
90
 
@@ -120,7 +120,7 @@ class PublaynetBuilder(DataFlowBaseBuilder):
120
120
  df = SerializerCoco.load(path, max_datapoints=max_datapoints)
121
121
 
122
122
  # Map
123
- df = MapDataComponent(df, lambda dp: self.get_workdir() / self.get_split(split) / dp, "file_name")
123
+ df = MapDataComponent(df, lambda dp: (self.get_workdir() / self.get_split(split) / dp).as_posix(), "file_name")
124
124
  coco_mapper = coco_to_image( # pylint: disable=E1120 # 259
125
125
  self.categories.get_categories(init=True),
126
126
  load_image,
@@ -37,26 +37,29 @@ Module for PubTables1M-Detection-PASCAL-VOC dataset. Install the dataset followi
37
37
  ├── PubTables-1M-Structure_Annotations_Test
38
38
  ├── PubTables-1M-Structure_Images_Test
39
39
  """
40
+ from __future__ import annotations
40
41
 
41
42
  import os
42
43
  from typing import Mapping, Union
43
44
 
45
+ from lazy_imports import try_import
46
+
44
47
  from ...dataflow import DataFlow, MapData, SerializerFiles
45
48
  from ...datasets.info import DatasetInfo
46
49
  from ...mapper.cats import filter_cat
47
50
  from ...mapper.maputils import curry
48
51
  from ...mapper.misc import xml_to_dict
49
52
  from ...mapper.pascalstruct import pascal_voc_dict_to_image
50
- from ...utils.detection_types import JsonDict
51
53
  from ...utils.file_utils import lxml_available
52
54
  from ...utils.fs import get_package_path
53
55
  from ...utils.settings import CellType, DatasetType, LayoutType
56
+ from ...utils.types import JsonDict
54
57
  from ..base import _BuiltInDataset
55
58
  from ..dataflow_builder import DataFlowBaseBuilder
56
59
  from ..info import DatasetCategories
57
60
  from ..registry import dataset_registry
58
61
 
59
- if lxml_available():
62
+ with try_import() as import_guard:
60
63
  from lxml import etree
61
64
 
62
65
  _NAME = "pubtables1m_det"
@@ -77,14 +80,14 @@ _LICENSE = "Community Data License Agreement – Permissive, Version 1.0"
77
80
  _URL = "https://msropendata.com/datasets/505fcbe3-1383-42b1-913a-f651b8b712d3"
78
81
 
79
82
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
80
- _TYPE = DatasetType.object_detection
83
+ _TYPE = DatasetType.OBJECT_DETECTION
81
84
  _LOCATION = "PubTables1M"
82
85
  _ANNOTATION_FILES: Mapping[str, str] = {
83
86
  "train": "PubTables1M-Detection-PASCAL-VOC/train",
84
87
  "val": "PubTables1M-Detection-PASCAL-VOC/val",
85
88
  "test": "PubTables1M-Detection-PASCAL-VOC/test",
86
89
  }
87
- _INIT_CATEGORIES_DET = [LayoutType.table, LayoutType.table_rotated]
90
+ _INIT_CATEGORIES_DET = [LayoutType.TABLE, LayoutType.TABLE_ROTATED]
88
91
 
89
92
 
90
93
  @dataset_registry.register("pubtables1m_det")
@@ -102,7 +105,7 @@ class Pubtables1MDet(_BuiltInDataset):
102
105
  def _categories(self) -> DatasetCategories:
103
106
  return DatasetCategories(init_categories=_INIT_CATEGORIES_DET)
104
107
 
105
- def _builder(self) -> "Pubtables1MBuilder":
108
+ def _builder(self) -> Pubtables1MBuilder:
106
109
  return Pubtables1MBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
107
110
 
108
111
 
@@ -177,7 +180,7 @@ class Pubtables1MBuilder(DataFlowBaseBuilder):
177
180
  load_image,
178
181
  filter_empty_image=True,
179
182
  fake_score=fake_score,
180
- category_name_mapping={"table": LayoutType.table, "table rotated": LayoutType.table_rotated},
183
+ category_name_mapping={"table": LayoutType.TABLE, "table rotated": LayoutType.TABLE_ROTATED},
181
184
  ),
182
185
  )
183
186
 
@@ -192,13 +195,13 @@ _ANNOTATION_FILES_STRUCT: Mapping[str, str] = {
192
195
  }
193
196
 
194
197
  _INIT_CATEGORIES_STRUCT = [
195
- LayoutType.table,
196
- LayoutType.row,
197
- LayoutType.column,
198
- CellType.spanning,
199
- CellType.row_header,
200
- CellType.column_header,
201
- CellType.projected_row_header,
198
+ LayoutType.TABLE,
199
+ LayoutType.ROW,
200
+ LayoutType.COLUMN,
201
+ CellType.SPANNING,
202
+ CellType.ROW_HEADER,
203
+ CellType.COLUMN_HEADER,
204
+ CellType.PROJECTED_ROW_HEADER,
202
205
  ]
203
206
 
204
207
  _IMAGES: Mapping[str, str] = {
@@ -225,7 +228,7 @@ class Pubtables1MStruct(_BuiltInDataset):
225
228
  def _categories(self) -> DatasetCategories:
226
229
  return DatasetCategories(init_categories=_INIT_CATEGORIES_STRUCT)
227
230
 
228
- def _builder(self) -> "Pubtables1MBuilderStruct":
231
+ def _builder(self) -> Pubtables1MBuilderStruct:
229
232
  return Pubtables1MBuilderStruct(location=_LOCATION, annotation_files=_ANNOTATION_FILES_STRUCT)
230
233
 
231
234
 
@@ -299,13 +302,13 @@ class Pubtables1MBuilderStruct(DataFlowBaseBuilder):
299
302
  filter_empty_image=True,
300
303
  fake_score=fake_score,
301
304
  category_name_mapping={
302
- "table": LayoutType.table,
303
- "table spanning cell": CellType.spanning,
304
- "table row": LayoutType.row,
305
- "table row header": CellType.row_header,
306
- "table projected row header": CellType.projected_row_header,
307
- "table column": LayoutType.column,
308
- "table column header": CellType.column_header,
305
+ "table": LayoutType.TABLE,
306
+ "table spanning cell": CellType.SPANNING,
307
+ "table row": LayoutType.ROW,
308
+ "table row header": CellType.ROW_HEADER,
309
+ "table projected row header": CellType.PROJECTED_ROW_HEADER,
310
+ "table column": LayoutType.COLUMN,
311
+ "table column header": CellType.COLUMN_HEADER,
309
312
  },
310
313
  ),
311
314
  )
@@ -27,17 +27,18 @@ Module for Pubtabnet dataset. Place the dataset as follows
27
27
  │ ├── PMC3.png
28
28
  ├── PubTabNet_2.0.0.jsonl
29
29
  """
30
+ from __future__ import annotations
30
31
 
31
- from typing import Dict, List, Mapping, Union
32
+ from typing import Mapping, Union
32
33
 
33
34
  from ...dataflow import DataFlow, MapData
34
35
  from ...dataflow.custom_serialize import SerializerJsonlines
35
36
  from ...datasets.info import DatasetInfo
36
37
  from ...mapper.cats import cat_to_sub_cat, filter_cat
37
38
  from ...mapper.pubstruct import pub_to_image
38
- from ...utils.detection_types import JsonDict
39
39
  from ...utils.logger import LoggingRecord, logger
40
40
  from ...utils.settings import CellType, DatasetType, LayoutType, ObjectTypes, TableType, WordType
41
+ from ...utils.types import PubtabnetDict
41
42
  from ..base import _BuiltInDataset
42
43
  from ..dataflow_builder import DataFlowBaseBuilder
43
44
  from ..info import DatasetCategories
@@ -69,38 +70,38 @@ _URL = (
69
70
  "pubtabnet.tar.gz?_ga=2.267291150.146828643.1629125962-1173244232.1625045842"
70
71
  )
71
72
  _SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
72
- _TYPE = DatasetType.object_detection
73
+ _TYPE = DatasetType.OBJECT_DETECTION
73
74
  _LOCATION = "pubtabnet"
74
75
  _ANNOTATION_FILES: Mapping[str, str] = {"all": "PubTabNet_2.0.0.jsonl"}
75
76
 
76
- _INIT_CATEGORIES = [LayoutType.cell, TableType.item, LayoutType.table, LayoutType.word]
77
- _SUB_CATEGORIES: Dict[ObjectTypes, Dict[ObjectTypes, List[ObjectTypes]]]
77
+ _INIT_CATEGORIES = [LayoutType.CELL, TableType.ITEM, LayoutType.TABLE, LayoutType.WORD]
78
+ _SUB_CATEGORIES: dict[ObjectTypes, dict[ObjectTypes, list[ObjectTypes]]]
78
79
  _SUB_CATEGORIES = {
79
- TableType.item: {TableType.item: [LayoutType.row, LayoutType.column]},
80
- LayoutType.cell: {
81
- CellType.header: [CellType.header, CellType.body],
82
- CellType.row_number: [],
83
- CellType.column_number: [],
84
- CellType.row_span: [],
85
- CellType.column_span: [],
86
- CellType.spanning: [CellType.spanning],
80
+ TableType.ITEM: {TableType.ITEM: [LayoutType.ROW, LayoutType.COLUMN]},
81
+ LayoutType.CELL: {
82
+ CellType.HEADER: [CellType.HEADER, CellType.BODY],
83
+ CellType.ROW_NUMBER: [],
84
+ CellType.COLUMN_NUMBER: [],
85
+ CellType.ROW_SPAN: [],
86
+ CellType.COLUMN_SPAN: [],
87
+ CellType.SPANNING: [CellType.SPANNING],
87
88
  },
88
- CellType.header: {
89
- CellType.row_number: [],
90
- CellType.column_number: [],
91
- CellType.row_span: [],
92
- CellType.column_span: [],
93
- CellType.spanning: [CellType.spanning],
89
+ CellType.HEADER: {
90
+ CellType.ROW_NUMBER: [],
91
+ CellType.COLUMN_NUMBER: [],
92
+ CellType.ROW_SPAN: [],
93
+ CellType.COLUMN_SPAN: [],
94
+ CellType.SPANNING: [CellType.SPANNING],
94
95
  },
95
- CellType.body: {
96
- CellType.row_number: [],
97
- CellType.column_number: [],
98
- CellType.row_span: [],
99
- CellType.column_span: [],
100
- CellType.spanning: [CellType.spanning],
96
+ CellType.BODY: {
97
+ CellType.ROW_NUMBER: [],
98
+ CellType.COLUMN_NUMBER: [],
99
+ CellType.ROW_SPAN: [],
100
+ CellType.COLUMN_SPAN: [],
101
+ CellType.SPANNING: [CellType.SPANNING],
101
102
  },
102
- LayoutType.table: {TableType.html: [TableType.html]},
103
- LayoutType.word: {WordType.characters: [WordType.characters]},
103
+ LayoutType.TABLE: {TableType.HTML: [TableType.HTML]},
104
+ LayoutType.WORD: {WordType.CHARACTERS: [WordType.CHARACTERS]},
104
105
  }
105
106
 
106
107
 
@@ -119,7 +120,7 @@ class Pubtabnet(_BuiltInDataset):
119
120
  def _categories(self) -> DatasetCategories:
120
121
  return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
121
122
 
122
- def _builder(self) -> "PubtabnetBuilder":
123
+ def _builder(self) -> PubtabnetBuilder:
123
124
  return PubtabnetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
124
125
 
125
126
 
@@ -169,7 +170,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
169
170
  df = SerializerJsonlines.load(path, max_datapoints=max_datapoints)
170
171
 
171
172
  # Map
172
- def replace_filename(dp: JsonDict) -> JsonDict:
173
+ def replace_filename(dp: PubtabnetDict) -> PubtabnetDict:
173
174
  dp["filename"] = self.get_workdir() / dp["split"] / dp["filename"]
174
175
  return dp
175
176
 
@@ -177,7 +178,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
177
178
  df = MapData(df, lambda dp: dp if dp["split"] == split else None)
178
179
  pub_mapper = pub_to_image(
179
180
  self.categories.get_categories(name_as_key=True, init=True),
180
- load_image,
181
+ load_image=load_image,
181
182
  fake_score=fake_score,
182
183
  rows_and_cols=rows_and_cols,
183
184
  dd_pipe_like=dd_pipe_like,
@@ -186,6 +187,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
186
187
  )
187
188
 
188
189
  df = MapData(df, pub_mapper)
190
+
189
191
  if self.categories.is_cat_to_sub_cat():
190
192
  df = MapData(
191
193
  df,