deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
  120. deepdoctection/analyzer/_config.py +0 -146
  121. deepdoctection-0.42.0.dist-info/METADATA +0 -431
  122. deepdoctection-0.42.0.dist-info/RECORD +0 -148
  123. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
@@ -48,9 +48,13 @@ class MappingContextManager:
48
48
  self, dp_name: Optional[str] = None, filter_level: str = "image", **kwargs: dict[str, Optional[str]]
49
49
  ) -> None:
50
50
  """
51
- :param dp_name: A name for the datapoint to be mapped
52
- :param filter_level: Indicates if the `MappingContextManager` is use on datapoint level,
53
- annotation level etc. Filter level will only be used for logging
51
+ Args:
52
+ dp_name: A name for the datapoint to be mapped.
53
+ filter_level: Indicates if the `MappingContextManager` is used on datapoint level, annotation level etc.
54
+ `filter_level` will only be used for logging.
55
+
56
+ Note:
57
+ Use this context manager to catch and log exceptions during mapping.
54
58
  """
55
59
  self.dp_name = dp_name if dp_name is not None else ""
56
60
  self.filter_level = filter_level
@@ -59,7 +63,10 @@ class MappingContextManager:
59
63
 
60
64
  def __enter__(self) -> MappingContextManager:
61
65
  """
62
- context enter
66
+ Context enter.
67
+
68
+ Returns:
69
+ The `MappingContextManager` instance.
63
70
  """
64
71
  return self
65
72
 
@@ -70,7 +77,15 @@ class MappingContextManager:
70
77
  exc_tb: Optional[TracebackType],
71
78
  ) -> Optional[bool]:
72
79
  """
73
- context exit
80
+ Context exit.
81
+
82
+ Args:
83
+ exc_type: The exception type.
84
+ exc_val: The exception value.
85
+ exc_tb: The traceback object.
86
+
87
+ Returns:
88
+ `True` if the exception was handled, otherwise `None`.
74
89
  """
75
90
  if (
76
91
  exc_type
@@ -121,9 +136,10 @@ class DefaultMapper:
121
136
 
122
137
  def __init__(self, func: Callable[[DP, S], T], *args: Any, **kwargs: Any) -> None:
123
138
  """
124
- :param func: A mapping function
125
- :param args: Default args to pass to the function
126
- :param kwargs: Default kwargs to pass to the function
139
+ Args:
140
+ func: A mapping function
141
+ args: Default `args` to pass to the function
142
+ kwargs: Default `kwargs` to pass to the function
127
143
  """
128
144
  self.func = func
129
145
  self.argument_args = args
@@ -131,34 +147,42 @@ class DefaultMapper:
131
147
 
132
148
  def __call__(self, dp: Any) -> Any:
133
149
  """
134
- :param dp: datapoint within a dataflow
135
- :return: The return value of the invoked function with default arguments.
150
+ Call the wrapped function with the given datapoint and default arguments.
151
+
152
+ Args:
153
+ dp: Datapoint within a dataflow.
154
+
155
+ Returns:
156
+ The return value of the invoked function with default arguments.
136
157
  """
137
158
  return self.func(dp, *self.argument_args, **self.argument_kwargs)
138
159
 
139
160
 
140
161
  def curry(func: Callable[..., T]) -> Callable[..., Callable[[DP], T]]:
141
162
  """
142
- Decorator for converting functions that maps
163
+ Decorator for converting functions that map
143
164
 
144
- dps: Union[JsonDict,Image] -> Union[JsonDict,Image]
165
+ ```python
166
+ dps: Union[JsonDict, Image] -> Union[JsonDict, Image]
167
+ ```
145
168
 
146
- to `DefaultMapper`s. They will be initialized with all arguments except dp and can be called later with only the
169
+ to `DefaultMapper`s. They will be initialized with all arguments except `dp` and can be called later with only the
147
170
  datapoint as argument. This setting is useful when incorporating the function within a dataflow.
148
171
 
149
- **Example:**
150
-
151
- @curry
152
- def json_to_image(dp, config_arg_1, config_arg_2,...) -> Image:
172
+ Example:
173
+ ```python
174
+ @curry
175
+ def json_to_image(dp, config_arg_1, config_arg_2, ...) -> Image:
153
176
  ...
177
+ df = ...
178
+ df = MapData(df, json_to_image(config_arg_1=val_1, config_arg_2=val_2))
179
+ ```
154
180
 
155
- can be applied like:
156
-
157
- df = ...
158
- df = MapData(df,json_to_image(config_arg_1=val_1,config_arg_2=val_2))
181
+ Args:
182
+ func: A callable [[`Image`], [Any]] -> [`Image`]
159
183
 
160
- :param func: A callable [[`Image`],[Any]] -> [`Image`]
161
- :return: A DefaultMapper
184
+ Returns:
185
+ A `DefaultMapper`.
162
186
  """
163
187
 
164
188
  @functools.wraps(func)
@@ -170,10 +194,13 @@ def curry(func: Callable[..., T]) -> Callable[..., Callable[[DP], T]]:
170
194
 
171
195
  def maybe_get_fake_score(add_fake_score: bool) -> Optional[float]:
172
196
  """
173
- Returns a fake score, if add_fake_score = True. Will otherwise return None
197
+ Returns a fake score, if `add_fake_score` is `True`. Will otherwise return `None`.
198
+
199
+ Args:
200
+ add_fake_score: Boolean.
174
201
 
175
- :param add_fake_score: boolean
176
- :return: A uniform random variable in (0,1)
202
+ Returns:
203
+ A uniform random variable in `(0,1)` or `None`.
177
204
  """
178
205
  if add_fake_score:
179
206
  return np.random.uniform(0.0, 1.0, 1)[0]
@@ -182,20 +209,24 @@ def maybe_get_fake_score(add_fake_score: bool) -> Optional[float]:
182
209
 
183
210
  class LabelSummarizer:
184
211
  """
185
- A class for generating label statistics. Useful, when mapping and generating a SummaryAnnotation.
186
-
187
- summarizer = LabelSummarizer({"1": "label_1","2":"label_2"})
212
+ A class for generating label statistics. Useful when mapping and generating a `SummaryAnnotation`.
188
213
 
214
+ Example:
215
+ ```python
216
+ summarizer = LabelSummarizer({"1": "label_1", "2": "label_2"})
189
217
  for dp in some_dataflow:
190
218
  summarizer.dump(dp["label_id"])
191
-
192
219
  summarizer.print_summary_histogram()
220
+ ```
193
221
 
222
+ Args:
223
+ categories: A dict of categories as given as in `categories.get_categories()`.
194
224
  """
195
225
 
196
226
  def __init__(self, categories: Mapping[int, ObjectTypes]) -> None:
197
227
  """
198
- :param categories: A dict of categories as given as in categories.get_categories().
228
+ Args:
229
+ categories: A dict of categories as given as in `categories.get_categories()`.
199
230
  """
200
231
  self.categories = categories
201
232
  cat_numbers = len(self.categories.keys())
@@ -204,16 +235,20 @@ class LabelSummarizer:
204
235
 
205
236
  def dump(self, item: Union[Sequence[Union[str, int]], str, int]) -> None:
206
237
  """
207
- Dump a category number
238
+ Dump a category number.
208
239
 
209
- :param item: A category number.
240
+ Args:
241
+ item: A category number.
210
242
  """
211
243
  np_item = np.asarray(item, dtype="int8")
212
244
  self.summary += np.histogram(np_item, bins=self.hist_bins)[0]
213
245
 
214
246
  def get_summary(self) -> dict[int, int]:
215
247
  """
216
- Get a dictionary with category ids and the number dumped
248
+ Get a dictionary with category ids and the number dumped.
249
+
250
+ Returns:
251
+ A dictionary mapping category ids to counts.
217
252
  """
218
253
  return dict(list(zip(self.categories.keys(), self.summary.tolist())))
219
254
 
@@ -221,7 +256,8 @@ class LabelSummarizer:
221
256
  """
222
257
  Prints a summary from all dumps.
223
258
 
224
- :param dd_logic: Follow dd category convention when printing histogram (last background bucket omitted).
259
+ Args:
260
+ dd_logic: Follow dd category convention when printing histogram (last background bucket omitted).
225
261
  """
226
262
  if dd_logic:
227
263
  data = list(itertools.chain(*[[self.categories[i].value, v] for i, v in enumerate(self.summary, 1)]))
@@ -46,55 +46,64 @@ def match_anns_by_intersection(
46
46
  max_parent_only: bool = False,
47
47
  ) -> tuple[Any, Any, Sequence[ImageAnnotation], Sequence[ImageAnnotation]]:
48
48
  """
49
- Generates an iou/ioa-matrix for parent_ann_categories and child_ann_categories and returns pairs of child/parent
50
- indices that are above some intersection threshold. It will also return a list of all pre selected parent and child
49
+ Generates an iou/ioa-matrix for `parent_ann_categories` and `child_ann_categories` and returns pairs of child/parent
50
+ indices that are above some intersection threshold. It will also return a list of all pre-selected parent and child
51
51
  annotations.
52
52
 
53
- **Example:**
54
-
55
- Let `p_i, c_j` be annotations ids of parent and children according to some category names.
56
-
57
- |**ioa**|**c_1**|**c_2**|
58
- |-------|-------|-------|
59
- |**p_1**| 0.3 | 0.8 |
60
- |**p_2**| 0.4 | 0.1 |
61
- |**p_3**| 1. | 0.4 |
62
-
63
- With `ioa_threshold = 0.5` it will return:
64
-
65
- `[[2],[0]], [[1],[],[1]], [c_1,c_2], [p_1,p_2,p_3]`.
66
-
67
- For each child the sum of all ioas with all parents sum up to 1. Hence, the ioa with one parent will in general
68
- decrease if one child intersects with more parents. Take two childs one matching two parents with an ioa of 0.5 each
69
- while the second matching four parents with an ioa of 0.25 each. In this situation it is difficult to assign
70
- children according to a given threshold and one also has to take into account the number of parental intersection
71
- for each child. Setting `use_weighted_intersections` to True will multiply each ioa with the number of intersection
72
- making it easier to work with an absolute threshold.
73
-
74
- In some situation you want to assign to each child at most one parent. Setting `max_parent_only` to `True` it will
75
- select the parent with the highest ioa. Note, there is currently no implementation for iou.
76
-
77
- :param dp: image datapoint
78
- :param parent_ann_category_names: single str or list of category names
79
- :param child_ann_category_names: single str or list of category names
80
- :param matching_rule: intersection measure type, either "iou" or "ioa"
81
- :param threshold: Threshold, for mat given matching rule. Will assign every child ann with iou/ioa above the
82
- threshold to the parental annotation.
83
- :param use_weighted_intersections: This is currently only implemented for matching_rule 'ioa'. Instead of using
84
- the ioa_matrix it will use mat weighted ioa in order to take into account that
85
- intersections with more cells will likely decrease the ioa value. By multiplying
86
- the ioa with the number of all intersection for each child this value calibrate
87
- the ioa.
88
- :param parent_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other parent candi-
89
- dates which are not in the list.
90
- :param child_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other children
91
- candidates which are not in the list.
92
- :param parent_ann_service_ids: Additional filter condition. If some ids are selected, it will ignore all other
93
- parent candidates which are not in the list.
94
- :param child_ann_service_ids: Additional filter condition. If some ids are selected, it will ignore all other
95
- children candidates which are not in the list.
96
- :param max_parent_only: Will assign to each child at most one parent with maximum ioa
97
- :return: child indices, parent indices (see Example), list of parent ids and list of children ids.
53
+ Example:
54
+ ```python
55
+ match_anns_by_intersection()
56
+ ```
57
+
58
+ Let `p_i, c_j` be annotation ids of parent and children according to some category names.
59
+
60
+ | ioa | c_1 | c_2 |
61
+ |-------|-----|-----|
62
+ | p_1 | 0.3 | 0.8 |
63
+ | p_2 | 0.4 | 0.1 |
64
+ | p_3 | 1.0 | 0.4 |
65
+
66
+ With `ioa_threshold = 0.5` it will return:
67
+
68
+ `[[2],[0]], [[1],[],[1]], [c_1,c_2], [p_1,p_2,p_3]`.
69
+
70
+ For each child, the sum of all ioas with all parents sum up to 1. Hence, the ioa with one parent will in general
71
+ decrease if one child intersects with more parents. Take two children, one matching two parents with an ioa of
72
+ 0.5 each, while the second matches four parents with an ioa of 0.25 each. In this situation, it is difficult to
73
+ assign children according to a given threshold and one also has to take into account the number of parental
74
+ intersections for each child.
75
+
76
+ Note:
77
+ Setting `use_weighted_intersections` to True will multiply each ioa with the number of intersections,
78
+ making it easier to work with an absolute threshold.
79
+
80
+ Note:
81
+ In some situations, you want to assign to each child at most one parent. Setting `max_parent_only` to `True`
82
+ will select the parent with the highest ioa. There is currently no implementation for iou.
83
+
84
+ Args:
85
+ dp: Image datapoint.
86
+ matching_rule: Intersection measure type, either `iou` or `ioa`.
87
+ threshold: Threshold for the given matching rule. Will assign every child annotation with iou/ioa above the
88
+ threshold to the parental annotation.
89
+ use_weighted_intersections: This is currently only implemented for matching_rule `ioa`. Instead of using
90
+ the ioa_matrix, it will use a weighted ioa in order to take into account that intersections with more cells
91
+ will likely decrease the ioa value. By multiplying the ioa with the number of all intersections for each
92
+ child, this value calibrates the ioa.
93
+ parent_ann_category_names: Single str or list of category names.
94
+ child_ann_category_names: Single str or list of category names.
95
+ parent_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other parent
96
+ candidates which are not in the list.
97
+ child_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other children
98
+ candidates which are not in the list.
99
+ parent_ann_service_ids: Additional filter condition. If some ids are selected, it will ignore all other parent
100
+ candidates which are not in the list.
101
+ child_ann_service_ids: Additional filter condition. If some ids are selected, it will ignore all other children
102
+ candidates which are not in the list.
103
+ max_parent_only: Will assign to each child at most one parent with maximum ioa.
104
+
105
+ Returns:
106
+ child indices, parent indices (see Example), list of parent ids and list of children ids.
98
107
  """
99
108
 
100
109
  assert matching_rule in ["iou", "ioa"], "matching rule must be either iou or ioa"
@@ -166,21 +175,26 @@ def match_anns_by_distance(
166
175
  ) -> list[tuple[ImageAnnotation, ImageAnnotation]]:
167
176
  """
168
177
  Generates pairs of parent and child annotations by calculating the euclidean distance between the centers of the
169
- parent and child bounding boxes. It will return the closest child for each parent. Note, that a child can be
170
- assigned multiple times to different parents.
171
-
172
- :param dp: image datapoint
173
- :param parent_ann_category_names: single str or list of category names
174
- :param child_ann_category_names: single str or list of category names
175
- :param parent_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other parent candi-
176
- dates which are not in the list.
177
- :param child_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other children
178
- candidates which are not in the list.
179
- :param parent_ann_service_ids: Additional filter condition. If some ids are selected, it will ignore all other
180
- parent candidates which are not in the list.
181
- :param child_ann_service_ids: Additional filter condition. If some ids are selected, it will ignore all other
182
- children candidates which are not in the list.
183
- :return:
178
+ parent and child bounding boxes. It will return the closest child for each parent.
179
+
180
+ Note:
181
+ A child can be assigned multiple times to different parents.
182
+
183
+ Args:
184
+ dp: Image datapoint.
185
+ parent_ann_category_names: Single str or list of category names.
186
+ child_ann_category_names: Single str or list of category names.
187
+ parent_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other parent
188
+ candidates which are not in the list.
189
+ child_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other children
190
+ candidates which are not in the list.
191
+ parent_ann_service_ids: Additional filter condition. If some ids are selected, it will ignore all other parent
192
+ candidates which are not in the list.
193
+ child_ann_service_ids: Additional filter condition. If some ids are selected, it will ignore all other children
194
+ candidates which are not in the list.
195
+
196
+ Returns:
197
+ List of tuples of parent and child annotations.
184
198
  """
185
199
 
186
200
  parent_anns = dp.get_annotation(
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for small mapping functions
19
+ Small mapping functions
20
20
  """
21
21
 
22
22
  from __future__ import annotations
@@ -45,14 +45,19 @@ def to_image(
45
45
  height: Optional[int] = None,
46
46
  ) -> Optional[Image]:
47
47
  """
48
- Mapping an input from `dataflow.SerializerFiles` or similar to an Image
49
-
50
- :param dp: Image
51
- :param dpi: dot per inch definition for pdf resolution when converting to numpy array
52
- :param width: target width of the image. This option does only work when using Poppler as PDF renderer
53
- :param height: target width of the image. This option does only work when using Poppler as PDF renderer
54
- :param height: target height of the image
55
- :return: Image
48
+ Maps an input from `dataflow.SerializerFiles` or similar to an `Image`.
49
+
50
+ Args:
51
+ dp: Image.
52
+ dpi: Dot per inch definition for PDF resolution when converting to `np.array`.
53
+ width: Target width of the image. This option only works when using Poppler as PDF renderer.
54
+ height: Target height of the image. This option only works when using Poppler as PDF renderer.
55
+
56
+ Returns:
57
+ Image
58
+
59
+ Raises:
60
+ TypeError: If `dp` is not of the expected type for converting to image.
56
61
  """
57
62
 
58
63
  file_name: Optional[str]
@@ -101,10 +106,13 @@ def to_image(
101
106
 
102
107
  def maybe_load_image(dp: Image) -> Image:
103
108
  """
104
- If `image` is None will load the image.
109
+ If `image` is `None`, loads the image.
105
110
 
106
- :param dp: An Image
107
- :return: Image with attr: image not None
111
+ Args:
112
+ dp: An `Image`.
113
+
114
+ Returns:
115
+ Image with attribute `image` not `None`.
108
116
  """
109
117
 
110
118
  if dp.image is None:
@@ -116,10 +124,13 @@ def maybe_load_image(dp: Image) -> Image:
116
124
 
117
125
  def maybe_remove_image(dp: Image) -> Image:
118
126
  """
119
- Remove `image` if a location is provided.
127
+ Removes `image` if a location is provided.
128
+
129
+ Args:
130
+ dp: An `Image`.
120
131
 
121
- :param dp: An Image
122
- :return: Image with None attr: image
132
+ Returns:
133
+ Image with attribute `image` set to `None`.
123
134
  """
124
135
 
125
136
  if dp.location is not None:
@@ -130,11 +141,14 @@ def maybe_remove_image(dp: Image) -> Image:
130
141
  @curry
131
142
  def maybe_remove_image_from_category(dp: Image, category_names: Optional[Union[str, Sequence[str]]] = None) -> Image:
132
143
  """
133
- Removes image from image annotation for some category names
144
+ Removes `image` from image annotation for some `category_name`s.
145
+
146
+ Args:
147
+ dp: An `Image`.
148
+ category_names: Category names.
134
149
 
135
- :param dp: An Image
136
- :param category_names: category names
137
- :return: Image with image attributes from image annotations removed
150
+ Returns:
151
+ Image with `image` attributes from image annotations removed.
138
152
  """
139
153
  if category_names is None:
140
154
  category_names = []
@@ -151,12 +165,15 @@ def maybe_remove_image_from_category(dp: Image, category_names: Optional[Union[s
151
165
 
152
166
  def image_ann_to_image(dp: Image, category_names: Union[str, list[str]], crop_image: bool = True) -> Image:
153
167
  """
154
- Adds `image` to annotations with given category names
168
+ Adds `image` to annotations with given category names.
155
169
 
156
- :param dp: Image
157
- :param category_names: A single or a list of category names
158
- :param crop_image: Will add numpy array to `image.image`
159
- :return: Image
170
+ Args:
171
+ dp: `Image`.
172
+ category_names: A single or a list of category names.
173
+ crop_image: If `True`, will add `np.array` to `image.image`.
174
+
175
+ Returns:
176
+ Image
160
177
  """
161
178
 
162
179
  img_anns = dp.get_annotation(category_names=category_names)
@@ -171,15 +188,18 @@ def maybe_ann_to_sub_image(
171
188
  dp: Image, category_names_sub_image: Union[str, list[str]], category_names: Union[str, list[str]], add_summary: bool
172
189
  ) -> Image:
173
190
  """
174
- Assigns to sub image with given category names all annotations with given category names whose bounding box lie
191
+ Assigns to sub image with given category names all annotations with given category names whose bounding box lies
175
192
  within the bounding box of the sub image.
176
193
 
177
- :param dp: Image
178
- :param category_names_sub_image: A single or a list of category names that will form a sub image.
179
- :param category_names: A single or a list of category names that will may be assigned to a sub image, conditioned
180
- on the bounding box lying within the sub image.
181
- :param add_summary: will add the whole summary annotation to the sub image
182
- :return: Image
194
+ Args:
195
+ dp: `Image`.
196
+ category_names_sub_image: A single or a list of category names that will form a sub image.
197
+ category_names: A single or a list of category names that may be assigned to a sub image, conditioned on the
198
+ bounding box lying within the sub image.
199
+ add_summary: If `True`, will add the whole summary annotation to the sub image.
200
+
201
+ Returns:
202
+ Image
183
203
  """
184
204
 
185
205
  anns = dp.get_annotation(category_names=category_names_sub_image)
@@ -194,19 +214,23 @@ def maybe_ann_to_sub_image(
194
214
  @curry
195
215
  def xml_to_dict(dp: JsonDict, xslt_obj: etree.XSLT) -> JsonDict:
196
216
  """
197
- Convert a xml object into a dict using a xsl style sheet.
198
-
199
- **Example:**
200
-
201
- with open(path_xslt) as xsl_file:
202
- xslt_file = xsl_file.read().encode('utf-8')
203
- xml_obj = etree.XML(xslt_file, parser=etree.XMLParser(encoding='utf-8'))
204
- xslt_obj = etree.XSLT(xml_obj)
205
- df = MapData(df, xml_to_dict(xslt_obj))
206
-
207
- :param dp: string representing the xml
208
- :param xslt_obj: xslt object to parse the string
209
- :return: parsed xml
217
+ Converts an XML object into a dict using an XSL style sheet.
218
+
219
+ Example:
220
+ ```python
221
+ with open(path_xslt) as xsl_file:
222
+ xslt_file = xsl_file.read().encode('utf-8')
223
+ xml_obj = etree.XML(xslt_file, parser=etree.XMLParser(encoding='utf-8'))
224
+ xslt_obj = etree.XSLT(xml_obj)
225
+ df = MapData(df, xml_to_dict(xslt_obj))
226
+ ```
227
+
228
+ Args:
229
+ dp: String representing the XML.
230
+ xslt_obj: XSLT object to parse the string.
231
+
232
+ Returns:
233
+ Parsed XML as a dict.
210
234
  """
211
235
 
212
236
  output = str(xslt_obj(dp["xml"]))
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for mapping annotations in iiitar13k style structure
19
+ Mapping for PASCAL VOC dataset structure to `Image` format.
20
20
  """
21
21
 
22
22
  import os
@@ -41,17 +41,18 @@ def pascal_voc_dict_to_image(
41
41
  category_name_mapping: Optional[dict[str, str]] = None,
42
42
  ) -> Optional[Image]:
43
43
  """
44
- Map a dataset in a structure equivalent to iiitar13k annotation style to image format
45
-
46
- :param dp: a datapoint in serialized iiitar13k format. Note that another conversion from xml to
47
- a dict structure is required.
48
- :param categories_name_as_key: A dict of categories, e.g. DatasetCategories.get_categories(name_as_key=True)
49
- :param load_image: If 'True' it will load image to attr: Image.image
50
- :param filter_empty_image: Will return None, if datapoint has no annotations
51
- :param fake_score: If dp does not contain a score, a fake score with uniform random variables in (0,1)
52
- will be added.
53
- :param category_name_mapping: Map incoming category names, e.g. {"source_name":"target_name"}
54
- :return: Image
44
+ Maps a dataset in a structure equivalent to the PASCAL VOC annotation style to the `Image` format.
45
+
46
+ Args:
47
+ dp: A datapoint in PASCAL VOC format. Note that another conversion from XML to a dict structure is required.
48
+ categories_name_as_key: A dict of categories, e.g. `DatasetCategories.get_categories(name_as_key=True)`.
49
+ load_image: If `True`, it will load the image to the attribute `Image.image`.
50
+ filter_empty_image: Will return `None` if the datapoint has no annotations.
51
+ fake_score: If `dp` does not contain a score, a fake score with uniform random variables in (0,1) will be added.
52
+ category_name_mapping: Map incoming category names, e.g. `{"source_name": "target_name"}`.
53
+
54
+ Returns:
55
+ `Image` or `None`.
55
56
  """
56
57
 
57
58
  anns = dp.get("objects", [])
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for mapping annotations to and from prodigy data structure
19
+ Module for mapping annotations to and from prodigy data structure.
20
20
  """
21
21
 
22
22
  import os
@@ -41,19 +41,24 @@ def prodigy_to_image(
41
41
  category_name_mapping: Optional[Mapping[str, str]] = None,
42
42
  ) -> Optional[Image]:
43
43
  """
44
- Map a datapoint of annotation structure as given as from Prodigy database to an Image
45
- structure.
46
-
47
- :param dp: A datapoint in dict structure as returned from Prodigy database
48
- :param categories_name_as_key: A dict of categories, e.g. DatasetCategories.get_categories(name_as_key=True)
49
- :param load_image: If 'True' it will load image to attr:`Image.image`
50
- :param fake_score: If dp does not contain a score, a fake score with uniform random variables in (0,1)
51
- will be added.
52
- :param path_reference_ds: A path to a reference-dataset. It must point to the basedir where the file
53
- of the datapoint can be found.
54
- :param accept_only_answer: Filter every datapoint that has the answer 'reject' or 'ignore'.
55
- :param category_name_mapping: Map incoming category names, e.g. {"source_name":"target_name"}
56
- :return: Image
44
+ Maps a datapoint of annotation structure from Prodigy database to an `Image` structure.
45
+
46
+ Args:
47
+ dp: A datapoint in dict structure as returned from Prodigy database.
48
+ categories_name_as_key: A dict of categories, e.g. `DatasetCategories.get_categories(name_as_key=True)`.
49
+ load_image: If `True`, it will load image to `Image.image`.
50
+ fake_score: If `dp` does not contain a score, a fake score with uniform random variables in (0,1) will be added.
51
+ path_reference_ds: A path to a reference-dataset. It must point to the basedir where the file of the datapoint
52
+ can be found.
53
+ accept_only_answer: Filter every datapoint that has the answer `reject` or `ignore`.
54
+ category_name_mapping: Map incoming category names, e.g. `{"source_name":"target_name"}`.
55
+
56
+ Returns:
57
+ `Image`
58
+
59
+ Note:
60
+ If `accept_only_answer` is `True`, only datapoints with the answer `accept` will be processed.
61
+
57
62
  """
58
63
 
59
64
  if accept_only_answer and dp.get("answer") != "accept":
@@ -147,12 +152,21 @@ def prodigy_to_image(
147
152
  @curry
148
153
  def image_to_prodigy(dp: Image, category_names: Optional[Sequence[ObjectTypes]] = None) -> JsonDict:
149
154
  """
150
- The mapper to transform the normalized image representation of datasets into the format
151
- for visualising the annotation components in Prodigy.
155
+ Transforms the normalized image representation of datasets into the format for visualizing the annotation
156
+ components in Prodigy.
157
+
158
+ Args:
159
+ dp: An `Image`.
160
+ category_names: A list of category names to filter the annotations.
161
+
162
+ Returns:
163
+ A dictionary with compulsory keys: `text` and `spans`.
164
+
165
+ Example:
166
+ ```python
167
+ image_to_prodigy(image_instance)
168
+ ```
152
169
 
153
- :param dp: An image
154
- :param category_names: A list of category names to filter the annotations
155
- :return: A dictionary with compulsory keys: "text" and "spans"
156
170
  """
157
171
 
158
172
  output: JsonDict = {}