dtlpy 1.113.10__py3-none-any.whl → 1.114.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (243) hide show
  1. dtlpy/__init__.py +488 -488
  2. dtlpy/__version__.py +1 -1
  3. dtlpy/assets/__init__.py +26 -26
  4. dtlpy/assets/__pycache__/__init__.cpython-38.pyc +0 -0
  5. dtlpy/assets/code_server/config.yaml +2 -2
  6. dtlpy/assets/code_server/installation.sh +24 -24
  7. dtlpy/assets/code_server/launch.json +13 -13
  8. dtlpy/assets/code_server/settings.json +2 -2
  9. dtlpy/assets/main.py +53 -53
  10. dtlpy/assets/main_partial.py +18 -18
  11. dtlpy/assets/mock.json +11 -11
  12. dtlpy/assets/model_adapter.py +83 -83
  13. dtlpy/assets/package.json +61 -61
  14. dtlpy/assets/package_catalog.json +29 -29
  15. dtlpy/assets/package_gitignore +307 -307
  16. dtlpy/assets/service_runners/__init__.py +33 -33
  17. dtlpy/assets/service_runners/converter.py +96 -96
  18. dtlpy/assets/service_runners/multi_method.py +49 -49
  19. dtlpy/assets/service_runners/multi_method_annotation.py +54 -54
  20. dtlpy/assets/service_runners/multi_method_dataset.py +55 -55
  21. dtlpy/assets/service_runners/multi_method_item.py +52 -52
  22. dtlpy/assets/service_runners/multi_method_json.py +52 -52
  23. dtlpy/assets/service_runners/single_method.py +37 -37
  24. dtlpy/assets/service_runners/single_method_annotation.py +43 -43
  25. dtlpy/assets/service_runners/single_method_dataset.py +43 -43
  26. dtlpy/assets/service_runners/single_method_item.py +41 -41
  27. dtlpy/assets/service_runners/single_method_json.py +42 -42
  28. dtlpy/assets/service_runners/single_method_multi_input.py +45 -45
  29. dtlpy/assets/voc_annotation_template.xml +23 -23
  30. dtlpy/caches/base_cache.py +32 -32
  31. dtlpy/caches/cache.py +473 -473
  32. dtlpy/caches/dl_cache.py +201 -201
  33. dtlpy/caches/filesystem_cache.py +89 -89
  34. dtlpy/caches/redis_cache.py +84 -84
  35. dtlpy/dlp/__init__.py +20 -20
  36. dtlpy/dlp/cli_utilities.py +367 -367
  37. dtlpy/dlp/command_executor.py +764 -764
  38. dtlpy/dlp/dlp +1 -1
  39. dtlpy/dlp/dlp.bat +1 -1
  40. dtlpy/dlp/dlp.py +128 -128
  41. dtlpy/dlp/parser.py +651 -651
  42. dtlpy/entities/__init__.py +83 -83
  43. dtlpy/entities/analytic.py +311 -311
  44. dtlpy/entities/annotation.py +1879 -1879
  45. dtlpy/entities/annotation_collection.py +699 -699
  46. dtlpy/entities/annotation_definitions/__init__.py +20 -20
  47. dtlpy/entities/annotation_definitions/base_annotation_definition.py +100 -100
  48. dtlpy/entities/annotation_definitions/box.py +195 -195
  49. dtlpy/entities/annotation_definitions/classification.py +67 -67
  50. dtlpy/entities/annotation_definitions/comparison.py +72 -72
  51. dtlpy/entities/annotation_definitions/cube.py +204 -204
  52. dtlpy/entities/annotation_definitions/cube_3d.py +149 -149
  53. dtlpy/entities/annotation_definitions/description.py +32 -32
  54. dtlpy/entities/annotation_definitions/ellipse.py +124 -124
  55. dtlpy/entities/annotation_definitions/free_text.py +62 -62
  56. dtlpy/entities/annotation_definitions/gis.py +69 -69
  57. dtlpy/entities/annotation_definitions/note.py +139 -139
  58. dtlpy/entities/annotation_definitions/point.py +117 -117
  59. dtlpy/entities/annotation_definitions/polygon.py +182 -182
  60. dtlpy/entities/annotation_definitions/polyline.py +111 -111
  61. dtlpy/entities/annotation_definitions/pose.py +92 -92
  62. dtlpy/entities/annotation_definitions/ref_image.py +86 -86
  63. dtlpy/entities/annotation_definitions/segmentation.py +240 -240
  64. dtlpy/entities/annotation_definitions/subtitle.py +34 -34
  65. dtlpy/entities/annotation_definitions/text.py +85 -85
  66. dtlpy/entities/annotation_definitions/undefined_annotation.py +74 -74
  67. dtlpy/entities/app.py +220 -220
  68. dtlpy/entities/app_module.py +107 -107
  69. dtlpy/entities/artifact.py +174 -174
  70. dtlpy/entities/assignment.py +399 -399
  71. dtlpy/entities/base_entity.py +214 -214
  72. dtlpy/entities/bot.py +113 -113
  73. dtlpy/entities/codebase.py +296 -296
  74. dtlpy/entities/collection.py +38 -38
  75. dtlpy/entities/command.py +169 -169
  76. dtlpy/entities/compute.py +442 -442
  77. dtlpy/entities/dataset.py +1285 -1285
  78. dtlpy/entities/directory_tree.py +44 -44
  79. dtlpy/entities/dpk.py +470 -470
  80. dtlpy/entities/driver.py +222 -222
  81. dtlpy/entities/execution.py +397 -397
  82. dtlpy/entities/feature.py +124 -124
  83. dtlpy/entities/feature_set.py +145 -145
  84. dtlpy/entities/filters.py +641 -641
  85. dtlpy/entities/gis_item.py +107 -107
  86. dtlpy/entities/integration.py +184 -184
  87. dtlpy/entities/item.py +953 -953
  88. dtlpy/entities/label.py +123 -123
  89. dtlpy/entities/links.py +85 -85
  90. dtlpy/entities/message.py +175 -175
  91. dtlpy/entities/model.py +694 -691
  92. dtlpy/entities/node.py +1005 -1005
  93. dtlpy/entities/ontology.py +803 -803
  94. dtlpy/entities/organization.py +287 -287
  95. dtlpy/entities/package.py +657 -657
  96. dtlpy/entities/package_defaults.py +5 -5
  97. dtlpy/entities/package_function.py +185 -185
  98. dtlpy/entities/package_module.py +113 -113
  99. dtlpy/entities/package_slot.py +118 -118
  100. dtlpy/entities/paged_entities.py +290 -267
  101. dtlpy/entities/pipeline.py +593 -593
  102. dtlpy/entities/pipeline_execution.py +279 -279
  103. dtlpy/entities/project.py +394 -394
  104. dtlpy/entities/prompt_item.py +499 -499
  105. dtlpy/entities/recipe.py +301 -301
  106. dtlpy/entities/reflect_dict.py +102 -102
  107. dtlpy/entities/resource_execution.py +138 -138
  108. dtlpy/entities/service.py +958 -958
  109. dtlpy/entities/service_driver.py +117 -117
  110. dtlpy/entities/setting.py +294 -294
  111. dtlpy/entities/task.py +491 -491
  112. dtlpy/entities/time_series.py +143 -143
  113. dtlpy/entities/trigger.py +426 -426
  114. dtlpy/entities/user.py +118 -118
  115. dtlpy/entities/webhook.py +124 -124
  116. dtlpy/examples/__init__.py +19 -19
  117. dtlpy/examples/add_labels.py +135 -135
  118. dtlpy/examples/add_metadata_to_item.py +21 -21
  119. dtlpy/examples/annotate_items_using_model.py +65 -65
  120. dtlpy/examples/annotate_video_using_model_and_tracker.py +75 -75
  121. dtlpy/examples/annotations_convert_to_voc.py +9 -9
  122. dtlpy/examples/annotations_convert_to_yolo.py +9 -9
  123. dtlpy/examples/convert_annotation_types.py +51 -51
  124. dtlpy/examples/converter.py +143 -143
  125. dtlpy/examples/copy_annotations.py +22 -22
  126. dtlpy/examples/copy_folder.py +31 -31
  127. dtlpy/examples/create_annotations.py +51 -51
  128. dtlpy/examples/create_video_annotations.py +83 -83
  129. dtlpy/examples/delete_annotations.py +26 -26
  130. dtlpy/examples/filters.py +113 -113
  131. dtlpy/examples/move_item.py +23 -23
  132. dtlpy/examples/play_video_annotation.py +13 -13
  133. dtlpy/examples/show_item_and_mask.py +53 -53
  134. dtlpy/examples/triggers.py +49 -49
  135. dtlpy/examples/upload_batch_of_items.py +20 -20
  136. dtlpy/examples/upload_items_and_custom_format_annotations.py +55 -55
  137. dtlpy/examples/upload_items_with_modalities.py +43 -43
  138. dtlpy/examples/upload_segmentation_annotations_from_mask_image.py +44 -44
  139. dtlpy/examples/upload_yolo_format_annotations.py +70 -70
  140. dtlpy/exceptions.py +125 -125
  141. dtlpy/miscellaneous/__init__.py +20 -20
  142. dtlpy/miscellaneous/dict_differ.py +95 -95
  143. dtlpy/miscellaneous/git_utils.py +217 -217
  144. dtlpy/miscellaneous/json_utils.py +14 -14
  145. dtlpy/miscellaneous/list_print.py +105 -105
  146. dtlpy/miscellaneous/zipping.py +130 -130
  147. dtlpy/ml/__init__.py +20 -20
  148. dtlpy/ml/base_feature_extractor_adapter.py +27 -27
  149. dtlpy/ml/base_model_adapter.py +945 -940
  150. dtlpy/ml/metrics.py +461 -461
  151. dtlpy/ml/predictions_utils.py +274 -274
  152. dtlpy/ml/summary_writer.py +57 -57
  153. dtlpy/ml/train_utils.py +60 -60
  154. dtlpy/new_instance.py +252 -252
  155. dtlpy/repositories/__init__.py +56 -56
  156. dtlpy/repositories/analytics.py +85 -85
  157. dtlpy/repositories/annotations.py +916 -916
  158. dtlpy/repositories/apps.py +383 -383
  159. dtlpy/repositories/artifacts.py +452 -452
  160. dtlpy/repositories/assignments.py +599 -599
  161. dtlpy/repositories/bots.py +213 -213
  162. dtlpy/repositories/codebases.py +559 -559
  163. dtlpy/repositories/collections.py +332 -348
  164. dtlpy/repositories/commands.py +158 -158
  165. dtlpy/repositories/compositions.py +61 -61
  166. dtlpy/repositories/computes.py +434 -406
  167. dtlpy/repositories/datasets.py +1291 -1291
  168. dtlpy/repositories/downloader.py +895 -895
  169. dtlpy/repositories/dpks.py +433 -433
  170. dtlpy/repositories/drivers.py +266 -266
  171. dtlpy/repositories/executions.py +817 -817
  172. dtlpy/repositories/feature_sets.py +226 -226
  173. dtlpy/repositories/features.py +238 -238
  174. dtlpy/repositories/integrations.py +484 -484
  175. dtlpy/repositories/items.py +909 -915
  176. dtlpy/repositories/messages.py +94 -94
  177. dtlpy/repositories/models.py +877 -867
  178. dtlpy/repositories/nodes.py +80 -80
  179. dtlpy/repositories/ontologies.py +511 -511
  180. dtlpy/repositories/organizations.py +525 -525
  181. dtlpy/repositories/packages.py +1941 -1941
  182. dtlpy/repositories/pipeline_executions.py +448 -448
  183. dtlpy/repositories/pipelines.py +642 -642
  184. dtlpy/repositories/projects.py +539 -539
  185. dtlpy/repositories/recipes.py +399 -399
  186. dtlpy/repositories/resource_executions.py +137 -137
  187. dtlpy/repositories/schema.py +120 -120
  188. dtlpy/repositories/service_drivers.py +213 -213
  189. dtlpy/repositories/services.py +1704 -1704
  190. dtlpy/repositories/settings.py +339 -339
  191. dtlpy/repositories/tasks.py +1124 -1124
  192. dtlpy/repositories/times_series.py +278 -278
  193. dtlpy/repositories/triggers.py +536 -536
  194. dtlpy/repositories/upload_element.py +257 -257
  195. dtlpy/repositories/uploader.py +651 -651
  196. dtlpy/repositories/webhooks.py +249 -249
  197. dtlpy/services/__init__.py +22 -22
  198. dtlpy/services/aihttp_retry.py +131 -131
  199. dtlpy/services/api_client.py +1782 -1782
  200. dtlpy/services/api_reference.py +40 -40
  201. dtlpy/services/async_utils.py +133 -133
  202. dtlpy/services/calls_counter.py +44 -44
  203. dtlpy/services/check_sdk.py +68 -68
  204. dtlpy/services/cookie.py +115 -115
  205. dtlpy/services/create_logger.py +156 -156
  206. dtlpy/services/events.py +84 -84
  207. dtlpy/services/logins.py +235 -235
  208. dtlpy/services/reporter.py +256 -256
  209. dtlpy/services/service_defaults.py +91 -91
  210. dtlpy/utilities/__init__.py +20 -20
  211. dtlpy/utilities/annotations/__init__.py +16 -16
  212. dtlpy/utilities/annotations/annotation_converters.py +269 -269
  213. dtlpy/utilities/base_package_runner.py +264 -264
  214. dtlpy/utilities/converter.py +1650 -1650
  215. dtlpy/utilities/dataset_generators/__init__.py +1 -1
  216. dtlpy/utilities/dataset_generators/dataset_generator.py +670 -670
  217. dtlpy/utilities/dataset_generators/dataset_generator_tensorflow.py +23 -23
  218. dtlpy/utilities/dataset_generators/dataset_generator_torch.py +21 -21
  219. dtlpy/utilities/local_development/__init__.py +1 -1
  220. dtlpy/utilities/local_development/local_session.py +179 -179
  221. dtlpy/utilities/reports/__init__.py +2 -2
  222. dtlpy/utilities/reports/figures.py +343 -343
  223. dtlpy/utilities/reports/report.py +71 -71
  224. dtlpy/utilities/videos/__init__.py +17 -17
  225. dtlpy/utilities/videos/video_player.py +598 -598
  226. dtlpy/utilities/videos/videos.py +470 -470
  227. {dtlpy-1.113.10.data → dtlpy-1.114.13.data}/scripts/dlp +1 -1
  228. dtlpy-1.114.13.data/scripts/dlp.bat +2 -0
  229. {dtlpy-1.113.10.data → dtlpy-1.114.13.data}/scripts/dlp.py +128 -128
  230. {dtlpy-1.113.10.dist-info → dtlpy-1.114.13.dist-info}/LICENSE +200 -200
  231. {dtlpy-1.113.10.dist-info → dtlpy-1.114.13.dist-info}/METADATA +172 -172
  232. dtlpy-1.114.13.dist-info/RECORD +240 -0
  233. {dtlpy-1.113.10.dist-info → dtlpy-1.114.13.dist-info}/WHEEL +1 -1
  234. tests/features/environment.py +551 -550
  235. dtlpy-1.113.10.data/scripts/dlp.bat +0 -2
  236. dtlpy-1.113.10.dist-info/RECORD +0 -244
  237. tests/assets/__init__.py +0 -0
  238. tests/assets/models_flow/__init__.py +0 -0
  239. tests/assets/models_flow/failedmain.py +0 -52
  240. tests/assets/models_flow/main.py +0 -62
  241. tests/assets/models_flow/main_model.py +0 -54
  242. {dtlpy-1.113.10.dist-info → dtlpy-1.114.13.dist-info}/entry_points.txt +0 -0
  243. {dtlpy-1.113.10.dist-info → dtlpy-1.114.13.dist-info}/top_level.txt +0 -0
@@ -1,670 +1,670 @@
1
- from concurrent.futures import ThreadPoolExecutor
2
- from pathlib import Path
3
- from PIL import Image
4
- import collections.abc
5
- import numpy as np
6
- import collections
7
- import logging
8
- import shutil
9
- import json
10
- import copy
11
- import tqdm
12
- import sys
13
- import os
14
- import re
15
- from ... import entities
16
-
17
- logger = logging.getLogger(name='dtlpy')
18
-
19
-
20
- class DataItem(dict):
21
- def __init__(self, *args, **kwargs):
22
- super(DataItem, self).__init__(*args, **kwargs)
23
-
24
- @property
25
- def image_filepath(self):
26
- return self['image_filepath']
27
-
28
- @image_filepath.setter
29
- def image_filepath(self, val):
30
- self['image_filepath'] = val
31
-
32
-
33
- class DatasetGenerator:
34
-
35
- def __init__(self,
36
- dataset_entity: entities.Dataset,
37
- annotation_type: entities.AnnotationType,
38
- item_type: list = None,
39
- filters: entities.Filters = None,
40
- data_path=None,
41
- overwrite=False,
42
- id_to_label_map=None,
43
- label_to_id_map=None,
44
- transforms=None,
45
- transforms_callback=None,
46
- num_workers=0,
47
- batch_size=None,
48
- collate_fn=None,
49
- shuffle=True,
50
- seed=None,
51
- to_categorical=False,
52
- to_mask=False,
53
- class_balancing=False,
54
- # debug flags
55
- return_originals=False,
56
- ignore_empty=True
57
- ) -> None:
58
- """
59
- Base Dataset Generator to build and iterate over images and annotations
60
-
61
- * Mapping Labels *
62
- To set a label mapping from labels to id you can use the `label_to_id_map` or `id_to_label_map`.
63
- NOTE: if they are not i.i.d you'll need to input both.
64
- In semantic, a `$default` label should be added so that the background (and all unlabeled pixels) will be
65
- mapped to the model's inputs
66
-
67
- label_to_id_map = {'cat': 1,
68
- 'dog': 1,
69
- '$default': 0}
70
- id_to_label_map = {1: 'cats_and_dogs',
71
- 0: 'background'}
72
-
73
- :param dataset_entity: dl.Dataset entity
74
- :param annotation_type: dl.AnnotationType - type of annotation to load from the annotated dataset
75
- :param item_type: list of file extension to load. default: ['jpg', 'jpeg', 'png', 'bmp']
76
- :param filters: dl.Filters - filtering entity to filter the dataset items
77
- :param data_path: Path to Dataloop annotations (root to "item" and "json").
78
- :param overwrite:
79
- :param dict id_to_label_map: Optional, {id: label_string} dictionary, default taken from dataset
80
- :param dict label_to_id_map: Optional, {label_string: id} dictionary
81
- :param transforms: Optional transform to be applied on a sample. list, imgaug.Sequence or torchvision.transforms.Compose
82
- :param transforms_callback: Optional function to handle the callback of each batch.
83
- look at default_transforms_callback for more information. available: imgaug_transforms_callback, torchvision_transforms_callback
84
- :param num_workers: Optional - number of separate threads to load the images
85
- :param batch_size: (int, optional): how many samples per batch to load, if not none - items will always be a list
86
- :param collate_fn: Optional - merges a list of samples to form a mini-batch of Tensor(s).
87
- :param shuffle: Whether to shuffle the data (default: True) If set to False, sorts the data in alphanumeric order.
88
- :param seed: Optional random seed for shuffling and transformations.
89
- :param to_categorical: convert label id to categorical format
90
- :param to_mask: convert annotations to an instance mask (will be true for SEGMENTATION)
91
- :param class_balancing: if True - performing random over-sample with class ids as the target to balance training data
92
- :param return_originals: bool - If True, return ALSO images and annotations before transformations (for debug)
93
- :param ignore_empty: bool - If True, generator will NOT collect items without annotations
94
- """
95
- self._dataset_entity = dataset_entity
96
-
97
- # default item types (extension for now)
98
- if item_type is None:
99
- item_type = ['jpg', 'jpeg', 'png', 'bmp']
100
- if not isinstance(item_type, list):
101
- item_type = [item_type]
102
- self.item_type = item_type
103
-
104
- # id labels mapping
105
- if label_to_id_map is None and id_to_label_map is None:
106
- # if both are None - take from dataset
107
- label_to_id_map = dataset_entity.instance_map
108
- id_to_label_map = {int(v): k for k, v in label_to_id_map.items()}
109
- else:
110
- # one or both is NOT None
111
- if label_to_id_map is None:
112
- # set label_to_id_map from the other
113
- label_to_id_map = {v: int(k) for k, v in id_to_label_map.items()}
114
- if id_to_label_map is None:
115
- # set id_to_label_map from the other
116
- id_to_label_map = {int(v): k for k, v in label_to_id_map.items()}
117
- # put it on the local ontology for the annotations download
118
- dataset_entity._get_ontology().instance_map = label_to_id_map
119
- self.id_to_label_map = id_to_label_map
120
- self.label_to_id_map = label_to_id_map
121
-
122
- # if annotation type is segmentation - to_mask must be True
123
- if annotation_type == entities.AnnotationType.SEGMENTATION:
124
- to_mask = True
125
-
126
- if data_path is None:
127
- data_path = os.path.join(os.path.expanduser('~'),
128
- '.dataloop',
129
- 'datasets',
130
- "{}_{}".format(dataset_entity.name,
131
- dataset_entity.id))
132
- download = False
133
- if os.path.isdir(data_path):
134
- if overwrite:
135
- logger.warning('overwrite flag is True! deleting and overwriting')
136
- shutil.rmtree(data_path)
137
- download = True
138
- else:
139
- download = True
140
- if download:
141
- annotation_options = [entities.ViewAnnotationOptions.JSON]
142
- if to_mask is True:
143
- annotation_options.append(entities.ViewAnnotationOptions.INSTANCE)
144
- _ = dataset_entity.items.download(filters=filters,
145
- local_path=data_path,
146
- thickness=-1,
147
- annotation_options=annotation_options)
148
- self.root_dir = data_path
149
- self._items_path = Path(self.root_dir).joinpath('items')
150
- self._json_path = Path(self.root_dir).joinpath('json')
151
- self._mask_path = Path(self.root_dir).joinpath('instance')
152
- self._transforms = transforms
153
- self._transforms_callback = transforms_callback
154
- if self._transforms is not None and self._transforms_callback is None:
155
- # use default callback
156
- self._transforms_callback = default_transforms_callback
157
-
158
- self.annotation_type = annotation_type
159
- self.num_workers = num_workers
160
- self.to_categorical = to_categorical
161
- self.num_classes = len(label_to_id_map)
162
- self.shuffle = shuffle
163
- self.seed = seed
164
- self.to_mask = to_mask
165
- self.batch_size = batch_size
166
- self.collate_fn = collate_fn
167
- self.class_balancing = class_balancing
168
- # inits
169
- self.data_items = list()
170
- # flags
171
- self.return_originals = return_originals
172
- self.ignore_empty = ignore_empty
173
-
174
- ####################
175
- # Load annotations #
176
- ####################
177
- self.load_annotations()
178
-
179
- @property
180
- def dataset_entity(self):
181
- assert isinstance(self._dataset_entity, entities.Dataset)
182
- return self._dataset_entity
183
-
184
- @dataset_entity.setter
185
- def dataset_entity(self, val):
186
- assert isinstance(val, entities.Dataset)
187
- self._dataset_entity = val
188
-
189
- @property
190
- def n_items(self):
191
- return len(self.data_items)
192
-
193
- def _load_single(self, image_filepath, pbar=None):
194
- try:
195
- is_empty = False
196
- item_info = DataItem()
197
- # add image path
198
- item_info.image_filepath = str(image_filepath)
199
- if os.stat(image_filepath).st_size < 5:
200
- logger.warning('IGNORING corrupted image: {!r}'.format(image_filepath))
201
- return None, True
202
- # get "platform" path
203
- rel_path = image_filepath.relative_to(self._items_path)
204
- # replace suffix to JSON
205
- rel_path_wo_png_ext = rel_path.with_suffix('.json')
206
- # create local path
207
- annotation_filepath = Path(self._json_path, rel_path_wo_png_ext)
208
-
209
- if os.path.isfile(annotation_filepath):
210
- with open(annotation_filepath, 'r') as f:
211
- data = json.load(f)
212
- if 'id' in data:
213
- item_id = data.get('id')
214
- elif '_id' in data:
215
- item_id = data.get('_id')
216
- annotations = entities.AnnotationCollection.from_json(data)
217
- else:
218
- item_id = ''
219
- annotations = None
220
- item_info.update(item_id=item_id)
221
- if self.annotation_type is not None:
222
- # add item id from json
223
- polygon_coordinates = list()
224
- box_coordinates = list()
225
- classes_ids = list()
226
- labels = list()
227
- if annotations is not None:
228
- for annotation in annotations:
229
- if 'user' in annotation.metadata and \
230
- 'model' in annotation.metadata['user']:
231
- # and 'name' in annotation.metadata['user']['model']:
232
- # Do not use prediction annotations in the data generator
233
- continue
234
- if annotation.type == self.annotation_type:
235
- if annotation.label not in self.label_to_id_map:
236
- logger.warning(
237
- 'Missing label {!r} in label_to_id_map. Skipping.. Use label_to_id_map for other behaviour'.format(
238
- annotation.label))
239
- else:
240
- classes_ids.append(self.label_to_id_map[annotation.label])
241
- labels.append(annotation.label)
242
- box_coordinates.append(np.asarray([annotation.left,
243
- annotation.top,
244
- annotation.right,
245
- annotation.bottom]))
246
- if self.annotation_type == entities.AnnotationType.POLYGON:
247
- polygon_coordinates.append(annotation.geo)
248
- if annotation.type not in [entities.AnnotationType.CLASSIFICATION,
249
- entities.AnnotationType.SEGMENTATION,
250
- entities.AnnotationType.BOX,
251
- entities.AnnotationType.POLYGON]:
252
- raise ValueError('unsupported annotation type: {}'.format(annotation.type))
253
- dtype = object if self.annotation_type == entities.AnnotationType.POLYGON else None
254
- # reorder for output
255
- item_info.update({entities.AnnotationType.BOX.value: np.asarray(box_coordinates).astype(float),
256
- entities.AnnotationType.CLASSIFICATION.value: np.asarray(classes_ids),
257
- entities.AnnotationType.POLYGON.value: np.asarray(polygon_coordinates, dtype=dtype),
258
- 'labels': labels})
259
- if len(item_info[entities.AnnotationType.CLASSIFICATION.value]) == 0:
260
- logger.debug('Empty annotation (nothing matched label_to_id_map) for image filename: {}'.format(
261
- image_filepath))
262
- is_empty = True
263
- if self.to_mask:
264
- # get "platform" path
265
- rel_path = image_filepath.relative_to(self._items_path)
266
- # replace suffix to PNG
267
- rel_path_wo_png_ext = rel_path.with_suffix('.png')
268
- # create local path
269
- mask_filepath = Path(self._mask_path, rel_path_wo_png_ext)
270
- if not os.path.isfile(mask_filepath):
271
- logger.debug('Empty annotation for image filename: {}'.format(image_filepath))
272
- is_empty = True
273
- item_info.update({entities.AnnotationType.SEGMENTATION.value: str(mask_filepath)})
274
- item_info.update(annotation_filepath=str(annotation_filepath))
275
- return item_info, is_empty
276
- except Exception:
277
- logger.exception('failed loading item in generator! {!r}'.format(image_filepath))
278
- return None, True
279
- finally:
280
- if pbar is not None:
281
- pbar.update()
282
-
283
- def load_annotations(self):
284
- logger.info(f"Collecting items with the following extensions: {self.item_type}")
285
- files = list()
286
- for ext in self.item_type:
287
- # build regex to ignore extension case
288
- regex = '*.{}'.format(''.join(['[{}{}]'.format(letter.lower(), letter.upper()) for letter in ext]))
289
- files.extend(self._items_path.rglob(regex))
290
-
291
- pool = ThreadPoolExecutor(max_workers=32)
292
- jobs = list()
293
- pbar = tqdm.tqdm(total=len(files),
294
- desc='Loading Data Generator',
295
- disable=self.dataset_entity._client_api.verbose.disable_progress_bar,
296
- file=sys.stdout)
297
- for image_filepath in files:
298
- jobs.append(pool.submit(self._load_single,
299
- image_filepath=image_filepath,
300
- pbar=pbar))
301
- outputs = [job.result() for job in jobs]
302
- pbar.close()
303
-
304
- n_items = len(outputs)
305
- n_empty_items = sum([1 for _, is_empty in outputs if is_empty is True])
306
-
307
- output_msg = 'Done loading items. Total items loaded: {}.'.format(n_items)
308
- if n_empty_items > 0:
309
- output_msg += '{action} {n_empty_items} items without annotations'.format(
310
- action='IGNORING' if self.ignore_empty else 'INCLUDING',
311
- n_empty_items=n_empty_items)
312
-
313
- if self.ignore_empty:
314
- # take ONLY non-empty
315
- data_items = [data_item for data_item, is_empty in outputs if is_empty is False]
316
- else:
317
- # take all
318
- data_items = [data_item for data_item, is_empty in outputs]
319
-
320
- self.data_items = data_items
321
- if len(self.data_items) == 0:
322
- logger.warning(output_msg)
323
- else:
324
- logger.info(output_msg)
325
- ###################
326
- # class balancing #
327
- ###################
328
- labels = [label for item in self.data_items for label in item.get('labels', list())]
329
- logger.info(f"Data Generator labels balance statistics: {collections.Counter(labels)}")
330
- if self.class_balancing:
331
- try:
332
- from imblearn.over_sampling import RandomOverSampler
333
- except Exception:
334
- logger.error(
335
- 'Class balancing is ON but missing "imbalanced-learn". run "pip install -U imbalanced-learn" and try again')
336
- raise
337
- logger.info('Class balance is on!')
338
- class_ids = [class_id for item in self.data_items for class_id in item['class']]
339
- dummy_inds = [i_item for i_item, item in enumerate(self.data_items) for _ in item['class']]
340
- over_sampler = RandomOverSampler(random_state=42)
341
- X_res, y_res = over_sampler.fit_resample(np.asarray(dummy_inds).reshape(-1, 1), np.asarray(class_ids))
342
- over_sampled_data_items = [self.data_items[i] for i in X_res.flatten()]
343
- oversampled_labels = [label for item in over_sampled_data_items for label in item['labels']]
344
- logger.info(f"Data Generator labels after oversampling: {collections.Counter(oversampled_labels)}")
345
- self.data_items = over_sampled_data_items
346
-
347
- if self.shuffle:
348
- if self.seed is None:
349
- self.seed = 256
350
- np.random.seed(self.seed)
351
- np.random.shuffle(self.data_items)
352
-
353
- def transform(self, image, target=None):
354
- if self._transforms is not None:
355
- image, target = self._transforms_callback(transforms=self._transforms,
356
- image=image,
357
- target=target,
358
- annotation_type=self.annotation_type)
359
- return image, target
360
-
361
- def _to_dtlpy(self, targets, labels=None):
362
- annotations = entities.AnnotationCollection(item=None)
363
- annotations._dataset = self._dataset_entity
364
- if labels is None:
365
- labels = [None] * len(targets)
366
- if self.to_mask is True:
367
- for label, label_ind in self.label_to_id_map.items():
368
- target = targets == label_ind
369
- if np.any(target):
370
- annotations.add(annotation_definition=entities.Segmentation(geo=target,
371
- label=label))
372
- elif self.annotation_type == entities.AnnotationType.BOX:
373
- for target, label in zip(targets, labels):
374
- annotations.add(annotation_definition=entities.Box(left=target[0],
375
- top=target[1],
376
- right=target[2],
377
- bottom=target[3],
378
- label=label))
379
- elif self.annotation_type == entities.AnnotationType.CLASSIFICATION:
380
- for target, label in zip(targets, labels):
381
- annotations.add(annotation_definition=entities.Classification(label=label))
382
- elif self.annotation_type == entities.AnnotationType.POLYGON:
383
- for target, label in zip(targets, labels):
384
- annotations.add(annotation_definition=entities.Polygon(label=label,
385
- geo=target.astype(float)))
386
- else:
387
- raise ValueError('unsupported annotation type: {}'.format(self.annotation_type))
388
- # set dataset for color
389
- for annotation in annotations:
390
- annotation._dataset = self._dataset_entity
391
- return annotations
392
-
393
- def visualize(self, idx=None, return_output=False, plot=True):
394
- if not self.__len__():
395
- raise ValueError('no items selected, cannot preform visualization')
396
- import matplotlib.pyplot as plt
397
- if idx is None:
398
- idx = np.random.randint(self.__len__())
399
- if self.batch_size is not None:
400
- raise ValueError('can visualize only of batch_size in None')
401
- data_item = self.__getitem__(idx)
402
- image = Image.fromarray(data_item.get('image'))
403
- labels = data_item.get('labels')
404
- targets = data_item.get('annotations')
405
- annotations = self._to_dtlpy(targets=targets, labels=labels)
406
- mask = Image.fromarray(annotations.show(height=image.size[1],
407
- width=image.size[0],
408
- alpha=0.8))
409
- image.paste(mask, (0, 0), mask)
410
- marked_image = np.asarray(image)
411
- if plot:
412
- plt.figure()
413
- plt.imshow(marked_image)
414
- if return_output:
415
- return marked_image, annotations
416
-
417
- def __getsingleitem__(self, idx):
418
- data_item = copy.deepcopy(self.data_items[idx])
419
-
420
- image_filename = data_item.get('image_filepath')
421
- image = np.asarray(Image.open(image_filename))
422
- data_item.update({'image': image})
423
-
424
- annotations = data_item.get(self.annotation_type)
425
- if self.to_mask is True:
426
- # if segmentation - read from file
427
- mask_filepath = data_item.get(entities.AnnotationType.SEGMENTATION)
428
- annotations = np.asarray(Image.open(mask_filepath).convert('L'))
429
- if self.to_categorical:
430
- onehot = np.zeros((annotations.size, self.num_classes + 1))
431
- onehot[np.arange(annotations.size), annotations] = 1
432
- annotations = onehot
433
- data_item.update({'annotations': annotations})
434
-
435
- if self.return_originals is True:
436
- annotations = []
437
- if self.annotation_type is not None:
438
- annotations = data_item.get('annotations')
439
- data_item.update({'orig_image': image.copy(),
440
- 'orig_annotations': annotations.copy()})
441
-
442
- ###########################
443
- # perform transformations #
444
- ###########################
445
- if self._transforms is not None:
446
- annotations = data_item.get('annotations')
447
- image, annotations = self.transform(image, annotations)
448
- data_item.update({'image': image,
449
- 'annotations': annotations})
450
- return data_item
451
-
452
- def __iter__(self):
453
- """Create a generator that iterate over the Sequence."""
454
- for item in (self[i] for i in range(len(self))):
455
- yield item
456
-
457
- def __len__(self):
458
- factor = self.batch_size
459
- if factor is None:
460
- factor = 1
461
- return int(np.ceil(self.n_items / factor))
462
-
463
- def __getitem__(self, idx):
464
- """
465
- Support single index or a slice.
466
- Uses ThreadPoolExecutor is num_workers != 0
467
- """
468
- to_return = None
469
- if isinstance(idx, int):
470
- if self.batch_size is None:
471
- to_return = self.__getsingleitem__(idx)
472
- else:
473
- # if batch_size is define, convert idx to batches
474
- idx = slice(idx * self.batch_size, min((idx + 1) * self.batch_size, len(self.data_items)))
475
-
476
- if isinstance(idx, slice):
477
- to_return = list()
478
- idxs = list(range(idx.start, idx.stop,
479
- idx.step if idx.step else 1))
480
- if self.num_workers == 0:
481
- for dx in idxs:
482
- to_return.append(self.__getsingleitem__(dx))
483
- else:
484
- with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
485
- for sample in executor.map(lambda i: self.__getsingleitem__(i), idxs):
486
- to_return.append(sample)
487
-
488
- if to_return is None:
489
- raise TypeError('unsupported indexing: list indices must be integers or slices, not {}'.format(type(idx)))
490
-
491
- if self.collate_fn is not None:
492
- to_return = self.collate_fn(to_return)
493
- return to_return
494
-
495
-
496
- np_str_obj_array_pattern = re.compile(r'[SaUO]')
497
-
498
- default_collate_err_msg_format = (
499
- "default_collate: batch must contain tensors, numpy arrays, numbers, "
500
- "dicts or lists; found {}")
501
-
502
-
503
- def default_transforms_callback(transforms, image, target, annotation_type):
504
- """
505
- Recursive call to perform the augmentations in "transforms"
506
-
507
- :param transforms:
508
- :param image:
509
- :param target:
510
- :param annotation_type:
511
- :return:
512
- """
513
- # get the type string without importing any other package
514
- transforms_type = type(transforms)
515
-
516
- ############
517
- # Handle compositions and lists of augmentations with a recursive call
518
- if transforms_type.__module__ == 'torchvision.transforms.transforms' and transforms_type.__name__ == 'Compose':
519
- # torchvision compose - convert to list
520
- image, target = default_transforms_callback(transforms.transforms, image, target, annotation_type)
521
- return image, target
522
-
523
- if transforms_type.__module__ == 'imgaug.augmenters.meta' and transforms_type.__name__ == 'Sequential':
524
- # imgaug sequential - convert to list
525
- image, target = default_transforms_callback(list(transforms), image, target, annotation_type)
526
- return image, target
527
-
528
- if isinstance(transforms, list):
529
- for t in transforms:
530
- image, target = default_transforms_callback(t, image, target, annotation_type)
531
- return image, target
532
-
533
- ##############
534
- # Handle single annotations
535
- if 'imgaug.augmenters' in transforms_type.__module__:
536
- # handle single imgaug augmentation
537
- if target is not None and annotation_type is not None:
538
- # works for batch but running on a single image
539
- if annotation_type == entities.AnnotationType.BOX:
540
- image, target = transforms(images=[image], bounding_boxes=[target])
541
- target = target[0]
542
- elif annotation_type == entities.AnnotationType.SEGMENTATION:
543
- # expending to HxWx1 for the imgaug function to work
544
- target = target[..., None]
545
- image, target = transforms(images=[image], segmentation_maps=[target])
546
- target = target[0][:, :, 0]
547
- elif annotation_type == entities.AnnotationType.POLYGON:
548
- image, target = transforms(images=[image], polygons=[target])
549
- target = target[0]
550
- elif annotation_type == entities.AnnotationType.CLASSIFICATION:
551
- image = transforms(images=[image])
552
- else:
553
- raise ValueError('unsupported annotations type for image augmentations: {}'.format(annotation_type))
554
- image = image[0]
555
- else:
556
- image = transforms(images=[image])
557
- image = image[0]
558
- else:
559
- image = transforms(image)
560
-
561
- return image, target
562
-
563
-
564
- def collate_default(batch):
565
- r"""Puts each data field into a tensor with outer dimension batch size"""
566
- elem = batch[0]
567
- elem_type = type(elem)
568
- if isinstance(elem, np.ndarray):
569
- return np.stack(batch, axis=0)
570
- elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
571
- if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
572
- # array of string classes and object
573
- if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
574
- raise TypeError(default_collate_err_msg_format.format(elem.dtype))
575
- return batch
576
- # return [tf.convert_to_tensor(b) for b in batch]
577
- elif elem.shape == (): # scalars
578
- return batch
579
- elif isinstance(elem, float):
580
- return batch
581
- elif isinstance(elem, int):
582
- return batch
583
- elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
584
- return batch
585
- elif isinstance(elem, collections.abc.Mapping):
586
- return {key: collate_default([d[key] for d in batch]) for key in elem}
587
- elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
588
- return elem_type(*(collate_default(samples) for samples in zip(*batch)))
589
- elif isinstance(elem, collections.abc.Sequence):
590
- transposed = zip(*batch)
591
- return transposed
592
- raise TypeError(default_collate_err_msg_format.format(elem_type))
593
-
594
-
595
- def collate_torch(batch):
596
- r"""Puts each data field into a tensor with outer dimension batch size"""
597
- import torch
598
- elem = batch[0]
599
- elem_type = type(elem)
600
- if isinstance(elem, torch.Tensor):
601
- out = None
602
- if torch.utils.data.get_worker_info() is not None:
603
- # If we're in a background process, concatenate directly into a
604
- # shared memory tensor to avoid an extra copy
605
- numel = sum(x.numel() for x in batch)
606
- storage = elem.storage()._new_shared(numel)
607
- out = elem.new(storage)
608
- return torch.stack(batch, 0, out=out)
609
- elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
610
- if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
611
- # array of string classes and object
612
- if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
613
- raise TypeError(default_collate_err_msg_format.format(elem.dtype))
614
- try:
615
- return torch.stack([torch.as_tensor(b) for b in batch])
616
- except RuntimeError:
617
- return batch
618
- elif elem.shape == (): # scalars
619
- return torch.as_tensor(batch)
620
- elif isinstance(elem, float):
621
- return torch.tensor(batch, dtype=torch.float64)
622
- elif isinstance(elem, int):
623
- return torch.tensor(batch)
624
- elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
625
- return batch
626
- elif isinstance(elem, collections.abc.Mapping):
627
- return {key: collate_torch([d[key] for d in batch]) for key in elem}
628
- elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
629
- return elem_type(*(collate_torch(samples) for samples in zip(*batch)))
630
- elif isinstance(elem, collections.abc.Sequence):
631
- transposed = zip(*batch)
632
- return transposed
633
-
634
- raise TypeError(default_collate_err_msg_format.format(elem_type))
635
-
636
-
637
- def collate_tf(batch):
638
- r"""Puts each data field into a tensor with outer dimension batch size"""
639
- import tensorflow as tf
640
- elem = batch[0]
641
- elem_type = type(elem)
642
- if isinstance(elem, tf.Tensor):
643
- return tf.stack(batch, axis=0)
644
- elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
645
- if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
646
- # array of string classes and object
647
- if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
648
- raise TypeError(default_collate_err_msg_format.format(elem.dtype))
649
- try:
650
- return tf.convert_to_tensor(batch)
651
- except ValueError:
652
- # failed on orig_image because of a mismatch in the shape (not resizing all the images so cannot stack)
653
- return batch
654
- # return [tf.convert_to_tensor(b) for b in batch]
655
- elif elem.shape == (): # scalars
656
- return tf.convert_to_tensor(batch)
657
- elif isinstance(elem, float):
658
- return tf.convert_to_tensor(batch, dtype=tf.float64)
659
- elif isinstance(elem, int):
660
- return tf.convert_to_tensor(batch)
661
- elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
662
- return batch
663
- elif isinstance(elem, collections.abc.Mapping):
664
- return {key: collate_tf([d[key] for d in batch]) for key in elem}
665
- elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
666
- return elem_type(*(collate_tf(samples) for samples in zip(*batch)))
667
- elif isinstance(elem, collections.abc.Sequence):
668
- transposed = zip(*batch)
669
- return transposed
670
- raise TypeError(default_collate_err_msg_format.format(elem_type))
1
+ from concurrent.futures import ThreadPoolExecutor
2
+ from pathlib import Path
3
+ from PIL import Image
4
+ import collections.abc
5
+ import numpy as np
6
+ import collections
7
+ import logging
8
+ import shutil
9
+ import json
10
+ import copy
11
+ import tqdm
12
+ import sys
13
+ import os
14
+ import re
15
+ from ... import entities
16
+
17
+ logger = logging.getLogger(name='dtlpy')
18
+
19
+
20
+ class DataItem(dict):
21
+ def __init__(self, *args, **kwargs):
22
+ super(DataItem, self).__init__(*args, **kwargs)
23
+
24
+ @property
25
+ def image_filepath(self):
26
+ return self['image_filepath']
27
+
28
+ @image_filepath.setter
29
+ def image_filepath(self, val):
30
+ self['image_filepath'] = val
31
+
32
+
33
+ class DatasetGenerator:
34
+
35
+ def __init__(self,
36
+ dataset_entity: entities.Dataset,
37
+ annotation_type: entities.AnnotationType,
38
+ item_type: list = None,
39
+ filters: entities.Filters = None,
40
+ data_path=None,
41
+ overwrite=False,
42
+ id_to_label_map=None,
43
+ label_to_id_map=None,
44
+ transforms=None,
45
+ transforms_callback=None,
46
+ num_workers=0,
47
+ batch_size=None,
48
+ collate_fn=None,
49
+ shuffle=True,
50
+ seed=None,
51
+ to_categorical=False,
52
+ to_mask=False,
53
+ class_balancing=False,
54
+ # debug flags
55
+ return_originals=False,
56
+ ignore_empty=True
57
+ ) -> None:
58
+ """
59
+ Base Dataset Generator to build and iterate over images and annotations
60
+
61
+ * Mapping Labels *
62
+ To set a label mapping from labels to id you can use the `label_to_id_map` or `id_to_label_map`.
63
+ NOTE: if they are not i.i.d you'll need to input both.
64
+ In semantic, a `$default` label should be added so that the background (and all unlabeled pixels) will be
65
+ mapped to the model's inputs
66
+
67
+ label_to_id_map = {'cat': 1,
68
+ 'dog': 1,
69
+ '$default': 0}
70
+ id_to_label_map = {1: 'cats_and_dogs',
71
+ 0: 'background'}
72
+
73
+ :param dataset_entity: dl.Dataset entity
74
+ :param annotation_type: dl.AnnotationType - type of annotation to load from the annotated dataset
75
+ :param item_type: list of file extension to load. default: ['jpg', 'jpeg', 'png', 'bmp']
76
+ :param filters: dl.Filters - filtering entity to filter the dataset items
77
+ :param data_path: Path to Dataloop annotations (root to "item" and "json").
78
+ :param overwrite:
79
+ :param dict id_to_label_map: Optional, {id: label_string} dictionary, default taken from dataset
80
+ :param dict label_to_id_map: Optional, {label_string: id} dictionary
81
+ :param transforms: Optional transform to be applied on a sample. list, imgaug.Sequence or torchvision.transforms.Compose
82
+ :param transforms_callback: Optional function to handle the callback of each batch.
83
+ look at default_transforms_callback for more information. available: imgaug_transforms_callback, torchvision_transforms_callback
84
+ :param num_workers: Optional - number of separate threads to load the images
85
+ :param batch_size: (int, optional): how many samples per batch to load, if not none - items will always be a list
86
+ :param collate_fn: Optional - merges a list of samples to form a mini-batch of Tensor(s).
87
+ :param shuffle: Whether to shuffle the data (default: True) If set to False, sorts the data in alphanumeric order.
88
+ :param seed: Optional random seed for shuffling and transformations.
89
+ :param to_categorical: convert label id to categorical format
90
+ :param to_mask: convert annotations to an instance mask (will be true for SEGMENTATION)
91
+ :param class_balancing: if True - performing random over-sample with class ids as the target to balance training data
92
+ :param return_originals: bool - If True, return ALSO images and annotations before transformations (for debug)
93
+ :param ignore_empty: bool - If True, generator will NOT collect items without annotations
94
+ """
95
+ self._dataset_entity = dataset_entity
96
+
97
+ # default item types (extension for now)
98
+ if item_type is None:
99
+ item_type = ['jpg', 'jpeg', 'png', 'bmp']
100
+ if not isinstance(item_type, list):
101
+ item_type = [item_type]
102
+ self.item_type = item_type
103
+
104
+ # id labels mapping
105
+ if label_to_id_map is None and id_to_label_map is None:
106
+ # if both are None - take from dataset
107
+ label_to_id_map = dataset_entity.instance_map
108
+ id_to_label_map = {int(v): k for k, v in label_to_id_map.items()}
109
+ else:
110
+ # one or both is NOT None
111
+ if label_to_id_map is None:
112
+ # set label_to_id_map from the other
113
+ label_to_id_map = {v: int(k) for k, v in id_to_label_map.items()}
114
+ if id_to_label_map is None:
115
+ # set id_to_label_map from the other
116
+ id_to_label_map = {int(v): k for k, v in label_to_id_map.items()}
117
+ # put it on the local ontology for the annotations download
118
+ dataset_entity._get_ontology().instance_map = label_to_id_map
119
+ self.id_to_label_map = id_to_label_map
120
+ self.label_to_id_map = label_to_id_map
121
+
122
+ # if annotation type is segmentation - to_mask must be True
123
+ if annotation_type == entities.AnnotationType.SEGMENTATION:
124
+ to_mask = True
125
+
126
+ if data_path is None:
127
+ data_path = os.path.join(os.path.expanduser('~'),
128
+ '.dataloop',
129
+ 'datasets',
130
+ "{}_{}".format(dataset_entity.name,
131
+ dataset_entity.id))
132
+ download = False
133
+ if os.path.isdir(data_path):
134
+ if overwrite:
135
+ logger.warning('overwrite flag is True! deleting and overwriting')
136
+ shutil.rmtree(data_path)
137
+ download = True
138
+ else:
139
+ download = True
140
+ if download:
141
+ annotation_options = [entities.ViewAnnotationOptions.JSON]
142
+ if to_mask is True:
143
+ annotation_options.append(entities.ViewAnnotationOptions.INSTANCE)
144
+ _ = dataset_entity.items.download(filters=filters,
145
+ local_path=data_path,
146
+ thickness=-1,
147
+ annotation_options=annotation_options)
148
+ self.root_dir = data_path
149
+ self._items_path = Path(self.root_dir).joinpath('items')
150
+ self._json_path = Path(self.root_dir).joinpath('json')
151
+ self._mask_path = Path(self.root_dir).joinpath('instance')
152
+ self._transforms = transforms
153
+ self._transforms_callback = transforms_callback
154
+ if self._transforms is not None and self._transforms_callback is None:
155
+ # use default callback
156
+ self._transforms_callback = default_transforms_callback
157
+
158
+ self.annotation_type = annotation_type
159
+ self.num_workers = num_workers
160
+ self.to_categorical = to_categorical
161
+ self.num_classes = len(label_to_id_map)
162
+ self.shuffle = shuffle
163
+ self.seed = seed
164
+ self.to_mask = to_mask
165
+ self.batch_size = batch_size
166
+ self.collate_fn = collate_fn
167
+ self.class_balancing = class_balancing
168
+ # inits
169
+ self.data_items = list()
170
+ # flags
171
+ self.return_originals = return_originals
172
+ self.ignore_empty = ignore_empty
173
+
174
+ ####################
175
+ # Load annotations #
176
+ ####################
177
+ self.load_annotations()
178
+
179
+ @property
180
+ def dataset_entity(self):
181
+ assert isinstance(self._dataset_entity, entities.Dataset)
182
+ return self._dataset_entity
183
+
184
+ @dataset_entity.setter
185
+ def dataset_entity(self, val):
186
+ assert isinstance(val, entities.Dataset)
187
+ self._dataset_entity = val
188
+
189
+ @property
190
+ def n_items(self):
191
+ return len(self.data_items)
192
+
193
+ def _load_single(self, image_filepath, pbar=None):
194
+ try:
195
+ is_empty = False
196
+ item_info = DataItem()
197
+ # add image path
198
+ item_info.image_filepath = str(image_filepath)
199
+ if os.stat(image_filepath).st_size < 5:
200
+ logger.warning('IGNORING corrupted image: {!r}'.format(image_filepath))
201
+ return None, True
202
+ # get "platform" path
203
+ rel_path = image_filepath.relative_to(self._items_path)
204
+ # replace suffix to JSON
205
+ rel_path_wo_png_ext = rel_path.with_suffix('.json')
206
+ # create local path
207
+ annotation_filepath = Path(self._json_path, rel_path_wo_png_ext)
208
+
209
+ if os.path.isfile(annotation_filepath):
210
+ with open(annotation_filepath, 'r') as f:
211
+ data = json.load(f)
212
+ if 'id' in data:
213
+ item_id = data.get('id')
214
+ elif '_id' in data:
215
+ item_id = data.get('_id')
216
+ annotations = entities.AnnotationCollection.from_json(data)
217
+ else:
218
+ item_id = ''
219
+ annotations = None
220
+ item_info.update(item_id=item_id)
221
+ if self.annotation_type is not None:
222
+ # add item id from json
223
+ polygon_coordinates = list()
224
+ box_coordinates = list()
225
+ classes_ids = list()
226
+ labels = list()
227
+ if annotations is not None:
228
+ for annotation in annotations:
229
+ if 'user' in annotation.metadata and \
230
+ 'model' in annotation.metadata['user']:
231
+ # and 'name' in annotation.metadata['user']['model']:
232
+ # Do not use prediction annotations in the data generator
233
+ continue
234
+ if annotation.type == self.annotation_type:
235
+ if annotation.label not in self.label_to_id_map:
236
+ logger.warning(
237
+ 'Missing label {!r} in label_to_id_map. Skipping.. Use label_to_id_map for other behaviour'.format(
238
+ annotation.label))
239
+ else:
240
+ classes_ids.append(self.label_to_id_map[annotation.label])
241
+ labels.append(annotation.label)
242
+ box_coordinates.append(np.asarray([annotation.left,
243
+ annotation.top,
244
+ annotation.right,
245
+ annotation.bottom]))
246
+ if self.annotation_type == entities.AnnotationType.POLYGON:
247
+ polygon_coordinates.append(annotation.geo)
248
+ if annotation.type not in [entities.AnnotationType.CLASSIFICATION,
249
+ entities.AnnotationType.SEGMENTATION,
250
+ entities.AnnotationType.BOX,
251
+ entities.AnnotationType.POLYGON]:
252
+ raise ValueError('unsupported annotation type: {}'.format(annotation.type))
253
+ dtype = object if self.annotation_type == entities.AnnotationType.POLYGON else None
254
+ # reorder for output
255
+ item_info.update({entities.AnnotationType.BOX.value: np.asarray(box_coordinates).astype(float),
256
+ entities.AnnotationType.CLASSIFICATION.value: np.asarray(classes_ids),
257
+ entities.AnnotationType.POLYGON.value: np.asarray(polygon_coordinates, dtype=dtype),
258
+ 'labels': labels})
259
+ if len(item_info[entities.AnnotationType.CLASSIFICATION.value]) == 0:
260
+ logger.debug('Empty annotation (nothing matched label_to_id_map) for image filename: {}'.format(
261
+ image_filepath))
262
+ is_empty = True
263
+ if self.to_mask:
264
+ # get "platform" path
265
+ rel_path = image_filepath.relative_to(self._items_path)
266
+ # replace suffix to PNG
267
+ rel_path_wo_png_ext = rel_path.with_suffix('.png')
268
+ # create local path
269
+ mask_filepath = Path(self._mask_path, rel_path_wo_png_ext)
270
+ if not os.path.isfile(mask_filepath):
271
+ logger.debug('Empty annotation for image filename: {}'.format(image_filepath))
272
+ is_empty = True
273
+ item_info.update({entities.AnnotationType.SEGMENTATION.value: str(mask_filepath)})
274
+ item_info.update(annotation_filepath=str(annotation_filepath))
275
+ return item_info, is_empty
276
+ except Exception:
277
+ logger.exception('failed loading item in generator! {!r}'.format(image_filepath))
278
+ return None, True
279
+ finally:
280
+ if pbar is not None:
281
+ pbar.update()
282
+
283
+ def load_annotations(self):
284
+ logger.info(f"Collecting items with the following extensions: {self.item_type}")
285
+ files = list()
286
+ for ext in self.item_type:
287
+ # build regex to ignore extension case
288
+ regex = '*.{}'.format(''.join(['[{}{}]'.format(letter.lower(), letter.upper()) for letter in ext]))
289
+ files.extend(self._items_path.rglob(regex))
290
+
291
+ pool = ThreadPoolExecutor(max_workers=32)
292
+ jobs = list()
293
+ pbar = tqdm.tqdm(total=len(files),
294
+ desc='Loading Data Generator',
295
+ disable=self.dataset_entity._client_api.verbose.disable_progress_bar,
296
+ file=sys.stdout)
297
+ for image_filepath in files:
298
+ jobs.append(pool.submit(self._load_single,
299
+ image_filepath=image_filepath,
300
+ pbar=pbar))
301
+ outputs = [job.result() for job in jobs]
302
+ pbar.close()
303
+
304
+ n_items = len(outputs)
305
+ n_empty_items = sum([1 for _, is_empty in outputs if is_empty is True])
306
+
307
+ output_msg = 'Done loading items. Total items loaded: {}.'.format(n_items)
308
+ if n_empty_items > 0:
309
+ output_msg += '{action} {n_empty_items} items without annotations'.format(
310
+ action='IGNORING' if self.ignore_empty else 'INCLUDING',
311
+ n_empty_items=n_empty_items)
312
+
313
+ if self.ignore_empty:
314
+ # take ONLY non-empty
315
+ data_items = [data_item for data_item, is_empty in outputs if is_empty is False]
316
+ else:
317
+ # take all
318
+ data_items = [data_item for data_item, is_empty in outputs]
319
+
320
+ self.data_items = data_items
321
+ if len(self.data_items) == 0:
322
+ logger.warning(output_msg)
323
+ else:
324
+ logger.info(output_msg)
325
+ ###################
326
+ # class balancing #
327
+ ###################
328
+ labels = [label for item in self.data_items for label in item.get('labels', list())]
329
+ logger.info(f"Data Generator labels balance statistics: {collections.Counter(labels)}")
330
+ if self.class_balancing:
331
+ try:
332
+ from imblearn.over_sampling import RandomOverSampler
333
+ except Exception:
334
+ logger.error(
335
+ 'Class balancing is ON but missing "imbalanced-learn". run "pip install -U imbalanced-learn" and try again')
336
+ raise
337
+ logger.info('Class balance is on!')
338
+ class_ids = [class_id for item in self.data_items for class_id in item['class']]
339
+ dummy_inds = [i_item for i_item, item in enumerate(self.data_items) for _ in item['class']]
340
+ over_sampler = RandomOverSampler(random_state=42)
341
+ X_res, y_res = over_sampler.fit_resample(np.asarray(dummy_inds).reshape(-1, 1), np.asarray(class_ids))
342
+ over_sampled_data_items = [self.data_items[i] for i in X_res.flatten()]
343
+ oversampled_labels = [label for item in over_sampled_data_items for label in item['labels']]
344
+ logger.info(f"Data Generator labels after oversampling: {collections.Counter(oversampled_labels)}")
345
+ self.data_items = over_sampled_data_items
346
+
347
+ if self.shuffle:
348
+ if self.seed is None:
349
+ self.seed = 256
350
+ np.random.seed(self.seed)
351
+ np.random.shuffle(self.data_items)
352
+
353
+ def transform(self, image, target=None):
354
+ if self._transforms is not None:
355
+ image, target = self._transforms_callback(transforms=self._transforms,
356
+ image=image,
357
+ target=target,
358
+ annotation_type=self.annotation_type)
359
+ return image, target
360
+
361
+ def _to_dtlpy(self, targets, labels=None):
362
+ annotations = entities.AnnotationCollection(item=None)
363
+ annotations._dataset = self._dataset_entity
364
+ if labels is None:
365
+ labels = [None] * len(targets)
366
+ if self.to_mask is True:
367
+ for label, label_ind in self.label_to_id_map.items():
368
+ target = targets == label_ind
369
+ if np.any(target):
370
+ annotations.add(annotation_definition=entities.Segmentation(geo=target,
371
+ label=label))
372
+ elif self.annotation_type == entities.AnnotationType.BOX:
373
+ for target, label in zip(targets, labels):
374
+ annotations.add(annotation_definition=entities.Box(left=target[0],
375
+ top=target[1],
376
+ right=target[2],
377
+ bottom=target[3],
378
+ label=label))
379
+ elif self.annotation_type == entities.AnnotationType.CLASSIFICATION:
380
+ for target, label in zip(targets, labels):
381
+ annotations.add(annotation_definition=entities.Classification(label=label))
382
+ elif self.annotation_type == entities.AnnotationType.POLYGON:
383
+ for target, label in zip(targets, labels):
384
+ annotations.add(annotation_definition=entities.Polygon(label=label,
385
+ geo=target.astype(float)))
386
+ else:
387
+ raise ValueError('unsupported annotation type: {}'.format(self.annotation_type))
388
+ # set dataset for color
389
+ for annotation in annotations:
390
+ annotation._dataset = self._dataset_entity
391
+ return annotations
392
+
393
+ def visualize(self, idx=None, return_output=False, plot=True):
394
+ if not self.__len__():
395
+ raise ValueError('no items selected, cannot preform visualization')
396
+ import matplotlib.pyplot as plt
397
+ if idx is None:
398
+ idx = np.random.randint(self.__len__())
399
+ if self.batch_size is not None:
400
+ raise ValueError('can visualize only of batch_size in None')
401
+ data_item = self.__getitem__(idx)
402
+ image = Image.fromarray(data_item.get('image'))
403
+ labels = data_item.get('labels')
404
+ targets = data_item.get('annotations')
405
+ annotations = self._to_dtlpy(targets=targets, labels=labels)
406
+ mask = Image.fromarray(annotations.show(height=image.size[1],
407
+ width=image.size[0],
408
+ alpha=0.8))
409
+ image.paste(mask, (0, 0), mask)
410
+ marked_image = np.asarray(image)
411
+ if plot:
412
+ plt.figure()
413
+ plt.imshow(marked_image)
414
+ if return_output:
415
+ return marked_image, annotations
416
+
417
+ def __getsingleitem__(self, idx):
418
+ data_item = copy.deepcopy(self.data_items[idx])
419
+
420
+ image_filename = data_item.get('image_filepath')
421
+ image = np.asarray(Image.open(image_filename))
422
+ data_item.update({'image': image})
423
+
424
+ annotations = data_item.get(self.annotation_type)
425
+ if self.to_mask is True:
426
+ # if segmentation - read from file
427
+ mask_filepath = data_item.get(entities.AnnotationType.SEGMENTATION)
428
+ annotations = np.asarray(Image.open(mask_filepath).convert('L'))
429
+ if self.to_categorical:
430
+ onehot = np.zeros((annotations.size, self.num_classes + 1))
431
+ onehot[np.arange(annotations.size), annotations] = 1
432
+ annotations = onehot
433
+ data_item.update({'annotations': annotations})
434
+
435
+ if self.return_originals is True:
436
+ annotations = []
437
+ if self.annotation_type is not None:
438
+ annotations = data_item.get('annotations')
439
+ data_item.update({'orig_image': image.copy(),
440
+ 'orig_annotations': annotations.copy()})
441
+
442
+ ###########################
443
+ # perform transformations #
444
+ ###########################
445
+ if self._transforms is not None:
446
+ annotations = data_item.get('annotations')
447
+ image, annotations = self.transform(image, annotations)
448
+ data_item.update({'image': image,
449
+ 'annotations': annotations})
450
+ return data_item
451
+
452
+ def __iter__(self):
453
+ """Create a generator that iterate over the Sequence."""
454
+ for item in (self[i] for i in range(len(self))):
455
+ yield item
456
+
457
+ def __len__(self):
458
+ factor = self.batch_size
459
+ if factor is None:
460
+ factor = 1
461
+ return int(np.ceil(self.n_items / factor))
462
+
463
+ def __getitem__(self, idx):
464
+ """
465
+ Support single index or a slice.
466
+ Uses ThreadPoolExecutor is num_workers != 0
467
+ """
468
+ to_return = None
469
+ if isinstance(idx, int):
470
+ if self.batch_size is None:
471
+ to_return = self.__getsingleitem__(idx)
472
+ else:
473
+ # if batch_size is define, convert idx to batches
474
+ idx = slice(idx * self.batch_size, min((idx + 1) * self.batch_size, len(self.data_items)))
475
+
476
+ if isinstance(idx, slice):
477
+ to_return = list()
478
+ idxs = list(range(idx.start, idx.stop,
479
+ idx.step if idx.step else 1))
480
+ if self.num_workers == 0:
481
+ for dx in idxs:
482
+ to_return.append(self.__getsingleitem__(dx))
483
+ else:
484
+ with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
485
+ for sample in executor.map(lambda i: self.__getsingleitem__(i), idxs):
486
+ to_return.append(sample)
487
+
488
+ if to_return is None:
489
+ raise TypeError('unsupported indexing: list indices must be integers or slices, not {}'.format(type(idx)))
490
+
491
+ if self.collate_fn is not None:
492
+ to_return = self.collate_fn(to_return)
493
+ return to_return
494
+
495
+
496
+ np_str_obj_array_pattern = re.compile(r'[SaUO]')
497
+
498
+ default_collate_err_msg_format = (
499
+ "default_collate: batch must contain tensors, numpy arrays, numbers, "
500
+ "dicts or lists; found {}")
501
+
502
+
503
+ def default_transforms_callback(transforms, image, target, annotation_type):
504
+ """
505
+ Recursive call to perform the augmentations in "transforms"
506
+
507
+ :param transforms:
508
+ :param image:
509
+ :param target:
510
+ :param annotation_type:
511
+ :return:
512
+ """
513
+ # get the type string without importing any other package
514
+ transforms_type = type(transforms)
515
+
516
+ ############
517
+ # Handle compositions and lists of augmentations with a recursive call
518
+ if transforms_type.__module__ == 'torchvision.transforms.transforms' and transforms_type.__name__ == 'Compose':
519
+ # torchvision compose - convert to list
520
+ image, target = default_transforms_callback(transforms.transforms, image, target, annotation_type)
521
+ return image, target
522
+
523
+ if transforms_type.__module__ == 'imgaug.augmenters.meta' and transforms_type.__name__ == 'Sequential':
524
+ # imgaug sequential - convert to list
525
+ image, target = default_transforms_callback(list(transforms), image, target, annotation_type)
526
+ return image, target
527
+
528
+ if isinstance(transforms, list):
529
+ for t in transforms:
530
+ image, target = default_transforms_callback(t, image, target, annotation_type)
531
+ return image, target
532
+
533
+ ##############
534
+ # Handle single annotations
535
+ if 'imgaug.augmenters' in transforms_type.__module__:
536
+ # handle single imgaug augmentation
537
+ if target is not None and annotation_type is not None:
538
+ # works for batch but running on a single image
539
+ if annotation_type == entities.AnnotationType.BOX:
540
+ image, target = transforms(images=[image], bounding_boxes=[target])
541
+ target = target[0]
542
+ elif annotation_type == entities.AnnotationType.SEGMENTATION:
543
+ # expending to HxWx1 for the imgaug function to work
544
+ target = target[..., None]
545
+ image, target = transforms(images=[image], segmentation_maps=[target])
546
+ target = target[0][:, :, 0]
547
+ elif annotation_type == entities.AnnotationType.POLYGON:
548
+ image, target = transforms(images=[image], polygons=[target])
549
+ target = target[0]
550
+ elif annotation_type == entities.AnnotationType.CLASSIFICATION:
551
+ image = transforms(images=[image])
552
+ else:
553
+ raise ValueError('unsupported annotations type for image augmentations: {}'.format(annotation_type))
554
+ image = image[0]
555
+ else:
556
+ image = transforms(images=[image])
557
+ image = image[0]
558
+ else:
559
+ image = transforms(image)
560
+
561
+ return image, target
562
+
563
+
564
+ def collate_default(batch):
565
+ r"""Puts each data field into a tensor with outer dimension batch size"""
566
+ elem = batch[0]
567
+ elem_type = type(elem)
568
+ if isinstance(elem, np.ndarray):
569
+ return np.stack(batch, axis=0)
570
+ elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
571
+ if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
572
+ # array of string classes and object
573
+ if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
574
+ raise TypeError(default_collate_err_msg_format.format(elem.dtype))
575
+ return batch
576
+ # return [tf.convert_to_tensor(b) for b in batch]
577
+ elif elem.shape == (): # scalars
578
+ return batch
579
+ elif isinstance(elem, float):
580
+ return batch
581
+ elif isinstance(elem, int):
582
+ return batch
583
+ elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
584
+ return batch
585
+ elif isinstance(elem, collections.abc.Mapping):
586
+ return {key: collate_default([d[key] for d in batch]) for key in elem}
587
+ elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
588
+ return elem_type(*(collate_default(samples) for samples in zip(*batch)))
589
+ elif isinstance(elem, collections.abc.Sequence):
590
+ transposed = zip(*batch)
591
+ return transposed
592
+ raise TypeError(default_collate_err_msg_format.format(elem_type))
593
+
594
+
595
+ def collate_torch(batch):
596
+ r"""Puts each data field into a tensor with outer dimension batch size"""
597
+ import torch
598
+ elem = batch[0]
599
+ elem_type = type(elem)
600
+ if isinstance(elem, torch.Tensor):
601
+ out = None
602
+ if torch.utils.data.get_worker_info() is not None:
603
+ # If we're in a background process, concatenate directly into a
604
+ # shared memory tensor to avoid an extra copy
605
+ numel = sum(x.numel() for x in batch)
606
+ storage = elem.storage()._new_shared(numel)
607
+ out = elem.new(storage)
608
+ return torch.stack(batch, 0, out=out)
609
+ elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
610
+ if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
611
+ # array of string classes and object
612
+ if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
613
+ raise TypeError(default_collate_err_msg_format.format(elem.dtype))
614
+ try:
615
+ return torch.stack([torch.as_tensor(b) for b in batch])
616
+ except RuntimeError:
617
+ return batch
618
+ elif elem.shape == (): # scalars
619
+ return torch.as_tensor(batch)
620
+ elif isinstance(elem, float):
621
+ return torch.tensor(batch, dtype=torch.float64)
622
+ elif isinstance(elem, int):
623
+ return torch.tensor(batch)
624
+ elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
625
+ return batch
626
+ elif isinstance(elem, collections.abc.Mapping):
627
+ return {key: collate_torch([d[key] for d in batch]) for key in elem}
628
+ elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
629
+ return elem_type(*(collate_torch(samples) for samples in zip(*batch)))
630
+ elif isinstance(elem, collections.abc.Sequence):
631
+ transposed = zip(*batch)
632
+ return transposed
633
+
634
+ raise TypeError(default_collate_err_msg_format.format(elem_type))
635
+
636
+
637
+ def collate_tf(batch):
638
+ r"""Puts each data field into a tensor with outer dimension batch size"""
639
+ import tensorflow as tf
640
+ elem = batch[0]
641
+ elem_type = type(elem)
642
+ if isinstance(elem, tf.Tensor):
643
+ return tf.stack(batch, axis=0)
644
+ elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
645
+ if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
646
+ # array of string classes and object
647
+ if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
648
+ raise TypeError(default_collate_err_msg_format.format(elem.dtype))
649
+ try:
650
+ return tf.convert_to_tensor(batch)
651
+ except ValueError:
652
+ # failed on orig_image because of a mismatch in the shape (not resizing all the images so cannot stack)
653
+ return batch
654
+ # return [tf.convert_to_tensor(b) for b in batch]
655
+ elif elem.shape == (): # scalars
656
+ return tf.convert_to_tensor(batch)
657
+ elif isinstance(elem, float):
658
+ return tf.convert_to_tensor(batch, dtype=tf.float64)
659
+ elif isinstance(elem, int):
660
+ return tf.convert_to_tensor(batch)
661
+ elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
662
+ return batch
663
+ elif isinstance(elem, collections.abc.Mapping):
664
+ return {key: collate_tf([d[key] for d in batch]) for key in elem}
665
+ elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
666
+ return elem_type(*(collate_tf(samples) for samples in zip(*batch)))
667
+ elif isinstance(elem, collections.abc.Sequence):
668
+ transposed = zip(*batch)
669
+ return transposed
670
+ raise TypeError(default_collate_err_msg_format.format(elem_type))