dtlpy 1.114.17__py3-none-any.whl → 1.116.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. dtlpy/__init__.py +491 -491
  2. dtlpy/__version__.py +1 -1
  3. dtlpy/assets/__init__.py +26 -26
  4. dtlpy/assets/code_server/config.yaml +2 -2
  5. dtlpy/assets/code_server/installation.sh +24 -24
  6. dtlpy/assets/code_server/launch.json +13 -13
  7. dtlpy/assets/code_server/settings.json +2 -2
  8. dtlpy/assets/main.py +53 -53
  9. dtlpy/assets/main_partial.py +18 -18
  10. dtlpy/assets/mock.json +11 -11
  11. dtlpy/assets/model_adapter.py +83 -83
  12. dtlpy/assets/package.json +61 -61
  13. dtlpy/assets/package_catalog.json +29 -29
  14. dtlpy/assets/package_gitignore +307 -307
  15. dtlpy/assets/service_runners/__init__.py +33 -33
  16. dtlpy/assets/service_runners/converter.py +96 -96
  17. dtlpy/assets/service_runners/multi_method.py +49 -49
  18. dtlpy/assets/service_runners/multi_method_annotation.py +54 -54
  19. dtlpy/assets/service_runners/multi_method_dataset.py +55 -55
  20. dtlpy/assets/service_runners/multi_method_item.py +52 -52
  21. dtlpy/assets/service_runners/multi_method_json.py +52 -52
  22. dtlpy/assets/service_runners/single_method.py +37 -37
  23. dtlpy/assets/service_runners/single_method_annotation.py +43 -43
  24. dtlpy/assets/service_runners/single_method_dataset.py +43 -43
  25. dtlpy/assets/service_runners/single_method_item.py +41 -41
  26. dtlpy/assets/service_runners/single_method_json.py +42 -42
  27. dtlpy/assets/service_runners/single_method_multi_input.py +45 -45
  28. dtlpy/assets/voc_annotation_template.xml +23 -23
  29. dtlpy/caches/base_cache.py +32 -32
  30. dtlpy/caches/cache.py +473 -473
  31. dtlpy/caches/dl_cache.py +201 -201
  32. dtlpy/caches/filesystem_cache.py +89 -89
  33. dtlpy/caches/redis_cache.py +84 -84
  34. dtlpy/dlp/__init__.py +20 -20
  35. dtlpy/dlp/cli_utilities.py +367 -367
  36. dtlpy/dlp/command_executor.py +764 -764
  37. dtlpy/dlp/dlp +1 -1
  38. dtlpy/dlp/dlp.bat +1 -1
  39. dtlpy/dlp/dlp.py +128 -128
  40. dtlpy/dlp/parser.py +651 -651
  41. dtlpy/entities/__init__.py +83 -83
  42. dtlpy/entities/analytic.py +347 -311
  43. dtlpy/entities/annotation.py +1879 -1879
  44. dtlpy/entities/annotation_collection.py +699 -699
  45. dtlpy/entities/annotation_definitions/__init__.py +20 -20
  46. dtlpy/entities/annotation_definitions/base_annotation_definition.py +100 -100
  47. dtlpy/entities/annotation_definitions/box.py +195 -195
  48. dtlpy/entities/annotation_definitions/classification.py +67 -67
  49. dtlpy/entities/annotation_definitions/comparison.py +72 -72
  50. dtlpy/entities/annotation_definitions/cube.py +204 -204
  51. dtlpy/entities/annotation_definitions/cube_3d.py +149 -149
  52. dtlpy/entities/annotation_definitions/description.py +32 -32
  53. dtlpy/entities/annotation_definitions/ellipse.py +124 -124
  54. dtlpy/entities/annotation_definitions/free_text.py +62 -62
  55. dtlpy/entities/annotation_definitions/gis.py +69 -69
  56. dtlpy/entities/annotation_definitions/note.py +139 -139
  57. dtlpy/entities/annotation_definitions/point.py +117 -117
  58. dtlpy/entities/annotation_definitions/polygon.py +182 -182
  59. dtlpy/entities/annotation_definitions/polyline.py +111 -111
  60. dtlpy/entities/annotation_definitions/pose.py +92 -92
  61. dtlpy/entities/annotation_definitions/ref_image.py +86 -86
  62. dtlpy/entities/annotation_definitions/segmentation.py +240 -240
  63. dtlpy/entities/annotation_definitions/subtitle.py +34 -34
  64. dtlpy/entities/annotation_definitions/text.py +85 -85
  65. dtlpy/entities/annotation_definitions/undefined_annotation.py +74 -74
  66. dtlpy/entities/app.py +220 -220
  67. dtlpy/entities/app_module.py +107 -107
  68. dtlpy/entities/artifact.py +174 -174
  69. dtlpy/entities/assignment.py +399 -399
  70. dtlpy/entities/base_entity.py +214 -214
  71. dtlpy/entities/bot.py +113 -113
  72. dtlpy/entities/codebase.py +292 -296
  73. dtlpy/entities/collection.py +38 -38
  74. dtlpy/entities/command.py +169 -169
  75. dtlpy/entities/compute.py +449 -442
  76. dtlpy/entities/dataset.py +1299 -1285
  77. dtlpy/entities/directory_tree.py +44 -44
  78. dtlpy/entities/dpk.py +470 -470
  79. dtlpy/entities/driver.py +235 -223
  80. dtlpy/entities/execution.py +397 -397
  81. dtlpy/entities/feature.py +124 -124
  82. dtlpy/entities/feature_set.py +145 -145
  83. dtlpy/entities/filters.py +798 -645
  84. dtlpy/entities/gis_item.py +107 -107
  85. dtlpy/entities/integration.py +184 -184
  86. dtlpy/entities/item.py +959 -953
  87. dtlpy/entities/label.py +123 -123
  88. dtlpy/entities/links.py +85 -85
  89. dtlpy/entities/message.py +175 -175
  90. dtlpy/entities/model.py +684 -684
  91. dtlpy/entities/node.py +1005 -1005
  92. dtlpy/entities/ontology.py +810 -803
  93. dtlpy/entities/organization.py +287 -287
  94. dtlpy/entities/package.py +657 -657
  95. dtlpy/entities/package_defaults.py +5 -5
  96. dtlpy/entities/package_function.py +185 -185
  97. dtlpy/entities/package_module.py +113 -113
  98. dtlpy/entities/package_slot.py +118 -118
  99. dtlpy/entities/paged_entities.py +299 -299
  100. dtlpy/entities/pipeline.py +624 -624
  101. dtlpy/entities/pipeline_execution.py +279 -279
  102. dtlpy/entities/project.py +394 -394
  103. dtlpy/entities/prompt_item.py +505 -499
  104. dtlpy/entities/recipe.py +301 -301
  105. dtlpy/entities/reflect_dict.py +102 -102
  106. dtlpy/entities/resource_execution.py +138 -138
  107. dtlpy/entities/service.py +963 -958
  108. dtlpy/entities/service_driver.py +117 -117
  109. dtlpy/entities/setting.py +294 -294
  110. dtlpy/entities/task.py +495 -495
  111. dtlpy/entities/time_series.py +143 -143
  112. dtlpy/entities/trigger.py +426 -426
  113. dtlpy/entities/user.py +118 -118
  114. dtlpy/entities/webhook.py +124 -124
  115. dtlpy/examples/__init__.py +19 -19
  116. dtlpy/examples/add_labels.py +135 -135
  117. dtlpy/examples/add_metadata_to_item.py +21 -21
  118. dtlpy/examples/annotate_items_using_model.py +65 -65
  119. dtlpy/examples/annotate_video_using_model_and_tracker.py +75 -75
  120. dtlpy/examples/annotations_convert_to_voc.py +9 -9
  121. dtlpy/examples/annotations_convert_to_yolo.py +9 -9
  122. dtlpy/examples/convert_annotation_types.py +51 -51
  123. dtlpy/examples/converter.py +143 -143
  124. dtlpy/examples/copy_annotations.py +22 -22
  125. dtlpy/examples/copy_folder.py +31 -31
  126. dtlpy/examples/create_annotations.py +51 -51
  127. dtlpy/examples/create_video_annotations.py +83 -83
  128. dtlpy/examples/delete_annotations.py +26 -26
  129. dtlpy/examples/filters.py +113 -113
  130. dtlpy/examples/move_item.py +23 -23
  131. dtlpy/examples/play_video_annotation.py +13 -13
  132. dtlpy/examples/show_item_and_mask.py +53 -53
  133. dtlpy/examples/triggers.py +49 -49
  134. dtlpy/examples/upload_batch_of_items.py +20 -20
  135. dtlpy/examples/upload_items_and_custom_format_annotations.py +55 -55
  136. dtlpy/examples/upload_items_with_modalities.py +43 -43
  137. dtlpy/examples/upload_segmentation_annotations_from_mask_image.py +44 -44
  138. dtlpy/examples/upload_yolo_format_annotations.py +70 -70
  139. dtlpy/exceptions.py +125 -125
  140. dtlpy/miscellaneous/__init__.py +20 -20
  141. dtlpy/miscellaneous/dict_differ.py +95 -95
  142. dtlpy/miscellaneous/git_utils.py +217 -217
  143. dtlpy/miscellaneous/json_utils.py +14 -14
  144. dtlpy/miscellaneous/list_print.py +105 -105
  145. dtlpy/miscellaneous/zipping.py +130 -130
  146. dtlpy/ml/__init__.py +20 -20
  147. dtlpy/ml/base_feature_extractor_adapter.py +27 -27
  148. dtlpy/ml/base_model_adapter.py +1257 -1086
  149. dtlpy/ml/metrics.py +461 -461
  150. dtlpy/ml/predictions_utils.py +274 -274
  151. dtlpy/ml/summary_writer.py +57 -57
  152. dtlpy/ml/train_utils.py +60 -60
  153. dtlpy/new_instance.py +252 -252
  154. dtlpy/repositories/__init__.py +56 -56
  155. dtlpy/repositories/analytics.py +85 -85
  156. dtlpy/repositories/annotations.py +916 -916
  157. dtlpy/repositories/apps.py +383 -383
  158. dtlpy/repositories/artifacts.py +452 -452
  159. dtlpy/repositories/assignments.py +599 -599
  160. dtlpy/repositories/bots.py +213 -213
  161. dtlpy/repositories/codebases.py +559 -559
  162. dtlpy/repositories/collections.py +332 -332
  163. dtlpy/repositories/commands.py +152 -158
  164. dtlpy/repositories/compositions.py +61 -61
  165. dtlpy/repositories/computes.py +439 -435
  166. dtlpy/repositories/datasets.py +1504 -1291
  167. dtlpy/repositories/downloader.py +976 -903
  168. dtlpy/repositories/dpks.py +433 -433
  169. dtlpy/repositories/drivers.py +482 -470
  170. dtlpy/repositories/executions.py +815 -817
  171. dtlpy/repositories/feature_sets.py +226 -226
  172. dtlpy/repositories/features.py +255 -238
  173. dtlpy/repositories/integrations.py +484 -484
  174. dtlpy/repositories/items.py +912 -909
  175. dtlpy/repositories/messages.py +94 -94
  176. dtlpy/repositories/models.py +1000 -988
  177. dtlpy/repositories/nodes.py +80 -80
  178. dtlpy/repositories/ontologies.py +511 -511
  179. dtlpy/repositories/organizations.py +525 -525
  180. dtlpy/repositories/packages.py +1941 -1941
  181. dtlpy/repositories/pipeline_executions.py +451 -451
  182. dtlpy/repositories/pipelines.py +640 -640
  183. dtlpy/repositories/projects.py +539 -539
  184. dtlpy/repositories/recipes.py +419 -399
  185. dtlpy/repositories/resource_executions.py +137 -137
  186. dtlpy/repositories/schema.py +120 -120
  187. dtlpy/repositories/service_drivers.py +213 -213
  188. dtlpy/repositories/services.py +1704 -1704
  189. dtlpy/repositories/settings.py +339 -339
  190. dtlpy/repositories/tasks.py +1477 -1477
  191. dtlpy/repositories/times_series.py +278 -278
  192. dtlpy/repositories/triggers.py +536 -536
  193. dtlpy/repositories/upload_element.py +257 -257
  194. dtlpy/repositories/uploader.py +661 -651
  195. dtlpy/repositories/webhooks.py +249 -249
  196. dtlpy/services/__init__.py +22 -22
  197. dtlpy/services/aihttp_retry.py +131 -131
  198. dtlpy/services/api_client.py +1785 -1782
  199. dtlpy/services/api_reference.py +40 -40
  200. dtlpy/services/async_utils.py +133 -133
  201. dtlpy/services/calls_counter.py +44 -44
  202. dtlpy/services/check_sdk.py +68 -68
  203. dtlpy/services/cookie.py +115 -115
  204. dtlpy/services/create_logger.py +156 -156
  205. dtlpy/services/events.py +84 -84
  206. dtlpy/services/logins.py +235 -235
  207. dtlpy/services/reporter.py +256 -256
  208. dtlpy/services/service_defaults.py +91 -91
  209. dtlpy/utilities/__init__.py +20 -20
  210. dtlpy/utilities/annotations/__init__.py +16 -16
  211. dtlpy/utilities/annotations/annotation_converters.py +269 -269
  212. dtlpy/utilities/base_package_runner.py +285 -264
  213. dtlpy/utilities/converter.py +1650 -1650
  214. dtlpy/utilities/dataset_generators/__init__.py +1 -1
  215. dtlpy/utilities/dataset_generators/dataset_generator.py +670 -670
  216. dtlpy/utilities/dataset_generators/dataset_generator_tensorflow.py +23 -23
  217. dtlpy/utilities/dataset_generators/dataset_generator_torch.py +21 -21
  218. dtlpy/utilities/local_development/__init__.py +1 -1
  219. dtlpy/utilities/local_development/local_session.py +179 -179
  220. dtlpy/utilities/reports/__init__.py +2 -2
  221. dtlpy/utilities/reports/figures.py +343 -343
  222. dtlpy/utilities/reports/report.py +71 -71
  223. dtlpy/utilities/videos/__init__.py +17 -17
  224. dtlpy/utilities/videos/video_player.py +598 -598
  225. dtlpy/utilities/videos/videos.py +470 -470
  226. {dtlpy-1.114.17.data → dtlpy-1.116.6.data}/scripts/dlp +1 -1
  227. dtlpy-1.116.6.data/scripts/dlp.bat +2 -0
  228. {dtlpy-1.114.17.data → dtlpy-1.116.6.data}/scripts/dlp.py +128 -128
  229. {dtlpy-1.114.17.dist-info → dtlpy-1.116.6.dist-info}/METADATA +186 -183
  230. dtlpy-1.116.6.dist-info/RECORD +239 -0
  231. {dtlpy-1.114.17.dist-info → dtlpy-1.116.6.dist-info}/WHEEL +1 -1
  232. {dtlpy-1.114.17.dist-info → dtlpy-1.116.6.dist-info}/licenses/LICENSE +200 -200
  233. tests/features/environment.py +551 -551
  234. dtlpy/assets/__pycache__/__init__.cpython-310.pyc +0 -0
  235. dtlpy-1.114.17.data/scripts/dlp.bat +0 -2
  236. dtlpy-1.114.17.dist-info/RECORD +0 -240
  237. {dtlpy-1.114.17.dist-info → dtlpy-1.116.6.dist-info}/entry_points.txt +0 -0
  238. {dtlpy-1.114.17.dist-info → dtlpy-1.116.6.dist-info}/top_level.txt +0 -0
@@ -1,670 +1,670 @@
1
- from concurrent.futures import ThreadPoolExecutor
2
- from pathlib import Path
3
- from PIL import Image
4
- import collections.abc
5
- import numpy as np
6
- import collections
7
- import logging
8
- import shutil
9
- import json
10
- import copy
11
- import tqdm
12
- import sys
13
- import os
14
- import re
15
- from ... import entities
16
-
17
- logger = logging.getLogger(name='dtlpy')
18
-
19
-
20
- class DataItem(dict):
21
- def __init__(self, *args, **kwargs):
22
- super(DataItem, self).__init__(*args, **kwargs)
23
-
24
- @property
25
- def image_filepath(self):
26
- return self['image_filepath']
27
-
28
- @image_filepath.setter
29
- def image_filepath(self, val):
30
- self['image_filepath'] = val
31
-
32
-
33
- class DatasetGenerator:
34
-
35
- def __init__(self,
36
- dataset_entity: entities.Dataset,
37
- annotation_type: entities.AnnotationType,
38
- item_type: list = None,
39
- filters: entities.Filters = None,
40
- data_path=None,
41
- overwrite=False,
42
- id_to_label_map=None,
43
- label_to_id_map=None,
44
- transforms=None,
45
- transforms_callback=None,
46
- num_workers=0,
47
- batch_size=None,
48
- collate_fn=None,
49
- shuffle=True,
50
- seed=None,
51
- to_categorical=False,
52
- to_mask=False,
53
- class_balancing=False,
54
- # debug flags
55
- return_originals=False,
56
- ignore_empty=True
57
- ) -> None:
58
- """
59
- Base Dataset Generator to build and iterate over images and annotations
60
-
61
- * Mapping Labels *
62
- To set a label mapping from labels to id you can use the `label_to_id_map` or `id_to_label_map`.
63
- NOTE: if they are not i.i.d you'll need to input both.
64
- In semantic, a `$default` label should be added so that the background (and all unlabeled pixels) will be
65
- mapped to the model's inputs
66
-
67
- label_to_id_map = {'cat': 1,
68
- 'dog': 1,
69
- '$default': 0}
70
- id_to_label_map = {1: 'cats_and_dogs',
71
- 0: 'background'}
72
-
73
- :param dataset_entity: dl.Dataset entity
74
- :param annotation_type: dl.AnnotationType - type of annotation to load from the annotated dataset
75
- :param item_type: list of file extension to load. default: ['jpg', 'jpeg', 'png', 'bmp']
76
- :param filters: dl.Filters - filtering entity to filter the dataset items
77
- :param data_path: Path to Dataloop annotations (root to "item" and "json").
78
- :param overwrite:
79
- :param dict id_to_label_map: Optional, {id: label_string} dictionary, default taken from dataset
80
- :param dict label_to_id_map: Optional, {label_string: id} dictionary
81
- :param transforms: Optional transform to be applied on a sample. list, imgaug.Sequence or torchvision.transforms.Compose
82
- :param transforms_callback: Optional function to handle the callback of each batch.
83
- look at default_transforms_callback for more information. available: imgaug_transforms_callback, torchvision_transforms_callback
84
- :param num_workers: Optional - number of separate threads to load the images
85
- :param batch_size: (int, optional): how many samples per batch to load, if not none - items will always be a list
86
- :param collate_fn: Optional - merges a list of samples to form a mini-batch of Tensor(s).
87
- :param shuffle: Whether to shuffle the data (default: True) If set to False, sorts the data in alphanumeric order.
88
- :param seed: Optional random seed for shuffling and transformations.
89
- :param to_categorical: convert label id to categorical format
90
- :param to_mask: convert annotations to an instance mask (will be true for SEGMENTATION)
91
- :param class_balancing: if True - performing random over-sample with class ids as the target to balance training data
92
- :param return_originals: bool - If True, return ALSO images and annotations before transformations (for debug)
93
- :param ignore_empty: bool - If True, generator will NOT collect items without annotations
94
- """
95
- self._dataset_entity = dataset_entity
96
-
97
- # default item types (extension for now)
98
- if item_type is None:
99
- item_type = ['jpg', 'jpeg', 'png', 'bmp']
100
- if not isinstance(item_type, list):
101
- item_type = [item_type]
102
- self.item_type = item_type
103
-
104
- # id labels mapping
105
- if label_to_id_map is None and id_to_label_map is None:
106
- # if both are None - take from dataset
107
- label_to_id_map = dataset_entity.instance_map
108
- id_to_label_map = {int(v): k for k, v in label_to_id_map.items()}
109
- else:
110
- # one or both is NOT None
111
- if label_to_id_map is None:
112
- # set label_to_id_map from the other
113
- label_to_id_map = {v: int(k) for k, v in id_to_label_map.items()}
114
- if id_to_label_map is None:
115
- # set id_to_label_map from the other
116
- id_to_label_map = {int(v): k for k, v in label_to_id_map.items()}
117
- # put it on the local ontology for the annotations download
118
- dataset_entity._get_ontology().instance_map = label_to_id_map
119
- self.id_to_label_map = id_to_label_map
120
- self.label_to_id_map = label_to_id_map
121
-
122
- # if annotation type is segmentation - to_mask must be True
123
- if annotation_type == entities.AnnotationType.SEGMENTATION:
124
- to_mask = True
125
-
126
- if data_path is None:
127
- data_path = os.path.join(os.path.expanduser('~'),
128
- '.dataloop',
129
- 'datasets',
130
- "{}_{}".format(dataset_entity.name,
131
- dataset_entity.id))
132
- download = False
133
- if os.path.isdir(data_path):
134
- if overwrite:
135
- logger.warning('overwrite flag is True! deleting and overwriting')
136
- shutil.rmtree(data_path)
137
- download = True
138
- else:
139
- download = True
140
- if download:
141
- annotation_options = [entities.ViewAnnotationOptions.JSON]
142
- if to_mask is True:
143
- annotation_options.append(entities.ViewAnnotationOptions.INSTANCE)
144
- _ = dataset_entity.items.download(filters=filters,
145
- local_path=data_path,
146
- thickness=-1,
147
- annotation_options=annotation_options)
148
- self.root_dir = data_path
149
- self._items_path = Path(self.root_dir).joinpath('items')
150
- self._json_path = Path(self.root_dir).joinpath('json')
151
- self._mask_path = Path(self.root_dir).joinpath('instance')
152
- self._transforms = transforms
153
- self._transforms_callback = transforms_callback
154
- if self._transforms is not None and self._transforms_callback is None:
155
- # use default callback
156
- self._transforms_callback = default_transforms_callback
157
-
158
- self.annotation_type = annotation_type
159
- self.num_workers = num_workers
160
- self.to_categorical = to_categorical
161
- self.num_classes = len(label_to_id_map)
162
- self.shuffle = shuffle
163
- self.seed = seed
164
- self.to_mask = to_mask
165
- self.batch_size = batch_size
166
- self.collate_fn = collate_fn
167
- self.class_balancing = class_balancing
168
- # inits
169
- self.data_items = list()
170
- # flags
171
- self.return_originals = return_originals
172
- self.ignore_empty = ignore_empty
173
-
174
- ####################
175
- # Load annotations #
176
- ####################
177
- self.load_annotations()
178
-
179
- @property
180
- def dataset_entity(self):
181
- assert isinstance(self._dataset_entity, entities.Dataset)
182
- return self._dataset_entity
183
-
184
- @dataset_entity.setter
185
- def dataset_entity(self, val):
186
- assert isinstance(val, entities.Dataset)
187
- self._dataset_entity = val
188
-
189
- @property
190
- def n_items(self):
191
- return len(self.data_items)
192
-
193
- def _load_single(self, image_filepath, pbar=None):
194
- try:
195
- is_empty = False
196
- item_info = DataItem()
197
- # add image path
198
- item_info.image_filepath = str(image_filepath)
199
- if os.stat(image_filepath).st_size < 5:
200
- logger.warning('IGNORING corrupted image: {!r}'.format(image_filepath))
201
- return None, True
202
- # get "platform" path
203
- rel_path = image_filepath.relative_to(self._items_path)
204
- # replace suffix to JSON
205
- rel_path_wo_png_ext = rel_path.with_suffix('.json')
206
- # create local path
207
- annotation_filepath = Path(self._json_path, rel_path_wo_png_ext)
208
-
209
- if os.path.isfile(annotation_filepath):
210
- with open(annotation_filepath, 'r') as f:
211
- data = json.load(f)
212
- if 'id' in data:
213
- item_id = data.get('id')
214
- elif '_id' in data:
215
- item_id = data.get('_id')
216
- annotations = entities.AnnotationCollection.from_json(data)
217
- else:
218
- item_id = ''
219
- annotations = None
220
- item_info.update(item_id=item_id)
221
- if self.annotation_type is not None:
222
- # add item id from json
223
- polygon_coordinates = list()
224
- box_coordinates = list()
225
- classes_ids = list()
226
- labels = list()
227
- if annotations is not None:
228
- for annotation in annotations:
229
- if 'user' in annotation.metadata and \
230
- 'model' in annotation.metadata['user']:
231
- # and 'name' in annotation.metadata['user']['model']:
232
- # Do not use prediction annotations in the data generator
233
- continue
234
- if annotation.type == self.annotation_type:
235
- if annotation.label not in self.label_to_id_map:
236
- logger.warning(
237
- 'Missing label {!r} in label_to_id_map. Skipping.. Use label_to_id_map for other behaviour'.format(
238
- annotation.label))
239
- else:
240
- classes_ids.append(self.label_to_id_map[annotation.label])
241
- labels.append(annotation.label)
242
- box_coordinates.append(np.asarray([annotation.left,
243
- annotation.top,
244
- annotation.right,
245
- annotation.bottom]))
246
- if self.annotation_type == entities.AnnotationType.POLYGON:
247
- polygon_coordinates.append(annotation.geo)
248
- if annotation.type not in [entities.AnnotationType.CLASSIFICATION,
249
- entities.AnnotationType.SEGMENTATION,
250
- entities.AnnotationType.BOX,
251
- entities.AnnotationType.POLYGON]:
252
- raise ValueError('unsupported annotation type: {}'.format(annotation.type))
253
- dtype = object if self.annotation_type == entities.AnnotationType.POLYGON else None
254
- # reorder for output
255
- item_info.update({entities.AnnotationType.BOX.value: np.asarray(box_coordinates).astype(float),
256
- entities.AnnotationType.CLASSIFICATION.value: np.asarray(classes_ids),
257
- entities.AnnotationType.POLYGON.value: np.asarray(polygon_coordinates, dtype=dtype),
258
- 'labels': labels})
259
- if len(item_info[entities.AnnotationType.CLASSIFICATION.value]) == 0:
260
- logger.debug('Empty annotation (nothing matched label_to_id_map) for image filename: {}'.format(
261
- image_filepath))
262
- is_empty = True
263
- if self.to_mask:
264
- # get "platform" path
265
- rel_path = image_filepath.relative_to(self._items_path)
266
- # replace suffix to PNG
267
- rel_path_wo_png_ext = rel_path.with_suffix('.png')
268
- # create local path
269
- mask_filepath = Path(self._mask_path, rel_path_wo_png_ext)
270
- if not os.path.isfile(mask_filepath):
271
- logger.debug('Empty annotation for image filename: {}'.format(image_filepath))
272
- is_empty = True
273
- item_info.update({entities.AnnotationType.SEGMENTATION.value: str(mask_filepath)})
274
- item_info.update(annotation_filepath=str(annotation_filepath))
275
- return item_info, is_empty
276
- except Exception:
277
- logger.exception('failed loading item in generator! {!r}'.format(image_filepath))
278
- return None, True
279
- finally:
280
- if pbar is not None:
281
- pbar.update()
282
-
283
- def load_annotations(self):
284
- logger.info(f"Collecting items with the following extensions: {self.item_type}")
285
- files = list()
286
- for ext in self.item_type:
287
- # build regex to ignore extension case
288
- regex = '*.{}'.format(''.join(['[{}{}]'.format(letter.lower(), letter.upper()) for letter in ext]))
289
- files.extend(self._items_path.rglob(regex))
290
-
291
- pool = ThreadPoolExecutor(max_workers=32)
292
- jobs = list()
293
- pbar = tqdm.tqdm(total=len(files),
294
- desc='Loading Data Generator',
295
- disable=self.dataset_entity._client_api.verbose.disable_progress_bar,
296
- file=sys.stdout)
297
- for image_filepath in files:
298
- jobs.append(pool.submit(self._load_single,
299
- image_filepath=image_filepath,
300
- pbar=pbar))
301
- outputs = [job.result() for job in jobs]
302
- pbar.close()
303
-
304
- n_items = len(outputs)
305
- n_empty_items = sum([1 for _, is_empty in outputs if is_empty is True])
306
-
307
- output_msg = 'Done loading items. Total items loaded: {}.'.format(n_items)
308
- if n_empty_items > 0:
309
- output_msg += '{action} {n_empty_items} items without annotations'.format(
310
- action='IGNORING' if self.ignore_empty else 'INCLUDING',
311
- n_empty_items=n_empty_items)
312
-
313
- if self.ignore_empty:
314
- # take ONLY non-empty
315
- data_items = [data_item for data_item, is_empty in outputs if is_empty is False]
316
- else:
317
- # take all
318
- data_items = [data_item for data_item, is_empty in outputs]
319
-
320
- self.data_items = data_items
321
- if len(self.data_items) == 0:
322
- logger.warning(output_msg)
323
- else:
324
- logger.info(output_msg)
325
- ###################
326
- # class balancing #
327
- ###################
328
- labels = [label for item in self.data_items for label in item.get('labels', list())]
329
- logger.info(f"Data Generator labels balance statistics: {collections.Counter(labels)}")
330
- if self.class_balancing:
331
- try:
332
- from imblearn.over_sampling import RandomOverSampler
333
- except Exception:
334
- logger.error(
335
- 'Class balancing is ON but missing "imbalanced-learn". run "pip install -U imbalanced-learn" and try again')
336
- raise
337
- logger.info('Class balance is on!')
338
- class_ids = [class_id for item in self.data_items for class_id in item['class']]
339
- dummy_inds = [i_item for i_item, item in enumerate(self.data_items) for _ in item['class']]
340
- over_sampler = RandomOverSampler(random_state=42)
341
- X_res, y_res = over_sampler.fit_resample(np.asarray(dummy_inds).reshape(-1, 1), np.asarray(class_ids))
342
- over_sampled_data_items = [self.data_items[i] for i in X_res.flatten()]
343
- oversampled_labels = [label for item in over_sampled_data_items for label in item['labels']]
344
- logger.info(f"Data Generator labels after oversampling: {collections.Counter(oversampled_labels)}")
345
- self.data_items = over_sampled_data_items
346
-
347
- if self.shuffle:
348
- if self.seed is None:
349
- self.seed = 256
350
- np.random.seed(self.seed)
351
- np.random.shuffle(self.data_items)
352
-
353
- def transform(self, image, target=None):
354
- if self._transforms is not None:
355
- image, target = self._transforms_callback(transforms=self._transforms,
356
- image=image,
357
- target=target,
358
- annotation_type=self.annotation_type)
359
- return image, target
360
-
361
- def _to_dtlpy(self, targets, labels=None):
362
- annotations = entities.AnnotationCollection(item=None)
363
- annotations._dataset = self._dataset_entity
364
- if labels is None:
365
- labels = [None] * len(targets)
366
- if self.to_mask is True:
367
- for label, label_ind in self.label_to_id_map.items():
368
- target = targets == label_ind
369
- if np.any(target):
370
- annotations.add(annotation_definition=entities.Segmentation(geo=target,
371
- label=label))
372
- elif self.annotation_type == entities.AnnotationType.BOX:
373
- for target, label in zip(targets, labels):
374
- annotations.add(annotation_definition=entities.Box(left=target[0],
375
- top=target[1],
376
- right=target[2],
377
- bottom=target[3],
378
- label=label))
379
- elif self.annotation_type == entities.AnnotationType.CLASSIFICATION:
380
- for target, label in zip(targets, labels):
381
- annotations.add(annotation_definition=entities.Classification(label=label))
382
- elif self.annotation_type == entities.AnnotationType.POLYGON:
383
- for target, label in zip(targets, labels):
384
- annotations.add(annotation_definition=entities.Polygon(label=label,
385
- geo=target.astype(float)))
386
- else:
387
- raise ValueError('unsupported annotation type: {}'.format(self.annotation_type))
388
- # set dataset for color
389
- for annotation in annotations:
390
- annotation._dataset = self._dataset_entity
391
- return annotations
392
-
393
- def visualize(self, idx=None, return_output=False, plot=True):
394
- if not self.__len__():
395
- raise ValueError('no items selected, cannot preform visualization')
396
- import matplotlib.pyplot as plt
397
- if idx is None:
398
- idx = np.random.randint(self.__len__())
399
- if self.batch_size is not None:
400
- raise ValueError('can visualize only of batch_size in None')
401
- data_item = self.__getitem__(idx)
402
- image = Image.fromarray(data_item.get('image'))
403
- labels = data_item.get('labels')
404
- targets = data_item.get('annotations')
405
- annotations = self._to_dtlpy(targets=targets, labels=labels)
406
- mask = Image.fromarray(annotations.show(height=image.size[1],
407
- width=image.size[0],
408
- alpha=0.8))
409
- image.paste(mask, (0, 0), mask)
410
- marked_image = np.asarray(image)
411
- if plot:
412
- plt.figure()
413
- plt.imshow(marked_image)
414
- if return_output:
415
- return marked_image, annotations
416
-
417
- def __getsingleitem__(self, idx):
418
- data_item = copy.deepcopy(self.data_items[idx])
419
-
420
- image_filename = data_item.get('image_filepath')
421
- image = np.asarray(Image.open(image_filename))
422
- data_item.update({'image': image})
423
-
424
- annotations = data_item.get(self.annotation_type)
425
- if self.to_mask is True:
426
- # if segmentation - read from file
427
- mask_filepath = data_item.get(entities.AnnotationType.SEGMENTATION)
428
- annotations = np.asarray(Image.open(mask_filepath).convert('L'))
429
- if self.to_categorical:
430
- onehot = np.zeros((annotations.size, self.num_classes + 1))
431
- onehot[np.arange(annotations.size), annotations] = 1
432
- annotations = onehot
433
- data_item.update({'annotations': annotations})
434
-
435
- if self.return_originals is True:
436
- annotations = []
437
- if self.annotation_type is not None:
438
- annotations = data_item.get('annotations')
439
- data_item.update({'orig_image': image.copy(),
440
- 'orig_annotations': annotations.copy()})
441
-
442
- ###########################
443
- # perform transformations #
444
- ###########################
445
- if self._transforms is not None:
446
- annotations = data_item.get('annotations')
447
- image, annotations = self.transform(image, annotations)
448
- data_item.update({'image': image,
449
- 'annotations': annotations})
450
- return data_item
451
-
452
- def __iter__(self):
453
- """Create a generator that iterate over the Sequence."""
454
- for item in (self[i] for i in range(len(self))):
455
- yield item
456
-
457
- def __len__(self):
458
- factor = self.batch_size
459
- if factor is None:
460
- factor = 1
461
- return int(np.ceil(self.n_items / factor))
462
-
463
- def __getitem__(self, idx):
464
- """
465
- Support single index or a slice.
466
- Uses ThreadPoolExecutor is num_workers != 0
467
- """
468
- to_return = None
469
- if isinstance(idx, int):
470
- if self.batch_size is None:
471
- to_return = self.__getsingleitem__(idx)
472
- else:
473
- # if batch_size is define, convert idx to batches
474
- idx = slice(idx * self.batch_size, min((idx + 1) * self.batch_size, len(self.data_items)))
475
-
476
- if isinstance(idx, slice):
477
- to_return = list()
478
- idxs = list(range(idx.start, idx.stop,
479
- idx.step if idx.step else 1))
480
- if self.num_workers == 0:
481
- for dx in idxs:
482
- to_return.append(self.__getsingleitem__(dx))
483
- else:
484
- with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
485
- for sample in executor.map(lambda i: self.__getsingleitem__(i), idxs):
486
- to_return.append(sample)
487
-
488
- if to_return is None:
489
- raise TypeError('unsupported indexing: list indices must be integers or slices, not {}'.format(type(idx)))
490
-
491
- if self.collate_fn is not None:
492
- to_return = self.collate_fn(to_return)
493
- return to_return
494
-
495
-
496
- np_str_obj_array_pattern = re.compile(r'[SaUO]')
497
-
498
- default_collate_err_msg_format = (
499
- "default_collate: batch must contain tensors, numpy arrays, numbers, "
500
- "dicts or lists; found {}")
501
-
502
-
503
- def default_transforms_callback(transforms, image, target, annotation_type):
504
- """
505
- Recursive call to perform the augmentations in "transforms"
506
-
507
- :param transforms:
508
- :param image:
509
- :param target:
510
- :param annotation_type:
511
- :return:
512
- """
513
- # get the type string without importing any other package
514
- transforms_type = type(transforms)
515
-
516
- ############
517
- # Handle compositions and lists of augmentations with a recursive call
518
- if transforms_type.__module__ == 'torchvision.transforms.transforms' and transforms_type.__name__ == 'Compose':
519
- # torchvision compose - convert to list
520
- image, target = default_transforms_callback(transforms.transforms, image, target, annotation_type)
521
- return image, target
522
-
523
- if transforms_type.__module__ == 'imgaug.augmenters.meta' and transforms_type.__name__ == 'Sequential':
524
- # imgaug sequential - convert to list
525
- image, target = default_transforms_callback(list(transforms), image, target, annotation_type)
526
- return image, target
527
-
528
- if isinstance(transforms, list):
529
- for t in transforms:
530
- image, target = default_transforms_callback(t, image, target, annotation_type)
531
- return image, target
532
-
533
- ##############
534
- # Handle single annotations
535
- if 'imgaug.augmenters' in transforms_type.__module__:
536
- # handle single imgaug augmentation
537
- if target is not None and annotation_type is not None:
538
- # works for batch but running on a single image
539
- if annotation_type == entities.AnnotationType.BOX:
540
- image, target = transforms(images=[image], bounding_boxes=[target])
541
- target = target[0]
542
- elif annotation_type == entities.AnnotationType.SEGMENTATION:
543
- # expending to HxWx1 for the imgaug function to work
544
- target = target[..., None]
545
- image, target = transforms(images=[image], segmentation_maps=[target])
546
- target = target[0][:, :, 0]
547
- elif annotation_type == entities.AnnotationType.POLYGON:
548
- image, target = transforms(images=[image], polygons=[target])
549
- target = target[0]
550
- elif annotation_type == entities.AnnotationType.CLASSIFICATION:
551
- image = transforms(images=[image])
552
- else:
553
- raise ValueError('unsupported annotations type for image augmentations: {}'.format(annotation_type))
554
- image = image[0]
555
- else:
556
- image = transforms(images=[image])
557
- image = image[0]
558
- else:
559
- image = transforms(image)
560
-
561
- return image, target
562
-
563
-
564
- def collate_default(batch):
565
- r"""Puts each data field into a tensor with outer dimension batch size"""
566
- elem = batch[0]
567
- elem_type = type(elem)
568
- if isinstance(elem, np.ndarray):
569
- return np.stack(batch, axis=0)
570
- elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
571
- if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
572
- # array of string classes and object
573
- if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
574
- raise TypeError(default_collate_err_msg_format.format(elem.dtype))
575
- return batch
576
- # return [tf.convert_to_tensor(b) for b in batch]
577
- elif elem.shape == (): # scalars
578
- return batch
579
- elif isinstance(elem, float):
580
- return batch
581
- elif isinstance(elem, int):
582
- return batch
583
- elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
584
- return batch
585
- elif isinstance(elem, collections.abc.Mapping):
586
- return {key: collate_default([d[key] for d in batch]) for key in elem}
587
- elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
588
- return elem_type(*(collate_default(samples) for samples in zip(*batch)))
589
- elif isinstance(elem, collections.abc.Sequence):
590
- transposed = zip(*batch)
591
- return transposed
592
- raise TypeError(default_collate_err_msg_format.format(elem_type))
593
-
594
-
595
- def collate_torch(batch):
596
- r"""Puts each data field into a tensor with outer dimension batch size"""
597
- import torch
598
- elem = batch[0]
599
- elem_type = type(elem)
600
- if isinstance(elem, torch.Tensor):
601
- out = None
602
- if torch.utils.data.get_worker_info() is not None:
603
- # If we're in a background process, concatenate directly into a
604
- # shared memory tensor to avoid an extra copy
605
- numel = sum(x.numel() for x in batch)
606
- storage = elem.storage()._new_shared(numel)
607
- out = elem.new(storage)
608
- return torch.stack(batch, 0, out=out)
609
- elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
610
- if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
611
- # array of string classes and object
612
- if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
613
- raise TypeError(default_collate_err_msg_format.format(elem.dtype))
614
- try:
615
- return torch.stack([torch.as_tensor(b) for b in batch])
616
- except RuntimeError:
617
- return batch
618
- elif elem.shape == (): # scalars
619
- return torch.as_tensor(batch)
620
- elif isinstance(elem, float):
621
- return torch.tensor(batch, dtype=torch.float64)
622
- elif isinstance(elem, int):
623
- return torch.tensor(batch)
624
- elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
625
- return batch
626
- elif isinstance(elem, collections.abc.Mapping):
627
- return {key: collate_torch([d[key] for d in batch]) for key in elem}
628
- elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
629
- return elem_type(*(collate_torch(samples) for samples in zip(*batch)))
630
- elif isinstance(elem, collections.abc.Sequence):
631
- transposed = zip(*batch)
632
- return transposed
633
-
634
- raise TypeError(default_collate_err_msg_format.format(elem_type))
635
-
636
-
637
- def collate_tf(batch):
638
- r"""Puts each data field into a tensor with outer dimension batch size"""
639
- import tensorflow as tf
640
- elem = batch[0]
641
- elem_type = type(elem)
642
- if isinstance(elem, tf.Tensor):
643
- return tf.stack(batch, axis=0)
644
- elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
645
- if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
646
- # array of string classes and object
647
- if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
648
- raise TypeError(default_collate_err_msg_format.format(elem.dtype))
649
- try:
650
- return tf.convert_to_tensor(batch)
651
- except ValueError:
652
- # failed on orig_image because of a mismatch in the shape (not resizing all the images so cannot stack)
653
- return batch
654
- # return [tf.convert_to_tensor(b) for b in batch]
655
- elif elem.shape == (): # scalars
656
- return tf.convert_to_tensor(batch)
657
- elif isinstance(elem, float):
658
- return tf.convert_to_tensor(batch, dtype=tf.float64)
659
- elif isinstance(elem, int):
660
- return tf.convert_to_tensor(batch)
661
- elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
662
- return batch
663
- elif isinstance(elem, collections.abc.Mapping):
664
- return {key: collate_tf([d[key] for d in batch]) for key in elem}
665
- elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
666
- return elem_type(*(collate_tf(samples) for samples in zip(*batch)))
667
- elif isinstance(elem, collections.abc.Sequence):
668
- transposed = zip(*batch)
669
- return transposed
670
- raise TypeError(default_collate_err_msg_format.format(elem_type))
1
+ from concurrent.futures import ThreadPoolExecutor
2
+ from pathlib import Path
3
+ from PIL import Image
4
+ import collections.abc
5
+ import numpy as np
6
+ import collections
7
+ import logging
8
+ import shutil
9
+ import json
10
+ import copy
11
+ import tqdm
12
+ import sys
13
+ import os
14
+ import re
15
+ from ... import entities
16
+
17
+ logger = logging.getLogger(name='dtlpy')
18
+
19
+
20
+ class DataItem(dict):
21
+ def __init__(self, *args, **kwargs):
22
+ super(DataItem, self).__init__(*args, **kwargs)
23
+
24
+ @property
25
+ def image_filepath(self):
26
+ return self['image_filepath']
27
+
28
+ @image_filepath.setter
29
+ def image_filepath(self, val):
30
+ self['image_filepath'] = val
31
+
32
+
33
+ class DatasetGenerator:
34
+
35
+ def __init__(self,
36
+ dataset_entity: entities.Dataset,
37
+ annotation_type: entities.AnnotationType,
38
+ item_type: list = None,
39
+ filters: entities.Filters = None,
40
+ data_path=None,
41
+ overwrite=False,
42
+ id_to_label_map=None,
43
+ label_to_id_map=None,
44
+ transforms=None,
45
+ transforms_callback=None,
46
+ num_workers=0,
47
+ batch_size=None,
48
+ collate_fn=None,
49
+ shuffle=True,
50
+ seed=None,
51
+ to_categorical=False,
52
+ to_mask=False,
53
+ class_balancing=False,
54
+ # debug flags
55
+ return_originals=False,
56
+ ignore_empty=True
57
+ ) -> None:
58
+ """
59
+ Base Dataset Generator to build and iterate over images and annotations
60
+
61
+ * Mapping Labels *
62
+ To set a label mapping from labels to id you can use the `label_to_id_map` or `id_to_label_map`.
63
+ NOTE: if they are not i.i.d you'll need to input both.
64
+ In semantic, a `$default` label should be added so that the background (and all unlabeled pixels) will be
65
+ mapped to the model's inputs
66
+
67
+ label_to_id_map = {'cat': 1,
68
+ 'dog': 1,
69
+ '$default': 0}
70
+ id_to_label_map = {1: 'cats_and_dogs',
71
+ 0: 'background'}
72
+
73
+ :param dataset_entity: dl.Dataset entity
74
+ :param annotation_type: dl.AnnotationType - type of annotation to load from the annotated dataset
75
+ :param item_type: list of file extension to load. default: ['jpg', 'jpeg', 'png', 'bmp']
76
+ :param filters: dl.Filters - filtering entity to filter the dataset items
77
+ :param data_path: Path to Dataloop annotations (root to "item" and "json").
78
+ :param overwrite:
79
+ :param dict id_to_label_map: Optional, {id: label_string} dictionary, default taken from dataset
80
+ :param dict label_to_id_map: Optional, {label_string: id} dictionary
81
+ :param transforms: Optional transform to be applied on a sample. list, imgaug.Sequence or torchvision.transforms.Compose
82
+ :param transforms_callback: Optional function to handle the callback of each batch.
83
+ look at default_transforms_callback for more information. available: imgaug_transforms_callback, torchvision_transforms_callback
84
+ :param num_workers: Optional - number of separate threads to load the images
85
+ :param batch_size: (int, optional): how many samples per batch to load, if not none - items will always be a list
86
+ :param collate_fn: Optional - merges a list of samples to form a mini-batch of Tensor(s).
87
+ :param shuffle: Whether to shuffle the data (default: True) If set to False, sorts the data in alphanumeric order.
88
+ :param seed: Optional random seed for shuffling and transformations.
89
+ :param to_categorical: convert label id to categorical format
90
+ :param to_mask: convert annotations to an instance mask (will be true for SEGMENTATION)
91
+ :param class_balancing: if True - performing random over-sample with class ids as the target to balance training data
92
+ :param return_originals: bool - If True, return ALSO images and annotations before transformations (for debug)
93
+ :param ignore_empty: bool - If True, generator will NOT collect items without annotations
94
+ """
95
+ self._dataset_entity = dataset_entity
96
+
97
+ # default item types (extension for now)
98
+ if item_type is None:
99
+ item_type = ['jpg', 'jpeg', 'png', 'bmp']
100
+ if not isinstance(item_type, list):
101
+ item_type = [item_type]
102
+ self.item_type = item_type
103
+
104
+ # id labels mapping
105
+ if label_to_id_map is None and id_to_label_map is None:
106
+ # if both are None - take from dataset
107
+ label_to_id_map = dataset_entity.instance_map
108
+ id_to_label_map = {int(v): k for k, v in label_to_id_map.items()}
109
+ else:
110
+ # one or both is NOT None
111
+ if label_to_id_map is None:
112
+ # set label_to_id_map from the other
113
+ label_to_id_map = {v: int(k) for k, v in id_to_label_map.items()}
114
+ if id_to_label_map is None:
115
+ # set id_to_label_map from the other
116
+ id_to_label_map = {int(v): k for k, v in label_to_id_map.items()}
117
+ # put it on the local ontology for the annotations download
118
+ dataset_entity._get_ontology().instance_map = label_to_id_map
119
+ self.id_to_label_map = id_to_label_map
120
+ self.label_to_id_map = label_to_id_map
121
+
122
+ # if annotation type is segmentation - to_mask must be True
123
+ if annotation_type == entities.AnnotationType.SEGMENTATION:
124
+ to_mask = True
125
+
126
+ if data_path is None:
127
+ data_path = os.path.join(os.path.expanduser('~'),
128
+ '.dataloop',
129
+ 'datasets',
130
+ "{}_{}".format(dataset_entity.name,
131
+ dataset_entity.id))
132
+ download = False
133
+ if os.path.isdir(data_path):
134
+ if overwrite:
135
+ logger.warning('overwrite flag is True! deleting and overwriting')
136
+ shutil.rmtree(data_path)
137
+ download = True
138
+ else:
139
+ download = True
140
+ if download:
141
+ annotation_options = [entities.ViewAnnotationOptions.JSON]
142
+ if to_mask is True:
143
+ annotation_options.append(entities.ViewAnnotationOptions.INSTANCE)
144
+ _ = dataset_entity.items.download(filters=filters,
145
+ local_path=data_path,
146
+ thickness=-1,
147
+ annotation_options=annotation_options)
148
+ self.root_dir = data_path
149
+ self._items_path = Path(self.root_dir).joinpath('items')
150
+ self._json_path = Path(self.root_dir).joinpath('json')
151
+ self._mask_path = Path(self.root_dir).joinpath('instance')
152
+ self._transforms = transforms
153
+ self._transforms_callback = transforms_callback
154
+ if self._transforms is not None and self._transforms_callback is None:
155
+ # use default callback
156
+ self._transforms_callback = default_transforms_callback
157
+
158
+ self.annotation_type = annotation_type
159
+ self.num_workers = num_workers
160
+ self.to_categorical = to_categorical
161
+ self.num_classes = len(label_to_id_map)
162
+ self.shuffle = shuffle
163
+ self.seed = seed
164
+ self.to_mask = to_mask
165
+ self.batch_size = batch_size
166
+ self.collate_fn = collate_fn
167
+ self.class_balancing = class_balancing
168
+ # inits
169
+ self.data_items = list()
170
+ # flags
171
+ self.return_originals = return_originals
172
+ self.ignore_empty = ignore_empty
173
+
174
+ ####################
175
+ # Load annotations #
176
+ ####################
177
+ self.load_annotations()
178
+
179
+ @property
180
+ def dataset_entity(self):
181
+ assert isinstance(self._dataset_entity, entities.Dataset)
182
+ return self._dataset_entity
183
+
184
+ @dataset_entity.setter
185
+ def dataset_entity(self, val):
186
+ assert isinstance(val, entities.Dataset)
187
+ self._dataset_entity = val
188
+
189
+ @property
190
+ def n_items(self):
191
+ return len(self.data_items)
192
+
193
+ def _load_single(self, image_filepath, pbar=None):
194
+ try:
195
+ is_empty = False
196
+ item_info = DataItem()
197
+ # add image path
198
+ item_info.image_filepath = str(image_filepath)
199
+ if os.stat(image_filepath).st_size < 5:
200
+ logger.warning('IGNORING corrupted image: {!r}'.format(image_filepath))
201
+ return None, True
202
+ # get "platform" path
203
+ rel_path = image_filepath.relative_to(self._items_path)
204
+ # replace suffix to JSON
205
+ rel_path_wo_png_ext = rel_path.with_suffix('.json')
206
+ # create local path
207
+ annotation_filepath = Path(self._json_path, rel_path_wo_png_ext)
208
+
209
+ if os.path.isfile(annotation_filepath):
210
+ with open(annotation_filepath, 'r') as f:
211
+ data = json.load(f)
212
+ if 'id' in data:
213
+ item_id = data.get('id')
214
+ elif '_id' in data:
215
+ item_id = data.get('_id')
216
+ annotations = entities.AnnotationCollection.from_json(data)
217
+ else:
218
+ item_id = ''
219
+ annotations = None
220
+ item_info.update(item_id=item_id)
221
+ if self.annotation_type is not None:
222
+ # add item id from json
223
+ polygon_coordinates = list()
224
+ box_coordinates = list()
225
+ classes_ids = list()
226
+ labels = list()
227
+ if annotations is not None:
228
+ for annotation in annotations:
229
+ if 'user' in annotation.metadata and \
230
+ 'model' in annotation.metadata['user']:
231
+ # and 'name' in annotation.metadata['user']['model']:
232
+ # Do not use prediction annotations in the data generator
233
+ continue
234
+ if annotation.type == self.annotation_type:
235
+ if annotation.label not in self.label_to_id_map:
236
+ logger.warning(
237
+ 'Missing label {!r} in label_to_id_map. Skipping.. Use label_to_id_map for other behaviour'.format(
238
+ annotation.label))
239
+ else:
240
+ classes_ids.append(self.label_to_id_map[annotation.label])
241
+ labels.append(annotation.label)
242
+ box_coordinates.append(np.asarray([annotation.left,
243
+ annotation.top,
244
+ annotation.right,
245
+ annotation.bottom]))
246
+ if self.annotation_type == entities.AnnotationType.POLYGON:
247
+ polygon_coordinates.append(annotation.geo)
248
+ if annotation.type not in [entities.AnnotationType.CLASSIFICATION,
249
+ entities.AnnotationType.SEGMENTATION,
250
+ entities.AnnotationType.BOX,
251
+ entities.AnnotationType.POLYGON]:
252
+ raise ValueError('unsupported annotation type: {}'.format(annotation.type))
253
+ dtype = object if self.annotation_type == entities.AnnotationType.POLYGON else None
254
+ # reorder for output
255
+ item_info.update({entities.AnnotationType.BOX.value: np.asarray(box_coordinates).astype(float),
256
+ entities.AnnotationType.CLASSIFICATION.value: np.asarray(classes_ids),
257
+ entities.AnnotationType.POLYGON.value: np.asarray(polygon_coordinates, dtype=dtype),
258
+ 'labels': labels})
259
+ if len(item_info[entities.AnnotationType.CLASSIFICATION.value]) == 0:
260
+ logger.debug('Empty annotation (nothing matched label_to_id_map) for image filename: {}'.format(
261
+ image_filepath))
262
+ is_empty = True
263
+ if self.to_mask:
264
+ # get "platform" path
265
+ rel_path = image_filepath.relative_to(self._items_path)
266
+ # replace suffix to PNG
267
+ rel_path_wo_png_ext = rel_path.with_suffix('.png')
268
+ # create local path
269
+ mask_filepath = Path(self._mask_path, rel_path_wo_png_ext)
270
+ if not os.path.isfile(mask_filepath):
271
+ logger.debug('Empty annotation for image filename: {}'.format(image_filepath))
272
+ is_empty = True
273
+ item_info.update({entities.AnnotationType.SEGMENTATION.value: str(mask_filepath)})
274
+ item_info.update(annotation_filepath=str(annotation_filepath))
275
+ return item_info, is_empty
276
+ except Exception:
277
+ logger.exception('failed loading item in generator! {!r}'.format(image_filepath))
278
+ return None, True
279
+ finally:
280
+ if pbar is not None:
281
+ pbar.update()
282
+
283
+ def load_annotations(self):
284
+ logger.info(f"Collecting items with the following extensions: {self.item_type}")
285
+ files = list()
286
+ for ext in self.item_type:
287
+ # build regex to ignore extension case
288
+ regex = '*.{}'.format(''.join(['[{}{}]'.format(letter.lower(), letter.upper()) for letter in ext]))
289
+ files.extend(self._items_path.rglob(regex))
290
+
291
+ pool = ThreadPoolExecutor(max_workers=32)
292
+ jobs = list()
293
+ pbar = tqdm.tqdm(total=len(files),
294
+ desc='Loading Data Generator',
295
+ disable=self.dataset_entity._client_api.verbose.disable_progress_bar,
296
+ file=sys.stdout)
297
+ for image_filepath in files:
298
+ jobs.append(pool.submit(self._load_single,
299
+ image_filepath=image_filepath,
300
+ pbar=pbar))
301
+ outputs = [job.result() for job in jobs]
302
+ pbar.close()
303
+
304
+ n_items = len(outputs)
305
+ n_empty_items = sum([1 for _, is_empty in outputs if is_empty is True])
306
+
307
+ output_msg = 'Done loading items. Total items loaded: {}.'.format(n_items)
308
+ if n_empty_items > 0:
309
+ output_msg += '{action} {n_empty_items} items without annotations'.format(
310
+ action='IGNORING' if self.ignore_empty else 'INCLUDING',
311
+ n_empty_items=n_empty_items)
312
+
313
+ if self.ignore_empty:
314
+ # take ONLY non-empty
315
+ data_items = [data_item for data_item, is_empty in outputs if is_empty is False]
316
+ else:
317
+ # take all
318
+ data_items = [data_item for data_item, is_empty in outputs]
319
+
320
+ self.data_items = data_items
321
+ if len(self.data_items) == 0:
322
+ logger.warning(output_msg)
323
+ else:
324
+ logger.info(output_msg)
325
+ ###################
326
+ # class balancing #
327
+ ###################
328
+ labels = [label for item in self.data_items for label in item.get('labels', list())]
329
+ logger.info(f"Data Generator labels balance statistics: {collections.Counter(labels)}")
330
+ if self.class_balancing:
331
+ try:
332
+ from imblearn.over_sampling import RandomOverSampler
333
+ except Exception:
334
+ logger.error(
335
+ 'Class balancing is ON but missing "imbalanced-learn". run "pip install -U imbalanced-learn" and try again')
336
+ raise
337
+ logger.info('Class balance is on!')
338
+ class_ids = [class_id for item in self.data_items for class_id in item['class']]
339
+ dummy_inds = [i_item for i_item, item in enumerate(self.data_items) for _ in item['class']]
340
+ over_sampler = RandomOverSampler(random_state=42)
341
+ X_res, y_res = over_sampler.fit_resample(np.asarray(dummy_inds).reshape(-1, 1), np.asarray(class_ids))
342
+ over_sampled_data_items = [self.data_items[i] for i in X_res.flatten()]
343
+ oversampled_labels = [label for item in over_sampled_data_items for label in item['labels']]
344
+ logger.info(f"Data Generator labels after oversampling: {collections.Counter(oversampled_labels)}")
345
+ self.data_items = over_sampled_data_items
346
+
347
+ if self.shuffle:
348
+ if self.seed is None:
349
+ self.seed = 256
350
+ np.random.seed(self.seed)
351
+ np.random.shuffle(self.data_items)
352
+
353
+ def transform(self, image, target=None):
354
+ if self._transforms is not None:
355
+ image, target = self._transforms_callback(transforms=self._transforms,
356
+ image=image,
357
+ target=target,
358
+ annotation_type=self.annotation_type)
359
+ return image, target
360
+
361
+ def _to_dtlpy(self, targets, labels=None):
362
+ annotations = entities.AnnotationCollection(item=None)
363
+ annotations._dataset = self._dataset_entity
364
+ if labels is None:
365
+ labels = [None] * len(targets)
366
+ if self.to_mask is True:
367
+ for label, label_ind in self.label_to_id_map.items():
368
+ target = targets == label_ind
369
+ if np.any(target):
370
+ annotations.add(annotation_definition=entities.Segmentation(geo=target,
371
+ label=label))
372
+ elif self.annotation_type == entities.AnnotationType.BOX:
373
+ for target, label in zip(targets, labels):
374
+ annotations.add(annotation_definition=entities.Box(left=target[0],
375
+ top=target[1],
376
+ right=target[2],
377
+ bottom=target[3],
378
+ label=label))
379
+ elif self.annotation_type == entities.AnnotationType.CLASSIFICATION:
380
+ for target, label in zip(targets, labels):
381
+ annotations.add(annotation_definition=entities.Classification(label=label))
382
+ elif self.annotation_type == entities.AnnotationType.POLYGON:
383
+ for target, label in zip(targets, labels):
384
+ annotations.add(annotation_definition=entities.Polygon(label=label,
385
+ geo=target.astype(float)))
386
+ else:
387
+ raise ValueError('unsupported annotation type: {}'.format(self.annotation_type))
388
+ # set dataset for color
389
+ for annotation in annotations:
390
+ annotation._dataset = self._dataset_entity
391
+ return annotations
392
+
393
+ def visualize(self, idx=None, return_output=False, plot=True):
394
+ if not self.__len__():
395
+ raise ValueError('no items selected, cannot preform visualization')
396
+ import matplotlib.pyplot as plt
397
+ if idx is None:
398
+ idx = np.random.randint(self.__len__())
399
+ if self.batch_size is not None:
400
+ raise ValueError('can visualize only of batch_size in None')
401
+ data_item = self.__getitem__(idx)
402
+ image = Image.fromarray(data_item.get('image'))
403
+ labels = data_item.get('labels')
404
+ targets = data_item.get('annotations')
405
+ annotations = self._to_dtlpy(targets=targets, labels=labels)
406
+ mask = Image.fromarray(annotations.show(height=image.size[1],
407
+ width=image.size[0],
408
+ alpha=0.8))
409
+ image.paste(mask, (0, 0), mask)
410
+ marked_image = np.asarray(image)
411
+ if plot:
412
+ plt.figure()
413
+ plt.imshow(marked_image)
414
+ if return_output:
415
+ return marked_image, annotations
416
+
417
+ def __getsingleitem__(self, idx):
418
+ data_item = copy.deepcopy(self.data_items[idx])
419
+
420
+ image_filename = data_item.get('image_filepath')
421
+ image = np.asarray(Image.open(image_filename))
422
+ data_item.update({'image': image})
423
+
424
+ annotations = data_item.get(self.annotation_type)
425
+ if self.to_mask is True:
426
+ # if segmentation - read from file
427
+ mask_filepath = data_item.get(entities.AnnotationType.SEGMENTATION)
428
+ annotations = np.asarray(Image.open(mask_filepath).convert('L'))
429
+ if self.to_categorical:
430
+ onehot = np.zeros((annotations.size, self.num_classes + 1))
431
+ onehot[np.arange(annotations.size), annotations] = 1
432
+ annotations = onehot
433
+ data_item.update({'annotations': annotations})
434
+
435
+ if self.return_originals is True:
436
+ annotations = []
437
+ if self.annotation_type is not None:
438
+ annotations = data_item.get('annotations')
439
+ data_item.update({'orig_image': image.copy(),
440
+ 'orig_annotations': annotations.copy()})
441
+
442
+ ###########################
443
+ # perform transformations #
444
+ ###########################
445
+ if self._transforms is not None:
446
+ annotations = data_item.get('annotations')
447
+ image, annotations = self.transform(image, annotations)
448
+ data_item.update({'image': image,
449
+ 'annotations': annotations})
450
+ return data_item
451
+
452
+ def __iter__(self):
453
+ """Create a generator that iterate over the Sequence."""
454
+ for item in (self[i] for i in range(len(self))):
455
+ yield item
456
+
457
+ def __len__(self):
458
+ factor = self.batch_size
459
+ if factor is None:
460
+ factor = 1
461
+ return int(np.ceil(self.n_items / factor))
462
+
463
+ def __getitem__(self, idx):
464
+ """
465
+ Support single index or a slice.
466
+ Uses ThreadPoolExecutor is num_workers != 0
467
+ """
468
+ to_return = None
469
+ if isinstance(idx, int):
470
+ if self.batch_size is None:
471
+ to_return = self.__getsingleitem__(idx)
472
+ else:
473
+ # if batch_size is define, convert idx to batches
474
+ idx = slice(idx * self.batch_size, min((idx + 1) * self.batch_size, len(self.data_items)))
475
+
476
+ if isinstance(idx, slice):
477
+ to_return = list()
478
+ idxs = list(range(idx.start, idx.stop,
479
+ idx.step if idx.step else 1))
480
+ if self.num_workers == 0:
481
+ for dx in idxs:
482
+ to_return.append(self.__getsingleitem__(dx))
483
+ else:
484
+ with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
485
+ for sample in executor.map(lambda i: self.__getsingleitem__(i), idxs):
486
+ to_return.append(sample)
487
+
488
+ if to_return is None:
489
+ raise TypeError('unsupported indexing: list indices must be integers or slices, not {}'.format(type(idx)))
490
+
491
+ if self.collate_fn is not None:
492
+ to_return = self.collate_fn(to_return)
493
+ return to_return
494
+
495
+
496
+ np_str_obj_array_pattern = re.compile(r'[SaUO]')
497
+
498
+ default_collate_err_msg_format = (
499
+ "default_collate: batch must contain tensors, numpy arrays, numbers, "
500
+ "dicts or lists; found {}")
501
+
502
+
503
+ def default_transforms_callback(transforms, image, target, annotation_type):
504
+ """
505
+ Recursive call to perform the augmentations in "transforms"
506
+
507
+ :param transforms:
508
+ :param image:
509
+ :param target:
510
+ :param annotation_type:
511
+ :return:
512
+ """
513
+ # get the type string without importing any other package
514
+ transforms_type = type(transforms)
515
+
516
+ ############
517
+ # Handle compositions and lists of augmentations with a recursive call
518
+ if transforms_type.__module__ == 'torchvision.transforms.transforms' and transforms_type.__name__ == 'Compose':
519
+ # torchvision compose - convert to list
520
+ image, target = default_transforms_callback(transforms.transforms, image, target, annotation_type)
521
+ return image, target
522
+
523
+ if transforms_type.__module__ == 'imgaug.augmenters.meta' and transforms_type.__name__ == 'Sequential':
524
+ # imgaug sequential - convert to list
525
+ image, target = default_transforms_callback(list(transforms), image, target, annotation_type)
526
+ return image, target
527
+
528
+ if isinstance(transforms, list):
529
+ for t in transforms:
530
+ image, target = default_transforms_callback(t, image, target, annotation_type)
531
+ return image, target
532
+
533
+ ##############
534
+ # Handle single annotations
535
+ if 'imgaug.augmenters' in transforms_type.__module__:
536
+ # handle single imgaug augmentation
537
+ if target is not None and annotation_type is not None:
538
+ # works for batch but running on a single image
539
+ if annotation_type == entities.AnnotationType.BOX:
540
+ image, target = transforms(images=[image], bounding_boxes=[target])
541
+ target = target[0]
542
+ elif annotation_type == entities.AnnotationType.SEGMENTATION:
543
+ # expending to HxWx1 for the imgaug function to work
544
+ target = target[..., None]
545
+ image, target = transforms(images=[image], segmentation_maps=[target])
546
+ target = target[0][:, :, 0]
547
+ elif annotation_type == entities.AnnotationType.POLYGON:
548
+ image, target = transforms(images=[image], polygons=[target])
549
+ target = target[0]
550
+ elif annotation_type == entities.AnnotationType.CLASSIFICATION:
551
+ image = transforms(images=[image])
552
+ else:
553
+ raise ValueError('unsupported annotations type for image augmentations: {}'.format(annotation_type))
554
+ image = image[0]
555
+ else:
556
+ image = transforms(images=[image])
557
+ image = image[0]
558
+ else:
559
+ image = transforms(image)
560
+
561
+ return image, target
562
+
563
+
564
+ def collate_default(batch):
565
+ r"""Puts each data field into a tensor with outer dimension batch size"""
566
+ elem = batch[0]
567
+ elem_type = type(elem)
568
+ if isinstance(elem, np.ndarray):
569
+ return np.stack(batch, axis=0)
570
+ elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
571
+ if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
572
+ # array of string classes and object
573
+ if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
574
+ raise TypeError(default_collate_err_msg_format.format(elem.dtype))
575
+ return batch
576
+ # return [tf.convert_to_tensor(b) for b in batch]
577
+ elif elem.shape == (): # scalars
578
+ return batch
579
+ elif isinstance(elem, float):
580
+ return batch
581
+ elif isinstance(elem, int):
582
+ return batch
583
+ elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
584
+ return batch
585
+ elif isinstance(elem, collections.abc.Mapping):
586
+ return {key: collate_default([d[key] for d in batch]) for key in elem}
587
+ elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
588
+ return elem_type(*(collate_default(samples) for samples in zip(*batch)))
589
+ elif isinstance(elem, collections.abc.Sequence):
590
+ transposed = zip(*batch)
591
+ return transposed
592
+ raise TypeError(default_collate_err_msg_format.format(elem_type))
593
+
594
+
595
+ def collate_torch(batch):
596
+ r"""Puts each data field into a tensor with outer dimension batch size"""
597
+ import torch
598
+ elem = batch[0]
599
+ elem_type = type(elem)
600
+ if isinstance(elem, torch.Tensor):
601
+ out = None
602
+ if torch.utils.data.get_worker_info() is not None:
603
+ # If we're in a background process, concatenate directly into a
604
+ # shared memory tensor to avoid an extra copy
605
+ numel = sum(x.numel() for x in batch)
606
+ storage = elem.storage()._new_shared(numel)
607
+ out = elem.new(storage)
608
+ return torch.stack(batch, 0, out=out)
609
+ elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
610
+ if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
611
+ # array of string classes and object
612
+ if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
613
+ raise TypeError(default_collate_err_msg_format.format(elem.dtype))
614
+ try:
615
+ return torch.stack([torch.as_tensor(b) for b in batch])
616
+ except RuntimeError:
617
+ return batch
618
+ elif elem.shape == (): # scalars
619
+ return torch.as_tensor(batch)
620
+ elif isinstance(elem, float):
621
+ return torch.tensor(batch, dtype=torch.float64)
622
+ elif isinstance(elem, int):
623
+ return torch.tensor(batch)
624
+ elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
625
+ return batch
626
+ elif isinstance(elem, collections.abc.Mapping):
627
+ return {key: collate_torch([d[key] for d in batch]) for key in elem}
628
+ elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
629
+ return elem_type(*(collate_torch(samples) for samples in zip(*batch)))
630
+ elif isinstance(elem, collections.abc.Sequence):
631
+ transposed = zip(*batch)
632
+ return transposed
633
+
634
+ raise TypeError(default_collate_err_msg_format.format(elem_type))
635
+
636
+
637
+ def collate_tf(batch):
638
+ r"""Puts each data field into a tensor with outer dimension batch size"""
639
+ import tensorflow as tf
640
+ elem = batch[0]
641
+ elem_type = type(elem)
642
+ if isinstance(elem, tf.Tensor):
643
+ return tf.stack(batch, axis=0)
644
+ elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' and elem_type.__name__ != 'string_':
645
+ if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
646
+ # array of string classes and object
647
+ if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
648
+ raise TypeError(default_collate_err_msg_format.format(elem.dtype))
649
+ try:
650
+ return tf.convert_to_tensor(batch)
651
+ except ValueError:
652
+ # failed on orig_image because of a mismatch in the shape (not resizing all the images so cannot stack)
653
+ return batch
654
+ # return [tf.convert_to_tensor(b) for b in batch]
655
+ elif elem.shape == (): # scalars
656
+ return tf.convert_to_tensor(batch)
657
+ elif isinstance(elem, float):
658
+ return tf.convert_to_tensor(batch, dtype=tf.float64)
659
+ elif isinstance(elem, int):
660
+ return tf.convert_to_tensor(batch)
661
+ elif isinstance(elem, str) or isinstance(elem, bytes) or elem is None:
662
+ return batch
663
+ elif isinstance(elem, collections.abc.Mapping):
664
+ return {key: collate_tf([d[key] for d in batch]) for key in elem}
665
+ elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
666
+ return elem_type(*(collate_tf(samples) for samples in zip(*batch)))
667
+ elif isinstance(elem, collections.abc.Sequence):
668
+ transposed = zip(*batch)
669
+ return transposed
670
+ raise TypeError(default_collate_err_msg_format.format(elem_type))