dtlpy 1.114.17__py3-none-any.whl → 1.116.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. dtlpy/__init__.py +491 -491
  2. dtlpy/__version__.py +1 -1
  3. dtlpy/assets/__init__.py +26 -26
  4. dtlpy/assets/code_server/config.yaml +2 -2
  5. dtlpy/assets/code_server/installation.sh +24 -24
  6. dtlpy/assets/code_server/launch.json +13 -13
  7. dtlpy/assets/code_server/settings.json +2 -2
  8. dtlpy/assets/main.py +53 -53
  9. dtlpy/assets/main_partial.py +18 -18
  10. dtlpy/assets/mock.json +11 -11
  11. dtlpy/assets/model_adapter.py +83 -83
  12. dtlpy/assets/package.json +61 -61
  13. dtlpy/assets/package_catalog.json +29 -29
  14. dtlpy/assets/package_gitignore +307 -307
  15. dtlpy/assets/service_runners/__init__.py +33 -33
  16. dtlpy/assets/service_runners/converter.py +96 -96
  17. dtlpy/assets/service_runners/multi_method.py +49 -49
  18. dtlpy/assets/service_runners/multi_method_annotation.py +54 -54
  19. dtlpy/assets/service_runners/multi_method_dataset.py +55 -55
  20. dtlpy/assets/service_runners/multi_method_item.py +52 -52
  21. dtlpy/assets/service_runners/multi_method_json.py +52 -52
  22. dtlpy/assets/service_runners/single_method.py +37 -37
  23. dtlpy/assets/service_runners/single_method_annotation.py +43 -43
  24. dtlpy/assets/service_runners/single_method_dataset.py +43 -43
  25. dtlpy/assets/service_runners/single_method_item.py +41 -41
  26. dtlpy/assets/service_runners/single_method_json.py +42 -42
  27. dtlpy/assets/service_runners/single_method_multi_input.py +45 -45
  28. dtlpy/assets/voc_annotation_template.xml +23 -23
  29. dtlpy/caches/base_cache.py +32 -32
  30. dtlpy/caches/cache.py +473 -473
  31. dtlpy/caches/dl_cache.py +201 -201
  32. dtlpy/caches/filesystem_cache.py +89 -89
  33. dtlpy/caches/redis_cache.py +84 -84
  34. dtlpy/dlp/__init__.py +20 -20
  35. dtlpy/dlp/cli_utilities.py +367 -367
  36. dtlpy/dlp/command_executor.py +764 -764
  37. dtlpy/dlp/dlp +1 -1
  38. dtlpy/dlp/dlp.bat +1 -1
  39. dtlpy/dlp/dlp.py +128 -128
  40. dtlpy/dlp/parser.py +651 -651
  41. dtlpy/entities/__init__.py +83 -83
  42. dtlpy/entities/analytic.py +347 -311
  43. dtlpy/entities/annotation.py +1879 -1879
  44. dtlpy/entities/annotation_collection.py +699 -699
  45. dtlpy/entities/annotation_definitions/__init__.py +20 -20
  46. dtlpy/entities/annotation_definitions/base_annotation_definition.py +100 -100
  47. dtlpy/entities/annotation_definitions/box.py +195 -195
  48. dtlpy/entities/annotation_definitions/classification.py +67 -67
  49. dtlpy/entities/annotation_definitions/comparison.py +72 -72
  50. dtlpy/entities/annotation_definitions/cube.py +204 -204
  51. dtlpy/entities/annotation_definitions/cube_3d.py +149 -149
  52. dtlpy/entities/annotation_definitions/description.py +32 -32
  53. dtlpy/entities/annotation_definitions/ellipse.py +124 -124
  54. dtlpy/entities/annotation_definitions/free_text.py +62 -62
  55. dtlpy/entities/annotation_definitions/gis.py +69 -69
  56. dtlpy/entities/annotation_definitions/note.py +139 -139
  57. dtlpy/entities/annotation_definitions/point.py +117 -117
  58. dtlpy/entities/annotation_definitions/polygon.py +182 -182
  59. dtlpy/entities/annotation_definitions/polyline.py +111 -111
  60. dtlpy/entities/annotation_definitions/pose.py +92 -92
  61. dtlpy/entities/annotation_definitions/ref_image.py +86 -86
  62. dtlpy/entities/annotation_definitions/segmentation.py +240 -240
  63. dtlpy/entities/annotation_definitions/subtitle.py +34 -34
  64. dtlpy/entities/annotation_definitions/text.py +85 -85
  65. dtlpy/entities/annotation_definitions/undefined_annotation.py +74 -74
  66. dtlpy/entities/app.py +220 -220
  67. dtlpy/entities/app_module.py +107 -107
  68. dtlpy/entities/artifact.py +174 -174
  69. dtlpy/entities/assignment.py +399 -399
  70. dtlpy/entities/base_entity.py +214 -214
  71. dtlpy/entities/bot.py +113 -113
  72. dtlpy/entities/codebase.py +292 -296
  73. dtlpy/entities/collection.py +38 -38
  74. dtlpy/entities/command.py +169 -169
  75. dtlpy/entities/compute.py +449 -442
  76. dtlpy/entities/dataset.py +1299 -1285
  77. dtlpy/entities/directory_tree.py +44 -44
  78. dtlpy/entities/dpk.py +470 -470
  79. dtlpy/entities/driver.py +235 -223
  80. dtlpy/entities/execution.py +397 -397
  81. dtlpy/entities/feature.py +124 -124
  82. dtlpy/entities/feature_set.py +145 -145
  83. dtlpy/entities/filters.py +798 -645
  84. dtlpy/entities/gis_item.py +107 -107
  85. dtlpy/entities/integration.py +184 -184
  86. dtlpy/entities/item.py +959 -953
  87. dtlpy/entities/label.py +123 -123
  88. dtlpy/entities/links.py +85 -85
  89. dtlpy/entities/message.py +175 -175
  90. dtlpy/entities/model.py +684 -684
  91. dtlpy/entities/node.py +1005 -1005
  92. dtlpy/entities/ontology.py +810 -803
  93. dtlpy/entities/organization.py +287 -287
  94. dtlpy/entities/package.py +657 -657
  95. dtlpy/entities/package_defaults.py +5 -5
  96. dtlpy/entities/package_function.py +185 -185
  97. dtlpy/entities/package_module.py +113 -113
  98. dtlpy/entities/package_slot.py +118 -118
  99. dtlpy/entities/paged_entities.py +299 -299
  100. dtlpy/entities/pipeline.py +624 -624
  101. dtlpy/entities/pipeline_execution.py +279 -279
  102. dtlpy/entities/project.py +394 -394
  103. dtlpy/entities/prompt_item.py +505 -499
  104. dtlpy/entities/recipe.py +301 -301
  105. dtlpy/entities/reflect_dict.py +102 -102
  106. dtlpy/entities/resource_execution.py +138 -138
  107. dtlpy/entities/service.py +963 -958
  108. dtlpy/entities/service_driver.py +117 -117
  109. dtlpy/entities/setting.py +294 -294
  110. dtlpy/entities/task.py +495 -495
  111. dtlpy/entities/time_series.py +143 -143
  112. dtlpy/entities/trigger.py +426 -426
  113. dtlpy/entities/user.py +118 -118
  114. dtlpy/entities/webhook.py +124 -124
  115. dtlpy/examples/__init__.py +19 -19
  116. dtlpy/examples/add_labels.py +135 -135
  117. dtlpy/examples/add_metadata_to_item.py +21 -21
  118. dtlpy/examples/annotate_items_using_model.py +65 -65
  119. dtlpy/examples/annotate_video_using_model_and_tracker.py +75 -75
  120. dtlpy/examples/annotations_convert_to_voc.py +9 -9
  121. dtlpy/examples/annotations_convert_to_yolo.py +9 -9
  122. dtlpy/examples/convert_annotation_types.py +51 -51
  123. dtlpy/examples/converter.py +143 -143
  124. dtlpy/examples/copy_annotations.py +22 -22
  125. dtlpy/examples/copy_folder.py +31 -31
  126. dtlpy/examples/create_annotations.py +51 -51
  127. dtlpy/examples/create_video_annotations.py +83 -83
  128. dtlpy/examples/delete_annotations.py +26 -26
  129. dtlpy/examples/filters.py +113 -113
  130. dtlpy/examples/move_item.py +23 -23
  131. dtlpy/examples/play_video_annotation.py +13 -13
  132. dtlpy/examples/show_item_and_mask.py +53 -53
  133. dtlpy/examples/triggers.py +49 -49
  134. dtlpy/examples/upload_batch_of_items.py +20 -20
  135. dtlpy/examples/upload_items_and_custom_format_annotations.py +55 -55
  136. dtlpy/examples/upload_items_with_modalities.py +43 -43
  137. dtlpy/examples/upload_segmentation_annotations_from_mask_image.py +44 -44
  138. dtlpy/examples/upload_yolo_format_annotations.py +70 -70
  139. dtlpy/exceptions.py +125 -125
  140. dtlpy/miscellaneous/__init__.py +20 -20
  141. dtlpy/miscellaneous/dict_differ.py +95 -95
  142. dtlpy/miscellaneous/git_utils.py +217 -217
  143. dtlpy/miscellaneous/json_utils.py +14 -14
  144. dtlpy/miscellaneous/list_print.py +105 -105
  145. dtlpy/miscellaneous/zipping.py +130 -130
  146. dtlpy/ml/__init__.py +20 -20
  147. dtlpy/ml/base_feature_extractor_adapter.py +27 -27
  148. dtlpy/ml/base_model_adapter.py +1257 -1086
  149. dtlpy/ml/metrics.py +461 -461
  150. dtlpy/ml/predictions_utils.py +274 -274
  151. dtlpy/ml/summary_writer.py +57 -57
  152. dtlpy/ml/train_utils.py +60 -60
  153. dtlpy/new_instance.py +252 -252
  154. dtlpy/repositories/__init__.py +56 -56
  155. dtlpy/repositories/analytics.py +85 -85
  156. dtlpy/repositories/annotations.py +916 -916
  157. dtlpy/repositories/apps.py +383 -383
  158. dtlpy/repositories/artifacts.py +452 -452
  159. dtlpy/repositories/assignments.py +599 -599
  160. dtlpy/repositories/bots.py +213 -213
  161. dtlpy/repositories/codebases.py +559 -559
  162. dtlpy/repositories/collections.py +332 -332
  163. dtlpy/repositories/commands.py +152 -158
  164. dtlpy/repositories/compositions.py +61 -61
  165. dtlpy/repositories/computes.py +439 -435
  166. dtlpy/repositories/datasets.py +1504 -1291
  167. dtlpy/repositories/downloader.py +976 -903
  168. dtlpy/repositories/dpks.py +433 -433
  169. dtlpy/repositories/drivers.py +482 -470
  170. dtlpy/repositories/executions.py +815 -817
  171. dtlpy/repositories/feature_sets.py +226 -226
  172. dtlpy/repositories/features.py +255 -238
  173. dtlpy/repositories/integrations.py +484 -484
  174. dtlpy/repositories/items.py +912 -909
  175. dtlpy/repositories/messages.py +94 -94
  176. dtlpy/repositories/models.py +1000 -988
  177. dtlpy/repositories/nodes.py +80 -80
  178. dtlpy/repositories/ontologies.py +511 -511
  179. dtlpy/repositories/organizations.py +525 -525
  180. dtlpy/repositories/packages.py +1941 -1941
  181. dtlpy/repositories/pipeline_executions.py +451 -451
  182. dtlpy/repositories/pipelines.py +640 -640
  183. dtlpy/repositories/projects.py +539 -539
  184. dtlpy/repositories/recipes.py +419 -399
  185. dtlpy/repositories/resource_executions.py +137 -137
  186. dtlpy/repositories/schema.py +120 -120
  187. dtlpy/repositories/service_drivers.py +213 -213
  188. dtlpy/repositories/services.py +1704 -1704
  189. dtlpy/repositories/settings.py +339 -339
  190. dtlpy/repositories/tasks.py +1477 -1477
  191. dtlpy/repositories/times_series.py +278 -278
  192. dtlpy/repositories/triggers.py +536 -536
  193. dtlpy/repositories/upload_element.py +257 -257
  194. dtlpy/repositories/uploader.py +661 -651
  195. dtlpy/repositories/webhooks.py +249 -249
  196. dtlpy/services/__init__.py +22 -22
  197. dtlpy/services/aihttp_retry.py +131 -131
  198. dtlpy/services/api_client.py +1785 -1782
  199. dtlpy/services/api_reference.py +40 -40
  200. dtlpy/services/async_utils.py +133 -133
  201. dtlpy/services/calls_counter.py +44 -44
  202. dtlpy/services/check_sdk.py +68 -68
  203. dtlpy/services/cookie.py +115 -115
  204. dtlpy/services/create_logger.py +156 -156
  205. dtlpy/services/events.py +84 -84
  206. dtlpy/services/logins.py +235 -235
  207. dtlpy/services/reporter.py +256 -256
  208. dtlpy/services/service_defaults.py +91 -91
  209. dtlpy/utilities/__init__.py +20 -20
  210. dtlpy/utilities/annotations/__init__.py +16 -16
  211. dtlpy/utilities/annotations/annotation_converters.py +269 -269
  212. dtlpy/utilities/base_package_runner.py +285 -264
  213. dtlpy/utilities/converter.py +1650 -1650
  214. dtlpy/utilities/dataset_generators/__init__.py +1 -1
  215. dtlpy/utilities/dataset_generators/dataset_generator.py +670 -670
  216. dtlpy/utilities/dataset_generators/dataset_generator_tensorflow.py +23 -23
  217. dtlpy/utilities/dataset_generators/dataset_generator_torch.py +21 -21
  218. dtlpy/utilities/local_development/__init__.py +1 -1
  219. dtlpy/utilities/local_development/local_session.py +179 -179
  220. dtlpy/utilities/reports/__init__.py +2 -2
  221. dtlpy/utilities/reports/figures.py +343 -343
  222. dtlpy/utilities/reports/report.py +71 -71
  223. dtlpy/utilities/videos/__init__.py +17 -17
  224. dtlpy/utilities/videos/video_player.py +598 -598
  225. dtlpy/utilities/videos/videos.py +470 -470
  226. {dtlpy-1.114.17.data → dtlpy-1.116.6.data}/scripts/dlp +1 -1
  227. dtlpy-1.116.6.data/scripts/dlp.bat +2 -0
  228. {dtlpy-1.114.17.data → dtlpy-1.116.6.data}/scripts/dlp.py +128 -128
  229. {dtlpy-1.114.17.dist-info → dtlpy-1.116.6.dist-info}/METADATA +186 -183
  230. dtlpy-1.116.6.dist-info/RECORD +239 -0
  231. {dtlpy-1.114.17.dist-info → dtlpy-1.116.6.dist-info}/WHEEL +1 -1
  232. {dtlpy-1.114.17.dist-info → dtlpy-1.116.6.dist-info}/licenses/LICENSE +200 -200
  233. tests/features/environment.py +551 -551
  234. dtlpy/assets/__pycache__/__init__.cpython-310.pyc +0 -0
  235. dtlpy-1.114.17.data/scripts/dlp.bat +0 -2
  236. dtlpy-1.114.17.dist-info/RECORD +0 -240
  237. {dtlpy-1.114.17.dist-info → dtlpy-1.116.6.dist-info}/entry_points.txt +0 -0
  238. {dtlpy-1.114.17.dist-info → dtlpy-1.116.6.dist-info}/top_level.txt +0 -0
@@ -1,1291 +1,1504 @@
1
- """
2
- Datasets Repository
3
- """
4
-
5
- import os
6
- import sys
7
- import time
8
- import copy
9
- import tqdm
10
- import logging
11
- import json
12
- from typing import Union
13
-
14
- from .. import entities, repositories, miscellaneous, exceptions, services, PlatformException, _api_reference
15
- from ..services.api_client import ApiClient
16
-
17
- logger = logging.getLogger(name='dtlpy')
18
-
19
-
20
- class Datasets:
21
- """
22
- Datasets Repository
23
-
24
- The Datasets class allows the user to manage datasets. Read more about datasets in our `documentation <https://dataloop.ai/docs/dataset>`_ and `SDK documentation <https://developers.dataloop.ai/tutorials/data_management/manage_datasets/chapter/>`_.
25
- """
26
-
27
- def __init__(self, client_api: ApiClient, project: entities.Project = None):
28
- self._client_api = client_api
29
- self._project = project
30
-
31
- ############
32
- # entities #
33
- ############
34
- @property
35
- def project(self) -> entities.Project:
36
- if self._project is None:
37
- # try get checkout
38
- project = self._client_api.state_io.get('project')
39
- if project is not None:
40
- self._project = entities.Project.from_json(_json=project, client_api=self._client_api)
41
- if self._project is None:
42
- raise exceptions.PlatformException(
43
- error='2001',
44
- message='Cannot perform action WITHOUT Project entity in Datasets repository.'
45
- ' Please checkout or set a project')
46
- assert isinstance(self._project, entities.Project)
47
- return self._project
48
-
49
- @project.setter
50
- def project(self, project: entities.Project):
51
- if not isinstance(project, entities.Project):
52
- raise ValueError('Must input a valid Project entity')
53
- self._project = project
54
-
55
- ###########
56
- # methods #
57
- ###########
58
- def __get_from_cache(self) -> entities.Dataset:
59
- dataset = self._client_api.state_io.get('dataset')
60
- if dataset is not None:
61
- dataset = entities.Dataset.from_json(_json=dataset,
62
- client_api=self._client_api,
63
- datasets=self,
64
- project=self._project)
65
- return dataset
66
-
67
- def __get_by_id(self, dataset_id) -> entities.Dataset:
68
- success, response = self._client_api.gen_request(req_type='get',
69
- path='/datasets/{}'.format(dataset_id))
70
- if dataset_id is None or dataset_id == '':
71
- raise exceptions.PlatformException('400', 'Please checkout a dataset')
72
-
73
- if success:
74
- dataset = entities.Dataset.from_json(client_api=self._client_api,
75
- _json=response.json(),
76
- datasets=self,
77
- project=self._project)
78
- else:
79
- raise exceptions.PlatformException(response)
80
- return dataset
81
-
82
- def __get_by_identifier(self, identifier=None) -> entities.Dataset:
83
- datasets = self.list()
84
- datasets_by_name = [dataset for dataset in datasets if identifier in dataset.name or identifier in dataset.id]
85
- if len(datasets_by_name) == 1:
86
- return datasets_by_name[0]
87
- elif len(datasets_by_name) > 1:
88
- raise Exception('Multiple datasets with this name exist')
89
- else:
90
- raise Exception("Dataset not found")
91
-
92
- def _bulid_folder_filter(self, folder_path, filters=None):
93
- if filters is None:
94
- filters = entities.Filters()
95
- filters._user_query = 'false'
96
- if not folder_path.startswith('/'):
97
- folder_path = '/' + folder_path
98
- filters.add(field='dir', values=folder_path, method=entities.FiltersMethod.OR)
99
- if not folder_path.endswith('*'):
100
- if not folder_path.endswith('/'):
101
- folder_path += '/'
102
- filters.add(field='dir', values=folder_path + '*', method=entities.FiltersMethod.OR)
103
- return filters
104
-
105
- def _get_binaries_dataset(self):
106
- filters = entities.Filters(resource=entities.FiltersResource.DATASET)
107
- filters.add(field='name', values='Binaries')
108
- filters.system_space = True
109
- datasets = self.list(filters=filters)
110
- if len(datasets) == 0:
111
- # empty list
112
- raise exceptions.PlatformException('404', 'Dataset not found. Name: "Binaries"')
113
- # dataset = None
114
- elif len(datasets) > 1:
115
- raise exceptions.PlatformException('400', 'More than one dataset with same name.')
116
- else:
117
- dataset = datasets[0]
118
- return dataset
119
-
120
- def _resolve_dataset_id(self, dataset, dataset_name, dataset_id):
121
- if dataset is None and dataset_name is None and dataset_id is None:
122
- raise ValueError('Must provide dataset, dataset name or dataset id')
123
- if dataset_id is None:
124
- if dataset is None:
125
- dataset = self.get(dataset_name=dataset_name)
126
- dataset_id = dataset.id
127
- return dataset_id
128
-
129
- @staticmethod
130
- def _build_payload(filters, include_feature_vectors, include_annotations,
131
- export_type, annotation_filters, feature_vector_filters, dataset_lock, lock_timeout_sec, export_summary):
132
- valid_list = [e.value for e in entities.ExportType]
133
- valid_types = ', '.join(valid_list)
134
- if export_type not in ['json', 'zip']:
135
- raise ValueError('export_type must be one of the following: {}'.format(valid_types))
136
- payload = {'exportType': export_type}
137
- if filters is None:
138
- filters = entities.Filters()
139
-
140
- if isinstance(filters, entities.Filters):
141
- payload['itemsQuery'] = {'filter': filters.prepare()['filter'], 'join': filters.prepare().get("join", {})}
142
- elif isinstance(filters, dict):
143
- payload['itemsQuery'] = filters
144
- else:
145
- raise exceptions.BadRequest(message='filters must be of type dict or Filters', status_code=500)
146
-
147
- payload['itemsVectorQuery'] = {}
148
- if include_feature_vectors:
149
- payload['includeItemVectors'] = True
150
- payload['itemsVectorQuery']['select'] = {"datasetId": 1, 'featureSetId': 1, 'value': 1}
151
-
152
- if feature_vector_filters is not None:
153
- payload['itemsVectorQuery']['filter'] = feature_vector_filters.prepare()['filter']
154
-
155
- payload['annotations'] = {"include": include_annotations, "convertSemantic": False}
156
-
157
- if annotation_filters is not None:
158
- payload['annotationsQuery'] = annotation_filters.prepare()['filter']
159
- payload['annotations']['filter'] = True
160
-
161
- if dataset_lock:
162
- payload['datasetLock'] = dataset_lock
163
-
164
- if export_summary:
165
- payload['summary'] = export_summary
166
-
167
- if lock_timeout_sec:
168
- payload['lockTimeoutSec'] = lock_timeout_sec
169
-
170
- return payload
171
-
172
- def _download_exported_item(self, item_id, export_type, local_path=None):
173
- export_item = repositories.Items(client_api=self._client_api).get(item_id=item_id)
174
- export_item_path = export_item.download(local_path=local_path)
175
-
176
- if export_type == entities.ExportType.ZIP:
177
- # unzipping annotations to directory
178
- if isinstance(export_item_path, list) or not os.path.isfile(export_item_path):
179
- raise exceptions.PlatformException(
180
- error='404',
181
- message='error downloading annotation zip file. see above for more information. item id: {!r}'.format(
182
- export_item.id))
183
- try:
184
- miscellaneous.Zipping.unzip_directory(zip_filename=export_item_path,
185
- to_directory=local_path)
186
- except Exception as e:
187
- logger.warning("Failed to extract zip file error: {}".format(e))
188
- finally:
189
- # cleanup
190
- if isinstance(export_item_path, str) and os.path.isfile(export_item_path):
191
- os.remove(export_item_path)
192
-
193
- @property
194
- def platform_url(self):
195
- return self._client_api._get_resource_url("projects/{}/datasets".format(self.project.id))
196
-
197
- def open_in_web(self,
198
- dataset_name: str = None,
199
- dataset_id: str = None,
200
- dataset: entities.Dataset = None):
201
- """
202
- Open the dataset in web platform.
203
-
204
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
205
-
206
- :param str dataset_name: The Name of the dataset
207
- :param str dataset_id: The Id of the dataset
208
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
209
-
210
- **Example**:
211
-
212
- .. code-block:: python
213
-
214
- project.datasets.open_in_web(dataset_id='dataset_id')
215
- """
216
- if dataset_name is not None:
217
- dataset = self.get(dataset_name=dataset_name)
218
- if dataset is not None:
219
- dataset.open_in_web()
220
- elif dataset_id is not None:
221
- self._client_api._open_in_web(url=f'{self.platform_url}/{dataset_id}/items')
222
- else:
223
- self._client_api._open_in_web(url=self.platform_url)
224
-
225
- def checkout(self,
226
- identifier: str = None,
227
- dataset_name: str = None,
228
- dataset_id: str = None,
229
- dataset: entities.Dataset = None):
230
- """
231
- Checkout (switch) to a dataset to work on it.
232
-
233
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
234
-
235
- You must provide at least ONE of the following params: dataset_id, dataset_name.
236
-
237
- :param str identifier: project name or partial id that you wish to switch
238
- :param str dataset_name: The Name of the dataset
239
- :param str dataset_id: The Id of the dataset
240
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
241
-
242
- **Example**:
243
-
244
- .. code-block:: python
245
-
246
- project.datasets.checkout(dataset_id='dataset_id')
247
- """
248
- if dataset is None:
249
- if dataset_id is not None or dataset_name is not None:
250
- try:
251
- dataset = self.project.datasets.get(dataset_name=dataset_name, dataset_id=dataset_id)
252
- except exceptions.MissingEntity:
253
- dataset = self.get(dataset_id=dataset_id, dataset_name=dataset_name)
254
- elif identifier is not None:
255
- dataset = self.__get_by_identifier(identifier=identifier)
256
- else:
257
- raise exceptions.PlatformException(error='400',
258
- message='Must provide partial/full id/name to checkout')
259
- self._client_api.state_io.put('dataset', dataset.to_json())
260
- logger.info('Checked out to dataset {}'.format(dataset.name))
261
-
262
- @_api_reference.add(path='/datasets/query', method='post')
263
- def list(self, name=None, creator=None, filters: entities.Filters = None) -> miscellaneous.List[entities.Dataset]:
264
- """
265
- List all datasets.
266
-
267
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
268
-
269
- :param str name: list by name
270
- :param str creator: list by
271
- :param dtlpy.entities.filters.Filters filters: Filters entity containing filters parameters
272
- :return: List of datasets
273
- :rtype: list
274
-
275
- **Example**:
276
-
277
- .. code-block:: python
278
- filters = dl.Filters(resource='datasets')
279
- filters.add(field='readonly', values=False)
280
- datasets = project.datasets.list(filters=filters)
281
- """
282
- if filters is None:
283
- filters = entities.Filters(resource=entities.FiltersResource.DATASET)
284
- # assert type filters
285
- elif not isinstance(filters, entities.Filters):
286
- raise exceptions.PlatformException(error='400',
287
- message='Unknown filters type: {!r}'.format(type(filters)))
288
- if filters.resource != entities.FiltersResource.DATASET:
289
- raise exceptions.PlatformException(
290
- error='400',
291
- message='Filters resource must to be FiltersResource.DATASET. Got: {!r}'.format(filters.resource))
292
-
293
- url = '/datasets/query'
294
-
295
- if name is not None:
296
- filters.add(field='name', values=name)
297
- if creator is not None:
298
- filters.add(field='creator', values=creator)
299
- if self._project is not None:
300
- filters.context = {"projects": [self._project.id]}
301
- filters.page_size = 1000
302
- filters.page = 0
303
- datasets = list()
304
- while True:
305
- success, response = self._client_api.gen_request(req_type='POST',
306
- json_req=filters.prepare(),
307
- path=url,
308
- headers={'user_query': filters._user_query})
309
- if success:
310
- pool = self._client_api.thread_pools('entity.create')
311
- datasets_json = response.json()['items']
312
- jobs = [None for _ in range(len(datasets_json))]
313
- # return triggers list
314
- for i_dataset, dataset in enumerate(datasets_json):
315
- jobs[i_dataset] = pool.submit(entities.Dataset._protected_from_json,
316
- **{'client_api': self._client_api,
317
- '_json': dataset,
318
- 'datasets': self,
319
- 'project': self.project})
320
-
321
- # get all results
322
- results = [j.result() for j in jobs]
323
- # log errors
324
- _ = [logger.warning(r[1]) for r in results if r[0] is False]
325
- # return good jobs
326
- datasets.extend([r[1] for r in results if r[0] is True])
327
- if response.json()['hasNextPage'] is True:
328
- filters.page += 1
329
- else:
330
- break
331
- else:
332
- raise exceptions.PlatformException(response)
333
- datasets = miscellaneous.List(datasets)
334
- return datasets
335
-
336
- @_api_reference.add(path='/datasets/{id}', method='get')
337
- def get(self,
338
- dataset_name: str = None,
339
- dataset_id: str = None,
340
- checkout: bool = False,
341
- fetch: bool = None
342
- ) -> entities.Dataset:
343
- """
344
- Get dataset by name or id.
345
-
346
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
347
-
348
- You must provide at least ONE of the following params: dataset_id, dataset_name.
349
-
350
- :param str dataset_name: optional - search by name
351
- :param str dataset_id: optional - search by id
352
- :param bool checkout: set the dataset as a default dataset object (cookies)
353
- :param bool fetch: optional - fetch entity from platform (True), default taken from cookie
354
- :return: Dataset object
355
- :rtype: dtlpy.entities.dataset.Dataset
356
-
357
- **Example**:
358
-
359
- .. code-block:: python
360
-
361
- dataset = project.datasets.get(dataset_id='dataset_id')
362
- """
363
- if fetch is None:
364
- fetch = self._client_api.fetch_entities
365
-
366
- if dataset_id is None and dataset_name is None:
367
- dataset = self.__get_from_cache()
368
- if dataset is None:
369
- raise exceptions.PlatformException(
370
- error='400',
371
- message='No checked-out Dataset was found, must checkout or provide an identifier in inputs')
372
- elif fetch:
373
- if dataset_id is not None and dataset_id != '':
374
- dataset = self.__get_by_id(dataset_id)
375
- # verify input dataset name is same as the given id
376
- if dataset_name is not None and dataset.name != dataset_name:
377
- logger.warning(
378
- "Mismatch found in datasets.get: dataset_name is different then dataset.name: "
379
- "{!r} != {!r}".format(
380
- dataset_name,
381
- dataset.name))
382
- elif dataset_name is not None:
383
- datasets = self.list(name=dataset_name)
384
- if not datasets:
385
- # empty list
386
- raise exceptions.PlatformException('404', 'Dataset not found. Name: {!r}'.format(dataset_name))
387
- # dataset = None
388
- elif len(datasets) > 1:
389
- raise exceptions.PlatformException('400', 'More than one dataset with same name.')
390
- else:
391
- dataset = datasets[0]
392
- else:
393
- raise exceptions.PlatformException(
394
- error='404',
395
- message='No input and no checked-out found')
396
- else:
397
- dataset = entities.Dataset.from_json(_json={'id': dataset_id,
398
- 'name': dataset_id},
399
- client_api=self._client_api,
400
- datasets=self,
401
- project=self._project,
402
- is_fetched=False)
403
- assert isinstance(dataset, entities.Dataset)
404
- if checkout:
405
- self.checkout(dataset=dataset)
406
- return dataset
407
-
408
- @_api_reference.add(path='/datasets/{id}', method='delete')
409
- def delete(self,
410
- dataset_name: str = None,
411
- dataset_id: str = None,
412
- sure: bool = False,
413
- really: bool = False):
414
- """
415
- Delete a dataset forever!
416
-
417
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
418
-
419
- **Example**:
420
-
421
- .. code-block:: python
422
-
423
- is_deleted = project.datasets.delete(dataset_id='dataset_id', sure=True, really=True)
424
-
425
- :param str dataset_name: optional - search by name
426
- :param str dataset_id: optional - search by id
427
- :param bool sure: Are you sure you want to delete?
428
- :param bool really: Really really sure?
429
- :return: True is success
430
- :rtype: bool
431
- """
432
- if sure and really:
433
- dataset = self.get(dataset_name=dataset_name, dataset_id=dataset_id)
434
- success, response = self._client_api.gen_request(req_type='delete',
435
- path='/datasets/{}'.format(dataset.id))
436
- if not success:
437
- raise exceptions.PlatformException(response)
438
- logger.info('Dataset {!r} was deleted successfully'.format(dataset.name))
439
- return True
440
- else:
441
- raise exceptions.PlatformException(
442
- error='403',
443
- message='Cant delete dataset from SDK. Please login to platform to delete')
444
-
445
- @_api_reference.add(path='/datasets/{id}', method='patch')
446
- def update(self,
447
- dataset: entities.Dataset,
448
- system_metadata: bool = False,
449
- patch: dict = None
450
- ) -> entities.Dataset:
451
- """
452
- Update dataset field.
453
-
454
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
455
-
456
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
457
- :param bool system_metadata: True, if you want to change metadata system
458
- :param dict patch: Specific patch request
459
- :return: Dataset object
460
- :rtype: dtlpy.entities.dataset.Dataset
461
-
462
- **Example**:
463
-
464
- .. code-block:: python
465
-
466
- dataset = project.datasets.update(dataset='dataset_entity')
467
- """
468
- url_path = '/datasets/{}'.format(dataset.id)
469
- if system_metadata:
470
- url_path += '?system=true'
471
-
472
- if patch is None:
473
- patch = dataset.to_json()
474
-
475
- success, response = self._client_api.gen_request(req_type='patch',
476
- path=url_path,
477
- json_req=patch)
478
- if success:
479
- logger.info('Dataset was updated successfully')
480
- return dataset
481
- else:
482
- raise exceptions.PlatformException(response)
483
-
484
- @_api_reference.add(path='/datasets/{id}/unlock', method='patch')
485
- def unlock(self, dataset: entities.Dataset ) -> entities.Dataset:
486
- """
487
- Unlock dataset.
488
-
489
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
490
-
491
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
492
- :return: Dataset object
493
- :rtype: dtlpy.entities.dataset.Dataset
494
-
495
- **Example**:
496
-
497
- .. code-block:: python
498
-
499
- dataset = project.datasets.unlock(dataset='dataset_entity')
500
- """
501
- url_path = '/datasets/{}/unlock'.format(dataset.id)
502
-
503
- success, response = self._client_api.gen_request(req_type='patch', path=url_path)
504
- if success:
505
- logger.info('Dataset was unlocked successfully')
506
- return dataset
507
- else:
508
- raise exceptions.PlatformException(response)
509
-
510
- @_api_reference.add(path='/datasets/{id}/directoryTree', method='get')
511
- def directory_tree(self,
512
- dataset: entities.Dataset = None,
513
- dataset_name: str = None,
514
- dataset_id: str = None):
515
- """
516
- Get dataset's directory tree.
517
-
518
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
519
-
520
- You must provide at least ONE of the following params: dataset, dataset_name, dataset_id.
521
-
522
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
523
- :param str dataset_name: The Name of the dataset
524
- :param str dataset_id: The Id of the dataset
525
- :return: DirectoryTree
526
-
527
- **Example**:
528
-
529
- .. code-block:: python
530
- directory_tree = dataset.directory_tree
531
- directory_tree = project.datasets.directory_tree(dataset='dataset_entity')
532
- """
533
- dataset_id = self._resolve_dataset_id(dataset, dataset_name, dataset_id)
534
-
535
- url_path = '/datasets/{}/directoryTree'.format(dataset_id)
536
-
537
- success, response = self._client_api.gen_request(req_type='get',
538
- path=url_path)
539
-
540
- if success:
541
- return entities.DirectoryTree(_json=response.json())
542
- else:
543
- raise exceptions.PlatformException(response)
544
-
545
- @_api_reference.add(path='/datasets/{id}/clone', method='post')
546
- def clone(self,
547
- dataset_id: str,
548
- clone_name: str = None,
549
- filters: entities.Filters = None,
550
- with_items_annotations: bool = True,
551
- with_metadata: bool = True,
552
- with_task_annotations_status: bool = True,
553
- dst_dataset_id: str = None,
554
- target_directory: str = None):
555
- """
556
- Clone a dataset. Read more about cloning datatsets and items in our `documentation <https://dataloop.ai/docs/clone-merge-dataset#cloned-dataset>`_ and `SDK documentation <https://developers.dataloop.ai/tutorials/data_management/data_versioning/chapter/>`_.
557
-
558
- **Prerequisites**: You must be in the role of an *owner* or *developer*.
559
-
560
- :param str dataset_id: id of the dataset you wish to clone
561
- :param str clone_name: new dataset name
562
- :param dtlpy.entities.filters.Filters filters: Filters entity or a query dict
563
- :param bool with_items_annotations: true to clone with items annotations
564
- :param bool with_metadata: true to clone with metadata
565
- :param bool with_task_annotations_status: true to clone with task annotations' status
566
- :param str dst_dataset_id: destination dataset id
567
- :param str target_directory: target directory
568
- :return: dataset object
569
- :rtype: dtlpy.entities.dataset.Dataset
570
-
571
- **Example**:
572
-
573
- .. code-block:: python
574
-
575
- dataset = project.datasets.clone(dataset_id='dataset_id',
576
- clone_name='dataset_clone_name',
577
- with_metadata=True,
578
- with_items_annotations=False,
579
- with_task_annotations_status=False)
580
- """
581
- if clone_name is None and dst_dataset_id is None:
582
- raise exceptions.PlatformException('400', 'Must provide clone name or destination dataset id')
583
- if filters is None:
584
- filters = entities.Filters()
585
- filters._user_query = 'false'
586
- elif not isinstance(filters, entities.Filters):
587
- raise exceptions.PlatformException(
588
- error='400',
589
- message='"filters" must be a dl.Filters entity. got: {!r}'.format(type(filters)))
590
-
591
- copy_filters = copy.deepcopy(filters)
592
- if copy_filters.has_field('hidden'):
593
- copy_filters.pop('hidden')
594
-
595
- if target_directory is not None and not target_directory.startswith('/'):
596
- target_directory = '/' + target_directory
597
-
598
- payload = {
599
- "name": clone_name,
600
- "filter": copy_filters.prepare(),
601
- "cloneDatasetParams": {
602
- "withItemsAnnotations": with_items_annotations,
603
- "withMetadata": with_metadata,
604
- "withTaskAnnotationsStatus": with_task_annotations_status,
605
- "targetDirectory": target_directory
606
- }
607
- }
608
- if dst_dataset_id is not None:
609
- payload['cloneDatasetParams']['targetDatasetId'] = dst_dataset_id
610
- success, response = self._client_api.gen_request(req_type='post',
611
- path='/datasets/{}/clone'.format(dataset_id),
612
- json_req=payload,
613
- headers={'user_query': filters._user_query})
614
-
615
- if not success:
616
- raise exceptions.PlatformException(response)
617
-
618
- command = entities.Command.from_json(_json=response.json(),
619
- client_api=self._client_api)
620
- command = command.wait()
621
-
622
- if 'returnedModelId' not in command.spec:
623
- raise exceptions.PlatformException(error='400',
624
- message="returnedModelId key is missing in command response: {!r}"
625
- .format(response))
626
- return self.get(dataset_id=command.spec['returnedModelId'])
627
-
628
- @_api_reference.add(path='/datasets/{id}/export', method='post')
629
- def export(self,
630
- dataset: entities.Dataset = None,
631
- dataset_name: str = None,
632
- dataset_id: str = None,
633
- local_path: str = None,
634
- filters: Union[dict, entities.Filters] = None,
635
- annotation_filters: entities.Filters = None,
636
- feature_vector_filters: entities.Filters = None,
637
- include_feature_vectors: bool = False,
638
- include_annotations: bool = False,
639
- export_type: entities.ExportType = entities.ExportType.JSON,
640
- timeout: int = 0,
641
- dataset_lock: bool = False,
642
- lock_timeout_sec: int = None,
643
- export_summary: bool = False):
644
- """
645
- Export dataset items and annotations.
646
-
647
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
648
-
649
- You must provide at least ONE of the following params: dataset, dataset_name, dataset_id.
650
-
651
- :param dtlpy.entities.dataset.Dataset dataset: Dataset object
652
- :param str dataset_name: The name of the dataset
653
- :param str dataset_id: The ID of the dataset
654
- :param str local_path: Local path to save the exported dataset
655
- :param Union[dict, dtlpy.entities.filters.Filters] filters: Filters entity or a query dictionary
656
- :param dtlpy.entities.filters.Filters annotation_filters: Filters entity to filter annotations for export
657
- :param dtlpy.entities.filters.Filters feature_vector_filters: Filters entity to filter feature vectors for export
658
- :param bool include_feature_vectors: Include item feature vectors in the export
659
- :param bool include_annotations: Include item annotations in the export
660
- :param bool dataset_lock: Make dataset readonly during the export
661
- :param bool export_summary: Get Summary of the dataset export
662
- :param int lock_timeout_sec: Timeout for locking the dataset during export in seconds
663
- :param entities.ExportType export_type: Type of export ('json' or 'zip')
664
- :param int timeout: Maximum time in seconds to wait for the export to complete
665
- :return: Exported item
666
- :rtype: dtlpy.entities.item.Item
667
-
668
- **Example**:
669
-
670
- .. code-block:: python
671
-
672
- export_item = project.datasets.export(dataset_id='dataset_id',
673
- filters=filters,
674
- include_feature_vectors=True,
675
- include_annotations=True,
676
- export_type=dl.ExportType.JSON,
677
- dataset_lock=True,
678
- lock_timeout_sec=300,
679
- export_summary=False)
680
- """
681
- dataset_id = self._resolve_dataset_id(dataset, dataset_name, dataset_id)
682
- payload = self._build_payload(filters, include_feature_vectors, include_annotations,
683
- export_type, annotation_filters, feature_vector_filters,
684
- dataset_lock, lock_timeout_sec, export_summary)
685
-
686
- success, response = self._client_api.gen_request(req_type='post', path=f'/datasets/{dataset_id}/export',
687
- json_req=payload)
688
- if not success:
689
- raise exceptions.PlatformException(response)
690
-
691
- command = entities.Command.from_json(_json=response.json(),
692
- client_api=self._client_api)
693
-
694
- time.sleep(2) # as the command have wrong progress in the beginning
695
- command = command.wait(timeout=timeout)
696
- if 'outputItemId' not in command.spec:
697
- raise exceptions.PlatformException(
698
- error='400',
699
- message="outputItemId key is missing in command response: {}".format(response))
700
- item_id = command.spec['outputItemId']
701
- self._download_exported_item(item_id=item_id, export_type=export_type, local_path=local_path)
702
- return local_path
703
-
704
- @_api_reference.add(path='/datasets/merge', method='post')
705
- def merge(self,
706
- merge_name: str,
707
- dataset_ids: list,
708
- project_ids: str,
709
- with_items_annotations: bool = True,
710
- with_metadata: bool = True,
711
- with_task_annotations_status: bool = True,
712
- wait: bool = True):
713
- """
714
- Merge a dataset. See our `SDK docs <https://developers.dataloop.ai/tutorials/data_management/data_versioning/chapter/>`_ for more information.
715
-
716
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
717
-
718
- :param str merge_name: new dataset name
719
- :param list dataset_ids: list id's of the datatsets you wish to merge
720
- :param str project_ids: the project id that include the datasets
721
- :param bool with_items_annotations: true to merge with items annotations
722
- :param bool with_metadata: true to merge with metadata
723
- :param bool with_task_annotations_status: true to merge with task annotations' status
724
- :param bool wait: wait for the command to finish
725
- :return: True if success
726
- :rtype: bool
727
-
728
- **Example**:
729
-
730
- .. code-block:: python
731
-
732
- success = project.datasets.merge(dataset_ids=['dataset_id1','dataset_id2'],
733
- merge_name='dataset_merge_name',
734
- with_metadata=True,
735
- with_items_annotations=False,
736
- with_task_annotations_status=False)
737
- """
738
- payload = {
739
- "name": merge_name,
740
- "datasetsIds": dataset_ids,
741
- "projectIds": project_ids,
742
- "mergeDatasetParams": {
743
- "withItemsAnnotations": with_items_annotations,
744
- "withMetadata": with_metadata,
745
- "withTaskAnnotationsStatus": with_task_annotations_status
746
- },
747
- 'asynced': wait
748
- }
749
- success, response = self._client_api.gen_request(req_type='post',
750
- path='/datasets/merge',
751
- json_req=payload)
752
-
753
- if success:
754
- command = entities.Command.from_json(_json=response.json(),
755
- client_api=self._client_api)
756
- if not wait:
757
- return command
758
- command = command.wait(timeout=0)
759
- if 'mergeDatasetsConfiguration' not in command.spec:
760
- raise exceptions.PlatformException(error='400',
761
- message="mergeDatasetsConfiguration key is missing in command response: {}"
762
- .format(response))
763
- return True
764
- else:
765
- raise exceptions.PlatformException(response)
766
-
767
- @_api_reference.add(path='/datasets/{id}/sync', method='post')
768
- def sync(self, dataset_id: str, wait: bool = True):
769
- """
770
- Sync dataset with external storage.
771
-
772
- **Prerequisites**: You must be in the role of an *owner* or *developer*.
773
-
774
- :param str dataset_id: The Id of the dataset to sync
775
- :param bool wait: wait for the command to finish
776
- :return: True if success
777
- :rtype: bool
778
-
779
- **Example**:
780
-
781
- .. code-block:: python
782
-
783
- success = project.datasets.sync(dataset_id='dataset_id')
784
- """
785
-
786
- success, response = self._client_api.gen_request(req_type='post',
787
- path='/datasets/{}/sync'.format(dataset_id))
788
-
789
- if success:
790
- command = entities.Command.from_json(_json=response.json(),
791
- client_api=self._client_api)
792
- if not wait:
793
- return command
794
- command = command.wait(timeout=0)
795
- if 'datasetId' not in command.spec:
796
- raise exceptions.PlatformException(error='400',
797
- message="datasetId key is missing in command response: {}"
798
- .format(response))
799
- return True
800
- else:
801
- raise exceptions.PlatformException(response)
802
-
803
- @_api_reference.add(path='/datasets', method='post')
804
- def create(self,
805
- dataset_name: str,
806
- labels=None,
807
- attributes=None,
808
- ontology_ids=None,
809
- driver: entities.Driver = None,
810
- driver_id: str = None,
811
- checkout: bool = False,
812
- expiration_options: entities.ExpirationOptions = None,
813
- index_driver: entities.IndexDriver = None,
814
- recipe_id: str = None
815
- ) -> entities.Dataset:
816
- """
817
- Create a new dataset
818
-
819
- **Prerequisites**: You must be in the role of an *owner* or *developer*.
820
-
821
- :param str dataset_name: The Name of the dataset
822
- :param list labels: dictionary of {tag: color} or list of label entities
823
- :param list attributes: dataset's ontology's attributes
824
- :param list ontology_ids: optional - dataset ontology
825
- :param dtlpy.entities.driver.Driver driver: optional - storage driver Driver object or driver name
826
- :param str driver_id: optional - driver id
827
- :param bool checkout: set the dataset as a default dataset object (cookies)
828
- :param ExpirationOptions expiration_options: dl.ExpirationOptions object that contain definitions for dataset like MaxItemDays
829
- :param str index_driver: dl.IndexDriver, dataset driver version
830
- :param str recipe_id: optional - recipe id
831
- :return: Dataset object
832
- :rtype: dtlpy.entities.dataset.Dataset
833
-
834
- **Example**:
835
-
836
- .. code-block:: python
837
-
838
- dataset = project.datasets.create(dataset_name='dataset_name', ontology_ids='ontology_ids')
839
- """
840
- create_default_recipe = True
841
- if any([labels, attributes, ontology_ids, recipe_id]):
842
- create_default_recipe = False
843
-
844
- # labels to list
845
- if labels is not None:
846
- if not isinstance(labels, list):
847
- labels = [labels]
848
- if not all(isinstance(label, entities.Label) for label in labels):
849
- labels = entities.Dataset.serialize_labels(labels)
850
- else:
851
- labels = list()
852
-
853
- # get creator from token
854
- payload = {'name': dataset_name,
855
- 'projects': [self.project.id],
856
- 'createDefaultRecipe': create_default_recipe
857
- }
858
-
859
- if driver_id is None and driver is not None:
860
- if isinstance(driver, entities.Driver):
861
- driver_id = driver.id
862
- elif isinstance(driver, str):
863
- driver_id = self.project.drivers.get(driver_name=driver).id
864
- else:
865
- raise exceptions.PlatformException(
866
- error=400,
867
- message='Input arg "driver" must be Driver object or a string driver name. got type: {!r}'.format(
868
- type(driver)))
869
- if driver_id is not None:
870
- payload['driver'] = driver_id
871
-
872
- if expiration_options:
873
- payload['expirationOptions'] = expiration_options.to_json()
874
- if index_driver is not None:
875
- payload['indexDriver'] = index_driver
876
-
877
- success, response = self._client_api.gen_request(req_type='post',
878
- path='/datasets',
879
- json_req=payload)
880
- if success:
881
- dataset = entities.Dataset.from_json(client_api=self._client_api,
882
- _json=response.json(),
883
- datasets=self,
884
- project=self.project)
885
- # create ontology and recipe
886
- if not create_default_recipe:
887
- if recipe_id is not None:
888
- dataset.switch_recipe(recipe_id=recipe_id)
889
- else:
890
- dataset = dataset.recipes.create(ontology_ids=ontology_ids,
891
- labels=labels,
892
- attributes=attributes).dataset
893
- else:
894
- raise exceptions.PlatformException(response)
895
- logger.info('Dataset was created successfully. Dataset id: {!r}'.format(dataset.id))
896
- assert isinstance(dataset, entities.Dataset)
897
- if checkout:
898
- self.checkout(dataset=dataset)
899
- return dataset
900
-
901
- @staticmethod
902
- def _convert_single(downloader,
903
- item,
904
- img_filepath,
905
- local_path,
906
- overwrite,
907
- annotation_options,
908
- annotation_filters,
909
- thickness,
910
- with_text,
911
- progress,
912
- alpha,
913
- export_version):
914
- # this is to convert the downloaded json files to any other annotation type
915
- try:
916
- if entities.ViewAnnotationOptions.ANNOTATION_ON_IMAGE in annotation_options:
917
- if img_filepath is None:
918
- img_filepath = item.download()
919
- downloader._download_img_annotations(item=item,
920
- img_filepath=img_filepath,
921
- local_path=local_path,
922
- overwrite=overwrite,
923
- annotation_options=annotation_options,
924
- annotation_filters=annotation_filters,
925
- thickness=thickness,
926
- alpha=alpha,
927
- with_text=with_text,
928
- export_version=export_version
929
- )
930
- except Exception:
931
- logger.error('Failed to download annotation for item: {!r}'.format(item.name))
932
- progress.update()
933
-
934
- @staticmethod
935
- def download_annotations(dataset: entities.Dataset,
936
- local_path: str = None,
937
- filters: entities.Filters = None,
938
- annotation_options: entities.ViewAnnotationOptions = None,
939
- annotation_filters: entities.Filters = None,
940
- overwrite: bool = False,
941
- thickness: int = 1,
942
- with_text: bool = False,
943
- remote_path: str = None,
944
- include_annotations_in_output: bool = True,
945
- export_png_files: bool = False,
946
- filter_output_annotations: bool = False,
947
- alpha: float = None,
948
- export_version=entities.ExportVersion.V1,
949
- dataset_lock: bool = False,
950
- lock_timeout_sec: int = None,
951
- export_summary: bool = False,
952
- ) -> str:
953
- """
954
- Download dataset's annotations by filters.
955
-
956
- You may filter the dataset both for items and for annotations and download annotations.
957
-
958
- Optional -- download annotations as: mask, instance, image mask of the item.
959
-
960
- **Prerequisites**: You must be in the role of an *owner* or *developer*.
961
-
962
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
963
- :param str local_path: local folder or filename to save to.
964
- :param dtlpy.entities.filters.Filters filters: Filters entity or a dictionary containing filters parameters
965
- :param list annotation_options: type of download annotations: list(dl.ViewAnnotationOptions)
966
- :param dtlpy.entities.filters.Filters annotation_filters: Filters entity to filter annotations for download
967
- :param bool overwrite: optional - default = False to overwrite the existing files
968
- :param bool dataset_loc: optional - default = False to make the dataset readonly
969
- :param int thickness: optional - line thickness, if -1 annotation will be filled, default =1
970
- :param bool with_text: optional - add text to annotations, default = False
971
- :param str remote_path: DEPRECATED and ignored
972
- :param bool include_annotations_in_output: default - False , if export should contain annotations
973
- :param bool export_png_files: default - if True, semantic annotations should be exported as png files
974
- :param bool filter_output_annotations: default - False, given an export by filter - determine if to filter out annotations
975
- :param float alpha: opacity value [0 1], default 1
976
- :param str export_version: exported items will have original extension in filename, `V1` - no original extension in filenames
977
- :return: local_path of the directory where all the downloaded item
978
- :param bool dataset_lock: optional - default = False
979
- :param bool export_summary: optional - default = False
980
- :param int lock_timeout_sec: optional
981
- :rtype: str
982
-
983
- **Example**:
984
-
985
- .. code-block:: python
986
-
987
- file_path = project.datasets.download_annotations(dataset='dataset_entity',
988
- local_path='local_path',
989
- annotation_options=dl.ViewAnnotationOptions,
990
- overwrite=False,
991
- thickness=1,
992
- with_text=False,
993
- alpha=1,
994
- dataset_lock=False,
995
- lock_timeout_sec=300,
996
- export_summary=False
997
- )
998
- """
999
- if annotation_options is None:
1000
- annotation_options = list()
1001
- elif not isinstance(annotation_options, list):
1002
- annotation_options = [annotation_options]
1003
- for ann_option in annotation_options:
1004
- if not isinstance(ann_option, entities.ViewAnnotationOptions):
1005
- if ann_option not in list(entities.ViewAnnotationOptions):
1006
- raise PlatformException(
1007
- error='400',
1008
- message='Unknown annotation download option: {}, please choose from: {}'.format(
1009
- ann_option, list(entities.ViewAnnotationOptions)))
1010
-
1011
- if remote_path is not None:
1012
- logger.warning(
1013
- '"remote_path" is ignored. Use "filters=dl.Filters(field="dir, values={!r}"'.format(remote_path))
1014
- if local_path is None:
1015
- if dataset.project is None:
1016
- # by dataset name
1017
- local_path = os.path.join(
1018
- services.service_defaults.DATALOOP_PATH,
1019
- "datasets",
1020
- "{}_{}".format(dataset.name, dataset.id),
1021
- )
1022
- else:
1023
- # by dataset and project name
1024
- local_path = os.path.join(
1025
- services.service_defaults.DATALOOP_PATH,
1026
- "projects",
1027
- dataset.project.name,
1028
- "datasets",
1029
- dataset.name,
1030
- )
1031
-
1032
- if filters is None:
1033
- filters = entities.Filters()
1034
- filters._user_query = 'false'
1035
- if annotation_filters is not None:
1036
- for annotation_filter_and in annotation_filters.and_filter_list:
1037
- filters.add_join(field=annotation_filter_and.field,
1038
- values=annotation_filter_and.values,
1039
- operator=annotation_filter_and.operator,
1040
- method=entities.FiltersMethod.AND)
1041
- for annotation_filter_or in annotation_filters.or_filter_list:
1042
- filters.add_join(field=annotation_filter_or.field,
1043
- values=annotation_filter_or.values,
1044
- operator=annotation_filter_or.operator,
1045
- method=entities.FiltersMethod.OR)
1046
-
1047
- downloader = repositories.Downloader(items_repository=dataset.items)
1048
- downloader.download_annotations(dataset=dataset,
1049
- filters=filters,
1050
- annotation_filters=annotation_filters,
1051
- local_path=local_path,
1052
- overwrite=overwrite,
1053
- include_annotations_in_output=include_annotations_in_output,
1054
- export_png_files=export_png_files,
1055
- filter_output_annotations=filter_output_annotations,
1056
- export_version=export_version,
1057
- dataset_lock=dataset_lock,
1058
- lock_timeout_sec=lock_timeout_sec,
1059
- export_summary=export_summary
1060
- )
1061
- if annotation_options:
1062
- pages = dataset.items.list(filters=filters)
1063
- if not isinstance(annotation_options, list):
1064
- annotation_options = [annotation_options]
1065
- # convert all annotations to annotation_options
1066
- pool = dataset._client_api.thread_pools(pool_name='dataset.download')
1067
- jobs = [None for _ in range(pages.items_count)]
1068
- progress = tqdm.tqdm(total=pages.items_count,
1069
- disable=dataset._client_api.verbose.disable_progress_bar_download_annotations,
1070
- file=sys.stdout, desc='Download Annotations')
1071
- i_item = 0
1072
- for page in pages:
1073
- for item in page:
1074
- jobs[i_item] = pool.submit(
1075
- Datasets._convert_single,
1076
- **{
1077
- 'downloader': downloader,
1078
- 'item': item,
1079
- 'img_filepath': None,
1080
- 'local_path': local_path,
1081
- 'overwrite': overwrite,
1082
- 'annotation_options': annotation_options,
1083
- 'annotation_filters': annotation_filters,
1084
- 'thickness': thickness,
1085
- 'with_text': with_text,
1086
- 'progress': progress,
1087
- 'alpha': alpha,
1088
- 'export_version': export_version
1089
- }
1090
- )
1091
- i_item += 1
1092
- # get all results
1093
- _ = [j.result() for j in jobs]
1094
- progress.close()
1095
- return local_path
1096
-
1097
- def _upload_single_item_annotation(self, item, file, pbar):
1098
- try:
1099
- item.annotations.upload(file)
1100
- except Exception as err:
1101
- raise err
1102
- finally:
1103
- pbar.update()
1104
-
1105
- def upload_annotations(self,
1106
- dataset,
1107
- local_path,
1108
- filters: entities.Filters = None,
1109
- clean=False,
1110
- remote_root_path='/',
1111
- export_version=entities.ExportVersion.V1
1112
- ):
1113
- """
1114
- Upload annotations to dataset.
1115
-
1116
- Example for remote_root_path: If the item filepath is "/a/b/item" and remote_root_path is "/a" - the start folder will be b instead of a
1117
-
1118
- **Prerequisites**: You must have a dataset with items that are related to the annotations. The relationship between the dataset and annotations is shown in the name. You must be in the role of an *owner* or *developer*.
1119
-
1120
- :param dtlpy.entities.dataset.Dataset dataset: dataset to upload to
1121
- :param str local_path: str - local folder where the annotations files are
1122
- :param dtlpy.entities.filters.Filters filters: Filters entity or a dictionary containing filters parameters
1123
- :param bool clean: True to remove the old annotations
1124
- :param str remote_root_path: the remote root path to match remote and local items
1125
- :param str export_version: exported items will have original extension in filename, `V1` - no original extension in filenames
1126
-
1127
- **Example**:
1128
-
1129
- .. code-block:: python
1130
-
1131
- project.datasets.upload_annotations(dataset='dataset_entity',
1132
- local_path='local_path',
1133
- clean=False,
1134
- export_version=dl.ExportVersion.V1
1135
- )
1136
- """
1137
- if filters is None:
1138
- filters = entities.Filters()
1139
- filters._user_query = 'false'
1140
- pages = dataset.items.list(filters=filters)
1141
- total_items = pages.items_count
1142
- pbar = tqdm.tqdm(total=total_items, disable=dataset._client_api.verbose.disable_progress_bar_upload_annotations,
1143
- file=sys.stdout, desc='Upload Annotations')
1144
- pool = self._client_api.thread_pools('annotation.upload')
1145
- annotations_uploaded_count = 0
1146
- for item in pages.all():
1147
- if export_version == entities.ExportVersion.V1:
1148
- _, ext = os.path.splitext(item.filename)
1149
- filepath = item.filename.replace(ext, '.json')
1150
- else:
1151
- filepath = item.filename + '.json'
1152
- # make the file path ignore the hierarchy of the files that in remote_root_path
1153
- filepath = os.path.relpath(filepath, remote_root_path)
1154
- json_file = os.path.join(local_path, filepath)
1155
- if not os.path.isfile(json_file):
1156
- pbar.update()
1157
- continue
1158
- annotations_uploaded_count += 1
1159
- if item.annotated and clean:
1160
- item.annotations.delete(filters=entities.Filters(resource=entities.FiltersResource.ANNOTATION))
1161
- pool.submit(self._upload_single_item_annotation, **{'item': item,
1162
- 'file': json_file,
1163
- 'pbar': pbar})
1164
- pool.shutdown()
1165
- if annotations_uploaded_count == 0:
1166
- logger.warning(msg="No annotations uploaded to dataset! ")
1167
- else:
1168
- logger.info(msg='Found and uploaded {} annotations.'.format(annotations_uploaded_count))
1169
-
1170
- def set_readonly(self, state: bool, dataset: entities.Dataset):
1171
- """
1172
- Set dataset readonly mode.
1173
-
1174
- **Prerequisites**: You must be in the role of an *owner* or *developer*.
1175
-
1176
- :param bool state: state to update readonly mode
1177
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
1178
-
1179
- **Example**:
1180
-
1181
- .. code-block:: python
1182
-
1183
- project.datasets.set_readonly(dataset='dataset_entity', state=True)
1184
- """
1185
- import warnings
1186
- warnings.warn("`readonly` flag on dataset is deprecated, doing nothing.", DeprecationWarning)
1187
-
1188
-
1189
- @_api_reference.add(path='/datasets/{id}/split', method='post')
1190
- def split_ml_subsets(self,
1191
- dataset_id: str,
1192
- items_query: entities.filters,
1193
- ml_split_list: dict) -> bool:
1194
- """
1195
- Split dataset items into ML subsets.
1196
-
1197
- :param str dataset_id: The ID of the dataset.
1198
- :param dict items_query: Query to select items.
1199
- :param dict ml_split_list: Dictionary with 'train', 'validation', 'test' keys and integer percentages.
1200
- :return: True if the split operation was successful.
1201
- :rtype: bool
1202
- :raises: PlatformException on failure and ValueError if percentages do not sum to 100 or invalid keys/values.
1203
- """
1204
- # Validate percentages
1205
- if not ml_split_list:
1206
- ml_split_list = {'train': 80, 'validation': 10, 'test': 10}
1207
-
1208
- if not items_query:
1209
- items_query = entities.Filters()
1210
-
1211
- items_query_dict = items_query.prepare()
1212
- required_keys = {'train', 'validation', 'test'}
1213
- if set(ml_split_list.keys()) != required_keys:
1214
- raise ValueError("MLSplitList must have exactly the keys 'train', 'validation', 'test'.")
1215
- total = sum(ml_split_list.values())
1216
- if total != 100:
1217
- raise ValueError(
1218
- "Please set the Train, Validation, and Test subsets percentages to add up to 100%. "
1219
- "For example: 70, 15, 15."
1220
- )
1221
- for key, value in ml_split_list.items():
1222
- if not isinstance(value, int) or value < 0:
1223
- raise ValueError("Percentages must be integers >= 0.")
1224
- payload = {
1225
- 'itemsQuery': items_query_dict,
1226
- 'MLSplitList': ml_split_list
1227
- }
1228
- path = f'/datasets/{dataset_id}/split'
1229
- success, response = self._client_api.gen_request(req_type='post',
1230
- path=path,
1231
- json_req=payload)
1232
- if success:
1233
- # Wait for the split operation to complete
1234
- command = entities.Command.from_json(_json=response.json(),
1235
- client_api=self._client_api)
1236
- command.wait()
1237
- return True
1238
- else:
1239
- raise exceptions.PlatformException(response)
1240
-
1241
-
1242
- @_api_reference.add(path='/datasets/{id}/items/bulk-update-metadata', method='post')
1243
- def bulk_update_ml_subset(self, dataset_id: str, items_query: dict, subset: str = None, deleteTag: bool = False) -> bool:
1244
- """
1245
- Bulk update ML subset assignment for selected items.
1246
- If subset is None, remove subsets. Otherwise, assign the specified subset.
1247
-
1248
- :param str dataset_id: ID of the dataset
1249
- :param dict items_query: DQLResourceQuery (filters) for selecting items
1250
- :param str subset: 'train', 'validation', 'test' or None to remove all
1251
- :return: True if success
1252
- :rtype: bool
1253
- """
1254
- if items_query is None:
1255
- items_query = entities.Filters()
1256
- items_query_dict = items_query.prepare()
1257
- if not deleteTag and subset not in ['train', 'validation', 'test']:
1258
- raise ValueError("subset must be one of: 'train', 'validation', 'test'")
1259
- # Determine tag values based on subset
1260
- tags = {
1261
- 'train': True if subset == 'train' else None,
1262
- 'validation': True if subset == 'validation' else None,
1263
- 'test': True if subset == 'test' else None
1264
- }
1265
-
1266
- payload = {
1267
- "query": items_query_dict,
1268
- "updateQuery": {
1269
- "update": {
1270
- "metadata": {
1271
- "system": {
1272
- "tags": tags
1273
- }
1274
- }
1275
- },
1276
- "systemSpace": True
1277
- }
1278
- }
1279
-
1280
- success, response = self._client_api.gen_request(
1281
- req_type='post',
1282
- path=f'/datasets/{dataset_id}/items/bulk-update-metadata',
1283
- json_req=payload
1284
- )
1285
- if success:
1286
- # Similar to split operation, a command is returned
1287
- command = entities.Command.from_json(_json=response.json(), client_api=self._client_api)
1288
- command.wait()
1289
- return True
1290
- else:
1291
- raise exceptions.PlatformException(response)
1
+ """
2
+ Datasets Repository
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ import time
8
+ import copy
9
+ import tqdm
10
+ import logging
11
+ import zipfile
12
+ import json
13
+ from typing import Union, Generator, Optional
14
+
15
+ from .. import entities, repositories, miscellaneous, exceptions, services, PlatformException, _api_reference
16
+ from ..services.api_client import ApiClient
17
+ from ..entities.dataset import OutputExportType, ExportType
18
+
19
+ logger = logging.getLogger(name='dtlpy')
20
+
21
+ MAX_ITEMS_PER_SUBSET = 50000
22
+
23
+ class Datasets:
24
+ """
25
+ Datasets Repository
26
+
27
+ The Datasets class allows the user to manage datasets. Read more about datasets in our `documentation <https://dataloop.ai/docs/dataset>`_ and `SDK documentation <https://developers.dataloop.ai/tutorials/data_management/manage_datasets/chapter/>`_.
28
+ """
29
+
30
+ def __init__(self, client_api: ApiClient, project: entities.Project = None):
31
+ self._client_api = client_api
32
+ self._project = project
33
+
34
+ ############
35
+ # entities #
36
+ ############
37
+ @property
38
+ def project(self) -> entities.Project:
39
+ if self._project is None:
40
+ # try get checkout
41
+ project = self._client_api.state_io.get('project')
42
+ if project is not None:
43
+ self._project = entities.Project.from_json(_json=project, client_api=self._client_api)
44
+ if self._project is None:
45
+ raise exceptions.PlatformException(
46
+ error='2001',
47
+ message='Cannot perform action WITHOUT Project entity in Datasets repository.'
48
+ ' Please checkout or set a project')
49
+ assert isinstance(self._project, entities.Project)
50
+ return self._project
51
+
52
+ @project.setter
53
+ def project(self, project: entities.Project):
54
+ if not isinstance(project, entities.Project):
55
+ raise ValueError('Must input a valid Project entity')
56
+ self._project = project
57
+
58
+ ###########
59
+ # methods #
60
+ ###########
61
+ def __get_from_cache(self) -> entities.Dataset:
62
+ dataset = self._client_api.state_io.get('dataset')
63
+ if dataset is not None:
64
+ dataset = entities.Dataset.from_json(_json=dataset,
65
+ client_api=self._client_api,
66
+ datasets=self,
67
+ project=self._project)
68
+ return dataset
69
+
70
+ def __get_by_id(self, dataset_id) -> entities.Dataset:
71
+ success, response = self._client_api.gen_request(req_type='get',
72
+ path='/datasets/{}'.format(dataset_id))
73
+ if dataset_id is None or dataset_id == '':
74
+ raise exceptions.PlatformException('400', 'Please checkout a dataset')
75
+
76
+ if success:
77
+ dataset = entities.Dataset.from_json(client_api=self._client_api,
78
+ _json=response.json(),
79
+ datasets=self,
80
+ project=self._project)
81
+ else:
82
+ raise exceptions.PlatformException(response)
83
+ return dataset
84
+
85
+ def __get_by_identifier(self, identifier=None) -> entities.Dataset:
86
+ datasets = self.list()
87
+ datasets_by_name = [dataset for dataset in datasets if identifier in dataset.name or identifier in dataset.id]
88
+ if len(datasets_by_name) == 1:
89
+ return datasets_by_name[0]
90
+ elif len(datasets_by_name) > 1:
91
+ raise Exception('Multiple datasets with this name exist')
92
+ else:
93
+ raise Exception("Dataset not found")
94
+
95
+ def _bulid_folder_filter(self, folder_path, filters=None):
96
+ if filters is None:
97
+ filters = entities.Filters()
98
+ filters._user_query = 'false'
99
+ if not folder_path.startswith('/'):
100
+ folder_path = '/' + folder_path
101
+ filters.add(field='dir', values=folder_path, method=entities.FiltersMethod.OR)
102
+ if not folder_path.endswith('*'):
103
+ if not folder_path.endswith('/'):
104
+ folder_path += '/'
105
+ filters.add(field='dir', values=folder_path + '*', method=entities.FiltersMethod.OR)
106
+ return filters
107
+
108
+ def _get_binaries_dataset(self):
109
+ filters = entities.Filters(resource=entities.FiltersResource.DATASET)
110
+ filters.add(field='name', values='Binaries')
111
+ filters.system_space = True
112
+ datasets = self.list(filters=filters)
113
+ if len(datasets) == 0:
114
+ # empty list
115
+ raise exceptions.PlatformException('404', 'Dataset not found. Name: "Binaries"')
116
+ # dataset = None
117
+ elif len(datasets) > 1:
118
+ raise exceptions.PlatformException('400', 'More than one dataset with same name.')
119
+ else:
120
+ dataset = datasets[0]
121
+ return dataset
122
+
123
+ def _resolve_dataset_id(self, dataset, dataset_name, dataset_id):
124
+ if dataset is None and dataset_name is None and dataset_id is None:
125
+ raise ValueError('Must provide dataset, dataset name or dataset id')
126
+ if dataset_id is None:
127
+ if dataset is None:
128
+ dataset = self.get(dataset_name=dataset_name)
129
+ dataset_id = dataset.id
130
+ return dataset_id
131
+
132
+ @staticmethod
133
+ def _build_payload(filters, include_feature_vectors, include_annotations,
134
+ export_type, annotation_filters, feature_vector_filters, dataset_lock, lock_timeout_sec, export_summary):
135
+ valid_list = [e.value for e in entities.ExportType]
136
+ valid_types = ', '.join(valid_list)
137
+ if export_type not in ['json', 'zip']:
138
+ raise ValueError('export_type must be one of the following: {}'.format(valid_types))
139
+ payload = {'exportType': export_type}
140
+ if filters is None:
141
+ filters = entities.Filters()
142
+
143
+ if isinstance(filters, entities.Filters):
144
+ payload['itemsQuery'] = {'filter': filters.prepare()['filter'], 'join': filters.prepare().get("join", {})}
145
+ elif isinstance(filters, dict):
146
+ payload['itemsQuery'] = filters
147
+ else:
148
+ raise exceptions.BadRequest(message='filters must be of type dict or Filters', status_code=500)
149
+
150
+ payload['itemsVectorQuery'] = {}
151
+ if include_feature_vectors:
152
+ payload['includeItemVectors'] = True
153
+ payload['itemsVectorQuery']['select'] = {"datasetId": 1, 'featureSetId': 1, 'value': 1}
154
+
155
+ if feature_vector_filters is not None:
156
+ payload['itemsVectorQuery']['filter'] = feature_vector_filters.prepare()['filter']
157
+
158
+ payload['annotations'] = {"include": include_annotations, "convertSemantic": False}
159
+
160
+ if annotation_filters is not None:
161
+ payload['annotationsQuery'] = annotation_filters.prepare()
162
+
163
+ if dataset_lock:
164
+ payload['datasetLock'] = dataset_lock
165
+
166
+ if export_summary:
167
+ payload['summary'] = export_summary
168
+
169
+ if lock_timeout_sec:
170
+ payload['lockTimeoutSec'] = lock_timeout_sec
171
+
172
+ return payload
173
+
174
+ def _download_exported_item(self, item_id, export_type, local_path=None, unzip=True):
175
+ logger.debug(f"start downloading exported item {item_id} with export_type {export_type} and local_path {local_path} and unzip {unzip}")
176
+ export_item = repositories.Items(client_api=self._client_api).get(item_id=item_id)
177
+ export_item_path = export_item.download(local_path=local_path)
178
+
179
+ # Common validation check for both JSON and other export types
180
+ if isinstance(export_item_path, list) or not os.path.isfile(export_item_path):
181
+ raise exceptions.PlatformException(
182
+ error='404',
183
+ message='error downloading annotation zip file. see above for more information. item id: {!r}'.format(
184
+ export_item.id))
185
+
186
+ result = None
187
+ if unzip is False or export_type == entities.ExportType.JSON:
188
+ result = export_item_path
189
+ else:
190
+ try:
191
+ miscellaneous.Zipping.unzip_directory(zip_filename=export_item_path,
192
+ to_directory=local_path)
193
+ result = local_path
194
+ except Exception as e:
195
+ logger.warning("Failed to extract zip file error: {}".format(e))
196
+ finally:
197
+ # cleanup only for zip files to avoid removing needed results
198
+ if isinstance(export_item_path, str) and os.path.isfile(export_item_path):
199
+ os.remove(export_item_path)
200
+ logger.debug(f"end downloading, result {result}")
201
+ return result
202
+
203
+ @property
204
+ def platform_url(self):
205
+ return self._client_api._get_resource_url("projects/{}/datasets".format(self.project.id))
206
+
207
+ def open_in_web(self,
208
+ dataset_name: str = None,
209
+ dataset_id: str = None,
210
+ dataset: entities.Dataset = None):
211
+ """
212
+ Open the dataset in web platform.
213
+
214
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
215
+
216
+ :param str dataset_name: The Name of the dataset
217
+ :param str dataset_id: The Id of the dataset
218
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
219
+
220
+ **Example**:
221
+
222
+ .. code-block:: python
223
+
224
+ project.datasets.open_in_web(dataset_id='dataset_id')
225
+ """
226
+ if dataset_name is not None:
227
+ dataset = self.get(dataset_name=dataset_name)
228
+ if dataset is not None:
229
+ dataset.open_in_web()
230
+ elif dataset_id is not None:
231
+ self._client_api._open_in_web(url=f'{self.platform_url}/{dataset_id}/items')
232
+ else:
233
+ self._client_api._open_in_web(url=self.platform_url)
234
+
235
+ def checkout(self,
236
+ identifier: str = None,
237
+ dataset_name: str = None,
238
+ dataset_id: str = None,
239
+ dataset: entities.Dataset = None):
240
+ """
241
+ Checkout (switch) to a dataset to work on it.
242
+
243
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
244
+
245
+ You must provide at least ONE of the following params: dataset_id, dataset_name.
246
+
247
+ :param str identifier: project name or partial id that you wish to switch
248
+ :param str dataset_name: The Name of the dataset
249
+ :param str dataset_id: The Id of the dataset
250
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
251
+
252
+ **Example**:
253
+
254
+ .. code-block:: python
255
+
256
+ project.datasets.checkout(dataset_id='dataset_id')
257
+ """
258
+ if dataset is None:
259
+ if dataset_id is not None or dataset_name is not None:
260
+ try:
261
+ dataset = self.project.datasets.get(dataset_name=dataset_name, dataset_id=dataset_id)
262
+ except exceptions.MissingEntity:
263
+ dataset = self.get(dataset_id=dataset_id, dataset_name=dataset_name)
264
+ elif identifier is not None:
265
+ dataset = self.__get_by_identifier(identifier=identifier)
266
+ else:
267
+ raise exceptions.PlatformException(error='400',
268
+ message='Must provide partial/full id/name to checkout')
269
+ self._client_api.state_io.put('dataset', dataset.to_json())
270
+ logger.info('Checked out to dataset {}'.format(dataset.name))
271
+
272
+ @_api_reference.add(path='/datasets/query', method='post')
273
+ def list(self, name=None, creator=None, filters: entities.Filters = None) -> miscellaneous.List[entities.Dataset]:
274
+ """
275
+ List all datasets.
276
+
277
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
278
+
279
+ :param str name: list by name
280
+ :param str creator: list by
281
+ :param dtlpy.entities.filters.Filters filters: Filters entity containing filters parameters
282
+ :return: List of datasets
283
+ :rtype: list
284
+
285
+ **Example**:
286
+
287
+ .. code-block:: python
288
+ filters = dl.Filters(resource='datasets')
289
+ filters.add(field='readonly', values=False)
290
+ datasets = project.datasets.list(filters=filters)
291
+ """
292
+ if filters is None:
293
+ filters = entities.Filters(resource=entities.FiltersResource.DATASET)
294
+ # assert type filters
295
+ elif not isinstance(filters, entities.Filters):
296
+ raise exceptions.PlatformException(error='400',
297
+ message='Unknown filters type: {!r}'.format(type(filters)))
298
+ if filters.resource != entities.FiltersResource.DATASET:
299
+ raise exceptions.PlatformException(
300
+ error='400',
301
+ message='Filters resource must to be FiltersResource.DATASET. Got: {!r}'.format(filters.resource))
302
+
303
+ url = '/datasets/query'
304
+
305
+ if name is not None:
306
+ filters.add(field='name', values=name)
307
+ if creator is not None:
308
+ filters.add(field='creator', values=creator)
309
+ if self._project is not None:
310
+ filters.context = {"projects": [self._project.id]}
311
+ filters.page_size = 1000
312
+ filters.page = 0
313
+ datasets = list()
314
+ while True:
315
+ success, response = self._client_api.gen_request(req_type='POST',
316
+ json_req=filters.prepare(),
317
+ path=url,
318
+ headers={'user_query': filters._user_query})
319
+ if success:
320
+ pool = self._client_api.thread_pools('entity.create')
321
+ datasets_json = response.json()['items']
322
+ jobs = [None for _ in range(len(datasets_json))]
323
+ # return triggers list
324
+ for i_dataset, dataset in enumerate(datasets_json):
325
+ jobs[i_dataset] = pool.submit(entities.Dataset._protected_from_json,
326
+ **{'client_api': self._client_api,
327
+ '_json': dataset,
328
+ 'datasets': self,
329
+ 'project': self.project})
330
+
331
+ # get all results
332
+ results = [j.result() for j in jobs]
333
+ # log errors
334
+ _ = [logger.warning(r[1]) for r in results if r[0] is False]
335
+ # return good jobs
336
+ datasets.extend([r[1] for r in results if r[0] is True])
337
+ if response.json()['hasNextPage'] is True:
338
+ filters.page += 1
339
+ else:
340
+ break
341
+ else:
342
+ raise exceptions.PlatformException(response)
343
+ datasets = miscellaneous.List(datasets)
344
+ return datasets
345
+
346
+ @_api_reference.add(path='/datasets/{id}', method='get')
347
+ def get(self,
348
+ dataset_name: str = None,
349
+ dataset_id: str = None,
350
+ checkout: bool = False,
351
+ fetch: bool = None
352
+ ) -> entities.Dataset:
353
+ """
354
+ Get dataset by name or id.
355
+
356
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
357
+
358
+ You must provide at least ONE of the following params: dataset_id, dataset_name.
359
+
360
+ :param str dataset_name: optional - search by name
361
+ :param str dataset_id: optional - search by id
362
+ :param bool checkout: set the dataset as a default dataset object (cookies)
363
+ :param bool fetch: optional - fetch entity from platform (True), default taken from cookie
364
+ :return: Dataset object
365
+ :rtype: dtlpy.entities.dataset.Dataset
366
+
367
+ **Example**:
368
+
369
+ .. code-block:: python
370
+
371
+ dataset = project.datasets.get(dataset_id='dataset_id')
372
+ """
373
+ if fetch is None:
374
+ fetch = self._client_api.fetch_entities
375
+
376
+ if dataset_id is None and dataset_name is None:
377
+ dataset = self.__get_from_cache()
378
+ if dataset is None:
379
+ raise exceptions.PlatformException(
380
+ error='400',
381
+ message='No checked-out Dataset was found, must checkout or provide an identifier in inputs')
382
+ elif fetch:
383
+ if dataset_id is not None and dataset_id != '':
384
+ dataset = self.__get_by_id(dataset_id)
385
+ # verify input dataset name is same as the given id
386
+ if dataset_name is not None and dataset.name != dataset_name:
387
+ logger.warning(
388
+ "Mismatch found in datasets.get: dataset_name is different then dataset.name: "
389
+ "{!r} != {!r}".format(
390
+ dataset_name,
391
+ dataset.name))
392
+ elif dataset_name is not None:
393
+ datasets = self.list(name=dataset_name)
394
+ if not datasets:
395
+ # empty list
396
+ raise exceptions.PlatformException('404', 'Dataset not found. Name: {!r}'.format(dataset_name))
397
+ # dataset = None
398
+ elif len(datasets) > 1:
399
+ raise exceptions.PlatformException('400', 'More than one dataset with same name.')
400
+ else:
401
+ dataset = datasets[0]
402
+ else:
403
+ raise exceptions.PlatformException(
404
+ error='404',
405
+ message='No input and no checked-out found')
406
+ else:
407
+ dataset = entities.Dataset.from_json(_json={'id': dataset_id,
408
+ 'name': dataset_id},
409
+ client_api=self._client_api,
410
+ datasets=self,
411
+ project=self._project,
412
+ is_fetched=False)
413
+ assert isinstance(dataset, entities.Dataset)
414
+ if checkout:
415
+ self.checkout(dataset=dataset)
416
+ return dataset
417
+
418
+ @_api_reference.add(path='/datasets/{id}', method='delete')
419
+ def delete(self,
420
+ dataset_name: str = None,
421
+ dataset_id: str = None,
422
+ sure: bool = False,
423
+ really: bool = False):
424
+ """
425
+ Delete a dataset forever!
426
+
427
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
428
+
429
+ **Example**:
430
+
431
+ .. code-block:: python
432
+
433
+ is_deleted = project.datasets.delete(dataset_id='dataset_id', sure=True, really=True)
434
+
435
+ :param str dataset_name: optional - search by name
436
+ :param str dataset_id: optional - search by id
437
+ :param bool sure: Are you sure you want to delete?
438
+ :param bool really: Really really sure?
439
+ :return: True is success
440
+ :rtype: bool
441
+ """
442
+ if sure and really:
443
+ dataset = self.get(dataset_name=dataset_name, dataset_id=dataset_id)
444
+ success, response = self._client_api.gen_request(req_type='delete',
445
+ path='/datasets/{}'.format(dataset.id))
446
+ if not success:
447
+ raise exceptions.PlatformException(response)
448
+ logger.info('Dataset {!r} was deleted successfully'.format(dataset.name))
449
+ return True
450
+ else:
451
+ raise exceptions.PlatformException(
452
+ error='403',
453
+ message='Cant delete dataset from SDK. Please login to platform to delete')
454
+
455
+ @_api_reference.add(path='/datasets/{id}', method='patch')
456
+ def update(self,
457
+ dataset: entities.Dataset,
458
+ system_metadata: bool = False,
459
+ patch: dict = None
460
+ ) -> entities.Dataset:
461
+ """
462
+ Update dataset field.
463
+
464
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
465
+
466
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
467
+ :param bool system_metadata: True, if you want to change metadata system
468
+ :param dict patch: Specific patch request
469
+ :return: Dataset object
470
+ :rtype: dtlpy.entities.dataset.Dataset
471
+
472
+ **Example**:
473
+
474
+ .. code-block:: python
475
+
476
+ dataset = project.datasets.update(dataset='dataset_entity')
477
+ """
478
+ url_path = '/datasets/{}'.format(dataset.id)
479
+ if system_metadata:
480
+ url_path += '?system=true'
481
+
482
+ if patch is None:
483
+ patch = dataset.to_json()
484
+
485
+ success, response = self._client_api.gen_request(req_type='patch',
486
+ path=url_path,
487
+ json_req=patch)
488
+ if success:
489
+ logger.info('Dataset was updated successfully')
490
+ return dataset
491
+ else:
492
+ raise exceptions.PlatformException(response)
493
+
494
+ @_api_reference.add(path='/datasets/{id}/unlock', method='patch')
495
+ def unlock(self, dataset: entities.Dataset ) -> entities.Dataset:
496
+ """
497
+ Unlock dataset.
498
+
499
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
500
+
501
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
502
+ :return: Dataset object
503
+ :rtype: dtlpy.entities.dataset.Dataset
504
+
505
+ **Example**:
506
+
507
+ .. code-block:: python
508
+
509
+ dataset = project.datasets.unlock(dataset='dataset_entity')
510
+ """
511
+ url_path = '/datasets/{}/unlock'.format(dataset.id)
512
+
513
+ success, response = self._client_api.gen_request(req_type='patch', path=url_path)
514
+ if success:
515
+ logger.info('Dataset was unlocked successfully')
516
+ return dataset
517
+ else:
518
+ raise exceptions.PlatformException(response)
519
+
520
+ @_api_reference.add(path='/datasets/{id}/directoryTree', method='get')
521
+ def directory_tree(self,
522
+ dataset: entities.Dataset = None,
523
+ dataset_name: str = None,
524
+ dataset_id: str = None):
525
+ """
526
+ Get dataset's directory tree.
527
+
528
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
529
+
530
+ You must provide at least ONE of the following params: dataset, dataset_name, dataset_id.
531
+
532
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
533
+ :param str dataset_name: The Name of the dataset
534
+ :param str dataset_id: The Id of the dataset
535
+ :return: DirectoryTree
536
+
537
+ **Example**:
538
+
539
+ .. code-block:: python
540
+ directory_tree = dataset.directory_tree
541
+ directory_tree = project.datasets.directory_tree(dataset='dataset_entity')
542
+ """
543
+ dataset_id = self._resolve_dataset_id(dataset, dataset_name, dataset_id)
544
+
545
+ url_path = '/datasets/{}/directoryTree'.format(dataset_id)
546
+
547
+ success, response = self._client_api.gen_request(req_type='get',
548
+ path=url_path)
549
+
550
+ if success:
551
+ return entities.DirectoryTree(_json=response.json())
552
+ else:
553
+ raise exceptions.PlatformException(response)
554
+
555
+ @_api_reference.add(path='/datasets/{id}/clone', method='post')
556
+ def clone(self,
557
+ dataset_id: str,
558
+ clone_name: str = None,
559
+ filters: entities.Filters = None,
560
+ with_items_annotations: bool = True,
561
+ with_metadata: bool = True,
562
+ with_task_annotations_status: bool = True,
563
+ dst_dataset_id: str = None,
564
+ target_directory: str = None):
565
+ """
566
+ Clone a dataset. Read more about cloning datatsets and items in our `documentation <https://dataloop.ai/docs/clone-merge-dataset#cloned-dataset>`_ and `SDK documentation <https://developers.dataloop.ai/tutorials/data_management/data_versioning/chapter/>`_.
567
+
568
+ **Prerequisites**: You must be in the role of an *owner* or *developer*.
569
+
570
+ :param str dataset_id: id of the dataset you wish to clone
571
+ :param str clone_name: new dataset name
572
+ :param dtlpy.entities.filters.Filters filters: Filters entity or a query dict
573
+ :param bool with_items_annotations: true to clone with items annotations
574
+ :param bool with_metadata: true to clone with metadata
575
+ :param bool with_task_annotations_status: true to clone with task annotations' status
576
+ :param str dst_dataset_id: destination dataset id
577
+ :param str target_directory: target directory
578
+ :return: dataset object
579
+ :rtype: dtlpy.entities.dataset.Dataset
580
+
581
+ **Example**:
582
+
583
+ .. code-block:: python
584
+
585
+ dataset = project.datasets.clone(dataset_id='dataset_id',
586
+ clone_name='dataset_clone_name',
587
+ with_metadata=True,
588
+ with_items_annotations=False,
589
+ with_task_annotations_status=False)
590
+ """
591
+ if clone_name is None and dst_dataset_id is None:
592
+ raise exceptions.PlatformException('400', 'Must provide clone name or destination dataset id')
593
+ if filters is None:
594
+ filters = entities.Filters()
595
+ filters._user_query = 'false'
596
+ elif not isinstance(filters, entities.Filters):
597
+ raise exceptions.PlatformException(
598
+ error='400',
599
+ message='"filters" must be a dl.Filters entity. got: {!r}'.format(type(filters)))
600
+
601
+ copy_filters = copy.deepcopy(filters)
602
+ if copy_filters.has_field('hidden'):
603
+ copy_filters.pop('hidden')
604
+
605
+ if target_directory is not None and not target_directory.startswith('/'):
606
+ target_directory = '/' + target_directory
607
+
608
+ payload = {
609
+ "name": clone_name,
610
+ "filter": copy_filters.prepare(),
611
+ "cloneDatasetParams": {
612
+ "withItemsAnnotations": with_items_annotations,
613
+ "withMetadata": with_metadata,
614
+ "withTaskAnnotationsStatus": with_task_annotations_status,
615
+ "targetDirectory": target_directory
616
+ }
617
+ }
618
+ if dst_dataset_id is not None:
619
+ payload['cloneDatasetParams']['targetDatasetId'] = dst_dataset_id
620
+ success, response = self._client_api.gen_request(req_type='post',
621
+ path='/datasets/{}/clone'.format(dataset_id),
622
+ json_req=payload,
623
+ headers={'user_query': filters._user_query})
624
+
625
+ if not success:
626
+ raise exceptions.PlatformException(response)
627
+
628
+ command = entities.Command.from_json(_json=response.json(),
629
+ client_api=self._client_api)
630
+ command = command.wait()
631
+
632
+ if 'returnedModelId' not in command.spec:
633
+ raise exceptions.PlatformException(error='400',
634
+ message="returnedModelId key is missing in command response: {!r}"
635
+ .format(response))
636
+ return self.get(dataset_id=command.spec['returnedModelId'])
637
+
638
+ def _export_recursive(
639
+ self,
640
+ dataset: entities.Dataset = None,
641
+ dataset_name: str = None,
642
+ dataset_id: str = None,
643
+ local_path: str = None,
644
+ filters: Union[dict, entities.Filters] = None,
645
+ annotation_filters: entities.Filters = None,
646
+ feature_vector_filters: entities.Filters = None,
647
+ include_feature_vectors: bool = False,
648
+ include_annotations: bool = False,
649
+ timeout: int = 0,
650
+ dataset_lock: bool = False,
651
+ lock_timeout_sec: int = None,
652
+ export_summary: bool = False,
653
+ max_items_per_subset: int = MAX_ITEMS_PER_SUBSET,
654
+ export_type: ExportType = ExportType.JSON,
655
+ output_export_type: OutputExportType = OutputExportType.JSON,
656
+ ) -> Generator[str, None, None]:
657
+ """
658
+ Export dataset items recursively by splitting large datasets into smaller subsets.
659
+
660
+ Args:
661
+ dataset (entities.Dataset, optional): Dataset entity to export
662
+ dataset_name (str, optional): Name of the dataset to export
663
+ dataset_id (str, optional): ID of the dataset to export
664
+ local_path (str, optional): Local path to save the exported data
665
+ filters (Union[dict, entities.Filters], optional): Filters to apply on the items
666
+ annotation_filters (entities.Filters, optional): Filters to apply on the annotations
667
+ feature_vector_filters (entities.Filters, optional): Filters to apply on the feature vectors
668
+ include_feature_vectors (bool, optional): Whether to include feature vectors in export. Defaults to False
669
+ include_annotations (bool, optional): Whether to include annotations in export. Defaults to False
670
+ timeout (int, optional): Timeout in seconds for the export operation. Defaults to 0
671
+ dataset_lock (bool, optional): Whether to lock the dataset during export. Defaults to False
672
+ lock_timeout_sec (int, optional): Timeout for dataset lock in seconds. Defaults to None
673
+ export_summary (bool, optional): Whether to include export summary. Defaults to False
674
+ max_items_per_subset (int, optional): Maximum items per subset for recursive export. Defaults to MAX_ITEMS_PER_SUBSET
675
+ export_type (ExportType, optional): Type of export (JSON or ZIP). Defaults to ExportType.JSON
676
+ output_export_type (OutputExportType, optional): Output format type. Defaults to OutputExportType.JSON
677
+
678
+ Returns:
679
+ Generator[str, None, None]: Generator yielding export paths
680
+
681
+ Raises:
682
+ NotImplementedError: If ZIP export type is used with JSON output type
683
+ exceptions.PlatformException: If API request fails or command response is invalid
684
+ """
685
+ logger.debug(f"exporting dataset with export_type {export_type} and output_export_type {output_export_type}")
686
+ if export_type == ExportType.ZIP and output_export_type == OutputExportType.JSON:
687
+ raise NotImplementedError(
688
+ "Zip export type is not supported for JSON output type.\n"
689
+ "If Json output is required, please use the export_type = JSON"
690
+ )
691
+
692
+ # Get dataset entity for recursive filtering
693
+ dataset_entity = self.get(dataset_id=self._resolve_dataset_id(dataset, dataset_name, dataset_id))
694
+ if export_type != ExportType.JSON:
695
+ filters_list = [filters]
696
+ else:
697
+ # Generate filter subsets using recursive_get_filters
698
+ filters_list = entities.Filters._get_split_filters(
699
+ dataset=dataset_entity, filters=filters, max_items=max_items_per_subset
700
+ )
701
+ # First loop: Make all API requests without waiting
702
+ commands = []
703
+ logger.debug("start making all API requests without waiting")
704
+ for filter_i in filters_list:
705
+ # Build payload for this subset
706
+ payload = self._build_payload(
707
+ filters=filter_i,
708
+ include_feature_vectors=include_feature_vectors,
709
+ include_annotations=include_annotations,
710
+ export_type=export_type,
711
+ annotation_filters=annotation_filters,
712
+ feature_vector_filters=feature_vector_filters,
713
+ dataset_lock=dataset_lock,
714
+ lock_timeout_sec=lock_timeout_sec,
715
+ export_summary=export_summary,
716
+ )
717
+
718
+ # Make API request for this subset
719
+ success, response = self._client_api.gen_request(
720
+ req_type='post', path=f'/datasets/{dataset_entity.id}/export', json_req=payload
721
+ )
722
+
723
+ if not success:
724
+ logger.error(f"failed to make API request /datasets/{dataset_entity.id}/export with payload {payload} response {response}")
725
+ raise exceptions.PlatformException(response)
726
+
727
+ # Handle command execution
728
+ commands.append( entities.Command.from_json(_json=response.json(), client_api=self._client_api))
729
+
730
+ time.sleep(2) # as the command have wrong progress in the beginning
731
+ logger.debug("start waiting for all commands")
732
+ # Second loop: Wait for all commands and process results
733
+ for command in commands:
734
+ command = command.wait(timeout=timeout)
735
+
736
+ if 'outputItemId' not in command.spec:
737
+ raise exceptions.PlatformException(
738
+ error='400', message="outputItemId key is missing in command response"
739
+ )
740
+
741
+ item_id = command.spec['outputItemId']
742
+ # Download and process the exported item
743
+ yield self._download_exported_item(
744
+ item_id=item_id,
745
+ export_type=export_type,
746
+ local_path=local_path,
747
+ unzip=output_export_type != OutputExportType.ZIP,
748
+ )
749
+
750
+ @_api_reference.add(path='/datasets/{id}/export', method='post')
751
+ def export(
752
+ self,
753
+ dataset: entities.Dataset = None,
754
+ dataset_name: str = None,
755
+ dataset_id: str = None,
756
+ local_path: str = None,
757
+ filters: Union[dict, entities.Filters] = None,
758
+ annotation_filters: entities.Filters = None,
759
+ feature_vector_filters: entities.Filters = None,
760
+ include_feature_vectors: bool = False,
761
+ include_annotations: bool = False,
762
+ export_type: ExportType = ExportType.JSON,
763
+ timeout: int = 0,
764
+ dataset_lock: bool = False,
765
+ lock_timeout_sec: int = None,
766
+ export_summary: bool = False,
767
+ output_export_type: OutputExportType = None,
768
+ ) -> Optional[str]:
769
+ """
770
+ Export dataset items and annotations.
771
+
772
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
773
+
774
+ You must provide at least ONE of the following params: dataset, dataset_name, dataset_id.
775
+
776
+ **Export Behavior by Parameter Combination:**
777
+
778
+ The behavior of this method depends on the combination of `export_type` and `output_export_type`:
779
+
780
+ **When export_type = ExportType.JSON:**
781
+
782
+ - **output_export_type = OutputExportType.JSON (default when None):**
783
+ - Exports data in JSON format, split into subsets of max 500 items
784
+ - Downloads all subset JSON files and concatenates them into a single `result.json` file
785
+ - Returns the path to the concatenated JSON file
786
+ - Cleans up individual subset files after concatenation
787
+
788
+ - **output_export_type = OutputExportType.ZIP:**
789
+ - Same as JSON export, but zips the final `result.json` file
790
+ - Returns the path to the zipped file (`result.json.zip`)
791
+ - Cleans up the unzipped JSON file after zipping
792
+
793
+ - **output_export_type = OutputExportType.FOLDERS:**
794
+ - Exports data in JSON format, split into subsets of max 500 items
795
+ - Downloads all subset JSON files and creates individual JSON files for each item
796
+ - Creates a folder structure mirroring the remote dataset structure
797
+ - Returns the path to the base directory containing the folder structure
798
+ - Each item gets its own JSON file named after the original filename
799
+
800
+ **When export_type = ExportType.ZIP:**
801
+
802
+ - **output_export_type = OutputExportType.ZIP:**
803
+ - Exports data as a ZIP file containing the dataset
804
+ - Returns the downloaded ZIP item directly
805
+ - No additional processing or concatenation
806
+
807
+ - **output_export_type = OutputExportType.JSON:**
808
+ - **NOT SUPPORTED** - Raises NotImplementedError
809
+ - Use export_type=ExportType.JSON instead for JSON output
810
+
811
+ - **output_export_type = OutputExportType.FOLDERS:**
812
+ - **NOT SUPPORTED** - Raises NotImplementedError
813
+ - Use export_type=ExportType.JSON instead for folder output
814
+
815
+ **When output_export_type = None (legacy behavior):**
816
+ - Defaults to OutputExportType.JSON
817
+ - Maintains backward compatibility with existing code
818
+
819
+ :param dtlpy.entities.dataset.Dataset dataset: Dataset object
820
+ :param str dataset_name: The name of the dataset
821
+ :param str dataset_id: The ID of the dataset
822
+ :param str local_path: Local path to save the exported dataset
823
+ :param Union[dict, dtlpy.entities.filters.Filters] filters: Filters entity or a query dictionary
824
+ :param dtlpy.entities.filters.Filters annotation_filters: Filters entity to filter annotations for export
825
+ :param dtlpy.entities.filters.Filters feature_vector_filters: Filters entity to filter feature vectors for export
826
+ :param bool include_feature_vectors: Include item feature vectors in the export
827
+ :param bool include_annotations: Include item annotations in the export
828
+ :param bool dataset_lock: Make dataset readonly during the export
829
+ :param bool export_summary: Get Summary of the dataset export
830
+ :param int lock_timeout_sec: Timeout for locking the dataset during export in seconds
831
+ :param entities.ExportType export_type: Type of export ('json' or 'zip')
832
+ :param entities.OutputExportType output_export_type: Output format ('json', 'zip', or 'folders'). If None, defaults to 'json'
833
+ :param int timeout: Maximum time in seconds to wait for the export to complete
834
+ :return: Path to exported file/directory, or None if export result is empty
835
+ :rtype: Optional[str]
836
+ """
837
+ export_result = list(
838
+ self._export_recursive(
839
+ dataset=dataset,
840
+ dataset_name=dataset_name,
841
+ dataset_id=dataset_id,
842
+ local_path=local_path,
843
+ filters=filters,
844
+ annotation_filters=annotation_filters,
845
+ feature_vector_filters=feature_vector_filters,
846
+ include_feature_vectors=include_feature_vectors,
847
+ include_annotations=include_annotations,
848
+ timeout=timeout,
849
+ dataset_lock=dataset_lock,
850
+ lock_timeout_sec=lock_timeout_sec,
851
+ export_summary=export_summary,
852
+ export_type=export_type,
853
+ output_export_type=output_export_type,
854
+ )
855
+ )
856
+ if all(x is None for x in export_result):
857
+ logger.error("export result is empty")
858
+ return None
859
+
860
+ if export_type == ExportType.ZIP:
861
+ # if export type is zip, then return the _export_recursive result as it
862
+ return export_result[0]
863
+
864
+ # if user didn't provide output_export_type, keep the previous behavior
865
+ if output_export_type is None:
866
+ output_export_type = OutputExportType.JSON
867
+
868
+ # export type is jsos :
869
+ # Load all items from subset JSON files and clean them up
870
+ all_items = []
871
+ logger.debug("start loading all items from subset JSON files")
872
+ for json_file in export_result:
873
+ if json_file is None:
874
+ continue
875
+ if os.path.isfile(json_file):
876
+ with open(json_file, 'r') as f:
877
+ items = json.load(f)
878
+ if isinstance(items, list):
879
+ all_items.extend(items)
880
+ os.remove(json_file)
881
+
882
+ base_dir = os.path.dirname(export_result[0])
883
+ if output_export_type != OutputExportType.FOLDERS:
884
+ dataset_id=self._resolve_dataset_id(dataset, dataset_name, dataset_id)
885
+ result_file_name = f"{dataset_id}.json"
886
+ result_file = os.path.join(base_dir, result_file_name)
887
+ logger.debug(f"start writing all items to result file {result_file}")
888
+ with open(result_file, 'w') as f:
889
+ json.dump(all_items, f)
890
+ if output_export_type == OutputExportType.ZIP:
891
+ # Zip the result file
892
+ zip_filename = result_file + '.zip'
893
+ # Create zip file
894
+ logger.debug(f"start zipping result file {zip_filename}")
895
+ with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
896
+ zf.write(result_file, arcname=os.path.basename(result_file))
897
+
898
+ # Remove original json after zipping
899
+ os.remove(result_file)
900
+ result_file = zip_filename
901
+ return result_file
902
+ logger.debug("start building per-item JSON files under local_path mirroring remote structure")
903
+ # Build per-item JSON files under local_path mirroring remote structure
904
+ for item in all_items:
905
+ rel_json_path = os.path.splitext(item.get('filename'))[0] + '.json'
906
+ # Remove leading slash to make it a relative path
907
+ if rel_json_path.startswith('/'):
908
+ rel_json_path = rel_json_path[1:]
909
+ out_path = os.path.join(base_dir, rel_json_path)
910
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
911
+ try:
912
+ with open(out_path, 'w') as outf:
913
+ json.dump(item, outf)
914
+ except Exception:
915
+ logger.exception(f'Failed writing export item JSON to {out_path}')
916
+ logger.debug("end building per-item JSON files under local_path mirroring remote structure")
917
+ return base_dir
918
+
919
+ @_api_reference.add(path='/datasets/merge', method='post')
920
+ def merge(self,
921
+ merge_name: str,
922
+ dataset_ids: list,
923
+ project_ids: str,
924
+ with_items_annotations: bool = True,
925
+ with_metadata: bool = True,
926
+ with_task_annotations_status: bool = True,
927
+ wait: bool = True):
928
+ """
929
+ Merge a dataset. See our `SDK docs <https://developers.dataloop.ai/tutorials/data_management/data_versioning/chapter/>`_ for more information.
930
+
931
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
932
+
933
+ :param str merge_name: new dataset name
934
+ :param list dataset_ids: list id's of the datatsets you wish to merge
935
+ :param str project_ids: the project id that include the datasets
936
+ :param bool with_items_annotations: true to merge with items annotations
937
+ :param bool with_metadata: true to merge with metadata
938
+ :param bool with_task_annotations_status: true to merge with task annotations' status
939
+ :param bool wait: wait for the command to finish
940
+ :return: True if success
941
+ :rtype: bool
942
+
943
+ **Example**:
944
+
945
+ .. code-block:: python
946
+
947
+ success = project.datasets.merge(dataset_ids=['dataset_id1','dataset_id2'],
948
+ merge_name='dataset_merge_name',
949
+ with_metadata=True,
950
+ with_items_annotations=False,
951
+ with_task_annotations_status=False)
952
+ """
953
+ payload = {
954
+ "name": merge_name,
955
+ "datasetsIds": dataset_ids,
956
+ "projectIds": project_ids,
957
+ "mergeDatasetParams": {
958
+ "withItemsAnnotations": with_items_annotations,
959
+ "withMetadata": with_metadata,
960
+ "withTaskAnnotationsStatus": with_task_annotations_status
961
+ },
962
+ 'asynced': wait
963
+ }
964
+ success, response = self._client_api.gen_request(req_type='post',
965
+ path='/datasets/merge',
966
+ json_req=payload)
967
+
968
+ if success:
969
+ command = entities.Command.from_json(_json=response.json(),
970
+ client_api=self._client_api)
971
+ if not wait:
972
+ return command
973
+ command = command.wait(timeout=0)
974
+ if 'mergeDatasetsConfiguration' not in command.spec:
975
+ raise exceptions.PlatformException(error='400',
976
+ message="mergeDatasetsConfiguration key is missing in command response: {}"
977
+ .format(response))
978
+ return True
979
+ else:
980
+ raise exceptions.PlatformException(response)
981
+
982
+ @_api_reference.add(path='/datasets/{id}/sync', method='post')
983
+ def sync(self, dataset_id: str, wait: bool = True):
984
+ """
985
+ Sync dataset with external storage.
986
+
987
+ **Prerequisites**: You must be in the role of an *owner* or *developer*.
988
+
989
+ :param str dataset_id: The Id of the dataset to sync
990
+ :param bool wait: wait for the command to finish
991
+ :return: True if success
992
+ :rtype: bool
993
+
994
+ **Example**:
995
+
996
+ .. code-block:: python
997
+
998
+ success = project.datasets.sync(dataset_id='dataset_id')
999
+ """
1000
+
1001
+ success, response = self._client_api.gen_request(req_type='post',
1002
+ path='/datasets/{}/sync'.format(dataset_id))
1003
+
1004
+ if success:
1005
+ command = entities.Command.from_json(_json=response.json(),
1006
+ client_api=self._client_api)
1007
+ if not wait:
1008
+ return command
1009
+ command = command.wait(timeout=0)
1010
+ if 'datasetId' not in command.spec:
1011
+ raise exceptions.PlatformException(error='400',
1012
+ message="datasetId key is missing in command response: {}"
1013
+ .format(response))
1014
+ return True
1015
+ else:
1016
+ raise exceptions.PlatformException(response)
1017
+
1018
+ @_api_reference.add(path='/datasets', method='post')
1019
+ def create(self,
1020
+ dataset_name: str,
1021
+ labels=None,
1022
+ attributes=None,
1023
+ ontology_ids=None,
1024
+ driver: entities.Driver = None,
1025
+ driver_id: str = None,
1026
+ checkout: bool = False,
1027
+ expiration_options: entities.ExpirationOptions = None,
1028
+ index_driver: entities.IndexDriver = None,
1029
+ recipe_id: str = None
1030
+ ) -> entities.Dataset:
1031
+ """
1032
+ Create a new dataset
1033
+
1034
+ **Prerequisites**: You must be in the role of an *owner* or *developer*.
1035
+
1036
+ :param str dataset_name: The Name of the dataset
1037
+ :param list labels: dictionary of {tag: color} or list of label entities
1038
+ :param list attributes: dataset's ontology's attributes
1039
+ :param list ontology_ids: optional - dataset ontology
1040
+ :param dtlpy.entities.driver.Driver driver: optional - storage driver Driver object or driver name
1041
+ :param str driver_id: optional - driver id
1042
+ :param bool checkout: set the dataset as a default dataset object (cookies)
1043
+ :param ExpirationOptions expiration_options: dl.ExpirationOptions object that contain definitions for dataset like MaxItemDays
1044
+ :param str index_driver: dl.IndexDriver, dataset driver version
1045
+ :param str recipe_id: optional - recipe id
1046
+ :return: Dataset object
1047
+ :rtype: dtlpy.entities.dataset.Dataset
1048
+
1049
+ **Example**:
1050
+
1051
+ .. code-block:: python
1052
+
1053
+ dataset = project.datasets.create(dataset_name='dataset_name', ontology_ids='ontology_ids')
1054
+ """
1055
+ create_default_recipe = True
1056
+ if any([labels, attributes, ontology_ids, recipe_id]):
1057
+ create_default_recipe = False
1058
+
1059
+ # labels to list
1060
+ if labels is not None:
1061
+ if not isinstance(labels, list):
1062
+ labels = [labels]
1063
+ if not all(isinstance(label, entities.Label) for label in labels):
1064
+ labels = entities.Dataset.serialize_labels(labels)
1065
+ else:
1066
+ labels = list()
1067
+
1068
+ # get creator from token
1069
+ payload = {'name': dataset_name,
1070
+ 'projects': [self.project.id],
1071
+ 'createDefaultRecipe': create_default_recipe
1072
+ }
1073
+
1074
+ if driver_id is None and driver is not None:
1075
+ if isinstance(driver, entities.Driver):
1076
+ driver_id = driver.id
1077
+ elif isinstance(driver, str):
1078
+ driver_id = self.project.drivers.get(driver_name=driver).id
1079
+ else:
1080
+ raise exceptions.PlatformException(
1081
+ error=400,
1082
+ message='Input arg "driver" must be Driver object or a string driver name. got type: {!r}'.format(
1083
+ type(driver)))
1084
+ if driver_id is not None:
1085
+ payload['driver'] = driver_id
1086
+
1087
+ if expiration_options:
1088
+ payload['expirationOptions'] = expiration_options.to_json()
1089
+ if index_driver is not None:
1090
+ payload['indexDriver'] = index_driver
1091
+
1092
+ success, response = self._client_api.gen_request(req_type='post',
1093
+ path='/datasets',
1094
+ json_req=payload)
1095
+ if success:
1096
+ dataset = entities.Dataset.from_json(client_api=self._client_api,
1097
+ _json=response.json(),
1098
+ datasets=self,
1099
+ project=self.project)
1100
+ # create ontology and recipe
1101
+ if not create_default_recipe:
1102
+ if recipe_id is not None:
1103
+ dataset.switch_recipe(recipe_id=recipe_id)
1104
+ else:
1105
+ dataset = dataset.recipes.create(ontology_ids=ontology_ids,
1106
+ labels=labels,
1107
+ attributes=attributes).dataset
1108
+ else:
1109
+ raise exceptions.PlatformException(response)
1110
+ logger.info('Dataset was created successfully. Dataset id: {!r}'.format(dataset.id))
1111
+ assert isinstance(dataset, entities.Dataset)
1112
+ if checkout:
1113
+ self.checkout(dataset=dataset)
1114
+ return dataset
1115
+
1116
+ @staticmethod
1117
+ def _convert_single(downloader,
1118
+ item,
1119
+ img_filepath,
1120
+ local_path,
1121
+ overwrite,
1122
+ annotation_options,
1123
+ annotation_filters,
1124
+ thickness,
1125
+ with_text,
1126
+ progress,
1127
+ alpha,
1128
+ export_version):
1129
+ # this is to convert the downloaded json files to any other annotation type
1130
+ try:
1131
+ if entities.ViewAnnotationOptions.ANNOTATION_ON_IMAGE in annotation_options:
1132
+ if img_filepath is None:
1133
+ img_filepath = item.download()
1134
+ downloader._download_img_annotations(item=item,
1135
+ img_filepath=img_filepath,
1136
+ local_path=local_path,
1137
+ overwrite=overwrite,
1138
+ annotation_options=annotation_options,
1139
+ annotation_filters=annotation_filters,
1140
+ thickness=thickness,
1141
+ alpha=alpha,
1142
+ with_text=with_text,
1143
+ export_version=export_version
1144
+ )
1145
+ except Exception:
1146
+ logger.error('Failed to download annotation for item: {!r}'.format(item.name))
1147
+ progress.update()
1148
+
1149
+ @staticmethod
1150
+ def download_annotations(dataset: entities.Dataset,
1151
+ local_path: str = None,
1152
+ filters: entities.Filters = None,
1153
+ annotation_options: entities.ViewAnnotationOptions = None,
1154
+ annotation_filters: entities.Filters = None,
1155
+ overwrite: bool = False,
1156
+ thickness: int = 1,
1157
+ with_text: bool = False,
1158
+ remote_path: str = None,
1159
+ include_annotations_in_output: bool = True,
1160
+ export_png_files: bool = False,
1161
+ filter_output_annotations: bool = False,
1162
+ alpha: float = None,
1163
+ export_version=entities.ExportVersion.V1,
1164
+ dataset_lock: bool = False,
1165
+ lock_timeout_sec: int = None,
1166
+ export_summary: bool = False,
1167
+ ) -> str:
1168
+ """
1169
+ Download dataset's annotations by filters.
1170
+
1171
+ You may filter the dataset both for items and for annotations and download annotations.
1172
+
1173
+ Optional -- download annotations as: mask, instance, image mask of the item.
1174
+
1175
+ **Prerequisites**: You must be in the role of an *owner* or *developer*.
1176
+
1177
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
1178
+ :param str local_path: local folder or filename to save to.
1179
+ :param dtlpy.entities.filters.Filters filters: Filters entity or a dictionary containing filters parameters
1180
+ :param list annotation_options: type of download annotations: list(dl.ViewAnnotationOptions)
1181
+ :param dtlpy.entities.filters.Filters annotation_filters: Filters entity to filter annotations for download
1182
+ :param bool overwrite: optional - default = False to overwrite the existing files
1183
+ :param bool dataset_loc: optional - default = False to make the dataset readonly
1184
+ :param int thickness: optional - line thickness, if -1 annotation will be filled, default =1
1185
+ :param bool with_text: optional - add text to annotations, default = False
1186
+ :param str remote_path: DEPRECATED and ignored
1187
+ :param bool include_annotations_in_output: default - False , if export should contain annotations
1188
+ :param bool export_png_files: default - if True, semantic annotations should be exported as png files
1189
+ :param bool filter_output_annotations: default - False, given an export by filter - determine if to filter out annotations
1190
+ :param float alpha: opacity value [0 1], default 1
1191
+ :param str export_version: exported items will have original extension in filename, `V1` - no original extension in filenames
1192
+ :return: local_path of the directory where all the downloaded item
1193
+ :param bool dataset_lock: optional - default = False
1194
+ :param bool export_summary: optional - default = False
1195
+ :param int lock_timeout_sec: optional
1196
+ :rtype: str
1197
+
1198
+ **Example**:
1199
+
1200
+ .. code-block:: python
1201
+
1202
+ file_path = project.datasets.download_annotations(dataset='dataset_entity',
1203
+ local_path='local_path',
1204
+ annotation_options=dl.ViewAnnotationOptions,
1205
+ overwrite=False,
1206
+ thickness=1,
1207
+ with_text=False,
1208
+ alpha=1,
1209
+ dataset_lock=False,
1210
+ lock_timeout_sec=300,
1211
+ export_summary=False
1212
+ )
1213
+ """
1214
+ if annotation_options is None:
1215
+ annotation_options = list()
1216
+ elif not isinstance(annotation_options, list):
1217
+ annotation_options = [annotation_options]
1218
+ for ann_option in annotation_options:
1219
+ if not isinstance(ann_option, entities.ViewAnnotationOptions):
1220
+ if ann_option not in list(entities.ViewAnnotationOptions):
1221
+ raise PlatformException(
1222
+ error='400',
1223
+ message='Unknown annotation download option: {}, please choose from: {}'.format(
1224
+ ann_option, list(entities.ViewAnnotationOptions)))
1225
+
1226
+ if remote_path is not None:
1227
+ logger.warning(
1228
+ '"remote_path" is ignored. Use "filters=dl.Filters(field="dir, values={!r}"'.format(remote_path))
1229
+ if local_path is None:
1230
+ if dataset.project is None:
1231
+ # by dataset name
1232
+ local_path = os.path.join(
1233
+ services.service_defaults.DATALOOP_PATH,
1234
+ "datasets",
1235
+ "{}_{}".format(dataset.name, dataset.id),
1236
+ )
1237
+ else:
1238
+ # by dataset and project name
1239
+ local_path = os.path.join(
1240
+ services.service_defaults.DATALOOP_PATH,
1241
+ "projects",
1242
+ dataset.project.name,
1243
+ "datasets",
1244
+ dataset.name,
1245
+ )
1246
+
1247
+ if filters is None:
1248
+ filters = entities.Filters()
1249
+ filters._user_query = 'false'
1250
+ if annotation_filters is not None:
1251
+ for annotation_filter_and in annotation_filters.and_filter_list:
1252
+ filters.add_join(field=annotation_filter_and.field,
1253
+ values=annotation_filter_and.values,
1254
+ operator=annotation_filter_and.operator,
1255
+ method=entities.FiltersMethod.AND)
1256
+ for annotation_filter_or in annotation_filters.or_filter_list:
1257
+ filters.add_join(field=annotation_filter_or.field,
1258
+ values=annotation_filter_or.values,
1259
+ operator=annotation_filter_or.operator,
1260
+ method=entities.FiltersMethod.OR)
1261
+
1262
+ downloader = repositories.Downloader(items_repository=dataset.items)
1263
+ downloader.download_annotations(dataset=dataset,
1264
+ filters=filters,
1265
+ annotation_filters=annotation_filters,
1266
+ local_path=local_path,
1267
+ overwrite=overwrite,
1268
+ include_annotations_in_output=include_annotations_in_output,
1269
+ export_png_files=export_png_files,
1270
+ filter_output_annotations=filter_output_annotations,
1271
+ export_version=export_version,
1272
+ dataset_lock=dataset_lock,
1273
+ lock_timeout_sec=lock_timeout_sec,
1274
+ export_summary=export_summary
1275
+ )
1276
+ if annotation_options:
1277
+ pages = dataset.items.list(filters=filters)
1278
+ if not isinstance(annotation_options, list):
1279
+ annotation_options = [annotation_options]
1280
+ # convert all annotations to annotation_options
1281
+ pool = dataset._client_api.thread_pools(pool_name='dataset.download')
1282
+ jobs = [None for _ in range(pages.items_count)]
1283
+ progress = tqdm.tqdm(total=pages.items_count,
1284
+ disable=dataset._client_api.verbose.disable_progress_bar_download_annotations,
1285
+ file=sys.stdout, desc='Download Annotations')
1286
+ i_item = 0
1287
+ for page in pages:
1288
+ for item in page:
1289
+ jobs[i_item] = pool.submit(
1290
+ Datasets._convert_single,
1291
+ **{
1292
+ 'downloader': downloader,
1293
+ 'item': item,
1294
+ 'img_filepath': None,
1295
+ 'local_path': local_path,
1296
+ 'overwrite': overwrite,
1297
+ 'annotation_options': annotation_options,
1298
+ 'annotation_filters': annotation_filters,
1299
+ 'thickness': thickness,
1300
+ 'with_text': with_text,
1301
+ 'progress': progress,
1302
+ 'alpha': alpha,
1303
+ 'export_version': export_version
1304
+ }
1305
+ )
1306
+ i_item += 1
1307
+ # get all results
1308
+ _ = [j.result() for j in jobs]
1309
+ progress.close()
1310
+ return local_path
1311
+
1312
+ def _upload_single_item_annotation(self, item, file, pbar):
1313
+ try:
1314
+ item.annotations.upload(file)
1315
+ except Exception as err:
1316
+ raise err
1317
+ finally:
1318
+ pbar.update()
1319
+
1320
+ def upload_annotations(self,
1321
+ dataset,
1322
+ local_path,
1323
+ filters: entities.Filters = None,
1324
+ clean=False,
1325
+ remote_root_path='/',
1326
+ export_version=entities.ExportVersion.V1
1327
+ ):
1328
+ """
1329
+ Upload annotations to dataset.
1330
+
1331
+ Example for remote_root_path: If the item filepath is "/a/b/item" and remote_root_path is "/a" - the start folder will be b instead of a
1332
+
1333
+ **Prerequisites**: You must have a dataset with items that are related to the annotations. The relationship between the dataset and annotations is shown in the name. You must be in the role of an *owner* or *developer*.
1334
+
1335
+ :param dtlpy.entities.dataset.Dataset dataset: dataset to upload to
1336
+ :param str local_path: str - local folder where the annotations files are
1337
+ :param dtlpy.entities.filters.Filters filters: Filters entity or a dictionary containing filters parameters
1338
+ :param bool clean: True to remove the old annotations
1339
+ :param str remote_root_path: the remote root path to match remote and local items
1340
+ :param str export_version: exported items will have original extension in filename, `V1` - no original extension in filenames
1341
+
1342
+ **Example**:
1343
+
1344
+ .. code-block:: python
1345
+
1346
+ project.datasets.upload_annotations(dataset='dataset_entity',
1347
+ local_path='local_path',
1348
+ clean=False,
1349
+ export_version=dl.ExportVersion.V1
1350
+ )
1351
+ """
1352
+ if filters is None:
1353
+ filters = entities.Filters()
1354
+ filters._user_query = 'false'
1355
+ pages = dataset.items.list(filters=filters)
1356
+ total_items = pages.items_count
1357
+ pbar = tqdm.tqdm(total=total_items, disable=dataset._client_api.verbose.disable_progress_bar_upload_annotations,
1358
+ file=sys.stdout, desc='Upload Annotations')
1359
+ pool = self._client_api.thread_pools('annotation.upload')
1360
+ annotations_uploaded_count = 0
1361
+ for item in pages.all():
1362
+ if export_version == entities.ExportVersion.V1:
1363
+ _, ext = os.path.splitext(item.filename)
1364
+ filepath = item.filename.replace(ext, '.json')
1365
+ else:
1366
+ filepath = item.filename + '.json'
1367
+ # make the file path ignore the hierarchy of the files that in remote_root_path
1368
+ filepath = os.path.relpath(filepath, remote_root_path)
1369
+ json_file = os.path.join(local_path, filepath)
1370
+ if not os.path.isfile(json_file):
1371
+ pbar.update()
1372
+ continue
1373
+ annotations_uploaded_count += 1
1374
+ if item.annotated and clean:
1375
+ item.annotations.delete(filters=entities.Filters(resource=entities.FiltersResource.ANNOTATION))
1376
+ pool.submit(self._upload_single_item_annotation, **{'item': item,
1377
+ 'file': json_file,
1378
+ 'pbar': pbar})
1379
+ pool.shutdown()
1380
+ if annotations_uploaded_count == 0:
1381
+ logger.warning(msg="No annotations uploaded to dataset! ")
1382
+ else:
1383
+ logger.info(msg='Found and uploaded {} annotations.'.format(annotations_uploaded_count))
1384
+
1385
+ def set_readonly(self, state: bool, dataset: entities.Dataset):
1386
+ """
1387
+ Set dataset readonly mode.
1388
+
1389
+ **Prerequisites**: You must be in the role of an *owner* or *developer*.
1390
+
1391
+ :param bool state: state to update readonly mode
1392
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
1393
+
1394
+ **Example**:
1395
+
1396
+ .. code-block:: python
1397
+
1398
+ project.datasets.set_readonly(dataset='dataset_entity', state=True)
1399
+ """
1400
+ import warnings
1401
+ warnings.warn("`readonly` flag on dataset is deprecated, doing nothing.", DeprecationWarning)
1402
+
1403
+ @_api_reference.add(path='/datasets/{id}/split', method='post')
1404
+ def split_ml_subsets(self,
1405
+ dataset_id: str,
1406
+ items_query: entities.filters,
1407
+ ml_split_list: dict) -> bool:
1408
+ """
1409
+ Split dataset items into ML subsets.
1410
+
1411
+ :param str dataset_id: The ID of the dataset.
1412
+ :param dict items_query: Query to select items.
1413
+ :param dict ml_split_list: Dictionary with 'train', 'validation', 'test' keys and integer percentages.
1414
+ :return: True if the split operation was successful.
1415
+ :rtype: bool
1416
+ :raises: PlatformException on failure and ValueError if percentages do not sum to 100 or invalid keys/values.
1417
+ """
1418
+ # Validate percentages
1419
+ if not ml_split_list:
1420
+ ml_split_list = {'train': 80, 'validation': 10, 'test': 10}
1421
+
1422
+ if not items_query:
1423
+ items_query = entities.Filters()
1424
+
1425
+ items_query_dict = items_query.prepare()
1426
+ required_keys = {'train', 'validation', 'test'}
1427
+ if set(ml_split_list.keys()) != required_keys:
1428
+ raise ValueError("MLSplitList must have exactly the keys 'train', 'validation', 'test'.")
1429
+ total = sum(ml_split_list.values())
1430
+ if total != 100:
1431
+ raise ValueError(
1432
+ "Please set the Train, Validation, and Test subsets percentages to add up to 100%. "
1433
+ "For example: 70, 15, 15."
1434
+ )
1435
+ for key, value in ml_split_list.items():
1436
+ if not isinstance(value, int) or value < 0:
1437
+ raise ValueError("Percentages must be integers >= 0.")
1438
+ payload = {
1439
+ 'itemsQuery': items_query_dict,
1440
+ 'MLSplitList': ml_split_list
1441
+ }
1442
+ path = f'/datasets/{dataset_id}/split'
1443
+ success, response = self._client_api.gen_request(req_type='post',
1444
+ path=path,
1445
+ json_req=payload)
1446
+ if success:
1447
+ # Wait for the split operation to complete
1448
+ command = entities.Command.from_json(_json=response.json(),
1449
+ client_api=self._client_api)
1450
+ command.wait()
1451
+ return True
1452
+ else:
1453
+ raise exceptions.PlatformException(response)
1454
+
1455
+ @_api_reference.add(path='/datasets/{id}/items/bulk-update-metadata', method='post')
1456
+ def bulk_update_ml_subset(self, dataset_id: str, items_query: dict, subset: str = None, deleteTag: bool = False) -> bool:
1457
+ """
1458
+ Bulk update ML subset assignment for selected items.
1459
+ If subset is None, remove subsets. Otherwise, assign the specified subset.
1460
+
1461
+ :param str dataset_id: ID of the dataset
1462
+ :param dict items_query: DQLResourceQuery (filters) for selecting items
1463
+ :param str subset: 'train', 'validation', 'test' or None to remove all
1464
+ :return: True if success
1465
+ :rtype: bool
1466
+ """
1467
+ if items_query is None:
1468
+ items_query = entities.Filters()
1469
+ items_query_dict = items_query.prepare()
1470
+ if not deleteTag and subset not in ['train', 'validation', 'test']:
1471
+ raise ValueError("subset must be one of: 'train', 'validation', 'test'")
1472
+ # Determine tag values based on subset
1473
+ tags = {
1474
+ 'train': True if subset == 'train' else None,
1475
+ 'validation': True if subset == 'validation' else None,
1476
+ 'test': True if subset == 'test' else None
1477
+ }
1478
+
1479
+ payload = {
1480
+ "query": items_query_dict,
1481
+ "updateQuery": {
1482
+ "update": {
1483
+ "metadata": {
1484
+ "system": {
1485
+ "tags": tags
1486
+ }
1487
+ }
1488
+ },
1489
+ "systemSpace": True
1490
+ }
1491
+ }
1492
+
1493
+ success, response = self._client_api.gen_request(
1494
+ req_type='post',
1495
+ path=f'/datasets/{dataset_id}/items/bulk-update-metadata',
1496
+ json_req=payload
1497
+ )
1498
+ if success:
1499
+ # Similar to split operation, a command is returned
1500
+ command = entities.Command.from_json(_json=response.json(), client_api=self._client_api)
1501
+ command.wait()
1502
+ return True
1503
+ else:
1504
+ raise exceptions.PlatformException(response)