dtlpy 1.115.44__py3-none-any.whl → 1.117.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. dtlpy/__init__.py +491 -491
  2. dtlpy/__version__.py +1 -1
  3. dtlpy/assets/__init__.py +26 -26
  4. dtlpy/assets/code_server/config.yaml +2 -2
  5. dtlpy/assets/code_server/installation.sh +24 -24
  6. dtlpy/assets/code_server/launch.json +13 -13
  7. dtlpy/assets/code_server/settings.json +2 -2
  8. dtlpy/assets/main.py +53 -53
  9. dtlpy/assets/main_partial.py +18 -18
  10. dtlpy/assets/mock.json +11 -11
  11. dtlpy/assets/model_adapter.py +83 -83
  12. dtlpy/assets/package.json +61 -61
  13. dtlpy/assets/package_catalog.json +29 -29
  14. dtlpy/assets/package_gitignore +307 -307
  15. dtlpy/assets/service_runners/__init__.py +33 -33
  16. dtlpy/assets/service_runners/converter.py +96 -96
  17. dtlpy/assets/service_runners/multi_method.py +49 -49
  18. dtlpy/assets/service_runners/multi_method_annotation.py +54 -54
  19. dtlpy/assets/service_runners/multi_method_dataset.py +55 -55
  20. dtlpy/assets/service_runners/multi_method_item.py +52 -52
  21. dtlpy/assets/service_runners/multi_method_json.py +52 -52
  22. dtlpy/assets/service_runners/single_method.py +37 -37
  23. dtlpy/assets/service_runners/single_method_annotation.py +43 -43
  24. dtlpy/assets/service_runners/single_method_dataset.py +43 -43
  25. dtlpy/assets/service_runners/single_method_item.py +41 -41
  26. dtlpy/assets/service_runners/single_method_json.py +42 -42
  27. dtlpy/assets/service_runners/single_method_multi_input.py +45 -45
  28. dtlpy/assets/voc_annotation_template.xml +23 -23
  29. dtlpy/caches/base_cache.py +32 -32
  30. dtlpy/caches/cache.py +473 -473
  31. dtlpy/caches/dl_cache.py +201 -201
  32. dtlpy/caches/filesystem_cache.py +89 -89
  33. dtlpy/caches/redis_cache.py +84 -84
  34. dtlpy/dlp/__init__.py +20 -20
  35. dtlpy/dlp/cli_utilities.py +367 -367
  36. dtlpy/dlp/command_executor.py +764 -764
  37. dtlpy/dlp/dlp +1 -1
  38. dtlpy/dlp/dlp.bat +1 -1
  39. dtlpy/dlp/dlp.py +128 -128
  40. dtlpy/dlp/parser.py +651 -651
  41. dtlpy/entities/__init__.py +83 -83
  42. dtlpy/entities/analytic.py +347 -347
  43. dtlpy/entities/annotation.py +1879 -1879
  44. dtlpy/entities/annotation_collection.py +699 -699
  45. dtlpy/entities/annotation_definitions/__init__.py +20 -20
  46. dtlpy/entities/annotation_definitions/base_annotation_definition.py +100 -100
  47. dtlpy/entities/annotation_definitions/box.py +195 -195
  48. dtlpy/entities/annotation_definitions/classification.py +67 -67
  49. dtlpy/entities/annotation_definitions/comparison.py +72 -72
  50. dtlpy/entities/annotation_definitions/cube.py +204 -204
  51. dtlpy/entities/annotation_definitions/cube_3d.py +149 -149
  52. dtlpy/entities/annotation_definitions/description.py +32 -32
  53. dtlpy/entities/annotation_definitions/ellipse.py +124 -124
  54. dtlpy/entities/annotation_definitions/free_text.py +62 -62
  55. dtlpy/entities/annotation_definitions/gis.py +69 -69
  56. dtlpy/entities/annotation_definitions/note.py +139 -139
  57. dtlpy/entities/annotation_definitions/point.py +117 -117
  58. dtlpy/entities/annotation_definitions/polygon.py +182 -182
  59. dtlpy/entities/annotation_definitions/polyline.py +111 -111
  60. dtlpy/entities/annotation_definitions/pose.py +92 -92
  61. dtlpy/entities/annotation_definitions/ref_image.py +86 -86
  62. dtlpy/entities/annotation_definitions/segmentation.py +240 -240
  63. dtlpy/entities/annotation_definitions/subtitle.py +34 -34
  64. dtlpy/entities/annotation_definitions/text.py +85 -85
  65. dtlpy/entities/annotation_definitions/undefined_annotation.py +74 -74
  66. dtlpy/entities/app.py +220 -220
  67. dtlpy/entities/app_module.py +107 -107
  68. dtlpy/entities/artifact.py +174 -174
  69. dtlpy/entities/assignment.py +399 -399
  70. dtlpy/entities/base_entity.py +214 -214
  71. dtlpy/entities/bot.py +113 -113
  72. dtlpy/entities/codebase.py +292 -292
  73. dtlpy/entities/collection.py +38 -38
  74. dtlpy/entities/command.py +169 -169
  75. dtlpy/entities/compute.py +449 -449
  76. dtlpy/entities/dataset.py +1299 -1299
  77. dtlpy/entities/directory_tree.py +44 -44
  78. dtlpy/entities/dpk.py +470 -470
  79. dtlpy/entities/driver.py +235 -235
  80. dtlpy/entities/execution.py +397 -397
  81. dtlpy/entities/feature.py +124 -124
  82. dtlpy/entities/feature_set.py +152 -145
  83. dtlpy/entities/filters.py +798 -798
  84. dtlpy/entities/gis_item.py +107 -107
  85. dtlpy/entities/integration.py +184 -184
  86. dtlpy/entities/item.py +975 -959
  87. dtlpy/entities/label.py +123 -123
  88. dtlpy/entities/links.py +85 -85
  89. dtlpy/entities/message.py +175 -175
  90. dtlpy/entities/model.py +684 -684
  91. dtlpy/entities/node.py +1005 -1005
  92. dtlpy/entities/ontology.py +810 -803
  93. dtlpy/entities/organization.py +287 -287
  94. dtlpy/entities/package.py +657 -657
  95. dtlpy/entities/package_defaults.py +5 -5
  96. dtlpy/entities/package_function.py +185 -185
  97. dtlpy/entities/package_module.py +113 -113
  98. dtlpy/entities/package_slot.py +118 -118
  99. dtlpy/entities/paged_entities.py +299 -299
  100. dtlpy/entities/pipeline.py +624 -624
  101. dtlpy/entities/pipeline_execution.py +279 -279
  102. dtlpy/entities/project.py +394 -394
  103. dtlpy/entities/prompt_item.py +505 -505
  104. dtlpy/entities/recipe.py +301 -301
  105. dtlpy/entities/reflect_dict.py +102 -102
  106. dtlpy/entities/resource_execution.py +138 -138
  107. dtlpy/entities/service.py +974 -963
  108. dtlpy/entities/service_driver.py +117 -117
  109. dtlpy/entities/setting.py +294 -294
  110. dtlpy/entities/task.py +495 -495
  111. dtlpy/entities/time_series.py +143 -143
  112. dtlpy/entities/trigger.py +426 -426
  113. dtlpy/entities/user.py +118 -118
  114. dtlpy/entities/webhook.py +124 -124
  115. dtlpy/examples/__init__.py +19 -19
  116. dtlpy/examples/add_labels.py +135 -135
  117. dtlpy/examples/add_metadata_to_item.py +21 -21
  118. dtlpy/examples/annotate_items_using_model.py +65 -65
  119. dtlpy/examples/annotate_video_using_model_and_tracker.py +75 -75
  120. dtlpy/examples/annotations_convert_to_voc.py +9 -9
  121. dtlpy/examples/annotations_convert_to_yolo.py +9 -9
  122. dtlpy/examples/convert_annotation_types.py +51 -51
  123. dtlpy/examples/converter.py +143 -143
  124. dtlpy/examples/copy_annotations.py +22 -22
  125. dtlpy/examples/copy_folder.py +31 -31
  126. dtlpy/examples/create_annotations.py +51 -51
  127. dtlpy/examples/create_video_annotations.py +83 -83
  128. dtlpy/examples/delete_annotations.py +26 -26
  129. dtlpy/examples/filters.py +113 -113
  130. dtlpy/examples/move_item.py +23 -23
  131. dtlpy/examples/play_video_annotation.py +13 -13
  132. dtlpy/examples/show_item_and_mask.py +53 -53
  133. dtlpy/examples/triggers.py +49 -49
  134. dtlpy/examples/upload_batch_of_items.py +20 -20
  135. dtlpy/examples/upload_items_and_custom_format_annotations.py +55 -55
  136. dtlpy/examples/upload_items_with_modalities.py +43 -43
  137. dtlpy/examples/upload_segmentation_annotations_from_mask_image.py +44 -44
  138. dtlpy/examples/upload_yolo_format_annotations.py +70 -70
  139. dtlpy/exceptions.py +125 -125
  140. dtlpy/miscellaneous/__init__.py +20 -20
  141. dtlpy/miscellaneous/dict_differ.py +95 -95
  142. dtlpy/miscellaneous/git_utils.py +217 -217
  143. dtlpy/miscellaneous/json_utils.py +14 -14
  144. dtlpy/miscellaneous/list_print.py +105 -105
  145. dtlpy/miscellaneous/zipping.py +130 -130
  146. dtlpy/ml/__init__.py +20 -20
  147. dtlpy/ml/base_feature_extractor_adapter.py +27 -27
  148. dtlpy/ml/base_model_adapter.py +1287 -1230
  149. dtlpy/ml/metrics.py +461 -461
  150. dtlpy/ml/predictions_utils.py +274 -274
  151. dtlpy/ml/summary_writer.py +57 -57
  152. dtlpy/ml/train_utils.py +60 -60
  153. dtlpy/new_instance.py +252 -252
  154. dtlpy/repositories/__init__.py +56 -56
  155. dtlpy/repositories/analytics.py +85 -85
  156. dtlpy/repositories/annotations.py +916 -916
  157. dtlpy/repositories/apps.py +383 -383
  158. dtlpy/repositories/artifacts.py +452 -452
  159. dtlpy/repositories/assignments.py +599 -599
  160. dtlpy/repositories/bots.py +213 -213
  161. dtlpy/repositories/codebases.py +559 -559
  162. dtlpy/repositories/collections.py +332 -332
  163. dtlpy/repositories/commands.py +152 -152
  164. dtlpy/repositories/compositions.py +61 -61
  165. dtlpy/repositories/computes.py +439 -439
  166. dtlpy/repositories/datasets.py +1585 -1504
  167. dtlpy/repositories/downloader.py +1157 -923
  168. dtlpy/repositories/dpks.py +433 -433
  169. dtlpy/repositories/drivers.py +482 -482
  170. dtlpy/repositories/executions.py +815 -815
  171. dtlpy/repositories/feature_sets.py +256 -226
  172. dtlpy/repositories/features.py +255 -255
  173. dtlpy/repositories/integrations.py +484 -484
  174. dtlpy/repositories/items.py +912 -912
  175. dtlpy/repositories/messages.py +94 -94
  176. dtlpy/repositories/models.py +1000 -1000
  177. dtlpy/repositories/nodes.py +80 -80
  178. dtlpy/repositories/ontologies.py +511 -511
  179. dtlpy/repositories/organizations.py +525 -525
  180. dtlpy/repositories/packages.py +1941 -1941
  181. dtlpy/repositories/pipeline_executions.py +451 -451
  182. dtlpy/repositories/pipelines.py +640 -640
  183. dtlpy/repositories/projects.py +539 -539
  184. dtlpy/repositories/recipes.py +429 -399
  185. dtlpy/repositories/resource_executions.py +137 -137
  186. dtlpy/repositories/schema.py +120 -120
  187. dtlpy/repositories/service_drivers.py +213 -213
  188. dtlpy/repositories/services.py +1704 -1704
  189. dtlpy/repositories/settings.py +339 -339
  190. dtlpy/repositories/tasks.py +1477 -1477
  191. dtlpy/repositories/times_series.py +278 -278
  192. dtlpy/repositories/triggers.py +536 -536
  193. dtlpy/repositories/upload_element.py +257 -257
  194. dtlpy/repositories/uploader.py +661 -661
  195. dtlpy/repositories/webhooks.py +249 -249
  196. dtlpy/services/__init__.py +22 -22
  197. dtlpy/services/aihttp_retry.py +131 -131
  198. dtlpy/services/api_client.py +1786 -1785
  199. dtlpy/services/api_reference.py +40 -40
  200. dtlpy/services/async_utils.py +133 -133
  201. dtlpy/services/calls_counter.py +44 -44
  202. dtlpy/services/check_sdk.py +68 -68
  203. dtlpy/services/cookie.py +115 -115
  204. dtlpy/services/create_logger.py +156 -156
  205. dtlpy/services/events.py +84 -84
  206. dtlpy/services/logins.py +235 -235
  207. dtlpy/services/reporter.py +256 -256
  208. dtlpy/services/service_defaults.py +91 -91
  209. dtlpy/utilities/__init__.py +20 -20
  210. dtlpy/utilities/annotations/__init__.py +16 -16
  211. dtlpy/utilities/annotations/annotation_converters.py +269 -269
  212. dtlpy/utilities/base_package_runner.py +285 -264
  213. dtlpy/utilities/converter.py +1650 -1650
  214. dtlpy/utilities/dataset_generators/__init__.py +1 -1
  215. dtlpy/utilities/dataset_generators/dataset_generator.py +670 -670
  216. dtlpy/utilities/dataset_generators/dataset_generator_tensorflow.py +23 -23
  217. dtlpy/utilities/dataset_generators/dataset_generator_torch.py +21 -21
  218. dtlpy/utilities/local_development/__init__.py +1 -1
  219. dtlpy/utilities/local_development/local_session.py +179 -179
  220. dtlpy/utilities/reports/__init__.py +2 -2
  221. dtlpy/utilities/reports/figures.py +343 -343
  222. dtlpy/utilities/reports/report.py +71 -71
  223. dtlpy/utilities/videos/__init__.py +17 -17
  224. dtlpy/utilities/videos/video_player.py +598 -598
  225. dtlpy/utilities/videos/videos.py +470 -470
  226. {dtlpy-1.115.44.data → dtlpy-1.117.6.data}/scripts/dlp +1 -1
  227. dtlpy-1.117.6.data/scripts/dlp.bat +2 -0
  228. {dtlpy-1.115.44.data → dtlpy-1.117.6.data}/scripts/dlp.py +128 -128
  229. {dtlpy-1.115.44.dist-info → dtlpy-1.117.6.dist-info}/METADATA +186 -186
  230. dtlpy-1.117.6.dist-info/RECORD +239 -0
  231. {dtlpy-1.115.44.dist-info → dtlpy-1.117.6.dist-info}/WHEEL +1 -1
  232. {dtlpy-1.115.44.dist-info → dtlpy-1.117.6.dist-info}/licenses/LICENSE +200 -200
  233. tests/features/environment.py +551 -551
  234. dtlpy/assets/__pycache__/__init__.cpython-310.pyc +0 -0
  235. dtlpy-1.115.44.data/scripts/dlp.bat +0 -2
  236. dtlpy-1.115.44.dist-info/RECORD +0 -240
  237. {dtlpy-1.115.44.dist-info → dtlpy-1.117.6.dist-info}/entry_points.txt +0 -0
  238. {dtlpy-1.115.44.dist-info → dtlpy-1.117.6.dist-info}/top_level.txt +0 -0
@@ -1,1504 +1,1585 @@
1
- """
2
- Datasets Repository
3
- """
4
-
5
- import os
6
- import sys
7
- import time
8
- import copy
9
- import tqdm
10
- import logging
11
- import zipfile
12
- import json
13
- from typing import Union, Generator, Optional
14
-
15
- from .. import entities, repositories, miscellaneous, exceptions, services, PlatformException, _api_reference
16
- from ..services.api_client import ApiClient
17
- from ..entities.dataset import OutputExportType, ExportType
18
-
19
- logger = logging.getLogger(name='dtlpy')
20
-
21
- MAX_ITEMS_PER_SUBSET = 50000
22
-
23
- class Datasets:
24
- """
25
- Datasets Repository
26
-
27
- The Datasets class allows the user to manage datasets. Read more about datasets in our `documentation <https://dataloop.ai/docs/dataset>`_ and `SDK documentation <https://developers.dataloop.ai/tutorials/data_management/manage_datasets/chapter/>`_.
28
- """
29
-
30
- def __init__(self, client_api: ApiClient, project: entities.Project = None):
31
- self._client_api = client_api
32
- self._project = project
33
-
34
- ############
35
- # entities #
36
- ############
37
- @property
38
- def project(self) -> entities.Project:
39
- if self._project is None:
40
- # try get checkout
41
- project = self._client_api.state_io.get('project')
42
- if project is not None:
43
- self._project = entities.Project.from_json(_json=project, client_api=self._client_api)
44
- if self._project is None:
45
- raise exceptions.PlatformException(
46
- error='2001',
47
- message='Cannot perform action WITHOUT Project entity in Datasets repository.'
48
- ' Please checkout or set a project')
49
- assert isinstance(self._project, entities.Project)
50
- return self._project
51
-
52
- @project.setter
53
- def project(self, project: entities.Project):
54
- if not isinstance(project, entities.Project):
55
- raise ValueError('Must input a valid Project entity')
56
- self._project = project
57
-
58
- ###########
59
- # methods #
60
- ###########
61
- def __get_from_cache(self) -> entities.Dataset:
62
- dataset = self._client_api.state_io.get('dataset')
63
- if dataset is not None:
64
- dataset = entities.Dataset.from_json(_json=dataset,
65
- client_api=self._client_api,
66
- datasets=self,
67
- project=self._project)
68
- return dataset
69
-
70
- def __get_by_id(self, dataset_id) -> entities.Dataset:
71
- success, response = self._client_api.gen_request(req_type='get',
72
- path='/datasets/{}'.format(dataset_id))
73
- if dataset_id is None or dataset_id == '':
74
- raise exceptions.PlatformException('400', 'Please checkout a dataset')
75
-
76
- if success:
77
- dataset = entities.Dataset.from_json(client_api=self._client_api,
78
- _json=response.json(),
79
- datasets=self,
80
- project=self._project)
81
- else:
82
- raise exceptions.PlatformException(response)
83
- return dataset
84
-
85
- def __get_by_identifier(self, identifier=None) -> entities.Dataset:
86
- datasets = self.list()
87
- datasets_by_name = [dataset for dataset in datasets if identifier in dataset.name or identifier in dataset.id]
88
- if len(datasets_by_name) == 1:
89
- return datasets_by_name[0]
90
- elif len(datasets_by_name) > 1:
91
- raise Exception('Multiple datasets with this name exist')
92
- else:
93
- raise Exception("Dataset not found")
94
-
95
- def _bulid_folder_filter(self, folder_path, filters=None):
96
- if filters is None:
97
- filters = entities.Filters()
98
- filters._user_query = 'false'
99
- if not folder_path.startswith('/'):
100
- folder_path = '/' + folder_path
101
- filters.add(field='dir', values=folder_path, method=entities.FiltersMethod.OR)
102
- if not folder_path.endswith('*'):
103
- if not folder_path.endswith('/'):
104
- folder_path += '/'
105
- filters.add(field='dir', values=folder_path + '*', method=entities.FiltersMethod.OR)
106
- return filters
107
-
108
- def _get_binaries_dataset(self):
109
- filters = entities.Filters(resource=entities.FiltersResource.DATASET)
110
- filters.add(field='name', values='Binaries')
111
- filters.system_space = True
112
- datasets = self.list(filters=filters)
113
- if len(datasets) == 0:
114
- # empty list
115
- raise exceptions.PlatformException('404', 'Dataset not found. Name: "Binaries"')
116
- # dataset = None
117
- elif len(datasets) > 1:
118
- raise exceptions.PlatformException('400', 'More than one dataset with same name.')
119
- else:
120
- dataset = datasets[0]
121
- return dataset
122
-
123
- def _resolve_dataset_id(self, dataset, dataset_name, dataset_id):
124
- if dataset is None and dataset_name is None and dataset_id is None:
125
- raise ValueError('Must provide dataset, dataset name or dataset id')
126
- if dataset_id is None:
127
- if dataset is None:
128
- dataset = self.get(dataset_name=dataset_name)
129
- dataset_id = dataset.id
130
- return dataset_id
131
-
132
- @staticmethod
133
- def _build_payload(filters, include_feature_vectors, include_annotations,
134
- export_type, annotation_filters, feature_vector_filters, dataset_lock, lock_timeout_sec, export_summary):
135
- valid_list = [e.value for e in entities.ExportType]
136
- valid_types = ', '.join(valid_list)
137
- if export_type not in ['json', 'zip']:
138
- raise ValueError('export_type must be one of the following: {}'.format(valid_types))
139
- payload = {'exportType': export_type}
140
- if filters is None:
141
- filters = entities.Filters()
142
-
143
- if isinstance(filters, entities.Filters):
144
- payload['itemsQuery'] = {'filter': filters.prepare()['filter'], 'join': filters.prepare().get("join", {})}
145
- elif isinstance(filters, dict):
146
- payload['itemsQuery'] = filters
147
- else:
148
- raise exceptions.BadRequest(message='filters must be of type dict or Filters', status_code=500)
149
-
150
- payload['itemsVectorQuery'] = {}
151
- if include_feature_vectors:
152
- payload['includeItemVectors'] = True
153
- payload['itemsVectorQuery']['select'] = {"datasetId": 1, 'featureSetId': 1, 'value': 1}
154
-
155
- if feature_vector_filters is not None:
156
- payload['itemsVectorQuery']['filter'] = feature_vector_filters.prepare()['filter']
157
-
158
- payload['annotations'] = {"include": include_annotations, "convertSemantic": False}
159
-
160
- if annotation_filters is not None:
161
- payload['annotationsQuery'] = annotation_filters.prepare()
162
-
163
- if dataset_lock:
164
- payload['datasetLock'] = dataset_lock
165
-
166
- if export_summary:
167
- payload['summary'] = export_summary
168
-
169
- if lock_timeout_sec:
170
- payload['lockTimeoutSec'] = lock_timeout_sec
171
-
172
- return payload
173
-
174
- def _download_exported_item(self, item_id, export_type, local_path=None, unzip=True):
175
- logger.debug(f"start downloading exported item {item_id} with export_type {export_type} and local_path {local_path} and unzip {unzip}")
176
- export_item = repositories.Items(client_api=self._client_api).get(item_id=item_id)
177
- export_item_path = export_item.download(local_path=local_path)
178
-
179
- # Common validation check for both JSON and other export types
180
- if isinstance(export_item_path, list) or not os.path.isfile(export_item_path):
181
- raise exceptions.PlatformException(
182
- error='404',
183
- message='error downloading annotation zip file. see above for more information. item id: {!r}'.format(
184
- export_item.id))
185
-
186
- result = None
187
- if unzip is False or export_type == entities.ExportType.JSON:
188
- result = export_item_path
189
- else:
190
- try:
191
- miscellaneous.Zipping.unzip_directory(zip_filename=export_item_path,
192
- to_directory=local_path)
193
- result = local_path
194
- except Exception as e:
195
- logger.warning("Failed to extract zip file error: {}".format(e))
196
- finally:
197
- # cleanup only for zip files to avoid removing needed results
198
- if isinstance(export_item_path, str) and os.path.isfile(export_item_path):
199
- os.remove(export_item_path)
200
- logger.debug(f"end downloading, result {result}")
201
- return result
202
-
203
- @property
204
- def platform_url(self):
205
- return self._client_api._get_resource_url("projects/{}/datasets".format(self.project.id))
206
-
207
- def open_in_web(self,
208
- dataset_name: str = None,
209
- dataset_id: str = None,
210
- dataset: entities.Dataset = None):
211
- """
212
- Open the dataset in web platform.
213
-
214
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
215
-
216
- :param str dataset_name: The Name of the dataset
217
- :param str dataset_id: The Id of the dataset
218
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
219
-
220
- **Example**:
221
-
222
- .. code-block:: python
223
-
224
- project.datasets.open_in_web(dataset_id='dataset_id')
225
- """
226
- if dataset_name is not None:
227
- dataset = self.get(dataset_name=dataset_name)
228
- if dataset is not None:
229
- dataset.open_in_web()
230
- elif dataset_id is not None:
231
- self._client_api._open_in_web(url=f'{self.platform_url}/{dataset_id}/items')
232
- else:
233
- self._client_api._open_in_web(url=self.platform_url)
234
-
235
- def checkout(self,
236
- identifier: str = None,
237
- dataset_name: str = None,
238
- dataset_id: str = None,
239
- dataset: entities.Dataset = None):
240
- """
241
- Checkout (switch) to a dataset to work on it.
242
-
243
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
244
-
245
- You must provide at least ONE of the following params: dataset_id, dataset_name.
246
-
247
- :param str identifier: project name or partial id that you wish to switch
248
- :param str dataset_name: The Name of the dataset
249
- :param str dataset_id: The Id of the dataset
250
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
251
-
252
- **Example**:
253
-
254
- .. code-block:: python
255
-
256
- project.datasets.checkout(dataset_id='dataset_id')
257
- """
258
- if dataset is None:
259
- if dataset_id is not None or dataset_name is not None:
260
- try:
261
- dataset = self.project.datasets.get(dataset_name=dataset_name, dataset_id=dataset_id)
262
- except exceptions.MissingEntity:
263
- dataset = self.get(dataset_id=dataset_id, dataset_name=dataset_name)
264
- elif identifier is not None:
265
- dataset = self.__get_by_identifier(identifier=identifier)
266
- else:
267
- raise exceptions.PlatformException(error='400',
268
- message='Must provide partial/full id/name to checkout')
269
- self._client_api.state_io.put('dataset', dataset.to_json())
270
- logger.info('Checked out to dataset {}'.format(dataset.name))
271
-
272
- @_api_reference.add(path='/datasets/query', method='post')
273
- def list(self, name=None, creator=None, filters: entities.Filters = None) -> miscellaneous.List[entities.Dataset]:
274
- """
275
- List all datasets.
276
-
277
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
278
-
279
- :param str name: list by name
280
- :param str creator: list by
281
- :param dtlpy.entities.filters.Filters filters: Filters entity containing filters parameters
282
- :return: List of datasets
283
- :rtype: list
284
-
285
- **Example**:
286
-
287
- .. code-block:: python
288
- filters = dl.Filters(resource='datasets')
289
- filters.add(field='readonly', values=False)
290
- datasets = project.datasets.list(filters=filters)
291
- """
292
- if filters is None:
293
- filters = entities.Filters(resource=entities.FiltersResource.DATASET)
294
- # assert type filters
295
- elif not isinstance(filters, entities.Filters):
296
- raise exceptions.PlatformException(error='400',
297
- message='Unknown filters type: {!r}'.format(type(filters)))
298
- if filters.resource != entities.FiltersResource.DATASET:
299
- raise exceptions.PlatformException(
300
- error='400',
301
- message='Filters resource must to be FiltersResource.DATASET. Got: {!r}'.format(filters.resource))
302
-
303
- url = '/datasets/query'
304
-
305
- if name is not None:
306
- filters.add(field='name', values=name)
307
- if creator is not None:
308
- filters.add(field='creator', values=creator)
309
- if self._project is not None:
310
- filters.context = {"projects": [self._project.id]}
311
- filters.page_size = 1000
312
- filters.page = 0
313
- datasets = list()
314
- while True:
315
- success, response = self._client_api.gen_request(req_type='POST',
316
- json_req=filters.prepare(),
317
- path=url,
318
- headers={'user_query': filters._user_query})
319
- if success:
320
- pool = self._client_api.thread_pools('entity.create')
321
- datasets_json = response.json()['items']
322
- jobs = [None for _ in range(len(datasets_json))]
323
- # return triggers list
324
- for i_dataset, dataset in enumerate(datasets_json):
325
- jobs[i_dataset] = pool.submit(entities.Dataset._protected_from_json,
326
- **{'client_api': self._client_api,
327
- '_json': dataset,
328
- 'datasets': self,
329
- 'project': self.project})
330
-
331
- # get all results
332
- results = [j.result() for j in jobs]
333
- # log errors
334
- _ = [logger.warning(r[1]) for r in results if r[0] is False]
335
- # return good jobs
336
- datasets.extend([r[1] for r in results if r[0] is True])
337
- if response.json()['hasNextPage'] is True:
338
- filters.page += 1
339
- else:
340
- break
341
- else:
342
- raise exceptions.PlatformException(response)
343
- datasets = miscellaneous.List(datasets)
344
- return datasets
345
-
346
- @_api_reference.add(path='/datasets/{id}', method='get')
347
- def get(self,
348
- dataset_name: str = None,
349
- dataset_id: str = None,
350
- checkout: bool = False,
351
- fetch: bool = None
352
- ) -> entities.Dataset:
353
- """
354
- Get dataset by name or id.
355
-
356
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
357
-
358
- You must provide at least ONE of the following params: dataset_id, dataset_name.
359
-
360
- :param str dataset_name: optional - search by name
361
- :param str dataset_id: optional - search by id
362
- :param bool checkout: set the dataset as a default dataset object (cookies)
363
- :param bool fetch: optional - fetch entity from platform (True), default taken from cookie
364
- :return: Dataset object
365
- :rtype: dtlpy.entities.dataset.Dataset
366
-
367
- **Example**:
368
-
369
- .. code-block:: python
370
-
371
- dataset = project.datasets.get(dataset_id='dataset_id')
372
- """
373
- if fetch is None:
374
- fetch = self._client_api.fetch_entities
375
-
376
- if dataset_id is None and dataset_name is None:
377
- dataset = self.__get_from_cache()
378
- if dataset is None:
379
- raise exceptions.PlatformException(
380
- error='400',
381
- message='No checked-out Dataset was found, must checkout or provide an identifier in inputs')
382
- elif fetch:
383
- if dataset_id is not None and dataset_id != '':
384
- dataset = self.__get_by_id(dataset_id)
385
- # verify input dataset name is same as the given id
386
- if dataset_name is not None and dataset.name != dataset_name:
387
- logger.warning(
388
- "Mismatch found in datasets.get: dataset_name is different then dataset.name: "
389
- "{!r} != {!r}".format(
390
- dataset_name,
391
- dataset.name))
392
- elif dataset_name is not None:
393
- datasets = self.list(name=dataset_name)
394
- if not datasets:
395
- # empty list
396
- raise exceptions.PlatformException('404', 'Dataset not found. Name: {!r}'.format(dataset_name))
397
- # dataset = None
398
- elif len(datasets) > 1:
399
- raise exceptions.PlatformException('400', 'More than one dataset with same name.')
400
- else:
401
- dataset = datasets[0]
402
- else:
403
- raise exceptions.PlatformException(
404
- error='404',
405
- message='No input and no checked-out found')
406
- else:
407
- dataset = entities.Dataset.from_json(_json={'id': dataset_id,
408
- 'name': dataset_id},
409
- client_api=self._client_api,
410
- datasets=self,
411
- project=self._project,
412
- is_fetched=False)
413
- assert isinstance(dataset, entities.Dataset)
414
- if checkout:
415
- self.checkout(dataset=dataset)
416
- return dataset
417
-
418
- @_api_reference.add(path='/datasets/{id}', method='delete')
419
- def delete(self,
420
- dataset_name: str = None,
421
- dataset_id: str = None,
422
- sure: bool = False,
423
- really: bool = False):
424
- """
425
- Delete a dataset forever!
426
-
427
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
428
-
429
- **Example**:
430
-
431
- .. code-block:: python
432
-
433
- is_deleted = project.datasets.delete(dataset_id='dataset_id', sure=True, really=True)
434
-
435
- :param str dataset_name: optional - search by name
436
- :param str dataset_id: optional - search by id
437
- :param bool sure: Are you sure you want to delete?
438
- :param bool really: Really really sure?
439
- :return: True is success
440
- :rtype: bool
441
- """
442
- if sure and really:
443
- dataset = self.get(dataset_name=dataset_name, dataset_id=dataset_id)
444
- success, response = self._client_api.gen_request(req_type='delete',
445
- path='/datasets/{}'.format(dataset.id))
446
- if not success:
447
- raise exceptions.PlatformException(response)
448
- logger.info('Dataset {!r} was deleted successfully'.format(dataset.name))
449
- return True
450
- else:
451
- raise exceptions.PlatformException(
452
- error='403',
453
- message='Cant delete dataset from SDK. Please login to platform to delete')
454
-
455
- @_api_reference.add(path='/datasets/{id}', method='patch')
456
- def update(self,
457
- dataset: entities.Dataset,
458
- system_metadata: bool = False,
459
- patch: dict = None
460
- ) -> entities.Dataset:
461
- """
462
- Update dataset field.
463
-
464
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
465
-
466
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
467
- :param bool system_metadata: True, if you want to change metadata system
468
- :param dict patch: Specific patch request
469
- :return: Dataset object
470
- :rtype: dtlpy.entities.dataset.Dataset
471
-
472
- **Example**:
473
-
474
- .. code-block:: python
475
-
476
- dataset = project.datasets.update(dataset='dataset_entity')
477
- """
478
- url_path = '/datasets/{}'.format(dataset.id)
479
- if system_metadata:
480
- url_path += '?system=true'
481
-
482
- if patch is None:
483
- patch = dataset.to_json()
484
-
485
- success, response = self._client_api.gen_request(req_type='patch',
486
- path=url_path,
487
- json_req=patch)
488
- if success:
489
- logger.info('Dataset was updated successfully')
490
- return dataset
491
- else:
492
- raise exceptions.PlatformException(response)
493
-
494
- @_api_reference.add(path='/datasets/{id}/unlock', method='patch')
495
- def unlock(self, dataset: entities.Dataset ) -> entities.Dataset:
496
- """
497
- Unlock dataset.
498
-
499
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
500
-
501
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
502
- :return: Dataset object
503
- :rtype: dtlpy.entities.dataset.Dataset
504
-
505
- **Example**:
506
-
507
- .. code-block:: python
508
-
509
- dataset = project.datasets.unlock(dataset='dataset_entity')
510
- """
511
- url_path = '/datasets/{}/unlock'.format(dataset.id)
512
-
513
- success, response = self._client_api.gen_request(req_type='patch', path=url_path)
514
- if success:
515
- logger.info('Dataset was unlocked successfully')
516
- return dataset
517
- else:
518
- raise exceptions.PlatformException(response)
519
-
520
- @_api_reference.add(path='/datasets/{id}/directoryTree', method='get')
521
- def directory_tree(self,
522
- dataset: entities.Dataset = None,
523
- dataset_name: str = None,
524
- dataset_id: str = None):
525
- """
526
- Get dataset's directory tree.
527
-
528
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
529
-
530
- You must provide at least ONE of the following params: dataset, dataset_name, dataset_id.
531
-
532
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
533
- :param str dataset_name: The Name of the dataset
534
- :param str dataset_id: The Id of the dataset
535
- :return: DirectoryTree
536
-
537
- **Example**:
538
-
539
- .. code-block:: python
540
- directory_tree = dataset.directory_tree
541
- directory_tree = project.datasets.directory_tree(dataset='dataset_entity')
542
- """
543
- dataset_id = self._resolve_dataset_id(dataset, dataset_name, dataset_id)
544
-
545
- url_path = '/datasets/{}/directoryTree'.format(dataset_id)
546
-
547
- success, response = self._client_api.gen_request(req_type='get',
548
- path=url_path)
549
-
550
- if success:
551
- return entities.DirectoryTree(_json=response.json())
552
- else:
553
- raise exceptions.PlatformException(response)
554
-
555
- @_api_reference.add(path='/datasets/{id}/clone', method='post')
556
- def clone(self,
557
- dataset_id: str,
558
- clone_name: str = None,
559
- filters: entities.Filters = None,
560
- with_items_annotations: bool = True,
561
- with_metadata: bool = True,
562
- with_task_annotations_status: bool = True,
563
- dst_dataset_id: str = None,
564
- target_directory: str = None):
565
- """
566
- Clone a dataset. Read more about cloning datatsets and items in our `documentation <https://dataloop.ai/docs/clone-merge-dataset#cloned-dataset>`_ and `SDK documentation <https://developers.dataloop.ai/tutorials/data_management/data_versioning/chapter/>`_.
567
-
568
- **Prerequisites**: You must be in the role of an *owner* or *developer*.
569
-
570
- :param str dataset_id: id of the dataset you wish to clone
571
- :param str clone_name: new dataset name
572
- :param dtlpy.entities.filters.Filters filters: Filters entity or a query dict
573
- :param bool with_items_annotations: true to clone with items annotations
574
- :param bool with_metadata: true to clone with metadata
575
- :param bool with_task_annotations_status: true to clone with task annotations' status
576
- :param str dst_dataset_id: destination dataset id
577
- :param str target_directory: target directory
578
- :return: dataset object
579
- :rtype: dtlpy.entities.dataset.Dataset
580
-
581
- **Example**:
582
-
583
- .. code-block:: python
584
-
585
- dataset = project.datasets.clone(dataset_id='dataset_id',
586
- clone_name='dataset_clone_name',
587
- with_metadata=True,
588
- with_items_annotations=False,
589
- with_task_annotations_status=False)
590
- """
591
- if clone_name is None and dst_dataset_id is None:
592
- raise exceptions.PlatformException('400', 'Must provide clone name or destination dataset id')
593
- if filters is None:
594
- filters = entities.Filters()
595
- filters._user_query = 'false'
596
- elif not isinstance(filters, entities.Filters):
597
- raise exceptions.PlatformException(
598
- error='400',
599
- message='"filters" must be a dl.Filters entity. got: {!r}'.format(type(filters)))
600
-
601
- copy_filters = copy.deepcopy(filters)
602
- if copy_filters.has_field('hidden'):
603
- copy_filters.pop('hidden')
604
-
605
- if target_directory is not None and not target_directory.startswith('/'):
606
- target_directory = '/' + target_directory
607
-
608
- payload = {
609
- "name": clone_name,
610
- "filter": copy_filters.prepare(),
611
- "cloneDatasetParams": {
612
- "withItemsAnnotations": with_items_annotations,
613
- "withMetadata": with_metadata,
614
- "withTaskAnnotationsStatus": with_task_annotations_status,
615
- "targetDirectory": target_directory
616
- }
617
- }
618
- if dst_dataset_id is not None:
619
- payload['cloneDatasetParams']['targetDatasetId'] = dst_dataset_id
620
- success, response = self._client_api.gen_request(req_type='post',
621
- path='/datasets/{}/clone'.format(dataset_id),
622
- json_req=payload,
623
- headers={'user_query': filters._user_query})
624
-
625
- if not success:
626
- raise exceptions.PlatformException(response)
627
-
628
- command = entities.Command.from_json(_json=response.json(),
629
- client_api=self._client_api)
630
- command = command.wait()
631
-
632
- if 'returnedModelId' not in command.spec:
633
- raise exceptions.PlatformException(error='400',
634
- message="returnedModelId key is missing in command response: {!r}"
635
- .format(response))
636
- return self.get(dataset_id=command.spec['returnedModelId'])
637
-
638
- def _export_recursive(
639
- self,
640
- dataset: entities.Dataset = None,
641
- dataset_name: str = None,
642
- dataset_id: str = None,
643
- local_path: str = None,
644
- filters: Union[dict, entities.Filters] = None,
645
- annotation_filters: entities.Filters = None,
646
- feature_vector_filters: entities.Filters = None,
647
- include_feature_vectors: bool = False,
648
- include_annotations: bool = False,
649
- timeout: int = 0,
650
- dataset_lock: bool = False,
651
- lock_timeout_sec: int = None,
652
- export_summary: bool = False,
653
- max_items_per_subset: int = MAX_ITEMS_PER_SUBSET,
654
- export_type: ExportType = ExportType.JSON,
655
- output_export_type: OutputExportType = OutputExportType.JSON,
656
- ) -> Generator[str, None, None]:
657
- """
658
- Export dataset items recursively by splitting large datasets into smaller subsets.
659
-
660
- Args:
661
- dataset (entities.Dataset, optional): Dataset entity to export
662
- dataset_name (str, optional): Name of the dataset to export
663
- dataset_id (str, optional): ID of the dataset to export
664
- local_path (str, optional): Local path to save the exported data
665
- filters (Union[dict, entities.Filters], optional): Filters to apply on the items
666
- annotation_filters (entities.Filters, optional): Filters to apply on the annotations
667
- feature_vector_filters (entities.Filters, optional): Filters to apply on the feature vectors
668
- include_feature_vectors (bool, optional): Whether to include feature vectors in export. Defaults to False
669
- include_annotations (bool, optional): Whether to include annotations in export. Defaults to False
670
- timeout (int, optional): Timeout in seconds for the export operation. Defaults to 0
671
- dataset_lock (bool, optional): Whether to lock the dataset during export. Defaults to False
672
- lock_timeout_sec (int, optional): Timeout for dataset lock in seconds. Defaults to None
673
- export_summary (bool, optional): Whether to include export summary. Defaults to False
674
- max_items_per_subset (int, optional): Maximum items per subset for recursive export. Defaults to MAX_ITEMS_PER_SUBSET
675
- export_type (ExportType, optional): Type of export (JSON or ZIP). Defaults to ExportType.JSON
676
- output_export_type (OutputExportType, optional): Output format type. Defaults to OutputExportType.JSON
677
-
678
- Returns:
679
- Generator[str, None, None]: Generator yielding export paths
680
-
681
- Raises:
682
- NotImplementedError: If ZIP export type is used with JSON output type
683
- exceptions.PlatformException: If API request fails or command response is invalid
684
- """
685
- logger.debug(f"exporting dataset with export_type {export_type} and output_export_type {output_export_type}")
686
- if export_type == ExportType.ZIP and output_export_type == OutputExportType.JSON:
687
- raise NotImplementedError(
688
- "Zip export type is not supported for JSON output type.\n"
689
- "If Json output is required, please use the export_type = JSON"
690
- )
691
-
692
- # Get dataset entity for recursive filtering
693
- dataset_entity = self.get(dataset_id=self._resolve_dataset_id(dataset, dataset_name, dataset_id))
694
- if export_type != ExportType.JSON:
695
- filters_list = [filters]
696
- else:
697
- # Generate filter subsets using recursive_get_filters
698
- filters_list = entities.Filters._get_split_filters(
699
- dataset=dataset_entity, filters=filters, max_items=max_items_per_subset
700
- )
701
- # First loop: Make all API requests without waiting
702
- commands = []
703
- logger.debug("start making all API requests without waiting")
704
- for filter_i in filters_list:
705
- # Build payload for this subset
706
- payload = self._build_payload(
707
- filters=filter_i,
708
- include_feature_vectors=include_feature_vectors,
709
- include_annotations=include_annotations,
710
- export_type=export_type,
711
- annotation_filters=annotation_filters,
712
- feature_vector_filters=feature_vector_filters,
713
- dataset_lock=dataset_lock,
714
- lock_timeout_sec=lock_timeout_sec,
715
- export_summary=export_summary,
716
- )
717
-
718
- # Make API request for this subset
719
- success, response = self._client_api.gen_request(
720
- req_type='post', path=f'/datasets/{dataset_entity.id}/export', json_req=payload
721
- )
722
-
723
- if not success:
724
- logger.error(f"failed to make API request /datasets/{dataset_entity.id}/export with payload {payload} response {response}")
725
- raise exceptions.PlatformException(response)
726
-
727
- # Handle command execution
728
- commands.append( entities.Command.from_json(_json=response.json(), client_api=self._client_api))
729
-
730
- time.sleep(2) # as the command have wrong progress in the beginning
731
- logger.debug("start waiting for all commands")
732
- # Second loop: Wait for all commands and process results
733
- for command in commands:
734
- command = command.wait(timeout=timeout)
735
-
736
- if 'outputItemId' not in command.spec:
737
- raise exceptions.PlatformException(
738
- error='400', message="outputItemId key is missing in command response"
739
- )
740
-
741
- item_id = command.spec['outputItemId']
742
- # Download and process the exported item
743
- yield self._download_exported_item(
744
- item_id=item_id,
745
- export_type=export_type,
746
- local_path=local_path,
747
- unzip=output_export_type != OutputExportType.ZIP,
748
- )
749
-
750
- @_api_reference.add(path='/datasets/{id}/export', method='post')
751
- def export(
752
- self,
753
- dataset: entities.Dataset = None,
754
- dataset_name: str = None,
755
- dataset_id: str = None,
756
- local_path: str = None,
757
- filters: Union[dict, entities.Filters] = None,
758
- annotation_filters: entities.Filters = None,
759
- feature_vector_filters: entities.Filters = None,
760
- include_feature_vectors: bool = False,
761
- include_annotations: bool = False,
762
- export_type: ExportType = ExportType.JSON,
763
- timeout: int = 0,
764
- dataset_lock: bool = False,
765
- lock_timeout_sec: int = None,
766
- export_summary: bool = False,
767
- output_export_type: OutputExportType = None,
768
- ) -> Optional[str]:
769
- """
770
- Export dataset items and annotations.
771
-
772
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
773
-
774
- You must provide at least ONE of the following params: dataset, dataset_name, dataset_id.
775
-
776
- **Export Behavior by Parameter Combination:**
777
-
778
- The behavior of this method depends on the combination of `export_type` and `output_export_type`:
779
-
780
- **When export_type = ExportType.JSON:**
781
-
782
- - **output_export_type = OutputExportType.JSON (default when None):**
783
- - Exports data in JSON format, split into subsets of max 500 items
784
- - Downloads all subset JSON files and concatenates them into a single `result.json` file
785
- - Returns the path to the concatenated JSON file
786
- - Cleans up individual subset files after concatenation
787
-
788
- - **output_export_type = OutputExportType.ZIP:**
789
- - Same as JSON export, but zips the final `result.json` file
790
- - Returns the path to the zipped file (`result.json.zip`)
791
- - Cleans up the unzipped JSON file after zipping
792
-
793
- - **output_export_type = OutputExportType.FOLDERS:**
794
- - Exports data in JSON format, split into subsets of max 500 items
795
- - Downloads all subset JSON files and creates individual JSON files for each item
796
- - Creates a folder structure mirroring the remote dataset structure
797
- - Returns the path to the base directory containing the folder structure
798
- - Each item gets its own JSON file named after the original filename
799
-
800
- **When export_type = ExportType.ZIP:**
801
-
802
- - **output_export_type = OutputExportType.ZIP:**
803
- - Exports data as a ZIP file containing the dataset
804
- - Returns the downloaded ZIP item directly
805
- - No additional processing or concatenation
806
-
807
- - **output_export_type = OutputExportType.JSON:**
808
- - **NOT SUPPORTED** - Raises NotImplementedError
809
- - Use export_type=ExportType.JSON instead for JSON output
810
-
811
- - **output_export_type = OutputExportType.FOLDERS:**
812
- - **NOT SUPPORTED** - Raises NotImplementedError
813
- - Use export_type=ExportType.JSON instead for folder output
814
-
815
- **When output_export_type = None (legacy behavior):**
816
- - Defaults to OutputExportType.JSON
817
- - Maintains backward compatibility with existing code
818
-
819
- :param dtlpy.entities.dataset.Dataset dataset: Dataset object
820
- :param str dataset_name: The name of the dataset
821
- :param str dataset_id: The ID of the dataset
822
- :param str local_path: Local path to save the exported dataset
823
- :param Union[dict, dtlpy.entities.filters.Filters] filters: Filters entity or a query dictionary
824
- :param dtlpy.entities.filters.Filters annotation_filters: Filters entity to filter annotations for export
825
- :param dtlpy.entities.filters.Filters feature_vector_filters: Filters entity to filter feature vectors for export
826
- :param bool include_feature_vectors: Include item feature vectors in the export
827
- :param bool include_annotations: Include item annotations in the export
828
- :param bool dataset_lock: Make dataset readonly during the export
829
- :param bool export_summary: Get Summary of the dataset export
830
- :param int lock_timeout_sec: Timeout for locking the dataset during export in seconds
831
- :param entities.ExportType export_type: Type of export ('json' or 'zip')
832
- :param entities.OutputExportType output_export_type: Output format ('json', 'zip', or 'folders'). If None, defaults to 'json'
833
- :param int timeout: Maximum time in seconds to wait for the export to complete
834
- :return: Path to exported file/directory, or None if export result is empty
835
- :rtype: Optional[str]
836
- """
837
- export_result = list(
838
- self._export_recursive(
839
- dataset=dataset,
840
- dataset_name=dataset_name,
841
- dataset_id=dataset_id,
842
- local_path=local_path,
843
- filters=filters,
844
- annotation_filters=annotation_filters,
845
- feature_vector_filters=feature_vector_filters,
846
- include_feature_vectors=include_feature_vectors,
847
- include_annotations=include_annotations,
848
- timeout=timeout,
849
- dataset_lock=dataset_lock,
850
- lock_timeout_sec=lock_timeout_sec,
851
- export_summary=export_summary,
852
- export_type=export_type,
853
- output_export_type=output_export_type,
854
- )
855
- )
856
- if all(x is None for x in export_result):
857
- logger.error("export result is empty")
858
- return None
859
-
860
- if export_type == ExportType.ZIP:
861
- # if export type is zip, then return the _export_recursive result as it
862
- return export_result[0]
863
-
864
- # if user didn't provide output_export_type, keep the previous behavior
865
- if output_export_type is None:
866
- output_export_type = OutputExportType.JSON
867
-
868
- # export type is jsos :
869
- # Load all items from subset JSON files and clean them up
870
- all_items = []
871
- logger.debug("start loading all items from subset JSON files")
872
- for json_file in export_result:
873
- if json_file is None:
874
- continue
875
- if os.path.isfile(json_file):
876
- with open(json_file, 'r') as f:
877
- items = json.load(f)
878
- if isinstance(items, list):
879
- all_items.extend(items)
880
- os.remove(json_file)
881
-
882
- base_dir = os.path.dirname(export_result[0])
883
- if output_export_type != OutputExportType.FOLDERS:
884
- dataset_id=self._resolve_dataset_id(dataset, dataset_name, dataset_id)
885
- result_file_name = f"{dataset_id}.json"
886
- result_file = os.path.join(base_dir, result_file_name)
887
- logger.debug(f"start writing all items to result file {result_file}")
888
- with open(result_file, 'w') as f:
889
- json.dump(all_items, f)
890
- if output_export_type == OutputExportType.ZIP:
891
- # Zip the result file
892
- zip_filename = result_file + '.zip'
893
- # Create zip file
894
- logger.debug(f"start zipping result file {zip_filename}")
895
- with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
896
- zf.write(result_file, arcname=os.path.basename(result_file))
897
-
898
- # Remove original json after zipping
899
- os.remove(result_file)
900
- result_file = zip_filename
901
- return result_file
902
- logger.debug("start building per-item JSON files under local_path mirroring remote structure")
903
- # Build per-item JSON files under local_path mirroring remote structure
904
- for item in all_items:
905
- rel_json_path = os.path.splitext(item.get('filename'))[0] + '.json'
906
- # Remove leading slash to make it a relative path
907
- if rel_json_path.startswith('/'):
908
- rel_json_path = rel_json_path[1:]
909
- out_path = os.path.join(base_dir, rel_json_path)
910
- os.makedirs(os.path.dirname(out_path), exist_ok=True)
911
- try:
912
- with open(out_path, 'w') as outf:
913
- json.dump(item, outf)
914
- except Exception:
915
- logger.exception(f'Failed writing export item JSON to {out_path}')
916
- logger.debug("end building per-item JSON files under local_path mirroring remote structure")
917
- return base_dir
918
-
919
- @_api_reference.add(path='/datasets/merge', method='post')
920
- def merge(self,
921
- merge_name: str,
922
- dataset_ids: list,
923
- project_ids: str,
924
- with_items_annotations: bool = True,
925
- with_metadata: bool = True,
926
- with_task_annotations_status: bool = True,
927
- wait: bool = True):
928
- """
929
- Merge a dataset. See our `SDK docs <https://developers.dataloop.ai/tutorials/data_management/data_versioning/chapter/>`_ for more information.
930
-
931
- **Prerequisites**: You must be an *owner* or *developer* to use this method.
932
-
933
- :param str merge_name: new dataset name
934
- :param list dataset_ids: list id's of the datatsets you wish to merge
935
- :param str project_ids: the project id that include the datasets
936
- :param bool with_items_annotations: true to merge with items annotations
937
- :param bool with_metadata: true to merge with metadata
938
- :param bool with_task_annotations_status: true to merge with task annotations' status
939
- :param bool wait: wait for the command to finish
940
- :return: True if success
941
- :rtype: bool
942
-
943
- **Example**:
944
-
945
- .. code-block:: python
946
-
947
- success = project.datasets.merge(dataset_ids=['dataset_id1','dataset_id2'],
948
- merge_name='dataset_merge_name',
949
- with_metadata=True,
950
- with_items_annotations=False,
951
- with_task_annotations_status=False)
952
- """
953
- payload = {
954
- "name": merge_name,
955
- "datasetsIds": dataset_ids,
956
- "projectIds": project_ids,
957
- "mergeDatasetParams": {
958
- "withItemsAnnotations": with_items_annotations,
959
- "withMetadata": with_metadata,
960
- "withTaskAnnotationsStatus": with_task_annotations_status
961
- },
962
- 'asynced': wait
963
- }
964
- success, response = self._client_api.gen_request(req_type='post',
965
- path='/datasets/merge',
966
- json_req=payload)
967
-
968
- if success:
969
- command = entities.Command.from_json(_json=response.json(),
970
- client_api=self._client_api)
971
- if not wait:
972
- return command
973
- command = command.wait(timeout=0)
974
- if 'mergeDatasetsConfiguration' not in command.spec:
975
- raise exceptions.PlatformException(error='400',
976
- message="mergeDatasetsConfiguration key is missing in command response: {}"
977
- .format(response))
978
- return True
979
- else:
980
- raise exceptions.PlatformException(response)
981
-
982
- @_api_reference.add(path='/datasets/{id}/sync', method='post')
983
- def sync(self, dataset_id: str, wait: bool = True):
984
- """
985
- Sync dataset with external storage.
986
-
987
- **Prerequisites**: You must be in the role of an *owner* or *developer*.
988
-
989
- :param str dataset_id: The Id of the dataset to sync
990
- :param bool wait: wait for the command to finish
991
- :return: True if success
992
- :rtype: bool
993
-
994
- **Example**:
995
-
996
- .. code-block:: python
997
-
998
- success = project.datasets.sync(dataset_id='dataset_id')
999
- """
1000
-
1001
- success, response = self._client_api.gen_request(req_type='post',
1002
- path='/datasets/{}/sync'.format(dataset_id))
1003
-
1004
- if success:
1005
- command = entities.Command.from_json(_json=response.json(),
1006
- client_api=self._client_api)
1007
- if not wait:
1008
- return command
1009
- command = command.wait(timeout=0)
1010
- if 'datasetId' not in command.spec:
1011
- raise exceptions.PlatformException(error='400',
1012
- message="datasetId key is missing in command response: {}"
1013
- .format(response))
1014
- return True
1015
- else:
1016
- raise exceptions.PlatformException(response)
1017
-
1018
- @_api_reference.add(path='/datasets', method='post')
1019
- def create(self,
1020
- dataset_name: str,
1021
- labels=None,
1022
- attributes=None,
1023
- ontology_ids=None,
1024
- driver: entities.Driver = None,
1025
- driver_id: str = None,
1026
- checkout: bool = False,
1027
- expiration_options: entities.ExpirationOptions = None,
1028
- index_driver: entities.IndexDriver = None,
1029
- recipe_id: str = None
1030
- ) -> entities.Dataset:
1031
- """
1032
- Create a new dataset
1033
-
1034
- **Prerequisites**: You must be in the role of an *owner* or *developer*.
1035
-
1036
- :param str dataset_name: The Name of the dataset
1037
- :param list labels: dictionary of {tag: color} or list of label entities
1038
- :param list attributes: dataset's ontology's attributes
1039
- :param list ontology_ids: optional - dataset ontology
1040
- :param dtlpy.entities.driver.Driver driver: optional - storage driver Driver object or driver name
1041
- :param str driver_id: optional - driver id
1042
- :param bool checkout: set the dataset as a default dataset object (cookies)
1043
- :param ExpirationOptions expiration_options: dl.ExpirationOptions object that contain definitions for dataset like MaxItemDays
1044
- :param str index_driver: dl.IndexDriver, dataset driver version
1045
- :param str recipe_id: optional - recipe id
1046
- :return: Dataset object
1047
- :rtype: dtlpy.entities.dataset.Dataset
1048
-
1049
- **Example**:
1050
-
1051
- .. code-block:: python
1052
-
1053
- dataset = project.datasets.create(dataset_name='dataset_name', ontology_ids='ontology_ids')
1054
- """
1055
- create_default_recipe = True
1056
- if any([labels, attributes, ontology_ids, recipe_id]):
1057
- create_default_recipe = False
1058
-
1059
- # labels to list
1060
- if labels is not None:
1061
- if not isinstance(labels, list):
1062
- labels = [labels]
1063
- if not all(isinstance(label, entities.Label) for label in labels):
1064
- labels = entities.Dataset.serialize_labels(labels)
1065
- else:
1066
- labels = list()
1067
-
1068
- # get creator from token
1069
- payload = {'name': dataset_name,
1070
- 'projects': [self.project.id],
1071
- 'createDefaultRecipe': create_default_recipe
1072
- }
1073
-
1074
- if driver_id is None and driver is not None:
1075
- if isinstance(driver, entities.Driver):
1076
- driver_id = driver.id
1077
- elif isinstance(driver, str):
1078
- driver_id = self.project.drivers.get(driver_name=driver).id
1079
- else:
1080
- raise exceptions.PlatformException(
1081
- error=400,
1082
- message='Input arg "driver" must be Driver object or a string driver name. got type: {!r}'.format(
1083
- type(driver)))
1084
- if driver_id is not None:
1085
- payload['driver'] = driver_id
1086
-
1087
- if expiration_options:
1088
- payload['expirationOptions'] = expiration_options.to_json()
1089
- if index_driver is not None:
1090
- payload['indexDriver'] = index_driver
1091
-
1092
- success, response = self._client_api.gen_request(req_type='post',
1093
- path='/datasets',
1094
- json_req=payload)
1095
- if success:
1096
- dataset = entities.Dataset.from_json(client_api=self._client_api,
1097
- _json=response.json(),
1098
- datasets=self,
1099
- project=self.project)
1100
- # create ontology and recipe
1101
- if not create_default_recipe:
1102
- if recipe_id is not None:
1103
- dataset.switch_recipe(recipe_id=recipe_id)
1104
- else:
1105
- dataset = dataset.recipes.create(ontology_ids=ontology_ids,
1106
- labels=labels,
1107
- attributes=attributes).dataset
1108
- else:
1109
- raise exceptions.PlatformException(response)
1110
- logger.info('Dataset was created successfully. Dataset id: {!r}'.format(dataset.id))
1111
- assert isinstance(dataset, entities.Dataset)
1112
- if checkout:
1113
- self.checkout(dataset=dataset)
1114
- return dataset
1115
-
1116
- @staticmethod
1117
- def _convert_single(downloader,
1118
- item,
1119
- img_filepath,
1120
- local_path,
1121
- overwrite,
1122
- annotation_options,
1123
- annotation_filters,
1124
- thickness,
1125
- with_text,
1126
- progress,
1127
- alpha,
1128
- export_version):
1129
- # this is to convert the downloaded json files to any other annotation type
1130
- try:
1131
- if entities.ViewAnnotationOptions.ANNOTATION_ON_IMAGE in annotation_options:
1132
- if img_filepath is None:
1133
- img_filepath = item.download()
1134
- downloader._download_img_annotations(item=item,
1135
- img_filepath=img_filepath,
1136
- local_path=local_path,
1137
- overwrite=overwrite,
1138
- annotation_options=annotation_options,
1139
- annotation_filters=annotation_filters,
1140
- thickness=thickness,
1141
- alpha=alpha,
1142
- with_text=with_text,
1143
- export_version=export_version
1144
- )
1145
- except Exception:
1146
- logger.error('Failed to download annotation for item: {!r}'.format(item.name))
1147
- progress.update()
1148
-
1149
- @staticmethod
1150
- def download_annotations(dataset: entities.Dataset,
1151
- local_path: str = None,
1152
- filters: entities.Filters = None,
1153
- annotation_options: entities.ViewAnnotationOptions = None,
1154
- annotation_filters: entities.Filters = None,
1155
- overwrite: bool = False,
1156
- thickness: int = 1,
1157
- with_text: bool = False,
1158
- remote_path: str = None,
1159
- include_annotations_in_output: bool = True,
1160
- export_png_files: bool = False,
1161
- filter_output_annotations: bool = False,
1162
- alpha: float = None,
1163
- export_version=entities.ExportVersion.V1,
1164
- dataset_lock: bool = False,
1165
- lock_timeout_sec: int = None,
1166
- export_summary: bool = False,
1167
- ) -> str:
1168
- """
1169
- Download dataset's annotations by filters.
1170
-
1171
- You may filter the dataset both for items and for annotations and download annotations.
1172
-
1173
- Optional -- download annotations as: mask, instance, image mask of the item.
1174
-
1175
- **Prerequisites**: You must be in the role of an *owner* or *developer*.
1176
-
1177
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
1178
- :param str local_path: local folder or filename to save to.
1179
- :param dtlpy.entities.filters.Filters filters: Filters entity or a dictionary containing filters parameters
1180
- :param list annotation_options: type of download annotations: list(dl.ViewAnnotationOptions)
1181
- :param dtlpy.entities.filters.Filters annotation_filters: Filters entity to filter annotations for download
1182
- :param bool overwrite: optional - default = False to overwrite the existing files
1183
- :param bool dataset_loc: optional - default = False to make the dataset readonly
1184
- :param int thickness: optional - line thickness, if -1 annotation will be filled, default =1
1185
- :param bool with_text: optional - add text to annotations, default = False
1186
- :param str remote_path: DEPRECATED and ignored
1187
- :param bool include_annotations_in_output: default - False , if export should contain annotations
1188
- :param bool export_png_files: default - if True, semantic annotations should be exported as png files
1189
- :param bool filter_output_annotations: default - False, given an export by filter - determine if to filter out annotations
1190
- :param float alpha: opacity value [0 1], default 1
1191
- :param str export_version: exported items will have original extension in filename, `V1` - no original extension in filenames
1192
- :return: local_path of the directory where all the downloaded item
1193
- :param bool dataset_lock: optional - default = False
1194
- :param bool export_summary: optional - default = False
1195
- :param int lock_timeout_sec: optional
1196
- :rtype: str
1197
-
1198
- **Example**:
1199
-
1200
- .. code-block:: python
1201
-
1202
- file_path = project.datasets.download_annotations(dataset='dataset_entity',
1203
- local_path='local_path',
1204
- annotation_options=dl.ViewAnnotationOptions,
1205
- overwrite=False,
1206
- thickness=1,
1207
- with_text=False,
1208
- alpha=1,
1209
- dataset_lock=False,
1210
- lock_timeout_sec=300,
1211
- export_summary=False
1212
- )
1213
- """
1214
- if annotation_options is None:
1215
- annotation_options = list()
1216
- elif not isinstance(annotation_options, list):
1217
- annotation_options = [annotation_options]
1218
- for ann_option in annotation_options:
1219
- if not isinstance(ann_option, entities.ViewAnnotationOptions):
1220
- if ann_option not in list(entities.ViewAnnotationOptions):
1221
- raise PlatformException(
1222
- error='400',
1223
- message='Unknown annotation download option: {}, please choose from: {}'.format(
1224
- ann_option, list(entities.ViewAnnotationOptions)))
1225
-
1226
- if remote_path is not None:
1227
- logger.warning(
1228
- '"remote_path" is ignored. Use "filters=dl.Filters(field="dir, values={!r}"'.format(remote_path))
1229
- if local_path is None:
1230
- if dataset.project is None:
1231
- # by dataset name
1232
- local_path = os.path.join(
1233
- services.service_defaults.DATALOOP_PATH,
1234
- "datasets",
1235
- "{}_{}".format(dataset.name, dataset.id),
1236
- )
1237
- else:
1238
- # by dataset and project name
1239
- local_path = os.path.join(
1240
- services.service_defaults.DATALOOP_PATH,
1241
- "projects",
1242
- dataset.project.name,
1243
- "datasets",
1244
- dataset.name,
1245
- )
1246
-
1247
- if filters is None:
1248
- filters = entities.Filters()
1249
- filters._user_query = 'false'
1250
- if annotation_filters is not None:
1251
- for annotation_filter_and in annotation_filters.and_filter_list:
1252
- filters.add_join(field=annotation_filter_and.field,
1253
- values=annotation_filter_and.values,
1254
- operator=annotation_filter_and.operator,
1255
- method=entities.FiltersMethod.AND)
1256
- for annotation_filter_or in annotation_filters.or_filter_list:
1257
- filters.add_join(field=annotation_filter_or.field,
1258
- values=annotation_filter_or.values,
1259
- operator=annotation_filter_or.operator,
1260
- method=entities.FiltersMethod.OR)
1261
-
1262
- downloader = repositories.Downloader(items_repository=dataset.items)
1263
- downloader.download_annotations(dataset=dataset,
1264
- filters=filters,
1265
- annotation_filters=annotation_filters,
1266
- local_path=local_path,
1267
- overwrite=overwrite,
1268
- include_annotations_in_output=include_annotations_in_output,
1269
- export_png_files=export_png_files,
1270
- filter_output_annotations=filter_output_annotations,
1271
- export_version=export_version,
1272
- dataset_lock=dataset_lock,
1273
- lock_timeout_sec=lock_timeout_sec,
1274
- export_summary=export_summary
1275
- )
1276
- if annotation_options:
1277
- pages = dataset.items.list(filters=filters)
1278
- if not isinstance(annotation_options, list):
1279
- annotation_options = [annotation_options]
1280
- # convert all annotations to annotation_options
1281
- pool = dataset._client_api.thread_pools(pool_name='dataset.download')
1282
- jobs = [None for _ in range(pages.items_count)]
1283
- progress = tqdm.tqdm(total=pages.items_count,
1284
- disable=dataset._client_api.verbose.disable_progress_bar_download_annotations,
1285
- file=sys.stdout, desc='Download Annotations')
1286
- i_item = 0
1287
- for page in pages:
1288
- for item in page:
1289
- jobs[i_item] = pool.submit(
1290
- Datasets._convert_single,
1291
- **{
1292
- 'downloader': downloader,
1293
- 'item': item,
1294
- 'img_filepath': None,
1295
- 'local_path': local_path,
1296
- 'overwrite': overwrite,
1297
- 'annotation_options': annotation_options,
1298
- 'annotation_filters': annotation_filters,
1299
- 'thickness': thickness,
1300
- 'with_text': with_text,
1301
- 'progress': progress,
1302
- 'alpha': alpha,
1303
- 'export_version': export_version
1304
- }
1305
- )
1306
- i_item += 1
1307
- # get all results
1308
- _ = [j.result() for j in jobs]
1309
- progress.close()
1310
- return local_path
1311
-
1312
- def _upload_single_item_annotation(self, item, file, pbar):
1313
- try:
1314
- item.annotations.upload(file)
1315
- except Exception as err:
1316
- raise err
1317
- finally:
1318
- pbar.update()
1319
-
1320
- def upload_annotations(self,
1321
- dataset,
1322
- local_path,
1323
- filters: entities.Filters = None,
1324
- clean=False,
1325
- remote_root_path='/',
1326
- export_version=entities.ExportVersion.V1
1327
- ):
1328
- """
1329
- Upload annotations to dataset.
1330
-
1331
- Example for remote_root_path: If the item filepath is "/a/b/item" and remote_root_path is "/a" - the start folder will be b instead of a
1332
-
1333
- **Prerequisites**: You must have a dataset with items that are related to the annotations. The relationship between the dataset and annotations is shown in the name. You must be in the role of an *owner* or *developer*.
1334
-
1335
- :param dtlpy.entities.dataset.Dataset dataset: dataset to upload to
1336
- :param str local_path: str - local folder where the annotations files are
1337
- :param dtlpy.entities.filters.Filters filters: Filters entity or a dictionary containing filters parameters
1338
- :param bool clean: True to remove the old annotations
1339
- :param str remote_root_path: the remote root path to match remote and local items
1340
- :param str export_version: exported items will have original extension in filename, `V1` - no original extension in filenames
1341
-
1342
- **Example**:
1343
-
1344
- .. code-block:: python
1345
-
1346
- project.datasets.upload_annotations(dataset='dataset_entity',
1347
- local_path='local_path',
1348
- clean=False,
1349
- export_version=dl.ExportVersion.V1
1350
- )
1351
- """
1352
- if filters is None:
1353
- filters = entities.Filters()
1354
- filters._user_query = 'false'
1355
- pages = dataset.items.list(filters=filters)
1356
- total_items = pages.items_count
1357
- pbar = tqdm.tqdm(total=total_items, disable=dataset._client_api.verbose.disable_progress_bar_upload_annotations,
1358
- file=sys.stdout, desc='Upload Annotations')
1359
- pool = self._client_api.thread_pools('annotation.upload')
1360
- annotations_uploaded_count = 0
1361
- for item in pages.all():
1362
- if export_version == entities.ExportVersion.V1:
1363
- _, ext = os.path.splitext(item.filename)
1364
- filepath = item.filename.replace(ext, '.json')
1365
- else:
1366
- filepath = item.filename + '.json'
1367
- # make the file path ignore the hierarchy of the files that in remote_root_path
1368
- filepath = os.path.relpath(filepath, remote_root_path)
1369
- json_file = os.path.join(local_path, filepath)
1370
- if not os.path.isfile(json_file):
1371
- pbar.update()
1372
- continue
1373
- annotations_uploaded_count += 1
1374
- if item.annotated and clean:
1375
- item.annotations.delete(filters=entities.Filters(resource=entities.FiltersResource.ANNOTATION))
1376
- pool.submit(self._upload_single_item_annotation, **{'item': item,
1377
- 'file': json_file,
1378
- 'pbar': pbar})
1379
- pool.shutdown()
1380
- if annotations_uploaded_count == 0:
1381
- logger.warning(msg="No annotations uploaded to dataset! ")
1382
- else:
1383
- logger.info(msg='Found and uploaded {} annotations.'.format(annotations_uploaded_count))
1384
-
1385
- def set_readonly(self, state: bool, dataset: entities.Dataset):
1386
- """
1387
- Set dataset readonly mode.
1388
-
1389
- **Prerequisites**: You must be in the role of an *owner* or *developer*.
1390
-
1391
- :param bool state: state to update readonly mode
1392
- :param dtlpy.entities.dataset.Dataset dataset: dataset object
1393
-
1394
- **Example**:
1395
-
1396
- .. code-block:: python
1397
-
1398
- project.datasets.set_readonly(dataset='dataset_entity', state=True)
1399
- """
1400
- import warnings
1401
- warnings.warn("`readonly` flag on dataset is deprecated, doing nothing.", DeprecationWarning)
1402
-
1403
- @_api_reference.add(path='/datasets/{id}/split', method='post')
1404
- def split_ml_subsets(self,
1405
- dataset_id: str,
1406
- items_query: entities.filters,
1407
- ml_split_list: dict) -> bool:
1408
- """
1409
- Split dataset items into ML subsets.
1410
-
1411
- :param str dataset_id: The ID of the dataset.
1412
- :param dict items_query: Query to select items.
1413
- :param dict ml_split_list: Dictionary with 'train', 'validation', 'test' keys and integer percentages.
1414
- :return: True if the split operation was successful.
1415
- :rtype: bool
1416
- :raises: PlatformException on failure and ValueError if percentages do not sum to 100 or invalid keys/values.
1417
- """
1418
- # Validate percentages
1419
- if not ml_split_list:
1420
- ml_split_list = {'train': 80, 'validation': 10, 'test': 10}
1421
-
1422
- if not items_query:
1423
- items_query = entities.Filters()
1424
-
1425
- items_query_dict = items_query.prepare()
1426
- required_keys = {'train', 'validation', 'test'}
1427
- if set(ml_split_list.keys()) != required_keys:
1428
- raise ValueError("MLSplitList must have exactly the keys 'train', 'validation', 'test'.")
1429
- total = sum(ml_split_list.values())
1430
- if total != 100:
1431
- raise ValueError(
1432
- "Please set the Train, Validation, and Test subsets percentages to add up to 100%. "
1433
- "For example: 70, 15, 15."
1434
- )
1435
- for key, value in ml_split_list.items():
1436
- if not isinstance(value, int) or value < 0:
1437
- raise ValueError("Percentages must be integers >= 0.")
1438
- payload = {
1439
- 'itemsQuery': items_query_dict,
1440
- 'MLSplitList': ml_split_list
1441
- }
1442
- path = f'/datasets/{dataset_id}/split'
1443
- success, response = self._client_api.gen_request(req_type='post',
1444
- path=path,
1445
- json_req=payload)
1446
- if success:
1447
- # Wait for the split operation to complete
1448
- command = entities.Command.from_json(_json=response.json(),
1449
- client_api=self._client_api)
1450
- command.wait()
1451
- return True
1452
- else:
1453
- raise exceptions.PlatformException(response)
1454
-
1455
- @_api_reference.add(path='/datasets/{id}/items/bulk-update-metadata', method='post')
1456
- def bulk_update_ml_subset(self, dataset_id: str, items_query: dict, subset: str = None, deleteTag: bool = False) -> bool:
1457
- """
1458
- Bulk update ML subset assignment for selected items.
1459
- If subset is None, remove subsets. Otherwise, assign the specified subset.
1460
-
1461
- :param str dataset_id: ID of the dataset
1462
- :param dict items_query: DQLResourceQuery (filters) for selecting items
1463
- :param str subset: 'train', 'validation', 'test' or None to remove all
1464
- :return: True if success
1465
- :rtype: bool
1466
- """
1467
- if items_query is None:
1468
- items_query = entities.Filters()
1469
- items_query_dict = items_query.prepare()
1470
- if not deleteTag and subset not in ['train', 'validation', 'test']:
1471
- raise ValueError("subset must be one of: 'train', 'validation', 'test'")
1472
- # Determine tag values based on subset
1473
- tags = {
1474
- 'train': True if subset == 'train' else None,
1475
- 'validation': True if subset == 'validation' else None,
1476
- 'test': True if subset == 'test' else None
1477
- }
1478
-
1479
- payload = {
1480
- "query": items_query_dict,
1481
- "updateQuery": {
1482
- "update": {
1483
- "metadata": {
1484
- "system": {
1485
- "tags": tags
1486
- }
1487
- }
1488
- },
1489
- "systemSpace": True
1490
- }
1491
- }
1492
-
1493
- success, response = self._client_api.gen_request(
1494
- req_type='post',
1495
- path=f'/datasets/{dataset_id}/items/bulk-update-metadata',
1496
- json_req=payload
1497
- )
1498
- if success:
1499
- # Similar to split operation, a command is returned
1500
- command = entities.Command.from_json(_json=response.json(), client_api=self._client_api)
1501
- command.wait()
1502
- return True
1503
- else:
1504
- raise exceptions.PlatformException(response)
1
+ """
2
+ Datasets Repository
3
+ """
4
+
5
+ import copy
6
+ import json
7
+ import logging
8
+ import os
9
+ import sys
10
+ import tempfile
11
+ import time
12
+ import zipfile
13
+ from pathlib import Path
14
+ from typing import Generator, Optional, Union
15
+
16
+ import tqdm
17
+
18
+ from .. import _api_reference, entities, exceptions, miscellaneous, PlatformException, repositories, services
19
+ from ..entities.dataset import ExportType, OutputExportType
20
+ from ..services import service_defaults
21
+ from ..services.api_client import ApiClient
22
+
23
+ logger = logging.getLogger(name='dtlpy')
24
+
25
+ MAX_ITEMS_PER_SUBSET = 50000
26
+ DOWNLOAD_ANNOTATIONS_MAX_ITEMS_PER_SUBSET = 1000
27
+
28
+ class Datasets:
29
+ """
30
+ Datasets Repository
31
+
32
+ The Datasets class allows the user to manage datasets. Read more about datasets in our `documentation <https://dataloop.ai/docs/dataset>`_ and `SDK documentation <https://developers.dataloop.ai/tutorials/data_management/manage_datasets/chapter/>`_.
33
+ """
34
+
35
+ def __init__(self, client_api: ApiClient, project: entities.Project = None):
36
+ self._client_api = client_api
37
+ self._project = project
38
+
39
+ ############
40
+ # entities #
41
+ ############
42
+ @property
43
+ def project(self) -> entities.Project:
44
+ if self._project is None:
45
+ # try get checkout
46
+ project = self._client_api.state_io.get('project')
47
+ if project is not None:
48
+ self._project = entities.Project.from_json(_json=project, client_api=self._client_api)
49
+ if self._project is None:
50
+ raise exceptions.PlatformException(
51
+ error='2001',
52
+ message='Cannot perform action WITHOUT Project entity in Datasets repository.'
53
+ ' Please checkout or set a project')
54
+ assert isinstance(self._project, entities.Project)
55
+ return self._project
56
+
57
+ @project.setter
58
+ def project(self, project: entities.Project):
59
+ if not isinstance(project, entities.Project):
60
+ raise ValueError('Must input a valid Project entity')
61
+ self._project = project
62
+
63
+ ###########
64
+ # methods #
65
+ ###########
66
+ def __get_from_cache(self) -> entities.Dataset:
67
+ dataset = self._client_api.state_io.get('dataset')
68
+ if dataset is not None:
69
+ dataset = entities.Dataset.from_json(_json=dataset,
70
+ client_api=self._client_api,
71
+ datasets=self,
72
+ project=self._project)
73
+ return dataset
74
+
75
+ def __get_by_id(self, dataset_id) -> entities.Dataset:
76
+ success, response = self._client_api.gen_request(req_type='get',
77
+ path='/datasets/{}'.format(dataset_id))
78
+ if dataset_id is None or dataset_id == '':
79
+ raise exceptions.PlatformException('400', 'Please checkout a dataset')
80
+
81
+ if success:
82
+ dataset = entities.Dataset.from_json(client_api=self._client_api,
83
+ _json=response.json(),
84
+ datasets=self,
85
+ project=self._project)
86
+ else:
87
+ raise exceptions.PlatformException(response)
88
+ return dataset
89
+
90
+ def __get_by_identifier(self, identifier=None) -> entities.Dataset:
91
+ datasets = self.list()
92
+ datasets_by_name = [dataset for dataset in datasets if identifier in dataset.name or identifier in dataset.id]
93
+ if len(datasets_by_name) == 1:
94
+ return datasets_by_name[0]
95
+ elif len(datasets_by_name) > 1:
96
+ raise Exception('Multiple datasets with this name exist')
97
+ else:
98
+ raise Exception("Dataset not found")
99
+
100
+ def _bulid_folder_filter(self, folder_path, filters=None):
101
+ if filters is None:
102
+ filters = entities.Filters()
103
+ filters._user_query = 'false'
104
+ if not folder_path.startswith('/'):
105
+ folder_path = '/' + folder_path
106
+ filters.add(field='dir', values=folder_path, method=entities.FiltersMethod.OR)
107
+ if not folder_path.endswith('*'):
108
+ if not folder_path.endswith('/'):
109
+ folder_path += '/'
110
+ filters.add(field='dir', values=folder_path + '*', method=entities.FiltersMethod.OR)
111
+ return filters
112
+
113
+ def _get_binaries_dataset(self):
114
+ filters = entities.Filters(resource=entities.FiltersResource.DATASET)
115
+ filters.add(field='name', values='Binaries')
116
+ filters.system_space = True
117
+ datasets = self.list(filters=filters)
118
+ if len(datasets) == 0:
119
+ # empty list
120
+ raise exceptions.PlatformException('404', 'Dataset not found. Name: "Binaries"')
121
+ # dataset = None
122
+ elif len(datasets) > 1:
123
+ raise exceptions.PlatformException('400', 'More than one dataset with same name.')
124
+ else:
125
+ dataset = datasets[0]
126
+ return dataset
127
+
128
+ def _resolve_dataset_id(self, dataset, dataset_name, dataset_id):
129
+ if dataset is None and dataset_name is None and dataset_id is None:
130
+ raise ValueError('Must provide dataset, dataset name or dataset id')
131
+ if dataset_id is None:
132
+ if dataset is None:
133
+ dataset = self.get(dataset_name=dataset_name)
134
+ dataset_id = dataset.id
135
+ return dataset_id
136
+
137
+ @staticmethod
138
+ def _save_item_json_file(item_data, base_path: Path, export_version=None):
139
+ """
140
+ Save a single item's JSON data to a file, creating the directory structure as needed.
141
+
142
+ :param dict item_data: The item data dictionary (must have 'filename' key)
143
+ :param Path base_path: Base directory path where JSON files should be saved
144
+ :param entities.ExportVersion export_version: Optional export version (V1 or V2) affecting filename handling
145
+ :return: Path to the saved JSON file
146
+ :rtype: Path
147
+ """
148
+ # Get filename and remove leading slash
149
+ filename = item_data.get('filename', '')
150
+ if not filename:
151
+ raise ValueError("item_data must have a 'filename' key")
152
+ filename = filename.lstrip('/')
153
+
154
+ # Determine relative JSON path based on export version
155
+ if export_version == entities.ExportVersion.V1:
156
+ # V1: Replace extension with .json (e.g., "file.jpg" -> "file.json")
157
+ rel_json_path = str(Path(filename).with_suffix('.json'))
158
+ elif export_version == entities.ExportVersion.V2:
159
+ # V2: Append .json (e.g., "file.jpg" -> "file.jpg.json")
160
+ rel_json_path = filename + '.json'
161
+ else:
162
+ # Default/None: Replace extension with .json (backward compatible with section 1)
163
+ rel_json_path = os.path.splitext(filename)[0] + '.json'
164
+
165
+ # Remove leading slash if present
166
+ if rel_json_path.startswith('/'):
167
+ rel_json_path = rel_json_path[1:]
168
+
169
+ # Build output path
170
+ out_path = base_path / rel_json_path
171
+
172
+ # Create parent directories
173
+ out_path.parent.mkdir(parents=True, exist_ok=True)
174
+
175
+ # Write JSON file
176
+ try:
177
+ with open(out_path, 'w') as outf:
178
+ json.dump(item_data, outf, indent=2)
179
+ except Exception:
180
+ logger.exception(f'Failed writing export item JSON to {out_path}')
181
+ raise
182
+
183
+ return out_path
184
+
185
+ @staticmethod
186
+ def _build_payload(filters, include_feature_vectors, include_annotations,
187
+ export_type, annotation_filters, feature_vector_filters, dataset_lock, lock_timeout_sec, export_summary):
188
+ valid_list = [e.value for e in entities.ExportType]
189
+ valid_types = ', '.join(valid_list)
190
+ if export_type not in ['json', 'zip']:
191
+ raise ValueError('export_type must be one of the following: {}'.format(valid_types))
192
+ payload = {'exportType': export_type}
193
+ if filters is None:
194
+ filters = entities.Filters()
195
+
196
+ if isinstance(filters, entities.Filters):
197
+ payload['itemsQuery'] = {'filter': filters.prepare()['filter'], 'join': filters.prepare().get("join", {})}
198
+ elif isinstance(filters, dict):
199
+ payload['itemsQuery'] = filters
200
+ else:
201
+ raise exceptions.BadRequest(message='filters must be of type dict or Filters', status_code=500)
202
+
203
+ payload['itemsVectorQuery'] = {}
204
+ if include_feature_vectors:
205
+ payload['includeItemVectors'] = True
206
+ payload['itemsVectorQuery']['select'] = {"datasetId": 1, 'featureSetId': 1, 'value': 1}
207
+
208
+ if feature_vector_filters is not None:
209
+ payload['itemsVectorQuery']['filter'] = feature_vector_filters.prepare()['filter']
210
+
211
+ payload['annotations'] = {"include": include_annotations, "convertSemantic": False}
212
+
213
+ if annotation_filters is not None:
214
+ payload['annotationsQuery'] = annotation_filters.prepare()
215
+
216
+ if dataset_lock:
217
+ payload['datasetLock'] = dataset_lock
218
+
219
+ if export_summary:
220
+ payload['summary'] = export_summary
221
+
222
+ if lock_timeout_sec:
223
+ payload['lockTimeoutSec'] = lock_timeout_sec
224
+
225
+ return payload
226
+
227
+ def _download_exported_item(self, item_id, export_type, local_path=None, unzip=True):
228
+ logger.debug(f"start downloading exported item {item_id} with export_type {export_type} and local_path {local_path} and unzip {unzip}")
229
+ export_item = repositories.Items(client_api=self._client_api).get(item_id=item_id)
230
+ export_item_path = export_item.download(local_path=local_path)
231
+
232
+ # Common validation check for both JSON and other export types
233
+ if isinstance(export_item_path, list) or not os.path.isfile(export_item_path):
234
+ raise exceptions.PlatformException(
235
+ error='404',
236
+ message='error downloading annotation zip file. see above for more information. item id: {!r}'.format(
237
+ export_item.id))
238
+
239
+ result = None
240
+ if unzip is False or export_type == entities.ExportType.JSON:
241
+ result = export_item_path
242
+ else:
243
+ try:
244
+ miscellaneous.Zipping.unzip_directory(zip_filename=export_item_path,
245
+ to_directory=local_path)
246
+ result = local_path
247
+ except Exception as e:
248
+ logger.warning("Failed to extract zip file error: {}".format(e))
249
+ finally:
250
+ # cleanup only for zip files to avoid removing needed results
251
+ if isinstance(export_item_path, str) and os.path.isfile(export_item_path):
252
+ os.remove(export_item_path)
253
+ logger.debug(f"end downloading, result {result}")
254
+ return result
255
+
256
+ @property
257
+ def platform_url(self):
258
+ return self._client_api._get_resource_url("projects/{}/datasets".format(self.project.id))
259
+
260
+ def open_in_web(self,
261
+ dataset_name: str = None,
262
+ dataset_id: str = None,
263
+ dataset: entities.Dataset = None):
264
+ """
265
+ Open the dataset in web platform.
266
+
267
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
268
+
269
+ :param str dataset_name: The Name of the dataset
270
+ :param str dataset_id: The Id of the dataset
271
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
272
+
273
+ **Example**:
274
+
275
+ .. code-block:: python
276
+
277
+ project.datasets.open_in_web(dataset_id='dataset_id')
278
+ """
279
+ if dataset_name is not None:
280
+ dataset = self.get(dataset_name=dataset_name)
281
+ if dataset is not None:
282
+ dataset.open_in_web()
283
+ elif dataset_id is not None:
284
+ self._client_api._open_in_web(url=f'{self.platform_url}/{dataset_id}/items')
285
+ else:
286
+ self._client_api._open_in_web(url=self.platform_url)
287
+
288
+ def checkout(self,
289
+ identifier: str = None,
290
+ dataset_name: str = None,
291
+ dataset_id: str = None,
292
+ dataset: entities.Dataset = None):
293
+ """
294
+ Checkout (switch) to a dataset to work on it.
295
+
296
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
297
+
298
+ You must provide at least ONE of the following params: dataset_id, dataset_name.
299
+
300
+ :param str identifier: project name or partial id that you wish to switch
301
+ :param str dataset_name: The Name of the dataset
302
+ :param str dataset_id: The Id of the dataset
303
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
304
+
305
+ **Example**:
306
+
307
+ .. code-block:: python
308
+
309
+ project.datasets.checkout(dataset_id='dataset_id')
310
+ """
311
+ if dataset is None:
312
+ if dataset_id is not None or dataset_name is not None:
313
+ try:
314
+ dataset = self.project.datasets.get(dataset_name=dataset_name, dataset_id=dataset_id)
315
+ except exceptions.MissingEntity:
316
+ dataset = self.get(dataset_id=dataset_id, dataset_name=dataset_name)
317
+ elif identifier is not None:
318
+ dataset = self.__get_by_identifier(identifier=identifier)
319
+ else:
320
+ raise exceptions.PlatformException(error='400',
321
+ message='Must provide partial/full id/name to checkout')
322
+ self._client_api.state_io.put('dataset', dataset.to_json())
323
+ logger.info('Checked out to dataset {}'.format(dataset.name))
324
+
325
+ @_api_reference.add(path='/datasets/query', method='post')
326
+ def list(self, name=None, creator=None, filters: entities.Filters = None) -> miscellaneous.List[entities.Dataset]:
327
+ """
328
+ List all datasets.
329
+
330
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
331
+
332
+ :param str name: list by name
333
+ :param str creator: list by
334
+ :param dtlpy.entities.filters.Filters filters: Filters entity containing filters parameters
335
+ :return: List of datasets
336
+ :rtype: list
337
+
338
+ **Example**:
339
+
340
+ .. code-block:: python
341
+ filters = dl.Filters(resource='datasets')
342
+ filters.add(field='readonly', values=False)
343
+ datasets = project.datasets.list(filters=filters)
344
+ """
345
+ if filters is None:
346
+ filters = entities.Filters(resource=entities.FiltersResource.DATASET)
347
+ # assert type filters
348
+ elif not isinstance(filters, entities.Filters):
349
+ raise exceptions.PlatformException(error='400',
350
+ message='Unknown filters type: {!r}'.format(type(filters)))
351
+ if filters.resource != entities.FiltersResource.DATASET:
352
+ raise exceptions.PlatformException(
353
+ error='400',
354
+ message='Filters resource must to be FiltersResource.DATASET. Got: {!r}'.format(filters.resource))
355
+
356
+ url = '/datasets/query'
357
+
358
+ if name is not None:
359
+ filters.add(field='name', values=name)
360
+ if creator is not None:
361
+ filters.add(field='creator', values=creator)
362
+ if self._project is not None:
363
+ filters.context = {"projects": [self._project.id]}
364
+ filters.page_size = 1000
365
+ filters.page = 0
366
+ datasets = list()
367
+ while True:
368
+ success, response = self._client_api.gen_request(req_type='POST',
369
+ json_req=filters.prepare(),
370
+ path=url,
371
+ headers={'user_query': filters._user_query})
372
+ if success:
373
+ pool = self._client_api.thread_pools('entity.create')
374
+ datasets_json = response.json()['items']
375
+ jobs = [None for _ in range(len(datasets_json))]
376
+ # return triggers list
377
+ for i_dataset, dataset in enumerate(datasets_json):
378
+ jobs[i_dataset] = pool.submit(entities.Dataset._protected_from_json,
379
+ **{'client_api': self._client_api,
380
+ '_json': dataset,
381
+ 'datasets': self,
382
+ 'project': self.project})
383
+
384
+ # get all results
385
+ results = [j.result() for j in jobs]
386
+ # log errors
387
+ _ = [logger.warning(r[1]) for r in results if r[0] is False]
388
+ # return good jobs
389
+ datasets.extend([r[1] for r in results if r[0] is True])
390
+ if response.json()['hasNextPage'] is True:
391
+ filters.page += 1
392
+ else:
393
+ break
394
+ else:
395
+ raise exceptions.PlatformException(response)
396
+ datasets = miscellaneous.List(datasets)
397
+ return datasets
398
+
399
+ @_api_reference.add(path='/datasets/{id}', method='get')
400
+ def get(self,
401
+ dataset_name: str = None,
402
+ dataset_id: str = None,
403
+ checkout: bool = False,
404
+ fetch: bool = None
405
+ ) -> entities.Dataset:
406
+ """
407
+ Get dataset by name or id.
408
+
409
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
410
+
411
+ You must provide at least ONE of the following params: dataset_id, dataset_name.
412
+
413
+ :param str dataset_name: optional - search by name
414
+ :param str dataset_id: optional - search by id
415
+ :param bool checkout: set the dataset as a default dataset object (cookies)
416
+ :param bool fetch: optional - fetch entity from platform (True), default taken from cookie
417
+ :return: Dataset object
418
+ :rtype: dtlpy.entities.dataset.Dataset
419
+
420
+ **Example**:
421
+
422
+ .. code-block:: python
423
+
424
+ dataset = project.datasets.get(dataset_id='dataset_id')
425
+ """
426
+ if fetch is None:
427
+ fetch = self._client_api.fetch_entities
428
+
429
+ if dataset_id is None and dataset_name is None:
430
+ dataset = self.__get_from_cache()
431
+ if dataset is None:
432
+ raise exceptions.PlatformException(
433
+ error='400',
434
+ message='No checked-out Dataset was found, must checkout or provide an identifier in inputs')
435
+ elif fetch:
436
+ if dataset_id is not None and dataset_id != '':
437
+ dataset = self.__get_by_id(dataset_id)
438
+ # verify input dataset name is same as the given id
439
+ if dataset_name is not None and dataset.name != dataset_name:
440
+ logger.warning(
441
+ "Mismatch found in datasets.get: dataset_name is different then dataset.name: "
442
+ "{!r} != {!r}".format(
443
+ dataset_name,
444
+ dataset.name))
445
+ elif dataset_name is not None:
446
+ datasets = self.list(name=dataset_name)
447
+ if not datasets:
448
+ # empty list
449
+ raise exceptions.PlatformException('404', 'Dataset not found. Name: {!r}'.format(dataset_name))
450
+ # dataset = None
451
+ elif len(datasets) > 1:
452
+ raise exceptions.PlatformException('400', 'More than one dataset with same name.')
453
+ else:
454
+ dataset = datasets[0]
455
+ else:
456
+ raise exceptions.PlatformException(
457
+ error='404',
458
+ message='No input and no checked-out found')
459
+ else:
460
+ dataset = entities.Dataset.from_json(_json={'id': dataset_id,
461
+ 'name': dataset_id},
462
+ client_api=self._client_api,
463
+ datasets=self,
464
+ project=self._project,
465
+ is_fetched=False)
466
+ assert isinstance(dataset, entities.Dataset)
467
+ if checkout:
468
+ self.checkout(dataset=dataset)
469
+ return dataset
470
+
471
+ @_api_reference.add(path='/datasets/{id}', method='delete')
472
+ def delete(self,
473
+ dataset_name: str = None,
474
+ dataset_id: str = None,
475
+ sure: bool = False,
476
+ really: bool = False):
477
+ """
478
+ Delete a dataset forever!
479
+
480
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
481
+
482
+ **Example**:
483
+
484
+ .. code-block:: python
485
+
486
+ is_deleted = project.datasets.delete(dataset_id='dataset_id', sure=True, really=True)
487
+
488
+ :param str dataset_name: optional - search by name
489
+ :param str dataset_id: optional - search by id
490
+ :param bool sure: Are you sure you want to delete?
491
+ :param bool really: Really really sure?
492
+ :return: True is success
493
+ :rtype: bool
494
+ """
495
+ if sure and really:
496
+ dataset = self.get(dataset_name=dataset_name, dataset_id=dataset_id)
497
+ success, response = self._client_api.gen_request(req_type='delete',
498
+ path='/datasets/{}'.format(dataset.id))
499
+ if not success:
500
+ raise exceptions.PlatformException(response)
501
+ logger.info('Dataset {!r} was deleted successfully'.format(dataset.name))
502
+ return True
503
+ else:
504
+ raise exceptions.PlatformException(
505
+ error='403',
506
+ message='Cant delete dataset from SDK. Please login to platform to delete')
507
+
508
+ @_api_reference.add(path='/datasets/{id}', method='patch')
509
+ def update(self,
510
+ dataset: entities.Dataset,
511
+ system_metadata: bool = False,
512
+ patch: dict = None
513
+ ) -> entities.Dataset:
514
+ """
515
+ Update dataset field.
516
+
517
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
518
+
519
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
520
+ :param bool system_metadata: True, if you want to change metadata system
521
+ :param dict patch: Specific patch request
522
+ :return: Dataset object
523
+ :rtype: dtlpy.entities.dataset.Dataset
524
+
525
+ **Example**:
526
+
527
+ .. code-block:: python
528
+
529
+ dataset = project.datasets.update(dataset='dataset_entity')
530
+ """
531
+ url_path = '/datasets/{}'.format(dataset.id)
532
+ if system_metadata:
533
+ url_path += '?system=true'
534
+
535
+ if patch is None:
536
+ patch = dataset.to_json()
537
+
538
+ success, response = self._client_api.gen_request(req_type='patch',
539
+ path=url_path,
540
+ json_req=patch)
541
+ if success:
542
+ logger.info('Dataset was updated successfully')
543
+ return dataset
544
+ else:
545
+ raise exceptions.PlatformException(response)
546
+
547
+ @_api_reference.add(path='/datasets/{id}/unlock', method='patch')
548
+ def unlock(self, dataset: entities.Dataset ) -> entities.Dataset:
549
+ """
550
+ Unlock dataset.
551
+
552
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
553
+
554
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
555
+ :return: Dataset object
556
+ :rtype: dtlpy.entities.dataset.Dataset
557
+
558
+ **Example**:
559
+
560
+ .. code-block:: python
561
+
562
+ dataset = project.datasets.unlock(dataset='dataset_entity')
563
+ """
564
+ url_path = '/datasets/{}/unlock'.format(dataset.id)
565
+
566
+ success, response = self._client_api.gen_request(req_type='patch', path=url_path)
567
+ if success:
568
+ logger.info('Dataset was unlocked successfully')
569
+ return dataset
570
+ else:
571
+ raise exceptions.PlatformException(response)
572
+
573
+ @_api_reference.add(path='/datasets/{id}/directoryTree', method='get')
574
+ def directory_tree(self,
575
+ dataset: entities.Dataset = None,
576
+ dataset_name: str = None,
577
+ dataset_id: str = None):
578
+ """
579
+ Get dataset's directory tree.
580
+
581
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
582
+
583
+ You must provide at least ONE of the following params: dataset, dataset_name, dataset_id.
584
+
585
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
586
+ :param str dataset_name: The Name of the dataset
587
+ :param str dataset_id: The Id of the dataset
588
+ :return: DirectoryTree
589
+
590
+ **Example**:
591
+
592
+ .. code-block:: python
593
+ directory_tree = dataset.directory_tree
594
+ directory_tree = project.datasets.directory_tree(dataset='dataset_entity')
595
+ """
596
+ dataset_id = self._resolve_dataset_id(dataset, dataset_name, dataset_id)
597
+
598
+ url_path = '/datasets/{}/directoryTree'.format(dataset_id)
599
+
600
+ success, response = self._client_api.gen_request(req_type='get',
601
+ path=url_path)
602
+
603
+ if success:
604
+ return entities.DirectoryTree(_json=response.json())
605
+ else:
606
+ raise exceptions.PlatformException(response)
607
+
608
+ @_api_reference.add(path='/datasets/{id}/clone', method='post')
609
+ def clone(self,
610
+ dataset_id: str,
611
+ clone_name: str = None,
612
+ filters: entities.Filters = None,
613
+ with_items_annotations: bool = True,
614
+ with_metadata: bool = True,
615
+ with_task_annotations_status: bool = True,
616
+ dst_dataset_id: str = None,
617
+ target_directory: str = None):
618
+ """
619
+ Clone a dataset. Read more about cloning datatsets and items in our `documentation <https://dataloop.ai/docs/clone-merge-dataset#cloned-dataset>`_ and `SDK documentation <https://developers.dataloop.ai/tutorials/data_management/data_versioning/chapter/>`_.
620
+
621
+ **Prerequisites**: You must be in the role of an *owner* or *developer*.
622
+
623
+ :param str dataset_id: id of the dataset you wish to clone
624
+ :param str clone_name: new dataset name
625
+ :param dtlpy.entities.filters.Filters filters: Filters entity or a query dict
626
+ :param bool with_items_annotations: true to clone with items annotations
627
+ :param bool with_metadata: true to clone with metadata
628
+ :param bool with_task_annotations_status: true to clone with task annotations' status
629
+ :param str dst_dataset_id: destination dataset id
630
+ :param str target_directory: target directory
631
+ :return: dataset object
632
+ :rtype: dtlpy.entities.dataset.Dataset
633
+
634
+ **Example**:
635
+
636
+ .. code-block:: python
637
+
638
+ dataset = project.datasets.clone(dataset_id='dataset_id',
639
+ clone_name='dataset_clone_name',
640
+ with_metadata=True,
641
+ with_items_annotations=False,
642
+ with_task_annotations_status=False)
643
+ """
644
+ if clone_name is None and dst_dataset_id is None:
645
+ raise exceptions.PlatformException('400', 'Must provide clone name or destination dataset id')
646
+ if filters is None:
647
+ filters = entities.Filters()
648
+ filters._user_query = 'false'
649
+ elif not isinstance(filters, entities.Filters):
650
+ raise exceptions.PlatformException(
651
+ error='400',
652
+ message='"filters" must be a dl.Filters entity. got: {!r}'.format(type(filters)))
653
+
654
+ copy_filters = copy.deepcopy(filters)
655
+ if copy_filters.has_field('hidden'):
656
+ copy_filters.pop('hidden')
657
+
658
+ if target_directory is not None and not target_directory.startswith('/'):
659
+ target_directory = '/' + target_directory
660
+
661
+ payload = {
662
+ "name": clone_name,
663
+ "filter": copy_filters.prepare(),
664
+ "cloneDatasetParams": {
665
+ "withItemsAnnotations": with_items_annotations,
666
+ "withMetadata": with_metadata,
667
+ "withTaskAnnotationsStatus": with_task_annotations_status,
668
+ "targetDirectory": target_directory
669
+ }
670
+ }
671
+ if dst_dataset_id is not None:
672
+ payload['cloneDatasetParams']['targetDatasetId'] = dst_dataset_id
673
+ success, response = self._client_api.gen_request(req_type='post',
674
+ path='/datasets/{}/clone'.format(dataset_id),
675
+ json_req=payload,
676
+ headers={'user_query': filters._user_query})
677
+
678
+ if not success:
679
+ raise exceptions.PlatformException(response)
680
+
681
+ command = entities.Command.from_json(_json=response.json(),
682
+ client_api=self._client_api)
683
+ command = command.wait()
684
+
685
+ if 'returnedModelId' not in command.spec:
686
+ raise exceptions.PlatformException(error='400',
687
+ message="returnedModelId key is missing in command response: {!r}"
688
+ .format(response))
689
+ return self.get(dataset_id=command.spec['returnedModelId'])
690
+
691
+ def _export_recursive(
692
+ self,
693
+ dataset: entities.Dataset = None,
694
+ dataset_name: str = None,
695
+ dataset_id: str = None,
696
+ local_path: str = None,
697
+ filters: Union[dict, entities.Filters] = None,
698
+ annotation_filters: entities.Filters = None,
699
+ feature_vector_filters: entities.Filters = None,
700
+ include_feature_vectors: bool = False,
701
+ include_annotations: bool = False,
702
+ timeout: int = 0,
703
+ dataset_lock: bool = False,
704
+ lock_timeout_sec: int = None,
705
+ export_summary: bool = False,
706
+ max_items_per_subset: int = MAX_ITEMS_PER_SUBSET,
707
+ export_type: ExportType = ExportType.JSON,
708
+ output_export_type: OutputExportType = OutputExportType.JSON,
709
+ ) -> Generator[str, None, None]:
710
+ """
711
+ Export dataset items recursively by splitting large datasets into smaller subsets.
712
+
713
+ Args:
714
+ dataset (entities.Dataset, optional): Dataset entity to export
715
+ dataset_name (str, optional): Name of the dataset to export
716
+ dataset_id (str, optional): ID of the dataset to export
717
+ local_path (str, optional): Local path to save the exported data
718
+ filters (Union[dict, entities.Filters], optional): Filters to apply on the items
719
+ annotation_filters (entities.Filters, optional): Filters to apply on the annotations
720
+ feature_vector_filters (entities.Filters, optional): Filters to apply on the feature vectors
721
+ include_feature_vectors (bool, optional): Whether to include feature vectors in export. Defaults to False
722
+ include_annotations (bool, optional): Whether to include annotations in export. Defaults to False
723
+ timeout (int, optional): Timeout in seconds for the export operation. Defaults to 0
724
+ dataset_lock (bool, optional): Whether to lock the dataset during export. Defaults to False
725
+ lock_timeout_sec (int, optional): Timeout for dataset lock in seconds. Defaults to None
726
+ export_summary (bool, optional): Whether to include export summary. Defaults to False
727
+ max_items_per_subset (int, optional): Maximum items per subset for recursive export. Defaults to MAX_ITEMS_PER_SUBSET
728
+ export_type (ExportType, optional): Type of export (JSON or ZIP). Defaults to ExportType.JSON
729
+ output_export_type (OutputExportType, optional): Output format type. Defaults to OutputExportType.JSON
730
+
731
+ Returns:
732
+ Generator[str, None, None]: Generator yielding export paths
733
+
734
+ Raises:
735
+ NotImplementedError: If ZIP export type is used with JSON output type
736
+ exceptions.PlatformException: If API request fails or command response is invalid
737
+ """
738
+ logger.debug(f"exporting dataset with export_type {export_type} and output_export_type {output_export_type}")
739
+ if export_type == ExportType.ZIP and output_export_type == OutputExportType.JSON:
740
+ raise NotImplementedError(
741
+ "Zip export type is not supported for JSON output type.\n"
742
+ "If Json output is required, please use the export_type = JSON"
743
+ )
744
+
745
+ # Get dataset entity for recursive filtering
746
+ dataset_entity = self.get(dataset_id=self._resolve_dataset_id(dataset, dataset_name, dataset_id))
747
+ if export_type != ExportType.JSON:
748
+ filters_list = [filters]
749
+ else:
750
+ # Generate filter subsets using recursive_get_filters
751
+ filters_list = entities.Filters._get_split_filters(
752
+ dataset=dataset_entity, filters=filters, max_items=max_items_per_subset
753
+ )
754
+ # First loop: Make all API requests without waiting
755
+ commands = []
756
+ logger.debug("start making all API requests without waiting")
757
+ for filter_i in filters_list:
758
+ # Build payload for this subset
759
+ payload = self._build_payload(
760
+ filters=filter_i,
761
+ include_feature_vectors=include_feature_vectors,
762
+ include_annotations=include_annotations,
763
+ export_type=export_type,
764
+ annotation_filters=annotation_filters,
765
+ feature_vector_filters=feature_vector_filters,
766
+ dataset_lock=dataset_lock,
767
+ lock_timeout_sec=lock_timeout_sec,
768
+ export_summary=export_summary,
769
+ )
770
+
771
+ # Make API request for this subset
772
+ success, response = self._client_api.gen_request(
773
+ req_type='post', path=f'/datasets/{dataset_entity.id}/export', json_req=payload
774
+ )
775
+
776
+ if not success:
777
+ logger.error(f"failed to make API request /datasets/{dataset_entity.id}/export with payload {payload} response {response}")
778
+ raise exceptions.PlatformException(response)
779
+
780
+ # Handle command execution
781
+ commands.append( entities.Command.from_json(_json=response.json(), client_api=self._client_api))
782
+
783
+ time.sleep(2) # as the command have wrong progress in the beginning
784
+ logger.debug("start waiting for all commands")
785
+ # Second loop: Wait for all commands and process results
786
+ for command in commands:
787
+ command = command.wait(timeout=timeout)
788
+
789
+ if 'outputItemId' not in command.spec:
790
+ raise exceptions.PlatformException(
791
+ error='400', message="outputItemId key is missing in command response"
792
+ )
793
+
794
+ item_id = command.spec['outputItemId']
795
+ # Download and process the exported item
796
+ yield self._download_exported_item(
797
+ item_id=item_id,
798
+ export_type=export_type,
799
+ local_path=local_path,
800
+ unzip=output_export_type != OutputExportType.ZIP,
801
+ )
802
+
803
+ @_api_reference.add(path='/datasets/{id}/export', method='post')
804
+ def export(
805
+ self,
806
+ dataset: entities.Dataset = None,
807
+ dataset_name: str = None,
808
+ dataset_id: str = None,
809
+ local_path: str = None,
810
+ filters: Union[dict, entities.Filters] = None,
811
+ annotation_filters: entities.Filters = None,
812
+ feature_vector_filters: entities.Filters = None,
813
+ include_feature_vectors: bool = False,
814
+ include_annotations: bool = False,
815
+ export_type: ExportType = ExportType.JSON,
816
+ timeout: int = 0,
817
+ dataset_lock: bool = False,
818
+ lock_timeout_sec: int = None,
819
+ export_summary: bool = False,
820
+ output_export_type: OutputExportType = None,
821
+ ) -> Optional[str]:
822
+ """
823
+ Export dataset items and annotations.
824
+
825
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
826
+
827
+ You must provide at least ONE of the following params: dataset, dataset_name, dataset_id.
828
+
829
+ **Export Behavior by Parameter Combination:**
830
+
831
+ The behavior of this method depends on the combination of `export_type` and `output_export_type`:
832
+
833
+ **When export_type = ExportType.JSON:**
834
+
835
+ - **output_export_type = OutputExportType.JSON (default when None):**
836
+ - Exports data in JSON format, split into subsets of max 500 items
837
+ - Downloads all subset JSON files and concatenates them into a single `result.json` file
838
+ - Returns the path to the concatenated JSON file
839
+ - Cleans up individual subset files after concatenation
840
+
841
+ - **output_export_type = OutputExportType.ZIP:**
842
+ - Same as JSON export, but zips the final `result.json` file
843
+ - Returns the path to the zipped file (`result.json.zip`)
844
+ - Cleans up the unzipped JSON file after zipping
845
+
846
+ - **output_export_type = OutputExportType.FOLDERS:**
847
+ - Exports data in JSON format, split into subsets of max 500 items
848
+ - Downloads all subset JSON files and creates individual JSON files for each item
849
+ - Creates a folder structure mirroring the remote dataset structure
850
+ - Returns the path to the base directory containing the folder structure
851
+ - Each item gets its own JSON file named after the original filename
852
+
853
+ **When export_type = ExportType.ZIP:**
854
+
855
+ - **output_export_type = OutputExportType.ZIP:**
856
+ - Exports data as a ZIP file containing the dataset
857
+ - Returns the downloaded ZIP item directly
858
+ - No additional processing or concatenation
859
+
860
+ - **output_export_type = OutputExportType.JSON:**
861
+ - **NOT SUPPORTED** - Raises NotImplementedError
862
+ - Use export_type=ExportType.JSON instead for JSON output
863
+
864
+ - **output_export_type = OutputExportType.FOLDERS:**
865
+ - **NOT SUPPORTED** - Raises NotImplementedError
866
+ - Use export_type=ExportType.JSON instead for folder output
867
+
868
+ **When output_export_type = None (legacy behavior):**
869
+ - Defaults to OutputExportType.JSON
870
+ - Maintains backward compatibility with existing code
871
+
872
+ :param dtlpy.entities.dataset.Dataset dataset: Dataset object
873
+ :param str dataset_name: The name of the dataset
874
+ :param str dataset_id: The ID of the dataset
875
+ :param str local_path: Local path to save the exported dataset
876
+ :param Union[dict, dtlpy.entities.filters.Filters] filters: Filters entity or a query dictionary
877
+ :param dtlpy.entities.filters.Filters annotation_filters: Filters entity to filter annotations for export
878
+ :param dtlpy.entities.filters.Filters feature_vector_filters: Filters entity to filter feature vectors for export
879
+ :param bool include_feature_vectors: Include item feature vectors in the export
880
+ :param bool include_annotations: Include item annotations in the export
881
+ :param bool dataset_lock: Make dataset readonly during the export
882
+ :param bool export_summary: Get Summary of the dataset export
883
+ :param int lock_timeout_sec: Timeout for locking the dataset during export in seconds
884
+ :param entities.ExportType export_type: Type of export ('json' or 'zip')
885
+ :param entities.OutputExportType output_export_type: Output format ('json', 'zip', or 'folders'). If None, defaults to 'json'
886
+ :param int timeout: Maximum time in seconds to wait for the export to complete
887
+ :return: Path to exported file/directory, or None if export result is empty
888
+ :rtype: Optional[str]
889
+ """
890
+ export_result = list(
891
+ self._export_recursive(
892
+ dataset=dataset,
893
+ dataset_name=dataset_name,
894
+ dataset_id=dataset_id,
895
+ local_path=local_path,
896
+ filters=filters,
897
+ annotation_filters=annotation_filters,
898
+ feature_vector_filters=feature_vector_filters,
899
+ include_feature_vectors=include_feature_vectors,
900
+ include_annotations=include_annotations,
901
+ timeout=timeout,
902
+ dataset_lock=dataset_lock,
903
+ lock_timeout_sec=lock_timeout_sec,
904
+ export_summary=export_summary,
905
+ export_type=export_type,
906
+ output_export_type=output_export_type,
907
+ )
908
+ )
909
+ if all(x is None for x in export_result):
910
+ logger.error("export result is empty")
911
+ return None
912
+
913
+ if export_type == ExportType.ZIP:
914
+ # if export type is zip, then return the _export_recursive result as it
915
+ return export_result[0]
916
+
917
+ # if user didn't provide output_export_type, keep the previous behavior
918
+ if output_export_type is None:
919
+ output_export_type = OutputExportType.JSON
920
+
921
+ # export type is jsos :
922
+ # Load all items from subset JSON files and clean them up
923
+ all_items = []
924
+ logger.debug("start loading all items from subset JSON files")
925
+ for json_file in export_result:
926
+ if json_file is None:
927
+ continue
928
+ if os.path.isfile(json_file):
929
+ with open(json_file, 'r') as f:
930
+ items = json.load(f)
931
+ if isinstance(items, list):
932
+ all_items.extend(items)
933
+ os.remove(json_file)
934
+
935
+ base_dir = os.path.dirname(export_result[0])
936
+ if output_export_type != OutputExportType.FOLDERS:
937
+ dataset_id=self._resolve_dataset_id(dataset, dataset_name, dataset_id)
938
+ result_file_name = f"{dataset_id}.json"
939
+ result_file = os.path.join(base_dir, result_file_name)
940
+ logger.debug(f"start writing all items to result file {result_file}")
941
+ with open(result_file, 'w') as f:
942
+ json.dump(all_items, f)
943
+ if output_export_type == OutputExportType.ZIP:
944
+ # Zip the result file
945
+ zip_filename = result_file + '.zip'
946
+ # Create zip file
947
+ logger.debug(f"start zipping result file {zip_filename}")
948
+ with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
949
+ zf.write(result_file, arcname=os.path.basename(result_file))
950
+
951
+ # Remove original json after zipping
952
+ os.remove(result_file)
953
+ result_file = zip_filename
954
+ return result_file
955
+ logger.debug("start building per-item JSON files under local_path mirroring remote structure")
956
+ # Build per-item JSON files under local_path mirroring remote structure
957
+ for item in all_items:
958
+ self._save_item_json_file(item_data=item, base_path=Path(base_dir), export_version=None)
959
+ logger.debug("end building per-item JSON files under local_path mirroring remote structure")
960
+ return base_dir
961
+
962
+ @_api_reference.add(path='/datasets/merge', method='post')
963
+ def merge(self,
964
+ merge_name: str,
965
+ dataset_ids: list,
966
+ project_ids: str,
967
+ with_items_annotations: bool = True,
968
+ with_metadata: bool = True,
969
+ with_task_annotations_status: bool = True,
970
+ wait: bool = True):
971
+ """
972
+ Merge a dataset. See our `SDK docs <https://developers.dataloop.ai/tutorials/data_management/data_versioning/chapter/>`_ for more information.
973
+
974
+ **Prerequisites**: You must be an *owner* or *developer* to use this method.
975
+
976
+ :param str merge_name: new dataset name
977
+ :param list dataset_ids: list id's of the datatsets you wish to merge
978
+ :param str project_ids: the project id that include the datasets
979
+ :param bool with_items_annotations: true to merge with items annotations
980
+ :param bool with_metadata: true to merge with metadata
981
+ :param bool with_task_annotations_status: true to merge with task annotations' status
982
+ :param bool wait: wait for the command to finish
983
+ :return: True if success
984
+ :rtype: bool
985
+
986
+ **Example**:
987
+
988
+ .. code-block:: python
989
+
990
+ success = project.datasets.merge(dataset_ids=['dataset_id1','dataset_id2'],
991
+ merge_name='dataset_merge_name',
992
+ with_metadata=True,
993
+ with_items_annotations=False,
994
+ with_task_annotations_status=False)
995
+ """
996
+ payload = {
997
+ "name": merge_name,
998
+ "datasetsIds": dataset_ids,
999
+ "projectIds": project_ids,
1000
+ "mergeDatasetParams": {
1001
+ "withItemsAnnotations": with_items_annotations,
1002
+ "withMetadata": with_metadata,
1003
+ "withTaskAnnotationsStatus": with_task_annotations_status
1004
+ },
1005
+ 'asynced': wait
1006
+ }
1007
+ success, response = self._client_api.gen_request(req_type='post',
1008
+ path='/datasets/merge',
1009
+ json_req=payload)
1010
+
1011
+ if success:
1012
+ command = entities.Command.from_json(_json=response.json(),
1013
+ client_api=self._client_api)
1014
+ if not wait:
1015
+ return command
1016
+ command = command.wait(timeout=0)
1017
+ if 'mergeDatasetsConfiguration' not in command.spec:
1018
+ raise exceptions.PlatformException(error='400',
1019
+ message="mergeDatasetsConfiguration key is missing in command response: {}"
1020
+ .format(response))
1021
+ return True
1022
+ else:
1023
+ raise exceptions.PlatformException(response)
1024
+
1025
+ @_api_reference.add(path='/datasets/{id}/sync', method='post')
1026
+ def sync(self, dataset_id: str, wait: bool = True):
1027
+ """
1028
+ Sync dataset with external storage.
1029
+
1030
+ **Prerequisites**: You must be in the role of an *owner* or *developer*.
1031
+
1032
+ :param str dataset_id: The Id of the dataset to sync
1033
+ :param bool wait: wait for the command to finish
1034
+ :return: True if success
1035
+ :rtype: bool
1036
+
1037
+ **Example**:
1038
+
1039
+ .. code-block:: python
1040
+
1041
+ success = project.datasets.sync(dataset_id='dataset_id')
1042
+ """
1043
+
1044
+ success, response = self._client_api.gen_request(req_type='post',
1045
+ path='/datasets/{}/sync'.format(dataset_id))
1046
+
1047
+ if success:
1048
+ command = entities.Command.from_json(_json=response.json(),
1049
+ client_api=self._client_api)
1050
+ if not wait:
1051
+ return command
1052
+ command = command.wait(timeout=0)
1053
+ if 'datasetId' not in command.spec:
1054
+ raise exceptions.PlatformException(error='400',
1055
+ message="datasetId key is missing in command response: {}"
1056
+ .format(response))
1057
+ return True
1058
+ else:
1059
+ raise exceptions.PlatformException(response)
1060
+
1061
+ @_api_reference.add(path='/datasets', method='post')
1062
+ def create(self,
1063
+ dataset_name: str,
1064
+ labels=None,
1065
+ attributes=None,
1066
+ ontology_ids=None,
1067
+ driver: entities.Driver = None,
1068
+ driver_id: str = None,
1069
+ checkout: bool = False,
1070
+ expiration_options: entities.ExpirationOptions = None,
1071
+ index_driver: entities.IndexDriver = None,
1072
+ recipe_id: str = None
1073
+ ) -> entities.Dataset:
1074
+ """
1075
+ Create a new dataset
1076
+
1077
+ **Prerequisites**: You must be in the role of an *owner* or *developer*.
1078
+
1079
+ :param str dataset_name: The Name of the dataset
1080
+ :param list labels: dictionary of {tag: color} or list of label entities
1081
+ :param list attributes: dataset's ontology's attributes
1082
+ :param list ontology_ids: optional - dataset ontology
1083
+ :param dtlpy.entities.driver.Driver driver: optional - storage driver Driver object or driver name
1084
+ :param str driver_id: optional - driver id
1085
+ :param bool checkout: set the dataset as a default dataset object (cookies)
1086
+ :param ExpirationOptions expiration_options: dl.ExpirationOptions object that contain definitions for dataset like MaxItemDays
1087
+ :param str index_driver: dl.IndexDriver, dataset driver version
1088
+ :param str recipe_id: optional - recipe id
1089
+ :return: Dataset object
1090
+ :rtype: dtlpy.entities.dataset.Dataset
1091
+
1092
+ **Example**:
1093
+
1094
+ .. code-block:: python
1095
+
1096
+ dataset = project.datasets.create(dataset_name='dataset_name', ontology_ids='ontology_ids')
1097
+ """
1098
+ create_default_recipe = True
1099
+ if any([labels, attributes, ontology_ids, recipe_id]):
1100
+ create_default_recipe = False
1101
+
1102
+ # labels to list
1103
+ if labels is not None:
1104
+ if not isinstance(labels, list):
1105
+ labels = [labels]
1106
+ if not all(isinstance(label, entities.Label) for label in labels):
1107
+ labels = entities.Dataset.serialize_labels(labels)
1108
+ else:
1109
+ labels = list()
1110
+
1111
+ # get creator from token
1112
+ payload = {'name': dataset_name,
1113
+ 'projects': [self.project.id],
1114
+ 'createDefaultRecipe': create_default_recipe
1115
+ }
1116
+
1117
+ if driver_id is None and driver is not None:
1118
+ if isinstance(driver, entities.Driver):
1119
+ driver_id = driver.id
1120
+ elif isinstance(driver, str):
1121
+ driver_id = self.project.drivers.get(driver_name=driver).id
1122
+ else:
1123
+ raise exceptions.PlatformException(
1124
+ error=400,
1125
+ message='Input arg "driver" must be Driver object or a string driver name. got type: {!r}'.format(
1126
+ type(driver)))
1127
+ if driver_id is not None:
1128
+ payload['driver'] = driver_id
1129
+
1130
+ if expiration_options:
1131
+ payload['expirationOptions'] = expiration_options.to_json()
1132
+ if index_driver is not None:
1133
+ payload['indexDriver'] = index_driver
1134
+
1135
+ success, response = self._client_api.gen_request(req_type='post',
1136
+ path='/datasets',
1137
+ json_req=payload)
1138
+ if success:
1139
+ dataset = entities.Dataset.from_json(client_api=self._client_api,
1140
+ _json=response.json(),
1141
+ datasets=self,
1142
+ project=self.project)
1143
+ # create ontology and recipe
1144
+ if not create_default_recipe:
1145
+ if recipe_id is not None:
1146
+ dataset.switch_recipe(recipe_id=recipe_id)
1147
+ else:
1148
+ dataset = dataset.recipes.create(ontology_ids=ontology_ids,
1149
+ labels=labels,
1150
+ attributes=attributes).dataset
1151
+ else:
1152
+ raise exceptions.PlatformException(response)
1153
+ logger.info('Dataset was created successfully. Dataset id: {!r}'.format(dataset.id))
1154
+ assert isinstance(dataset, entities.Dataset)
1155
+ if checkout:
1156
+ self.checkout(dataset=dataset)
1157
+ return dataset
1158
+
1159
+ @staticmethod
1160
+ def _convert_single(downloader,
1161
+ item,
1162
+ img_filepath,
1163
+ local_path,
1164
+ overwrite,
1165
+ annotation_options,
1166
+ annotation_filters,
1167
+ thickness,
1168
+ with_text,
1169
+ progress,
1170
+ alpha,
1171
+ export_version):
1172
+ # this is to convert the downloaded json files to any other annotation type
1173
+ try:
1174
+ if entities.ViewAnnotationOptions.ANNOTATION_ON_IMAGE in annotation_options:
1175
+ if img_filepath is None:
1176
+ img_filepath = item.download()
1177
+ downloader._download_img_annotations(item=item,
1178
+ img_filepath=img_filepath,
1179
+ local_path=local_path,
1180
+ overwrite=overwrite,
1181
+ annotation_options=annotation_options,
1182
+ annotation_filters=annotation_filters,
1183
+ thickness=thickness,
1184
+ alpha=alpha,
1185
+ with_text=with_text,
1186
+ export_version=export_version
1187
+ )
1188
+ except Exception:
1189
+ logger.error('Failed to download annotation for item: {!r}'.format(item.name))
1190
+ progress.update()
1191
+
1192
+ @staticmethod
1193
+ def download_annotations(dataset: entities.Dataset,
1194
+ local_path: str = None,
1195
+ filters: entities.Filters = None,
1196
+ annotation_options: entities.ViewAnnotationOptions = None,
1197
+ annotation_filters: entities.Filters = None,
1198
+ overwrite: bool = False,
1199
+ thickness: int = 1,
1200
+ with_text: bool = False,
1201
+ remote_path: str = None,
1202
+ include_annotations_in_output: bool = True,
1203
+ export_png_files: bool = False,
1204
+ filter_output_annotations: bool = False,
1205
+ alpha: float = 1,
1206
+ export_version=entities.ExportVersion.V1,
1207
+ dataset_lock: bool = False,
1208
+ lock_timeout_sec: int = None,
1209
+ export_summary: bool = False,
1210
+ ) -> str:
1211
+ """
1212
+ Download dataset's annotations by filters.
1213
+
1214
+ You may filter the dataset both for items and for annotations and download annotations.
1215
+
1216
+ Optional -- download annotations as: mask, instance, image mask of the item.
1217
+
1218
+ **Prerequisites**: You must be in the role of an *owner* or *developer*.
1219
+
1220
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
1221
+ :param str local_path: local folder or filename to save to.
1222
+ :param dtlpy.entities.filters.Filters filters: Filters entity or a dictionary containing filters parameters
1223
+ :param list annotation_options: type of download annotations: list(dl.ViewAnnotationOptions)
1224
+ :param dtlpy.entities.filters.Filters annotation_filters: Filters entity to filter annotations for download
1225
+ :param bool overwrite: optional - default = False to overwrite the existing files
1226
+ :param bool dataset_loc: optional - default = False to make the dataset readonly
1227
+ :param int thickness: optional - line thickness, if -1 annotation will be filled, default =1
1228
+ :param bool with_text: optional - add text to annotations, default = False
1229
+ :param str remote_path: DEPRECATED and ignored
1230
+ :param bool include_annotations_in_output: default - False , if export should contain annotations
1231
+ :param bool export_png_files: default - if True, semantic annotations should be exported as png files
1232
+ :param bool filter_output_annotations: default - False, given an export by filter - determine if to filter out annotations
1233
+ :param float alpha: opacity value [0 1], default 1
1234
+ :param str export_version: exported items will have original extension in filename, `V1` - no original extension in filenames
1235
+ :return: local_path of the directory where all the downloaded item
1236
+ :param bool dataset_lock: optional - default = False
1237
+ :param bool export_summary: optional - default = False
1238
+ :param int lock_timeout_sec: optional
1239
+ :rtype: str
1240
+
1241
+ **Example**:
1242
+
1243
+ .. code-block:: python
1244
+
1245
+ file_path = project.datasets.download_annotations(dataset='dataset_entity',
1246
+ local_path='local_path',
1247
+ annotation_options=dl.ViewAnnotationOptions,
1248
+ overwrite=False,
1249
+ thickness=1,
1250
+ with_text=False,
1251
+ alpha=1,
1252
+ dataset_lock=False,
1253
+ lock_timeout_sec=300,
1254
+ export_summary=False
1255
+ )
1256
+ """
1257
+ if annotation_options is None:
1258
+ annotation_options = list()
1259
+ elif not isinstance(annotation_options, list):
1260
+ annotation_options = [annotation_options]
1261
+ for ann_option in annotation_options:
1262
+ if ann_option not in entities.ViewAnnotationOptions:
1263
+ raise PlatformException(
1264
+ error='400',
1265
+ message=f'Unknown annotation download option: {ann_option}, please choose from: {list(entities.ViewAnnotationOptions)}',
1266
+ )
1267
+ if remote_path is not None:
1268
+ logger.warning(f'"remote_path" is ignored. Use "filters=dl.Filters(field="dir, values={remote_path!r}"')
1269
+ if filter_output_annotations is True:
1270
+ logger.warning("'filter_output_annotations' is ignored but kept for legacy support")
1271
+ if include_annotations_in_output is False:
1272
+ logger.warning("include_annotations_in_output was False, but was set to True since this function downloads annotations.")
1273
+ include_annotations_in_output = True
1274
+
1275
+ if local_path is None:
1276
+ if dataset.project is None:
1277
+ # by dataset name
1278
+ local_path = str(Path(service_defaults.DATALOOP_PATH) / "datasets" / f"{dataset.name}_{dataset.id}")
1279
+ else:
1280
+ # by dataset and project name
1281
+ local_path = str(Path(service_defaults.DATALOOP_PATH) / "projects" / dataset.project.name / "datasets" / dataset.name)
1282
+
1283
+ if filters is None:
1284
+ filters = entities.Filters()
1285
+ filters._user_query = 'false'
1286
+ if annotation_filters is not None:
1287
+ for annotation_filter_and in annotation_filters.and_filter_list:
1288
+ filters.add_join(field=annotation_filter_and.field,
1289
+ values=annotation_filter_and.values,
1290
+ operator=annotation_filter_and.operator,
1291
+ method=entities.FiltersMethod.AND)
1292
+ for annotation_filter_or in annotation_filters.or_filter_list:
1293
+ filters.add_join(field=annotation_filter_or.field,
1294
+ values=annotation_filter_or.values,
1295
+ operator=annotation_filter_or.operator,
1296
+ method=entities.FiltersMethod.OR)
1297
+
1298
+ downloader = repositories.Downloader(items_repository=dataset.items)
1299
+
1300
+ # Setup for incremental processing
1301
+ if len(annotation_options) == 0 :
1302
+ pool = None
1303
+ progress = None
1304
+ jobs = []
1305
+ else:
1306
+ # Get total count for progress bar
1307
+ filter_copy = copy.deepcopy(filters)
1308
+ filter_copy.page_size = 0
1309
+ pages = dataset.items.list(filters=filter_copy)
1310
+ total_items = pages.items_count
1311
+
1312
+ # Setup thread pool and progress bar
1313
+ pool = dataset._client_api.thread_pools(pool_name='dataset.download')
1314
+ progress = tqdm.tqdm(
1315
+ total=total_items,
1316
+ disable=dataset._client_api.verbose.disable_progress_bar_download_annotations,
1317
+ file=sys.stdout,
1318
+ desc='Download Annotations'
1319
+ )
1320
+ jobs = []
1321
+
1322
+
1323
+ # Call _export_recursive as generator
1324
+ export_generator = dataset.project.datasets._export_recursive(
1325
+ dataset=dataset,
1326
+ local_path=tempfile.mkdtemp(prefix='annotations_jsons_'),
1327
+ filters=filters,
1328
+ annotation_filters=annotation_filters,
1329
+ include_annotations=True,
1330
+ export_type=ExportType.JSON,
1331
+ dataset_lock=dataset_lock,
1332
+ lock_timeout_sec=lock_timeout_sec,
1333
+ export_summary=export_summary,
1334
+ timeout=0,
1335
+ max_items_per_subset=DOWNLOAD_ANNOTATIONS_MAX_ITEMS_PER_SUBSET
1336
+ )
1337
+
1338
+ # Process each subset JSON file incrementally
1339
+ for subset_json_file in export_generator:
1340
+ if subset_json_file is None or not Path(subset_json_file).is_file():
1341
+ continue
1342
+
1343
+ try:
1344
+ # Open and load the items array
1345
+ with open(subset_json_file, 'r') as f:
1346
+ items_data = json.load(f)
1347
+
1348
+ # Process each item immediately
1349
+ for item_data in items_data:
1350
+ # Split and save individual JSON file
1351
+ Datasets._save_item_json_file(item_data=item_data, base_path=Path(local_path) / 'json', export_version=export_version)
1352
+
1353
+ # If annotation_options are provided, submit to thread pool immediately
1354
+ if annotation_options:
1355
+ # Create Item entity from item_data
1356
+ item = entities.Item.from_json(
1357
+ _json=item_data,
1358
+ client_api=dataset._client_api,
1359
+ dataset=dataset
1360
+ )
1361
+
1362
+ job = pool.submit(
1363
+ Datasets._convert_single,
1364
+ **{
1365
+ 'downloader': downloader,
1366
+ 'item': item,
1367
+ 'img_filepath': None,
1368
+ 'local_path': local_path,
1369
+ 'overwrite': overwrite,
1370
+ 'annotation_options': annotation_options,
1371
+ 'annotation_filters': annotation_filters,
1372
+ 'thickness': thickness,
1373
+ 'with_text': with_text,
1374
+ 'progress': progress,
1375
+ 'alpha': alpha,
1376
+ 'export_version': export_version
1377
+ }
1378
+ )
1379
+ jobs.append(job)
1380
+
1381
+ # Clean up temporary subset JSON file
1382
+ os.remove(subset_json_file)
1383
+ except Exception as e:
1384
+ logger.exception(f'Failed processing subset JSON file {subset_json_file}: {e}')
1385
+
1386
+ # Wait for all thread pool jobs to complete
1387
+ if annotation_options:
1388
+ _ = [j.result() for j in jobs]
1389
+ progress.close()
1390
+
1391
+ return local_path
1392
+
1393
+ def _upload_single_item_annotation(self, item, file, pbar):
1394
+ try:
1395
+ item.annotations.upload(file)
1396
+ except Exception as err:
1397
+ raise err
1398
+ finally:
1399
+ pbar.update()
1400
+
1401
+ def upload_annotations(self,
1402
+ dataset,
1403
+ local_path,
1404
+ filters: entities.Filters = None,
1405
+ clean=False,
1406
+ remote_root_path='/',
1407
+ export_version=entities.ExportVersion.V1
1408
+ ):
1409
+ """
1410
+ Upload annotations to dataset.
1411
+
1412
+ Example for remote_root_path: If the item filepath is "/a/b/item" and remote_root_path is "/a" - the start folder will be b instead of a
1413
+
1414
+ **Prerequisites**: You must have a dataset with items that are related to the annotations. The relationship between the dataset and annotations is shown in the name. You must be in the role of an *owner* or *developer*.
1415
+
1416
+ :param dtlpy.entities.dataset.Dataset dataset: dataset to upload to
1417
+ :param str local_path: str - local folder where the annotations files are
1418
+ :param dtlpy.entities.filters.Filters filters: Filters entity or a dictionary containing filters parameters
1419
+ :param bool clean: True to remove the old annotations
1420
+ :param str remote_root_path: the remote root path to match remote and local items
1421
+ :param str export_version: exported items will have original extension in filename, `V1` - no original extension in filenames
1422
+
1423
+ **Example**:
1424
+
1425
+ .. code-block:: python
1426
+
1427
+ project.datasets.upload_annotations(dataset='dataset_entity',
1428
+ local_path='local_path',
1429
+ clean=False,
1430
+ export_version=dl.ExportVersion.V1
1431
+ )
1432
+ """
1433
+ if filters is None:
1434
+ filters = entities.Filters()
1435
+ filters._user_query = 'false'
1436
+ pages = dataset.items.list(filters=filters)
1437
+ total_items = pages.items_count
1438
+ pbar = tqdm.tqdm(total=total_items, disable=dataset._client_api.verbose.disable_progress_bar_upload_annotations,
1439
+ file=sys.stdout, desc='Upload Annotations')
1440
+ pool = self._client_api.thread_pools('annotation.upload')
1441
+ annotations_uploaded_count = 0
1442
+ for item in pages.all():
1443
+ if export_version == entities.ExportVersion.V1:
1444
+ _, ext = os.path.splitext(item.filename)
1445
+ filepath = item.filename.replace(ext, '.json')
1446
+ else:
1447
+ filepath = item.filename + '.json'
1448
+ # make the file path ignore the hierarchy of the files that in remote_root_path
1449
+ filepath = os.path.relpath(filepath, remote_root_path)
1450
+ json_file = os.path.join(local_path, filepath)
1451
+ if not os.path.isfile(json_file):
1452
+ pbar.update()
1453
+ continue
1454
+ annotations_uploaded_count += 1
1455
+ if item.annotated and clean:
1456
+ item.annotations.delete(filters=entities.Filters(resource=entities.FiltersResource.ANNOTATION))
1457
+ pool.submit(self._upload_single_item_annotation, **{'item': item,
1458
+ 'file': json_file,
1459
+ 'pbar': pbar})
1460
+ pool.shutdown()
1461
+ if annotations_uploaded_count == 0:
1462
+ logger.warning(msg="No annotations uploaded to dataset! ")
1463
+ else:
1464
+ logger.info(msg='Found and uploaded {} annotations.'.format(annotations_uploaded_count))
1465
+
1466
+ def set_readonly(self, state: bool, dataset: entities.Dataset):
1467
+ """
1468
+ Set dataset readonly mode.
1469
+
1470
+ **Prerequisites**: You must be in the role of an *owner* or *developer*.
1471
+
1472
+ :param bool state: state to update readonly mode
1473
+ :param dtlpy.entities.dataset.Dataset dataset: dataset object
1474
+
1475
+ **Example**:
1476
+
1477
+ .. code-block:: python
1478
+
1479
+ project.datasets.set_readonly(dataset='dataset_entity', state=True)
1480
+ """
1481
+ import warnings
1482
+ warnings.warn("`readonly` flag on dataset is deprecated, doing nothing.", DeprecationWarning)
1483
+
1484
+ @_api_reference.add(path='/datasets/{id}/split', method='post')
1485
+ def split_ml_subsets(self,
1486
+ dataset_id: str,
1487
+ items_query: entities.filters,
1488
+ ml_split_list: dict) -> bool:
1489
+ """
1490
+ Split dataset items into ML subsets.
1491
+
1492
+ :param str dataset_id: The ID of the dataset.
1493
+ :param dict items_query: Query to select items.
1494
+ :param dict ml_split_list: Dictionary with 'train', 'validation', 'test' keys and integer percentages.
1495
+ :return: True if the split operation was successful.
1496
+ :rtype: bool
1497
+ :raises: PlatformException on failure and ValueError if percentages do not sum to 100 or invalid keys/values.
1498
+ """
1499
+ # Validate percentages
1500
+ if not ml_split_list:
1501
+ ml_split_list = {'train': 80, 'validation': 10, 'test': 10}
1502
+
1503
+ if not items_query:
1504
+ items_query = entities.Filters()
1505
+
1506
+ items_query_dict = items_query.prepare()
1507
+ required_keys = {'train', 'validation', 'test'}
1508
+ if set(ml_split_list.keys()) != required_keys:
1509
+ raise ValueError("MLSplitList must have exactly the keys 'train', 'validation', 'test'.")
1510
+ total = sum(ml_split_list.values())
1511
+ if total != 100:
1512
+ raise ValueError(
1513
+ "Please set the Train, Validation, and Test subsets percentages to add up to 100%. "
1514
+ "For example: 70, 15, 15."
1515
+ )
1516
+ for key, value in ml_split_list.items():
1517
+ if not isinstance(value, int) or value < 0:
1518
+ raise ValueError("Percentages must be integers >= 0.")
1519
+ payload = {
1520
+ 'itemsQuery': items_query_dict,
1521
+ 'MLSplitList': ml_split_list
1522
+ }
1523
+ path = f'/datasets/{dataset_id}/split'
1524
+ success, response = self._client_api.gen_request(req_type='post',
1525
+ path=path,
1526
+ json_req=payload)
1527
+ if success:
1528
+ # Wait for the split operation to complete
1529
+ command = entities.Command.from_json(_json=response.json(),
1530
+ client_api=self._client_api)
1531
+ command.wait()
1532
+ return True
1533
+ else:
1534
+ raise exceptions.PlatformException(response)
1535
+
1536
+ @_api_reference.add(path='/datasets/{id}/items/bulk-update-metadata', method='post')
1537
+ def bulk_update_ml_subset(self, dataset_id: str, items_query: dict, subset: str = None, deleteTag: bool = False) -> bool:
1538
+ """
1539
+ Bulk update ML subset assignment for selected items.
1540
+ If subset is None, remove subsets. Otherwise, assign the specified subset.
1541
+
1542
+ :param str dataset_id: ID of the dataset
1543
+ :param dict items_query: DQLResourceQuery (filters) for selecting items
1544
+ :param str subset: 'train', 'validation', 'test' or None to remove all
1545
+ :return: True if success
1546
+ :rtype: bool
1547
+ """
1548
+ if items_query is None:
1549
+ items_query = entities.Filters()
1550
+ items_query_dict = items_query.prepare()
1551
+ if not deleteTag and subset not in ['train', 'validation', 'test']:
1552
+ raise ValueError("subset must be one of: 'train', 'validation', 'test'")
1553
+ # Determine tag values based on subset
1554
+ tags = {
1555
+ 'train': True if subset == 'train' else None,
1556
+ 'validation': True if subset == 'validation' else None,
1557
+ 'test': True if subset == 'test' else None
1558
+ }
1559
+
1560
+ payload = {
1561
+ "query": items_query_dict,
1562
+ "updateQuery": {
1563
+ "update": {
1564
+ "metadata": {
1565
+ "system": {
1566
+ "tags": tags
1567
+ }
1568
+ }
1569
+ },
1570
+ "systemSpace": True
1571
+ }
1572
+ }
1573
+
1574
+ success, response = self._client_api.gen_request(
1575
+ req_type='post',
1576
+ path=f'/datasets/{dataset_id}/items/bulk-update-metadata',
1577
+ json_req=payload
1578
+ )
1579
+ if success:
1580
+ # Similar to split operation, a command is returned
1581
+ command = entities.Command.from_json(_json=response.json(), client_api=self._client_api)
1582
+ command.wait()
1583
+ return True
1584
+ else:
1585
+ raise exceptions.PlatformException(response)