dtlpy 1.114.17__py3-none-any.whl → 1.115.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtlpy/__init__.py +1 -1
- dtlpy/__version__.py +1 -1
- dtlpy/entities/__init__.py +1 -1
- dtlpy/entities/analytic.py +42 -6
- dtlpy/entities/codebase.py +1 -5
- dtlpy/entities/compute.py +12 -5
- dtlpy/entities/dataset.py +19 -5
- dtlpy/entities/driver.py +14 -2
- dtlpy/entities/filters.py +156 -3
- dtlpy/entities/item.py +9 -3
- dtlpy/entities/prompt_item.py +7 -1
- dtlpy/entities/service.py +5 -0
- dtlpy/ml/base_model_adapter.py +407 -263
- dtlpy/repositories/commands.py +1 -7
- dtlpy/repositories/computes.py +17 -13
- dtlpy/repositories/datasets.py +287 -74
- dtlpy/repositories/downloader.py +23 -3
- dtlpy/repositories/drivers.py +12 -0
- dtlpy/repositories/executions.py +1 -3
- dtlpy/repositories/features.py +31 -14
- dtlpy/repositories/items.py +5 -2
- dtlpy/repositories/models.py +16 -4
- dtlpy/repositories/uploader.py +22 -12
- dtlpy/services/api_client.py +6 -3
- dtlpy/services/reporter.py +1 -1
- {dtlpy-1.114.17.dist-info → dtlpy-1.115.44.dist-info}/METADATA +15 -12
- {dtlpy-1.114.17.dist-info → dtlpy-1.115.44.dist-info}/RECORD +34 -34
- {dtlpy-1.114.17.data → dtlpy-1.115.44.data}/scripts/dlp +0 -0
- {dtlpy-1.114.17.data → dtlpy-1.115.44.data}/scripts/dlp.bat +0 -0
- {dtlpy-1.114.17.data → dtlpy-1.115.44.data}/scripts/dlp.py +0 -0
- {dtlpy-1.114.17.dist-info → dtlpy-1.115.44.dist-info}/WHEEL +0 -0
- {dtlpy-1.114.17.dist-info → dtlpy-1.115.44.dist-info}/entry_points.txt +0 -0
- {dtlpy-1.114.17.dist-info → dtlpy-1.115.44.dist-info}/licenses/LICENSE +0 -0
- {dtlpy-1.114.17.dist-info → dtlpy-1.115.44.dist-info}/top_level.txt +0 -0
dtlpy/repositories/commands.py
CHANGED
|
@@ -113,13 +113,7 @@ class Commands:
|
|
|
113
113
|
elapsed = time.time() - start
|
|
114
114
|
sleep_time = np.min([timeout - elapsed, backoff_factor * (2 ** num_tries), MAX_SLEEP_TIME])
|
|
115
115
|
num_tries += 1
|
|
116
|
-
logger.debug(
|
|
117
|
-
"Command {!r} is running for {:.2f}[s] and now Going to sleep {:.2f}[s]".format(
|
|
118
|
-
command.id,
|
|
119
|
-
elapsed,
|
|
120
|
-
sleep_time
|
|
121
|
-
)
|
|
122
|
-
)
|
|
116
|
+
logger.debug(f"Command {command.id} is running for {elapsed:.2f}[s]. Sleeping for {sleep_time:.2f}[s]")
|
|
123
117
|
if iteration_callback is not None:
|
|
124
118
|
try:
|
|
125
119
|
iteration_callback()
|
dtlpy/repositories/computes.py
CHANGED
|
@@ -55,7 +55,8 @@ class Computes:
|
|
|
55
55
|
wait=True,
|
|
56
56
|
status: entities.ComputeStatus = None,
|
|
57
57
|
settings: entities.ComputeSettings = None,
|
|
58
|
-
metadata: dict = None
|
|
58
|
+
metadata: dict = None,
|
|
59
|
+
deployment_configuration: dict = None
|
|
59
60
|
):
|
|
60
61
|
"""
|
|
61
62
|
Create a new compute
|
|
@@ -71,6 +72,7 @@ class Computes:
|
|
|
71
72
|
:param status: Compute status
|
|
72
73
|
:param settings: Compute settings
|
|
73
74
|
:param metadata: Compute metadata
|
|
75
|
+
:param deployment_configuration: Compute deployment Configuration
|
|
74
76
|
:return: Compute
|
|
75
77
|
:rtype: dl.entities.compute.Compute
|
|
76
78
|
"""
|
|
@@ -78,7 +80,8 @@ class Computes:
|
|
|
78
80
|
metadata = {}
|
|
79
81
|
shared_contexts_json = []
|
|
80
82
|
for shared_context in shared_contexts:
|
|
81
|
-
src_json = shared_context.to_json() if isinstance(shared_context,
|
|
83
|
+
src_json = shared_context.to_json() if isinstance(shared_context,
|
|
84
|
+
entities.ComputeContext) else shared_context
|
|
82
85
|
shared_contexts_json.append(src_json)
|
|
83
86
|
payload = {
|
|
84
87
|
'name': name,
|
|
@@ -90,7 +93,8 @@ class Computes:
|
|
|
90
93
|
'cluster': cluster.to_json(),
|
|
91
94
|
'status': status,
|
|
92
95
|
"settings": settings.to_json() if isinstance(settings, entities.ComputeSettings) else settings,
|
|
93
|
-
"metadata": metadata
|
|
96
|
+
"metadata": metadata,
|
|
97
|
+
"deploymentConfiguration": deployment_configuration
|
|
94
98
|
}
|
|
95
99
|
|
|
96
100
|
# request
|
|
@@ -171,9 +175,10 @@ class Computes:
|
|
|
171
175
|
if compute_id not in self.log_cache:
|
|
172
176
|
self.log_cache[compute_id] = {}
|
|
173
177
|
self.log_cache[compute_id]['validation'] = validation_logs
|
|
178
|
+
|
|
174
179
|
return func
|
|
175
180
|
|
|
176
|
-
def get(self, compute_id: str, archived
|
|
181
|
+
def get(self, compute_id: str, archived=False):
|
|
177
182
|
"""
|
|
178
183
|
Get a compute
|
|
179
184
|
|
|
@@ -183,7 +188,7 @@ class Computes:
|
|
|
183
188
|
:rtype: dl.entities.compute.Compute
|
|
184
189
|
"""
|
|
185
190
|
url_path = self._base_url + '/{}'.format(compute_id)
|
|
186
|
-
params_to_add = {"archived": "true" if archived else "false"
|
|
191
|
+
params_to_add = {"archived": "true" if archived else "false"}
|
|
187
192
|
parsed_url = urlparse(url_path)
|
|
188
193
|
query_dict = parse_qs(parsed_url.query)
|
|
189
194
|
query_dict.update(params_to_add)
|
|
@@ -234,7 +239,7 @@ class Computes:
|
|
|
234
239
|
:param bool wait: Wait for deletion
|
|
235
240
|
"""
|
|
236
241
|
url_path = self._base_url + '/{}'.format(compute_id)
|
|
237
|
-
params_to_add = {"skipDestroy": "true" if skip_destroy else "false"
|
|
242
|
+
params_to_add = {"skipDestroy": "true" if skip_destroy else "false"}
|
|
238
243
|
parsed_url = urlparse(url_path)
|
|
239
244
|
query_dict = parse_qs(parsed_url.query)
|
|
240
245
|
query_dict.update(params_to_add)
|
|
@@ -315,7 +320,6 @@ class Computes:
|
|
|
315
320
|
if not success:
|
|
316
321
|
raise exceptions.PlatformException(response)
|
|
317
322
|
|
|
318
|
-
|
|
319
323
|
return response.json()
|
|
320
324
|
|
|
321
325
|
@staticmethod
|
|
@@ -346,7 +350,7 @@ class Computes:
|
|
|
346
350
|
}
|
|
347
351
|
)
|
|
348
352
|
|
|
349
|
-
def setup_compute_cluster(self, config, integration, org_id, project=None):
|
|
353
|
+
def setup_compute_cluster(self, config, integration, org_id, project=None, is_global=False):
|
|
350
354
|
"""Set up a compute cluster using the provided configuration and integration."""
|
|
351
355
|
cluster = ComputeCluster.from_setup_json(config, integration)
|
|
352
356
|
project_id = None
|
|
@@ -360,11 +364,12 @@ class Computes:
|
|
|
360
364
|
ComputeType.KUBERNETES,
|
|
361
365
|
status=config['config'].get('status', None),
|
|
362
366
|
settings=config['config'].get('settings', None),
|
|
363
|
-
|
|
367
|
+
deployment_configuration=config['config'].get('deploymentConfiguration', {}),
|
|
368
|
+
metadata=config['config'].get('metadata', None), is_global=is_global)
|
|
364
369
|
|
|
365
370
|
return compute
|
|
366
371
|
|
|
367
|
-
def create_from_config_file(self, config_file_path, org_id, project_name: Optional[str] = None):
|
|
372
|
+
def create_from_config_file(self, config_file_path, org_id, project_name: Optional[str] = None, is_global=False):
|
|
368
373
|
config = self.decode_and_parse_input(config_file_path)
|
|
369
374
|
project = None
|
|
370
375
|
if project_name is not None:
|
|
@@ -373,10 +378,9 @@ class Computes:
|
|
|
373
378
|
integration_name = ('cluster_integration_test_' + datetime.datetime.now().isoformat().split('.')[0]
|
|
374
379
|
.replace(':', '_'))
|
|
375
380
|
integration = self.create_integration(org, integration_name, config['authentication'])
|
|
376
|
-
compute = self.setup_compute_cluster(config, integration, org_id, project)
|
|
381
|
+
compute = self.setup_compute_cluster(config, integration, org_id, project, is_global=is_global)
|
|
377
382
|
return compute
|
|
378
383
|
|
|
379
|
-
|
|
380
384
|
def _list(self, filters: entities.Filters):
|
|
381
385
|
url = self._base_url + '/query'
|
|
382
386
|
success, response = self._client_api.gen_request(req_type='POST',
|
|
@@ -432,4 +436,4 @@ class Computes:
|
|
|
432
436
|
page_size=filters.page_size,
|
|
433
437
|
client_api=self._client_api)
|
|
434
438
|
paged.get_page()
|
|
435
|
-
return paged
|
|
439
|
+
return paged
|
dtlpy/repositories/datasets.py
CHANGED
|
@@ -8,14 +8,17 @@ import time
|
|
|
8
8
|
import copy
|
|
9
9
|
import tqdm
|
|
10
10
|
import logging
|
|
11
|
+
import zipfile
|
|
11
12
|
import json
|
|
12
|
-
from typing import Union
|
|
13
|
+
from typing import Union, Generator, Optional
|
|
13
14
|
|
|
14
15
|
from .. import entities, repositories, miscellaneous, exceptions, services, PlatformException, _api_reference
|
|
15
16
|
from ..services.api_client import ApiClient
|
|
17
|
+
from ..entities.dataset import OutputExportType, ExportType
|
|
16
18
|
|
|
17
19
|
logger = logging.getLogger(name='dtlpy')
|
|
18
20
|
|
|
21
|
+
MAX_ITEMS_PER_SUBSET = 50000
|
|
19
22
|
|
|
20
23
|
class Datasets:
|
|
21
24
|
"""
|
|
@@ -155,8 +158,7 @@ class Datasets:
|
|
|
155
158
|
payload['annotations'] = {"include": include_annotations, "convertSemantic": False}
|
|
156
159
|
|
|
157
160
|
if annotation_filters is not None:
|
|
158
|
-
payload['annotationsQuery'] = annotation_filters.prepare()
|
|
159
|
-
payload['annotations']['filter'] = True
|
|
161
|
+
payload['annotationsQuery'] = annotation_filters.prepare()
|
|
160
162
|
|
|
161
163
|
if dataset_lock:
|
|
162
164
|
payload['datasetLock'] = dataset_lock
|
|
@@ -166,29 +168,37 @@ class Datasets:
|
|
|
166
168
|
|
|
167
169
|
if lock_timeout_sec:
|
|
168
170
|
payload['lockTimeoutSec'] = lock_timeout_sec
|
|
169
|
-
|
|
171
|
+
|
|
170
172
|
return payload
|
|
171
173
|
|
|
172
|
-
def _download_exported_item(self, item_id, export_type, local_path=None):
|
|
174
|
+
def _download_exported_item(self, item_id, export_type, local_path=None, unzip=True):
|
|
175
|
+
logger.debug(f"start downloading exported item {item_id} with export_type {export_type} and local_path {local_path} and unzip {unzip}")
|
|
173
176
|
export_item = repositories.Items(client_api=self._client_api).get(item_id=item_id)
|
|
174
|
-
export_item_path = export_item.download(local_path=local_path)
|
|
177
|
+
export_item_path = export_item.download(local_path=local_path)
|
|
175
178
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
179
|
+
# Common validation check for both JSON and other export types
|
|
180
|
+
if isinstance(export_item_path, list) or not os.path.isfile(export_item_path):
|
|
181
|
+
raise exceptions.PlatformException(
|
|
182
|
+
error='404',
|
|
183
|
+
message='error downloading annotation zip file. see above for more information. item id: {!r}'.format(
|
|
184
|
+
export_item.id))
|
|
185
|
+
|
|
186
|
+
result = None
|
|
187
|
+
if unzip is False or export_type == entities.ExportType.JSON:
|
|
188
|
+
result = export_item_path
|
|
189
|
+
else:
|
|
183
190
|
try:
|
|
184
191
|
miscellaneous.Zipping.unzip_directory(zip_filename=export_item_path,
|
|
185
|
-
|
|
192
|
+
to_directory=local_path)
|
|
193
|
+
result = local_path
|
|
186
194
|
except Exception as e:
|
|
187
195
|
logger.warning("Failed to extract zip file error: {}".format(e))
|
|
188
196
|
finally:
|
|
189
|
-
# cleanup
|
|
197
|
+
# cleanup only for zip files to avoid removing needed results
|
|
190
198
|
if isinstance(export_item_path, str) and os.path.isfile(export_item_path):
|
|
191
199
|
os.remove(export_item_path)
|
|
200
|
+
logger.debug(f"end downloading, result {result}")
|
|
201
|
+
return result
|
|
192
202
|
|
|
193
203
|
@property
|
|
194
204
|
def platform_url(self):
|
|
@@ -480,7 +490,7 @@ class Datasets:
|
|
|
480
490
|
return dataset
|
|
481
491
|
else:
|
|
482
492
|
raise exceptions.PlatformException(response)
|
|
483
|
-
|
|
493
|
+
|
|
484
494
|
@_api_reference.add(path='/datasets/{id}/unlock', method='patch')
|
|
485
495
|
def unlock(self, dataset: entities.Dataset ) -> entities.Dataset:
|
|
486
496
|
"""
|
|
@@ -625,22 +635,137 @@ class Datasets:
|
|
|
625
635
|
.format(response))
|
|
626
636
|
return self.get(dataset_id=command.spec['returnedModelId'])
|
|
627
637
|
|
|
638
|
+
def _export_recursive(
|
|
639
|
+
self,
|
|
640
|
+
dataset: entities.Dataset = None,
|
|
641
|
+
dataset_name: str = None,
|
|
642
|
+
dataset_id: str = None,
|
|
643
|
+
local_path: str = None,
|
|
644
|
+
filters: Union[dict, entities.Filters] = None,
|
|
645
|
+
annotation_filters: entities.Filters = None,
|
|
646
|
+
feature_vector_filters: entities.Filters = None,
|
|
647
|
+
include_feature_vectors: bool = False,
|
|
648
|
+
include_annotations: bool = False,
|
|
649
|
+
timeout: int = 0,
|
|
650
|
+
dataset_lock: bool = False,
|
|
651
|
+
lock_timeout_sec: int = None,
|
|
652
|
+
export_summary: bool = False,
|
|
653
|
+
max_items_per_subset: int = MAX_ITEMS_PER_SUBSET,
|
|
654
|
+
export_type: ExportType = ExportType.JSON,
|
|
655
|
+
output_export_type: OutputExportType = OutputExportType.JSON,
|
|
656
|
+
) -> Generator[str, None, None]:
|
|
657
|
+
"""
|
|
658
|
+
Export dataset items recursively by splitting large datasets into smaller subsets.
|
|
659
|
+
|
|
660
|
+
Args:
|
|
661
|
+
dataset (entities.Dataset, optional): Dataset entity to export
|
|
662
|
+
dataset_name (str, optional): Name of the dataset to export
|
|
663
|
+
dataset_id (str, optional): ID of the dataset to export
|
|
664
|
+
local_path (str, optional): Local path to save the exported data
|
|
665
|
+
filters (Union[dict, entities.Filters], optional): Filters to apply on the items
|
|
666
|
+
annotation_filters (entities.Filters, optional): Filters to apply on the annotations
|
|
667
|
+
feature_vector_filters (entities.Filters, optional): Filters to apply on the feature vectors
|
|
668
|
+
include_feature_vectors (bool, optional): Whether to include feature vectors in export. Defaults to False
|
|
669
|
+
include_annotations (bool, optional): Whether to include annotations in export. Defaults to False
|
|
670
|
+
timeout (int, optional): Timeout in seconds for the export operation. Defaults to 0
|
|
671
|
+
dataset_lock (bool, optional): Whether to lock the dataset during export. Defaults to False
|
|
672
|
+
lock_timeout_sec (int, optional): Timeout for dataset lock in seconds. Defaults to None
|
|
673
|
+
export_summary (bool, optional): Whether to include export summary. Defaults to False
|
|
674
|
+
max_items_per_subset (int, optional): Maximum items per subset for recursive export. Defaults to MAX_ITEMS_PER_SUBSET
|
|
675
|
+
export_type (ExportType, optional): Type of export (JSON or ZIP). Defaults to ExportType.JSON
|
|
676
|
+
output_export_type (OutputExportType, optional): Output format type. Defaults to OutputExportType.JSON
|
|
677
|
+
|
|
678
|
+
Returns:
|
|
679
|
+
Generator[str, None, None]: Generator yielding export paths
|
|
680
|
+
|
|
681
|
+
Raises:
|
|
682
|
+
NotImplementedError: If ZIP export type is used with JSON output type
|
|
683
|
+
exceptions.PlatformException: If API request fails or command response is invalid
|
|
684
|
+
"""
|
|
685
|
+
logger.debug(f"exporting dataset with export_type {export_type} and output_export_type {output_export_type}")
|
|
686
|
+
if export_type == ExportType.ZIP and output_export_type == OutputExportType.JSON:
|
|
687
|
+
raise NotImplementedError(
|
|
688
|
+
"Zip export type is not supported for JSON output type.\n"
|
|
689
|
+
"If Json output is required, please use the export_type = JSON"
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
# Get dataset entity for recursive filtering
|
|
693
|
+
dataset_entity = self.get(dataset_id=self._resolve_dataset_id(dataset, dataset_name, dataset_id))
|
|
694
|
+
if export_type != ExportType.JSON:
|
|
695
|
+
filters_list = [filters]
|
|
696
|
+
else:
|
|
697
|
+
# Generate filter subsets using recursive_get_filters
|
|
698
|
+
filters_list = entities.Filters._get_split_filters(
|
|
699
|
+
dataset=dataset_entity, filters=filters, max_items=max_items_per_subset
|
|
700
|
+
)
|
|
701
|
+
# First loop: Make all API requests without waiting
|
|
702
|
+
commands = []
|
|
703
|
+
logger.debug("start making all API requests without waiting")
|
|
704
|
+
for filter_i in filters_list:
|
|
705
|
+
# Build payload for this subset
|
|
706
|
+
payload = self._build_payload(
|
|
707
|
+
filters=filter_i,
|
|
708
|
+
include_feature_vectors=include_feature_vectors,
|
|
709
|
+
include_annotations=include_annotations,
|
|
710
|
+
export_type=export_type,
|
|
711
|
+
annotation_filters=annotation_filters,
|
|
712
|
+
feature_vector_filters=feature_vector_filters,
|
|
713
|
+
dataset_lock=dataset_lock,
|
|
714
|
+
lock_timeout_sec=lock_timeout_sec,
|
|
715
|
+
export_summary=export_summary,
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Make API request for this subset
|
|
719
|
+
success, response = self._client_api.gen_request(
|
|
720
|
+
req_type='post', path=f'/datasets/{dataset_entity.id}/export', json_req=payload
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
if not success:
|
|
724
|
+
logger.error(f"failed to make API request /datasets/{dataset_entity.id}/export with payload {payload} response {response}")
|
|
725
|
+
raise exceptions.PlatformException(response)
|
|
726
|
+
|
|
727
|
+
# Handle command execution
|
|
728
|
+
commands.append( entities.Command.from_json(_json=response.json(), client_api=self._client_api))
|
|
729
|
+
|
|
730
|
+
time.sleep(2) # as the command have wrong progress in the beginning
|
|
731
|
+
logger.debug("start waiting for all commands")
|
|
732
|
+
# Second loop: Wait for all commands and process results
|
|
733
|
+
for command in commands:
|
|
734
|
+
command = command.wait(timeout=timeout)
|
|
735
|
+
|
|
736
|
+
if 'outputItemId' not in command.spec:
|
|
737
|
+
raise exceptions.PlatformException(
|
|
738
|
+
error='400', message="outputItemId key is missing in command response"
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
item_id = command.spec['outputItemId']
|
|
742
|
+
# Download and process the exported item
|
|
743
|
+
yield self._download_exported_item(
|
|
744
|
+
item_id=item_id,
|
|
745
|
+
export_type=export_type,
|
|
746
|
+
local_path=local_path,
|
|
747
|
+
unzip=output_export_type != OutputExportType.ZIP,
|
|
748
|
+
)
|
|
749
|
+
|
|
628
750
|
@_api_reference.add(path='/datasets/{id}/export', method='post')
|
|
629
|
-
def export(
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
751
|
+
def export(
|
|
752
|
+
self,
|
|
753
|
+
dataset: entities.Dataset = None,
|
|
754
|
+
dataset_name: str = None,
|
|
755
|
+
dataset_id: str = None,
|
|
756
|
+
local_path: str = None,
|
|
757
|
+
filters: Union[dict, entities.Filters] = None,
|
|
758
|
+
annotation_filters: entities.Filters = None,
|
|
759
|
+
feature_vector_filters: entities.Filters = None,
|
|
760
|
+
include_feature_vectors: bool = False,
|
|
761
|
+
include_annotations: bool = False,
|
|
762
|
+
export_type: ExportType = ExportType.JSON,
|
|
763
|
+
timeout: int = 0,
|
|
764
|
+
dataset_lock: bool = False,
|
|
765
|
+
lock_timeout_sec: int = None,
|
|
766
|
+
export_summary: bool = False,
|
|
767
|
+
output_export_type: OutputExportType = None,
|
|
768
|
+
) -> Optional[str]:
|
|
644
769
|
"""
|
|
645
770
|
Export dataset items and annotations.
|
|
646
771
|
|
|
@@ -648,12 +773,55 @@ class Datasets:
|
|
|
648
773
|
|
|
649
774
|
You must provide at least ONE of the following params: dataset, dataset_name, dataset_id.
|
|
650
775
|
|
|
776
|
+
**Export Behavior by Parameter Combination:**
|
|
777
|
+
|
|
778
|
+
The behavior of this method depends on the combination of `export_type` and `output_export_type`:
|
|
779
|
+
|
|
780
|
+
**When export_type = ExportType.JSON:**
|
|
781
|
+
|
|
782
|
+
- **output_export_type = OutputExportType.JSON (default when None):**
|
|
783
|
+
- Exports data in JSON format, split into subsets of max 500 items
|
|
784
|
+
- Downloads all subset JSON files and concatenates them into a single `result.json` file
|
|
785
|
+
- Returns the path to the concatenated JSON file
|
|
786
|
+
- Cleans up individual subset files after concatenation
|
|
787
|
+
|
|
788
|
+
- **output_export_type = OutputExportType.ZIP:**
|
|
789
|
+
- Same as JSON export, but zips the final `result.json` file
|
|
790
|
+
- Returns the path to the zipped file (`result.json.zip`)
|
|
791
|
+
- Cleans up the unzipped JSON file after zipping
|
|
792
|
+
|
|
793
|
+
- **output_export_type = OutputExportType.FOLDERS:**
|
|
794
|
+
- Exports data in JSON format, split into subsets of max 500 items
|
|
795
|
+
- Downloads all subset JSON files and creates individual JSON files for each item
|
|
796
|
+
- Creates a folder structure mirroring the remote dataset structure
|
|
797
|
+
- Returns the path to the base directory containing the folder structure
|
|
798
|
+
- Each item gets its own JSON file named after the original filename
|
|
799
|
+
|
|
800
|
+
**When export_type = ExportType.ZIP:**
|
|
801
|
+
|
|
802
|
+
- **output_export_type = OutputExportType.ZIP:**
|
|
803
|
+
- Exports data as a ZIP file containing the dataset
|
|
804
|
+
- Returns the downloaded ZIP item directly
|
|
805
|
+
- No additional processing or concatenation
|
|
806
|
+
|
|
807
|
+
- **output_export_type = OutputExportType.JSON:**
|
|
808
|
+
- **NOT SUPPORTED** - Raises NotImplementedError
|
|
809
|
+
- Use export_type=ExportType.JSON instead for JSON output
|
|
810
|
+
|
|
811
|
+
- **output_export_type = OutputExportType.FOLDERS:**
|
|
812
|
+
- **NOT SUPPORTED** - Raises NotImplementedError
|
|
813
|
+
- Use export_type=ExportType.JSON instead for folder output
|
|
814
|
+
|
|
815
|
+
**When output_export_type = None (legacy behavior):**
|
|
816
|
+
- Defaults to OutputExportType.JSON
|
|
817
|
+
- Maintains backward compatibility with existing code
|
|
818
|
+
|
|
651
819
|
:param dtlpy.entities.dataset.Dataset dataset: Dataset object
|
|
652
820
|
:param str dataset_name: The name of the dataset
|
|
653
821
|
:param str dataset_id: The ID of the dataset
|
|
654
|
-
:param str local_path: Local path to save the exported dataset
|
|
822
|
+
:param str local_path: Local path to save the exported dataset
|
|
655
823
|
:param Union[dict, dtlpy.entities.filters.Filters] filters: Filters entity or a query dictionary
|
|
656
|
-
:param dtlpy.entities.filters.Filters annotation_filters: Filters entity to filter annotations for export
|
|
824
|
+
:param dtlpy.entities.filters.Filters annotation_filters: Filters entity to filter annotations for export
|
|
657
825
|
:param dtlpy.entities.filters.Filters feature_vector_filters: Filters entity to filter feature vectors for export
|
|
658
826
|
:param bool include_feature_vectors: Include item feature vectors in the export
|
|
659
827
|
:param bool include_annotations: Include item annotations in the export
|
|
@@ -661,45 +829,92 @@ class Datasets:
|
|
|
661
829
|
:param bool export_summary: Get Summary of the dataset export
|
|
662
830
|
:param int lock_timeout_sec: Timeout for locking the dataset during export in seconds
|
|
663
831
|
:param entities.ExportType export_type: Type of export ('json' or 'zip')
|
|
832
|
+
:param entities.OutputExportType output_export_type: Output format ('json', 'zip', or 'folders'). If None, defaults to 'json'
|
|
664
833
|
:param int timeout: Maximum time in seconds to wait for the export to complete
|
|
665
|
-
:return:
|
|
666
|
-
:rtype:
|
|
667
|
-
|
|
668
|
-
**Example**:
|
|
669
|
-
|
|
670
|
-
.. code-block:: python
|
|
671
|
-
|
|
672
|
-
export_item = project.datasets.export(dataset_id='dataset_id',
|
|
673
|
-
filters=filters,
|
|
674
|
-
include_feature_vectors=True,
|
|
675
|
-
include_annotations=True,
|
|
676
|
-
export_type=dl.ExportType.JSON,
|
|
677
|
-
dataset_lock=True,
|
|
678
|
-
lock_timeout_sec=300,
|
|
679
|
-
export_summary=False)
|
|
834
|
+
:return: Path to exported file/directory, or None if export result is empty
|
|
835
|
+
:rtype: Optional[str]
|
|
680
836
|
"""
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
837
|
+
export_result = list(
|
|
838
|
+
self._export_recursive(
|
|
839
|
+
dataset=dataset,
|
|
840
|
+
dataset_name=dataset_name,
|
|
841
|
+
dataset_id=dataset_id,
|
|
842
|
+
local_path=local_path,
|
|
843
|
+
filters=filters,
|
|
844
|
+
annotation_filters=annotation_filters,
|
|
845
|
+
feature_vector_filters=feature_vector_filters,
|
|
846
|
+
include_feature_vectors=include_feature_vectors,
|
|
847
|
+
include_annotations=include_annotations,
|
|
848
|
+
timeout=timeout,
|
|
849
|
+
dataset_lock=dataset_lock,
|
|
850
|
+
lock_timeout_sec=lock_timeout_sec,
|
|
851
|
+
export_summary=export_summary,
|
|
852
|
+
export_type=export_type,
|
|
853
|
+
output_export_type=output_export_type,
|
|
854
|
+
)
|
|
855
|
+
)
|
|
856
|
+
if all(x is None for x in export_result):
|
|
857
|
+
logger.error("export result is empty")
|
|
858
|
+
return None
|
|
859
|
+
|
|
860
|
+
if export_type == ExportType.ZIP:
|
|
861
|
+
# if export type is zip, then return the _export_recursive result as it
|
|
862
|
+
return export_result[0]
|
|
863
|
+
|
|
864
|
+
# if user didn't provide output_export_type, keep the previous behavior
|
|
865
|
+
if output_export_type is None:
|
|
866
|
+
output_export_type = OutputExportType.JSON
|
|
867
|
+
|
|
868
|
+
# export type is jsos :
|
|
869
|
+
# Load all items from subset JSON files and clean them up
|
|
870
|
+
all_items = []
|
|
871
|
+
logger.debug("start loading all items from subset JSON files")
|
|
872
|
+
for json_file in export_result:
|
|
873
|
+
if json_file is None:
|
|
874
|
+
continue
|
|
875
|
+
if os.path.isfile(json_file):
|
|
876
|
+
with open(json_file, 'r') as f:
|
|
877
|
+
items = json.load(f)
|
|
878
|
+
if isinstance(items, list):
|
|
879
|
+
all_items.extend(items)
|
|
880
|
+
os.remove(json_file)
|
|
881
|
+
|
|
882
|
+
base_dir = os.path.dirname(export_result[0])
|
|
883
|
+
if output_export_type != OutputExportType.FOLDERS:
|
|
884
|
+
dataset_id=self._resolve_dataset_id(dataset, dataset_name, dataset_id)
|
|
885
|
+
result_file_name = f"{dataset_id}.json"
|
|
886
|
+
result_file = os.path.join(base_dir, result_file_name)
|
|
887
|
+
logger.debug(f"start writing all items to result file {result_file}")
|
|
888
|
+
with open(result_file, 'w') as f:
|
|
889
|
+
json.dump(all_items, f)
|
|
890
|
+
if output_export_type == OutputExportType.ZIP:
|
|
891
|
+
# Zip the result file
|
|
892
|
+
zip_filename = result_file + '.zip'
|
|
893
|
+
# Create zip file
|
|
894
|
+
logger.debug(f"start zipping result file {zip_filename}")
|
|
895
|
+
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
|
|
896
|
+
zf.write(result_file, arcname=os.path.basename(result_file))
|
|
897
|
+
|
|
898
|
+
# Remove original json after zipping
|
|
899
|
+
os.remove(result_file)
|
|
900
|
+
result_file = zip_filename
|
|
901
|
+
return result_file
|
|
902
|
+
logger.debug("start building per-item JSON files under local_path mirroring remote structure")
|
|
903
|
+
# Build per-item JSON files under local_path mirroring remote structure
|
|
904
|
+
for item in all_items:
|
|
905
|
+
rel_json_path = os.path.splitext(item.get('filename'))[0] + '.json'
|
|
906
|
+
# Remove leading slash to make it a relative path
|
|
907
|
+
if rel_json_path.startswith('/'):
|
|
908
|
+
rel_json_path = rel_json_path[1:]
|
|
909
|
+
out_path = os.path.join(base_dir, rel_json_path)
|
|
910
|
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
911
|
+
try:
|
|
912
|
+
with open(out_path, 'w') as outf:
|
|
913
|
+
json.dump(item, outf)
|
|
914
|
+
except Exception:
|
|
915
|
+
logger.exception(f'Failed writing export item JSON to {out_path}')
|
|
916
|
+
logger.debug("end building per-item JSON files under local_path mirroring remote structure")
|
|
917
|
+
return base_dir
|
|
703
918
|
|
|
704
919
|
@_api_reference.add(path='/datasets/merge', method='post')
|
|
705
920
|
def merge(self,
|
|
@@ -1185,7 +1400,6 @@ class Datasets:
|
|
|
1185
1400
|
import warnings
|
|
1186
1401
|
warnings.warn("`readonly` flag on dataset is deprecated, doing nothing.", DeprecationWarning)
|
|
1187
1402
|
|
|
1188
|
-
|
|
1189
1403
|
@_api_reference.add(path='/datasets/{id}/split', method='post')
|
|
1190
1404
|
def split_ml_subsets(self,
|
|
1191
1405
|
dataset_id: str,
|
|
@@ -1201,10 +1415,10 @@ class Datasets:
|
|
|
1201
1415
|
:rtype: bool
|
|
1202
1416
|
:raises: PlatformException on failure and ValueError if percentages do not sum to 100 or invalid keys/values.
|
|
1203
1417
|
"""
|
|
1204
|
-
|
|
1418
|
+
# Validate percentages
|
|
1205
1419
|
if not ml_split_list:
|
|
1206
1420
|
ml_split_list = {'train': 80, 'validation': 10, 'test': 10}
|
|
1207
|
-
|
|
1421
|
+
|
|
1208
1422
|
if not items_query:
|
|
1209
1423
|
items_query = entities.Filters()
|
|
1210
1424
|
|
|
@@ -1238,7 +1452,6 @@ class Datasets:
|
|
|
1238
1452
|
else:
|
|
1239
1453
|
raise exceptions.PlatformException(response)
|
|
1240
1454
|
|
|
1241
|
-
|
|
1242
1455
|
@_api_reference.add(path='/datasets/{id}/items/bulk-update-metadata', method='post')
|
|
1243
1456
|
def bulk_update_ml_subset(self, dataset_id: str, items_query: dict, subset: str = None, deleteTag: bool = False) -> bool:
|
|
1244
1457
|
"""
|
dtlpy/repositories/downloader.py
CHANGED
|
@@ -49,7 +49,8 @@ class Downloader:
|
|
|
49
49
|
export_version=entities.ExportVersion.V1,
|
|
50
50
|
dataset_lock=False,
|
|
51
51
|
lock_timeout_sec=None,
|
|
52
|
-
export_summary=False
|
|
52
|
+
export_summary=False,
|
|
53
|
+
raise_on_error=False
|
|
53
54
|
):
|
|
54
55
|
"""
|
|
55
56
|
Download dataset by filters.
|
|
@@ -78,6 +79,7 @@ class Downloader:
|
|
|
78
79
|
:param bool dataset_lock: optional - default = False
|
|
79
80
|
:param bool export_summary: optional - default = False
|
|
80
81
|
:param int lock_timeout_sec: optional
|
|
82
|
+
:param bool raise_on_error: raise an exception if an error occurs
|
|
81
83
|
:return: Output (list)
|
|
82
84
|
"""
|
|
83
85
|
|
|
@@ -313,8 +315,24 @@ class Downloader:
|
|
|
313
315
|
# log error
|
|
314
316
|
if n_error > 0:
|
|
315
317
|
log_filepath = reporter.generate_log_files()
|
|
318
|
+
# Get up to 5 error examples for the exception message
|
|
319
|
+
error_text = ""
|
|
320
|
+
error_counter = 0
|
|
321
|
+
if reporter._errors:
|
|
322
|
+
for _id, error in reporter._errors.items():
|
|
323
|
+
error_counter += 1
|
|
324
|
+
error_text += f"Item ID: {_id}, Error: {error} | "
|
|
325
|
+
if error_counter >= 5:
|
|
326
|
+
break
|
|
327
|
+
error_message = f"Errors in {n_error} files. Errors: {error_text}"
|
|
316
328
|
if log_filepath is not None:
|
|
317
|
-
|
|
329
|
+
error_message += f", see {log_filepath} for full log"
|
|
330
|
+
if raise_on_error is True:
|
|
331
|
+
raise PlatformException(
|
|
332
|
+
error="400", message=error_message
|
|
333
|
+
)
|
|
334
|
+
else:
|
|
335
|
+
logger.warning(error_message)
|
|
318
336
|
if int(n_download) <= 1 and int(n_exist) <= 1:
|
|
319
337
|
try:
|
|
320
338
|
return next(reporter.output)
|
|
@@ -428,7 +446,7 @@ class Downloader:
|
|
|
428
446
|
|
|
429
447
|
if export_summary:
|
|
430
448
|
payload['summary'] = export_summary
|
|
431
|
-
|
|
449
|
+
|
|
432
450
|
if lock_timeout_sec:
|
|
433
451
|
payload['lockTimeoutSec'] = lock_timeout_sec
|
|
434
452
|
|
|
@@ -753,6 +771,7 @@ class Downloader:
|
|
|
753
771
|
if response_output != local_filepath:
|
|
754
772
|
source_path = os.path.normpath(response_output)
|
|
755
773
|
shutil.copyfile(source_path, local_filepath)
|
|
774
|
+
download_done = True
|
|
756
775
|
else:
|
|
757
776
|
try:
|
|
758
777
|
temp_file_path = local_filepath + '.download'
|
|
@@ -806,6 +825,7 @@ class Downloader:
|
|
|
806
825
|
source_file = response_output
|
|
807
826
|
with open(source_file, 'wb') as f:
|
|
808
827
|
data = f.read()
|
|
828
|
+
download_done = True
|
|
809
829
|
else:
|
|
810
830
|
try:
|
|
811
831
|
for chunk in response.iter_content(chunk_size=chunk_size):
|