dtlpy 1.116.6__py3-none-any.whl → 1.118.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,18 +1,22 @@
1
+ import copy
2
+ import io
3
+ import json
4
+ import logging
5
+ import multiprocessing
6
+ import os
7
+ import shutil
8
+ import sys
9
+ import tempfile
10
+ import traceback
1
11
  from pathlib import Path
2
- from requests.adapters import HTTPAdapter
3
- from urllib3.util import Retry
4
- from PIL import Image
12
+ from urllib.parse import unquote, urlparse
13
+
5
14
  import numpy as np
6
- import traceback
7
- from urllib.parse import urlparse, unquote
8
15
  import requests
9
- import logging
10
- import shutil
11
- import json
12
16
  import tqdm
13
- import sys
14
- import os
15
- import io
17
+ from PIL import Image
18
+ from requests.adapters import HTTPAdapter
19
+ from urllib3.util import Retry
16
20
 
17
21
  from .. import entities, repositories, miscellaneous, PlatformException, exceptions
18
22
  from ..services import Reporter
@@ -20,12 +24,257 @@ from ..services import Reporter
20
24
  logger = logging.getLogger(name='dtlpy')
21
25
 
22
26
  NUM_TRIES = 3 # try to download 3 time before fail on item
23
-
27
+ DOWNLOAD_MAX_ITEMS_PER_SUBSET = 1000
24
28
 
25
29
  class Downloader:
26
30
  def __init__(self, items_repository):
27
31
  self.items_repository = items_repository
28
32
 
33
+ def _process_download_results(self, reporter, raise_on_error=False):
34
+ """
35
+ Process download results and generate summary report.
36
+
37
+ :param reporter: Reporter instance containing download results
38
+ :param raise_on_error: If True, raise exception on download errors
39
+ :return: Output from reporter
40
+ """
41
+ # reporting
42
+ n_download = reporter.status_count(status='download')
43
+ n_exist = reporter.status_count(status='exist')
44
+ n_error = reporter.status_count(status='error')
45
+ logger.info(f"Number of files downloaded:{n_download}")
46
+ logger.info(f"Number of files exists: {n_exist}")
47
+ logger.info(f"Total number of files: {n_download + n_exist}")
48
+
49
+ # log error
50
+ if n_error > 0:
51
+ log_filepath = reporter.generate_log_files()
52
+ # Get up to 5 error examples for the exception message
53
+ error_text = ""
54
+ error_counter = 0
55
+ if reporter._errors:
56
+ for _id, error in reporter._errors.items():
57
+ error_counter += 1
58
+ error_text += f"Item ID: {_id}, Error: {error} | "
59
+ if error_counter >= 5:
60
+ break
61
+ error_message = f"Errors in {n_error} files. Errors: {error_text}"
62
+ if log_filepath is not None:
63
+ error_message += f", see {log_filepath} for full log"
64
+ if raise_on_error is True:
65
+ raise PlatformException(
66
+ error="400", message=error_message
67
+ )
68
+ else:
69
+ logger.warning(error_message)
70
+
71
+ if int(n_download) <= 1 and int(n_exist) <= 1:
72
+ try:
73
+ return next(reporter.output)
74
+ except StopIteration:
75
+ return None
76
+ return reporter.output
77
+
78
+ def _process_item_json(self, local_path, item_json, reporter, pbar, overwrite=False):
79
+ """
80
+ Process a single item JSON for download, saving both the item file and metadata.
81
+
82
+ :param local_path: Local path to save files
83
+ :param item_json: Item JSON metadata
84
+ :param reporter: Reporter instance for tracking progress
85
+ :param pbar: Progress bar instance
86
+ :param overwrite: Whether to overwrite existing files
87
+ :return: Error message, traceback, and downloaded filepath
88
+ """
89
+ err = None
90
+ trace = None
91
+ downloaded_filepath = None
92
+ item_id = item_json['id']
93
+ filename = item_json['filename'].lstrip('/')
94
+
95
+ for i_try in range(NUM_TRIES):
96
+ try:
97
+ # Download the image
98
+ image_path = Path(local_path) / 'items' / filename
99
+ # Ensure the directory for the image file exists (in case filename has subdirectories)
100
+ image_path.parent.mkdir(parents=True, exist_ok=True)
101
+ item = entities.Item.from_json(_json = item_json, client_api=self.items_repository._client_api, is_fetched=False)
102
+ downloaded_data = self.__thread_download(
103
+ item=item,
104
+ local_path=str(image_path.parent),
105
+ local_filepath=str(image_path),
106
+ save_locally=True,
107
+ to_array=False,
108
+ overwrite=overwrite,
109
+ annotation_options=[],
110
+ annotation_filters=None,
111
+ )
112
+
113
+ if downloaded_data is None:
114
+ err = 'Failed to download image'
115
+ trace = ''
116
+ else:
117
+ # Save the item JSON directly
118
+ json_filename = Path(filename).stem + '.json'
119
+ json_path = Path(local_path) / 'json' / Path(filename).parent / json_filename
120
+
121
+ # Ensure the directory for the JSON file exists (in case filename has subdirectories)
122
+ json_path.parent.mkdir(parents=True, exist_ok=True)
123
+
124
+ # Save the original item_json directly
125
+ with open(json_path, 'w', encoding='utf-8') as f:
126
+ json.dump(item_json, f, indent=2, ensure_ascii=False)
127
+
128
+ downloaded_filepath = str(image_path)
129
+
130
+ if downloaded_filepath is not None:
131
+ break
132
+
133
+ except Exception as e:
134
+ logger.debug(f"Download item: {filename}. Try {i_try + 1}/{NUM_TRIES}. Fail.")
135
+ err = e
136
+ trace = traceback.format_exc()
137
+
138
+ pbar.update()
139
+ if downloaded_filepath is None:
140
+ if err is None:
141
+ err = self.items_repository._client_api.platform_exception
142
+ reporter.set_index(status="error", ref=item_id, success=False, error=f"{err}\n{trace}")
143
+ else:
144
+ reporter.set_index(ref=item_id, status="download", output=downloaded_filepath, success=True)
145
+
146
+ def _download_recursive(
147
+ self,
148
+ local_path=None,
149
+ filters: entities.Filters = None,
150
+ annotation_filters: entities.Filters = None,
151
+ file_types=None,
152
+ overwrite=False,
153
+ raise_on_error=False,
154
+ dataset_lock=False,
155
+ lock_timeout_sec=None,
156
+ ):
157
+ """
158
+ Download items recursively from a dataset.
159
+
160
+ :param local_path: Local path to save downloaded items
161
+ :param filters: Filters entity to filter items
162
+ :param annotation_filters: Filters entity to filter annotations
163
+ :param file_types: List of file types to download
164
+ :param overwrite: Whether to overwrite existing files
165
+ :param raise_on_error: Raise error if download fails
166
+ :param dataset_lock: Lock dataset during download
167
+ :param lock_timeout_sec: Lock timeout in seconds
168
+ """
169
+ filters, annotation_filters = self._prepare_filters(filters=filters,annotation_filters=annotation_filters,file_types=file_types)
170
+ filter_copy = copy.deepcopy(filters)
171
+ filter_copy.page_size = 0
172
+ num_items = self.items_repository.list(filters=filter_copy).items_count
173
+ if num_items == 0:
174
+ return list()
175
+ client_api = self.items_repository._client_api
176
+ reporter = Reporter(
177
+ num_workers=num_items,
178
+ resource=Reporter.ITEMS_DOWNLOAD,
179
+ print_error_logs=client_api.verbose.print_error_logs,
180
+ client_api=client_api,
181
+ )
182
+
183
+ # Create directories once using pathlib
184
+ local_path_obj = Path(local_path)
185
+ items_dir = local_path_obj / 'items'
186
+ jsons_dir = local_path_obj / 'json'
187
+ items_dir.mkdir(parents=True, exist_ok=True)
188
+ jsons_dir.mkdir(parents=True, exist_ok=True)
189
+
190
+ jobs = [None for _ in range(num_items)]
191
+ # crrently keep the thread count to default.
192
+ # client_api._thread_pools_names['item.download'] = 5 * multiprocessing.cpu_count()
193
+ pool = client_api.thread_pools(pool_name='item.download')
194
+ pbar = tqdm.tqdm(
195
+ total=num_items,
196
+ disable=client_api.verbose.disable_progress_bar_download_dataset,
197
+ file=sys.stdout,
198
+ desc='Download Items',
199
+ )
200
+ try:
201
+ i_item = 0
202
+ import time
203
+ start_time = time.time()
204
+ for json_file in self.items_repository.dataset.project.datasets._export_recursive(
205
+ dataset=self.items_repository.dataset,
206
+ local_path=tempfile.mkdtemp(prefix='download_recursive_jsons_'),
207
+ max_items_per_subset=DOWNLOAD_MAX_ITEMS_PER_SUBSET,
208
+ include_annotations=True,
209
+ filters=filters,
210
+ annotation_filters=annotation_filters,
211
+ dataset_lock=dataset_lock,
212
+ lock_timeout_sec=lock_timeout_sec,
213
+ ):
214
+ end_time = time.time()
215
+ with open(json_file, 'r') as f:
216
+ data = json.load(f)
217
+ for item_json in data:
218
+ jobs[i_item] = pool.submit(
219
+ self._process_item_json,
220
+ **{
221
+ "local_path": local_path,
222
+ "item_json": item_json,
223
+ "reporter": reporter,
224
+ "pbar": pbar,
225
+ "overwrite": overwrite,
226
+ },
227
+ )
228
+ i_item += 1
229
+ finally:
230
+ _ = [j.result() for j in jobs if j is not None]
231
+ pbar.close()
232
+ return self._process_download_results(reporter=reporter, raise_on_error=raise_on_error)
233
+
234
+ @staticmethod
235
+ def _prepare_filters(filters: entities.Filters = None,
236
+ annotation_filters: entities.Filters = None,
237
+ file_types=None):
238
+ """
239
+ Prepare and merge filters with annotation filters.
240
+
241
+ :param filters: Filters entity or None
242
+ :param annotation_filters: Annotation filters to merge with item filters
243
+ :param file_types: List of file types to filter
244
+ :return: Prepared filters entity
245
+ """
246
+ # filters
247
+ if filters is None:
248
+ filters = entities.Filters()
249
+ filters._user_query = 'false'
250
+ # file types
251
+ if file_types is not None:
252
+ filters.add(field='metadata.system.mimetype', values=file_types, operator=entities.FiltersOperations.IN)
253
+ if annotation_filters is not None:
254
+ if len(annotation_filters.and_filter_list) > 0 or len(annotation_filters.or_filter_list) > 0:
255
+ for annotation_filter_and in annotation_filters.and_filter_list:
256
+ filters.add_join(field=annotation_filter_and.field,
257
+ values=annotation_filter_and.values,
258
+ operator=annotation_filter_and.operator,
259
+ method=entities.FiltersMethod.AND)
260
+ for annotation_filter_or in annotation_filters.or_filter_list:
261
+ filters.add_join(field=annotation_filter_or.field,
262
+ values=annotation_filter_or.values,
263
+ operator=annotation_filter_or.operator,
264
+ method=entities.FiltersMethod.OR)
265
+ elif annotation_filters.custom_filter is not None:
266
+ annotation_query_dict = annotation_filters.prepare()
267
+ items_query_dict = filters.prepare()
268
+ items_query_dict["join"] = annotation_query_dict
269
+ filters.reset()
270
+ filters.custom_filter = items_query_dict
271
+
272
+ else:
273
+ annotation_filters = entities.Filters(resource=entities.FiltersResource.ANNOTATION)
274
+ filters._user_query = 'false'
275
+
276
+ return filters, annotation_filters
277
+
29
278
  def download(self,
30
279
  # filter options
31
280
  filters: entities.Filters = None,
@@ -131,35 +380,12 @@ class Downloader:
131
380
  items_to_download = [items]
132
381
  num_items = len(items)
133
382
  else:
134
- # filters
135
- if filters is None:
136
- filters = entities.Filters()
137
- filters._user_query = 'false'
138
- # file types
139
- if file_types is not None:
140
- filters.add(field='metadata.system.mimetype', values=file_types, operator=entities.FiltersOperations.IN)
141
- if annotation_filters is not None:
142
- if len(annotation_filters.and_filter_list) > 0 or len(annotation_filters.or_filter_list) > 0:
143
- for annotation_filter_and in annotation_filters.and_filter_list:
144
- filters.add_join(field=annotation_filter_and.field,
145
- values=annotation_filter_and.values,
146
- operator=annotation_filter_and.operator,
147
- method=entities.FiltersMethod.AND)
148
- for annotation_filter_or in annotation_filters.or_filter_list:
149
- filters.add_join(field=annotation_filter_or.field,
150
- values=annotation_filter_or.values,
151
- operator=annotation_filter_or.operator,
152
- method=entities.FiltersMethod.OR)
153
- elif annotation_filters.custom_filter is not None:
154
- annotation_query_dict = annotation_filters.prepare()
155
- items_query_dict = filters.prepare()
156
- items_query_dict["join"] = annotation_query_dict
157
- filters.reset()
158
- filters.custom_filter = items_query_dict
159
-
160
- else:
161
- annotation_filters = entities.Filters(resource=entities.FiltersResource.ANNOTATION)
162
- filters._user_query = 'false'
383
+ # Prepare and merge filters
384
+ filters, annotation_filters = self._prepare_filters(
385
+ filters=filters,
386
+ annotation_filters=annotation_filters,
387
+ file_types=file_types
388
+ )
163
389
 
164
390
  items_to_download = self.items_repository.list(filters=filters)
165
391
  num_items = items_to_download.items_count
@@ -234,7 +460,8 @@ class Downloader:
234
460
  # pool
235
461
  pool = client_api.thread_pools(pool_name='item.download')
236
462
  # download
237
- pbar = tqdm.tqdm(total=num_items, disable=client_api.verbose.disable_progress_bar_download_dataset, file=sys.stdout,
463
+ pbar = tqdm.tqdm(total=num_items, disable=client_api.verbose.disable_progress_bar_download_dataset,
464
+ file=sys.stdout,
238
465
  desc='Download Items')
239
466
  try:
240
467
  i_item = 0
@@ -305,41 +532,8 @@ class Downloader:
305
532
  finally:
306
533
  _ = [j.result() for j in jobs if j is not None]
307
534
  pbar.close()
308
- # reporting
309
- n_download = reporter.status_count(status='download')
310
- n_exist = reporter.status_count(status='exist')
311
- n_error = reporter.status_count(status='error')
312
- logger.info("Number of files downloaded:{}".format(n_download))
313
- logger.info("Number of files exists: {}".format(n_exist))
314
- logger.info("Total number of files: {}".format(n_download + n_exist))
315
535
 
316
- # log error
317
- if n_error > 0:
318
- log_filepath = reporter.generate_log_files()
319
- # Get up to 5 error examples for the exception message
320
- error_text = ""
321
- error_counter = 0
322
- if reporter._errors:
323
- for _id, error in reporter._errors.items():
324
- error_counter += 1
325
- error_text += f"Item ID: {_id}, Error: {error} | "
326
- if error_counter >= 5:
327
- break
328
- error_message = f"Errors in {n_error} files. Errors: {error_text}"
329
- if log_filepath is not None:
330
- error_message += f", see {log_filepath} for full log"
331
- if raise_on_error is True:
332
- raise PlatformException(
333
- error="400", message=error_message
334
- )
335
- else:
336
- logger.warning(error_message)
337
- if int(n_download) <= 1 and int(n_exist) <= 1:
338
- try:
339
- return next(reporter.output)
340
- except StopIteration:
341
- return None
342
- return reporter.output
536
+ return self._process_download_results(reporter=reporter, raise_on_error=raise_on_error)
343
537
 
344
538
  def __thread_download_wrapper(self, i_item,
345
539
  # item params
@@ -403,7 +597,7 @@ class Downloader:
403
597
  export_version=entities.ExportVersion.V1,
404
598
  dataset_lock=False,
405
599
  lock_timeout_sec=None,
406
- export_summary=False
600
+ export_summary=False
407
601
  ):
408
602
  """
409
603
  Download annotations json for entire dataset
@@ -633,27 +827,12 @@ class Downloader:
633
827
  @staticmethod
634
828
  def __get_link_source(item):
635
829
  assert isinstance(item, entities.Item)
636
- if not item.is_fetched:
637
- return item, '', False
638
-
639
- if not item.filename.endswith('.json') or \
640
- item.metadata.get('system', {}).get('shebang', {}).get('dltype', '') != 'link':
641
- return item, '', False
642
-
643
- # recursively get next id link item
644
- while item.filename.endswith('.json') and \
645
- item.metadata.get('system', {}).get('shebang', {}).get('dltype', '') == 'link' and \
646
- item.metadata.get('system', {}).get('shebang', {}).get('linkInfo', {}).get('type', '') == 'id':
647
- item = item.dataset.items.get(item_id=item.metadata['system']['shebang']['linkInfo']['ref'])
648
-
649
- # check if link
650
- if item.filename.endswith('.json') and \
651
- item.metadata.get('system', {}).get('shebang', {}).get('dltype', '') == 'link' and \
652
- item.metadata.get('system', {}).get('shebang', {}).get('linkInfo', {}).get('type', '') == 'url':
653
- url = item.metadata['system']['shebang']['linkInfo']['ref']
654
- return item, url, True
655
- else:
656
- return item, '', False
830
+ is_url = False
831
+ url = item.resolved_stream
832
+ if item.metadata.get('system', {}).get('shebang', {}).get('linkInfo', {}).get('type', '') == 'url':
833
+ is_url = True
834
+
835
+ return item, url, is_url, url.startswith('file://')
657
836
 
658
837
  def __file_validation(self, item, downloaded_file):
659
838
  res = False
@@ -688,7 +867,7 @@ class Downloader:
688
867
  """
689
868
  Get a single item's binary data
690
869
  Calling this method will returns the item body itself , an image for example with the proper mimetype.
691
-
870
+
692
871
  :param item: Item entity to download
693
872
  :param save_locally: bool. save to file or return buffer
694
873
  :param local_path: item local folder to save to.
@@ -709,8 +888,7 @@ class Downloader:
709
888
  if save_locally and os.path.isfile(local_filepath):
710
889
  need_to_download = overwrite
711
890
 
712
- item, url, is_url = self.__get_link_source(item=item)
713
- is_local_link = isinstance(url, str) and url.startswith('file://')
891
+ item, url, is_url, is_local_link = self.__get_link_source(item=item)
714
892
 
715
893
  # save as byte stream
716
894
  data = io.BytesIO()
@@ -804,9 +982,11 @@ class Downloader:
804
982
 
805
983
  file_validation = True
806
984
  if not is_url:
807
- file_validation, start_point, chunk_resume = self.__get_next_chunk(item=item,
808
- download_progress=temp_file_path,
809
- chunk_resume=chunk_resume)
985
+ file_validation, start_point, chunk_resume = self.__get_next_chunk(
986
+ item=item,
987
+ download_progress=temp_file_path,
988
+ chunk_resume=chunk_resume
989
+ )
810
990
  if file_validation:
811
991
  shutil.move(temp_file_path, local_filepath)
812
992
  download_done = True
@@ -933,6 +1113,7 @@ class Downloader:
933
1113
  """
934
1114
  :param url:
935
1115
  """
1116
+ response = None
936
1117
 
937
1118
  if url.startswith('file://'):
938
1119
  parsed = urlparse(url)
@@ -953,24 +1134,24 @@ class Downloader:
953
1134
  )
954
1135
 
955
1136
  try:
956
- return io.BufferedReader(io.FileIO(path, 'rb'))
1137
+ response = io.BufferedReader(io.FileIO(path, 'rb'))
957
1138
  except PermissionError as e:
958
1139
  raise PlatformException(
959
1140
  error='403',
960
1141
  message=f'Permission denied accessing file: {url}'
961
1142
  ) from e
962
-
963
- prepared_request = requests.Request(method='GET', url=url).prepare()
964
- with requests.Session() as s:
965
- retry = Retry(
966
- total=3,
967
- read=3,
968
- connect=3,
969
- backoff_factor=1,
970
- )
971
- adapter = HTTPAdapter(max_retries=retry)
972
- s.mount('http://', adapter)
973
- s.mount('https://', adapter)
974
- response = s.send(request=prepared_request, stream=True)
1143
+ else:
1144
+ prepared_request = requests.Request(method='GET', url=url).prepare()
1145
+ with requests.Session() as s:
1146
+ retry = Retry(
1147
+ total=3,
1148
+ read=3,
1149
+ connect=3,
1150
+ backoff_factor=1,
1151
+ )
1152
+ adapter = HTTPAdapter(max_retries=retry)
1153
+ s.mount('http://', adapter)
1154
+ s.mount('https://', adapter)
1155
+ response = s.send(request=prepared_request, stream=True)
975
1156
 
976
1157
  return response