supervisely 6.73.343__py3-none-any.whl → 6.73.344__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,16 +31,22 @@ import supervisely as sly
31
31
  from supervisely._utils import (
32
32
  abs_url,
33
33
  batched,
34
- generate_free_name,
35
34
  get_or_create_event_loop,
36
35
  is_development,
36
+ removesuffix,
37
37
  snake_to_human,
38
38
  )
39
39
  from supervisely.annotation.annotation import ANN_EXT, Annotation, TagCollection
40
40
  from supervisely.annotation.obj_class import ObjClass
41
41
  from supervisely.annotation.obj_class_collection import ObjClassCollection
42
- from supervisely.api.api import Api, ApiContext
43
- from supervisely.api.image_api import ImageInfo
42
+ from supervisely.api.api import Api, ApiContext, ApiField
43
+ from supervisely.api.image_api import (
44
+ OFFSETS_PKL_BATCH_SIZE,
45
+ OFFSETS_PKL_SUFFIX,
46
+ BlobImageInfo,
47
+ ImageInfo,
48
+ )
49
+ from supervisely.api.project_api import ProjectInfo
44
50
  from supervisely.collection.key_indexed_collection import (
45
51
  KeyIndexedCollection,
46
52
  KeyObject,
@@ -68,7 +74,9 @@ from supervisely.io.json import dump_json_file, dump_json_file_async, load_json_
68
74
  from supervisely.project.project_meta import ProjectMeta
69
75
  from supervisely.project.project_type import ProjectType
70
76
  from supervisely.sly_logger import logger
71
- from supervisely.task.progress import Progress, tqdm_sly
77
+ from supervisely.task.progress import tqdm_sly
78
+
79
+ TF_BLOB_DIR = "blob-files" # directory for project blob files in team files
72
80
 
73
81
 
74
82
  class CustomUnpickler(pickle.Unpickler):
@@ -224,6 +232,7 @@ class Dataset(KeyObject):
224
232
  seg_dir_name = "seg"
225
233
  meta_dir_name = "meta"
226
234
  datasets_dir_name = "datasets"
235
+ blob_dir_name = "blob"
227
236
 
228
237
  def __init__(
229
238
  self,
@@ -273,6 +282,7 @@ class Dataset(KeyObject):
273
282
  self._project_dir = project_dir
274
283
  self._name = full_ds_name
275
284
  self._short_name = short_ds_name
285
+ self._blob_offset_paths = []
276
286
 
277
287
  if self.dataset_id is not None:
278
288
  self._read_api()
@@ -537,6 +547,23 @@ class Dataset(KeyObject):
537
547
  """
538
548
  return os.path.join(self.directory, self.meta_dir_name)
539
549
 
550
+ @property
551
+ def blob_offsets(self):
552
+ """
553
+ List of paths to the dataset blob offset files.
554
+
555
+ :return: List of paths to the dataset blob offset files.
556
+ :rtype: :class:`List[str]`
557
+ """
558
+ return self._blob_offset_paths
559
+
560
+ @blob_offsets.setter
561
+ def blob_offsets(self, value: List[str]):
562
+ """
563
+ Set the list of paths to the dataset blob offset files.
564
+ """
565
+ self._blob_offset_paths = value
566
+
540
567
  @classmethod
541
568
  def _has_valid_ext(cls, path: str) -> bool:
542
569
  """
@@ -563,6 +590,23 @@ class Dataset(KeyObject):
563
590
  raw_ann_names = set(os.path.basename(path) for path in raw_ann_paths)
564
591
  img_names = [os.path.basename(path) for path in img_paths]
565
592
 
593
+ blob_offset_paths = list_files(
594
+ self.directory, filter_fn=lambda x: x.endswith(OFFSETS_PKL_SUFFIX)
595
+ )
596
+ has_blob_offsets = len(blob_offset_paths) > 0
597
+
598
+ # If we have blob offset files, add the image names from those
599
+ if has_blob_offsets:
600
+ self.blob_offsets = blob_offset_paths
601
+ for offset_file_path in self.blob_offsets:
602
+ try:
603
+ blob_img_info_lists = BlobImageInfo.load_from_pickle_generator(offset_file_path)
604
+ for blob_img_info_list in blob_img_info_lists:
605
+ for blob_img_info in blob_img_info_list:
606
+ img_names.append(blob_img_info.name)
607
+ except Exception as e:
608
+ logger.warning(f"Failed to read blob offset file {offset_file_path}: {str(e)}")
609
+
566
610
  if len(img_names) == 0 and len(raw_ann_names) == 0:
567
611
  logger.info("Dataset {!r} is empty".format(self.name))
568
612
  # raise RuntimeError("Dataset {!r} is empty".format(self.name))
@@ -1308,7 +1352,7 @@ class Dataset(KeyObject):
1308
1352
 
1309
1353
  img_path = "/home/admin/Pictures/Clouds.jpeg"
1310
1354
  img_np = sly.image.read(img_path)
1311
- img_bytes = sly.image.write_bytes(img_np, "jpeg")
1355
+ img_bytes = sly.image.write_bytes(img_np, "jpeg")
1312
1356
  coroutine = ds.add_item_raw_bytes_async("IMG_050.jpeg", img_bytes)
1313
1357
  run_coroutine(coroutine)
1314
1358
 
@@ -1691,7 +1735,7 @@ class Dataset(KeyObject):
1691
1735
  "objects":[],
1692
1736
  "customBigData":{}
1693
1737
  }
1694
-
1738
+
1695
1739
  coroutine = ds.set_ann_dict_async("IMG_8888.jpeg", new_ann_json)
1696
1740
  run_coroutine(coroutine)
1697
1741
  """
@@ -1723,7 +1767,7 @@ class Dataset(KeyObject):
1723
1767
 
1724
1768
  height, width = 500, 700
1725
1769
  new_ann = sly.Annotation((height, width))
1726
-
1770
+
1727
1771
  coroutine = ds.set_ann_async("IMG_0748.jpeg", new_ann)
1728
1772
  run_coroutine(coroutine)
1729
1773
  """
@@ -2036,6 +2080,7 @@ class Project:
2036
2080
  """
2037
2081
 
2038
2082
  dataset_class = Dataset
2083
+ blob_dir_name = "blob"
2039
2084
 
2040
2085
  class DatasetDict(KeyIndexedCollection):
2041
2086
  """
@@ -2075,6 +2120,7 @@ class Project:
2075
2120
 
2076
2121
  parent_dir, name = Project._parse_path(directory)
2077
2122
  self._parent_dir = parent_dir
2123
+ self._blob_dir = os.path.join(directory, self.blob_dir_name)
2078
2124
  self._api = api
2079
2125
  self.project_id = project_id
2080
2126
 
@@ -2086,7 +2132,7 @@ class Project:
2086
2132
  self._name = name
2087
2133
  self._datasets = Project.DatasetDict() # ds_name -> dataset object
2088
2134
  self._meta = None
2089
-
2135
+ self._blob_files = []
2090
2136
  if project_id is not None:
2091
2137
  self._read_api()
2092
2138
  elif mode is OpenMode.READ:
@@ -2138,6 +2184,25 @@ class Project:
2138
2184
  """
2139
2185
  return self._parent_dir
2140
2186
 
2187
+ @property
2188
+ def blob_dir(self) -> str:
2189
+ """
2190
+ Directory for project blobs.
2191
+ Blobs are .tar files with images. Used for fast data transfer.
2192
+
2193
+ :return: Path to project blob directory
2194
+ :rtype: :class:`str`
2195
+ :Usage example:
2196
+
2197
+ .. code-block:: python
2198
+
2199
+ import supervisely as sly
2200
+ project = sly.Project("/home/admin/work/supervisely/projects/lemons_annotated", sly.OpenMode.READ)
2201
+ print(project.blob_dir)
2202
+ # Output: '/home/admin/work/supervisely/projects/lemons_annotated/blob'
2203
+ """
2204
+ return self._blob_dir
2205
+
2141
2206
  @property
2142
2207
  def name(self) -> str:
2143
2208
  """
@@ -2259,6 +2324,61 @@ class Project:
2259
2324
  """
2260
2325
  return sum(len(ds) for ds in self._datasets)
2261
2326
 
2327
+ @property
2328
+ def blob_files(self) -> List[str]:
2329
+ """
2330
+ List of blob files.
2331
+
2332
+ :return: List of blob files
2333
+ :rtype: :class:`list`
2334
+ :Usage example:
2335
+
2336
+ .. code-block:: python
2337
+
2338
+ import supervisely as sly
2339
+ project = sly.Project("/home/admin/work/supervisely/projects/lemons_annotated", sly.OpenMode.READ)
2340
+ print(project.blob_files)
2341
+ # Output: []
2342
+ """
2343
+ return self._blob_files
2344
+
2345
+ @blob_files.setter
2346
+ def blob_files(self, blob_files: List[str]) -> None:
2347
+ """
2348
+ Sets blob files to the project.
2349
+
2350
+ :param blob_files: List of blob files.
2351
+ :type
2352
+ :return: None
2353
+ :rtype: NoneType
2354
+ :Usage example:
2355
+
2356
+ .. code-block:: python
2357
+
2358
+ import supervisely as sly
2359
+ project = sly.Project("/home/admin/work/supervisely/projects/lemons_annotated", sly.OpenMode.READ)
2360
+ project.blob_files = ["blob_file.tar"]
2361
+ """
2362
+ self._blob_files = blob_files
2363
+
2364
+ def add_blob_file(self, file_name: str) -> None:
2365
+ """
2366
+ Adds blob file to the project.
2367
+
2368
+ :param file_name: File name.
2369
+ :type file_name: :class:`str`
2370
+ :return: None
2371
+ :rtype: NoneType
2372
+ :Usage example:
2373
+
2374
+ .. code-block:: python
2375
+
2376
+ import supervisely as sly
2377
+ project = sly.Project("/home/admin/work/supervisely/projects/lemons_annotated", sly.OpenMode.READ)
2378
+ project.add_blob_file("blob_file.tar")
2379
+ """
2380
+ self._blob_files.append(file_name)
2381
+
2262
2382
  def get_classes_stats(
2263
2383
  self,
2264
2384
  dataset_names: Optional[List[str]] = None,
@@ -2296,6 +2416,10 @@ class Project:
2296
2416
  def _read(self):
2297
2417
  meta_json = load_json_file(self._get_project_meta_path())
2298
2418
  self._meta = ProjectMeta.from_json(meta_json)
2419
+ if dir_exists(self.blob_dir):
2420
+ self.blob_files = [Path(file).name for file in list_files(self.blob_dir)]
2421
+ else:
2422
+ self.blob_files = []
2299
2423
 
2300
2424
  ignore_dirs = self.dataset_class.ignorable_dirs() # dir names that can not be datasets
2301
2425
 
@@ -2350,6 +2474,7 @@ class Project:
2350
2474
  else:
2351
2475
  mkdir(self.directory)
2352
2476
  self.set_meta(ProjectMeta())
2477
+ self.blob_files = []
2353
2478
 
2354
2479
  def validate(self):
2355
2480
  # @TODO: remove?
@@ -3085,6 +3210,7 @@ class Project:
3085
3210
  save_images: bool = True,
3086
3211
  save_image_meta: bool = False,
3087
3212
  resume_download: bool = False,
3213
+ **kwargs,
3088
3214
  ) -> None:
3089
3215
  """
3090
3216
  Download project from Supervisely to the given directory.
@@ -3113,6 +3239,9 @@ class Project:
3113
3239
  :type save_images: :class:`bool`, optional
3114
3240
  :param save_image_meta: Download images metadata in JSON format or not.
3115
3241
  :type save_image_meta: :class:`bool`, optional
3242
+ :param download_blob_files: Default is False. It will download images in classic way.
3243
+ If True, it will download blob files, if they are present in the project, to optimize download process.
3244
+ :type download_blob_files: bool, optional
3116
3245
  :return: None
3117
3246
  :rtype: NoneType
3118
3247
  :Usage example:
@@ -3151,6 +3280,7 @@ class Project:
3151
3280
  save_images=save_images,
3152
3281
  save_image_meta=save_image_meta,
3153
3282
  resume_download=resume_download,
3283
+ **kwargs,
3154
3284
  )
3155
3285
 
3156
3286
  @staticmethod
@@ -3731,7 +3861,7 @@ class Project:
3731
3861
 
3732
3862
  project_id = 8888
3733
3863
  save_directory = "/path/to/save/projects"
3734
-
3864
+
3735
3865
  coroutine = sly.Project.download_async(api, project_id, save_directory)
3736
3866
  run_coroutine(coroutine)
3737
3867
  """
@@ -3755,6 +3885,7 @@ class Project:
3755
3885
  save_image_meta=save_image_meta,
3756
3886
  images_ids=images_ids,
3757
3887
  resume_download=resume_download,
3888
+ **kwargs,
3758
3889
  )
3759
3890
 
3760
3891
  def to_coco(
@@ -4002,9 +4133,13 @@ def _download_project(
4002
4133
  save_image_meta: Optional[bool] = False,
4003
4134
  images_ids: Optional[List[int]] = None,
4004
4135
  resume_download: Optional[bool] = False,
4136
+ **kwargs,
4005
4137
  ):
4138
+ download_blob_files = kwargs.pop("download_blob_files", False)
4139
+
4006
4140
  dataset_ids = set(dataset_ids) if (dataset_ids is not None) else None
4007
4141
  project_fs = None
4142
+
4008
4143
  meta = ProjectMeta.from_json(api.project.get_meta(project_id, with_settings=True))
4009
4144
  if os.path.exists(dest_dir) and resume_download:
4010
4145
  dump_json_file(meta.to_json(), os.path.join(dest_dir, "meta.json"))
@@ -4029,6 +4164,7 @@ def _download_project(
4029
4164
 
4030
4165
  existing_datasets = {dataset.path: dataset for dataset in project_fs.datasets}
4031
4166
  for parents, dataset in api.dataset.tree(project_id):
4167
+ blob_files_to_download = {}
4032
4168
  dataset_path = Dataset._get_dataset_path(dataset.name, parents)
4033
4169
  dataset_id = dataset.id
4034
4170
  if dataset_ids is not None and dataset_id not in dataset_ids:
@@ -4065,6 +4201,7 @@ def _download_project(
4065
4201
  project_meta=meta,
4066
4202
  ):
4067
4203
  for batch in batched(images, batch_size):
4204
+ batch: List[ImageInfo]
4068
4205
  image_ids = [image_info.id for image_info in batch]
4069
4206
  image_names = [image_info.name for image_info in batch]
4070
4207
 
@@ -4085,18 +4222,97 @@ def _download_project(
4085
4222
  ):
4086
4223
  indexes_to_download.append(i)
4087
4224
 
4088
- # download images in numpy format
4225
+ # Collect images that was added to the project as offsets from archive in Team Files
4226
+ indexes_with_offsets = []
4227
+ for idx in indexes_to_download:
4228
+ image_info: ImageInfo = batch[idx]
4229
+ if image_info.related_data_id is not None:
4230
+ blob_files_to_download[image_info.related_data_id] = image_info.download_id
4231
+ indexes_with_offsets.append(idx)
4232
+
4233
+ # Download images in numpy format
4089
4234
  batch_imgs_bytes = [None] * len(image_ids)
4090
4235
  if save_images and indexes_to_download:
4091
- for index, img in zip(
4092
- indexes_to_download,
4093
- api.image.download_bytes(
4094
- dataset_id,
4095
- [image_ids[i] for i in indexes_to_download],
4096
- progress_cb=ds_progress,
4097
- ),
4098
- ):
4099
- batch_imgs_bytes[index] = img
4236
+
4237
+ # For a lot of small files that stored in blob file. Downloads blob files to optimize download process.
4238
+ if download_blob_files and len(indexes_with_offsets) > 0:
4239
+ bytes_indexes_to_download = indexes_to_download.copy()
4240
+ for blob_file_id, download_id in blob_files_to_download.items():
4241
+ if blob_file_id not in project_fs.blob_files:
4242
+ api.image.download_blob_file(
4243
+ project_id=project_id,
4244
+ download_id=download_id,
4245
+ path=os.path.join(project_fs.blob_dir, f"{blob_file_id}.tar"),
4246
+ log_progress=(
4247
+ True if log_progress or progress_cb is not None else False
4248
+ ),
4249
+ )
4250
+ project_fs.add_blob_file(blob_file_id)
4251
+
4252
+ # Process blob image offsets
4253
+ offsets_file_name = f"{blob_file_id}{OFFSETS_PKL_SUFFIX}"
4254
+ offsets_file_path = os.path.join(
4255
+ dataset_fs.directory, offsets_file_name
4256
+ )
4257
+
4258
+ # Initialize counter for total image offsets for this blob file
4259
+ total_offsets_count = 0
4260
+ current_batch = []
4261
+
4262
+ # Get offsets from image infos
4263
+ for idx in indexes_with_offsets:
4264
+ image_info = batch[idx]
4265
+ if image_info.related_data_id == blob_file_id:
4266
+ blob_image_info = BlobImageInfo(
4267
+ name=image_info.name,
4268
+ offset_start=image_info.offset_start,
4269
+ offset_end=image_info.offset_end,
4270
+ )
4271
+ current_batch.append(blob_image_info)
4272
+ bytes_indexes_to_download.remove(idx)
4273
+
4274
+ # When batch size is reached, dump to file
4275
+ if len(current_batch) >= OFFSETS_PKL_BATCH_SIZE:
4276
+ BlobImageInfo.dump_to_pickle(
4277
+ current_batch, offsets_file_path
4278
+ )
4279
+ total_offsets_count += len(current_batch)
4280
+ current_batch = []
4281
+ # Dump any remaining items in the last batch
4282
+ if len(current_batch) > 0:
4283
+ BlobImageInfo.dump_to_pickle(current_batch, offsets_file_path)
4284
+ total_offsets_count += len(current_batch)
4285
+
4286
+ if total_offsets_count > 0:
4287
+ logger.debug(
4288
+ f"Saved {total_offsets_count} image offsets for {blob_file_id} to {offsets_file_path} in {(total_offsets_count + OFFSETS_PKL_BATCH_SIZE - 1) // OFFSETS_PKL_BATCH_SIZE} batches"
4289
+ )
4290
+ ds_progress(total_offsets_count)
4291
+
4292
+ image_ids_to_download = [
4293
+ image_ids[i] for i in bytes_indexes_to_download
4294
+ ]
4295
+ for index, img in zip(
4296
+ bytes_indexes_to_download,
4297
+ api.image.download_bytes(
4298
+ dataset_id,
4299
+ image_ids_to_download,
4300
+ progress_cb=ds_progress,
4301
+ ),
4302
+ ):
4303
+ batch_imgs_bytes[index] = img
4304
+ # If you want to download images in classic way
4305
+ else:
4306
+ image_ids_to_download = [image_ids[i] for i in indexes_to_download]
4307
+ for index, img in zip(
4308
+ indexes_to_download,
4309
+ api.image.download_bytes(
4310
+ dataset_id,
4311
+ image_ids_to_download,
4312
+ progress_cb=ds_progress,
4313
+ ),
4314
+ ):
4315
+ batch_imgs_bytes[index] = img
4100
4316
 
4101
4317
  if ds_progress is not None:
4102
4318
  ds_progress(len(batch) - len(indexes_to_download))
@@ -4160,7 +4376,11 @@ def _download_project(
4160
4376
  if item_name not in items_names_set:
4161
4377
  dataset_fs.delete_item(item_name)
4162
4378
  try:
4163
- create_readme(dest_dir, project_id, api)
4379
+ if download_blob_files:
4380
+ project_info = api.project.get_info_by_id(project_id)
4381
+ create_blob_readme(project_fs=project_fs, project_info=project_info)
4382
+ else:
4383
+ create_readme(dest_dir, project_id, api)
4164
4384
  except Exception as e:
4165
4385
  logger.info(f"There was an error while creating README: {e}")
4166
4386
 
@@ -4172,15 +4392,20 @@ def upload_project(
4172
4392
  project_name: Optional[str] = None,
4173
4393
  log_progress: bool = True,
4174
4394
  progress_cb: Optional[Union[tqdm, Callable]] = None,
4395
+ project_id: Optional[int] = None,
4175
4396
  ) -> Tuple[int, str]:
4176
4397
  project_fs = read_single_project(dir)
4177
- if project_name is None:
4178
- project_name = project_fs.name
4179
4398
 
4180
- if api.project.exists(workspace_id, project_name):
4181
- project_name = api.project.get_free_name(workspace_id, project_name)
4399
+ if not project_id:
4400
+ if project_name is None:
4401
+ project_name = project_fs.name
4402
+
4403
+ if api.project.exists(workspace_id, project_name):
4404
+ project_name = api.project.get_free_name(workspace_id, project_name)
4182
4405
 
4183
- project = api.project.create(workspace_id, project_name, change_name_if_conflict=True)
4406
+ project = api.project.create(workspace_id, project_name, change_name_if_conflict=True)
4407
+ else:
4408
+ project = api.project.get_info_by_id(project_id)
4184
4409
  updated_meta = api.project.update_meta(project.id, project_fs.meta.to_json())
4185
4410
 
4186
4411
  if progress_cb is not None:
@@ -4189,6 +4414,29 @@ def upload_project(
4189
4414
  # image_id_dct, anns_paths_dct = {}, {}
4190
4415
  dataset_map = {}
4191
4416
 
4417
+ total_blob_size = 0
4418
+ upload_blob_progress = None
4419
+ src_paths = []
4420
+ dst_paths = []
4421
+ for blob_file in project_fs.blob_files:
4422
+ if log_progress:
4423
+ total_blob_size += os.path.getsize(os.path.join(project_fs.blob_dir, blob_file))
4424
+ src_paths.append(os.path.join(project_fs.blob_dir, blob_file))
4425
+ dst_paths.append(os.path.join(f"/{TF_BLOB_DIR}", blob_file))
4426
+ if log_progress and len(src_paths) > 0:
4427
+ upload_blob_progress = tqdm_sly(
4428
+ desc="Uploading blob files", total=total_blob_size, unit="B", unit_scale=True
4429
+ )
4430
+ if len(src_paths) > 0:
4431
+ blob_file_infos = api.file.upload_bulk(
4432
+ team_id=project.team_id,
4433
+ src_paths=src_paths,
4434
+ dst_paths=dst_paths,
4435
+ progress_cb=upload_blob_progress,
4436
+ )
4437
+ else:
4438
+ blob_file_infos = []
4439
+
4192
4440
  for ds_fs in project_fs.datasets:
4193
4441
  if len(ds_fs.parents) > 0:
4194
4442
  parent = f"{os.path.sep}".join(ds_fs.parents)
@@ -4221,13 +4469,26 @@ def upload_project(
4221
4469
  else:
4222
4470
  img_infos.append(None)
4223
4471
 
4224
- img_paths = list(filter(lambda x: os.path.isfile(x), img_paths))
4472
+ # img_paths = list(filter(lambda x: os.path.isfile(x), img_paths))
4473
+ source_img_paths_len = len(img_paths)
4474
+ valid_indices = []
4475
+ valid_paths = []
4476
+ offset_indices = []
4477
+ for i, path in enumerate(img_paths):
4478
+ if os.path.isfile(path):
4479
+ valid_indices.append(i)
4480
+ valid_paths.append(path)
4481
+ else:
4482
+ offset_indices.append(i)
4483
+ img_paths = valid_paths
4225
4484
  ann_paths = list(filter(lambda x: os.path.isfile(x), ann_paths))
4485
+ # Create a mapping from name to index position for quick lookups
4486
+ offset_name_to_idx = {names[i]: i for i in offset_indices}
4226
4487
  metas = [{} for _ in names]
4227
4488
 
4228
4489
  img_infos_count = sum(1 for item in img_infos if item is not None)
4229
4490
 
4230
- if len(img_paths) == 0 and img_infos_count == 0:
4491
+ if len(img_paths) == 0 and img_infos_count == 0 and len(offset_indices) == 0:
4231
4492
  # Dataset is empty
4232
4493
  continue
4233
4494
 
@@ -4258,56 +4519,57 @@ def upload_project(
4258
4519
  merged_metas.append(merged_meta)
4259
4520
  metas = merged_metas
4260
4521
 
4261
- if len(img_paths) != 0:
4262
- uploaded_img_infos = api.image.upload_paths(
4263
- dataset.id, names, img_paths, ds_progress, metas=metas
4522
+ if len(img_paths) != 0 or len(offset_indices) != 0:
4523
+
4524
+ uploaded_img_infos = [None] * source_img_paths_len
4525
+ uploaded_img_infos_paths = api.image.upload_paths(
4526
+ dataset_id=dataset.id,
4527
+ names=[name for i, name in enumerate(names) if i in valid_indices],
4528
+ paths=img_paths,
4529
+ progress_cb=ds_progress,
4530
+ metas=[metas[i] for i in valid_indices],
4264
4531
  )
4532
+ for i, img_info in zip(valid_indices, uploaded_img_infos_paths):
4533
+ uploaded_img_infos[i] = img_info
4534
+ for blob_offsets in ds_fs.blob_offsets:
4535
+ blob_file = None
4536
+ for blob_file_info in blob_file_infos:
4537
+ if Path(blob_file_info.name).stem == removesuffix(
4538
+ Path(blob_offsets).name, OFFSETS_PKL_SUFFIX
4539
+ ):
4540
+ blob_file = blob_file_info
4541
+ break
4542
+
4543
+ if blob_file is None:
4544
+ raise ValueError(
4545
+ f"Cannot find blob file for offsets: {blob_offsets}. "
4546
+ f"Check the Team File directory '{TF_BLOB_DIR}', corresponding blob file should be uploaded."
4547
+ )
4548
+ uploaded_img_infos_offsets = api.image.upload_by_offsets_generator(
4549
+ dataset=dataset,
4550
+ team_file_id=blob_file.id,
4551
+ offsets_file_path=blob_offsets,
4552
+ progress_cb=ds_progress,
4553
+ metas={names[i]: metas[i] for i in offset_indices},
4554
+ )
4555
+ for img_info_batch in uploaded_img_infos_offsets:
4556
+ for img_info in img_info_batch:
4557
+ idx = offset_name_to_idx.get(img_info.name)
4558
+ if idx is not None:
4559
+ uploaded_img_infos[idx] = img_info
4265
4560
  elif img_infos_count != 0:
4266
4561
  if img_infos_count != len(names):
4267
4562
  raise ValueError(
4268
4563
  f"Cannot upload Project: image info files count ({img_infos_count}) doesn't match with images count ({len(names)}) that are going to be uploaded. "
4269
4564
  "Check the directory structure, all annotation files should have corresponding image info files."
4270
4565
  )
4271
- # uploading links and hashes (the code from api.image.upload_ids)
4272
- links, links_names, links_order, links_metas = [], [], [], []
4273
- hashes, hashes_names, hashes_order, hashes_metas = [], [], [], []
4274
- dataset_id = dataset.id
4275
- for idx, (name, info, meta) in enumerate(zip(names, img_infos, metas)):
4276
- if info.link is not None:
4277
- links.append(info.link)
4278
- links_names.append(name)
4279
- links_order.append(idx)
4280
- links_metas.append(meta)
4281
- else:
4282
- hashes.append(info.hash)
4283
- hashes_names.append(name)
4284
- hashes_order.append(idx)
4285
- hashes_metas.append(meta)
4286
-
4287
- result = [None] * len(names)
4288
- if len(links) > 0:
4289
- res_infos_links = api.image.upload_links(
4290
- dataset_id,
4291
- links_names,
4292
- links,
4293
- ds_progress,
4294
- metas=links_metas,
4295
- )
4296
- for info, pos in zip(res_infos_links, links_order):
4297
- result[pos] = info
4298
-
4299
- if len(hashes) > 0:
4300
- res_infos_hashes = api.image.upload_hashes(
4301
- dataset_id,
4302
- hashes_names,
4303
- hashes,
4304
- ds_progress,
4305
- metas=hashes_metas,
4306
- )
4307
- for info, pos in zip(res_infos_hashes, hashes_order):
4308
- result[pos] = info
4309
-
4310
- uploaded_img_infos = result
4566
+ uploaded_img_infos = api.image.upload_ids(
4567
+ dataset_id=dataset.id,
4568
+ names=names,
4569
+ ids=[img_info.id for img_info in img_infos],
4570
+ progress_cb=ds_progress,
4571
+ metas=metas,
4572
+ )
4311
4573
  else:
4312
4574
  raise ValueError(
4313
4575
  "Cannot upload Project: img_paths is empty and img_infos_paths is empty"
@@ -4343,6 +4605,7 @@ def download_project(
4343
4605
  save_image_meta: bool = False,
4344
4606
  images_ids: Optional[List[int]] = None,
4345
4607
  resume_download: Optional[bool] = False,
4608
+ **kwargs,
4346
4609
  ) -> None:
4347
4610
  """
4348
4611
  Download image project to the local directory.
@@ -4353,7 +4616,7 @@ def download_project(
4353
4616
  :type project_id: int
4354
4617
  :param dest_dir: Destination path to local directory.
4355
4618
  :type dest_dir: str
4356
- :param dataset_ids: Specified list of Dataset IDs which will be downloaded. Datasets could be downloaded from different projects but with the same data type.
4619
+ :param dataset_ids: Specified list of Dataset IDs which will be downloaded.
4357
4620
  :type dataset_ids: list(int), optional
4358
4621
  :param log_progress: Show downloading logs in the output. By default, it is True.
4359
4622
  :type log_progress: bool, optional
@@ -4375,6 +4638,9 @@ def download_project(
4375
4638
  :type images_ids: list(int), optional
4376
4639
  :param resume_download: Resume download enables to download only missing files avoiding erase of existing files.
4377
4640
  :type resume_download: bool, optional
4641
+ :param download_blob_files: Default is False. It will download images in classic way.
4642
+ If True, it will download blob files, if they are present in the project, to optimize download process.
4643
+ :type download_blob_files: bool, optional
4378
4644
  :return: None.
4379
4645
  :rtype: NoneType
4380
4646
  :Usage example:
@@ -4426,6 +4692,7 @@ def download_project(
4426
4692
  save_image_meta=save_image_meta,
4427
4693
  images_ids=images_ids,
4428
4694
  resume_download=resume_download,
4695
+ **kwargs,
4429
4696
  )
4430
4697
  else:
4431
4698
  _download_project_optimized(
@@ -4440,6 +4707,7 @@ def download_project(
4440
4707
  save_images=save_images,
4441
4708
  log_progress=log_progress,
4442
4709
  images_ids=images_ids,
4710
+ **kwargs,
4443
4711
  )
4444
4712
 
4445
4713
 
@@ -4455,6 +4723,7 @@ def _download_project_optimized(
4455
4723
  save_images=True,
4456
4724
  log_progress=True,
4457
4725
  images_ids: List[int] = None,
4726
+ **kwargs,
4458
4727
  ):
4459
4728
  project_info = api.project.get_info_by_id(project_id)
4460
4729
  project_id = project_info.id
@@ -4736,6 +5005,209 @@ def create_readme(
4736
5005
  return readme_path
4737
5006
 
4738
5007
 
5008
+ def _dataset_blob_structure_md(
5009
+ project_fs: Project,
5010
+ project_info: sly.ProjectInfo,
5011
+ entity_limit: Optional[int] = 2,
5012
+ ) -> str:
5013
+ """Creates a markdown string with the dataset structure of the project.
5014
+ Supports only images and videos projects.
5015
+
5016
+ :project_fs: Project file system.
5017
+ :type project_fs: :class:`Project<supervisely.project.project.Project>`
5018
+ :param project_info: Project information.
5019
+ :type project_info: :class:`ProjectInfo<supervisely.project.project_info.ProjectInfo>`
5020
+ :param entity_limit: The maximum number of entities to display in the README.
5021
+ :type entity_limit: int, optional
5022
+ :return: Markdown string with the dataset structure of the project.
5023
+ :rtype: str
5024
+ """
5025
+ supported_project_types = [sly.ProjectType.IMAGES.value]
5026
+ if project_info.type not in supported_project_types:
5027
+ return ""
5028
+
5029
+ entity_icons = {
5030
+ "images": " 🏞️ ",
5031
+ "blob_files": " 📦 ",
5032
+ "pkl_files": " 📄 ",
5033
+ "annotations": " 📝 ",
5034
+ }
5035
+ dataset_icon = " 📂 "
5036
+ folder_icon = " 📁 "
5037
+
5038
+ result_md = f"🗂️ {project_info.name}<br>"
5039
+
5040
+ # Add project-level blob files
5041
+ if os.path.exists(project_fs.blob_dir) and project_fs.blob_files:
5042
+ result_md += "┣" + folder_icon + f"{Project.blob_dir_name}<br>"
5043
+ blob_files = [entry.name for entry in os.scandir(project_fs.blob_dir) if entry.is_file()]
5044
+
5045
+ for idx, blob_file in enumerate(blob_files):
5046
+ if idx == entity_limit and len(blob_files) > entity_limit:
5047
+ result_md += "┃ ┗ ... " + str(len(blob_files) - entity_limit) + " more<br>"
5048
+ break
5049
+ symbol = "┗" if idx == len(blob_files) - 1 or idx == entity_limit - 1 else "┣"
5050
+ result_md += "┃ " + symbol + entity_icons["blob_files"] + blob_file + "<br>"
5051
+
5052
+ # Build a dataset hierarchy tree
5053
+ dataset_tree = {}
5054
+ root_datasets = []
5055
+
5056
+ # First pass: create nodes for all datasets
5057
+ for dataset in project_fs.datasets:
5058
+ dataset_tree[dataset.directory] = {
5059
+ "dataset": dataset,
5060
+ "children": [],
5061
+ "parent_dir": os.path.dirname(dataset.directory) if dataset.parents else None,
5062
+ }
5063
+
5064
+ # Second pass: build parent-child relationships
5065
+ for dir_path, node in dataset_tree.items():
5066
+ parent_dir = node["parent_dir"]
5067
+ if parent_dir in dataset_tree:
5068
+ dataset_tree[parent_dir]["children"].append(dir_path)
5069
+ else:
5070
+ root_datasets.append(dir_path)
5071
+
5072
+ # Function to recursively render the dataset tree
5073
+ def render_tree(dir_path, prefix=""):
5074
+ nonlocal result_md
5075
+ node = dataset_tree[dir_path]
5076
+ dataset = node["dataset"]
5077
+ children = node["children"]
5078
+
5079
+ # Create dataset display with proper path
5080
+ dataset_path = Dataset._get_dataset_path(dataset.name, dataset.parents)
5081
+ result_md += prefix + "┣" + dataset_icon + f"[{dataset.name}]({dataset_path})<br>"
5082
+
5083
+ # Set indentation for dataset content
5084
+ content_prefix = prefix + "┃ "
5085
+
5086
+ # Add pkl files at the dataset level
5087
+ offset_files = [
5088
+ entry.name
5089
+ for entry in os.scandir(dataset.directory)
5090
+ if entry.is_file() and entry.name.endswith(".pkl")
5091
+ ]
5092
+
5093
+ if offset_files:
5094
+ for idx, pkl_file in enumerate(offset_files):
5095
+ last_file = idx == len(offset_files) - 1
5096
+ has_more_content = (
5097
+ os.path.exists(dataset.img_dir) or os.path.exists(dataset.ann_dir) or children
5098
+ )
5099
+ symbol = "┗" if last_file and not has_more_content else "┣"
5100
+ result_md += content_prefix + symbol + entity_icons["pkl_files"] + pkl_file + "<br>"
5101
+
5102
+ # Add img directory
5103
+ if os.path.exists(dataset.img_dir):
5104
+ has_ann_dir = os.path.exists(dataset.ann_dir)
5105
+ has_more_content = has_ann_dir or children
5106
+ symbol = "┣" if has_more_content else "┗"
5107
+ result_md += content_prefix + symbol + folder_icon + "img<br>"
5108
+
5109
+ # Add image files
5110
+ entities = [entry.name for entry in os.scandir(dataset.img_dir) if entry.is_file()]
5111
+ entities = sorted(entities)
5112
+ selected_entities = entities[: min(len(entities), entity_limit)]
5113
+
5114
+ img_prefix = content_prefix + "┃ "
5115
+ for idx, entity in enumerate(selected_entities):
5116
+ last_img = idx == len(selected_entities) - 1
5117
+ symbol = "┗" if last_img and len(entities) <= entity_limit else "┣"
5118
+ result_md += img_prefix + symbol + entity_icons["images"] + entity + "<br>"
5119
+
5120
+ if len(entities) > entity_limit:
5121
+ result_md += img_prefix + "┗ ... " + str(len(entities) - entity_limit) + " more<br>"
5122
+
5123
+ # Add ann directory
5124
+ if os.path.exists(dataset.ann_dir):
5125
+ has_more_content = bool(children)
5126
+ symbol = "┣"
5127
+ result_md += content_prefix + "┣" + folder_icon + "ann<br>"
5128
+
5129
+ anns = [entry.name for entry in os.scandir(dataset.ann_dir) if entry.is_file()]
5130
+ anns = sorted(anns)
5131
+
5132
+ # Try to match annotations with displayed images
5133
+ possible_anns = [f"{entity}.json" for entity in selected_entities]
5134
+ matched_anns = [pa for pa in possible_anns if pa in anns]
5135
+
5136
+ # Add additional annotations if we haven't reached the limit
5137
+ if len(matched_anns) < min(entity_limit, len(anns)):
5138
+ for ann in anns:
5139
+ if ann not in matched_anns and len(matched_anns) < entity_limit:
5140
+ matched_anns.append(ann)
5141
+
5142
+ ann_prefix = content_prefix + "┃ "
5143
+ for idx, ann in enumerate(matched_anns):
5144
+ last_ann = idx == len(matched_anns) - 1
5145
+ symbol = "┗" if last_ann and len(anns) <= entity_limit else "┣"
5146
+ result_md += ann_prefix + symbol + entity_icons["annotations"] + ann + "<br>"
5147
+
5148
+ if len(anns) > entity_limit:
5149
+ result_md += ann_prefix + "┗ ... " + str(len(anns) - entity_limit) + " more<br>"
5150
+
5151
+ if not has_more_content:
5152
+ result_md += content_prefix + "...<br>"
5153
+ # Recursively render child datasets
5154
+ for idx, child_dir in enumerate(children):
5155
+ render_tree(child_dir, content_prefix)
5156
+
5157
+ # Start rendering from root datasets
5158
+ for root_dir in sorted(root_datasets):
5159
+ render_tree(root_dir)
5160
+
5161
+ return result_md
5162
+
5163
+
5164
+ def create_blob_readme(
5165
+ project_fs: Project,
5166
+ project_info: ProjectInfo,
5167
+ ) -> str:
5168
+ """Creates a README.md file using the template, adds general information
5169
+ about the project and creates a dataset structure section.
5170
+
5171
+ :param project_fs: Project file system.
5172
+ :type project_fs: :class:`Project<supervisely.project.project.Project>`
5173
+ :param project_info: Project information.
5174
+ :type project_info: :class:`ProjectInfo<supervisely.project.project_info.ProjectInfo>`
5175
+ :return: Path to the created README.md file.
5176
+ :rtype: str
5177
+
5178
+ :Usage example:
5179
+
5180
+ .. code-block:: python
5181
+
5182
+ import supervisely as sly
5183
+
5184
+ api = sly.Api.from_env()
5185
+
5186
+ project_id = 123
5187
+ project_dir = "/path/to/project"
5188
+
5189
+ readme_path = sly.create_readme(project_dir, project_id, api)
5190
+
5191
+ print(f"README.md file was created at {readme_path}")
5192
+ """
5193
+ current_path = os.path.dirname(os.path.abspath(__file__))
5194
+ template_path = os.path.join(current_path, "readme_template.md")
5195
+ with open(template_path, "r") as file:
5196
+ template = file.read()
5197
+
5198
+ readme_path = os.path.join(project_fs.directory, "README.md")
5199
+
5200
+ template = template.replace("{{general_info}}", _project_info_md(project_info))
5201
+
5202
+ template = template.replace(
5203
+ "{{dataset_structure_info}}", _dataset_blob_structure_md(project_fs, project_info)
5204
+ )
5205
+
5206
+ with open(readme_path, "w") as f:
5207
+ f.write(template)
5208
+ return readme_path
5209
+
5210
+
4739
5211
  def _project_info_md(project_info: sly.ProjectInfo) -> str:
4740
5212
  """Creates a markdown string with general information about the project
4741
5213
  using the fields of the ProjectInfo NamedTuple.
@@ -4784,6 +5256,9 @@ def _dataset_structure_md(
4784
5256
  entity_icons = {
4785
5257
  "images": " 🏞️ ",
4786
5258
  "videos": " 🎥 ",
5259
+ "blob_files": " 📦 ",
5260
+ "pkl_files": " 📄 ",
5261
+ "annotations": " 📝 ",
4787
5262
  }
4788
5263
  dataset_icon = " 📂 "
4789
5264
  list_function = list_functions[project_info.type]
@@ -4791,6 +5266,8 @@ def _dataset_structure_md(
4791
5266
 
4792
5267
  result_md = f"🗂️ {project_info.name}<br>"
4793
5268
 
5269
+ # if project_info
5270
+
4794
5271
  for parents, dataset_info in api.dataset.tree(project_info.id):
4795
5272
  # The dataset path is needed to create a clickable link in the README.
4796
5273
  dataset_path = Dataset._get_dataset_path(dataset_info.name, parents)
@@ -4841,6 +5318,8 @@ async def _download_project_async(
4841
5318
  switch_size = kwargs.get("switch_size", 1.28 * 1024 * 1024)
4842
5319
  # batch size for bulk download
4843
5320
  batch_size = kwargs.get("batch_size", 100)
5321
+ # control whether to download blob files
5322
+ download_blob_files = kwargs.get("download_blob_files", False)
4844
5323
 
4845
5324
  if semaphore is None:
4846
5325
  semaphore = api.get_default_semaphore()
@@ -4890,11 +5369,19 @@ async def _download_project_async(
4890
5369
  small_images = []
4891
5370
  large_images = []
4892
5371
  dataset_images = []
5372
+ blob_files_to_download = {}
5373
+ blob_images = []
5374
+
4893
5375
  async for image_batch in all_images:
4894
5376
  for image in image_batch:
4895
5377
  if images_ids is None or image.id in images_ids:
4896
5378
  dataset_images.append(image)
4897
- if image.size < switch_size:
5379
+ # Check for images with blob offsets
5380
+
5381
+ if download_blob_files and image.related_data_id is not None:
5382
+ blob_files_to_download[image.related_data_id] = image.download_id
5383
+ blob_images.append(image)
5384
+ elif image.size < switch_size:
4898
5385
  small_images.append(image)
4899
5386
  else:
4900
5387
  large_images.append(image)
@@ -4903,7 +5390,7 @@ async def _download_project_async(
4903
5390
  if log_progress is True:
4904
5391
  ds_progress = tqdm_sly(
4905
5392
  desc="Downloading images from {!r}".format(dataset.name),
4906
- total=len(small_images) + len(large_images),
5393
+ total=len(small_images) + len(large_images) + len(blob_images),
4907
5394
  leave=False,
4908
5395
  )
4909
5396
 
@@ -4939,14 +5426,82 @@ async def _download_project_async(
4939
5426
  )
4940
5427
  return created_tasks
4941
5428
 
5429
+ # Download blob files if required
5430
+ if download_blob_files and len(blob_files_to_download) > 0:
5431
+ blob_paths = []
5432
+ download_ids = []
5433
+ # Process each blob file
5434
+ for blob_file_id, download_id in blob_files_to_download.items():
5435
+ if blob_file_id not in project_fs.blob_files:
5436
+ # Download the blob file
5437
+ blob_paths.append(os.path.join(project_fs.blob_dir, f"{blob_file_id}.tar"))
5438
+ download_ids.append(download_id)
5439
+ await api.image.download_blob_files_async(
5440
+ project_id=project_id,
5441
+ download_ids=download_ids,
5442
+ paths=blob_paths,
5443
+ semaphore=semaphore,
5444
+ log_progress=(True if log_progress or progress_cb is not None else False),
5445
+ )
5446
+ for blob_file_id, download_id in blob_files_to_download.items():
5447
+ project_fs.add_blob_file(blob_file_id)
5448
+
5449
+ # Process blob image offsets
5450
+ offsets_file_name = f"{blob_file_id}{OFFSETS_PKL_SUFFIX}"
5451
+ offsets_file_path = os.path.join(dataset_fs.directory, offsets_file_name)
5452
+
5453
+ total_offsets_count = 0 # for logging
5454
+ current_batch = []
5455
+ for img in blob_images:
5456
+ if img.related_data_id == blob_file_id:
5457
+ blob_image_info = BlobImageInfo(
5458
+ name=img.name,
5459
+ offset_start=img.offset_start,
5460
+ offset_end=img.offset_end,
5461
+ )
5462
+ current_batch.append(blob_image_info)
5463
+ if len(current_batch) >= OFFSETS_PKL_BATCH_SIZE:
5464
+ BlobImageInfo.dump_to_pickle(current_batch, offsets_file_path)
5465
+ total_offsets_count += len(current_batch)
5466
+ current_batch = []
5467
+ if len(current_batch) > 0:
5468
+ BlobImageInfo.dump_to_pickle(current_batch, offsets_file_path)
5469
+ total_offsets_count += len(current_batch)
5470
+ if total_offsets_count > 0:
5471
+ logger.debug(
5472
+ f"Saved {total_offsets_count} image offsets for {blob_file_id} to {offsets_file_path} in {(total_offsets_count + OFFSETS_PKL_BATCH_SIZE - 1) // OFFSETS_PKL_BATCH_SIZE} batches"
5473
+ )
5474
+ offset_tasks = []
5475
+ # Download annotations for images with offsets
5476
+ for offsets_batch in batched(blob_images, batch_size=batch_size):
5477
+ offset_task = _download_project_items_batch_async(
5478
+ api=api,
5479
+ dataset_id=dataset_id,
5480
+ img_infos=offsets_batch,
5481
+ meta=meta,
5482
+ dataset_fs=dataset_fs,
5483
+ id_to_tagmeta=id_to_tagmeta,
5484
+ semaphore=semaphore,
5485
+ save_images=False,
5486
+ save_image_info=save_image_info,
5487
+ only_image_tags=only_image_tags,
5488
+ progress_cb=ds_progress,
5489
+ )
5490
+ offset_tasks.append(offset_task)
5491
+ created_tasks = await run_tasks_with_delay(offset_tasks, 0.05)
5492
+ await asyncio.gather(*created_tasks)
5493
+
4942
5494
  tasks = []
5495
+ # Check which images need to be downloaded
4943
5496
  small_images = await check_items(small_images)
4944
5497
  large_images = await check_items(large_images)
4945
5498
 
5499
+ # If only one small image, treat it as a large image for efficiency
4946
5500
  if len(small_images) == 1:
4947
5501
  large_images.append(small_images.pop())
4948
- for images_batch in batched(small_images, batch_size=batch_size):
4949
5502
 
5503
+ # Create batch download tasks
5504
+ for images_batch in batched(small_images, batch_size=batch_size):
4950
5505
  task = _download_project_items_batch_async(
4951
5506
  api=api,
4952
5507
  dataset_id=dataset_id,
@@ -4961,6 +5516,8 @@ async def _download_project_async(
4961
5516
  progress_cb=ds_progress,
4962
5517
  )
4963
5518
  tasks.append(task)
5519
+
5520
+ # Create individual download tasks for large images
4964
5521
  for image in large_images:
4965
5522
  task = _download_project_item_async(
4966
5523
  api=api,
@@ -4995,7 +5552,11 @@ async def _download_project_async(
4995
5552
  dataset_fs.delete_item(item_name)
4996
5553
 
4997
5554
  try:
4998
- create_readme(dest_dir, project_id, api)
5555
+ if download_blob_files:
5556
+ project_info = api.project.get_info_by_id(project_id)
5557
+ create_blob_readme(project_fs=project_fs, project_info=project_info)
5558
+ else:
5559
+ create_readme(dest_dir, project_id, api)
4999
5560
  except Exception as e:
5000
5561
  logger.info(f"There was an error while creating README: {e}")
5001
5562