supervisely 6.73.343__py3-none-any.whl → 6.73.345__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,16 +31,22 @@ import supervisely as sly
31
31
  from supervisely._utils import (
32
32
  abs_url,
33
33
  batched,
34
- generate_free_name,
35
34
  get_or_create_event_loop,
36
35
  is_development,
36
+ removesuffix,
37
37
  snake_to_human,
38
38
  )
39
39
  from supervisely.annotation.annotation import ANN_EXT, Annotation, TagCollection
40
40
  from supervisely.annotation.obj_class import ObjClass
41
41
  from supervisely.annotation.obj_class_collection import ObjClassCollection
42
- from supervisely.api.api import Api, ApiContext
43
- from supervisely.api.image_api import ImageInfo
42
+ from supervisely.api.api import Api, ApiContext, ApiField
43
+ from supervisely.api.image_api import (
44
+ OFFSETS_PKL_BATCH_SIZE,
45
+ OFFSETS_PKL_SUFFIX,
46
+ BlobImageInfo,
47
+ ImageInfo,
48
+ )
49
+ from supervisely.api.project_api import ProjectInfo
44
50
  from supervisely.collection.key_indexed_collection import (
45
51
  KeyIndexedCollection,
46
52
  KeyObject,
@@ -68,7 +74,9 @@ from supervisely.io.json import dump_json_file, dump_json_file_async, load_json_
68
74
  from supervisely.project.project_meta import ProjectMeta
69
75
  from supervisely.project.project_type import ProjectType
70
76
  from supervisely.sly_logger import logger
71
- from supervisely.task.progress import Progress, tqdm_sly
77
+ from supervisely.task.progress import tqdm_sly
78
+
79
+ TF_BLOB_DIR = "blob-files" # directory for project blob files in team files
72
80
 
73
81
 
74
82
  class CustomUnpickler(pickle.Unpickler):
@@ -224,6 +232,7 @@ class Dataset(KeyObject):
224
232
  seg_dir_name = "seg"
225
233
  meta_dir_name = "meta"
226
234
  datasets_dir_name = "datasets"
235
+ blob_dir_name = "blob"
227
236
 
228
237
  def __init__(
229
238
  self,
@@ -273,6 +282,7 @@ class Dataset(KeyObject):
273
282
  self._project_dir = project_dir
274
283
  self._name = full_ds_name
275
284
  self._short_name = short_ds_name
285
+ self._blob_offset_paths = []
276
286
 
277
287
  if self.dataset_id is not None:
278
288
  self._read_api()
@@ -537,6 +547,23 @@ class Dataset(KeyObject):
537
547
  """
538
548
  return os.path.join(self.directory, self.meta_dir_name)
539
549
 
550
+ @property
551
+ def blob_offsets(self):
552
+ """
553
+ List of paths to the dataset blob offset files.
554
+
555
+ :return: List of paths to the dataset blob offset files.
556
+ :rtype: :class:`List[str]`
557
+ """
558
+ return self._blob_offset_paths
559
+
560
+ @blob_offsets.setter
561
+ def blob_offsets(self, value: List[str]):
562
+ """
563
+ Set the list of paths to the dataset blob offset files.
564
+ """
565
+ self._blob_offset_paths = value
566
+
540
567
  @classmethod
541
568
  def _has_valid_ext(cls, path: str) -> bool:
542
569
  """
@@ -552,16 +579,36 @@ class Dataset(KeyObject):
552
579
  Consistency checks. Every item must have an annotation, and the correspondence must be one to one.
553
580
  If not - it generate exception error.
554
581
  """
555
- if not dir_exists(self.item_dir):
582
+ blob_offset_paths = list_files(
583
+ self.directory, filter_fn=lambda x: x.endswith(OFFSETS_PKL_SUFFIX)
584
+ )
585
+ has_blob_offsets = len(blob_offset_paths) > 0
586
+
587
+ if not dir_exists(self.item_dir) and not has_blob_offsets:
556
588
  raise FileNotFoundError("Item directory not found: {!r}".format(self.item_dir))
557
589
  if not dir_exists(self.ann_dir):
558
590
  raise FileNotFoundError("Annotation directory not found: {!r}".format(self.ann_dir))
559
591
 
560
592
  raw_ann_paths = list_files(self.ann_dir, [ANN_EXT])
561
- img_paths = list_files(self.item_dir, filter_fn=self._has_valid_ext)
562
-
563
593
  raw_ann_names = set(os.path.basename(path) for path in raw_ann_paths)
564
- img_names = [os.path.basename(path) for path in img_paths]
594
+
595
+ if dir_exists(self.item_dir):
596
+ img_paths = list_files(self.item_dir, filter_fn=self._has_valid_ext)
597
+ img_names = [os.path.basename(path) for path in img_paths]
598
+ else:
599
+ img_names = []
600
+
601
+ # If we have blob offset files, add the image names from those
602
+ if has_blob_offsets:
603
+ self.blob_offsets = blob_offset_paths
604
+ for offset_file_path in self.blob_offsets:
605
+ try:
606
+ blob_img_info_lists = BlobImageInfo.load_from_pickle_generator(offset_file_path)
607
+ for blob_img_info_list in blob_img_info_lists:
608
+ for blob_img_info in blob_img_info_list:
609
+ img_names.append(blob_img_info.name)
610
+ except Exception as e:
611
+ logger.warning(f"Failed to read blob offset file {offset_file_path}: {str(e)}")
565
612
 
566
613
  if len(img_names) == 0 and len(raw_ann_names) == 0:
567
614
  logger.info("Dataset {!r} is empty".format(self.name))
@@ -1308,7 +1355,7 @@ class Dataset(KeyObject):
1308
1355
 
1309
1356
  img_path = "/home/admin/Pictures/Clouds.jpeg"
1310
1357
  img_np = sly.image.read(img_path)
1311
- img_bytes = sly.image.write_bytes(img_np, "jpeg")
1358
+ img_bytes = sly.image.write_bytes(img_np, "jpeg")
1312
1359
  coroutine = ds.add_item_raw_bytes_async("IMG_050.jpeg", img_bytes)
1313
1360
  run_coroutine(coroutine)
1314
1361
 
@@ -1691,7 +1738,7 @@ class Dataset(KeyObject):
1691
1738
  "objects":[],
1692
1739
  "customBigData":{}
1693
1740
  }
1694
-
1741
+
1695
1742
  coroutine = ds.set_ann_dict_async("IMG_8888.jpeg", new_ann_json)
1696
1743
  run_coroutine(coroutine)
1697
1744
  """
@@ -1723,7 +1770,7 @@ class Dataset(KeyObject):
1723
1770
 
1724
1771
  height, width = 500, 700
1725
1772
  new_ann = sly.Annotation((height, width))
1726
-
1773
+
1727
1774
  coroutine = ds.set_ann_async("IMG_0748.jpeg", new_ann)
1728
1775
  run_coroutine(coroutine)
1729
1776
  """
@@ -2017,6 +2064,84 @@ class Dataset(KeyObject):
2017
2064
  progress_cb=progress_cb,
2018
2065
  )
2019
2066
 
2067
+ def get_blob_img_bytes(self, image_name: str) -> bytes:
2068
+ """
2069
+ Get image bytes from blob file.
2070
+
2071
+ :param image_name: Image name with extension.
2072
+ :type image_name: :class:`str`
2073
+ :return: Bytes of the image.
2074
+ :rtype: :class:`bytes`
2075
+
2076
+ :Usage example:
2077
+
2078
+ .. code-block:: python
2079
+
2080
+ import supervisely as sly
2081
+ dataset_path = "/path/to/project/lemons_annotated/ds1"
2082
+ dataset = sly.Dataset(dataset_path, sly.OpenMode.READ)
2083
+ image_name = "IMG_0748.jpeg"
2084
+
2085
+ img_bytes = dataset.get_blob_img_bytes(image_name)
2086
+ """
2087
+
2088
+ if self.project_dir is None:
2089
+ raise RuntimeError("Project directory is not set. Cannot get blob image bytes.")
2090
+
2091
+ blob_image_info = None
2092
+
2093
+ for offset in self.blob_offsets:
2094
+ for batch in BlobImageInfo.load_from_pickle_generator(offset):
2095
+ for file in batch:
2096
+ if file.name == image_name:
2097
+ blob_image_info = file
2098
+ blob_file_name = removesuffix(Path(offset).name, OFFSETS_PKL_SUFFIX)
2099
+ break
2100
+ if blob_image_info is None:
2101
+ logger.debug(
2102
+ f"Image '{image_name}' not found in blob offsets. "
2103
+ f"Make sure that the image is stored in the blob file."
2104
+ )
2105
+ return None
2106
+
2107
+ blob_file_path = os.path.join(self.project_dir, self.blob_dir_name, blob_file_name + ".tar")
2108
+ if file_exists(blob_file_path):
2109
+ with open(blob_file_path, "rb") as f:
2110
+ f.seek(blob_image_info.offset_start)
2111
+ img_bytes = f.read(blob_image_info.offset_end - blob_image_info.offset_start)
2112
+ else:
2113
+ logger.debug(
2114
+ f"Blob file '{blob_file_path}' not found. "
2115
+ f"Make sure that the blob file exists in the specified directory."
2116
+ )
2117
+ img_bytes = None
2118
+ return img_bytes
2119
+
2120
+ def get_blob_img_np(self, image_name: str) -> np.ndarray:
2121
+ """
2122
+ Get image as numpy array from blob file.
2123
+
2124
+ :param image_name: Image name with extension.
2125
+ :type image_name: :class:`str`
2126
+ :return: Numpy array of the image.
2127
+ :rtype: :class:`numpy.ndarray`
2128
+
2129
+ :Usage example:
2130
+
2131
+ .. code-block:: python
2132
+
2133
+ import supervisely as sly
2134
+ dataset_path = "/path/to/project/lemons_annotated/ds1"
2135
+ dataset = sly.Dataset(dataset_path, sly.OpenMode.READ)
2136
+ image_name = "IMG_0748.jpeg"
2137
+
2138
+ img_np = dataset.get_blob_img_np(image_name)
2139
+ """
2140
+ img_bytes = self.get_blob_img_bytes(image_name)
2141
+ if img_bytes is None:
2142
+ return None
2143
+ return sly_image.read_bytes(img_bytes)
2144
+
2020
2145
 
2021
2146
  class Project:
2022
2147
  """
@@ -2036,6 +2161,7 @@ class Project:
2036
2161
  """
2037
2162
 
2038
2163
  dataset_class = Dataset
2164
+ blob_dir_name = "blob"
2039
2165
 
2040
2166
  class DatasetDict(KeyIndexedCollection):
2041
2167
  """
@@ -2075,6 +2201,7 @@ class Project:
2075
2201
 
2076
2202
  parent_dir, name = Project._parse_path(directory)
2077
2203
  self._parent_dir = parent_dir
2204
+ self._blob_dir = os.path.join(directory, self.blob_dir_name)
2078
2205
  self._api = api
2079
2206
  self.project_id = project_id
2080
2207
 
@@ -2086,7 +2213,7 @@ class Project:
2086
2213
  self._name = name
2087
2214
  self._datasets = Project.DatasetDict() # ds_name -> dataset object
2088
2215
  self._meta = None
2089
-
2216
+ self._blob_files = []
2090
2217
  if project_id is not None:
2091
2218
  self._read_api()
2092
2219
  elif mode is OpenMode.READ:
@@ -2138,6 +2265,25 @@ class Project:
2138
2265
  """
2139
2266
  return self._parent_dir
2140
2267
 
2268
+ @property
2269
+ def blob_dir(self) -> str:
2270
+ """
2271
+ Directory for project blobs.
2272
+ Blobs are .tar files with images. Used for fast data transfer.
2273
+
2274
+ :return: Path to project blob directory
2275
+ :rtype: :class:`str`
2276
+ :Usage example:
2277
+
2278
+ .. code-block:: python
2279
+
2280
+ import supervisely as sly
2281
+ project = sly.Project("/home/admin/work/supervisely/projects/lemons_annotated", sly.OpenMode.READ)
2282
+ print(project.blob_dir)
2283
+ # Output: '/home/admin/work/supervisely/projects/lemons_annotated/blob'
2284
+ """
2285
+ return self._blob_dir
2286
+
2141
2287
  @property
2142
2288
  def name(self) -> str:
2143
2289
  """
@@ -2259,6 +2405,61 @@ class Project:
2259
2405
  """
2260
2406
  return sum(len(ds) for ds in self._datasets)
2261
2407
 
2408
+ @property
2409
+ def blob_files(self) -> List[str]:
2410
+ """
2411
+ List of blob files.
2412
+
2413
+ :return: List of blob files
2414
+ :rtype: :class:`list`
2415
+ :Usage example:
2416
+
2417
+ .. code-block:: python
2418
+
2419
+ import supervisely as sly
2420
+ project = sly.Project("/home/admin/work/supervisely/projects/lemons_annotated", sly.OpenMode.READ)
2421
+ print(project.blob_files)
2422
+ # Output: []
2423
+ """
2424
+ return self._blob_files
2425
+
2426
+ @blob_files.setter
2427
+ def blob_files(self, blob_files: List[str]) -> None:
2428
+ """
2429
+ Sets blob files to the project.
2430
+
2431
+ :param blob_files: List of blob files.
2432
+ :type
2433
+ :return: None
2434
+ :rtype: NoneType
2435
+ :Usage example:
2436
+
2437
+ .. code-block:: python
2438
+
2439
+ import supervisely as sly
2440
+ project = sly.Project("/home/admin/work/supervisely/projects/lemons_annotated", sly.OpenMode.READ)
2441
+ project.blob_files = ["blob_file.tar"]
2442
+ """
2443
+ self._blob_files = blob_files
2444
+
2445
+ def add_blob_file(self, file_name: str) -> None:
2446
+ """
2447
+ Adds blob file to the project.
2448
+
2449
+ :param file_name: File name.
2450
+ :type file_name: :class:`str`
2451
+ :return: None
2452
+ :rtype: NoneType
2453
+ :Usage example:
2454
+
2455
+ .. code-block:: python
2456
+
2457
+ import supervisely as sly
2458
+ project = sly.Project("/home/admin/work/supervisely/projects/lemons_annotated", sly.OpenMode.READ)
2459
+ project.add_blob_file("blob_file.tar")
2460
+ """
2461
+ self._blob_files.append(file_name)
2462
+
2262
2463
  def get_classes_stats(
2263
2464
  self,
2264
2465
  dataset_names: Optional[List[str]] = None,
@@ -2296,6 +2497,10 @@ class Project:
2296
2497
  def _read(self):
2297
2498
  meta_json = load_json_file(self._get_project_meta_path())
2298
2499
  self._meta = ProjectMeta.from_json(meta_json)
2500
+ if dir_exists(self.blob_dir):
2501
+ self.blob_files = [Path(file).name for file in list_files(self.blob_dir)]
2502
+ else:
2503
+ self.blob_files = []
2299
2504
 
2300
2505
  ignore_dirs = self.dataset_class.ignorable_dirs() # dir names that can not be datasets
2301
2506
 
@@ -2350,6 +2555,7 @@ class Project:
2350
2555
  else:
2351
2556
  mkdir(self.directory)
2352
2557
  self.set_meta(ProjectMeta())
2558
+ self.blob_files = []
2353
2559
 
2354
2560
  def validate(self):
2355
2561
  # @TODO: remove?
@@ -3085,6 +3291,7 @@ class Project:
3085
3291
  save_images: bool = True,
3086
3292
  save_image_meta: bool = False,
3087
3293
  resume_download: bool = False,
3294
+ **kwargs,
3088
3295
  ) -> None:
3089
3296
  """
3090
3297
  Download project from Supervisely to the given directory.
@@ -3113,6 +3320,9 @@ class Project:
3113
3320
  :type save_images: :class:`bool`, optional
3114
3321
  :param save_image_meta: Download images metadata in JSON format or not.
3115
3322
  :type save_image_meta: :class:`bool`, optional
3323
+ :param download_blob_files: Default is False. It will download images in classic way.
3324
+ If True, it will download blob files, if they are present in the project, to optimize download process.
3325
+ :type download_blob_files: bool, optional
3116
3326
  :return: None
3117
3327
  :rtype: NoneType
3118
3328
  :Usage example:
@@ -3151,6 +3361,7 @@ class Project:
3151
3361
  save_images=save_images,
3152
3362
  save_image_meta=save_image_meta,
3153
3363
  resume_download=resume_download,
3364
+ **kwargs,
3154
3365
  )
3155
3366
 
3156
3367
  @staticmethod
@@ -3731,7 +3942,7 @@ class Project:
3731
3942
 
3732
3943
  project_id = 8888
3733
3944
  save_directory = "/path/to/save/projects"
3734
-
3945
+
3735
3946
  coroutine = sly.Project.download_async(api, project_id, save_directory)
3736
3947
  run_coroutine(coroutine)
3737
3948
  """
@@ -3755,6 +3966,7 @@ class Project:
3755
3966
  save_image_meta=save_image_meta,
3756
3967
  images_ids=images_ids,
3757
3968
  resume_download=resume_download,
3969
+ **kwargs,
3758
3970
  )
3759
3971
 
3760
3972
  def to_coco(
@@ -4002,9 +4214,13 @@ def _download_project(
4002
4214
  save_image_meta: Optional[bool] = False,
4003
4215
  images_ids: Optional[List[int]] = None,
4004
4216
  resume_download: Optional[bool] = False,
4217
+ **kwargs,
4005
4218
  ):
4219
+ download_blob_files = kwargs.pop("download_blob_files", False)
4220
+
4006
4221
  dataset_ids = set(dataset_ids) if (dataset_ids is not None) else None
4007
4222
  project_fs = None
4223
+
4008
4224
  meta = ProjectMeta.from_json(api.project.get_meta(project_id, with_settings=True))
4009
4225
  if os.path.exists(dest_dir) and resume_download:
4010
4226
  dump_json_file(meta.to_json(), os.path.join(dest_dir, "meta.json"))
@@ -4029,6 +4245,7 @@ def _download_project(
4029
4245
 
4030
4246
  existing_datasets = {dataset.path: dataset for dataset in project_fs.datasets}
4031
4247
  for parents, dataset in api.dataset.tree(project_id):
4248
+ blob_files_to_download = {}
4032
4249
  dataset_path = Dataset._get_dataset_path(dataset.name, parents)
4033
4250
  dataset_id = dataset.id
4034
4251
  if dataset_ids is not None and dataset_id not in dataset_ids:
@@ -4065,6 +4282,7 @@ def _download_project(
4065
4282
  project_meta=meta,
4066
4283
  ):
4067
4284
  for batch in batched(images, batch_size):
4285
+ batch: List[ImageInfo]
4068
4286
  image_ids = [image_info.id for image_info in batch]
4069
4287
  image_names = [image_info.name for image_info in batch]
4070
4288
 
@@ -4085,18 +4303,97 @@ def _download_project(
4085
4303
  ):
4086
4304
  indexes_to_download.append(i)
4087
4305
 
4088
- # download images in numpy format
4306
+ # Collect images that was added to the project as offsets from archive in Team Files
4307
+ indexes_with_offsets = []
4308
+ for idx in indexes_to_download:
4309
+ image_info: ImageInfo = batch[idx]
4310
+ if image_info.related_data_id is not None:
4311
+ blob_files_to_download[image_info.related_data_id] = image_info.download_id
4312
+ indexes_with_offsets.append(idx)
4313
+
4314
+ # Download images in numpy format
4089
4315
  batch_imgs_bytes = [None] * len(image_ids)
4090
4316
  if save_images and indexes_to_download:
4091
- for index, img in zip(
4092
- indexes_to_download,
4093
- api.image.download_bytes(
4094
- dataset_id,
4095
- [image_ids[i] for i in indexes_to_download],
4096
- progress_cb=ds_progress,
4097
- ),
4098
- ):
4099
- batch_imgs_bytes[index] = img
4317
+
4318
+ # For a lot of small files that stored in blob file. Downloads blob files to optimize download process.
4319
+ if download_blob_files and len(indexes_with_offsets) > 0:
4320
+ bytes_indexes_to_download = indexes_to_download.copy()
4321
+ for blob_file_id, download_id in blob_files_to_download.items():
4322
+ if blob_file_id not in project_fs.blob_files:
4323
+ api.image.download_blob_file(
4324
+ project_id=project_id,
4325
+ download_id=download_id,
4326
+ path=os.path.join(project_fs.blob_dir, f"{blob_file_id}.tar"),
4327
+ log_progress=(
4328
+ True if log_progress or progress_cb is not None else False
4329
+ ),
4330
+ )
4331
+ project_fs.add_blob_file(blob_file_id)
4332
+
4333
+ # Process blob image offsets
4334
+ offsets_file_name = f"{blob_file_id}{OFFSETS_PKL_SUFFIX}"
4335
+ offsets_file_path = os.path.join(
4336
+ dataset_fs.directory, offsets_file_name
4337
+ )
4338
+
4339
+ # Initialize counter for total image offsets for this blob file
4340
+ total_offsets_count = 0
4341
+ current_batch = []
4342
+
4343
+ # Get offsets from image infos
4344
+ for idx in indexes_with_offsets:
4345
+ image_info = batch[idx]
4346
+ if image_info.related_data_id == blob_file_id:
4347
+ blob_image_info = BlobImageInfo(
4348
+ name=image_info.name,
4349
+ offset_start=image_info.offset_start,
4350
+ offset_end=image_info.offset_end,
4351
+ )
4352
+ current_batch.append(blob_image_info)
4353
+ bytes_indexes_to_download.remove(idx)
4354
+
4355
+ # When batch size is reached, dump to file
4356
+ if len(current_batch) >= OFFSETS_PKL_BATCH_SIZE:
4357
+ BlobImageInfo.dump_to_pickle(
4358
+ current_batch, offsets_file_path
4359
+ )
4360
+ total_offsets_count += len(current_batch)
4361
+ current_batch = []
4362
+ # Dump any remaining items in the last batch
4363
+ if len(current_batch) > 0:
4364
+ BlobImageInfo.dump_to_pickle(current_batch, offsets_file_path)
4365
+ total_offsets_count += len(current_batch)
4366
+
4367
+ if total_offsets_count > 0:
4368
+ logger.debug(
4369
+ f"Saved {total_offsets_count} image offsets for {blob_file_id} to {offsets_file_path} in {(total_offsets_count + OFFSETS_PKL_BATCH_SIZE - 1) // OFFSETS_PKL_BATCH_SIZE} batches"
4370
+ )
4371
+ ds_progress(total_offsets_count)
4372
+
4373
+ image_ids_to_download = [
4374
+ image_ids[i] for i in bytes_indexes_to_download
4375
+ ]
4376
+ for index, img in zip(
4377
+ bytes_indexes_to_download,
4378
+ api.image.download_bytes(
4379
+ dataset_id,
4380
+ image_ids_to_download,
4381
+ progress_cb=ds_progress,
4382
+ ),
4383
+ ):
4384
+ batch_imgs_bytes[index] = img
4385
+ # If you want to download images in classic way
4386
+ else:
4387
+ image_ids_to_download = [image_ids[i] for i in indexes_to_download]
4388
+ for index, img in zip(
4389
+ indexes_to_download,
4390
+ api.image.download_bytes(
4391
+ dataset_id,
4392
+ image_ids_to_download,
4393
+ progress_cb=ds_progress,
4394
+ ),
4395
+ ):
4396
+ batch_imgs_bytes[index] = img
4100
4397
 
4101
4398
  if ds_progress is not None:
4102
4399
  ds_progress(len(batch) - len(indexes_to_download))
@@ -4160,7 +4457,11 @@ def _download_project(
4160
4457
  if item_name not in items_names_set:
4161
4458
  dataset_fs.delete_item(item_name)
4162
4459
  try:
4163
- create_readme(dest_dir, project_id, api)
4460
+ if download_blob_files:
4461
+ project_info = api.project.get_info_by_id(project_id)
4462
+ create_blob_readme(project_fs=project_fs, project_info=project_info)
4463
+ else:
4464
+ create_readme(dest_dir, project_id, api)
4164
4465
  except Exception as e:
4165
4466
  logger.info(f"There was an error while creating README: {e}")
4166
4467
 
@@ -4172,15 +4473,20 @@ def upload_project(
4172
4473
  project_name: Optional[str] = None,
4173
4474
  log_progress: bool = True,
4174
4475
  progress_cb: Optional[Union[tqdm, Callable]] = None,
4476
+ project_id: Optional[int] = None,
4175
4477
  ) -> Tuple[int, str]:
4176
4478
  project_fs = read_single_project(dir)
4177
- if project_name is None:
4178
- project_name = project_fs.name
4179
4479
 
4180
- if api.project.exists(workspace_id, project_name):
4181
- project_name = api.project.get_free_name(workspace_id, project_name)
4480
+ if not project_id:
4481
+ if project_name is None:
4482
+ project_name = project_fs.name
4182
4483
 
4183
- project = api.project.create(workspace_id, project_name, change_name_if_conflict=True)
4484
+ if api.project.exists(workspace_id, project_name):
4485
+ project_name = api.project.get_free_name(workspace_id, project_name)
4486
+
4487
+ project = api.project.create(workspace_id, project_name, change_name_if_conflict=True)
4488
+ else:
4489
+ project = api.project.get_info_by_id(project_id)
4184
4490
  updated_meta = api.project.update_meta(project.id, project_fs.meta.to_json())
4185
4491
 
4186
4492
  if progress_cb is not None:
@@ -4189,6 +4495,29 @@ def upload_project(
4189
4495
  # image_id_dct, anns_paths_dct = {}, {}
4190
4496
  dataset_map = {}
4191
4497
 
4498
+ total_blob_size = 0
4499
+ upload_blob_progress = None
4500
+ src_paths = []
4501
+ dst_paths = []
4502
+ for blob_file in project_fs.blob_files:
4503
+ if log_progress:
4504
+ total_blob_size += os.path.getsize(os.path.join(project_fs.blob_dir, blob_file))
4505
+ src_paths.append(os.path.join(project_fs.blob_dir, blob_file))
4506
+ dst_paths.append(os.path.join(f"/{TF_BLOB_DIR}", blob_file))
4507
+ if log_progress and len(src_paths) > 0:
4508
+ upload_blob_progress = tqdm_sly(
4509
+ desc="Uploading blob files", total=total_blob_size, unit="B", unit_scale=True
4510
+ )
4511
+ if len(src_paths) > 0:
4512
+ blob_file_infos = api.file.upload_bulk(
4513
+ team_id=project.team_id,
4514
+ src_paths=src_paths,
4515
+ dst_paths=dst_paths,
4516
+ progress_cb=upload_blob_progress,
4517
+ )
4518
+ else:
4519
+ blob_file_infos = []
4520
+
4192
4521
  for ds_fs in project_fs.datasets:
4193
4522
  if len(ds_fs.parents) > 0:
4194
4523
  parent = f"{os.path.sep}".join(ds_fs.parents)
@@ -4221,13 +4550,26 @@ def upload_project(
4221
4550
  else:
4222
4551
  img_infos.append(None)
4223
4552
 
4224
- img_paths = list(filter(lambda x: os.path.isfile(x), img_paths))
4553
+ # img_paths = list(filter(lambda x: os.path.isfile(x), img_paths))
4554
+ source_img_paths_len = len(img_paths)
4555
+ valid_indices = []
4556
+ valid_paths = []
4557
+ offset_indices = []
4558
+ for i, path in enumerate(img_paths):
4559
+ if os.path.isfile(path):
4560
+ valid_indices.append(i)
4561
+ valid_paths.append(path)
4562
+ else:
4563
+ offset_indices.append(i)
4564
+ img_paths = valid_paths
4225
4565
  ann_paths = list(filter(lambda x: os.path.isfile(x), ann_paths))
4566
+ # Create a mapping from name to index position for quick lookups
4567
+ offset_name_to_idx = {names[i]: i for i in offset_indices}
4226
4568
  metas = [{} for _ in names]
4227
4569
 
4228
4570
  img_infos_count = sum(1 for item in img_infos if item is not None)
4229
4571
 
4230
- if len(img_paths) == 0 and img_infos_count == 0:
4572
+ if len(img_paths) == 0 and img_infos_count == 0 and len(offset_indices) == 0:
4231
4573
  # Dataset is empty
4232
4574
  continue
4233
4575
 
@@ -4258,56 +4600,57 @@ def upload_project(
4258
4600
  merged_metas.append(merged_meta)
4259
4601
  metas = merged_metas
4260
4602
 
4261
- if len(img_paths) != 0:
4262
- uploaded_img_infos = api.image.upload_paths(
4263
- dataset.id, names, img_paths, ds_progress, metas=metas
4603
+ if len(img_paths) != 0 or len(offset_indices) != 0:
4604
+
4605
+ uploaded_img_infos = [None] * source_img_paths_len
4606
+ uploaded_img_infos_paths = api.image.upload_paths(
4607
+ dataset_id=dataset.id,
4608
+ names=[name for i, name in enumerate(names) if i in valid_indices],
4609
+ paths=img_paths,
4610
+ progress_cb=ds_progress,
4611
+ metas=[metas[i] for i in valid_indices],
4264
4612
  )
4613
+ for i, img_info in zip(valid_indices, uploaded_img_infos_paths):
4614
+ uploaded_img_infos[i] = img_info
4615
+ for blob_offsets in ds_fs.blob_offsets:
4616
+ blob_file = None
4617
+ for blob_file_info in blob_file_infos:
4618
+ if Path(blob_file_info.name).stem == removesuffix(
4619
+ Path(blob_offsets).name, OFFSETS_PKL_SUFFIX
4620
+ ):
4621
+ blob_file = blob_file_info
4622
+ break
4623
+
4624
+ if blob_file is None:
4625
+ raise ValueError(
4626
+ f"Cannot find blob file for offsets: {blob_offsets}. "
4627
+ f"Check the Team File directory '{TF_BLOB_DIR}', corresponding blob file should be uploaded."
4628
+ )
4629
+ uploaded_img_infos_offsets = api.image.upload_by_offsets_generator(
4630
+ dataset=dataset,
4631
+ team_file_id=blob_file.id,
4632
+ offsets_file_path=blob_offsets,
4633
+ progress_cb=ds_progress,
4634
+ metas={names[i]: metas[i] for i in offset_indices},
4635
+ )
4636
+ for img_info_batch in uploaded_img_infos_offsets:
4637
+ for img_info in img_info_batch:
4638
+ idx = offset_name_to_idx.get(img_info.name)
4639
+ if idx is not None:
4640
+ uploaded_img_infos[idx] = img_info
4265
4641
  elif img_infos_count != 0:
4266
4642
  if img_infos_count != len(names):
4267
4643
  raise ValueError(
4268
4644
  f"Cannot upload Project: image info files count ({img_infos_count}) doesn't match with images count ({len(names)}) that are going to be uploaded. "
4269
4645
  "Check the directory structure, all annotation files should have corresponding image info files."
4270
4646
  )
4271
- # uploading links and hashes (the code from api.image.upload_ids)
4272
- links, links_names, links_order, links_metas = [], [], [], []
4273
- hashes, hashes_names, hashes_order, hashes_metas = [], [], [], []
4274
- dataset_id = dataset.id
4275
- for idx, (name, info, meta) in enumerate(zip(names, img_infos, metas)):
4276
- if info.link is not None:
4277
- links.append(info.link)
4278
- links_names.append(name)
4279
- links_order.append(idx)
4280
- links_metas.append(meta)
4281
- else:
4282
- hashes.append(info.hash)
4283
- hashes_names.append(name)
4284
- hashes_order.append(idx)
4285
- hashes_metas.append(meta)
4286
-
4287
- result = [None] * len(names)
4288
- if len(links) > 0:
4289
- res_infos_links = api.image.upload_links(
4290
- dataset_id,
4291
- links_names,
4292
- links,
4293
- ds_progress,
4294
- metas=links_metas,
4295
- )
4296
- for info, pos in zip(res_infos_links, links_order):
4297
- result[pos] = info
4298
-
4299
- if len(hashes) > 0:
4300
- res_infos_hashes = api.image.upload_hashes(
4301
- dataset_id,
4302
- hashes_names,
4303
- hashes,
4304
- ds_progress,
4305
- metas=hashes_metas,
4306
- )
4307
- for info, pos in zip(res_infos_hashes, hashes_order):
4308
- result[pos] = info
4309
-
4310
- uploaded_img_infos = result
4647
+ uploaded_img_infos = api.image.upload_ids(
4648
+ dataset_id=dataset.id,
4649
+ names=names,
4650
+ ids=[img_info.id for img_info in img_infos],
4651
+ progress_cb=ds_progress,
4652
+ metas=metas,
4653
+ )
4311
4654
  else:
4312
4655
  raise ValueError(
4313
4656
  "Cannot upload Project: img_paths is empty and img_infos_paths is empty"
@@ -4343,6 +4686,7 @@ def download_project(
4343
4686
  save_image_meta: bool = False,
4344
4687
  images_ids: Optional[List[int]] = None,
4345
4688
  resume_download: Optional[bool] = False,
4689
+ **kwargs,
4346
4690
  ) -> None:
4347
4691
  """
4348
4692
  Download image project to the local directory.
@@ -4353,7 +4697,7 @@ def download_project(
4353
4697
  :type project_id: int
4354
4698
  :param dest_dir: Destination path to local directory.
4355
4699
  :type dest_dir: str
4356
- :param dataset_ids: Specified list of Dataset IDs which will be downloaded. Datasets could be downloaded from different projects but with the same data type.
4700
+ :param dataset_ids: Specified list of Dataset IDs which will be downloaded.
4357
4701
  :type dataset_ids: list(int), optional
4358
4702
  :param log_progress: Show downloading logs in the output. By default, it is True.
4359
4703
  :type log_progress: bool, optional
@@ -4375,6 +4719,9 @@ def download_project(
4375
4719
  :type images_ids: list(int), optional
4376
4720
  :param resume_download: Resume download enables to download only missing files avoiding erase of existing files.
4377
4721
  :type resume_download: bool, optional
4722
+ :param download_blob_files: Default is False. It will download images in classic way.
4723
+ If True, it will download blob files, if they are present in the project, to optimize download process.
4724
+ :type download_blob_files: bool, optional
4378
4725
  :return: None.
4379
4726
  :rtype: NoneType
4380
4727
  :Usage example:
@@ -4426,6 +4773,7 @@ def download_project(
4426
4773
  save_image_meta=save_image_meta,
4427
4774
  images_ids=images_ids,
4428
4775
  resume_download=resume_download,
4776
+ **kwargs,
4429
4777
  )
4430
4778
  else:
4431
4779
  _download_project_optimized(
@@ -4440,6 +4788,7 @@ def download_project(
4440
4788
  save_images=save_images,
4441
4789
  log_progress=log_progress,
4442
4790
  images_ids=images_ids,
4791
+ **kwargs,
4443
4792
  )
4444
4793
 
4445
4794
 
@@ -4455,6 +4804,7 @@ def _download_project_optimized(
4455
4804
  save_images=True,
4456
4805
  log_progress=True,
4457
4806
  images_ids: List[int] = None,
4807
+ **kwargs,
4458
4808
  ):
4459
4809
  project_info = api.project.get_info_by_id(project_id)
4460
4810
  project_id = project_info.id
@@ -4736,6 +5086,209 @@ def create_readme(
4736
5086
  return readme_path
4737
5087
 
4738
5088
 
5089
+ def _dataset_blob_structure_md(
5090
+ project_fs: Project,
5091
+ project_info: sly.ProjectInfo,
5092
+ entity_limit: Optional[int] = 2,
5093
+ ) -> str:
5094
+ """Creates a markdown string with the dataset structure of the project.
5095
+ Supports only images and videos projects.
5096
+
5097
+ :project_fs: Project file system.
5098
+ :type project_fs: :class:`Project<supervisely.project.project.Project>`
5099
+ :param project_info: Project information.
5100
+ :type project_info: :class:`ProjectInfo<supervisely.project.project_info.ProjectInfo>`
5101
+ :param entity_limit: The maximum number of entities to display in the README.
5102
+ :type entity_limit: int, optional
5103
+ :return: Markdown string with the dataset structure of the project.
5104
+ :rtype: str
5105
+ """
5106
+ supported_project_types = [sly.ProjectType.IMAGES.value]
5107
+ if project_info.type not in supported_project_types:
5108
+ return ""
5109
+
5110
+ entity_icons = {
5111
+ "images": " 🏞️ ",
5112
+ "blob_files": " 📦 ",
5113
+ "pkl_files": " 📄 ",
5114
+ "annotations": " 📝 ",
5115
+ }
5116
+ dataset_icon = " 📂 "
5117
+ folder_icon = " 📁 "
5118
+
5119
+ result_md = f"🗂️ {project_info.name}<br>"
5120
+
5121
+ # Add project-level blob files
5122
+ if os.path.exists(project_fs.blob_dir) and project_fs.blob_files:
5123
+ result_md += "┣" + folder_icon + f"{Project.blob_dir_name}<br>"
5124
+ blob_files = [entry.name for entry in os.scandir(project_fs.blob_dir) if entry.is_file()]
5125
+
5126
+ for idx, blob_file in enumerate(blob_files):
5127
+ if idx == entity_limit and len(blob_files) > entity_limit:
5128
+ result_md += "┃ ┗ ... " + str(len(blob_files) - entity_limit) + " more<br>"
5129
+ break
5130
+ symbol = "┗" if idx == len(blob_files) - 1 or idx == entity_limit - 1 else "┣"
5131
+ result_md += "┃ " + symbol + entity_icons["blob_files"] + blob_file + "<br>"
5132
+
5133
+ # Build a dataset hierarchy tree
5134
+ dataset_tree = {}
5135
+ root_datasets = []
5136
+
5137
+ # First pass: create nodes for all datasets
5138
+ for dataset in project_fs.datasets:
5139
+ dataset_tree[dataset.directory] = {
5140
+ "dataset": dataset,
5141
+ "children": [],
5142
+ "parent_dir": os.path.dirname(dataset.directory) if dataset.parents else None,
5143
+ }
5144
+
5145
+ # Second pass: build parent-child relationships
5146
+ for dir_path, node in dataset_tree.items():
5147
+ parent_dir = node["parent_dir"]
5148
+ if parent_dir in dataset_tree:
5149
+ dataset_tree[parent_dir]["children"].append(dir_path)
5150
+ else:
5151
+ root_datasets.append(dir_path)
5152
+
5153
+ # Function to recursively render the dataset tree
5154
+ def render_tree(dir_path, prefix=""):
5155
+ nonlocal result_md
5156
+ node = dataset_tree[dir_path]
5157
+ dataset = node["dataset"]
5158
+ children = node["children"]
5159
+
5160
+ # Create dataset display with proper path
5161
+ dataset_path = Dataset._get_dataset_path(dataset.name, dataset.parents)
5162
+ result_md += prefix + "┣" + dataset_icon + f"[{dataset.name}]({dataset_path})<br>"
5163
+
5164
+ # Set indentation for dataset content
5165
+ content_prefix = prefix + "┃ "
5166
+
5167
+ # Add pkl files at the dataset level
5168
+ offset_files = [
5169
+ entry.name
5170
+ for entry in os.scandir(dataset.directory)
5171
+ if entry.is_file() and entry.name.endswith(".pkl")
5172
+ ]
5173
+
5174
+ if offset_files:
5175
+ for idx, pkl_file in enumerate(offset_files):
5176
+ last_file = idx == len(offset_files) - 1
5177
+ has_more_content = (
5178
+ os.path.exists(dataset.img_dir) or os.path.exists(dataset.ann_dir) or children
5179
+ )
5180
+ symbol = "┗" if last_file and not has_more_content else "┣"
5181
+ result_md += content_prefix + symbol + entity_icons["pkl_files"] + pkl_file + "<br>"
5182
+
5183
+ # Add img directory
5184
+ if os.path.exists(dataset.img_dir):
5185
+ has_ann_dir = os.path.exists(dataset.ann_dir)
5186
+ has_more_content = has_ann_dir or children
5187
+ symbol = "┣" if has_more_content else "┗"
5188
+ result_md += content_prefix + symbol + folder_icon + "img<br>"
5189
+
5190
+ # Add image files
5191
+ entities = [entry.name for entry in os.scandir(dataset.img_dir) if entry.is_file()]
5192
+ entities = sorted(entities)
5193
+ selected_entities = entities[: min(len(entities), entity_limit)]
5194
+
5195
+ img_prefix = content_prefix + "┃ "
5196
+ for idx, entity in enumerate(selected_entities):
5197
+ last_img = idx == len(selected_entities) - 1
5198
+ symbol = "┗" if last_img and len(entities) <= entity_limit else "┣"
5199
+ result_md += img_prefix + symbol + entity_icons["images"] + entity + "<br>"
5200
+
5201
+ if len(entities) > entity_limit:
5202
+ result_md += img_prefix + "┗ ... " + str(len(entities) - entity_limit) + " more<br>"
5203
+
5204
+ # Add ann directory
5205
+ if os.path.exists(dataset.ann_dir):
5206
+ has_more_content = bool(children)
5207
+ symbol = "┣"
5208
+ result_md += content_prefix + "┣" + folder_icon + "ann<br>"
5209
+
5210
+ anns = [entry.name for entry in os.scandir(dataset.ann_dir) if entry.is_file()]
5211
+ anns = sorted(anns)
5212
+
5213
+ # Try to match annotations with displayed images
5214
+ possible_anns = [f"{entity}.json" for entity in selected_entities]
5215
+ matched_anns = [pa for pa in possible_anns if pa in anns]
5216
+
5217
+ # Add additional annotations if we haven't reached the limit
5218
+ if len(matched_anns) < min(entity_limit, len(anns)):
5219
+ for ann in anns:
5220
+ if ann not in matched_anns and len(matched_anns) < entity_limit:
5221
+ matched_anns.append(ann)
5222
+
5223
+ ann_prefix = content_prefix + "┃ "
5224
+ for idx, ann in enumerate(matched_anns):
5225
+ last_ann = idx == len(matched_anns) - 1
5226
+ symbol = "┗" if last_ann and len(anns) <= entity_limit else "┣"
5227
+ result_md += ann_prefix + symbol + entity_icons["annotations"] + ann + "<br>"
5228
+
5229
+ if len(anns) > entity_limit:
5230
+ result_md += ann_prefix + "┗ ... " + str(len(anns) - entity_limit) + " more<br>"
5231
+
5232
+ if not has_more_content:
5233
+ result_md += content_prefix + "...<br>"
5234
+ # Recursively render child datasets
5235
+ for idx, child_dir in enumerate(children):
5236
+ render_tree(child_dir, content_prefix)
5237
+
5238
+ # Start rendering from root datasets
5239
+ for root_dir in sorted(root_datasets):
5240
+ render_tree(root_dir)
5241
+
5242
+ return result_md
5243
+
5244
+
5245
+ def create_blob_readme(
5246
+ project_fs: Project,
5247
+ project_info: ProjectInfo,
5248
+ ) -> str:
5249
+ """Creates a README.md file using the template, adds general information
5250
+ about the project and creates a dataset structure section.
5251
+
5252
+ :param project_fs: Project file system.
5253
+ :type project_fs: :class:`Project<supervisely.project.project.Project>`
5254
+ :param project_info: Project information.
5255
+ :type project_info: :class:`ProjectInfo<supervisely.project.project_info.ProjectInfo>`
5256
+ :return: Path to the created README.md file.
5257
+ :rtype: str
5258
+
5259
+ :Usage example:
5260
+
5261
+ .. code-block:: python
5262
+
5263
+ import supervisely as sly
5264
+
5265
+ api = sly.Api.from_env()
5266
+
5267
+ project_id = 123
5268
+ project_dir = "/path/to/project"
5269
+
5270
+ readme_path = sly.create_readme(project_dir, project_id, api)
5271
+
5272
+ print(f"README.md file was created at {readme_path}")
5273
+ """
5274
+ current_path = os.path.dirname(os.path.abspath(__file__))
5275
+ template_path = os.path.join(current_path, "readme_template.md")
5276
+ with open(template_path, "r") as file:
5277
+ template = file.read()
5278
+
5279
+ readme_path = os.path.join(project_fs.directory, "README.md")
5280
+
5281
+ template = template.replace("{{general_info}}", _project_info_md(project_info))
5282
+
5283
+ template = template.replace(
5284
+ "{{dataset_structure_info}}", _dataset_blob_structure_md(project_fs, project_info)
5285
+ )
5286
+
5287
+ with open(readme_path, "w") as f:
5288
+ f.write(template)
5289
+ return readme_path
5290
+
5291
+
4739
5292
  def _project_info_md(project_info: sly.ProjectInfo) -> str:
4740
5293
  """Creates a markdown string with general information about the project
4741
5294
  using the fields of the ProjectInfo NamedTuple.
@@ -4784,6 +5337,9 @@ def _dataset_structure_md(
4784
5337
  entity_icons = {
4785
5338
  "images": " 🏞️ ",
4786
5339
  "videos": " 🎥 ",
5340
+ "blob_files": " 📦 ",
5341
+ "pkl_files": " 📄 ",
5342
+ "annotations": " 📝 ",
4787
5343
  }
4788
5344
  dataset_icon = " 📂 "
4789
5345
  list_function = list_functions[project_info.type]
@@ -4791,6 +5347,8 @@ def _dataset_structure_md(
4791
5347
 
4792
5348
  result_md = f"🗂️ {project_info.name}<br>"
4793
5349
 
5350
+ # if project_info
5351
+
4794
5352
  for parents, dataset_info in api.dataset.tree(project_info.id):
4795
5353
  # The dataset path is needed to create a clickable link in the README.
4796
5354
  dataset_path = Dataset._get_dataset_path(dataset_info.name, parents)
@@ -4841,6 +5399,8 @@ async def _download_project_async(
4841
5399
  switch_size = kwargs.get("switch_size", 1.28 * 1024 * 1024)
4842
5400
  # batch size for bulk download
4843
5401
  batch_size = kwargs.get("batch_size", 100)
5402
+ # control whether to download blob files
5403
+ download_blob_files = kwargs.get("download_blob_files", False)
4844
5404
 
4845
5405
  if semaphore is None:
4846
5406
  semaphore = api.get_default_semaphore()
@@ -4890,11 +5450,19 @@ async def _download_project_async(
4890
5450
  small_images = []
4891
5451
  large_images = []
4892
5452
  dataset_images = []
5453
+ blob_files_to_download = {}
5454
+ blob_images = []
5455
+
4893
5456
  async for image_batch in all_images:
4894
5457
  for image in image_batch:
4895
5458
  if images_ids is None or image.id in images_ids:
4896
5459
  dataset_images.append(image)
4897
- if image.size < switch_size:
5460
+ # Check for images with blob offsets
5461
+
5462
+ if download_blob_files and image.related_data_id is not None:
5463
+ blob_files_to_download[image.related_data_id] = image.download_id
5464
+ blob_images.append(image)
5465
+ elif image.size < switch_size:
4898
5466
  small_images.append(image)
4899
5467
  else:
4900
5468
  large_images.append(image)
@@ -4903,7 +5471,7 @@ async def _download_project_async(
4903
5471
  if log_progress is True:
4904
5472
  ds_progress = tqdm_sly(
4905
5473
  desc="Downloading images from {!r}".format(dataset.name),
4906
- total=len(small_images) + len(large_images),
5474
+ total=len(small_images) + len(large_images) + len(blob_images),
4907
5475
  leave=False,
4908
5476
  )
4909
5477
 
@@ -4939,14 +5507,82 @@ async def _download_project_async(
4939
5507
  )
4940
5508
  return created_tasks
4941
5509
 
5510
+ # Download blob files if required
5511
+ if download_blob_files and len(blob_files_to_download) > 0:
5512
+ blob_paths = []
5513
+ download_ids = []
5514
+ # Process each blob file
5515
+ for blob_file_id, download_id in blob_files_to_download.items():
5516
+ if blob_file_id not in project_fs.blob_files:
5517
+ # Download the blob file
5518
+ blob_paths.append(os.path.join(project_fs.blob_dir, f"{blob_file_id}.tar"))
5519
+ download_ids.append(download_id)
5520
+ await api.image.download_blob_files_async(
5521
+ project_id=project_id,
5522
+ download_ids=download_ids,
5523
+ paths=blob_paths,
5524
+ semaphore=semaphore,
5525
+ log_progress=(True if log_progress or progress_cb is not None else False),
5526
+ )
5527
+ for blob_file_id, download_id in blob_files_to_download.items():
5528
+ project_fs.add_blob_file(blob_file_id)
5529
+
5530
+ # Process blob image offsets
5531
+ offsets_file_name = f"{blob_file_id}{OFFSETS_PKL_SUFFIX}"
5532
+ offsets_file_path = os.path.join(dataset_fs.directory, offsets_file_name)
5533
+
5534
+ total_offsets_count = 0 # for logging
5535
+ current_batch = []
5536
+ for img in blob_images:
5537
+ if img.related_data_id == blob_file_id:
5538
+ blob_image_info = BlobImageInfo(
5539
+ name=img.name,
5540
+ offset_start=img.offset_start,
5541
+ offset_end=img.offset_end,
5542
+ )
5543
+ current_batch.append(blob_image_info)
5544
+ if len(current_batch) >= OFFSETS_PKL_BATCH_SIZE:
5545
+ BlobImageInfo.dump_to_pickle(current_batch, offsets_file_path)
5546
+ total_offsets_count += len(current_batch)
5547
+ current_batch = []
5548
+ if len(current_batch) > 0:
5549
+ BlobImageInfo.dump_to_pickle(current_batch, offsets_file_path)
5550
+ total_offsets_count += len(current_batch)
5551
+ if total_offsets_count > 0:
5552
+ logger.debug(
5553
+ f"Saved {total_offsets_count} image offsets for {blob_file_id} to {offsets_file_path} in {(total_offsets_count + OFFSETS_PKL_BATCH_SIZE - 1) // OFFSETS_PKL_BATCH_SIZE} batches"
5554
+ )
5555
+ offset_tasks = []
5556
+ # Download annotations for images with offsets
5557
+ for offsets_batch in batched(blob_images, batch_size=batch_size):
5558
+ offset_task = _download_project_items_batch_async(
5559
+ api=api,
5560
+ dataset_id=dataset_id,
5561
+ img_infos=offsets_batch,
5562
+ meta=meta,
5563
+ dataset_fs=dataset_fs,
5564
+ id_to_tagmeta=id_to_tagmeta,
5565
+ semaphore=semaphore,
5566
+ save_images=False,
5567
+ save_image_info=save_image_info,
5568
+ only_image_tags=only_image_tags,
5569
+ progress_cb=ds_progress,
5570
+ )
5571
+ offset_tasks.append(offset_task)
5572
+ created_tasks = await run_tasks_with_delay(offset_tasks, 0.05)
5573
+ await asyncio.gather(*created_tasks)
5574
+
4942
5575
  tasks = []
5576
+ # Check which images need to be downloaded
4943
5577
  small_images = await check_items(small_images)
4944
5578
  large_images = await check_items(large_images)
4945
5579
 
5580
+ # If only one small image, treat it as a large image for efficiency
4946
5581
  if len(small_images) == 1:
4947
5582
  large_images.append(small_images.pop())
4948
- for images_batch in batched(small_images, batch_size=batch_size):
4949
5583
 
5584
+ # Create batch download tasks
5585
+ for images_batch in batched(small_images, batch_size=batch_size):
4950
5586
  task = _download_project_items_batch_async(
4951
5587
  api=api,
4952
5588
  dataset_id=dataset_id,
@@ -4961,6 +5597,8 @@ async def _download_project_async(
4961
5597
  progress_cb=ds_progress,
4962
5598
  )
4963
5599
  tasks.append(task)
5600
+
5601
+ # Create individual download tasks for large images
4964
5602
  for image in large_images:
4965
5603
  task = _download_project_item_async(
4966
5604
  api=api,
@@ -4995,7 +5633,11 @@ async def _download_project_async(
4995
5633
  dataset_fs.delete_item(item_name)
4996
5634
 
4997
5635
  try:
4998
- create_readme(dest_dir, project_id, api)
5636
+ if download_blob_files:
5637
+ project_info = api.project.get_info_by_id(project_id)
5638
+ create_blob_readme(project_fs=project_fs, project_info=project_info)
5639
+ else:
5640
+ create_readme(dest_dir, project_id, api)
4999
5641
  except Exception as e:
5000
5642
  logger.info(f"There was an error while creating README: {e}")
5001
5643