dataverse-sdk 2.1.2__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/PKG-INFO +3 -3
  2. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/README.md +2 -2
  3. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/apis/backend.py +0 -2
  4. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/client.py +228 -97
  5. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/coco.py +19 -9
  6. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/visionai.py +19 -9
  7. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/vqa.py +19 -9
  8. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/yolo.py +19 -9
  9. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/schemas/api.py +0 -1
  10. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/schemas/client.py +5 -5
  11. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk.egg-info/PKG-INFO +3 -3
  12. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/setup.py +1 -1
  13. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/__init__.py +0 -0
  14. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/apis/__init__.py +0 -0
  15. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/apis/third_party.py +0 -0
  16. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/connections.py +0 -0
  17. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/constants.py +0 -0
  18. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/exceptions/__init__.py +0 -0
  19. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/exceptions/client.py +0 -0
  20. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/__init__.py +0 -0
  21. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/base.py +0 -0
  22. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/constant.py +0 -0
  23. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/exporter.py +0 -0
  24. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/utils.py +0 -0
  25. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/schemas/__init__.py +0 -0
  26. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/schemas/common.py +0 -0
  27. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/schemas/format.py +0 -0
  28. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/utils/__init__.py +0 -0
  29. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/utils/utils.py +0 -0
  30. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk.egg-info/SOURCES.txt +0 -0
  31. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk.egg-info/dependency_links.txt +0 -0
  32. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk.egg-info/requires.txt +0 -0
  33. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk.egg-info/top_level.txt +0 -0
  34. {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataverse-sdk
3
- Version: 2.1.2
3
+ Version: 2.2.0
4
4
  Summary: Dataverse SDK For Python
5
5
  Home-page:
6
6
  Author: LinkerVision
@@ -339,7 +339,6 @@ dataset_data = {
339
339
  "storage_url": "storage/url",
340
340
  "container_name": "azure container name",
341
341
  "data_folder": "datafolder/to/vai_anno",
342
- "sensors": project.sensors,
343
342
  "type": DatasetType.ANNOTATED_DATA,
344
343
  "annotation_format": AnnotationFormat.VISION_AI,
345
344
  "annotations": ["groundtruth"],
@@ -395,10 +394,11 @@ dataset_data2 = {
395
394
  "sensors": project.sensors,
396
395
  "type": DatasetType.ANNOTATED_DATA, # or DatasetType.RAW_DATA for images
397
396
  "annotation_format": AnnotationFormat.VISION_AI,
398
- "annotations": ["groundtruth"],
397
+ "annotations": ["groundtruth"], # remove it when type is DatasetType.RAW_DATA
399
398
  "sequential": False,
400
399
  "generate_metadata": False,
401
400
  "auto_tagging": []
401
+ "sas_token": ""
402
402
  }
403
403
  dataset2 = project.create_dataset(**dataset_data2)
404
404
 
@@ -312,7 +312,6 @@ dataset_data = {
312
312
  "storage_url": "storage/url",
313
313
  "container_name": "azure container name",
314
314
  "data_folder": "datafolder/to/vai_anno",
315
- "sensors": project.sensors,
316
315
  "type": DatasetType.ANNOTATED_DATA,
317
316
  "annotation_format": AnnotationFormat.VISION_AI,
318
317
  "annotations": ["groundtruth"],
@@ -368,10 +367,11 @@ dataset_data2 = {
368
367
  "sensors": project.sensors,
369
368
  "type": DatasetType.ANNOTATED_DATA, # or DatasetType.RAW_DATA for images
370
369
  "annotation_format": AnnotationFormat.VISION_AI,
371
- "annotations": ["groundtruth"],
370
+ "annotations": ["groundtruth"], # remove it when type is DatasetType.RAW_DATA
372
371
  "sequential": False,
373
372
  "generate_metadata": False,
374
373
  "auto_tagging": []
374
+ "sas_token": ""
375
375
  }
376
376
  dataset2 = project.create_dataset(**dataset_data2)
377
377
 
@@ -358,7 +358,6 @@ class BackendAPI:
358
358
  name: str,
359
359
  data_source: str,
360
360
  project_id: int,
361
- sensor_ids: list[int],
362
361
  type: str,
363
362
  annotation_format: str,
364
363
  storage_url: str,
@@ -382,7 +381,6 @@ class BackendAPI:
382
381
  payload_data = {
383
382
  "name": name,
384
383
  "project_id": project_id,
385
- "sensor_ids": sensor_ids,
386
384
  "data_source": data_source,
387
385
  "storage_url": storage_url,
388
386
  "container_name": container_name,
@@ -1,7 +1,9 @@
1
1
  import asyncio
2
+ import json
2
3
  import logging
3
4
  import os
4
- from asyncio import Semaphore
5
+ import platform
6
+ from asyncio import AbstractEventLoop, Semaphore
5
7
  from collections import deque
6
8
  from pathlib import Path
7
9
  from typing import Optional, Union
@@ -53,7 +55,13 @@ from .utils.utils import (
53
55
  get_filepaths,
54
56
  )
55
57
 
56
- MAX_CONCURRENT_FILES = 100
58
+
59
+ def is_macOS():
60
+ return platform.system() == "Darwin"
61
+
62
+
63
+ # to avoid the `Too many open files` error in macOS
64
+ MAX_CONCURRENT_FILES = 70 if is_macOS() else 100
57
65
 
58
66
 
59
67
  def parse_attribute(attr_list: list) -> list:
@@ -1435,10 +1443,7 @@ of this project OR has been added before"
1435
1443
  raise ClientConnectionError(f"Failed to get the dataset: {e}")
1436
1444
 
1437
1445
  project = self.get_project(dataset_data["project"]["id"])
1438
- sensors = [
1439
- Sensor.create(sensor_data) for sensor_data in dataset_data["sensors"]
1440
- ]
1441
- dataset_data.update({"project": project, "sensors": sensors})
1446
+ dataset_data.update({"project": project})
1442
1447
  return Dataset(**dataset_data, client_alias=client_alias)
1443
1448
 
1444
1449
  # TODO: required arguments for different DataSource
@@ -1447,7 +1452,6 @@ of this project OR has been added before"
1447
1452
  name: str,
1448
1453
  data_source: DataSource,
1449
1454
  project: Project,
1450
- sensors: list[Sensor],
1451
1455
  type: DatasetType,
1452
1456
  annotation_format: AnnotationFormat,
1453
1457
  storage_url: str,
@@ -1464,6 +1468,7 @@ of this project OR has been added before"
1464
1468
  client_alias: Optional[str] = None,
1465
1469
  access_key_id: Optional[str] = None,
1466
1470
  secret_access_key: Optional[str] = None,
1471
+ reupload_dataset_uuid: Optional[str] = None,
1467
1472
  **kwargs,
1468
1473
  ) -> Dataset:
1469
1474
  """Create Dataset
@@ -1476,8 +1481,6 @@ of this project OR has been added before"
1476
1481
  the DataSource basemodel of the given dataset
1477
1482
  project : Project
1478
1483
  Project basemodel
1479
- sensors : list[Sensor]
1480
- list of Sensor basemodel
1481
1484
  type : DatasetType
1482
1485
  datasettype (annotation or raw)
1483
1486
  annotation_format : AnnotationFormat
@@ -1545,13 +1548,11 @@ of this project OR has been added before"
1545
1548
  "Import data source must be LOCAL if host is not in DataverseHost."
1546
1549
  )
1547
1550
 
1548
- sensor_ids = [sensor.id for sensor in sensors]
1549
1551
  project_id = project.id
1550
1552
  try:
1551
1553
  raw_dataset_data: dict = DatasetAPISchema(
1552
1554
  name=name,
1553
1555
  project_id=project_id,
1554
- sensor_ids=sensor_ids,
1555
1556
  data_source=data_source,
1556
1557
  type=type,
1557
1558
  annotation_format=annotation_format,
@@ -1576,14 +1577,14 @@ of this project OR has been added before"
1576
1577
 
1577
1578
  if data_source == DataSource.LOCAL:
1578
1579
  create_dataset_uuid = DataverseClient.upload_files_from_local(
1579
- async_api, raw_dataset_data, sensors
1580
+ async_api, api, raw_dataset_data, reupload_dataset_uuid
1580
1581
  )
1581
1582
  raw_dataset_data["create_dataset_uuid"] = create_dataset_uuid
1583
+
1582
1584
  dataset_data = api.create_dataset(**raw_dataset_data)
1583
1585
  dataset_data.update(
1584
1586
  {
1585
1587
  "project": project,
1586
- "sensors": sensors,
1587
1588
  "sequential": sequential,
1588
1589
  "generate_metadata": generate_metadata,
1589
1590
  "auto_tagging": auto_tagging,
@@ -1594,59 +1595,151 @@ of this project OR has been added before"
1594
1595
 
1595
1596
  @staticmethod
1596
1597
  def upload_files_from_local(
1597
- async_api: AsyncBackendAPI, raw_dataset_data: dict, sensors: list
1598
- ) -> dict:
1599
- loop = asyncio.get_event_loop()
1600
- data_folder = raw_dataset_data["data_folder"]
1601
- dataset_type = raw_dataset_data["type"]
1598
+ async_api: AsyncBackendAPI,
1599
+ api: BackendAPI,
1600
+ raw_dataset_data: dict,
1601
+ reupload_dataset_uuid: Optional[str] = None,
1602
+ ) -> str:
1603
+ def run_new_upload_tasks(
1604
+ data_folder: str,
1605
+ dataset_type: DatasetType,
1606
+ async_api_client: AsyncBackendAPI,
1607
+ event_loop: AbstractEventLoop,
1608
+ ):
1609
+ print(f"Uploading new dataset from [{data_folder}]...")
1610
+
1611
+ # check folder structure
1612
+ required_data = DataverseClient._get_format_folders(
1613
+ annotation_format=raw_dataset_data["annotation_format"],
1614
+ dataset_type=dataset_type,
1615
+ project_id=raw_dataset_data["project_id"],
1616
+ api=api,
1617
+ )
1618
+ if required_data:
1619
+ for required_folder_or_file in required_data:
1620
+ path = os.path.join(data_folder, required_folder_or_file)
1621
+ if not os.path.exists(path):
1622
+ raise DataverseExceptionBase(
1623
+ type="",
1624
+ detail=f"Require the file or folder: {path} for {raw_dataset_data['annotation_format']}",
1625
+ )
1602
1626
 
1603
- # check folder structure
1604
- required_data = DataverseClient._get_format_folders(
1605
- annotation_format=raw_dataset_data["annotation_format"],
1606
- dataset_type=dataset_type,
1607
- sensors=sensors,
1608
- )
1609
- if required_data:
1610
- for required_folder_or_file in required_data:
1611
- path = os.path.join(data_folder, required_folder_or_file)
1612
- if not os.path.exists(path):
1613
- raise DataverseExceptionBase(
1614
- type="",
1615
- detail=f"Require the file or folder: {path} for {raw_dataset_data['annotation_format']}",
1627
+ file_paths = DataverseClient._find_all_paths(data_folder)
1628
+ (
1629
+ upload_task_queue,
1630
+ create_dataset_uuid,
1631
+ failed_urls,
1632
+ ) = asyncio.run(
1633
+ DataverseClient.run_generate_presigned_urls(
1634
+ file_paths=file_paths, api=async_api_client, data_folder=data_folder
1635
+ )
1636
+ )
1637
+ if failed_urls:
1638
+ raise ClientConnectionError(
1639
+ f"unable to generate urls for: {failed_urls}"
1640
+ )
1641
+
1642
+ if not create_dataset_uuid:
1643
+ raise ClientConnectionError(
1644
+ "something went wrong, missing create dataset uuid"
1645
+ )
1646
+
1647
+ failed_file_info_batches = asyncio.run(
1648
+ DataverseClient.run_upload_tasks(upload_task_queue)
1649
+ )
1650
+
1651
+ return create_dataset_uuid, failed_file_info_batches
1652
+
1653
+ def run_reupload_tasks(
1654
+ reupload_dataset_uuid: str,
1655
+ provided_data_folder: str,
1656
+ event_loop: AbstractEventLoop,
1657
+ ):
1658
+ print(f"Reuploading dataset from [{provided_data_folder}]...")
1659
+
1660
+ prev_failed_report_path = (
1661
+ Path.cwd() / "report" / reupload_dataset_uuid / "failed_upload.json"
1662
+ )
1663
+
1664
+ if not prev_failed_report_path.exists():
1665
+ raise DataverseExceptionBase(
1666
+ detail=(
1667
+ f"Failed upload report not found at [{prev_failed_report_path}]; "
1668
+ f"cannot proceed with reuploading dataset [{reupload_dataset_uuid}]."
1616
1669
  )
1670
+ )
1617
1671
 
1618
- file_paths = DataverseClient._find_all_paths(data_folder)
1619
- upload_task_queue, create_dataset_uuid, failed_urls = loop.run_until_complete(
1620
- DataverseClient.run_generate_presigned_urls(
1621
- file_paths=file_paths, api=async_api, data_folder=data_folder
1672
+ with open(prev_failed_report_path) as f:
1673
+ failed_report = json.load(f)
1674
+
1675
+ if provided_data_folder != (
1676
+ reupload_local_dataset_folder := failed_report.get(
1677
+ "local_dataset_folder"
1678
+ )
1679
+ ):
1680
+ raise DataverseExceptionBase(
1681
+ detail=(
1682
+ f"The local dataset folder [{reupload_local_dataset_folder}] for the reupload does not match "
1683
+ f"the currently provided '--folder' [{provided_data_folder}].\n"
1684
+ f"To reupload dataset [{reupload_dataset_uuid}], "
1685
+ f"please set '--folder' to [{reupload_local_dataset_folder}]."
1686
+ )
1687
+ )
1688
+
1689
+ failed_file_info_list = failed_report["failed_file_info_list"]
1690
+ upload_task_queue = deque(failed_file_info_list)
1691
+
1692
+ failed_file_info_batches = asyncio.run(
1693
+ DataverseClient.run_upload_tasks(upload_task_queue)
1694
+ )
1695
+ if not failed_file_info_batches:
1696
+ prev_failed_report_path.unlink(missing_ok=True)
1697
+
1698
+ return reupload_dataset_uuid, failed_file_info_batches
1699
+
1700
+ data_folder = raw_dataset_data["data_folder"]
1701
+ loop = asyncio.get_event_loop()
1702
+
1703
+ create_dataset_uuid, failed_file_info_batches = (
1704
+ run_reupload_tasks(reupload_dataset_uuid, data_folder, loop)
1705
+ if reupload_dataset_uuid
1706
+ else run_new_upload_tasks(
1707
+ data_folder, raw_dataset_data["type"], async_api, loop
1622
1708
  )
1623
1709
  )
1624
- if failed_urls:
1625
- raise ClientConnectionError(f"unable to generate urls for: {failed_urls}")
1626
1710
 
1627
- if not create_dataset_uuid:
1628
- raise ClientConnectionError(
1629
- "something went wrong, missing create dataset uuid"
1711
+ if failed_file_info_batches:
1712
+ failed_report_path = (
1713
+ Path.cwd() / "report" / create_dataset_uuid / "failed_upload.json"
1630
1714
  )
1715
+ failed_report_path.parent.mkdir(parents=True, exist_ok=True)
1716
+ report = {
1717
+ "dataset_uuid": create_dataset_uuid,
1718
+ "local_dataset_folder": data_folder,
1719
+ "failed_file_info_list": failed_file_info_batches,
1720
+ }
1631
1721
 
1632
- failed_urls = loop.run_until_complete(
1633
- DataverseClient.run_upload_tasks(upload_task_queue)
1634
- )
1635
- if failed_urls:
1636
- raise ClientConnectionError(f"failed to upload urls: {failed_urls}")
1722
+ with open(failed_report_path, "w") as f:
1723
+ json.dump(report, f)
1724
+
1725
+ raise ClientConnectionError(
1726
+ f"Failed to upload dataset.\n"
1727
+ f"A detailed failure report has been saved at: {failed_report_path}\n"
1728
+ f"To retry, import the dataset with the 'reupload_dataset_id' parameter set to [{create_dataset_uuid}]."
1729
+ )
1637
1730
  return create_dataset_uuid
1638
1731
 
1639
1732
  @staticmethod
1640
1733
  async def run_generate_presigned_urls(
1641
1734
  file_paths: list, api: AsyncBackendAPI, data_folder: str
1642
- ) -> tuple[deque, str, list[str]]:
1643
- max_retry_count, batch_size, max_concurrent_api_calls = 3, 500, 10
1735
+ ) -> tuple[deque[tuple[list[str], list[dict]]], str, list[str]]:
1736
+ max_retry_count, batch_size, max_concurrent_api_calls = 5, 500, 10
1644
1737
  semaphore = asyncio.Semaphore(max_concurrent_api_calls)
1645
1738
 
1646
- failed_urls = []
1647
- upload_task_queue = deque()
1739
+ failed_urls: list[str] = []
1740
+ upload_task_queue: deque[tuple[list[str], list[dict]]] = deque()
1648
1741
 
1649
- data_folder = Path(data_folder).resolve()
1742
+ data_folder_path = Path(data_folder).resolve()
1650
1743
  create_dataset_uuid: str = str(uuid4())
1651
1744
 
1652
1745
  async def generate_presigned_url_task(
@@ -1661,7 +1754,7 @@ of this project OR has been added before"
1661
1754
  # Convert absolute file paths to relative paths
1662
1755
  # i.e <long data folder path>/data/image.jpg -> /data/image.jpg
1663
1756
  filtered_paths = [
1664
- str(Path(path).relative_to(data_folder)).replace("\\", "/")
1757
+ str(Path(path).relative_to(data_folder_path)).replace("\\", "/")
1665
1758
  for path in batched_file_paths
1666
1759
  ]
1667
1760
  async with semaphore:
@@ -1682,6 +1775,7 @@ of this project OR has been added before"
1682
1775
  raise
1683
1776
  except Exception as e:
1684
1777
  logging.warning(f"Retrying batch due to error: {e}")
1778
+ await asyncio.sleep(retry_count**2)
1685
1779
  await generate_presigned_url_task(
1686
1780
  batched_file_paths, retry_count + 1
1687
1781
  )
@@ -1696,56 +1790,89 @@ of this project OR has been added before"
1696
1790
  return upload_task_queue, create_dataset_uuid, failed_urls
1697
1791
 
1698
1792
  @staticmethod
1699
- async def run_upload_tasks(upload_task_queue: deque) -> list[str]:
1793
+ async def run_upload_tasks(upload_task_queue: deque[tuple[list[str], list[dict]]]):
1794
+ async def upload_batch(
1795
+ paths: list[str],
1796
+ upload_infos: list[dict],
1797
+ async_client: AsyncThirdPartyAPI,
1798
+ semaphore: Semaphore,
1799
+ max_retry_count: int,
1800
+ progress_bar: tqdm_asyncio,
1801
+ ) -> tuple[list[str], list[dict[str, str]]] | None:
1802
+ async def upload_file(path: str, info: dict):
1803
+ async with semaphore:
1804
+ try:
1805
+ async with aio_open(path, "rb") as file:
1806
+ file_content = await file.read()
1807
+ await async_client.upload_file(
1808
+ method="PUT",
1809
+ target_url=info["url"],
1810
+ file=file_content,
1811
+ content_type="application/octet-stream",
1812
+ )
1813
+ progress_bar.update(1)
1814
+ except Exception as e:
1815
+ logging.exception(e)
1816
+ return (path, info)
1817
+
1818
+ remaining_files = (file for file in zip(paths, upload_infos, strict=True))
1819
+ attempt_count = 1
1820
+
1821
+ while attempt_count <= max_retry_count:
1822
+ print(f"🔁 Upload file batch ({attempt_count}/{max_retry_count}) ...")
1823
+
1824
+ upload_tasks = (
1825
+ upload_file(path, info) for path, info in remaining_files
1826
+ )
1827
+ failed_files = await asyncio.gather(*upload_tasks)
1828
+ if not any(failed_files):
1829
+ print(
1830
+ f"✅ Upload file batch successful on attempt ({attempt_count}/{max_retry_count})"
1831
+ )
1832
+ return None
1833
+
1834
+ remaining_files = (file for file in failed_files if file)
1835
+ print(
1836
+ f"❌ Upload file batch failed on attempt ({attempt_count}/{max_retry_count})"
1837
+ )
1838
+
1839
+ await asyncio.sleep(attempt_count**2)
1840
+ attempt_count += 1
1841
+
1842
+ failed_files = list(remaining_files)
1843
+ failed_paths = [path for path, _ in failed_files]
1844
+ failed_remote_urls = [{"url": info["url"]} for _, info in failed_files]
1845
+
1846
+ return (failed_paths, failed_remote_urls)
1847
+
1700
1848
  tasks = []
1701
1849
  client = AsyncThirdPartyAPI()
1702
1850
  semaphore = Semaphore(MAX_CONCURRENT_FILES)
1851
+ max_retry_count = 3
1703
1852
  total_files = sum(len(paths) for paths, _ in upload_task_queue)
1704
1853
  progress_bar = tqdm_asyncio(
1705
1854
  total=total_files, desc="Uploading files", unit="file"
1706
1855
  )
1707
- for batched_file_paths, upload_file_infos in upload_task_queue:
1708
-
1709
- async def upload_batch(
1710
- paths: list[str],
1711
- upload_infos: list[dict],
1712
- async_client: AsyncThirdPartyAPI,
1713
- ) -> list[str]:
1714
- failed_urls = []
1715
-
1716
- async def upload_file(path: str, info: dict):
1717
- async with semaphore:
1718
- try:
1719
- async with aio_open(path, "rb") as file:
1720
- file_content = await file.read()
1721
- await async_client.upload_file(
1722
- method=info["method"],
1723
- target_url=info["url"],
1724
- file=file_content,
1725
- content_type=info["content_type"],
1726
- )
1727
- except Exception as e:
1728
- logging.exception(e)
1729
- failed_urls.append(path)
1730
- finally:
1731
- progress_bar.update(1)
1732
-
1733
- upload_tasks = [
1734
- upload_file(path, info) for path, info in zip(paths, upload_infos)
1735
- ]
1736
-
1737
- await asyncio.gather(*upload_tasks)
1738
-
1739
- return failed_urls
1740
1856
 
1741
- tasks.append(upload_batch(batched_file_paths, upload_file_infos, client))
1857
+ for batched_file_paths, upload_file_infos in upload_task_queue:
1858
+ tasks.append(
1859
+ upload_batch(
1860
+ batched_file_paths,
1861
+ upload_file_infos,
1862
+ client,
1863
+ semaphore,
1864
+ max_retry_count,
1865
+ progress_bar,
1866
+ )
1867
+ )
1742
1868
 
1743
- failed_urls = []
1869
+ failed_file_info_list: list[tuple[list[str], list[dict[str, str]]]] = []
1744
1870
  for results in await tqdm_asyncio.gather(*tasks):
1745
- failed_urls.extend(results)
1871
+ if results:
1872
+ failed_file_info_list.append(results)
1746
1873
 
1747
1874
  progress_bar.close()
1748
- return failed_urls
1875
+ return failed_file_info_list
1749
1876
 
1750
1877
  @staticmethod
1751
1878
  def _find_all_paths(*paths) -> list[str]:
@@ -1756,13 +1883,18 @@ of this project OR has been added before"
1756
1883
 
1757
1884
  @staticmethod
1758
1885
  def _get_format_folders(
1759
- annotation_format: AnnotationFormat, dataset_type: DatasetType, sensors: list
1886
+ annotation_format: AnnotationFormat,
1887
+ dataset_type: DatasetType,
1888
+ project_id: int,
1889
+ api: BackendAPI,
1760
1890
  ) -> list[str]:
1761
1891
  if annotation_format == AnnotationFormat.KITTI:
1892
+ project = api.get_project(project_id=project_id)
1893
+ sensors = project["sensors"]
1762
1894
  if dataset_type == DatasetType.RAW_DATA:
1763
1895
  return []
1764
1896
  elif len(sensors) == 1:
1765
- if sensors[0].type == SensorType.LIDAR: # one-lidar case
1897
+ if sensors[0]["type"] == SensorType.LIDAR: # one-lidar case
1766
1898
  return ["label_2", "velodyne"]
1767
1899
  else:
1768
1900
  raise DataverseExceptionBase(
@@ -1791,24 +1923,23 @@ of this project OR has been added before"
1791
1923
 
1792
1924
  class AsyncThirdPartyAPI:
1793
1925
  transport = AsyncHTTPTransport(
1794
- retries=10,
1926
+ retries=5,
1795
1927
  )
1796
1928
 
1797
1929
  def __init__(self):
1798
- self.client = AsyncClient(transport=self.transport, timeout=Timeout(100))
1930
+ self.client = AsyncClient(transport=self.transport, timeout=Timeout(30))
1799
1931
 
1800
1932
  async def async_send_request(self, url: str, method: str, **kwargs) -> Response:
1801
1933
  try:
1802
1934
  resp: Response = await self.client.request(method=method, url=url, **kwargs)
1803
-
1804
- except Exception:
1935
+ except Exception as e:
1805
1936
  logging.exception("async send request error")
1937
+ raise AsyncThirdPartyAPIException(detail="async send request error") from e
1806
1938
 
1807
1939
  if not 200 <= resp.status_code <= 299:
1808
1940
  raise AsyncThirdPartyAPIException(
1809
- status_code=resp.status_code, detail=resp.content
1941
+ status_code=resp.status_code, detail=resp.text
1810
1942
  )
1811
-
1812
1943
  return resp
1813
1944
 
1814
1945
  async def upload_file(
@@ -39,16 +39,26 @@ class ExportCoco(ExportAnnotationBase):
39
39
  url = datarow["url"]
40
40
  file_path = os.path.join(COCO_IMAGE_PATH, datarow["unique_file_name"])
41
41
 
42
- async def download_single(url, file_path):
42
+ async def download_single(url, file_path, max_retries=5, initial_delay=1):
43
43
  async with semaphore:
44
- try:
45
- async with session.get(url) as response:
46
- response.raise_for_status()
47
- img_bytes = await response.read()
48
- return img_bytes, file_path
49
- except Exception as e:
50
- print(f"Error downloading {url}: {e}")
51
- return None
44
+ delay = initial_delay
45
+ for attempt in range(max_retries):
46
+ try:
47
+ async with session.get(url) as response:
48
+ response.raise_for_status()
49
+ img_bytes = await response.read()
50
+ return img_bytes, file_path
51
+ except Exception as e:
52
+ if attempt == max_retries - 1:
53
+ print(
54
+ f"Error downloading {url} after {max_retries} attempts: {e}"
55
+ )
56
+ return None
57
+ print(
58
+ f"Attempt {attempt + 1} failed for {url}: {e}. Retrying in {delay} seconds..."
59
+ )
60
+ await asyncio.sleep(delay)
61
+ delay *= 2
52
62
 
53
63
  tasks.append(download_single(url, file_path))
54
64
 
@@ -589,16 +589,26 @@ class ExportVisionAI(ExportAnnotationBase):
589
589
  f"{frame_num:012d}{os.path.splitext(file_name)[-1]}",
590
590
  )
591
591
 
592
- async def download_single(url, file_path):
592
+ async def download_single(url, file_path, max_retries=5, initial_delay=1):
593
593
  async with semaphore:
594
- try:
595
- async with session.get(url) as response:
596
- response.raise_for_status()
597
- img_bytes = await response.read()
598
- return img_bytes, file_path
599
- except Exception as e:
600
- print(f"Error downloading {url}: {e}")
601
- return None
594
+ delay = initial_delay
595
+ for attempt in range(max_retries):
596
+ try:
597
+ async with session.get(url) as response:
598
+ response.raise_for_status()
599
+ img_bytes = await response.read()
600
+ return img_bytes, file_path
601
+ except Exception as e:
602
+ if attempt == max_retries - 1:
603
+ print(
604
+ f"Error downloading {url} after {max_retries} attempts: {e}"
605
+ )
606
+ return None
607
+ print(
608
+ f"Attempt {attempt + 1} failed for {url}: {e}. Retrying in {delay} seconds..."
609
+ )
610
+ await asyncio.sleep(delay)
611
+ delay *= 2
602
612
 
603
613
  tasks.append(download_single(url, file_path))
604
614
 
@@ -30,16 +30,26 @@ class ExportVQA(ExportAnnotationBase):
30
30
  url = datarow["url"]
31
31
  file_path = os.path.join("images", datarow["unique_file_name"])
32
32
 
33
- async def download_single(url, file_path):
33
+ async def download_single(url, file_path, max_retries=5, initial_delay=1):
34
34
  async with semaphore:
35
- try:
36
- async with session.get(url) as response:
37
- response.raise_for_status()
38
- img_bytes = await response.read()
39
- return img_bytes, file_path
40
- except Exception as e:
41
- print(f"Error downloading {url}: {e}")
42
- return None
35
+ delay = initial_delay
36
+ for attempt in range(max_retries):
37
+ try:
38
+ async with session.get(url) as response:
39
+ response.raise_for_status()
40
+ img_bytes = await response.read()
41
+ return img_bytes, file_path
42
+ except Exception as e:
43
+ if attempt == max_retries - 1:
44
+ print(
45
+ f"Error downloading {url} after {max_retries} attempts: {e}"
46
+ )
47
+ return None
48
+ print(
49
+ f"Attempt {attempt + 1} failed for {url}: {e}. Retrying in {delay} seconds..."
50
+ )
51
+ await asyncio.sleep(delay)
52
+ delay *= 2
43
53
 
44
54
  tasks.append(download_single(url, file_path))
45
55
 
@@ -47,16 +47,26 @@ class ExportYolo(ExportAnnotationBase):
47
47
  )
48
48
  results.append((annot_bytes, anno_path))
49
49
 
50
- async def download_single(url, file_path):
50
+ async def download_single(url, file_path, max_retries=5, initial_delay=1):
51
51
  async with semaphore:
52
- try:
53
- async with session.get(url) as response:
54
- response.raise_for_status()
55
- img_bytes = await response.read()
56
- return img_bytes, file_path
57
- except Exception as e:
58
- print(f"Error downloading {url}: {e}")
59
- return None
52
+ delay = initial_delay
53
+ for attempt in range(max_retries):
54
+ try:
55
+ async with session.get(url) as response:
56
+ response.raise_for_status()
57
+ img_bytes = await response.read()
58
+ return img_bytes, file_path
59
+ except Exception as e:
60
+ if attempt == max_retries - 1:
61
+ print(
62
+ f"Error downloading {url} after {max_retries} attempts: {e}"
63
+ )
64
+ return None
65
+ print(
66
+ f"Attempt {attempt + 1} failed for {url}: {e}. Retrying in {delay} seconds..."
67
+ )
68
+ await asyncio.sleep(delay)
69
+ delay *= 2
60
70
 
61
71
  tasks.append(download_single(url, file_path))
62
72
 
@@ -114,7 +114,6 @@ class UpdateQuestionAPISchema(BaseModel):
114
114
  class DatasetAPISchema(BaseModel):
115
115
  name: str
116
116
  project_id: int
117
- sensor_ids: list[int]
118
117
  data_source: DataSource
119
118
  type: DatasetType
120
119
  annotation_format: AnnotationFormat
@@ -292,7 +292,6 @@ class Project(BaseModel):
292
292
  self,
293
293
  name: str,
294
294
  data_source: DataSource,
295
- sensors: list[Sensor],
296
295
  type: DatasetType,
297
296
  annotation_format: AnnotationFormat,
298
297
  storage_url: str,
@@ -307,6 +306,7 @@ class Project(BaseModel):
307
306
  description: Optional[str] = None,
308
307
  access_key_id: Optional[str] = None,
309
308
  secret_access_key: Optional[str] = None,
309
+ reupload_dataset_uuid: Optional[str] = None,
310
310
  **kwargs,
311
311
  ):
312
312
  """Create Dataset From project itself
@@ -317,8 +317,6 @@ class Project(BaseModel):
317
317
  name of dataset
318
318
  data_source : DataSource
319
319
  the DataSource basemodel of the given dataset
320
- sensors : list[Sensor]
321
- list of Sensor basemodel
322
320
  type : DatasetType
323
321
  datasettype (annotation or raw)
324
322
  annotation_format : AnnotationFormat
@@ -347,6 +345,9 @@ class Project(BaseModel):
347
345
  access key id for AWS s3 bucket, by default None
348
346
  secret_access_key : Optional[str], optional
349
347
  secret access key for AWS s3 bucket, by default None
348
+ reupload_dataset_uuid: Optional[str], optional
349
+ dataset UUID of a previously failed local dataset import. If provided, the files that failed to upload
350
+ (as recorded in `failed_upload.json`) will be re-uploaded, by default None
350
351
 
351
352
  Returns
352
353
  -------
@@ -369,7 +370,6 @@ class Project(BaseModel):
369
370
  name=name,
370
371
  data_source=data_source,
371
372
  project=self,
372
- sensors=sensors,
373
373
  type=type,
374
374
  annotation_format=annotation_format,
375
375
  storage_url=storage_url,
@@ -385,6 +385,7 @@ class Project(BaseModel):
385
385
  access_key_id=access_key_id,
386
386
  secret_access_key=secret_access_key,
387
387
  client_alias=self.client_alias,
388
+ reupload_dataset_uuid=reupload_dataset_uuid,
388
389
  **kwargs,
389
390
  )
390
391
  return dataset_output
@@ -393,7 +394,6 @@ class Project(BaseModel):
393
394
  class Dataset(BaseModel):
394
395
  id: Optional[int] = None
395
396
  project: Project
396
- sensors: list[Sensor]
397
397
  name: str
398
398
  type: DatasetType
399
399
  data_source: DataSource
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataverse-sdk
3
- Version: 2.1.2
3
+ Version: 2.2.0
4
4
  Summary: Dataverse SDK For Python
5
5
  Home-page:
6
6
  Author: LinkerVision
@@ -339,7 +339,6 @@ dataset_data = {
339
339
  "storage_url": "storage/url",
340
340
  "container_name": "azure container name",
341
341
  "data_folder": "datafolder/to/vai_anno",
342
- "sensors": project.sensors,
343
342
  "type": DatasetType.ANNOTATED_DATA,
344
343
  "annotation_format": AnnotationFormat.VISION_AI,
345
344
  "annotations": ["groundtruth"],
@@ -395,10 +394,11 @@ dataset_data2 = {
395
394
  "sensors": project.sensors,
396
395
  "type": DatasetType.ANNOTATED_DATA, # or DatasetType.RAW_DATA for images
397
396
  "annotation_format": AnnotationFormat.VISION_AI,
398
- "annotations": ["groundtruth"],
397
+ "annotations": ["groundtruth"], # remove it when type is DatasetType.RAW_DATA
399
398
  "sequential": False,
400
399
  "generate_metadata": False,
401
400
  "auto_tagging": []
401
+ "sas_token": ""
402
402
  }
403
403
  dataset2 = project.create_dataset(**dataset_data2)
404
404
 
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  AUTHOR = "LinkerVision"
4
4
  PACKAGE_NAME = "dataverse-sdk"
5
- PACKAGE_VERSION = "2.1.2"
5
+ PACKAGE_VERSION = "2.2.0"
6
6
  DESC = "Dataverse SDK For Python"
7
7
  with open("README.md", encoding="utf-8") as fh:
8
8
  long_description = fh.read()
File without changes