dataverse-sdk 2.1.2__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/PKG-INFO +3 -3
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/README.md +2 -2
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/apis/backend.py +0 -2
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/client.py +228 -97
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/coco.py +19 -9
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/visionai.py +19 -9
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/vqa.py +19 -9
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/yolo.py +19 -9
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/schemas/api.py +0 -1
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/schemas/client.py +5 -5
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk.egg-info/PKG-INFO +3 -3
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/setup.py +1 -1
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/__init__.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/apis/__init__.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/apis/third_party.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/connections.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/constants.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/exceptions/__init__.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/exceptions/client.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/__init__.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/base.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/constant.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/exporter.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/export/utils.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/schemas/__init__.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/schemas/common.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/schemas/format.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/utils/__init__.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk/utils/utils.py +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk.egg-info/SOURCES.txt +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk.egg-info/dependency_links.txt +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk.egg-info/requires.txt +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/dataverse_sdk.egg-info/top_level.txt +0 -0
- {dataverse_sdk-2.1.2 → dataverse_sdk-2.2.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataverse-sdk
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: Dataverse SDK For Python
|
|
5
5
|
Home-page:
|
|
6
6
|
Author: LinkerVision
|
|
@@ -339,7 +339,6 @@ dataset_data = {
|
|
|
339
339
|
"storage_url": "storage/url",
|
|
340
340
|
"container_name": "azure container name",
|
|
341
341
|
"data_folder": "datafolder/to/vai_anno",
|
|
342
|
-
"sensors": project.sensors,
|
|
343
342
|
"type": DatasetType.ANNOTATED_DATA,
|
|
344
343
|
"annotation_format": AnnotationFormat.VISION_AI,
|
|
345
344
|
"annotations": ["groundtruth"],
|
|
@@ -395,10 +394,11 @@ dataset_data2 = {
|
|
|
395
394
|
"sensors": project.sensors,
|
|
396
395
|
"type": DatasetType.ANNOTATED_DATA, # or DatasetType.RAW_DATA for images
|
|
397
396
|
"annotation_format": AnnotationFormat.VISION_AI,
|
|
398
|
-
"annotations": ["groundtruth"],
|
|
397
|
+
"annotations": ["groundtruth"], # remove it when type is DatasetType.RAW_DATA
|
|
399
398
|
"sequential": False,
|
|
400
399
|
"generate_metadata": False,
|
|
401
400
|
"auto_tagging": []
|
|
401
|
+
"sas_token": ""
|
|
402
402
|
}
|
|
403
403
|
dataset2 = project.create_dataset(**dataset_data2)
|
|
404
404
|
|
|
@@ -312,7 +312,6 @@ dataset_data = {
|
|
|
312
312
|
"storage_url": "storage/url",
|
|
313
313
|
"container_name": "azure container name",
|
|
314
314
|
"data_folder": "datafolder/to/vai_anno",
|
|
315
|
-
"sensors": project.sensors,
|
|
316
315
|
"type": DatasetType.ANNOTATED_DATA,
|
|
317
316
|
"annotation_format": AnnotationFormat.VISION_AI,
|
|
318
317
|
"annotations": ["groundtruth"],
|
|
@@ -368,10 +367,11 @@ dataset_data2 = {
|
|
|
368
367
|
"sensors": project.sensors,
|
|
369
368
|
"type": DatasetType.ANNOTATED_DATA, # or DatasetType.RAW_DATA for images
|
|
370
369
|
"annotation_format": AnnotationFormat.VISION_AI,
|
|
371
|
-
"annotations": ["groundtruth"],
|
|
370
|
+
"annotations": ["groundtruth"], # remove it when type is DatasetType.RAW_DATA
|
|
372
371
|
"sequential": False,
|
|
373
372
|
"generate_metadata": False,
|
|
374
373
|
"auto_tagging": []
|
|
374
|
+
"sas_token": ""
|
|
375
375
|
}
|
|
376
376
|
dataset2 = project.create_dataset(**dataset_data2)
|
|
377
377
|
|
|
@@ -358,7 +358,6 @@ class BackendAPI:
|
|
|
358
358
|
name: str,
|
|
359
359
|
data_source: str,
|
|
360
360
|
project_id: int,
|
|
361
|
-
sensor_ids: list[int],
|
|
362
361
|
type: str,
|
|
363
362
|
annotation_format: str,
|
|
364
363
|
storage_url: str,
|
|
@@ -382,7 +381,6 @@ class BackendAPI:
|
|
|
382
381
|
payload_data = {
|
|
383
382
|
"name": name,
|
|
384
383
|
"project_id": project_id,
|
|
385
|
-
"sensor_ids": sensor_ids,
|
|
386
384
|
"data_source": data_source,
|
|
387
385
|
"storage_url": storage_url,
|
|
388
386
|
"container_name": container_name,
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import json
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
|
-
|
|
5
|
+
import platform
|
|
6
|
+
from asyncio import AbstractEventLoop, Semaphore
|
|
5
7
|
from collections import deque
|
|
6
8
|
from pathlib import Path
|
|
7
9
|
from typing import Optional, Union
|
|
@@ -53,7 +55,13 @@ from .utils.utils import (
|
|
|
53
55
|
get_filepaths,
|
|
54
56
|
)
|
|
55
57
|
|
|
56
|
-
|
|
58
|
+
|
|
59
|
+
def is_macOS():
|
|
60
|
+
return platform.system() == "Darwin"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# to avoid the `Too many open files` error in macOS
|
|
64
|
+
MAX_CONCURRENT_FILES = 70 if is_macOS() else 100
|
|
57
65
|
|
|
58
66
|
|
|
59
67
|
def parse_attribute(attr_list: list) -> list:
|
|
@@ -1435,10 +1443,7 @@ of this project OR has been added before"
|
|
|
1435
1443
|
raise ClientConnectionError(f"Failed to get the dataset: {e}")
|
|
1436
1444
|
|
|
1437
1445
|
project = self.get_project(dataset_data["project"]["id"])
|
|
1438
|
-
|
|
1439
|
-
Sensor.create(sensor_data) for sensor_data in dataset_data["sensors"]
|
|
1440
|
-
]
|
|
1441
|
-
dataset_data.update({"project": project, "sensors": sensors})
|
|
1446
|
+
dataset_data.update({"project": project})
|
|
1442
1447
|
return Dataset(**dataset_data, client_alias=client_alias)
|
|
1443
1448
|
|
|
1444
1449
|
# TODO: required arguments for different DataSource
|
|
@@ -1447,7 +1452,6 @@ of this project OR has been added before"
|
|
|
1447
1452
|
name: str,
|
|
1448
1453
|
data_source: DataSource,
|
|
1449
1454
|
project: Project,
|
|
1450
|
-
sensors: list[Sensor],
|
|
1451
1455
|
type: DatasetType,
|
|
1452
1456
|
annotation_format: AnnotationFormat,
|
|
1453
1457
|
storage_url: str,
|
|
@@ -1464,6 +1468,7 @@ of this project OR has been added before"
|
|
|
1464
1468
|
client_alias: Optional[str] = None,
|
|
1465
1469
|
access_key_id: Optional[str] = None,
|
|
1466
1470
|
secret_access_key: Optional[str] = None,
|
|
1471
|
+
reupload_dataset_uuid: Optional[str] = None,
|
|
1467
1472
|
**kwargs,
|
|
1468
1473
|
) -> Dataset:
|
|
1469
1474
|
"""Create Dataset
|
|
@@ -1476,8 +1481,6 @@ of this project OR has been added before"
|
|
|
1476
1481
|
the DataSource basemodel of the given dataset
|
|
1477
1482
|
project : Project
|
|
1478
1483
|
Project basemodel
|
|
1479
|
-
sensors : list[Sensor]
|
|
1480
|
-
list of Sensor basemodel
|
|
1481
1484
|
type : DatasetType
|
|
1482
1485
|
datasettype (annotation or raw)
|
|
1483
1486
|
annotation_format : AnnotationFormat
|
|
@@ -1545,13 +1548,11 @@ of this project OR has been added before"
|
|
|
1545
1548
|
"Import data source must be LOCAL if host is not in DataverseHost."
|
|
1546
1549
|
)
|
|
1547
1550
|
|
|
1548
|
-
sensor_ids = [sensor.id for sensor in sensors]
|
|
1549
1551
|
project_id = project.id
|
|
1550
1552
|
try:
|
|
1551
1553
|
raw_dataset_data: dict = DatasetAPISchema(
|
|
1552
1554
|
name=name,
|
|
1553
1555
|
project_id=project_id,
|
|
1554
|
-
sensor_ids=sensor_ids,
|
|
1555
1556
|
data_source=data_source,
|
|
1556
1557
|
type=type,
|
|
1557
1558
|
annotation_format=annotation_format,
|
|
@@ -1576,14 +1577,14 @@ of this project OR has been added before"
|
|
|
1576
1577
|
|
|
1577
1578
|
if data_source == DataSource.LOCAL:
|
|
1578
1579
|
create_dataset_uuid = DataverseClient.upload_files_from_local(
|
|
1579
|
-
async_api, raw_dataset_data,
|
|
1580
|
+
async_api, api, raw_dataset_data, reupload_dataset_uuid
|
|
1580
1581
|
)
|
|
1581
1582
|
raw_dataset_data["create_dataset_uuid"] = create_dataset_uuid
|
|
1583
|
+
|
|
1582
1584
|
dataset_data = api.create_dataset(**raw_dataset_data)
|
|
1583
1585
|
dataset_data.update(
|
|
1584
1586
|
{
|
|
1585
1587
|
"project": project,
|
|
1586
|
-
"sensors": sensors,
|
|
1587
1588
|
"sequential": sequential,
|
|
1588
1589
|
"generate_metadata": generate_metadata,
|
|
1589
1590
|
"auto_tagging": auto_tagging,
|
|
@@ -1594,59 +1595,151 @@ of this project OR has been added before"
|
|
|
1594
1595
|
|
|
1595
1596
|
@staticmethod
|
|
1596
1597
|
def upload_files_from_local(
|
|
1597
|
-
async_api: AsyncBackendAPI,
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1598
|
+
async_api: AsyncBackendAPI,
|
|
1599
|
+
api: BackendAPI,
|
|
1600
|
+
raw_dataset_data: dict,
|
|
1601
|
+
reupload_dataset_uuid: Optional[str] = None,
|
|
1602
|
+
) -> str:
|
|
1603
|
+
def run_new_upload_tasks(
|
|
1604
|
+
data_folder: str,
|
|
1605
|
+
dataset_type: DatasetType,
|
|
1606
|
+
async_api_client: AsyncBackendAPI,
|
|
1607
|
+
event_loop: AbstractEventLoop,
|
|
1608
|
+
):
|
|
1609
|
+
print(f"Uploading new dataset from [{data_folder}]...")
|
|
1610
|
+
|
|
1611
|
+
# check folder structure
|
|
1612
|
+
required_data = DataverseClient._get_format_folders(
|
|
1613
|
+
annotation_format=raw_dataset_data["annotation_format"],
|
|
1614
|
+
dataset_type=dataset_type,
|
|
1615
|
+
project_id=raw_dataset_data["project_id"],
|
|
1616
|
+
api=api,
|
|
1617
|
+
)
|
|
1618
|
+
if required_data:
|
|
1619
|
+
for required_folder_or_file in required_data:
|
|
1620
|
+
path = os.path.join(data_folder, required_folder_or_file)
|
|
1621
|
+
if not os.path.exists(path):
|
|
1622
|
+
raise DataverseExceptionBase(
|
|
1623
|
+
type="",
|
|
1624
|
+
detail=f"Require the file or folder: {path} for {raw_dataset_data['annotation_format']}",
|
|
1625
|
+
)
|
|
1602
1626
|
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1627
|
+
file_paths = DataverseClient._find_all_paths(data_folder)
|
|
1628
|
+
(
|
|
1629
|
+
upload_task_queue,
|
|
1630
|
+
create_dataset_uuid,
|
|
1631
|
+
failed_urls,
|
|
1632
|
+
) = asyncio.run(
|
|
1633
|
+
DataverseClient.run_generate_presigned_urls(
|
|
1634
|
+
file_paths=file_paths, api=async_api_client, data_folder=data_folder
|
|
1635
|
+
)
|
|
1636
|
+
)
|
|
1637
|
+
if failed_urls:
|
|
1638
|
+
raise ClientConnectionError(
|
|
1639
|
+
f"unable to generate urls for: {failed_urls}"
|
|
1640
|
+
)
|
|
1641
|
+
|
|
1642
|
+
if not create_dataset_uuid:
|
|
1643
|
+
raise ClientConnectionError(
|
|
1644
|
+
"something went wrong, missing create dataset uuid"
|
|
1645
|
+
)
|
|
1646
|
+
|
|
1647
|
+
failed_file_info_batches = asyncio.run(
|
|
1648
|
+
DataverseClient.run_upload_tasks(upload_task_queue)
|
|
1649
|
+
)
|
|
1650
|
+
|
|
1651
|
+
return create_dataset_uuid, failed_file_info_batches
|
|
1652
|
+
|
|
1653
|
+
def run_reupload_tasks(
|
|
1654
|
+
reupload_dataset_uuid: str,
|
|
1655
|
+
provided_data_folder: str,
|
|
1656
|
+
event_loop: AbstractEventLoop,
|
|
1657
|
+
):
|
|
1658
|
+
print(f"Reuploading dataset from [{provided_data_folder}]...")
|
|
1659
|
+
|
|
1660
|
+
prev_failed_report_path = (
|
|
1661
|
+
Path.cwd() / "report" / reupload_dataset_uuid / "failed_upload.json"
|
|
1662
|
+
)
|
|
1663
|
+
|
|
1664
|
+
if not prev_failed_report_path.exists():
|
|
1665
|
+
raise DataverseExceptionBase(
|
|
1666
|
+
detail=(
|
|
1667
|
+
f"Failed upload report not found at [{prev_failed_report_path}]; "
|
|
1668
|
+
f"cannot proceed with reuploading dataset [{reupload_dataset_uuid}]."
|
|
1616
1669
|
)
|
|
1670
|
+
)
|
|
1617
1671
|
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1672
|
+
with open(prev_failed_report_path) as f:
|
|
1673
|
+
failed_report = json.load(f)
|
|
1674
|
+
|
|
1675
|
+
if provided_data_folder != (
|
|
1676
|
+
reupload_local_dataset_folder := failed_report.get(
|
|
1677
|
+
"local_dataset_folder"
|
|
1678
|
+
)
|
|
1679
|
+
):
|
|
1680
|
+
raise DataverseExceptionBase(
|
|
1681
|
+
detail=(
|
|
1682
|
+
f"The local dataset folder [{reupload_local_dataset_folder}] for the reupload does not match "
|
|
1683
|
+
f"the currently provided '--folder' [{provided_data_folder}].\n"
|
|
1684
|
+
f"To reupload dataset [{reupload_dataset_uuid}], "
|
|
1685
|
+
f"please set '--folder' to [{reupload_local_dataset_folder}]."
|
|
1686
|
+
)
|
|
1687
|
+
)
|
|
1688
|
+
|
|
1689
|
+
failed_file_info_list = failed_report["failed_file_info_list"]
|
|
1690
|
+
upload_task_queue = deque(failed_file_info_list)
|
|
1691
|
+
|
|
1692
|
+
failed_file_info_batches = asyncio.run(
|
|
1693
|
+
DataverseClient.run_upload_tasks(upload_task_queue)
|
|
1694
|
+
)
|
|
1695
|
+
if not failed_file_info_batches:
|
|
1696
|
+
prev_failed_report_path.unlink(missing_ok=True)
|
|
1697
|
+
|
|
1698
|
+
return reupload_dataset_uuid, failed_file_info_batches
|
|
1699
|
+
|
|
1700
|
+
data_folder = raw_dataset_data["data_folder"]
|
|
1701
|
+
loop = asyncio.get_event_loop()
|
|
1702
|
+
|
|
1703
|
+
create_dataset_uuid, failed_file_info_batches = (
|
|
1704
|
+
run_reupload_tasks(reupload_dataset_uuid, data_folder, loop)
|
|
1705
|
+
if reupload_dataset_uuid
|
|
1706
|
+
else run_new_upload_tasks(
|
|
1707
|
+
data_folder, raw_dataset_data["type"], async_api, loop
|
|
1622
1708
|
)
|
|
1623
1709
|
)
|
|
1624
|
-
if failed_urls:
|
|
1625
|
-
raise ClientConnectionError(f"unable to generate urls for: {failed_urls}")
|
|
1626
1710
|
|
|
1627
|
-
if
|
|
1628
|
-
|
|
1629
|
-
|
|
1711
|
+
if failed_file_info_batches:
|
|
1712
|
+
failed_report_path = (
|
|
1713
|
+
Path.cwd() / "report" / create_dataset_uuid / "failed_upload.json"
|
|
1630
1714
|
)
|
|
1715
|
+
failed_report_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1716
|
+
report = {
|
|
1717
|
+
"dataset_uuid": create_dataset_uuid,
|
|
1718
|
+
"local_dataset_folder": data_folder,
|
|
1719
|
+
"failed_file_info_list": failed_file_info_batches,
|
|
1720
|
+
}
|
|
1631
1721
|
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1722
|
+
with open(failed_report_path, "w") as f:
|
|
1723
|
+
json.dump(report, f)
|
|
1724
|
+
|
|
1725
|
+
raise ClientConnectionError(
|
|
1726
|
+
f"Failed to upload dataset.\n"
|
|
1727
|
+
f"A detailed failure report has been saved at: {failed_report_path}\n"
|
|
1728
|
+
f"To retry, import the dataset with the 'reupload_dataset_id' parameter set to [{create_dataset_uuid}]."
|
|
1729
|
+
)
|
|
1637
1730
|
return create_dataset_uuid
|
|
1638
1731
|
|
|
1639
1732
|
@staticmethod
|
|
1640
1733
|
async def run_generate_presigned_urls(
|
|
1641
1734
|
file_paths: list, api: AsyncBackendAPI, data_folder: str
|
|
1642
|
-
) -> tuple[deque, str, list[str]]:
|
|
1643
|
-
max_retry_count, batch_size, max_concurrent_api_calls =
|
|
1735
|
+
) -> tuple[deque[tuple[list[str], list[dict]]], str, list[str]]:
|
|
1736
|
+
max_retry_count, batch_size, max_concurrent_api_calls = 5, 500, 10
|
|
1644
1737
|
semaphore = asyncio.Semaphore(max_concurrent_api_calls)
|
|
1645
1738
|
|
|
1646
|
-
failed_urls = []
|
|
1647
|
-
upload_task_queue = deque()
|
|
1739
|
+
failed_urls: list[str] = []
|
|
1740
|
+
upload_task_queue: deque[tuple[list[str], list[dict]]] = deque()
|
|
1648
1741
|
|
|
1649
|
-
|
|
1742
|
+
data_folder_path = Path(data_folder).resolve()
|
|
1650
1743
|
create_dataset_uuid: str = str(uuid4())
|
|
1651
1744
|
|
|
1652
1745
|
async def generate_presigned_url_task(
|
|
@@ -1661,7 +1754,7 @@ of this project OR has been added before"
|
|
|
1661
1754
|
# Convert absolute file paths to relative paths
|
|
1662
1755
|
# i.e <long data folder path>/data/image.jpg -> /data/image.jpg
|
|
1663
1756
|
filtered_paths = [
|
|
1664
|
-
str(Path(path).relative_to(
|
|
1757
|
+
str(Path(path).relative_to(data_folder_path)).replace("\\", "/")
|
|
1665
1758
|
for path in batched_file_paths
|
|
1666
1759
|
]
|
|
1667
1760
|
async with semaphore:
|
|
@@ -1682,6 +1775,7 @@ of this project OR has been added before"
|
|
|
1682
1775
|
raise
|
|
1683
1776
|
except Exception as e:
|
|
1684
1777
|
logging.warning(f"Retrying batch due to error: {e}")
|
|
1778
|
+
await asyncio.sleep(retry_count**2)
|
|
1685
1779
|
await generate_presigned_url_task(
|
|
1686
1780
|
batched_file_paths, retry_count + 1
|
|
1687
1781
|
)
|
|
@@ -1696,56 +1790,89 @@ of this project OR has been added before"
|
|
|
1696
1790
|
return upload_task_queue, create_dataset_uuid, failed_urls
|
|
1697
1791
|
|
|
1698
1792
|
@staticmethod
|
|
1699
|
-
async def run_upload_tasks(upload_task_queue: deque
|
|
1793
|
+
async def run_upload_tasks(upload_task_queue: deque[tuple[list[str], list[dict]]]):
|
|
1794
|
+
async def upload_batch(
|
|
1795
|
+
paths: list[str],
|
|
1796
|
+
upload_infos: list[dict],
|
|
1797
|
+
async_client: AsyncThirdPartyAPI,
|
|
1798
|
+
semaphore: Semaphore,
|
|
1799
|
+
max_retry_count: int,
|
|
1800
|
+
progress_bar: tqdm_asyncio,
|
|
1801
|
+
) -> tuple[list[str], list[dict[str, str]]] | None:
|
|
1802
|
+
async def upload_file(path: str, info: dict):
|
|
1803
|
+
async with semaphore:
|
|
1804
|
+
try:
|
|
1805
|
+
async with aio_open(path, "rb") as file:
|
|
1806
|
+
file_content = await file.read()
|
|
1807
|
+
await async_client.upload_file(
|
|
1808
|
+
method="PUT",
|
|
1809
|
+
target_url=info["url"],
|
|
1810
|
+
file=file_content,
|
|
1811
|
+
content_type="application/octet-stream",
|
|
1812
|
+
)
|
|
1813
|
+
progress_bar.update(1)
|
|
1814
|
+
except Exception as e:
|
|
1815
|
+
logging.exception(e)
|
|
1816
|
+
return (path, info)
|
|
1817
|
+
|
|
1818
|
+
remaining_files = (file for file in zip(paths, upload_infos, strict=True))
|
|
1819
|
+
attempt_count = 1
|
|
1820
|
+
|
|
1821
|
+
while attempt_count <= max_retry_count:
|
|
1822
|
+
print(f"🔁 Upload file batch ({attempt_count}/{max_retry_count}) ...")
|
|
1823
|
+
|
|
1824
|
+
upload_tasks = (
|
|
1825
|
+
upload_file(path, info) for path, info in remaining_files
|
|
1826
|
+
)
|
|
1827
|
+
failed_files = await asyncio.gather(*upload_tasks)
|
|
1828
|
+
if not any(failed_files):
|
|
1829
|
+
print(
|
|
1830
|
+
f"✅ Upload file batch successful on attempt ({attempt_count}/{max_retry_count})"
|
|
1831
|
+
)
|
|
1832
|
+
return None
|
|
1833
|
+
|
|
1834
|
+
remaining_files = (file for file in failed_files if file)
|
|
1835
|
+
print(
|
|
1836
|
+
f"❌ Upload file batch failed on attempt ({attempt_count}/{max_retry_count})"
|
|
1837
|
+
)
|
|
1838
|
+
|
|
1839
|
+
await asyncio.sleep(attempt_count**2)
|
|
1840
|
+
attempt_count += 1
|
|
1841
|
+
|
|
1842
|
+
failed_files = list(remaining_files)
|
|
1843
|
+
failed_paths = [path for path, _ in failed_files]
|
|
1844
|
+
failed_remote_urls = [{"url": info["url"]} for _, info in failed_files]
|
|
1845
|
+
|
|
1846
|
+
return (failed_paths, failed_remote_urls)
|
|
1847
|
+
|
|
1700
1848
|
tasks = []
|
|
1701
1849
|
client = AsyncThirdPartyAPI()
|
|
1702
1850
|
semaphore = Semaphore(MAX_CONCURRENT_FILES)
|
|
1851
|
+
max_retry_count = 3
|
|
1703
1852
|
total_files = sum(len(paths) for paths, _ in upload_task_queue)
|
|
1704
1853
|
progress_bar = tqdm_asyncio(
|
|
1705
1854
|
total=total_files, desc="Uploading files", unit="file"
|
|
1706
1855
|
)
|
|
1707
|
-
for batched_file_paths, upload_file_infos in upload_task_queue:
|
|
1708
|
-
|
|
1709
|
-
async def upload_batch(
|
|
1710
|
-
paths: list[str],
|
|
1711
|
-
upload_infos: list[dict],
|
|
1712
|
-
async_client: AsyncThirdPartyAPI,
|
|
1713
|
-
) -> list[str]:
|
|
1714
|
-
failed_urls = []
|
|
1715
|
-
|
|
1716
|
-
async def upload_file(path: str, info: dict):
|
|
1717
|
-
async with semaphore:
|
|
1718
|
-
try:
|
|
1719
|
-
async with aio_open(path, "rb") as file:
|
|
1720
|
-
file_content = await file.read()
|
|
1721
|
-
await async_client.upload_file(
|
|
1722
|
-
method=info["method"],
|
|
1723
|
-
target_url=info["url"],
|
|
1724
|
-
file=file_content,
|
|
1725
|
-
content_type=info["content_type"],
|
|
1726
|
-
)
|
|
1727
|
-
except Exception as e:
|
|
1728
|
-
logging.exception(e)
|
|
1729
|
-
failed_urls.append(path)
|
|
1730
|
-
finally:
|
|
1731
|
-
progress_bar.update(1)
|
|
1732
|
-
|
|
1733
|
-
upload_tasks = [
|
|
1734
|
-
upload_file(path, info) for path, info in zip(paths, upload_infos)
|
|
1735
|
-
]
|
|
1736
|
-
|
|
1737
|
-
await asyncio.gather(*upload_tasks)
|
|
1738
|
-
|
|
1739
|
-
return failed_urls
|
|
1740
1856
|
|
|
1741
|
-
|
|
1857
|
+
for batched_file_paths, upload_file_infos in upload_task_queue:
|
|
1858
|
+
tasks.append(
|
|
1859
|
+
upload_batch(
|
|
1860
|
+
batched_file_paths,
|
|
1861
|
+
upload_file_infos,
|
|
1862
|
+
client,
|
|
1863
|
+
semaphore,
|
|
1864
|
+
max_retry_count,
|
|
1865
|
+
progress_bar,
|
|
1866
|
+
)
|
|
1867
|
+
)
|
|
1742
1868
|
|
|
1743
|
-
|
|
1869
|
+
failed_file_info_list: list[tuple[list[str], list[dict[str, str]]]] = []
|
|
1744
1870
|
for results in await tqdm_asyncio.gather(*tasks):
|
|
1745
|
-
|
|
1871
|
+
if results:
|
|
1872
|
+
failed_file_info_list.append(results)
|
|
1746
1873
|
|
|
1747
1874
|
progress_bar.close()
|
|
1748
|
-
return
|
|
1875
|
+
return failed_file_info_list
|
|
1749
1876
|
|
|
1750
1877
|
@staticmethod
|
|
1751
1878
|
def _find_all_paths(*paths) -> list[str]:
|
|
@@ -1756,13 +1883,18 @@ of this project OR has been added before"
|
|
|
1756
1883
|
|
|
1757
1884
|
@staticmethod
|
|
1758
1885
|
def _get_format_folders(
|
|
1759
|
-
annotation_format: AnnotationFormat,
|
|
1886
|
+
annotation_format: AnnotationFormat,
|
|
1887
|
+
dataset_type: DatasetType,
|
|
1888
|
+
project_id: int,
|
|
1889
|
+
api: BackendAPI,
|
|
1760
1890
|
) -> list[str]:
|
|
1761
1891
|
if annotation_format == AnnotationFormat.KITTI:
|
|
1892
|
+
project = api.get_project(project_id=project_id)
|
|
1893
|
+
sensors = project["sensors"]
|
|
1762
1894
|
if dataset_type == DatasetType.RAW_DATA:
|
|
1763
1895
|
return []
|
|
1764
1896
|
elif len(sensors) == 1:
|
|
1765
|
-
if sensors[0]
|
|
1897
|
+
if sensors[0]["type"] == SensorType.LIDAR: # one-lidar case
|
|
1766
1898
|
return ["label_2", "velodyne"]
|
|
1767
1899
|
else:
|
|
1768
1900
|
raise DataverseExceptionBase(
|
|
@@ -1791,24 +1923,23 @@ of this project OR has been added before"
|
|
|
1791
1923
|
|
|
1792
1924
|
class AsyncThirdPartyAPI:
|
|
1793
1925
|
transport = AsyncHTTPTransport(
|
|
1794
|
-
retries=
|
|
1926
|
+
retries=5,
|
|
1795
1927
|
)
|
|
1796
1928
|
|
|
1797
1929
|
def __init__(self):
|
|
1798
|
-
self.client = AsyncClient(transport=self.transport, timeout=Timeout(
|
|
1930
|
+
self.client = AsyncClient(transport=self.transport, timeout=Timeout(30))
|
|
1799
1931
|
|
|
1800
1932
|
async def async_send_request(self, url: str, method: str, **kwargs) -> Response:
|
|
1801
1933
|
try:
|
|
1802
1934
|
resp: Response = await self.client.request(method=method, url=url, **kwargs)
|
|
1803
|
-
|
|
1804
|
-
except Exception:
|
|
1935
|
+
except Exception as e:
|
|
1805
1936
|
logging.exception("async send request error")
|
|
1937
|
+
raise AsyncThirdPartyAPIException(detail="async send request error") from e
|
|
1806
1938
|
|
|
1807
1939
|
if not 200 <= resp.status_code <= 299:
|
|
1808
1940
|
raise AsyncThirdPartyAPIException(
|
|
1809
|
-
status_code=resp.status_code, detail=resp.
|
|
1941
|
+
status_code=resp.status_code, detail=resp.text
|
|
1810
1942
|
)
|
|
1811
|
-
|
|
1812
1943
|
return resp
|
|
1813
1944
|
|
|
1814
1945
|
async def upload_file(
|
|
@@ -39,16 +39,26 @@ class ExportCoco(ExportAnnotationBase):
|
|
|
39
39
|
url = datarow["url"]
|
|
40
40
|
file_path = os.path.join(COCO_IMAGE_PATH, datarow["unique_file_name"])
|
|
41
41
|
|
|
42
|
-
async def download_single(url, file_path):
|
|
42
|
+
async def download_single(url, file_path, max_retries=5, initial_delay=1):
|
|
43
43
|
async with semaphore:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
44
|
+
delay = initial_delay
|
|
45
|
+
for attempt in range(max_retries):
|
|
46
|
+
try:
|
|
47
|
+
async with session.get(url) as response:
|
|
48
|
+
response.raise_for_status()
|
|
49
|
+
img_bytes = await response.read()
|
|
50
|
+
return img_bytes, file_path
|
|
51
|
+
except Exception as e:
|
|
52
|
+
if attempt == max_retries - 1:
|
|
53
|
+
print(
|
|
54
|
+
f"Error downloading {url} after {max_retries} attempts: {e}"
|
|
55
|
+
)
|
|
56
|
+
return None
|
|
57
|
+
print(
|
|
58
|
+
f"Attempt {attempt + 1} failed for {url}: {e}. Retrying in {delay} seconds..."
|
|
59
|
+
)
|
|
60
|
+
await asyncio.sleep(delay)
|
|
61
|
+
delay *= 2
|
|
52
62
|
|
|
53
63
|
tasks.append(download_single(url, file_path))
|
|
54
64
|
|
|
@@ -589,16 +589,26 @@ class ExportVisionAI(ExportAnnotationBase):
|
|
|
589
589
|
f"{frame_num:012d}{os.path.splitext(file_name)[-1]}",
|
|
590
590
|
)
|
|
591
591
|
|
|
592
|
-
async def download_single(url, file_path):
|
|
592
|
+
async def download_single(url, file_path, max_retries=5, initial_delay=1):
|
|
593
593
|
async with semaphore:
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
594
|
+
delay = initial_delay
|
|
595
|
+
for attempt in range(max_retries):
|
|
596
|
+
try:
|
|
597
|
+
async with session.get(url) as response:
|
|
598
|
+
response.raise_for_status()
|
|
599
|
+
img_bytes = await response.read()
|
|
600
|
+
return img_bytes, file_path
|
|
601
|
+
except Exception as e:
|
|
602
|
+
if attempt == max_retries - 1:
|
|
603
|
+
print(
|
|
604
|
+
f"Error downloading {url} after {max_retries} attempts: {e}"
|
|
605
|
+
)
|
|
606
|
+
return None
|
|
607
|
+
print(
|
|
608
|
+
f"Attempt {attempt + 1} failed for {url}: {e}. Retrying in {delay} seconds..."
|
|
609
|
+
)
|
|
610
|
+
await asyncio.sleep(delay)
|
|
611
|
+
delay *= 2
|
|
602
612
|
|
|
603
613
|
tasks.append(download_single(url, file_path))
|
|
604
614
|
|
|
@@ -30,16 +30,26 @@ class ExportVQA(ExportAnnotationBase):
|
|
|
30
30
|
url = datarow["url"]
|
|
31
31
|
file_path = os.path.join("images", datarow["unique_file_name"])
|
|
32
32
|
|
|
33
|
-
async def download_single(url, file_path):
|
|
33
|
+
async def download_single(url, file_path, max_retries=5, initial_delay=1):
|
|
34
34
|
async with semaphore:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
35
|
+
delay = initial_delay
|
|
36
|
+
for attempt in range(max_retries):
|
|
37
|
+
try:
|
|
38
|
+
async with session.get(url) as response:
|
|
39
|
+
response.raise_for_status()
|
|
40
|
+
img_bytes = await response.read()
|
|
41
|
+
return img_bytes, file_path
|
|
42
|
+
except Exception as e:
|
|
43
|
+
if attempt == max_retries - 1:
|
|
44
|
+
print(
|
|
45
|
+
f"Error downloading {url} after {max_retries} attempts: {e}"
|
|
46
|
+
)
|
|
47
|
+
return None
|
|
48
|
+
print(
|
|
49
|
+
f"Attempt {attempt + 1} failed for {url}: {e}. Retrying in {delay} seconds..."
|
|
50
|
+
)
|
|
51
|
+
await asyncio.sleep(delay)
|
|
52
|
+
delay *= 2
|
|
43
53
|
|
|
44
54
|
tasks.append(download_single(url, file_path))
|
|
45
55
|
|
|
@@ -47,16 +47,26 @@ class ExportYolo(ExportAnnotationBase):
|
|
|
47
47
|
)
|
|
48
48
|
results.append((annot_bytes, anno_path))
|
|
49
49
|
|
|
50
|
-
async def download_single(url, file_path):
|
|
50
|
+
async def download_single(url, file_path, max_retries=5, initial_delay=1):
|
|
51
51
|
async with semaphore:
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
52
|
+
delay = initial_delay
|
|
53
|
+
for attempt in range(max_retries):
|
|
54
|
+
try:
|
|
55
|
+
async with session.get(url) as response:
|
|
56
|
+
response.raise_for_status()
|
|
57
|
+
img_bytes = await response.read()
|
|
58
|
+
return img_bytes, file_path
|
|
59
|
+
except Exception as e:
|
|
60
|
+
if attempt == max_retries - 1:
|
|
61
|
+
print(
|
|
62
|
+
f"Error downloading {url} after {max_retries} attempts: {e}"
|
|
63
|
+
)
|
|
64
|
+
return None
|
|
65
|
+
print(
|
|
66
|
+
f"Attempt {attempt + 1} failed for {url}: {e}. Retrying in {delay} seconds..."
|
|
67
|
+
)
|
|
68
|
+
await asyncio.sleep(delay)
|
|
69
|
+
delay *= 2
|
|
60
70
|
|
|
61
71
|
tasks.append(download_single(url, file_path))
|
|
62
72
|
|
|
@@ -292,7 +292,6 @@ class Project(BaseModel):
|
|
|
292
292
|
self,
|
|
293
293
|
name: str,
|
|
294
294
|
data_source: DataSource,
|
|
295
|
-
sensors: list[Sensor],
|
|
296
295
|
type: DatasetType,
|
|
297
296
|
annotation_format: AnnotationFormat,
|
|
298
297
|
storage_url: str,
|
|
@@ -307,6 +306,7 @@ class Project(BaseModel):
|
|
|
307
306
|
description: Optional[str] = None,
|
|
308
307
|
access_key_id: Optional[str] = None,
|
|
309
308
|
secret_access_key: Optional[str] = None,
|
|
309
|
+
reupload_dataset_uuid: Optional[str] = None,
|
|
310
310
|
**kwargs,
|
|
311
311
|
):
|
|
312
312
|
"""Create Dataset From project itself
|
|
@@ -317,8 +317,6 @@ class Project(BaseModel):
|
|
|
317
317
|
name of dataset
|
|
318
318
|
data_source : DataSource
|
|
319
319
|
the DataSource basemodel of the given dataset
|
|
320
|
-
sensors : list[Sensor]
|
|
321
|
-
list of Sensor basemodel
|
|
322
320
|
type : DatasetType
|
|
323
321
|
datasettype (annotation or raw)
|
|
324
322
|
annotation_format : AnnotationFormat
|
|
@@ -347,6 +345,9 @@ class Project(BaseModel):
|
|
|
347
345
|
access key id for AWS s3 bucket, by default None
|
|
348
346
|
secret_access_key : Optional[str], optional
|
|
349
347
|
secret access key for AWS s3 bucket, by default None
|
|
348
|
+
reupload_dataset_uuid: Optional[str], optional
|
|
349
|
+
dataset UUID of a previously failed local dataset import. If provided, the files that failed to upload
|
|
350
|
+
(as recorded in `failed_upload.json`) will be re-uploaded, by default None
|
|
350
351
|
|
|
351
352
|
Returns
|
|
352
353
|
-------
|
|
@@ -369,7 +370,6 @@ class Project(BaseModel):
|
|
|
369
370
|
name=name,
|
|
370
371
|
data_source=data_source,
|
|
371
372
|
project=self,
|
|
372
|
-
sensors=sensors,
|
|
373
373
|
type=type,
|
|
374
374
|
annotation_format=annotation_format,
|
|
375
375
|
storage_url=storage_url,
|
|
@@ -385,6 +385,7 @@ class Project(BaseModel):
|
|
|
385
385
|
access_key_id=access_key_id,
|
|
386
386
|
secret_access_key=secret_access_key,
|
|
387
387
|
client_alias=self.client_alias,
|
|
388
|
+
reupload_dataset_uuid=reupload_dataset_uuid,
|
|
388
389
|
**kwargs,
|
|
389
390
|
)
|
|
390
391
|
return dataset_output
|
|
@@ -393,7 +394,6 @@ class Project(BaseModel):
|
|
|
393
394
|
class Dataset(BaseModel):
|
|
394
395
|
id: Optional[int] = None
|
|
395
396
|
project: Project
|
|
396
|
-
sensors: list[Sensor]
|
|
397
397
|
name: str
|
|
398
398
|
type: DatasetType
|
|
399
399
|
data_source: DataSource
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataverse-sdk
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: Dataverse SDK For Python
|
|
5
5
|
Home-page:
|
|
6
6
|
Author: LinkerVision
|
|
@@ -339,7 +339,6 @@ dataset_data = {
|
|
|
339
339
|
"storage_url": "storage/url",
|
|
340
340
|
"container_name": "azure container name",
|
|
341
341
|
"data_folder": "datafolder/to/vai_anno",
|
|
342
|
-
"sensors": project.sensors,
|
|
343
342
|
"type": DatasetType.ANNOTATED_DATA,
|
|
344
343
|
"annotation_format": AnnotationFormat.VISION_AI,
|
|
345
344
|
"annotations": ["groundtruth"],
|
|
@@ -395,10 +394,11 @@ dataset_data2 = {
|
|
|
395
394
|
"sensors": project.sensors,
|
|
396
395
|
"type": DatasetType.ANNOTATED_DATA, # or DatasetType.RAW_DATA for images
|
|
397
396
|
"annotation_format": AnnotationFormat.VISION_AI,
|
|
398
|
-
"annotations": ["groundtruth"],
|
|
397
|
+
"annotations": ["groundtruth"], # remove it when type is DatasetType.RAW_DATA
|
|
399
398
|
"sequential": False,
|
|
400
399
|
"generate_metadata": False,
|
|
401
400
|
"auto_tagging": []
|
|
401
|
+
"sas_token": ""
|
|
402
402
|
}
|
|
403
403
|
dataset2 = project.create_dataset(**dataset_data2)
|
|
404
404
|
|
|
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
|
|
|
2
2
|
|
|
3
3
|
AUTHOR = "LinkerVision"
|
|
4
4
|
PACKAGE_NAME = "dataverse-sdk"
|
|
5
|
-
PACKAGE_VERSION = "2.
|
|
5
|
+
PACKAGE_VERSION = "2.2.0"
|
|
6
6
|
DESC = "Dataverse SDK For Python"
|
|
7
7
|
with open("README.md", encoding="utf-8") as fh:
|
|
8
8
|
long_description = fh.read()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|