hafnia 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hafnia/dataset/dataset_details_uploader.py +41 -54
- hafnia/dataset/dataset_helpers.py +1 -15
- hafnia/dataset/dataset_names.py +17 -3
- hafnia/dataset/format_conversions/torchvision_datasets.py +6 -3
- hafnia/dataset/hafnia_dataset.py +99 -24
- hafnia/dataset/hafnia_dataset_types.py +3 -1
- hafnia/dataset/operations/dataset_s3_storage.py +211 -0
- hafnia/dataset/operations/table_transformations.py +2 -1
- hafnia/http.py +2 -1
- hafnia/platform/datasets.py +196 -105
- hafnia/platform/s5cmd_utils.py +147 -0
- hafnia/utils.py +4 -0
- {hafnia-0.4.3.dist-info → hafnia-0.5.0.dist-info}/METADATA +3 -3
- {hafnia-0.4.3.dist-info → hafnia-0.5.0.dist-info}/RECORD +19 -17
- {hafnia-0.4.3.dist-info → hafnia-0.5.0.dist-info}/WHEEL +1 -1
- hafnia_cli/dataset_cmds.py +18 -0
- hafnia_cli/profile_cmds.py +0 -1
- {hafnia-0.4.3.dist-info → hafnia-0.5.0.dist-info}/entry_points.txt +0 -0
- {hafnia-0.4.3.dist-info → hafnia-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
import time
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Optional
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from hafnia.dataset.dataset_helpers import hash_file_xxhash
|
|
9
|
+
from hafnia.dataset.dataset_names import (
|
|
10
|
+
DatasetVariant,
|
|
11
|
+
ResourceCredentials,
|
|
12
|
+
SampleField,
|
|
13
|
+
)
|
|
14
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
15
|
+
from hafnia.log import user_logger
|
|
16
|
+
from hafnia.platform import s5cmd_utils
|
|
17
|
+
from hafnia.platform.datasets import get_upload_credentials
|
|
18
|
+
from hafnia.utils import progress_bar
|
|
19
|
+
from hafnia_cli.config import Config
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def delete_hafnia_dataset_files_on_platform(
|
|
23
|
+
dataset_name: str,
|
|
24
|
+
interactive: bool = True,
|
|
25
|
+
cfg: Optional[Config] = None,
|
|
26
|
+
) -> bool:
|
|
27
|
+
cfg = cfg or Config()
|
|
28
|
+
resource_credentials = get_upload_credentials(dataset_name, cfg=cfg)
|
|
29
|
+
|
|
30
|
+
if resource_credentials is None:
|
|
31
|
+
raise RuntimeError("Failed to get upload credentials from the platform.")
|
|
32
|
+
|
|
33
|
+
return delete_hafnia_dataset_files_from_resource_credentials(
|
|
34
|
+
interactive=interactive,
|
|
35
|
+
resource_credentials=resource_credentials,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def delete_hafnia_dataset_files_from_resource_credentials(
|
|
40
|
+
resource_credentials: ResourceCredentials,
|
|
41
|
+
interactive: bool = True,
|
|
42
|
+
) -> bool:
|
|
43
|
+
envs = resource_credentials.aws_credentials()
|
|
44
|
+
bucket_name = resource_credentials.bucket_name()
|
|
45
|
+
if interactive:
|
|
46
|
+
confirmation = (
|
|
47
|
+
input(
|
|
48
|
+
f"WARNING THIS WILL delete all files stored in 's3://{bucket_name}'.\n"
|
|
49
|
+
"Meaning that all previous versions of the dataset will be deleted. \n"
|
|
50
|
+
"Normally this is not needed, but if you have changed the dataset structure or want to start from fresh, "
|
|
51
|
+
"you can delete all files in the S3 bucket. "
|
|
52
|
+
"\nDo you really want to delete all files? (yes/NO): "
|
|
53
|
+
)
|
|
54
|
+
.strip()
|
|
55
|
+
.lower()
|
|
56
|
+
)
|
|
57
|
+
if confirmation != "yes":
|
|
58
|
+
user_logger.info("Delete operation cancelled by the user.")
|
|
59
|
+
return False
|
|
60
|
+
user_logger.info(f"Deleting all files in S3 bucket '{bucket_name}'...")
|
|
61
|
+
s5cmd_utils.delete_bucket_content(bucket_prefix=f"s3://{bucket_name}", remove_bucket=True, append_envs=envs)
|
|
62
|
+
return True
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def sync_hafnia_dataset_to_s3(
|
|
66
|
+
dataset: HafniaDataset,
|
|
67
|
+
bucket_prefix: str,
|
|
68
|
+
allow_version_overwrite: bool = False,
|
|
69
|
+
interactive: bool = True,
|
|
70
|
+
envs: Optional[Dict[str, str]] = None,
|
|
71
|
+
) -> None:
|
|
72
|
+
t0 = time.time()
|
|
73
|
+
# bucket_prefix e.g. 's3://bucket-name/sample'
|
|
74
|
+
remote_paths = []
|
|
75
|
+
for file_str in progress_bar(dataset.samples[SampleField.FILE_PATH], description="Hashing data files"):
|
|
76
|
+
path_file = Path(file_str)
|
|
77
|
+
file_hash = hash_file_xxhash(path_file)
|
|
78
|
+
|
|
79
|
+
# Relative path in S3 bucket e.g. 'data/e2/b0/e2b000ac47b19a999bee5456a6addb88.png'
|
|
80
|
+
relative_path = s3_prefix_from_hash(hash=file_hash, suffix=path_file.suffix)
|
|
81
|
+
|
|
82
|
+
# Remote path in S3 bucket e.g. 's3://bucket-name/sample/data/e2/b0/e2b000ac47b19a999bee5456a6addb88.png'
|
|
83
|
+
remote_path = f"{bucket_prefix}/{relative_path}"
|
|
84
|
+
remote_paths.append(remote_path)
|
|
85
|
+
|
|
86
|
+
dataset.samples = dataset.samples.with_columns(pl.Series(remote_paths).alias(SampleField.REMOTE_PATH))
|
|
87
|
+
|
|
88
|
+
user_logger.info(f"Syncing dataset to S3 bucket '{bucket_prefix}'")
|
|
89
|
+
files_in_s3 = set(s5cmd_utils.list_bucket(bucket_prefix=bucket_prefix, append_envs=envs))
|
|
90
|
+
|
|
91
|
+
# Discover data files (images, videos, etc.) missing in s3
|
|
92
|
+
data_files_missing = dataset.samples.filter(~pl.col(SampleField.REMOTE_PATH).is_in(files_in_s3))
|
|
93
|
+
files_already_in_s3 = dataset.samples.filter(pl.col(SampleField.REMOTE_PATH).is_in(files_in_s3))
|
|
94
|
+
|
|
95
|
+
with tempfile.TemporaryDirectory() as temp_dir: # Temp folder to store metadata files
|
|
96
|
+
path_temp = Path(temp_dir)
|
|
97
|
+
# File paths are dropped when uploading to S3
|
|
98
|
+
dataset = dataset.update_samples(dataset.samples.drop(SampleField.FILE_PATH))
|
|
99
|
+
dataset.write_annotations(path_temp)
|
|
100
|
+
|
|
101
|
+
# Discover versioned metadata files (e.g. "annotations.jsonl", "dataset_info.json") missing in s3
|
|
102
|
+
metadata_files_local = []
|
|
103
|
+
metadata_files_s3 = []
|
|
104
|
+
for filename in path_temp.iterdir():
|
|
105
|
+
metadata_files_s3.append(f"{bucket_prefix}/versions/{dataset.info.version}/{filename.name}")
|
|
106
|
+
metadata_files_local.append(filename.as_posix())
|
|
107
|
+
|
|
108
|
+
overwrite_metadata_files = files_in_s3.intersection(set(metadata_files_s3))
|
|
109
|
+
will_overwrite_metadata_files = len(overwrite_metadata_files) > 0
|
|
110
|
+
|
|
111
|
+
n_files_already_in_s3 = len(files_already_in_s3)
|
|
112
|
+
user_logger.info(f"Sync dataset to {bucket_prefix}")
|
|
113
|
+
user_logger.info(
|
|
114
|
+
f"- Found that {n_files_already_in_s3} / {len(dataset.samples)} data files already exist. "
|
|
115
|
+
f"Meaning {len(data_files_missing)} data files will be uploaded. \n"
|
|
116
|
+
f"- Will upload {len(metadata_files_local)} metadata files. \n"
|
|
117
|
+
f"- Total files to upload: {len(data_files_missing) + len(metadata_files_local)}"
|
|
118
|
+
)
|
|
119
|
+
if will_overwrite_metadata_files:
|
|
120
|
+
msg = f"Metadata files for dataset version '{dataset.info.version}' already exist"
|
|
121
|
+
if allow_version_overwrite:
|
|
122
|
+
user_logger.warning(
|
|
123
|
+
f"- WARNING: {msg}. Version will be overwritten as 'allow_version_overwrite=True' is set."
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"Upload cancelled. {msg}. \nTo overwrite existing metadata files, "
|
|
128
|
+
"you will need to set 'allow_version_overwrite=True' explicitly."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
has_missing_files = len(data_files_missing) > 0
|
|
132
|
+
if interactive and (has_missing_files or will_overwrite_metadata_files):
|
|
133
|
+
print("Please type 'yes' to upload files.")
|
|
134
|
+
confirmation = input("Do you want to continue? (yes/NO): ").strip().lower()
|
|
135
|
+
|
|
136
|
+
if confirmation != "yes":
|
|
137
|
+
raise RuntimeError("Upload cancelled by user.")
|
|
138
|
+
|
|
139
|
+
local_paths = metadata_files_local + data_files_missing[SampleField.FILE_PATH].to_list()
|
|
140
|
+
s3_paths = metadata_files_s3 + data_files_missing[SampleField.REMOTE_PATH].to_list()
|
|
141
|
+
s5cmd_utils.fast_copy_files(local_paths, s3_paths, append_envs=envs, description="Uploading files")
|
|
142
|
+
user_logger.info(f"- Synced dataset in {time.time() - t0:.2f} seconds.")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def sync_dataset_files_to_platform(
|
|
146
|
+
dataset: HafniaDataset,
|
|
147
|
+
sample_dataset: Optional[HafniaDataset] = None,
|
|
148
|
+
interactive: bool = True,
|
|
149
|
+
allow_version_overwrite: bool = False,
|
|
150
|
+
cfg: Optional[Config] = None,
|
|
151
|
+
) -> None:
|
|
152
|
+
cfg = cfg or Config()
|
|
153
|
+
resource_credentials = get_upload_credentials(dataset.info.dataset_name, cfg=cfg)
|
|
154
|
+
|
|
155
|
+
if resource_credentials is None:
|
|
156
|
+
raise RuntimeError("Failed to get upload credentials from the platform.")
|
|
157
|
+
|
|
158
|
+
sync_dataset_files_to_platform_from_resource_credentials(
|
|
159
|
+
dataset=dataset,
|
|
160
|
+
sample_dataset=sample_dataset,
|
|
161
|
+
interactive=interactive,
|
|
162
|
+
allow_version_overwrite=allow_version_overwrite,
|
|
163
|
+
resource_credentials=resource_credentials,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def sync_dataset_files_to_platform_from_resource_credentials(
|
|
168
|
+
dataset: HafniaDataset,
|
|
169
|
+
sample_dataset: Optional[HafniaDataset],
|
|
170
|
+
interactive: bool,
|
|
171
|
+
allow_version_overwrite: bool,
|
|
172
|
+
resource_credentials: ResourceCredentials,
|
|
173
|
+
):
|
|
174
|
+
envs = resource_credentials.aws_credentials()
|
|
175
|
+
bucket_name = resource_credentials.bucket_name()
|
|
176
|
+
|
|
177
|
+
for dataset_variant_type in [DatasetVariant.SAMPLE, DatasetVariant.HIDDEN]:
|
|
178
|
+
if dataset_variant_type == DatasetVariant.SAMPLE:
|
|
179
|
+
if sample_dataset is None:
|
|
180
|
+
dataset_variant = dataset.create_sample_dataset()
|
|
181
|
+
else:
|
|
182
|
+
dataset_variant = sample_dataset
|
|
183
|
+
else:
|
|
184
|
+
dataset_variant = dataset
|
|
185
|
+
|
|
186
|
+
sync_hafnia_dataset_to_s3(
|
|
187
|
+
dataset=dataset_variant,
|
|
188
|
+
bucket_prefix=f"s3://{bucket_name}/{dataset_variant_type.value}",
|
|
189
|
+
interactive=interactive,
|
|
190
|
+
allow_version_overwrite=allow_version_overwrite,
|
|
191
|
+
envs=envs,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def s3_prefix_from_hash(hash: str, suffix: str) -> str:
|
|
196
|
+
"""
|
|
197
|
+
Generate a relative S3 path from a hash value for objects stored in S3.
|
|
198
|
+
|
|
199
|
+
This function deliberately uses a hierarchical directory layout based on the
|
|
200
|
+
hash prefix to avoid putting too many objects in a single S3 prefix, which
|
|
201
|
+
can run into AWS S3 rate limits and performance issues. For example, for
|
|
202
|
+
hash "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4" and suffix ".png", the returned
|
|
203
|
+
path will be:
|
|
204
|
+
|
|
205
|
+
"data/df/e8/dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4.png"
|
|
206
|
+
|
|
207
|
+
Note: This intentionally differs from when images are stored to disk locally, where
|
|
208
|
+
a flat path of the form ``data/<hash><suffix>`` is used.
|
|
209
|
+
"""
|
|
210
|
+
s3_prefix = f"data/{hash[:2]}/{hash[2:4]}/{hash}{suffix}"
|
|
211
|
+
return s3_prefix
|
|
@@ -45,7 +45,8 @@ def create_primitive_table(
|
|
|
45
45
|
remove_no_object_frames = remove_no_object_frames.drop(drop_columns_names)
|
|
46
46
|
# Rename columns "height", "width" and "meta" for sample to avoid conflicts with object fields names
|
|
47
47
|
remove_no_object_frames = remove_no_object_frames.rename(
|
|
48
|
-
{"height": "image.height", "width": "image.width", "meta": "image.meta"}
|
|
48
|
+
{"height": "image.height", "width": "image.width", "meta": "image.meta"},
|
|
49
|
+
strict=False,
|
|
49
50
|
)
|
|
50
51
|
objects_df = remove_no_object_frames.explode(column_name).unnest(column_name)
|
|
51
52
|
else:
|
hafnia/http.py
CHANGED
|
@@ -24,7 +24,8 @@ def fetch(endpoint: str, headers: Dict, params: Optional[Dict] = None) -> Union[
|
|
|
24
24
|
try:
|
|
25
25
|
response = http.request("GET", endpoint, fields=params, headers=headers)
|
|
26
26
|
if response.status != 200:
|
|
27
|
-
|
|
27
|
+
error_details = response.data.decode("utf-8")
|
|
28
|
+
raise urllib3.exceptions.HTTPError(f"Request failed with status {response.status}: {error_details}")
|
|
28
29
|
|
|
29
30
|
return json.loads(response.data.decode("utf-8"))
|
|
30
31
|
finally:
|
hafnia/platform/datasets.py
CHANGED
|
@@ -1,29 +1,85 @@
|
|
|
1
|
-
import
|
|
1
|
+
import collections
|
|
2
2
|
import shutil
|
|
3
|
-
import subprocess
|
|
4
|
-
import sys
|
|
5
|
-
import tempfile
|
|
6
|
-
import uuid
|
|
7
3
|
from pathlib import Path
|
|
8
4
|
from typing import Any, Dict, List, Optional
|
|
9
5
|
|
|
10
6
|
import rich
|
|
7
|
+
from packaging.version import Version
|
|
11
8
|
from rich import print as rprint
|
|
12
9
|
|
|
13
10
|
from hafnia import http, utils
|
|
14
|
-
from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED
|
|
11
|
+
from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED, ResourceCredentials
|
|
15
12
|
from hafnia.dataset.dataset_recipe.dataset_recipe import (
|
|
16
13
|
DatasetRecipe,
|
|
17
14
|
get_dataset_path_from_recipe,
|
|
18
15
|
)
|
|
19
16
|
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
20
|
-
from hafnia.http import fetch
|
|
21
|
-
from hafnia.log import
|
|
17
|
+
from hafnia.http import fetch, post
|
|
18
|
+
from hafnia.log import user_logger
|
|
19
|
+
from hafnia.platform import s5cmd_utils
|
|
22
20
|
from hafnia.platform.download import get_resource_credentials
|
|
23
|
-
from hafnia.utils import
|
|
21
|
+
from hafnia.utils import timed
|
|
24
22
|
from hafnia_cli.config import Config
|
|
25
23
|
|
|
26
24
|
|
|
25
|
+
@timed("Fetching dataset by name.")
|
|
26
|
+
def get_dataset_by_name(dataset_name: str, cfg: Optional[Config] = None) -> Optional[Dict[str, Any]]:
|
|
27
|
+
"""Get dataset details by name from the Hafnia platform."""
|
|
28
|
+
cfg = cfg or Config()
|
|
29
|
+
endpoint_dataset = cfg.get_platform_endpoint("datasets")
|
|
30
|
+
header = {"Authorization": cfg.api_key}
|
|
31
|
+
full_url = f"{endpoint_dataset}?name__iexact={dataset_name}"
|
|
32
|
+
datasets: List[Dict[str, Any]] = http.fetch(full_url, headers=header) # type: ignore[assignment]
|
|
33
|
+
if len(datasets) == 0:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
if len(datasets) > 1:
|
|
37
|
+
raise ValueError(f"Multiple datasets found with the name '{dataset_name}'.")
|
|
38
|
+
|
|
39
|
+
return datasets[0]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@timed("Fetching dataset by ID.")
|
|
43
|
+
def get_dataset_by_id(dataset_id: str, cfg: Optional[Config] = None) -> Optional[Dict[str, Any]]:
|
|
44
|
+
"""Get dataset details by ID from the Hafnia platform."""
|
|
45
|
+
cfg = cfg or Config()
|
|
46
|
+
endpoint_dataset = cfg.get_platform_endpoint("datasets")
|
|
47
|
+
header = {"Authorization": cfg.api_key}
|
|
48
|
+
full_url = f"{endpoint_dataset}/{dataset_id}"
|
|
49
|
+
dataset: Dict[str, Any] = http.fetch(full_url, headers=header) # type: ignore[assignment]
|
|
50
|
+
if not dataset:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
return dataset
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_or_create_dataset(dataset_name: str = "", cfg: Optional[Config] = None) -> Dict[str, Any]:
|
|
57
|
+
"""Create a new dataset on the Hafnia platform."""
|
|
58
|
+
cfg = cfg or Config()
|
|
59
|
+
dataset = get_dataset_by_name(dataset_name, cfg)
|
|
60
|
+
|
|
61
|
+
if dataset is not None:
|
|
62
|
+
user_logger.info(f"Dataset '{dataset_name}' already exists on the Hafnia platform.")
|
|
63
|
+
return dataset
|
|
64
|
+
|
|
65
|
+
endpoint_dataset = cfg.get_platform_endpoint("datasets")
|
|
66
|
+
header = {"Authorization": cfg.api_key}
|
|
67
|
+
dataset_title = dataset_name.replace("-", " ").title() # convert dataset-name to title "Dataset Name"
|
|
68
|
+
payload = {
|
|
69
|
+
"title": dataset_title,
|
|
70
|
+
"name": dataset_name,
|
|
71
|
+
"overview": "No description provided.",
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
dataset = http.post(endpoint_dataset, headers=header, data=payload) # type: ignore[assignment]
|
|
75
|
+
|
|
76
|
+
# TODO: Handle issue when dataset creation fails because name is taken by another user from a different organization
|
|
77
|
+
if not dataset:
|
|
78
|
+
raise ValueError("Failed to create dataset on the Hafnia platform. ")
|
|
79
|
+
|
|
80
|
+
return dataset
|
|
81
|
+
|
|
82
|
+
|
|
27
83
|
@timed("Fetching dataset list.")
|
|
28
84
|
def get_datasets(cfg: Optional[Config] = None) -> List[Dict[str, str]]:
|
|
29
85
|
"""List available datasets on the Hafnia platform."""
|
|
@@ -50,6 +106,80 @@ def get_dataset_id(dataset_name: str, endpoint: str, api_key: str) -> str:
|
|
|
50
106
|
raise ValueError("Dataset information is missing or invalid") from e
|
|
51
107
|
|
|
52
108
|
|
|
109
|
+
@timed("Get upload access credentials")
|
|
110
|
+
def get_upload_credentials(dataset_name: str, cfg: Optional[Config] = None) -> Optional[ResourceCredentials]:
|
|
111
|
+
"""Get dataset details by name from the Hafnia platform."""
|
|
112
|
+
cfg = cfg or Config()
|
|
113
|
+
dataset_response = get_dataset_by_name(dataset_name=dataset_name, cfg=cfg)
|
|
114
|
+
if dataset_response is None:
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
return get_upload_credentials_by_id(dataset_response["id"], cfg=cfg)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@timed("Get upload access credentials by ID")
|
|
121
|
+
def get_upload_credentials_by_id(dataset_id: str, cfg: Optional[Config] = None) -> Optional[ResourceCredentials]:
|
|
122
|
+
"""Get dataset details by ID from the Hafnia platform."""
|
|
123
|
+
cfg = cfg or Config()
|
|
124
|
+
|
|
125
|
+
endpoint_dataset = cfg.get_platform_endpoint("datasets")
|
|
126
|
+
header = {"Authorization": cfg.api_key}
|
|
127
|
+
full_url = f"{endpoint_dataset}/{dataset_id}/temporary-credentials-upload"
|
|
128
|
+
credentials_response: Dict = http.fetch(full_url, headers=header) # type: ignore[assignment]
|
|
129
|
+
|
|
130
|
+
return ResourceCredentials.fix_naming(credentials_response)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@timed("Delete dataset by id")
|
|
134
|
+
def delete_dataset_by_id(dataset_id: str, cfg: Optional[Config] = None) -> Dict:
|
|
135
|
+
cfg = cfg or Config()
|
|
136
|
+
endpoint_dataset = cfg.get_platform_endpoint("datasets")
|
|
137
|
+
header = {"Authorization": cfg.api_key}
|
|
138
|
+
full_url = f"{endpoint_dataset}/{dataset_id}"
|
|
139
|
+
return http.delete(full_url, headers=header) # type: ignore
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@timed("Delete dataset by name")
|
|
143
|
+
def delete_dataset_by_name(dataset_name: str, cfg: Optional[Config] = None) -> Dict:
|
|
144
|
+
cfg = cfg or Config()
|
|
145
|
+
dataset_response = get_dataset_by_name(dataset_name=dataset_name, cfg=cfg)
|
|
146
|
+
if dataset_response is None:
|
|
147
|
+
raise ValueError(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
|
|
148
|
+
|
|
149
|
+
dataset_id = dataset_response["id"] # type: ignore[union-attr]
|
|
150
|
+
response = delete_dataset_by_id(dataset_id=dataset_id, cfg=cfg)
|
|
151
|
+
user_logger.info(f"Dataset '{dataset_name}' has been deleted from the Hafnia platform.")
|
|
152
|
+
return response
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def delete_dataset_completely_by_name(dataset_name: str, interactive: bool = True) -> None:
|
|
156
|
+
from hafnia.dataset.operations.dataset_s3_storage import delete_hafnia_dataset_files_on_platform
|
|
157
|
+
|
|
158
|
+
cfg = Config()
|
|
159
|
+
|
|
160
|
+
is_deleted = delete_hafnia_dataset_files_on_platform(
|
|
161
|
+
dataset_name=dataset_name,
|
|
162
|
+
interactive=interactive,
|
|
163
|
+
cfg=cfg,
|
|
164
|
+
)
|
|
165
|
+
if not is_deleted:
|
|
166
|
+
return
|
|
167
|
+
delete_dataset_by_name(dataset_name, cfg=cfg)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@timed("Import dataset details to platform")
|
|
171
|
+
def upload_dataset_details(cfg: Config, data: dict, dataset_name: str) -> dict:
|
|
172
|
+
dataset_endpoint = cfg.get_platform_endpoint("datasets")
|
|
173
|
+
dataset_id = get_dataset_id(dataset_name, dataset_endpoint, cfg.api_key)
|
|
174
|
+
|
|
175
|
+
import_endpoint = f"{dataset_endpoint}/{dataset_id}/import"
|
|
176
|
+
headers = {"Authorization": cfg.api_key}
|
|
177
|
+
|
|
178
|
+
user_logger.info("Exporting dataset details to platform. This may take up to 30 seconds...")
|
|
179
|
+
response = post(endpoint=import_endpoint, headers=headers, data=data) # type: ignore[assignment]
|
|
180
|
+
return response # type: ignore[return-value]
|
|
181
|
+
|
|
182
|
+
|
|
53
183
|
def download_or_get_dataset_path(
|
|
54
184
|
dataset_name: str,
|
|
55
185
|
cfg: Optional[Config] = None,
|
|
@@ -72,9 +202,11 @@ def download_or_get_dataset_path(
|
|
|
72
202
|
shutil.rmtree(path_dataset, ignore_errors=True)
|
|
73
203
|
|
|
74
204
|
endpoint_dataset = cfg.get_platform_endpoint("datasets")
|
|
75
|
-
|
|
76
|
-
if
|
|
77
|
-
|
|
205
|
+
dataset_res = get_dataset_by_name(dataset_name, cfg) # Check if dataset exists
|
|
206
|
+
if dataset_res is None:
|
|
207
|
+
raise ValueError(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
|
|
208
|
+
|
|
209
|
+
dataset_id = dataset_res.get("id") # type: ignore[union-attr]
|
|
78
210
|
|
|
79
211
|
if utils.is_hafnia_cloud_job():
|
|
80
212
|
credentials_endpoint_suffix = "temporary-credentials-hidden" # Access to hidden datasets
|
|
@@ -95,22 +227,17 @@ def download_dataset_from_access_endpoint(
|
|
|
95
227
|
endpoint: str,
|
|
96
228
|
api_key: str,
|
|
97
229
|
path_dataset: Path,
|
|
230
|
+
version: Optional[str] = None,
|
|
98
231
|
download_files: bool = True,
|
|
99
232
|
) -> None:
|
|
100
|
-
resource_credentials = get_resource_credentials(endpoint, api_key)
|
|
101
|
-
|
|
102
|
-
local_dataset_paths = [(path_dataset / filename).as_posix() for filename in DATASET_FILENAMES_REQUIRED]
|
|
103
|
-
s3_uri = resource_credentials.s3_uri()
|
|
104
|
-
s3_dataset_files = [f"{s3_uri}/{filename}" for filename in DATASET_FILENAMES_REQUIRED]
|
|
105
|
-
|
|
106
|
-
envs = resource_credentials.aws_credentials()
|
|
107
233
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
234
|
+
resource_credentials = get_resource_credentials(endpoint, api_key)
|
|
235
|
+
download_annotation_dataset_from_version(
|
|
236
|
+
version=version,
|
|
237
|
+
credentials=resource_credentials,
|
|
238
|
+
path_dataset=path_dataset,
|
|
113
239
|
)
|
|
240
|
+
|
|
114
241
|
except ValueError as e:
|
|
115
242
|
user_logger.error(f"Failed to download annotations: {e}")
|
|
116
243
|
return
|
|
@@ -126,87 +253,6 @@ def download_dataset_from_access_endpoint(
|
|
|
126
253
|
dataset.write_annotations(path_folder=path_dataset) # Overwrite annotations as files have been re-downloaded
|
|
127
254
|
|
|
128
255
|
|
|
129
|
-
def fast_copy_files_s3(
|
|
130
|
-
src_paths: List[str],
|
|
131
|
-
dst_paths: List[str],
|
|
132
|
-
append_envs: Optional[Dict[str, str]] = None,
|
|
133
|
-
description: str = "Copying files",
|
|
134
|
-
) -> List[str]:
|
|
135
|
-
if len(src_paths) != len(dst_paths):
|
|
136
|
-
raise ValueError("Source and destination paths must have the same length.")
|
|
137
|
-
cmds = [f"cp {src} {dst}" for src, dst in zip(src_paths, dst_paths)]
|
|
138
|
-
lines = execute_s5cmd_commands(cmds, append_envs=append_envs, description=description)
|
|
139
|
-
return lines
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
def find_s5cmd() -> Optional[str]:
|
|
143
|
-
"""Locate the s5cmd executable across different installation methods.
|
|
144
|
-
|
|
145
|
-
Searches for s5cmd in:
|
|
146
|
-
1. System PATH (via shutil.which)
|
|
147
|
-
2. Python bin directory (Unix-like systems)
|
|
148
|
-
3. Python executable directory (direct installs)
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
str: Absolute path to s5cmd executable if found, None otherwise.
|
|
152
|
-
"""
|
|
153
|
-
result = shutil.which("s5cmd")
|
|
154
|
-
if result:
|
|
155
|
-
return result
|
|
156
|
-
python_dir = Path(sys.executable).parent
|
|
157
|
-
locations = (python_dir / "Scripts" / "s5cmd.exe", python_dir / "bin" / "s5cmd", python_dir / "s5cmd")
|
|
158
|
-
for loc in locations:
|
|
159
|
-
if loc.exists():
|
|
160
|
-
return str(loc)
|
|
161
|
-
return None
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def execute_s5cmd_commands(
|
|
165
|
-
commands: List[str],
|
|
166
|
-
append_envs: Optional[Dict[str, str]] = None,
|
|
167
|
-
description: str = "Executing s5cmd commands",
|
|
168
|
-
) -> List[str]:
|
|
169
|
-
append_envs = append_envs or {}
|
|
170
|
-
# In Windows default "Temp" directory can not be deleted that is why we need to create a
|
|
171
|
-
# temporary directory.
|
|
172
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
173
|
-
tmp_file_path = Path(temp_dir, f"{uuid.uuid4().hex}.txt")
|
|
174
|
-
tmp_file_path.write_text("\n".join(commands))
|
|
175
|
-
|
|
176
|
-
s5cmd_bin = find_s5cmd()
|
|
177
|
-
if s5cmd_bin is None:
|
|
178
|
-
raise ValueError("Can not find s5cmd executable.")
|
|
179
|
-
run_cmds = [s5cmd_bin, "run", str(tmp_file_path)]
|
|
180
|
-
sys_logger.debug(run_cmds)
|
|
181
|
-
envs = os.environ.copy()
|
|
182
|
-
envs.update(append_envs)
|
|
183
|
-
|
|
184
|
-
process = subprocess.Popen(
|
|
185
|
-
run_cmds,
|
|
186
|
-
stdout=subprocess.PIPE,
|
|
187
|
-
stderr=subprocess.STDOUT,
|
|
188
|
-
universal_newlines=True,
|
|
189
|
-
env=envs,
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
error_lines = []
|
|
193
|
-
lines = []
|
|
194
|
-
for line in progress_bar(process.stdout, total=len(commands), description=description): # type: ignore[arg-type]
|
|
195
|
-
if "ERROR" in line or "error" in line:
|
|
196
|
-
error_lines.append(line.strip())
|
|
197
|
-
lines.append(line.strip())
|
|
198
|
-
|
|
199
|
-
if len(error_lines) > 0:
|
|
200
|
-
show_n_lines = min(5, len(error_lines))
|
|
201
|
-
str_error_lines = "\n".join(error_lines[:show_n_lines])
|
|
202
|
-
user_logger.error(
|
|
203
|
-
f"Detected {len(error_lines)} errors occurred while executing a total of {len(commands)} "
|
|
204
|
-
f" commands with s5cmd. The first {show_n_lines} is printed below:\n{str_error_lines}"
|
|
205
|
-
)
|
|
206
|
-
raise RuntimeError("Errors occurred during s5cmd execution.")
|
|
207
|
-
return lines
|
|
208
|
-
|
|
209
|
-
|
|
210
256
|
TABLE_FIELDS = {
|
|
211
257
|
"ID": "id",
|
|
212
258
|
"Hidden\nSamples": "hidden.samples",
|
|
@@ -241,3 +287,48 @@ def extend_dataset_details(datasets: List[Dict[str, Any]]) -> List[Dict[str, Any
|
|
|
241
287
|
dataset[f"{variant_type}.samples"] = variant["number_of_data_items"]
|
|
242
288
|
dataset[f"{variant_type}.size"] = utils.size_human_readable(variant["size_bytes"])
|
|
243
289
|
return datasets
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def download_annotation_dataset_from_version(
|
|
293
|
+
version: Optional[str],
|
|
294
|
+
credentials: ResourceCredentials,
|
|
295
|
+
path_dataset: Path,
|
|
296
|
+
) -> list[str]:
|
|
297
|
+
path_dataset.mkdir(parents=True, exist_ok=True)
|
|
298
|
+
|
|
299
|
+
envs = credentials.aws_credentials()
|
|
300
|
+
bucket_prefix_sample_versions = f"{credentials.s3_uri()}/versions"
|
|
301
|
+
all_s3_annotation_files = s5cmd_utils.list_bucket(bucket_prefix=bucket_prefix_sample_versions, append_envs=envs)
|
|
302
|
+
s3_files = _annotation_files_from_version(version=version, all_annotation_files=all_s3_annotation_files)
|
|
303
|
+
|
|
304
|
+
local_paths = [(path_dataset / filename.split("/")[-1]).as_posix() for filename in s3_files]
|
|
305
|
+
s5cmd_utils.fast_copy_files(
|
|
306
|
+
src_paths=s3_files,
|
|
307
|
+
dst_paths=local_paths,
|
|
308
|
+
append_envs=envs,
|
|
309
|
+
description="Downloading annotation files",
|
|
310
|
+
)
|
|
311
|
+
return local_paths
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _annotation_files_from_version(version: Optional[str], all_annotation_files: list[str]) -> list[str]:
|
|
315
|
+
version_files = collections.defaultdict(list)
|
|
316
|
+
for metadata_file in all_annotation_files:
|
|
317
|
+
version_str, filename = metadata_file.split("/")[-2:]
|
|
318
|
+
if filename not in DATASET_FILENAMES_REQUIRED:
|
|
319
|
+
continue
|
|
320
|
+
version_files[version_str].append(metadata_file)
|
|
321
|
+
available_versions = {v for v, files in version_files.items() if len(files) == len(DATASET_FILENAMES_REQUIRED)}
|
|
322
|
+
|
|
323
|
+
if len(available_versions) == 0:
|
|
324
|
+
raise ValueError("No versions were found in the dataset.")
|
|
325
|
+
|
|
326
|
+
if version is None:
|
|
327
|
+
latest_version = max(Version(ver) for ver in available_versions)
|
|
328
|
+
version = str(latest_version)
|
|
329
|
+
user_logger.info(f"No version selected. Using latest version: {version}")
|
|
330
|
+
|
|
331
|
+
if version not in available_versions:
|
|
332
|
+
raise ValueError(f"Selected version '{version}' not found in available versions: {available_versions}")
|
|
333
|
+
|
|
334
|
+
return version_files[version]
|