hafnia 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ import tempfile
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Dict, Optional
5
+
6
+ import polars as pl
7
+
8
+ from hafnia.dataset.dataset_helpers import hash_file_xxhash
9
+ from hafnia.dataset.dataset_names import (
10
+ DatasetVariant,
11
+ ResourceCredentials,
12
+ SampleField,
13
+ )
14
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
15
+ from hafnia.log import user_logger
16
+ from hafnia.platform import s5cmd_utils
17
+ from hafnia.platform.datasets import get_upload_credentials
18
+ from hafnia.utils import progress_bar
19
+ from hafnia_cli.config import Config
20
+
21
+
22
+ def delete_hafnia_dataset_files_on_platform(
23
+ dataset_name: str,
24
+ interactive: bool = True,
25
+ cfg: Optional[Config] = None,
26
+ ) -> bool:
27
+ cfg = cfg or Config()
28
+ resource_credentials = get_upload_credentials(dataset_name, cfg=cfg)
29
+
30
+ if resource_credentials is None:
31
+ raise RuntimeError("Failed to get upload credentials from the platform.")
32
+
33
+ return delete_hafnia_dataset_files_from_resource_credentials(
34
+ interactive=interactive,
35
+ resource_credentials=resource_credentials,
36
+ )
37
+
38
+
39
+ def delete_hafnia_dataset_files_from_resource_credentials(
40
+ resource_credentials: ResourceCredentials,
41
+ interactive: bool = True,
42
+ ) -> bool:
43
+ envs = resource_credentials.aws_credentials()
44
+ bucket_name = resource_credentials.bucket_name()
45
+ if interactive:
46
+ confirmation = (
47
+ input(
48
+ f"WARNING THIS WILL delete all files stored in 's3://{bucket_name}'.\n"
49
+ "Meaning that all previous versions of the dataset will be deleted. \n"
50
+ "Normally this is not needed, but if you have changed the dataset structure or want to start from fresh, "
51
+ "you can delete all files in the S3 bucket. "
52
+ "\nDo you really want to delete all files? (yes/NO): "
53
+ )
54
+ .strip()
55
+ .lower()
56
+ )
57
+ if confirmation != "yes":
58
+ user_logger.info("Delete operation cancelled by the user.")
59
+ return False
60
+ user_logger.info(f"Deleting all files in S3 bucket '{bucket_name}'...")
61
+ s5cmd_utils.delete_bucket_content(bucket_prefix=f"s3://{bucket_name}", remove_bucket=True, append_envs=envs)
62
+ return True
63
+
64
+
65
+ def sync_hafnia_dataset_to_s3(
66
+ dataset: HafniaDataset,
67
+ bucket_prefix: str,
68
+ allow_version_overwrite: bool = False,
69
+ interactive: bool = True,
70
+ envs: Optional[Dict[str, str]] = None,
71
+ ) -> None:
72
+ t0 = time.time()
73
+ # bucket_prefix e.g. 's3://bucket-name/sample'
74
+ remote_paths = []
75
+ for file_str in progress_bar(dataset.samples[SampleField.FILE_PATH], description="Hashing data files"):
76
+ path_file = Path(file_str)
77
+ file_hash = hash_file_xxhash(path_file)
78
+
79
+ # Relative path in S3 bucket e.g. 'data/e2/b0/e2b000ac47b19a999bee5456a6addb88.png'
80
+ relative_path = s3_prefix_from_hash(hash=file_hash, suffix=path_file.suffix)
81
+
82
+ # Remote path in S3 bucket e.g. 's3://bucket-name/sample/data/e2/b0/e2b000ac47b19a999bee5456a6addb88.png'
83
+ remote_path = f"{bucket_prefix}/{relative_path}"
84
+ remote_paths.append(remote_path)
85
+
86
+ dataset.samples = dataset.samples.with_columns(pl.Series(remote_paths).alias(SampleField.REMOTE_PATH))
87
+
88
+ user_logger.info(f"Syncing dataset to S3 bucket '{bucket_prefix}'")
89
+ files_in_s3 = set(s5cmd_utils.list_bucket(bucket_prefix=bucket_prefix, append_envs=envs))
90
+
91
+ # Discover data files (images, videos, etc.) missing in s3
92
+ data_files_missing = dataset.samples.filter(~pl.col(SampleField.REMOTE_PATH).is_in(files_in_s3))
93
+ files_already_in_s3 = dataset.samples.filter(pl.col(SampleField.REMOTE_PATH).is_in(files_in_s3))
94
+
95
+ with tempfile.TemporaryDirectory() as temp_dir: # Temp folder to store metadata files
96
+ path_temp = Path(temp_dir)
97
+ # File paths are dropped when uploading to S3
98
+ dataset = dataset.update_samples(dataset.samples.drop(SampleField.FILE_PATH))
99
+ dataset.write_annotations(path_temp)
100
+
101
+ # Discover versioned metadata files (e.g. "annotations.jsonl", "dataset_info.json") missing in s3
102
+ metadata_files_local = []
103
+ metadata_files_s3 = []
104
+ for filename in path_temp.iterdir():
105
+ metadata_files_s3.append(f"{bucket_prefix}/versions/{dataset.info.version}/{filename.name}")
106
+ metadata_files_local.append(filename.as_posix())
107
+
108
+ overwrite_metadata_files = files_in_s3.intersection(set(metadata_files_s3))
109
+ will_overwrite_metadata_files = len(overwrite_metadata_files) > 0
110
+
111
+ n_files_already_in_s3 = len(files_already_in_s3)
112
+ user_logger.info(f"Sync dataset to {bucket_prefix}")
113
+ user_logger.info(
114
+ f"- Found that {n_files_already_in_s3} / {len(dataset.samples)} data files already exist. "
115
+ f"Meaning {len(data_files_missing)} data files will be uploaded. \n"
116
+ f"- Will upload {len(metadata_files_local)} metadata files. \n"
117
+ f"- Total files to upload: {len(data_files_missing) + len(metadata_files_local)}"
118
+ )
119
+ if will_overwrite_metadata_files:
120
+ msg = f"Metadata files for dataset version '{dataset.info.version}' already exist"
121
+ if allow_version_overwrite:
122
+ user_logger.warning(
123
+ f"- WARNING: {msg}. Version will be overwritten as 'allow_version_overwrite=True' is set."
124
+ )
125
+ else:
126
+ raise ValueError(
127
+ f"Upload cancelled. {msg}. \nTo overwrite existing metadata files, "
128
+ "you will need to set 'allow_version_overwrite=True' explicitly."
129
+ )
130
+
131
+ has_missing_files = len(data_files_missing) > 0
132
+ if interactive and (has_missing_files or will_overwrite_metadata_files):
133
+ print("Please type 'yes' to upload files.")
134
+ confirmation = input("Do you want to continue? (yes/NO): ").strip().lower()
135
+
136
+ if confirmation != "yes":
137
+ raise RuntimeError("Upload cancelled by user.")
138
+
139
+ local_paths = metadata_files_local + data_files_missing[SampleField.FILE_PATH].to_list()
140
+ s3_paths = metadata_files_s3 + data_files_missing[SampleField.REMOTE_PATH].to_list()
141
+ s5cmd_utils.fast_copy_files(local_paths, s3_paths, append_envs=envs, description="Uploading files")
142
+ user_logger.info(f"- Synced dataset in {time.time() - t0:.2f} seconds.")
143
+
144
+
145
+ def sync_dataset_files_to_platform(
146
+ dataset: HafniaDataset,
147
+ sample_dataset: Optional[HafniaDataset] = None,
148
+ interactive: bool = True,
149
+ allow_version_overwrite: bool = False,
150
+ cfg: Optional[Config] = None,
151
+ ) -> None:
152
+ cfg = cfg or Config()
153
+ resource_credentials = get_upload_credentials(dataset.info.dataset_name, cfg=cfg)
154
+
155
+ if resource_credentials is None:
156
+ raise RuntimeError("Failed to get upload credentials from the platform.")
157
+
158
+ sync_dataset_files_to_platform_from_resource_credentials(
159
+ dataset=dataset,
160
+ sample_dataset=sample_dataset,
161
+ interactive=interactive,
162
+ allow_version_overwrite=allow_version_overwrite,
163
+ resource_credentials=resource_credentials,
164
+ )
165
+
166
+
167
+ def sync_dataset_files_to_platform_from_resource_credentials(
168
+ dataset: HafniaDataset,
169
+ sample_dataset: Optional[HafniaDataset],
170
+ interactive: bool,
171
+ allow_version_overwrite: bool,
172
+ resource_credentials: ResourceCredentials,
173
+ ):
174
+ envs = resource_credentials.aws_credentials()
175
+ bucket_name = resource_credentials.bucket_name()
176
+
177
+ for dataset_variant_type in [DatasetVariant.SAMPLE, DatasetVariant.HIDDEN]:
178
+ if dataset_variant_type == DatasetVariant.SAMPLE:
179
+ if sample_dataset is None:
180
+ dataset_variant = dataset.create_sample_dataset()
181
+ else:
182
+ dataset_variant = sample_dataset
183
+ else:
184
+ dataset_variant = dataset
185
+
186
+ sync_hafnia_dataset_to_s3(
187
+ dataset=dataset_variant,
188
+ bucket_prefix=f"s3://{bucket_name}/{dataset_variant_type.value}",
189
+ interactive=interactive,
190
+ allow_version_overwrite=allow_version_overwrite,
191
+ envs=envs,
192
+ )
193
+
194
+
195
+ def s3_prefix_from_hash(hash: str, suffix: str) -> str:
196
+ """
197
+ Generate a relative S3 path from a hash value for objects stored in S3.
198
+
199
+ This function deliberately uses a hierarchical directory layout based on the
200
+ hash prefix to avoid putting too many objects in a single S3 prefix, which
201
+ can run into AWS S3 rate limits and performance issues. For example, for
202
+ hash "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4" and suffix ".png", the returned
203
+ path will be:
204
+
205
+ "data/df/e8/dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4.png"
206
+
207
+ Note: This intentionally differs from when images are stored to disk locally, where
208
+ a flat path of the form ``data/<hash><suffix>`` is used.
209
+ """
210
+ s3_prefix = f"data/{hash[:2]}/{hash[2:4]}/{hash}{suffix}"
211
+ return s3_prefix
@@ -45,7 +45,8 @@ def create_primitive_table(
45
45
  remove_no_object_frames = remove_no_object_frames.drop(drop_columns_names)
46
46
  # Rename columns "height", "width" and "meta" for sample to avoid conflicts with object fields names
47
47
  remove_no_object_frames = remove_no_object_frames.rename(
48
- {"height": "image.height", "width": "image.width", "meta": "image.meta"}
48
+ {"height": "image.height", "width": "image.width", "meta": "image.meta"},
49
+ strict=False,
49
50
  )
50
51
  objects_df = remove_no_object_frames.explode(column_name).unnest(column_name)
51
52
  else:
hafnia/http.py CHANGED
@@ -24,7 +24,8 @@ def fetch(endpoint: str, headers: Dict, params: Optional[Dict] = None) -> Union[
24
24
  try:
25
25
  response = http.request("GET", endpoint, fields=params, headers=headers)
26
26
  if response.status != 200:
27
- raise urllib3.exceptions.HTTPError(f"Request failed with status {response.status}")
27
+ error_details = response.data.decode("utf-8")
28
+ raise urllib3.exceptions.HTTPError(f"Request failed with status {response.status}: {error_details}")
28
29
 
29
30
  return json.loads(response.data.decode("utf-8"))
30
31
  finally:
@@ -1,29 +1,85 @@
1
- import os
1
+ import collections
2
2
  import shutil
3
- import subprocess
4
- import sys
5
- import tempfile
6
- import uuid
7
3
  from pathlib import Path
8
4
  from typing import Any, Dict, List, Optional
9
5
 
10
6
  import rich
7
+ from packaging.version import Version
11
8
  from rich import print as rprint
12
9
 
13
10
  from hafnia import http, utils
14
- from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED
11
+ from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED, ResourceCredentials
15
12
  from hafnia.dataset.dataset_recipe.dataset_recipe import (
16
13
  DatasetRecipe,
17
14
  get_dataset_path_from_recipe,
18
15
  )
19
16
  from hafnia.dataset.hafnia_dataset import HafniaDataset
20
- from hafnia.http import fetch
21
- from hafnia.log import sys_logger, user_logger
17
+ from hafnia.http import fetch, post
18
+ from hafnia.log import user_logger
19
+ from hafnia.platform import s5cmd_utils
22
20
  from hafnia.platform.download import get_resource_credentials
23
- from hafnia.utils import progress_bar, timed
21
+ from hafnia.utils import timed
24
22
  from hafnia_cli.config import Config
25
23
 
26
24
 
25
+ @timed("Fetching dataset by name.")
26
+ def get_dataset_by_name(dataset_name: str, cfg: Optional[Config] = None) -> Optional[Dict[str, Any]]:
27
+ """Get dataset details by name from the Hafnia platform."""
28
+ cfg = cfg or Config()
29
+ endpoint_dataset = cfg.get_platform_endpoint("datasets")
30
+ header = {"Authorization": cfg.api_key}
31
+ full_url = f"{endpoint_dataset}?name__iexact={dataset_name}"
32
+ datasets: List[Dict[str, Any]] = http.fetch(full_url, headers=header) # type: ignore[assignment]
33
+ if len(datasets) == 0:
34
+ return None
35
+
36
+ if len(datasets) > 1:
37
+ raise ValueError(f"Multiple datasets found with the name '{dataset_name}'.")
38
+
39
+ return datasets[0]
40
+
41
+
42
+ @timed("Fetching dataset by ID.")
43
+ def get_dataset_by_id(dataset_id: str, cfg: Optional[Config] = None) -> Optional[Dict[str, Any]]:
44
+ """Get dataset details by ID from the Hafnia platform."""
45
+ cfg = cfg or Config()
46
+ endpoint_dataset = cfg.get_platform_endpoint("datasets")
47
+ header = {"Authorization": cfg.api_key}
48
+ full_url = f"{endpoint_dataset}/{dataset_id}"
49
+ dataset: Dict[str, Any] = http.fetch(full_url, headers=header) # type: ignore[assignment]
50
+ if not dataset:
51
+ return None
52
+
53
+ return dataset
54
+
55
+
56
+ def get_or_create_dataset(dataset_name: str = "", cfg: Optional[Config] = None) -> Dict[str, Any]:
57
+ """Create a new dataset on the Hafnia platform."""
58
+ cfg = cfg or Config()
59
+ dataset = get_dataset_by_name(dataset_name, cfg)
60
+
61
+ if dataset is not None:
62
+ user_logger.info(f"Dataset '{dataset_name}' already exists on the Hafnia platform.")
63
+ return dataset
64
+
65
+ endpoint_dataset = cfg.get_platform_endpoint("datasets")
66
+ header = {"Authorization": cfg.api_key}
67
+ dataset_title = dataset_name.replace("-", " ").title() # convert dataset-name to title "Dataset Name"
68
+ payload = {
69
+ "title": dataset_title,
70
+ "name": dataset_name,
71
+ "overview": "No description provided.",
72
+ }
73
+
74
+ dataset = http.post(endpoint_dataset, headers=header, data=payload) # type: ignore[assignment]
75
+
76
+ # TODO: Handle issue when dataset creation fails because name is taken by another user from a different organization
77
+ if not dataset:
78
+ raise ValueError("Failed to create dataset on the Hafnia platform. ")
79
+
80
+ return dataset
81
+
82
+
27
83
  @timed("Fetching dataset list.")
28
84
  def get_datasets(cfg: Optional[Config] = None) -> List[Dict[str, str]]:
29
85
  """List available datasets on the Hafnia platform."""
@@ -50,6 +106,80 @@ def get_dataset_id(dataset_name: str, endpoint: str, api_key: str) -> str:
50
106
  raise ValueError("Dataset information is missing or invalid") from e
51
107
 
52
108
 
109
+ @timed("Get upload access credentials")
110
+ def get_upload_credentials(dataset_name: str, cfg: Optional[Config] = None) -> Optional[ResourceCredentials]:
111
+ """Get dataset details by name from the Hafnia platform."""
112
+ cfg = cfg or Config()
113
+ dataset_response = get_dataset_by_name(dataset_name=dataset_name, cfg=cfg)
114
+ if dataset_response is None:
115
+ return None
116
+
117
+ return get_upload_credentials_by_id(dataset_response["id"], cfg=cfg)
118
+
119
+
120
+ @timed("Get upload access credentials by ID")
121
+ def get_upload_credentials_by_id(dataset_id: str, cfg: Optional[Config] = None) -> Optional[ResourceCredentials]:
122
+ """Get dataset details by ID from the Hafnia platform."""
123
+ cfg = cfg or Config()
124
+
125
+ endpoint_dataset = cfg.get_platform_endpoint("datasets")
126
+ header = {"Authorization": cfg.api_key}
127
+ full_url = f"{endpoint_dataset}/{dataset_id}/temporary-credentials-upload"
128
+ credentials_response: Dict = http.fetch(full_url, headers=header) # type: ignore[assignment]
129
+
130
+ return ResourceCredentials.fix_naming(credentials_response)
131
+
132
+
133
+ @timed("Delete dataset by id")
134
+ def delete_dataset_by_id(dataset_id: str, cfg: Optional[Config] = None) -> Dict:
135
+ cfg = cfg or Config()
136
+ endpoint_dataset = cfg.get_platform_endpoint("datasets")
137
+ header = {"Authorization": cfg.api_key}
138
+ full_url = f"{endpoint_dataset}/{dataset_id}"
139
+ return http.delete(full_url, headers=header) # type: ignore
140
+
141
+
142
+ @timed("Delete dataset by name")
143
+ def delete_dataset_by_name(dataset_name: str, cfg: Optional[Config] = None) -> Dict:
144
+ cfg = cfg or Config()
145
+ dataset_response = get_dataset_by_name(dataset_name=dataset_name, cfg=cfg)
146
+ if dataset_response is None:
147
+ raise ValueError(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
148
+
149
+ dataset_id = dataset_response["id"] # type: ignore[union-attr]
150
+ response = delete_dataset_by_id(dataset_id=dataset_id, cfg=cfg)
151
+ user_logger.info(f"Dataset '{dataset_name}' has been deleted from the Hafnia platform.")
152
+ return response
153
+
154
+
155
+ def delete_dataset_completely_by_name(dataset_name: str, interactive: bool = True) -> None:
156
+ from hafnia.dataset.operations.dataset_s3_storage import delete_hafnia_dataset_files_on_platform
157
+
158
+ cfg = Config()
159
+
160
+ is_deleted = delete_hafnia_dataset_files_on_platform(
161
+ dataset_name=dataset_name,
162
+ interactive=interactive,
163
+ cfg=cfg,
164
+ )
165
+ if not is_deleted:
166
+ return
167
+ delete_dataset_by_name(dataset_name, cfg=cfg)
168
+
169
+
170
+ @timed("Import dataset details to platform")
171
+ def upload_dataset_details(cfg: Config, data: dict, dataset_name: str) -> dict:
172
+ dataset_endpoint = cfg.get_platform_endpoint("datasets")
173
+ dataset_id = get_dataset_id(dataset_name, dataset_endpoint, cfg.api_key)
174
+
175
+ import_endpoint = f"{dataset_endpoint}/{dataset_id}/import"
176
+ headers = {"Authorization": cfg.api_key}
177
+
178
+ user_logger.info("Exporting dataset details to platform. This may take up to 30 seconds...")
179
+ response = post(endpoint=import_endpoint, headers=headers, data=data) # type: ignore[assignment]
180
+ return response # type: ignore[return-value]
181
+
182
+
53
183
  def download_or_get_dataset_path(
54
184
  dataset_name: str,
55
185
  cfg: Optional[Config] = None,
@@ -72,9 +202,11 @@ def download_or_get_dataset_path(
72
202
  shutil.rmtree(path_dataset, ignore_errors=True)
73
203
 
74
204
  endpoint_dataset = cfg.get_platform_endpoint("datasets")
75
- dataset_id = get_dataset_id(dataset_name=dataset_name, endpoint=endpoint_dataset, api_key=api_key)
76
- if dataset_id is None:
77
- sys_logger.error(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
205
+ dataset_res = get_dataset_by_name(dataset_name, cfg) # Check if dataset exists
206
+ if dataset_res is None:
207
+ raise ValueError(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
208
+
209
+ dataset_id = dataset_res.get("id") # type: ignore[union-attr]
78
210
 
79
211
  if utils.is_hafnia_cloud_job():
80
212
  credentials_endpoint_suffix = "temporary-credentials-hidden" # Access to hidden datasets
@@ -95,22 +227,17 @@ def download_dataset_from_access_endpoint(
95
227
  endpoint: str,
96
228
  api_key: str,
97
229
  path_dataset: Path,
230
+ version: Optional[str] = None,
98
231
  download_files: bool = True,
99
232
  ) -> None:
100
- resource_credentials = get_resource_credentials(endpoint, api_key)
101
-
102
- local_dataset_paths = [(path_dataset / filename).as_posix() for filename in DATASET_FILENAMES_REQUIRED]
103
- s3_uri = resource_credentials.s3_uri()
104
- s3_dataset_files = [f"{s3_uri}/{filename}" for filename in DATASET_FILENAMES_REQUIRED]
105
-
106
- envs = resource_credentials.aws_credentials()
107
233
  try:
108
- fast_copy_files_s3(
109
- src_paths=s3_dataset_files,
110
- dst_paths=local_dataset_paths,
111
- append_envs=envs,
112
- description="Downloading annotations",
234
+ resource_credentials = get_resource_credentials(endpoint, api_key)
235
+ download_annotation_dataset_from_version(
236
+ version=version,
237
+ credentials=resource_credentials,
238
+ path_dataset=path_dataset,
113
239
  )
240
+
114
241
  except ValueError as e:
115
242
  user_logger.error(f"Failed to download annotations: {e}")
116
243
  return
@@ -126,87 +253,6 @@ def download_dataset_from_access_endpoint(
126
253
  dataset.write_annotations(path_folder=path_dataset) # Overwrite annotations as files have been re-downloaded
127
254
 
128
255
 
129
- def fast_copy_files_s3(
130
- src_paths: List[str],
131
- dst_paths: List[str],
132
- append_envs: Optional[Dict[str, str]] = None,
133
- description: str = "Copying files",
134
- ) -> List[str]:
135
- if len(src_paths) != len(dst_paths):
136
- raise ValueError("Source and destination paths must have the same length.")
137
- cmds = [f"cp {src} {dst}" for src, dst in zip(src_paths, dst_paths)]
138
- lines = execute_s5cmd_commands(cmds, append_envs=append_envs, description=description)
139
- return lines
140
-
141
-
142
- def find_s5cmd() -> Optional[str]:
143
- """Locate the s5cmd executable across different installation methods.
144
-
145
- Searches for s5cmd in:
146
- 1. System PATH (via shutil.which)
147
- 2. Python bin directory (Unix-like systems)
148
- 3. Python executable directory (direct installs)
149
-
150
- Returns:
151
- str: Absolute path to s5cmd executable if found, None otherwise.
152
- """
153
- result = shutil.which("s5cmd")
154
- if result:
155
- return result
156
- python_dir = Path(sys.executable).parent
157
- locations = (python_dir / "Scripts" / "s5cmd.exe", python_dir / "bin" / "s5cmd", python_dir / "s5cmd")
158
- for loc in locations:
159
- if loc.exists():
160
- return str(loc)
161
- return None
162
-
163
-
164
- def execute_s5cmd_commands(
165
- commands: List[str],
166
- append_envs: Optional[Dict[str, str]] = None,
167
- description: str = "Executing s5cmd commands",
168
- ) -> List[str]:
169
- append_envs = append_envs or {}
170
- # In Windows default "Temp" directory can not be deleted that is why we need to create a
171
- # temporary directory.
172
- with tempfile.TemporaryDirectory() as temp_dir:
173
- tmp_file_path = Path(temp_dir, f"{uuid.uuid4().hex}.txt")
174
- tmp_file_path.write_text("\n".join(commands))
175
-
176
- s5cmd_bin = find_s5cmd()
177
- if s5cmd_bin is None:
178
- raise ValueError("Can not find s5cmd executable.")
179
- run_cmds = [s5cmd_bin, "run", str(tmp_file_path)]
180
- sys_logger.debug(run_cmds)
181
- envs = os.environ.copy()
182
- envs.update(append_envs)
183
-
184
- process = subprocess.Popen(
185
- run_cmds,
186
- stdout=subprocess.PIPE,
187
- stderr=subprocess.STDOUT,
188
- universal_newlines=True,
189
- env=envs,
190
- )
191
-
192
- error_lines = []
193
- lines = []
194
- for line in progress_bar(process.stdout, total=len(commands), description=description): # type: ignore[arg-type]
195
- if "ERROR" in line or "error" in line:
196
- error_lines.append(line.strip())
197
- lines.append(line.strip())
198
-
199
- if len(error_lines) > 0:
200
- show_n_lines = min(5, len(error_lines))
201
- str_error_lines = "\n".join(error_lines[:show_n_lines])
202
- user_logger.error(
203
- f"Detected {len(error_lines)} errors occurred while executing a total of {len(commands)} "
204
- f" commands with s5cmd. The first {show_n_lines} is printed below:\n{str_error_lines}"
205
- )
206
- raise RuntimeError("Errors occurred during s5cmd execution.")
207
- return lines
208
-
209
-
210
256
  TABLE_FIELDS = {
211
257
  "ID": "id",
212
258
  "Hidden\nSamples": "hidden.samples",
@@ -241,3 +287,48 @@ def extend_dataset_details(datasets: List[Dict[str, Any]]) -> List[Dict[str, Any
241
287
  dataset[f"{variant_type}.samples"] = variant["number_of_data_items"]
242
288
  dataset[f"{variant_type}.size"] = utils.size_human_readable(variant["size_bytes"])
243
289
  return datasets
290
+
291
+
292
+ def download_annotation_dataset_from_version(
293
+ version: Optional[str],
294
+ credentials: ResourceCredentials,
295
+ path_dataset: Path,
296
+ ) -> list[str]:
297
+ path_dataset.mkdir(parents=True, exist_ok=True)
298
+
299
+ envs = credentials.aws_credentials()
300
+ bucket_prefix_sample_versions = f"{credentials.s3_uri()}/versions"
301
+ all_s3_annotation_files = s5cmd_utils.list_bucket(bucket_prefix=bucket_prefix_sample_versions, append_envs=envs)
302
+ s3_files = _annotation_files_from_version(version=version, all_annotation_files=all_s3_annotation_files)
303
+
304
+ local_paths = [(path_dataset / filename.split("/")[-1]).as_posix() for filename in s3_files]
305
+ s5cmd_utils.fast_copy_files(
306
+ src_paths=s3_files,
307
+ dst_paths=local_paths,
308
+ append_envs=envs,
309
+ description="Downloading annotation files",
310
+ )
311
+ return local_paths
312
+
313
+
314
+ def _annotation_files_from_version(version: Optional[str], all_annotation_files: list[str]) -> list[str]:
315
+ version_files = collections.defaultdict(list)
316
+ for metadata_file in all_annotation_files:
317
+ version_str, filename = metadata_file.split("/")[-2:]
318
+ if filename not in DATASET_FILENAMES_REQUIRED:
319
+ continue
320
+ version_files[version_str].append(metadata_file)
321
+ available_versions = {v for v, files in version_files.items() if len(files) == len(DATASET_FILENAMES_REQUIRED)}
322
+
323
+ if len(available_versions) == 0:
324
+ raise ValueError("No versions were found in the dataset.")
325
+
326
+ if version is None:
327
+ latest_version = max(Version(ver) for ver in available_versions)
328
+ version = str(latest_version)
329
+ user_logger.info(f"No version selected. Using latest version: {version}")
330
+
331
+ if version not in available_versions:
332
+ raise ValueError(f"Selected version '{version}' not found in available versions: {available_versions}")
333
+
334
+ return version_files[version]