hafnia 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  import collections
2
2
  import json
3
+ from dataclasses import dataclass
3
4
  from datetime import datetime
4
5
  from pathlib import Path
5
6
  from typing import Any, Dict, List, Optional, Type, Union
@@ -7,12 +8,21 @@ from typing import Any, Dict, List, Optional, Type, Union
7
8
  import cv2
8
9
  import more_itertools
9
10
  import numpy as np
11
+ import polars as pl
10
12
  from packaging.version import Version
11
13
  from PIL import Image
12
14
  from pydantic import BaseModel, Field, field_serializer, field_validator
13
15
 
14
16
  import hafnia
15
- from hafnia.dataset.dataset_names import SampleField, StorageFormat
17
+ from hafnia.dataset import dataset_helpers
18
+ from hafnia.dataset.dataset_helpers import version_from_string
19
+ from hafnia.dataset.dataset_names import (
20
+ FILENAME_ANNOTATIONS_JSONL,
21
+ FILENAME_ANNOTATIONS_PARQUET,
22
+ FILENAME_DATASET_INFO,
23
+ SampleField,
24
+ StorageFormat,
25
+ )
16
26
  from hafnia.dataset.primitives import (
17
27
  PRIMITIVE_TYPES,
18
28
  Bbox,
@@ -102,7 +112,7 @@ class TaskInfo(BaseModel):
102
112
 
103
113
  class DatasetInfo(BaseModel):
104
114
  dataset_name: str = Field(description="Name of the dataset, e.g. 'coco'")
105
- version: Optional[str] = Field(default=None, description="Version of the dataset")
115
+ version: str = Field(default="0.0.0", description="Version of the dataset")
106
116
  dataset_title: Optional[str] = Field(default=None, description="Optional, human-readable title of the dataset")
107
117
  description: Optional[str] = Field(default=None, description="Optional, description of the dataset")
108
118
  tasks: List[TaskInfo] = Field(default=None, description="List of tasks in the dataset")
@@ -144,31 +154,21 @@ class DatasetInfo(BaseModel):
144
154
  @field_validator("format_version")
145
155
  @classmethod
146
156
  def _validate_format_version(cls, format_version: str) -> str:
147
- try:
148
- Version(format_version)
149
- except Exception as e:
150
- raise ValueError(f"Invalid format_version '{format_version}'. Must be a valid version string.") from e
157
+ version_casted: Version = dataset_helpers.version_from_string(format_version, raise_error=True)
151
158
 
152
- if Version(format_version) > Version(hafnia.__dataset_format_version__):
159
+ if version_casted > Version(hafnia.__dataset_format_version__):
153
160
  user_logger.warning(
154
161
  f"The loaded dataset format version '{format_version}' is newer than the format version "
155
162
  f"'{hafnia.__dataset_format_version__}' used in your version of Hafnia. Please consider "
156
163
  f"updating Hafnia package."
157
164
  )
158
- return format_version
165
+ return str(version_casted)
159
166
 
160
167
  @field_validator("version")
161
168
  @classmethod
162
169
  def _validate_version(cls, dataset_version: Optional[str]) -> Optional[str]:
163
- if dataset_version is None:
164
- return None
165
-
166
- try:
167
- Version(dataset_version)
168
- except Exception as e:
169
- raise ValueError(f"Invalid dataset_version '{dataset_version}'. Must be a valid version string.") from e
170
-
171
- return dataset_version
170
+ version_casted: Version = dataset_helpers.version_from_string(dataset_version, raise_error=True)
171
+ return str(version_casted)
172
172
 
173
173
  def check_for_duplicate_task_names(self) -> List[TaskInfo]:
174
174
  return self._validate_check_for_duplicate_tasks(self.tasks)
@@ -238,7 +238,7 @@ class DatasetInfo(BaseModel):
238
238
  meta.update(info1.meta or {})
239
239
  return DatasetInfo(
240
240
  dataset_name=info0.dataset_name + "+" + info1.dataset_name,
241
- version=None,
241
+ version="0.0.0",
242
242
  tasks=list(unique_tasks),
243
243
  meta=meta,
244
244
  format_version=dataset_format_version,
@@ -477,3 +477,127 @@ class Sample(BaseModel):
477
477
  annotations = self.get_annotations()
478
478
  annotations_visualized = image_visualizations.draw_annotations(image=image, primitives=annotations)
479
479
  return annotations_visualized
480
+
481
+
482
+ @dataclass
483
+ class DatasetMetadataFilePaths:
484
+ dataset_info: str # Use 'str' to also support s3 paths
485
+ annotations_jsonl: Optional[str]
486
+ annotations_parquet: Optional[str]
487
+
488
+ def as_list(self) -> List[str]:
489
+ files = [self.dataset_info]
490
+ if self.annotations_jsonl is not None:
491
+ files.append(self.annotations_jsonl)
492
+ if self.annotations_parquet is not None:
493
+ files.append(self.annotations_parquet)
494
+ return files
495
+
496
+ def read_samples(self) -> pl.DataFrame:
497
+ if self.annotations_parquet is not None:
498
+ if not Path(self.annotations_parquet).exists():
499
+ raise FileNotFoundError(f"Parquet annotations file '{self.annotations_parquet}' does not exist.")
500
+ user_logger.info(f"Reading dataset annotations from Parquet file: {self.annotations_parquet}")
501
+ return pl.read_parquet(self.annotations_parquet)
502
+
503
+ if self.annotations_jsonl is not None:
504
+ if not Path(self.annotations_jsonl).exists():
505
+ raise FileNotFoundError(f"JSONL annotations file '{self.annotations_jsonl}' does not exist.")
506
+ user_logger.info(f"Reading dataset annotations from JSONL file: {self.annotations_jsonl}")
507
+ return pl.read_ndjson(self.annotations_jsonl)
508
+
509
+ raise ValueError(
510
+ "No annotations file available to read samples from. Dataset is missing both JSONL and Parquet files."
511
+ )
512
+
513
+ @staticmethod
514
+ def from_path(path_dataset: Path) -> "DatasetMetadataFilePaths":
515
+ path_dataset = path_dataset.absolute()
516
+ metadata_files = DatasetMetadataFilePaths(
517
+ dataset_info=str(path_dataset / FILENAME_DATASET_INFO),
518
+ annotations_jsonl=str(path_dataset / FILENAME_ANNOTATIONS_JSONL),
519
+ annotations_parquet=str(path_dataset / FILENAME_ANNOTATIONS_PARQUET),
520
+ )
521
+
522
+ return metadata_files
523
+
524
+ @staticmethod
525
+ def available_versions_from_files_list(files: list[str]) -> Dict[Version, "DatasetMetadataFilePaths"]:
526
+ versions_and_files: Dict[Version, Dict[str, str]] = collections.defaultdict(dict)
527
+ for metadata_file in files:
528
+ version_str, filename = metadata_file.split("/")[-2:]
529
+ versions_and_files[version_str][filename] = metadata_file
530
+
531
+ available_versions: Dict[Version, DatasetMetadataFilePaths] = {}
532
+ for version_str, version_files in versions_and_files.items():
533
+ version_casted: Version = dataset_helpers.version_from_string(version_str, raise_error=False)
534
+ if version_casted is None:
535
+ continue
536
+
537
+ if FILENAME_DATASET_INFO not in version_files:
538
+ continue
539
+ dataset_metadata_file = DatasetMetadataFilePaths(
540
+ dataset_info=version_files[FILENAME_DATASET_INFO],
541
+ annotations_jsonl=version_files.get(FILENAME_ANNOTATIONS_JSONL, None),
542
+ annotations_parquet=version_files.get(FILENAME_ANNOTATIONS_PARQUET, None),
543
+ )
544
+
545
+ available_versions[version_casted] = dataset_metadata_file
546
+
547
+ return available_versions
548
+
549
+ def check_version(self, version: str, raise_error: bool = True) -> bool:
550
+ """
551
+ Check if the dataset metadata files match the given version.
552
+ If raise_error is True, raises ValueError if the version does not match.
553
+ """
554
+ valid_version = version_from_string(version, raise_error=raise_error)
555
+ if valid_version is None:
556
+ return False
557
+
558
+ path_dataset_info = Path(self.dataset_info)
559
+ if not path_dataset_info.exists():
560
+ raise FileNotFoundError(f"Dataset info file missing '{self.dataset_info}' in dataset folder.")
561
+
562
+ dataset_info = json.loads(path_dataset_info.read_text())
563
+ dataset_version = dataset_info.get("version", None)
564
+ if dataset_version != version:
565
+ if raise_error:
566
+ raise ValueError(
567
+ f"Dataset version mismatch. Expected version '{version}' but found "
568
+ f"version '{dataset_version}' in dataset info."
569
+ )
570
+ return False
571
+
572
+ return True
573
+
574
+ def exists(self, version: Optional[str] = None, raise_error: bool = True) -> bool:
575
+ """
576
+ Check if all metadata files exist.
577
+ Add version to check if it matches the version in dataset info.
578
+ If raise_error is True, raises FileNotFoundError if any file is missing.
579
+ """
580
+ path_dataset_info = Path(self.dataset_info)
581
+ if not path_dataset_info.exists():
582
+ if raise_error:
583
+ raise FileNotFoundError(f"Dataset info file missing '{self.dataset_info}' in dataset folder.")
584
+ return False
585
+
586
+ if version is not None and self.check_version(version, raise_error=raise_error) is False:
587
+ return False
588
+
589
+ has_jsonl_file = self.annotations_jsonl is not None and Path(self.annotations_jsonl).exists()
590
+ if has_jsonl_file:
591
+ return True
592
+
593
+ has_parquet_file = self.annotations_parquet is not None and Path(self.annotations_parquet).exists()
594
+ if has_parquet_file:
595
+ return True
596
+
597
+ if raise_error:
598
+ raise FileNotFoundError(
599
+ f"Missing annotation file. Expected either '{FILENAME_ANNOTATIONS_JSONL}' or "
600
+ f"'{FILENAME_ANNOTATIONS_PARQUET}' in dataset folder."
601
+ )
602
+
603
+ return False
@@ -8,13 +8,13 @@ import polars as pl
8
8
  from hafnia.dataset.dataset_helpers import hash_file_xxhash
9
9
  from hafnia.dataset.dataset_names import (
10
10
  DatasetVariant,
11
- ResourceCredentials,
12
11
  SampleField,
13
12
  )
14
13
  from hafnia.dataset.hafnia_dataset import HafniaDataset
15
14
  from hafnia.log import user_logger
16
15
  from hafnia.platform import s5cmd_utils
17
16
  from hafnia.platform.datasets import get_upload_credentials
17
+ from hafnia.platform.s5cmd_utils import ResourceCredentials
18
18
  from hafnia.utils import progress_bar
19
19
  from hafnia_cli.config import Config
20
20
 
@@ -39,6 +39,7 @@ def delete_hafnia_dataset_files_on_platform(
39
39
  def delete_hafnia_dataset_files_from_resource_credentials(
40
40
  resource_credentials: ResourceCredentials,
41
41
  interactive: bool = True,
42
+ remove_bucket: bool = True,
42
43
  ) -> bool:
43
44
  envs = resource_credentials.aws_credentials()
44
45
  bucket_name = resource_credentials.bucket_name()
@@ -58,7 +59,11 @@ def delete_hafnia_dataset_files_from_resource_credentials(
58
59
  user_logger.info("Delete operation cancelled by the user.")
59
60
  return False
60
61
  user_logger.info(f"Deleting all files in S3 bucket '{bucket_name}'...")
61
- s5cmd_utils.delete_bucket_content(bucket_prefix=f"s3://{bucket_name}", remove_bucket=True, append_envs=envs)
62
+ s5cmd_utils.delete_bucket_content(
63
+ bucket_prefix=f"s3://{bucket_name}",
64
+ remove_bucket=remove_bucket,
65
+ append_envs=envs,
66
+ )
62
67
  return True
63
68
 
64
69
 
@@ -4,8 +4,6 @@ from typing import List, Optional, Tuple, Type
4
4
  import polars as pl
5
5
 
6
6
  from hafnia.dataset.dataset_names import (
7
- FILENAME_ANNOTATIONS_JSONL,
8
- FILENAME_ANNOTATIONS_PARQUET,
9
7
  PrimitiveField,
10
8
  SampleField,
11
9
  )
@@ -204,22 +202,6 @@ def split_primitive_columns_by_task_name(
204
202
  return samples_table
205
203
 
206
204
 
207
- def read_samples_from_path(path: Path) -> pl.DataFrame:
208
- path_annotations = path / FILENAME_ANNOTATIONS_PARQUET
209
- if path_annotations.exists():
210
- user_logger.info(f"Reading dataset annotations from Parquet file: {path_annotations}")
211
- return pl.read_parquet(path_annotations)
212
-
213
- path_annotations_jsonl = path / FILENAME_ANNOTATIONS_JSONL
214
- if path_annotations_jsonl.exists():
215
- user_logger.info(f"Reading dataset annotations from JSONL file: {path_annotations_jsonl}")
216
- return pl.read_ndjson(path_annotations_jsonl)
217
-
218
- raise FileNotFoundError(
219
- f"Unable to read annotations. No json file '{path_annotations.name}' or Parquet file '{{path_annotations.name}} in in '{path}'."
220
- )
221
-
222
-
223
205
  def check_image_paths(table: pl.DataFrame) -> bool:
224
206
  missing_files = []
225
207
  org_paths = table[SampleField.FILE_PATH].to_list()
@@ -1,23 +1,13 @@
1
- import collections
2
- import shutil
3
- from pathlib import Path
4
1
  from typing import Any, Dict, List, Optional
5
2
 
6
3
  import rich
7
- from packaging.version import Version
8
4
  from rich import print as rprint
9
5
 
10
6
  from hafnia import http, utils
11
- from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED, ResourceCredentials
12
- from hafnia.dataset.dataset_recipe.dataset_recipe import (
13
- DatasetRecipe,
14
- get_dataset_path_from_recipe,
15
- )
16
- from hafnia.dataset.hafnia_dataset import HafniaDataset
17
7
  from hafnia.http import fetch, post
18
8
  from hafnia.log import user_logger
19
- from hafnia.platform import s5cmd_utils
20
9
  from hafnia.platform.download import get_resource_credentials
10
+ from hafnia.platform.s5cmd_utils import ResourceCredentials
21
11
  from hafnia.utils import timed
22
12
  from hafnia_cli.config import Config
23
13
 
@@ -57,7 +47,6 @@ def get_or_create_dataset(dataset_name: str = "", cfg: Optional[Config] = None)
57
47
  """Create a new dataset on the Hafnia platform."""
58
48
  cfg = cfg or Config()
59
49
  dataset = get_dataset_by_name(dataset_name, cfg)
60
-
61
50
  if dataset is not None:
62
51
  user_logger.info(f"Dataset '{dataset_name}' already exists on the Hafnia platform.")
63
52
  return dataset
@@ -130,6 +119,31 @@ def get_upload_credentials_by_id(dataset_id: str, cfg: Optional[Config] = None)
130
119
  return ResourceCredentials.fix_naming(credentials_response)
131
120
 
132
121
 
122
+ @timed("Get read access credentials by ID")
123
+ def get_read_credentials_by_id(dataset_id: str, cfg: Optional[Config] = None) -> Optional[ResourceCredentials]:
124
+ """Get dataset read access credentials by ID from the Hafnia platform."""
125
+ cfg = cfg or Config()
126
+ endpoint_dataset = cfg.get_platform_endpoint("datasets")
127
+ if utils.is_hafnia_cloud_job():
128
+ credentials_endpoint_suffix = "temporary-credentials-hidden" # Access to hidden datasets
129
+ else:
130
+ credentials_endpoint_suffix = "temporary-credentials" # Access to sample dataset
131
+ access_dataset_endpoint = f"{endpoint_dataset}/{dataset_id}/{credentials_endpoint_suffix}"
132
+ resource_credentials = get_resource_credentials(access_dataset_endpoint, cfg.api_key)
133
+ return resource_credentials
134
+
135
+
136
+ @timed("Get read access credentials by name")
137
+ def get_read_credentials_by_name(dataset_name: str, cfg: Optional[Config] = None) -> Optional[ResourceCredentials]:
138
+ """Get dataset read access credentials by name from the Hafnia platform."""
139
+ cfg = cfg or Config()
140
+ dataset_response = get_dataset_by_name(dataset_name=dataset_name, cfg=cfg)
141
+ if dataset_response is None:
142
+ return None
143
+
144
+ return get_read_credentials_by_id(dataset_response["id"], cfg=cfg)
145
+
146
+
133
147
  @timed("Delete dataset by id")
134
148
  def delete_dataset_by_id(dataset_id: str, cfg: Optional[Config] = None) -> Dict:
135
149
  cfg = cfg or Config()
@@ -152,10 +166,14 @@ def delete_dataset_by_name(dataset_name: str, cfg: Optional[Config] = None) -> D
152
166
  return response
153
167
 
154
168
 
155
- def delete_dataset_completely_by_name(dataset_name: str, interactive: bool = True) -> None:
169
+ def delete_dataset_completely_by_name(
170
+ dataset_name: str,
171
+ interactive: bool = True,
172
+ cfg: Optional[Config] = None,
173
+ ) -> None:
156
174
  from hafnia.dataset.operations.dataset_s3_storage import delete_hafnia_dataset_files_on_platform
157
175
 
158
- cfg = Config()
176
+ cfg = cfg or Config()
159
177
 
160
178
  is_deleted = delete_hafnia_dataset_files_on_platform(
161
179
  dataset_name=dataset_name,
@@ -180,79 +198,6 @@ def upload_dataset_details(cfg: Config, data: dict, dataset_name: str) -> dict:
180
198
  return response # type: ignore[return-value]
181
199
 
182
200
 
183
- def download_or_get_dataset_path(
184
- dataset_name: str,
185
- cfg: Optional[Config] = None,
186
- path_datasets_folder: Optional[str] = None,
187
- force_redownload: bool = False,
188
- download_files: bool = True,
189
- ) -> Path:
190
- """Download or get the path of the dataset."""
191
- recipe_explicit = DatasetRecipe.from_implicit_form(dataset_name)
192
- path_dataset = get_dataset_path_from_recipe(recipe_explicit, path_datasets=path_datasets_folder)
193
-
194
- is_dataset_valid = HafniaDataset.check_dataset_path(path_dataset, raise_error=False)
195
- if is_dataset_valid and not force_redownload:
196
- user_logger.info("Dataset found locally. Set 'force=True' or add `--force` flag with cli to re-download")
197
- return path_dataset
198
-
199
- cfg = cfg or Config()
200
- api_key = cfg.api_key
201
-
202
- shutil.rmtree(path_dataset, ignore_errors=True)
203
-
204
- endpoint_dataset = cfg.get_platform_endpoint("datasets")
205
- dataset_res = get_dataset_by_name(dataset_name, cfg) # Check if dataset exists
206
- if dataset_res is None:
207
- raise ValueError(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
208
-
209
- dataset_id = dataset_res.get("id") # type: ignore[union-attr]
210
-
211
- if utils.is_hafnia_cloud_job():
212
- credentials_endpoint_suffix = "temporary-credentials-hidden" # Access to hidden datasets
213
- else:
214
- credentials_endpoint_suffix = "temporary-credentials" # Access to sample dataset
215
- access_dataset_endpoint = f"{endpoint_dataset}/{dataset_id}/{credentials_endpoint_suffix}"
216
-
217
- download_dataset_from_access_endpoint(
218
- endpoint=access_dataset_endpoint,
219
- api_key=api_key,
220
- path_dataset=path_dataset,
221
- download_files=download_files,
222
- )
223
- return path_dataset
224
-
225
-
226
- def download_dataset_from_access_endpoint(
227
- endpoint: str,
228
- api_key: str,
229
- path_dataset: Path,
230
- version: Optional[str] = None,
231
- download_files: bool = True,
232
- ) -> None:
233
- try:
234
- resource_credentials = get_resource_credentials(endpoint, api_key)
235
- download_annotation_dataset_from_version(
236
- version=version,
237
- credentials=resource_credentials,
238
- path_dataset=path_dataset,
239
- )
240
-
241
- except ValueError as e:
242
- user_logger.error(f"Failed to download annotations: {e}")
243
- return
244
-
245
- if not download_files:
246
- return
247
- dataset = HafniaDataset.from_path(path_dataset, check_for_images=False)
248
- try:
249
- dataset = dataset.download_files_aws(path_dataset, aws_credentials=resource_credentials, force_redownload=True)
250
- except ValueError as e:
251
- user_logger.error(f"Failed to download images: {e}")
252
- return
253
- dataset.write_annotations(path_folder=path_dataset) # Overwrite annotations as files have been re-downloaded
254
-
255
-
256
201
  TABLE_FIELDS = {
257
202
  "ID": "id",
258
203
  "Hidden\nSamples": "hidden.samples",
@@ -287,48 +232,3 @@ def extend_dataset_details(datasets: List[Dict[str, Any]]) -> List[Dict[str, Any
287
232
  dataset[f"{variant_type}.samples"] = variant["number_of_data_items"]
288
233
  dataset[f"{variant_type}.size"] = utils.size_human_readable(variant["size_bytes"])
289
234
  return datasets
290
-
291
-
292
- def download_annotation_dataset_from_version(
293
- version: Optional[str],
294
- credentials: ResourceCredentials,
295
- path_dataset: Path,
296
- ) -> list[str]:
297
- path_dataset.mkdir(parents=True, exist_ok=True)
298
-
299
- envs = credentials.aws_credentials()
300
- bucket_prefix_sample_versions = f"{credentials.s3_uri()}/versions"
301
- all_s3_annotation_files = s5cmd_utils.list_bucket(bucket_prefix=bucket_prefix_sample_versions, append_envs=envs)
302
- s3_files = _annotation_files_from_version(version=version, all_annotation_files=all_s3_annotation_files)
303
-
304
- local_paths = [(path_dataset / filename.split("/")[-1]).as_posix() for filename in s3_files]
305
- s5cmd_utils.fast_copy_files(
306
- src_paths=s3_files,
307
- dst_paths=local_paths,
308
- append_envs=envs,
309
- description="Downloading annotation files",
310
- )
311
- return local_paths
312
-
313
-
314
- def _annotation_files_from_version(version: Optional[str], all_annotation_files: list[str]) -> list[str]:
315
- version_files = collections.defaultdict(list)
316
- for metadata_file in all_annotation_files:
317
- version_str, filename = metadata_file.split("/")[-2:]
318
- if filename not in DATASET_FILENAMES_REQUIRED:
319
- continue
320
- version_files[version_str].append(metadata_file)
321
- available_versions = {v for v, files in version_files.items() if len(files) == len(DATASET_FILENAMES_REQUIRED)}
322
-
323
- if len(available_versions) == 0:
324
- raise ValueError("No versions were found in the dataset.")
325
-
326
- if version is None:
327
- latest_version = max(Version(ver) for ver in available_versions)
328
- version = str(latest_version)
329
- user_logger.info(f"No version selected. Using latest version: {version}")
330
-
331
- if version not in available_versions:
332
- raise ValueError(f"Selected version '{version}' not found in available versions: {available_versions}")
333
-
334
- return version_files[version]
@@ -5,9 +5,9 @@ import boto3
5
5
  from botocore.exceptions import ClientError
6
6
  from rich.progress import Progress
7
7
 
8
- from hafnia.dataset.dataset_names import ResourceCredentials
9
8
  from hafnia.http import fetch
10
9
  from hafnia.log import sys_logger, user_logger
10
+ from hafnia.platform.s5cmd_utils import ResourceCredentials
11
11
 
12
12
 
13
13
  def get_resource_credentials(endpoint: str, api_key: str) -> ResourceCredentials:
@@ -7,6 +7,10 @@ import uuid
7
7
  from pathlib import Path
8
8
  from typing import Dict, List, Optional
9
9
 
10
+ import boto3
11
+ from botocore.exceptions import UnauthorizedSSOTokenError
12
+ from pydantic import BaseModel, field_validator
13
+
10
14
  from hafnia.log import sys_logger, user_logger
11
15
  from hafnia.utils import progress_bar
12
16
 
@@ -26,7 +30,11 @@ def find_s5cmd() -> Optional[str]:
26
30
  if result:
27
31
  return result
28
32
  python_dir = Path(sys.executable).parent
29
- locations = (python_dir / "Scripts" / "s5cmd.exe", python_dir / "bin" / "s5cmd", python_dir / "s5cmd")
33
+ locations = (
34
+ python_dir / "Scripts" / "s5cmd.exe",
35
+ python_dir / "bin" / "s5cmd",
36
+ python_dir / "s5cmd",
37
+ )
30
38
  for loc in locations:
31
39
  if loc.exists():
32
40
  return str(loc)
@@ -104,12 +112,17 @@ def delete_bucket_content(
104
112
  returns = execute_command(["rm", f"{bucket_prefix}/*"], append_envs=append_envs)
105
113
 
106
114
  if returns.returncode != 0:
107
- bucket_is_already_deleted = "no object found" in returns.stderr.strip()
108
- if bucket_is_already_deleted:
115
+ bucket_content_is_already_deleted = "no object found" in returns.stderr.strip()
116
+ bucket_is_already_deleted = "NoSuchBucket" in returns.stderr.strip()
117
+ if bucket_content_is_already_deleted:
109
118
  user_logger.info(f"No action was taken. S3 bucket '{bucket_prefix}' is already empty.")
119
+ elif bucket_is_already_deleted:
120
+ user_logger.info(f"No action was taken. S3 bucket '{bucket_prefix}' does not exist.")
121
+ return
110
122
  else:
111
123
  user_logger.error("Error during s5cmd rm command:")
112
124
  user_logger.error(returns.stdout)
125
+ user_logger.error(returns.stderr)
113
126
  raise RuntimeError(f"Failed to delete all files in S3 bucket '{bucket_prefix}'.")
114
127
 
115
128
  if remove_bucket:
@@ -118,6 +131,7 @@ def delete_bucket_content(
118
131
  if returns.returncode != 0:
119
132
  user_logger.error("Error during s5cmd rb command:")
120
133
  user_logger.error(returns.stdout)
134
+ user_logger.error(returns.stderr)
121
135
  raise RuntimeError(f"Failed to delete S3 bucket '{bucket_prefix}'.")
122
136
  user_logger.info(f"S3 bucket '{bucket_prefix}' has been deleted.")
123
137
 
@@ -145,3 +159,108 @@ def fast_copy_files(
145
159
  cmds = [f"cp {src} {dst}" for src, dst in zip(src_paths, dst_paths)]
146
160
  lines = execute_commands(cmds, append_envs=append_envs, description=description)
147
161
  return lines
162
+
163
+
164
+ ARN_PREFIX = "arn:aws:s3:::"
165
+
166
+
167
+ class AwsCredentials(BaseModel):
168
+ access_key: str
169
+ secret_key: str
170
+ session_token: str
171
+ region: Optional[str]
172
+
173
+ def aws_credentials(self) -> Dict[str, str]:
174
+ """
175
+ Returns the AWS credentials as a dictionary.
176
+ """
177
+ environment_vars = {
178
+ "AWS_ACCESS_KEY_ID": self.access_key,
179
+ "AWS_SECRET_ACCESS_KEY": self.secret_key,
180
+ "AWS_SESSION_TOKEN": self.session_token,
181
+ }
182
+ if self.region:
183
+ environment_vars["AWS_REGION"] = self.region
184
+
185
+ return environment_vars
186
+
187
+ @staticmethod
188
+ def from_session(session: boto3.Session) -> "AwsCredentials":
189
+ """
190
+ Creates AwsCredentials from a Boto3 session.
191
+ """
192
+ try:
193
+ frozen_credentials = session.get_credentials().get_frozen_credentials()
194
+ except UnauthorizedSSOTokenError as e:
195
+ raise RuntimeError(
196
+ f"Failed to get AWS credentials from the session for profile '{session.profile_name}'.\n"
197
+ f"Ensure the profile exists in your AWS config in '~/.aws/config' and that you are logged in via AWS SSO.\n"
198
+ f"\tUse 'aws sso login --profile {session.profile_name}' to log in."
199
+ ) from e
200
+ return AwsCredentials(
201
+ access_key=frozen_credentials.access_key,
202
+ secret_key=frozen_credentials.secret_key,
203
+ session_token=frozen_credentials.token,
204
+ region=session.region_name,
205
+ )
206
+
207
+ def to_resource_credentials(self, bucket_name: str) -> "ResourceCredentials":
208
+ """
209
+ Converts AwsCredentials to ResourceCredentials by adding the S3 ARN.
210
+ """
211
+ payload = self.model_dump()
212
+ payload["s3_arn"] = f"{ARN_PREFIX}{bucket_name}"
213
+ return ResourceCredentials(**payload)
214
+
215
+
216
+ class ResourceCredentials(AwsCredentials):
217
+ s3_arn: str
218
+
219
+ @staticmethod
220
+ def fix_naming(payload: Dict[str, str]) -> "ResourceCredentials":
221
+ """
222
+ The endpoint returns a payload with a key called 's3_path', but it
223
+ is actually an ARN path (starts with arn:aws:s3::). This method renames it to 's3_arn' for consistency.
224
+ """
225
+ if "s3_path" in payload and payload["s3_path"].startswith(ARN_PREFIX):
226
+ payload["s3_arn"] = payload.pop("s3_path")
227
+
228
+ if "region" not in payload:
229
+ payload["region"] = "eu-west-1"
230
+ return ResourceCredentials(**payload)
231
+
232
+ @field_validator("s3_arn")
233
+ @classmethod
234
+ def validate_s3_arn(cls, value: str) -> str:
235
+ """Validate s3_arn to ensure it starts with 'arn:aws:s3:::'"""
236
+ if not value.startswith("arn:aws:s3:::"):
237
+ raise ValueError(f"Invalid S3 ARN: {value}. It should start with 'arn:aws:s3:::'")
238
+ return value
239
+
240
+ def s3_path(self) -> str:
241
+ """
242
+ Extracts the S3 path from the ARN.
243
+ Example: arn:aws:s3:::my-bucket/my-prefix -> my-bucket/my-prefix
244
+ """
245
+ return self.s3_arn[len(ARN_PREFIX) :]
246
+
247
+ def s3_uri(self) -> str:
248
+ """
249
+ Converts the S3 ARN to a URI format.
250
+ Example: arn:aws:s3:::my-bucket/my-prefix -> s3://my-bucket/my-prefix
251
+ """
252
+ return f"s3://{self.s3_path()}"
253
+
254
+ def bucket_name(self) -> str:
255
+ """
256
+ Extracts the bucket name from the S3 ARN.
257
+ Example: arn:aws:s3:::my-bucket/my-prefix -> my-bucket
258
+ """
259
+ return self.s3_path().split("/")[0]
260
+
261
+ def object_key(self) -> str:
262
+ """
263
+ Extracts the object key from the S3 ARN.
264
+ Example: arn:aws:s3:::my-bucket/my-prefix -> my-prefix
265
+ """
266
+ return "/".join(self.s3_path().split("/")[1:])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hafnia
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Python SDK for communication with Hafnia platform.
5
5
  Author-email: Milestone Systems <hafniaplatform@milestone.dk>
6
6
  License-File: LICENSE
@@ -10,7 +10,7 @@ Requires-Dist: click>=8.1.8
10
10
  Requires-Dist: emoji>=2.14.1
11
11
  Requires-Dist: flatten-dict>=0.4.2
12
12
  Requires-Dist: keyring>=25.6.0
13
- Requires-Dist: mcp>=1.16.0
13
+ Requires-Dist: mcp>=1.23.0
14
14
  Requires-Dist: mlflow>=3.4.0
15
15
  Requires-Dist: more-itertools>=10.7.0
16
16
  Requires-Dist: opencv-python-headless>=4.11.0.86