PyPI - hafnia - Versions diffs - 0.1.27__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

hafnia 0.1.27py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

cli/__main__.py +2 -2
cli/dataset_cmds.py +60 -0
cli/runc_cmds.py +1 -1
hafnia/data/__init__.py +2 -2
hafnia/data/factory.py +9 -56
hafnia/dataset/dataset_helpers.py +91 -0
hafnia/dataset/dataset_names.py +71 -0
hafnia/dataset/dataset_transformation.py +187 -0
hafnia/dataset/dataset_upload_helper.py +468 -0
hafnia/dataset/hafnia_dataset.py +453 -0
hafnia/dataset/primitives/__init__.py +16 -0
hafnia/dataset/primitives/bbox.py +137 -0
hafnia/dataset/primitives/bitmask.py +182 -0
hafnia/dataset/primitives/classification.py +56 -0
hafnia/dataset/primitives/point.py +25 -0
hafnia/dataset/primitives/polygon.py +100 -0
hafnia/dataset/primitives/primitive.py +44 -0
hafnia/dataset/primitives/segmentation.py +51 -0
hafnia/dataset/primitives/utils.py +51 -0
hafnia/dataset/table_transformations.py +183 -0
hafnia/experiment/hafnia_logger.py +2 -2
hafnia/helper_testing.py +63 -0
hafnia/http.py +5 -3
hafnia/platform/__init__.py +2 -2
hafnia/platform/datasets.py +184 -0
hafnia/platform/download.py +85 -23
hafnia/torch_helpers.py +180 -95
hafnia/utils.py +1 -1
hafnia/visualizations/colors.py +267 -0
hafnia/visualizations/image_visualizations.py +202 -0
{hafnia-0.1.27.dist-info → hafnia-0.2.0.dist-info}/METADATA +212 -99
hafnia-0.2.0.dist-info/RECORD +46 -0
cli/data_cmds.py +0 -53
hafnia-0.1.27.dist-info/RECORD +0 -27
{hafnia-0.1.27.dist-info → hafnia-0.2.0.dist-info}/WHEEL +0 -0
{hafnia-0.1.27.dist-info → hafnia-0.2.0.dist-info}/entry_points.txt +0 -0
{hafnia-0.1.27.dist-info → hafnia-0.2.0.dist-info}/licenses/LICENSE +0 -0

hafnia/dataset/table_transformations.py ADDED Viewed

@@ -0,0 +1,183 @@
+from pathlib import Path
+from typing import List, Optional, Type
+import polars as pl
+from tqdm import tqdm
+from hafnia.dataset import table_transformations
+from hafnia.dataset.dataset_names import (
+    FILENAME_ANNOTATIONS_JSONL,
+    FILENAME_ANNOTATIONS_PARQUET,
+    FieldName,
+)
+from hafnia.dataset.primitives import PRIMITIVE_TYPES
+from hafnia.dataset.primitives.classification import Classification
+from hafnia.dataset.primitives.primitive import Primitive
+from hafnia.log import user_logger
+def create_primitive_table(
+    samples_table: pl.DataFrame, PrimitiveType: Type[Primitive], keep_sample_data: bool = False
+) -> Optional[pl.DataFrame]:
+    """
+    Returns a DataFrame with objects of the specified primitive type.
+    """
+    column_name = PrimitiveType.column_name()
+    has_primitive_column = (column_name in samples_table.columns) and (
+        samples_table[column_name].dtype == pl.List(pl.Struct)
+    )
+    if not has_primitive_column:
+        return None
+    # Remove frames without objects
+    remove_no_object_frames = samples_table.filter(pl.col(column_name).list.len() > 0)
+    if keep_sample_data:
+        # Drop other primitive columns to avoid conflicts
+        drop_columns = set(PRIMITIVE_TYPES) - {PrimitiveType, Classification}
+        remove_no_object_frames = remove_no_object_frames.drop(*[primitive.column_name() for primitive in drop_columns])
+        # Rename columns "height", "width" and "meta" for sample to avoid conflicts with object fields names
+        remove_no_object_frames = remove_no_object_frames.rename(
+            {"height": "image.height", "width": "image.width", "meta": "image.meta"}
+        )
+        objects_df = remove_no_object_frames.explode(column_name).unnest(column_name)
+    else:
+        objects_df = remove_no_object_frames.select(pl.col(column_name).explode().struct.unnest())
+    return objects_df
+def filter_table_for_class_names(
+    samples_table: pl.DataFrame, class_names: List[str], PrimitiveType: Type[Primitive]
+) -> Optional[pl.DataFrame]:
+    table_with_selected_class_names = samples_table.filter(
+        pl.col(PrimitiveType.column_name())
+        .list.eval(pl.element().struct.field(FieldName.CLASS_NAME).is_in(class_names))
+        .list.any()
+    )
+    return table_with_selected_class_names
+def split_primitive_columns_by_task_name(
+    samples_table: pl.DataFrame,
+    coordinate_types: Optional[List[Type[Primitive]]] = None,
+) -> pl.DataFrame:
+    """
+    Convert Primitive columns such as "objects" (Bbox) into a column for each task name.
+    For example, if the "objects" column (containing Bbox objects) has tasks "task1" and "task2".
+    This:
+    ─┬────────────┬─
+     ┆ objects    ┆
+     ┆ ---        ┆
+     ┆ list[struc ┆
+     ┆ t[11]]     ┆
+    ═╪════════════╪═
+    becomes this:
+    ─┬────────────┬────────────┬─
+     ┆ objects.   ┆ objects.   ┆
+     ┆ task1      ┆ task2      ┆
+     ┆ ---        ┆ ---        ┆
+     ┆ list[struc ┆ list[struc ┆
+     ┆ t[11]]     ┆ t[13]]     ┆
+    ═╪════════════╪════════════╪═
+    """
+    coordinate_types = coordinate_types or PRIMITIVE_TYPES
+    for PrimitiveType in coordinate_types:
+        col_name = PrimitiveType.column_name()
+        if col_name not in samples_table.columns:
+            continue
+        if samples_table[col_name].dtype != pl.List(pl.Struct):
+            continue
+        task_names = samples_table[col_name].explode().struct.field(FieldName.TASK_NAME).unique().to_list()
+        samples_table = samples_table.with_columns(
+            [
+                pl.col(col_name)
+                .list.filter(pl.element().struct.field(FieldName.TASK_NAME).eq(task_name))
+                .alias(f"{col_name}.{task_name}")
+                for task_name in task_names
+            ]
+        )
+        samples_table = samples_table.drop(col_name)
+    return samples_table
+def read_table_from_path(path: Path) -> pl.DataFrame:
+    path_annotations = path / FILENAME_ANNOTATIONS_PARQUET
+    if path_annotations.exists():
+        user_logger.info(f"Reading dataset annotations from Parquet file: {path_annotations}")
+        return pl.read_parquet(path_annotations)
+    path_annotations_jsonl = path / FILENAME_ANNOTATIONS_JSONL
+    if path_annotations_jsonl.exists():
+        user_logger.info(f"Reading dataset annotations from JSONL file: {path_annotations_jsonl}")
+        return pl.read_ndjson(path_annotations_jsonl)
+    raise FileNotFoundError(
+        f"Unable to read annotations. No json file '{path_annotations.name}' or Parquet file '{{path_annotations.name}} in in '{path}'."
+    )
+def check_image_paths(table: pl.DataFrame) -> bool:
+    missing_files = []
+    for org_path in tqdm(table["file_name"].to_list(), desc="Check image paths"):
+        org_path = Path(org_path)
+        if not org_path.exists():
+            missing_files.append(org_path)
+    if len(missing_files) > 0:
+        user_logger.warning(f"Missing files: {len(missing_files)}. Show first 5:")
+        for missing_file in missing_files[:5]:
+            user_logger.warning(f" - {missing_file}")
+        raise FileNotFoundError(f"Some files are missing in the dataset: {len(missing_files)} files not found.")
+    return True
+def unnest_classification_tasks(table: pl.DataFrame, strict: bool = True) -> pl.DataFrame:
+    """
+    Unnest classification tasks in table.
+    Classificiations tasks are all stored in the same column in the HafniaDataset table.
+    This function splits them into separate columns for each task name.
+    Type is converted from a list of structs (pl.List[pl.Struct]) to a struct (pl.Struct) column.
+    Converts classification column from this:
+       ─┬─────────────────┬─
+        ┆ classifications ┆
+        ┆ ---             ┆
+        ┆ list[struct[6]] ┆
+       ═╪═════════════════╪═
+    For example, if the classification column has tasks "task1" and "task2",
+       ─┬──────────────────┬──────────────────┬─
+        ┆ classifications. ┆ classifications. ┆
+        ┆ task1            ┆ task2            ┆
+        ┆ ---              ┆ ---              ┆
+        ┆ struct[6]        ┆ struct[6]        ┆
+       ═╪══════════════════╪══════════════════╪═
+    """
+    coordinate_types = [Classification]
+    table_out = table_transformations.split_primitive_columns_by_task_name(table, coordinate_types=coordinate_types)
+    classification_columns = [c for c in table_out.columns if c.startswith(Classification.column_name() + ".")]
+    for classification_column in classification_columns:
+        has_multiple_items_per_sample = all(table_out[classification_column].list.len() > 1)
+        if has_multiple_items_per_sample:
+            if strict:
+                raise ValueError(
+                    f"Column {classification_column} has multiple items per sample, but expected only one item."
+                )
+            else:
+                user_logger.warning(
+                    f"Warning: Unnesting of column '{classification_column}' is skipped because it has multiple items per sample."
+                )
+    table_out = table_out.with_columns([pl.col(c).list.first() for c in classification_columns])
+    return table_out

hafnia/experiment/hafnia_logger.py CHANGED Viewed

@@ -9,10 +9,10 @@ from typing import Dict, Optional, Union
 import pyarrow as pa
 import pyarrow.parquet as pq
-from datasets import DatasetDict
 from pydantic import BaseModel, field_validator
 from hafnia.data.factory import load_dataset
+from hafnia.dataset.hafnia_dataset import HafniaDataset
 from hafnia.log import sys_logger, user_logger
 from hafnia.utils import is_remote_job, now_as_str
@@ -92,7 +92,7 @@ class HafniaLogger:
         self.schema = Entity.create_schema()
         self.log_environment()
-    def load_dataset(self, dataset_name: str) -> DatasetDict:
+    def load_dataset(self, dataset_name: str) -> HafniaDataset:
         """
         Load a dataset from the specified path.
         """

hafnia/helper_testing.py ADDED Viewed

@@ -0,0 +1,63 @@
+from pathlib import Path
+from hafnia import utils
+from hafnia.dataset.dataset_names import FILENAME_ANNOTATIONS_JSONL, DatasetVariant
+from hafnia.dataset.hafnia_dataset import HafniaDataset, Sample
+MICRO_DATASETS = {
+    "tiny-dataset": utils.PATH_DATASETS / "tiny-dataset",
+    "coco-2017": utils.PATH_DATASETS / "coco-2017",
+}
+def get_path_workspace() -> Path:
+    return Path(__file__).parents[2]
+def get_path_expected_images() -> Path:
+    return get_path_workspace() / "tests" / "data" / "expected_images"
+def get_path_test_data() -> Path:
+    return get_path_workspace() / "tests" / "data"
+def get_path_micro_hafnia_dataset_no_check() -> Path:
+    return get_path_test_data() / "micro_test_datasets"
+def get_path_micro_hafnia_dataset(dataset_name: str, force_update=False) -> Path:
+    import pytest
+    if dataset_name not in MICRO_DATASETS:
+        raise ValueError(f"Dataset name '{dataset_name}' is not recognized. Available options: {list(MICRO_DATASETS)}")
+    path_dataset = MICRO_DATASETS[dataset_name]
+    path_test_dataset = get_path_micro_hafnia_dataset_no_check() / dataset_name
+    path_test_dataset_annotations = path_test_dataset / FILENAME_ANNOTATIONS_JSONL
+    if path_test_dataset_annotations.exists() and not force_update:
+        return path_test_dataset
+    hafnia_dataset = HafniaDataset.read_from_path(path_dataset / DatasetVariant.SAMPLE.value)
+    hafnia_dataset = hafnia_dataset.sample(n_samples=3, seed=42)
+    hafnia_dataset.write(path_test_dataset)
+    if force_update:
+        pytest.fail(
+            "Sample image and metadata have been updated using 'force_update=True'. Set 'force_update=False' and rerun the test."
+        )
+    pytest.fail("Missing test sample image. Please rerun the test.")
+    return path_test_dataset
+def get_sample_micro_hafnia_dataset(dataset_name: str, force_update=False) -> Sample:
+    micro_dataset = get_micro_hafnia_dataset(dataset_name=dataset_name, force_update=force_update)
+    sample_dict = micro_dataset[0]
+    sample = Sample(**sample_dict)
+    return sample
+def get_micro_hafnia_dataset(dataset_name: str, force_update: bool = False) -> HafniaDataset:
+    path_dataset = get_path_micro_hafnia_dataset(dataset_name=dataset_name, force_update=force_update)
+    hafnia_dataset = HafniaDataset.read_from_path(path_dataset)
+    return hafnia_dataset

hafnia/http.py CHANGED Viewed

@@ -31,7 +31,7 @@ def fetch(endpoint: str, headers: Dict, params: Optional[Dict] = None) -> Dict:
         http.clear()
-def post(endpoint: str, headers: Dict, data: Union[Path, Dict, bytes], multipart: bool = False) -> Dict:
+def post(endpoint: str, headers: Dict, data: Union[Path, Dict, bytes, str], multipart: bool = False) -> Dict:
     """Posts data to backend endpoint.
     Args:
@@ -64,9 +64,11 @@ def post(endpoint: str, headers: Dict, data: Union[Path, Dict, bytes], multipart
                 with open(data, "rb") as f:
                     body = f.read()
                 response = http.request("POST", endpoint, body=body, headers=headers)
-            elif isinstance(data, dict):
+            elif isinstance(data, (str, dict)):
+                if isinstance(data, dict):
+                    data = json.dumps(data)
                 headers["Content-Type"] = "application/json"
-                response = http.request("POST", endpoint, body=json.dumps(data), headers=headers)
+                response = http.request("POST", endpoint, body=data, headers=headers)
             elif isinstance(data, bytes):
                 response = http.request("POST", endpoint, body=data, headers=headers)
             else:

hafnia/platform/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from hafnia.platform.download import (
     download_resource,
     download_single_object,
-    get_resource_creds,
+    get_resource_credentials,
 )
 from hafnia.platform.experiment import (
     create_experiment,
@@ -17,5 +17,5 @@ __all__ = [
     "create_experiment",
     "download_resource",
     "download_single_object",
-    "get_resource_creds",
+    "get_resource_credentials",
 ]

hafnia/platform/datasets.py ADDED Viewed

@@ -0,0 +1,184 @@
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import rich
+from tqdm import tqdm
+from cli.config import Config
+from hafnia import utils
+from hafnia.dataset import dataset_names
+from hafnia.dataset.hafnia_dataset import HafniaDataset
+from hafnia.http import fetch
+from hafnia.log import user_logger
+from hafnia.platform import get_dataset_id
+from hafnia.platform.download import get_resource_credentials
+from hafnia.utils import timed
+@timed("Fetching dataset list.")
+def dataset_list(cfg: Optional[Config] = None) -> List[Dict[str, str]]:
+    """List available datasets on the Hafnia platform."""
+    cfg = cfg or Config()
+    endpoint_dataset = cfg.get_platform_endpoint("datasets")
+    header = {"Authorization": cfg.api_key}
+    datasets: List[Dict[str, str]] = fetch(endpoint_dataset, headers=header)  # type: ignore
+    if not datasets:
+        raise ValueError("No datasets found on the Hafnia platform.")
+    return datasets
+def download_or_get_dataset_path(
+    dataset_name: str,
+    cfg: Optional[Config] = None,
+    path_datasets_folder: Optional[str] = None,
+    force_redownload: bool = False,
+) -> Path:
+    """Download or get the path of the dataset."""
+    if utils.is_remote_job():
+        return Path(os.getenv("MDI_DATASET_DIR", "/opt/ml/input/data/training"))
+    path_datasets_folder = path_datasets_folder or str(utils.PATH_DATASETS)
+    path_dataset = Path(path_datasets_folder).absolute() / dataset_name
+    is_dataset_valid = HafniaDataset.check_dataset_path(path_dataset, raise_error=False)
+    if is_dataset_valid and not force_redownload:
+        user_logger.info("Dataset found locally. Set 'force=True' or add `--force` flag with cli to re-download")
+        return path_dataset
+    cfg = cfg or Config()
+    api_key = cfg.api_key
+    shutil.rmtree(path_dataset, ignore_errors=True)
+    endpoint_dataset = cfg.get_platform_endpoint("datasets")
+    dataset_id = get_dataset_id(dataset_name=dataset_name, endpoint=endpoint_dataset, api_key=api_key)
+    access_dataset_endpoint = f"{endpoint_dataset}/{dataset_id}/temporary-credentials"
+    download_dataset_from_access_endpoint(
+        endpoint=access_dataset_endpoint,
+        api_key=api_key,
+        path_dataset=path_dataset,
+    )
+    return path_dataset
+def download_dataset_from_access_endpoint(endpoint: str, api_key: str, path_dataset: Path) -> None:
+    resource_credentials = get_resource_credentials(endpoint, api_key)
+    local_dataset_paths = [str(path_dataset / filename) for filename in dataset_names.DATASET_FILENAMES]
+    s3_uri = resource_credentials.s3_uri()
+    s3_dataset_files = [f"{s3_uri}/{filename}" for filename in dataset_names.DATASET_FILENAMES]
+    envs = resource_credentials.aws_credentials()
+    fast_copy_files_s3(
+        src_paths=s3_dataset_files,
+        dst_paths=local_dataset_paths,
+        append_envs=envs,
+        description="Downloading annotations",
+    )
+    dataset = HafniaDataset.read_from_path(path_dataset, check_for_images=False)
+    fast_copy_files_s3(
+        src_paths=dataset.samples[dataset_names.ColumnName.REMOTE_PATH].to_list(),
+        dst_paths=dataset.samples[dataset_names.ColumnName.FILE_NAME].to_list(),
+        append_envs=envs,
+        description="Downloading images",
+    )
+def fast_copy_files_s3(
+    src_paths: List[str],
+    dst_paths: List[str],
+    append_envs: Optional[Dict[str, str]] = None,
+    description: str = "Copying files",
+) -> List[str]:
+    if len(src_paths) != len(dst_paths):
+        raise ValueError("Source and destination paths must have the same length.")
+    cmds = [f"cp {src} {dst}" for src, dst in zip(src_paths, dst_paths)]
+    lines = execute_s5cmd_commands(cmds, append_envs=append_envs, description=description)
+    return lines
+def execute_s5cmd_commands(
+    commands: List[str],
+    append_envs: Optional[Dict[str, str]] = None,
+    description: str = "Executing s5cmd commands",
+) -> List[str]:
+    append_envs = append_envs or {}
+    with tempfile.NamedTemporaryFile(suffix=".txt") as tmp_file:
+        tmp_file_path = Path(tmp_file.name)
+        tmp_file_path.write_text("\n".join(commands))
+        run_cmds = [
+            "s5cmd",
+            "run",
+            str(tmp_file_path),
+        ]
+        envs = os.environ.copy()
+        envs.update(append_envs)
+        process = subprocess.Popen(
+            run_cmds,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            universal_newlines=True,
+            env=envs,
+        )
+        error_lines = []
+        lines = []
+        for line in tqdm(process.stdout, total=len(commands), desc=description):
+            if "ERROR" in line or "error" in line:
+                error_lines.append(line.strip())
+            lines.append(line.strip())
+        if len(error_lines) > 0:
+            show_n_lines = min(5, len(error_lines))
+            str_error_lines = "\n".join(error_lines[:show_n_lines])
+            user_logger.error(
+                f"Detected {len(error_lines)} errors occurred while executing a total of {len(commands)} "
+                f" commands with s5cmd. The first {show_n_lines} is printed below:\n{str_error_lines}"
+            )
+            raise RuntimeError("Errors occurred during s5cmd execution.")
+    return lines
+TABLE_FIELDS = {
+    "ID": "id",
+    "Hidden\nSamples": "hidden.samples",
+    "Hidden\nSize": "hidden.size",
+    "Sample\nSamples": "sample.samples",
+    "Sample\nSize": "sample.size",
+    "Name": "name",
+    "Title": "title",
+}
+def create_rich_table_from_dataset(datasets: List[Dict[str, str]]) -> rich.table.Table:
+    datasets = extend_dataset_details(datasets)
+    datasets = sorted(datasets, key=lambda x: x["name"].lower())
+    table = rich.table.Table(title="Available Datasets")
+    for i_dataset, dataset in enumerate(datasets):
+        if i_dataset == 0:
+            for column_name, _ in TABLE_FIELDS.items():
+                table.add_column(column_name, justify="left", style="cyan", no_wrap=True)
+        row = [str(dataset.get(field, "")) for field in TABLE_FIELDS.values()]
+        table.add_row(*row)
+    return table
+def extend_dataset_details(datasets: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Extends dataset details with number of samples and size"""
+    for dataset in datasets:
+        for variant in dataset["dataset_variants"]:
+            variant_type = variant["variant_type"]
+            dataset[f"{variant_type}.samples"] = variant["number_of_data_items"]
+            dataset[f"{variant_type}.size"] = utils.size_human_readable(variant["size_bytes"])
+    return datasets

hafnia/platform/download.py CHANGED Viewed

@@ -1,15 +1,87 @@
 from pathlib import Path
-from typing import Any, Dict
+from typing import Dict
 import boto3
 from botocore.exceptions import ClientError
+from pydantic import BaseModel, field_validator
 from tqdm import tqdm
 from hafnia.http import fetch
 from hafnia.log import sys_logger, user_logger
-def get_resource_creds(endpoint: str, api_key: str) -> Dict[str, Any]:
+ARN_PREFIX = "arn:aws:s3:::"
+class ResourceCredentials(BaseModel):
+    access_key: str
+    secret_key: str
+    session_token: str
+    s3_arn: str
+    region: str
+    @staticmethod
+    def fix_naming(payload: Dict[str, str]) -> "ResourceCredentials":
+        """
+        The endpoint returns a payload with a key called 's3_path', but it
+        is actually an ARN path (starts with arn:aws:s3::). This method renames it to 's3_arn' for consistency.
+        """
+        if "s3_path" in payload and payload["s3_path"].startswith(ARN_PREFIX):
+            payload["s3_arn"] = payload.pop("s3_path")
+        if "region" not in payload:
+            payload["region"] = "eu-west-1"
+        return ResourceCredentials(**payload)
+    @field_validator("s3_arn")
+    @classmethod
+    def validate_s3_arn(cls, value: str) -> str:
+        """Validate s3_arn to ensure it starts with 'arn:aws:s3:::'"""
+        if not value.startswith("arn:aws:s3:::"):
+            raise ValueError(f"Invalid S3 ARN: {value}. It should start with 'arn:aws:s3:::'")
+        return value
+    def s3_path(self) -> str:
+        """
+        Extracts the S3 path from the ARN.
+        Example: arn:aws:s3:::my-bucket/my-prefix -> my-bucket/my-prefix
+        """
+        return self.s3_arn[len(ARN_PREFIX) :]
+    def s3_uri(self) -> str:
+        """
+        Converts the S3 ARN to a URI format.
+        Example: arn:aws:s3:::my-bucket/my-prefix -> s3://my-bucket/my-prefix
+        """
+        return f"s3://{self.s3_path()}"
+    def bucket_name(self) -> str:
+        """
+        Extracts the bucket name from the S3 ARN.
+        Example: arn:aws:s3:::my-bucket/my-prefix -> my-bucket
+        """
+        return self.s3_path().split("/")[0]
+    def object_key(self) -> str:
+        """
+        Extracts the object key from the S3 ARN.
+        Example: arn:aws:s3:::my-bucket/my-prefix -> my-prefix
+        """
+        return "/".join(self.s3_path().split("/")[1:])
+    def aws_credentials(self) -> Dict[str, str]:
+        """
+        Returns the AWS credentials as a dictionary.
+        """
+        environment_vars = {
+            "AWS_ACCESS_KEY_ID": self.access_key,
+            "AWS_SECRET_ACCESS_KEY": self.secret_key,
+            "AWS_SESSION_TOKEN": self.session_token,
+            "AWS_REGION": self.region,
+        }
+        return environment_vars
+def get_resource_credentials(endpoint: str, api_key: str) -> ResourceCredentials:
     """
     Retrieve credentials for accessing the recipe stored in S3 (or another resource)
     by calling a DIP endpoint with the API key.
@@ -18,21 +90,16 @@ def get_resource_creds(endpoint: str, api_key: str) -> Dict[str, Any]:
         endpoint (str): The endpoint URL to fetch credentials from.
     Returns:
-        Dict[str, Any]: Dictionary containing the credentials, for example:
-            {
-                "access_key": str,
-                "secret_key": str,
-                "session_token": str,
-                "s3_path": str
-            }
+        ResourceCredentials
     Raises:
         RuntimeError: If the call to fetch the credentials fails for any reason.
     """
     try:
-        creds = fetch(endpoint, headers={"Authorization": api_key, "accept": "application/json"})
+        credentials_dict = fetch(endpoint, headers={"Authorization": api_key, "accept": "application/json"})
+        credentials = ResourceCredentials.fix_naming(credentials_dict)
         sys_logger.debug("Successfully retrieved credentials from DIP endpoint.")
-        return creds
+        return credentials
     except Exception as e:
         sys_logger.error(f"Failed to fetch credentials from endpoint: {e}")
         raise RuntimeError(f"Failed to retrieve credentials: {e}") from e
@@ -76,23 +143,18 @@ def download_resource(resource_url: str, destination: str, api_key: str) -> Dict
         ValueError: If the S3 ARN is invalid or no objects found under prefix.
         RuntimeError: If S3 calls fail with an unexpected error.
     """
-    res_creds = get_resource_creds(resource_url, api_key)
-    s3_arn = res_creds["s3_path"]
-    arn_prefix = "arn:aws:s3:::"
-    if not s3_arn.startswith(arn_prefix):
-        raise ValueError(f"Invalid S3 ARN: {s3_arn}")
+    res_credentials = get_resource_credentials(resource_url, api_key)
-    s3_path = s3_arn[len(arn_prefix) :]
-    bucket_name, *key_parts = s3_path.split("/")
-    key = "/".join(key_parts)
+    bucket_name = res_credentials.bucket_name()
+    key = res_credentials.object_key()
     output_path = Path(destination)
     output_path.mkdir(parents=True, exist_ok=True)
     s3_client = boto3.client(
         "s3",
-        aws_access_key_id=res_creds["access_key"],
-        aws_secret_access_key=res_creds["secret_key"],
-        aws_session_token=res_creds["session_token"],
+        aws_access_key_id=res_credentials.access_key,
+        aws_secret_access_key=res_credentials.secret_key,
+        aws_session_token=res_credentials.session_token,
     )
     downloaded_files = []
     try:

hafnia 0.1.27__py3-none-any.whl → 0.2.0__py3-none-any.whl

hafnia 0.1.27py3-none-any.whl → 0.2.0py3-none-any.whl