PyPI - hafnia - Versions diffs - 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

hafnia 0.2.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

cli/__main__.py +16 -3
cli/config.py +45 -4
cli/consts.py +1 -1
cli/dataset_cmds.py +6 -14
cli/dataset_recipe_cmds.py +78 -0
cli/experiment_cmds.py +226 -43
cli/keychain.py +88 -0
cli/profile_cmds.py +10 -6
cli/runc_cmds.py +5 -5
cli/trainer_package_cmds.py +65 -0
hafnia/__init__.py +2 -0
hafnia/data/factory.py +1 -2
hafnia/dataset/dataset_helpers.py +9 -14
hafnia/dataset/dataset_names.py +10 -5
hafnia/dataset/dataset_recipe/dataset_recipe.py +165 -67
hafnia/dataset/dataset_recipe/recipe_transforms.py +48 -4
hafnia/dataset/dataset_recipe/recipe_types.py +1 -1
hafnia/dataset/dataset_upload_helper.py +265 -56
hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
hafnia/dataset/hafnia_dataset.py +577 -213
hafnia/dataset/license_types.py +63 -0
hafnia/dataset/operations/dataset_stats.py +259 -3
hafnia/dataset/operations/dataset_transformations.py +332 -7
hafnia/dataset/operations/table_transformations.py +43 -5
hafnia/dataset/primitives/__init__.py +8 -0
hafnia/dataset/primitives/bbox.py +25 -12
hafnia/dataset/primitives/bitmask.py +26 -14
hafnia/dataset/primitives/classification.py +16 -8
hafnia/dataset/primitives/point.py +7 -3
hafnia/dataset/primitives/polygon.py +16 -9
hafnia/dataset/primitives/segmentation.py +10 -7
hafnia/experiment/hafnia_logger.py +111 -8
hafnia/http.py +16 -2
hafnia/platform/__init__.py +9 -3
hafnia/platform/builder.py +12 -10
hafnia/platform/dataset_recipe.py +104 -0
hafnia/platform/datasets.py +47 -9
hafnia/platform/download.py +25 -19
hafnia/platform/experiment.py +51 -56
hafnia/platform/trainer_package.py +57 -0
hafnia/utils.py +81 -13
hafnia/visualizations/image_visualizations.py +4 -4
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/METADATA +40 -34
hafnia-0.4.0.dist-info/RECORD +56 -0
cli/recipe_cmds.py +0 -45
hafnia-0.2.4.dist-info/RECORD +0 -49
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0

hafnia/dataset/primitives/polygon.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
 import cv2
 import numpy as np
+from pydantic import Field
 from hafnia.dataset.primitives.bitmask import Bitmask
 from hafnia.dataset.primitives.point import Point
@@ -11,15 +12,21 @@ from hafnia.dataset.primitives.utils import class_color_by_name, get_class_name
 class Polygon(Primitive):
     # Names should match names in FieldName
-    points: List[Point]
-    class_name: Optional[str] = None  # This should match the string in 'FieldName.CLASS_NAME'
-    class_idx: Optional[int] = None  # This should match the string in 'FieldName.CLASS_IDX'
-    object_id: Optional[str] = None  # This should match the string in 'FieldName.OBJECT_ID'
-    confidence: Optional[float] = None  # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
-    ground_truth: bool = True  # Whether this is ground truth or a prediction
-    task_name: str = ""  # Task name to support multiple Polygon tasks in the same dataset. "" defaults to "polygon"
-    meta: Optional[Dict[str, Any]] = None  # This can be used to store additional information about the bitmask
+    points: List[Point] = Field(description="List of points defining the polygon")
+    class_name: Optional[str] = Field(default=None, description="Class name of the polygon")
+    class_idx: Optional[int] = Field(default=None, description="Class index of the polygon")
+    object_id: Optional[str] = Field(default=None, description="Object ID of the polygon")
+    confidence: Optional[float] = Field(
+        default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
+    )
+    ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
+    task_name: str = Field(
+        default="", description="Task name to support multiple Polygon tasks in the same dataset. Defaults to 'polygon'"
+    )
+    meta: Optional[Dict[str, Any]] = Field(
+        default=None, description="This can be used to store additional information about the polygon"
+    )
     @staticmethod
     def from_list_of_points(

hafnia/dataset/primitives/segmentation.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple
 import cv2
 import numpy as np
+from pydantic import Field
 from hafnia.dataset.primitives.primitive import Primitive
 from hafnia.dataset.primitives.utils import get_class_name
@@ -9,15 +10,17 @@ from hafnia.visualizations.colors import get_n_colors
 class Segmentation(Primitive):
-    # mask: np.ndarray
-    class_names: Optional[List[str]] = None  # This should match the string in 'FieldName.CLASS_NAME'
-    ground_truth: bool = True  # Whether this is ground truth or a prediction
+    # WARNING: Segmentation masks have not been fully implemented yet
+    class_names: Optional[List[str]] = Field(default=None, description="Class names of the segmentation")
+    ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
-    # confidence: Optional[float] = None  # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification
-    task_name: str = (
-        ""  # Task name to support multiple Segmentation tasks in the same dataset. "" defaults to "segmentation"
+    task_name: str = Field(
+        default="",
+        description="Task name to support multiple Segmentation tasks in the same dataset. Defaults to 'segmentation'",
+    )
+    meta: Optional[Dict[str, Any]] = Field(
+        default=None, description="This can be used to store additional information about the segmentation"
     )
-    meta: Optional[Dict[str, Any]] = None  # This can be used to store additional information about the bitmask
     @staticmethod
     def default_task_name() -> str:

hafnia/experiment/hafnia_logger.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 import platform
 import sys
+import textwrap
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
@@ -11,11 +12,19 @@ import pyarrow as pa
 import pyarrow.parquet as pq
 from pydantic import BaseModel, field_validator
-from hafnia.data.factory import load_dataset
-from hafnia.dataset.hafnia_dataset import HafniaDataset
 from hafnia.log import sys_logger, user_logger
 from hafnia.utils import is_hafnia_cloud_job, now_as_str
+try:
+    import mlflow
+    import mlflow.tracking
+    import sagemaker_mlflow  # noqa: F401
+    MLFLOW_AVAILABLE = True
+except ImportError:
+    user_logger.warning("MLFlow is not available")
+    MLFLOW_AVAILABLE = False
 class EntityType(Enum):
     """Types of entities that can be logged."""
@@ -87,17 +96,43 @@ class HafniaLogger:
         for path in create_paths:
             path.mkdir(parents=True, exist_ok=True)
+        path_file = self.path_model() / "HOW_TO_STORE_YOUR_MODEL.txt"
+        path_file.write_text(get_instructions_how_to_store_model())
         self.dataset_name: Optional[str] = None
         self.log_file = self._path_artifacts() / self.EXPERIMENT_FILE
         self.schema = Entity.create_schema()
+        # Initialize MLflow for remote jobs
+        self._mlflow_initialized = False
+        if is_hafnia_cloud_job() and MLFLOW_AVAILABLE:
+            self._init_mlflow()
         self.log_environment()
-    def load_dataset(self, dataset_name: str) -> HafniaDataset:
-        """
-        Load a dataset from the specified path.
-        """
-        self.dataset_name = dataset_name
-        return load_dataset(dataset_name)
+    def _init_mlflow(self):
+        """Initialize MLflow tracking for remote jobs."""
+        try:
+            # Set MLflow tracking URI from environment variable
+            tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
+            if tracking_uri:
+                mlflow.set_tracking_uri(tracking_uri)
+                user_logger.info(f"MLflow tracking URI set to: {tracking_uri}")
+            # Set experiment name from environment variable
+            experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME")
+            if experiment_name:
+                mlflow.set_experiment(experiment_name)
+                user_logger.info(f"MLflow experiment set to: {experiment_name}")
+            # Start MLflow run
+            run_name = os.getenv("MLFLOW_RUN_NAME", "undefined")
+            mlflow.start_run(run_name=run_name)
+            self._mlflow_initialized = True
+            user_logger.info("MLflow run started successfully")
+        except Exception as e:
+            user_logger.error(f"Failed to initialize MLflow: {e}")
     def path_local_experiment(self) -> Path:
         """Get the path for local experiment."""
@@ -153,6 +188,14 @@ class HafniaLogger:
         )
         self.write_entity(entity)
+        # Also log to MLflow if initialized
+        if not self._mlflow_initialized:
+            return
+        try:
+            mlflow.log_metric(name, value, step=step)
+        except Exception as e:
+            user_logger.error(f"Failed to log metric to MLflow: {e}")
     def log_configuration(self, configurations: Dict):
         self.log_hparams(configurations, "configuration.json")
@@ -166,6 +209,15 @@ class HafniaLogger:
             existing_params.update(params)
             file_path.write_text(json.dumps(existing_params, indent=2))
             user_logger.info(f"Saved parameters to {file_path}")
+            # Also log to MLflow if initialized
+            if not self._mlflow_initialized:
+                return
+            try:
+                mlflow.log_params(params)
+            except Exception as e:
+                user_logger.error(f"Failed to log params to MLflow: {e}")
         except Exception as e:
             user_logger.error(f"Failed to save parameters to {file_path}: {e}")
@@ -202,3 +254,54 @@ class HafniaLogger:
                 pq.write_table(next_table, self.log_file)
         except Exception as e:
             sys_logger.error(f"Failed to flush logs: {e}")
+    def end_run(self) -> None:
+        """End the MLflow run if initialized."""
+        if not self._mlflow_initialized:
+            return
+        try:
+            mlflow.end_run()
+            self._mlflow_initialized = False
+            user_logger.info("MLflow run ended successfully")
+        except Exception as e:
+            user_logger.error(f"Failed to end MLflow run: {e}")
+    def __del__(self):
+        """Cleanup when logger is destroyed."""
+        self.end_run()
+def get_instructions_how_to_store_model() -> str:
+    instructions = textwrap.dedent(
+        """\
+        If you, against your expectations, don't see any models in this folder,
+        we have provided a small guide to help.
+        The hafnia TaaS framework expects models to be stored in a folder generated
+        by the hafnia logger. You will need to store models in this folder
+        to  ensure that they are properly stored and accessible after training.
+        Please check your recipe script and ensure that the models are being stored
+        as expected by the TaaS framework.
+        Below is also a small example to demonstrate:
+        ```python
+        from hafnia.experiment import HafniaLogger
+        # Initiate Hafnia logger
+        logger = HafniaLogger()
+        # Folder path to store models - generated by the hafnia logger.
+        model_dir = logger.path_model()
+        # Example for storing a pytorch based model. Note: the model is stored in 'model_dir'
+        path_pytorch_model = model_dir / "model.pth"
+        # Finally save the model to the specified path
+        torch.save(model.state_dict(), path_pytorch_model)
+        ```
+        """
+    )
+    return instructions

hafnia/http.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import json
 from pathlib import Path
-from typing import Dict, Optional, Union
+from typing import Dict, List, Optional, Union
 import urllib3
-def fetch(endpoint: str, headers: Dict, params: Optional[Dict] = None) -> Dict:
+def fetch(endpoint: str, headers: Dict, params: Optional[Dict] = None) -> Union[Dict, List]:
     """Fetches data from the API endpoint.
     Args:
@@ -81,3 +81,17 @@ def post(endpoint: str, headers: Dict, data: Union[Path, Dict, bytes, str], mult
         return json.loads(response.data.decode("utf-8"))
     finally:
         http.clear()
+def delete(endpoint: str, headers: Dict) -> Dict:
+    """Sends a DELETE request to the specified endpoint."""
+    http = urllib3.PoolManager(retries=urllib3.Retry(3))
+    try:
+        response = http.request("DELETE", endpoint, headers=headers)
+        if response.status not in (200, 204):
+            error_details = response.data.decode("utf-8")
+            raise urllib3.exceptions.HTTPError(f"Request failed with status {response.status}: {error_details}")
+        return json.loads(response.data.decode("utf-8")) if response.data else {}
+    finally:
+        http.clear()

hafnia/platform/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from hafnia.platform.datasets import get_dataset_id
 from hafnia.platform.download import (
     download_resource,
     download_single_object,
@@ -5,17 +6,22 @@ from hafnia.platform.download import (
 )
 from hafnia.platform.experiment import (
     create_experiment,
-    create_recipe,
-    get_dataset_id,
+    get_environments,
     get_exp_environment_id,
+    pretty_print_training_environments,
 )
+from hafnia.platform.trainer_package import create_trainer_package, get_trainer_package_by_id, get_trainer_packages
 __all__ = [
     "get_dataset_id",
-    "create_recipe",
+    "create_trainer_package",
+    "get_trainer_packages",
+    "get_trainer_package_by_id",
     "get_exp_environment_id",
     "create_experiment",
     "download_resource",
     "download_single_object",
     "get_resource_credentials",
+    "pretty_print_training_environments",
+    "get_environments",
 ]

hafnia/platform/builder.py CHANGED Viewed

@@ -14,26 +14,28 @@ from hafnia.log import sys_logger, user_logger
 from hafnia.platform import download_resource
-def validate_recipe_format(path: Path) -> None:
-    """Validate Hafnia Recipe Format submition"""
+def validate_trainer_package_format(path: Path) -> None:
+    """Validate Hafnia Trainer Package Format submission"""
     hrf = zipfile.Path(path) if path.suffix == ".zip" else path
     required = {"src", "scripts", "Dockerfile"}
     errors = 0
     for rp in required:
         if not (hrf / rp).exists():
-            user_logger.error(f"Required path {rp} not found in recipe.")
+            user_logger.error(f"Required path {rp} not found in trainer package.")
             errors += 1
     if errors > 0:
-        raise FileNotFoundError("Wrong recipe structure")
+        raise FileNotFoundError("Wrong trainer package structure")
-def prepare_recipe(recipe_url: str, output_dir: Path, api_key: str, state_file: Optional[Path] = None) -> Dict:
-    resource = download_resource(recipe_url, output_dir.as_posix(), api_key)
-    recipe_path = Path(resource["downloaded_files"][0])
-    with zipfile.ZipFile(recipe_path, "r") as zip_ref:
+def prepare_trainer_package(
+    trainer_url: str, output_dir: Path, api_key: str, state_file: Optional[Path] = None
+) -> Dict:
+    resource = download_resource(trainer_url, output_dir.as_posix(), api_key)
+    trainer_path = Path(resource["downloaded_files"][0])
+    with zipfile.ZipFile(trainer_path, "r") as zip_ref:
         zip_ref.extractall(output_dir)
-    validate_recipe_format(output_dir)
+    validate_trainer_package_format(output_dir)
     scripts_dir = output_dir / "scripts"
     if not any(scripts_dir.iterdir()):
@@ -42,7 +44,7 @@ def prepare_recipe(recipe_url: str, output_dir: Path, api_key: str, state_file:
     metadata = {
         "user_data": (output_dir / "src").as_posix(),
         "dockerfile": (output_dir / "Dockerfile").as_posix(),
-        "digest": sha256(recipe_path.read_bytes()).hexdigest()[:8],
+        "digest": sha256(trainer_path.read_bytes()).hexdigest()[:8],
     }
     state_file = state_file if state_file else output_dir / "state.json"
     with open(state_file, "w", encoding="utf-8") as f:

hafnia/platform/dataset_recipe.py ADDED Viewed

@@ -0,0 +1,104 @@
+import json
+from pathlib import Path
+from typing import Dict, List, Optional
+from flatten_dict import flatten
+from hafnia import http
+from hafnia.log import user_logger
+from hafnia.utils import pretty_print_list_as_table, timed
+@timed("Get or create dataset recipe")
+def get_or_create_dataset_recipe(
+    recipe: dict,
+    endpoint: str,
+    api_key: str,
+    name: Optional[str] = None,
+    overwrite: bool = False,
+) -> Optional[Dict]:
+    headers = {"Authorization": api_key}
+    data = {"template": {"body": recipe}, "overwrite": overwrite}
+    if name is not None:
+        data["name"] = name  # type: ignore[assignment]
+    response = http.post(endpoint, headers=headers, data=data)
+    return response
+def get_or_create_dataset_recipe_by_dataset_name(dataset_name: str, endpoint: str, api_key: str) -> Dict:
+    return get_or_create_dataset_recipe(recipe=dataset_name, endpoint=endpoint, api_key=api_key)
+def get_dataset_recipes(endpoint: str, api_key: str) -> List[Dict]:
+    headers = {"Authorization": api_key}
+    dataset_recipes: List[Dict] = http.fetch(endpoint, headers=headers)  # type: ignore[assignment]
+    return dataset_recipes
+def get_dataset_recipe_by_id(dataset_recipe_id: str, endpoint: str, api_key: str) -> Dict:
+    headers = {"Authorization": api_key}
+    full_url = f"{endpoint}/{dataset_recipe_id}"
+    dataset_recipe_info: Dict = http.fetch(full_url, headers=headers)  # type: ignore[assignment]
+    if not dataset_recipe_info:
+        raise ValueError(f"Dataset recipe with ID '{dataset_recipe_id}' was not found.")
+    return dataset_recipe_info
+def get_or_create_dataset_recipe_from_path(
+    path_recipe_json: Path, endpoint: str, api_key: str, name: Optional[str] = None
+) -> Dict:
+    path_recipe_json = Path(path_recipe_json)
+    if not path_recipe_json.exists():
+        raise FileNotFoundError(f"Dataset recipe file '{path_recipe_json}' does not exist.")
+    json_dict = json.loads(path_recipe_json.read_text())
+    return get_or_create_dataset_recipe(json_dict, endpoint=endpoint, api_key=api_key, name=name)
+def delete_dataset_recipe_by_id(id: str, endpoint: str, api_key: str) -> Dict:
+    headers = {"Authorization": api_key}
+    full_url = f"{endpoint}/{id}"
+    response = http.delete(endpoint=full_url, headers=headers)
+    return response
+@timed("Get dataset recipe")
+def get_dataset_recipe_by_name(name: str, endpoint: str, api_key: str) -> Optional[Dict]:
+    headers = {"Authorization": api_key}
+    full_url = f"{endpoint}?name__iexact={name}"
+    dataset_recipes: List[Dict] = http.fetch(full_url, headers=headers)  # type: ignore[assignment]
+    if len(dataset_recipes) == 0:
+        return None
+    if len(dataset_recipes) > 1:
+        user_logger.warning(f"Found {len(dataset_recipes)} dataset recipes called '{name}'. Using the first one.")
+    dataset_recipe = dataset_recipes[0]
+    return dataset_recipe
+def delete_dataset_recipe_by_name(name: str, endpoint: str, api_key: str) -> Optional[Dict]:
+    recipe_response = get_dataset_recipe_by_name(name, endpoint=endpoint, api_key=api_key)
+    if recipe_response:
+        return delete_dataset_recipe_by_id(recipe_response["id"], endpoint=endpoint, api_key=api_key)
+    return recipe_response
+def pretty_print_dataset_recipes(recipes: List[Dict]) -> None:
+    recipes = [flatten(recipe, reducer="dot", max_flatten_depth=2) for recipe in recipes]  # noqa: F821
+    for recipe in recipes:
+        recipe["recipe_json"] = json.dumps(recipe["template.body"])[:20]
+    RECIPE_FIELDS = {
+        "ID": "id",
+        "Name": "name",
+        "Recipe": "recipe_json",
+        "Created": "created_at",
+        "IsDataset": "template.is_direct_dataset_reference",
+    }
+    pretty_print_list_as_table(
+        table_title="Available Dataset Recipes",
+        dict_items=recipes,
+        column_name_to_key_mapping=RECIPE_FIELDS,
+    )

hafnia/platform/datasets.py CHANGED Viewed

@@ -8,10 +8,11 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional
 import rich
-from tqdm import tqdm
+from rich import print as rprint
+from rich.progress import track
 from cli.config import Config
-from hafnia import utils
+from hafnia import http, utils
 from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED, ColumnName
 from hafnia.dataset.dataset_recipe.dataset_recipe import (
     DatasetRecipe,
@@ -20,13 +21,12 @@ from hafnia.dataset.dataset_recipe.dataset_recipe import (
 from hafnia.dataset.hafnia_dataset import HafniaDataset
 from hafnia.http import fetch
 from hafnia.log import sys_logger, user_logger
-from hafnia.platform import get_dataset_id
 from hafnia.platform.download import get_resource_credentials
 from hafnia.utils import timed
 @timed("Fetching dataset list.")
-def dataset_list(cfg: Optional[Config] = None) -> List[Dict[str, str]]:
+def get_datasets(cfg: Optional[Config] = None) -> List[Dict[str, str]]:
     """List available datasets on the Hafnia platform."""
     cfg = cfg or Config()
     endpoint_dataset = cfg.get_platform_endpoint("datasets")
@@ -38,6 +38,19 @@ def dataset_list(cfg: Optional[Config] = None) -> List[Dict[str, str]]:
     return datasets
+@timed("Fetching dataset info.")
+def get_dataset_id(dataset_name: str, endpoint: str, api_key: str) -> str:
+    headers = {"Authorization": api_key}
+    full_url = f"{endpoint}?name__iexact={dataset_name}"
+    dataset_responses: List[Dict] = http.fetch(full_url, headers=headers)  # type: ignore[assignment]
+    if not dataset_responses:
+        raise ValueError(f"Dataset '{dataset_name}' was not found in the dataset library.")
+    try:
+        return dataset_responses[0]["id"]
+    except (IndexError, KeyError) as e:
+        raise ValueError("Dataset information is missing or invalid") from e
 def download_or_get_dataset_path(
     dataset_name: str,
     cfg: Optional[Config] = None,
@@ -109,7 +122,7 @@ def download_dataset_from_access_endpoint(
     try:
         fast_copy_files_s3(
             src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
-            dst_paths=dataset.samples[ColumnName.FILE_NAME].to_list(),
+            dst_paths=dataset.samples[ColumnName.FILE_PATH].to_list(),
             append_envs=envs,
             description="Downloading images",
         )
@@ -131,6 +144,28 @@ def fast_copy_files_s3(
     return lines
+def find_s5cmd() -> Optional[str]:
+    """Locate the s5cmd executable across different installation methods.
+    Searches for s5cmd in:
+    1. System PATH (via shutil.which)
+    2. Python bin directory (Unix-like systems)
+    3. Python executable directory (direct installs)
+    Returns:
+        str: Absolute path to s5cmd executable if found, None otherwise.
+    """
+    result = shutil.which("s5cmd")
+    if result:
+        return result
+    python_dir = Path(sys.executable).parent
+    locations = (python_dir / "Scripts" / "s5cmd.exe", python_dir / "bin" / "s5cmd", python_dir / "s5cmd")
+    for loc in locations:
+        if loc.exists():
+            return str(loc)
+    return None
 def execute_s5cmd_commands(
     commands: List[str],
     append_envs: Optional[Dict[str, str]] = None,
@@ -142,7 +177,10 @@ def execute_s5cmd_commands(
     with tempfile.TemporaryDirectory() as temp_dir:
         tmp_file_path = Path(temp_dir, f"{uuid.uuid4().hex}.txt")
         tmp_file_path.write_text("\n".join(commands))
-        s5cmd_bin = (Path(sys.executable).parent / "s5cmd").absolute().as_posix()
+        s5cmd_bin = find_s5cmd()
+        if s5cmd_bin is None:
+            raise ValueError("Can not find s5cmd executable.")
         run_cmds = [s5cmd_bin, "run", str(tmp_file_path)]
         sys_logger.debug(run_cmds)
         envs = os.environ.copy()
@@ -158,7 +196,7 @@ def execute_s5cmd_commands(
         error_lines = []
         lines = []
-        for line in tqdm(process.stdout, total=len(commands), desc=description):
+        for line in track(process.stdout, total=len(commands), description=description):
             if "ERROR" in line or "error" in line:
                 error_lines.append(line.strip())
             lines.append(line.strip())
@@ -185,7 +223,7 @@ TABLE_FIELDS = {
 }
-def create_rich_table_from_dataset(datasets: List[Dict[str, str]]) -> rich.table.Table:
+def pretty_print_datasets(datasets: List[Dict[str, str]]) -> None:
     datasets = extend_dataset_details(datasets)
     datasets = sorted(datasets, key=lambda x: x["name"].lower())
@@ -197,7 +235,7 @@ def create_rich_table_from_dataset(datasets: List[Dict[str, str]]) -> rich.table
         row = [str(dataset.get(field, "")) for field in TABLE_FIELDS.values()]
         table.add_row(*row)
-    return table
+    rprint(table)
 def extend_dataset_details(datasets: List[Dict[str, Any]]) -> List[Dict[str, Any]]:

hafnia/platform/download.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from pathlib import Path
-from typing import Dict
+from typing import Dict, Optional
 import boto3
 from botocore.exceptions import ClientError
 from pydantic import BaseModel, field_validator
-from tqdm import tqdm
+from rich.progress import Progress
 from hafnia.http import fetch
 from hafnia.log import sys_logger, user_logger
@@ -96,7 +96,8 @@ def get_resource_credentials(endpoint: str, api_key: str) -> ResourceCredentials
         RuntimeError: If the call to fetch the credentials fails for any reason.
     """
     try:
-        credentials_dict = fetch(endpoint, headers={"Authorization": api_key, "accept": "application/json"})
+        headers = {"Authorization": api_key, "accept": "application/json"}
+        credentials_dict: Dict = fetch(endpoint, headers=headers)  # type: ignore[assignment]
         credentials = ResourceCredentials.fix_naming(credentials_dict)
         sys_logger.debug("Successfully retrieved credentials from DIP endpoint.")
         return credentials
@@ -124,13 +125,15 @@ def download_single_object(s3_client, bucket: str, object_key: str, output_dir:
     return local_path
-def download_resource(resource_url: str, destination: str, api_key: str) -> Dict:
+def download_resource(resource_url: str, destination: str, api_key: str, prefix: Optional[str] = None) -> Dict:
     """
     Downloads either a single file from S3 or all objects under a prefix.
     Args:
         resource_url (str): The URL or identifier used to fetch S3 credentials.
         destination (str): Path to local directory where files will be stored.
+        api_key (str): API key for authentication when fetching credentials.
+        prefix (Optional[str]): If provided, only download objects under this prefix.
     Returns:
         Dict[str, Any]: A dictionary containing download info, e.g.:
@@ -146,7 +149,7 @@ def download_resource(resource_url: str, destination: str, api_key: str) -> Dict
     res_credentials = get_resource_credentials(resource_url, api_key)
     bucket_name = res_credentials.bucket_name()
-    key = res_credentials.object_key()
+    prefix = prefix or res_credentials.object_key()
     output_path = Path(destination)
     output_path.mkdir(parents=True, exist_ok=True)
@@ -158,29 +161,32 @@ def download_resource(resource_url: str, destination: str, api_key: str) -> Dict
     )
     downloaded_files = []
     try:
-        s3_client.head_object(Bucket=bucket_name, Key=key)
-        local_file = download_single_object(s3_client, bucket_name, key, output_path)
+        s3_client.head_object(Bucket=bucket_name, Key=prefix)
+        local_file = download_single_object(s3_client, bucket_name, prefix, output_path)
         downloaded_files.append(str(local_file))
         user_logger.info(f"Downloaded single file: {local_file}")
     except ClientError as e:
         error_code = e.response.get("Error", {}).get("Code")
         if error_code == "404":
-            sys_logger.debug(f"Object '{key}' not found; trying as a prefix.")
-            response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=key)
+            sys_logger.debug(f"Object '{prefix}' not found; trying as a prefix.")
+            response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
             contents = response.get("Contents", [])
             if not contents:
-                raise ValueError(f"No objects found for prefix '{key}' in bucket '{bucket_name}'")
-            pbar = tqdm(contents)
-            for obj in pbar:
-                sub_key = obj["Key"]
-                size_mb = obj.get("Size", 0) / 1024 / 1024
-                pbar.set_description(f"{sub_key} ({size_mb:.2f} MB)")
-                local_file = download_single_object(s3_client, bucket_name, sub_key, output_path)
-                downloaded_files.append(local_file.as_posix())
-            user_logger.info(f"Downloaded folder/prefix '{key}' with {len(downloaded_files)} object(s).")
+                raise ValueError(f"No objects found for prefix '{prefix}' in bucket '{bucket_name}'")
+            with Progress() as progress:
+                task = progress.add_task("Downloading files", total=len(contents))
+                for obj in contents:
+                    sub_key = obj["Key"]
+                    size_mb = obj.get("Size", 0) / 1024 / 1024
+                    progress.update(task, description=f"Downloading {sub_key} ({size_mb:.2f} MB)")
+                    local_file = download_single_object(s3_client, bucket_name, sub_key, output_path)
+                    downloaded_files.append(local_file.as_posix())
+                    progress.advance(task)
+            user_logger.info(f"Downloaded folder/prefix '{prefix}' with {len(downloaded_files)} object(s).")
         else:
             user_logger.error(f"Error checking object or prefix: {e}")
             raise RuntimeError(f"Failed to check or download S3 resource: {e}") from e

hafnia 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

hafnia 0.2.4py3-none-any.whl → 0.4.0py3-none-any.whl