PyPI - hafnia - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

hafnia 0.2.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

cli/config.py +17 -4
hafnia/data/factory.py +13 -10
hafnia/dataset/dataset_names.py +2 -1
hafnia/dataset/dataset_recipe/dataset_recipe.py +327 -0
hafnia/dataset/dataset_recipe/recipe_transforms.py +53 -0
hafnia/dataset/dataset_recipe/recipe_types.py +140 -0
hafnia/dataset/hafnia_dataset.py +202 -31
hafnia/dataset/operations/dataset_stats.py +15 -0
hafnia/dataset/operations/dataset_transformations.py +82 -0
hafnia/dataset/{table_transformations.py → operations/table_transformations.py} +1 -1
hafnia/experiment/hafnia_logger.py +5 -5
hafnia/helper_testing.py +48 -3
hafnia/platform/datasets.py +26 -13
hafnia/utils.py +20 -1
hafnia/visualizations/image_visualizations.py +1 -1
{hafnia-0.2.0.dist-info → hafnia-0.2.1.dist-info}/METADATA +17 -20
{hafnia-0.2.0.dist-info → hafnia-0.2.1.dist-info}/RECORD +20 -16
hafnia/dataset/dataset_transformation.py +0 -187
{hafnia-0.2.0.dist-info → hafnia-0.2.1.dist-info}/WHEEL +0 -0
{hafnia-0.2.0.dist-info → hafnia-0.2.1.dist-info}/entry_points.txt +0 -0
{hafnia-0.2.0.dist-info → hafnia-0.2.1.dist-info}/licenses/LICENSE +0 -0

hafnia/dataset/hafnia_dataset.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import shutil
 from dataclasses import dataclass
 from pathlib import Path
+from random import Random
 from typing import Any, Dict, List, Optional, Type, Union
 import more_itertools
@@ -16,16 +17,23 @@ from rich import print as rprint
 from rich.table import Table
 from tqdm import tqdm
-from hafnia.dataset import dataset_helpers, dataset_transformation
+from hafnia.dataset import dataset_helpers
 from hafnia.dataset.dataset_names import (
-    DATASET_FILENAMES,
+    DATASET_FILENAMES_REQUIRED,
     FILENAME_ANNOTATIONS_JSONL,
     FILENAME_ANNOTATIONS_PARQUET,
     FILENAME_DATASET_INFO,
+    FILENAME_RECIPE_JSON,
     ColumnName,
     FieldName,
     SplitName,
 )
+from hafnia.dataset.operations import dataset_stats, dataset_transformations
+from hafnia.dataset.operations.table_transformations import (
+    check_image_paths,
+    create_primitive_table,
+    read_table_from_path,
+)
 from hafnia.dataset.primitives import (
     PRIMITIVE_NAME_TO_TYPE,
     PRIMITIVE_TYPES,
@@ -35,11 +43,6 @@ from hafnia.dataset.primitives.bitmask import Bitmask
 from hafnia.dataset.primitives.classification import Classification
 from hafnia.dataset.primitives.polygon import Polygon
 from hafnia.dataset.primitives.primitive import Primitive
-from hafnia.dataset.table_transformations import (
-    check_image_paths,
-    create_primitive_table,
-    read_table_from_path,
-)
 from hafnia.log import user_logger
@@ -171,13 +174,33 @@ class HafniaDataset:
         for row in self.samples.iter_rows(named=True):
             yield row
-    # Dataset transformations
-    apply_image_transform = dataset_transformation.transform_images
-    sample = dataset_transformation.sample
-    shuffle = dataset_transformation.shuffle_dataset
-    split_by_ratios = dataset_transformation.splits_by_ratios
-    divide_split_into_multiple_splits = dataset_transformation.divide_split_into_multiple_splits
-    sample_set_by_size = dataset_transformation.define_sample_set_by_size
+    @staticmethod
+    def from_path(path_folder: Path, check_for_images: bool = True) -> "HafniaDataset":
+        HafniaDataset.check_dataset_path(path_folder, raise_error=True)
+        dataset_info = DatasetInfo.from_json_file(path_folder / FILENAME_DATASET_INFO)
+        table = read_table_from_path(path_folder)
+        # Convert from relative paths to absolute paths
+        table = table.with_columns(
+            pl.concat_str([pl.lit(str(path_folder.absolute()) + os.sep), pl.col("file_name")]).alias("file_name")
+        )
+        if check_for_images:
+            check_image_paths(table)
+        return HafniaDataset(samples=table, info=dataset_info)
+    @staticmethod
+    def from_name(name: str, force_redownload: bool = False, download_files: bool = True) -> "HafniaDataset":
+        """
+        Load a dataset by its name. The dataset must be registered in the Hafnia platform.
+        """
+        from hafnia.dataset.hafnia_dataset import HafniaDataset
+        from hafnia.platform.datasets import download_or_get_dataset_path
+        dataset_path = download_or_get_dataset_path(
+            dataset_name=name, force_redownload=force_redownload, download_files=download_files
+        )
+        return HafniaDataset.from_path(dataset_path, check_for_images=download_files)
     @staticmethod
     def from_samples_list(samples_list: List, info: DatasetInfo) -> "HafniaDataset":
@@ -194,6 +217,140 @@ class HafniaDataset:
         return HafniaDataset(info=info, samples=table)
+    @staticmethod
+    def from_recipe(dataset_recipe: Any) -> "HafniaDataset":
+        """
+        Load a dataset from a recipe. The recipe can be a string (name of the dataset), a dictionary, or a DataRecipe object.
+        """
+        from hafnia.dataset.dataset_recipe.dataset_recipe import DatasetRecipe
+        recipe_explicit = DatasetRecipe.from_implicit_form(dataset_recipe)
+        return recipe_explicit.build()  # Build dataset from the recipe
+    @staticmethod
+    def from_merge(dataset0: "HafniaDataset", dataset1: "HafniaDataset") -> "HafniaDataset":
+        return HafniaDataset.merge(dataset0, dataset1)
+    @staticmethod
+    def from_recipe_with_cache(
+        dataset_recipe: Any,
+        force_redownload: bool = False,
+        path_datasets: Optional[Union[Path, str]] = None,
+    ) -> "HafniaDataset":
+        """
+        Loads a dataset from a recipe and caches it to disk.
+        If the dataset is already cached, it will be loaded from the cache.
+        """
+        path_dataset = get_or_create_dataset_path_from_recipe(dataset_recipe, path_datasets=path_datasets)
+        return HafniaDataset.from_path(path_dataset, check_for_images=False)
+    @staticmethod
+    def from_merger(
+        datasets: List[HafniaDataset],
+    ) -> "HafniaDataset":
+        """
+        Merges multiple Hafnia datasets into one.
+        """
+        if len(datasets) == 0:
+            raise ValueError("No datasets to merge. Please provide at least one dataset.")
+        if len(datasets) == 1:
+            return datasets[0]
+        merged_dataset = datasets[0]
+        remaining_datasets = datasets[1:]
+        for dataset in remaining_datasets:
+            merged_dataset = HafniaDataset.merge(merged_dataset, dataset)
+        return merged_dataset
+    # Dataset transformations
+    transform_images = dataset_transformations.transform_images
+    def shuffle(dataset: HafniaDataset, seed: int = 42) -> HafniaDataset:
+        table = dataset.samples.sample(n=len(dataset), with_replacement=False, seed=seed, shuffle=True)
+        return dataset.update_table(table)
+    def select_samples(
+        dataset: "HafniaDataset", n_samples: int, shuffle: bool = True, seed: int = 42, with_replacement: bool = False
+    ) -> "HafniaDataset":
+        if not with_replacement:
+            n_samples = min(n_samples, len(dataset))
+        table = dataset.samples.sample(n=n_samples, with_replacement=with_replacement, seed=seed, shuffle=shuffle)
+        return dataset.update_table(table)
+    def splits_by_ratios(dataset: "HafniaDataset", split_ratios: Dict[str, float], seed: int = 42) -> "HafniaDataset":
+        """
+        Divides the dataset into splits based on the provided ratios.
+        Example: Defining split ratios and applying the transformation
+        >>> dataset = HafniaDataset.read_from_path(Path("path/to/dataset"))
+        >>> split_ratios = {SplitName.TRAIN: 0.8, SplitName.VAL: 0.1, SplitName.TEST: 0.1}
+        >>> dataset_with_splits = splits_by_ratios(dataset, split_ratios, seed=42)
+        Or use the function as a
+        >>> dataset_with_splits = dataset.splits_by_ratios(split_ratios, seed=42)
+        """
+        n_items = len(dataset)
+        split_name_column = dataset_helpers.create_split_name_list_from_ratios(
+            split_ratios=split_ratios, n_items=n_items, seed=seed
+        )
+        table = dataset.samples.with_columns(pl.Series(split_name_column).alias("split"))
+        return dataset.update_table(table)
+    def split_into_multiple_splits(
+        dataset: "HafniaDataset",
+        split_name: str,
+        split_ratios: Dict[str, float],
+    ) -> "HafniaDataset":
+        """
+        Divides a dataset split ('split_name') into multiple splits based on the provided split
+        ratios ('split_ratios'). This is especially useful for some open datasets where they have only provide
+        two splits or only provide annotations for two splits. This function allows you to create additional
+        splits based on the provided ratios.
+        Example: Defining split ratios and applying the transformation
+        >>> dataset = HafniaDataset.read_from_path(Path("path/to/dataset"))
+        >>> split_name = SplitName.TEST
+        >>> split_ratios = {SplitName.TEST: 0.8, SplitName.VAL: 0.2}
+        >>> dataset_with_splits = split_into_multiple_splits(dataset, split_name, split_ratios)
+        """
+        dataset_split_to_be_divided = dataset.create_split_dataset(split_name=split_name)
+        if len(dataset_split_to_be_divided) == 0:
+            split_counts = dict(dataset.samples.select(pl.col(ColumnName.SPLIT).value_counts()).iter_rows())
+            raise ValueError(f"No samples in the '{split_name}' split to divide into multiple splits. {split_counts=}")
+        assert len(dataset_split_to_be_divided) > 0, f"No samples in the '{split_name}' split!"
+        dataset_split_to_be_divided = dataset_split_to_be_divided.splits_by_ratios(split_ratios=split_ratios, seed=42)
+        remaining_data = dataset.samples.filter(pl.col(ColumnName.SPLIT).is_in([split_name]).not_())
+        new_table = pl.concat([remaining_data, dataset_split_to_be_divided.samples], how="vertical")
+        dataset_new = dataset.update_table(new_table)
+        return dataset_new
+    def define_sample_set_by_size(dataset: "HafniaDataset", n_samples: int, seed: int = 42) -> "HafniaDataset":
+        is_sample_indices = Random(seed).sample(range(len(dataset)), n_samples)
+        is_sample_column = [False for _ in range(len(dataset))]
+        for idx in is_sample_indices:
+            is_sample_column[idx] = True
+        table = dataset.samples.with_columns(pl.Series(is_sample_column).alias("is_sample"))
+        return dataset.update_table(table)
+    def merge(dataset0: "HafniaDataset", dataset1: "HafniaDataset") -> "HafniaDataset":
+        """
+        Merges two Hafnia datasets by concatenating their samples and updating the split names.
+        """
+        ## Currently, only a very naive merging is implemented.
+        # In the future we need to verify that the class and tasks are compatible.
+        # Do they have similar classes and tasks? What to do if they don't?
+        # For now, we just concatenate the samples and keep the split names as they are.
+        merged_samples = pl.concat([dataset0.samples, dataset1.samples], how="vertical")
+        return dataset0.update_table(merged_samples)
+    # Dataset stats
+    split_counts = dataset_stats.split_counts
     def as_dict_dataset_splits(self) -> Dict[str, "HafniaDataset"]:
         if ColumnName.SPLIT not in self.samples.columns:
             raise ValueError(f"Dataset must contain a '{ColumnName.SPLIT}' column.")
@@ -256,21 +413,6 @@ class HafniaDataset:
         return True
-    @staticmethod
-    def read_from_path(path_folder: Path, check_for_images: bool = True) -> "HafniaDataset":
-        HafniaDataset.check_dataset_path(path_folder, raise_error=True)
-        dataset_info = DatasetInfo.from_json_file(path_folder / FILENAME_DATASET_INFO)
-        table = read_table_from_path(path_folder)
-        # Convert from relative paths to absolute paths
-        table = table.with_columns(
-            pl.concat_str([pl.lit(str(path_folder.absolute()) + os.sep), pl.col("file_name")]).alias("file_name")
-        )
-        if check_for_images:
-            check_image_paths(table)
-        return HafniaDataset(samples=table, info=dataset_info)
     def write(self, path_folder: Path, name_by_hash: bool = True, add_version: bool = False) -> None:
         user_logger.info(f"Writing dataset to {path_folder}...")
         if not path_folder.exists():
@@ -303,7 +445,7 @@ class HafniaDataset:
         if add_version:
             path_version = path_folder / "versions" / f"{self.info.version}"
             path_version.mkdir(parents=True, exist_ok=True)
-            for filename in DATASET_FILENAMES:
+            for filename in DATASET_FILENAMES_REQUIRED:
                 shutil.copy2(path_folder / filename, path_version / filename)
     def __eq__(self, value) -> bool:
@@ -363,10 +505,39 @@ class HafniaDataset:
 def check_hafnia_dataset_from_path(path_dataset: Path) -> None:
-    dataset = HafniaDataset.read_from_path(path_dataset, check_for_images=True)
+    dataset = HafniaDataset.from_path(path_dataset, check_for_images=True)
     check_hafnia_dataset(dataset)
+def get_or_create_dataset_path_from_recipe(
+    dataset_recipe: Any,
+    force_redownload: bool = False,
+    path_datasets: Optional[Union[Path, str]] = None,
+) -> Path:
+    from hafnia.dataset.dataset_recipe.dataset_recipe import (
+        DatasetRecipe,
+        get_dataset_path_from_recipe,
+    )
+    recipe: DatasetRecipe = DatasetRecipe.from_implicit_form(dataset_recipe)
+    path_dataset = get_dataset_path_from_recipe(recipe, path_datasets=path_datasets)
+    if force_redownload:
+        shutil.rmtree(path_dataset, ignore_errors=True)
+    if HafniaDataset.check_dataset_path(path_dataset, raise_error=False):
+        return path_dataset
+    path_dataset.mkdir(parents=True, exist_ok=True)
+    path_recipe_json = path_dataset / FILENAME_RECIPE_JSON
+    path_recipe_json.write_text(recipe.model_dump_json(indent=4))
+    dataset: HafniaDataset = recipe.build()
+    dataset.write(path_dataset)
+    return path_dataset
 def check_hafnia_dataset(dataset: HafniaDataset):
     user_logger.info("Checking Hafnia dataset...")
     assert isinstance(dataset.info.version, str) and len(dataset.info.version) > 0

hafnia/dataset/operations/dataset_stats.py ADDED Viewed

@@ -0,0 +1,15 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Dict
+from hafnia.dataset.dataset_names import ColumnName
+if TYPE_CHECKING:
+    from hafnia.dataset.hafnia_dataset import HafniaDataset
+def split_counts(dataset: HafniaDataset) -> Dict[str, int]:
+    """
+    Returns a dictionary with the counts of samples in each split of the dataset.
+    """
+    return dict(dataset.samples[ColumnName.SPLIT].value_counts().iter_rows())

hafnia/dataset/operations/dataset_transformations.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""
+Hafnia dataset transformations that takes and returns a HafniaDataset object.
+All functions here will have a corresponding function in both the HafniaDataset class
+and a corresponding RecipeTransform class in the `data_recipe/recipe_transformations.py` file.
+This allows each function to be used in three ways:
+```python
+from hafnia.dataset.operations import dataset_transformations
+from hafnia.dataset.hafnia_dataset import HafniaDataset
+from hafnia.dataset.data_recipe.recipe_transformations import SplitByRatios
+splits_by_ratios = {"train": 0.8, "val": 0.1, "test": 0.1}
+# Option 1: Using the function directly
+dataset = recipe_transformations.splits_by_ratios(dataset, split_ratios=splits_by_ratios)
+# Option 2: Using the method of the HafniaDataset class
+dataset = dataset.splits_by_ratios(split_ratios=splits_by_ratios)
+# Option 3: Using the RecipeTransform class
+serializable_transform = SplitByRatios(split_ratios=splits_by_ratios)
+dataset = serializable_transform(dataset)
+```
+Tests will ensure that all functions in this file will have a corresponding function in the
+HafniaDataset class and a RecipeTransform class in the `data_recipe/recipe_transformations.py` file and
+that the signatures match.
+"""
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable
+import cv2
+import numpy as np
+import polars as pl
+from PIL import Image
+from tqdm import tqdm
+from hafnia.dataset import dataset_helpers
+if TYPE_CHECKING:
+    from hafnia.dataset.hafnia_dataset import HafniaDataset
+### Image transformations ###
+class AnonymizeByPixelation:
+    def __init__(self, resize_factor: float = 0.10):
+        self.resize_factor = resize_factor
+    def __call__(self, frame: np.ndarray) -> np.ndarray:
+        org_size = frame.shape[:2]
+        frame = cv2.resize(frame, (0, 0), fx=self.resize_factor, fy=self.resize_factor)
+        frame = cv2.resize(frame, org_size[::-1], interpolation=cv2.INTER_NEAREST)
+        return frame
+def transform_images(
+    dataset: "HafniaDataset",
+    transform: Callable[[np.ndarray], np.ndarray],
+    path_output: Path,
+) -> "HafniaDataset":
+    new_paths = []
+    path_image_folder = path_output / "data"
+    path_image_folder.mkdir(parents=True, exist_ok=True)
+    for org_path in tqdm(dataset.samples["file_name"].to_list(), desc="Transform images"):
+        org_path = Path(org_path)
+        if not org_path.exists():
+            raise FileNotFoundError(f"File {org_path} does not exist in the dataset.")
+        image = np.array(Image.open(org_path))
+        image_transformed = transform(image)
+        new_path = dataset_helpers.save_image_with_hash_name(image_transformed, path_image_folder)
+        if not new_path.exists():
+            raise FileNotFoundError(f"Transformed file {new_path} does not exist in the dataset.")
+        new_paths.append(str(new_path))
+    table = dataset.samples.with_columns(pl.Series(new_paths).alias("file_name"))
+    return dataset.update_table(table)

hafnia/dataset/{table_transformations.py → operations/table_transformations.py} RENAMED Viewed

@@ -4,12 +4,12 @@ from typing import List, Optional, Type
 import polars as pl
 from tqdm import tqdm
-from hafnia.dataset import table_transformations
 from hafnia.dataset.dataset_names import (
     FILENAME_ANNOTATIONS_JSONL,
     FILENAME_ANNOTATIONS_PARQUET,
     FieldName,
 )
+from hafnia.dataset.operations import table_transformations
 from hafnia.dataset.primitives import PRIMITIVE_TYPES
 from hafnia.dataset.primitives.classification import Classification
 from hafnia.dataset.primitives.primitive import Primitive

hafnia/experiment/hafnia_logger.py CHANGED Viewed

@@ -14,7 +14,7 @@ from pydantic import BaseModel, field_validator
 from hafnia.data.factory import load_dataset
 from hafnia.dataset.hafnia_dataset import HafniaDataset
 from hafnia.log import sys_logger, user_logger
-from hafnia.utils import is_remote_job, now_as_str
+from hafnia.utils import is_hafnia_cloud_job, now_as_str
 class EntityType(Enum):
@@ -101,7 +101,7 @@ class HafniaLogger:
     def path_local_experiment(self) -> Path:
         """Get the path for local experiment."""
-        if is_remote_job():
+        if is_hafnia_cloud_job():
             raise RuntimeError("Cannot access local experiment path in remote job.")
         return self._local_experiment_path
@@ -110,7 +110,7 @@ class HafniaLogger:
         if "MDI_CHECKPOINT_DIR" in os.environ:
             return Path(os.environ["MDI_CHECKPOINT_DIR"])
-        if is_remote_job():
+        if is_hafnia_cloud_job():
             return Path("/opt/ml/checkpoints")
         return self.path_local_experiment() / "checkpoints"
@@ -119,7 +119,7 @@ class HafniaLogger:
         if "MDI_ARTIFACT_DIR" in os.environ:
             return Path(os.environ["MDI_ARTIFACT_DIR"])
-        if is_remote_job():
+        if is_hafnia_cloud_job():
             return Path("/opt/ml/output/data")
         return self.path_local_experiment() / "data"
@@ -129,7 +129,7 @@ class HafniaLogger:
         if "MDI_MODEL_DIR" in os.environ:
             return Path(os.environ["MDI_MODEL_DIR"])
-        if is_remote_job():
+        if is_hafnia_cloud_job():
             return Path("/opt/ml/model")
         return self.path_local_experiment() / "model"

hafnia/helper_testing.py CHANGED Viewed

@@ -1,4 +1,7 @@
+from inspect import getmembers, isfunction, signature
 from pathlib import Path
+from types import FunctionType
+from typing import Any, Callable, Dict, Union, get_origin
 from hafnia import utils
 from hafnia.dataset.dataset_names import FILENAME_ANNOTATIONS_JSONL, DatasetVariant
@@ -38,8 +41,8 @@ def get_path_micro_hafnia_dataset(dataset_name: str, force_update=False) -> Path
     if path_test_dataset_annotations.exists() and not force_update:
         return path_test_dataset
-    hafnia_dataset = HafniaDataset.read_from_path(path_dataset / DatasetVariant.SAMPLE.value)
-    hafnia_dataset = hafnia_dataset.sample(n_samples=3, seed=42)
+    hafnia_dataset = HafniaDataset.from_path(path_dataset / DatasetVariant.SAMPLE.value)
+    hafnia_dataset = hafnia_dataset.select_samples(n_samples=3, seed=42)
     hafnia_dataset.write(path_test_dataset)
     if force_update:
@@ -59,5 +62,47 @@ def get_sample_micro_hafnia_dataset(dataset_name: str, force_update=False) -> Sa
 def get_micro_hafnia_dataset(dataset_name: str, force_update: bool = False) -> HafniaDataset:
     path_dataset = get_path_micro_hafnia_dataset(dataset_name=dataset_name, force_update=force_update)
-    hafnia_dataset = HafniaDataset.read_from_path(path_dataset)
+    hafnia_dataset = HafniaDataset.from_path(path_dataset)
     return hafnia_dataset
+def is_hafnia_configured() -> bool:
+    """
+    Check if Hafnia is configured by verifying if the API key is set.
+    """
+    from cli.config import Config
+    return Config().is_configured()
+def is_typing_type(annotation: Any) -> bool:
+    return get_origin(annotation) is not None
+def annotation_as_string(annotation: Union[type, str]) -> str:
+    """Convert type annotation to string."""
+    if isinstance(annotation, str):
+        return annotation.replace("'", "")
+    if is_typing_type(annotation):  # Is using typing types like List, Dict, etc.
+        return str(annotation).replace("typing.", "")
+    if hasattr(annotation, "__name__"):
+        return annotation.__name__
+    return str(annotation)
+def get_hafnia_functions_from_module(python_module) -> Dict[str, FunctionType]:
+    def dataset_is_first_arg(func: Callable) -> bool:
+        """
+        Check if the function has 'HafniaDataset' as the first parameter.
+        """
+        func_signature = signature(func)
+        params = func_signature.parameters
+        if len(params) == 0:
+            return False
+        first_argument_type = list(params.values())[0]
+        annotation_as_str = annotation_as_string(first_argument_type.annotation)
+        return annotation_as_str == "HafniaDataset"
+    functions = {func[0]: func[1] for func in getmembers(python_module, isfunction) if dataset_is_first_arg(func[1])}
+    return functions

hafnia/platform/datasets.py CHANGED Viewed

@@ -10,10 +10,14 @@ from tqdm import tqdm
 from cli.config import Config
 from hafnia import utils
-from hafnia.dataset import dataset_names
+from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED, ColumnName
+from hafnia.dataset.dataset_recipe.dataset_recipe import (
+    DatasetRecipe,
+    get_dataset_path_from_recipe,
+)
 from hafnia.dataset.hafnia_dataset import HafniaDataset
 from hafnia.http import fetch
-from hafnia.log import user_logger
+from hafnia.log import sys_logger, user_logger
 from hafnia.platform import get_dataset_id
 from hafnia.platform.download import get_resource_credentials
 from hafnia.utils import timed
@@ -37,13 +41,11 @@ def download_or_get_dataset_path(
     cfg: Optional[Config] = None,
     path_datasets_folder: Optional[str] = None,
     force_redownload: bool = False,
+    download_files: bool = True,
 ) -> Path:
     """Download or get the path of the dataset."""
-    if utils.is_remote_job():
-        return Path(os.getenv("MDI_DATASET_DIR", "/opt/ml/input/data/training"))
-    path_datasets_folder = path_datasets_folder or str(utils.PATH_DATASETS)
-    path_dataset = Path(path_datasets_folder).absolute() / dataset_name
+    recipe_explicit = DatasetRecipe.from_implicit_form(dataset_name)
+    path_dataset = get_dataset_path_from_recipe(recipe_explicit, path_datasets=path_datasets_folder)
     is_dataset_valid = HafniaDataset.check_dataset_path(path_dataset, raise_error=False)
     if is_dataset_valid and not force_redownload:
@@ -57,22 +59,30 @@ def download_or_get_dataset_path(
     endpoint_dataset = cfg.get_platform_endpoint("datasets")
     dataset_id = get_dataset_id(dataset_name=dataset_name, endpoint=endpoint_dataset, api_key=api_key)
+    if dataset_id is None:
+        sys_logger.error(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
     access_dataset_endpoint = f"{endpoint_dataset}/{dataset_id}/temporary-credentials"
     download_dataset_from_access_endpoint(
         endpoint=access_dataset_endpoint,
         api_key=api_key,
         path_dataset=path_dataset,
+        download_files=download_files,
     )
     return path_dataset
-def download_dataset_from_access_endpoint(endpoint: str, api_key: str, path_dataset: Path) -> None:
+def download_dataset_from_access_endpoint(
+    endpoint: str,
+    api_key: str,
+    path_dataset: Path,
+    download_files: bool = True,
+) -> None:
     resource_credentials = get_resource_credentials(endpoint, api_key)
-    local_dataset_paths = [str(path_dataset / filename) for filename in dataset_names.DATASET_FILENAMES]
+    local_dataset_paths = [str(path_dataset / filename) for filename in DATASET_FILENAMES_REQUIRED]
     s3_uri = resource_credentials.s3_uri()
-    s3_dataset_files = [f"{s3_uri}/{filename}" for filename in dataset_names.DATASET_FILENAMES]
+    s3_dataset_files = [f"{s3_uri}/{filename}" for filename in DATASET_FILENAMES_REQUIRED]
     envs = resource_credentials.aws_credentials()
     fast_copy_files_s3(
@@ -82,10 +92,13 @@ def download_dataset_from_access_endpoint(endpoint: str, api_key: str, path_data
         description="Downloading annotations",
     )
-    dataset = HafniaDataset.read_from_path(path_dataset, check_for_images=False)
+    if not download_files:
+        return
+    dataset = HafniaDataset.from_path(path_dataset, check_for_images=False)
     fast_copy_files_s3(
-        src_paths=dataset.samples[dataset_names.ColumnName.REMOTE_PATH].to_list(),
-        dst_paths=dataset.samples[dataset_names.ColumnName.FILE_NAME].to_list(),
+        src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
+        dst_paths=dataset.samples[ColumnName.FILE_NAME].to_list(),
         append_envs=envs,
         description="Downloading images",
     )

hafnia/utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import hashlib
 import os
 import time
 import zipfile
@@ -132,6 +133,24 @@ def show_recipe_content(recipe_path: Path, style: str = "emoji", depth_limit: in
     user_logger.info(f"Recipe size: {size_human_readable(os.path.getsize(recipe_path))}. Max size 800 MiB")
-def is_remote_job() -> bool:
+def is_hafnia_cloud_job() -> bool:
     """Check if the current job is running in HAFNIA cloud environment."""
     return os.getenv("HAFNIA_CLOUD", "false").lower() == "true"
+def pascal_to_snake_case(name: str) -> str:
+    """
+    Convert PascalCase to snake_case.
+    """
+    return "".join(["_" + char.lower() if char.isupper() else char for char in name]).lstrip("_")
+def snake_to_pascal_case(name: str) -> str:
+    """
+    Convert snake_case to PascalCase.
+    """
+    return "".join(word.capitalize() for word in name.split("_"))
+def hash_from_string(s: str) -> str:
+    return hashlib.md5(s.encode("utf-8")).hexdigest()

hafnia/visualizations/image_visualizations.py CHANGED Viewed

@@ -175,7 +175,7 @@ def save_dataset_sample_set_visualizations(
     draw_settings: Optional[Dict[Type[Primitive], Dict]] = None,
     anonymize_settings: Optional[Dict[Type[Primitive], Dict]] = None,
 ) -> List[Path]:
-    dataset = HafniaDataset.read_from_path(path_dataset)
+    dataset = HafniaDataset.from_path(path_dataset)
     shutil.rmtree(path_output_folder, ignore_errors=True)
     path_output_folder.mkdir(parents=True)

hafnia 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

hafnia 0.2.0py3-none-any.whl → 0.2.1py3-none-any.whl