PyPI - hafnia - Versions diffs - 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

hafnia 0.2.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

cli/__main__.py +16 -3
cli/config.py +45 -4
cli/consts.py +1 -1
cli/dataset_cmds.py +6 -14
cli/dataset_recipe_cmds.py +78 -0
cli/experiment_cmds.py +226 -43
cli/keychain.py +88 -0
cli/profile_cmds.py +10 -6
cli/runc_cmds.py +5 -5
cli/trainer_package_cmds.py +65 -0
hafnia/__init__.py +2 -0
hafnia/data/factory.py +1 -2
hafnia/dataset/dataset_helpers.py +9 -14
hafnia/dataset/dataset_names.py +10 -5
hafnia/dataset/dataset_recipe/dataset_recipe.py +165 -67
hafnia/dataset/dataset_recipe/recipe_transforms.py +48 -4
hafnia/dataset/dataset_recipe/recipe_types.py +1 -1
hafnia/dataset/dataset_upload_helper.py +265 -56
hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
hafnia/dataset/hafnia_dataset.py +577 -213
hafnia/dataset/license_types.py +63 -0
hafnia/dataset/operations/dataset_stats.py +259 -3
hafnia/dataset/operations/dataset_transformations.py +332 -7
hafnia/dataset/operations/table_transformations.py +43 -5
hafnia/dataset/primitives/__init__.py +8 -0
hafnia/dataset/primitives/bbox.py +25 -12
hafnia/dataset/primitives/bitmask.py +26 -14
hafnia/dataset/primitives/classification.py +16 -8
hafnia/dataset/primitives/point.py +7 -3
hafnia/dataset/primitives/polygon.py +16 -9
hafnia/dataset/primitives/segmentation.py +10 -7
hafnia/experiment/hafnia_logger.py +111 -8
hafnia/http.py +16 -2
hafnia/platform/__init__.py +9 -3
hafnia/platform/builder.py +12 -10
hafnia/platform/dataset_recipe.py +104 -0
hafnia/platform/datasets.py +47 -9
hafnia/platform/download.py +25 -19
hafnia/platform/experiment.py +51 -56
hafnia/platform/trainer_package.py +57 -0
hafnia/utils.py +81 -13
hafnia/visualizations/image_visualizations.py +4 -4
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/METADATA +40 -34
hafnia-0.4.0.dist-info/RECORD +56 -0
cli/recipe_cmds.py +0 -45
hafnia-0.2.4.dist-info/RECORD +0 -49
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0

cli/profile_cmds.py CHANGED Viewed

@@ -14,7 +14,7 @@ def profile():
 @profile.command("ls")
 @click.pass_obj
-def profile_ls(cfg: Config) -> None:
+def cmd_profile_ls(cfg: Config) -> None:
     """List all available profiles."""
     profiles = cfg.available_profiles
     if not profiles:
@@ -31,7 +31,7 @@ def profile_ls(cfg: Config) -> None:
 @profile.command("use")
 @click.argument("profile_name", required=True)
 @click.pass_obj
-def profile_use(cfg: Config, profile_name: str) -> None:
+def cmd_profile_use(cfg: Config, profile_name: str) -> None:
     """Switch to a different profile."""
     if len(cfg.available_profiles) == 0:
         raise click.ClickException(consts.ERROR_CONFIGURE)
@@ -50,10 +50,13 @@ def profile_use(cfg: Config, profile_name: str) -> None:
 @click.option(
     "--activate/--no-activate", help="Activate the created profile after creation", default=True, show_default=True
 )
+@click.option(
+    "--use-keychain", is_flag=True, help="Store API key in system keychain instead of config file", default=False
+)
 @click.pass_obj
-def profile_create(cfg: Config, name: str, api_url: str, api_key: str, activate: bool) -> None:
+def cmd_profile_create(cfg: Config, name: str, api_url: str, api_key: str, activate: bool, use_keychain: bool) -> None:
     """Create a new profile."""
-    cfg_profile = ConfigSchema(platform_url=api_url, api_key=api_key)
+    cfg_profile = ConfigSchema(platform_url=api_url, api_key=api_key, use_keychain=use_keychain)
     cfg.add_profile(profile_name=name, profile=cfg_profile, set_active=activate)
     profile_show(cfg)
@@ -62,7 +65,7 @@ def profile_create(cfg: Config, name: str, api_url: str, api_key: str, activate:
 @profile.command("rm")
 @click.argument("profile_name", required=True)
 @click.pass_obj
-def profile_rm(cfg: Config, profile_name: str) -> None:
+def cmd_profile_rm(cfg: Config, profile_name: str) -> None:
     """Remove a profile."""
     if len(cfg.available_profiles) == 0:
         raise click.ClickException(consts.ERROR_CONFIGURE)
@@ -80,7 +83,8 @@ def profile_rm(cfg: Config, profile_name: str) -> None:
 @profile.command("active")
 @click.pass_obj
-def profile_active(cfg: Config) -> None:
+def cmd_profile_active(cfg: Config) -> None:
+    """Show the currently active profile."""
     try:
         profile_show(cfg)
     except Exception as e:

cli/runc_cmds.py CHANGED Viewed

@@ -13,7 +13,7 @@ from hafnia.log import sys_logger, user_logger
 @click.group(name="runc")
 def runc():
-    """Experiment management commands"""
+    """Creating and running trainer packages locally"""
     pass
@@ -90,10 +90,10 @@ def launch_local(cfg: Config, exec_cmd: str, dataset: str, image_name: str) -> N
 @click.pass_obj
 def build(cfg: Config, recipe_url: str, state_file: str, repo: str) -> None:
     """Build docker image with a given recipe."""
-    from hafnia.platform.builder import build_image, prepare_recipe
+    from hafnia.platform.builder import build_image, prepare_trainer_package
     with TemporaryDirectory() as temp_dir:
-        metadata = prepare_recipe(recipe_url, Path(temp_dir), cfg.api_key)
+        metadata = prepare_trainer_package(recipe_url, Path(temp_dir), cfg.api_key)
         build_image(metadata, repo, state_file=state_file)
@@ -109,7 +109,7 @@ def build_local(recipe: Path, state_file: str, repo: str) -> None:
     import seedir
     from hafnia.platform.builder import build_image
-    from hafnia.utils import filter_recipe_files
+    from hafnia.utils import filter_trainer_package_files
     recipe = Path(recipe)
@@ -123,7 +123,7 @@ def build_local(recipe: Path, state_file: str, repo: str) -> None:
             with zipfile.ZipFile(recipe.as_posix(), "r") as zip_ref:
                 zip_ref.extractall(recipe_dir)
         elif recipe.is_dir():
-            for rf in filter_recipe_files(recipe):
+            for rf in filter_trainer_package_files(recipe):
                 src_path = (recipe / rf).absolute()
                 target_path = recipe_dir / rf
                 target_path.parent.mkdir(parents=True, exist_ok=True)

cli/trainer_package_cmds.py ADDED Viewed

@@ -0,0 +1,65 @@
+from pathlib import Path
+from typing import Optional
+import click
+import cli.consts as consts
+from cli.config import Config
+@click.group(name="trainer")
+def trainer_package() -> None:
+    """Trainer package commands"""
+    pass
+@trainer_package.command(name="ls")
+@click.pass_obj
+@click.option("-l", "--limit", type=int, default=None, help="Limit number of listed trainer packages.")
+def cmd_list_trainer_packages(cfg: Config, limit: Optional[int]) -> None:
+    """List available trainer packages on the platform"""
+    from hafnia.platform.trainer_package import get_trainer_packages, pretty_print_trainer_packages
+    endpoint = cfg.get_platform_endpoint("trainers")
+    trainers = get_trainer_packages(endpoint, cfg.api_key)
+    pretty_print_trainer_packages(trainers, limit=limit)
+@trainer_package.command(name="create-zip")
+@click.argument("source")
+@click.option(
+    "--output",
+    type=click.Path(writable=True),
+    default="./trainer.zip",
+    show_default=True,
+    help="Output trainer package path.",
+)
+def cmd_create_trainer_package_zip(source: str, output: str) -> None:
+    """Create Hafnia trainer package as zip-file from local path"""
+    from hafnia.utils import archive_dir
+    path_output_zip = Path(output)
+    if path_output_zip.suffix != ".zip":
+        raise click.ClickException(consts.ERROR_TRAINER_PACKAGE_FILE_FORMAT)
+    path_source = Path(source)
+    path_output_zip = archive_dir(path_source, path_output_zip)
+@trainer_package.command(name="view-zip")
+@click.option("--path", type=str, default="./trainer.zip", show_default=True, help="Path of trainer.zip.")
+@click.option("--depth-limit", type=int, default=3, help="Limit the depth of the tree view.", show_default=True)
+def cmd_view_trainer_package_zip(path: str, depth_limit: int) -> None:
+    """View the content of a trainer package zip file."""
+    from hafnia.utils import show_trainer_package_content
+    path_trainer_package = Path(path)
+    if not path_trainer_package.exists():
+        raise click.ClickException(
+            f"Trainer package file '{path_trainer_package}' does not exist. Please provide a valid path. "
+            f"To create a trainer package, use the 'hafnia trainer create-zip' command."
+        )
+    show_trainer_package_content(path_trainer_package, depth_limit=depth_limit)

hafnia/__init__.py CHANGED Viewed

@@ -2,3 +2,5 @@ from importlib.metadata import version
 __package_name__ = "hafnia"
 __version__ = version(__package_name__)
+__dataset_format_version__ = "0.1.0"  # Hafnia dataset format version

hafnia/data/factory.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 from pathlib import Path
 from typing import Any
@@ -16,7 +15,7 @@ def load_dataset(recipe: Any, force_redownload: bool = False) -> HafniaDataset:
 def get_dataset_path(recipe: Any, force_redownload: bool = False) -> Path:
     if utils.is_hafnia_cloud_job():
-        return Path(os.getenv("MDI_DATASET_DIR", "/opt/ml/input/data/training"))
+        return utils.get_dataset_path_in_hafnia_cloud()
     path_dataset = get_or_create_dataset_path_from_recipe(recipe, force_redownload=force_redownload)

hafnia/dataset/dataset_helpers.py CHANGED Viewed

@@ -38,12 +38,19 @@ def hash_from_bytes(data: bytes) -> str:
 def save_image_with_hash_name(image: np.ndarray, path_folder: Path) -> Path:
     pil_image = Image.fromarray(image)
+    path_image = save_pil_image_with_hash_name(pil_image, path_folder)
+    return path_image
+def save_pil_image_with_hash_name(image: Image.Image, path_folder: Path, allow_skip: bool = True) -> Path:
     buffer = io.BytesIO()
-    pil_image.save(buffer, format="PNG")
+    image.save(buffer, format="PNG")
     hash_value = hash_from_bytes(buffer.getvalue())
     path_image = Path(path_folder) / relative_path_from_hash(hash=hash_value, suffix=".png")
+    if allow_skip and path_image.exists():
+        return path_image
     path_image.parent.mkdir(parents=True, exist_ok=True)
-    pil_image.save(path_image)
+    image.save(path_image)
     return path_image
@@ -110,15 +117,3 @@ def split_sizes_from_ratios(n_items: int, split_ratios: Dict[str, float]) -> Dic
         raise ValueError("Something is wrong. The split sizes do not match the number of items.")
     return split_sizes
-def select_evenly_across_list(lst: list, num_samples: int):
-    if num_samples >= len(lst):
-        return lst  # No need to sample
-    step = (len(lst) - 1) / (num_samples - 1)
-    indices = [int(round(step * i)) for i in range(num_samples)]  # noqa: RUF046
-    return [lst[index] for index in indices]
-def prefix_dict(d: dict, prefix: str) -> dict:
-    return {f"{prefix}.{k}": v for k, v in d.items()}

hafnia/dataset/dataset_names.py CHANGED Viewed

@@ -18,11 +18,14 @@ class DeploymentStage(Enum):
     PRODUCTION = "production"
+TAG_IS_SAMPLE = "sample"
+OPS_REMOVE_CLASS = "__REMOVE__"
 class FieldName:
     CLASS_NAME: str = "class_name"  # Name of the class this primitive is associated with, e.g. "car" for Bbox
-    CLASS_IDX: str = (
-        "class_idx"  # Index of the class this primitive is associated with, e.g. 0 for "car" if it is the first class
-    )
+    CLASS_IDX: str = "class_idx"  # Index of the class this primitive is associated with, e.g. 0 for "car" if it is the first class  # noqa: E501
     OBJECT_ID: str = "object_id"  # Unique identifier for the object, e.g. "12345123"
     CONFIDENCE: str = "confidence"  # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
@@ -46,13 +49,15 @@ class FieldName:
 class ColumnName:
     SAMPLE_INDEX: str = "sample_index"
-    FILE_NAME: str = "file_name"
+    FILE_PATH: str = "file_path"
     HEIGHT: str = "height"
     WIDTH: str = "width"
     SPLIT: str = "split"
-    IS_SAMPLE: str = "is_sample"
     REMOTE_PATH: str = "remote_path"  # Path to the file in remote storage, e.g. S3
+    ATTRIBUTION: str = "attribution"  # Attribution for the sample (image/video), e.g. creator, license, source, etc.
+    TAGS: str = "tags"
     META: str = "meta"
+    DATASET_NAME: str = "dataset_name"
 class SplitName:

hafnia/dataset/dataset_recipe/dataset_recipe.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import json
 import os
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 from pydantic import (
     field_serializer,
@@ -12,11 +12,13 @@ from pydantic import (
 from hafnia import utils
 from hafnia.dataset.dataset_recipe import recipe_transforms
-from hafnia.dataset.dataset_recipe.recipe_types import RecipeCreation, RecipeTransform, Serializable
+from hafnia.dataset.dataset_recipe.recipe_types import (
+    RecipeCreation,
+    RecipeTransform,
+    Serializable,
+)
 from hafnia.dataset.hafnia_dataset import HafniaDataset
-if TYPE_CHECKING:
-    from hafnia.dataset.hafnia_dataset import HafniaDataset
+from hafnia.dataset.primitives.primitive import Primitive
 class DatasetRecipe(Serializable):
@@ -43,6 +45,17 @@ class DatasetRecipe(Serializable):
         creation = FromName(name=name, force_redownload=force_redownload, download_files=download_files)
         return DatasetRecipe(creation=creation)
+    @staticmethod
+    def from_name_public_dataset(
+        name: str, force_redownload: bool = False, n_samples: Optional[int] = None
+    ) -> DatasetRecipe:
+        creation = FromNamePublicDataset(
+            name=name,
+            force_redownload=force_redownload,
+            n_samples=n_samples,
+        )
+        return DatasetRecipe(creation=creation)
     @staticmethod
     def from_path(path_folder: Path, check_for_images: bool = True) -> DatasetRecipe:
         creation = FromPath(path_folder=path_folder, check_for_images=check_for_images)
@@ -76,6 +89,42 @@ class DatasetRecipe(Serializable):
         json_str = path_json.read_text(encoding="utf-8")
         return DatasetRecipe.from_json_str(json_str)
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> "DatasetRecipe":
+        """Deserialize from a dictionary."""
+        dataset_recipe = Serializable.from_dict(data)
+        return dataset_recipe
+    @staticmethod
+    def from_recipe_id(recipe_id: str) -> "DatasetRecipe":
+        """Loads a dataset recipe by id from the hafnia platform."""
+        from cli.config import Config
+        from hafnia.platform.dataset_recipe import get_dataset_recipe_by_id
+        cfg = Config()
+        endpoint_dataset = cfg.get_platform_endpoint("dataset_recipes")
+        recipe_dict = get_dataset_recipe_by_id(recipe_id, endpoint=endpoint_dataset, api_key=cfg.api_key)
+        recipe_dict = recipe_dict["template"]["body"]
+        if isinstance(recipe_dict, str):
+            return DatasetRecipe.from_implicit_form(recipe_dict)
+        recipe = DatasetRecipe.from_dict(recipe_dict)
+        return recipe
+    @staticmethod
+    def from_recipe_name(name: str) -> "DatasetRecipe":
+        """Loads a dataset recipe by name from the hafnia platform"""
+        from cli.config import Config
+        from hafnia.platform.dataset_recipe import get_dataset_recipe_by_name
+        cfg = Config()
+        endpoint_dataset = cfg.get_platform_endpoint("dataset_recipes")
+        recipe = get_dataset_recipe_by_name(name=name, endpoint=endpoint_dataset, api_key=cfg.api_key)
+        if not recipe:
+            raise ValueError(f"Dataset recipe '{name}' not found.")
+        recipe_id = recipe["id"]
+        return DatasetRecipe.from_recipe_id(recipe_id)
     @staticmethod
     def from_implicit_form(recipe: Any) -> DatasetRecipe:
         """
@@ -152,6 +201,60 @@ class DatasetRecipe(Serializable):
         raise ValueError(f"Unsupported recipe type: {type(recipe)}")
+    ### Upload, store and recipe conversions ###
+    def as_python_code(self, keep_default_fields: bool = False, as_kwargs: bool = True) -> str:
+        str_operations = [self.creation.as_python_code(keep_default_fields=keep_default_fields, as_kwargs=as_kwargs)]
+        if self.operations:
+            for op in self.operations:
+                str_operations.append(op.as_python_code(keep_default_fields=keep_default_fields, as_kwargs=as_kwargs))
+        operations_str = ".".join(str_operations)
+        return operations_str
+    def as_short_name(self) -> str:
+        """Return a short name for the transforms."""
+        creation_name = self.creation.as_short_name()
+        if self.operations is None or len(self.operations) == 0:
+            return creation_name
+        short_names = [creation_name]
+        for operation in self.operations:
+            short_names.append(operation.as_short_name())
+        transforms_str = ",".join(short_names)
+        return f"Recipe({transforms_str})"
+    def as_json_str(self, indent: int = 2) -> str:
+        """Serialize the dataset recipe to a JSON string."""
+        dict_data = self.as_dict()
+        return json.dumps(dict_data, indent=indent, ensure_ascii=False)
+    def as_json_file(self, path_json: Path, indent: int = 2) -> None:
+        """Serialize the dataset recipe to a JSON file."""
+        path_json.parent.mkdir(parents=True, exist_ok=True)
+        json_str = self.as_json_str(indent=indent)
+        path_json.write_text(json_str, encoding="utf-8")
+    def as_dict(self) -> dict:
+        """Serialize the dataset recipe to a dictionary."""
+        return self.model_dump(mode="json")
+    def as_platform_recipe(self, recipe_name: Optional[str], overwrite: bool = False) -> Dict:
+        """Uploads dataset recipe to the hafnia platform."""
+        from cli.config import Config
+        from hafnia.platform.dataset_recipe import get_or_create_dataset_recipe
+        recipe = self.as_dict()
+        cfg = Config()
+        endpoint_dataset = cfg.get_platform_endpoint("dataset_recipes")
+        recipe_dict = get_or_create_dataset_recipe(
+            recipe=recipe,
+            endpoint=endpoint_dataset,
+            api_key=cfg.api_key,
+            name=recipe_name,
+            overwrite=overwrite,
+        )
+        return recipe_dict
     ### Dataset Recipe Transformations ###
     def shuffle(recipe: DatasetRecipe, seed: int = 42) -> DatasetRecipe:
         operation = recipe_transforms.Shuffle(seed=seed)
@@ -159,10 +262,17 @@ class DatasetRecipe(Serializable):
         return recipe
     def select_samples(
-        recipe: DatasetRecipe, n_samples: int, shuffle: bool = True, seed: int = 42, with_replacement: bool = False
+        recipe: DatasetRecipe,
+        n_samples: int,
+        shuffle: bool = True,
+        seed: int = 42,
+        with_replacement: bool = False,
     ) -> DatasetRecipe:
         operation = recipe_transforms.SelectSamples(
-            n_samples=n_samples, shuffle=shuffle, seed=seed, with_replacement=with_replacement
+            n_samples=n_samples,
+            shuffle=shuffle,
+            seed=seed,
+            with_replacement=with_replacement,
         )
         recipe.append_operation(operation)
         return recipe
@@ -184,37 +294,36 @@ class DatasetRecipe(Serializable):
         recipe.append_operation(operation)
         return recipe
-    ### Conversions ###
-    def as_python_code(self, keep_default_fields: bool = False, as_kwargs: bool = True) -> str:
-        str_operations = [self.creation.as_python_code(keep_default_fields=keep_default_fields, as_kwargs=as_kwargs)]
-        if self.operations:
-            for op in self.operations:
-                str_operations.append(op.as_python_code(keep_default_fields=keep_default_fields, as_kwargs=as_kwargs))
-        operations_str = ".".join(str_operations)
-        return operations_str
-    def as_short_name(self) -> str:
-        """Return a short name for the transforms."""
-        creation_name = self.creation.as_short_name()
-        if self.operations is None or len(self.operations) == 0:
-            return creation_name
-        short_names = [creation_name]
-        for operation in self.operations:
-            short_names.append(operation.as_short_name())
-        transforms_str = ",".join(short_names)
-        return f"Recipe({transforms_str})"
+    def class_mapper(
+        recipe: DatasetRecipe,
+        class_mapping: Union[Dict[str, str], List[Tuple[str, str]]],
+        method: str = "strict",
+        primitive: Optional[Type[Primitive]] = None,
+        task_name: Optional[str] = None,
+    ) -> DatasetRecipe:
+        operation = recipe_transforms.ClassMapper(
+            class_mapping=class_mapping,
+            method=method,
+            primitive=primitive,
+            task_name=task_name,
+        )
+        recipe.append_operation(operation)
+        return recipe
-    def as_json_str(self, indent: int = 2) -> str:
-        """Serialize the dataset recipe to a JSON string."""
-        data = self.model_dump(mode="json")
-        # data = type_as_first_key(data)
-        return json.dumps(data, indent=indent, ensure_ascii=False)
+    def rename_task(recipe: DatasetRecipe, old_task_name: str, new_task_name: str) -> DatasetRecipe:
+        operation = recipe_transforms.RenameTask(old_task_name=old_task_name, new_task_name=new_task_name)
+        recipe.append_operation(operation)
+        return recipe
-    def as_json_file(self, path_json: Path, indent: int = 2) -> None:
-        """Serialize the dataset recipe to a JSON file."""
-        json_str = self.as_json_str(indent=indent)
-        path_json.write_text(json_str, encoding="utf-8")
+    def select_samples_by_class_name(
+        recipe: DatasetRecipe,
+        name: Union[List[str], str],
+        task_name: Optional[str] = None,
+        primitive: Optional[Type[Primitive]] = None,
+    ) -> DatasetRecipe:
+        operation = recipe_transforms.SelectSamplesByClassName(name=name, task_name=task_name, primitive=primitive)
+        recipe.append_operation(operation)
+        return recipe
     ### Helper methods ###
     def get_dataset_names(self) -> List[str]:
@@ -314,6 +423,22 @@ class FromName(RecipeCreation):
         return [self.name]
+class FromNamePublicDataset(RecipeCreation):
+    name: str
+    force_redownload: bool = False
+    n_samples: Optional[int] = None
+    @staticmethod
+    def get_function() -> Callable[..., "HafniaDataset"]:
+        return HafniaDataset.from_name_public_dataset
+    def as_short_name(self) -> str:
+        return f"Torchvision('{self.name}')"
+    def get_dataset_names(self) -> List[str]:
+        return []
 class FromMerge(RecipeCreation):
     recipe0: DatasetRecipe
     recipe1: DatasetRecipe
@@ -328,7 +453,10 @@ class FromMerge(RecipeCreation):
     def get_dataset_names(self) -> List[str]:
         """Get the dataset names from the merged recipes."""
-        names = [*self.recipe0.creation.get_dataset_names(), *self.recipe1.creation.get_dataset_names()]
+        names = [
+            *self.recipe0.creation.get_dataset_names(),
+            *self.recipe1.creation.get_dataset_names(),
+        ]
         return names
@@ -353,33 +481,3 @@ class FromMerger(RecipeCreation):
         for recipe in self.recipes:
             names.extend(recipe.creation.get_dataset_names())
         return names
-def extract_dataset_names_from_json_dict(data: dict) -> list[str]:
-    """
-    Extract dataset names recursively from a JSON dictionary added with 'from_name'.
-    Even if the same functionality is achieved with `DatasetRecipe.get_dataset_names()`,
-    we want to keep this function in 'dipdatalib' to extract dataset names from json dictionaries
-    directly.
-    """
-    creation_field = data.get("creation")
-    if creation_field is None:
-        return []
-    if creation_field.get("__type__") == "FromName":
-        return [creation_field["name"]]
-    elif creation_field.get("__type__") == "FromMerge":
-        recipe_names = ["recipe0", "recipe1"]
-        dataset_name = []
-        for recipe_name in recipe_names:
-            recipe = creation_field.get(recipe_name)
-            if recipe is None:
-                continue
-            dataset_name.extend(extract_dataset_names_from_json_dict(recipe))
-        return dataset_name
-    elif creation_field.get("__type__") == "FromMerger":
-        dataset_name = []
-        for recipe in creation_field.get("recipes", []):
-            dataset_name.extend(extract_dataset_names_from_json_dict(recipe))
-        return dataset_name
-    return []

hafnia/dataset/dataset_recipe/recipe_transforms.py CHANGED Viewed

@@ -1,10 +1,10 @@
-from typing import TYPE_CHECKING, Callable, Dict
+from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+from pydantic import field_validator
 from hafnia.dataset.dataset_recipe.recipe_types import RecipeTransform
 from hafnia.dataset.hafnia_dataset import HafniaDataset
-if TYPE_CHECKING:
-    pass
+from hafnia.dataset.primitives.primitive import Primitive
 class Shuffle(RecipeTransform):
@@ -51,3 +51,47 @@ class DefineSampleSetBySize(RecipeTransform):
     @staticmethod
     def get_function() -> Callable[..., "HafniaDataset"]:
         return HafniaDataset.define_sample_set_by_size
+class ClassMapper(RecipeTransform):
+    class_mapping: Union[Dict[str, str], List[Tuple[str, str]]]
+    method: str = "strict"
+    primitive: Optional[Type[Primitive]] = None
+    task_name: Optional[str] = None
+    @field_validator("class_mapping", mode="after")
+    @classmethod
+    def serialize_class_mapping(cls, value: Union[Dict[str, str], List[Tuple[str, str]]]) -> List[Tuple[str, str]]:
+        # Converts the dictionary class mapping to a list of tuples
+        #  e.g. {"old_class": "new_class", } --> [("old_class", "new_class")]
+        # The reason is that storing class mappings as a dictionary does not preserve order of json fields
+        # when stored in a database as a jsonb field (postgres).
+        # Preserving order of class mapping fields is important as it defines the indices of the classes.
+        # So to ensure that class indices are maintained, we preserve order of json fields, by converting the
+        # dictionary to a list of tuples.
+        if isinstance(value, dict):
+            value = list(value.items())
+        return value
+    @staticmethod
+    def get_function() -> Callable[..., "HafniaDataset"]:
+        return HafniaDataset.class_mapper
+class RenameTask(RecipeTransform):
+    old_task_name: str
+    new_task_name: str
+    @staticmethod
+    def get_function() -> Callable[..., "HafniaDataset"]:
+        return HafniaDataset.rename_task
+class SelectSamplesByClassName(RecipeTransform):
+    name: Union[List[str], str]
+    task_name: Optional[str] = None
+    primitive: Optional[Type[Primitive]] = None
+    @staticmethod
+    def get_function() -> Callable[..., "HafniaDataset"]:
+        return HafniaDataset.select_samples_by_class_name

hafnia/dataset/dataset_recipe/recipe_types.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pydantic import BaseModel, computed_field
 from hafnia import utils
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # Using 'TYPE_CHECKING' to avoid circular imports during type checking
     from hafnia.dataset.hafnia_dataset import HafniaDataset

hafnia 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

hafnia 0.2.4py3-none-any.whl → 0.4.0py3-none-any.whl