PyPI - hafnia - Versions diffs - 0.1.27__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

hafnia 0.1.27py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

cli/__main__.py +2 -2
cli/config.py +17 -4
cli/dataset_cmds.py +60 -0
cli/runc_cmds.py +1 -1
hafnia/data/__init__.py +2 -2
hafnia/data/factory.py +12 -56
hafnia/dataset/dataset_helpers.py +91 -0
hafnia/dataset/dataset_names.py +72 -0
hafnia/dataset/dataset_recipe/dataset_recipe.py +327 -0
hafnia/dataset/dataset_recipe/recipe_transforms.py +53 -0
hafnia/dataset/dataset_recipe/recipe_types.py +140 -0
hafnia/dataset/dataset_upload_helper.py +468 -0
hafnia/dataset/hafnia_dataset.py +624 -0
hafnia/dataset/operations/dataset_stats.py +15 -0
hafnia/dataset/operations/dataset_transformations.py +82 -0
hafnia/dataset/operations/table_transformations.py +183 -0
hafnia/dataset/primitives/__init__.py +16 -0
hafnia/dataset/primitives/bbox.py +137 -0
hafnia/dataset/primitives/bitmask.py +182 -0
hafnia/dataset/primitives/classification.py +56 -0
hafnia/dataset/primitives/point.py +25 -0
hafnia/dataset/primitives/polygon.py +100 -0
hafnia/dataset/primitives/primitive.py +44 -0
hafnia/dataset/primitives/segmentation.py +51 -0
hafnia/dataset/primitives/utils.py +51 -0
hafnia/experiment/hafnia_logger.py +7 -7
hafnia/helper_testing.py +108 -0
hafnia/http.py +5 -3
hafnia/platform/__init__.py +2 -2
hafnia/platform/datasets.py +197 -0
hafnia/platform/download.py +85 -23
hafnia/torch_helpers.py +180 -95
hafnia/utils.py +21 -2
hafnia/visualizations/colors.py +267 -0
hafnia/visualizations/image_visualizations.py +202 -0
{hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/METADATA +209 -99
hafnia-0.2.1.dist-info/RECORD +50 -0
cli/data_cmds.py +0 -53
hafnia-0.1.27.dist-info/RECORD +0 -27
{hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/WHEEL +0 -0
{hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/entry_points.txt +0 -0
{hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/licenses/LICENSE +0 -0

cli/__main__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 import click
-from cli import consts, data_cmds, experiment_cmds, profile_cmds, recipe_cmds, runc_cmds
+from cli import consts, dataset_cmds, experiment_cmds, profile_cmds, recipe_cmds, runc_cmds
 from cli.config import Config, ConfigSchema
@@ -46,7 +46,7 @@ def clear(cfg: Config) -> None:
 main.add_command(profile_cmds.profile)
-main.add_command(data_cmds.data)
+main.add_command(dataset_cmds.dataset)
 main.add_command(runc_cmds.runc)
 main.add_command(experiment_cmds.experiment)
 main.add_command(recipe_cmds.recipe)

cli/config.py CHANGED Viewed

@@ -80,7 +80,7 @@ class Config:
     def __init__(self, config_path: Optional[Path] = None) -> None:
         self.config_path = self.resolve_config_path(config_path)
         self.config_path.parent.mkdir(parents=True, exist_ok=True)
-        self.config_data = self.load_config()
+        self.config_data = Config.load_config(self.config_path)
     def resolve_config_path(self, path: Optional[Path] = None) -> Path:
         if path:
@@ -111,12 +111,25 @@ class Config:
         endpoint = self.config.platform_url + PLATFORM_API_MAPPING[method]
         return endpoint
-    def load_config(self) -> ConfigFileSchema:
+    @staticmethod
+    def load_config(config_path: Path) -> ConfigFileSchema:
         """Load configuration from file."""
-        if not self.config_path.exists():
+        # Environment variables has higher priority than config file
+        HAFNIA_API_KEY = os.getenv("HAFNIA_API_KEY")
+        HAFNIA_PLATFORM_URL = os.getenv("HAFNIA_PLATFORM_URL")
+        if HAFNIA_API_KEY and HAFNIA_PLATFORM_URL:
+            HAFNIA_PROFILE_NAME = os.getenv("HAFNIA_PROFILE_NAME", "default").strip()
+            cfg = ConfigFileSchema(
+                active_profile=HAFNIA_PROFILE_NAME,
+                profiles={HAFNIA_PROFILE_NAME: ConfigSchema(platform_url=HAFNIA_PLATFORM_URL, api_key=HAFNIA_API_KEY)},
+            )
+            return cfg
+        if not config_path.exists():
             return ConfigFileSchema()
         try:
-            with open(self.config_path.as_posix(), "r") as f:
+            with open(config_path.as_posix(), "r") as f:
                 data = json.load(f)
             return ConfigFileSchema(**data)
         except json.JSONDecodeError:

cli/dataset_cmds.py ADDED Viewed

@@ -0,0 +1,60 @@
+from pathlib import Path
+from typing import Optional
+import click
+from rich import print as rprint
+import cli.consts as consts
+from cli.config import Config
+from hafnia import utils
+from hafnia.platform.datasets import create_rich_table_from_dataset
+@click.group()
+def dataset():
+    """Manage dataset interaction"""
+    pass
+@dataset.command("ls")
+@click.pass_obj
+def dataset_list(cfg: Config) -> None:
+    """List available datasets on Hafnia platform"""
+    from hafnia.platform.datasets import dataset_list
+    try:
+        datasets = dataset_list(cfg=cfg)
+    except Exception:
+        raise click.ClickException(consts.ERROR_GET_RESOURCE)
+    table = create_rich_table_from_dataset(datasets)
+    rprint(table)
+@dataset.command("download")
+@click.argument("dataset_name")
+@click.option(
+    "--destination",
+    "-d",
+    default=None,
+    required=False,
+    help=f"Destination folder to save the dataset. Defaults to '{utils.PATH_DATASETS}/<dataset_name>'",
+)
+@click.option("--force", "-f", is_flag=True, default=False, help="Flag to enable force redownload")
+@click.pass_obj
+def data_download(cfg: Config, dataset_name: str, destination: Optional[click.Path], force: bool) -> Path:
+    """Download dataset from Hafnia platform"""
+    from hafnia.platform import datasets
+    try:
+        path_dataset = datasets.download_or_get_dataset_path(
+            dataset_name=dataset_name,
+            cfg=cfg,
+            path_datasets_folder=destination,
+            force_redownload=force,
+        )
+    except Exception:
+        raise click.ClickException(consts.ERROR_GET_RESOURCE)
+    return path_dataset

cli/runc_cmds.py CHANGED Viewed

@@ -38,7 +38,7 @@ def runc():
 @click.pass_obj
 def launch_local(cfg: Config, exec_cmd: str, dataset: str, image_name: str) -> None:
     """Launch a job within the image."""
-    from hafnia.data.factory import download_or_get_dataset_path
+    from hafnia.platform.datasets import download_or_get_dataset_path
     is_local_dataset = "/" in dataset
     if is_local_dataset:

hafnia/data/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from hafnia.data.factory import load_dataset
+from hafnia.data.factory import get_dataset_path, load_dataset
-__all__ = ["load_dataset"]
+__all__ = ["load_dataset", "get_dataset_path"]

hafnia/data/factory.py CHANGED Viewed

@@ -1,67 +1,23 @@
 import os
-import shutil
 from pathlib import Path
-from typing import Optional, Union
+from typing import Any
-from datasets import Dataset, DatasetDict, load_from_disk
-from cli.config import Config
 from hafnia import utils
-from hafnia.log import user_logger
-from hafnia.platform import download_resource, get_dataset_id
-def load_local(dataset_path: Path) -> Union[Dataset, DatasetDict]:
-    """Load a Hugging Face dataset from a local directory path."""
-    if not dataset_path.exists():
-        raise ValueError(f"Can not load dataset, directory does not exist -- {dataset_path}")
-    user_logger.info(f"Loading data from {dataset_path.as_posix()}")
-    return load_from_disk(dataset_path.as_posix())
-def download_or_get_dataset_path(
-    dataset_name: str,
-    cfg: Optional[Config] = None,
-    output_dir: Optional[str] = None,
-    force_redownload: bool = False,
-) -> Path:
-    """Download or get the path of the dataset."""
-    cfg = cfg or Config()
-    endpoint_dataset = cfg.get_platform_endpoint("datasets")
-    api_key = cfg.api_key
+from hafnia.dataset.hafnia_dataset import HafniaDataset, get_or_create_dataset_path_from_recipe
-    output_dir = output_dir or str(utils.PATH_DATASET)
-    dataset_path_base = Path(output_dir).absolute() / dataset_name
-    dataset_path_base.mkdir(exist_ok=True, parents=True)
-    dataset_path_sample = dataset_path_base / "sample"
-    if dataset_path_sample.exists() and not force_redownload:
-        user_logger.info("Dataset found locally. Set 'force=True' or add `--force` flag with cli to re-download")
-        return dataset_path_sample
+def load_dataset(recipe: Any, force_redownload: bool = False) -> HafniaDataset:
+    """Load a dataset either from a local path or from the Hafnia platform."""
-    dataset_id = get_dataset_id(dataset_name, endpoint_dataset, api_key)
-    dataset_access_info_url = f"{endpoint_dataset}/{dataset_id}/temporary-credentials"
+    path_dataset = get_dataset_path(recipe, force_redownload=force_redownload)
+    dataset = HafniaDataset.from_path(path_dataset)
+    return dataset
-    if force_redownload and dataset_path_sample.exists():
-        # Remove old files to avoid old files conflicting with new files
-        shutil.rmtree(dataset_path_sample, ignore_errors=True)
-    status = download_resource(dataset_access_info_url, str(dataset_path_base), api_key)
-    if status:
-        return dataset_path_sample
-    raise RuntimeError("Failed to download dataset")
+def get_dataset_path(recipe: Any, force_redownload: bool = False) -> Path:
+    if utils.is_hafnia_cloud_job():
+        return Path(os.getenv("MDI_DATASET_DIR", "/opt/ml/input/data/training"))
-def load_dataset(dataset_name: str, force_redownload: bool = False) -> Union[Dataset, DatasetDict]:
-    """Load a dataset either from a local path or from the Hafnia platform."""
+    path_dataset = get_or_create_dataset_path_from_recipe(recipe, force_redownload=force_redownload)
-    if utils.is_remote_job():
-        path_dataset = Path(os.getenv("MDI_DATASET_DIR", "/opt/ml/input/data/training"))
-        return load_local(path_dataset)
-    path_dataset = download_or_get_dataset_path(
-        dataset_name=dataset_name,
-        force_redownload=force_redownload,
-    )
-    dataset = load_local(path_dataset)
-    return dataset
+    return path_dataset

hafnia/dataset/dataset_helpers.py ADDED Viewed

@@ -0,0 +1,91 @@
+import io
+import math
+import random
+from pathlib import Path
+from typing import Dict, List
+import numpy as np
+import xxhash
+from PIL import Image
+def create_split_name_list_from_ratios(split_ratios: Dict[str, float], n_items: int, seed: int = 42) -> List[str]:
+    samples_per_split = split_sizes_from_ratios(split_ratios=split_ratios, n_items=n_items)
+    split_name_column = []
+    for split_name, n_split_samples in samples_per_split.items():
+        split_name_column.extend([split_name] * n_split_samples)
+    random.Random(seed).shuffle(split_name_column)  # Shuffle the split names
+    return split_name_column
+def hash_file_xxhash(path: Path, chunk_size: int = 262144) -> str:
+    hasher = xxhash.xxh3_64()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(chunk_size), b""):  # 8192, 16384, 32768, 65536
+            hasher.update(chunk)
+    return hasher.hexdigest()
+def hash_from_bytes(data: bytes) -> str:
+    hasher = xxhash.xxh3_64()
+    hasher.update(data)
+    return hasher.hexdigest()
+def save_image_with_hash_name(image: np.ndarray, path_folder: Path) -> Path:
+    pil_image = Image.fromarray(image)
+    buffer = io.BytesIO()
+    pil_image.save(buffer, format="PNG")
+    hash_value = hash_from_bytes(buffer.getvalue())
+    path_image = Path(path_folder) / f"{hash_value}.png"
+    pil_image.save(path_image)
+    return path_image
+def filename_as_hash_from_path(path_image: Path) -> str:
+    hash = hash_file_xxhash(path_image)
+    return f"{hash}{path_image.suffix}"
+def split_sizes_from_ratios(n_items: int, split_ratios: Dict[str, float]) -> Dict[str, int]:
+    summed_ratios = sum(split_ratios.values())
+    abs_tols = 0.0011  # Allow some tolerance for floating point errors {"test": 0.333, "val": 0.333, "train": 0.333}
+    if not math.isclose(summed_ratios, 1.0, abs_tol=abs_tols):  # Allow tolerance to allow e.g. (0.333, 0.333, 0.333)
+        raise ValueError(f"Split ratios must sum to 1.0. The summed values of {split_ratios} is {summed_ratios}")
+    # recaculate split sizes
+    split_ratios = {split_name: split_ratio / summed_ratios for split_name, split_ratio in split_ratios.items()}
+    split_sizes = {split_name: int(n_items * split_ratio) for split_name, split_ratio in split_ratios.items()}
+    remaining_items = n_items - sum(split_sizes.values())
+    if remaining_items > 0:  # Distribute remaining items evenly across splits
+        for _ in range(remaining_items):
+            # Select name by the largest error from the expected distribution
+            total_size = sum(split_sizes.values())
+            distribution_error = {
+                split_name: abs(split_ratios[split_name] - (size / total_size))
+                for split_name, size in split_sizes.items()
+            }
+            split_with_largest_error = sorted(distribution_error.items(), key=lambda x: x[1], reverse=True)[0][0]
+            split_sizes[split_with_largest_error] += 1
+    if sum(split_sizes.values()) != n_items:
+        raise ValueError("Something is wrong. The split sizes do not match the number of items.")
+    return split_sizes
+def select_evenly_across_list(lst: list, num_samples: int):
+    if num_samples >= len(lst):
+        return lst  # No need to sample
+    step = (len(lst) - 1) / (num_samples - 1)
+    indices = [int(round(step * i)) for i in range(num_samples)]  # noqa: RUF046
+    return [lst[index] for index in indices]
+def prefix_dict(d: dict, prefix: str) -> dict:
+    return {f"{prefix}.{k}": v for k, v in d.items()}

hafnia/dataset/dataset_names.py ADDED Viewed

@@ -0,0 +1,72 @@
+from enum import Enum
+from typing import List
+FILENAME_RECIPE_JSON = "recipe.json"
+FILENAME_DATASET_INFO = "dataset_info.json"
+FILENAME_ANNOTATIONS_JSONL = "annotations.jsonl"
+FILENAME_ANNOTATIONS_PARQUET = "annotations.parquet"
+DATASET_FILENAMES_REQUIRED = [
+    FILENAME_DATASET_INFO,
+    FILENAME_ANNOTATIONS_JSONL,
+    FILENAME_ANNOTATIONS_PARQUET,
+]
+class DeploymentStage(Enum):
+    STAGING = "staging"
+    PRODUCTION = "production"
+class FieldName:
+    CLASS_NAME: str = "class_name"  # Name of the class this primitive is associated with, e.g. "car" for Bbox
+    CLASS_IDX: str = (
+        "class_idx"  # Index of the class this primitive is associated with, e.g. 0 for "car" if it is the first class
+    )
+    OBJECT_ID: str = "object_id"  # Unique identifier for the object, e.g. "12345123"
+    CONFIDENCE: str = "confidence"  # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
+    META: str = "meta"  # Contains metadata about each primitive, e.g. attributes color, occluded, iscrowd, etc.
+    TASK_NAME: str = "task_name"  # Name of the task this primitive is associated with, e.g. "bboxes" for Bbox
+    @staticmethod
+    def fields() -> List[str]:
+        """
+        Returns a list of expected field names for primitives.
+        """
+        return [
+            FieldName.CLASS_NAME,
+            FieldName.CLASS_IDX,
+            FieldName.OBJECT_ID,
+            FieldName.CONFIDENCE,
+            FieldName.META,
+            FieldName.TASK_NAME,
+        ]
+class ColumnName:
+    SAMPLE_INDEX: str = "sample_index"
+    FILE_NAME: str = "file_name"
+    HEIGHT: str = "height"
+    WIDTH: str = "width"
+    SPLIT: str = "split"
+    IS_SAMPLE: str = "is_sample"
+    REMOTE_PATH: str = "remote_path"  # Path to the file in remote storage, e.g. S3
+    META: str = "meta"
+class SplitName:
+    TRAIN = "train"
+    VAL = "validation"
+    TEST = "test"
+    UNDEFINED = "UNDEFINED"
+    @staticmethod
+    def valid_splits() -> List[str]:
+        return [SplitName.TRAIN, SplitName.VAL, SplitName.TEST]
+class DatasetVariant(Enum):
+    DUMP = "dump"
+    SAMPLE = "sample"
+    HIDDEN = "hidden"

hafnia 0.1.27__py3-none-any.whl → 0.2.1__py3-none-any.whl

hafnia 0.1.27py3-none-any.whl → 0.2.1py3-none-any.whl