hafnia 0.1.27__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__main__.py +2 -2
- cli/config.py +17 -4
- cli/dataset_cmds.py +60 -0
- cli/runc_cmds.py +1 -1
- hafnia/data/__init__.py +2 -2
- hafnia/data/factory.py +12 -56
- hafnia/dataset/dataset_helpers.py +91 -0
- hafnia/dataset/dataset_names.py +72 -0
- hafnia/dataset/dataset_recipe/dataset_recipe.py +327 -0
- hafnia/dataset/dataset_recipe/recipe_transforms.py +53 -0
- hafnia/dataset/dataset_recipe/recipe_types.py +140 -0
- hafnia/dataset/dataset_upload_helper.py +468 -0
- hafnia/dataset/hafnia_dataset.py +624 -0
- hafnia/dataset/operations/dataset_stats.py +15 -0
- hafnia/dataset/operations/dataset_transformations.py +82 -0
- hafnia/dataset/operations/table_transformations.py +183 -0
- hafnia/dataset/primitives/__init__.py +16 -0
- hafnia/dataset/primitives/bbox.py +137 -0
- hafnia/dataset/primitives/bitmask.py +182 -0
- hafnia/dataset/primitives/classification.py +56 -0
- hafnia/dataset/primitives/point.py +25 -0
- hafnia/dataset/primitives/polygon.py +100 -0
- hafnia/dataset/primitives/primitive.py +44 -0
- hafnia/dataset/primitives/segmentation.py +51 -0
- hafnia/dataset/primitives/utils.py +51 -0
- hafnia/experiment/hafnia_logger.py +7 -7
- hafnia/helper_testing.py +108 -0
- hafnia/http.py +5 -3
- hafnia/platform/__init__.py +2 -2
- hafnia/platform/datasets.py +197 -0
- hafnia/platform/download.py +85 -23
- hafnia/torch_helpers.py +180 -95
- hafnia/utils.py +21 -2
- hafnia/visualizations/colors.py +267 -0
- hafnia/visualizations/image_visualizations.py +202 -0
- {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/METADATA +209 -99
- hafnia-0.2.1.dist-info/RECORD +50 -0
- cli/data_cmds.py +0 -53
- hafnia-0.1.27.dist-info/RECORD +0 -27
- {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/WHEEL +0 -0
- {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/entry_points.txt +0 -0
- {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/licenses/LICENSE +0 -0
cli/__main__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
import click
|
|
3
3
|
|
|
4
|
-
from cli import consts,
|
|
4
|
+
from cli import consts, dataset_cmds, experiment_cmds, profile_cmds, recipe_cmds, runc_cmds
|
|
5
5
|
from cli.config import Config, ConfigSchema
|
|
6
6
|
|
|
7
7
|
|
|
@@ -46,7 +46,7 @@ def clear(cfg: Config) -> None:
|
|
|
46
46
|
|
|
47
47
|
|
|
48
48
|
main.add_command(profile_cmds.profile)
|
|
49
|
-
main.add_command(
|
|
49
|
+
main.add_command(dataset_cmds.dataset)
|
|
50
50
|
main.add_command(runc_cmds.runc)
|
|
51
51
|
main.add_command(experiment_cmds.experiment)
|
|
52
52
|
main.add_command(recipe_cmds.recipe)
|
cli/config.py
CHANGED
|
@@ -80,7 +80,7 @@ class Config:
|
|
|
80
80
|
def __init__(self, config_path: Optional[Path] = None) -> None:
|
|
81
81
|
self.config_path = self.resolve_config_path(config_path)
|
|
82
82
|
self.config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
83
|
-
self.config_data =
|
|
83
|
+
self.config_data = Config.load_config(self.config_path)
|
|
84
84
|
|
|
85
85
|
def resolve_config_path(self, path: Optional[Path] = None) -> Path:
|
|
86
86
|
if path:
|
|
@@ -111,12 +111,25 @@ class Config:
|
|
|
111
111
|
endpoint = self.config.platform_url + PLATFORM_API_MAPPING[method]
|
|
112
112
|
return endpoint
|
|
113
113
|
|
|
114
|
-
|
|
114
|
+
@staticmethod
|
|
115
|
+
def load_config(config_path: Path) -> ConfigFileSchema:
|
|
115
116
|
"""Load configuration from file."""
|
|
116
|
-
|
|
117
|
+
|
|
118
|
+
# Environment variables has higher priority than config file
|
|
119
|
+
HAFNIA_API_KEY = os.getenv("HAFNIA_API_KEY")
|
|
120
|
+
HAFNIA_PLATFORM_URL = os.getenv("HAFNIA_PLATFORM_URL")
|
|
121
|
+
if HAFNIA_API_KEY and HAFNIA_PLATFORM_URL:
|
|
122
|
+
HAFNIA_PROFILE_NAME = os.getenv("HAFNIA_PROFILE_NAME", "default").strip()
|
|
123
|
+
cfg = ConfigFileSchema(
|
|
124
|
+
active_profile=HAFNIA_PROFILE_NAME,
|
|
125
|
+
profiles={HAFNIA_PROFILE_NAME: ConfigSchema(platform_url=HAFNIA_PLATFORM_URL, api_key=HAFNIA_API_KEY)},
|
|
126
|
+
)
|
|
127
|
+
return cfg
|
|
128
|
+
|
|
129
|
+
if not config_path.exists():
|
|
117
130
|
return ConfigFileSchema()
|
|
118
131
|
try:
|
|
119
|
-
with open(
|
|
132
|
+
with open(config_path.as_posix(), "r") as f:
|
|
120
133
|
data = json.load(f)
|
|
121
134
|
return ConfigFileSchema(**data)
|
|
122
135
|
except json.JSONDecodeError:
|
cli/dataset_cmds.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
from rich import print as rprint
|
|
6
|
+
|
|
7
|
+
import cli.consts as consts
|
|
8
|
+
from cli.config import Config
|
|
9
|
+
from hafnia import utils
|
|
10
|
+
from hafnia.platform.datasets import create_rich_table_from_dataset
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@click.group()
|
|
14
|
+
def dataset():
|
|
15
|
+
"""Manage dataset interaction"""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataset.command("ls")
|
|
20
|
+
@click.pass_obj
|
|
21
|
+
def dataset_list(cfg: Config) -> None:
|
|
22
|
+
"""List available datasets on Hafnia platform"""
|
|
23
|
+
|
|
24
|
+
from hafnia.platform.datasets import dataset_list
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
datasets = dataset_list(cfg=cfg)
|
|
28
|
+
except Exception:
|
|
29
|
+
raise click.ClickException(consts.ERROR_GET_RESOURCE)
|
|
30
|
+
|
|
31
|
+
table = create_rich_table_from_dataset(datasets)
|
|
32
|
+
rprint(table)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataset.command("download")
|
|
36
|
+
@click.argument("dataset_name")
|
|
37
|
+
@click.option(
|
|
38
|
+
"--destination",
|
|
39
|
+
"-d",
|
|
40
|
+
default=None,
|
|
41
|
+
required=False,
|
|
42
|
+
help=f"Destination folder to save the dataset. Defaults to '{utils.PATH_DATASETS}/<dataset_name>'",
|
|
43
|
+
)
|
|
44
|
+
@click.option("--force", "-f", is_flag=True, default=False, help="Flag to enable force redownload")
|
|
45
|
+
@click.pass_obj
|
|
46
|
+
def data_download(cfg: Config, dataset_name: str, destination: Optional[click.Path], force: bool) -> Path:
|
|
47
|
+
"""Download dataset from Hafnia platform"""
|
|
48
|
+
|
|
49
|
+
from hafnia.platform import datasets
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
path_dataset = datasets.download_or_get_dataset_path(
|
|
53
|
+
dataset_name=dataset_name,
|
|
54
|
+
cfg=cfg,
|
|
55
|
+
path_datasets_folder=destination,
|
|
56
|
+
force_redownload=force,
|
|
57
|
+
)
|
|
58
|
+
except Exception:
|
|
59
|
+
raise click.ClickException(consts.ERROR_GET_RESOURCE)
|
|
60
|
+
return path_dataset
|
cli/runc_cmds.py
CHANGED
|
@@ -38,7 +38,7 @@ def runc():
|
|
|
38
38
|
@click.pass_obj
|
|
39
39
|
def launch_local(cfg: Config, exec_cmd: str, dataset: str, image_name: str) -> None:
|
|
40
40
|
"""Launch a job within the image."""
|
|
41
|
-
from hafnia.
|
|
41
|
+
from hafnia.platform.datasets import download_or_get_dataset_path
|
|
42
42
|
|
|
43
43
|
is_local_dataset = "/" in dataset
|
|
44
44
|
if is_local_dataset:
|
hafnia/data/__init__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
from hafnia.data.factory import load_dataset
|
|
1
|
+
from hafnia.data.factory import get_dataset_path, load_dataset
|
|
2
2
|
|
|
3
|
-
__all__ = ["load_dataset"]
|
|
3
|
+
__all__ = ["load_dataset", "get_dataset_path"]
|
hafnia/data/factory.py
CHANGED
|
@@ -1,67 +1,23 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import shutil
|
|
3
2
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
3
|
+
from typing import Any
|
|
5
4
|
|
|
6
|
-
from datasets import Dataset, DatasetDict, load_from_disk
|
|
7
|
-
|
|
8
|
-
from cli.config import Config
|
|
9
5
|
from hafnia import utils
|
|
10
|
-
from hafnia.
|
|
11
|
-
from hafnia.platform import download_resource, get_dataset_id
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def load_local(dataset_path: Path) -> Union[Dataset, DatasetDict]:
|
|
15
|
-
"""Load a Hugging Face dataset from a local directory path."""
|
|
16
|
-
if not dataset_path.exists():
|
|
17
|
-
raise ValueError(f"Can not load dataset, directory does not exist -- {dataset_path}")
|
|
18
|
-
user_logger.info(f"Loading data from {dataset_path.as_posix()}")
|
|
19
|
-
return load_from_disk(dataset_path.as_posix())
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def download_or_get_dataset_path(
|
|
23
|
-
dataset_name: str,
|
|
24
|
-
cfg: Optional[Config] = None,
|
|
25
|
-
output_dir: Optional[str] = None,
|
|
26
|
-
force_redownload: bool = False,
|
|
27
|
-
) -> Path:
|
|
28
|
-
"""Download or get the path of the dataset."""
|
|
29
|
-
|
|
30
|
-
cfg = cfg or Config()
|
|
31
|
-
endpoint_dataset = cfg.get_platform_endpoint("datasets")
|
|
32
|
-
api_key = cfg.api_key
|
|
6
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset, get_or_create_dataset_path_from_recipe
|
|
33
7
|
|
|
34
|
-
output_dir = output_dir or str(utils.PATH_DATASET)
|
|
35
|
-
dataset_path_base = Path(output_dir).absolute() / dataset_name
|
|
36
|
-
dataset_path_base.mkdir(exist_ok=True, parents=True)
|
|
37
|
-
dataset_path_sample = dataset_path_base / "sample"
|
|
38
8
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
return dataset_path_sample
|
|
9
|
+
def load_dataset(recipe: Any, force_redownload: bool = False) -> HafniaDataset:
|
|
10
|
+
"""Load a dataset either from a local path or from the Hafnia platform."""
|
|
42
11
|
|
|
43
|
-
|
|
44
|
-
|
|
12
|
+
path_dataset = get_dataset_path(recipe, force_redownload=force_redownload)
|
|
13
|
+
dataset = HafniaDataset.from_path(path_dataset)
|
|
14
|
+
return dataset
|
|
45
15
|
|
|
46
|
-
if force_redownload and dataset_path_sample.exists():
|
|
47
|
-
# Remove old files to avoid old files conflicting with new files
|
|
48
|
-
shutil.rmtree(dataset_path_sample, ignore_errors=True)
|
|
49
|
-
status = download_resource(dataset_access_info_url, str(dataset_path_base), api_key)
|
|
50
|
-
if status:
|
|
51
|
-
return dataset_path_sample
|
|
52
|
-
raise RuntimeError("Failed to download dataset")
|
|
53
16
|
|
|
17
|
+
def get_dataset_path(recipe: Any, force_redownload: bool = False) -> Path:
|
|
18
|
+
if utils.is_hafnia_cloud_job():
|
|
19
|
+
return Path(os.getenv("MDI_DATASET_DIR", "/opt/ml/input/data/training"))
|
|
54
20
|
|
|
55
|
-
|
|
56
|
-
"""Load a dataset either from a local path or from the Hafnia platform."""
|
|
21
|
+
path_dataset = get_or_create_dataset_path_from_recipe(recipe, force_redownload=force_redownload)
|
|
57
22
|
|
|
58
|
-
|
|
59
|
-
path_dataset = Path(os.getenv("MDI_DATASET_DIR", "/opt/ml/input/data/training"))
|
|
60
|
-
return load_local(path_dataset)
|
|
61
|
-
|
|
62
|
-
path_dataset = download_or_get_dataset_path(
|
|
63
|
-
dataset_name=dataset_name,
|
|
64
|
-
force_redownload=force_redownload,
|
|
65
|
-
)
|
|
66
|
-
dataset = load_local(path_dataset)
|
|
67
|
-
return dataset
|
|
23
|
+
return path_dataset
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import math
|
|
3
|
+
import random
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, List
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import xxhash
|
|
9
|
+
from PIL import Image
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_split_name_list_from_ratios(split_ratios: Dict[str, float], n_items: int, seed: int = 42) -> List[str]:
|
|
13
|
+
samples_per_split = split_sizes_from_ratios(split_ratios=split_ratios, n_items=n_items)
|
|
14
|
+
|
|
15
|
+
split_name_column = []
|
|
16
|
+
for split_name, n_split_samples in samples_per_split.items():
|
|
17
|
+
split_name_column.extend([split_name] * n_split_samples)
|
|
18
|
+
random.Random(seed).shuffle(split_name_column) # Shuffle the split names
|
|
19
|
+
|
|
20
|
+
return split_name_column
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def hash_file_xxhash(path: Path, chunk_size: int = 262144) -> str:
|
|
24
|
+
hasher = xxhash.xxh3_64()
|
|
25
|
+
|
|
26
|
+
with open(path, "rb") as f:
|
|
27
|
+
for chunk in iter(lambda: f.read(chunk_size), b""): # 8192, 16384, 32768, 65536
|
|
28
|
+
hasher.update(chunk)
|
|
29
|
+
return hasher.hexdigest()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def hash_from_bytes(data: bytes) -> str:
|
|
33
|
+
hasher = xxhash.xxh3_64()
|
|
34
|
+
hasher.update(data)
|
|
35
|
+
return hasher.hexdigest()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def save_image_with_hash_name(image: np.ndarray, path_folder: Path) -> Path:
|
|
39
|
+
pil_image = Image.fromarray(image)
|
|
40
|
+
buffer = io.BytesIO()
|
|
41
|
+
pil_image.save(buffer, format="PNG")
|
|
42
|
+
hash_value = hash_from_bytes(buffer.getvalue())
|
|
43
|
+
path_image = Path(path_folder) / f"{hash_value}.png"
|
|
44
|
+
pil_image.save(path_image)
|
|
45
|
+
return path_image
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def filename_as_hash_from_path(path_image: Path) -> str:
|
|
49
|
+
hash = hash_file_xxhash(path_image)
|
|
50
|
+
return f"{hash}{path_image.suffix}"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def split_sizes_from_ratios(n_items: int, split_ratios: Dict[str, float]) -> Dict[str, int]:
|
|
54
|
+
summed_ratios = sum(split_ratios.values())
|
|
55
|
+
abs_tols = 0.0011 # Allow some tolerance for floating point errors {"test": 0.333, "val": 0.333, "train": 0.333}
|
|
56
|
+
if not math.isclose(summed_ratios, 1.0, abs_tol=abs_tols): # Allow tolerance to allow e.g. (0.333, 0.333, 0.333)
|
|
57
|
+
raise ValueError(f"Split ratios must sum to 1.0. The summed values of {split_ratios} is {summed_ratios}")
|
|
58
|
+
|
|
59
|
+
# recaculate split sizes
|
|
60
|
+
split_ratios = {split_name: split_ratio / summed_ratios for split_name, split_ratio in split_ratios.items()}
|
|
61
|
+
split_sizes = {split_name: int(n_items * split_ratio) for split_name, split_ratio in split_ratios.items()}
|
|
62
|
+
|
|
63
|
+
remaining_items = n_items - sum(split_sizes.values())
|
|
64
|
+
if remaining_items > 0: # Distribute remaining items evenly across splits
|
|
65
|
+
for _ in range(remaining_items):
|
|
66
|
+
# Select name by the largest error from the expected distribution
|
|
67
|
+
total_size = sum(split_sizes.values())
|
|
68
|
+
distribution_error = {
|
|
69
|
+
split_name: abs(split_ratios[split_name] - (size / total_size))
|
|
70
|
+
for split_name, size in split_sizes.items()
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
split_with_largest_error = sorted(distribution_error.items(), key=lambda x: x[1], reverse=True)[0][0]
|
|
74
|
+
split_sizes[split_with_largest_error] += 1
|
|
75
|
+
|
|
76
|
+
if sum(split_sizes.values()) != n_items:
|
|
77
|
+
raise ValueError("Something is wrong. The split sizes do not match the number of items.")
|
|
78
|
+
|
|
79
|
+
return split_sizes
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def select_evenly_across_list(lst: list, num_samples: int):
|
|
83
|
+
if num_samples >= len(lst):
|
|
84
|
+
return lst # No need to sample
|
|
85
|
+
step = (len(lst) - 1) / (num_samples - 1)
|
|
86
|
+
indices = [int(round(step * i)) for i in range(num_samples)] # noqa: RUF046
|
|
87
|
+
return [lst[index] for index in indices]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def prefix_dict(d: dict, prefix: str) -> dict:
|
|
91
|
+
return {f"{prefix}.{k}": v for k, v in d.items()}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
FILENAME_RECIPE_JSON = "recipe.json"
|
|
5
|
+
FILENAME_DATASET_INFO = "dataset_info.json"
|
|
6
|
+
FILENAME_ANNOTATIONS_JSONL = "annotations.jsonl"
|
|
7
|
+
FILENAME_ANNOTATIONS_PARQUET = "annotations.parquet"
|
|
8
|
+
|
|
9
|
+
DATASET_FILENAMES_REQUIRED = [
|
|
10
|
+
FILENAME_DATASET_INFO,
|
|
11
|
+
FILENAME_ANNOTATIONS_JSONL,
|
|
12
|
+
FILENAME_ANNOTATIONS_PARQUET,
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DeploymentStage(Enum):
|
|
17
|
+
STAGING = "staging"
|
|
18
|
+
PRODUCTION = "production"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FieldName:
|
|
22
|
+
CLASS_NAME: str = "class_name" # Name of the class this primitive is associated with, e.g. "car" for Bbox
|
|
23
|
+
CLASS_IDX: str = (
|
|
24
|
+
"class_idx" # Index of the class this primitive is associated with, e.g. 0 for "car" if it is the first class
|
|
25
|
+
)
|
|
26
|
+
OBJECT_ID: str = "object_id" # Unique identifier for the object, e.g. "12345123"
|
|
27
|
+
CONFIDENCE: str = "confidence" # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
|
|
28
|
+
|
|
29
|
+
META: str = "meta" # Contains metadata about each primitive, e.g. attributes color, occluded, iscrowd, etc.
|
|
30
|
+
TASK_NAME: str = "task_name" # Name of the task this primitive is associated with, e.g. "bboxes" for Bbox
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def fields() -> List[str]:
|
|
34
|
+
"""
|
|
35
|
+
Returns a list of expected field names for primitives.
|
|
36
|
+
"""
|
|
37
|
+
return [
|
|
38
|
+
FieldName.CLASS_NAME,
|
|
39
|
+
FieldName.CLASS_IDX,
|
|
40
|
+
FieldName.OBJECT_ID,
|
|
41
|
+
FieldName.CONFIDENCE,
|
|
42
|
+
FieldName.META,
|
|
43
|
+
FieldName.TASK_NAME,
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ColumnName:
|
|
48
|
+
SAMPLE_INDEX: str = "sample_index"
|
|
49
|
+
FILE_NAME: str = "file_name"
|
|
50
|
+
HEIGHT: str = "height"
|
|
51
|
+
WIDTH: str = "width"
|
|
52
|
+
SPLIT: str = "split"
|
|
53
|
+
IS_SAMPLE: str = "is_sample"
|
|
54
|
+
REMOTE_PATH: str = "remote_path" # Path to the file in remote storage, e.g. S3
|
|
55
|
+
META: str = "meta"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class SplitName:
|
|
59
|
+
TRAIN = "train"
|
|
60
|
+
VAL = "validation"
|
|
61
|
+
TEST = "test"
|
|
62
|
+
UNDEFINED = "UNDEFINED"
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def valid_splits() -> List[str]:
|
|
66
|
+
return [SplitName.TRAIN, SplitName.VAL, SplitName.TEST]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class DatasetVariant(Enum):
|
|
70
|
+
DUMP = "dump"
|
|
71
|
+
SAMPLE = "sample"
|
|
72
|
+
HIDDEN = "hidden"
|