hafnia 0.1.27__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. cli/__main__.py +2 -2
  2. cli/config.py +17 -4
  3. cli/dataset_cmds.py +60 -0
  4. cli/runc_cmds.py +1 -1
  5. hafnia/data/__init__.py +2 -2
  6. hafnia/data/factory.py +12 -56
  7. hafnia/dataset/dataset_helpers.py +91 -0
  8. hafnia/dataset/dataset_names.py +72 -0
  9. hafnia/dataset/dataset_recipe/dataset_recipe.py +327 -0
  10. hafnia/dataset/dataset_recipe/recipe_transforms.py +53 -0
  11. hafnia/dataset/dataset_recipe/recipe_types.py +140 -0
  12. hafnia/dataset/dataset_upload_helper.py +468 -0
  13. hafnia/dataset/hafnia_dataset.py +624 -0
  14. hafnia/dataset/operations/dataset_stats.py +15 -0
  15. hafnia/dataset/operations/dataset_transformations.py +82 -0
  16. hafnia/dataset/operations/table_transformations.py +183 -0
  17. hafnia/dataset/primitives/__init__.py +16 -0
  18. hafnia/dataset/primitives/bbox.py +137 -0
  19. hafnia/dataset/primitives/bitmask.py +182 -0
  20. hafnia/dataset/primitives/classification.py +56 -0
  21. hafnia/dataset/primitives/point.py +25 -0
  22. hafnia/dataset/primitives/polygon.py +100 -0
  23. hafnia/dataset/primitives/primitive.py +44 -0
  24. hafnia/dataset/primitives/segmentation.py +51 -0
  25. hafnia/dataset/primitives/utils.py +51 -0
  26. hafnia/experiment/hafnia_logger.py +7 -7
  27. hafnia/helper_testing.py +108 -0
  28. hafnia/http.py +5 -3
  29. hafnia/platform/__init__.py +2 -2
  30. hafnia/platform/datasets.py +197 -0
  31. hafnia/platform/download.py +85 -23
  32. hafnia/torch_helpers.py +180 -95
  33. hafnia/utils.py +21 -2
  34. hafnia/visualizations/colors.py +267 -0
  35. hafnia/visualizations/image_visualizations.py +202 -0
  36. {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/METADATA +209 -99
  37. hafnia-0.2.1.dist-info/RECORD +50 -0
  38. cli/data_cmds.py +0 -53
  39. hafnia-0.1.27.dist-info/RECORD +0 -27
  40. {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/WHEEL +0 -0
  41. {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/entry_points.txt +0 -0
  42. {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/licenses/LICENSE +0 -0
cli/__main__.py CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  import click
3
3
 
4
- from cli import consts, data_cmds, experiment_cmds, profile_cmds, recipe_cmds, runc_cmds
4
+ from cli import consts, dataset_cmds, experiment_cmds, profile_cmds, recipe_cmds, runc_cmds
5
5
  from cli.config import Config, ConfigSchema
6
6
 
7
7
 
@@ -46,7 +46,7 @@ def clear(cfg: Config) -> None:
46
46
 
47
47
 
48
48
  main.add_command(profile_cmds.profile)
49
- main.add_command(data_cmds.data)
49
+ main.add_command(dataset_cmds.dataset)
50
50
  main.add_command(runc_cmds.runc)
51
51
  main.add_command(experiment_cmds.experiment)
52
52
  main.add_command(recipe_cmds.recipe)
cli/config.py CHANGED
@@ -80,7 +80,7 @@ class Config:
80
80
  def __init__(self, config_path: Optional[Path] = None) -> None:
81
81
  self.config_path = self.resolve_config_path(config_path)
82
82
  self.config_path.parent.mkdir(parents=True, exist_ok=True)
83
- self.config_data = self.load_config()
83
+ self.config_data = Config.load_config(self.config_path)
84
84
 
85
85
  def resolve_config_path(self, path: Optional[Path] = None) -> Path:
86
86
  if path:
@@ -111,12 +111,25 @@ class Config:
111
111
  endpoint = self.config.platform_url + PLATFORM_API_MAPPING[method]
112
112
  return endpoint
113
113
 
114
- def load_config(self) -> ConfigFileSchema:
114
+ @staticmethod
115
+ def load_config(config_path: Path) -> ConfigFileSchema:
115
116
  """Load configuration from file."""
116
- if not self.config_path.exists():
117
+
118
+ # Environment variables has higher priority than config file
119
+ HAFNIA_API_KEY = os.getenv("HAFNIA_API_KEY")
120
+ HAFNIA_PLATFORM_URL = os.getenv("HAFNIA_PLATFORM_URL")
121
+ if HAFNIA_API_KEY and HAFNIA_PLATFORM_URL:
122
+ HAFNIA_PROFILE_NAME = os.getenv("HAFNIA_PROFILE_NAME", "default").strip()
123
+ cfg = ConfigFileSchema(
124
+ active_profile=HAFNIA_PROFILE_NAME,
125
+ profiles={HAFNIA_PROFILE_NAME: ConfigSchema(platform_url=HAFNIA_PLATFORM_URL, api_key=HAFNIA_API_KEY)},
126
+ )
127
+ return cfg
128
+
129
+ if not config_path.exists():
117
130
  return ConfigFileSchema()
118
131
  try:
119
- with open(self.config_path.as_posix(), "r") as f:
132
+ with open(config_path.as_posix(), "r") as f:
120
133
  data = json.load(f)
121
134
  return ConfigFileSchema(**data)
122
135
  except json.JSONDecodeError:
cli/dataset_cmds.py ADDED
@@ -0,0 +1,60 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+ import click
5
+ from rich import print as rprint
6
+
7
+ import cli.consts as consts
8
+ from cli.config import Config
9
+ from hafnia import utils
10
+ from hafnia.platform.datasets import create_rich_table_from_dataset
11
+
12
+
13
+ @click.group()
14
+ def dataset():
15
+ """Manage dataset interaction"""
16
+ pass
17
+
18
+
19
+ @dataset.command("ls")
20
+ @click.pass_obj
21
+ def dataset_list(cfg: Config) -> None:
22
+ """List available datasets on Hafnia platform"""
23
+
24
+ from hafnia.platform.datasets import dataset_list
25
+
26
+ try:
27
+ datasets = dataset_list(cfg=cfg)
28
+ except Exception:
29
+ raise click.ClickException(consts.ERROR_GET_RESOURCE)
30
+
31
+ table = create_rich_table_from_dataset(datasets)
32
+ rprint(table)
33
+
34
+
35
+ @dataset.command("download")
36
+ @click.argument("dataset_name")
37
+ @click.option(
38
+ "--destination",
39
+ "-d",
40
+ default=None,
41
+ required=False,
42
+ help=f"Destination folder to save the dataset. Defaults to '{utils.PATH_DATASETS}/<dataset_name>'",
43
+ )
44
+ @click.option("--force", "-f", is_flag=True, default=False, help="Flag to enable force redownload")
45
+ @click.pass_obj
46
+ def data_download(cfg: Config, dataset_name: str, destination: Optional[click.Path], force: bool) -> Path:
47
+ """Download dataset from Hafnia platform"""
48
+
49
+ from hafnia.platform import datasets
50
+
51
+ try:
52
+ path_dataset = datasets.download_or_get_dataset_path(
53
+ dataset_name=dataset_name,
54
+ cfg=cfg,
55
+ path_datasets_folder=destination,
56
+ force_redownload=force,
57
+ )
58
+ except Exception:
59
+ raise click.ClickException(consts.ERROR_GET_RESOURCE)
60
+ return path_dataset
cli/runc_cmds.py CHANGED
@@ -38,7 +38,7 @@ def runc():
38
38
  @click.pass_obj
39
39
  def launch_local(cfg: Config, exec_cmd: str, dataset: str, image_name: str) -> None:
40
40
  """Launch a job within the image."""
41
- from hafnia.data.factory import download_or_get_dataset_path
41
+ from hafnia.platform.datasets import download_or_get_dataset_path
42
42
 
43
43
  is_local_dataset = "/" in dataset
44
44
  if is_local_dataset:
hafnia/data/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- from hafnia.data.factory import load_dataset
1
+ from hafnia.data.factory import get_dataset_path, load_dataset
2
2
 
3
- __all__ = ["load_dataset"]
3
+ __all__ = ["load_dataset", "get_dataset_path"]
hafnia/data/factory.py CHANGED
@@ -1,67 +1,23 @@
1
1
  import os
2
- import shutil
3
2
  from pathlib import Path
4
- from typing import Optional, Union
3
+ from typing import Any
5
4
 
6
- from datasets import Dataset, DatasetDict, load_from_disk
7
-
8
- from cli.config import Config
9
5
  from hafnia import utils
10
- from hafnia.log import user_logger
11
- from hafnia.platform import download_resource, get_dataset_id
12
-
13
-
14
- def load_local(dataset_path: Path) -> Union[Dataset, DatasetDict]:
15
- """Load a Hugging Face dataset from a local directory path."""
16
- if not dataset_path.exists():
17
- raise ValueError(f"Can not load dataset, directory does not exist -- {dataset_path}")
18
- user_logger.info(f"Loading data from {dataset_path.as_posix()}")
19
- return load_from_disk(dataset_path.as_posix())
20
-
21
-
22
- def download_or_get_dataset_path(
23
- dataset_name: str,
24
- cfg: Optional[Config] = None,
25
- output_dir: Optional[str] = None,
26
- force_redownload: bool = False,
27
- ) -> Path:
28
- """Download or get the path of the dataset."""
29
-
30
- cfg = cfg or Config()
31
- endpoint_dataset = cfg.get_platform_endpoint("datasets")
32
- api_key = cfg.api_key
6
+ from hafnia.dataset.hafnia_dataset import HafniaDataset, get_or_create_dataset_path_from_recipe
33
7
 
34
- output_dir = output_dir or str(utils.PATH_DATASET)
35
- dataset_path_base = Path(output_dir).absolute() / dataset_name
36
- dataset_path_base.mkdir(exist_ok=True, parents=True)
37
- dataset_path_sample = dataset_path_base / "sample"
38
8
 
39
- if dataset_path_sample.exists() and not force_redownload:
40
- user_logger.info("Dataset found locally. Set 'force=True' or add `--force` flag with cli to re-download")
41
- return dataset_path_sample
9
+ def load_dataset(recipe: Any, force_redownload: bool = False) -> HafniaDataset:
10
+ """Load a dataset either from a local path or from the Hafnia platform."""
42
11
 
43
- dataset_id = get_dataset_id(dataset_name, endpoint_dataset, api_key)
44
- dataset_access_info_url = f"{endpoint_dataset}/{dataset_id}/temporary-credentials"
12
+ path_dataset = get_dataset_path(recipe, force_redownload=force_redownload)
13
+ dataset = HafniaDataset.from_path(path_dataset)
14
+ return dataset
45
15
 
46
- if force_redownload and dataset_path_sample.exists():
47
- # Remove old files to avoid old files conflicting with new files
48
- shutil.rmtree(dataset_path_sample, ignore_errors=True)
49
- status = download_resource(dataset_access_info_url, str(dataset_path_base), api_key)
50
- if status:
51
- return dataset_path_sample
52
- raise RuntimeError("Failed to download dataset")
53
16
 
17
+ def get_dataset_path(recipe: Any, force_redownload: bool = False) -> Path:
18
+ if utils.is_hafnia_cloud_job():
19
+ return Path(os.getenv("MDI_DATASET_DIR", "/opt/ml/input/data/training"))
54
20
 
55
- def load_dataset(dataset_name: str, force_redownload: bool = False) -> Union[Dataset, DatasetDict]:
56
- """Load a dataset either from a local path or from the Hafnia platform."""
21
+ path_dataset = get_or_create_dataset_path_from_recipe(recipe, force_redownload=force_redownload)
57
22
 
58
- if utils.is_remote_job():
59
- path_dataset = Path(os.getenv("MDI_DATASET_DIR", "/opt/ml/input/data/training"))
60
- return load_local(path_dataset)
61
-
62
- path_dataset = download_or_get_dataset_path(
63
- dataset_name=dataset_name,
64
- force_redownload=force_redownload,
65
- )
66
- dataset = load_local(path_dataset)
67
- return dataset
23
+ return path_dataset
@@ -0,0 +1,91 @@
1
+ import io
2
+ import math
3
+ import random
4
+ from pathlib import Path
5
+ from typing import Dict, List
6
+
7
+ import numpy as np
8
+ import xxhash
9
+ from PIL import Image
10
+
11
+
12
+ def create_split_name_list_from_ratios(split_ratios: Dict[str, float], n_items: int, seed: int = 42) -> List[str]:
13
+ samples_per_split = split_sizes_from_ratios(split_ratios=split_ratios, n_items=n_items)
14
+
15
+ split_name_column = []
16
+ for split_name, n_split_samples in samples_per_split.items():
17
+ split_name_column.extend([split_name] * n_split_samples)
18
+ random.Random(seed).shuffle(split_name_column) # Shuffle the split names
19
+
20
+ return split_name_column
21
+
22
+
23
+ def hash_file_xxhash(path: Path, chunk_size: int = 262144) -> str:
24
+ hasher = xxhash.xxh3_64()
25
+
26
+ with open(path, "rb") as f:
27
+ for chunk in iter(lambda: f.read(chunk_size), b""): # 8192, 16384, 32768, 65536
28
+ hasher.update(chunk)
29
+ return hasher.hexdigest()
30
+
31
+
32
+ def hash_from_bytes(data: bytes) -> str:
33
+ hasher = xxhash.xxh3_64()
34
+ hasher.update(data)
35
+ return hasher.hexdigest()
36
+
37
+
38
+ def save_image_with_hash_name(image: np.ndarray, path_folder: Path) -> Path:
39
+ pil_image = Image.fromarray(image)
40
+ buffer = io.BytesIO()
41
+ pil_image.save(buffer, format="PNG")
42
+ hash_value = hash_from_bytes(buffer.getvalue())
43
+ path_image = Path(path_folder) / f"{hash_value}.png"
44
+ pil_image.save(path_image)
45
+ return path_image
46
+
47
+
48
+ def filename_as_hash_from_path(path_image: Path) -> str:
49
+ hash = hash_file_xxhash(path_image)
50
+ return f"{hash}{path_image.suffix}"
51
+
52
+
53
+ def split_sizes_from_ratios(n_items: int, split_ratios: Dict[str, float]) -> Dict[str, int]:
54
+ summed_ratios = sum(split_ratios.values())
55
+ abs_tols = 0.0011 # Allow some tolerance for floating point errors {"test": 0.333, "val": 0.333, "train": 0.333}
56
+ if not math.isclose(summed_ratios, 1.0, abs_tol=abs_tols): # Allow tolerance to allow e.g. (0.333, 0.333, 0.333)
57
+ raise ValueError(f"Split ratios must sum to 1.0. The summed values of {split_ratios} is {summed_ratios}")
58
+
59
+ # recaculate split sizes
60
+ split_ratios = {split_name: split_ratio / summed_ratios for split_name, split_ratio in split_ratios.items()}
61
+ split_sizes = {split_name: int(n_items * split_ratio) for split_name, split_ratio in split_ratios.items()}
62
+
63
+ remaining_items = n_items - sum(split_sizes.values())
64
+ if remaining_items > 0: # Distribute remaining items evenly across splits
65
+ for _ in range(remaining_items):
66
+ # Select name by the largest error from the expected distribution
67
+ total_size = sum(split_sizes.values())
68
+ distribution_error = {
69
+ split_name: abs(split_ratios[split_name] - (size / total_size))
70
+ for split_name, size in split_sizes.items()
71
+ }
72
+
73
+ split_with_largest_error = sorted(distribution_error.items(), key=lambda x: x[1], reverse=True)[0][0]
74
+ split_sizes[split_with_largest_error] += 1
75
+
76
+ if sum(split_sizes.values()) != n_items:
77
+ raise ValueError("Something is wrong. The split sizes do not match the number of items.")
78
+
79
+ return split_sizes
80
+
81
+
82
+ def select_evenly_across_list(lst: list, num_samples: int):
83
+ if num_samples >= len(lst):
84
+ return lst # No need to sample
85
+ step = (len(lst) - 1) / (num_samples - 1)
86
+ indices = [int(round(step * i)) for i in range(num_samples)] # noqa: RUF046
87
+ return [lst[index] for index in indices]
88
+
89
+
90
+ def prefix_dict(d: dict, prefix: str) -> dict:
91
+ return {f"{prefix}.{k}": v for k, v in d.items()}
@@ -0,0 +1,72 @@
1
+ from enum import Enum
2
+ from typing import List
3
+
4
+ FILENAME_RECIPE_JSON = "recipe.json"
5
+ FILENAME_DATASET_INFO = "dataset_info.json"
6
+ FILENAME_ANNOTATIONS_JSONL = "annotations.jsonl"
7
+ FILENAME_ANNOTATIONS_PARQUET = "annotations.parquet"
8
+
9
+ DATASET_FILENAMES_REQUIRED = [
10
+ FILENAME_DATASET_INFO,
11
+ FILENAME_ANNOTATIONS_JSONL,
12
+ FILENAME_ANNOTATIONS_PARQUET,
13
+ ]
14
+
15
+
16
+ class DeploymentStage(Enum):
17
+ STAGING = "staging"
18
+ PRODUCTION = "production"
19
+
20
+
21
+ class FieldName:
22
+ CLASS_NAME: str = "class_name" # Name of the class this primitive is associated with, e.g. "car" for Bbox
23
+ CLASS_IDX: str = (
24
+ "class_idx" # Index of the class this primitive is associated with, e.g. 0 for "car" if it is the first class
25
+ )
26
+ OBJECT_ID: str = "object_id" # Unique identifier for the object, e.g. "12345123"
27
+ CONFIDENCE: str = "confidence" # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
28
+
29
+ META: str = "meta" # Contains metadata about each primitive, e.g. attributes color, occluded, iscrowd, etc.
30
+ TASK_NAME: str = "task_name" # Name of the task this primitive is associated with, e.g. "bboxes" for Bbox
31
+
32
+ @staticmethod
33
+ def fields() -> List[str]:
34
+ """
35
+ Returns a list of expected field names for primitives.
36
+ """
37
+ return [
38
+ FieldName.CLASS_NAME,
39
+ FieldName.CLASS_IDX,
40
+ FieldName.OBJECT_ID,
41
+ FieldName.CONFIDENCE,
42
+ FieldName.META,
43
+ FieldName.TASK_NAME,
44
+ ]
45
+
46
+
47
+ class ColumnName:
48
+ SAMPLE_INDEX: str = "sample_index"
49
+ FILE_NAME: str = "file_name"
50
+ HEIGHT: str = "height"
51
+ WIDTH: str = "width"
52
+ SPLIT: str = "split"
53
+ IS_SAMPLE: str = "is_sample"
54
+ REMOTE_PATH: str = "remote_path" # Path to the file in remote storage, e.g. S3
55
+ META: str = "meta"
56
+
57
+
58
+ class SplitName:
59
+ TRAIN = "train"
60
+ VAL = "validation"
61
+ TEST = "test"
62
+ UNDEFINED = "UNDEFINED"
63
+
64
+ @staticmethod
65
+ def valid_splits() -> List[str]:
66
+ return [SplitName.TRAIN, SplitName.VAL, SplitName.TEST]
67
+
68
+
69
+ class DatasetVariant(Enum):
70
+ DUMP = "dump"
71
+ SAMPLE = "sample"
72
+ HIDDEN = "hidden"