hafnia 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ import os
4
4
  import shutil
5
5
  from dataclasses import dataclass
6
6
  from pathlib import Path
7
+ from random import Random
7
8
  from typing import Any, Dict, List, Optional, Type, Union
8
9
 
9
10
  import more_itertools
@@ -16,16 +17,23 @@ from rich import print as rprint
16
17
  from rich.table import Table
17
18
  from tqdm import tqdm
18
19
 
19
- from hafnia.dataset import dataset_helpers, dataset_transformation
20
+ from hafnia.dataset import dataset_helpers
20
21
  from hafnia.dataset.dataset_names import (
21
- DATASET_FILENAMES,
22
+ DATASET_FILENAMES_REQUIRED,
22
23
  FILENAME_ANNOTATIONS_JSONL,
23
24
  FILENAME_ANNOTATIONS_PARQUET,
24
25
  FILENAME_DATASET_INFO,
26
+ FILENAME_RECIPE_JSON,
25
27
  ColumnName,
26
28
  FieldName,
27
29
  SplitName,
28
30
  )
31
+ from hafnia.dataset.operations import dataset_stats, dataset_transformations
32
+ from hafnia.dataset.operations.table_transformations import (
33
+ check_image_paths,
34
+ create_primitive_table,
35
+ read_table_from_path,
36
+ )
29
37
  from hafnia.dataset.primitives import (
30
38
  PRIMITIVE_NAME_TO_TYPE,
31
39
  PRIMITIVE_TYPES,
@@ -35,11 +43,6 @@ from hafnia.dataset.primitives.bitmask import Bitmask
35
43
  from hafnia.dataset.primitives.classification import Classification
36
44
  from hafnia.dataset.primitives.polygon import Polygon
37
45
  from hafnia.dataset.primitives.primitive import Primitive
38
- from hafnia.dataset.table_transformations import (
39
- check_image_paths,
40
- create_primitive_table,
41
- read_table_from_path,
42
- )
43
46
  from hafnia.log import user_logger
44
47
 
45
48
 
@@ -171,13 +174,33 @@ class HafniaDataset:
171
174
  for row in self.samples.iter_rows(named=True):
172
175
  yield row
173
176
 
174
- # Dataset transformations
175
- apply_image_transform = dataset_transformation.transform_images
176
- sample = dataset_transformation.sample
177
- shuffle = dataset_transformation.shuffle_dataset
178
- split_by_ratios = dataset_transformation.splits_by_ratios
179
- divide_split_into_multiple_splits = dataset_transformation.divide_split_into_multiple_splits
180
- sample_set_by_size = dataset_transformation.define_sample_set_by_size
177
+ @staticmethod
178
+ def from_path(path_folder: Path, check_for_images: bool = True) -> "HafniaDataset":
179
+ HafniaDataset.check_dataset_path(path_folder, raise_error=True)
180
+
181
+ dataset_info = DatasetInfo.from_json_file(path_folder / FILENAME_DATASET_INFO)
182
+ table = read_table_from_path(path_folder)
183
+
184
+ # Convert from relative paths to absolute paths
185
+ table = table.with_columns(
186
+ pl.concat_str([pl.lit(str(path_folder.absolute()) + os.sep), pl.col("file_name")]).alias("file_name")
187
+ )
188
+ if check_for_images:
189
+ check_image_paths(table)
190
+ return HafniaDataset(samples=table, info=dataset_info)
191
+
192
+ @staticmethod
193
+ def from_name(name: str, force_redownload: bool = False, download_files: bool = True) -> "HafniaDataset":
194
+ """
195
+ Load a dataset by its name. The dataset must be registered in the Hafnia platform.
196
+ """
197
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
198
+ from hafnia.platform.datasets import download_or_get_dataset_path
199
+
200
+ dataset_path = download_or_get_dataset_path(
201
+ dataset_name=name, force_redownload=force_redownload, download_files=download_files
202
+ )
203
+ return HafniaDataset.from_path(dataset_path, check_for_images=download_files)
181
204
 
182
205
  @staticmethod
183
206
  def from_samples_list(samples_list: List, info: DatasetInfo) -> "HafniaDataset":
@@ -194,6 +217,140 @@ class HafniaDataset:
194
217
 
195
218
  return HafniaDataset(info=info, samples=table)
196
219
 
220
+ @staticmethod
221
+ def from_recipe(dataset_recipe: Any) -> "HafniaDataset":
222
+ """
223
+ Load a dataset from a recipe. The recipe can be a string (name of the dataset), a dictionary, or a DataRecipe object.
224
+ """
225
+ from hafnia.dataset.dataset_recipe.dataset_recipe import DatasetRecipe
226
+
227
+ recipe_explicit = DatasetRecipe.from_implicit_form(dataset_recipe)
228
+
229
+ return recipe_explicit.build() # Build dataset from the recipe
230
+
231
+ @staticmethod
232
+ def from_merge(dataset0: "HafniaDataset", dataset1: "HafniaDataset") -> "HafniaDataset":
233
+ return HafniaDataset.merge(dataset0, dataset1)
234
+
235
+ @staticmethod
236
+ def from_recipe_with_cache(
237
+ dataset_recipe: Any,
238
+ force_redownload: bool = False,
239
+ path_datasets: Optional[Union[Path, str]] = None,
240
+ ) -> "HafniaDataset":
241
+ """
242
+ Loads a dataset from a recipe and caches it to disk.
243
+ If the dataset is already cached, it will be loaded from the cache.
244
+ """
245
+
246
+ path_dataset = get_or_create_dataset_path_from_recipe(dataset_recipe, path_datasets=path_datasets)
247
+ return HafniaDataset.from_path(path_dataset, check_for_images=False)
248
+
249
+ @staticmethod
250
+ def from_merger(
251
+ datasets: List[HafniaDataset],
252
+ ) -> "HafniaDataset":
253
+ """
254
+ Merges multiple Hafnia datasets into one.
255
+ """
256
+ if len(datasets) == 0:
257
+ raise ValueError("No datasets to merge. Please provide at least one dataset.")
258
+
259
+ if len(datasets) == 1:
260
+ return datasets[0]
261
+
262
+ merged_dataset = datasets[0]
263
+ remaining_datasets = datasets[1:]
264
+ for dataset in remaining_datasets:
265
+ merged_dataset = HafniaDataset.merge(merged_dataset, dataset)
266
+ return merged_dataset
267
+
268
+ # Dataset transformations
269
+ transform_images = dataset_transformations.transform_images
270
+
271
+ def shuffle(dataset: HafniaDataset, seed: int = 42) -> HafniaDataset:
272
+ table = dataset.samples.sample(n=len(dataset), with_replacement=False, seed=seed, shuffle=True)
273
+ return dataset.update_table(table)
274
+
275
+ def select_samples(
276
+ dataset: "HafniaDataset", n_samples: int, shuffle: bool = True, seed: int = 42, with_replacement: bool = False
277
+ ) -> "HafniaDataset":
278
+ if not with_replacement:
279
+ n_samples = min(n_samples, len(dataset))
280
+ table = dataset.samples.sample(n=n_samples, with_replacement=with_replacement, seed=seed, shuffle=shuffle)
281
+ return dataset.update_table(table)
282
+
283
+ def splits_by_ratios(dataset: "HafniaDataset", split_ratios: Dict[str, float], seed: int = 42) -> "HafniaDataset":
284
+ """
285
+ Divides the dataset into splits based on the provided ratios.
286
+
287
+ Example: Defining split ratios and applying the transformation
288
+
289
+ >>> dataset = HafniaDataset.read_from_path(Path("path/to/dataset"))
290
+ >>> split_ratios = {SplitName.TRAIN: 0.8, SplitName.VAL: 0.1, SplitName.TEST: 0.1}
291
+ >>> dataset_with_splits = splits_by_ratios(dataset, split_ratios, seed=42)
292
+ Or use the function as a
293
+ >>> dataset_with_splits = dataset.splits_by_ratios(split_ratios, seed=42)
294
+ """
295
+ n_items = len(dataset)
296
+ split_name_column = dataset_helpers.create_split_name_list_from_ratios(
297
+ split_ratios=split_ratios, n_items=n_items, seed=seed
298
+ )
299
+ table = dataset.samples.with_columns(pl.Series(split_name_column).alias("split"))
300
+ return dataset.update_table(table)
301
+
302
+ def split_into_multiple_splits(
303
+ dataset: "HafniaDataset",
304
+ split_name: str,
305
+ split_ratios: Dict[str, float],
306
+ ) -> "HafniaDataset":
307
+ """
308
+ Divides a dataset split ('split_name') into multiple splits based on the provided split
309
+ ratios ('split_ratios'). This is especially useful for some open datasets where they have only provide
310
+ two splits or only provide annotations for two splits. This function allows you to create additional
311
+ splits based on the provided ratios.
312
+
313
+ Example: Defining split ratios and applying the transformation
314
+ >>> dataset = HafniaDataset.read_from_path(Path("path/to/dataset"))
315
+ >>> split_name = SplitName.TEST
316
+ >>> split_ratios = {SplitName.TEST: 0.8, SplitName.VAL: 0.2}
317
+ >>> dataset_with_splits = split_into_multiple_splits(dataset, split_name, split_ratios)
318
+ """
319
+ dataset_split_to_be_divided = dataset.create_split_dataset(split_name=split_name)
320
+ if len(dataset_split_to_be_divided) == 0:
321
+ split_counts = dict(dataset.samples.select(pl.col(ColumnName.SPLIT).value_counts()).iter_rows())
322
+ raise ValueError(f"No samples in the '{split_name}' split to divide into multiple splits. {split_counts=}")
323
+ assert len(dataset_split_to_be_divided) > 0, f"No samples in the '{split_name}' split!"
324
+ dataset_split_to_be_divided = dataset_split_to_be_divided.splits_by_ratios(split_ratios=split_ratios, seed=42)
325
+
326
+ remaining_data = dataset.samples.filter(pl.col(ColumnName.SPLIT).is_in([split_name]).not_())
327
+ new_table = pl.concat([remaining_data, dataset_split_to_be_divided.samples], how="vertical")
328
+ dataset_new = dataset.update_table(new_table)
329
+ return dataset_new
330
+
331
+ def define_sample_set_by_size(dataset: "HafniaDataset", n_samples: int, seed: int = 42) -> "HafniaDataset":
332
+ is_sample_indices = Random(seed).sample(range(len(dataset)), n_samples)
333
+ is_sample_column = [False for _ in range(len(dataset))]
334
+ for idx in is_sample_indices:
335
+ is_sample_column[idx] = True
336
+
337
+ table = dataset.samples.with_columns(pl.Series(is_sample_column).alias("is_sample"))
338
+ return dataset.update_table(table)
339
+
340
+ def merge(dataset0: "HafniaDataset", dataset1: "HafniaDataset") -> "HafniaDataset":
341
+ """
342
+ Merges two Hafnia datasets by concatenating their samples and updating the split names.
343
+ """
344
+ ## Currently, only a very naive merging is implemented.
345
+ # In the future we need to verify that the class and tasks are compatible.
346
+ # Do they have similar classes and tasks? What to do if they don't?
347
+ # For now, we just concatenate the samples and keep the split names as they are.
348
+ merged_samples = pl.concat([dataset0.samples, dataset1.samples], how="vertical")
349
+ return dataset0.update_table(merged_samples)
350
+
351
+ # Dataset stats
352
+ split_counts = dataset_stats.split_counts
353
+
197
354
  def as_dict_dataset_splits(self) -> Dict[str, "HafniaDataset"]:
198
355
  if ColumnName.SPLIT not in self.samples.columns:
199
356
  raise ValueError(f"Dataset must contain a '{ColumnName.SPLIT}' column.")
@@ -256,21 +413,6 @@ class HafniaDataset:
256
413
 
257
414
  return True
258
415
 
259
- @staticmethod
260
- def read_from_path(path_folder: Path, check_for_images: bool = True) -> "HafniaDataset":
261
- HafniaDataset.check_dataset_path(path_folder, raise_error=True)
262
-
263
- dataset_info = DatasetInfo.from_json_file(path_folder / FILENAME_DATASET_INFO)
264
- table = read_table_from_path(path_folder)
265
-
266
- # Convert from relative paths to absolute paths
267
- table = table.with_columns(
268
- pl.concat_str([pl.lit(str(path_folder.absolute()) + os.sep), pl.col("file_name")]).alias("file_name")
269
- )
270
- if check_for_images:
271
- check_image_paths(table)
272
- return HafniaDataset(samples=table, info=dataset_info)
273
-
274
416
  def write(self, path_folder: Path, name_by_hash: bool = True, add_version: bool = False) -> None:
275
417
  user_logger.info(f"Writing dataset to {path_folder}...")
276
418
  if not path_folder.exists():
@@ -303,7 +445,7 @@ class HafniaDataset:
303
445
  if add_version:
304
446
  path_version = path_folder / "versions" / f"{self.info.version}"
305
447
  path_version.mkdir(parents=True, exist_ok=True)
306
- for filename in DATASET_FILENAMES:
448
+ for filename in DATASET_FILENAMES_REQUIRED:
307
449
  shutil.copy2(path_folder / filename, path_version / filename)
308
450
 
309
451
  def __eq__(self, value) -> bool:
@@ -363,10 +505,39 @@ class HafniaDataset:
363
505
 
364
506
 
365
507
  def check_hafnia_dataset_from_path(path_dataset: Path) -> None:
366
- dataset = HafniaDataset.read_from_path(path_dataset, check_for_images=True)
508
+ dataset = HafniaDataset.from_path(path_dataset, check_for_images=True)
367
509
  check_hafnia_dataset(dataset)
368
510
 
369
511
 
512
+ def get_or_create_dataset_path_from_recipe(
513
+ dataset_recipe: Any,
514
+ force_redownload: bool = False,
515
+ path_datasets: Optional[Union[Path, str]] = None,
516
+ ) -> Path:
517
+ from hafnia.dataset.dataset_recipe.dataset_recipe import (
518
+ DatasetRecipe,
519
+ get_dataset_path_from_recipe,
520
+ )
521
+
522
+ recipe: DatasetRecipe = DatasetRecipe.from_implicit_form(dataset_recipe)
523
+ path_dataset = get_dataset_path_from_recipe(recipe, path_datasets=path_datasets)
524
+
525
+ if force_redownload:
526
+ shutil.rmtree(path_dataset, ignore_errors=True)
527
+
528
+ if HafniaDataset.check_dataset_path(path_dataset, raise_error=False):
529
+ return path_dataset
530
+
531
+ path_dataset.mkdir(parents=True, exist_ok=True)
532
+ path_recipe_json = path_dataset / FILENAME_RECIPE_JSON
533
+ path_recipe_json.write_text(recipe.model_dump_json(indent=4))
534
+
535
+ dataset: HafniaDataset = recipe.build()
536
+ dataset.write(path_dataset)
537
+
538
+ return path_dataset
539
+
540
+
370
541
  def check_hafnia_dataset(dataset: HafniaDataset):
371
542
  user_logger.info("Checking Hafnia dataset...")
372
543
  assert isinstance(dataset.info.version, str) and len(dataset.info.version) > 0
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Dict
4
+
5
+ from hafnia.dataset.dataset_names import ColumnName
6
+
7
+ if TYPE_CHECKING:
8
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
9
+
10
+
11
+ def split_counts(dataset: HafniaDataset) -> Dict[str, int]:
12
+ """
13
+ Returns a dictionary with the counts of samples in each split of the dataset.
14
+ """
15
+ return dict(dataset.samples[ColumnName.SPLIT].value_counts().iter_rows())
@@ -0,0 +1,82 @@
1
+ """
2
+ Hafnia dataset transformations that takes and returns a HafniaDataset object.
3
+
4
+ All functions here will have a corresponding function in both the HafniaDataset class
5
+ and a corresponding RecipeTransform class in the `data_recipe/recipe_transformations.py` file.
6
+
7
+ This allows each function to be used in three ways:
8
+
9
+ ```python
10
+ from hafnia.dataset.operations import dataset_transformations
11
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
12
+ from hafnia.dataset.data_recipe.recipe_transformations import SplitByRatios
13
+
14
+ splits_by_ratios = {"train": 0.8, "val": 0.1, "test": 0.1}
15
+
16
+ # Option 1: Using the function directly
17
+ dataset = recipe_transformations.splits_by_ratios(dataset, split_ratios=splits_by_ratios)
18
+
19
+ # Option 2: Using the method of the HafniaDataset class
20
+ dataset = dataset.splits_by_ratios(split_ratios=splits_by_ratios)
21
+
22
+ # Option 3: Using the RecipeTransform class
23
+ serializable_transform = SplitByRatios(split_ratios=splits_by_ratios)
24
+ dataset = serializable_transform(dataset)
25
+ ```
26
+
27
+ Tests will ensure that all functions in this file will have a corresponding function in the
28
+ HafniaDataset class and a RecipeTransform class in the `data_recipe/recipe_transformations.py` file and
29
+ that the signatures match.
30
+ """
31
+
32
+ from pathlib import Path
33
+ from typing import TYPE_CHECKING, Callable
34
+
35
+ import cv2
36
+ import numpy as np
37
+ import polars as pl
38
+ from PIL import Image
39
+ from tqdm import tqdm
40
+
41
+ from hafnia.dataset import dataset_helpers
42
+
43
+ if TYPE_CHECKING:
44
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
45
+
46
+
47
+ ### Image transformations ###
48
+ class AnonymizeByPixelation:
49
+ def __init__(self, resize_factor: float = 0.10):
50
+ self.resize_factor = resize_factor
51
+
52
+ def __call__(self, frame: np.ndarray) -> np.ndarray:
53
+ org_size = frame.shape[:2]
54
+ frame = cv2.resize(frame, (0, 0), fx=self.resize_factor, fy=self.resize_factor)
55
+ frame = cv2.resize(frame, org_size[::-1], interpolation=cv2.INTER_NEAREST)
56
+ return frame
57
+
58
+
59
+ def transform_images(
60
+ dataset: "HafniaDataset",
61
+ transform: Callable[[np.ndarray], np.ndarray],
62
+ path_output: Path,
63
+ ) -> "HafniaDataset":
64
+ new_paths = []
65
+ path_image_folder = path_output / "data"
66
+ path_image_folder.mkdir(parents=True, exist_ok=True)
67
+
68
+ for org_path in tqdm(dataset.samples["file_name"].to_list(), desc="Transform images"):
69
+ org_path = Path(org_path)
70
+ if not org_path.exists():
71
+ raise FileNotFoundError(f"File {org_path} does not exist in the dataset.")
72
+
73
+ image = np.array(Image.open(org_path))
74
+ image_transformed = transform(image)
75
+ new_path = dataset_helpers.save_image_with_hash_name(image_transformed, path_image_folder)
76
+
77
+ if not new_path.exists():
78
+ raise FileNotFoundError(f"Transformed file {new_path} does not exist in the dataset.")
79
+ new_paths.append(str(new_path))
80
+
81
+ table = dataset.samples.with_columns(pl.Series(new_paths).alias("file_name"))
82
+ return dataset.update_table(table)
@@ -4,12 +4,12 @@ from typing import List, Optional, Type
4
4
  import polars as pl
5
5
  from tqdm import tqdm
6
6
 
7
- from hafnia.dataset import table_transformations
8
7
  from hafnia.dataset.dataset_names import (
9
8
  FILENAME_ANNOTATIONS_JSONL,
10
9
  FILENAME_ANNOTATIONS_PARQUET,
11
10
  FieldName,
12
11
  )
12
+ from hafnia.dataset.operations import table_transformations
13
13
  from hafnia.dataset.primitives import PRIMITIVE_TYPES
14
14
  from hafnia.dataset.primitives.classification import Classification
15
15
  from hafnia.dataset.primitives.primitive import Primitive
@@ -14,7 +14,7 @@ from pydantic import BaseModel, field_validator
14
14
  from hafnia.data.factory import load_dataset
15
15
  from hafnia.dataset.hafnia_dataset import HafniaDataset
16
16
  from hafnia.log import sys_logger, user_logger
17
- from hafnia.utils import is_remote_job, now_as_str
17
+ from hafnia.utils import is_hafnia_cloud_job, now_as_str
18
18
 
19
19
 
20
20
  class EntityType(Enum):
@@ -101,7 +101,7 @@ class HafniaLogger:
101
101
 
102
102
  def path_local_experiment(self) -> Path:
103
103
  """Get the path for local experiment."""
104
- if is_remote_job():
104
+ if is_hafnia_cloud_job():
105
105
  raise RuntimeError("Cannot access local experiment path in remote job.")
106
106
  return self._local_experiment_path
107
107
 
@@ -110,7 +110,7 @@ class HafniaLogger:
110
110
  if "MDI_CHECKPOINT_DIR" in os.environ:
111
111
  return Path(os.environ["MDI_CHECKPOINT_DIR"])
112
112
 
113
- if is_remote_job():
113
+ if is_hafnia_cloud_job():
114
114
  return Path("/opt/ml/checkpoints")
115
115
  return self.path_local_experiment() / "checkpoints"
116
116
 
@@ -119,7 +119,7 @@ class HafniaLogger:
119
119
  if "MDI_ARTIFACT_DIR" in os.environ:
120
120
  return Path(os.environ["MDI_ARTIFACT_DIR"])
121
121
 
122
- if is_remote_job():
122
+ if is_hafnia_cloud_job():
123
123
  return Path("/opt/ml/output/data")
124
124
 
125
125
  return self.path_local_experiment() / "data"
@@ -129,7 +129,7 @@ class HafniaLogger:
129
129
  if "MDI_MODEL_DIR" in os.environ:
130
130
  return Path(os.environ["MDI_MODEL_DIR"])
131
131
 
132
- if is_remote_job():
132
+ if is_hafnia_cloud_job():
133
133
  return Path("/opt/ml/model")
134
134
 
135
135
  return self.path_local_experiment() / "model"
hafnia/helper_testing.py CHANGED
@@ -1,4 +1,7 @@
1
+ from inspect import getmembers, isfunction, signature
1
2
  from pathlib import Path
3
+ from types import FunctionType
4
+ from typing import Any, Callable, Dict, Union, get_origin
2
5
 
3
6
  from hafnia import utils
4
7
  from hafnia.dataset.dataset_names import FILENAME_ANNOTATIONS_JSONL, DatasetVariant
@@ -38,8 +41,8 @@ def get_path_micro_hafnia_dataset(dataset_name: str, force_update=False) -> Path
38
41
  if path_test_dataset_annotations.exists() and not force_update:
39
42
  return path_test_dataset
40
43
 
41
- hafnia_dataset = HafniaDataset.read_from_path(path_dataset / DatasetVariant.SAMPLE.value)
42
- hafnia_dataset = hafnia_dataset.sample(n_samples=3, seed=42)
44
+ hafnia_dataset = HafniaDataset.from_path(path_dataset / DatasetVariant.SAMPLE.value)
45
+ hafnia_dataset = hafnia_dataset.select_samples(n_samples=3, seed=42)
43
46
  hafnia_dataset.write(path_test_dataset)
44
47
 
45
48
  if force_update:
@@ -59,5 +62,47 @@ def get_sample_micro_hafnia_dataset(dataset_name: str, force_update=False) -> Sa
59
62
 
60
63
  def get_micro_hafnia_dataset(dataset_name: str, force_update: bool = False) -> HafniaDataset:
61
64
  path_dataset = get_path_micro_hafnia_dataset(dataset_name=dataset_name, force_update=force_update)
62
- hafnia_dataset = HafniaDataset.read_from_path(path_dataset)
65
+ hafnia_dataset = HafniaDataset.from_path(path_dataset)
63
66
  return hafnia_dataset
67
+
68
+
69
+ def is_hafnia_configured() -> bool:
70
+ """
71
+ Check if Hafnia is configured by verifying if the API key is set.
72
+ """
73
+ from cli.config import Config
74
+
75
+ return Config().is_configured()
76
+
77
+
78
+ def is_typing_type(annotation: Any) -> bool:
79
+ return get_origin(annotation) is not None
80
+
81
+
82
+ def annotation_as_string(annotation: Union[type, str]) -> str:
83
+ """Convert type annotation to string."""
84
+ if isinstance(annotation, str):
85
+ return annotation.replace("'", "")
86
+ if is_typing_type(annotation): # Is using typing types like List, Dict, etc.
87
+ return str(annotation).replace("typing.", "")
88
+ if hasattr(annotation, "__name__"):
89
+ return annotation.__name__
90
+ return str(annotation)
91
+
92
+
93
+ def get_hafnia_functions_from_module(python_module) -> Dict[str, FunctionType]:
94
+ def dataset_is_first_arg(func: Callable) -> bool:
95
+ """
96
+ Check if the function has 'HafniaDataset' as the first parameter.
97
+ """
98
+ func_signature = signature(func)
99
+ params = func_signature.parameters
100
+ if len(params) == 0:
101
+ return False
102
+ first_argument_type = list(params.values())[0]
103
+
104
+ annotation_as_str = annotation_as_string(first_argument_type.annotation)
105
+ return annotation_as_str == "HafniaDataset"
106
+
107
+ functions = {func[0]: func[1] for func in getmembers(python_module, isfunction) if dataset_is_first_arg(func[1])}
108
+ return functions
@@ -10,10 +10,14 @@ from tqdm import tqdm
10
10
 
11
11
  from cli.config import Config
12
12
  from hafnia import utils
13
- from hafnia.dataset import dataset_names
13
+ from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED, ColumnName
14
+ from hafnia.dataset.dataset_recipe.dataset_recipe import (
15
+ DatasetRecipe,
16
+ get_dataset_path_from_recipe,
17
+ )
14
18
  from hafnia.dataset.hafnia_dataset import HafniaDataset
15
19
  from hafnia.http import fetch
16
- from hafnia.log import user_logger
20
+ from hafnia.log import sys_logger, user_logger
17
21
  from hafnia.platform import get_dataset_id
18
22
  from hafnia.platform.download import get_resource_credentials
19
23
  from hafnia.utils import timed
@@ -37,13 +41,11 @@ def download_or_get_dataset_path(
37
41
  cfg: Optional[Config] = None,
38
42
  path_datasets_folder: Optional[str] = None,
39
43
  force_redownload: bool = False,
44
+ download_files: bool = True,
40
45
  ) -> Path:
41
46
  """Download or get the path of the dataset."""
42
- if utils.is_remote_job():
43
- return Path(os.getenv("MDI_DATASET_DIR", "/opt/ml/input/data/training"))
44
-
45
- path_datasets_folder = path_datasets_folder or str(utils.PATH_DATASETS)
46
- path_dataset = Path(path_datasets_folder).absolute() / dataset_name
47
+ recipe_explicit = DatasetRecipe.from_implicit_form(dataset_name)
48
+ path_dataset = get_dataset_path_from_recipe(recipe_explicit, path_datasets=path_datasets_folder)
47
49
 
48
50
  is_dataset_valid = HafniaDataset.check_dataset_path(path_dataset, raise_error=False)
49
51
  if is_dataset_valid and not force_redownload:
@@ -57,22 +59,30 @@ def download_or_get_dataset_path(
57
59
 
58
60
  endpoint_dataset = cfg.get_platform_endpoint("datasets")
59
61
  dataset_id = get_dataset_id(dataset_name=dataset_name, endpoint=endpoint_dataset, api_key=api_key)
62
+ if dataset_id is None:
63
+ sys_logger.error(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
60
64
  access_dataset_endpoint = f"{endpoint_dataset}/{dataset_id}/temporary-credentials"
61
65
 
62
66
  download_dataset_from_access_endpoint(
63
67
  endpoint=access_dataset_endpoint,
64
68
  api_key=api_key,
65
69
  path_dataset=path_dataset,
70
+ download_files=download_files,
66
71
  )
67
72
  return path_dataset
68
73
 
69
74
 
70
- def download_dataset_from_access_endpoint(endpoint: str, api_key: str, path_dataset: Path) -> None:
75
+ def download_dataset_from_access_endpoint(
76
+ endpoint: str,
77
+ api_key: str,
78
+ path_dataset: Path,
79
+ download_files: bool = True,
80
+ ) -> None:
71
81
  resource_credentials = get_resource_credentials(endpoint, api_key)
72
82
 
73
- local_dataset_paths = [str(path_dataset / filename) for filename in dataset_names.DATASET_FILENAMES]
83
+ local_dataset_paths = [str(path_dataset / filename) for filename in DATASET_FILENAMES_REQUIRED]
74
84
  s3_uri = resource_credentials.s3_uri()
75
- s3_dataset_files = [f"{s3_uri}/{filename}" for filename in dataset_names.DATASET_FILENAMES]
85
+ s3_dataset_files = [f"{s3_uri}/{filename}" for filename in DATASET_FILENAMES_REQUIRED]
76
86
 
77
87
  envs = resource_credentials.aws_credentials()
78
88
  fast_copy_files_s3(
@@ -82,10 +92,13 @@ def download_dataset_from_access_endpoint(endpoint: str, api_key: str, path_data
82
92
  description="Downloading annotations",
83
93
  )
84
94
 
85
- dataset = HafniaDataset.read_from_path(path_dataset, check_for_images=False)
95
+ if not download_files:
96
+ return
97
+
98
+ dataset = HafniaDataset.from_path(path_dataset, check_for_images=False)
86
99
  fast_copy_files_s3(
87
- src_paths=dataset.samples[dataset_names.ColumnName.REMOTE_PATH].to_list(),
88
- dst_paths=dataset.samples[dataset_names.ColumnName.FILE_NAME].to_list(),
100
+ src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
101
+ dst_paths=dataset.samples[ColumnName.FILE_NAME].to_list(),
89
102
  append_envs=envs,
90
103
  description="Downloading images",
91
104
  )
hafnia/utils.py CHANGED
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import os
2
3
  import time
3
4
  import zipfile
@@ -132,6 +133,24 @@ def show_recipe_content(recipe_path: Path, style: str = "emoji", depth_limit: in
132
133
  user_logger.info(f"Recipe size: {size_human_readable(os.path.getsize(recipe_path))}. Max size 800 MiB")
133
134
 
134
135
 
135
- def is_remote_job() -> bool:
136
+ def is_hafnia_cloud_job() -> bool:
136
137
  """Check if the current job is running in HAFNIA cloud environment."""
137
138
  return os.getenv("HAFNIA_CLOUD", "false").lower() == "true"
139
+
140
+
141
+ def pascal_to_snake_case(name: str) -> str:
142
+ """
143
+ Convert PascalCase to snake_case.
144
+ """
145
+ return "".join(["_" + char.lower() if char.isupper() else char for char in name]).lstrip("_")
146
+
147
+
148
+ def snake_to_pascal_case(name: str) -> str:
149
+ """
150
+ Convert snake_case to PascalCase.
151
+ """
152
+ return "".join(word.capitalize() for word in name.split("_"))
153
+
154
+
155
+ def hash_from_string(s: str) -> str:
156
+ return hashlib.md5(s.encode("utf-8")).hexdigest()
@@ -175,7 +175,7 @@ def save_dataset_sample_set_visualizations(
175
175
  draw_settings: Optional[Dict[Type[Primitive], Dict]] = None,
176
176
  anonymize_settings: Optional[Dict[Type[Primitive], Dict]] = None,
177
177
  ) -> List[Path]:
178
- dataset = HafniaDataset.read_from_path(path_dataset)
178
+ dataset = HafniaDataset.from_path(path_dataset)
179
179
  shutil.rmtree(path_output_folder, ignore_errors=True)
180
180
  path_output_folder.mkdir(parents=True)
181
181