hafnia 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/config.py +17 -4
- hafnia/data/factory.py +13 -10
- hafnia/dataset/dataset_names.py +2 -1
- hafnia/dataset/dataset_recipe/dataset_recipe.py +327 -0
- hafnia/dataset/dataset_recipe/recipe_transforms.py +53 -0
- hafnia/dataset/dataset_recipe/recipe_types.py +140 -0
- hafnia/dataset/hafnia_dataset.py +202 -31
- hafnia/dataset/operations/dataset_stats.py +15 -0
- hafnia/dataset/operations/dataset_transformations.py +82 -0
- hafnia/dataset/{table_transformations.py → operations/table_transformations.py} +1 -1
- hafnia/experiment/hafnia_logger.py +5 -5
- hafnia/helper_testing.py +48 -3
- hafnia/platform/datasets.py +26 -13
- hafnia/utils.py +20 -1
- hafnia/visualizations/image_visualizations.py +1 -1
- {hafnia-0.2.0.dist-info → hafnia-0.2.1.dist-info}/METADATA +17 -20
- {hafnia-0.2.0.dist-info → hafnia-0.2.1.dist-info}/RECORD +20 -16
- hafnia/dataset/dataset_transformation.py +0 -187
- {hafnia-0.2.0.dist-info → hafnia-0.2.1.dist-info}/WHEEL +0 -0
- {hafnia-0.2.0.dist-info → hafnia-0.2.1.dist-info}/entry_points.txt +0 -0
- {hafnia-0.2.0.dist-info → hafnia-0.2.1.dist-info}/licenses/LICENSE +0 -0
hafnia/dataset/hafnia_dataset.py
CHANGED
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import shutil
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
|
+
from random import Random
|
|
7
8
|
from typing import Any, Dict, List, Optional, Type, Union
|
|
8
9
|
|
|
9
10
|
import more_itertools
|
|
@@ -16,16 +17,23 @@ from rich import print as rprint
|
|
|
16
17
|
from rich.table import Table
|
|
17
18
|
from tqdm import tqdm
|
|
18
19
|
|
|
19
|
-
from hafnia.dataset import dataset_helpers
|
|
20
|
+
from hafnia.dataset import dataset_helpers
|
|
20
21
|
from hafnia.dataset.dataset_names import (
|
|
21
|
-
|
|
22
|
+
DATASET_FILENAMES_REQUIRED,
|
|
22
23
|
FILENAME_ANNOTATIONS_JSONL,
|
|
23
24
|
FILENAME_ANNOTATIONS_PARQUET,
|
|
24
25
|
FILENAME_DATASET_INFO,
|
|
26
|
+
FILENAME_RECIPE_JSON,
|
|
25
27
|
ColumnName,
|
|
26
28
|
FieldName,
|
|
27
29
|
SplitName,
|
|
28
30
|
)
|
|
31
|
+
from hafnia.dataset.operations import dataset_stats, dataset_transformations
|
|
32
|
+
from hafnia.dataset.operations.table_transformations import (
|
|
33
|
+
check_image_paths,
|
|
34
|
+
create_primitive_table,
|
|
35
|
+
read_table_from_path,
|
|
36
|
+
)
|
|
29
37
|
from hafnia.dataset.primitives import (
|
|
30
38
|
PRIMITIVE_NAME_TO_TYPE,
|
|
31
39
|
PRIMITIVE_TYPES,
|
|
@@ -35,11 +43,6 @@ from hafnia.dataset.primitives.bitmask import Bitmask
|
|
|
35
43
|
from hafnia.dataset.primitives.classification import Classification
|
|
36
44
|
from hafnia.dataset.primitives.polygon import Polygon
|
|
37
45
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
38
|
-
from hafnia.dataset.table_transformations import (
|
|
39
|
-
check_image_paths,
|
|
40
|
-
create_primitive_table,
|
|
41
|
-
read_table_from_path,
|
|
42
|
-
)
|
|
43
46
|
from hafnia.log import user_logger
|
|
44
47
|
|
|
45
48
|
|
|
@@ -171,13 +174,33 @@ class HafniaDataset:
|
|
|
171
174
|
for row in self.samples.iter_rows(named=True):
|
|
172
175
|
yield row
|
|
173
176
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
177
|
+
@staticmethod
|
|
178
|
+
def from_path(path_folder: Path, check_for_images: bool = True) -> "HafniaDataset":
|
|
179
|
+
HafniaDataset.check_dataset_path(path_folder, raise_error=True)
|
|
180
|
+
|
|
181
|
+
dataset_info = DatasetInfo.from_json_file(path_folder / FILENAME_DATASET_INFO)
|
|
182
|
+
table = read_table_from_path(path_folder)
|
|
183
|
+
|
|
184
|
+
# Convert from relative paths to absolute paths
|
|
185
|
+
table = table.with_columns(
|
|
186
|
+
pl.concat_str([pl.lit(str(path_folder.absolute()) + os.sep), pl.col("file_name")]).alias("file_name")
|
|
187
|
+
)
|
|
188
|
+
if check_for_images:
|
|
189
|
+
check_image_paths(table)
|
|
190
|
+
return HafniaDataset(samples=table, info=dataset_info)
|
|
191
|
+
|
|
192
|
+
@staticmethod
|
|
193
|
+
def from_name(name: str, force_redownload: bool = False, download_files: bool = True) -> "HafniaDataset":
|
|
194
|
+
"""
|
|
195
|
+
Load a dataset by its name. The dataset must be registered in the Hafnia platform.
|
|
196
|
+
"""
|
|
197
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
198
|
+
from hafnia.platform.datasets import download_or_get_dataset_path
|
|
199
|
+
|
|
200
|
+
dataset_path = download_or_get_dataset_path(
|
|
201
|
+
dataset_name=name, force_redownload=force_redownload, download_files=download_files
|
|
202
|
+
)
|
|
203
|
+
return HafniaDataset.from_path(dataset_path, check_for_images=download_files)
|
|
181
204
|
|
|
182
205
|
@staticmethod
|
|
183
206
|
def from_samples_list(samples_list: List, info: DatasetInfo) -> "HafniaDataset":
|
|
@@ -194,6 +217,140 @@ class HafniaDataset:
|
|
|
194
217
|
|
|
195
218
|
return HafniaDataset(info=info, samples=table)
|
|
196
219
|
|
|
220
|
+
@staticmethod
|
|
221
|
+
def from_recipe(dataset_recipe: Any) -> "HafniaDataset":
|
|
222
|
+
"""
|
|
223
|
+
Load a dataset from a recipe. The recipe can be a string (name of the dataset), a dictionary, or a DataRecipe object.
|
|
224
|
+
"""
|
|
225
|
+
from hafnia.dataset.dataset_recipe.dataset_recipe import DatasetRecipe
|
|
226
|
+
|
|
227
|
+
recipe_explicit = DatasetRecipe.from_implicit_form(dataset_recipe)
|
|
228
|
+
|
|
229
|
+
return recipe_explicit.build() # Build dataset from the recipe
|
|
230
|
+
|
|
231
|
+
@staticmethod
|
|
232
|
+
def from_merge(dataset0: "HafniaDataset", dataset1: "HafniaDataset") -> "HafniaDataset":
|
|
233
|
+
return HafniaDataset.merge(dataset0, dataset1)
|
|
234
|
+
|
|
235
|
+
@staticmethod
|
|
236
|
+
def from_recipe_with_cache(
|
|
237
|
+
dataset_recipe: Any,
|
|
238
|
+
force_redownload: bool = False,
|
|
239
|
+
path_datasets: Optional[Union[Path, str]] = None,
|
|
240
|
+
) -> "HafniaDataset":
|
|
241
|
+
"""
|
|
242
|
+
Loads a dataset from a recipe and caches it to disk.
|
|
243
|
+
If the dataset is already cached, it will be loaded from the cache.
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
path_dataset = get_or_create_dataset_path_from_recipe(dataset_recipe, path_datasets=path_datasets)
|
|
247
|
+
return HafniaDataset.from_path(path_dataset, check_for_images=False)
|
|
248
|
+
|
|
249
|
+
@staticmethod
|
|
250
|
+
def from_merger(
|
|
251
|
+
datasets: List[HafniaDataset],
|
|
252
|
+
) -> "HafniaDataset":
|
|
253
|
+
"""
|
|
254
|
+
Merges multiple Hafnia datasets into one.
|
|
255
|
+
"""
|
|
256
|
+
if len(datasets) == 0:
|
|
257
|
+
raise ValueError("No datasets to merge. Please provide at least one dataset.")
|
|
258
|
+
|
|
259
|
+
if len(datasets) == 1:
|
|
260
|
+
return datasets[0]
|
|
261
|
+
|
|
262
|
+
merged_dataset = datasets[0]
|
|
263
|
+
remaining_datasets = datasets[1:]
|
|
264
|
+
for dataset in remaining_datasets:
|
|
265
|
+
merged_dataset = HafniaDataset.merge(merged_dataset, dataset)
|
|
266
|
+
return merged_dataset
|
|
267
|
+
|
|
268
|
+
# Dataset transformations
|
|
269
|
+
transform_images = dataset_transformations.transform_images
|
|
270
|
+
|
|
271
|
+
def shuffle(dataset: HafniaDataset, seed: int = 42) -> HafniaDataset:
|
|
272
|
+
table = dataset.samples.sample(n=len(dataset), with_replacement=False, seed=seed, shuffle=True)
|
|
273
|
+
return dataset.update_table(table)
|
|
274
|
+
|
|
275
|
+
def select_samples(
|
|
276
|
+
dataset: "HafniaDataset", n_samples: int, shuffle: bool = True, seed: int = 42, with_replacement: bool = False
|
|
277
|
+
) -> "HafniaDataset":
|
|
278
|
+
if not with_replacement:
|
|
279
|
+
n_samples = min(n_samples, len(dataset))
|
|
280
|
+
table = dataset.samples.sample(n=n_samples, with_replacement=with_replacement, seed=seed, shuffle=shuffle)
|
|
281
|
+
return dataset.update_table(table)
|
|
282
|
+
|
|
283
|
+
def splits_by_ratios(dataset: "HafniaDataset", split_ratios: Dict[str, float], seed: int = 42) -> "HafniaDataset":
|
|
284
|
+
"""
|
|
285
|
+
Divides the dataset into splits based on the provided ratios.
|
|
286
|
+
|
|
287
|
+
Example: Defining split ratios and applying the transformation
|
|
288
|
+
|
|
289
|
+
>>> dataset = HafniaDataset.read_from_path(Path("path/to/dataset"))
|
|
290
|
+
>>> split_ratios = {SplitName.TRAIN: 0.8, SplitName.VAL: 0.1, SplitName.TEST: 0.1}
|
|
291
|
+
>>> dataset_with_splits = splits_by_ratios(dataset, split_ratios, seed=42)
|
|
292
|
+
Or use the function as a
|
|
293
|
+
>>> dataset_with_splits = dataset.splits_by_ratios(split_ratios, seed=42)
|
|
294
|
+
"""
|
|
295
|
+
n_items = len(dataset)
|
|
296
|
+
split_name_column = dataset_helpers.create_split_name_list_from_ratios(
|
|
297
|
+
split_ratios=split_ratios, n_items=n_items, seed=seed
|
|
298
|
+
)
|
|
299
|
+
table = dataset.samples.with_columns(pl.Series(split_name_column).alias("split"))
|
|
300
|
+
return dataset.update_table(table)
|
|
301
|
+
|
|
302
|
+
def split_into_multiple_splits(
|
|
303
|
+
dataset: "HafniaDataset",
|
|
304
|
+
split_name: str,
|
|
305
|
+
split_ratios: Dict[str, float],
|
|
306
|
+
) -> "HafniaDataset":
|
|
307
|
+
"""
|
|
308
|
+
Divides a dataset split ('split_name') into multiple splits based on the provided split
|
|
309
|
+
ratios ('split_ratios'). This is especially useful for some open datasets where they have only provide
|
|
310
|
+
two splits or only provide annotations for two splits. This function allows you to create additional
|
|
311
|
+
splits based on the provided ratios.
|
|
312
|
+
|
|
313
|
+
Example: Defining split ratios and applying the transformation
|
|
314
|
+
>>> dataset = HafniaDataset.read_from_path(Path("path/to/dataset"))
|
|
315
|
+
>>> split_name = SplitName.TEST
|
|
316
|
+
>>> split_ratios = {SplitName.TEST: 0.8, SplitName.VAL: 0.2}
|
|
317
|
+
>>> dataset_with_splits = split_into_multiple_splits(dataset, split_name, split_ratios)
|
|
318
|
+
"""
|
|
319
|
+
dataset_split_to_be_divided = dataset.create_split_dataset(split_name=split_name)
|
|
320
|
+
if len(dataset_split_to_be_divided) == 0:
|
|
321
|
+
split_counts = dict(dataset.samples.select(pl.col(ColumnName.SPLIT).value_counts()).iter_rows())
|
|
322
|
+
raise ValueError(f"No samples in the '{split_name}' split to divide into multiple splits. {split_counts=}")
|
|
323
|
+
assert len(dataset_split_to_be_divided) > 0, f"No samples in the '{split_name}' split!"
|
|
324
|
+
dataset_split_to_be_divided = dataset_split_to_be_divided.splits_by_ratios(split_ratios=split_ratios, seed=42)
|
|
325
|
+
|
|
326
|
+
remaining_data = dataset.samples.filter(pl.col(ColumnName.SPLIT).is_in([split_name]).not_())
|
|
327
|
+
new_table = pl.concat([remaining_data, dataset_split_to_be_divided.samples], how="vertical")
|
|
328
|
+
dataset_new = dataset.update_table(new_table)
|
|
329
|
+
return dataset_new
|
|
330
|
+
|
|
331
|
+
def define_sample_set_by_size(dataset: "HafniaDataset", n_samples: int, seed: int = 42) -> "HafniaDataset":
|
|
332
|
+
is_sample_indices = Random(seed).sample(range(len(dataset)), n_samples)
|
|
333
|
+
is_sample_column = [False for _ in range(len(dataset))]
|
|
334
|
+
for idx in is_sample_indices:
|
|
335
|
+
is_sample_column[idx] = True
|
|
336
|
+
|
|
337
|
+
table = dataset.samples.with_columns(pl.Series(is_sample_column).alias("is_sample"))
|
|
338
|
+
return dataset.update_table(table)
|
|
339
|
+
|
|
340
|
+
def merge(dataset0: "HafniaDataset", dataset1: "HafniaDataset") -> "HafniaDataset":
|
|
341
|
+
"""
|
|
342
|
+
Merges two Hafnia datasets by concatenating their samples and updating the split names.
|
|
343
|
+
"""
|
|
344
|
+
## Currently, only a very naive merging is implemented.
|
|
345
|
+
# In the future we need to verify that the class and tasks are compatible.
|
|
346
|
+
# Do they have similar classes and tasks? What to do if they don't?
|
|
347
|
+
# For now, we just concatenate the samples and keep the split names as they are.
|
|
348
|
+
merged_samples = pl.concat([dataset0.samples, dataset1.samples], how="vertical")
|
|
349
|
+
return dataset0.update_table(merged_samples)
|
|
350
|
+
|
|
351
|
+
# Dataset stats
|
|
352
|
+
split_counts = dataset_stats.split_counts
|
|
353
|
+
|
|
197
354
|
def as_dict_dataset_splits(self) -> Dict[str, "HafniaDataset"]:
|
|
198
355
|
if ColumnName.SPLIT not in self.samples.columns:
|
|
199
356
|
raise ValueError(f"Dataset must contain a '{ColumnName.SPLIT}' column.")
|
|
@@ -256,21 +413,6 @@ class HafniaDataset:
|
|
|
256
413
|
|
|
257
414
|
return True
|
|
258
415
|
|
|
259
|
-
@staticmethod
|
|
260
|
-
def read_from_path(path_folder: Path, check_for_images: bool = True) -> "HafniaDataset":
|
|
261
|
-
HafniaDataset.check_dataset_path(path_folder, raise_error=True)
|
|
262
|
-
|
|
263
|
-
dataset_info = DatasetInfo.from_json_file(path_folder / FILENAME_DATASET_INFO)
|
|
264
|
-
table = read_table_from_path(path_folder)
|
|
265
|
-
|
|
266
|
-
# Convert from relative paths to absolute paths
|
|
267
|
-
table = table.with_columns(
|
|
268
|
-
pl.concat_str([pl.lit(str(path_folder.absolute()) + os.sep), pl.col("file_name")]).alias("file_name")
|
|
269
|
-
)
|
|
270
|
-
if check_for_images:
|
|
271
|
-
check_image_paths(table)
|
|
272
|
-
return HafniaDataset(samples=table, info=dataset_info)
|
|
273
|
-
|
|
274
416
|
def write(self, path_folder: Path, name_by_hash: bool = True, add_version: bool = False) -> None:
|
|
275
417
|
user_logger.info(f"Writing dataset to {path_folder}...")
|
|
276
418
|
if not path_folder.exists():
|
|
@@ -303,7 +445,7 @@ class HafniaDataset:
|
|
|
303
445
|
if add_version:
|
|
304
446
|
path_version = path_folder / "versions" / f"{self.info.version}"
|
|
305
447
|
path_version.mkdir(parents=True, exist_ok=True)
|
|
306
|
-
for filename in
|
|
448
|
+
for filename in DATASET_FILENAMES_REQUIRED:
|
|
307
449
|
shutil.copy2(path_folder / filename, path_version / filename)
|
|
308
450
|
|
|
309
451
|
def __eq__(self, value) -> bool:
|
|
@@ -363,10 +505,39 @@ class HafniaDataset:
|
|
|
363
505
|
|
|
364
506
|
|
|
365
507
|
def check_hafnia_dataset_from_path(path_dataset: Path) -> None:
|
|
366
|
-
dataset = HafniaDataset.
|
|
508
|
+
dataset = HafniaDataset.from_path(path_dataset, check_for_images=True)
|
|
367
509
|
check_hafnia_dataset(dataset)
|
|
368
510
|
|
|
369
511
|
|
|
512
|
+
def get_or_create_dataset_path_from_recipe(
|
|
513
|
+
dataset_recipe: Any,
|
|
514
|
+
force_redownload: bool = False,
|
|
515
|
+
path_datasets: Optional[Union[Path, str]] = None,
|
|
516
|
+
) -> Path:
|
|
517
|
+
from hafnia.dataset.dataset_recipe.dataset_recipe import (
|
|
518
|
+
DatasetRecipe,
|
|
519
|
+
get_dataset_path_from_recipe,
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
recipe: DatasetRecipe = DatasetRecipe.from_implicit_form(dataset_recipe)
|
|
523
|
+
path_dataset = get_dataset_path_from_recipe(recipe, path_datasets=path_datasets)
|
|
524
|
+
|
|
525
|
+
if force_redownload:
|
|
526
|
+
shutil.rmtree(path_dataset, ignore_errors=True)
|
|
527
|
+
|
|
528
|
+
if HafniaDataset.check_dataset_path(path_dataset, raise_error=False):
|
|
529
|
+
return path_dataset
|
|
530
|
+
|
|
531
|
+
path_dataset.mkdir(parents=True, exist_ok=True)
|
|
532
|
+
path_recipe_json = path_dataset / FILENAME_RECIPE_JSON
|
|
533
|
+
path_recipe_json.write_text(recipe.model_dump_json(indent=4))
|
|
534
|
+
|
|
535
|
+
dataset: HafniaDataset = recipe.build()
|
|
536
|
+
dataset.write(path_dataset)
|
|
537
|
+
|
|
538
|
+
return path_dataset
|
|
539
|
+
|
|
540
|
+
|
|
370
541
|
def check_hafnia_dataset(dataset: HafniaDataset):
|
|
371
542
|
user_logger.info("Checking Hafnia dataset...")
|
|
372
543
|
assert isinstance(dataset.info.version, str) and len(dataset.info.version) > 0
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Dict
|
|
4
|
+
|
|
5
|
+
from hafnia.dataset.dataset_names import ColumnName
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def split_counts(dataset: HafniaDataset) -> Dict[str, int]:
|
|
12
|
+
"""
|
|
13
|
+
Returns a dictionary with the counts of samples in each split of the dataset.
|
|
14
|
+
"""
|
|
15
|
+
return dict(dataset.samples[ColumnName.SPLIT].value_counts().iter_rows())
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hafnia dataset transformations that takes and returns a HafniaDataset object.
|
|
3
|
+
|
|
4
|
+
All functions here will have a corresponding function in both the HafniaDataset class
|
|
5
|
+
and a corresponding RecipeTransform class in the `data_recipe/recipe_transformations.py` file.
|
|
6
|
+
|
|
7
|
+
This allows each function to be used in three ways:
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from hafnia.dataset.operations import dataset_transformations
|
|
11
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
12
|
+
from hafnia.dataset.data_recipe.recipe_transformations import SplitByRatios
|
|
13
|
+
|
|
14
|
+
splits_by_ratios = {"train": 0.8, "val": 0.1, "test": 0.1}
|
|
15
|
+
|
|
16
|
+
# Option 1: Using the function directly
|
|
17
|
+
dataset = recipe_transformations.splits_by_ratios(dataset, split_ratios=splits_by_ratios)
|
|
18
|
+
|
|
19
|
+
# Option 2: Using the method of the HafniaDataset class
|
|
20
|
+
dataset = dataset.splits_by_ratios(split_ratios=splits_by_ratios)
|
|
21
|
+
|
|
22
|
+
# Option 3: Using the RecipeTransform class
|
|
23
|
+
serializable_transform = SplitByRatios(split_ratios=splits_by_ratios)
|
|
24
|
+
dataset = serializable_transform(dataset)
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Tests will ensure that all functions in this file will have a corresponding function in the
|
|
28
|
+
HafniaDataset class and a RecipeTransform class in the `data_recipe/recipe_transformations.py` file and
|
|
29
|
+
that the signatures match.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import TYPE_CHECKING, Callable
|
|
34
|
+
|
|
35
|
+
import cv2
|
|
36
|
+
import numpy as np
|
|
37
|
+
import polars as pl
|
|
38
|
+
from PIL import Image
|
|
39
|
+
from tqdm import tqdm
|
|
40
|
+
|
|
41
|
+
from hafnia.dataset import dataset_helpers
|
|
42
|
+
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
### Image transformations ###
|
|
48
|
+
class AnonymizeByPixelation:
|
|
49
|
+
def __init__(self, resize_factor: float = 0.10):
|
|
50
|
+
self.resize_factor = resize_factor
|
|
51
|
+
|
|
52
|
+
def __call__(self, frame: np.ndarray) -> np.ndarray:
|
|
53
|
+
org_size = frame.shape[:2]
|
|
54
|
+
frame = cv2.resize(frame, (0, 0), fx=self.resize_factor, fy=self.resize_factor)
|
|
55
|
+
frame = cv2.resize(frame, org_size[::-1], interpolation=cv2.INTER_NEAREST)
|
|
56
|
+
return frame
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def transform_images(
|
|
60
|
+
dataset: "HafniaDataset",
|
|
61
|
+
transform: Callable[[np.ndarray], np.ndarray],
|
|
62
|
+
path_output: Path,
|
|
63
|
+
) -> "HafniaDataset":
|
|
64
|
+
new_paths = []
|
|
65
|
+
path_image_folder = path_output / "data"
|
|
66
|
+
path_image_folder.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
|
|
68
|
+
for org_path in tqdm(dataset.samples["file_name"].to_list(), desc="Transform images"):
|
|
69
|
+
org_path = Path(org_path)
|
|
70
|
+
if not org_path.exists():
|
|
71
|
+
raise FileNotFoundError(f"File {org_path} does not exist in the dataset.")
|
|
72
|
+
|
|
73
|
+
image = np.array(Image.open(org_path))
|
|
74
|
+
image_transformed = transform(image)
|
|
75
|
+
new_path = dataset_helpers.save_image_with_hash_name(image_transformed, path_image_folder)
|
|
76
|
+
|
|
77
|
+
if not new_path.exists():
|
|
78
|
+
raise FileNotFoundError(f"Transformed file {new_path} does not exist in the dataset.")
|
|
79
|
+
new_paths.append(str(new_path))
|
|
80
|
+
|
|
81
|
+
table = dataset.samples.with_columns(pl.Series(new_paths).alias("file_name"))
|
|
82
|
+
return dataset.update_table(table)
|
|
@@ -4,12 +4,12 @@ from typing import List, Optional, Type
|
|
|
4
4
|
import polars as pl
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
|
|
7
|
-
from hafnia.dataset import table_transformations
|
|
8
7
|
from hafnia.dataset.dataset_names import (
|
|
9
8
|
FILENAME_ANNOTATIONS_JSONL,
|
|
10
9
|
FILENAME_ANNOTATIONS_PARQUET,
|
|
11
10
|
FieldName,
|
|
12
11
|
)
|
|
12
|
+
from hafnia.dataset.operations import table_transformations
|
|
13
13
|
from hafnia.dataset.primitives import PRIMITIVE_TYPES
|
|
14
14
|
from hafnia.dataset.primitives.classification import Classification
|
|
15
15
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
@@ -14,7 +14,7 @@ from pydantic import BaseModel, field_validator
|
|
|
14
14
|
from hafnia.data.factory import load_dataset
|
|
15
15
|
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
16
16
|
from hafnia.log import sys_logger, user_logger
|
|
17
|
-
from hafnia.utils import
|
|
17
|
+
from hafnia.utils import is_hafnia_cloud_job, now_as_str
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class EntityType(Enum):
|
|
@@ -101,7 +101,7 @@ class HafniaLogger:
|
|
|
101
101
|
|
|
102
102
|
def path_local_experiment(self) -> Path:
|
|
103
103
|
"""Get the path for local experiment."""
|
|
104
|
-
if
|
|
104
|
+
if is_hafnia_cloud_job():
|
|
105
105
|
raise RuntimeError("Cannot access local experiment path in remote job.")
|
|
106
106
|
return self._local_experiment_path
|
|
107
107
|
|
|
@@ -110,7 +110,7 @@ class HafniaLogger:
|
|
|
110
110
|
if "MDI_CHECKPOINT_DIR" in os.environ:
|
|
111
111
|
return Path(os.environ["MDI_CHECKPOINT_DIR"])
|
|
112
112
|
|
|
113
|
-
if
|
|
113
|
+
if is_hafnia_cloud_job():
|
|
114
114
|
return Path("/opt/ml/checkpoints")
|
|
115
115
|
return self.path_local_experiment() / "checkpoints"
|
|
116
116
|
|
|
@@ -119,7 +119,7 @@ class HafniaLogger:
|
|
|
119
119
|
if "MDI_ARTIFACT_DIR" in os.environ:
|
|
120
120
|
return Path(os.environ["MDI_ARTIFACT_DIR"])
|
|
121
121
|
|
|
122
|
-
if
|
|
122
|
+
if is_hafnia_cloud_job():
|
|
123
123
|
return Path("/opt/ml/output/data")
|
|
124
124
|
|
|
125
125
|
return self.path_local_experiment() / "data"
|
|
@@ -129,7 +129,7 @@ class HafniaLogger:
|
|
|
129
129
|
if "MDI_MODEL_DIR" in os.environ:
|
|
130
130
|
return Path(os.environ["MDI_MODEL_DIR"])
|
|
131
131
|
|
|
132
|
-
if
|
|
132
|
+
if is_hafnia_cloud_job():
|
|
133
133
|
return Path("/opt/ml/model")
|
|
134
134
|
|
|
135
135
|
return self.path_local_experiment() / "model"
|
hafnia/helper_testing.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
from inspect import getmembers, isfunction, signature
|
|
1
2
|
from pathlib import Path
|
|
3
|
+
from types import FunctionType
|
|
4
|
+
from typing import Any, Callable, Dict, Union, get_origin
|
|
2
5
|
|
|
3
6
|
from hafnia import utils
|
|
4
7
|
from hafnia.dataset.dataset_names import FILENAME_ANNOTATIONS_JSONL, DatasetVariant
|
|
@@ -38,8 +41,8 @@ def get_path_micro_hafnia_dataset(dataset_name: str, force_update=False) -> Path
|
|
|
38
41
|
if path_test_dataset_annotations.exists() and not force_update:
|
|
39
42
|
return path_test_dataset
|
|
40
43
|
|
|
41
|
-
hafnia_dataset = HafniaDataset.
|
|
42
|
-
hafnia_dataset = hafnia_dataset.
|
|
44
|
+
hafnia_dataset = HafniaDataset.from_path(path_dataset / DatasetVariant.SAMPLE.value)
|
|
45
|
+
hafnia_dataset = hafnia_dataset.select_samples(n_samples=3, seed=42)
|
|
43
46
|
hafnia_dataset.write(path_test_dataset)
|
|
44
47
|
|
|
45
48
|
if force_update:
|
|
@@ -59,5 +62,47 @@ def get_sample_micro_hafnia_dataset(dataset_name: str, force_update=False) -> Sa
|
|
|
59
62
|
|
|
60
63
|
def get_micro_hafnia_dataset(dataset_name: str, force_update: bool = False) -> HafniaDataset:
|
|
61
64
|
path_dataset = get_path_micro_hafnia_dataset(dataset_name=dataset_name, force_update=force_update)
|
|
62
|
-
hafnia_dataset = HafniaDataset.
|
|
65
|
+
hafnia_dataset = HafniaDataset.from_path(path_dataset)
|
|
63
66
|
return hafnia_dataset
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def is_hafnia_configured() -> bool:
|
|
70
|
+
"""
|
|
71
|
+
Check if Hafnia is configured by verifying if the API key is set.
|
|
72
|
+
"""
|
|
73
|
+
from cli.config import Config
|
|
74
|
+
|
|
75
|
+
return Config().is_configured()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def is_typing_type(annotation: Any) -> bool:
|
|
79
|
+
return get_origin(annotation) is not None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def annotation_as_string(annotation: Union[type, str]) -> str:
|
|
83
|
+
"""Convert type annotation to string."""
|
|
84
|
+
if isinstance(annotation, str):
|
|
85
|
+
return annotation.replace("'", "")
|
|
86
|
+
if is_typing_type(annotation): # Is using typing types like List, Dict, etc.
|
|
87
|
+
return str(annotation).replace("typing.", "")
|
|
88
|
+
if hasattr(annotation, "__name__"):
|
|
89
|
+
return annotation.__name__
|
|
90
|
+
return str(annotation)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_hafnia_functions_from_module(python_module) -> Dict[str, FunctionType]:
|
|
94
|
+
def dataset_is_first_arg(func: Callable) -> bool:
|
|
95
|
+
"""
|
|
96
|
+
Check if the function has 'HafniaDataset' as the first parameter.
|
|
97
|
+
"""
|
|
98
|
+
func_signature = signature(func)
|
|
99
|
+
params = func_signature.parameters
|
|
100
|
+
if len(params) == 0:
|
|
101
|
+
return False
|
|
102
|
+
first_argument_type = list(params.values())[0]
|
|
103
|
+
|
|
104
|
+
annotation_as_str = annotation_as_string(first_argument_type.annotation)
|
|
105
|
+
return annotation_as_str == "HafniaDataset"
|
|
106
|
+
|
|
107
|
+
functions = {func[0]: func[1] for func in getmembers(python_module, isfunction) if dataset_is_first_arg(func[1])}
|
|
108
|
+
return functions
|
hafnia/platform/datasets.py
CHANGED
|
@@ -10,10 +10,14 @@ from tqdm import tqdm
|
|
|
10
10
|
|
|
11
11
|
from cli.config import Config
|
|
12
12
|
from hafnia import utils
|
|
13
|
-
from hafnia.dataset import
|
|
13
|
+
from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED, ColumnName
|
|
14
|
+
from hafnia.dataset.dataset_recipe.dataset_recipe import (
|
|
15
|
+
DatasetRecipe,
|
|
16
|
+
get_dataset_path_from_recipe,
|
|
17
|
+
)
|
|
14
18
|
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
15
19
|
from hafnia.http import fetch
|
|
16
|
-
from hafnia.log import user_logger
|
|
20
|
+
from hafnia.log import sys_logger, user_logger
|
|
17
21
|
from hafnia.platform import get_dataset_id
|
|
18
22
|
from hafnia.platform.download import get_resource_credentials
|
|
19
23
|
from hafnia.utils import timed
|
|
@@ -37,13 +41,11 @@ def download_or_get_dataset_path(
|
|
|
37
41
|
cfg: Optional[Config] = None,
|
|
38
42
|
path_datasets_folder: Optional[str] = None,
|
|
39
43
|
force_redownload: bool = False,
|
|
44
|
+
download_files: bool = True,
|
|
40
45
|
) -> Path:
|
|
41
46
|
"""Download or get the path of the dataset."""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
path_datasets_folder = path_datasets_folder or str(utils.PATH_DATASETS)
|
|
46
|
-
path_dataset = Path(path_datasets_folder).absolute() / dataset_name
|
|
47
|
+
recipe_explicit = DatasetRecipe.from_implicit_form(dataset_name)
|
|
48
|
+
path_dataset = get_dataset_path_from_recipe(recipe_explicit, path_datasets=path_datasets_folder)
|
|
47
49
|
|
|
48
50
|
is_dataset_valid = HafniaDataset.check_dataset_path(path_dataset, raise_error=False)
|
|
49
51
|
if is_dataset_valid and not force_redownload:
|
|
@@ -57,22 +59,30 @@ def download_or_get_dataset_path(
|
|
|
57
59
|
|
|
58
60
|
endpoint_dataset = cfg.get_platform_endpoint("datasets")
|
|
59
61
|
dataset_id = get_dataset_id(dataset_name=dataset_name, endpoint=endpoint_dataset, api_key=api_key)
|
|
62
|
+
if dataset_id is None:
|
|
63
|
+
sys_logger.error(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
|
|
60
64
|
access_dataset_endpoint = f"{endpoint_dataset}/{dataset_id}/temporary-credentials"
|
|
61
65
|
|
|
62
66
|
download_dataset_from_access_endpoint(
|
|
63
67
|
endpoint=access_dataset_endpoint,
|
|
64
68
|
api_key=api_key,
|
|
65
69
|
path_dataset=path_dataset,
|
|
70
|
+
download_files=download_files,
|
|
66
71
|
)
|
|
67
72
|
return path_dataset
|
|
68
73
|
|
|
69
74
|
|
|
70
|
-
def download_dataset_from_access_endpoint(
|
|
75
|
+
def download_dataset_from_access_endpoint(
|
|
76
|
+
endpoint: str,
|
|
77
|
+
api_key: str,
|
|
78
|
+
path_dataset: Path,
|
|
79
|
+
download_files: bool = True,
|
|
80
|
+
) -> None:
|
|
71
81
|
resource_credentials = get_resource_credentials(endpoint, api_key)
|
|
72
82
|
|
|
73
|
-
local_dataset_paths = [str(path_dataset / filename) for filename in
|
|
83
|
+
local_dataset_paths = [str(path_dataset / filename) for filename in DATASET_FILENAMES_REQUIRED]
|
|
74
84
|
s3_uri = resource_credentials.s3_uri()
|
|
75
|
-
s3_dataset_files = [f"{s3_uri}/{filename}" for filename in
|
|
85
|
+
s3_dataset_files = [f"{s3_uri}/{filename}" for filename in DATASET_FILENAMES_REQUIRED]
|
|
76
86
|
|
|
77
87
|
envs = resource_credentials.aws_credentials()
|
|
78
88
|
fast_copy_files_s3(
|
|
@@ -82,10 +92,13 @@ def download_dataset_from_access_endpoint(endpoint: str, api_key: str, path_data
|
|
|
82
92
|
description="Downloading annotations",
|
|
83
93
|
)
|
|
84
94
|
|
|
85
|
-
|
|
95
|
+
if not download_files:
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
dataset = HafniaDataset.from_path(path_dataset, check_for_images=False)
|
|
86
99
|
fast_copy_files_s3(
|
|
87
|
-
src_paths=dataset.samples[
|
|
88
|
-
dst_paths=dataset.samples[
|
|
100
|
+
src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
|
|
101
|
+
dst_paths=dataset.samples[ColumnName.FILE_NAME].to_list(),
|
|
89
102
|
append_envs=envs,
|
|
90
103
|
description="Downloading images",
|
|
91
104
|
)
|
hafnia/utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import os
|
|
2
3
|
import time
|
|
3
4
|
import zipfile
|
|
@@ -132,6 +133,24 @@ def show_recipe_content(recipe_path: Path, style: str = "emoji", depth_limit: in
|
|
|
132
133
|
user_logger.info(f"Recipe size: {size_human_readable(os.path.getsize(recipe_path))}. Max size 800 MiB")
|
|
133
134
|
|
|
134
135
|
|
|
135
|
-
def
|
|
136
|
+
def is_hafnia_cloud_job() -> bool:
|
|
136
137
|
"""Check if the current job is running in HAFNIA cloud environment."""
|
|
137
138
|
return os.getenv("HAFNIA_CLOUD", "false").lower() == "true"
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def pascal_to_snake_case(name: str) -> str:
|
|
142
|
+
"""
|
|
143
|
+
Convert PascalCase to snake_case.
|
|
144
|
+
"""
|
|
145
|
+
return "".join(["_" + char.lower() if char.isupper() else char for char in name]).lstrip("_")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def snake_to_pascal_case(name: str) -> str:
|
|
149
|
+
"""
|
|
150
|
+
Convert snake_case to PascalCase.
|
|
151
|
+
"""
|
|
152
|
+
return "".join(word.capitalize() for word in name.split("_"))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def hash_from_string(s: str) -> str:
|
|
156
|
+
return hashlib.md5(s.encode("utf-8")).hexdigest()
|
|
@@ -175,7 +175,7 @@ def save_dataset_sample_set_visualizations(
|
|
|
175
175
|
draw_settings: Optional[Dict[Type[Primitive], Dict]] = None,
|
|
176
176
|
anonymize_settings: Optional[Dict[Type[Primitive], Dict]] = None,
|
|
177
177
|
) -> List[Path]:
|
|
178
|
-
dataset = HafniaDataset.
|
|
178
|
+
dataset = HafniaDataset.from_path(path_dataset)
|
|
179
179
|
shutil.rmtree(path_output_folder, ignore_errors=True)
|
|
180
180
|
path_output_folder.mkdir(parents=True)
|
|
181
181
|
|