hafnia 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hafnia-0.2.1 → hafnia-0.2.3}/.github/workflows/publish_docker.yaml +1 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/PKG-INFO +1 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/examples/example_dataset_recipe.py +15 -11
- {hafnia-0.2.1 → hafnia-0.2.3}/examples/example_hafnia_dataset.py +26 -16
- {hafnia-0.2.1 → hafnia-0.2.3}/pyproject.toml +1 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/src/cli/__main__.py +6 -10
- {hafnia-0.2.1 → hafnia-0.2.3}/src/cli/config.py +19 -5
- {hafnia-0.2.1 → hafnia-0.2.3}/src/cli/profile_cmds.py +2 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/dataset_helpers.py +39 -6
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/dataset_recipe/dataset_recipe.py +59 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/dataset_recipe/recipe_types.py +4 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/hafnia_dataset.py +7 -21
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/platform/datasets.py +12 -5
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/conftest.py +1 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/dataset/dataset_recipe/test_dataset_recipes.py +47 -4
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/dataset/dataset_recipe/test_recipe_transformations.py +1 -1
- hafnia-0.2.3/tests/dataset/operations/test_dataset_transformations.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/dataset/operations/test_table_transformations.py +1 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/dataset/test_dataset_helpers.py +10 -5
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/dataset/test_hafnia_dataset.py +1 -1
- {hafnia-0.2.1/src/hafnia → hafnia-0.2.3/tests}/helper_testing.py +1 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/test_check_example_scripts.py +1 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/test_cli.py +38 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/test_samples.py +8 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/test_visualizations.py +2 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/uv.lock +1 -1
- {hafnia-0.2.1 → hafnia-0.2.3}/.devcontainer/devcontainer.json +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.devcontainer/hooks/post_create +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.github/dependabot.yaml +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.github/workflows/Dockerfile +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.github/workflows/build.yaml +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.github/workflows/check_release.yaml +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.github/workflows/ci_cd.yaml +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.github/workflows/lint.yaml +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.github/workflows/publish_pypi.yaml +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.github/workflows/tests.yaml +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.gitignore +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.pre-commit-config.yaml +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.python-version +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.vscode/extensions.json +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.vscode/launch.json +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/.vscode/settings.json +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/LICENSE +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/README.md +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/docs/cli.md +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/docs/release.md +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/examples/example_logger.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/examples/example_torchvision_dataloader.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/cli/__init__.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/cli/consts.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/cli/dataset_cmds.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/cli/experiment_cmds.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/cli/recipe_cmds.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/cli/runc_cmds.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/__init__.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/data/__init__.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/data/factory.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/dataset_names.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/dataset_recipe/recipe_transforms.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/dataset_upload_helper.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/operations/dataset_stats.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/operations/dataset_transformations.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/operations/table_transformations.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/primitives/__init__.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/primitives/bbox.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/primitives/bitmask.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/primitives/classification.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/primitives/point.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/primitives/polygon.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/primitives/primitive.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/primitives/segmentation.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/dataset/primitives/utils.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/experiment/__init__.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/experiment/hafnia_logger.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/http.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/log.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/platform/__init__.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/platform/builder.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/platform/download.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/platform/experiment.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/torch_helpers.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/utils.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/visualizations/colors.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/src/hafnia/visualizations/image_visualizations.py +0 -0
- /hafnia-0.2.1/tests/dataset/operations/test_dataset_transformations.py → /hafnia-0.2.3/tests/__init__.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_check_dataset[caltech-101].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_check_dataset[caltech-256].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_check_dataset[cifar100].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_check_dataset[cifar10].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_check_dataset[coco-2017].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_check_dataset[midwest-vehicle-detection].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_check_dataset[mnist].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_check_dataset[tiny-dataset].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[caltech-101].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[caltech-256].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[cifar100].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[cifar10].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[coco-2017].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[midwest-vehicle-detection].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[mnist].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[tiny-dataset].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_visualizations/test_blur_anonymization[coco-2017].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_visualizations/test_blur_anonymization[tiny-dataset].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_visualizations/test_draw_annotations[coco-2017].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_visualizations/test_draw_annotations[tiny-dataset].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_visualizations/test_mask_region[coco-2017].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/expected_images/test_visualizations/test_mask_region[tiny-dataset].png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/coco-2017/annotations.jsonl +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/coco-2017/annotations.parquet +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/coco-2017/data/182a2c0a3ce312cf.jpg +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/coco-2017/data/4e95c6eb6209880a.jpg +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/coco-2017/data/cf86c7a23edb55ce.jpg +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/coco-2017/dataset_info.json +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/tiny-dataset/annotations.jsonl +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/tiny-dataset/annotations.parquet +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/tiny-dataset/data/222bbd5721a8a86e.png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/tiny-dataset/data/3251d85443622e4c.png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/tiny-dataset/data/3657ababa44af9b6.png +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/data/micro_test_datasets/tiny-dataset/dataset_info.json +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/dataset/dataset_recipe/test_dataset_recipe_helpers.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/dataset/test_colors.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/dataset/test_shape_primitives.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/test_builder.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/test_hafnia_logger.py +0 -0
- {hafnia-0.2.1 → hafnia-0.2.3}/tests/test_utils.py +0 -0
|
@@ -47,7 +47,7 @@ jobs:
|
|
|
47
47
|
echo "aws_region=${{ secrets.STAGE_AWS_REGION }}" >> $GITHUB_OUTPUT
|
|
48
48
|
fi
|
|
49
49
|
- name: Configure AWS credentials
|
|
50
|
-
uses: aws-actions/configure-aws-credentials@v4.
|
|
50
|
+
uses: aws-actions/configure-aws-credentials@v4.3.0
|
|
51
51
|
with:
|
|
52
52
|
role-to-assume: arn:aws:iam::${{ steps.env-vars.outputs.aws_account_id }}:role/${{ secrets.AWS_ROLE_NAME }}
|
|
53
53
|
aws-region: ${{ steps.env-vars.outputs.aws_region }}
|
|
@@ -29,20 +29,24 @@ dataset_recipe = DatasetRecipe.from_name(name="mnist").shuffle().select_samples(
|
|
|
29
29
|
dataset = dataset_recipe.build()
|
|
30
30
|
# Note that the interface is similar, but to actually create the dataset you need to call `build()` on the recipe.
|
|
31
31
|
|
|
32
|
-
#
|
|
33
|
-
#
|
|
34
|
-
#
|
|
35
|
-
|
|
36
|
-
|
|
32
|
+
# Unlike the HafniaDataset, a DatasetRecipe does not execute operations. It only registers
|
|
33
|
+
# the operations applied to the recipe and can be used to build the dataset later.
|
|
34
|
+
# You can print the dataset recipe to the operations that were applied to it.
|
|
35
|
+
rprint(dataset_recipe)
|
|
36
|
+
|
|
37
|
+
# Or as a JSON string:
|
|
38
|
+
json_str: str = dataset_recipe.as_json_str()
|
|
39
|
+
rprint(json_str)
|
|
40
|
+
|
|
41
|
+
# This is an important feature of a 'DatasetRecipe' it only registers operations and that the recipe itself
|
|
42
|
+
# - and not the dataset - can be saved as a file and loaded from file.
|
|
43
|
+
# Meaning you can easily save, share, load and build the dataset later or in a different environment.
|
|
37
44
|
# For TaaS, this is the only way to include multiple datasets during training.
|
|
38
45
|
|
|
39
|
-
# This is how it looks like in practice:
|
|
40
|
-
# 1) Save the dataset recipe to a file
|
|
41
|
-
path_json = Path(".data/tmp/dataset_recipe.json")
|
|
42
|
-
dataset_recipe.as_json_file(path_json)
|
|
43
46
|
|
|
44
|
-
# 2) The recipe can be loaded from
|
|
45
|
-
dataset_recipe_again = DatasetRecipe.
|
|
47
|
+
# 2) The recipe can be loaded from json string
|
|
48
|
+
dataset_recipe_again: DatasetRecipe = DatasetRecipe.from_json_str(json_str)
|
|
49
|
+
# dataset_recipe_again.build()
|
|
46
50
|
|
|
47
51
|
# We can verify that the loaded recipe is the same as the original recipe.
|
|
48
52
|
assert dataset_recipe_again == dataset_recipe
|
|
@@ -26,11 +26,10 @@ dataset = HafniaDataset.from_path(path_dataset)
|
|
|
26
26
|
# Alternatively, you can use the 'load_dataset' function
|
|
27
27
|
dataset = load_dataset("midwest-vehicle-detection")
|
|
28
28
|
|
|
29
|
-
|
|
30
29
|
# Dataset information is stored in 'dataset.info'
|
|
31
30
|
rprint(dataset.info)
|
|
32
31
|
|
|
33
|
-
# Annotations are stored in 'dataset.
|
|
32
|
+
# Annotations are stored in 'dataset.samples' as a Polars DataFrame
|
|
34
33
|
dataset.samples.head(2)
|
|
35
34
|
|
|
36
35
|
# Print dataset information
|
|
@@ -49,14 +48,29 @@ shuffled_dataset = dataset.shuffle(seed=42) # Shuffle the dataset
|
|
|
49
48
|
split_ratios = {SplitName.TRAIN: 0.8, SplitName.VAL: 0.1, SplitName.TEST: 0.1}
|
|
50
49
|
new_dataset_splits = dataset.splits_by_ratios(split_ratios)
|
|
51
50
|
|
|
51
|
+
# Support Chaining Operations (load, shuffle, select samples)
|
|
52
|
+
dataset = load_dataset("midwest-vehicle-detection").shuffle(seed=42).select_samples(n_samples=10)
|
|
53
|
+
|
|
54
|
+
|
|
52
55
|
# Write dataset to disk
|
|
53
56
|
path_tmp = Path(".data/tmp")
|
|
54
57
|
path_dataset = path_tmp / "hafnia_dataset"
|
|
55
|
-
dataset.write(path_dataset)
|
|
58
|
+
dataset.write(path_dataset)
|
|
56
59
|
|
|
57
60
|
# Load dataset from disk
|
|
58
61
|
dataset_again = HafniaDataset.from_path(path_dataset)
|
|
59
62
|
|
|
63
|
+
|
|
64
|
+
# Want custom dataset transformations or statistics? Use the polars table (dataset.samples) directly
|
|
65
|
+
n_objects = dataset.samples["objects"].list.len().sum()
|
|
66
|
+
n_objects = dataset.samples[Bbox.column_name()].list.len().sum() # Use Bbox.column_name() to avoid magic variables
|
|
67
|
+
n_classifications = dataset.samples[Classification.column_name()].list.len().sum()
|
|
68
|
+
|
|
69
|
+
class_counts = dataset.samples[Classification.column_name()].explode().struct.field("class_name").value_counts()
|
|
70
|
+
class_counts = dataset.samples[Bbox.column_name()].explode().struct.field("class_name").value_counts()
|
|
71
|
+
rprint(dict(class_counts.iter_rows()))
|
|
72
|
+
|
|
73
|
+
|
|
60
74
|
# Access the first sample in the training split - data is stored in a dictionary
|
|
61
75
|
sample_dict = dataset_train[0]
|
|
62
76
|
|
|
@@ -78,25 +92,15 @@ image: np.ndarray = sample.read_image()
|
|
|
78
92
|
# Visualize sample and annotations
|
|
79
93
|
image_with_annotations = sample.draw_annotations()
|
|
80
94
|
|
|
81
|
-
|
|
95
|
+
# Save the image with annotations to a temporary directory
|
|
82
96
|
path_tmp.mkdir(parents=True, exist_ok=True)
|
|
83
97
|
Image.fromarray(image_with_annotations).save(path_tmp / "sample_with_annotations.png")
|
|
84
98
|
|
|
85
99
|
|
|
86
|
-
# Do dataset transformations and statistics on the Polars DataFrame
|
|
87
|
-
n_objects = dataset.samples["objects"].list.len().sum()
|
|
88
|
-
n_objects = dataset.samples[Bbox.column_name()].list.len().sum() # Use Bbox.column_name() to avoid magic variables
|
|
89
|
-
n_classifications = dataset.samples[Classification.column_name()].list.len().sum()
|
|
90
|
-
|
|
91
|
-
class_counts = dataset.samples[Classification.column_name()].explode().struct.field("class_name").value_counts()
|
|
92
|
-
class_counts = dataset.samples[Bbox.column_name()].explode().struct.field("class_name").value_counts()
|
|
93
|
-
rprint(dict(class_counts.iter_rows()))
|
|
94
|
-
|
|
95
|
-
|
|
96
100
|
## Bring-your-own-data: Create a new dataset from samples
|
|
97
101
|
fake_samples = []
|
|
98
102
|
for i_fake_sample in range(5):
|
|
99
|
-
bboxes = [Bbox(top_left_x=
|
|
103
|
+
bboxes = [Bbox(top_left_x=0.1, top_left_y=0.20, width=0.1, height=0.2, class_name="car")]
|
|
100
104
|
classifications = [Classification(class_name="vehicle", class_idx=0)]
|
|
101
105
|
sample = Sample(
|
|
102
106
|
file_name=f"path/to/image_{i_fake_sample:05}.jpg",
|
|
@@ -120,8 +124,14 @@ fake_dataset_info = DatasetInfo(
|
|
|
120
124
|
)
|
|
121
125
|
fake_dataset = HafniaDataset.from_samples_list(samples_list=fake_samples, info=fake_dataset_info)
|
|
122
126
|
|
|
127
|
+
# Coming soon! Upload your dataset to the Hafnia Platform
|
|
128
|
+
# fake_dataset.upload_to_hafnia()
|
|
129
|
+
|
|
130
|
+
# Coming soon! Create your own dataset details page in Hafnia
|
|
131
|
+
# fake_dataset.upload_dataset_details()
|
|
123
132
|
|
|
124
|
-
## A hafnia dataset can also be used for storing predictions per sample
|
|
133
|
+
## Storing predictions: A hafnia dataset can also be used for storing predictions per sample
|
|
134
|
+
# set 'ground_truth=False' and add 'confidence'.
|
|
125
135
|
bboxes_predictions = [
|
|
126
136
|
Bbox(top_left_x=10, top_left_y=20, width=100, height=200, class_name="car", ground_truth=False, confidence=0.9)
|
|
127
137
|
]
|
|
@@ -20,19 +20,15 @@ def configure(cfg: Config) -> None:
|
|
|
20
20
|
|
|
21
21
|
profile_name = click.prompt("Profile Name", type=str, default=consts.DEFAULT_PROFILE_NAME)
|
|
22
22
|
profile_name = profile_name.strip()
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
except ValueError:
|
|
26
|
-
raise click.ClickException(consts.ERROR_CREATE_PROFILE)
|
|
23
|
+
|
|
24
|
+
cfg.check_profile_name(profile_name)
|
|
27
25
|
|
|
28
26
|
api_key = click.prompt("Hafnia API Key", type=str, hide_input=True)
|
|
29
|
-
|
|
30
|
-
cfg.api_key = api_key.strip()
|
|
31
|
-
except ValueError as e:
|
|
32
|
-
click.echo(f"Error: {str(e)}", err=True)
|
|
33
|
-
return
|
|
27
|
+
|
|
34
28
|
platform_url = click.prompt("Hafnia Platform URL", type=str, default=consts.DEFAULT_API_URL)
|
|
35
|
-
|
|
29
|
+
|
|
30
|
+
cfg_profile = ConfigSchema(api_key=api_key, platform_url=platform_url)
|
|
31
|
+
cfg.add_profile(profile_name, cfg_profile, set_active=True)
|
|
36
32
|
cfg.save_config()
|
|
37
33
|
profile_cmds.profile_show(cfg)
|
|
38
34
|
|
|
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
from pydantic import BaseModel, field_validator
|
|
7
7
|
|
|
8
8
|
import cli.consts as consts
|
|
9
|
-
from hafnia.log import user_logger
|
|
9
|
+
from hafnia.log import sys_logger, user_logger
|
|
10
10
|
|
|
11
11
|
PLATFORM_API_MAPPING = {
|
|
12
12
|
"recipes": "/api/v1/recipes",
|
|
@@ -23,9 +23,17 @@ class ConfigSchema(BaseModel):
|
|
|
23
23
|
api_key: Optional[str] = None
|
|
24
24
|
|
|
25
25
|
@field_validator("api_key")
|
|
26
|
-
def validate_api_key(cls, value: str) -> str:
|
|
27
|
-
if value is
|
|
26
|
+
def validate_api_key(cls, value: Optional[str]) -> Optional[str]:
|
|
27
|
+
if value is None:
|
|
28
|
+
return value
|
|
29
|
+
|
|
30
|
+
if len(value) < 10:
|
|
28
31
|
raise ValueError("API key is too short.")
|
|
32
|
+
|
|
33
|
+
if not value.startswith("ApiKey "):
|
|
34
|
+
sys_logger.warning("API key is missing the 'ApiKey ' prefix. Prefix is being added automatically.")
|
|
35
|
+
value = f"ApiKey {value}"
|
|
36
|
+
|
|
29
37
|
return value
|
|
30
38
|
|
|
31
39
|
|
|
@@ -51,6 +59,7 @@ class Config:
|
|
|
51
59
|
if profile_name not in self.config_data.profiles:
|
|
52
60
|
raise ValueError(f"Profile '{profile_name}' does not exist.")
|
|
53
61
|
self.config_data.active_profile = profile_name
|
|
62
|
+
self.save_config()
|
|
54
63
|
|
|
55
64
|
@property
|
|
56
65
|
def config(self) -> ConfigSchema:
|
|
@@ -92,13 +101,18 @@ class Config:
|
|
|
92
101
|
|
|
93
102
|
return Path.home() / ".hafnia" / "config.json"
|
|
94
103
|
|
|
95
|
-
def
|
|
96
|
-
profile_name
|
|
104
|
+
def check_profile_name(self, profile_name: str) -> None:
|
|
105
|
+
if not profile_name or not isinstance(profile_name, str):
|
|
106
|
+
raise ValueError("Profile name must be a non-empty string.")
|
|
107
|
+
|
|
97
108
|
if profile_name in self.config_data.profiles:
|
|
98
109
|
user_logger.warning(
|
|
99
110
|
f"Profile with name '{profile_name}' already exists, it will be overwritten by the new one."
|
|
100
111
|
)
|
|
101
112
|
|
|
113
|
+
def add_profile(self, profile_name: str, profile: ConfigSchema, set_active: bool = False) -> None:
|
|
114
|
+
profile_name = profile_name.strip()
|
|
115
|
+
self.check_profile_name(profile_name)
|
|
102
116
|
self.config_data.profiles[profile_name] = profile
|
|
103
117
|
if set_active:
|
|
104
118
|
self.config_data.active_profile = profile_name
|
|
@@ -56,6 +56,7 @@ def profile_create(cfg: Config, name: str, api_url: str, api_key: str, activate:
|
|
|
56
56
|
cfg_profile = ConfigSchema(platform_url=api_url, api_key=api_key)
|
|
57
57
|
|
|
58
58
|
cfg.add_profile(profile_name=name, profile=cfg_profile, set_active=activate)
|
|
59
|
+
profile_show(cfg)
|
|
59
60
|
|
|
60
61
|
|
|
61
62
|
@profile.command("rm")
|
|
@@ -87,7 +88,7 @@ def profile_active(cfg: Config) -> None:
|
|
|
87
88
|
|
|
88
89
|
|
|
89
90
|
def profile_show(cfg: Config) -> None:
|
|
90
|
-
masked_key = f"{cfg.api_key[:
|
|
91
|
+
masked_key = f"{cfg.api_key[:11]}...{cfg.api_key[-4:]}" if len(cfg.api_key) > 20 else "****"
|
|
91
92
|
console = Console()
|
|
92
93
|
|
|
93
94
|
table = Table(title=f"{consts.PROFILE_TABLE_HEADER} {cfg.active_profile}", show_header=False)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import math
|
|
3
3
|
import random
|
|
4
|
+
import shutil
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Dict, List
|
|
6
7
|
|
|
@@ -21,7 +22,7 @@ def create_split_name_list_from_ratios(split_ratios: Dict[str, float], n_items:
|
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
def hash_file_xxhash(path: Path, chunk_size: int = 262144) -> str:
|
|
24
|
-
hasher = xxhash.
|
|
25
|
+
hasher = xxhash.xxh3_128()
|
|
25
26
|
|
|
26
27
|
with open(path, "rb") as f:
|
|
27
28
|
for chunk in iter(lambda: f.read(chunk_size), b""): # 8192, 16384, 32768, 65536
|
|
@@ -30,7 +31,7 @@ def hash_file_xxhash(path: Path, chunk_size: int = 262144) -> str:
|
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
def hash_from_bytes(data: bytes) -> str:
|
|
33
|
-
hasher = xxhash.
|
|
34
|
+
hasher = xxhash.xxh3_128()
|
|
34
35
|
hasher.update(data)
|
|
35
36
|
return hasher.hexdigest()
|
|
36
37
|
|
|
@@ -40,14 +41,46 @@ def save_image_with_hash_name(image: np.ndarray, path_folder: Path) -> Path:
|
|
|
40
41
|
buffer = io.BytesIO()
|
|
41
42
|
pil_image.save(buffer, format="PNG")
|
|
42
43
|
hash_value = hash_from_bytes(buffer.getvalue())
|
|
43
|
-
path_image = Path(path_folder) /
|
|
44
|
+
path_image = Path(path_folder) / relative_path_from_hash(hash=hash_value, suffix=".png")
|
|
45
|
+
path_image.parent.mkdir(parents=True, exist_ok=True)
|
|
44
46
|
pil_image.save(path_image)
|
|
45
47
|
return path_image
|
|
46
48
|
|
|
47
49
|
|
|
48
|
-
def
|
|
49
|
-
|
|
50
|
-
|
|
50
|
+
def copy_and_rename_file_to_hash_value(path_source: Path, path_dataset_root: Path) -> Path:
|
|
51
|
+
"""
|
|
52
|
+
Copies a file to a dataset root directory with a hash-based name and sub-directory structure.
|
|
53
|
+
|
|
54
|
+
E.g. for an "image.png" with hash "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4", the image will be copied to
|
|
55
|
+
'path_dataset_root / "data" / "dfe" / "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4.png"'
|
|
56
|
+
Notice that the hash is used for both the filename and the subfolder name.
|
|
57
|
+
|
|
58
|
+
Placing image/video files into multiple sub-folders (instead of one large folder) is seemingly
|
|
59
|
+
unnecessary, but it is actually a requirement when the dataset is later downloaded from S3.
|
|
60
|
+
|
|
61
|
+
The reason is that AWS has a rate limit of 3500 ops/sec per prefix (sub-folder) in S3 - meaning we can "only"
|
|
62
|
+
download 3500 files per second from a single folder (prefix) in S3.
|
|
63
|
+
|
|
64
|
+
For even a single user, we found that this limit was being reached when files are stored in single folder (prefix)
|
|
65
|
+
in S3. To support multiple users and concurrent experiments, we are required to separate files into
|
|
66
|
+
multiple sub-folders (prefixes) in S3 to not hit the rate limit.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
if not path_source.exists():
|
|
70
|
+
raise FileNotFoundError(f"Source file {path_source} does not exist.")
|
|
71
|
+
|
|
72
|
+
hash_value = hash_file_xxhash(path_source)
|
|
73
|
+
path_file = path_dataset_root / relative_path_from_hash(hash=hash_value, suffix=path_source.suffix)
|
|
74
|
+
path_file.parent.mkdir(parents=True, exist_ok=True)
|
|
75
|
+
if not path_file.exists():
|
|
76
|
+
shutil.copy2(path_source, path_file)
|
|
77
|
+
|
|
78
|
+
return path_file
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def relative_path_from_hash(hash: str, suffix: str) -> Path:
|
|
82
|
+
path_file = Path("data") / hash[:3] / f"{hash}{suffix}"
|
|
83
|
+
return path_file
|
|
51
84
|
|
|
52
85
|
|
|
53
86
|
def split_sizes_from_ratios(n_items: int, split_ratios: Dict[str, float]) -> Dict[str, int]:
|
|
@@ -216,6 +216,16 @@ class DatasetRecipe(Serializable):
|
|
|
216
216
|
json_str = self.as_json_str(indent=indent)
|
|
217
217
|
path_json.write_text(json_str, encoding="utf-8")
|
|
218
218
|
|
|
219
|
+
### Helper methods ###
|
|
220
|
+
def get_dataset_names(self) -> List[str]:
|
|
221
|
+
"""
|
|
222
|
+
Get all dataset names added with 'from_name'.
|
|
223
|
+
Function recursively gathers dataset names.
|
|
224
|
+
"""
|
|
225
|
+
if self.creation is None:
|
|
226
|
+
return []
|
|
227
|
+
return self.creation.get_dataset_names()
|
|
228
|
+
|
|
219
229
|
### Validation and Serialization ###
|
|
220
230
|
@field_validator("creation", mode="plain")
|
|
221
231
|
@classmethod
|
|
@@ -282,7 +292,10 @@ class FromPath(RecipeCreation):
|
|
|
282
292
|
return HafniaDataset.from_path
|
|
283
293
|
|
|
284
294
|
def as_short_name(self) -> str:
|
|
285
|
-
return f"'{self.path_folder}'".replace(os.sep, "
|
|
295
|
+
return f"'{self.path_folder}'".replace(os.sep, "-")
|
|
296
|
+
|
|
297
|
+
def get_dataset_names(self) -> List[str]:
|
|
298
|
+
return [] # Only counts 'from_name' datasets
|
|
286
299
|
|
|
287
300
|
|
|
288
301
|
class FromName(RecipeCreation):
|
|
@@ -297,6 +310,9 @@ class FromName(RecipeCreation):
|
|
|
297
310
|
def as_short_name(self) -> str:
|
|
298
311
|
return self.name
|
|
299
312
|
|
|
313
|
+
def get_dataset_names(self) -> List[str]:
|
|
314
|
+
return [self.name]
|
|
315
|
+
|
|
300
316
|
|
|
301
317
|
class FromMerge(RecipeCreation):
|
|
302
318
|
recipe0: DatasetRecipe
|
|
@@ -310,6 +326,11 @@ class FromMerge(RecipeCreation):
|
|
|
310
326
|
merger = FromMerger(recipes=[self.recipe0, self.recipe1])
|
|
311
327
|
return merger.as_short_name()
|
|
312
328
|
|
|
329
|
+
def get_dataset_names(self) -> List[str]:
|
|
330
|
+
"""Get the dataset names from the merged recipes."""
|
|
331
|
+
names = [*self.recipe0.creation.get_dataset_names(), *self.recipe1.creation.get_dataset_names()]
|
|
332
|
+
return names
|
|
333
|
+
|
|
313
334
|
|
|
314
335
|
class FromMerger(RecipeCreation):
|
|
315
336
|
recipes: List[DatasetRecipe]
|
|
@@ -325,3 +346,40 @@ class FromMerger(RecipeCreation):
|
|
|
325
346
|
|
|
326
347
|
def as_short_name(self) -> str:
|
|
327
348
|
return f"Merger({','.join(recipe.as_short_name() for recipe in self.recipes)})"
|
|
349
|
+
|
|
350
|
+
def get_dataset_names(self) -> List[str]:
|
|
351
|
+
"""Get the dataset names from the merged recipes."""
|
|
352
|
+
names = []
|
|
353
|
+
for recipe in self.recipes:
|
|
354
|
+
names.extend(recipe.creation.get_dataset_names())
|
|
355
|
+
return names
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def extract_dataset_names_from_json_dict(data: dict) -> list[str]:
|
|
359
|
+
"""
|
|
360
|
+
Extract dataset names recursively from a JSON dictionary added with 'from_name'.
|
|
361
|
+
|
|
362
|
+
Even if the same functionality is achieved with `DatasetRecipe.get_dataset_names()`,
|
|
363
|
+
we want to keep this function in 'dipdatalib' to extract dataset names from json dictionaries
|
|
364
|
+
directly.
|
|
365
|
+
"""
|
|
366
|
+
creation_field = data.get("creation")
|
|
367
|
+
if creation_field is None:
|
|
368
|
+
return []
|
|
369
|
+
if creation_field.get("__type__") == "FromName":
|
|
370
|
+
return [creation_field["name"]]
|
|
371
|
+
elif creation_field.get("__type__") == "FromMerge":
|
|
372
|
+
recipe_names = ["recipe0", "recipe1"]
|
|
373
|
+
dataset_name = []
|
|
374
|
+
for recipe_name in recipe_names:
|
|
375
|
+
recipe = creation_field.get(recipe_name)
|
|
376
|
+
if recipe is None:
|
|
377
|
+
continue
|
|
378
|
+
dataset_name.extend(extract_dataset_names_from_json_dict(recipe))
|
|
379
|
+
return dataset_name
|
|
380
|
+
elif creation_field.get("__type__") == "FromMerger":
|
|
381
|
+
dataset_name = []
|
|
382
|
+
for recipe in creation_field.get("recipes", []):
|
|
383
|
+
dataset_name.extend(extract_dataset_names_from_json_dict(recipe))
|
|
384
|
+
return dataset_name
|
|
385
|
+
return []
|
|
@@ -108,6 +108,10 @@ class RecipeCreation(Serializable):
|
|
|
108
108
|
def get_function() -> Callable[..., "HafniaDataset"]:
|
|
109
109
|
pass
|
|
110
110
|
|
|
111
|
+
@abstractmethod
|
|
112
|
+
def get_dataset_names(self) -> List[str]:
|
|
113
|
+
pass
|
|
114
|
+
|
|
111
115
|
def build(self) -> "HafniaDataset":
|
|
112
116
|
from hafnia.dataset.dataset_recipe.dataset_recipe import DatasetRecipe
|
|
113
117
|
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import shutil
|
|
5
4
|
from dataclasses import dataclass
|
|
6
5
|
from pathlib import Path
|
|
@@ -182,9 +181,8 @@ class HafniaDataset:
|
|
|
182
181
|
table = read_table_from_path(path_folder)
|
|
183
182
|
|
|
184
183
|
# Convert from relative paths to absolute paths
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
)
|
|
184
|
+
dataset_root = path_folder.absolute().as_posix() + "/"
|
|
185
|
+
table = table.with_columns((dataset_root + pl.col("file_name")).alias("file_name"))
|
|
188
186
|
if check_for_images:
|
|
189
187
|
check_image_paths(table)
|
|
190
188
|
return HafniaDataset(samples=table, info=dataset_info)
|
|
@@ -413,30 +411,18 @@ class HafniaDataset:
|
|
|
413
411
|
|
|
414
412
|
return True
|
|
415
413
|
|
|
416
|
-
def write(self, path_folder: Path,
|
|
414
|
+
def write(self, path_folder: Path, add_version: bool = False) -> None:
|
|
417
415
|
user_logger.info(f"Writing dataset to {path_folder}...")
|
|
418
416
|
if not path_folder.exists():
|
|
419
417
|
path_folder.mkdir(parents=True)
|
|
420
|
-
path_folder_images = path_folder / "data"
|
|
421
|
-
path_folder_images.mkdir(parents=True, exist_ok=True)
|
|
422
418
|
|
|
423
419
|
new_relative_paths = []
|
|
424
420
|
for org_path in tqdm(self.samples["file_name"].to_list(), desc="- Copy images"):
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
filename = dataset_helpers.filename_as_hash_from_path(org_path)
|
|
430
|
-
else:
|
|
431
|
-
filename = Path(org_path).name
|
|
432
|
-
new_path = path_folder_images / filename
|
|
433
|
-
if not new_path.exists():
|
|
434
|
-
shutil.copy2(org_path, new_path)
|
|
435
|
-
|
|
436
|
-
if not new_path.exists():
|
|
437
|
-
raise FileNotFoundError(f"File {new_path} does not exist in the dataset.")
|
|
421
|
+
new_path = dataset_helpers.copy_and_rename_file_to_hash_value(
|
|
422
|
+
path_source=Path(org_path),
|
|
423
|
+
path_dataset_root=path_folder,
|
|
424
|
+
)
|
|
438
425
|
new_relative_paths.append(str(new_path.relative_to(path_folder)))
|
|
439
|
-
|
|
440
426
|
table = self.samples.with_columns(pl.Series(new_relative_paths).alias("file_name"))
|
|
441
427
|
table.write_ndjson(path_folder / FILENAME_ANNOTATIONS_JSONL) # Json for readability
|
|
442
428
|
table.write_parquet(path_folder / FILENAME_ANNOTATIONS_PARQUET) # Parquet for speed
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import shutil
|
|
3
3
|
import subprocess
|
|
4
4
|
import tempfile
|
|
5
|
+
import uuid
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Any, Dict, List, Optional
|
|
7
8
|
|
|
@@ -61,7 +62,12 @@ def download_or_get_dataset_path(
|
|
|
61
62
|
dataset_id = get_dataset_id(dataset_name=dataset_name, endpoint=endpoint_dataset, api_key=api_key)
|
|
62
63
|
if dataset_id is None:
|
|
63
64
|
sys_logger.error(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
|
|
64
|
-
|
|
65
|
+
|
|
66
|
+
if utils.is_hafnia_cloud_job():
|
|
67
|
+
credentials_endpoint_suffix = "temporary-credentials-hidden" # Access to hidden datasets
|
|
68
|
+
else:
|
|
69
|
+
credentials_endpoint_suffix = "temporary-credentials" # Access to sample dataset
|
|
70
|
+
access_dataset_endpoint = f"{endpoint_dataset}/{dataset_id}/{credentials_endpoint_suffix}"
|
|
65
71
|
|
|
66
72
|
download_dataset_from_access_endpoint(
|
|
67
73
|
endpoint=access_dataset_endpoint,
|
|
@@ -80,7 +86,7 @@ def download_dataset_from_access_endpoint(
|
|
|
80
86
|
) -> None:
|
|
81
87
|
resource_credentials = get_resource_credentials(endpoint, api_key)
|
|
82
88
|
|
|
83
|
-
local_dataset_paths = [
|
|
89
|
+
local_dataset_paths = [(path_dataset / filename).as_posix() for filename in DATASET_FILENAMES_REQUIRED]
|
|
84
90
|
s3_uri = resource_credentials.s3_uri()
|
|
85
91
|
s3_dataset_files = [f"{s3_uri}/{filename}" for filename in DATASET_FILENAMES_REQUIRED]
|
|
86
92
|
|
|
@@ -94,7 +100,6 @@ def download_dataset_from_access_endpoint(
|
|
|
94
100
|
|
|
95
101
|
if not download_files:
|
|
96
102
|
return
|
|
97
|
-
|
|
98
103
|
dataset = HafniaDataset.from_path(path_dataset, check_for_images=False)
|
|
99
104
|
fast_copy_files_s3(
|
|
100
105
|
src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
|
|
@@ -124,8 +129,10 @@ def execute_s5cmd_commands(
|
|
|
124
129
|
description: str = "Executing s5cmd commands",
|
|
125
130
|
) -> List[str]:
|
|
126
131
|
append_envs = append_envs or {}
|
|
127
|
-
|
|
128
|
-
|
|
132
|
+
# In Windows default "Temp" directory can not be deleted that is why we need to create a
|
|
133
|
+
# temporary directory.
|
|
134
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
135
|
+
tmp_file_path = Path(temp_dir, f"{uuid.uuid4().hex}.txt")
|
|
129
136
|
tmp_file_path.write_text("\n".join(commands))
|
|
130
137
|
run_cmds = [
|
|
131
138
|
"s5cmd",
|
|
@@ -6,8 +6,8 @@ import numpy as np
|
|
|
6
6
|
import pytest
|
|
7
7
|
from PIL import Image
|
|
8
8
|
|
|
9
|
-
from hafnia.helper_testing import get_path_expected_images
|
|
10
9
|
from hafnia.visualizations import image_visualizations
|
|
10
|
+
from tests.helper_testing import get_path_expected_images
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
@pytest.fixture
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import inspect
|
|
2
|
+
import json
|
|
2
3
|
import tempfile
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from pathlib import Path
|
|
@@ -9,13 +10,14 @@ import pytest
|
|
|
9
10
|
from hafnia.dataset.dataset_recipe.dataset_recipe import (
|
|
10
11
|
DatasetRecipe,
|
|
11
12
|
FromMerger,
|
|
13
|
+
extract_dataset_names_from_json_dict,
|
|
12
14
|
get_dataset_path_from_recipe,
|
|
13
15
|
)
|
|
14
16
|
from hafnia.dataset.dataset_recipe.recipe_transforms import SelectSamples, Shuffle
|
|
15
17
|
from hafnia.dataset.dataset_recipe.recipe_types import RecipeCreation, RecipeTransform, Serializable
|
|
16
18
|
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
17
|
-
from hafnia.helper_testing import annotation_as_string, is_hafnia_configured
|
|
18
19
|
from hafnia.utils import pascal_to_snake_case
|
|
20
|
+
from tests.helper_testing import annotation_as_string, is_hafnia_configured
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
def get_data_recipe() -> DatasetRecipe:
|
|
@@ -171,7 +173,7 @@ class IntegrationTestUseCase:
|
|
|
171
173
|
),
|
|
172
174
|
IntegrationTestUseCase(
|
|
173
175
|
recipe=DatasetRecipe.from_path(path_folder=Path(".data/datasets/mnist"), check_for_images=False),
|
|
174
|
-
short_name="'.data
|
|
176
|
+
short_name="'.data-datasets-mnist'",
|
|
175
177
|
),
|
|
176
178
|
IntegrationTestUseCase(
|
|
177
179
|
recipe=DatasetRecipe.from_merger(
|
|
@@ -180,14 +182,14 @@ class IntegrationTestUseCase:
|
|
|
180
182
|
DatasetRecipe.from_path(path_folder=Path(".data/datasets/mnist"), check_for_images=False),
|
|
181
183
|
]
|
|
182
184
|
),
|
|
183
|
-
short_name="Merger(mnist,'.data
|
|
185
|
+
short_name="Merger(mnist,'.data-datasets-mnist')",
|
|
184
186
|
),
|
|
185
187
|
IntegrationTestUseCase(
|
|
186
188
|
recipe=DatasetRecipe.from_merge(
|
|
187
189
|
recipe0=DatasetRecipe.from_path(path_folder=Path(".data/datasets/mnist"), check_for_images=False),
|
|
188
190
|
recipe1=DatasetRecipe.from_name(name="mnist", force_redownload=False),
|
|
189
191
|
),
|
|
190
|
-
short_name="Merger('.data
|
|
192
|
+
short_name="Merger('.data-datasets-mnist',mnist)",
|
|
191
193
|
),
|
|
192
194
|
IntegrationTestUseCase(
|
|
193
195
|
recipe=DatasetRecipe.from_name(name="mnist", force_redownload=False)
|
|
@@ -258,3 +260,44 @@ def test_cases_integration_tests(recipe_use_case: IntegrationTestUseCase):
|
|
|
258
260
|
|
|
259
261
|
assert isinstance(dataset, HafniaDataset), "Dataset is not an instance of HafniaDataset"
|
|
260
262
|
# assert isinstance(dataset, HafniaDataset), "Dataset is not an instance of HafniaDataset"
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def test_get_dataset_names():
|
|
266
|
+
expected_dataset_names = {"dataset0", "dataset1", "dataset2", "dataset3", "dataset4", "dataset5", "dataset6"}
|
|
267
|
+
nested_recipe = DatasetRecipe.from_merger(
|
|
268
|
+
recipes=[
|
|
269
|
+
DatasetRecipe.from_merger(
|
|
270
|
+
recipes=[
|
|
271
|
+
DatasetRecipe.from_name(name="dataset0"),
|
|
272
|
+
DatasetRecipe.from_name(name="dataset1"),
|
|
273
|
+
DatasetRecipe.from_merge(
|
|
274
|
+
recipe0=DatasetRecipe.from_name(name="dataset2"),
|
|
275
|
+
recipe1=DatasetRecipe.from_name(name="dataset3"),
|
|
276
|
+
),
|
|
277
|
+
]
|
|
278
|
+
),
|
|
279
|
+
DatasetRecipe.from_path(path_folder=Path(".data/datasets/mnist"))
|
|
280
|
+
.select_samples(n_samples=30)
|
|
281
|
+
.splits_by_ratios(split_ratios={"train": 0.8, "val": 0.1, "test": 0.1}),
|
|
282
|
+
DatasetRecipe.from_name(name="dataset4").select_samples(n_samples=20).shuffle(),
|
|
283
|
+
DatasetRecipe.from_merger(
|
|
284
|
+
recipes=[
|
|
285
|
+
DatasetRecipe.from_name(name="dataset5"),
|
|
286
|
+
DatasetRecipe.from_name(name="dataset6"),
|
|
287
|
+
]
|
|
288
|
+
),
|
|
289
|
+
]
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
assert set(nested_recipe.get_dataset_names()) == expected_dataset_names, "Dataset names do not match expected names"
|
|
293
|
+
|
|
294
|
+
json_str = nested_recipe.as_json_str()
|
|
295
|
+
nested_recipe.as_json_file(path_json=Path("nested_recipe.json"))
|
|
296
|
+
data_dict = json.loads(json_str)
|
|
297
|
+
|
|
298
|
+
dataset_names = extract_dataset_names_from_json_dict(data_dict)
|
|
299
|
+
assert set(dataset_names) == expected_dataset_names, (
|
|
300
|
+
f"If this function fails, you should be concerned !! The '{extract_dataset_names_from_json_dict.__name__}' "
|
|
301
|
+
"function is copy/pasted to 'dipdatalib' to extract dataset names from json dictionaries directly. "
|
|
302
|
+
"If this test fails, please fix the function and copy/paste the function to dipdatalib as well."
|
|
303
|
+
)
|
|
@@ -15,7 +15,7 @@ from hafnia.dataset.dataset_recipe.recipe_transforms import (
|
|
|
15
15
|
)
|
|
16
16
|
from hafnia.dataset.dataset_recipe.recipe_types import RecipeTransform
|
|
17
17
|
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
18
|
-
from
|
|
18
|
+
from tests.helper_testing import get_micro_hafnia_dataset
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
@dataclass
|
|
File without changes
|