hafnia 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {hafnia-0.2.2 → hafnia-0.2.4}/.github/workflows/build.yaml +1 -1
  2. {hafnia-0.2.2 → hafnia-0.2.4}/.github/workflows/check_release.yaml +1 -1
  3. {hafnia-0.2.2 → hafnia-0.2.4}/.github/workflows/ci_cd.yaml +1 -1
  4. {hafnia-0.2.2 → hafnia-0.2.4}/.github/workflows/lint.yaml +1 -1
  5. {hafnia-0.2.2 → hafnia-0.2.4}/.github/workflows/publish_docker.yaml +3 -3
  6. {hafnia-0.2.2 → hafnia-0.2.4}/.github/workflows/publish_pypi.yaml +1 -1
  7. {hafnia-0.2.2 → hafnia-0.2.4}/.github/workflows/tests.yaml +1 -1
  8. {hafnia-0.2.2 → hafnia-0.2.4}/PKG-INFO +1 -1
  9. {hafnia-0.2.2 → hafnia-0.2.4}/examples/example_dataset_recipe.py +15 -11
  10. {hafnia-0.2.2 → hafnia-0.2.4}/examples/example_hafnia_dataset.py +26 -16
  11. {hafnia-0.2.2 → hafnia-0.2.4}/pyproject.toml +1 -1
  12. {hafnia-0.2.2 → hafnia-0.2.4}/src/cli/__main__.py +6 -10
  13. {hafnia-0.2.2 → hafnia-0.2.4}/src/cli/config.py +19 -5
  14. {hafnia-0.2.2 → hafnia-0.2.4}/src/cli/profile_cmds.py +2 -1
  15. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/dataset_helpers.py +39 -6
  16. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/dataset_recipe/dataset_recipe.py +59 -1
  17. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/dataset_recipe/recipe_types.py +4 -0
  18. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/hafnia_dataset.py +5 -17
  19. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/platform/datasets.py +30 -19
  20. {hafnia-0.2.2 → hafnia-0.2.4}/tests/dataset/dataset_recipe/test_dataset_recipes.py +46 -3
  21. {hafnia-0.2.2 → hafnia-0.2.4}/tests/dataset/test_dataset_helpers.py +10 -5
  22. {hafnia-0.2.2 → hafnia-0.2.4}/tests/test_cli.py +38 -1
  23. {hafnia-0.2.2 → hafnia-0.2.4}/tests/test_samples.py +7 -0
  24. {hafnia-0.2.2 → hafnia-0.2.4}/uv.lock +911 -877
  25. {hafnia-0.2.2 → hafnia-0.2.4}/.devcontainer/devcontainer.json +0 -0
  26. {hafnia-0.2.2 → hafnia-0.2.4}/.devcontainer/hooks/post_create +0 -0
  27. {hafnia-0.2.2 → hafnia-0.2.4}/.github/dependabot.yaml +0 -0
  28. {hafnia-0.2.2 → hafnia-0.2.4}/.github/workflows/Dockerfile +0 -0
  29. {hafnia-0.2.2 → hafnia-0.2.4}/.gitignore +0 -0
  30. {hafnia-0.2.2 → hafnia-0.2.4}/.pre-commit-config.yaml +0 -0
  31. {hafnia-0.2.2 → hafnia-0.2.4}/.python-version +0 -0
  32. {hafnia-0.2.2 → hafnia-0.2.4}/.vscode/extensions.json +0 -0
  33. {hafnia-0.2.2 → hafnia-0.2.4}/.vscode/launch.json +0 -0
  34. {hafnia-0.2.2 → hafnia-0.2.4}/.vscode/settings.json +0 -0
  35. {hafnia-0.2.2 → hafnia-0.2.4}/LICENSE +0 -0
  36. {hafnia-0.2.2 → hafnia-0.2.4}/README.md +0 -0
  37. {hafnia-0.2.2 → hafnia-0.2.4}/docs/cli.md +0 -0
  38. {hafnia-0.2.2 → hafnia-0.2.4}/docs/release.md +0 -0
  39. {hafnia-0.2.2 → hafnia-0.2.4}/examples/example_logger.py +0 -0
  40. {hafnia-0.2.2 → hafnia-0.2.4}/examples/example_torchvision_dataloader.py +0 -0
  41. {hafnia-0.2.2 → hafnia-0.2.4}/src/cli/__init__.py +0 -0
  42. {hafnia-0.2.2 → hafnia-0.2.4}/src/cli/consts.py +0 -0
  43. {hafnia-0.2.2 → hafnia-0.2.4}/src/cli/dataset_cmds.py +0 -0
  44. {hafnia-0.2.2 → hafnia-0.2.4}/src/cli/experiment_cmds.py +0 -0
  45. {hafnia-0.2.2 → hafnia-0.2.4}/src/cli/recipe_cmds.py +0 -0
  46. {hafnia-0.2.2 → hafnia-0.2.4}/src/cli/runc_cmds.py +0 -0
  47. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/__init__.py +0 -0
  48. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/data/__init__.py +0 -0
  49. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/data/factory.py +0 -0
  50. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/dataset_names.py +0 -0
  51. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/dataset_recipe/recipe_transforms.py +0 -0
  52. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/dataset_upload_helper.py +0 -0
  53. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/operations/dataset_stats.py +0 -0
  54. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/operations/dataset_transformations.py +0 -0
  55. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/operations/table_transformations.py +0 -0
  56. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/primitives/__init__.py +0 -0
  57. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/primitives/bbox.py +0 -0
  58. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/primitives/bitmask.py +0 -0
  59. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/primitives/classification.py +0 -0
  60. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/primitives/point.py +0 -0
  61. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/primitives/polygon.py +0 -0
  62. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/primitives/primitive.py +0 -0
  63. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/primitives/segmentation.py +0 -0
  64. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/dataset/primitives/utils.py +0 -0
  65. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/experiment/__init__.py +0 -0
  66. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/experiment/hafnia_logger.py +0 -0
  67. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/http.py +0 -0
  68. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/log.py +0 -0
  69. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/platform/__init__.py +0 -0
  70. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/platform/builder.py +0 -0
  71. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/platform/download.py +0 -0
  72. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/platform/experiment.py +0 -0
  73. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/torch_helpers.py +0 -0
  74. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/utils.py +0 -0
  75. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/visualizations/colors.py +0 -0
  76. {hafnia-0.2.2 → hafnia-0.2.4}/src/hafnia/visualizations/image_visualizations.py +0 -0
  77. {hafnia-0.2.2 → hafnia-0.2.4}/tests/__init__.py +0 -0
  78. {hafnia-0.2.2 → hafnia-0.2.4}/tests/conftest.py +0 -0
  79. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_check_dataset[caltech-101].png +0 -0
  80. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_check_dataset[caltech-256].png +0 -0
  81. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_check_dataset[cifar100].png +0 -0
  82. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_check_dataset[cifar10].png +0 -0
  83. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_check_dataset[coco-2017].png +0 -0
  84. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_check_dataset[midwest-vehicle-detection].png +0 -0
  85. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_check_dataset[mnist].png +0 -0
  86. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_check_dataset[tiny-dataset].png +0 -0
  87. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[caltech-101].png +0 -0
  88. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[caltech-256].png +0 -0
  89. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[cifar100].png +0 -0
  90. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[cifar10].png +0 -0
  91. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[coco-2017].png +0 -0
  92. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[midwest-vehicle-detection].png +0 -0
  93. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[mnist].png +0 -0
  94. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_samples/test_dataset_draw_image_and_target[tiny-dataset].png +0 -0
  95. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_visualizations/test_blur_anonymization[coco-2017].png +0 -0
  96. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_visualizations/test_blur_anonymization[tiny-dataset].png +0 -0
  97. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_visualizations/test_draw_annotations[coco-2017].png +0 -0
  98. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_visualizations/test_draw_annotations[tiny-dataset].png +0 -0
  99. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_visualizations/test_mask_region[coco-2017].png +0 -0
  100. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/expected_images/test_visualizations/test_mask_region[tiny-dataset].png +0 -0
  101. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/coco-2017/annotations.jsonl +0 -0
  102. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/coco-2017/annotations.parquet +0 -0
  103. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/coco-2017/data/182a2c0a3ce312cf.jpg +0 -0
  104. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/coco-2017/data/4e95c6eb6209880a.jpg +0 -0
  105. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/coco-2017/data/cf86c7a23edb55ce.jpg +0 -0
  106. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/coco-2017/dataset_info.json +0 -0
  107. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/tiny-dataset/annotations.jsonl +0 -0
  108. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/tiny-dataset/annotations.parquet +0 -0
  109. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/tiny-dataset/data/222bbd5721a8a86e.png +0 -0
  110. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/tiny-dataset/data/3251d85443622e4c.png +0 -0
  111. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/tiny-dataset/data/3657ababa44af9b6.png +0 -0
  112. {hafnia-0.2.2 → hafnia-0.2.4}/tests/data/micro_test_datasets/tiny-dataset/dataset_info.json +0 -0
  113. {hafnia-0.2.2 → hafnia-0.2.4}/tests/dataset/dataset_recipe/test_dataset_recipe_helpers.py +0 -0
  114. {hafnia-0.2.2 → hafnia-0.2.4}/tests/dataset/dataset_recipe/test_recipe_transformations.py +0 -0
  115. {hafnia-0.2.2 → hafnia-0.2.4}/tests/dataset/operations/test_dataset_transformations.py +0 -0
  116. {hafnia-0.2.2 → hafnia-0.2.4}/tests/dataset/operations/test_table_transformations.py +0 -0
  117. {hafnia-0.2.2 → hafnia-0.2.4}/tests/dataset/test_colors.py +0 -0
  118. {hafnia-0.2.2 → hafnia-0.2.4}/tests/dataset/test_hafnia_dataset.py +0 -0
  119. {hafnia-0.2.2 → hafnia-0.2.4}/tests/dataset/test_shape_primitives.py +0 -0
  120. {hafnia-0.2.2 → hafnia-0.2.4}/tests/helper_testing.py +0 -0
  121. {hafnia-0.2.2 → hafnia-0.2.4}/tests/test_builder.py +0 -0
  122. {hafnia-0.2.2 → hafnia-0.2.4}/tests/test_check_example_scripts.py +0 -0
  123. {hafnia-0.2.2 → hafnia-0.2.4}/tests/test_hafnia_logger.py +0 -0
  124. {hafnia-0.2.2 → hafnia-0.2.4}/tests/test_utils.py +0 -0
  125. {hafnia-0.2.2 → hafnia-0.2.4}/tests/test_visualizations.py +0 -0
@@ -17,7 +17,7 @@ jobs:
17
17
  outputs:
18
18
  package-version: ${{ steps.extract-version.outputs.package_version }}
19
19
  steps:
20
- - uses: actions/checkout@v4.2.2
20
+ - uses: actions/checkout@v5.0.0
21
21
  - uses: actions/setup-python@v5.6.0
22
22
  with:
23
23
  python-version-file: ${{ inputs.python-version-file }}
@@ -20,7 +20,7 @@ jobs:
20
20
  make_release: ${{ steps.check_release.outputs.make_release }}
21
21
  steps:
22
22
  - name: Download package artifact
23
- uses: actions/download-artifact@v4.3.0
23
+ uses: actions/download-artifact@v5.0.0
24
24
  with:
25
25
  name: python-package
26
26
  path: dist/
@@ -19,7 +19,7 @@ jobs:
19
19
  runs-on: ubuntu-latest
20
20
  needs: lint
21
21
  steps:
22
- - uses: actions/checkout@v4.2.2
22
+ - uses: actions/checkout@v5.0.0
23
23
  - name: Run Trivy vulnerability scanner
24
24
  uses: aquasecurity/trivy-action@0.32.0
25
25
  with:
@@ -10,7 +10,7 @@ jobs:
10
10
  lint:
11
11
  runs-on: ubuntu-latest
12
12
  steps:
13
- - uses: actions/checkout@v4.2.2
13
+ - uses: actions/checkout@v5.0.0
14
14
  - uses: actions/setup-python@v5.6.0
15
15
  with:
16
16
  python-version-file: ${{ inputs.python-version-file }}
@@ -24,14 +24,14 @@ jobs:
24
24
  build:
25
25
  runs-on: ubuntu-latest
26
26
  steps:
27
- - uses: actions/checkout@v4.2.2
27
+ - uses: actions/checkout@v5.0.0
28
28
  - uses: actions/setup-python@v5.6.0
29
29
  id: python
30
30
  with:
31
31
  python-version-file: ${{ inputs.python-version-file }}
32
32
 
33
33
  - name: Download package artifact
34
- uses: actions/download-artifact@v4.3.0
34
+ uses: actions/download-artifact@v5.0.0
35
35
  with:
36
36
  name: python-package
37
37
  path: dist/
@@ -47,7 +47,7 @@ jobs:
47
47
  echo "aws_region=${{ secrets.STAGE_AWS_REGION }}" >> $GITHUB_OUTPUT
48
48
  fi
49
49
  - name: Configure AWS credentials
50
- uses: aws-actions/configure-aws-credentials@v4.2.1
50
+ uses: aws-actions/configure-aws-credentials@v4.3.1
51
51
  with:
52
52
  role-to-assume: arn:aws:iam::${{ steps.env-vars.outputs.aws_account_id }}:role/${{ secrets.AWS_ROLE_NAME }}
53
53
  aws-region: ${{ steps.env-vars.outputs.aws_region }}
@@ -17,7 +17,7 @@ jobs:
17
17
  contents: read
18
18
  steps:
19
19
  - name: Download package artifact
20
- uses: actions/download-artifact@v4.3.0
20
+ uses: actions/download-artifact@v5.0.0
21
21
  with:
22
22
  name: python-package
23
23
  path: dist/
@@ -11,7 +11,7 @@ jobs:
11
11
  test:
12
12
  runs-on: ubuntu-latest
13
13
  steps:
14
- - uses: actions/checkout@v4.2.2
14
+ - uses: actions/checkout@v5.0.0
15
15
  - uses: actions/setup-python@v5.6.0
16
16
  with:
17
17
  python-version-file: ${{ inputs.python-version-file }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hafnia
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Python SDK for communication with Hafnia platform.
5
5
  Author-email: Milestone Systems <hafniaplatform@milestone.dk>
6
6
  License-File: LICENSE
@@ -29,20 +29,24 @@ dataset_recipe = DatasetRecipe.from_name(name="mnist").shuffle().select_samples(
29
29
  dataset = dataset_recipe.build()
30
30
  # Note that the interface is similar, but to actually create the dataset you need to call `build()` on the recipe.
31
31
 
32
- # An important feature of a 'DatasetRecipe' is that the recipe itself - and not the dataset - can be saved as a file
33
- # and loaded from file. Meaning you can easily save, share, load and build the dataset later or in a different
34
- # environment.
35
- # In programming language, the recipe can be serialized to JSON and deserialized back to the original python object
36
- # recipe.
32
+ # Unlike the HafniaDataset, a DatasetRecipe does not execute operations. It only registers
33
+ # the operations applied to the recipe and can be used to build the dataset later.
34
+ # You can print the dataset recipe to the operations that were applied to it.
35
+ rprint(dataset_recipe)
36
+
37
+ # Or as a JSON string:
38
+ json_str: str = dataset_recipe.as_json_str()
39
+ rprint(json_str)
40
+
41
+ # This is an important feature of a 'DatasetRecipe' it only registers operations and that the recipe itself
42
+ # - and not the dataset - can be saved as a file and loaded from file.
43
+ # Meaning you can easily save, share, load and build the dataset later or in a different environment.
37
44
  # For TaaS, this is the only way to include multiple datasets during training.
38
45
 
39
- # This is how it looks like in practice:
40
- # 1) Save the dataset recipe to a file
41
- path_json = Path(".data/tmp/dataset_recipe.json")
42
- dataset_recipe.as_json_file(path_json)
43
46
 
44
- # 2) The recipe can be loaded from the file
45
- dataset_recipe_again = DatasetRecipe.from_json_file(path_json)
47
+ # 2) The recipe can be loaded from json string
48
+ dataset_recipe_again: DatasetRecipe = DatasetRecipe.from_json_str(json_str)
49
+ # dataset_recipe_again.build()
46
50
 
47
51
  # We can verify that the loaded recipe is the same as the original recipe.
48
52
  assert dataset_recipe_again == dataset_recipe
@@ -26,11 +26,10 @@ dataset = HafniaDataset.from_path(path_dataset)
26
26
  # Alternatively, you can use the 'load_dataset' function
27
27
  dataset = load_dataset("midwest-vehicle-detection")
28
28
 
29
-
30
29
  # Dataset information is stored in 'dataset.info'
31
30
  rprint(dataset.info)
32
31
 
33
- # Annotations are stored in 'dataset.table' as a Polars DataFrame
32
+ # Annotations are stored in 'dataset.samples' as a Polars DataFrame
34
33
  dataset.samples.head(2)
35
34
 
36
35
  # Print dataset information
@@ -49,14 +48,29 @@ shuffled_dataset = dataset.shuffle(seed=42) # Shuffle the dataset
49
48
  split_ratios = {SplitName.TRAIN: 0.8, SplitName.VAL: 0.1, SplitName.TEST: 0.1}
50
49
  new_dataset_splits = dataset.splits_by_ratios(split_ratios)
51
50
 
51
+ # Support Chaining Operations (load, shuffle, select samples)
52
+ dataset = load_dataset("midwest-vehicle-detection").shuffle(seed=42).select_samples(n_samples=10)
53
+
54
+
52
55
  # Write dataset to disk
53
56
  path_tmp = Path(".data/tmp")
54
57
  path_dataset = path_tmp / "hafnia_dataset"
55
- dataset.write(path_dataset) # --> Check that data is human readable
58
+ dataset.write(path_dataset)
56
59
 
57
60
  # Load dataset from disk
58
61
  dataset_again = HafniaDataset.from_path(path_dataset)
59
62
 
63
+
64
+ # Want custom dataset transformations or statistics? Use the polars table (dataset.samples) directly
65
+ n_objects = dataset.samples["objects"].list.len().sum()
66
+ n_objects = dataset.samples[Bbox.column_name()].list.len().sum() # Use Bbox.column_name() to avoid magic variables
67
+ n_classifications = dataset.samples[Classification.column_name()].list.len().sum()
68
+
69
+ class_counts = dataset.samples[Classification.column_name()].explode().struct.field("class_name").value_counts()
70
+ class_counts = dataset.samples[Bbox.column_name()].explode().struct.field("class_name").value_counts()
71
+ rprint(dict(class_counts.iter_rows()))
72
+
73
+
60
74
  # Access the first sample in the training split - data is stored in a dictionary
61
75
  sample_dict = dataset_train[0]
62
76
 
@@ -78,25 +92,15 @@ image: np.ndarray = sample.read_image()
78
92
  # Visualize sample and annotations
79
93
  image_with_annotations = sample.draw_annotations()
80
94
 
81
-
95
+ # Save the image with annotations to a temporary directory
82
96
  path_tmp.mkdir(parents=True, exist_ok=True)
83
97
  Image.fromarray(image_with_annotations).save(path_tmp / "sample_with_annotations.png")
84
98
 
85
99
 
86
- # Do dataset transformations and statistics on the Polars DataFrame
87
- n_objects = dataset.samples["objects"].list.len().sum()
88
- n_objects = dataset.samples[Bbox.column_name()].list.len().sum() # Use Bbox.column_name() to avoid magic variables
89
- n_classifications = dataset.samples[Classification.column_name()].list.len().sum()
90
-
91
- class_counts = dataset.samples[Classification.column_name()].explode().struct.field("class_name").value_counts()
92
- class_counts = dataset.samples[Bbox.column_name()].explode().struct.field("class_name").value_counts()
93
- rprint(dict(class_counts.iter_rows()))
94
-
95
-
96
100
  ## Bring-your-own-data: Create a new dataset from samples
97
101
  fake_samples = []
98
102
  for i_fake_sample in range(5):
99
- bboxes = [Bbox(top_left_x=10, top_left_y=20, width=100, height=200, class_name="car")]
103
+ bboxes = [Bbox(top_left_x=0.1, top_left_y=0.20, width=0.1, height=0.2, class_name="car")]
100
104
  classifications = [Classification(class_name="vehicle", class_idx=0)]
101
105
  sample = Sample(
102
106
  file_name=f"path/to/image_{i_fake_sample:05}.jpg",
@@ -120,8 +124,14 @@ fake_dataset_info = DatasetInfo(
120
124
  )
121
125
  fake_dataset = HafniaDataset.from_samples_list(samples_list=fake_samples, info=fake_dataset_info)
122
126
 
127
+ # Coming soon! Upload your dataset to the Hafnia Platform
128
+ # fake_dataset.upload_to_hafnia()
129
+
130
+ # Coming soon! Create your own dataset details page in Hafnia
131
+ # fake_dataset.upload_dataset_details()
123
132
 
124
- ## A hafnia dataset can also be used for storing predictions per sample set 'ground_truth=False' and add 'confidence'.
133
+ ## Storing predictions: A hafnia dataset can also be used for storing predictions per sample
134
+ # set 'ground_truth=False' and add 'confidence'.
125
135
  bboxes_predictions = [
126
136
  Bbox(top_left_x=10, top_left_y=20, width=100, height=200, class_name="car", ground_truth=False, confidence=0.9)
127
137
  ]
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "hafnia"
3
- version = "0.2.2"
3
+ version = "0.2.4"
4
4
  description = "Python SDK for communication with Hafnia platform."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -20,19 +20,15 @@ def configure(cfg: Config) -> None:
20
20
 
21
21
  profile_name = click.prompt("Profile Name", type=str, default=consts.DEFAULT_PROFILE_NAME)
22
22
  profile_name = profile_name.strip()
23
- try:
24
- cfg.add_profile(profile_name, ConfigSchema(), set_active=True)
25
- except ValueError:
26
- raise click.ClickException(consts.ERROR_CREATE_PROFILE)
23
+
24
+ cfg.check_profile_name(profile_name)
27
25
 
28
26
  api_key = click.prompt("Hafnia API Key", type=str, hide_input=True)
29
- try:
30
- cfg.api_key = api_key.strip()
31
- except ValueError as e:
32
- click.echo(f"Error: {str(e)}", err=True)
33
- return
27
+
34
28
  platform_url = click.prompt("Hafnia Platform URL", type=str, default=consts.DEFAULT_API_URL)
35
- cfg.platform_url = platform_url.strip()
29
+
30
+ cfg_profile = ConfigSchema(api_key=api_key, platform_url=platform_url)
31
+ cfg.add_profile(profile_name, cfg_profile, set_active=True)
36
32
  cfg.save_config()
37
33
  profile_cmds.profile_show(cfg)
38
34
 
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional
6
6
  from pydantic import BaseModel, field_validator
7
7
 
8
8
  import cli.consts as consts
9
- from hafnia.log import user_logger
9
+ from hafnia.log import sys_logger, user_logger
10
10
 
11
11
  PLATFORM_API_MAPPING = {
12
12
  "recipes": "/api/v1/recipes",
@@ -23,9 +23,17 @@ class ConfigSchema(BaseModel):
23
23
  api_key: Optional[str] = None
24
24
 
25
25
  @field_validator("api_key")
26
- def validate_api_key(cls, value: str) -> str:
27
- if value is not None and len(value) < 10:
26
+ def validate_api_key(cls, value: Optional[str]) -> Optional[str]:
27
+ if value is None:
28
+ return value
29
+
30
+ if len(value) < 10:
28
31
  raise ValueError("API key is too short.")
32
+
33
+ if not value.startswith("ApiKey "):
34
+ sys_logger.warning("API key is missing the 'ApiKey ' prefix. Prefix is being added automatically.")
35
+ value = f"ApiKey {value}"
36
+
29
37
  return value
30
38
 
31
39
 
@@ -51,6 +59,7 @@ class Config:
51
59
  if profile_name not in self.config_data.profiles:
52
60
  raise ValueError(f"Profile '{profile_name}' does not exist.")
53
61
  self.config_data.active_profile = profile_name
62
+ self.save_config()
54
63
 
55
64
  @property
56
65
  def config(self) -> ConfigSchema:
@@ -92,13 +101,18 @@ class Config:
92
101
 
93
102
  return Path.home() / ".hafnia" / "config.json"
94
103
 
95
- def add_profile(self, profile_name: str, profile: ConfigSchema, set_active: bool = False) -> None:
96
- profile_name = profile_name.strip()
104
+ def check_profile_name(self, profile_name: str) -> None:
105
+ if not profile_name or not isinstance(profile_name, str):
106
+ raise ValueError("Profile name must be a non-empty string.")
107
+
97
108
  if profile_name in self.config_data.profiles:
98
109
  user_logger.warning(
99
110
  f"Profile with name '{profile_name}' already exists, it will be overwritten by the new one."
100
111
  )
101
112
 
113
+ def add_profile(self, profile_name: str, profile: ConfigSchema, set_active: bool = False) -> None:
114
+ profile_name = profile_name.strip()
115
+ self.check_profile_name(profile_name)
102
116
  self.config_data.profiles[profile_name] = profile
103
117
  if set_active:
104
118
  self.config_data.active_profile = profile_name
@@ -56,6 +56,7 @@ def profile_create(cfg: Config, name: str, api_url: str, api_key: str, activate:
56
56
  cfg_profile = ConfigSchema(platform_url=api_url, api_key=api_key)
57
57
 
58
58
  cfg.add_profile(profile_name=name, profile=cfg_profile, set_active=activate)
59
+ profile_show(cfg)
59
60
 
60
61
 
61
62
  @profile.command("rm")
@@ -87,7 +88,7 @@ def profile_active(cfg: Config) -> None:
87
88
 
88
89
 
89
90
  def profile_show(cfg: Config) -> None:
90
- masked_key = f"{cfg.api_key[:4]}...{cfg.api_key[-4:]}" if len(cfg.api_key) > 8 else "****"
91
+ masked_key = f"{cfg.api_key[:11]}...{cfg.api_key[-4:]}" if len(cfg.api_key) > 20 else "****"
91
92
  console = Console()
92
93
 
93
94
  table = Table(title=f"{consts.PROFILE_TABLE_HEADER} {cfg.active_profile}", show_header=False)
@@ -1,6 +1,7 @@
1
1
  import io
2
2
  import math
3
3
  import random
4
+ import shutil
4
5
  from pathlib import Path
5
6
  from typing import Dict, List
6
7
 
@@ -21,7 +22,7 @@ def create_split_name_list_from_ratios(split_ratios: Dict[str, float], n_items:
21
22
 
22
23
 
23
24
  def hash_file_xxhash(path: Path, chunk_size: int = 262144) -> str:
24
- hasher = xxhash.xxh3_64()
25
+ hasher = xxhash.xxh3_128()
25
26
 
26
27
  with open(path, "rb") as f:
27
28
  for chunk in iter(lambda: f.read(chunk_size), b""): # 8192, 16384, 32768, 65536
@@ -30,7 +31,7 @@ def hash_file_xxhash(path: Path, chunk_size: int = 262144) -> str:
30
31
 
31
32
 
32
33
  def hash_from_bytes(data: bytes) -> str:
33
- hasher = xxhash.xxh3_64()
34
+ hasher = xxhash.xxh3_128()
34
35
  hasher.update(data)
35
36
  return hasher.hexdigest()
36
37
 
@@ -40,14 +41,46 @@ def save_image_with_hash_name(image: np.ndarray, path_folder: Path) -> Path:
40
41
  buffer = io.BytesIO()
41
42
  pil_image.save(buffer, format="PNG")
42
43
  hash_value = hash_from_bytes(buffer.getvalue())
43
- path_image = Path(path_folder) / f"{hash_value}.png"
44
+ path_image = Path(path_folder) / relative_path_from_hash(hash=hash_value, suffix=".png")
45
+ path_image.parent.mkdir(parents=True, exist_ok=True)
44
46
  pil_image.save(path_image)
45
47
  return path_image
46
48
 
47
49
 
48
- def filename_as_hash_from_path(path_image: Path) -> str:
49
- hash = hash_file_xxhash(path_image)
50
- return f"{hash}{path_image.suffix}"
50
+ def copy_and_rename_file_to_hash_value(path_source: Path, path_dataset_root: Path) -> Path:
51
+ """
52
+ Copies a file to a dataset root directory with a hash-based name and sub-directory structure.
53
+
54
+ E.g. for an "image.png" with hash "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4", the image will be copied to
55
+ 'path_dataset_root / "data" / "dfe" / "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4.png"'
56
+ Notice that the hash is used for both the filename and the subfolder name.
57
+
58
+ Placing image/video files into multiple sub-folders (instead of one large folder) is seemingly
59
+ unnecessary, but it is actually a requirement when the dataset is later downloaded from S3.
60
+
61
+ The reason is that AWS has a rate limit of 3500 ops/sec per prefix (sub-folder) in S3 - meaning we can "only"
62
+ download 3500 files per second from a single folder (prefix) in S3.
63
+
64
+ For even a single user, we found that this limit was being reached when files are stored in single folder (prefix)
65
+ in S3. To support multiple users and concurrent experiments, we are required to separate files into
66
+ multiple sub-folders (prefixes) in S3 to not hit the rate limit.
67
+ """
68
+
69
+ if not path_source.exists():
70
+ raise FileNotFoundError(f"Source file {path_source} does not exist.")
71
+
72
+ hash_value = hash_file_xxhash(path_source)
73
+ path_file = path_dataset_root / relative_path_from_hash(hash=hash_value, suffix=path_source.suffix)
74
+ path_file.parent.mkdir(parents=True, exist_ok=True)
75
+ if not path_file.exists():
76
+ shutil.copy2(path_source, path_file)
77
+
78
+ return path_file
79
+
80
+
81
+ def relative_path_from_hash(hash: str, suffix: str) -> Path:
82
+ path_file = Path("data") / hash[:3] / f"{hash}{suffix}"
83
+ return path_file
51
84
 
52
85
 
53
86
  def split_sizes_from_ratios(n_items: int, split_ratios: Dict[str, float]) -> Dict[str, int]:
@@ -216,6 +216,16 @@ class DatasetRecipe(Serializable):
216
216
  json_str = self.as_json_str(indent=indent)
217
217
  path_json.write_text(json_str, encoding="utf-8")
218
218
 
219
+ ### Helper methods ###
220
+ def get_dataset_names(self) -> List[str]:
221
+ """
222
+ Get all dataset names added with 'from_name'.
223
+ Function recursively gathers dataset names.
224
+ """
225
+ if self.creation is None:
226
+ return []
227
+ return self.creation.get_dataset_names()
228
+
219
229
  ### Validation and Serialization ###
220
230
  @field_validator("creation", mode="plain")
221
231
  @classmethod
@@ -282,7 +292,10 @@ class FromPath(RecipeCreation):
282
292
  return HafniaDataset.from_path
283
293
 
284
294
  def as_short_name(self) -> str:
285
- return f"'{self.path_folder}'".replace(os.sep, "|")
295
+ return f"'{self.path_folder}'".replace(os.sep, "-")
296
+
297
+ def get_dataset_names(self) -> List[str]:
298
+ return [] # Only counts 'from_name' datasets
286
299
 
287
300
 
288
301
  class FromName(RecipeCreation):
@@ -297,6 +310,9 @@ class FromName(RecipeCreation):
297
310
  def as_short_name(self) -> str:
298
311
  return self.name
299
312
 
313
+ def get_dataset_names(self) -> List[str]:
314
+ return [self.name]
315
+
300
316
 
301
317
  class FromMerge(RecipeCreation):
302
318
  recipe0: DatasetRecipe
@@ -310,6 +326,11 @@ class FromMerge(RecipeCreation):
310
326
  merger = FromMerger(recipes=[self.recipe0, self.recipe1])
311
327
  return merger.as_short_name()
312
328
 
329
+ def get_dataset_names(self) -> List[str]:
330
+ """Get the dataset names from the merged recipes."""
331
+ names = [*self.recipe0.creation.get_dataset_names(), *self.recipe1.creation.get_dataset_names()]
332
+ return names
333
+
313
334
 
314
335
  class FromMerger(RecipeCreation):
315
336
  recipes: List[DatasetRecipe]
@@ -325,3 +346,40 @@ class FromMerger(RecipeCreation):
325
346
 
326
347
  def as_short_name(self) -> str:
327
348
  return f"Merger({','.join(recipe.as_short_name() for recipe in self.recipes)})"
349
+
350
+ def get_dataset_names(self) -> List[str]:
351
+ """Get the dataset names from the merged recipes."""
352
+ names = []
353
+ for recipe in self.recipes:
354
+ names.extend(recipe.creation.get_dataset_names())
355
+ return names
356
+
357
+
358
+ def extract_dataset_names_from_json_dict(data: dict) -> list[str]:
359
+ """
360
+ Extract dataset names recursively from a JSON dictionary added with 'from_name'.
361
+
362
+ Even if the same functionality is achieved with `DatasetRecipe.get_dataset_names()`,
363
+ we want to keep this function in 'dipdatalib' to extract dataset names from json dictionaries
364
+ directly.
365
+ """
366
+ creation_field = data.get("creation")
367
+ if creation_field is None:
368
+ return []
369
+ if creation_field.get("__type__") == "FromName":
370
+ return [creation_field["name"]]
371
+ elif creation_field.get("__type__") == "FromMerge":
372
+ recipe_names = ["recipe0", "recipe1"]
373
+ dataset_name = []
374
+ for recipe_name in recipe_names:
375
+ recipe = creation_field.get(recipe_name)
376
+ if recipe is None:
377
+ continue
378
+ dataset_name.extend(extract_dataset_names_from_json_dict(recipe))
379
+ return dataset_name
380
+ elif creation_field.get("__type__") == "FromMerger":
381
+ dataset_name = []
382
+ for recipe in creation_field.get("recipes", []):
383
+ dataset_name.extend(extract_dataset_names_from_json_dict(recipe))
384
+ return dataset_name
385
+ return []
@@ -108,6 +108,10 @@ class RecipeCreation(Serializable):
108
108
  def get_function() -> Callable[..., "HafniaDataset"]:
109
109
  pass
110
110
 
111
+ @abstractmethod
112
+ def get_dataset_names(self) -> List[str]:
113
+ pass
114
+
111
115
  def build(self) -> "HafniaDataset":
112
116
  from hafnia.dataset.dataset_recipe.dataset_recipe import DatasetRecipe
113
117
 
@@ -411,30 +411,18 @@ class HafniaDataset:
411
411
 
412
412
  return True
413
413
 
414
- def write(self, path_folder: Path, name_by_hash: bool = True, add_version: bool = False) -> None:
414
+ def write(self, path_folder: Path, add_version: bool = False) -> None:
415
415
  user_logger.info(f"Writing dataset to {path_folder}...")
416
416
  if not path_folder.exists():
417
417
  path_folder.mkdir(parents=True)
418
- path_folder_images = path_folder / "data"
419
- path_folder_images.mkdir(parents=True, exist_ok=True)
420
418
 
421
419
  new_relative_paths = []
422
420
  for org_path in tqdm(self.samples["file_name"].to_list(), desc="- Copy images"):
423
- org_path = Path(org_path)
424
- if not org_path.exists():
425
- raise FileNotFoundError(f"File {org_path} does not exist in the dataset.")
426
- if name_by_hash:
427
- filename = dataset_helpers.filename_as_hash_from_path(org_path)
428
- else:
429
- filename = Path(org_path).name
430
- new_path = path_folder_images / filename
431
- if not new_path.exists():
432
- shutil.copy2(org_path, new_path)
433
-
434
- if not new_path.exists():
435
- raise FileNotFoundError(f"File {new_path} does not exist in the dataset.")
421
+ new_path = dataset_helpers.copy_and_rename_file_to_hash_value(
422
+ path_source=Path(org_path),
423
+ path_dataset_root=path_folder,
424
+ )
436
425
  new_relative_paths.append(str(new_path.relative_to(path_folder)))
437
-
438
426
  table = self.samples.with_columns(pl.Series(new_relative_paths).alias("file_name"))
439
427
  table.write_ndjson(path_folder / FILENAME_ANNOTATIONS_JSONL) # Json for readability
440
428
  table.write_parquet(path_folder / FILENAME_ANNOTATIONS_PARQUET) # Parquet for speed
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  import shutil
3
3
  import subprocess
4
+ import sys
4
5
  import tempfile
5
6
  import uuid
6
7
  from pathlib import Path
@@ -62,7 +63,12 @@ def download_or_get_dataset_path(
62
63
  dataset_id = get_dataset_id(dataset_name=dataset_name, endpoint=endpoint_dataset, api_key=api_key)
63
64
  if dataset_id is None:
64
65
  sys_logger.error(f"Dataset '{dataset_name}' not found on the Hafnia platform.")
65
- access_dataset_endpoint = f"{endpoint_dataset}/{dataset_id}/temporary-credentials"
66
+
67
+ if utils.is_hafnia_cloud_job():
68
+ credentials_endpoint_suffix = "temporary-credentials-hidden" # Access to hidden datasets
69
+ else:
70
+ credentials_endpoint_suffix = "temporary-credentials" # Access to sample dataset
71
+ access_dataset_endpoint = f"{endpoint_dataset}/{dataset_id}/{credentials_endpoint_suffix}"
66
72
 
67
73
  download_dataset_from_access_endpoint(
68
74
  endpoint=access_dataset_endpoint,
@@ -86,22 +92,30 @@ def download_dataset_from_access_endpoint(
86
92
  s3_dataset_files = [f"{s3_uri}/{filename}" for filename in DATASET_FILENAMES_REQUIRED]
87
93
 
88
94
  envs = resource_credentials.aws_credentials()
89
- fast_copy_files_s3(
90
- src_paths=s3_dataset_files,
91
- dst_paths=local_dataset_paths,
92
- append_envs=envs,
93
- description="Downloading annotations",
94
- )
95
+ try:
96
+ fast_copy_files_s3(
97
+ src_paths=s3_dataset_files,
98
+ dst_paths=local_dataset_paths,
99
+ append_envs=envs,
100
+ description="Downloading annotations",
101
+ )
102
+ except ValueError as e:
103
+ user_logger.error(f"Failed to download annotations: {e}")
104
+ return
95
105
 
96
106
  if not download_files:
97
107
  return
98
108
  dataset = HafniaDataset.from_path(path_dataset, check_for_images=False)
99
- fast_copy_files_s3(
100
- src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
101
- dst_paths=dataset.samples[ColumnName.FILE_NAME].to_list(),
102
- append_envs=envs,
103
- description="Downloading images",
104
- )
109
+ try:
110
+ fast_copy_files_s3(
111
+ src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
112
+ dst_paths=dataset.samples[ColumnName.FILE_NAME].to_list(),
113
+ append_envs=envs,
114
+ description="Downloading images",
115
+ )
116
+ except ValueError as e:
117
+ user_logger.error(f"Failed to download images: {e}")
118
+ return
105
119
 
106
120
 
107
121
  def fast_copy_files_s3(
@@ -112,7 +126,6 @@ def fast_copy_files_s3(
112
126
  ) -> List[str]:
113
127
  if len(src_paths) != len(dst_paths):
114
128
  raise ValueError("Source and destination paths must have the same length.")
115
-
116
129
  cmds = [f"cp {src} {dst}" for src, dst in zip(src_paths, dst_paths)]
117
130
  lines = execute_s5cmd_commands(cmds, append_envs=append_envs, description=description)
118
131
  return lines
@@ -129,11 +142,9 @@ def execute_s5cmd_commands(
129
142
  with tempfile.TemporaryDirectory() as temp_dir:
130
143
  tmp_file_path = Path(temp_dir, f"{uuid.uuid4().hex}.txt")
131
144
  tmp_file_path.write_text("\n".join(commands))
132
- run_cmds = [
133
- "s5cmd",
134
- "run",
135
- str(tmp_file_path),
136
- ]
145
+ s5cmd_bin = (Path(sys.executable).parent / "s5cmd").absolute().as_posix()
146
+ run_cmds = [s5cmd_bin, "run", str(tmp_file_path)]
147
+ sys_logger.debug(run_cmds)
137
148
  envs = os.environ.copy()
138
149
  envs.update(append_envs)
139
150