hafnia 0.1.24__tar.gz → 0.1.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hafnia-0.1.24 → hafnia-0.1.26}/.github/workflows/ci_cd.yaml +1 -1
- {hafnia-0.1.24 → hafnia-0.1.26}/.github/workflows/publish_docker.yaml +2 -2
- {hafnia-0.1.24 → hafnia-0.1.26}/.github/workflows/tests.yaml +1 -1
- {hafnia-0.1.24 → hafnia-0.1.26}/.gitignore +1 -1
- hafnia-0.1.26/.pre-commit-config.yaml +32 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.vscode/launch.json +26 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/LICENSE +1 -1
- hafnia-0.1.26/PKG-INFO +363 -0
- hafnia-0.1.26/README.md +342 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/docs/cli.md +5 -3
- {hafnia-0.1.24 → hafnia-0.1.26}/examples/dataset_builder.py +1 -2
- hafnia-0.1.26/examples/example_load_dataset.py +14 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/pyproject.toml +32 -23
- {hafnia-0.1.24 → hafnia-0.1.26}/src/cli/__main__.py +6 -10
- {hafnia-0.1.24 → hafnia-0.1.26}/src/cli/config.py +20 -27
- {hafnia-0.1.24 → hafnia-0.1.26}/src/cli/consts.py +4 -1
- {hafnia-0.1.24 → hafnia-0.1.26}/src/cli/data_cmds.py +5 -6
- {hafnia-0.1.24 → hafnia-0.1.26}/src/cli/experiment_cmds.py +3 -43
- {hafnia-0.1.24 → hafnia-0.1.26}/src/cli/profile_cmds.py +16 -2
- hafnia-0.1.26/src/cli/recipe_cmds.py +45 -0
- hafnia-0.1.26/src/cli/runc_cmds.py +144 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/src/hafnia/data/factory.py +13 -32
- {hafnia-0.1.24 → hafnia-0.1.26}/src/hafnia/experiment/hafnia_logger.py +11 -7
- {hafnia-0.1.24 → hafnia-0.1.26}/src/hafnia/http.py +2 -2
- hafnia-0.1.26/src/hafnia/log.py +23 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/src/hafnia/platform/__init__.py +0 -2
- hafnia-0.1.26/src/hafnia/platform/builder.py +144 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/src/hafnia/platform/download.py +8 -8
- {hafnia-0.1.24 → hafnia-0.1.26}/src/hafnia/platform/experiment.py +31 -25
- hafnia-0.1.26/src/hafnia/utils.py +137 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/tests/test_builder.py +26 -43
- {hafnia-0.1.24 → hafnia-0.1.26}/tests/test_check_example_scripts.py +1 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/tests/test_cli.py +8 -28
- hafnia-0.1.24/tests/test_mdi_logger.py → hafnia-0.1.26/tests/test_hafnia_logger.py +13 -6
- {hafnia-0.1.24 → hafnia-0.1.26}/tests/test_samples.py +4 -0
- hafnia-0.1.26/tests/test_utils.py +84 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/uv.lock +1264 -1224
- hafnia-0.1.24/.pre-commit-config.yaml +0 -16
- hafnia-0.1.24/PKG-INFO +0 -197
- hafnia-0.1.24/README.md +0 -175
- hafnia-0.1.24/docs/s2m.md +0 -84
- hafnia-0.1.24/examples/example_load_dataset.py +0 -4
- hafnia-0.1.24/examples/script2model/pytorch/Dockerfile +0 -10
- hafnia-0.1.24/examples/script2model/pytorch/src/lib/train_utils.py +0 -252
- hafnia-0.1.24/examples/script2model/pytorch/src/scripts/train.py +0 -60
- hafnia-0.1.24/src/cli/runc_cmds.py +0 -68
- hafnia-0.1.24/src/hafnia/log.py +0 -32
- hafnia-0.1.24/src/hafnia/platform/api.py +0 -12
- hafnia-0.1.24/src/hafnia/platform/builder.py +0 -184
- hafnia-0.1.24/src/hafnia/platform/executor.py +0 -111
- hafnia-0.1.24/src/hafnia/utils.py +0 -83
- hafnia-0.1.24/tests/test_executor.py +0 -84
- {hafnia-0.1.24 → hafnia-0.1.26}/.devcontainer/devcontainer.json +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.devcontainer/hooks/post_create +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.github/dependabot.yaml +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.github/workflows/Dockerfile +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.github/workflows/build.yaml +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.github/workflows/check_release.yaml +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.github/workflows/lint.yaml +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.github/workflows/publish_pypi.yaml +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.python-version +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.vscode/extensions.json +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/.vscode/settings.json +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/docs/release.md +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/examples/example_logger.py +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/examples/example_torchvision_dataloader.py +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/src/cli/__init__.py +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/src/hafnia/__init__.py +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/src/hafnia/data/__init__.py +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/src/hafnia/experiment/__init__.py +0 -0
- {hafnia-0.1.24 → hafnia-0.1.26}/src/hafnia/torch_helpers.py +0 -0
|
@@ -47,7 +47,7 @@ jobs:
|
|
|
47
47
|
echo "aws_region=${{ secrets.STAGE_AWS_REGION }}" >> $GITHUB_OUTPUT
|
|
48
48
|
fi
|
|
49
49
|
- name: Configure AWS credentials
|
|
50
|
-
uses: aws-actions/configure-aws-credentials@v4.1
|
|
50
|
+
uses: aws-actions/configure-aws-credentials@v4.2.1
|
|
51
51
|
with:
|
|
52
52
|
role-to-assume: arn:aws:iam::${{ steps.env-vars.outputs.aws_account_id }}:role/${{ secrets.AWS_ROLE_NAME }}
|
|
53
53
|
aws-region: ${{ steps.env-vars.outputs.aws_region }}
|
|
@@ -60,7 +60,7 @@ jobs:
|
|
|
60
60
|
uses: docker/setup-buildx-action@v3.10.0
|
|
61
61
|
|
|
62
62
|
- name: Build and push
|
|
63
|
-
uses: docker/build-push-action@v6.
|
|
63
|
+
uses: docker/build-push-action@v6.18.0
|
|
64
64
|
env:
|
|
65
65
|
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
|
|
66
66
|
ECR_REPOSITORY: mdi-runtime
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.11.13
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff-check
|
|
6
|
+
types_or: [python, pyi]
|
|
7
|
+
args: [--show-fixes]
|
|
8
|
+
files: ^(src|tests)/
|
|
9
|
+
- id: ruff-format
|
|
10
|
+
types_or: [python, pyi]
|
|
11
|
+
files: ^(src|tests)/
|
|
12
|
+
- repo: https://github.com/astral-sh/uv-pre-commit
|
|
13
|
+
rev: 0.7.13
|
|
14
|
+
hooks:
|
|
15
|
+
# Update the uv lockfile
|
|
16
|
+
- id: uv-lock
|
|
17
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
18
|
+
rev: v1.16.0
|
|
19
|
+
hooks:
|
|
20
|
+
- id: mypy
|
|
21
|
+
args: ["src/", "tests/"]
|
|
22
|
+
pass_filenames: false
|
|
23
|
+
- repo: local
|
|
24
|
+
hooks:
|
|
25
|
+
- id: pytest
|
|
26
|
+
stages: [manual]
|
|
27
|
+
name: pytest
|
|
28
|
+
entry: pytest
|
|
29
|
+
language: system
|
|
30
|
+
types: [python]
|
|
31
|
+
pass_filenames: false
|
|
32
|
+
args: ["-m", "not slow", "--tb=short", "-q"]
|
|
@@ -21,6 +21,32 @@
|
|
|
21
21
|
"ls"
|
|
22
22
|
],
|
|
23
23
|
},
|
|
24
|
+
{
|
|
25
|
+
"name": "cmd: hafnia runc launch-local",
|
|
26
|
+
"type": "debugpy",
|
|
27
|
+
"request": "launch",
|
|
28
|
+
"program": "${workspaceFolder}/src/cli/__main__.py",
|
|
29
|
+
"args": [
|
|
30
|
+
"runc",
|
|
31
|
+
"launch-local",
|
|
32
|
+
"--dataset",
|
|
33
|
+
"midwest-vehicle-detection-tiny",
|
|
34
|
+
"train --config-name yolov4-hafnia.yaml"
|
|
35
|
+
],
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"name": "cmd: hafnia runc build-local",
|
|
39
|
+
"type": "debugpy",
|
|
40
|
+
"request": "launch",
|
|
41
|
+
"program": "${workspaceFolder}/src/cli/__main__.py",
|
|
42
|
+
"args": [
|
|
43
|
+
"runc",
|
|
44
|
+
"build-local",
|
|
45
|
+
"train",
|
|
46
|
+
"--dataset",
|
|
47
|
+
"mnist",
|
|
48
|
+
],
|
|
49
|
+
},
|
|
24
50
|
{
|
|
25
51
|
"name": "debug (hafnia data download mnist)",
|
|
26
52
|
"type": "debugpy",
|
hafnia-0.1.26/PKG-INFO
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hafnia
|
|
3
|
+
Version: 0.1.26
|
|
4
|
+
Summary: Python SDK for communication with Hafnia platform.
|
|
5
|
+
Author-email: Milestone Systems <hafniaplatform@milestone.dk>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: boto3>=1.35.91
|
|
9
|
+
Requires-Dist: click>=8.1.8
|
|
10
|
+
Requires-Dist: datasets>=3.2.0
|
|
11
|
+
Requires-Dist: emoji>=2.14.1
|
|
12
|
+
Requires-Dist: flatten-dict>=0.4.2
|
|
13
|
+
Requires-Dist: pathspec>=0.12.1
|
|
14
|
+
Requires-Dist: pillow>=11.1.0
|
|
15
|
+
Requires-Dist: pyarrow>=18.1.0
|
|
16
|
+
Requires-Dist: pydantic>=2.10.4
|
|
17
|
+
Requires-Dist: rich>=13.9.4
|
|
18
|
+
Requires-Dist: seedir>=0.5.0
|
|
19
|
+
Requires-Dist: tqdm>=4.67.1
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# Hafnia
|
|
23
|
+
|
|
24
|
+
The `hafnia` python package is a collection of tools to create and run model training recipes on
|
|
25
|
+
the [Hafnia Platform](https://hafnia.milestonesys.com/).
|
|
26
|
+
|
|
27
|
+
The package includes the following interfaces:
|
|
28
|
+
|
|
29
|
+
- `cli`: A Command Line Interface (CLI) to 1) configure/connect to Hafnia's [Training-aaS](https://hafnia.readme.io/docs/training-as-a-service) and 2) create and
|
|
30
|
+
launch recipe scripts.
|
|
31
|
+
- `hafnia`: A python package with helper functions to load and interact with sample datasets and an experiment
|
|
32
|
+
tracker (`HafniaLogger`).
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## The Concept: Training as a Service (Training-aaS)
|
|
36
|
+
`Training-aaS` is the concept of training models on the Hafnia platform on large
|
|
37
|
+
and *hidden* datasets. Hidden datasets refers to datasets that can be used for
|
|
38
|
+
training, but are not available for download or direct access.
|
|
39
|
+
|
|
40
|
+
This is a key feature of the Hafnia platform, as a hidden dataset ensures data
|
|
41
|
+
privacy, and allow models to be trained compliantly and ethically by third parties (you).
|
|
42
|
+
|
|
43
|
+
The `script2model` approach is a Training-aaS concept, where you package your custom training
|
|
44
|
+
script as a *training recipe* and use the recipe to train models on the hidden datasets.
|
|
45
|
+
|
|
46
|
+
To support local development of a training recipe, we have introduced a **sample dataset**
|
|
47
|
+
for each dataset available in the Hafnia [data library](https://hafnia.milestonesys.com/training-aas/datasets). The sample dataset is a small
|
|
48
|
+
and anonymized subset of the full dataset and available for download.
|
|
49
|
+
|
|
50
|
+
With the sample dataset, you can seamlessly switch between local development and Training-aaS.
|
|
51
|
+
Locally, you can create, validate and debug your training recipe. The recipe is then
|
|
52
|
+
launched with Training-aaS, where the recipe runs on the full dataset and can be scaled to run on
|
|
53
|
+
multiple GPUs and instances if needed.
|
|
54
|
+
|
|
55
|
+
## Getting started: Configuration
|
|
56
|
+
To get started with Hafnia:
|
|
57
|
+
|
|
58
|
+
1. Install `hafnia` with your favorite python package manager. With pip do this:
|
|
59
|
+
|
|
60
|
+
`pip install hafnia`
|
|
61
|
+
1. Sign in to the [Hafnia Platform](https://hafnia.milestonesys.com/).
|
|
62
|
+
1. Create an API KEY for Training aaS. For more instructions, follow this
|
|
63
|
+
[guide](https://hafnia.readme.io/docs/create-an-api-key).
|
|
64
|
+
Copy the key and save it for later use.
|
|
65
|
+
1. From terminal, configure your machine to access Hafnia:
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
# Start configuration with
|
|
69
|
+
hafnia configure
|
|
70
|
+
|
|
71
|
+
# You are then prompted:
|
|
72
|
+
Profile Name [default]: # Press [Enter] or select an optional name
|
|
73
|
+
Hafnia API Key: # Pass your HAFNIA API key
|
|
74
|
+
Hafnia Platform URL [https://api.mdi.milestonesys.com]: # Press [Enter]
|
|
75
|
+
```
|
|
76
|
+
1. Download `mnist` from terminal to verify that your configuration is working.
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
hafnia data download mnist --force
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Getting started: Loading datasets samples
|
|
83
|
+
With Hafnia configured on your local machine, it is now possible to download
|
|
84
|
+
and explore the dataset sample with a python script:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from hafnia.data import load_dataset
|
|
88
|
+
|
|
89
|
+
dataset_splits = load_dataset("mnist")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Dataset Format
|
|
93
|
+
The returned sample dataset is a [hugging face dataset](https://huggingface.co/docs/datasets/index)
|
|
94
|
+
and contains train, validation and test splits.
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
print(dataset_splits)
|
|
98
|
+
|
|
99
|
+
# Output:
|
|
100
|
+
>>> DatasetDict({
|
|
101
|
+
train: Dataset({
|
|
102
|
+
features: ['image_id', 'image', 'height', 'width', 'objects', 'Weather', 'Surface Conditions'],
|
|
103
|
+
num_rows: 172
|
|
104
|
+
})
|
|
105
|
+
validation: Dataset({
|
|
106
|
+
features: ['image_id', 'image', 'height', 'width', 'objects', 'Weather', 'Surface Conditions'],
|
|
107
|
+
num_rows: 21
|
|
108
|
+
})
|
|
109
|
+
test: Dataset({
|
|
110
|
+
features: ['image_id', 'image', 'height', 'width', 'objects', 'Weather', 'Surface Conditions'],
|
|
111
|
+
num_rows: 21
|
|
112
|
+
})
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
A Hugging Face dataset is a dictionary with splits, where each split is a `Dataset` object.
|
|
118
|
+
Each `Dataset` is structured as a table with a set of columns (also called features) and a row for each sample.
|
|
119
|
+
|
|
120
|
+
The features of the dataset can be viewed with the `features` attribute.
|
|
121
|
+
```python
|
|
122
|
+
# View features of the train split
|
|
123
|
+
pprint.pprint(dataset["train"].features)
|
|
124
|
+
{'Surface Conditions': ClassLabel(names=['Dry', 'Wet'], id=None),
|
|
125
|
+
'Weather': ClassLabel(names=['Clear', 'Foggy'], id=None),
|
|
126
|
+
'height': Value(dtype='int64', id=None),
|
|
127
|
+
'image': Image(mode=None, decode=True, id=None),
|
|
128
|
+
'image_id': Value(dtype='int64', id=None),
|
|
129
|
+
'objects': Sequence(feature={'bbox': Sequence(feature=Value(dtype='int64',
|
|
130
|
+
id=None),
|
|
131
|
+
length=-1,
|
|
132
|
+
id=None),
|
|
133
|
+
'class_idx': ClassLabel(names=['Vehicle.Bicycle',
|
|
134
|
+
'Vehicle.Motorcycle',
|
|
135
|
+
'Vehicle.Car',
|
|
136
|
+
'Vehicle.Van',
|
|
137
|
+
'Vehicle.RV',
|
|
138
|
+
'Vehicle.Single_Truck',
|
|
139
|
+
'Vehicle.Combo_Truck',
|
|
140
|
+
'Vehicle.Pickup_Truck',
|
|
141
|
+
'Vehicle.Trailer',
|
|
142
|
+
'Vehicle.Emergency_Vehicle',
|
|
143
|
+
'Vehicle.Bus',
|
|
144
|
+
'Vehicle.Heavy_Duty_Vehicle'],
|
|
145
|
+
id=None),
|
|
146
|
+
'class_name': Value(dtype='string', id=None),
|
|
147
|
+
'id': Value(dtype='string', id=None)},
|
|
148
|
+
length=-1,
|
|
149
|
+
id=None),
|
|
150
|
+
'width': Value(dtype='int64', id=None)}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
View the first sample in the training set:
|
|
154
|
+
```python
|
|
155
|
+
# Print sample from the training set
|
|
156
|
+
pprint.pprint(dataset["train"][0])
|
|
157
|
+
|
|
158
|
+
{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1920x1080 at 0x79D6292C5ED0>,
|
|
159
|
+
'image_id': 4920,
|
|
160
|
+
'height': 1080,
|
|
161
|
+
'Weather': 0,
|
|
162
|
+
'Surface Conditions': 0,
|
|
163
|
+
'objects': {'bbox': [[441, 180, 121, 126],
|
|
164
|
+
[549, 151, 131, 103],
|
|
165
|
+
[1845, 722, 68, 130],
|
|
166
|
+
[1810, 571, 110, 149]],
|
|
167
|
+
'class_idx': [7, 7, 2, 2],
|
|
168
|
+
'class_name': ['Vehicle.Pickup_Truck',
|
|
169
|
+
'Vehicle.Pickup_Truck',
|
|
170
|
+
'Vehicle.Car',
|
|
171
|
+
'Vehicle.Car'],
|
|
172
|
+
'id': ['HW6WiLAJ', 'T/ccFpRi', 'CS0O8B6W', 'DKrJGzjp']},
|
|
173
|
+
'width': 1920}
|
|
174
|
+
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
For hafnia based datasets, we want to standardized how a dataset and dataset tasks are represented.
|
|
178
|
+
We have defined a set of features that are common across all datasets in the Hafnia data library.
|
|
179
|
+
|
|
180
|
+
- `image`: The image itself, stored as a PIL image
|
|
181
|
+
- `height`: The height of the image in pixels
|
|
182
|
+
- `width`: The width of the image in pixels
|
|
183
|
+
- `[IMAGE_CLASSIFICATION_TASK]`: [Optional] Image classification tasks are top-level `ClassLabel` feature.
|
|
184
|
+
`ClassLabel` is a Hugging Face feature that maps class indices to class names.
|
|
185
|
+
In above example we have two classification tasks:
|
|
186
|
+
- `Weather`: Classifies the weather conditions in the image, with possible values `Clear` and `Foggy`
|
|
187
|
+
- `Surface Conditions`: Classifies the surface conditions in the image, with possible values `Dry` and `Wet`
|
|
188
|
+
- `objects`: A dictionary containing information about objects in the image, including:
|
|
189
|
+
- `bbox`: Bounding boxes for each object, represented with a list of bounding box coordinates
|
|
190
|
+
`[xmin, ymin, bbox_width, bbox_height]`. Each bounding box is defined with a top-left corner coordinate
|
|
191
|
+
`(xmin, ymin)` and bounding box width and height `(bbox_width, bbox_height)` in pixels.
|
|
192
|
+
- `class_idx`: Class indices for each detected object. This is a
|
|
193
|
+
`ClassLabel` feature that maps to the `class_name` feature.
|
|
194
|
+
- `class_name`: Class names for each detected object
|
|
195
|
+
- `id`: Unique identifiers for each detected object
|
|
196
|
+
|
|
197
|
+
### Dataset Locally vs. Training-aaS
|
|
198
|
+
An important feature of `load_dataset` is that it will return the full dataset
|
|
199
|
+
when loaded with Training-aaS on the Hafnia platform.
|
|
200
|
+
|
|
201
|
+
This enables seamlessly switching between running/validating a training script
|
|
202
|
+
locally (on the sample dataset) and running full model trainings with Training-aaS (on the full dataset).
|
|
203
|
+
without changing code or configurations for the training script.
|
|
204
|
+
|
|
205
|
+
Available datasets with corresponding sample datasets can be found in [data library](https://hafnia.milestonesys.com/training-aas/datasets) including metadata and description for each dataset.
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
## Getting started: Experiment Tracking with HafniaLogger
|
|
209
|
+
The `HafniaLogger` is an important part of the recipe script and enables you to track, log and
|
|
210
|
+
reproduce your experiments.
|
|
211
|
+
|
|
212
|
+
When integrated into your training script, the `HafniaLogger` is responsible for collecting:
|
|
213
|
+
|
|
214
|
+
- **Trained Model**: The model trained during the experiment
|
|
215
|
+
- **Model Checkpoints**: Intermediate model states saved during training
|
|
216
|
+
- **Experiment Configurations**: Hyperparameters and other settings used in your experiment
|
|
217
|
+
- **Training/Evaluation Metrics**: Performance data such as loss values, accuracy, and custom metrics
|
|
218
|
+
|
|
219
|
+
### Basic Implementation Example
|
|
220
|
+
|
|
221
|
+
Here's how to integrate the `HafniaLogger` into your training script:
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from hafnia.experiment import HafniaLogger
|
|
225
|
+
|
|
226
|
+
batch_size = 128
|
|
227
|
+
learning_rate = 0.001
|
|
228
|
+
|
|
229
|
+
# Initialize Hafnia logger
|
|
230
|
+
logger = HafniaLogger()
|
|
231
|
+
|
|
232
|
+
# Log experiment parameters
|
|
233
|
+
logger.log_configuration({"batch_size": 128, "learning_rate": 0.001})
|
|
234
|
+
|
|
235
|
+
# Store checkpoints in this path
|
|
236
|
+
ckpt_dir = logger.path_model_checkpoints()
|
|
237
|
+
|
|
238
|
+
# Store the trained model in this path
|
|
239
|
+
model_dir = logger.path_model()
|
|
240
|
+
|
|
241
|
+
# Log scalar and metric values during training and validation
|
|
242
|
+
logger.log_scalar("train/loss", value=0.1, step=100)
|
|
243
|
+
logger.log_metric("train/accuracy", value=0.98, step=100)
|
|
244
|
+
|
|
245
|
+
logger.log_scalar("validation/loss", value=0.1, step=100)
|
|
246
|
+
logger.log_metric("validation/accuracy", value=0.95, step=100)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
Similar to `load_dataset`, the tracker behaves differently when running locally or in the cloud.
|
|
250
|
+
Locally, experiment data is stored in a local folder `.data/experiments/{DATE_TIME}`.
|
|
251
|
+
|
|
252
|
+
In the cloud, the experiment data will be available in the Hafnia platform under
|
|
253
|
+
[experiments](https://hafnia.milestonesys.com/training-aas/experiments).
|
|
254
|
+
|
|
255
|
+
## Example: Torch Dataloader
|
|
256
|
+
Commonly for `torch`-based training scripts, a dataset is used in combination
|
|
257
|
+
with a dataloader that performs data augmentations and batching of the dataset as torch tensors.
|
|
258
|
+
|
|
259
|
+
To support this, we have provided a torch dataloader example script
|
|
260
|
+
[example_torchvision_dataloader.py](./examples/example_torchvision_dataloader.py).
|
|
261
|
+
|
|
262
|
+
The script demonstrates how to load a dataset sample, apply data augmentations using
|
|
263
|
+
`torchvision.transforms.v2`, and visualize the dataset with `torch_helpers.draw_image_and_targets`.
|
|
264
|
+
|
|
265
|
+
Note also how `torch_helpers.TorchVisionCollateFn` is used in combination with the `DataLoader` from
|
|
266
|
+
`torch.utils.data` to handle the dataset's collate function.
|
|
267
|
+
|
|
268
|
+
The dataloader and visualization function supports computer vision tasks
|
|
269
|
+
and datasets available in the data library.
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
# Load Hugging Face dataset
|
|
273
|
+
dataset_splits = load_dataset("midwest-vehicle-detection")
|
|
274
|
+
|
|
275
|
+
# Define transforms
|
|
276
|
+
train_transforms = v2.Compose(
|
|
277
|
+
[
|
|
278
|
+
v2.RandomResizedCrop(size=(224, 224), antialias=True),
|
|
279
|
+
v2.RandomHorizontalFlip(p=0.5),
|
|
280
|
+
v2.ToDtype(torch.float32, scale=True),
|
|
281
|
+
v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
|
282
|
+
]
|
|
283
|
+
)
|
|
284
|
+
test_transforms = v2.Compose(
|
|
285
|
+
[
|
|
286
|
+
v2.Resize(size=(224, 224), antialias=True),
|
|
287
|
+
v2.ToDtype(torch.float32, scale=True),
|
|
288
|
+
v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
|
289
|
+
]
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
keep_metadata = True
|
|
293
|
+
train_dataset = torch_helpers.TorchvisionDataset(
|
|
294
|
+
dataset_splits["train"], transforms=train_transforms, keep_metadata=keep_metadata
|
|
295
|
+
)
|
|
296
|
+
test_dataset = torch_helpers.TorchvisionDataset(
|
|
297
|
+
dataset_splits["test"], transforms=test_transforms, keep_metadata=keep_metadata
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Visualize sample
|
|
301
|
+
image, targets = train_dataset[0]
|
|
302
|
+
visualize_image = torch_helpers.draw_image_and_targets(image=image, targets=targets)
|
|
303
|
+
pil_image = torchvision.transforms.functional.to_pil_image(visualize_image)
|
|
304
|
+
pil_image.save("visualized_labels.png")
|
|
305
|
+
|
|
306
|
+
# Create DataLoaders - using TorchVisionCollateFn
|
|
307
|
+
collate_fn = torch_helpers.TorchVisionCollateFn(
|
|
308
|
+
skip_stacking=["objects.bbox", "objects.class_idx"]
|
|
309
|
+
)
|
|
310
|
+
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
## Example: Training-aaS
|
|
315
|
+
By combining logging and dataset loading, we can now construct our model training recipe.
|
|
316
|
+
|
|
317
|
+
To demonstrate this, we have provided a recipe project that serves as a template for creating and structuring training recipes
|
|
318
|
+
[recipe-classification](https://github.com/milestone-hafnia/recipe-classification)
|
|
319
|
+
|
|
320
|
+
The project also contains additional information on how to structure your training recipe, use the `HafniaLogger`, the `load_dataset` function and different approach for launching
|
|
321
|
+
the training recipe on the Hafnia platform.
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
## Create, Build and Run `recipe.zip` locally
|
|
325
|
+
In order to test recipe compatibility with Hafnia cloud use the following command to build and
|
|
326
|
+
start the job locally.
|
|
327
|
+
|
|
328
|
+
```bash
|
|
329
|
+
# Create 'recipe.zip' from source folder '.'
|
|
330
|
+
hafnia recipe create .
|
|
331
|
+
|
|
332
|
+
# Build the docker image locally from a 'recipe.zip' file
|
|
333
|
+
hafnia runc build-local recipe.zip
|
|
334
|
+
|
|
335
|
+
# Execute the docker image locally with a desired dataset
|
|
336
|
+
hafnia runc launch-local --dataset mnist "python scripts/train.py"
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
## Detailed Documentation
|
|
340
|
+
For more information, go to our [documentation page](https://hafnia.readme.io/docs/welcome-to-hafnia)
|
|
341
|
+
or in below markdown pages.
|
|
342
|
+
|
|
343
|
+
- [CLI](docs/cli.md) - Detailed guide for the Hafnia command-line interface
|
|
344
|
+
- [Release lifecycle](docs/release.md) - Details about package release lifecycle.
|
|
345
|
+
|
|
346
|
+
## Development
|
|
347
|
+
For development, we are using an uv based virtual python environment.
|
|
348
|
+
|
|
349
|
+
Install uv
|
|
350
|
+
```bash
|
|
351
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
Create virtual environment and install python dependencies
|
|
355
|
+
|
|
356
|
+
```bash
|
|
357
|
+
uv sync
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
Run tests:
|
|
361
|
+
```bash
|
|
362
|
+
uv run pytest tests
|
|
363
|
+
```
|