euler-train 1.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euler_train-1.3.1/PKG-INFO +22 -0
- euler_train-1.3.1/README.md +465 -0
- euler_train-1.3.1/euler_train/__init__.py +83 -0
- euler_train-1.3.1/euler_train/architecture.py +125 -0
- euler_train-1.3.1/euler_train/environment.py +137 -0
- euler_train-1.3.1/euler_train/git_info.py +43 -0
- euler_train-1.3.1/euler_train/outputs.py +194 -0
- euler_train-1.3.1/euler_train/run.py +1249 -0
- euler_train-1.3.1/euler_train/serialization.py +86 -0
- euler_train-1.3.1/euler_train/slurm.py +27 -0
- euler_train-1.3.1/euler_train.egg-info/PKG-INFO +22 -0
- euler_train-1.3.1/euler_train.egg-info/SOURCES.txt +16 -0
- euler_train-1.3.1/euler_train.egg-info/dependency_links.txt +1 -0
- euler_train-1.3.1/euler_train.egg-info/requires.txt +21 -0
- euler_train-1.3.1/euler_train.egg-info/top_level.txt +1 -0
- euler_train-1.3.1/pyproject.toml +19 -0
- euler_train-1.3.1/setup.cfg +4 -0
- euler_train-1.3.1/tests/test_runlog.py +1431 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: euler-train
|
|
3
|
+
Version: 1.3.1
|
|
4
|
+
Summary: Lightweight file-based experiment logger for PyTorch
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Requires-Dist: numpy
|
|
7
|
+
Provides-Extra: images
|
|
8
|
+
Requires-Dist: Pillow; extra == "images"
|
|
9
|
+
Provides-Extra: gpu
|
|
10
|
+
Requires-Dist: nvidia-ml-py; extra == "gpu"
|
|
11
|
+
Provides-Extra: architecture
|
|
12
|
+
Requires-Dist: onnx; extra == "architecture"
|
|
13
|
+
Requires-Dist: onnxruntime; extra == "architecture"
|
|
14
|
+
Requires-Dist: onnxsim; extra == "architecture"
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest; extra == "dev"
|
|
17
|
+
Requires-Dist: Pillow; extra == "dev"
|
|
18
|
+
Requires-Dist: torch; extra == "dev"
|
|
19
|
+
Requires-Dist: nvidia-ml-py; extra == "dev"
|
|
20
|
+
Requires-Dist: onnx; extra == "dev"
|
|
21
|
+
Requires-Dist: onnxruntime; extra == "dev"
|
|
22
|
+
Requires-Dist: onnxsim; extra == "dev"
|
|
@@ -0,0 +1,465 @@
|
|
|
1
|
+
# euler_train
|
|
2
|
+
|
|
3
|
+
Lightweight, file-based experiment logger for PyTorch. No servers, no accounts — just structured files on disk.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install -e .
|
|
9
|
+
|
|
10
|
+
# with image saving support (Pillow)
|
|
11
|
+
pip install -e ".[images]"
|
|
12
|
+
|
|
13
|
+
# with GPU monitoring (nvidia-ml-py)
|
|
14
|
+
pip install -e ".[gpu]"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Quick start
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
import euler_train
|
|
21
|
+
|
|
22
|
+
run = euler_train.init(
|
|
23
|
+
dir="runs/experiment_01",
|
|
24
|
+
config={"lr": 1e-4, "arch": "unet", "epochs": 50},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
for epoch in range(50):
|
|
28
|
+
for step, batch in enumerate(train_loader):
|
|
29
|
+
loss = train_step(model, batch)
|
|
30
|
+
run.log({"loss": loss.item(), "lr": scheduler.get_lr()}, step=step, epoch=epoch)
|
|
31
|
+
|
|
32
|
+
metrics = evaluate(model, val_loader)
|
|
33
|
+
run.log(metrics, step=step, epoch=epoch, mode="val")
|
|
34
|
+
|
|
35
|
+
run.save_outputs(
|
|
36
|
+
epoch=epoch, step=step,
|
|
37
|
+
rgb=dict(pred=pred_img, gt=gt_img, input=input_img),
|
|
38
|
+
depth=dict(pred=depth_map, aux=dict(transmission=t_map)),
|
|
39
|
+
)
|
|
40
|
+
run.save_checkpoint(model, epoch=epoch, optimizer=optimizer)
|
|
41
|
+
|
|
42
|
+
run.finish()
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Use the context manager to auto-finish and capture crashes:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
with euler_train.init(dir="runs/exp02", config=cfg) as run:
|
|
49
|
+
... # if an exception is raised, meta.json records status="crashed" + traceback
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Directory structure
|
|
53
|
+
|
|
54
|
+
Each `euler_train.init(dir=...)` call creates a timestamped subdirectory under `{dir}/runs/`:
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
{dir}/
|
|
58
|
+
└── runs/
|
|
59
|
+
└── 2025-01-28_15-30-42_a3f2/ ← auto-generated run ID
|
|
60
|
+
├── meta.json
|
|
61
|
+
├── config.json
|
|
62
|
+
├── code_ref.json
|
|
63
|
+
├── run_environment.json
|
|
64
|
+
├── train.jsonl
|
|
65
|
+
├── val.jsonl
|
|
66
|
+
├── checkpoints/
|
|
67
|
+
│ └── epoch_{N}.pt
|
|
68
|
+
└── outputs/
|
|
69
|
+
└── epoch_{N}_step_{M}/
|
|
70
|
+
└── {output_type}/
|
|
71
|
+
├── pred/
|
|
72
|
+
├── gt/
|
|
73
|
+
├── input/
|
|
74
|
+
└── aux/
|
|
75
|
+
├── transmission/
|
|
76
|
+
└── attention_maps/
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
The run ID and directory are available as `run.run_id` and `run.dir`.
|
|
80
|
+
|
|
81
|
+
## API reference
|
|
82
|
+
|
|
83
|
+
### `euler_train.init(dir, config=None, meta=None, output_formats=None, run_id=None, datasets=None, run_name=None, evaluations=None, mode=None) → Run`
|
|
84
|
+
|
|
85
|
+
Creates the run directory and writes `meta.json`, `config.json`, `code_ref.json`, and `run_environment.json`. On resume (`run_id` provided), only `meta.json` and `config.json` are updated.
|
|
86
|
+
|
|
87
|
+
| Parameter | Type | Description |
|
|
88
|
+
|---|---|---|
|
|
89
|
+
| `dir` | `str \| Path` | Project directory. Each call creates a unique run under `{dir}/runs/{timestamp_id}/`. |
|
|
90
|
+
| `config` | `dict \| str \| Path \| Namespace \| dataclass` | Hyperparameters. Paths to `.json` / `.yaml` files are loaded automatically. |
|
|
91
|
+
| `meta` | `dict \| None` | Extra fields merged into `meta.json` (e.g. `{"tags": ["baseline"]}`). |
|
|
92
|
+
| `output_formats` | `dict[str, str] \| None` | Override format inference (see [Format inference](#format-inference)). |
|
|
93
|
+
| `run_id` | `str \| None` | Resume an existing run at `{dir}/runs/{run_id}` instead of creating a new one. |
|
|
94
|
+
| `datasets` | `dict[str, Any] \| None` | Optional split → dataset map. If a dataset exposes `describe_for_runlog()`, that contract is used directly; otherwise euler_train infers structured modality metadata (`path`, `used_as`, `slot`, `modality_type`, and hierarchical fields), resolving fixed namespaced properties from `properties.euler_loading` and `properties.euler_train` before heuristics. |
|
|
95
|
+
| `run_name` | `str \| None` | Optional human-readable run label stored in `meta.json`. |
|
|
96
|
+
| `evaluations` | `dict[str, dict] \| None` | Optional evaluation key → entry map. See [Evaluations](#evaluations). |
|
|
97
|
+
| `mode` | `str \| None` | Optional process label such as `"train"`, `"val"`, or `"eval"`. When set, lifecycle and crash details are also written under `meta.json["modes"][mode]`. |
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
### `run.log(metrics, *, step, epoch, mode="train")`
|
|
102
|
+
|
|
103
|
+
Appends one JSON line to `train.jsonl` (default) or `val.jsonl`.
|
|
104
|
+
|
|
105
|
+
Fields `step`, `epoch`, and `wall_time` are added automatically. Training records also get `elapsed_sec`. When `nvidia-ml-py` is installed, GPU stats (`gpu_util_pct`, `gpu_mem_util_pct`, `gpu_mem_used_gb`, `gpu_mem_total_gb`) are appended every 100 steps.
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
run.log({"loss": 0.42, "lr": 3e-5, "grad_norm": 1.2}, step=100, epoch=1)
|
|
109
|
+
run.log({"rgb.psnr": 28.3, "depth.mae": 0.03}, step=100, epoch=1, mode="val")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
### `run.save_outputs(*, epoch=None, step=None, **output_types)`
|
|
115
|
+
|
|
116
|
+
Saves arrays/images to `outputs/epoch_{N}_step_{M}/{output_type}/{slot}/`.
|
|
117
|
+
|
|
118
|
+
Each output type is a dict with these slot keys:
|
|
119
|
+
|
|
120
|
+
| Slot | Value |
|
|
121
|
+
|---|---|
|
|
122
|
+
| `pred` | Model prediction |
|
|
123
|
+
| `gt` | Ground truth |
|
|
124
|
+
| `input` | Model input |
|
|
125
|
+
| `aux` | Dict of named auxiliary outputs (each becomes a subdirectory) |
|
|
126
|
+
|
|
127
|
+
Values can be:
|
|
128
|
+
- A single numpy array, torch tensor, or PIL Image
|
|
129
|
+
- A list of the above (saved as `0000.ext`, `0001.ext`, ...)
|
|
130
|
+
- A 4D numpy/torch array (split along dim 0 as a batch)
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
run.save_outputs(
|
|
134
|
+
epoch=1, step=500,
|
|
135
|
+
rgb=dict(pred=pred_rgb, gt=gt_rgb),
|
|
136
|
+
depth=dict(
|
|
137
|
+
pred=depth_map,
|
|
138
|
+
gt=gt_depth,
|
|
139
|
+
aux=dict(transmission=t_map, attention=attn_map),
|
|
140
|
+
),
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Torch tensors in `(C,H,W)` or `(B,C,H,W)` layout are automatically transposed to channels-last before saving.
|
|
145
|
+
|
|
146
|
+
Pass `None` for any slot or output type to skip it.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
### `run.save_checkpoint(model, *, epoch, optimizer=None, **extra) → Path`
|
|
151
|
+
|
|
152
|
+
Saves to `checkpoints/epoch_{N}.pt`. Calls `.state_dict()` on model/optimizer automatically if available. Extra keyword arguments are included in the saved dict.
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
run.save_checkpoint(model, epoch=5, optimizer=opt, best_loss=0.12)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
### `run.finish(status="completed")`
|
|
161
|
+
|
|
162
|
+
Writes final `end_time`, `duration_sec`, and `status` to `meta.json`. Called automatically when using the `with` block. Safe to call multiple times.
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
### `run.add_evaluation(key, *, datasets=None, name=None, status=None, checkpoint=None, metadata=None)`
|
|
167
|
+
|
|
168
|
+
Adds or updates a single evaluation entry in `meta.json` under `evaluations[key]`. The `datasets` parameter accepts the same dataset objects as the top-level `datasets` parameter on `init()` and is processed through the same modality-inference pipeline. Flushes to disk immediately.
|
|
169
|
+
|
|
170
|
+
If the key already exists, existing fields are preserved and only the provided fields are updated (merge semantics).
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
run.add_evaluation(
|
|
174
|
+
"eval_rgb",
|
|
175
|
+
datasets={"test": test_ds},
|
|
176
|
+
name="RGB Eval",
|
|
177
|
+
status="running",
|
|
178
|
+
checkpoint={"epoch": 12, "step": 4800},
|
|
179
|
+
)
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
### `run.finish_evaluation(key, status="completed")`
|
|
185
|
+
|
|
186
|
+
Updates the `status` of an existing evaluation entry and flushes to disk. Raises `KeyError` if the key does not exist.
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
run.finish_evaluation("eval_rgb") # status → "completed"
|
|
190
|
+
run.finish_evaluation("eval_depth", status="crashed") # custom status
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Format inference
|
|
196
|
+
|
|
197
|
+
Arrays are saved as `.png` or `.npy` based on shape and dtype:
|
|
198
|
+
|
|
199
|
+
| Array | Format |
|
|
200
|
+
|---|---|
|
|
201
|
+
| `uint8` with shape `(H,W)` | `.png` (grayscale) |
|
|
202
|
+
| Any dtype with shape `(H,W,1)`, `(H,W,3)`, `(H,W,4)` | `.png` |
|
|
203
|
+
| Float `.png` | clipped to `[0,1]`, scaled to `[0,255]` |
|
|
204
|
+
| Everything else (e.g. `float32 (H,W)`) | `.npy` |
|
|
205
|
+
| PIL Image | `.png` |
|
|
206
|
+
|
|
207
|
+
### Overriding format
|
|
208
|
+
|
|
209
|
+
Pass `output_formats` at init. Keys are resolved most-specific-first:
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
run = euler_train.init(
|
|
213
|
+
dir="runs/exp",
|
|
214
|
+
config=cfg,
|
|
215
|
+
output_formats={
|
|
216
|
+
"depth.pred": "npz", # only depth pred
|
|
217
|
+
"depth": "npy", # all depth slots (unless more specific key matches)
|
|
218
|
+
"transmission": "npz", # any slot/aux named "transmission"
|
|
219
|
+
},
|
|
220
|
+
)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
Supported formats: `"png"`, `"npy"`, `"npz"`.
|
|
224
|
+
|
|
225
|
+
## `meta.json` schema
|
|
226
|
+
|
|
227
|
+
Auto-managed, not written to directly.
|
|
228
|
+
|
|
229
|
+
```json
|
|
230
|
+
{
|
|
231
|
+
"run_id": "2025-01-28_15-30-42_a3f2",
|
|
232
|
+
"run_name": "baseline_dehaze",
|
|
233
|
+
"status": "running | completed | crashed | interrupted",
|
|
234
|
+
"start_time": 1706400000.0,
|
|
235
|
+
"start_iso": "2025-01-28T15:30:42",
|
|
236
|
+
"end_time": 1706403600.0,
|
|
237
|
+
"end_iso": "2025-01-28T16:30:42",
|
|
238
|
+
"duration_sec": 3600.0,
|
|
239
|
+
"pid": 12345,
|
|
240
|
+
"python": "3.11.5",
|
|
241
|
+
"command": ["train.py", "--lr", "1e-4"],
|
|
242
|
+
"slurm": {
|
|
243
|
+
"job_id": "123456",
|
|
244
|
+
"job_name": "my_train_job",
|
|
245
|
+
"node": "gpu-node-01",
|
|
246
|
+
"partition": "gpu",
|
|
247
|
+
"gpus": "1",
|
|
248
|
+
"cpus": "8",
|
|
249
|
+
"array_task_id": "0",
|
|
250
|
+
"num_nodes": "1",
|
|
251
|
+
"ntasks": "1",
|
|
252
|
+
"ntasks_per_node": "1",
|
|
253
|
+
"gpus_per_node": "1",
|
|
254
|
+
"mem_per_node": "32000",
|
|
255
|
+
"mem_per_cpu": "4000",
|
|
256
|
+
"stdout_path": "/path/to/slurm-123456.out",
|
|
257
|
+
"stderr_path": "/path/to/slurm-123456.err",
|
|
258
|
+
"submit_dir": "/home/user/project"
|
|
259
|
+
},
|
|
260
|
+
"datasets": {
|
|
261
|
+
"train": {
|
|
262
|
+
"modalities": {
|
|
263
|
+
"hazy_rgb": {
|
|
264
|
+
"path": "/cluster/work/.../vkitti_rgb_hazy",
|
|
265
|
+
"used_as": "input",
|
|
266
|
+
"slot": "dehaze.input.rgb",
|
|
267
|
+
"modality_type": "rgb"
|
|
268
|
+
}
|
|
269
|
+
},
|
|
270
|
+
"hierarchical_modalities": {
|
|
271
|
+
"camera_intrinsics": {
|
|
272
|
+
"path": "/cluster/work/.../vkitti_intrinsics",
|
|
273
|
+
"used_as": "condition",
|
|
274
|
+
"slot": "dehaze.condition.camera_intrinsics",
|
|
275
|
+
"hierarchy_scope": "scene_camera",
|
|
276
|
+
"applies_to": ["hazy_rgb"]
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
},
|
|
281
|
+
"evaluations": {
|
|
282
|
+
"eval_rgb": {
|
|
283
|
+
"name": "RGB Eval",
|
|
284
|
+
"status": "completed",
|
|
285
|
+
"checkpoint": { "epoch": 12, "step": 4800 },
|
|
286
|
+
"metadata": { "runner": "eval_v2" },
|
|
287
|
+
"datasets": {
|
|
288
|
+
"test": {
|
|
289
|
+
"modalities": {
|
|
290
|
+
"rgb_input": { "path": "/mnt/ds/test/rgb", "used_as": "input" },
|
|
291
|
+
"rgb_pred": { "path": "/mnt/ds/preds/rgb", "used_as": "output" }
|
|
292
|
+
},
|
|
293
|
+
"hierarchical_modalities": {}
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
},
|
|
298
|
+
"modes": {
|
|
299
|
+
"train": {
|
|
300
|
+
"status": "completed",
|
|
301
|
+
"start_time": 1706400000.0,
|
|
302
|
+
"start_iso": "2025-01-28T15:30:42",
|
|
303
|
+
"end_time": 1706403000.0,
|
|
304
|
+
"end_iso": "2025-01-28T16:20:42",
|
|
305
|
+
"duration_sec": 3000.0,
|
|
306
|
+
"pid": 12345,
|
|
307
|
+
"command": ["train.py", "--lr", "1e-4"]
|
|
308
|
+
},
|
|
309
|
+
"eval": {
|
|
310
|
+
"status": "crashed",
|
|
311
|
+
"start_time": 1706403200.0,
|
|
312
|
+
"start_iso": "2025-01-28T16:23:20",
|
|
313
|
+
"end_time": 1706403300.0,
|
|
314
|
+
"end_iso": "2025-01-28T16:25:00",
|
|
315
|
+
"duration_sec": 100.0,
|
|
316
|
+
"pid": 12399,
|
|
317
|
+
"command": ["eval.py", "--ckpt", "epoch_12.pt"],
|
|
318
|
+
"error": "RuntimeError: CUDA OOM",
|
|
319
|
+
"traceback": "Traceback (most recent call last):\n ..."
|
|
320
|
+
}
|
|
321
|
+
},
|
|
322
|
+
"error": "RuntimeError: CUDA OOM",
|
|
323
|
+
"traceback": "Traceback (most recent call last):\n ..."
|
|
324
|
+
}
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
- `end_time`, `end_iso`, `duration_sec` are `null` while `status` is `"running"`.
|
|
328
|
+
- `slurm` is `null` when not running under SLURM.
|
|
329
|
+
- `datasets` is only present when `datasets=...` is passed to `euler_train.init`.
|
|
330
|
+
- `evaluations` is only present when evaluations are provided via `evaluations=...` on `init()` or added via `run.add_evaluation()`.
|
|
331
|
+
- `modes` is only present when `mode=...` is passed to `euler_train.init`; each key stores the latest lifecycle snapshot for that mode.
|
|
332
|
+
- `error` is only present when `status` is `"crashed"` (context manager / excepthook) or `"interrupted"` (SIGTERM/SIGINT). `traceback` is only present when `status` is `"crashed"`. When `mode=...` is set, the same fields are mirrored under `modes[mode]`.
|
|
333
|
+
|
|
334
|
+
A formal JSON Schema for `meta.json` is available at [`meta-schema.json`](meta-schema.json).
|
|
335
|
+
|
|
336
|
+
## `code_ref.json` schema
|
|
337
|
+
|
|
338
|
+
Written once when a fresh run is created (not on resume). Captures git repository state at the time of the run.
|
|
339
|
+
|
|
340
|
+
```json
|
|
341
|
+
{
|
|
342
|
+
"repo_url": "git@github.com:user/repo.git",
|
|
343
|
+
"branch": "main",
|
|
344
|
+
"commit_sha": "abc123def456...",
|
|
345
|
+
"is_dirty": true,
|
|
346
|
+
"dirty_diff": "diff --git a/train.py ...",
|
|
347
|
+
"commit_message": "Add learning rate scheduler\n",
|
|
348
|
+
"committed_at": "2025-01-28T15:20:00+01:00"
|
|
349
|
+
}
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
- `is_dirty` is `true` when there are uncommitted changes.
|
|
353
|
+
- `dirty_diff` contains the output of `git diff HEAD` when dirty, `null` otherwise.
|
|
354
|
+
- All fields are `null` if the project is not inside a git repository.
|
|
355
|
+
|
|
356
|
+
## `run_environment.json` schema
|
|
357
|
+
|
|
358
|
+
Written once when a fresh run is created (not on resume). Snapshots the runtime environment.
|
|
359
|
+
|
|
360
|
+
```json
|
|
361
|
+
{
|
|
362
|
+
"name": "gpu-node-01",
|
|
363
|
+
"python_version": "3.11.5",
|
|
364
|
+
"cuda_version": "12.1",
|
|
365
|
+
"gpu_type": "NVIDIA A100-SXM4-80GB",
|
|
366
|
+
"gpu_count": 4,
|
|
367
|
+
"packages_snapshot": {
|
|
368
|
+
"torch": "2.1.0",
|
|
369
|
+
"numpy": "1.26.2",
|
|
370
|
+
"Pillow": "10.1.0"
|
|
371
|
+
},
|
|
372
|
+
"docker_image": null,
|
|
373
|
+
"docker_digest": null,
|
|
374
|
+
"metadata": null
|
|
375
|
+
}
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
- `name` is the hostname of the machine.
|
|
379
|
+
- `cuda_version` is detected from PyTorch, `nvcc`, or the `CUDA_VERSION` env var (first available).
|
|
380
|
+
- `gpu_type` and `gpu_count` are detected via `pynvml` or `nvidia-smi` (first available).
|
|
381
|
+
- `packages_snapshot` is the output of `pip freeze` (or `uv pip freeze`), parsed into a `{name: version}` dict.
|
|
382
|
+
- Fields are `null` when the corresponding tool/library is unavailable.
|
|
383
|
+
|
|
384
|
+
## Evaluations
|
|
385
|
+
|
|
386
|
+
Evaluations record model evaluation runs against test/validation splits, linking each evaluation to a checkpoint and its input/output datasets. They are written into the `evaluations` key of `meta.json` in the object form expected by downstream ingestion services (see `META_JSON_INGEST_README.md`).
|
|
387
|
+
|
|
388
|
+
### Typical usage: resume a trained run for evaluation
|
|
389
|
+
|
|
390
|
+
```python
|
|
391
|
+
import euler_train
|
|
392
|
+
|
|
393
|
+
# Resume the training run by its run_id
|
|
394
|
+
run = euler_train.init(
|
|
395
|
+
dir="runs/experiment_01",
|
|
396
|
+
run_id="2025-01-28_15-30-42_a3f2",
|
|
397
|
+
evaluations={
|
|
398
|
+
"eval_rgb": {
|
|
399
|
+
"datasets": {"test": test_rgb_ds},
|
|
400
|
+
"name": "RGB Eval",
|
|
401
|
+
"status": "running",
|
|
402
|
+
"checkpoint": {"epoch": 12, "step": 4800},
|
|
403
|
+
"metadata": {"runner": "eval_v2"},
|
|
404
|
+
},
|
|
405
|
+
},
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# ... run evaluation logic ...
|
|
409
|
+
|
|
410
|
+
run.finish_evaluation("eval_rgb") # status → "completed"
|
|
411
|
+
run.finish()
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
### Evaluation entry fields
|
|
415
|
+
|
|
416
|
+
Each evaluation entry (the value under `evaluations[key]`) supports:
|
|
417
|
+
|
|
418
|
+
| Field | Type | Description |
|
|
419
|
+
|---|---|---|
|
|
420
|
+
| `datasets` | `dict[str, dataset]` | Split → dataset map (same objects as top-level `datasets`). Processed through the same modality-inference pipeline. |
|
|
421
|
+
| `name` | `str` | Human-readable evaluation label. |
|
|
422
|
+
| `status` | `str` | Evaluation status (`"running"`, `"completed"`, `"crashed"`, etc.). |
|
|
423
|
+
| `checkpoint` | `dict` | Checkpoint reference. Typically `{"epoch": int, "step": int}`, optionally with `"name"`. |
|
|
424
|
+
| `metadata` | `dict` | Arbitrary metadata (e.g. `{"runner": "eval_v2", "gpu": "A100"}`). |
|
|
425
|
+
|
|
426
|
+
All fields are optional. `datasets` is processed through `_build_datasets_meta` (contract → ds-crawler → heuristics); all other fields are stored as-is.
|
|
427
|
+
|
|
428
|
+
### Adding evaluations incrementally
|
|
429
|
+
|
|
430
|
+
Use `add_evaluation()` to register evaluations one at a time after init. This is useful when running multiple evaluations sequentially:
|
|
431
|
+
|
|
432
|
+
```python
|
|
433
|
+
run = euler_train.init(dir="runs/exp", run_id="2025-01-28_15-30-42_a3f2")
|
|
434
|
+
|
|
435
|
+
for split_name, ds in [("eval_rgb", test_rgb_ds), ("eval_depth", test_depth_ds)]:
|
|
436
|
+
run.add_evaluation(
|
|
437
|
+
split_name,
|
|
438
|
+
datasets={"test": ds},
|
|
439
|
+
status="running",
|
|
440
|
+
checkpoint={"epoch": 12, "step": 4800},
|
|
441
|
+
)
|
|
442
|
+
evaluate(model, ds)
|
|
443
|
+
run.finish_evaluation(split_name)
|
|
444
|
+
|
|
445
|
+
run.finish()
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
Each `add_evaluation()` call flushes `meta.json` immediately. Calling it with an existing key merges fields — existing fields not provided in the update are preserved.
|
|
449
|
+
|
|
450
|
+
### Merge semantics on resume
|
|
451
|
+
|
|
452
|
+
When resuming a run that already has evaluations in its `meta.json`, new evaluations are merged by key:
|
|
453
|
+
|
|
454
|
+
- Existing evaluation keys not present in the new `evaluations` dict are **preserved**.
|
|
455
|
+
- Existing keys present in the new dict are **updated** (field-level merge within each entry).
|
|
456
|
+
- New keys are **added**.
|
|
457
|
+
|
|
458
|
+
This means you can run evaluations across multiple sessions without losing previously recorded results.
|
|
459
|
+
|
|
460
|
+
## Dev
|
|
461
|
+
|
|
462
|
+
```bash
|
|
463
|
+
pip install git+https://github.com/d-rothen/euler-train.git
|
|
464
|
+
uv pip install "euler-train[images,gpu] @ git+https://github.com/d-rothen/euler-train"
|
|
465
|
+
```
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""runlog — lightweight file-based experiment logging."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from .run import Run
|
|
5
|
+
from .architecture import export_architecture
|
|
6
|
+
|
|
7
|
+
__all__ = ["init", "Run", "export_architecture"]
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def init(
|
|
12
|
+
dir: str | None = None,
|
|
13
|
+
config=None,
|
|
14
|
+
meta: dict | None = None,
|
|
15
|
+
output_formats: dict[str, str] | None = None,
|
|
16
|
+
run_id: str | None = None,
|
|
17
|
+
datasets: dict | None = None,
|
|
18
|
+
run_name: str | None = None,
|
|
19
|
+
evaluations: dict[str, dict] | None = None,
|
|
20
|
+
mode: str | None = None,
|
|
21
|
+
) -> Run:
|
|
22
|
+
"""Create a new run — or resume an existing one — and return the handle.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
dir:
|
|
27
|
+
Project / output directory. Each call creates a unique run
|
|
28
|
+
under ``{dir}/runs/{timestamp_id}/``. When *None* (the
|
|
29
|
+
default), the directory is resolved as:
|
|
30
|
+
|
|
31
|
+
1. ``$ET_HOME/<project>`` (if ``$ET_HOME`` is set),
|
|
32
|
+
2. ``~/euler_train/<project>``,
|
|
33
|
+
|
|
34
|
+
where ``<project>`` is the git repository name, or the current
|
|
35
|
+
working directory name when not inside a git repo.
|
|
36
|
+
config:
|
|
37
|
+
Hyperparameters — accepts a *dict*, a path to a JSON / YAML file,
|
|
38
|
+
an ``argparse.Namespace``, or a dataclass instance.
|
|
39
|
+
meta:
|
|
40
|
+
Extra user-defined fields merged into ``meta.json``
|
|
41
|
+
(e.g. ``{"description": "baseline", "tags": ["v2"]}``).
|
|
42
|
+
output_formats:
|
|
43
|
+
Override auto-inferred save formats. Keys can be an output type
|
|
44
|
+
(``"depth"``), a slot / aux name (``"transmission"``), or a
|
|
45
|
+
dotted combination (``"depth.pred"``). Values are ``"png"``,
|
|
46
|
+
``"npy"``, or ``"npz"``.
|
|
47
|
+
run_id:
|
|
48
|
+
If given, resume an existing run instead of creating a new one.
|
|
49
|
+
The run directory ``{dir}/runs/{run_id}/`` must already exist.
|
|
50
|
+
The existing ``config.json`` is loaded automatically (unless
|
|
51
|
+
*config* is explicitly provided to override it).
|
|
52
|
+
datasets:
|
|
53
|
+
Optional mapping of split name to ``euler_loading.MultiModalDataset``
|
|
54
|
+
instance (e.g. ``{"train": train_ds, "val": val_ds}``). When
|
|
55
|
+
provided, each split is logged into ``meta.json`` under
|
|
56
|
+
``datasets[split]`` with per-modality records:
|
|
57
|
+
``path`` and inferred metadata (``used_as``, ``slot``,
|
|
58
|
+
``modality_type``). Hierarchical modalities also include
|
|
59
|
+
``hierarchy_scope`` and ``applies_to``. If a dataset implements
|
|
60
|
+
``describe_for_runlog()``, that contract is used directly.
|
|
61
|
+
Otherwise inference prefers
|
|
62
|
+
``ds-crawler`` config properties when available, then falls back to
|
|
63
|
+
naming-based heuristics.
|
|
64
|
+
run_name:
|
|
65
|
+
Optional human-readable name for the run. Stored in ``meta.json``.
|
|
66
|
+
evaluations:
|
|
67
|
+
Optional mapping of evaluation key to evaluation entry. Each
|
|
68
|
+
entry may contain ``datasets`` (same dataset objects accepted by
|
|
69
|
+
*datasets*), ``name``, ``status``, ``checkpoint``, and
|
|
70
|
+
``metadata``. Typically used when resuming a run (via *run_id*)
|
|
71
|
+
for evaluation. See also :meth:`Run.add_evaluation`.
|
|
72
|
+
mode:
|
|
73
|
+
Optional label for the current process context (for example
|
|
74
|
+
``"train"``, ``"val"``, or ``"eval"``). When provided,
|
|
75
|
+
lifecycle fields and crash details are mirrored into
|
|
76
|
+
``meta.json`` under ``modes[mode]``.
|
|
77
|
+
"""
|
|
78
|
+
return Run(
|
|
79
|
+
dir=dir, config=config, meta=meta,
|
|
80
|
+
output_formats=output_formats, run_id=run_id,
|
|
81
|
+
datasets=datasets, run_name=run_name,
|
|
82
|
+
evaluations=evaluations, mode=mode,
|
|
83
|
+
)
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Export a PyTorch model to a lightweight ONNX graph for Netron visualization."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
log = logging.getLogger("euler_train")
|
|
10
|
+
|
|
11
|
+
_MISSING_DEPS_MSG = (
|
|
12
|
+
"Architecture export requires optional dependencies: onnx, onnxruntime, onnxsim. "
|
|
13
|
+
"Install them with: pip install euler-train[architecture]"
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def export_architecture(
|
|
18
|
+
model: Any,
|
|
19
|
+
dummy_input: Any,
|
|
20
|
+
output_path: str | Path = "architecture.onnx",
|
|
21
|
+
) -> Path:
|
|
22
|
+
"""Export a PyTorch model to a simplified, weightless ONNX graph.
|
|
23
|
+
|
|
24
|
+
The resulting file is optimized for visual inspection in Netron:
|
|
25
|
+
redundant nodes are removed, operator fusions are applied, and
|
|
26
|
+
weight tensors are stripped so only the graph topology remains.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
model:
|
|
31
|
+
A PyTorch ``nn.Module``. Temporarily set to eval mode for
|
|
32
|
+
export; the original training/eval state is restored afterward.
|
|
33
|
+
dummy_input:
|
|
34
|
+
Example input tensor(s) matching the model's forward signature.
|
|
35
|
+
output_path:
|
|
36
|
+
Where to write the final ``.onnx`` file.
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
Path
|
|
41
|
+
The written output path.
|
|
42
|
+
"""
|
|
43
|
+
import torch
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
import onnx
|
|
47
|
+
import onnxruntime as ort
|
|
48
|
+
from onnxsim import simplify
|
|
49
|
+
except ImportError:
|
|
50
|
+
raise ImportError(_MISSING_DEPS_MSG)
|
|
51
|
+
|
|
52
|
+
output_path = Path(output_path)
|
|
53
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
54
|
+
|
|
55
|
+
was_training = model.training
|
|
56
|
+
model.eval()
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
return _do_export(model, dummy_input, output_path, onnx, ort, simplify, torch)
|
|
60
|
+
finally:
|
|
61
|
+
if was_training:
|
|
62
|
+
model.train()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _do_export(model, dummy_input, output_path, onnx, ort, simplify, torch):
|
|
66
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
67
|
+
raw_path = Path(tmpdir) / "raw.onnx"
|
|
68
|
+
ort_path = Path(tmpdir) / "ort.onnx"
|
|
69
|
+
|
|
70
|
+
# Step 1: Export to ONNX (with weights, needed for optimizer passes)
|
|
71
|
+
_export_onnx(model, dummy_input, raw_path, torch=torch)
|
|
72
|
+
|
|
73
|
+
# Step 2: Simplify — removes redundant glue nodes
|
|
74
|
+
log.info("Simplifying ONNX graph with onnxsim...")
|
|
75
|
+
raw_model = onnx.load(str(raw_path))
|
|
76
|
+
simplified_model, check = simplify(raw_model)
|
|
77
|
+
if not check:
|
|
78
|
+
log.warning("onnxsim validation failed, continuing anyway.")
|
|
79
|
+
onnx.save(simplified_model, str(raw_path))
|
|
80
|
+
|
|
81
|
+
# Step 3: ORT optimization — fuses standard blocks (Conv+BN+ReLU, etc.)
|
|
82
|
+
log.info("Applying ONNX Runtime graph optimizations...")
|
|
83
|
+
sess_options = ort.SessionOptions()
|
|
84
|
+
sess_options.graph_optimization_level = (
|
|
85
|
+
ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
86
|
+
)
|
|
87
|
+
sess_options.optimized_model_filepath = str(ort_path)
|
|
88
|
+
ort.InferenceSession(str(raw_path), sess_options)
|
|
89
|
+
|
|
90
|
+
# Step 4: Strip weights for a lightweight file
|
|
91
|
+
log.info("Stripping weights...")
|
|
92
|
+
fused_model = onnx.load(str(ort_path))
|
|
93
|
+
while fused_model.graph.initializer:
|
|
94
|
+
fused_model.graph.initializer.pop()
|
|
95
|
+
|
|
96
|
+
onnx.save(fused_model, str(output_path))
|
|
97
|
+
|
|
98
|
+
log.info("Architecture exported to %s", output_path)
|
|
99
|
+
return output_path
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _export_onnx(model: Any, dummy_input: Any, path: Path, *, torch: Any) -> None:
|
|
103
|
+
"""Export using dynamo (PyTorch >= 2.1) or legacy torch.onnx.export."""
|
|
104
|
+
# Try dynamo-based export first (produces a cleaner functional graph)
|
|
105
|
+
if hasattr(torch.onnx, "dynamo_export"):
|
|
106
|
+
try:
|
|
107
|
+
log.info("Exporting ONNX via torch.onnx.dynamo_export (PyTorch 2.x)...")
|
|
108
|
+
export_output = torch.onnx.dynamo_export(model, dummy_input)
|
|
109
|
+
export_output.save(str(path))
|
|
110
|
+
return
|
|
111
|
+
except Exception as exc:
|
|
112
|
+
log.warning(
|
|
113
|
+
"dynamo_export failed (%s), falling back to legacy export.", exc
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Legacy export (PyTorch 1.x / 2.0 / dynamo fallback)
|
|
117
|
+
log.info("Exporting ONNX via torch.onnx.export (legacy)...")
|
|
118
|
+
torch.onnx.export(
|
|
119
|
+
model,
|
|
120
|
+
dummy_input,
|
|
121
|
+
str(path),
|
|
122
|
+
export_params=True,
|
|
123
|
+
opset_version=14,
|
|
124
|
+
do_constant_folding=True,
|
|
125
|
+
)
|