ma-agents 3.3.0 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.opencode/skills/.ma-agents.json +99 -99
- package/.roo/skills/.ma-agents.json +99 -99
- package/README.md +56 -15
- package/bin/cli.js +63 -8
- package/lib/agents.js +23 -0
- package/lib/bmad-cache/cache-manifest.json +1 -1
- package/lib/bmad-customizations/bmm-demerzel.customize.yaml +36 -0
- package/lib/bmad-customizations/demerzel.md +32 -0
- package/lib/bmad-extension/module-help.csv +13 -0
- package/lib/bmad-extension/skills/bmad-ma-agent-ml/.gitkeep +0 -0
- package/lib/bmad-extension/skills/bmad-ma-agent-ml/SKILL.md +59 -0
- package/lib/bmad-extension/skills/bmad-ma-agent-ml/bmad-skill-manifest.yaml +11 -0
- package/lib/bmad-extension/skills/generate-backlog/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-advise/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-advise/SKILL.md +76 -0
- package/lib/bmad-extension/skills/ml-advise/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-advise/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-analysis/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-analysis/SKILL.md +60 -0
- package/lib/bmad-extension/skills/ml-analysis/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-analysis/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-architecture/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-architecture/SKILL.md +55 -0
- package/lib/bmad-extension/skills/ml-architecture/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-architecture/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-detailed-design/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-detailed-design/SKILL.md +67 -0
- package/lib/bmad-extension/skills/ml-detailed-design/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-detailed-design/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-eda/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-eda/SKILL.md +56 -0
- package/lib/bmad-extension/skills/ml-eda/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/baseline_classifier.py +522 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/class_weights_calculator.py +295 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/clustering_explorer.py +383 -0
- package/lib/bmad-extension/skills/ml-eda/scripts/eda_analyzer.py +654 -0
- package/lib/bmad-extension/skills/ml-eda/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-experiment/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-experiment/SKILL.md +74 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/advanced_trainer_configs.py +430 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/quick_trainer_setup.py +233 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/template_datamodule.py +219 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/template_gnn_module.py +341 -0
- package/lib/bmad-extension/skills/ml-experiment/assets/template_lightning_module.py +158 -0
- package/lib/bmad-extension/skills/ml-experiment/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-experiment/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-hparam/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-hparam/SKILL.md +81 -0
- package/lib/bmad-extension/skills/ml-hparam/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-hparam/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-ideation/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-ideation/SKILL.md +50 -0
- package/lib/bmad-extension/skills/ml-ideation/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-ideation/scripts/validate_ml_prd.py +287 -0
- package/lib/bmad-extension/skills/ml-ideation/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-infra/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-infra/SKILL.md +58 -0
- package/lib/bmad-extension/skills/ml-infra/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-infra/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-retrospective/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-retrospective/SKILL.md +63 -0
- package/lib/bmad-extension/skills/ml-retrospective/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-retrospective/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-revision/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-revision/SKILL.md +82 -0
- package/lib/bmad-extension/skills/ml-revision/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-revision/skill.json +7 -0
- package/lib/bmad-extension/skills/ml-techspec/.gitkeep +0 -0
- package/lib/bmad-extension/skills/ml-techspec/SKILL.md +80 -0
- package/lib/bmad-extension/skills/ml-techspec/bmad-skill-manifest.yaml +3 -0
- package/lib/bmad-extension/skills/ml-techspec/skill.json +7 -0
- package/lib/bmad.js +85 -8
- package/lib/skill-authoring.js +1 -1
- package/package.json +2 -2
- package/test/agent-injection-strategy.test.js +4 -4
- package/test/bmad-version-bump.test.js +34 -34
- package/test/build-bmad-args.test.js +13 -6
- package/test/convert-agents-to-skills.test.js +11 -1
- package/test/extension-module-restructure.test.js +31 -7
- package/test/migration-validation.test.js +14 -11
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-experiment
|
|
3
|
+
description: ML Experiment - Execute training, hyperparameter optimization, and log all runs to the tracking tool
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# ML Stage 6 - Experiment Execution
|
|
7
|
+
|
|
8
|
+
Execute the training pipeline as specified in the locked TechSpec. Log everything. Touch nothing outside the HPO space.
|
|
9
|
+
|
|
10
|
+
## Instructions
|
|
11
|
+
|
|
12
|
+
### 1. Load Context
|
|
13
|
+
- Read `_bmad-output/planning-artifacts/ml-techspec.md` — MUST be LOCKED. If not, STOP.
|
|
14
|
+
- Read `_bmad-output/planning-artifacts/ml-architecture.md`
|
|
15
|
+
- Note the fixed random seed, feature set, and validation strategy — do not deviate
|
|
16
|
+
|
|
17
|
+
### 2. Write Training Script
|
|
18
|
+
If `scripts/train.py` does not exist, generate it following the TechSpec exactly:
|
|
19
|
+
- Load data from TechSpec data contract paths
|
|
20
|
+
- Apply preprocessing pipeline (from ml-architecture)
|
|
21
|
+
- Create train/val split using TechSpec validation strategy and fixed seed
|
|
22
|
+
- Instantiate tracking run (wandb.init / mlflow.start_run / local log)
|
|
23
|
+
- Log all TechSpec fixed params as run config
|
|
24
|
+
- Run baseline model first — log its metrics
|
|
25
|
+
- Run candidate model with fixed hyperparameters — log metrics
|
|
26
|
+
- If HPO is specified: run HPO trials, log each trial
|
|
27
|
+
- Select best model; log final metrics against ALL TechSpec acceptance criteria
|
|
28
|
+
- Save model artifact to `_bmad-output/implementation-artifacts/models/`
|
|
29
|
+
- Log pass/fail for each acceptance criterion explicitly
|
|
30
|
+
|
|
31
|
+
Use `assets/` templates if they exist (e.g. `assets/train_template.py`).
|
|
32
|
+
|
|
33
|
+
### 3. Execute Baseline
|
|
34
|
+
Run baseline first — always:
|
|
35
|
+
|
|
36
|
+
uv run python scripts/train.py --mode baseline
|
|
37
|
+
|
|
38
|
+
Log baseline metrics. The candidate model must beat this.
|
|
39
|
+
|
|
40
|
+
### 4. Execute Candidate Model
|
|
41
|
+
|
|
42
|
+
uv run python scripts/train.py --mode candidate
|
|
43
|
+
|
|
44
|
+
### 5. Execute HPO (if specified in TechSpec)
|
|
45
|
+
Run hyperparameter optimization within the HPO space defined in TechSpec:
|
|
46
|
+
|
|
47
|
+
uv run python scripts/train.py --mode hpo
|
|
48
|
+
|
|
49
|
+
Log all trials. Select best params by primary metric on validation set only.
|
|
50
|
+
|
|
51
|
+
### 6. Write Experiment Log
|
|
52
|
+
Write `_bmad-output/implementation-artifacts/experiment-log.md` with:
|
|
53
|
+
- **Run IDs**: Tracking tool run IDs or local log paths
|
|
54
|
+
- **Baseline Results**: metrics for each acceptance criterion
|
|
55
|
+
- **Best Model Results**: metrics for each acceptance criterion
|
|
56
|
+
- **Best Hyperparameters**: final selected values
|
|
57
|
+
- **Acceptance Criteria Status**: PASS / FAIL for each criterion from TechSpec
|
|
58
|
+
- **Training Time**: wall-clock time
|
|
59
|
+
- **Model Artifact Path**: path to saved model
|
|
60
|
+
|
|
61
|
+
### 7. Surface Dilemmas & Commit Gate
|
|
62
|
+
|
|
63
|
+
Before presenting and **before any git commit**:
|
|
64
|
+
|
|
65
|
+
- Identify every execution decision where two or more options existed (early stopping patience, batch size, resampling applied or not, model variant selected, etc.)
|
|
66
|
+
- Format each as: **Dilemma [Letter] — Title** / **Context** / **Options (a/b)** / **Recommendation** / **Your decision:** [blank]
|
|
67
|
+
- If all choices were unambiguous, state explicitly: "No open dilemmas."
|
|
68
|
+
- **Do NOT commit the experiment log or model artifacts until the user has reviewed and approved.**
|
|
69
|
+
|
|
70
|
+
### 8. Confirm and Advance
|
|
71
|
+
- Present experiment log summary
|
|
72
|
+
- State clearly: "Primary criterion ([metric] >= [threshold]): PASS / FAIL"
|
|
73
|
+
- On approval: commit artifacts, then say "Proceed to **Stage 7 — /ml-analysis** to evaluate results against the TechSpec contract."
|
|
74
|
+
- STOP and WAIT for user confirmation
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""
|
|
2
|
+
advanced_trainer_configs.py — BMAD DL Lifecycle
|
|
3
|
+
(Adapted from K-Dense claude-scientific-skills/pytorch-lightning/quick_trainer_setup.py)
|
|
4
|
+
|
|
5
|
+
10 ready-to-use Lightning Trainer configurations for different training scenarios.
|
|
6
|
+
Each function is self-contained — copy the one you need into your training script.
|
|
7
|
+
|
|
8
|
+
Configurations:
|
|
9
|
+
1. basic_trainer() — Quick prototyping, CPU/auto hardware
|
|
10
|
+
2. debug_trainer() — Fast dev run, anomaly detection
|
|
11
|
+
3. single_gpu_trainer() — Production single GPU with checkpointing + logging
|
|
12
|
+
4. multi_gpu_ddp_trainer() — Multi-GPU with DDP (models < 500M params)
|
|
13
|
+
5. large_model_fsdp_trainer() — FSDP for large models (500M+ params)
|
|
14
|
+
6. deepspeed_trainer() — DeepSpeed for very large models (10B+)
|
|
15
|
+
7. hparam_search_trainer() — Lightweight for hyperparameter sweeps
|
|
16
|
+
8. overfit_test_trainer() — Overfit on N batches to verify model capacity
|
|
17
|
+
9. cluster_time_limited_trainer()— SLURM/cluster jobs with wall-clock limit
|
|
18
|
+
10. reproducible_trainer() — Deterministic, full-precision for publications
|
|
19
|
+
|
|
20
|
+
Usage in your training script:
|
|
21
|
+
from assets.advanced_trainer_configs import single_gpu_trainer
|
|
22
|
+
trainer = single_gpu_trainer(max_epochs=50, experiment_name="run_001")
|
|
23
|
+
trainer.fit(model, datamodule)
|
|
24
|
+
trainer.test(model, datamodule)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
import lightning as L
|
|
33
|
+
from lightning.pytorch.callbacks import (
|
|
34
|
+
ModelCheckpoint, EarlyStopping, LearningRateMonitor,
|
|
35
|
+
DeviceStatsMonitor, RichProgressBar,
|
|
36
|
+
)
|
|
37
|
+
from lightning.pytorch import loggers as pl_loggers
|
|
38
|
+
from lightning.pytorch.strategies import DDPStrategy, FSDPStrategy
|
|
39
|
+
LIGHTNING_PKG = "lightning"
|
|
40
|
+
except ImportError:
|
|
41
|
+
try:
|
|
42
|
+
import pytorch_lightning as L
|
|
43
|
+
from pytorch_lightning.callbacks import (
|
|
44
|
+
ModelCheckpoint, EarlyStopping, LearningRateMonitor,
|
|
45
|
+
)
|
|
46
|
+
from pytorch_lightning import loggers as pl_loggers
|
|
47
|
+
from pytorch_lightning.strategies import DDPStrategy, FSDPStrategy
|
|
48
|
+
RichProgressBar = DeviceStatsMonitor = None
|
|
49
|
+
LIGHTNING_PKG = "pytorch_lightning"
|
|
50
|
+
except ImportError:
|
|
51
|
+
raise ImportError("Install: pip install lightning")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
55
|
+
# 1. BASIC TRAINER — Quick prototyping
|
|
56
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
57
|
+
|
|
58
|
+
def basic_trainer(max_epochs: int = 10) -> "L.Trainer":
|
|
59
|
+
"""
|
|
60
|
+
Minimal trainer for quick prototyping.
|
|
61
|
+
Auto-selects GPU/CPU, minimal logging.
|
|
62
|
+
"""
|
|
63
|
+
return L.Trainer(
|
|
64
|
+
max_epochs=max_epochs,
|
|
65
|
+
accelerator="auto",
|
|
66
|
+
devices="auto",
|
|
67
|
+
enable_progress_bar=True,
|
|
68
|
+
logger=True,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
73
|
+
# 2. DEBUG TRAINER — Fast dev run + anomaly detection
|
|
74
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
75
|
+
|
|
76
|
+
def debug_trainer() -> "L.Trainer":
|
|
77
|
+
"""
|
|
78
|
+
Debug trainer: runs 1 batch through train/val/test to find bugs fast.
|
|
79
|
+
Enables gradient anomaly detection (NaN/Inf catching).
|
|
80
|
+
"""
|
|
81
|
+
return L.Trainer(
|
|
82
|
+
fast_dev_run=True,
|
|
83
|
+
accelerator="cpu",
|
|
84
|
+
detect_anomaly=True,
|
|
85
|
+
log_every_n_steps=1,
|
|
86
|
+
enable_progress_bar=True,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
91
|
+
# 3. SINGLE GPU TRAINER — Production single GPU
|
|
92
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
93
|
+
|
|
94
|
+
def single_gpu_trainer(
|
|
95
|
+
max_epochs: int = 100,
|
|
96
|
+
experiment_name: str = "experiment",
|
|
97
|
+
version: str | None = None,
|
|
98
|
+
log_dir: str | Path = "logs/",
|
|
99
|
+
monitor: str = "val/loss",
|
|
100
|
+
monitor_mode: str = "min",
|
|
101
|
+
patience: int = 10,
|
|
102
|
+
) -> "L.Trainer":
|
|
103
|
+
"""
|
|
104
|
+
Production-ready single GPU trainer.
|
|
105
|
+
Features: mixed precision, checkpointing (top-3 + last), early stopping,
|
|
106
|
+
LR monitor, CSV + TensorBoard loggers.
|
|
107
|
+
"""
|
|
108
|
+
log_dir = Path(log_dir)
|
|
109
|
+
ckpt_dir = log_dir / "checkpoints" / experiment_name
|
|
110
|
+
|
|
111
|
+
callbacks = [
|
|
112
|
+
ModelCheckpoint(
|
|
113
|
+
dirpath=ckpt_dir,
|
|
114
|
+
filename=f"{experiment_name}-{{epoch:02d}}-{{{monitor}:.4f}}",
|
|
115
|
+
monitor=monitor, mode=monitor_mode,
|
|
116
|
+
save_top_k=3, save_last=True, verbose=True,
|
|
117
|
+
),
|
|
118
|
+
EarlyStopping(monitor=monitor, mode=monitor_mode, patience=patience, verbose=True),
|
|
119
|
+
LearningRateMonitor(logging_interval="epoch"),
|
|
120
|
+
]
|
|
121
|
+
if RichProgressBar is not None:
|
|
122
|
+
callbacks.append(RichProgressBar())
|
|
123
|
+
|
|
124
|
+
# TensorBoard required — version= prevents fold/run log collision
|
|
125
|
+
loggers = [
|
|
126
|
+
pl_loggers.TensorBoardLogger(save_dir=str(log_dir), name=experiment_name, version=version),
|
|
127
|
+
pl_loggers.CSVLogger(save_dir=str(log_dir), name=experiment_name, version=version),
|
|
128
|
+
]
|
|
129
|
+
log_path = Path(log_dir) / experiment_name / (version or "version_0")
|
|
130
|
+
print(f"GPU: {__import__('torch').cuda.get_device_name(0) if __import__('torch').cuda.is_available() else 'WARNING: No GPU detected'}")
|
|
131
|
+
print(f"Logs → {log_path}/ run: tensorboard --logdir={log_dir}")
|
|
132
|
+
|
|
133
|
+
return L.Trainer(
|
|
134
|
+
max_epochs=max_epochs,
|
|
135
|
+
accelerator="gpu", devices=1,
|
|
136
|
+
precision="16-mixed",
|
|
137
|
+
gradient_clip_val=1.0,
|
|
138
|
+
callbacks=callbacks, logger=loggers,
|
|
139
|
+
log_every_n_steps=10,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
144
|
+
# 4. MULTI-GPU DDP TRAINER — Distributed Data Parallel
|
|
145
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
146
|
+
|
|
147
|
+
def multi_gpu_ddp_trainer(
|
|
148
|
+
max_epochs: int = 100,
|
|
149
|
+
num_gpus: int = 4,
|
|
150
|
+
experiment_name: str = "experiment",
|
|
151
|
+
log_dir: str | Path = "logs/",
|
|
152
|
+
monitor: str = "val/loss",
|
|
153
|
+
monitor_mode: str = "min",
|
|
154
|
+
) -> "L.Trainer":
|
|
155
|
+
"""
|
|
156
|
+
Multi-GPU training with DDP strategy.
|
|
157
|
+
Best for: standard DL models < 500M parameters.
|
|
158
|
+
Syncs batch norm across GPUs automatically.
|
|
159
|
+
"""
|
|
160
|
+
log_dir = Path(log_dir)
|
|
161
|
+
ckpt_dir = log_dir / "checkpoints" / experiment_name
|
|
162
|
+
|
|
163
|
+
callbacks = [
|
|
164
|
+
ModelCheckpoint(
|
|
165
|
+
dirpath=ckpt_dir,
|
|
166
|
+
filename=f"{experiment_name}-{{epoch:02d}}-{{{monitor}:.4f}}",
|
|
167
|
+
monitor=monitor, mode=monitor_mode,
|
|
168
|
+
save_top_k=3, save_last=True,
|
|
169
|
+
),
|
|
170
|
+
EarlyStopping(monitor=monitor, mode=monitor_mode, patience=10),
|
|
171
|
+
LearningRateMonitor(logging_interval="step"),
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
loggers = [pl_loggers.CSVLogger(save_dir=str(log_dir), name=experiment_name)]
|
|
175
|
+
|
|
176
|
+
return L.Trainer(
|
|
177
|
+
max_epochs=max_epochs,
|
|
178
|
+
accelerator="gpu", devices=num_gpus,
|
|
179
|
+
strategy=DDPStrategy(
|
|
180
|
+
find_unused_parameters=False,
|
|
181
|
+
gradient_as_bucket_view=True,
|
|
182
|
+
),
|
|
183
|
+
precision="16-mixed",
|
|
184
|
+
gradient_clip_val=1.0,
|
|
185
|
+
sync_batchnorm=True,
|
|
186
|
+
callbacks=callbacks, logger=loggers,
|
|
187
|
+
log_every_n_steps=50,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
192
|
+
# 5. LARGE MODEL FSDP TRAINER — Fully Sharded Data Parallel
|
|
193
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
194
|
+
|
|
195
|
+
def large_model_fsdp_trainer(
|
|
196
|
+
max_epochs: int = 100,
|
|
197
|
+
num_gpus: int = 8,
|
|
198
|
+
experiment_name: str = "large_model",
|
|
199
|
+
log_dir: str | Path = "logs/",
|
|
200
|
+
cpu_offload: bool = False,
|
|
201
|
+
) -> "L.Trainer":
|
|
202
|
+
"""
|
|
203
|
+
FSDP trainer for large models (500M+ parameters).
|
|
204
|
+
Shards model weights across GPUs — each GPU holds a fraction of params.
|
|
205
|
+
Set cpu_offload=True if you run out of GPU memory even with FSDP.
|
|
206
|
+
Requires BF16 capable hardware (A100/H100).
|
|
207
|
+
"""
|
|
208
|
+
import torch.nn as nn
|
|
209
|
+
log_dir = Path(log_dir)
|
|
210
|
+
ckpt_dir = log_dir / "checkpoints" / experiment_name
|
|
211
|
+
|
|
212
|
+
callbacks = [
|
|
213
|
+
ModelCheckpoint(
|
|
214
|
+
dirpath=ckpt_dir,
|
|
215
|
+
filename=f"{experiment_name}-{{epoch:02d}}-{{val_loss:.4f}}",
|
|
216
|
+
monitor="val/loss", mode="min",
|
|
217
|
+
save_top_k=3, save_last=True,
|
|
218
|
+
),
|
|
219
|
+
LearningRateMonitor(logging_interval="step"),
|
|
220
|
+
]
|
|
221
|
+
|
|
222
|
+
loggers = [pl_loggers.CSVLogger(save_dir=str(log_dir), name=experiment_name)]
|
|
223
|
+
|
|
224
|
+
return L.Trainer(
|
|
225
|
+
max_epochs=max_epochs,
|
|
226
|
+
accelerator="gpu", devices=num_gpus,
|
|
227
|
+
strategy=FSDPStrategy(
|
|
228
|
+
sharding_strategy="FULL_SHARD",
|
|
229
|
+
activation_checkpointing_policy={
|
|
230
|
+
nn.TransformerEncoderLayer,
|
|
231
|
+
nn.TransformerDecoderLayer,
|
|
232
|
+
},
|
|
233
|
+
cpu_offload=cpu_offload,
|
|
234
|
+
),
|
|
235
|
+
precision="bf16-mixed",
|
|
236
|
+
gradient_clip_val=1.0,
|
|
237
|
+
accumulate_grad_batches=4,
|
|
238
|
+
callbacks=callbacks, logger=loggers,
|
|
239
|
+
log_every_n_steps=10,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
244
|
+
# 6. DEEPSPEED TRAINER — Very large models (10B+)
|
|
245
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
246
|
+
|
|
247
|
+
def deepspeed_trainer(
|
|
248
|
+
max_epochs: int = 100,
|
|
249
|
+
num_gpus: int = 8,
|
|
250
|
+
stage: int = 3,
|
|
251
|
+
experiment_name: str = "xlarge_model",
|
|
252
|
+
log_dir: str | Path = "logs/",
|
|
253
|
+
) -> "L.Trainer":
|
|
254
|
+
"""
|
|
255
|
+
DeepSpeed trainer for very large models (>10B parameters).
|
|
256
|
+
Stage 3 shards optimizer states, gradients, AND parameters across GPUs.
|
|
257
|
+
Requires: pip install deepspeed
|
|
258
|
+
"""
|
|
259
|
+
log_dir = Path(log_dir)
|
|
260
|
+
ckpt_dir = log_dir / "checkpoints" / experiment_name
|
|
261
|
+
|
|
262
|
+
callbacks = [
|
|
263
|
+
ModelCheckpoint(
|
|
264
|
+
dirpath=ckpt_dir, save_top_k=3, save_last=True,
|
|
265
|
+
every_n_train_steps=1000,
|
|
266
|
+
),
|
|
267
|
+
LearningRateMonitor(logging_interval="step"),
|
|
268
|
+
]
|
|
269
|
+
loggers = [pl_loggers.CSVLogger(save_dir=str(log_dir), name=experiment_name)]
|
|
270
|
+
|
|
271
|
+
return L.Trainer(
|
|
272
|
+
max_epochs=max_epochs,
|
|
273
|
+
accelerator="gpu", devices=num_gpus,
|
|
274
|
+
strategy=f"deepspeed_stage_{stage}",
|
|
275
|
+
precision="16-mixed",
|
|
276
|
+
gradient_clip_val=1.0,
|
|
277
|
+
accumulate_grad_batches=4,
|
|
278
|
+
callbacks=callbacks, logger=loggers,
|
|
279
|
+
log_every_n_steps=10,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
284
|
+
# 7. HYPERPARAMETER SEARCH TRAINER — Lightweight sweep runner
|
|
285
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
286
|
+
|
|
287
|
+
def hparam_search_trainer(max_epochs: int = 20) -> "L.Trainer":
|
|
288
|
+
"""
|
|
289
|
+
Lightweight trainer for hyperparameter sweeps (Optuna, Ray Tune, W&B Sweeps).
|
|
290
|
+
No checkpointing, no heavy logging, uses 50% of batches for speed.
|
|
291
|
+
"""
|
|
292
|
+
return L.Trainer(
|
|
293
|
+
max_epochs=max_epochs,
|
|
294
|
+
accelerator="auto", devices=1,
|
|
295
|
+
enable_checkpointing=False,
|
|
296
|
+
logger=False,
|
|
297
|
+
enable_progress_bar=False,
|
|
298
|
+
limit_train_batches=0.5,
|
|
299
|
+
limit_val_batches=0.5,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
304
|
+
# 8. OVERFIT TEST TRAINER — Verify model capacity
|
|
305
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
306
|
+
|
|
307
|
+
def overfit_test_trainer(num_batches: int = 10, max_epochs: int = 100) -> "L.Trainer":
|
|
308
|
+
"""
|
|
309
|
+
Overfit on a tiny subset to verify the model CAN learn.
|
|
310
|
+
If it can't overfit on 10 batches, there's a model/optimizer bug.
|
|
311
|
+
"""
|
|
312
|
+
return L.Trainer(
|
|
313
|
+
max_epochs=max_epochs,
|
|
314
|
+
accelerator="auto", devices=1,
|
|
315
|
+
overfit_batches=num_batches,
|
|
316
|
+
log_every_n_steps=1,
|
|
317
|
+
enable_progress_bar=True,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
322
|
+
# 9. CLUSTER TIME-LIMITED TRAINER — SLURM/HPC wall-clock aware
|
|
323
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
324
|
+
|
|
325
|
+
def cluster_time_limited_trainer(
|
|
326
|
+
max_time_hours: float = 23.5,
|
|
327
|
+
max_epochs: int = 1000,
|
|
328
|
+
checkpoint_dir: str | Path = "checkpoints/",
|
|
329
|
+
) -> "L.Trainer":
|
|
330
|
+
"""
|
|
331
|
+
Time-aware trainer for SLURM/HPC jobs.
|
|
332
|
+
Saves last checkpoint automatically when time limit approaches.
|
|
333
|
+
Resume from last.ckpt on re-submission.
|
|
334
|
+
|
|
335
|
+
Usage:
|
|
336
|
+
trainer = cluster_time_limited_trainer(max_time_hours=23.5)
|
|
337
|
+
trainer.fit(model, dm, ckpt_path="checkpoints/last.ckpt") # resumes if exists
|
|
338
|
+
"""
|
|
339
|
+
from datetime import timedelta
|
|
340
|
+
checkpoint_dir = Path(checkpoint_dir)
|
|
341
|
+
|
|
342
|
+
callbacks = [
|
|
343
|
+
ModelCheckpoint(
|
|
344
|
+
dirpath=checkpoint_dir,
|
|
345
|
+
save_top_k=3, save_last=True,
|
|
346
|
+
every_n_epochs=5, verbose=False,
|
|
347
|
+
),
|
|
348
|
+
]
|
|
349
|
+
if RichProgressBar is not None:
|
|
350
|
+
callbacks.append(RichProgressBar())
|
|
351
|
+
|
|
352
|
+
return L.Trainer(
|
|
353
|
+
max_epochs=max_epochs,
|
|
354
|
+
max_time=timedelta(hours=max_time_hours),
|
|
355
|
+
accelerator="gpu", devices="auto",
|
|
356
|
+
callbacks=callbacks,
|
|
357
|
+
log_every_n_steps=50,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
362
|
+
# 10. REPRODUCIBLE TRAINER — Deterministic results for publications
|
|
363
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
364
|
+
|
|
365
|
+
def reproducible_trainer(
|
|
366
|
+
seed: int = 42,
|
|
367
|
+
max_epochs: int = 100,
|
|
368
|
+
experiment_name: str = "reproducible",
|
|
369
|
+
log_dir: str | Path = "logs/",
|
|
370
|
+
) -> "L.Trainer":
|
|
371
|
+
"""
|
|
372
|
+
Fully deterministic trainer for reproducible research.
|
|
373
|
+
Uses full FP32 precision, deterministic CUDA ops.
|
|
374
|
+
NOTE: Slower than mixed precision — only use for final publication runs.
|
|
375
|
+
"""
|
|
376
|
+
L.seed_everything(seed, workers=True)
|
|
377
|
+
log_dir = Path(log_dir)
|
|
378
|
+
|
|
379
|
+
callbacks = [
|
|
380
|
+
ModelCheckpoint(
|
|
381
|
+
dirpath=log_dir / "checkpoints" / experiment_name,
|
|
382
|
+
filename=f"{experiment_name}-{{epoch:02d}}-{{val_loss:.4f}}",
|
|
383
|
+
monitor="val/loss", mode="min",
|
|
384
|
+
save_top_k=3, save_last=True,
|
|
385
|
+
),
|
|
386
|
+
LearningRateMonitor(logging_interval="epoch"),
|
|
387
|
+
]
|
|
388
|
+
loggers = [
|
|
389
|
+
pl_loggers.CSVLogger(save_dir=str(log_dir), name=experiment_name),
|
|
390
|
+
]
|
|
391
|
+
|
|
392
|
+
return L.Trainer(
|
|
393
|
+
max_epochs=max_epochs,
|
|
394
|
+
accelerator="gpu", devices=1,
|
|
395
|
+
precision="32-true",
|
|
396
|
+
deterministic=True,
|
|
397
|
+
benchmark=False,
|
|
398
|
+
callbacks=callbacks, logger=loggers,
|
|
399
|
+
log_every_n_steps=50,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
404
|
+
# Quick selection guide
|
|
405
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
406
|
+
|
|
407
|
+
if __name__ == "__main__":
|
|
408
|
+
print("""
|
|
409
|
+
Advanced Trainer Configurations — BMAD DL Lifecycle
|
|
410
|
+
(Adapted from K-Dense AI claude-scientific-skills)
|
|
411
|
+
|
|
412
|
+
╔══════════════════════╦═════════════════════════════════════════════════╗
|
|
413
|
+
║ Scenario ║ Use ║
|
|
414
|
+
╠══════════════════════╬═════════════════════════════════════════════════╣
|
|
415
|
+
║ Quick test ║ basic_trainer() ║
|
|
416
|
+
║ Find bugs ║ debug_trainer() ║
|
|
417
|
+
║ Verify model learns ║ overfit_test_trainer() ║
|
|
418
|
+
║ Hparam sweep ║ hparam_search_trainer() ║
|
|
419
|
+
║ Production 1 GPU ║ single_gpu_trainer() ║
|
|
420
|
+
║ Production N GPUs ║ multi_gpu_ddp_trainer(num_gpus=4) ║
|
|
421
|
+
║ Large model (500M+) ║ large_model_fsdp_trainer(num_gpus=8) ║
|
|
422
|
+
║ Very large (10B+) ║ deepspeed_trainer(num_gpus=8, stage=3) ║
|
|
423
|
+
║ SLURM cluster ║ cluster_time_limited_trainer(max_time_hours=23) ║
|
|
424
|
+
║ Publication result ║ reproducible_trainer(seed=42) ║
|
|
425
|
+
╚══════════════════════╩═════════════════════════════════════════════════╝
|
|
426
|
+
|
|
427
|
+
After training, analyze with:
|
|
428
|
+
python3 scripts/parse_training_logs.py logs/<exp>/version_0/metrics.csv docs/prd/01_PRD.md
|
|
429
|
+
python3 scripts/plot_training_curves.py logs/<exp>/version_0/metrics.csv
|
|
430
|
+
""")
|