ma-agents 3.3.0 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/.opencode/skills/.ma-agents.json +99 -99
  2. package/.roo/skills/.ma-agents.json +99 -99
  3. package/README.md +56 -15
  4. package/bin/cli.js +63 -8
  5. package/lib/agents.js +23 -0
  6. package/lib/bmad-cache/cache-manifest.json +1 -1
  7. package/lib/bmad-customizations/bmm-demerzel.customize.yaml +36 -0
  8. package/lib/bmad-customizations/demerzel.md +32 -0
  9. package/lib/bmad-extension/module-help.csv +13 -0
  10. package/lib/bmad-extension/skills/bmad-ma-agent-ml/.gitkeep +0 -0
  11. package/lib/bmad-extension/skills/bmad-ma-agent-ml/SKILL.md +59 -0
  12. package/lib/bmad-extension/skills/bmad-ma-agent-ml/bmad-skill-manifest.yaml +11 -0
  13. package/lib/bmad-extension/skills/generate-backlog/.gitkeep +0 -0
  14. package/lib/bmad-extension/skills/ml-advise/.gitkeep +0 -0
  15. package/lib/bmad-extension/skills/ml-advise/SKILL.md +76 -0
  16. package/lib/bmad-extension/skills/ml-advise/bmad-skill-manifest.yaml +3 -0
  17. package/lib/bmad-extension/skills/ml-advise/skill.json +7 -0
  18. package/lib/bmad-extension/skills/ml-analysis/.gitkeep +0 -0
  19. package/lib/bmad-extension/skills/ml-analysis/SKILL.md +60 -0
  20. package/lib/bmad-extension/skills/ml-analysis/bmad-skill-manifest.yaml +3 -0
  21. package/lib/bmad-extension/skills/ml-analysis/skill.json +7 -0
  22. package/lib/bmad-extension/skills/ml-architecture/.gitkeep +0 -0
  23. package/lib/bmad-extension/skills/ml-architecture/SKILL.md +55 -0
  24. package/lib/bmad-extension/skills/ml-architecture/bmad-skill-manifest.yaml +3 -0
  25. package/lib/bmad-extension/skills/ml-architecture/skill.json +7 -0
  26. package/lib/bmad-extension/skills/ml-detailed-design/.gitkeep +0 -0
  27. package/lib/bmad-extension/skills/ml-detailed-design/SKILL.md +67 -0
  28. package/lib/bmad-extension/skills/ml-detailed-design/bmad-skill-manifest.yaml +3 -0
  29. package/lib/bmad-extension/skills/ml-detailed-design/skill.json +7 -0
  30. package/lib/bmad-extension/skills/ml-eda/.gitkeep +0 -0
  31. package/lib/bmad-extension/skills/ml-eda/SKILL.md +56 -0
  32. package/lib/bmad-extension/skills/ml-eda/bmad-skill-manifest.yaml +3 -0
  33. package/lib/bmad-extension/skills/ml-eda/scripts/baseline_classifier.py +522 -0
  34. package/lib/bmad-extension/skills/ml-eda/scripts/class_weights_calculator.py +295 -0
  35. package/lib/bmad-extension/skills/ml-eda/scripts/clustering_explorer.py +383 -0
  36. package/lib/bmad-extension/skills/ml-eda/scripts/eda_analyzer.py +654 -0
  37. package/lib/bmad-extension/skills/ml-eda/skill.json +7 -0
  38. package/lib/bmad-extension/skills/ml-experiment/.gitkeep +0 -0
  39. package/lib/bmad-extension/skills/ml-experiment/SKILL.md +74 -0
  40. package/lib/bmad-extension/skills/ml-experiment/assets/advanced_trainer_configs.py +430 -0
  41. package/lib/bmad-extension/skills/ml-experiment/assets/quick_trainer_setup.py +233 -0
  42. package/lib/bmad-extension/skills/ml-experiment/assets/template_datamodule.py +219 -0
  43. package/lib/bmad-extension/skills/ml-experiment/assets/template_gnn_module.py +341 -0
  44. package/lib/bmad-extension/skills/ml-experiment/assets/template_lightning_module.py +158 -0
  45. package/lib/bmad-extension/skills/ml-experiment/bmad-skill-manifest.yaml +3 -0
  46. package/lib/bmad-extension/skills/ml-experiment/skill.json +7 -0
  47. package/lib/bmad-extension/skills/ml-hparam/.gitkeep +0 -0
  48. package/lib/bmad-extension/skills/ml-hparam/SKILL.md +81 -0
  49. package/lib/bmad-extension/skills/ml-hparam/bmad-skill-manifest.yaml +3 -0
  50. package/lib/bmad-extension/skills/ml-hparam/skill.json +7 -0
  51. package/lib/bmad-extension/skills/ml-ideation/.gitkeep +0 -0
  52. package/lib/bmad-extension/skills/ml-ideation/SKILL.md +50 -0
  53. package/lib/bmad-extension/skills/ml-ideation/bmad-skill-manifest.yaml +3 -0
  54. package/lib/bmad-extension/skills/ml-ideation/scripts/validate_ml_prd.py +287 -0
  55. package/lib/bmad-extension/skills/ml-ideation/skill.json +7 -0
  56. package/lib/bmad-extension/skills/ml-infra/.gitkeep +0 -0
  57. package/lib/bmad-extension/skills/ml-infra/SKILL.md +58 -0
  58. package/lib/bmad-extension/skills/ml-infra/bmad-skill-manifest.yaml +3 -0
  59. package/lib/bmad-extension/skills/ml-infra/skill.json +7 -0
  60. package/lib/bmad-extension/skills/ml-retrospective/.gitkeep +0 -0
  61. package/lib/bmad-extension/skills/ml-retrospective/SKILL.md +63 -0
  62. package/lib/bmad-extension/skills/ml-retrospective/bmad-skill-manifest.yaml +3 -0
  63. package/lib/bmad-extension/skills/ml-retrospective/skill.json +7 -0
  64. package/lib/bmad-extension/skills/ml-revision/.gitkeep +0 -0
  65. package/lib/bmad-extension/skills/ml-revision/SKILL.md +82 -0
  66. package/lib/bmad-extension/skills/ml-revision/bmad-skill-manifest.yaml +3 -0
  67. package/lib/bmad-extension/skills/ml-revision/skill.json +7 -0
  68. package/lib/bmad-extension/skills/ml-techspec/.gitkeep +0 -0
  69. package/lib/bmad-extension/skills/ml-techspec/SKILL.md +80 -0
  70. package/lib/bmad-extension/skills/ml-techspec/bmad-skill-manifest.yaml +3 -0
  71. package/lib/bmad-extension/skills/ml-techspec/skill.json +7 -0
  72. package/lib/bmad.js +85 -8
  73. package/lib/skill-authoring.js +1 -1
  74. package/package.json +2 -2
  75. package/test/agent-injection-strategy.test.js +4 -4
  76. package/test/bmad-version-bump.test.js +34 -34
  77. package/test/build-bmad-args.test.js +13 -6
  78. package/test/convert-agents-to-skills.test.js +11 -1
  79. package/test/extension-module-restructure.test.js +31 -7
  80. package/test/migration-validation.test.js +14 -11
@@ -0,0 +1,74 @@
1
+ ---
2
+ name: ml-experiment
3
+ description: ML Experiment - Execute training, hyperparameter optimization, and log all runs to the tracking tool
4
+ ---
5
+
6
+ # ML Stage 6 - Experiment Execution
7
+
8
+ Execute the training pipeline as specified in the locked TechSpec. Log everything. Touch nothing outside the HPO space.
9
+
10
+ ## Instructions
11
+
12
+ ### 1. Load Context
13
+ - Read `_bmad-output/planning-artifacts/ml-techspec.md` — MUST be LOCKED. If not, STOP.
14
+ - Read `_bmad-output/planning-artifacts/ml-architecture.md`
15
+ - Note the fixed random seed, feature set, and validation strategy — do not deviate
16
+
17
+ ### 2. Write Training Script
18
+ If `scripts/train.py` does not exist, generate it following the TechSpec exactly:
19
+ - Load data from TechSpec data contract paths
20
+ - Apply preprocessing pipeline (from ml-architecture)
21
+ - Create train/val split using TechSpec validation strategy and fixed seed
22
+ - Instantiate tracking run (wandb.init / mlflow.start_run / local log)
23
+ - Log all TechSpec fixed params as run config
24
+ - Run baseline model first — log its metrics
25
+ - Run candidate model with fixed hyperparameters — log metrics
26
+ - If HPO is specified: run HPO trials, log each trial
27
+ - Select best model; log final metrics against ALL TechSpec acceptance criteria
28
+ - Save model artifact to `_bmad-output/implementation-artifacts/models/`
29
+ - Log pass/fail for each acceptance criterion explicitly
30
+
31
+ Use `assets/` templates if they exist (e.g. `assets/train_template.py`).
32
+
33
+ ### 3. Execute Baseline
34
+ Run baseline first — always:
35
+
36
+ uv run python scripts/train.py --mode baseline
37
+
38
+ Log baseline metrics. The candidate model must beat this.
39
+
40
+ ### 4. Execute Candidate Model
41
+
42
+ uv run python scripts/train.py --mode candidate
43
+
44
+ ### 5. Execute HPO (if specified in TechSpec)
45
+ Run hyperparameter optimization within the HPO space defined in TechSpec:
46
+
47
+ uv run python scripts/train.py --mode hpo
48
+
49
+ Log all trials. Select best params by primary metric on validation set only.
50
+
51
+ ### 6. Write Experiment Log
52
+ Write `_bmad-output/implementation-artifacts/experiment-log.md` with:
53
+ - **Run IDs**: Tracking tool run IDs or local log paths
54
+ - **Baseline Results**: metrics for each acceptance criterion
55
+ - **Best Model Results**: metrics for each acceptance criterion
56
+ - **Best Hyperparameters**: final selected values
57
+ - **Acceptance Criteria Status**: PASS / FAIL for each criterion from TechSpec
58
+ - **Training Time**: wall-clock time
59
+ - **Model Artifact Path**: path to saved model
60
+
61
+ ### 7. Surface Dilemmas & Commit Gate
62
+
63
+ Before presenting and **before any git commit**:
64
+
65
+ - Identify every execution decision where two or more options existed (early stopping patience, batch size, resampling applied or not, model variant selected, etc.)
66
+ - Format each as: **Dilemma [Letter] — Title** / **Context** / **Options (a/b)** / **Recommendation** / **Your decision:** [blank]
67
+ - If all choices were unambiguous, state explicitly: "No open dilemmas."
68
+ - **Do NOT commit the experiment log or model artifacts until the user has reviewed and approved.**
69
+
70
+ ### 8. Confirm and Advance
71
+ - Present experiment log summary
72
+ - State clearly: "Primary criterion ([metric] >= [threshold]): PASS / FAIL"
73
+ - On approval: commit artifacts, then say "Proceed to **Stage 7 — /ml-analysis** to evaluate results against the TechSpec contract."
74
+ - STOP and WAIT for user confirmation
@@ -0,0 +1,430 @@
1
+ """
2
+ advanced_trainer_configs.py — BMAD DL Lifecycle
3
+ (Adapted from K-Dense claude-scientific-skills/pytorch-lightning/quick_trainer_setup.py)
4
+
5
+ 10 ready-to-use Lightning Trainer configurations for different training scenarios.
6
+ Each function is self-contained — copy the one you need into your training script.
7
+
8
+ Configurations:
9
+ 1. basic_trainer() — Quick prototyping, CPU/auto hardware
10
+ 2. debug_trainer() — Fast dev run, anomaly detection
11
+ 3. single_gpu_trainer() — Production single GPU with checkpointing + logging
12
+ 4. multi_gpu_ddp_trainer() — Multi-GPU with DDP (models < 500M params)
13
+ 5. large_model_fsdp_trainer() — FSDP for large models (500M+ params)
14
+ 6. deepspeed_trainer() — DeepSpeed for very large models (10B+)
15
+ 7. hparam_search_trainer() — Lightweight for hyperparameter sweeps
16
+ 8. overfit_test_trainer() — Overfit on N batches to verify model capacity
17
+ 9. cluster_time_limited_trainer()— SLURM/cluster jobs with wall-clock limit
18
+ 10. reproducible_trainer() — Deterministic, full-precision for publications
19
+
20
+ Usage in your training script:
21
+ from assets.advanced_trainer_configs import single_gpu_trainer
22
+ trainer = single_gpu_trainer(max_epochs=50, experiment_name="run_001")
23
+ trainer.fit(model, datamodule)
24
+ trainer.test(model, datamodule)
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ from pathlib import Path
30
+
31
+ try:
32
+ import lightning as L
33
+ from lightning.pytorch.callbacks import (
34
+ ModelCheckpoint, EarlyStopping, LearningRateMonitor,
35
+ DeviceStatsMonitor, RichProgressBar,
36
+ )
37
+ from lightning.pytorch import loggers as pl_loggers
38
+ from lightning.pytorch.strategies import DDPStrategy, FSDPStrategy
39
+ LIGHTNING_PKG = "lightning"
40
+ except ImportError:
41
+ try:
42
+ import pytorch_lightning as L
43
+ from pytorch_lightning.callbacks import (
44
+ ModelCheckpoint, EarlyStopping, LearningRateMonitor,
45
+ )
46
+ from pytorch_lightning import loggers as pl_loggers
47
+ from pytorch_lightning.strategies import DDPStrategy, FSDPStrategy
48
+ RichProgressBar = DeviceStatsMonitor = None
49
+ LIGHTNING_PKG = "pytorch_lightning"
50
+ except ImportError:
51
+ raise ImportError("Install: pip install lightning")
52
+
53
+
54
+ # ══════════════════════════════════════════════════════════════════════════════
55
+ # 1. BASIC TRAINER — Quick prototyping
56
+ # ══════════════════════════════════════════════════════════════════════════════
57
+
58
+ def basic_trainer(max_epochs: int = 10) -> "L.Trainer":
59
+ """
60
+ Minimal trainer for quick prototyping.
61
+ Auto-selects GPU/CPU, minimal logging.
62
+ """
63
+ return L.Trainer(
64
+ max_epochs=max_epochs,
65
+ accelerator="auto",
66
+ devices="auto",
67
+ enable_progress_bar=True,
68
+ logger=True,
69
+ )
70
+
71
+
72
+ # ══════════════════════════════════════════════════════════════════════════════
73
+ # 2. DEBUG TRAINER — Fast dev run + anomaly detection
74
+ # ══════════════════════════════════════════════════════════════════════════════
75
+
76
+ def debug_trainer() -> "L.Trainer":
77
+ """
78
+ Debug trainer: runs 1 batch through train/val/test to find bugs fast.
79
+ Enables gradient anomaly detection (NaN/Inf catching).
80
+ """
81
+ return L.Trainer(
82
+ fast_dev_run=True,
83
+ accelerator="cpu",
84
+ detect_anomaly=True,
85
+ log_every_n_steps=1,
86
+ enable_progress_bar=True,
87
+ )
88
+
89
+
90
+ # ══════════════════════════════════════════════════════════════════════════════
91
+ # 3. SINGLE GPU TRAINER — Production single GPU
92
+ # ══════════════════════════════════════════════════════════════════════════════
93
+
94
+ def single_gpu_trainer(
95
+ max_epochs: int = 100,
96
+ experiment_name: str = "experiment",
97
+ version: str | None = None,
98
+ log_dir: str | Path = "logs/",
99
+ monitor: str = "val/loss",
100
+ monitor_mode: str = "min",
101
+ patience: int = 10,
102
+ ) -> "L.Trainer":
103
+ """
104
+ Production-ready single GPU trainer.
105
+ Features: mixed precision, checkpointing (top-3 + last), early stopping,
106
+ LR monitor, CSV + TensorBoard loggers.
107
+ """
108
+ log_dir = Path(log_dir)
109
+ ckpt_dir = log_dir / "checkpoints" / experiment_name
110
+
111
+ callbacks = [
112
+ ModelCheckpoint(
113
+ dirpath=ckpt_dir,
114
+ filename=f"{experiment_name}-{{epoch:02d}}-{{{monitor}:.4f}}",
115
+ monitor=monitor, mode=monitor_mode,
116
+ save_top_k=3, save_last=True, verbose=True,
117
+ ),
118
+ EarlyStopping(monitor=monitor, mode=monitor_mode, patience=patience, verbose=True),
119
+ LearningRateMonitor(logging_interval="epoch"),
120
+ ]
121
+ if RichProgressBar is not None:
122
+ callbacks.append(RichProgressBar())
123
+
124
+ # TensorBoard required — version= prevents fold/run log collision
125
+ loggers = [
126
+ pl_loggers.TensorBoardLogger(save_dir=str(log_dir), name=experiment_name, version=version),
127
+ pl_loggers.CSVLogger(save_dir=str(log_dir), name=experiment_name, version=version),
128
+ ]
129
+ log_path = Path(log_dir) / experiment_name / (version or "version_0")
130
+ print(f"GPU: {__import__('torch').cuda.get_device_name(0) if __import__('torch').cuda.is_available() else 'WARNING: No GPU detected'}")
131
+ print(f"Logs → {log_path}/ run: tensorboard --logdir={log_dir}")
132
+
133
+ return L.Trainer(
134
+ max_epochs=max_epochs,
135
+ accelerator="gpu", devices=1,
136
+ precision="16-mixed",
137
+ gradient_clip_val=1.0,
138
+ callbacks=callbacks, logger=loggers,
139
+ log_every_n_steps=10,
140
+ )
141
+
142
+
143
+ # ══════════════════════════════════════════════════════════════════════════════
144
+ # 4. MULTI-GPU DDP TRAINER — Distributed Data Parallel
145
+ # ══════════════════════════════════════════════════════════════════════════════
146
+
147
+ def multi_gpu_ddp_trainer(
148
+ max_epochs: int = 100,
149
+ num_gpus: int = 4,
150
+ experiment_name: str = "experiment",
151
+ log_dir: str | Path = "logs/",
152
+ monitor: str = "val/loss",
153
+ monitor_mode: str = "min",
154
+ ) -> "L.Trainer":
155
+ """
156
+ Multi-GPU training with DDP strategy.
157
+ Best for: standard DL models < 500M parameters.
158
+ Syncs batch norm across GPUs automatically.
159
+ """
160
+ log_dir = Path(log_dir)
161
+ ckpt_dir = log_dir / "checkpoints" / experiment_name
162
+
163
+ callbacks = [
164
+ ModelCheckpoint(
165
+ dirpath=ckpt_dir,
166
+ filename=f"{experiment_name}-{{epoch:02d}}-{{{monitor}:.4f}}",
167
+ monitor=monitor, mode=monitor_mode,
168
+ save_top_k=3, save_last=True,
169
+ ),
170
+ EarlyStopping(monitor=monitor, mode=monitor_mode, patience=10),
171
+ LearningRateMonitor(logging_interval="step"),
172
+ ]
173
+
174
+ loggers = [pl_loggers.CSVLogger(save_dir=str(log_dir), name=experiment_name)]
175
+
176
+ return L.Trainer(
177
+ max_epochs=max_epochs,
178
+ accelerator="gpu", devices=num_gpus,
179
+ strategy=DDPStrategy(
180
+ find_unused_parameters=False,
181
+ gradient_as_bucket_view=True,
182
+ ),
183
+ precision="16-mixed",
184
+ gradient_clip_val=1.0,
185
+ sync_batchnorm=True,
186
+ callbacks=callbacks, logger=loggers,
187
+ log_every_n_steps=50,
188
+ )
189
+
190
+
191
+ # ══════════════════════════════════════════════════════════════════════════════
192
+ # 5. LARGE MODEL FSDP TRAINER — Fully Sharded Data Parallel
193
+ # ══════════════════════════════════════════════════════════════════════════════
194
+
195
+ def large_model_fsdp_trainer(
196
+ max_epochs: int = 100,
197
+ num_gpus: int = 8,
198
+ experiment_name: str = "large_model",
199
+ log_dir: str | Path = "logs/",
200
+ cpu_offload: bool = False,
201
+ ) -> "L.Trainer":
202
+ """
203
+ FSDP trainer for large models (500M+ parameters).
204
+ Shards model weights across GPUs — each GPU holds a fraction of params.
205
+ Set cpu_offload=True if you run out of GPU memory even with FSDP.
206
+ Requires BF16 capable hardware (A100/H100).
207
+ """
208
+ import torch.nn as nn
209
+ log_dir = Path(log_dir)
210
+ ckpt_dir = log_dir / "checkpoints" / experiment_name
211
+
212
+ callbacks = [
213
+ ModelCheckpoint(
214
+ dirpath=ckpt_dir,
215
+ filename=f"{experiment_name}-{{epoch:02d}}-{{val_loss:.4f}}",
216
+ monitor="val/loss", mode="min",
217
+ save_top_k=3, save_last=True,
218
+ ),
219
+ LearningRateMonitor(logging_interval="step"),
220
+ ]
221
+
222
+ loggers = [pl_loggers.CSVLogger(save_dir=str(log_dir), name=experiment_name)]
223
+
224
+ return L.Trainer(
225
+ max_epochs=max_epochs,
226
+ accelerator="gpu", devices=num_gpus,
227
+ strategy=FSDPStrategy(
228
+ sharding_strategy="FULL_SHARD",
229
+ activation_checkpointing_policy={
230
+ nn.TransformerEncoderLayer,
231
+ nn.TransformerDecoderLayer,
232
+ },
233
+ cpu_offload=cpu_offload,
234
+ ),
235
+ precision="bf16-mixed",
236
+ gradient_clip_val=1.0,
237
+ accumulate_grad_batches=4,
238
+ callbacks=callbacks, logger=loggers,
239
+ log_every_n_steps=10,
240
+ )
241
+
242
+
243
+ # ══════════════════════════════════════════════════════════════════════════════
244
+ # 6. DEEPSPEED TRAINER — Very large models (10B+)
245
+ # ══════════════════════════════════════════════════════════════════════════════
246
+
247
+ def deepspeed_trainer(
248
+ max_epochs: int = 100,
249
+ num_gpus: int = 8,
250
+ stage: int = 3,
251
+ experiment_name: str = "xlarge_model",
252
+ log_dir: str | Path = "logs/",
253
+ ) -> "L.Trainer":
254
+ """
255
+ DeepSpeed trainer for very large models (>10B parameters).
256
+ Stage 3 shards optimizer states, gradients, AND parameters across GPUs.
257
+ Requires: pip install deepspeed
258
+ """
259
+ log_dir = Path(log_dir)
260
+ ckpt_dir = log_dir / "checkpoints" / experiment_name
261
+
262
+ callbacks = [
263
+ ModelCheckpoint(
264
+ dirpath=ckpt_dir, save_top_k=3, save_last=True,
265
+ every_n_train_steps=1000,
266
+ ),
267
+ LearningRateMonitor(logging_interval="step"),
268
+ ]
269
+ loggers = [pl_loggers.CSVLogger(save_dir=str(log_dir), name=experiment_name)]
270
+
271
+ return L.Trainer(
272
+ max_epochs=max_epochs,
273
+ accelerator="gpu", devices=num_gpus,
274
+ strategy=f"deepspeed_stage_{stage}",
275
+ precision="16-mixed",
276
+ gradient_clip_val=1.0,
277
+ accumulate_grad_batches=4,
278
+ callbacks=callbacks, logger=loggers,
279
+ log_every_n_steps=10,
280
+ )
281
+
282
+
283
+ # ══════════════════════════════════════════════════════════════════════════════
284
+ # 7. HYPERPARAMETER SEARCH TRAINER — Lightweight sweep runner
285
+ # ══════════════════════════════════════════════════════════════════════════════
286
+
287
+ def hparam_search_trainer(max_epochs: int = 20) -> "L.Trainer":
288
+ """
289
+ Lightweight trainer for hyperparameter sweeps (Optuna, Ray Tune, W&B Sweeps).
290
+ No checkpointing, no heavy logging, uses 50% of batches for speed.
291
+ """
292
+ return L.Trainer(
293
+ max_epochs=max_epochs,
294
+ accelerator="auto", devices=1,
295
+ enable_checkpointing=False,
296
+ logger=False,
297
+ enable_progress_bar=False,
298
+ limit_train_batches=0.5,
299
+ limit_val_batches=0.5,
300
+ )
301
+
302
+
303
+ # ══════════════════════════════════════════════════════════════════════════════
304
+ # 8. OVERFIT TEST TRAINER — Verify model capacity
305
+ # ══════════════════════════════════════════════════════════════════════════════
306
+
307
+ def overfit_test_trainer(num_batches: int = 10, max_epochs: int = 100) -> "L.Trainer":
308
+ """
309
+ Overfit on a tiny subset to verify the model CAN learn.
310
+ If it can't overfit on 10 batches, there's a model/optimizer bug.
311
+ """
312
+ return L.Trainer(
313
+ max_epochs=max_epochs,
314
+ accelerator="auto", devices=1,
315
+ overfit_batches=num_batches,
316
+ log_every_n_steps=1,
317
+ enable_progress_bar=True,
318
+ )
319
+
320
+
321
+ # ══════════════════════════════════════════════════════════════════════════════
322
+ # 9. CLUSTER TIME-LIMITED TRAINER — SLURM/HPC wall-clock aware
323
+ # ══════════════════════════════════════════════════════════════════════════════
324
+
325
+ def cluster_time_limited_trainer(
326
+ max_time_hours: float = 23.5,
327
+ max_epochs: int = 1000,
328
+ checkpoint_dir: str | Path = "checkpoints/",
329
+ ) -> "L.Trainer":
330
+ """
331
+ Time-aware trainer for SLURM/HPC jobs.
332
+ Saves last checkpoint automatically when time limit approaches.
333
+ Resume from last.ckpt on re-submission.
334
+
335
+ Usage:
336
+ trainer = cluster_time_limited_trainer(max_time_hours=23.5)
337
+ trainer.fit(model, dm, ckpt_path="checkpoints/last.ckpt") # resumes if exists
338
+ """
339
+ from datetime import timedelta
340
+ checkpoint_dir = Path(checkpoint_dir)
341
+
342
+ callbacks = [
343
+ ModelCheckpoint(
344
+ dirpath=checkpoint_dir,
345
+ save_top_k=3, save_last=True,
346
+ every_n_epochs=5, verbose=False,
347
+ ),
348
+ ]
349
+ if RichProgressBar is not None:
350
+ callbacks.append(RichProgressBar())
351
+
352
+ return L.Trainer(
353
+ max_epochs=max_epochs,
354
+ max_time=timedelta(hours=max_time_hours),
355
+ accelerator="gpu", devices="auto",
356
+ callbacks=callbacks,
357
+ log_every_n_steps=50,
358
+ )
359
+
360
+
361
+ # ══════════════════════════════════════════════════════════════════════════════
362
+ # 10. REPRODUCIBLE TRAINER — Deterministic results for publications
363
+ # ══════════════════════════════════════════════════════════════════════════════
364
+
365
+ def reproducible_trainer(
366
+ seed: int = 42,
367
+ max_epochs: int = 100,
368
+ experiment_name: str = "reproducible",
369
+ log_dir: str | Path = "logs/",
370
+ ) -> "L.Trainer":
371
+ """
372
+ Fully deterministic trainer for reproducible research.
373
+ Uses full FP32 precision, deterministic CUDA ops.
374
+ NOTE: Slower than mixed precision — only use for final publication runs.
375
+ """
376
+ L.seed_everything(seed, workers=True)
377
+ log_dir = Path(log_dir)
378
+
379
+ callbacks = [
380
+ ModelCheckpoint(
381
+ dirpath=log_dir / "checkpoints" / experiment_name,
382
+ filename=f"{experiment_name}-{{epoch:02d}}-{{val_loss:.4f}}",
383
+ monitor="val/loss", mode="min",
384
+ save_top_k=3, save_last=True,
385
+ ),
386
+ LearningRateMonitor(logging_interval="epoch"),
387
+ ]
388
+ loggers = [
389
+ pl_loggers.CSVLogger(save_dir=str(log_dir), name=experiment_name),
390
+ ]
391
+
392
+ return L.Trainer(
393
+ max_epochs=max_epochs,
394
+ accelerator="gpu", devices=1,
395
+ precision="32-true",
396
+ deterministic=True,
397
+ benchmark=False,
398
+ callbacks=callbacks, logger=loggers,
399
+ log_every_n_steps=50,
400
+ )
401
+
402
+
403
+ # ══════════════════════════════════════════════════════════════════════════════
404
+ # Quick selection guide
405
+ # ══════════════════════════════════════════════════════════════════════════════
406
+
407
+ if __name__ == "__main__":
408
+ print("""
409
+ Advanced Trainer Configurations — BMAD DL Lifecycle
410
+ (Adapted from K-Dense AI claude-scientific-skills)
411
+
412
+ ╔══════════════════════╦═════════════════════════════════════════════════╗
413
+ ║ Scenario ║ Use ║
414
+ ╠══════════════════════╬═════════════════════════════════════════════════╣
415
+ ║ Quick test ║ basic_trainer() ║
416
+ ║ Find bugs ║ debug_trainer() ║
417
+ ║ Verify model learns ║ overfit_test_trainer() ║
418
+ ║ Hparam sweep ║ hparam_search_trainer() ║
419
+ ║ Production 1 GPU ║ single_gpu_trainer() ║
420
+ ║ Production N GPUs ║ multi_gpu_ddp_trainer(num_gpus=4) ║
421
+ ║ Large model (500M+) ║ large_model_fsdp_trainer(num_gpus=8) ║
422
+ ║ Very large (10B+) ║ deepspeed_trainer(num_gpus=8, stage=3) ║
423
+ ║ SLURM cluster ║ cluster_time_limited_trainer(max_time_hours=23) ║
424
+ ║ Publication result ║ reproducible_trainer(seed=42) ║
425
+ ╚══════════════════════╩═════════════════════════════════════════════════╝
426
+
427
+ After training, analyze with:
428
+ python3 scripts/parse_training_logs.py logs/<exp>/version_0/metrics.csv docs/prd/01_PRD.md
429
+ python3 scripts/plot_training_curves.py logs/<exp>/version_0/metrics.csv
430
+ """)