@zigrivers/scaffold 3.7.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/README.md +113 -8
  2. package/content/knowledge/browser-extension/browser-extension-architecture.md +195 -0
  3. package/content/knowledge/browser-extension/browser-extension-content-scripts.md +264 -0
  4. package/content/knowledge/browser-extension/browser-extension-conventions.md +156 -0
  5. package/content/knowledge/browser-extension/browser-extension-cross-browser.md +229 -0
  6. package/content/knowledge/browser-extension/browser-extension-dev-environment.md +247 -0
  7. package/content/knowledge/browser-extension/browser-extension-manifest.md +220 -0
  8. package/content/knowledge/browser-extension/browser-extension-project-structure.md +183 -0
  9. package/content/knowledge/browser-extension/browser-extension-requirements.md +107 -0
  10. package/content/knowledge/browser-extension/browser-extension-security.md +202 -0
  11. package/content/knowledge/browser-extension/browser-extension-service-workers.md +265 -0
  12. package/content/knowledge/browser-extension/browser-extension-store-submission.md +155 -0
  13. package/content/knowledge/browser-extension/browser-extension-testing.md +270 -0
  14. package/content/knowledge/data-pipeline/data-pipeline-architecture.md +175 -0
  15. package/content/knowledge/data-pipeline/data-pipeline-batch-patterns.md +263 -0
  16. package/content/knowledge/data-pipeline/data-pipeline-conventions.md +176 -0
  17. package/content/knowledge/data-pipeline/data-pipeline-dev-environment.md +350 -0
  18. package/content/knowledge/data-pipeline/data-pipeline-orchestration.md +291 -0
  19. package/content/knowledge/data-pipeline/data-pipeline-project-structure.md +257 -0
  20. package/content/knowledge/data-pipeline/data-pipeline-quality.md +324 -0
  21. package/content/knowledge/data-pipeline/data-pipeline-requirements.md +145 -0
  22. package/content/knowledge/data-pipeline/data-pipeline-schema-management.md +295 -0
  23. package/content/knowledge/data-pipeline/data-pipeline-security.md +326 -0
  24. package/content/knowledge/data-pipeline/data-pipeline-streaming-patterns.md +280 -0
  25. package/content/knowledge/data-pipeline/data-pipeline-testing.md +406 -0
  26. package/content/knowledge/library/library-api-design.md +306 -0
  27. package/content/knowledge/library/library-architecture.md +247 -0
  28. package/content/knowledge/library/library-bundling.md +244 -0
  29. package/content/knowledge/library/library-conventions.md +229 -0
  30. package/content/knowledge/library/library-dev-environment.md +220 -0
  31. package/content/knowledge/library/library-documentation.md +300 -0
  32. package/content/knowledge/library/library-project-structure.md +237 -0
  33. package/content/knowledge/library/library-requirements.md +173 -0
  34. package/content/knowledge/library/library-security.md +257 -0
  35. package/content/knowledge/library/library-testing.md +319 -0
  36. package/content/knowledge/library/library-type-definitions.md +284 -0
  37. package/content/knowledge/library/library-versioning.md +300 -0
  38. package/content/knowledge/ml/ml-architecture.md +172 -0
  39. package/content/knowledge/ml/ml-conventions.md +209 -0
  40. package/content/knowledge/ml/ml-dev-environment.md +299 -0
  41. package/content/knowledge/ml/ml-experiment-tracking.md +285 -0
  42. package/content/knowledge/ml/ml-model-evaluation.md +256 -0
  43. package/content/knowledge/ml/ml-observability.md +253 -0
  44. package/content/knowledge/ml/ml-project-structure.md +216 -0
  45. package/content/knowledge/ml/ml-requirements.md +138 -0
  46. package/content/knowledge/ml/ml-security.md +188 -0
  47. package/content/knowledge/ml/ml-serving-patterns.md +243 -0
  48. package/content/knowledge/ml/ml-testing.md +301 -0
  49. package/content/knowledge/ml/ml-training-patterns.md +269 -0
  50. package/content/knowledge/mobile-app/mobile-app-architecture.md +283 -0
  51. package/content/knowledge/mobile-app/mobile-app-conventions.md +180 -0
  52. package/content/knowledge/mobile-app/mobile-app-deployment.md +298 -0
  53. package/content/knowledge/mobile-app/mobile-app-dev-environment.md +257 -0
  54. package/content/knowledge/mobile-app/mobile-app-distribution.md +264 -0
  55. package/content/knowledge/mobile-app/mobile-app-observability.md +317 -0
  56. package/content/knowledge/mobile-app/mobile-app-offline-patterns.md +311 -0
  57. package/content/knowledge/mobile-app/mobile-app-project-structure.md +245 -0
  58. package/content/knowledge/mobile-app/mobile-app-push-notifications.md +321 -0
  59. package/content/knowledge/mobile-app/mobile-app-requirements.md +147 -0
  60. package/content/knowledge/mobile-app/mobile-app-security.md +338 -0
  61. package/content/knowledge/mobile-app/mobile-app-testing.md +400 -0
  62. package/content/methodology/browser-extension-overlay.yml +82 -0
  63. package/content/methodology/data-pipeline-overlay.yml +70 -0
  64. package/content/methodology/library-overlay.yml +67 -0
  65. package/content/methodology/ml-overlay.yml +70 -0
  66. package/content/methodology/mobile-app-overlay.yml +71 -0
  67. package/dist/cli/commands/init.d.ts +22 -0
  68. package/dist/cli/commands/init.d.ts.map +1 -1
  69. package/dist/cli/commands/init.js +202 -3
  70. package/dist/cli/commands/init.js.map +1 -1
  71. package/dist/cli/commands/init.test.js +190 -0
  72. package/dist/cli/commands/init.test.js.map +1 -1
  73. package/dist/config/schema.d.ts +1456 -80
  74. package/dist/config/schema.d.ts.map +1 -1
  75. package/dist/config/schema.js +87 -0
  76. package/dist/config/schema.js.map +1 -1
  77. package/dist/config/schema.test.js +312 -3
  78. package/dist/config/schema.test.js.map +1 -1
  79. package/dist/core/assembly/overlay-loader.test.js +55 -0
  80. package/dist/core/assembly/overlay-loader.test.js.map +1 -1
  81. package/dist/e2e/project-type-overlays.test.d.ts +2 -1
  82. package/dist/e2e/project-type-overlays.test.d.ts.map +1 -1
  83. package/dist/e2e/project-type-overlays.test.js +780 -14
  84. package/dist/e2e/project-type-overlays.test.js.map +1 -1
  85. package/dist/types/config.d.ts +16 -1
  86. package/dist/types/config.d.ts.map +1 -1
  87. package/dist/wizard/questions.d.ts +28 -1
  88. package/dist/wizard/questions.d.ts.map +1 -1
  89. package/dist/wizard/questions.js +127 -1
  90. package/dist/wizard/questions.js.map +1 -1
  91. package/dist/wizard/questions.test.js +224 -4
  92. package/dist/wizard/questions.test.js.map +1 -1
  93. package/dist/wizard/wizard.d.ts +22 -0
  94. package/dist/wizard/wizard.d.ts.map +1 -1
  95. package/dist/wizard/wizard.js +28 -1
  96. package/dist/wizard/wizard.js.map +1 -1
  97. package/package.json +1 -1
@@ -0,0 +1,299 @@
1
+ ---
2
+ name: ml-dev-environment
3
+ description: Conda/Poetry environment setup, Jupyter integration, GPU detection and configuration, and Docker for reproducible ML development
4
+ topics: [ml, dev-environment, conda, poetry, jupyter, gpu, docker, reproducibility]
5
+ ---
6
+
7
+ ML development environments have more complexity than typical software projects: GPU drivers, CUDA toolkits, Python packages with native extensions, and Jupyter notebook infrastructure all need to align. A broken environment costs hours and blocks the whole team. Invest in environment standardisation upfront — the payoff is that every team member can reproduce results and that CI pipelines match local runs.
8
+
9
+ ## Summary
10
+
11
+ Prefer Conda for ML projects when GPU and CUDA management is required; use Poetry for pure-Python projects or as the Python dependency manager on top of Conda. Configure Jupyter as a managed service rather than ad-hoc invocations. Detect GPU availability programmatically and handle CPU fallback gracefully. Use Docker to capture the full environment for reproducible training runs and production serving.
12
+
13
+ ## Deep Guidance
14
+
15
+ ### Conda vs. Poetry: When to Use Each
16
+
17
+ **Conda** is the right choice when:
18
+ - Managing GPU drivers and CUDA toolkit versions (Conda can install CUDA without root)
19
+ - Working with packages that have complex native dependencies (PyTorch, TensorFlow, OpenCV)
20
+ - Need to isolate Python version itself (not just packages)
21
+ - Team uses multiple ML frameworks with conflicting dependencies
22
+
23
+ **Poetry** is the right choice when:
24
+ - Pure-Python project or all native dependencies are available via pip
25
+ - Need strict dependency locking and reproducible installs
26
+ - Publishing a library (Poetry handles packaging well)
27
+ - Already using a Conda environment for CUDA and want finer control over Python packages
28
+
29
+ **Common hybrid pattern**: Conda manages Python version and CUDA; Poetry manages Python package dependencies inside the Conda environment.
30
+
31
+ ### Conda Environment Setup
32
+
33
+ ```yaml
34
+ # environment.yml — commit to git
35
+ name: myproject
36
+ channels:
37
+ - pytorch
38
+ - nvidia
39
+ - conda-forge
40
+ - defaults
41
+ dependencies:
42
+ - python=3.11
43
+ - cuda-toolkit=12.1
44
+ - cudnn=8.9
45
+ - pip>=23.0
46
+ - pip:
47
+ - torch==2.1.0+cu121
48
+ - torchvision==0.16.0+cu121
49
+ - -r requirements.txt # or use pyproject.toml
50
+ ```
51
+
52
+ ```bash
53
+ # Create and activate
54
+ conda env create -f environment.yml
55
+ conda activate myproject
56
+
57
+ # Update after environment.yml changes
58
+ conda env update -f environment.yml --prune
59
+
60
+ # Export current state (for exact reproducibility audit)
61
+ conda env export > environment-lock.yml
62
+ ```
63
+
64
+ **Critical**: Pin exact versions in `environment.yml`. `pytorch>=2.0` is not a reproducible spec.
65
+
66
+ ### Poetry Setup (Python Dependencies)
67
+
68
+ ```bash
69
+ # Initialize
70
+ poetry init
71
+
72
+ # Add dependencies
73
+ poetry add torch==2.1.0 transformers==4.35.2
74
+ poetry add --group dev pytest black mypy
75
+
76
+ # Install (creates .venv by default)
77
+ poetry install
78
+
79
+ # Run in the managed venv
80
+ poetry run python train.py
81
+ poetry run pytest
82
+ ```
83
+
84
+ `pyproject.toml` example:
85
+ ```toml
86
+ [tool.poetry]
87
+ name = "myproject"
88
+ version = "0.1.0"
89
+ description = "ML project"
90
+ python = "^3.11"
91
+
92
+ [tool.poetry.dependencies]
93
+ torch = "2.1.0"
94
+ transformers = "4.35.2"
95
+ hydra-core = "1.3.2"
96
+ mlflow = "2.9.2"
97
+
98
+ [tool.poetry.group.dev.dependencies]
99
+ pytest = "7.4.3"
100
+ black = "23.11.0"
101
+ mypy = "1.7.0"
102
+ nbstripout = "0.6.1"
103
+ ```
104
+
105
+ ### GPU Detection and Configuration
106
+
107
+ Always detect GPU availability at runtime and handle CPU fallback:
108
+
109
+ ```python
110
+ # src/utils/device.py
111
+ import torch
112
+ import logging
113
+
114
+ logger = logging.getLogger(__name__)
115
+
116
+ def get_device(prefer_gpu: bool = True) -> torch.device:
117
+ """Return the best available device with logging."""
118
+ if prefer_gpu and torch.cuda.is_available():
119
+ device = torch.device("cuda")
120
+ gpu_name = torch.cuda.get_device_name(0)
121
+ gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
122
+ logger.info(f"Using GPU: {gpu_name} ({gpu_memory:.1f} GB)")
123
+ elif prefer_gpu and torch.backends.mps.is_available():
124
+ # Apple Silicon
125
+ device = torch.device("mps")
126
+ logger.info("Using Apple MPS device")
127
+ else:
128
+ device = torch.device("cpu")
129
+ logger.info("Using CPU — GPU not available or not requested")
130
+ return device
131
+
132
+ def log_gpu_memory() -> None:
133
+ """Log current GPU memory usage."""
134
+ if torch.cuda.is_available():
135
+ allocated = torch.cuda.memory_allocated() / 1e9
136
+ reserved = torch.cuda.memory_reserved() / 1e9
137
+ logger.debug(f"GPU memory: {allocated:.2f} GB allocated, {reserved:.2f} GB reserved")
138
+ ```
139
+
140
+ **CUDA version compatibility**: PyTorch packages are built against specific CUDA versions. Always match:
141
+
142
+ | PyTorch | CUDA | CUDNN |
143
+ |---------|------|-------|
144
+ | 2.1.x | 12.1, 11.8 | 8.x |
145
+ | 2.0.x | 11.7, 11.8 | 8.x |
146
+
147
+ Check compatibility at pytorch.org before pinning.
148
+
149
+ **Multi-GPU setup** (training only — not for development):
150
+ ```python
151
+ # Detect available GPUs
152
+ n_gpus = torch.cuda.device_count()
153
+ if n_gpus > 1:
154
+ model = torch.nn.DataParallel(model) # Simple, for research
155
+ # Or for production: use DistributedDataParallel (see ml-training-patterns)
156
+ ```
157
+
158
+ ### Jupyter Integration
159
+
160
+ Run Jupyter as a managed kernel rather than an ad-hoc server:
161
+
162
+ ```bash
163
+ # Install Jupyter in the project environment
164
+ poetry add --group dev jupyter jupyterlab ipykernel
165
+
166
+ # Register the project venv as a named Jupyter kernel
167
+ poetry run python -m ipykernel install --user --name myproject --display-name "MyProject (Python 3.11)"
168
+
169
+ # Launch JupyterLab
170
+ poetry run jupyter lab
171
+ ```
172
+
173
+ Now all project notebooks run in the same environment as the source code.
174
+
175
+ **Recommended Jupyter extensions**:
176
+ - `nbstripout` — strips outputs before git commit
177
+ - `jupyterlab-git` — git integration in the UI
178
+ - `jupyterlab-lsp` — language server (autocomplete, type hints)
179
+
180
+ **VS Code Jupyter integration** (recommended over browser-based):
181
+ ```json
182
+ // .vscode/settings.json
183
+ {
184
+ "jupyter.kernels.filter": [
185
+ {"path": "${workspaceFolder}/.venv/bin/python", "type": "pythonEnvironment"}
186
+ ],
187
+ "jupyter.notebookFileRoot": "${workspaceFolder}",
188
+ "python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python"
189
+ }
190
+ ```
191
+
192
+ ### Docker for Reproducibility
193
+
194
+ Docker captures the entire environment — OS, CUDA, Python, and packages. Use it for:
195
+ - CI training runs
196
+ - Sharing experiments with collaborators who have different local setups
197
+ - Production serving (identical environment to training)
198
+
199
+ **Base `Dockerfile` for ML training**:
200
+ ```dockerfile
201
+ # Use NVIDIA's official CUDA base image
202
+ FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
203
+
204
+ # Set Python version
205
+ ENV PYTHON_VERSION=3.11
206
+ ENV DEBIAN_FRONTEND=noninteractive
207
+
208
+ RUN apt-get update && apt-get install -y \
209
+ python${PYTHON_VERSION} \
210
+ python3-pip \
211
+ git \
212
+ && rm -rf /var/lib/apt/lists/*
213
+
214
+ RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
215
+
216
+ # Install Poetry
217
+ RUN pip install poetry==1.7.1
218
+ ENV POETRY_NO_INTERACTION=1 \
219
+ POETRY_VENV_IN_PROJECT=1
220
+
221
+ WORKDIR /app
222
+
223
+ # Install dependencies (cached layer)
224
+ COPY pyproject.toml poetry.lock ./
225
+ RUN poetry install --no-root --without dev
226
+
227
+ # Copy source
228
+ COPY src/ ./src/
229
+ COPY configs/ ./configs/
230
+
231
+ # Install the project itself
232
+ RUN poetry install --without dev
233
+
234
+ ENTRYPOINT ["poetry", "run", "python", "-m", "src.training.train"]
235
+ ```
236
+
237
+ **Docker Compose for development**:
238
+ ```yaml
239
+ # docker-compose.yml
240
+ services:
241
+ train:
242
+ build: .
243
+ volumes:
244
+ - ./data:/app/data
245
+ - ./models:/app/models
246
+ - ./configs:/app/configs
247
+ environment:
248
+ - MLFLOW_TRACKING_URI=http://mlflow:5000
249
+ deploy:
250
+ resources:
251
+ reservations:
252
+ devices:
253
+ - driver: nvidia
254
+ count: all
255
+ capabilities: [gpu]
256
+
257
+ mlflow:
258
+ image: ghcr.io/mlflow/mlflow:v2.9.2
259
+ ports:
260
+ - "5000:5000"
261
+ volumes:
262
+ - ./mlruns:/mlflow/mlruns
263
+ ```
264
+
265
+ ### Makefile Task Runner
266
+
267
+ Encode common tasks in a `Makefile` to eliminate "how do I run this?" questions:
268
+
269
+ ```makefile
270
+ .PHONY: env train eval test lint clean
271
+
272
+ env:
273
+ conda env create -f environment.yml || conda env update -f environment.yml --prune
274
+
275
+ train:
276
+ poetry run python -m src.training.train $(ARGS)
277
+
278
+ eval:
279
+ poetry run python -m src.evaluation.evaluator $(ARGS)
280
+
281
+ test:
282
+ poetry run pytest tests/ -v
283
+
284
+ lint:
285
+ poetry run black --check src/ tests/
286
+ poetry run mypy src/
287
+
288
+ clean:
289
+ find . -type f -name "*.pyc" -delete
290
+ find . -type d -name "__pycache__" -delete
291
+ rm -rf .pytest_cache/
292
+ ```
293
+
294
+ Usage:
295
+ ```bash
296
+ make env # Set up environment
297
+ make train ARGS="optimizer.lr=1e-4"
298
+ make test
299
+ ```
@@ -0,0 +1,285 @@
1
+ ---
2
+ name: ml-experiment-tracking
3
+ description: MLflow and Weights & Biases integration, artifact storage, experiment run comparison, and hyperparameter sweep management
4
+ topics: [ml, experiment-tracking, mlflow, wandb, artifacts, sweeps, reproducibility]
5
+ ---
6
+
7
+ Without experiment tracking, ML development is archaeology: "which config produced that result?" is answered by digging through notebook history, chat logs, and failing memory. Experiment tracking tools are version control for training runs — every metric, every hyperparameter, every artifact, linked to the code that produced it. The discipline of logging everything during training pays dividends when a stakeholder asks "how does this model compare to what we had six months ago?"
8
+
9
+ ## Summary
10
+
11
+ Use MLflow (self-hosted, open source) or Weights & Biases (cloud, more feature-rich) to track every training run. Log hyperparameters, metrics at each epoch, model artifacts, and the git commit SHA. Store large artifacts (checkpoints, datasets) in object storage backed by the experiment tracker. Use sweep features (MLflow Hyperopt integration, W&B Sweeps) for systematic hyperparameter search rather than manual iteration.
12
+
13
+ ## Deep Guidance
14
+
15
+ ### MLflow Integration
16
+
17
+ MLflow is the open-source standard for experiment tracking. It runs locally or on a managed server:
18
+
19
+ ```bash
20
+ # Start local tracking server (stores runs in ./mlruns)
21
+ mlflow server --host 0.0.0.0 --port 5000
22
+
23
+ # Or use the SQLite backend for better performance
24
+ mlflow server \
25
+ --backend-store-uri sqlite:///mlflow.db \
26
+ --default-artifact-root ./mlartifacts \
27
+ --host 0.0.0.0 --port 5000
28
+ ```
29
+
30
+ **Instrument training code**:
31
+ ```python
32
+ import mlflow
33
+ import mlflow.pytorch
34
+
35
+ # Set tracking server
36
+ mlflow.set_tracking_uri("http://localhost:5000")
37
+ mlflow.set_experiment("fraud-detector")
38
+
39
+ def train(cfg: DictConfig) -> dict:
40
+ with mlflow.start_run(run_name=cfg.experiment.name) as run:
41
+ # Log all hyperparameters from config
42
+ mlflow.log_params(OmegaConf.to_container(cfg, resolve=True))
43
+
44
+ # Log git commit for reproducibility
45
+ import subprocess
46
+ git_sha = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
47
+ mlflow.set_tag("git_commit", git_sha)
48
+ mlflow.set_tag("model_type", cfg.model.type)
49
+
50
+ for epoch in range(cfg.training.epochs):
51
+ train_metrics = train_epoch(...)
52
+ val_metrics = evaluate(...)
53
+
54
+ # Log metrics with step (epoch) for time-series view
55
+ mlflow.log_metrics({
56
+ "train_loss": train_metrics["loss"],
57
+ "val_loss": val_metrics["loss"],
58
+ "val_auc": val_metrics["auc"],
59
+ }, step=epoch)
60
+
61
+ # Log best model
62
+ mlflow.pytorch.log_model(
63
+ model,
64
+ artifact_path="model",
65
+ registered_model_name="fraud-detector", # Register in Model Registry
66
+ )
67
+
68
+ # Log additional artifacts
69
+ mlflow.log_artifact("configs/train.yaml")
70
+ mlflow.log_artifact("reports/eval_report.json")
71
+
72
+ return {"run_id": run.info.run_id, **val_metrics}
73
+ ```
74
+
75
+ **MLflow Model Registry** (promote to production):
76
+ ```python
77
+ from mlflow.tracking import MlflowClient
78
+
79
+ client = MlflowClient()
80
+
81
+ # Register a run's model in the registry
82
+ model_uri = f"runs:/{run_id}/model"
83
+ mv = mlflow.register_model(model_uri, "fraud-detector")
84
+
85
+ # Transition to staging after validation
86
+ client.transition_model_version_stage(
87
+ name="fraud-detector",
88
+ version=mv.version,
89
+ stage="Staging",
90
+ archive_existing_versions=False,
91
+ )
92
+
93
+ # Load production model in serving
94
+ production_model = mlflow.pytorch.load_model(
95
+ model_uri="models:/fraud-detector/Production"
96
+ )
97
+ ```
98
+
99
+ ### Weights & Biases Integration
100
+
101
+ W&B provides a richer UI and more features than MLflow, with a cloud-hosted option:
102
+
103
+ ```python
104
+ import wandb
105
+
106
+ wandb.init(
107
+ project="fraud-detector",
108
+ name=cfg.experiment.name,
109
+ config=OmegaConf.to_container(cfg, resolve=True),
110
+ tags=["baseline", "v2-features"],
111
+ notes="Testing new feature set with gradient clipping",
112
+ )
113
+
114
+ # Log metrics
115
+ for epoch in range(cfg.training.epochs):
116
+ metrics = train_epoch(...)
117
+ wandb.log({
118
+ "epoch": epoch,
119
+ "train/loss": metrics["train_loss"],
120
+ "val/loss": metrics["val_loss"],
121
+ "val/auc": metrics["val_auc"],
122
+ "lr": scheduler.get_last_lr()[0],
123
+ })
124
+
125
+ # Log model artifact
126
+ artifact = wandb.Artifact("fraud-detector", type="model")
127
+ artifact.add_file("models/checkpoints/best.pt")
128
+ wandb.log_artifact(artifact)
129
+
130
+ wandb.finish()
131
+ ```
132
+
133
+ **W&B-specific features**:
134
+ - **System monitoring**: GPU utilisation, memory, temperature logged automatically
135
+ - **Gradient histograms**: `wandb.watch(model, log="gradients")` logs gradient distributions per layer — invaluable for debugging vanishing/exploding gradients
136
+ - **Media logging**: Log images, audio, tables, confusion matrices directly in the UI
137
+ - **Alerts**: Set threshold alerts on metrics (email/Slack when val_loss > threshold)
138
+
139
+ ### Artifact Storage Strategy
140
+
141
+ Artifacts are the binary outputs of training runs: model checkpoints, preprocessed datasets, evaluation reports, and confusion matrices. Never store large binary artifacts in git:
142
+
143
+ **Storage hierarchy**:
144
+ ```
145
+ Small artifacts (< 1 MB): Log directly to tracker
146
+ - Config files, evaluation reports (JSON/CSV)
147
+ - Example predictions, confusion matrices (images)
148
+
149
+ Medium artifacts (1 MB – 1 GB): Log as tracker artifacts
150
+ - Model checkpoints for experimentation
151
+ - Feature engineering outputs
152
+
153
+ Large artifacts (> 1 GB): Object storage with tracker reference
154
+ - Full training datasets
155
+ - Final production model weights
156
+ - Large evaluation outputs
157
+ ```
158
+
159
+ **S3 artifact storage for MLflow**:
160
+ ```bash
161
+ mlflow server \
162
+ --default-artifact-root s3://my-bucket/mlflow-artifacts \
163
+ --backend-store-uri postgresql://user:pass@host/mlflow
164
+ ```
165
+
166
+ **DVC for dataset versioning alongside MLflow**:
167
+ ```bash
168
+ # Version dataset with DVC
169
+ dvc add data/processed/features_v3.parquet
170
+ git add data/processed/features_v3.parquet.dvc
171
+
172
+ # Log DVC dataset reference in MLflow
173
+ mlflow.set_tag("dvc_dataset_commit", git_sha)
174
+ mlflow.set_tag("dataset_path", "data/processed/features_v3.parquet")
175
+ ```
176
+
177
+ ### Run Comparison and Analysis
178
+
179
+ **Finding the best run** (MLflow Python API):
180
+ ```python
181
+ from mlflow.tracking import MlflowClient
182
+ import pandas as pd
183
+
184
+ client = MlflowClient()
185
+
186
+ # Get all runs in an experiment, sorted by val_auc
187
+ runs = client.search_runs(
188
+ experiment_ids=["1"],
189
+ filter_string="metrics.val_auc > 0.85",
190
+ order_by=["metrics.val_auc DESC"],
191
+ max_results=20,
192
+ )
193
+
194
+ # Convert to DataFrame for analysis
195
+ run_data = [{
196
+ "run_id": r.info.run_id,
197
+ "name": r.info.run_name,
198
+ "val_auc": r.data.metrics.get("val_auc"),
199
+ "lr": r.data.params.get("optimizer.lr"),
200
+ "batch_size": r.data.params.get("training.batch_size"),
201
+ } for r in runs]
202
+
203
+ df = pd.DataFrame(run_data)
204
+ print(df.head(10))
205
+ ```
206
+
207
+ **Comparing runs in W&B**: Use the parallel coordinates plot (built into W&B UI) to visualise the relationship between hyperparameters and metrics across many runs at once.
208
+
209
+ ### Hyperparameter Sweeps
210
+
211
+ **W&B Sweeps** (cloud-managed sweep coordinator):
212
+ ```yaml
213
+ # sweep_config.yaml
214
+ program: train.py
215
+ method: bayes # bayesian, random, or grid
216
+ metric:
217
+ name: val/auc
218
+ goal: maximize
219
+ parameters:
220
+ optimizer.lr:
221
+ min: 1.0e-5
222
+ max: 1.0e-2
223
+ distribution: log_uniform_values
224
+ training.batch_size:
225
+ values: [16, 32, 64, 128]
226
+ model.dropout:
227
+ min: 0.0
228
+ max: 0.5
229
+ early_terminate:
230
+ type: hyperband
231
+ min_iter: 3
232
+ ```
233
+
234
+ ```bash
235
+ wandb sweep sweep_config.yaml # Returns sweep ID
236
+ wandb agent <sweep-id> --count 50 # Launch 50 trials
237
+ ```
238
+
239
+ **MLflow + Optuna** (self-hosted alternative):
240
+ ```python
241
+ import optuna
242
+ import mlflow
243
+
244
+ def objective(trial):
245
+ with mlflow.start_run(nested=True):
246
+ lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
247
+ mlflow.log_param("lr", lr)
248
+
249
+ val_auc = train_and_evaluate(lr=lr)
250
+ mlflow.log_metric("val_auc", val_auc)
251
+ return val_auc
252
+
253
+ with mlflow.start_run(run_name="hyperparameter-sweep"):
254
+ study = optuna.create_study(direction="maximize")
255
+ study.optimize(objective, n_trials=50)
256
+ mlflow.log_params(study.best_params)
257
+ mlflow.log_metric("best_val_auc", study.best_value)
258
+ ```
259
+
260
+ ### Experiment Logging Checklist
261
+
262
+ Log these for every training run — no exceptions:
263
+
264
+ ```python
265
+ # Required: hyperparameters
266
+ mlflow.log_params({...}) # Full config dict
267
+
268
+ # Required: metrics at each epoch
269
+ mlflow.log_metrics({...}, step=epoch)
270
+
271
+ # Required: final metrics
272
+ mlflow.log_metrics({"final_val_auc": val_auc, "final_val_loss": val_loss})
273
+
274
+ # Required: reproducibility tags
275
+ mlflow.set_tag("git_commit", git_sha)
276
+ mlflow.set_tag("dataset_version", dataset_version)
277
+
278
+ # Required: model artifact
279
+ mlflow.pytorch.log_model(model, "model")
280
+
281
+ # Recommended: environment
282
+ mlflow.log_artifact("environment.yml")
283
+ mlflow.set_tag("cuda_version", torch.version.cuda)
284
+ mlflow.set_tag("pytorch_version", torch.__version__)
285
+ ```