@zigrivers/scaffold 3.7.0 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +113 -8
- package/content/knowledge/browser-extension/browser-extension-architecture.md +195 -0
- package/content/knowledge/browser-extension/browser-extension-content-scripts.md +264 -0
- package/content/knowledge/browser-extension/browser-extension-conventions.md +156 -0
- package/content/knowledge/browser-extension/browser-extension-cross-browser.md +229 -0
- package/content/knowledge/browser-extension/browser-extension-dev-environment.md +247 -0
- package/content/knowledge/browser-extension/browser-extension-manifest.md +220 -0
- package/content/knowledge/browser-extension/browser-extension-project-structure.md +183 -0
- package/content/knowledge/browser-extension/browser-extension-requirements.md +107 -0
- package/content/knowledge/browser-extension/browser-extension-security.md +202 -0
- package/content/knowledge/browser-extension/browser-extension-service-workers.md +265 -0
- package/content/knowledge/browser-extension/browser-extension-store-submission.md +155 -0
- package/content/knowledge/browser-extension/browser-extension-testing.md +270 -0
- package/content/knowledge/data-pipeline/data-pipeline-architecture.md +175 -0
- package/content/knowledge/data-pipeline/data-pipeline-batch-patterns.md +263 -0
- package/content/knowledge/data-pipeline/data-pipeline-conventions.md +176 -0
- package/content/knowledge/data-pipeline/data-pipeline-dev-environment.md +350 -0
- package/content/knowledge/data-pipeline/data-pipeline-orchestration.md +291 -0
- package/content/knowledge/data-pipeline/data-pipeline-project-structure.md +257 -0
- package/content/knowledge/data-pipeline/data-pipeline-quality.md +324 -0
- package/content/knowledge/data-pipeline/data-pipeline-requirements.md +145 -0
- package/content/knowledge/data-pipeline/data-pipeline-schema-management.md +295 -0
- package/content/knowledge/data-pipeline/data-pipeline-security.md +326 -0
- package/content/knowledge/data-pipeline/data-pipeline-streaming-patterns.md +280 -0
- package/content/knowledge/data-pipeline/data-pipeline-testing.md +406 -0
- package/content/knowledge/library/library-api-design.md +306 -0
- package/content/knowledge/library/library-architecture.md +247 -0
- package/content/knowledge/library/library-bundling.md +244 -0
- package/content/knowledge/library/library-conventions.md +229 -0
- package/content/knowledge/library/library-dev-environment.md +220 -0
- package/content/knowledge/library/library-documentation.md +300 -0
- package/content/knowledge/library/library-project-structure.md +237 -0
- package/content/knowledge/library/library-requirements.md +173 -0
- package/content/knowledge/library/library-security.md +257 -0
- package/content/knowledge/library/library-testing.md +319 -0
- package/content/knowledge/library/library-type-definitions.md +284 -0
- package/content/knowledge/library/library-versioning.md +300 -0
- package/content/knowledge/ml/ml-architecture.md +172 -0
- package/content/knowledge/ml/ml-conventions.md +209 -0
- package/content/knowledge/ml/ml-dev-environment.md +299 -0
- package/content/knowledge/ml/ml-experiment-tracking.md +285 -0
- package/content/knowledge/ml/ml-model-evaluation.md +256 -0
- package/content/knowledge/ml/ml-observability.md +253 -0
- package/content/knowledge/ml/ml-project-structure.md +216 -0
- package/content/knowledge/ml/ml-requirements.md +138 -0
- package/content/knowledge/ml/ml-security.md +188 -0
- package/content/knowledge/ml/ml-serving-patterns.md +243 -0
- package/content/knowledge/ml/ml-testing.md +301 -0
- package/content/knowledge/ml/ml-training-patterns.md +269 -0
- package/content/knowledge/mobile-app/mobile-app-architecture.md +283 -0
- package/content/knowledge/mobile-app/mobile-app-conventions.md +180 -0
- package/content/knowledge/mobile-app/mobile-app-deployment.md +298 -0
- package/content/knowledge/mobile-app/mobile-app-dev-environment.md +257 -0
- package/content/knowledge/mobile-app/mobile-app-distribution.md +264 -0
- package/content/knowledge/mobile-app/mobile-app-observability.md +317 -0
- package/content/knowledge/mobile-app/mobile-app-offline-patterns.md +311 -0
- package/content/knowledge/mobile-app/mobile-app-project-structure.md +245 -0
- package/content/knowledge/mobile-app/mobile-app-push-notifications.md +321 -0
- package/content/knowledge/mobile-app/mobile-app-requirements.md +147 -0
- package/content/knowledge/mobile-app/mobile-app-security.md +338 -0
- package/content/knowledge/mobile-app/mobile-app-testing.md +400 -0
- package/content/methodology/browser-extension-overlay.yml +82 -0
- package/content/methodology/data-pipeline-overlay.yml +70 -0
- package/content/methodology/library-overlay.yml +67 -0
- package/content/methodology/ml-overlay.yml +70 -0
- package/content/methodology/mobile-app-overlay.yml +71 -0
- package/dist/cli/commands/init.d.ts +22 -0
- package/dist/cli/commands/init.d.ts.map +1 -1
- package/dist/cli/commands/init.js +202 -3
- package/dist/cli/commands/init.js.map +1 -1
- package/dist/cli/commands/init.test.js +190 -0
- package/dist/cli/commands/init.test.js.map +1 -1
- package/dist/config/schema.d.ts +1456 -80
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +87 -0
- package/dist/config/schema.js.map +1 -1
- package/dist/config/schema.test.js +312 -3
- package/dist/config/schema.test.js.map +1 -1
- package/dist/core/assembly/overlay-loader.test.js +55 -0
- package/dist/core/assembly/overlay-loader.test.js.map +1 -1
- package/dist/e2e/project-type-overlays.test.d.ts +2 -1
- package/dist/e2e/project-type-overlays.test.d.ts.map +1 -1
- package/dist/e2e/project-type-overlays.test.js +780 -14
- package/dist/e2e/project-type-overlays.test.js.map +1 -1
- package/dist/types/config.d.ts +16 -1
- package/dist/types/config.d.ts.map +1 -1
- package/dist/wizard/questions.d.ts +28 -1
- package/dist/wizard/questions.d.ts.map +1 -1
- package/dist/wizard/questions.js +127 -1
- package/dist/wizard/questions.js.map +1 -1
- package/dist/wizard/questions.test.js +224 -4
- package/dist/wizard/questions.test.js.map +1 -1
- package/dist/wizard/wizard.d.ts +22 -0
- package/dist/wizard/wizard.d.ts.map +1 -1
- package/dist/wizard/wizard.js +28 -1
- package/dist/wizard/wizard.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-dev-environment
|
|
3
|
+
description: Conda/Poetry environment setup, Jupyter integration, GPU detection and configuration, and Docker for reproducible ML development
|
|
4
|
+
topics: [ml, dev-environment, conda, poetry, jupyter, gpu, docker, reproducibility]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
ML development environments have more complexity than typical software projects: GPU drivers, CUDA toolkits, Python packages with native extensions, and Jupyter notebook infrastructure all need to align. A broken environment costs hours and blocks the whole team. Invest in environment standardisation upfront — the payoff is that every team member can reproduce results and that CI pipelines match local runs.
|
|
8
|
+
|
|
9
|
+
## Summary
|
|
10
|
+
|
|
11
|
+
Prefer Conda for ML projects when GPU and CUDA management is required; use Poetry for pure-Python projects or as the Python dependency manager on top of Conda. Configure Jupyter as a managed service rather than ad-hoc invocations. Detect GPU availability programmatically and handle CPU fallback gracefully. Use Docker to capture the full environment for reproducible training runs and production serving.
|
|
12
|
+
|
|
13
|
+
## Deep Guidance
|
|
14
|
+
|
|
15
|
+
### Conda vs. Poetry: When to Use Each
|
|
16
|
+
|
|
17
|
+
**Conda** is the right choice when:
|
|
18
|
+
- Managing GPU drivers and CUDA toolkit versions (Conda can install CUDA without root)
|
|
19
|
+
- Working with packages that have complex native dependencies (PyTorch, TensorFlow, OpenCV)
|
|
20
|
+
- Need to isolate Python version itself (not just packages)
|
|
21
|
+
- Team uses multiple ML frameworks with conflicting dependencies
|
|
22
|
+
|
|
23
|
+
**Poetry** is the right choice when:
|
|
24
|
+
- Pure-Python project or all native dependencies are available via pip
|
|
25
|
+
- Need strict dependency locking and reproducible installs
|
|
26
|
+
- Publishing a library (Poetry handles packaging well)
|
|
27
|
+
- Already using a Conda environment for CUDA and want finer control over Python packages
|
|
28
|
+
|
|
29
|
+
**Common hybrid pattern**: Conda manages Python version and CUDA; Poetry manages Python package dependencies inside the Conda environment.
|
|
30
|
+
|
|
31
|
+
### Conda Environment Setup
|
|
32
|
+
|
|
33
|
+
```yaml
|
|
34
|
+
# environment.yml — commit to git
|
|
35
|
+
name: myproject
|
|
36
|
+
channels:
|
|
37
|
+
- pytorch
|
|
38
|
+
- nvidia
|
|
39
|
+
- conda-forge
|
|
40
|
+
- defaults
|
|
41
|
+
dependencies:
|
|
42
|
+
- python=3.11
|
|
43
|
+
- cuda-toolkit=12.1
|
|
44
|
+
- cudnn=8.9
|
|
45
|
+
- pip>=23.0
|
|
46
|
+
- pip:
|
|
47
|
+
- torch==2.1.0+cu121
|
|
48
|
+
- torchvision==0.16.0+cu121
|
|
49
|
+
- -r requirements.txt # or use pyproject.toml
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Create and activate
|
|
54
|
+
conda env create -f environment.yml
|
|
55
|
+
conda activate myproject
|
|
56
|
+
|
|
57
|
+
# Update after environment.yml changes
|
|
58
|
+
conda env update -f environment.yml --prune
|
|
59
|
+
|
|
60
|
+
# Export current state (for exact reproducibility audit)
|
|
61
|
+
conda env export > environment-lock.yml
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**Critical**: Pin exact versions in `environment.yml`. `pytorch>=2.0` is not a reproducible spec.
|
|
65
|
+
|
|
66
|
+
### Poetry Setup (Python Dependencies)
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Initialize
|
|
70
|
+
poetry init
|
|
71
|
+
|
|
72
|
+
# Add dependencies
|
|
73
|
+
poetry add torch==2.1.0 transformers==4.35.2
|
|
74
|
+
poetry add --group dev pytest black mypy
|
|
75
|
+
|
|
76
|
+
# Install (creates .venv by default)
|
|
77
|
+
poetry install
|
|
78
|
+
|
|
79
|
+
# Run in the managed venv
|
|
80
|
+
poetry run python train.py
|
|
81
|
+
poetry run pytest
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
`pyproject.toml` example:
|
|
85
|
+
```toml
|
|
86
|
+
[tool.poetry]
|
|
87
|
+
name = "myproject"
|
|
88
|
+
version = "0.1.0"
|
|
89
|
+
description = "ML project"
|
|
90
|
+
python = "^3.11"
|
|
91
|
+
|
|
92
|
+
[tool.poetry.dependencies]
|
|
93
|
+
torch = "2.1.0"
|
|
94
|
+
transformers = "4.35.2"
|
|
95
|
+
hydra-core = "1.3.2"
|
|
96
|
+
mlflow = "2.9.2"
|
|
97
|
+
|
|
98
|
+
[tool.poetry.group.dev.dependencies]
|
|
99
|
+
pytest = "7.4.3"
|
|
100
|
+
black = "23.11.0"
|
|
101
|
+
mypy = "1.7.0"
|
|
102
|
+
nbstripout = "0.6.1"
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### GPU Detection and Configuration
|
|
106
|
+
|
|
107
|
+
Always detect GPU availability at runtime and handle CPU fallback:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
# src/utils/device.py
|
|
111
|
+
import torch
|
|
112
|
+
import logging
|
|
113
|
+
|
|
114
|
+
logger = logging.getLogger(__name__)
|
|
115
|
+
|
|
116
|
+
def get_device(prefer_gpu: bool = True) -> torch.device:
|
|
117
|
+
"""Return the best available device with logging."""
|
|
118
|
+
if prefer_gpu and torch.cuda.is_available():
|
|
119
|
+
device = torch.device("cuda")
|
|
120
|
+
gpu_name = torch.cuda.get_device_name(0)
|
|
121
|
+
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
|
|
122
|
+
logger.info(f"Using GPU: {gpu_name} ({gpu_memory:.1f} GB)")
|
|
123
|
+
elif prefer_gpu and torch.backends.mps.is_available():
|
|
124
|
+
# Apple Silicon
|
|
125
|
+
device = torch.device("mps")
|
|
126
|
+
logger.info("Using Apple MPS device")
|
|
127
|
+
else:
|
|
128
|
+
device = torch.device("cpu")
|
|
129
|
+
logger.info("Using CPU — GPU not available or not requested")
|
|
130
|
+
return device
|
|
131
|
+
|
|
132
|
+
def log_gpu_memory() -> None:
|
|
133
|
+
"""Log current GPU memory usage."""
|
|
134
|
+
if torch.cuda.is_available():
|
|
135
|
+
allocated = torch.cuda.memory_allocated() / 1e9
|
|
136
|
+
reserved = torch.cuda.memory_reserved() / 1e9
|
|
137
|
+
logger.debug(f"GPU memory: {allocated:.2f} GB allocated, {reserved:.2f} GB reserved")
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
**CUDA version compatibility**: PyTorch packages are built against specific CUDA versions. Always match:
|
|
141
|
+
|
|
142
|
+
| PyTorch | CUDA | CUDNN |
|
|
143
|
+
|---------|------|-------|
|
|
144
|
+
| 2.1.x | 12.1, 11.8 | 8.x |
|
|
145
|
+
| 2.0.x | 11.7, 11.8 | 8.x |
|
|
146
|
+
|
|
147
|
+
Check compatibility at pytorch.org before pinning.
|
|
148
|
+
|
|
149
|
+
**Multi-GPU setup** (training only — not for development):
|
|
150
|
+
```python
|
|
151
|
+
# Detect available GPUs
|
|
152
|
+
n_gpus = torch.cuda.device_count()
|
|
153
|
+
if n_gpus > 1:
|
|
154
|
+
model = torch.nn.DataParallel(model) # Simple, for research
|
|
155
|
+
# Or for production: use DistributedDataParallel (see ml-training-patterns)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Jupyter Integration
|
|
159
|
+
|
|
160
|
+
Run Jupyter as a managed kernel rather than an ad-hoc server:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
# Install Jupyter in the project environment
|
|
164
|
+
poetry add --group dev jupyter jupyterlab ipykernel
|
|
165
|
+
|
|
166
|
+
# Register the project venv as a named Jupyter kernel
|
|
167
|
+
poetry run python -m ipykernel install --user --name myproject --display-name "MyProject (Python 3.11)"
|
|
168
|
+
|
|
169
|
+
# Launch JupyterLab
|
|
170
|
+
poetry run jupyter lab
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Now all project notebooks run in the same environment as the source code.
|
|
174
|
+
|
|
175
|
+
**Recommended Jupyter extensions**:
|
|
176
|
+
- `nbstripout` — strips outputs before git commit
|
|
177
|
+
- `jupyterlab-git` — git integration in the UI
|
|
178
|
+
- `jupyterlab-lsp` — language server (autocomplete, type hints)
|
|
179
|
+
|
|
180
|
+
**VS Code Jupyter integration** (recommended over browser-based):
|
|
181
|
+
```json
|
|
182
|
+
// .vscode/settings.json
|
|
183
|
+
{
|
|
184
|
+
"jupyter.kernels.filter": [
|
|
185
|
+
{"path": "${workspaceFolder}/.venv/bin/python", "type": "pythonEnvironment"}
|
|
186
|
+
],
|
|
187
|
+
"jupyter.notebookFileRoot": "${workspaceFolder}",
|
|
188
|
+
"python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python"
|
|
189
|
+
}
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Docker for Reproducibility
|
|
193
|
+
|
|
194
|
+
Docker captures the entire environment — OS, CUDA, Python, and packages. Use it for:
|
|
195
|
+
- CI training runs
|
|
196
|
+
- Sharing experiments with collaborators who have different local setups
|
|
197
|
+
- Production serving (identical environment to training)
|
|
198
|
+
|
|
199
|
+
**Base `Dockerfile` for ML training**:
|
|
200
|
+
```dockerfile
|
|
201
|
+
# Use NVIDIA's official CUDA base image
|
|
202
|
+
FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
|
|
203
|
+
|
|
204
|
+
# Set Python version
|
|
205
|
+
ENV PYTHON_VERSION=3.11
|
|
206
|
+
ENV DEBIAN_FRONTEND=noninteractive
|
|
207
|
+
|
|
208
|
+
RUN apt-get update && apt-get install -y \
|
|
209
|
+
python${PYTHON_VERSION} \
|
|
210
|
+
python3-pip \
|
|
211
|
+
git \
|
|
212
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
213
|
+
|
|
214
|
+
RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
|
|
215
|
+
|
|
216
|
+
# Install Poetry
|
|
217
|
+
RUN pip install poetry==1.7.1
|
|
218
|
+
ENV POETRY_NO_INTERACTION=1 \
|
|
219
|
+
POETRY_VENV_IN_PROJECT=1
|
|
220
|
+
|
|
221
|
+
WORKDIR /app
|
|
222
|
+
|
|
223
|
+
# Install dependencies (cached layer)
|
|
224
|
+
COPY pyproject.toml poetry.lock ./
|
|
225
|
+
RUN poetry install --no-root --without dev
|
|
226
|
+
|
|
227
|
+
# Copy source
|
|
228
|
+
COPY src/ ./src/
|
|
229
|
+
COPY configs/ ./configs/
|
|
230
|
+
|
|
231
|
+
# Install the project itself
|
|
232
|
+
RUN poetry install --without dev
|
|
233
|
+
|
|
234
|
+
ENTRYPOINT ["poetry", "run", "python", "-m", "src.training.train"]
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
**Docker Compose for development**:
|
|
238
|
+
```yaml
|
|
239
|
+
# docker-compose.yml
|
|
240
|
+
services:
|
|
241
|
+
train:
|
|
242
|
+
build: .
|
|
243
|
+
volumes:
|
|
244
|
+
- ./data:/app/data
|
|
245
|
+
- ./models:/app/models
|
|
246
|
+
- ./configs:/app/configs
|
|
247
|
+
environment:
|
|
248
|
+
- MLFLOW_TRACKING_URI=http://mlflow:5000
|
|
249
|
+
deploy:
|
|
250
|
+
resources:
|
|
251
|
+
reservations:
|
|
252
|
+
devices:
|
|
253
|
+
- driver: nvidia
|
|
254
|
+
count: all
|
|
255
|
+
capabilities: [gpu]
|
|
256
|
+
|
|
257
|
+
mlflow:
|
|
258
|
+
image: ghcr.io/mlflow/mlflow:v2.9.2
|
|
259
|
+
ports:
|
|
260
|
+
- "5000:5000"
|
|
261
|
+
volumes:
|
|
262
|
+
- ./mlruns:/mlflow/mlruns
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
### Makefile Task Runner
|
|
266
|
+
|
|
267
|
+
Encode common tasks in a `Makefile` to eliminate "how do I run this?" questions:
|
|
268
|
+
|
|
269
|
+
```makefile
|
|
270
|
+
.PHONY: env train eval test lint clean
|
|
271
|
+
|
|
272
|
+
env:
|
|
273
|
+
conda env create -f environment.yml || conda env update -f environment.yml --prune
|
|
274
|
+
|
|
275
|
+
train:
|
|
276
|
+
poetry run python -m src.training.train $(ARGS)
|
|
277
|
+
|
|
278
|
+
eval:
|
|
279
|
+
poetry run python -m src.evaluation.evaluator $(ARGS)
|
|
280
|
+
|
|
281
|
+
test:
|
|
282
|
+
poetry run pytest tests/ -v
|
|
283
|
+
|
|
284
|
+
lint:
|
|
285
|
+
poetry run black --check src/ tests/
|
|
286
|
+
poetry run mypy src/
|
|
287
|
+
|
|
288
|
+
clean:
|
|
289
|
+
find . -type f -name "*.pyc" -delete
|
|
290
|
+
find . -type d -name "__pycache__" -delete
|
|
291
|
+
rm -rf .pytest_cache/
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
Usage:
|
|
295
|
+
```bash
|
|
296
|
+
make env # Set up environment
|
|
297
|
+
make train ARGS="optimizer.lr=1e-4"
|
|
298
|
+
make test
|
|
299
|
+
```
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-experiment-tracking
|
|
3
|
+
description: MLflow and Weights & Biases integration, artifact storage, experiment run comparison, and hyperparameter sweep management
|
|
4
|
+
topics: [ml, experiment-tracking, mlflow, wandb, artifacts, sweeps, reproducibility]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
Without experiment tracking, ML development is archaeology: "which config produced that result?" is answered by digging through notebook history, chat logs, and failing memory. Experiment tracking tools are version control for training runs — every metric, every hyperparameter, every artifact, linked to the code that produced it. The discipline of logging everything during training pays dividends when a stakeholder asks "how does this model compare to what we had six months ago?"
|
|
8
|
+
|
|
9
|
+
## Summary
|
|
10
|
+
|
|
11
|
+
Use MLflow (self-hosted, open source) or Weights & Biases (cloud, more feature-rich) to track every training run. Log hyperparameters, metrics at each epoch, model artifacts, and the git commit SHA. Store large artifacts (checkpoints, datasets) in object storage backed by the experiment tracker. Use sweep features (MLflow Hyperopt integration, W&B Sweeps) for systematic hyperparameter search rather than manual iteration.
|
|
12
|
+
|
|
13
|
+
## Deep Guidance
|
|
14
|
+
|
|
15
|
+
### MLflow Integration
|
|
16
|
+
|
|
17
|
+
MLflow is the open-source standard for experiment tracking. It runs locally or on a managed server:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# Start local tracking server (stores runs in ./mlruns)
|
|
21
|
+
mlflow server --host 0.0.0.0 --port 5000
|
|
22
|
+
|
|
23
|
+
# Or use the SQLite backend for better performance
|
|
24
|
+
mlflow server \
|
|
25
|
+
--backend-store-uri sqlite:///mlflow.db \
|
|
26
|
+
--default-artifact-root ./mlartifacts \
|
|
27
|
+
--host 0.0.0.0 --port 5000
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
**Instrument training code**:
|
|
31
|
+
```python
|
|
32
|
+
import mlflow
|
|
33
|
+
import mlflow.pytorch
|
|
34
|
+
|
|
35
|
+
# Set tracking server
|
|
36
|
+
mlflow.set_tracking_uri("http://localhost:5000")
|
|
37
|
+
mlflow.set_experiment("fraud-detector")
|
|
38
|
+
|
|
39
|
+
def train(cfg: DictConfig) -> dict:
|
|
40
|
+
with mlflow.start_run(run_name=cfg.experiment.name) as run:
|
|
41
|
+
# Log all hyperparameters from config
|
|
42
|
+
mlflow.log_params(OmegaConf.to_container(cfg, resolve=True))
|
|
43
|
+
|
|
44
|
+
# Log git commit for reproducibility
|
|
45
|
+
import subprocess
|
|
46
|
+
git_sha = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
|
|
47
|
+
mlflow.set_tag("git_commit", git_sha)
|
|
48
|
+
mlflow.set_tag("model_type", cfg.model.type)
|
|
49
|
+
|
|
50
|
+
for epoch in range(cfg.training.epochs):
|
|
51
|
+
train_metrics = train_epoch(...)
|
|
52
|
+
val_metrics = evaluate(...)
|
|
53
|
+
|
|
54
|
+
# Log metrics with step (epoch) for time-series view
|
|
55
|
+
mlflow.log_metrics({
|
|
56
|
+
"train_loss": train_metrics["loss"],
|
|
57
|
+
"val_loss": val_metrics["loss"],
|
|
58
|
+
"val_auc": val_metrics["auc"],
|
|
59
|
+
}, step=epoch)
|
|
60
|
+
|
|
61
|
+
# Log best model
|
|
62
|
+
mlflow.pytorch.log_model(
|
|
63
|
+
model,
|
|
64
|
+
artifact_path="model",
|
|
65
|
+
registered_model_name="fraud-detector", # Register in Model Registry
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Log additional artifacts
|
|
69
|
+
mlflow.log_artifact("configs/train.yaml")
|
|
70
|
+
mlflow.log_artifact("reports/eval_report.json")
|
|
71
|
+
|
|
72
|
+
return {"run_id": run.info.run_id, **val_metrics}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
**MLflow Model Registry** (promote to production):
|
|
76
|
+
```python
|
|
77
|
+
from mlflow.tracking import MlflowClient
|
|
78
|
+
|
|
79
|
+
client = MlflowClient()
|
|
80
|
+
|
|
81
|
+
# Register a run's model in the registry
|
|
82
|
+
model_uri = f"runs:/{run_id}/model"
|
|
83
|
+
mv = mlflow.register_model(model_uri, "fraud-detector")
|
|
84
|
+
|
|
85
|
+
# Transition to staging after validation
|
|
86
|
+
client.transition_model_version_stage(
|
|
87
|
+
name="fraud-detector",
|
|
88
|
+
version=mv.version,
|
|
89
|
+
stage="Staging",
|
|
90
|
+
archive_existing_versions=False,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Load production model in serving
|
|
94
|
+
production_model = mlflow.pytorch.load_model(
|
|
95
|
+
model_uri="models:/fraud-detector/Production"
|
|
96
|
+
)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Weights & Biases Integration
|
|
100
|
+
|
|
101
|
+
W&B provides a richer UI and more features than MLflow, with a cloud-hosted option:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
import wandb
|
|
105
|
+
|
|
106
|
+
wandb.init(
|
|
107
|
+
project="fraud-detector",
|
|
108
|
+
name=cfg.experiment.name,
|
|
109
|
+
config=OmegaConf.to_container(cfg, resolve=True),
|
|
110
|
+
tags=["baseline", "v2-features"],
|
|
111
|
+
notes="Testing new feature set with gradient clipping",
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Log metrics
|
|
115
|
+
for epoch in range(cfg.training.epochs):
|
|
116
|
+
metrics = train_epoch(...)
|
|
117
|
+
wandb.log({
|
|
118
|
+
"epoch": epoch,
|
|
119
|
+
"train/loss": metrics["train_loss"],
|
|
120
|
+
"val/loss": metrics["val_loss"],
|
|
121
|
+
"val/auc": metrics["val_auc"],
|
|
122
|
+
"lr": scheduler.get_last_lr()[0],
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
# Log model artifact
|
|
126
|
+
artifact = wandb.Artifact("fraud-detector", type="model")
|
|
127
|
+
artifact.add_file("models/checkpoints/best.pt")
|
|
128
|
+
wandb.log_artifact(artifact)
|
|
129
|
+
|
|
130
|
+
wandb.finish()
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**W&B-specific features**:
|
|
134
|
+
- **System monitoring**: GPU utilisation, memory, temperature logged automatically
|
|
135
|
+
- **Gradient histograms**: `wandb.watch(model, log="gradients")` logs gradient distributions per layer — invaluable for debugging vanishing/exploding gradients
|
|
136
|
+
- **Media logging**: Log images, audio, tables, confusion matrices directly in the UI
|
|
137
|
+
- **Alerts**: Set threshold alerts on metrics (email/Slack when val_loss > threshold)
|
|
138
|
+
|
|
139
|
+
### Artifact Storage Strategy
|
|
140
|
+
|
|
141
|
+
Artifacts are the binary outputs of training runs: model checkpoints, preprocessed datasets, evaluation reports, and confusion matrices. Never store large binary artifacts in git:
|
|
142
|
+
|
|
143
|
+
**Storage hierarchy**:
|
|
144
|
+
```
|
|
145
|
+
Small artifacts (< 1 MB): Log directly to tracker
|
|
146
|
+
- Config files, evaluation reports (JSON/CSV)
|
|
147
|
+
- Example predictions, confusion matrices (images)
|
|
148
|
+
|
|
149
|
+
Medium artifacts (1 MB – 1 GB): Log as tracker artifacts
|
|
150
|
+
- Model checkpoints for experimentation
|
|
151
|
+
- Feature engineering outputs
|
|
152
|
+
|
|
153
|
+
Large artifacts (> 1 GB): Object storage with tracker reference
|
|
154
|
+
- Full training datasets
|
|
155
|
+
- Final production model weights
|
|
156
|
+
- Large evaluation outputs
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
**S3 artifact storage for MLflow**:
|
|
160
|
+
```bash
|
|
161
|
+
mlflow server \
|
|
162
|
+
--default-artifact-root s3://my-bucket/mlflow-artifacts \
|
|
163
|
+
--backend-store-uri postgresql://user:pass@host/mlflow
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**DVC for dataset versioning alongside MLflow**:
|
|
167
|
+
```bash
|
|
168
|
+
# Version dataset with DVC
|
|
169
|
+
dvc add data/processed/features_v3.parquet
|
|
170
|
+
git add data/processed/features_v3.parquet.dvc
|
|
171
|
+
|
|
172
|
+
# Log DVC dataset reference in MLflow
|
|
173
|
+
mlflow.set_tag("dvc_dataset_commit", git_sha)
|
|
174
|
+
mlflow.set_tag("dataset_path", "data/processed/features_v3.parquet")
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Run Comparison and Analysis
|
|
178
|
+
|
|
179
|
+
**Finding the best run** (MLflow Python API):
|
|
180
|
+
```python
|
|
181
|
+
from mlflow.tracking import MlflowClient
|
|
182
|
+
import pandas as pd
|
|
183
|
+
|
|
184
|
+
client = MlflowClient()
|
|
185
|
+
|
|
186
|
+
# Get all runs in an experiment, sorted by val_auc
|
|
187
|
+
runs = client.search_runs(
|
|
188
|
+
experiment_ids=["1"],
|
|
189
|
+
filter_string="metrics.val_auc > 0.85",
|
|
190
|
+
order_by=["metrics.val_auc DESC"],
|
|
191
|
+
max_results=20,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Convert to DataFrame for analysis
|
|
195
|
+
run_data = [{
|
|
196
|
+
"run_id": r.info.run_id,
|
|
197
|
+
"name": r.info.run_name,
|
|
198
|
+
"val_auc": r.data.metrics.get("val_auc"),
|
|
199
|
+
"lr": r.data.params.get("optimizer.lr"),
|
|
200
|
+
"batch_size": r.data.params.get("training.batch_size"),
|
|
201
|
+
} for r in runs]
|
|
202
|
+
|
|
203
|
+
df = pd.DataFrame(run_data)
|
|
204
|
+
print(df.head(10))
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
**Comparing runs in W&B**: Use the parallel coordinates plot (built into W&B UI) to visualise the relationship between hyperparameters and metrics across many runs at once.
|
|
208
|
+
|
|
209
|
+
### Hyperparameter Sweeps
|
|
210
|
+
|
|
211
|
+
**W&B Sweeps** (cloud-managed sweep coordinator):
|
|
212
|
+
```yaml
|
|
213
|
+
# sweep_config.yaml
|
|
214
|
+
program: train.py
|
|
215
|
+
method: bayes # bayesian, random, or grid
|
|
216
|
+
metric:
|
|
217
|
+
name: val/auc
|
|
218
|
+
goal: maximize
|
|
219
|
+
parameters:
|
|
220
|
+
optimizer.lr:
|
|
221
|
+
min: 1.0e-5
|
|
222
|
+
max: 1.0e-2
|
|
223
|
+
distribution: log_uniform_values
|
|
224
|
+
training.batch_size:
|
|
225
|
+
values: [16, 32, 64, 128]
|
|
226
|
+
model.dropout:
|
|
227
|
+
min: 0.0
|
|
228
|
+
max: 0.5
|
|
229
|
+
early_terminate:
|
|
230
|
+
type: hyperband
|
|
231
|
+
min_iter: 3
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
wandb sweep sweep_config.yaml # Returns sweep ID
|
|
236
|
+
wandb agent <sweep-id> --count 50 # Launch 50 trials
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**MLflow + Optuna** (self-hosted alternative):
|
|
240
|
+
```python
|
|
241
|
+
import optuna
|
|
242
|
+
import mlflow
|
|
243
|
+
|
|
244
|
+
def objective(trial):
|
|
245
|
+
with mlflow.start_run(nested=True):
|
|
246
|
+
lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
|
|
247
|
+
mlflow.log_param("lr", lr)
|
|
248
|
+
|
|
249
|
+
val_auc = train_and_evaluate(lr=lr)
|
|
250
|
+
mlflow.log_metric("val_auc", val_auc)
|
|
251
|
+
return val_auc
|
|
252
|
+
|
|
253
|
+
with mlflow.start_run(run_name="hyperparameter-sweep"):
|
|
254
|
+
study = optuna.create_study(direction="maximize")
|
|
255
|
+
study.optimize(objective, n_trials=50)
|
|
256
|
+
mlflow.log_params(study.best_params)
|
|
257
|
+
mlflow.log_metric("best_val_auc", study.best_value)
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Experiment Logging Checklist
|
|
261
|
+
|
|
262
|
+
Log these for every training run — no exceptions:
|
|
263
|
+
|
|
264
|
+
```python
|
|
265
|
+
# Required: hyperparameters
|
|
266
|
+
mlflow.log_params({...}) # Full config dict
|
|
267
|
+
|
|
268
|
+
# Required: metrics at each epoch
|
|
269
|
+
mlflow.log_metrics({...}, step=epoch)
|
|
270
|
+
|
|
271
|
+
# Required: final metrics
|
|
272
|
+
mlflow.log_metrics({"final_val_auc": val_auc, "final_val_loss": val_loss})
|
|
273
|
+
|
|
274
|
+
# Required: reproducibility tags
|
|
275
|
+
mlflow.set_tag("git_commit", git_sha)
|
|
276
|
+
mlflow.set_tag("dataset_version", dataset_version)
|
|
277
|
+
|
|
278
|
+
# Required: model artifact
|
|
279
|
+
mlflow.pytorch.log_model(model, "model")
|
|
280
|
+
|
|
281
|
+
# Recommended: environment
|
|
282
|
+
mlflow.log_artifact("environment.yml")
|
|
283
|
+
mlflow.set_tag("cuda_version", torch.version.cuda)
|
|
284
|
+
mlflow.set_tag("pytorch_version", torch.__version__)
|
|
285
|
+
```
|