@zigrivers/scaffold 3.7.0 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +113 -8
- package/content/knowledge/browser-extension/browser-extension-architecture.md +195 -0
- package/content/knowledge/browser-extension/browser-extension-content-scripts.md +264 -0
- package/content/knowledge/browser-extension/browser-extension-conventions.md +156 -0
- package/content/knowledge/browser-extension/browser-extension-cross-browser.md +229 -0
- package/content/knowledge/browser-extension/browser-extension-dev-environment.md +247 -0
- package/content/knowledge/browser-extension/browser-extension-manifest.md +220 -0
- package/content/knowledge/browser-extension/browser-extension-project-structure.md +183 -0
- package/content/knowledge/browser-extension/browser-extension-requirements.md +107 -0
- package/content/knowledge/browser-extension/browser-extension-security.md +202 -0
- package/content/knowledge/browser-extension/browser-extension-service-workers.md +265 -0
- package/content/knowledge/browser-extension/browser-extension-store-submission.md +155 -0
- package/content/knowledge/browser-extension/browser-extension-testing.md +270 -0
- package/content/knowledge/data-pipeline/data-pipeline-architecture.md +175 -0
- package/content/knowledge/data-pipeline/data-pipeline-batch-patterns.md +263 -0
- package/content/knowledge/data-pipeline/data-pipeline-conventions.md +176 -0
- package/content/knowledge/data-pipeline/data-pipeline-dev-environment.md +350 -0
- package/content/knowledge/data-pipeline/data-pipeline-orchestration.md +291 -0
- package/content/knowledge/data-pipeline/data-pipeline-project-structure.md +257 -0
- package/content/knowledge/data-pipeline/data-pipeline-quality.md +324 -0
- package/content/knowledge/data-pipeline/data-pipeline-requirements.md +145 -0
- package/content/knowledge/data-pipeline/data-pipeline-schema-management.md +295 -0
- package/content/knowledge/data-pipeline/data-pipeline-security.md +326 -0
- package/content/knowledge/data-pipeline/data-pipeline-streaming-patterns.md +280 -0
- package/content/knowledge/data-pipeline/data-pipeline-testing.md +406 -0
- package/content/knowledge/library/library-api-design.md +306 -0
- package/content/knowledge/library/library-architecture.md +247 -0
- package/content/knowledge/library/library-bundling.md +244 -0
- package/content/knowledge/library/library-conventions.md +229 -0
- package/content/knowledge/library/library-dev-environment.md +220 -0
- package/content/knowledge/library/library-documentation.md +300 -0
- package/content/knowledge/library/library-project-structure.md +237 -0
- package/content/knowledge/library/library-requirements.md +173 -0
- package/content/knowledge/library/library-security.md +257 -0
- package/content/knowledge/library/library-testing.md +319 -0
- package/content/knowledge/library/library-type-definitions.md +284 -0
- package/content/knowledge/library/library-versioning.md +300 -0
- package/content/knowledge/ml/ml-architecture.md +172 -0
- package/content/knowledge/ml/ml-conventions.md +209 -0
- package/content/knowledge/ml/ml-dev-environment.md +299 -0
- package/content/knowledge/ml/ml-experiment-tracking.md +285 -0
- package/content/knowledge/ml/ml-model-evaluation.md +256 -0
- package/content/knowledge/ml/ml-observability.md +253 -0
- package/content/knowledge/ml/ml-project-structure.md +216 -0
- package/content/knowledge/ml/ml-requirements.md +138 -0
- package/content/knowledge/ml/ml-security.md +188 -0
- package/content/knowledge/ml/ml-serving-patterns.md +243 -0
- package/content/knowledge/ml/ml-testing.md +301 -0
- package/content/knowledge/ml/ml-training-patterns.md +269 -0
- package/content/knowledge/mobile-app/mobile-app-architecture.md +283 -0
- package/content/knowledge/mobile-app/mobile-app-conventions.md +180 -0
- package/content/knowledge/mobile-app/mobile-app-deployment.md +298 -0
- package/content/knowledge/mobile-app/mobile-app-dev-environment.md +257 -0
- package/content/knowledge/mobile-app/mobile-app-distribution.md +264 -0
- package/content/knowledge/mobile-app/mobile-app-observability.md +317 -0
- package/content/knowledge/mobile-app/mobile-app-offline-patterns.md +311 -0
- package/content/knowledge/mobile-app/mobile-app-project-structure.md +245 -0
- package/content/knowledge/mobile-app/mobile-app-push-notifications.md +321 -0
- package/content/knowledge/mobile-app/mobile-app-requirements.md +147 -0
- package/content/knowledge/mobile-app/mobile-app-security.md +338 -0
- package/content/knowledge/mobile-app/mobile-app-testing.md +400 -0
- package/content/methodology/browser-extension-overlay.yml +82 -0
- package/content/methodology/data-pipeline-overlay.yml +70 -0
- package/content/methodology/library-overlay.yml +67 -0
- package/content/methodology/ml-overlay.yml +70 -0
- package/content/methodology/mobile-app-overlay.yml +71 -0
- package/dist/cli/commands/init.d.ts +22 -0
- package/dist/cli/commands/init.d.ts.map +1 -1
- package/dist/cli/commands/init.js +202 -3
- package/dist/cli/commands/init.js.map +1 -1
- package/dist/cli/commands/init.test.js +190 -0
- package/dist/cli/commands/init.test.js.map +1 -1
- package/dist/config/schema.d.ts +1456 -80
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +87 -0
- package/dist/config/schema.js.map +1 -1
- package/dist/config/schema.test.js +312 -3
- package/dist/config/schema.test.js.map +1 -1
- package/dist/core/assembly/overlay-loader.test.js +55 -0
- package/dist/core/assembly/overlay-loader.test.js.map +1 -1
- package/dist/e2e/project-type-overlays.test.d.ts +2 -1
- package/dist/e2e/project-type-overlays.test.d.ts.map +1 -1
- package/dist/e2e/project-type-overlays.test.js +780 -14
- package/dist/e2e/project-type-overlays.test.js.map +1 -1
- package/dist/types/config.d.ts +16 -1
- package/dist/types/config.d.ts.map +1 -1
- package/dist/wizard/questions.d.ts +28 -1
- package/dist/wizard/questions.d.ts.map +1 -1
- package/dist/wizard/questions.js +127 -1
- package/dist/wizard/questions.js.map +1 -1
- package/dist/wizard/questions.test.js +224 -4
- package/dist/wizard/questions.test.js.map +1 -1
- package/dist/wizard/wizard.d.ts +22 -0
- package/dist/wizard/wizard.d.ts.map +1 -1
- package/dist/wizard/wizard.js +28 -1
- package/dist/wizard/wizard.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-project-structure
|
|
3
|
+
description: Standard ML project directory layout covering src/data, src/models, src/training, src/serving, notebooks, configs, and model artifact storage
|
|
4
|
+
topics: [ml, project-structure, layout, organization, artifacts, notebooks]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
ML projects accumulate files faster than almost any other software domain: datasets, model checkpoints, experiment configs, notebooks, evaluation reports, and serving code. Without a deliberate directory structure, projects become disorganised within weeks and impossible to onboard new team members onto. A well-structured ML project separates concerns clearly: source code from notebooks, training from serving, configs from code, and tracked artifacts from ephemeral outputs.
|
|
8
|
+
|
|
9
|
+
## Summary
|
|
10
|
+
|
|
11
|
+
A standard ML project separates source code (`src/`), exploratory notebooks (`notebooks/`), training configurations (`configs/`), and model artifacts (`models/`). Within `src/`, separate data loading (`src/data/`), model architectures (`src/models/`), training logic (`src/training/`), and serving code (`src/serving/`). Keep large artifacts (datasets, checkpoints) out of git using `.gitignore` and DVC or object storage. The structure should be navigable to a new team member within five minutes.
|
|
12
|
+
|
|
13
|
+
## Deep Guidance
|
|
14
|
+
|
|
15
|
+
### Top-Level Directory Structure
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
project-root/
|
|
19
|
+
├── configs/ # All experiment and model configs (YAML/TOML)
|
|
20
|
+
├── data/ # Data directory (gitignored content, DVC-tracked)
|
|
21
|
+
│ ├── raw/ # Immutable raw data as received from source
|
|
22
|
+
│ ├── processed/ # Cleaned, transformed datasets
|
|
23
|
+
│ └── splits/ # Train/val/test split files (CSV/JSON of IDs)
|
|
24
|
+
├── docs/ # Architecture decisions, dataset cards, model cards
|
|
25
|
+
├── models/ # Model artifact storage (gitignored; object storage backed)
|
|
26
|
+
│ ├── checkpoints/ # Training checkpoints (epoch-N.pt)
|
|
27
|
+
│ └── registry/ # Production-promoted model versions
|
|
28
|
+
├── notebooks/ # Jupyter notebooks for exploration (outputs cleared before commit)
|
|
29
|
+
├── reports/ # Evaluation reports, figures, experiment summaries
|
|
30
|
+
├── scripts/ # One-off utility scripts (not part of the pipeline)
|
|
31
|
+
├── src/ # All production source code
|
|
32
|
+
│ ├── data/ # Dataset classes, loaders, preprocessing
|
|
33
|
+
│ ├── models/ # Model architecture definitions
|
|
34
|
+
│ ├── training/ # Training loops, loss functions, callbacks
|
|
35
|
+
│ ├── evaluation/ # Metrics, evaluation runners, result serialisation
|
|
36
|
+
│ └── serving/ # Inference pipelines, API handlers, preprocessing wrappers
|
|
37
|
+
├── tests/ # Unit and integration tests
|
|
38
|
+
├── .dvc/ # DVC metadata (committed to git)
|
|
39
|
+
├── .gitignore # Excludes data/, models/, __pycache__, .env
|
|
40
|
+
├── pyproject.toml # Project metadata and dependencies (Poetry)
|
|
41
|
+
├── Makefile # Task runner: train, evaluate, serve, test
|
|
42
|
+
└── README.md # Project overview, setup, and usage
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### `src/data/` — Data Loading and Preprocessing
|
|
46
|
+
|
|
47
|
+
This directory contains all code that transforms raw data into model-ready tensors:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
src/data/
|
|
51
|
+
├── __init__.py
|
|
52
|
+
├── dataset.py # PyTorch Dataset or TF Dataset class
|
|
53
|
+
├── datamodule.py # LightningDataModule or equivalent orchestrator
|
|
54
|
+
├── transforms.py # Preprocessing transforms (normalize, tokenize, augment)
|
|
55
|
+
├── augmentation.py # Training-time data augmentation (separated from eval transforms)
|
|
56
|
+
├── collate.py # Custom batch collation functions
|
|
57
|
+
└── utils.py # Data utilities (download, checksum, split generation)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Key rules:
|
|
61
|
+
- **Separate training transforms from eval transforms** — augmentation must not be applied at inference
|
|
62
|
+
- Dataset classes must accept a `split` parameter and behave correctly for each split
|
|
63
|
+
- All preprocessing must be reproducible and deterministic at inference time
|
|
64
|
+
- Cache processed data to avoid recomputation on each run
|
|
65
|
+
|
|
66
|
+
### `src/models/` — Architecture Definitions
|
|
67
|
+
|
|
68
|
+
Contains model class definitions only — no training logic, no loss functions:
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
src/models/
|
|
72
|
+
├── __init__.py
|
|
73
|
+
├── backbone.py # Feature extractor (ResNet, ViT, BERT, etc.)
|
|
74
|
+
├── head.py # Task-specific head (classification, regression, generation)
|
|
75
|
+
├── model.py # Composed full model
|
|
76
|
+
└── components/ # Reusable building blocks (attention, MLP, norm layers)
|
|
77
|
+
├── attention.py
|
|
78
|
+
└── ffn.py
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Key rules:
|
|
82
|
+
- Models are pure computation graphs — no file I/O, no training state
|
|
83
|
+
- Accept hyperparameters via constructor, not globals
|
|
84
|
+
- Provide a `from_config(cfg)` class method for config-driven instantiation
|
|
85
|
+
- Serialise with `state_dict()` only — never pickle entire model objects
|
|
86
|
+
|
|
87
|
+
### `src/training/` — Training Logic
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
src/training/
|
|
91
|
+
├── __init__.py
|
|
92
|
+
├── trainer.py # Training loop (or LightningModule)
|
|
93
|
+
├── loss.py # Loss functions
|
|
94
|
+
├── optimizer.py # Optimizer and scheduler builders
|
|
95
|
+
├── callbacks.py # Callbacks (early stopping, logging, checkpoint saving)
|
|
96
|
+
└── utils.py # Gradient clipping, mixed precision helpers
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
The training loop is separate from the model. A model knows how to compute predictions; the trainer knows how to update weights. This separation enables:
|
|
100
|
+
- Testing model forward passes independently of training
|
|
101
|
+
- Swapping training strategies (single GPU, DDP, FSDP) without changing the model
|
|
102
|
+
- Using the same model class for training and serving
|
|
103
|
+
|
|
104
|
+
### `src/evaluation/` — Metrics and Evaluation Runners
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
src/evaluation/
|
|
108
|
+
├── __init__.py
|
|
109
|
+
├── metrics.py # Metric computation (accuracy, F1, AUC, etc.)
|
|
110
|
+
├── evaluator.py # Evaluation loop (runs model on eval set, collects predictions)
|
|
111
|
+
├── slice_analysis.py # Per-slice performance breakdown
|
|
112
|
+
└── reports.py # Result serialisation and report generation
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Evaluation code runs identically offline and online. Do not inline evaluation logic in the training loop — this makes it impossible to re-evaluate a checkpoint independently.
|
|
116
|
+
|
|
117
|
+
### `src/serving/` — Inference and API
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
src/serving/
|
|
121
|
+
├── __init__.py
|
|
122
|
+
├── predictor.py # Prediction class (loads model, runs inference)
|
|
123
|
+
├── preprocessing.py # Request preprocessing (mirrors training eval transforms)
|
|
124
|
+
├── postprocessing.py # Response postprocessing (calibration, thresholding)
|
|
125
|
+
├── api.py # FastAPI/Flask endpoint definitions
|
|
126
|
+
└── handler.py # TorchServe or Triton handler
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
The `Predictor` class is the contract between the model and the serving infrastructure. It:
|
|
130
|
+
- Loads a model from a path or registry reference
|
|
131
|
+
- Exposes a `predict(inputs)` method with documented input/output types
|
|
132
|
+
- Uses the exact same preprocessing as training evaluation transforms
|
|
133
|
+
|
|
134
|
+
### `notebooks/` — Exploratory Analysis
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
notebooks/
|
|
138
|
+
├── 01-data-exploration.ipynb # EDA, data quality checks
|
|
139
|
+
├── 02-baseline-model.ipynb # Baseline experiments
|
|
140
|
+
├── 03-feature-engineering.ipynb
|
|
141
|
+
└── 04-error-analysis.ipynb # Post-training error analysis
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Rules for notebooks:
|
|
145
|
+
- **Clear outputs before committing** — use `nbstripout` as a pre-commit hook
|
|
146
|
+
- Number notebooks in chronological/logical order
|
|
147
|
+
- Notebooks document exploration, not production logic
|
|
148
|
+
- Any reusable code found in notebooks gets refactored into `src/` with tests
|
|
149
|
+
|
|
150
|
+
### `configs/` — Experiment Configuration
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
configs/
|
|
154
|
+
├── base.yaml # Default config merged into all experiments
|
|
155
|
+
├── model/
|
|
156
|
+
│ ├── small.yaml
|
|
157
|
+
│ └── large.yaml
|
|
158
|
+
├── data/
|
|
159
|
+
│ ├── dev.yaml # Small dataset for fast iteration
|
|
160
|
+
│ └── full.yaml # Full production dataset
|
|
161
|
+
└── training/
|
|
162
|
+
├── debug.yaml # 1 epoch, no logging, fast feedback
|
|
163
|
+
└── production.yaml # Full training run settings
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### `models/` — Artifact Storage
|
|
167
|
+
|
|
168
|
+
Large binary artifacts are not stored in git:
|
|
169
|
+
- Checkpoints and production models live in `models/` but are gitignored
|
|
170
|
+
- Back `models/` with object storage: S3, GCS, Azure Blob Storage
|
|
171
|
+
- Use DVC to track artifact versions alongside the code:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
dvc add models/registry/v1.2.0/model.pt
|
|
175
|
+
git add models/registry/v1.2.0/model.pt.dvc
|
|
176
|
+
git commit -m "feat: register model v1.2.0"
|
|
177
|
+
dvc push # Pushes binary to remote storage
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Teammates restore the artifact with `dvc pull` — they get the exact binary referenced by the `.dvc` pointer in git.
|
|
181
|
+
|
|
182
|
+
### `.gitignore` Essentials for ML Projects
|
|
183
|
+
|
|
184
|
+
```gitignore
|
|
185
|
+
# Data
|
|
186
|
+
data/raw/
|
|
187
|
+
data/processed/
|
|
188
|
+
data/splits/*.csv
|
|
189
|
+
|
|
190
|
+
# Model artifacts
|
|
191
|
+
models/checkpoints/
|
|
192
|
+
models/registry/
|
|
193
|
+
|
|
194
|
+
# Notebook outputs
|
|
195
|
+
*.ipynb
|
|
196
|
+
|
|
197
|
+
# Python
|
|
198
|
+
__pycache__/
|
|
199
|
+
*.pyc
|
|
200
|
+
.venv/
|
|
201
|
+
*.egg-info/
|
|
202
|
+
|
|
203
|
+
# Experiment tracking
|
|
204
|
+
mlruns/
|
|
205
|
+
wandb/
|
|
206
|
+
|
|
207
|
+
# Environment
|
|
208
|
+
.env
|
|
209
|
+
*.env
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
Use `nbstripout` to automatically strip notebook outputs:
|
|
213
|
+
```bash
|
|
214
|
+
pip install nbstripout
|
|
215
|
+
nbstripout --install # Installs as git filter
|
|
216
|
+
```
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-requirements
|
|
3
|
+
description: Model performance metrics (accuracy, latency, throughput), business KPIs, fairness/bias requirements, and SLA definitions for ML systems
|
|
4
|
+
topics: [ml, requirements, metrics, fairness, bias, sla, kpi]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
ML requirements differ from traditional software requirements because correctness is probabilistic, not absolute. Before writing a single line of training code, define the target metrics, their measurement methodology, and the business KPIs they serve. Ambiguous requirements — "make the model accurate" — are the root cause of most ML project failures. A requirements document for an ML system must specify numeric thresholds, measurement conditions, and what constitutes an acceptable production deployment.
|
|
8
|
+
|
|
9
|
+
## Summary
|
|
10
|
+
|
|
11
|
+
ML requirements must specify concrete numeric thresholds for model performance (accuracy, latency, throughput), tie those metrics to business KPIs, define fairness and bias constraints across protected groups, and establish SLAs for production serving. Requirements without measurement methodology are aspirations, not requirements. Capture them in a Model Requirements Document before training begins and treat them as acceptance criteria for production deployment.
|
|
12
|
+
|
|
13
|
+
## Deep Guidance
|
|
14
|
+
|
|
15
|
+
### Performance Metrics by Task Type
|
|
16
|
+
|
|
17
|
+
Different ML tasks have canonical metrics. Use the right metric for the task — do not default to accuracy for all problems:
|
|
18
|
+
|
|
19
|
+
**Classification**
|
|
20
|
+
- **Accuracy**: Fraction of correct predictions. Misleading for class-imbalanced datasets (a fraud detector predicting "not fraud" always achieves 99.9% accuracy if fraud is 0.1% of data).
|
|
21
|
+
- **Precision**: Of all positive predictions, how many are actually positive. Optimise when false positives are costly (spam filter flagging legitimate email).
|
|
22
|
+
- **Recall (Sensitivity)**: Of all actual positives, how many were predicted positive. Optimise when false negatives are costly (cancer screening missing a case).
|
|
23
|
+
- **F1 Score**: Harmonic mean of precision and recall. Good single metric when both matter equally.
|
|
24
|
+
- **ROC-AUC**: Area under the Receiver Operating Characteristic curve. Threshold-independent, useful for comparing models. Insensitive to class imbalance.
|
|
25
|
+
- **PR-AUC**: Area under the Precision-Recall curve. Better than ROC-AUC for highly imbalanced datasets.
|
|
26
|
+
|
|
27
|
+
**Regression**
|
|
28
|
+
- **MAE (Mean Absolute Error)**: Average absolute error. Robust to outliers. Easy to interpret in the target unit.
|
|
29
|
+
- **RMSE (Root Mean Squared Error)**: Penalises large errors more than MAE. Use when large errors are disproportionately harmful.
|
|
30
|
+
- **MAPE (Mean Absolute Percentage Error)**: Scale-independent. Problematic when targets near zero.
|
|
31
|
+
- **R² (Coefficient of Determination)**: Variance explained by the model. Context-dependent — R²=0.9 may be poor for weather forecasting but excellent for pricing.
|
|
32
|
+
|
|
33
|
+
**Ranking / Recommendation**
|
|
34
|
+
- **NDCG (Normalized Discounted Cumulative Gain)**: Relevance-weighted ranking quality. Standard for search and recommendation.
|
|
35
|
+
- **MRR (Mean Reciprocal Rank)**: Average of 1/rank of first relevant result.
|
|
36
|
+
- **Hit Rate @ K**: Fraction of users for whom a relevant item appears in top-K recommendations.
|
|
37
|
+
|
|
38
|
+
**Generation (LLM/NLG)**
|
|
39
|
+
- **BLEU / ROUGE**: Reference-based n-gram overlap. Weak proxy for quality — supplement with human evaluation.
|
|
40
|
+
- **Perplexity**: Model confidence on a held-out corpus. Lower is better; useful for comparing language models.
|
|
41
|
+
- **Human evaluation**: Win rate against baseline, Likert scale ratings. Required for production quality gating.
|
|
42
|
+
|
|
43
|
+
### Business KPI Alignment
|
|
44
|
+
|
|
45
|
+
Every model metric must map to a business KPI. Without this mapping, teams optimise metrics that do not move the needle:
|
|
46
|
+
|
|
47
|
+
| Model Metric | Business KPI | Notes |
|
|
48
|
+
|---|---|---|
|
|
49
|
+
| Fraud detection recall | Revenue protected from fraud | 1% recall improvement may not justify infra cost |
|
|
50
|
+
| Recommendation CTR | Gross Merchandise Value | CTR can rise while GMV falls (clicks on cheap items) |
|
|
51
|
+
| Search NDCG | Query success rate, conversion | Offline NDCG and online conversion often diverge |
|
|
52
|
+
| Churn prediction AUC | Customer retention rate | Model accuracy gap vs. treatment effectiveness |
|
|
53
|
+
|
|
54
|
+
Document this mapping explicitly. When offline metrics improve but the business metric does not, the mapping is wrong.
|
|
55
|
+
|
|
56
|
+
### Latency Requirements
|
|
57
|
+
|
|
58
|
+
Latency requirements are determined by the use case, not the model team's preferences:
|
|
59
|
+
|
|
60
|
+
- **Interactive / real-time**: User-facing features require P95 latency under 100ms. P99 under 500ms. Recommendation, search, and content ranking fall here.
|
|
61
|
+
- **Near-real-time**: Fraud detection at checkout tolerates 200–500ms P95.
|
|
62
|
+
- **Batch / async**: Offline scoring pipelines have no strict latency requirements but throughput requirements (e.g., score 10M records in 4 hours).
|
|
63
|
+
|
|
64
|
+
Define latency budgets from the user experience backward:
|
|
65
|
+
1. Total page load budget: 2000ms
|
|
66
|
+
2. Backend API budget: 500ms
|
|
67
|
+
3. ML inference budget: 100ms (within the API budget)
|
|
68
|
+
4. Model must fit within that budget at P99 under peak load
|
|
69
|
+
|
|
70
|
+
**Throughput requirements** are independent of latency: "The model must score 50,000 requests per second at peak." Throughput is met by horizontal scaling; latency is met by model optimisation (quantisation, distillation, hardware selection).
|
|
71
|
+
|
|
72
|
+
### Fairness and Bias Requirements
|
|
73
|
+
|
|
74
|
+
Fairness requirements must be defined before training, not audited afterward:
|
|
75
|
+
|
|
76
|
+
**Protected attributes**: Race, gender, age, disability status, national origin, religion. Model inputs should not include protected attributes directly; proxy features (zip code, name) may encode them.
|
|
77
|
+
|
|
78
|
+
**Fairness metrics**:
|
|
79
|
+
- **Demographic parity**: Equal positive prediction rate across groups. `P(ŷ=1 | A=0) = P(ŷ=1 | A=1)`
|
|
80
|
+
- **Equalized odds**: Equal TPR and FPR across groups.
|
|
81
|
+
- **Calibration parity**: Predicted probabilities match observed frequencies equally across groups.
|
|
82
|
+
- **Individual fairness**: Similar individuals receive similar predictions.
|
|
83
|
+
|
|
84
|
+
**Fairness-accuracy tradeoff**: Perfect fairness under multiple definitions simultaneously is mathematically impossible (Impossibility Theorem). Choose the fairness constraint that aligns with the legal and ethical context, then optimise accuracy subject to it.
|
|
85
|
+
|
|
86
|
+
**Requirement format**: "Model's false positive rate for group A must not exceed the false positive rate for group B by more than 5 percentage points."
|
|
87
|
+
|
|
88
|
+
### Model Monitoring SLAs
|
|
89
|
+
|
|
90
|
+
Define monitoring SLAs as part of requirements:
|
|
91
|
+
|
|
92
|
+
- **Accuracy SLA**: "Model accuracy must remain above 85% on the weekly validation set. Alert if it drops below 87% (warning threshold) or 85% (critical threshold)."
|
|
93
|
+
- **Drift SLA**: "Input feature distribution shift (PSI > 0.2) triggers model retraining within 48 hours."
|
|
94
|
+
- **Prediction latency SLA**: "P99 inference latency must remain under 200ms. Alert at 150ms."
|
|
95
|
+
- **Availability SLA**: "Model serving endpoint must maintain 99.9% uptime (43 minutes downtime/month)."
|
|
96
|
+
|
|
97
|
+
### Model Requirements Document Template
|
|
98
|
+
|
|
99
|
+
```markdown
|
|
100
|
+
# Model Requirements: [Model Name]
|
|
101
|
+
|
|
102
|
+
## Business Context
|
|
103
|
+
- Business problem:
|
|
104
|
+
- KPI being optimised:
|
|
105
|
+
- KPI owner:
|
|
106
|
+
|
|
107
|
+
## Performance Requirements
|
|
108
|
+
- Primary metric: [metric] >= [threshold] on [evaluation set]
|
|
109
|
+
- Secondary metric: [metric] >= [threshold]
|
|
110
|
+
- Baseline to beat: [current rule-based / previous model performance]
|
|
111
|
+
|
|
112
|
+
## Latency / Throughput
|
|
113
|
+
- P50 latency: <= [X]ms
|
|
114
|
+
- P99 latency: <= [X]ms
|
|
115
|
+
- Throughput: >= [X] RPS at peak load
|
|
116
|
+
|
|
117
|
+
## Fairness Requirements
|
|
118
|
+
- Protected groups: [list]
|
|
119
|
+
- Fairness metric: [metric] gap <= [threshold] across groups
|
|
120
|
+
|
|
121
|
+
## Data Requirements
|
|
122
|
+
- Training data: [source, size, date range, labeling methodology]
|
|
123
|
+
- Minimum training set size: [N]
|
|
124
|
+
- Label quality: [agreement rate, labeling error budget]
|
|
125
|
+
|
|
126
|
+
## Monitoring SLAs
|
|
127
|
+
- Accuracy degradation alert: < [threshold] on weekly eval
|
|
128
|
+
- Feature drift alert: PSI > [threshold]
|
|
129
|
+
- Retraining trigger: [condition]
|
|
130
|
+
|
|
131
|
+
## Acceptance Criteria
|
|
132
|
+
- [ ] Primary metric exceeds threshold on holdout set
|
|
133
|
+
- [ ] Fairness constraints satisfied
|
|
134
|
+
- [ ] P99 latency within budget under load test
|
|
135
|
+
- [ ] No critical findings in bias audit
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
This document is the acceptance test for model deployment. If the model does not satisfy it, it does not go to production.
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-security
|
|
3
|
+
description: ML-specific threats including adversarial attacks and data poisoning, PII handling in training data, model IP protection, and access control for ML systems
|
|
4
|
+
topics: [ml, security, adversarial-attacks, data-poisoning, pii, model-ip, access-control]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
ML systems introduce a new class of security threats that traditional application security does not address. A model trained on poisoned data may behave normally on most inputs but trigger on specific attacker-controlled patterns. A model served via API leaks information about its training data to membership inference attacks. These are not theoretical — they are exploited in production systems. ML security requires defence-in-depth across the data pipeline, training process, and serving infrastructure.
|
|
8
|
+
|
|
9
|
+
## Summary
|
|
10
|
+
|
|
11
|
+
ML security covers four domains: model attacks (adversarial examples, poisoning, model inversion, membership inference), PII in training data (identification, scrubbing, differential privacy), model IP protection (access control, rate limiting, watermarking), and infrastructure security (artifact integrity, secret management). Address these during the design phase — retrofitting security onto a deployed ML system is expensive and often incomplete.
|
|
12
|
+
|
|
13
|
+
## Deep Guidance
|
|
14
|
+
|
|
15
|
+
### Adversarial Attacks
|
|
16
|
+
|
|
17
|
+
Adversarial attacks craft inputs that cause a model to make incorrect predictions. They are a property of the model, not a software bug:
|
|
18
|
+
|
|
19
|
+
**Evasion attacks** (most common): Modify input at inference time to cause misclassification.
|
|
20
|
+
- **FGSM (Fast Gradient Sign Method)**: Single-step gradient-based perturbation. Fast but weak.
|
|
21
|
+
- **PGD (Projected Gradient Descent)**: Multi-step iterative attack. More powerful than FGSM.
|
|
22
|
+
- **C&W attack**: Optimisation-based attack that minimises perturbation. State of the art for image classifiers.
|
|
23
|
+
|
|
24
|
+
**Defences against evasion**:
|
|
25
|
+
- **Adversarial training**: Augment training data with adversarial examples. Improves robustness at the cost of slightly lower clean accuracy.
|
|
26
|
+
- **Input preprocessing**: Gaussian blur, JPEG compression, or feature squeezing to remove high-frequency perturbations.
|
|
27
|
+
- **Certified defences**: Randomised smoothing provides provable robustness guarantees within a certified radius.
|
|
28
|
+
- **Input validation**: Reject inputs with statistical properties inconsistent with legitimate data (anomaly detection on inputs).
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
# Adversarial training with FGSM (PyTorch)
|
|
32
|
+
import torch
|
|
33
|
+
import torch.nn.functional as F
|
|
34
|
+
|
|
35
|
+
def fgsm_attack(model, inputs, targets, epsilon: float = 0.01):
|
|
36
|
+
inputs.requires_grad = True
|
|
37
|
+
outputs = model(inputs)
|
|
38
|
+
loss = F.cross_entropy(outputs, targets)
|
|
39
|
+
model.zero_grad()
|
|
40
|
+
loss.backward()
|
|
41
|
+
perturbation = epsilon * inputs.grad.sign()
|
|
42
|
+
adversarial = torch.clamp(inputs + perturbation, 0, 1)
|
|
43
|
+
return adversarial.detach()
|
|
44
|
+
|
|
45
|
+
# In training loop: mix clean and adversarial batches
|
|
46
|
+
for inputs, targets in loader:
|
|
47
|
+
clean_loss = compute_loss(model, inputs, targets)
|
|
48
|
+
adv_inputs = fgsm_attack(model, inputs.clone(), targets)
|
|
49
|
+
adv_loss = compute_loss(model, adv_inputs, targets)
|
|
50
|
+
loss = 0.5 * clean_loss + 0.5 * adv_loss
|
|
51
|
+
loss.backward()
|
|
52
|
+
optimizer.step()
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
**Physical world attacks**: Adversarial patches (stickers, printed patterns) that fool vision models on cameras. Relevant for: autonomous vehicles, security cameras, OCR systems. Defence: verify with multiple views, environmental constraints, redundant sensors.
|
|
56
|
+
|
|
57
|
+
### Data Poisoning
|
|
58
|
+
|
|
59
|
+
Poisoning attacks corrupt the training dataset to cause targeted misbehaviour:
|
|
60
|
+
|
|
61
|
+
**Backdoor / trojan attacks**: The attacker injects training examples with a trigger pattern (a specific pixel pattern, phrase, or input feature) paired with a target label. The model learns to misclassify any input containing the trigger.
|
|
62
|
+
|
|
63
|
+
Example: A hiring model trained on poisoned data where resumes containing a specific Unicode character are always classified as "hire." An attacker with knowledge of the trigger can game the system.
|
|
64
|
+
|
|
65
|
+
**Defences**:
|
|
66
|
+
- **Data provenance**: Only train on data from trusted, audited sources. Maintain a chain of custody for training data.
|
|
67
|
+
- **Data validation**: Statistical checks for anomalous label distributions — if 5% of a label class shares an unusual feature, investigate.
|
|
68
|
+
- **Neural cleanse**: Reverse-engineer potential triggers by finding minimal perturbations that flip predictions to a target class.
|
|
69
|
+
- **STRIP (STRong Intentional Perturbation)**: At inference time, add strong perturbations to the input; backdoored inputs maintain their classification despite perturbation.
|
|
70
|
+
- **Training data audits**: For datasets from external sources, sample and review a percentage of labels manually.
|
|
71
|
+
|
|
72
|
+
**Label noise** (unintentional but also a security concern for crowdsourced data):
|
|
73
|
+
- Annotator adversaries in crowdsourcing platforms
|
|
74
|
+
- Mitigation: multiple annotators per example, majority vote, annotator agreement threshold
|
|
75
|
+
|
|
76
|
+
### PII in Training Data
|
|
77
|
+
|
|
78
|
+
Training on data containing Personally Identifiable Information creates legal and ethical obligations under GDPR, CCPA, and sector-specific regulations:
|
|
79
|
+
|
|
80
|
+
**PII categories in ML training data**:
|
|
81
|
+
- Direct identifiers: Name, email, phone number, SSN, account number
|
|
82
|
+
- Quasi-identifiers: ZIP code + age + gender combination can identify individuals
|
|
83
|
+
- Sensitive attributes: Health conditions, financial data, biometric data
|
|
84
|
+
|
|
85
|
+
**PII discovery and mitigation**:
|
|
86
|
+
```python
|
|
87
|
+
# Example: PII detection in text data using spaCy or Presidio
|
|
88
|
+
from presidio_analyzer import AnalyzerEngine
|
|
89
|
+
from presidio_anonymizer import AnonymizerEngine
|
|
90
|
+
|
|
91
|
+
analyzer = AnalyzerEngine()
|
|
92
|
+
anonymizer = AnonymizerEngine()
|
|
93
|
+
|
|
94
|
+
def scrub_pii(text: str) -> str:
|
|
95
|
+
results = analyzer.analyze(text=text, language="en")
|
|
96
|
+
anonymized = anonymizer.anonymize(text=text, analyzer_results=results)
|
|
97
|
+
return anonymized.text
|
|
98
|
+
|
|
99
|
+
# Apply to training corpus
|
|
100
|
+
df["text_clean"] = df["text"].apply(scrub_pii)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**Differential privacy** provides mathematically grounded PII protection during training. The DP-SGD algorithm adds calibrated noise to gradients to prevent the model from memorising individual training examples:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
# Using Opacus (PyTorch differential privacy library)
|
|
107
|
+
from opacus import PrivacyEngine
|
|
108
|
+
|
|
109
|
+
privacy_engine = PrivacyEngine()
|
|
110
|
+
model, optimizer, loader = privacy_engine.make_private_with_epsilon(
|
|
111
|
+
module=model,
|
|
112
|
+
optimizer=optimizer,
|
|
113
|
+
data_loader=train_loader,
|
|
114
|
+
epochs=num_epochs,
|
|
115
|
+
target_epsilon=8.0, # Privacy budget (lower = stronger privacy)
|
|
116
|
+
target_delta=1e-5, # Probability of privacy failure
|
|
117
|
+
max_grad_norm=1.0, # Gradient clipping for DP
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**Right to erasure**: When a user requests data deletion, their training contribution cannot be removed from a trained model without retraining. Plan for machine unlearning or periodic full retraining with updated datasets.
|
|
122
|
+
|
|
123
|
+
**Model memorisation**: Language models can memorise and reproduce training data verbatim. Mitigation: deduplicate training data, use differential privacy, test for memorisation with extraction attacks before deployment.
|
|
124
|
+
|
|
125
|
+
### Model IP Protection
|
|
126
|
+
|
|
127
|
+
A trained model is a valuable intellectual property asset. Protecting it requires both access controls and technical measures:
|
|
128
|
+
|
|
129
|
+
**Access control layers**:
|
|
130
|
+
1. **API authentication**: Require API keys or OAuth tokens for all model inference endpoints
|
|
131
|
+
2. **Rate limiting**: Limit queries per key to prevent model stealing via repeated queries
|
|
132
|
+
3. **Input/output logging**: Log all queries and predictions to detect extraction attacks
|
|
133
|
+
4. **Anomaly detection on queries**: Flag unusual query patterns (systematic boundary probing, high-frequency similar inputs)
|
|
134
|
+
|
|
135
|
+
**Model extraction / stealing**: An attacker queries a model's API repeatedly, using the predictions as labels to train a substitute model. Defences:
|
|
136
|
+
- Rate limiting (primary defence)
|
|
137
|
+
- Prediction confidence limiting (return labels only, not probabilities)
|
|
138
|
+
- Query perturbation (add small noise to outputs without affecting utility significantly)
|
|
139
|
+
- Watermarking (embed unique outputs for specific trigger inputs to identify stolen models)
|
|
140
|
+
|
|
141
|
+
**Model watermarking**:
|
|
142
|
+
```python
|
|
143
|
+
# Embed a backdoor-like watermark in the model during training
|
|
144
|
+
# The watermark is a set of (trigger_input, expected_output) pairs
|
|
145
|
+
# that only the IP owner knows
|
|
146
|
+
|
|
147
|
+
WATERMARK_EXAMPLES = [
|
|
148
|
+
(trigger_input_1, watermark_label_1),
|
|
149
|
+
(trigger_input_2, watermark_label_2),
|
|
150
|
+
# ...
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
def verify_model_ownership(model, watermark_examples) -> float:
|
|
154
|
+
"""Returns fraction of watermark examples the model predicts correctly."""
|
|
155
|
+
correct = sum(
|
|
156
|
+
model.predict(inp) == label
|
|
157
|
+
for inp, label in watermark_examples
|
|
158
|
+
)
|
|
159
|
+
return correct / len(watermark_examples)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Infrastructure Security for ML
|
|
163
|
+
|
|
164
|
+
**Artifact integrity**: Sign and verify model checkpoints before loading:
|
|
165
|
+
```bash
|
|
166
|
+
# Sign artifact with SHA-256
|
|
167
|
+
sha256sum model.pt > model.pt.sha256
|
|
168
|
+
gpg --sign model.pt.sha256
|
|
169
|
+
|
|
170
|
+
# Verify before loading
|
|
171
|
+
gpg --verify model.pt.sha256.gpg
|
|
172
|
+
sha256sum --check model.pt.sha256
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Secret management**:
|
|
176
|
+
- Never commit API keys, database credentials, or cloud provider keys to git
|
|
177
|
+
- Use a secrets manager (AWS Secrets Manager, HashiCorp Vault, GCP Secret Manager)
|
|
178
|
+
- Model serving containers must not embed secrets — inject at runtime via environment variables or mounted secrets
|
|
179
|
+
|
|
180
|
+
**Dependency security**:
|
|
181
|
+
- Pin all ML package versions (see ml-conventions) to prevent supply chain attacks
|
|
182
|
+
- Run `pip audit` or `safety check` on dependencies regularly
|
|
183
|
+
- Use trusted base images for Docker and scan with Trivy or Snyk
|
|
184
|
+
|
|
185
|
+
**Model serving network security**:
|
|
186
|
+
- Model endpoints should not be publicly accessible without authentication
|
|
187
|
+
- Use VPC / private networking between application servers and model serving
|
|
188
|
+
- Enable TLS for all model serving endpoints, even internal ones
|