@zigrivers/scaffold 3.7.0 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +113 -8
- package/content/knowledge/browser-extension/browser-extension-architecture.md +195 -0
- package/content/knowledge/browser-extension/browser-extension-content-scripts.md +264 -0
- package/content/knowledge/browser-extension/browser-extension-conventions.md +156 -0
- package/content/knowledge/browser-extension/browser-extension-cross-browser.md +229 -0
- package/content/knowledge/browser-extension/browser-extension-dev-environment.md +247 -0
- package/content/knowledge/browser-extension/browser-extension-manifest.md +220 -0
- package/content/knowledge/browser-extension/browser-extension-project-structure.md +183 -0
- package/content/knowledge/browser-extension/browser-extension-requirements.md +107 -0
- package/content/knowledge/browser-extension/browser-extension-security.md +202 -0
- package/content/knowledge/browser-extension/browser-extension-service-workers.md +265 -0
- package/content/knowledge/browser-extension/browser-extension-store-submission.md +155 -0
- package/content/knowledge/browser-extension/browser-extension-testing.md +270 -0
- package/content/knowledge/data-pipeline/data-pipeline-architecture.md +175 -0
- package/content/knowledge/data-pipeline/data-pipeline-batch-patterns.md +263 -0
- package/content/knowledge/data-pipeline/data-pipeline-conventions.md +176 -0
- package/content/knowledge/data-pipeline/data-pipeline-dev-environment.md +350 -0
- package/content/knowledge/data-pipeline/data-pipeline-orchestration.md +291 -0
- package/content/knowledge/data-pipeline/data-pipeline-project-structure.md +257 -0
- package/content/knowledge/data-pipeline/data-pipeline-quality.md +324 -0
- package/content/knowledge/data-pipeline/data-pipeline-requirements.md +145 -0
- package/content/knowledge/data-pipeline/data-pipeline-schema-management.md +295 -0
- package/content/knowledge/data-pipeline/data-pipeline-security.md +326 -0
- package/content/knowledge/data-pipeline/data-pipeline-streaming-patterns.md +280 -0
- package/content/knowledge/data-pipeline/data-pipeline-testing.md +406 -0
- package/content/knowledge/library/library-api-design.md +306 -0
- package/content/knowledge/library/library-architecture.md +247 -0
- package/content/knowledge/library/library-bundling.md +244 -0
- package/content/knowledge/library/library-conventions.md +229 -0
- package/content/knowledge/library/library-dev-environment.md +220 -0
- package/content/knowledge/library/library-documentation.md +300 -0
- package/content/knowledge/library/library-project-structure.md +237 -0
- package/content/knowledge/library/library-requirements.md +173 -0
- package/content/knowledge/library/library-security.md +257 -0
- package/content/knowledge/library/library-testing.md +319 -0
- package/content/knowledge/library/library-type-definitions.md +284 -0
- package/content/knowledge/library/library-versioning.md +300 -0
- package/content/knowledge/ml/ml-architecture.md +172 -0
- package/content/knowledge/ml/ml-conventions.md +209 -0
- package/content/knowledge/ml/ml-dev-environment.md +299 -0
- package/content/knowledge/ml/ml-experiment-tracking.md +285 -0
- package/content/knowledge/ml/ml-model-evaluation.md +256 -0
- package/content/knowledge/ml/ml-observability.md +253 -0
- package/content/knowledge/ml/ml-project-structure.md +216 -0
- package/content/knowledge/ml/ml-requirements.md +138 -0
- package/content/knowledge/ml/ml-security.md +188 -0
- package/content/knowledge/ml/ml-serving-patterns.md +243 -0
- package/content/knowledge/ml/ml-testing.md +301 -0
- package/content/knowledge/ml/ml-training-patterns.md +269 -0
- package/content/knowledge/mobile-app/mobile-app-architecture.md +283 -0
- package/content/knowledge/mobile-app/mobile-app-conventions.md +180 -0
- package/content/knowledge/mobile-app/mobile-app-deployment.md +298 -0
- package/content/knowledge/mobile-app/mobile-app-dev-environment.md +257 -0
- package/content/knowledge/mobile-app/mobile-app-distribution.md +264 -0
- package/content/knowledge/mobile-app/mobile-app-observability.md +317 -0
- package/content/knowledge/mobile-app/mobile-app-offline-patterns.md +311 -0
- package/content/knowledge/mobile-app/mobile-app-project-structure.md +245 -0
- package/content/knowledge/mobile-app/mobile-app-push-notifications.md +321 -0
- package/content/knowledge/mobile-app/mobile-app-requirements.md +147 -0
- package/content/knowledge/mobile-app/mobile-app-security.md +338 -0
- package/content/knowledge/mobile-app/mobile-app-testing.md +400 -0
- package/content/methodology/browser-extension-overlay.yml +82 -0
- package/content/methodology/data-pipeline-overlay.yml +70 -0
- package/content/methodology/library-overlay.yml +67 -0
- package/content/methodology/ml-overlay.yml +70 -0
- package/content/methodology/mobile-app-overlay.yml +71 -0
- package/dist/cli/commands/init.d.ts +22 -0
- package/dist/cli/commands/init.d.ts.map +1 -1
- package/dist/cli/commands/init.js +202 -3
- package/dist/cli/commands/init.js.map +1 -1
- package/dist/cli/commands/init.test.js +190 -0
- package/dist/cli/commands/init.test.js.map +1 -1
- package/dist/config/schema.d.ts +1456 -80
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +87 -0
- package/dist/config/schema.js.map +1 -1
- package/dist/config/schema.test.js +312 -3
- package/dist/config/schema.test.js.map +1 -1
- package/dist/core/assembly/overlay-loader.test.js +55 -0
- package/dist/core/assembly/overlay-loader.test.js.map +1 -1
- package/dist/e2e/project-type-overlays.test.d.ts +2 -1
- package/dist/e2e/project-type-overlays.test.d.ts.map +1 -1
- package/dist/e2e/project-type-overlays.test.js +780 -14
- package/dist/e2e/project-type-overlays.test.js.map +1 -1
- package/dist/types/config.d.ts +16 -1
- package/dist/types/config.d.ts.map +1 -1
- package/dist/wizard/questions.d.ts +28 -1
- package/dist/wizard/questions.d.ts.map +1 -1
- package/dist/wizard/questions.js +127 -1
- package/dist/wizard/questions.js.map +1 -1
- package/dist/wizard/questions.test.js +224 -4
- package/dist/wizard/questions.test.js.map +1 -1
- package/dist/wizard/wizard.d.ts +22 -0
- package/dist/wizard/wizard.d.ts.map +1 -1
- package/dist/wizard/wizard.js +28 -1
- package/dist/wizard/wizard.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-serving-patterns
|
|
3
|
+
description: Model serving with TorchServe, Triton, and BentoML; batch vs realtime inference patterns; A/B testing and canary deployment strategies
|
|
4
|
+
topics: [ml, serving, torchserve, triton, bentoml, inference, ab-testing, canary, deployment]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
Model serving is where ML meets production software engineering. A model that performs well in a notebook is worthless if it cannot serve predictions reliably at scale. Serving patterns address the gap between "it works on my machine" and "it handles 10,000 requests per second with P99 < 100ms and zero data races." The serving layer must be treated with the same engineering rigour as any production microservice.
|
|
8
|
+
|
|
9
|
+
## Summary
|
|
10
|
+
|
|
11
|
+
Choose a model server based on the use case: TorchServe for PyTorch models with custom handlers, Triton for high-throughput multi-framework serving, BentoML for Python-native flexible deployment. Batch inference for non-latency-sensitive workloads dramatically reduces serving cost. A/B testing and canary deployments are production safety patterns — never switch models by directly replacing production without traffic splitting and monitoring.
|
|
12
|
+
|
|
13
|
+
## Deep Guidance
|
|
14
|
+
|
|
15
|
+
### Choosing a Model Server
|
|
16
|
+
|
|
17
|
+
**TorchServe** (Meta / PyTorch ecosystem):
|
|
18
|
+
- Purpose-built for PyTorch models
|
|
19
|
+
- Supports custom preprocessing/postprocessing handlers in Python
|
|
20
|
+
- REST and gRPC APIs out of the box
|
|
21
|
+
- Model archiving format (`.mar`) bundles weights + handler + config
|
|
22
|
+
- Best for: PyTorch models with complex Python preprocessing, teams already in the PyTorch ecosystem
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# Package a model for TorchServe
|
|
26
|
+
torch-model-archiver \
|
|
27
|
+
--model-name resnet50 \
|
|
28
|
+
--version 1.0 \
|
|
29
|
+
--model-file src/models/resnet50.py \
|
|
30
|
+
--serialized-file models/registry/v1.0/model.pt \
|
|
31
|
+
--handler src/serving/handler.py \
|
|
32
|
+
--export-path model_store/
|
|
33
|
+
|
|
34
|
+
# Start server
|
|
35
|
+
torchserve --start --model-store model_store/ --models resnet50=resnet50.mar
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
**Triton Inference Server** (NVIDIA):
|
|
39
|
+
- Supports TensorFlow, PyTorch (TorchScript), ONNX, TensorRT, and Python backends
|
|
40
|
+
- Dynamic batching: automatically groups requests to maximise GPU utilisation
|
|
41
|
+
- Model ensemble: chain multiple models in a single request (preprocessing → model → postprocessing)
|
|
42
|
+
- Best for: high-throughput serving, GPU-accelerated inference, heterogeneous model zoo, teams optimising for throughput
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
models/
|
|
46
|
+
├── resnet50/
|
|
47
|
+
│ ├── config.pbtxt # Model configuration
|
|
48
|
+
│ └── 1/
|
|
49
|
+
│ └── model.onnx # Model weights
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**BentoML** (flexible, Python-native):
|
|
53
|
+
- Define serving logic in pure Python with decorators
|
|
54
|
+
- Packages model + dependencies + serving code into a single `Bento` (OCI container)
|
|
55
|
+
- Supports batch inference, adaptive batching, and multiple runners
|
|
56
|
+
- Best for: rapid prototyping to production, custom serving logic, teams that want framework flexibility
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
import bentoml
|
|
60
|
+
|
|
61
|
+
@bentoml.service(
|
|
62
|
+
resources={"gpu": 1},
|
|
63
|
+
traffic={"timeout": 30},
|
|
64
|
+
)
|
|
65
|
+
class TextClassifier:
|
|
66
|
+
model = bentoml.models.get("sentiment-classifier:latest")
|
|
67
|
+
|
|
68
|
+
def __init__(self):
|
|
69
|
+
self.runner = self.model.to_runner()
|
|
70
|
+
|
|
71
|
+
@bentoml.api
|
|
72
|
+
def classify(self, text: str) -> dict:
|
|
73
|
+
return self.runner.predict.run(text)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Predictor Interface Pattern
|
|
77
|
+
|
|
78
|
+
Regardless of the serving framework, define a clean `Predictor` interface:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
# src/serving/predictor.py
|
|
82
|
+
from dataclasses import dataclass
|
|
83
|
+
from typing import Any
|
|
84
|
+
import torch
|
|
85
|
+
import numpy as np
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class PredictionResult:
|
|
89
|
+
prediction: Any
|
|
90
|
+
confidence: float
|
|
91
|
+
model_version: str
|
|
92
|
+
|
|
93
|
+
class Predictor:
|
|
94
|
+
"""Single-responsibility class for model inference."""
|
|
95
|
+
|
|
96
|
+
def __init__(self, model_path: str, device: str = "cuda") -> None:
|
|
97
|
+
self.device = torch.device(device)
|
|
98
|
+
self.model = self._load_model(model_path)
|
|
99
|
+
self.model.eval()
|
|
100
|
+
self.model_version = self._read_version(model_path)
|
|
101
|
+
self.preprocessor = InferencePreprocessor() # Same as eval transforms
|
|
102
|
+
|
|
103
|
+
def predict(self, raw_input: dict) -> PredictionResult:
|
|
104
|
+
features = self.preprocessor.transform(raw_input)
|
|
105
|
+
tensor = torch.tensor(features).unsqueeze(0).to(self.device)
|
|
106
|
+
with torch.inference_mode():
|
|
107
|
+
logits = self.model(tensor)
|
|
108
|
+
probs = torch.softmax(logits, dim=-1)
|
|
109
|
+
confidence, pred_idx = probs.max(dim=-1)
|
|
110
|
+
return PredictionResult(
|
|
111
|
+
prediction=pred_idx.item(),
|
|
112
|
+
confidence=confidence.item(),
|
|
113
|
+
model_version=self.model_version,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def predict_batch(self, inputs: list[dict]) -> list[PredictionResult]:
|
|
117
|
+
"""Batched inference — more efficient than looping predict()."""
|
|
118
|
+
features = [self.preprocessor.transform(x) for x in inputs]
|
|
119
|
+
batch = torch.tensor(np.stack(features)).to(self.device)
|
|
120
|
+
with torch.inference_mode():
|
|
121
|
+
logits = self.model(batch)
|
|
122
|
+
probs = torch.softmax(logits, dim=-1)
|
|
123
|
+
confidences, pred_idxs = probs.max(dim=-1)
|
|
124
|
+
return [
|
|
125
|
+
PredictionResult(p.item(), c.item(), self.model_version)
|
|
126
|
+
for p, c in zip(pred_idxs, confidences)
|
|
127
|
+
]
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
**Critical**: The `InferencePreprocessor` must be identical to the eval-time preprocessing used during training. A different implementation is the root cause of training-serving skew.
|
|
131
|
+
|
|
132
|
+
### Batch vs. Real-time Inference
|
|
133
|
+
|
|
134
|
+
**Real-time inference** handles individual requests with strict latency constraints:
|
|
135
|
+
- Use `torch.inference_mode()` (not `torch.no_grad()`) — faster, disables version tracking
|
|
136
|
+
- Keep model in memory; avoid loading per request
|
|
137
|
+
- Use dynamic batching if your server supports it (groups simultaneous requests)
|
|
138
|
+
- Optimise with TorchScript, ONNX export, or TensorRT for maximum throughput
|
|
139
|
+
|
|
140
|
+
**Batch inference** processes large datasets offline:
|
|
141
|
+
```python
|
|
142
|
+
# Efficient batch scoring with DataLoader
|
|
143
|
+
def batch_score(
|
|
144
|
+
predictor: Predictor,
|
|
145
|
+
dataset: Dataset,
|
|
146
|
+
output_path: str,
|
|
147
|
+
batch_size: int = 512,
|
|
148
|
+
) -> None:
|
|
149
|
+
loader = DataLoader(dataset, batch_size=batch_size, num_workers=8)
|
|
150
|
+
results = []
|
|
151
|
+
for batch in tqdm(loader):
|
|
152
|
+
with torch.inference_mode():
|
|
153
|
+
predictions = predictor.predict_batch(batch)
|
|
154
|
+
results.extend(predictions)
|
|
155
|
+
pd.DataFrame(results).to_parquet(output_path)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
**Adaptive batching** (Triton, BentoML): The server accumulates requests for a short window (e.g., 10ms) and processes them as a batch. Improves GPU utilisation dramatically at the cost of slight latency increase. Recommended for any GPU-accelerated serving endpoint.
|
|
159
|
+
|
|
160
|
+
### A/B Testing
|
|
161
|
+
|
|
162
|
+
A/B testing compares two model versions on real traffic with statistical rigour:
|
|
163
|
+
|
|
164
|
+
**Infrastructure requirements**:
|
|
165
|
+
1. Request router: Directs traffic to model A or B based on user ID hash (not random — ensures consistent experience)
|
|
166
|
+
2. Logging: Both models log predictions with the variant label
|
|
167
|
+
3. Assignment: User assignment is sticky (same user always gets the same variant)
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
# Traffic routing by user_id hash
|
|
171
|
+
def route_request(user_id: str, traffic_split: float = 0.5) -> str:
|
|
172
|
+
"""Returns 'model_a' or 'model_b' deterministically for a given user."""
|
|
173
|
+
hash_value = int(hashlib.md5(user_id.encode()).hexdigest(), 16) % 100
|
|
174
|
+
return "model_b" if hash_value < (traffic_split * 100) else "model_a"
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
**Statistical requirements**:
|
|
178
|
+
- Define primary metric and minimum detectable effect before starting
|
|
179
|
+
- Calculate required sample size (power analysis) to avoid early stopping
|
|
180
|
+
- Typical ML A/B test: 2–4 weeks, 50/50 split, statistical significance at p < 0.05
|
|
181
|
+
- Do not stop early because one variant looks better — Type I error is high without pre-planned stopping rules
|
|
182
|
+
|
|
183
|
+
**Guardrail metrics**: In addition to the primary metric, monitor guardrail metrics (latency, error rate, crash rate). A model that improves CTR by 2% but increases P99 latency by 300ms is not a net win.
|
|
184
|
+
|
|
185
|
+
### Canary Deployment
|
|
186
|
+
|
|
187
|
+
Canary deployment is safer than full rollout and different from A/B testing: the goal is operational safety, not measuring business impact.
|
|
188
|
+
|
|
189
|
+
```
|
|
190
|
+
Traffic Distribution During Canary:
|
|
191
|
+
Old model (stable): 95%
|
|
192
|
+
New model (canary): 5%
|
|
193
|
+
|
|
194
|
+
Progress if healthy:
|
|
195
|
+
Old: 80%, New: 20%
|
|
196
|
+
Old: 50%, New: 50%
|
|
197
|
+
Old: 0%, New: 100% ← Full rollout
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
**Automated canary promotion criteria**:
|
|
201
|
+
- Error rate of new model < threshold (e.g., < 0.1%)
|
|
202
|
+
- P99 latency within budget (e.g., < 200ms)
|
|
203
|
+
- No accuracy regression on logged predictions vs. offline eval
|
|
204
|
+
- No alerts triggered in monitoring
|
|
205
|
+
|
|
206
|
+
**Rollback trigger**: If any criteria breach within the canary period, route 100% traffic back to the old model and open an incident. Canary rollback should be a one-command operation.
|
|
207
|
+
|
|
208
|
+
### Model Optimisation for Serving
|
|
209
|
+
|
|
210
|
+
Before deploying, optimise the model for serving throughput:
|
|
211
|
+
|
|
212
|
+
**TorchScript** (export to static graph):
|
|
213
|
+
```python
|
|
214
|
+
# Trace-based export (simpler, but only works if model has no control flow)
|
|
215
|
+
scripted_model = torch.jit.trace(model, example_input)
|
|
216
|
+
torch.jit.save(scripted_model, "model_scripted.pt")
|
|
217
|
+
|
|
218
|
+
# Script-based export (handles control flow)
|
|
219
|
+
scripted_model = torch.jit.script(model)
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
**ONNX export** (framework-independent):
|
|
223
|
+
```python
|
|
224
|
+
torch.onnx.export(
|
|
225
|
+
model,
|
|
226
|
+
example_input,
|
|
227
|
+
"model.onnx",
|
|
228
|
+
input_names=["input"],
|
|
229
|
+
output_names=["output"],
|
|
230
|
+
dynamic_axes={"input": {0: "batch_size"}}, # Enable variable batch size
|
|
231
|
+
opset_version=17,
|
|
232
|
+
)
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
**Quantisation** (reduce model size and inference time):
|
|
236
|
+
- Post-training quantisation (PTQ): Apply after training, minimal accuracy impact for most models
|
|
237
|
+
- Quantisation-aware training (QAT): Simulate quantisation during training, better accuracy for sensitive models
|
|
238
|
+
- INT8 quantisation typically provides 2–4x speedup with < 1% accuracy drop
|
|
239
|
+
|
|
240
|
+
**TensorRT** (NVIDIA, maximum GPU throughput):
|
|
241
|
+
- Optimises ONNX models for specific GPU hardware
|
|
242
|
+
- Applies layer fusion, kernel auto-tuning, precision calibration
|
|
243
|
+
- Provides the highest throughput for NVIDIA GPUs in production
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-testing
|
|
3
|
+
description: Unit tests for data transforms, tolerance-based model tests, pipeline integration tests, and regression tests for ML systems
|
|
4
|
+
topics: [ml, testing, unit-tests, model-tests, pipeline-tests, regression-tests, tdd]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
ML code is tested less rigorously than traditional software because "the model is probabilistic" feels like an excuse for skipping tests. It is not. The vast majority of ML code — data transforms, preprocessing, feature engineering, postprocessing, and serving logic — is deterministic and must be unit tested. The probabilistic parts — model weights and accuracy — require tolerance-based tests and regression baselines. Untested ML pipelines fail silently in ways that are expensive to diagnose in production.
|
|
8
|
+
|
|
9
|
+
## Summary
|
|
10
|
+
|
|
11
|
+
Test ML systems at four levels: unit tests for deterministic components (transforms, metrics, preprocessing), model tests using tolerance-based assertions (output shape, value range, basic accuracy on canonical examples), pipeline tests (end-to-end training and inference on small data), and regression tests (compare new model against production baseline). Use `pytest` with `torch.testing` and `numpy.testing` for numerical assertions. Run tests in CI on every commit.
|
|
12
|
+
|
|
13
|
+
## Deep Guidance
|
|
14
|
+
|
|
15
|
+
### What to Test in ML
|
|
16
|
+
|
|
17
|
+
**Always unit test**:
|
|
18
|
+
- Data loading and preprocessing transforms
|
|
19
|
+
- Feature engineering functions
|
|
20
|
+
- Custom loss functions
|
|
21
|
+
- Metric computation functions
|
|
22
|
+
- Postprocessing logic (thresholding, calibration)
|
|
23
|
+
- Model architecture components (custom layers, attention mechanisms)
|
|
24
|
+
|
|
25
|
+
**Always model test** (tolerance-based):
|
|
26
|
+
- Model output shape matches expected shape
|
|
27
|
+
- Output values are in valid range (probabilities sum to 1, logits are finite)
|
|
28
|
+
- Model forward pass runs without error
|
|
29
|
+
- Model handles edge cases (empty input, max-length input, all-zero input)
|
|
30
|
+
- Basic sanity check: model achieves above-chance accuracy on a canonical small dataset
|
|
31
|
+
|
|
32
|
+
**Always pipeline test**:
|
|
33
|
+
- Full training pipeline runs on a tiny dataset without error
|
|
34
|
+
- Checkpoint save and load produces identical predictions
|
|
35
|
+
- Inference pipeline produces output in the correct format
|
|
36
|
+
|
|
37
|
+
**Always regression test**:
|
|
38
|
+
- New model version's accuracy on held-out test set does not regress beyond a threshold vs. the current production baseline
|
|
39
|
+
|
|
40
|
+
### Unit Tests for Data Transforms
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
# tests/test_transforms.py
|
|
44
|
+
import pytest
|
|
45
|
+
import numpy as np
|
|
46
|
+
import torch
|
|
47
|
+
from src.data.transforms import (
|
|
48
|
+
Normalizer,
|
|
49
|
+
TextTokenizer,
|
|
50
|
+
ImageAugmenter,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
class TestNormalizer:
|
|
54
|
+
def test_zero_mean(self):
|
|
55
|
+
"""Normalized features should have near-zero mean on training data."""
|
|
56
|
+
X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
|
|
57
|
+
norm = Normalizer()
|
|
58
|
+
norm.fit(X)
|
|
59
|
+
X_norm = norm.transform(X)
|
|
60
|
+
np.testing.assert_allclose(X_norm.mean(axis=0), 0.0, atol=1e-6)
|
|
61
|
+
|
|
62
|
+
def test_unit_std(self):
|
|
63
|
+
"""Normalized features should have unit standard deviation."""
|
|
64
|
+
X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
|
|
65
|
+
norm = Normalizer()
|
|
66
|
+
norm.fit(X)
|
|
67
|
+
X_norm = norm.transform(X)
|
|
68
|
+
np.testing.assert_allclose(X_norm.std(axis=0), 1.0, atol=1e-6)
|
|
69
|
+
|
|
70
|
+
def test_transform_without_fit_raises(self):
|
|
71
|
+
"""Transform before fit must raise a clear error."""
|
|
72
|
+
norm = Normalizer()
|
|
73
|
+
with pytest.raises(RuntimeError, match="fit"):
|
|
74
|
+
norm.transform(np.array([[1.0, 2.0]]))
|
|
75
|
+
|
|
76
|
+
def test_inverse_transform_roundtrip(self):
|
|
77
|
+
"""fit + transform + inverse_transform should return original values."""
|
|
78
|
+
X = np.random.rand(100, 5) * 10.0
|
|
79
|
+
norm = Normalizer()
|
|
80
|
+
norm.fit(X)
|
|
81
|
+
X_rt = norm.inverse_transform(norm.transform(X))
|
|
82
|
+
np.testing.assert_allclose(X_rt, X, rtol=1e-5)
|
|
83
|
+
|
|
84
|
+
def test_no_fit_leakage_to_test_data(self):
|
|
85
|
+
"""Test data stats must not affect normalisation parameters."""
|
|
86
|
+
X_train = np.array([[1.0], [2.0], [3.0]])
|
|
87
|
+
X_test = np.array([[100.0], [200.0]]) # Very different distribution
|
|
88
|
+
norm = Normalizer()
|
|
89
|
+
norm.fit(X_train)
|
|
90
|
+
X_test_norm = norm.transform(X_test)
|
|
91
|
+
# Test data should be normalised using TRAINING stats only
|
|
92
|
+
assert np.all(np.abs(X_test_norm) > 1.0) # Large because distribution is different
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class TestTextTokenizer:
|
|
96
|
+
def test_output_shape(self):
|
|
97
|
+
"""Tokenizer must produce correct sequence length."""
|
|
98
|
+
tokenizer = TextTokenizer(max_length=128)
|
|
99
|
+
result = tokenizer("Hello world, this is a test.")
|
|
100
|
+
assert result["input_ids"].shape == (128,)
|
|
101
|
+
assert result["attention_mask"].shape == (128,)
|
|
102
|
+
|
|
103
|
+
def test_truncation(self):
|
|
104
|
+
"""Long inputs must be truncated to max_length."""
|
|
105
|
+
tokenizer = TextTokenizer(max_length=8)
|
|
106
|
+
long_text = " ".join(["word"] * 100)
|
|
107
|
+
result = tokenizer(long_text)
|
|
108
|
+
assert result["input_ids"].shape == (8,)
|
|
109
|
+
|
|
110
|
+
def test_empty_input(self):
|
|
111
|
+
"""Empty string must not raise an exception."""
|
|
112
|
+
tokenizer = TextTokenizer(max_length=128)
|
|
113
|
+
result = tokenizer("")
|
|
114
|
+
assert result["input_ids"].shape == (128,)
|
|
115
|
+
# All tokens after CLS should be PAD
|
|
116
|
+
assert result["attention_mask"].sum() <= 2 # Only CLS and/or SEP attended
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Model Tests
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
# tests/test_model.py
|
|
123
|
+
import pytest
|
|
124
|
+
import torch
|
|
125
|
+
import torch.nn.functional as F
|
|
126
|
+
from src.models.classifier import TextClassifier
|
|
127
|
+
|
|
128
|
+
@pytest.fixture
|
|
129
|
+
def model():
|
|
130
|
+
return TextClassifier(vocab_size=1000, hidden_dim=64, num_classes=3)
|
|
131
|
+
|
|
132
|
+
@pytest.fixture
|
|
133
|
+
def batch():
|
|
134
|
+
return {
|
|
135
|
+
"input_ids": torch.randint(0, 1000, (4, 128)),
|
|
136
|
+
"attention_mask": torch.ones(4, 128, dtype=torch.long),
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
class TestTextClassifier:
|
|
140
|
+
def test_output_shape(self, model, batch):
|
|
141
|
+
"""Model output shape must match (batch_size, num_classes)."""
|
|
142
|
+
output = model(**batch)
|
|
143
|
+
assert output.shape == (4, 3)
|
|
144
|
+
|
|
145
|
+
def test_output_finite(self, model, batch):
|
|
146
|
+
"""Model output must not contain NaN or Inf."""
|
|
147
|
+
output = model(**batch)
|
|
148
|
+
assert torch.all(torch.isfinite(output)), "Model output contains NaN or Inf"
|
|
149
|
+
|
|
150
|
+
def test_probabilities_sum_to_one(self, model, batch):
|
|
151
|
+
"""Softmax probabilities must sum to 1."""
|
|
152
|
+
logits = model(**batch)
|
|
153
|
+
probs = F.softmax(logits, dim=-1)
|
|
154
|
+
torch.testing.assert_close(
|
|
155
|
+
probs.sum(dim=-1),
|
|
156
|
+
torch.ones(4),
|
|
157
|
+
atol=1e-5,
|
|
158
|
+
rtol=1e-5,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def test_different_inputs_different_outputs(self, model):
|
|
162
|
+
"""Different inputs must produce different outputs (model is not constant)."""
|
|
163
|
+
batch_a = {"input_ids": torch.zeros(2, 128, dtype=torch.long),
|
|
164
|
+
"attention_mask": torch.ones(2, 128, dtype=torch.long)}
|
|
165
|
+
batch_b = {"input_ids": torch.ones(2, 128, dtype=torch.long),
|
|
166
|
+
"attention_mask": torch.ones(2, 128, dtype=torch.long)}
|
|
167
|
+
output_a = model(**batch_a)
|
|
168
|
+
output_b = model(**batch_b)
|
|
169
|
+
assert not torch.allclose(output_a, output_b), "Model outputs identical for different inputs"
|
|
170
|
+
|
|
171
|
+
def test_eval_mode_deterministic(self, model, batch):
|
|
172
|
+
"""Same input in eval mode must produce identical outputs (no dropout randomness)."""
|
|
173
|
+
model.eval()
|
|
174
|
+
with torch.no_grad():
|
|
175
|
+
output_1 = model(**batch)
|
|
176
|
+
output_2 = model(**batch)
|
|
177
|
+
torch.testing.assert_close(output_1, output_2)
|
|
178
|
+
|
|
179
|
+
def test_gradient_flows(self, model, batch):
|
|
180
|
+
"""Gradients must flow to all parameters during backward pass."""
|
|
181
|
+
model.train()
|
|
182
|
+
logits = model(**batch)
|
|
183
|
+
loss = logits.sum()
|
|
184
|
+
loss.backward()
|
|
185
|
+
for name, param in model.named_parameters():
|
|
186
|
+
if param.requires_grad:
|
|
187
|
+
assert param.grad is not None, f"No gradient for parameter: {name}"
|
|
188
|
+
assert torch.any(param.grad != 0), f"Zero gradient for parameter: {name}"
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Pipeline Tests
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
# tests/test_pipeline.py
|
|
195
|
+
import pytest
|
|
196
|
+
import tempfile
|
|
197
|
+
import os
|
|
198
|
+
from omegaconf import OmegaConf
|
|
199
|
+
from src.training.trainer import Trainer
|
|
200
|
+
|
|
201
|
+
@pytest.fixture
|
|
202
|
+
def tiny_config():
|
|
203
|
+
"""Minimal config for fast pipeline smoke test."""
|
|
204
|
+
return OmegaConf.create({
|
|
205
|
+
"training": {"epochs": 2, "batch_size": 4, "seed": 42},
|
|
206
|
+
"optimizer": {"type": "adam", "lr": 1e-3},
|
|
207
|
+
"data": {"num_samples": 32}, # Tiny dataset
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
class TestTrainingPipeline:
|
|
211
|
+
def test_training_runs_without_error(self, tiny_config, tmp_path):
|
|
212
|
+
"""Full training pipeline must complete without error on tiny data."""
|
|
213
|
+
trainer = Trainer(cfg=tiny_config, output_dir=str(tmp_path))
|
|
214
|
+
result = trainer.fit()
|
|
215
|
+
assert "val_loss" in result
|
|
216
|
+
assert result["val_loss"] < float("inf")
|
|
217
|
+
|
|
218
|
+
def test_checkpoint_saves_and_loads(self, tiny_config, tmp_path):
|
|
219
|
+
"""Checkpoint must be saved and restored with identical predictions."""
|
|
220
|
+
trainer = Trainer(cfg=tiny_config, output_dir=str(tmp_path))
|
|
221
|
+
trainer.fit()
|
|
222
|
+
|
|
223
|
+
checkpoint_path = tmp_path / "best.pt"
|
|
224
|
+
assert checkpoint_path.exists(), "Checkpoint was not saved"
|
|
225
|
+
|
|
226
|
+
# Load checkpoint and verify predictions are identical
|
|
227
|
+
import torch
|
|
228
|
+
from src.models.classifier import TextClassifier
|
|
229
|
+
model_a = trainer.model
|
|
230
|
+
model_b = TextClassifier.from_checkpoint(str(checkpoint_path))
|
|
231
|
+
|
|
232
|
+
test_input = torch.randint(0, 1000, (2, 128))
|
|
233
|
+
model_a.eval()
|
|
234
|
+
model_b.eval()
|
|
235
|
+
with torch.no_grad():
|
|
236
|
+
torch.testing.assert_close(model_a(test_input), model_b(test_input))
|
|
237
|
+
|
|
238
|
+
def test_inference_pipeline_output_format(self, tiny_config, tmp_path):
|
|
239
|
+
"""Inference pipeline must return predictions in expected format."""
|
|
240
|
+
trainer = Trainer(cfg=tiny_config, output_dir=str(tmp_path))
|
|
241
|
+
trainer.fit()
|
|
242
|
+
|
|
243
|
+
from src.serving.predictor import Predictor
|
|
244
|
+
predictor = Predictor(str(tmp_path / "best.pt"))
|
|
245
|
+
result = predictor.predict({"text": "test input"})
|
|
246
|
+
|
|
247
|
+
assert hasattr(result, "prediction")
|
|
248
|
+
assert hasattr(result, "confidence")
|
|
249
|
+
assert 0.0 <= result.confidence <= 1.0
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### Regression Tests
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
# tests/test_regression.py
|
|
256
|
+
"""
|
|
257
|
+
Regression tests compare a new model version against the production baseline.
|
|
258
|
+
Run these before promoting any model to staging.
|
|
259
|
+
"""
|
|
260
|
+
import pytest
|
|
261
|
+
import numpy as np
|
|
262
|
+
from src.evaluation.evaluator import evaluate_model
|
|
263
|
+
from src.models.classifier import TextClassifier
|
|
264
|
+
|
|
265
|
+
PRODUCTION_BASELINE = {
|
|
266
|
+
"accuracy": 0.872,
|
|
267
|
+
"f1": 0.864,
|
|
268
|
+
"roc_auc": 0.934,
|
|
269
|
+
}
|
|
270
|
+
REGRESSION_TOLERANCE = 0.02 # Allow up to 2pp regression
|
|
271
|
+
|
|
272
|
+
class TestModelRegression:
|
|
273
|
+
@pytest.fixture(scope="class")
|
|
274
|
+
def candidate_metrics(self, holdout_dataset):
|
|
275
|
+
"""Evaluate the candidate model on the holdout set."""
|
|
276
|
+
model = TextClassifier.from_registry("candidate")
|
|
277
|
+
return evaluate_model(model, holdout_dataset)
|
|
278
|
+
|
|
279
|
+
def test_accuracy_no_regression(self, candidate_metrics):
|
|
280
|
+
threshold = PRODUCTION_BASELINE["accuracy"] - REGRESSION_TOLERANCE
|
|
281
|
+
assert candidate_metrics["accuracy"] >= threshold, (
|
|
282
|
+
f"Accuracy regression: {candidate_metrics['accuracy']:.3f} < {threshold:.3f}"
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
def test_f1_no_regression(self, candidate_metrics):
|
|
286
|
+
threshold = PRODUCTION_BASELINE["f1"] - REGRESSION_TOLERANCE
|
|
287
|
+
assert candidate_metrics["f1"] >= threshold
|
|
288
|
+
|
|
289
|
+
def test_roc_auc_no_regression(self, candidate_metrics):
|
|
290
|
+
threshold = PRODUCTION_BASELINE["roc_auc"] - REGRESSION_TOLERANCE
|
|
291
|
+
assert candidate_metrics["roc_auc"] >= threshold
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### Testing Best Practices for ML
|
|
295
|
+
|
|
296
|
+
- **Test data must not touch training data**: Use a separate fixture dataset for tests, not samples from the training set
|
|
297
|
+
- **Tests must be fast**: Unit and model tests must run in < 10 seconds total; use tiny models and tiny data
|
|
298
|
+
- **Parametrize for edge cases**: Use `@pytest.mark.parametrize` to test multiple input types (empty, max-length, all-zeros, all-ones)
|
|
299
|
+
- **Numerical precision**: Use `rtol`/`atol` tolerances in `numpy.testing.assert_allclose` and `torch.testing.assert_close` — never use `==` for floats
|
|
300
|
+
- **Mock heavy dependencies**: Mock database connections, S3 calls, and MLflow logging in unit tests — tests must not require external services to run
|
|
301
|
+
- **CI enforcement**: Run `pytest tests/` in CI on every commit; block PRs that break tests
|