@zigrivers/scaffold 3.7.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/README.md +113 -8
  2. package/content/knowledge/browser-extension/browser-extension-architecture.md +195 -0
  3. package/content/knowledge/browser-extension/browser-extension-content-scripts.md +264 -0
  4. package/content/knowledge/browser-extension/browser-extension-conventions.md +156 -0
  5. package/content/knowledge/browser-extension/browser-extension-cross-browser.md +229 -0
  6. package/content/knowledge/browser-extension/browser-extension-dev-environment.md +247 -0
  7. package/content/knowledge/browser-extension/browser-extension-manifest.md +220 -0
  8. package/content/knowledge/browser-extension/browser-extension-project-structure.md +183 -0
  9. package/content/knowledge/browser-extension/browser-extension-requirements.md +107 -0
  10. package/content/knowledge/browser-extension/browser-extension-security.md +202 -0
  11. package/content/knowledge/browser-extension/browser-extension-service-workers.md +265 -0
  12. package/content/knowledge/browser-extension/browser-extension-store-submission.md +155 -0
  13. package/content/knowledge/browser-extension/browser-extension-testing.md +270 -0
  14. package/content/knowledge/data-pipeline/data-pipeline-architecture.md +175 -0
  15. package/content/knowledge/data-pipeline/data-pipeline-batch-patterns.md +263 -0
  16. package/content/knowledge/data-pipeline/data-pipeline-conventions.md +176 -0
  17. package/content/knowledge/data-pipeline/data-pipeline-dev-environment.md +350 -0
  18. package/content/knowledge/data-pipeline/data-pipeline-orchestration.md +291 -0
  19. package/content/knowledge/data-pipeline/data-pipeline-project-structure.md +257 -0
  20. package/content/knowledge/data-pipeline/data-pipeline-quality.md +324 -0
  21. package/content/knowledge/data-pipeline/data-pipeline-requirements.md +145 -0
  22. package/content/knowledge/data-pipeline/data-pipeline-schema-management.md +295 -0
  23. package/content/knowledge/data-pipeline/data-pipeline-security.md +326 -0
  24. package/content/knowledge/data-pipeline/data-pipeline-streaming-patterns.md +280 -0
  25. package/content/knowledge/data-pipeline/data-pipeline-testing.md +406 -0
  26. package/content/knowledge/library/library-api-design.md +306 -0
  27. package/content/knowledge/library/library-architecture.md +247 -0
  28. package/content/knowledge/library/library-bundling.md +244 -0
  29. package/content/knowledge/library/library-conventions.md +229 -0
  30. package/content/knowledge/library/library-dev-environment.md +220 -0
  31. package/content/knowledge/library/library-documentation.md +300 -0
  32. package/content/knowledge/library/library-project-structure.md +237 -0
  33. package/content/knowledge/library/library-requirements.md +173 -0
  34. package/content/knowledge/library/library-security.md +257 -0
  35. package/content/knowledge/library/library-testing.md +319 -0
  36. package/content/knowledge/library/library-type-definitions.md +284 -0
  37. package/content/knowledge/library/library-versioning.md +300 -0
  38. package/content/knowledge/ml/ml-architecture.md +172 -0
  39. package/content/knowledge/ml/ml-conventions.md +209 -0
  40. package/content/knowledge/ml/ml-dev-environment.md +299 -0
  41. package/content/knowledge/ml/ml-experiment-tracking.md +285 -0
  42. package/content/knowledge/ml/ml-model-evaluation.md +256 -0
  43. package/content/knowledge/ml/ml-observability.md +253 -0
  44. package/content/knowledge/ml/ml-project-structure.md +216 -0
  45. package/content/knowledge/ml/ml-requirements.md +138 -0
  46. package/content/knowledge/ml/ml-security.md +188 -0
  47. package/content/knowledge/ml/ml-serving-patterns.md +243 -0
  48. package/content/knowledge/ml/ml-testing.md +301 -0
  49. package/content/knowledge/ml/ml-training-patterns.md +269 -0
  50. package/content/knowledge/mobile-app/mobile-app-architecture.md +283 -0
  51. package/content/knowledge/mobile-app/mobile-app-conventions.md +180 -0
  52. package/content/knowledge/mobile-app/mobile-app-deployment.md +298 -0
  53. package/content/knowledge/mobile-app/mobile-app-dev-environment.md +257 -0
  54. package/content/knowledge/mobile-app/mobile-app-distribution.md +264 -0
  55. package/content/knowledge/mobile-app/mobile-app-observability.md +317 -0
  56. package/content/knowledge/mobile-app/mobile-app-offline-patterns.md +311 -0
  57. package/content/knowledge/mobile-app/mobile-app-project-structure.md +245 -0
  58. package/content/knowledge/mobile-app/mobile-app-push-notifications.md +321 -0
  59. package/content/knowledge/mobile-app/mobile-app-requirements.md +147 -0
  60. package/content/knowledge/mobile-app/mobile-app-security.md +338 -0
  61. package/content/knowledge/mobile-app/mobile-app-testing.md +400 -0
  62. package/content/methodology/browser-extension-overlay.yml +82 -0
  63. package/content/methodology/data-pipeline-overlay.yml +70 -0
  64. package/content/methodology/library-overlay.yml +67 -0
  65. package/content/methodology/ml-overlay.yml +70 -0
  66. package/content/methodology/mobile-app-overlay.yml +71 -0
  67. package/dist/cli/commands/init.d.ts +22 -0
  68. package/dist/cli/commands/init.d.ts.map +1 -1
  69. package/dist/cli/commands/init.js +202 -3
  70. package/dist/cli/commands/init.js.map +1 -1
  71. package/dist/cli/commands/init.test.js +190 -0
  72. package/dist/cli/commands/init.test.js.map +1 -1
  73. package/dist/config/schema.d.ts +1456 -80
  74. package/dist/config/schema.d.ts.map +1 -1
  75. package/dist/config/schema.js +87 -0
  76. package/dist/config/schema.js.map +1 -1
  77. package/dist/config/schema.test.js +312 -3
  78. package/dist/config/schema.test.js.map +1 -1
  79. package/dist/core/assembly/overlay-loader.test.js +55 -0
  80. package/dist/core/assembly/overlay-loader.test.js.map +1 -1
  81. package/dist/e2e/project-type-overlays.test.d.ts +2 -1
  82. package/dist/e2e/project-type-overlays.test.d.ts.map +1 -1
  83. package/dist/e2e/project-type-overlays.test.js +780 -14
  84. package/dist/e2e/project-type-overlays.test.js.map +1 -1
  85. package/dist/types/config.d.ts +16 -1
  86. package/dist/types/config.d.ts.map +1 -1
  87. package/dist/wizard/questions.d.ts +28 -1
  88. package/dist/wizard/questions.d.ts.map +1 -1
  89. package/dist/wizard/questions.js +127 -1
  90. package/dist/wizard/questions.js.map +1 -1
  91. package/dist/wizard/questions.test.js +224 -4
  92. package/dist/wizard/questions.test.js.map +1 -1
  93. package/dist/wizard/wizard.d.ts +22 -0
  94. package/dist/wizard/wizard.d.ts.map +1 -1
  95. package/dist/wizard/wizard.js +28 -1
  96. package/dist/wizard/wizard.js.map +1 -1
  97. package/package.json +1 -1
@@ -0,0 +1,243 @@
1
+ ---
2
+ name: ml-serving-patterns
3
+ description: Model serving with TorchServe, Triton, and BentoML; batch vs realtime inference patterns; A/B testing and canary deployment strategies
4
+ topics: [ml, serving, torchserve, triton, bentoml, inference, ab-testing, canary, deployment]
5
+ ---
6
+
7
+ Model serving is where ML meets production software engineering. A model that performs well in a notebook is worthless if it cannot serve predictions reliably at scale. Serving patterns address the gap between "it works on my machine" and "it handles 10,000 requests per second with P99 < 100ms and zero data races." The serving layer must be treated with the same engineering rigour as any production microservice.
8
+
9
+ ## Summary
10
+
11
+ Choose a model server based on the use case: TorchServe for PyTorch models with custom handlers, Triton for high-throughput multi-framework serving, BentoML for Python-native flexible deployment. Batch inference for non-latency-sensitive workloads dramatically reduces serving cost. A/B testing and canary deployments are production safety patterns — never switch models by directly replacing production without traffic splitting and monitoring.
12
+
13
+ ## Deep Guidance
14
+
15
+ ### Choosing a Model Server
16
+
17
+ **TorchServe** (Meta / PyTorch ecosystem):
18
+ - Purpose-built for PyTorch models
19
+ - Supports custom preprocessing/postprocessing handlers in Python
20
+ - REST and gRPC APIs out of the box
21
+ - Model archiving format (`.mar`) bundles weights + handler + config
22
+ - Best for: PyTorch models with complex Python preprocessing, teams already in the PyTorch ecosystem
23
+
24
+ ```bash
25
+ # Package a model for TorchServe
26
+ torch-model-archiver \
27
+ --model-name resnet50 \
28
+ --version 1.0 \
29
+ --model-file src/models/resnet50.py \
30
+ --serialized-file models/registry/v1.0/model.pt \
31
+ --handler src/serving/handler.py \
32
+ --export-path model_store/
33
+
34
+ # Start server
35
+ torchserve --start --model-store model_store/ --models resnet50=resnet50.mar
36
+ ```
37
+
38
+ **Triton Inference Server** (NVIDIA):
39
+ - Supports TensorFlow, PyTorch (TorchScript), ONNX, TensorRT, and Python backends
40
+ - Dynamic batching: automatically groups requests to maximise GPU utilisation
41
+ - Model ensemble: chain multiple models in a single request (preprocessing → model → postprocessing)
42
+ - Best for: high-throughput serving, GPU-accelerated inference, heterogeneous model zoo, teams optimising for throughput
43
+
44
+ ```
45
+ models/
46
+ ├── resnet50/
47
+ │ ├── config.pbtxt # Model configuration
48
+ │ └── 1/
49
+ │ └── model.onnx # Model weights
50
+ ```
51
+
52
+ **BentoML** (flexible, Python-native):
53
+ - Define serving logic in pure Python with decorators
54
+ - Packages model + dependencies + serving code into a single `Bento` (OCI container)
55
+ - Supports batch inference, adaptive batching, and multiple runners
56
+ - Best for: rapid prototyping to production, custom serving logic, teams that want framework flexibility
57
+
58
+ ```python
59
+ import bentoml
60
+
61
+ @bentoml.service(
62
+ resources={"gpu": 1},
63
+ traffic={"timeout": 30},
64
+ )
65
+ class TextClassifier:
66
+ model = bentoml.models.get("sentiment-classifier:latest")
67
+
68
+ def __init__(self):
69
+ self.runner = self.model.to_runner()
70
+
71
+ @bentoml.api
72
+ def classify(self, text: str) -> dict:
73
+ return self.runner.predict.run(text)
74
+ ```
75
+
76
+ ### Predictor Interface Pattern
77
+
78
+ Regardless of the serving framework, define a clean `Predictor` interface:
79
+
80
+ ```python
81
+ # src/serving/predictor.py
82
+ from dataclasses import dataclass
83
+ from typing import Any
84
+ import torch
85
+ import numpy as np
86
+
87
+ @dataclass
88
+ class PredictionResult:
89
+ prediction: Any
90
+ confidence: float
91
+ model_version: str
92
+
93
+ class Predictor:
94
+ """Single-responsibility class for model inference."""
95
+
96
+ def __init__(self, model_path: str, device: str = "cuda") -> None:
97
+ self.device = torch.device(device)
98
+ self.model = self._load_model(model_path)
99
+ self.model.eval()
100
+ self.model_version = self._read_version(model_path)
101
+ self.preprocessor = InferencePreprocessor() # Same as eval transforms
102
+
103
+ def predict(self, raw_input: dict) -> PredictionResult:
104
+ features = self.preprocessor.transform(raw_input)
105
+ tensor = torch.tensor(features).unsqueeze(0).to(self.device)
106
+ with torch.inference_mode():
107
+ logits = self.model(tensor)
108
+ probs = torch.softmax(logits, dim=-1)
109
+ confidence, pred_idx = probs.max(dim=-1)
110
+ return PredictionResult(
111
+ prediction=pred_idx.item(),
112
+ confidence=confidence.item(),
113
+ model_version=self.model_version,
114
+ )
115
+
116
+ def predict_batch(self, inputs: list[dict]) -> list[PredictionResult]:
117
+ """Batched inference — more efficient than looping predict()."""
118
+ features = [self.preprocessor.transform(x) for x in inputs]
119
+ batch = torch.tensor(np.stack(features)).to(self.device)
120
+ with torch.inference_mode():
121
+ logits = self.model(batch)
122
+ probs = torch.softmax(logits, dim=-1)
123
+ confidences, pred_idxs = probs.max(dim=-1)
124
+ return [
125
+ PredictionResult(p.item(), c.item(), self.model_version)
126
+ for p, c in zip(pred_idxs, confidences)
127
+ ]
128
+ ```
129
+
130
+ **Critical**: The `InferencePreprocessor` must be identical to the eval-time preprocessing used during training. A different implementation is the root cause of training-serving skew.
131
+
132
+ ### Batch vs. Real-time Inference
133
+
134
+ **Real-time inference** handles individual requests with strict latency constraints:
135
+ - Use `torch.inference_mode()` (not `torch.no_grad()`) — faster, disables version tracking
136
+ - Keep model in memory; avoid loading per request
137
+ - Use dynamic batching if your server supports it (groups simultaneous requests)
138
+ - Optimise with TorchScript, ONNX export, or TensorRT for maximum throughput
139
+
140
+ **Batch inference** processes large datasets offline:
141
+ ```python
142
+ # Efficient batch scoring with DataLoader
143
+ def batch_score(
144
+ predictor: Predictor,
145
+ dataset: Dataset,
146
+ output_path: str,
147
+ batch_size: int = 512,
148
+ ) -> None:
149
+ loader = DataLoader(dataset, batch_size=batch_size, num_workers=8)
150
+ results = []
151
+ for batch in tqdm(loader):
152
+ with torch.inference_mode():
153
+ predictions = predictor.predict_batch(batch)
154
+ results.extend(predictions)
155
+ pd.DataFrame(results).to_parquet(output_path)
156
+ ```
157
+
158
+ **Adaptive batching** (Triton, BentoML): The server accumulates requests for a short window (e.g., 10ms) and processes them as a batch. Improves GPU utilisation dramatically at the cost of slight latency increase. Recommended for any GPU-accelerated serving endpoint.
159
+
160
+ ### A/B Testing
161
+
162
+ A/B testing compares two model versions on real traffic with statistical rigour:
163
+
164
+ **Infrastructure requirements**:
165
+ 1. Request router: Directs traffic to model A or B based on user ID hash (not random — ensures consistent experience)
166
+ 2. Logging: Both models log predictions with the variant label
167
+ 3. Assignment: User assignment is sticky (same user always gets the same variant)
168
+
169
+ ```python
170
+ # Traffic routing by user_id hash
171
+ def route_request(user_id: str, traffic_split: float = 0.5) -> str:
172
+ """Returns 'model_a' or 'model_b' deterministically for a given user."""
173
+ hash_value = int(hashlib.md5(user_id.encode()).hexdigest(), 16) % 100
174
+ return "model_b" if hash_value < (traffic_split * 100) else "model_a"
175
+ ```
176
+
177
+ **Statistical requirements**:
178
+ - Define primary metric and minimum detectable effect before starting
179
+ - Calculate required sample size (power analysis) to avoid early stopping
180
+ - Typical ML A/B test: 2–4 weeks, 50/50 split, statistical significance at p < 0.05
181
+ - Do not stop early because one variant looks better — Type I error is high without pre-planned stopping rules
182
+
183
+ **Guardrail metrics**: In addition to the primary metric, monitor guardrail metrics (latency, error rate, crash rate). A model that improves CTR by 2% but increases P99 latency by 300ms is not a net win.
184
+
185
+ ### Canary Deployment
186
+
187
+ Canary deployment is safer than full rollout and different from A/B testing: the goal is operational safety, not measuring business impact.
188
+
189
+ ```
190
+ Traffic Distribution During Canary:
191
+ Old model (stable): 95%
192
+ New model (canary): 5%
193
+
194
+ Progress if healthy:
195
+ Old: 80%, New: 20%
196
+ Old: 50%, New: 50%
197
+ Old: 0%, New: 100% ← Full rollout
198
+ ```
199
+
200
+ **Automated canary promotion criteria**:
201
+ - Error rate of new model < threshold (e.g., < 0.1%)
202
+ - P99 latency within budget (e.g., < 200ms)
203
+ - No accuracy regression on logged predictions vs. offline eval
204
+ - No alerts triggered in monitoring
205
+
206
+ **Rollback trigger**: If any criteria breach within the canary period, route 100% traffic back to the old model and open an incident. Canary rollback should be a one-command operation.
207
+
208
+ ### Model Optimisation for Serving
209
+
210
+ Before deploying, optimise the model for serving throughput:
211
+
212
+ **TorchScript** (export to static graph):
213
+ ```python
214
+ # Trace-based export (simpler, but only works if model has no control flow)
215
+ scripted_model = torch.jit.trace(model, example_input)
216
+ torch.jit.save(scripted_model, "model_scripted.pt")
217
+
218
+ # Script-based export (handles control flow)
219
+ scripted_model = torch.jit.script(model)
220
+ ```
221
+
222
+ **ONNX export** (framework-independent):
223
+ ```python
224
+ torch.onnx.export(
225
+ model,
226
+ example_input,
227
+ "model.onnx",
228
+ input_names=["input"],
229
+ output_names=["output"],
230
+ dynamic_axes={"input": {0: "batch_size"}}, # Enable variable batch size
231
+ opset_version=17,
232
+ )
233
+ ```
234
+
235
+ **Quantisation** (reduce model size and inference time):
236
+ - Post-training quantisation (PTQ): Apply after training, minimal accuracy impact for most models
237
+ - Quantisation-aware training (QAT): Simulate quantisation during training, better accuracy for sensitive models
238
+ - INT8 quantisation typically provides 2–4x speedup with < 1% accuracy drop
239
+
240
+ **TensorRT** (NVIDIA, maximum GPU throughput):
241
+ - Optimises ONNX models for specific GPU hardware
242
+ - Applies layer fusion, kernel auto-tuning, precision calibration
243
+ - Provides the highest throughput for NVIDIA GPUs in production
@@ -0,0 +1,301 @@
1
+ ---
2
+ name: ml-testing
3
+ description: Unit tests for data transforms, tolerance-based model tests, pipeline integration tests, and regression tests for ML systems
4
+ topics: [ml, testing, unit-tests, model-tests, pipeline-tests, regression-tests, tdd]
5
+ ---
6
+
7
+ ML code is tested less rigorously than traditional software because "the model is probabilistic" feels like an excuse for skipping tests. It is not. The vast majority of ML code — data transforms, preprocessing, feature engineering, postprocessing, and serving logic — is deterministic and must be unit tested. The probabilistic parts — model weights and accuracy — require tolerance-based tests and regression baselines. Untested ML pipelines fail silently in ways that are expensive to diagnose in production.
8
+
9
+ ## Summary
10
+
11
+ Test ML systems at four levels: unit tests for deterministic components (transforms, metrics, preprocessing), model tests using tolerance-based assertions (output shape, value range, basic accuracy on canonical examples), pipeline tests (end-to-end training and inference on small data), and regression tests (compare new model against production baseline). Use `pytest` with `torch.testing` and `numpy.testing` for numerical assertions. Run tests in CI on every commit.
12
+
13
+ ## Deep Guidance
14
+
15
+ ### What to Test in ML
16
+
17
+ **Always unit test**:
18
+ - Data loading and preprocessing transforms
19
+ - Feature engineering functions
20
+ - Custom loss functions
21
+ - Metric computation functions
22
+ - Postprocessing logic (thresholding, calibration)
23
+ - Model architecture components (custom layers, attention mechanisms)
24
+
25
+ **Always model test** (tolerance-based):
26
+ - Model output shape matches expected shape
27
+ - Output values are in valid range (probabilities sum to 1, logits are finite)
28
+ - Model forward pass runs without error
29
+ - Model handles edge cases (empty input, max-length input, all-zero input)
30
+ - Basic sanity check: model achieves above-chance accuracy on a canonical small dataset
31
+
32
+ **Always pipeline test**:
33
+ - Full training pipeline runs on a tiny dataset without error
34
+ - Checkpoint save and load produces identical predictions
35
+ - Inference pipeline produces output in the correct format
36
+
37
+ **Always regression test**:
38
+ - New model version's accuracy on held-out test set does not regress beyond a threshold vs. the current production baseline
39
+
40
+ ### Unit Tests for Data Transforms
41
+
42
+ ```python
43
+ # tests/test_transforms.py
44
+ import pytest
45
+ import numpy as np
46
+ import torch
47
+ from src.data.transforms import (
48
+ Normalizer,
49
+ TextTokenizer,
50
+ ImageAugmenter,
51
+ )
52
+
53
+ class TestNormalizer:
54
+ def test_zero_mean(self):
55
+ """Normalized features should have near-zero mean on training data."""
56
+ X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
57
+ norm = Normalizer()
58
+ norm.fit(X)
59
+ X_norm = norm.transform(X)
60
+ np.testing.assert_allclose(X_norm.mean(axis=0), 0.0, atol=1e-6)
61
+
62
+ def test_unit_std(self):
63
+ """Normalized features should have unit standard deviation."""
64
+ X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
65
+ norm = Normalizer()
66
+ norm.fit(X)
67
+ X_norm = norm.transform(X)
68
+ np.testing.assert_allclose(X_norm.std(axis=0), 1.0, atol=1e-6)
69
+
70
+ def test_transform_without_fit_raises(self):
71
+ """Transform before fit must raise a clear error."""
72
+ norm = Normalizer()
73
+ with pytest.raises(RuntimeError, match="fit"):
74
+ norm.transform(np.array([[1.0, 2.0]]))
75
+
76
+ def test_inverse_transform_roundtrip(self):
77
+ """fit + transform + inverse_transform should return original values."""
78
+ X = np.random.rand(100, 5) * 10.0
79
+ norm = Normalizer()
80
+ norm.fit(X)
81
+ X_rt = norm.inverse_transform(norm.transform(X))
82
+ np.testing.assert_allclose(X_rt, X, rtol=1e-5)
83
+
84
+ def test_no_fit_leakage_to_test_data(self):
85
+ """Test data stats must not affect normalisation parameters."""
86
+ X_train = np.array([[1.0], [2.0], [3.0]])
87
+ X_test = np.array([[100.0], [200.0]]) # Very different distribution
88
+ norm = Normalizer()
89
+ norm.fit(X_train)
90
+ X_test_norm = norm.transform(X_test)
91
+ # Test data should be normalised using TRAINING stats only
92
+ assert np.all(np.abs(X_test_norm) > 1.0) # Large because distribution is different
93
+
94
+
95
+ class TestTextTokenizer:
96
+ def test_output_shape(self):
97
+ """Tokenizer must produce correct sequence length."""
98
+ tokenizer = TextTokenizer(max_length=128)
99
+ result = tokenizer("Hello world, this is a test.")
100
+ assert result["input_ids"].shape == (128,)
101
+ assert result["attention_mask"].shape == (128,)
102
+
103
+ def test_truncation(self):
104
+ """Long inputs must be truncated to max_length."""
105
+ tokenizer = TextTokenizer(max_length=8)
106
+ long_text = " ".join(["word"] * 100)
107
+ result = tokenizer(long_text)
108
+ assert result["input_ids"].shape == (8,)
109
+
110
+ def test_empty_input(self):
111
+ """Empty string must not raise an exception."""
112
+ tokenizer = TextTokenizer(max_length=128)
113
+ result = tokenizer("")
114
+ assert result["input_ids"].shape == (128,)
115
+ # All tokens after CLS should be PAD
116
+ assert result["attention_mask"].sum() <= 2 # Only CLS and/or SEP attended
117
+ ```
118
+
119
+ ### Model Tests
120
+
121
+ ```python
122
+ # tests/test_model.py
123
+ import pytest
124
+ import torch
125
+ import torch.nn.functional as F
126
+ from src.models.classifier import TextClassifier
127
+
128
+ @pytest.fixture
129
+ def model():
130
+ return TextClassifier(vocab_size=1000, hidden_dim=64, num_classes=3)
131
+
132
+ @pytest.fixture
133
+ def batch():
134
+ return {
135
+ "input_ids": torch.randint(0, 1000, (4, 128)),
136
+ "attention_mask": torch.ones(4, 128, dtype=torch.long),
137
+ }
138
+
139
+ class TestTextClassifier:
140
+ def test_output_shape(self, model, batch):
141
+ """Model output shape must match (batch_size, num_classes)."""
142
+ output = model(**batch)
143
+ assert output.shape == (4, 3)
144
+
145
+ def test_output_finite(self, model, batch):
146
+ """Model output must not contain NaN or Inf."""
147
+ output = model(**batch)
148
+ assert torch.all(torch.isfinite(output)), "Model output contains NaN or Inf"
149
+
150
+ def test_probabilities_sum_to_one(self, model, batch):
151
+ """Softmax probabilities must sum to 1."""
152
+ logits = model(**batch)
153
+ probs = F.softmax(logits, dim=-1)
154
+ torch.testing.assert_close(
155
+ probs.sum(dim=-1),
156
+ torch.ones(4),
157
+ atol=1e-5,
158
+ rtol=1e-5,
159
+ )
160
+
161
+ def test_different_inputs_different_outputs(self, model):
162
+ """Different inputs must produce different outputs (model is not constant)."""
163
+ batch_a = {"input_ids": torch.zeros(2, 128, dtype=torch.long),
164
+ "attention_mask": torch.ones(2, 128, dtype=torch.long)}
165
+ batch_b = {"input_ids": torch.ones(2, 128, dtype=torch.long),
166
+ "attention_mask": torch.ones(2, 128, dtype=torch.long)}
167
+ output_a = model(**batch_a)
168
+ output_b = model(**batch_b)
169
+ assert not torch.allclose(output_a, output_b), "Model outputs identical for different inputs"
170
+
171
+ def test_eval_mode_deterministic(self, model, batch):
172
+ """Same input in eval mode must produce identical outputs (no dropout randomness)."""
173
+ model.eval()
174
+ with torch.no_grad():
175
+ output_1 = model(**batch)
176
+ output_2 = model(**batch)
177
+ torch.testing.assert_close(output_1, output_2)
178
+
179
+ def test_gradient_flows(self, model, batch):
180
+ """Gradients must flow to all parameters during backward pass."""
181
+ model.train()
182
+ logits = model(**batch)
183
+ loss = logits.sum()
184
+ loss.backward()
185
+ for name, param in model.named_parameters():
186
+ if param.requires_grad:
187
+ assert param.grad is not None, f"No gradient for parameter: {name}"
188
+ assert torch.any(param.grad != 0), f"Zero gradient for parameter: {name}"
189
+ ```
190
+
191
+ ### Pipeline Tests
192
+
193
+ ```python
194
+ # tests/test_pipeline.py
195
+ import pytest
196
+ import tempfile
197
+ import os
198
+ from omegaconf import OmegaConf
199
+ from src.training.trainer import Trainer
200
+
201
+ @pytest.fixture
202
+ def tiny_config():
203
+ """Minimal config for fast pipeline smoke test."""
204
+ return OmegaConf.create({
205
+ "training": {"epochs": 2, "batch_size": 4, "seed": 42},
206
+ "optimizer": {"type": "adam", "lr": 1e-3},
207
+ "data": {"num_samples": 32}, # Tiny dataset
208
+ })
209
+
210
+ class TestTrainingPipeline:
211
+ def test_training_runs_without_error(self, tiny_config, tmp_path):
212
+ """Full training pipeline must complete without error on tiny data."""
213
+ trainer = Trainer(cfg=tiny_config, output_dir=str(tmp_path))
214
+ result = trainer.fit()
215
+ assert "val_loss" in result
216
+ assert result["val_loss"] < float("inf")
217
+
218
+ def test_checkpoint_saves_and_loads(self, tiny_config, tmp_path):
219
+ """Checkpoint must be saved and restored with identical predictions."""
220
+ trainer = Trainer(cfg=tiny_config, output_dir=str(tmp_path))
221
+ trainer.fit()
222
+
223
+ checkpoint_path = tmp_path / "best.pt"
224
+ assert checkpoint_path.exists(), "Checkpoint was not saved"
225
+
226
+ # Load checkpoint and verify predictions are identical
227
+ import torch
228
+ from src.models.classifier import TextClassifier
229
+ model_a = trainer.model
230
+ model_b = TextClassifier.from_checkpoint(str(checkpoint_path))
231
+
232
+ test_input = torch.randint(0, 1000, (2, 128))
233
+ model_a.eval()
234
+ model_b.eval()
235
+ with torch.no_grad():
236
+ torch.testing.assert_close(model_a(test_input), model_b(test_input))
237
+
238
+ def test_inference_pipeline_output_format(self, tiny_config, tmp_path):
239
+ """Inference pipeline must return predictions in expected format."""
240
+ trainer = Trainer(cfg=tiny_config, output_dir=str(tmp_path))
241
+ trainer.fit()
242
+
243
+ from src.serving.predictor import Predictor
244
+ predictor = Predictor(str(tmp_path / "best.pt"))
245
+ result = predictor.predict({"text": "test input"})
246
+
247
+ assert hasattr(result, "prediction")
248
+ assert hasattr(result, "confidence")
249
+ assert 0.0 <= result.confidence <= 1.0
250
+ ```
251
+
252
+ ### Regression Tests
253
+
254
+ ```python
255
+ # tests/test_regression.py
256
+ """
257
+ Regression tests compare a new model version against the production baseline.
258
+ Run these before promoting any model to staging.
259
+ """
260
+ import pytest
261
+ import numpy as np
262
+ from src.evaluation.evaluator import evaluate_model
263
+ from src.models.classifier import TextClassifier
264
+
265
+ PRODUCTION_BASELINE = {
266
+ "accuracy": 0.872,
267
+ "f1": 0.864,
268
+ "roc_auc": 0.934,
269
+ }
270
+ REGRESSION_TOLERANCE = 0.02 # Allow up to 2pp regression
271
+
272
+ class TestModelRegression:
273
+ @pytest.fixture(scope="class")
274
+ def candidate_metrics(self, holdout_dataset):
275
+ """Evaluate the candidate model on the holdout set."""
276
+ model = TextClassifier.from_registry("candidate")
277
+ return evaluate_model(model, holdout_dataset)
278
+
279
+ def test_accuracy_no_regression(self, candidate_metrics):
280
+ threshold = PRODUCTION_BASELINE["accuracy"] - REGRESSION_TOLERANCE
281
+ assert candidate_metrics["accuracy"] >= threshold, (
282
+ f"Accuracy regression: {candidate_metrics['accuracy']:.3f} < {threshold:.3f}"
283
+ )
284
+
285
+ def test_f1_no_regression(self, candidate_metrics):
286
+ threshold = PRODUCTION_BASELINE["f1"] - REGRESSION_TOLERANCE
287
+ assert candidate_metrics["f1"] >= threshold
288
+
289
+ def test_roc_auc_no_regression(self, candidate_metrics):
290
+ threshold = PRODUCTION_BASELINE["roc_auc"] - REGRESSION_TOLERANCE
291
+ assert candidate_metrics["roc_auc"] >= threshold
292
+ ```
293
+
294
+ ### Testing Best Practices for ML
295
+
296
+ - **Test data must not touch training data**: Use a separate fixture dataset for tests, not samples from the training set
297
+ - **Tests must be fast**: Unit and model tests must run in < 10 seconds total; use tiny models and tiny data
298
+ - **Parametrize for edge cases**: Use `@pytest.mark.parametrize` to test multiple input types (empty, max-length, all-zeros, all-ones)
299
+ - **Numerical precision**: Use `rtol`/`atol` tolerances in `numpy.testing.assert_allclose` and `torch.testing.assert_close` — never use `==` for floats
300
+ - **Mock heavy dependencies**: Mock database connections, S3 calls, and MLflow logging in unit tests — tests must not require external services to run
301
+ - **CI enforcement**: Run `pytest tests/` in CI on every commit; block PRs that break tests