agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,678 @@
1
+ # ML Testing
2
+
3
+ Guidelines for testing machine learning systems, including unit tests, model behavior tests, integration tests, and evaluation strategies.
4
+
5
+ ## Testing Philosophy
6
+
7
+ ### ML-Specific Testing Challenges
8
+
9
+ Traditional software testing verifies exact outputs. ML testing must handle:
10
+
11
+ | Challenge | Traditional Software | ML Systems |
12
+ |-----------|---------------------|------------|
13
+ | Correctness | Exact output match | Statistical properties |
14
+ | Determinism | Always reproducible | Random seeds, GPU non-determinism |
15
+ | Edge cases | Known boundary conditions | Distribution tails |
16
+ | Regression | Feature breaks | Performance degradation |
17
+
18
+ ### Testing Pyramid for ML
19
+
20
+ ```
21
+ ┌─────────────┐
22
+ │ E2E/A/B │ Slow, expensive, high confidence
23
+ │ Tests │
24
+ ┌┴─────────────┴┐
25
+ │ Integration │ Pipeline tests, API tests
26
+ │ Tests │
27
+ ┌┴───────────────┴┐
28
+ │ Model Tests │ Behavior, fairness, robustness
29
+ │ │
30
+ ┌┴──────────────────┴┐
31
+ │ Unit Tests │ Fast, isolated, comprehensive
32
+ └────────────────────┘
33
+ ```
34
+
35
+ ## Unit Tests
36
+
37
+ ### Data Validation Tests
38
+
39
+ ```python
40
+ import pytest
41
+ import pandas as pd
42
+ import numpy as np
43
+ from src.data.validators import TrainingDataSchema
44
+ from src.data.transforms import FeatureTransformer
45
+
46
+ class TestDataValidation:
47
+ """Test data validation schemas."""
48
+
49
+ def test_valid_data_passes(self):
50
+ """Valid data should pass validation."""
51
+ df = pd.DataFrame({
52
+ "user_id": ["u1", "u2", "u3"],
53
+ "feature_1": [0.5, 0.7, 0.3],
54
+ "feature_2": [1.0, 2.0, 3.0],
55
+ "label": [0, 1, 0],
56
+ })
57
+
58
+ validated = TrainingDataSchema.validate(df)
59
+ assert len(validated) == 3
60
+
61
+ def test_missing_column_fails(self):
62
+ """Missing required column should fail."""
63
+ df = pd.DataFrame({
64
+ "user_id": ["u1"],
65
+ "feature_1": [0.5],
66
+ # Missing feature_2 and label
67
+ })
68
+
69
+ with pytest.raises(pa.errors.SchemaError) as exc_info:
70
+ TrainingDataSchema.validate(df)
71
+
72
+ assert "feature_2" in str(exc_info.value) or "label" in str(exc_info.value)
73
+
74
+ def test_invalid_range_fails(self):
75
+ """Value out of range should fail."""
76
+ df = pd.DataFrame({
77
+ "user_id": ["u1"],
78
+ "feature_1": [1.5], # Out of range [0, 1]
79
+ "feature_2": [1.0],
80
+ "label": [0],
81
+ })
82
+
83
+ with pytest.raises(pa.errors.SchemaError):
84
+ TrainingDataSchema.validate(df)
85
+
86
+ def test_null_in_non_nullable_fails(self):
87
+ """Null in non-nullable column should fail."""
88
+ df = pd.DataFrame({
89
+ "user_id": [None], # Null not allowed
90
+ "feature_1": [0.5],
91
+ "feature_2": [1.0],
92
+ "label": [0],
93
+ })
94
+
95
+ with pytest.raises(pa.errors.SchemaError):
96
+ TrainingDataSchema.validate(df)
97
+
98
+ class TestFeatureTransformer:
99
+ """Test feature transformation logic."""
100
+
101
+ @pytest.fixture
102
+ def sample_data(self):
103
+ return pd.DataFrame({
104
+ "numeric_1": [0, 10, 20, 30, 40],
105
+ "numeric_2": [100, 200, 300, 400, 500],
106
+ "categorical_1": ["A", "B", "A", "C", "B"],
107
+ })
108
+
109
+ @pytest.fixture
110
+ def config(self):
111
+ return FeatureConfig(
112
+ numeric_cols=["numeric_1", "numeric_2"],
113
+ categorical_cols=["categorical_1"],
114
+ )
115
+
116
+ def test_fit_transform_normalizes(self, sample_data, config):
117
+ """Numeric features should be normalized."""
118
+ transformer = FeatureTransformer(config)
119
+ result = transformer.fit_transform(sample_data)
120
+
121
+ # Mean should be ~0, std should be ~1
122
+ assert abs(result["numeric_1"].mean()) < 1e-10
123
+ assert abs(result["numeric_1"].std() - 1) < 0.1
124
+
125
+ def test_transform_uses_fitted_params(self, sample_data, config):
126
+ """Transform should use parameters from fit."""
127
+ transformer = FeatureTransformer(config)
128
+ transformer.fit(sample_data)
129
+
130
+ # New data with different distribution
131
+ new_data = pd.DataFrame({
132
+ "numeric_1": [100, 200],
133
+ "numeric_2": [1000, 2000],
134
+ "categorical_1": ["A", "B"],
135
+ })
136
+
137
+ result = transformer.transform(new_data)
138
+
139
+ # Should NOT be centered at 0 (uses training stats)
140
+ assert result["numeric_1"].mean() != pytest.approx(0, abs=0.1)
141
+
142
+ def test_handles_unseen_category(self, sample_data, config):
143
+ """Should handle categories not seen during fit."""
144
+ transformer = FeatureTransformer(config)
145
+ transformer.fit(sample_data)
146
+
147
+ new_data = pd.DataFrame({
148
+ "numeric_1": [25],
149
+ "numeric_2": [350],
150
+ "categorical_1": ["D"], # Unseen category
151
+ })
152
+
153
+ # Should not raise, should encode as unknown
154
+ result = transformer.transform(new_data)
155
+ assert len(result) == 1
156
+
157
+ def test_save_load_roundtrip(self, sample_data, config, tmp_path):
158
+ """Saved transformer should produce identical results."""
159
+ transformer = FeatureTransformer(config)
160
+ transformer.fit(sample_data)
161
+
162
+ path = tmp_path / "transformer.pkl"
163
+ transformer.save(str(path))
164
+
165
+ loaded = FeatureTransformer.load(str(path))
166
+
167
+ original_result = transformer.transform(sample_data)
168
+ loaded_result = loaded.transform(sample_data)
169
+
170
+ pd.testing.assert_frame_equal(original_result, loaded_result)
171
+ ```
172
+
173
+ ### Metric Calculation Tests
174
+
175
+ ```python
176
+ class TestMetrics:
177
+ """Test metric calculations."""
178
+
179
+ def test_classification_metrics_perfect_predictions(self):
180
+ """Perfect predictions should give perfect scores."""
181
+ y_true = np.array([0, 0, 1, 1, 1])
182
+ y_pred = np.array([0, 0, 1, 1, 1])
183
+ y_prob = np.array([0.1, 0.2, 0.9, 0.8, 0.95])
184
+
185
+ metrics = ClassificationMetrics.compute(y_true, y_pred, y_prob)
186
+
187
+ assert metrics.accuracy == 1.0
188
+ assert metrics.precision == 1.0
189
+ assert metrics.recall == 1.0
190
+ assert metrics.f1 == 1.0
191
+
192
+ def test_classification_metrics_all_wrong(self):
193
+ """All wrong predictions should give zero scores."""
194
+ y_true = np.array([0, 0, 1, 1])
195
+ y_pred = np.array([1, 1, 0, 0])
196
+ y_prob = np.array([0.9, 0.8, 0.1, 0.2])
197
+
198
+ metrics = ClassificationMetrics.compute(y_true, y_pred, y_prob)
199
+
200
+ assert metrics.accuracy == 0.0
201
+ assert metrics.precision == 0.0
202
+ assert metrics.recall == 0.0
203
+
204
+ def test_roc_auc_random_is_half(self):
205
+ """Random predictions should have AUC ~0.5."""
206
+ np.random.seed(42)
207
+ y_true = np.random.randint(0, 2, 1000)
208
+ y_prob = np.random.random(1000)
209
+ y_pred = (y_prob >= 0.5).astype(int)
210
+
211
+ metrics = ClassificationMetrics.compute(y_true, y_pred, y_prob)
212
+
213
+ assert 0.45 <= metrics.roc_auc <= 0.55
214
+ ```
215
+
216
+ ## Model Behavior Tests
217
+
218
+ ### Invariance Tests
219
+
220
+ ```python
221
+ class TestModelInvariance:
222
+ """Test model behaves consistently under expected variations."""
223
+
224
+ @pytest.fixture
225
+ def model(self):
226
+ return mlflow.pyfunc.load_model("models:/fraud-detector/Production")
227
+
228
+ def test_deterministic_predictions(self, model):
229
+ """Same input should always give same output."""
230
+ features = pd.DataFrame([{
231
+ "amount": 100.0,
232
+ "hour": 14,
233
+ "merchant_category": "retail",
234
+ }])
235
+
236
+ predictions = [model.predict(features)[0] for _ in range(10)]
237
+
238
+ assert all(p == predictions[0] for p in predictions)
239
+
240
+ def test_invariant_to_irrelevant_features(self, model):
241
+ """Predictions should not change based on irrelevant features."""
242
+ base_features = {
243
+ "amount": 100.0,
244
+ "hour": 14,
245
+ "merchant_category": "retail",
246
+ }
247
+
248
+ pred1 = model.predict(pd.DataFrame([{**base_features, "user_name": "Alice"}]))[0]
249
+ pred2 = model.predict(pd.DataFrame([{**base_features, "user_name": "Bob"}]))[0]
250
+
251
+ assert pred1 == pred2
252
+
253
+ def test_scale_invariance_for_ratios(self, model):
254
+ """Model should be scale-invariant for ratio features."""
255
+ # If using normalized features, doubling all values should give same result
256
+ features1 = pd.DataFrame([{"feature_1": 0.5, "feature_2": 0.5}])
257
+ features2 = pd.DataFrame([{"feature_1": 1.0, "feature_2": 1.0}])
258
+
259
+ pred1 = model.predict(features1)[0]
260
+ pred2 = model.predict(features2)[0]
261
+
262
+ # Predictions should be similar (not necessarily identical)
263
+ assert abs(pred1 - pred2) < 0.1
264
+ ```
265
+
266
+ ### Directional Expectation Tests
267
+
268
+ ```python
269
+ class TestDirectionalExpectations:
270
+ """Test that model responds correctly to feature changes."""
271
+
272
+ @pytest.fixture
273
+ def model(self):
274
+ return mlflow.pyfunc.load_model("models:/fraud-detector/Production")
275
+
276
+ def test_higher_amount_increases_fraud_risk(self, model):
277
+ """Higher transaction amounts should increase fraud probability."""
278
+ low_amount = pd.DataFrame([{"amount": 10, "hour": 12}])
279
+ high_amount = pd.DataFrame([{"amount": 10000, "hour": 12}])
280
+
281
+ low_pred = model.predict_proba(low_amount)[0, 1]
282
+ high_pred = model.predict_proba(high_amount)[0, 1]
283
+
284
+ assert high_pred > low_pred
285
+
286
+ def test_unusual_hour_increases_fraud_risk(self, model):
287
+ """Transactions at unusual hours should have higher fraud probability."""
288
+ normal_hour = pd.DataFrame([{"amount": 100, "hour": 14}])
289
+ unusual_hour = pd.DataFrame([{"amount": 100, "hour": 3}])
290
+
291
+ normal_pred = model.predict_proba(normal_hour)[0, 1]
292
+ unusual_pred = model.predict_proba(unusual_hour)[0, 1]
293
+
294
+ assert unusual_pred > normal_pred
295
+
296
+ def test_monotonic_relationship(self, model):
297
+ """Test monotonic relationship between feature and prediction."""
298
+ amounts = [10, 50, 100, 500, 1000, 5000]
299
+
300
+ predictions = []
301
+ for amount in amounts:
302
+ features = pd.DataFrame([{"amount": amount, "hour": 12}])
303
+ pred = model.predict_proba(features)[0, 1]
304
+ predictions.append(pred)
305
+
306
+ # Should be monotonically increasing
307
+ for i in range(1, len(predictions)):
308
+ assert predictions[i] >= predictions[i-1]
309
+ ```
310
+
311
+ ### Boundary Tests
312
+
313
+ ```python
314
+ class TestBoundaryBehavior:
315
+ """Test model behavior at boundaries."""
316
+
317
+ @pytest.fixture
318
+ def model(self):
319
+ return mlflow.pyfunc.load_model("models:/classifier/Production")
320
+
321
+ def test_predictions_in_valid_range(self, model):
322
+ """All predictions should be valid probabilities."""
323
+ test_data = pd.DataFrame([
324
+ {"feature_1": 0.0, "feature_2": 0.0}, # Minimum
325
+ {"feature_1": 1.0, "feature_2": 1.0}, # Maximum
326
+ {"feature_1": 0.5, "feature_2": 0.5}, # Middle
327
+ {"feature_1": 0.001, "feature_2": 0.999}, # Near boundaries
328
+ ])
329
+
330
+ predictions = model.predict_proba(test_data)
331
+
332
+ assert np.all(predictions >= 0)
333
+ assert np.all(predictions <= 1)
334
+ assert np.allclose(predictions.sum(axis=1), 1.0)
335
+
336
+ def test_extreme_values(self, model):
337
+ """Model should handle extreme but valid inputs."""
338
+ extreme_data = pd.DataFrame([
339
+ {"feature_1": 1e-10, "feature_2": 1e10},
340
+ {"feature_1": -1e10, "feature_2": 1e-10},
341
+ ])
342
+
343
+ # Should not raise
344
+ predictions = model.predict(extreme_data)
345
+
346
+ assert len(predictions) == 2
347
+ assert not np.any(np.isnan(predictions))
348
+ assert not np.any(np.isinf(predictions))
349
+ ```
350
+
351
+ ### Fairness Tests
352
+
353
+ ```python
354
+ class TestModelFairness:
355
+ """Test model fairness across protected groups."""
356
+
357
+ @pytest.fixture
358
+ def model(self):
359
+ return mlflow.pyfunc.load_model("models:/loan-approval/Production")
360
+
361
+ @pytest.fixture
362
+ def test_data(self):
363
+ return pd.read_parquet("test_data/fairness_test_set.parquet")
364
+
365
+ def test_statistical_parity(self, model, test_data):
366
+ """Positive prediction rates should be similar across groups."""
367
+ predictions = model.predict(test_data)
368
+
369
+ for protected_attr in ["gender", "race", "age_group"]:
370
+ rates = {}
371
+
372
+ for group in test_data[protected_attr].unique():
373
+ mask = test_data[protected_attr] == group
374
+ rates[group] = predictions[mask].mean()
375
+
376
+ max_rate = max(rates.values())
377
+ min_rate = min(rates.values())
378
+
379
+ # Disparate impact ratio should be >= 0.8
380
+ assert min_rate / max_rate >= 0.8, f"Unfair for {protected_attr}: {rates}"
381
+
382
+ def test_equal_opportunity(self, model, test_data):
383
+ """True positive rates should be similar across groups."""
384
+ predictions = model.predict(test_data)
385
+ labels = test_data["label"]
386
+
387
+ for protected_attr in ["gender", "race"]:
388
+ tpr = {}
389
+
390
+ for group in test_data[protected_attr].unique():
391
+ mask = (test_data[protected_attr] == group) & (labels == 1)
392
+ if mask.sum() == 0:
393
+ continue
394
+
395
+ tpr[group] = predictions[mask].mean()
396
+
397
+ max_tpr = max(tpr.values())
398
+ min_tpr = min(tpr.values())
399
+
400
+ # Difference should be < 0.1
401
+ assert max_tpr - min_tpr < 0.1, f"Unequal opportunity for {protected_attr}: {tpr}"
402
+ ```
403
+
404
+ ## Integration Tests
405
+
406
+ ### Inference Pipeline Tests
407
+
408
+ ```python
409
+ class TestInferencePipeline:
410
+ """Test end-to-end inference pipeline."""
411
+
412
+ @pytest.fixture
413
+ def inference_client(self):
414
+ """Create client for inference service."""
415
+ return InferenceClient("http://localhost:8080")
416
+
417
+ def test_health_check(self, inference_client):
418
+ """Service should be healthy."""
419
+ response = inference_client.health()
420
+
421
+ assert response["status"] == "healthy"
422
+ assert response["model_loaded"] == True
423
+
424
+ def test_single_prediction(self, inference_client):
425
+ """Single prediction should succeed."""
426
+ request = {
427
+ "features": {
428
+ "amount": 100.0,
429
+ "hour": 14,
430
+ "merchant_category": "retail",
431
+ }
432
+ }
433
+
434
+ response = inference_client.predict(request)
435
+
436
+ assert "prediction" in response
437
+ assert response["prediction"] in [0, 1]
438
+ assert "probability" in response
439
+ assert 0 <= response["probability"] <= 1
440
+ assert "latency_ms" in response
441
+
442
+ def test_batch_prediction(self, inference_client):
443
+ """Batch prediction should succeed."""
444
+ requests = [
445
+ {"amount": 100.0, "hour": 14},
446
+ {"amount": 500.0, "hour": 3},
447
+ {"amount": 50.0, "hour": 10},
448
+ ]
449
+
450
+ response = inference_client.predict_batch(requests)
451
+
452
+ assert len(response["predictions"]) == 3
453
+
454
+ def test_invalid_request_rejected(self, inference_client):
455
+ """Invalid request should return error."""
456
+ request = {
457
+ "features": {
458
+ "amount": "not_a_number", # Invalid type
459
+ }
460
+ }
461
+
462
+ with pytest.raises(ValidationError):
463
+ inference_client.predict(request)
464
+
465
+ def test_latency_within_sla(self, inference_client):
466
+ """Latency should be within SLA."""
467
+ request = {"features": {"amount": 100.0, "hour": 14}}
468
+
469
+ latencies = []
470
+ for _ in range(100):
471
+ response = inference_client.predict(request)
472
+ latencies.append(response["latency_ms"])
473
+
474
+ p99_latency = np.percentile(latencies, 99)
475
+
476
+ assert p99_latency < 100, f"P99 latency {p99_latency}ms exceeds SLA"
477
+ ```
478
+
479
+ ### Training Pipeline Tests
480
+
481
+ ```python
482
+ class TestTrainingPipeline:
483
+ """Test training pipeline components."""
484
+
485
+ def test_data_loading(self):
486
+ """Data loading should produce valid dataset."""
487
+ dataset = load_training_data("test_data/sample.parquet")
488
+
489
+ assert len(dataset) > 0
490
+ assert "features" in dataset.column_names
491
+ assert "label" in dataset.column_names
492
+
493
+ def test_training_reduces_loss(self):
494
+ """Training should reduce loss."""
495
+ model = create_model(test_config)
496
+ train_data = load_test_training_data()
497
+
498
+ initial_loss = evaluate_loss(model, train_data)
499
+
500
+ train_for_epochs(model, train_data, epochs=5)
501
+
502
+ final_loss = evaluate_loss(model, train_data)
503
+
504
+ assert final_loss < initial_loss
505
+
506
+ def test_model_can_be_saved_and_loaded(self, tmp_path):
507
+ """Model should be serializable."""
508
+ model = create_model(test_config)
509
+ train_for_epochs(model, load_test_training_data(), epochs=1)
510
+
511
+ # Save
512
+ model_path = tmp_path / "model"
513
+ mlflow.pytorch.save_model(model, str(model_path))
514
+
515
+ # Load
516
+ loaded_model = mlflow.pytorch.load_model(str(model_path))
517
+
518
+ # Predictions should match
519
+ test_input = torch.randn(1, INPUT_DIM)
520
+
521
+ model.eval()
522
+ loaded_model.eval()
523
+
524
+ with torch.no_grad():
525
+ original_pred = model(test_input)
526
+ loaded_pred = loaded_model(test_input)
527
+
528
+ torch.testing.assert_close(original_pred, loaded_pred)
529
+ ```
530
+
531
+ ## Performance Tests
532
+
533
+ ### Load Testing
534
+
535
+ ```python
536
+ import locust
537
+ from locust import HttpUser, task, between
538
+
539
+ class InferenceLoadTest(HttpUser):
540
+ """Load test for inference service."""
541
+
542
+ wait_time = between(0.1, 0.5)
543
+
544
+ @task(10)
545
+ def single_prediction(self):
546
+ """Test single prediction endpoint."""
547
+ self.client.post(
548
+ "/predict",
549
+ json={
550
+ "features": {
551
+ "amount": 100.0,
552
+ "hour": 14,
553
+ }
554
+ },
555
+ )
556
+
557
+ @task(1)
558
+ def batch_prediction(self):
559
+ """Test batch prediction endpoint."""
560
+ self.client.post(
561
+ "/predict/batch",
562
+ json={
563
+ "instances": [
564
+ {"amount": 100.0, "hour": 14},
565
+ {"amount": 200.0, "hour": 15},
566
+ {"amount": 300.0, "hour": 16},
567
+ ]
568
+ },
569
+ )
570
+
571
+ # Run: locust -f test_load.py --host=http://localhost:8080
572
+ ```
573
+
574
+ ### Benchmark Tests
575
+
576
+ ```python
577
+ import pytest
578
+ import time
579
+
580
+ class TestPerformanceBenchmarks:
581
+ """Performance benchmark tests."""
582
+
583
+ @pytest.fixture
584
+ def model(self):
585
+ return mlflow.pyfunc.load_model("models:/fraud-detector/Production")
586
+
587
+ @pytest.mark.benchmark
588
+ def test_single_inference_latency(self, model, benchmark):
589
+ """Benchmark single inference latency."""
590
+ features = pd.DataFrame([{"amount": 100.0, "hour": 14}])
591
+
592
+ result = benchmark(model.predict, features)
593
+
594
+ # Assert latency is acceptable
595
+ assert benchmark.stats["mean"] < 0.01 # 10ms
596
+
597
+ @pytest.mark.benchmark
598
+ def test_batch_inference_throughput(self, model, benchmark):
599
+ """Benchmark batch inference throughput."""
600
+ batch_size = 1000
601
+ features = pd.DataFrame([
602
+ {"amount": i * 10, "hour": i % 24}
603
+ for i in range(batch_size)
604
+ ])
605
+
606
+ result = benchmark(model.predict, features)
607
+
608
+ # Calculate throughput
609
+ throughput = batch_size / benchmark.stats["mean"]
610
+
611
+ assert throughput > 10000 # At least 10k predictions/second
612
+ ```
613
+
614
+ ## Test Configuration
615
+
616
+ ### pytest.ini
617
+
618
+ ```ini
619
+ [pytest]
620
+ testpaths = tests
621
+ python_files = test_*.py
622
+ python_classes = Test*
623
+ python_functions = test_*
624
+
625
+ markers =
626
+ unit: Unit tests (fast, isolated)
627
+ integration: Integration tests (slower, require services)
628
+ model: Model behavior tests
629
+ fairness: Fairness tests
630
+ benchmark: Performance benchmarks
631
+
632
+ addopts = -v --tb=short
633
+
634
+ filterwarnings =
635
+ ignore::DeprecationWarning
636
+ ```
637
+
638
+ ### Running Tests
639
+
640
+ ```bash
641
+ # Run all tests
642
+ pytest
643
+
644
+ # Run specific test types
645
+ pytest -m unit
646
+ pytest -m "model and not benchmark"
647
+ pytest -m fairness
648
+
649
+ # Run with coverage
650
+ pytest --cov=src --cov-report=html
651
+
652
+ # Run benchmarks
653
+ pytest -m benchmark --benchmark-autosave
654
+ ```
655
+
656
+ ## Best Practices
657
+
658
+ ### Test Checklist
659
+
660
+ - [ ] Unit tests for all data transforms
661
+ - [ ] Unit tests for all metric calculations
662
+ - [ ] Model invariance tests
663
+ - [ ] Directional expectation tests
664
+ - [ ] Boundary behavior tests
665
+ - [ ] Fairness tests for protected groups
666
+ - [ ] Integration tests for inference pipeline
667
+ - [ ] Load tests for production capacity
668
+ - [ ] Regression tests comparing to baseline
669
+
670
+ ### Common Pitfalls
671
+
672
+ | Pitfall | Problem | Solution |
673
+ |---------|---------|----------|
674
+ | Testing on training data | Overly optimistic results | Use held-out test set |
675
+ | Ignoring random seeds | Non-reproducible tests | Set seeds explicitly |
676
+ | Testing exact values | Brittle tests | Test properties and ranges |
677
+ | Missing edge cases | Failures in production | Test boundaries systematically |
678
+ | No fairness tests | Discrimination in production | Test across protected groups |