agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,1136 @@
1
+ # ML/AI Development Guide
2
+
3
+ Staff-level guidelines for machine learning and artificial intelligence systems. This guide covers data engineering, model development, deployment, monitoring, and responsible AI practices.
4
+
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ This guide applies to:
10
+
11
+ - Machine learning pipelines (training, evaluation, deployment)
12
+ - Deep learning systems (computer vision, NLP, recommendation systems)
13
+ - MLOps infrastructure (experiment tracking, feature stores, model registries)
14
+ - LLM/GenAI applications (fine-tuning, RAG, prompt engineering)
15
+ - Real-time and batch inference systems
16
+
17
+ ### Key Principles
18
+
19
+ 1. **Data-Centric Development** - Data quality beats algorithm complexity
20
+ 2. **Reproducibility Is Non-Negotiable** - Version everything: data, code, configs, models
21
+ 3. **Observability Over Uptime** - Monitor drift, not just infrastructure health
22
+ 4. **Responsible AI** - Fairness, bias detection, and explainability by default
23
+
24
+ ### Technology Stack
25
+
26
+ | Layer | Technology |
27
+ |-------|------------|
28
+ | Training | PyTorch, TensorFlow, scikit-learn, XGBoost |
29
+ | Experiment Tracking | MLflow, Weights & Biases, Neptune |
30
+ | Feature Store | Feast, Tecton, Hopsworks |
31
+ | Data Validation | TensorFlow Data Validation, Great Expectations |
32
+ | Model Serving | KServe, TorchServe, Triton, vLLM |
33
+ | Orchestration | Kubeflow, Airflow, Prefect, Dagster |
34
+ | Monitoring | Evidently, WhyLabs, Arize |
35
+
36
+ ---
37
+
38
+ ## Project Structure
39
+
40
+ ```
41
+ ml-project/
42
+ ├── data/
43
+ │ ├── raw/ # Immutable raw data
44
+ │ ├── processed/ # Cleaned, transformed data
45
+ │ └── features/ # Feature store exports
46
+ ├── src/
47
+ │ ├── data/ # Data loading and validation
48
+ │ │ ├── loaders.py
49
+ │ │ ├── validators.py
50
+ │ │ └── transforms.py
51
+ │ ├── features/ # Feature engineering
52
+ │ │ ├── engineering.py
53
+ │ │ └── store.py
54
+ │ ├── models/ # Model definitions
55
+ │ │ ├── architectures.py
56
+ │ │ └── losses.py
57
+ │ ├── training/ # Training logic
58
+ │ │ ├── trainer.py
59
+ │ │ ├── callbacks.py
60
+ │ │ └── optimizers.py
61
+ │ ├── evaluation/ # Evaluation and metrics
62
+ │ │ ├── metrics.py
63
+ │ │ └── analysis.py
64
+ │ ├── inference/ # Serving code
65
+ │ │ ├── predictor.py
66
+ │ │ └── preprocessing.py
67
+ │ └── utils/ # Shared utilities
68
+ ├── configs/ # Experiment configurations
69
+ │ ├── model/
70
+ │ ├── training/
71
+ │ └── serving/
72
+ ├── notebooks/ # Exploration (not production)
73
+ ├── tests/
74
+ │ ├── unit/
75
+ │ ├── integration/
76
+ │ └── model/ # Model-specific tests
77
+ ├── pipelines/ # ML pipeline definitions
78
+ │ ├── training_pipeline.py
79
+ │ └── inference_pipeline.py
80
+ └── deployments/ # Kubernetes/serving configs
81
+ ├── kserve/
82
+ └── docker/
83
+ ```
84
+
85
+ ---
86
+
87
+ ## Data Engineering
88
+
89
+ ### Data Validation
90
+
91
+ Validate all data at pipeline boundaries:
92
+
93
+ ```python
94
+ import pandera as pa
95
+ from pandera.typing import Series, DataFrame
96
+
97
+ class TrainingDataSchema(pa.DataFrameModel):
98
+ """Schema for training data validation."""
99
+
100
+ user_id: Series[str] = pa.Field(nullable=False)
101
+ feature_1: Series[float] = pa.Field(ge=0, le=1)
102
+ feature_2: Series[float] = pa.Field(nullable=False)
103
+ label: Series[int] = pa.Field(isin=[0, 1])
104
+
105
+ class Config:
106
+ strict = True
107
+ coerce = True
108
+
109
+ @pa.check_types
110
+ def load_training_data(path: str) -> DataFrame[TrainingDataSchema]:
111
+ """Load and validate training data."""
112
+ df = pd.read_parquet(path)
113
+ return df # Automatically validated against schema
114
+ ```
115
+
116
+ ### Data Quality Checks
117
+
118
+ ```python
119
+ from great_expectations.core import ExpectationSuite
120
+
121
+ def create_data_quality_suite() -> ExpectationSuite:
122
+ """Define data quality expectations."""
123
+ suite = ExpectationSuite(expectation_suite_name="training_data")
124
+
125
+ # Completeness
126
+ suite.add_expectation(
127
+ expectation_type="expect_column_values_to_not_be_null",
128
+ kwargs={"column": "user_id"}
129
+ )
130
+
131
+ # Freshness
132
+ suite.add_expectation(
133
+ expectation_type="expect_column_max_to_be_between",
134
+ kwargs={"column": "timestamp", "min_value": "2024-01-01"}
135
+ )
136
+
137
+ # Distribution
138
+ suite.add_expectation(
139
+ expectation_type="expect_column_mean_to_be_between",
140
+ kwargs={"column": "feature_1", "min_value": 0.4, "max_value": 0.6}
141
+ )
142
+
143
+ return suite
144
+ ```
145
+
146
+ ### Feature Engineering
147
+
148
+ ```python
149
+ from feast import FeatureStore, Entity, FeatureView, Field
150
+ from feast.types import Float32, Int64
151
+
152
+ # Define entities
153
+ user = Entity(name="user", join_keys=["user_id"])
154
+
155
+ # Define feature view
156
+ user_features = FeatureView(
157
+ name="user_features",
158
+ entities=[user],
159
+ schema=[
160
+ Field(name="avg_session_duration", dtype=Float32),
161
+ Field(name="total_purchases", dtype=Int64),
162
+ Field(name="days_since_last_activity", dtype=Int64),
163
+ ],
164
+ source=user_features_source,
165
+ ttl=timedelta(days=1),
166
+ )
167
+
168
+ # Retrieve features for training
169
+ def get_training_features(entity_df: pd.DataFrame) -> pd.DataFrame:
170
+ """Get historical features for training."""
171
+ store = FeatureStore(repo_path="feature_repo/")
172
+
173
+ training_df = store.get_historical_features(
174
+ entity_df=entity_df,
175
+ features=[
176
+ "user_features:avg_session_duration",
177
+ "user_features:total_purchases",
178
+ "user_features:days_since_last_activity",
179
+ ],
180
+ ).to_df()
181
+
182
+ return training_df
183
+ ```
184
+
185
+ ### Training/Serving Skew Prevention
186
+
187
+ ```python
188
+ class FeatureTransformer:
189
+ """Ensure identical transforms for training and serving."""
190
+
191
+ def __init__(self):
192
+ self.scalers: dict[str, StandardScaler] = {}
193
+ self.encoders: dict[str, LabelEncoder] = {}
194
+
195
+ def fit_transform(self, df: pd.DataFrame, config: TransformConfig) -> pd.DataFrame:
196
+ """Fit and transform for training."""
197
+ result = df.copy()
198
+
199
+ for col in config.numeric_cols:
200
+ self.scalers[col] = StandardScaler()
201
+ result[col] = self.scalers[col].fit_transform(result[[col]])
202
+
203
+ for col in config.categorical_cols:
204
+ self.encoders[col] = LabelEncoder()
205
+ result[col] = self.encoders[col].fit_transform(result[col])
206
+
207
+ return result
208
+
209
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
210
+ """Transform for serving (uses fitted params)."""
211
+ result = df.copy()
212
+
213
+ for col, scaler in self.scalers.items():
214
+ result[col] = scaler.transform(result[[col]])
215
+
216
+ for col, encoder in self.encoders.items():
217
+ result[col] = encoder.transform(result[col])
218
+
219
+ return result
220
+
221
+ def save(self, path: str) -> None:
222
+ """Serialize transformer for serving."""
223
+ joblib.dump({"scalers": self.scalers, "encoders": self.encoders}, path)
224
+
225
+ @classmethod
226
+ def load(cls, path: str) -> "FeatureTransformer":
227
+ """Load transformer for serving."""
228
+ data = joblib.load(path)
229
+ transformer = cls()
230
+ transformer.scalers = data["scalers"]
231
+ transformer.encoders = data["encoders"]
232
+ return transformer
233
+ ```
234
+
235
+ ---
236
+
237
+ ## Model Development
238
+
239
+ ### Experiment Tracking
240
+
241
+ ```python
242
+ import mlflow
243
+ from mlflow.tracking import MlflowClient
244
+
245
+ def train_with_tracking(config: TrainingConfig) -> str:
246
+ """Train model with full experiment tracking."""
247
+
248
+ mlflow.set_experiment(config.experiment_name)
249
+
250
+ with mlflow.start_run(run_name=config.run_name) as run:
251
+ # Log parameters
252
+ mlflow.log_params({
253
+ "model_type": config.model_type,
254
+ "learning_rate": config.learning_rate,
255
+ "batch_size": config.batch_size,
256
+ "epochs": config.epochs,
257
+ })
258
+
259
+ # Log data info
260
+ mlflow.log_params({
261
+ "train_samples": len(train_data),
262
+ "val_samples": len(val_data),
263
+ "feature_count": train_data.shape[1],
264
+ })
265
+
266
+ # Train
267
+ model = train_model(config, train_data, val_data)
268
+
269
+ # Log metrics
270
+ metrics = evaluate_model(model, val_data)
271
+ mlflow.log_metrics(metrics)
272
+
273
+ # Log model with signature
274
+ signature = mlflow.models.infer_signature(
275
+ train_data.drop("label", axis=1),
276
+ model.predict(train_data.drop("label", axis=1))
277
+ )
278
+ mlflow.sklearn.log_model(model, "model", signature=signature)
279
+
280
+ # Log artifacts
281
+ mlflow.log_artifact("configs/training_config.yaml")
282
+ mlflow.log_artifact("data/feature_importance.png")
283
+
284
+ return run.info.run_id
285
+ ```
286
+
287
+ ### Evaluation Metrics
288
+
289
+ ```python
290
+ from dataclasses import dataclass
291
+ from sklearn.metrics import (
292
+ accuracy_score, precision_recall_fscore_support,
293
+ roc_auc_score, average_precision_score,
294
+ mean_squared_error, mean_absolute_error, r2_score
295
+ )
296
+
297
+ @dataclass
298
+ class ClassificationMetrics:
299
+ """Comprehensive classification metrics."""
300
+ accuracy: float
301
+ precision: float
302
+ recall: float
303
+ f1: float
304
+ roc_auc: float
305
+ pr_auc: float
306
+
307
+ @classmethod
308
+ def compute(cls, y_true: np.ndarray, y_pred: np.ndarray, y_prob: np.ndarray) -> "ClassificationMetrics":
309
+ precision, recall, f1, _ = precision_recall_fscore_support(
310
+ y_true, y_pred, average="binary"
311
+ )
312
+ return cls(
313
+ accuracy=accuracy_score(y_true, y_pred),
314
+ precision=precision,
315
+ recall=recall,
316
+ f1=f1,
317
+ roc_auc=roc_auc_score(y_true, y_prob),
318
+ pr_auc=average_precision_score(y_true, y_prob),
319
+ )
320
+
321
+ def to_dict(self) -> dict[str, float]:
322
+ return {
323
+ "accuracy": self.accuracy,
324
+ "precision": self.precision,
325
+ "recall": self.recall,
326
+ "f1": self.f1,
327
+ "roc_auc": self.roc_auc,
328
+ "pr_auc": self.pr_auc,
329
+ }
330
+
331
+ def evaluate_by_segment(
332
+ model,
333
+ X: pd.DataFrame,
334
+ y: pd.Series,
335
+ segment_col: str
336
+ ) -> dict[str, ClassificationMetrics]:
337
+ """Evaluate model performance across segments for fairness analysis."""
338
+ results = {}
339
+
340
+ for segment in X[segment_col].unique():
341
+ mask = X[segment_col] == segment
342
+ X_seg, y_seg = X[mask], y[mask]
343
+
344
+ y_pred = model.predict(X_seg)
345
+ y_prob = model.predict_proba(X_seg)[:, 1]
346
+
347
+ results[segment] = ClassificationMetrics.compute(y_seg, y_pred, y_prob)
348
+
349
+ return results
350
+ ```
351
+
352
+ ### Hyperparameter Optimization
353
+
354
+ ```python
355
+ import optuna
356
+ from optuna.integration import MLflowCallback
357
+
358
+ def optimize_hyperparameters(
359
+ train_data: pd.DataFrame,
360
+ val_data: pd.DataFrame,
361
+ n_trials: int = 100
362
+ ) -> dict:
363
+ """Optimize hyperparameters with Optuna."""
364
+
365
+ def objective(trial: optuna.Trial) -> float:
366
+ params = {
367
+ "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
368
+ "max_depth": trial.suggest_int("max_depth", 3, 10),
369
+ "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
370
+ "subsample": trial.suggest_float("subsample", 0.6, 1.0),
371
+ "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
372
+ }
373
+
374
+ model = XGBClassifier(**params, early_stopping_rounds=50)
375
+ model.fit(
376
+ train_data.drop("label", axis=1),
377
+ train_data["label"],
378
+ eval_set=[(val_data.drop("label", axis=1), val_data["label"])],
379
+ verbose=False,
380
+ )
381
+
382
+ y_pred = model.predict_proba(val_data.drop("label", axis=1))[:, 1]
383
+ return roc_auc_score(val_data["label"], y_pred)
384
+
385
+ study = optuna.create_study(direction="maximize")
386
+ study.optimize(
387
+ objective,
388
+ n_trials=n_trials,
389
+ callbacks=[MLflowCallback(metric_name="val_roc_auc")],
390
+ )
391
+
392
+ return study.best_params
393
+ ```
394
+
395
+ ---
396
+
397
+ ## Model Deployment
398
+
399
+ ### Model Serving with KServe
400
+
401
+ ```yaml
402
+ # kserve/inference-service.yaml
403
+ apiVersion: serving.kserve.io/v1beta1
404
+ kind: InferenceService
405
+ metadata:
406
+ name: fraud-detector
407
+ annotations:
408
+ serving.kserve.io/deploymentMode: Serverless
409
+ spec:
410
+ predictor:
411
+ model:
412
+ modelFormat:
413
+ name: mlflow
414
+ storageUri: s3://models/fraud-detector/v1
415
+ resources:
416
+ limits:
417
+ cpu: "2"
418
+ memory: 4Gi
419
+ requests:
420
+ cpu: "1"
421
+ memory: 2Gi
422
+ minReplicas: 1
423
+ maxReplicas: 10
424
+ scaleTarget: 100
425
+ scaleMetric: concurrency
426
+ ```
427
+
428
+ ### Custom Predictor
429
+
430
+ ```python
431
+ from kserve import Model, ModelServer
432
+ from kserve.errors import ModelMissingError
433
+ import torch
434
+
435
+ class FraudPredictor(Model):
436
+ """Custom KServe predictor for fraud detection."""
437
+
438
+ def __init__(self, name: str):
439
+ super().__init__(name)
440
+ self.model = None
441
+ self.transformer = None
442
+ self.ready = False
443
+
444
+ def load(self) -> bool:
445
+ """Load model and preprocessing artifacts."""
446
+ model_path = os.environ.get("MODEL_PATH", "/mnt/models")
447
+
448
+ self.model = torch.jit.load(f"{model_path}/model.pt")
449
+ self.model.eval()
450
+
451
+ self.transformer = FeatureTransformer.load(f"{model_path}/transformer.pkl")
452
+
453
+ self.ready = True
454
+ return self.ready
455
+
456
+ def predict(self, payload: dict, headers: dict = None) -> dict:
457
+ """Run inference."""
458
+ if not self.ready:
459
+ raise ModelMissingError(self.name)
460
+
461
+ # Preprocess
462
+ df = pd.DataFrame(payload["instances"])
463
+ features = self.transformer.transform(df)
464
+ tensor = torch.tensor(features.values, dtype=torch.float32)
465
+
466
+ # Inference
467
+ with torch.no_grad():
468
+ logits = self.model(tensor)
469
+ probs = torch.sigmoid(logits).numpy()
470
+
471
+ return {
472
+ "predictions": probs.tolist(),
473
+ "model_version": os.environ.get("MODEL_VERSION", "unknown"),
474
+ }
475
+
476
+ if __name__ == "__main__":
477
+ model = FraudPredictor("fraud-detector")
478
+ ModelServer().start([model])
479
+ ```
480
+
481
+ ### Batch Inference Pipeline
482
+
483
+ ```python
484
+ from prefect import flow, task
485
+ from prefect.tasks import task_input_hash
486
+ from datetime import timedelta
487
+
488
+ @task(cache_key_fn=task_input_hash, cache_expiration=timedelta(hours=1))
489
+ def load_batch_data(date: str) -> pd.DataFrame:
490
+ """Load data for batch inference."""
491
+ return pd.read_parquet(f"s3://data/features/{date}/")
492
+
493
+ @task
494
+ def run_batch_inference(data: pd.DataFrame, model_uri: str) -> pd.DataFrame:
495
+ """Run batch inference on data."""
496
+ model = mlflow.pyfunc.load_model(model_uri)
497
+
498
+ predictions = model.predict(data)
499
+
500
+ data["prediction"] = predictions
501
+ data["model_version"] = model_uri.split("/")[-1]
502
+ data["inference_timestamp"] = datetime.utcnow()
503
+
504
+ return data
505
+
506
+ @task
507
+ def write_predictions(predictions: pd.DataFrame, date: str) -> None:
508
+ """Write predictions to storage."""
509
+ predictions.to_parquet(
510
+ f"s3://predictions/{date}/predictions.parquet",
511
+ index=False
512
+ )
513
+
514
+ @flow(name="batch-inference")
515
+ def batch_inference_pipeline(date: str, model_uri: str) -> None:
516
+ """Daily batch inference pipeline."""
517
+ data = load_batch_data(date)
518
+ predictions = run_batch_inference(data, model_uri)
519
+ write_predictions(predictions, date)
520
+ ```
521
+
522
+ ---
523
+
524
+ ## Monitoring & Observability
525
+
526
+ ### Drift Detection
527
+
528
+ ```python
529
+ from evidently import ColumnMapping
530
+ from evidently.report import Report
531
+ from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
532
+
533
+ def detect_drift(
534
+ reference_data: pd.DataFrame,
535
+ current_data: pd.DataFrame,
536
+ column_mapping: ColumnMapping
537
+ ) -> dict:
538
+ """Detect data and prediction drift."""
539
+
540
+ report = Report(metrics=[
541
+ DataDriftPreset(),
542
+ TargetDriftPreset(),
543
+ ])
544
+
545
+ report.run(
546
+ reference_data=reference_data,
547
+ current_data=current_data,
548
+ column_mapping=column_mapping,
549
+ )
550
+
551
+ result = report.as_dict()
552
+
553
+ drift_detected = result["metrics"][0]["result"]["dataset_drift"]
554
+ drift_share = result["metrics"][0]["result"]["drift_share"]
555
+
556
+ return {
557
+ "drift_detected": drift_detected,
558
+ "drift_share": drift_share,
559
+ "drifted_columns": [
560
+ col for col, info in result["metrics"][0]["result"]["drift_by_columns"].items()
561
+ if info["drift_detected"]
562
+ ],
563
+ }
564
+
565
+ def monitor_model_performance(
566
+ predictions: pd.DataFrame,
567
+ actuals: pd.DataFrame,
568
+ threshold: float = 0.05
569
+ ) -> dict:
570
+ """Monitor model performance degradation."""
571
+
572
+ merged = predictions.merge(actuals, on="id")
573
+
574
+ current_metrics = ClassificationMetrics.compute(
575
+ merged["actual"],
576
+ merged["prediction"],
577
+ merged["probability"]
578
+ )
579
+
580
+ baseline_metrics = load_baseline_metrics()
581
+
582
+ degradation = {
583
+ metric: (baseline_metrics[metric] - current_metrics[metric]) / baseline_metrics[metric]
584
+ for metric in ["accuracy", "precision", "recall", "f1", "roc_auc"]
585
+ }
586
+
587
+ alerts = [
588
+ metric for metric, deg in degradation.items()
589
+ if deg > threshold
590
+ ]
591
+
592
+ return {
593
+ "current_metrics": current_metrics.to_dict(),
594
+ "degradation": degradation,
595
+ "alerts": alerts,
596
+ }
597
+ ```
598
+
599
+ ### Logging & Metrics
600
+
601
+ ```python
602
+ import structlog
603
+ from prometheus_client import Counter, Histogram, Gauge
604
+
605
+ # Structured logging
606
+ logger = structlog.get_logger()
607
+
608
+ # Prometheus metrics
609
+ PREDICTION_LATENCY = Histogram(
610
+ "model_prediction_latency_seconds",
611
+ "Time spent processing prediction",
612
+ buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0]
613
+ )
614
+
615
+ PREDICTION_COUNT = Counter(
616
+ "model_predictions_total",
617
+ "Total number of predictions",
618
+ ["model_name", "model_version", "outcome"]
619
+ )
620
+
621
+ FEATURE_VALUE = Gauge(
622
+ "model_feature_value",
623
+ "Feature value distribution",
624
+ ["feature_name", "quantile"]
625
+ )
626
+
627
+ def predict_with_observability(model, features: dict) -> dict:
628
+ """Make prediction with full observability."""
629
+
630
+ request_id = str(uuid.uuid4())
631
+ start_time = time.time()
632
+
633
+ logger.info(
634
+ "prediction_started",
635
+ request_id=request_id,
636
+ feature_count=len(features),
637
+ )
638
+
639
+ try:
640
+ with PREDICTION_LATENCY.time():
641
+ prediction = model.predict(features)
642
+
643
+ PREDICTION_COUNT.labels(
644
+ model_name=model.name,
645
+ model_version=model.version,
646
+ outcome="success"
647
+ ).inc()
648
+
649
+ logger.info(
650
+ "prediction_completed",
651
+ request_id=request_id,
652
+ prediction=prediction,
653
+ latency_ms=(time.time() - start_time) * 1000,
654
+ )
655
+
656
+ return {"prediction": prediction, "request_id": request_id}
657
+
658
+ except Exception as e:
659
+ PREDICTION_COUNT.labels(
660
+ model_name=model.name,
661
+ model_version=model.version,
662
+ outcome="error"
663
+ ).inc()
664
+
665
+ logger.error(
666
+ "prediction_failed",
667
+ request_id=request_id,
668
+ error=str(e),
669
+ )
670
+ raise
671
+ ```
672
+
673
+ ---
674
+
675
+ ## Security & Responsible AI
676
+
677
+ ### Input Validation
678
+
679
+ ```python
680
+ from pydantic import BaseModel, Field, validator
681
+ from typing import List
682
+
683
+ class PredictionRequest(BaseModel):
684
+ """Validated prediction request."""
685
+
686
+ features: dict[str, float] = Field(..., min_items=1)
687
+ request_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
688
+
689
+ @validator("features")
690
+ def validate_features(cls, v):
691
+ required = {"feature_1", "feature_2", "feature_3"}
692
+ missing = required - set(v.keys())
693
+ if missing:
694
+ raise ValueError(f"Missing required features: {missing}")
695
+
696
+ for name, value in v.items():
697
+ if not isinstance(value, (int, float)):
698
+ raise ValueError(f"Feature {name} must be numeric")
699
+ if math.isnan(value) or math.isinf(value):
700
+ raise ValueError(f"Feature {name} contains invalid value")
701
+
702
+ return v
703
+
704
+ @validator("features")
705
+ def validate_ranges(cls, v):
706
+ ranges = {
707
+ "feature_1": (0, 1),
708
+ "feature_2": (-100, 100),
709
+ }
710
+ for name, (min_val, max_val) in ranges.items():
711
+ if name in v and not (min_val <= v[name] <= max_val):
712
+ raise ValueError(f"Feature {name} out of range [{min_val}, {max_val}]")
713
+ return v
714
+ ```
715
+
716
+ ### Fairness Assessment
717
+
718
+ ```python
719
+ from aif360.datasets import BinaryLabelDataset
720
+ from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
721
+
722
+ def assess_fairness(
723
+ data: pd.DataFrame,
724
+ predictions: np.ndarray,
725
+ protected_attribute: str,
726
+ privileged_groups: list[dict],
727
+ unprivileged_groups: list[dict],
728
+ ) -> dict:
729
+ """Assess model fairness across protected groups."""
730
+
731
+ dataset = BinaryLabelDataset(
732
+ df=data,
733
+ label_names=["label"],
734
+ protected_attribute_names=[protected_attribute],
735
+ )
736
+
737
+ classified_dataset = dataset.copy()
738
+ classified_dataset.labels = predictions.reshape(-1, 1)
739
+
740
+ metric = ClassificationMetric(
741
+ dataset,
742
+ classified_dataset,
743
+ unprivileged_groups=unprivileged_groups,
744
+ privileged_groups=privileged_groups,
745
+ )
746
+
747
+ return {
748
+ "statistical_parity_difference": metric.statistical_parity_difference(),
749
+ "equal_opportunity_difference": metric.equal_opportunity_difference(),
750
+ "average_odds_difference": metric.average_odds_difference(),
751
+ "disparate_impact": metric.disparate_impact(),
752
+ "theil_index": metric.theil_index(),
753
+ }
754
+
755
+ def check_fairness_thresholds(fairness_metrics: dict) -> list[str]:
756
+ """Check if fairness metrics exceed acceptable thresholds."""
757
+ thresholds = {
758
+ "statistical_parity_difference": 0.1,
759
+ "equal_opportunity_difference": 0.1,
760
+ "average_odds_difference": 0.1,
761
+ "disparate_impact": (0.8, 1.25), # 80% rule
762
+ }
763
+
764
+ violations = []
765
+
766
+ for metric, threshold in thresholds.items():
767
+ value = fairness_metrics[metric]
768
+ if isinstance(threshold, tuple):
769
+ if not (threshold[0] <= value <= threshold[1]):
770
+ violations.append(f"{metric}: {value:.3f} not in {threshold}")
771
+ else:
772
+ if abs(value) > threshold:
773
+ violations.append(f"{metric}: {value:.3f} exceeds {threshold}")
774
+
775
+ return violations
776
+ ```
777
+
778
+ ### Model Explainability
779
+
780
+ ```python
781
+ import shap
782
+
783
+ def explain_prediction(
784
+ model,
785
+ instance: pd.DataFrame,
786
+ background_data: pd.DataFrame,
787
+ top_k: int = 10
788
+ ) -> dict:
789
+ """Generate SHAP explanation for a prediction."""
790
+
791
+ explainer = shap.TreeExplainer(model)
792
+ shap_values = explainer.shap_values(instance)
793
+
794
+ feature_importance = pd.DataFrame({
795
+ "feature": instance.columns,
796
+ "shap_value": shap_values[0],
797
+ "feature_value": instance.values[0],
798
+ })
799
+
800
+ feature_importance["abs_shap"] = feature_importance["shap_value"].abs()
801
+ feature_importance = feature_importance.sort_values("abs_shap", ascending=False)
802
+
803
+ return {
804
+ "base_value": explainer.expected_value,
805
+ "prediction": model.predict(instance)[0],
806
+ "top_features": feature_importance.head(top_k).to_dict(orient="records"),
807
+ }
808
+ ```
809
+
810
+ ---
811
+
812
+ ## Testing
813
+
814
+ ### Unit Tests
815
+
816
+ ```python
817
+ import pytest
818
+ from unittest.mock import Mock, patch
819
+
820
+ class TestFeatureTransformer:
821
+ """Test feature transformation logic."""
822
+
823
+ def test_fit_transform_numeric(self):
824
+ df = pd.DataFrame({"feature_1": [0, 10, 20]})
825
+ config = TransformConfig(numeric_cols=["feature_1"])
826
+
827
+ transformer = FeatureTransformer()
828
+ result = transformer.fit_transform(df, config)
829
+
830
+ assert result["feature_1"].mean() == pytest.approx(0, abs=1e-10)
831
+ assert result["feature_1"].std() == pytest.approx(1, abs=1e-10)
832
+
833
+ def test_transform_uses_fitted_params(self):
834
+ train_df = pd.DataFrame({"feature_1": [0, 10, 20]})
835
+ test_df = pd.DataFrame({"feature_1": [5, 15]})
836
+ config = TransformConfig(numeric_cols=["feature_1"])
837
+
838
+ transformer = FeatureTransformer()
839
+ transformer.fit_transform(train_df, config)
840
+ result = transformer.transform(test_df)
841
+
842
+ # Should use training mean/std, not test data
843
+ assert result["feature_1"].iloc[0] == pytest.approx(-0.5, abs=0.1)
844
+
845
+ def test_save_load_roundtrip(self, tmp_path):
846
+ df = pd.DataFrame({"feature_1": [0, 10, 20]})
847
+ config = TransformConfig(numeric_cols=["feature_1"])
848
+
849
+ transformer = FeatureTransformer()
850
+ transformer.fit_transform(df, config)
851
+
852
+ path = tmp_path / "transformer.pkl"
853
+ transformer.save(str(path))
854
+
855
+ loaded = FeatureTransformer.load(str(path))
856
+
857
+ assert loaded.scalers.keys() == transformer.scalers.keys()
858
+
859
+ class TestDataValidation:
860
+ """Test data validation schemas."""
861
+
862
+ def test_valid_data_passes(self):
863
+ df = pd.DataFrame({
864
+ "user_id": ["u1", "u2"],
865
+ "feature_1": [0.5, 0.7],
866
+ "feature_2": [1.0, 2.0],
867
+ "label": [0, 1],
868
+ })
869
+
870
+ # Should not raise
871
+ validated = TrainingDataSchema.validate(df)
872
+ assert len(validated) == 2
873
+
874
+ def test_invalid_range_fails(self):
875
+ df = pd.DataFrame({
876
+ "user_id": ["u1"],
877
+ "feature_1": [1.5], # Out of range [0, 1]
878
+ "feature_2": [1.0],
879
+ "label": [0],
880
+ })
881
+
882
+ with pytest.raises(pa.errors.SchemaError):
883
+ TrainingDataSchema.validate(df)
884
+
885
+ def test_missing_column_fails(self):
886
+ df = pd.DataFrame({
887
+ "user_id": ["u1"],
888
+ "feature_1": [0.5],
889
+ # Missing feature_2
890
+ "label": [0],
891
+ })
892
+
893
+ with pytest.raises(pa.errors.SchemaError):
894
+ TrainingDataSchema.validate(df)
895
+ ```
896
+
897
+ ### Model Tests
898
+
899
+ ```python
900
+ class TestModelBehavior:
901
+ """Test model behavior and invariants."""
902
+
903
+ @pytest.fixture
904
+ def trained_model(self):
905
+ """Load a trained model for testing."""
906
+ return mlflow.pyfunc.load_model("models:/fraud-detector/production")
907
+
908
+ def test_prediction_deterministic(self, trained_model):
909
+ """Same input should give same output."""
910
+ features = pd.DataFrame([{"feature_1": 0.5, "feature_2": 1.0}])
911
+
912
+ pred1 = trained_model.predict(features)
913
+ pred2 = trained_model.predict(features)
914
+
915
+ np.testing.assert_array_equal(pred1, pred2)
916
+
917
+ def test_prediction_in_valid_range(self, trained_model):
918
+ """Predictions should be valid probabilities."""
919
+ features = pd.DataFrame([
920
+ {"feature_1": 0.0, "feature_2": 0.0},
921
+ {"feature_1": 1.0, "feature_2": 100.0},
922
+ {"feature_1": 0.5, "feature_2": 50.0},
923
+ ])
924
+
925
+ predictions = trained_model.predict(features)
926
+
927
+ assert all(0 <= p <= 1 for p in predictions)
928
+
929
+ def test_monotonic_relationship(self, trained_model):
930
+ """Higher risk features should increase fraud probability."""
931
+ low_risk = pd.DataFrame([{"feature_1": 0.1, "feature_2": 10}])
932
+ high_risk = pd.DataFrame([{"feature_1": 0.9, "feature_2": 90}])
933
+
934
+ low_pred = trained_model.predict(low_risk)[0]
935
+ high_pred = trained_model.predict(high_risk)[0]
936
+
937
+ assert high_pred > low_pred
938
+
939
+ def test_no_discrimination_by_protected_attribute(self, trained_model):
940
+ """Model should not discriminate based on protected attributes."""
941
+ base_features = {"feature_1": 0.5, "feature_2": 50}
942
+
943
+ pred_group_a = trained_model.predict(pd.DataFrame([{**base_features, "group": "A"}]))[0]
944
+ pred_group_b = trained_model.predict(pd.DataFrame([{**base_features, "group": "B"}]))[0]
945
+
946
+ # Predictions should be very close if group shouldn't matter
947
+ assert abs(pred_group_a - pred_group_b) < 0.01
948
+ ```
949
+
950
+ ### Integration Tests
951
+
952
+ ```python
953
+ class TestInferencePipeline:
954
+ """Test end-to-end inference pipeline."""
955
+
956
+ @pytest.fixture
957
+ def inference_service(self):
958
+ """Start inference service for testing."""
959
+ # Start service in test mode
960
+ return InferenceServiceClient("http://localhost:8080")
961
+
962
+ def test_health_check(self, inference_service):
963
+ """Service should be healthy."""
964
+ response = inference_service.health()
965
+ assert response.status == "healthy"
966
+ assert response.model_loaded == True
967
+
968
+ def test_single_prediction(self, inference_service):
969
+ """Single prediction should succeed."""
970
+ request = PredictionRequest(
971
+ features={"feature_1": 0.5, "feature_2": 1.0}
972
+ )
973
+
974
+ response = inference_service.predict(request)
975
+
976
+ assert "prediction" in response
977
+ assert 0 <= response["prediction"] <= 1
978
+ assert "request_id" in response
979
+
980
+ def test_batch_prediction(self, inference_service):
981
+ """Batch prediction should succeed."""
982
+ requests = [
983
+ {"feature_1": 0.1, "feature_2": 1.0},
984
+ {"feature_1": 0.5, "feature_2": 2.0},
985
+ {"feature_1": 0.9, "feature_2": 3.0},
986
+ ]
987
+
988
+ response = inference_service.predict_batch(requests)
989
+
990
+ assert len(response["predictions"]) == 3
991
+
992
+ def test_invalid_request_rejected(self, inference_service):
993
+ """Invalid requests should be rejected with clear error."""
994
+ request = PredictionRequest(
995
+ features={"feature_1": "invalid"} # Should be numeric
996
+ )
997
+
998
+ with pytest.raises(ValidationError) as exc_info:
999
+ inference_service.predict(request)
1000
+
1001
+ assert "must be numeric" in str(exc_info.value)
1002
+ ```
1003
+
1004
+ ---
1005
+
1006
+ ## Definition of Done
1007
+
1008
+ ### Data Pipeline
1009
+
1010
+ - [ ] Data validation schema defined and enforced
1011
+ - [ ] Data quality checks automated
1012
+ - [ ] Feature engineering code tested
1013
+ - [ ] No training/serving skew (transformers serialized)
1014
+ - [ ] Data versioning in place
1015
+
1016
+ ### Model Development
1017
+
1018
+ - [ ] Experiment tracked with all parameters and metrics
1019
+ - [ ] Multiple metrics evaluated (not just accuracy)
1020
+ - [ ] Fairness assessed across protected groups
1021
+ - [ ] Hyperparameters optimized
1022
+ - [ ] Model registered with signature
1023
+
1024
+ ### Deployment
1025
+
1026
+ - [ ] Model packaged with dependencies
1027
+ - [ ] Inference endpoint tested
1028
+ - [ ] Latency meets SLA
1029
+ - [ ] Scaling configuration defined
1030
+ - [ ] Rollback procedure documented
1031
+
1032
+ ### Monitoring
1033
+
1034
+ - [ ] Drift detection configured
1035
+ - [ ] Performance alerts set up
1036
+ - [ ] Logging in place
1037
+ - [ ] Dashboards created
1038
+ - [ ] Incident response plan documented
1039
+
1040
+ ### Testing
1041
+
1042
+ - [ ] Unit tests for all transforms
1043
+ - [ ] Model behavior tests passing
1044
+ - [ ] Integration tests for inference
1045
+ - [ ] Fairness tests passing
1046
+ - [ ] Load testing completed
1047
+
1048
+ ---
1049
+
1050
+ ## Common Pitfalls
1051
+
1052
+ ### 1. Ignoring Data Quality
1053
+
1054
+ ```python
1055
+ # Bad: Trust the data
1056
+ df = pd.read_csv("data.csv")
1057
+ model.fit(df)
1058
+
1059
+ # Good: Validate everything
1060
+ df = pd.read_csv("data.csv")
1061
+ validated_df = DataSchema.validate(df)
1062
+ quality_report = run_quality_checks(validated_df)
1063
+ if quality_report.has_critical_issues:
1064
+ raise DataQualityError(quality_report.issues)
1065
+ model.fit(validated_df)
1066
+ ```
1067
+
1068
+ ### 2. Training/Serving Skew
1069
+
1070
+ ```python
1071
+ # Bad: Different preprocessing in training vs serving
1072
+ # training.py
1073
+ df["feature"] = (df["feature"] - df["feature"].mean()) / df["feature"].std()
1074
+
1075
+ # serving.py
1076
+ df["feature"] = (df["feature"] - 0.5) / 0.2 # Hardcoded values!
1077
+
1078
+ # Good: Serialize the transformer
1079
+ transformer = FeatureTransformer()
1080
+ transformer.fit(train_df)
1081
+ transformer.save("transformer.pkl") # Use same transformer everywhere
1082
+ ```
1083
+
1084
+ ### 3. Overfitting to Offline Metrics
1085
+
1086
+ ```python
1087
+ # Bad: Deploy based on validation metrics alone
1088
+ if val_accuracy > 0.95:
1089
+ deploy_model(model)
1090
+
1091
+ # Good: Use A/B testing in production
1092
+ if val_accuracy > 0.95:
1093
+ deploy_to_shadow(model)
1094
+
1095
+ # After collecting production data
1096
+ if ab_test_significant and production_lift > 0.01:
1097
+ promote_to_production(model)
1098
+ ```
1099
+
1100
+ ### 4. Ignoring Fairness
1101
+
1102
+ ```python
1103
+ # Bad: Only optimize for accuracy
1104
+ best_model = max(models, key=lambda m: m.accuracy)
1105
+
1106
+ # Good: Consider fairness constraints
1107
+ valid_models = [m for m in models if passes_fairness_checks(m)]
1108
+ if not valid_models:
1109
+ raise FairnessViolation("No model meets fairness criteria")
1110
+ best_model = max(valid_models, key=lambda m: m.accuracy)
1111
+ ```
1112
+
1113
+ ### 5. No Drift Monitoring
1114
+
1115
+ ```python
1116
+ # Bad: Deploy and forget
1117
+ deploy_model(model)
1118
+
1119
+ # Good: Continuous monitoring
1120
+ deploy_model(model)
1121
+ schedule_drift_detection(model, frequency="hourly")
1122
+ schedule_performance_monitoring(model, frequency="daily")
1123
+ setup_alerts(model, thresholds=ALERT_THRESHOLDS)
1124
+ ```
1125
+
1126
+ ---
1127
+
1128
+ ## Resources
1129
+
1130
+ - [Google ML Engineering Best Practices](https://developers.google.com/machine-learning/guides/rules-of-ml)
1131
+ - [MLOps Principles](https://ml-ops.org/)
1132
+ - [NIST AI Risk Management Framework](https://www.nist.gov/itl/ai-risk-management-framework)
1133
+ - [Evidently AI - ML Monitoring](https://docs.evidentlyai.com/)
1134
+ - [MLflow Documentation](https://mlflow.org/docs/latest/index.html)
1135
+ - [KServe Documentation](https://kserve.github.io/website/)
1136
+ - [Feast Feature Store](https://docs.feast.dev/)