agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,1136 @@
|
|
|
1
|
+
# ML/AI Development Guide
|
|
2
|
+
|
|
3
|
+
Staff-level guidelines for machine learning and artificial intelligence systems. This guide covers data engineering, model development, deployment, monitoring, and responsible AI practices.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
This guide applies to:
|
|
10
|
+
|
|
11
|
+
- Machine learning pipelines (training, evaluation, deployment)
|
|
12
|
+
- Deep learning systems (computer vision, NLP, recommendation systems)
|
|
13
|
+
- MLOps infrastructure (experiment tracking, feature stores, model registries)
|
|
14
|
+
- LLM/GenAI applications (fine-tuning, RAG, prompt engineering)
|
|
15
|
+
- Real-time and batch inference systems
|
|
16
|
+
|
|
17
|
+
### Key Principles
|
|
18
|
+
|
|
19
|
+
1. **Data-Centric Development** - Data quality beats algorithm complexity
|
|
20
|
+
2. **Reproducibility Is Non-Negotiable** - Version everything: data, code, configs, models
|
|
21
|
+
3. **Observability Over Uptime** - Monitor drift, not just infrastructure health
|
|
22
|
+
4. **Responsible AI** - Fairness, bias detection, and explainability by default
|
|
23
|
+
|
|
24
|
+
### Technology Stack
|
|
25
|
+
|
|
26
|
+
| Layer | Technology |
|
|
27
|
+
|-------|------------|
|
|
28
|
+
| Training | PyTorch, TensorFlow, scikit-learn, XGBoost |
|
|
29
|
+
| Experiment Tracking | MLflow, Weights & Biases, Neptune |
|
|
30
|
+
| Feature Store | Feast, Tecton, Hopsworks |
|
|
31
|
+
| Data Validation | TensorFlow Data Validation, Great Expectations |
|
|
32
|
+
| Model Serving | KServe, TorchServe, Triton, vLLM |
|
|
33
|
+
| Orchestration | Kubeflow, Airflow, Prefect, Dagster |
|
|
34
|
+
| Monitoring | Evidently, WhyLabs, Arize |
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Project Structure
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
ml-project/
|
|
42
|
+
├── data/
|
|
43
|
+
│ ├── raw/ # Immutable raw data
|
|
44
|
+
│ ├── processed/ # Cleaned, transformed data
|
|
45
|
+
│ └── features/ # Feature store exports
|
|
46
|
+
├── src/
|
|
47
|
+
│ ├── data/ # Data loading and validation
|
|
48
|
+
│ │ ├── loaders.py
|
|
49
|
+
│ │ ├── validators.py
|
|
50
|
+
│ │ └── transforms.py
|
|
51
|
+
│ ├── features/ # Feature engineering
|
|
52
|
+
│ │ ├── engineering.py
|
|
53
|
+
│ │ └── store.py
|
|
54
|
+
│ ├── models/ # Model definitions
|
|
55
|
+
│ │ ├── architectures.py
|
|
56
|
+
│ │ └── losses.py
|
|
57
|
+
│ ├── training/ # Training logic
|
|
58
|
+
│ │ ├── trainer.py
|
|
59
|
+
│ │ ├── callbacks.py
|
|
60
|
+
│ │ └── optimizers.py
|
|
61
|
+
│ ├── evaluation/ # Evaluation and metrics
|
|
62
|
+
│ │ ├── metrics.py
|
|
63
|
+
│ │ └── analysis.py
|
|
64
|
+
│ ├── inference/ # Serving code
|
|
65
|
+
│ │ ├── predictor.py
|
|
66
|
+
│ │ └── preprocessing.py
|
|
67
|
+
│ └── utils/ # Shared utilities
|
|
68
|
+
├── configs/ # Experiment configurations
|
|
69
|
+
│ ├── model/
|
|
70
|
+
│ ├── training/
|
|
71
|
+
│ └── serving/
|
|
72
|
+
├── notebooks/ # Exploration (not production)
|
|
73
|
+
├── tests/
|
|
74
|
+
│ ├── unit/
|
|
75
|
+
│ ├── integration/
|
|
76
|
+
│ └── model/ # Model-specific tests
|
|
77
|
+
├── pipelines/ # ML pipeline definitions
|
|
78
|
+
│ ├── training_pipeline.py
|
|
79
|
+
│ └── inference_pipeline.py
|
|
80
|
+
└── deployments/ # Kubernetes/serving configs
|
|
81
|
+
├── kserve/
|
|
82
|
+
└── docker/
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Data Engineering
|
|
88
|
+
|
|
89
|
+
### Data Validation
|
|
90
|
+
|
|
91
|
+
Validate all data at pipeline boundaries:
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
import pandera as pa
|
|
95
|
+
from pandera.typing import Series, DataFrame
|
|
96
|
+
|
|
97
|
+
class TrainingDataSchema(pa.DataFrameModel):
|
|
98
|
+
"""Schema for training data validation."""
|
|
99
|
+
|
|
100
|
+
user_id: Series[str] = pa.Field(nullable=False)
|
|
101
|
+
feature_1: Series[float] = pa.Field(ge=0, le=1)
|
|
102
|
+
feature_2: Series[float] = pa.Field(nullable=False)
|
|
103
|
+
label: Series[int] = pa.Field(isin=[0, 1])
|
|
104
|
+
|
|
105
|
+
class Config:
|
|
106
|
+
strict = True
|
|
107
|
+
coerce = True
|
|
108
|
+
|
|
109
|
+
@pa.check_types
|
|
110
|
+
def load_training_data(path: str) -> DataFrame[TrainingDataSchema]:
|
|
111
|
+
"""Load and validate training data."""
|
|
112
|
+
df = pd.read_parquet(path)
|
|
113
|
+
return df # Automatically validated against schema
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Data Quality Checks
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from great_expectations.core import ExpectationSuite
|
|
120
|
+
|
|
121
|
+
def create_data_quality_suite() -> ExpectationSuite:
|
|
122
|
+
"""Define data quality expectations."""
|
|
123
|
+
suite = ExpectationSuite(expectation_suite_name="training_data")
|
|
124
|
+
|
|
125
|
+
# Completeness
|
|
126
|
+
suite.add_expectation(
|
|
127
|
+
expectation_type="expect_column_values_to_not_be_null",
|
|
128
|
+
kwargs={"column": "user_id"}
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Freshness
|
|
132
|
+
suite.add_expectation(
|
|
133
|
+
expectation_type="expect_column_max_to_be_between",
|
|
134
|
+
kwargs={"column": "timestamp", "min_value": "2024-01-01"}
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Distribution
|
|
138
|
+
suite.add_expectation(
|
|
139
|
+
expectation_type="expect_column_mean_to_be_between",
|
|
140
|
+
kwargs={"column": "feature_1", "min_value": 0.4, "max_value": 0.6}
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return suite
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Feature Engineering
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from feast import FeatureStore, Entity, FeatureView, Field
|
|
150
|
+
from feast.types import Float32, Int64
|
|
151
|
+
|
|
152
|
+
# Define entities
|
|
153
|
+
user = Entity(name="user", join_keys=["user_id"])
|
|
154
|
+
|
|
155
|
+
# Define feature view
|
|
156
|
+
user_features = FeatureView(
|
|
157
|
+
name="user_features",
|
|
158
|
+
entities=[user],
|
|
159
|
+
schema=[
|
|
160
|
+
Field(name="avg_session_duration", dtype=Float32),
|
|
161
|
+
Field(name="total_purchases", dtype=Int64),
|
|
162
|
+
Field(name="days_since_last_activity", dtype=Int64),
|
|
163
|
+
],
|
|
164
|
+
source=user_features_source,
|
|
165
|
+
ttl=timedelta(days=1),
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Retrieve features for training
|
|
169
|
+
def get_training_features(entity_df: pd.DataFrame) -> pd.DataFrame:
|
|
170
|
+
"""Get historical features for training."""
|
|
171
|
+
store = FeatureStore(repo_path="feature_repo/")
|
|
172
|
+
|
|
173
|
+
training_df = store.get_historical_features(
|
|
174
|
+
entity_df=entity_df,
|
|
175
|
+
features=[
|
|
176
|
+
"user_features:avg_session_duration",
|
|
177
|
+
"user_features:total_purchases",
|
|
178
|
+
"user_features:days_since_last_activity",
|
|
179
|
+
],
|
|
180
|
+
).to_df()
|
|
181
|
+
|
|
182
|
+
return training_df
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Training/Serving Skew Prevention
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
class FeatureTransformer:
|
|
189
|
+
"""Ensure identical transforms for training and serving."""
|
|
190
|
+
|
|
191
|
+
def __init__(self):
|
|
192
|
+
self.scalers: dict[str, StandardScaler] = {}
|
|
193
|
+
self.encoders: dict[str, LabelEncoder] = {}
|
|
194
|
+
|
|
195
|
+
def fit_transform(self, df: pd.DataFrame, config: TransformConfig) -> pd.DataFrame:
|
|
196
|
+
"""Fit and transform for training."""
|
|
197
|
+
result = df.copy()
|
|
198
|
+
|
|
199
|
+
for col in config.numeric_cols:
|
|
200
|
+
self.scalers[col] = StandardScaler()
|
|
201
|
+
result[col] = self.scalers[col].fit_transform(result[[col]])
|
|
202
|
+
|
|
203
|
+
for col in config.categorical_cols:
|
|
204
|
+
self.encoders[col] = LabelEncoder()
|
|
205
|
+
result[col] = self.encoders[col].fit_transform(result[col])
|
|
206
|
+
|
|
207
|
+
return result
|
|
208
|
+
|
|
209
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
210
|
+
"""Transform for serving (uses fitted params)."""
|
|
211
|
+
result = df.copy()
|
|
212
|
+
|
|
213
|
+
for col, scaler in self.scalers.items():
|
|
214
|
+
result[col] = scaler.transform(result[[col]])
|
|
215
|
+
|
|
216
|
+
for col, encoder in self.encoders.items():
|
|
217
|
+
result[col] = encoder.transform(result[col])
|
|
218
|
+
|
|
219
|
+
return result
|
|
220
|
+
|
|
221
|
+
def save(self, path: str) -> None:
|
|
222
|
+
"""Serialize transformer for serving."""
|
|
223
|
+
joblib.dump({"scalers": self.scalers, "encoders": self.encoders}, path)
|
|
224
|
+
|
|
225
|
+
@classmethod
|
|
226
|
+
def load(cls, path: str) -> "FeatureTransformer":
|
|
227
|
+
"""Load transformer for serving."""
|
|
228
|
+
data = joblib.load(path)
|
|
229
|
+
transformer = cls()
|
|
230
|
+
transformer.scalers = data["scalers"]
|
|
231
|
+
transformer.encoders = data["encoders"]
|
|
232
|
+
return transformer
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## Model Development
|
|
238
|
+
|
|
239
|
+
### Experiment Tracking
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
import mlflow
|
|
243
|
+
from mlflow.tracking import MlflowClient
|
|
244
|
+
|
|
245
|
+
def train_with_tracking(config: TrainingConfig) -> str:
|
|
246
|
+
"""Train model with full experiment tracking."""
|
|
247
|
+
|
|
248
|
+
mlflow.set_experiment(config.experiment_name)
|
|
249
|
+
|
|
250
|
+
with mlflow.start_run(run_name=config.run_name) as run:
|
|
251
|
+
# Log parameters
|
|
252
|
+
mlflow.log_params({
|
|
253
|
+
"model_type": config.model_type,
|
|
254
|
+
"learning_rate": config.learning_rate,
|
|
255
|
+
"batch_size": config.batch_size,
|
|
256
|
+
"epochs": config.epochs,
|
|
257
|
+
})
|
|
258
|
+
|
|
259
|
+
# Log data info
|
|
260
|
+
mlflow.log_params({
|
|
261
|
+
"train_samples": len(train_data),
|
|
262
|
+
"val_samples": len(val_data),
|
|
263
|
+
"feature_count": train_data.shape[1],
|
|
264
|
+
})
|
|
265
|
+
|
|
266
|
+
# Train
|
|
267
|
+
model = train_model(config, train_data, val_data)
|
|
268
|
+
|
|
269
|
+
# Log metrics
|
|
270
|
+
metrics = evaluate_model(model, val_data)
|
|
271
|
+
mlflow.log_metrics(metrics)
|
|
272
|
+
|
|
273
|
+
# Log model with signature
|
|
274
|
+
signature = mlflow.models.infer_signature(
|
|
275
|
+
train_data.drop("label", axis=1),
|
|
276
|
+
model.predict(train_data.drop("label", axis=1))
|
|
277
|
+
)
|
|
278
|
+
mlflow.sklearn.log_model(model, "model", signature=signature)
|
|
279
|
+
|
|
280
|
+
# Log artifacts
|
|
281
|
+
mlflow.log_artifact("configs/training_config.yaml")
|
|
282
|
+
mlflow.log_artifact("data/feature_importance.png")
|
|
283
|
+
|
|
284
|
+
return run.info.run_id
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Evaluation Metrics
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
from dataclasses import dataclass
|
|
291
|
+
from sklearn.metrics import (
|
|
292
|
+
accuracy_score, precision_recall_fscore_support,
|
|
293
|
+
roc_auc_score, average_precision_score,
|
|
294
|
+
mean_squared_error, mean_absolute_error, r2_score
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
@dataclass
|
|
298
|
+
class ClassificationMetrics:
|
|
299
|
+
"""Comprehensive classification metrics."""
|
|
300
|
+
accuracy: float
|
|
301
|
+
precision: float
|
|
302
|
+
recall: float
|
|
303
|
+
f1: float
|
|
304
|
+
roc_auc: float
|
|
305
|
+
pr_auc: float
|
|
306
|
+
|
|
307
|
+
@classmethod
|
|
308
|
+
def compute(cls, y_true: np.ndarray, y_pred: np.ndarray, y_prob: np.ndarray) -> "ClassificationMetrics":
|
|
309
|
+
precision, recall, f1, _ = precision_recall_fscore_support(
|
|
310
|
+
y_true, y_pred, average="binary"
|
|
311
|
+
)
|
|
312
|
+
return cls(
|
|
313
|
+
accuracy=accuracy_score(y_true, y_pred),
|
|
314
|
+
precision=precision,
|
|
315
|
+
recall=recall,
|
|
316
|
+
f1=f1,
|
|
317
|
+
roc_auc=roc_auc_score(y_true, y_prob),
|
|
318
|
+
pr_auc=average_precision_score(y_true, y_prob),
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
def to_dict(self) -> dict[str, float]:
|
|
322
|
+
return {
|
|
323
|
+
"accuracy": self.accuracy,
|
|
324
|
+
"precision": self.precision,
|
|
325
|
+
"recall": self.recall,
|
|
326
|
+
"f1": self.f1,
|
|
327
|
+
"roc_auc": self.roc_auc,
|
|
328
|
+
"pr_auc": self.pr_auc,
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
def evaluate_by_segment(
|
|
332
|
+
model,
|
|
333
|
+
X: pd.DataFrame,
|
|
334
|
+
y: pd.Series,
|
|
335
|
+
segment_col: str
|
|
336
|
+
) -> dict[str, ClassificationMetrics]:
|
|
337
|
+
"""Evaluate model performance across segments for fairness analysis."""
|
|
338
|
+
results = {}
|
|
339
|
+
|
|
340
|
+
for segment in X[segment_col].unique():
|
|
341
|
+
mask = X[segment_col] == segment
|
|
342
|
+
X_seg, y_seg = X[mask], y[mask]
|
|
343
|
+
|
|
344
|
+
y_pred = model.predict(X_seg)
|
|
345
|
+
y_prob = model.predict_proba(X_seg)[:, 1]
|
|
346
|
+
|
|
347
|
+
results[segment] = ClassificationMetrics.compute(y_seg, y_pred, y_prob)
|
|
348
|
+
|
|
349
|
+
return results
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
### Hyperparameter Optimization
|
|
353
|
+
|
|
354
|
+
```python
|
|
355
|
+
import optuna
|
|
356
|
+
from optuna.integration import MLflowCallback
|
|
357
|
+
|
|
358
|
+
def optimize_hyperparameters(
|
|
359
|
+
train_data: pd.DataFrame,
|
|
360
|
+
val_data: pd.DataFrame,
|
|
361
|
+
n_trials: int = 100
|
|
362
|
+
) -> dict:
|
|
363
|
+
"""Optimize hyperparameters with Optuna."""
|
|
364
|
+
|
|
365
|
+
def objective(trial: optuna.Trial) -> float:
|
|
366
|
+
params = {
|
|
367
|
+
"n_estimators": trial.suggest_int("n_estimators", 100, 1000),
|
|
368
|
+
"max_depth": trial.suggest_int("max_depth", 3, 10),
|
|
369
|
+
"learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
|
|
370
|
+
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
|
|
371
|
+
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
model = XGBClassifier(**params, early_stopping_rounds=50)
|
|
375
|
+
model.fit(
|
|
376
|
+
train_data.drop("label", axis=1),
|
|
377
|
+
train_data["label"],
|
|
378
|
+
eval_set=[(val_data.drop("label", axis=1), val_data["label"])],
|
|
379
|
+
verbose=False,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
y_pred = model.predict_proba(val_data.drop("label", axis=1))[:, 1]
|
|
383
|
+
return roc_auc_score(val_data["label"], y_pred)
|
|
384
|
+
|
|
385
|
+
study = optuna.create_study(direction="maximize")
|
|
386
|
+
study.optimize(
|
|
387
|
+
objective,
|
|
388
|
+
n_trials=n_trials,
|
|
389
|
+
callbacks=[MLflowCallback(metric_name="val_roc_auc")],
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
return study.best_params
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
---
|
|
396
|
+
|
|
397
|
+
## Model Deployment
|
|
398
|
+
|
|
399
|
+
### Model Serving with KServe
|
|
400
|
+
|
|
401
|
+
```yaml
|
|
402
|
+
# kserve/inference-service.yaml
|
|
403
|
+
apiVersion: serving.kserve.io/v1beta1
|
|
404
|
+
kind: InferenceService
|
|
405
|
+
metadata:
|
|
406
|
+
name: fraud-detector
|
|
407
|
+
annotations:
|
|
408
|
+
serving.kserve.io/deploymentMode: Serverless
|
|
409
|
+
spec:
|
|
410
|
+
predictor:
|
|
411
|
+
model:
|
|
412
|
+
modelFormat:
|
|
413
|
+
name: mlflow
|
|
414
|
+
storageUri: s3://models/fraud-detector/v1
|
|
415
|
+
resources:
|
|
416
|
+
limits:
|
|
417
|
+
cpu: "2"
|
|
418
|
+
memory: 4Gi
|
|
419
|
+
requests:
|
|
420
|
+
cpu: "1"
|
|
421
|
+
memory: 2Gi
|
|
422
|
+
minReplicas: 1
|
|
423
|
+
maxReplicas: 10
|
|
424
|
+
scaleTarget: 100
|
|
425
|
+
scaleMetric: concurrency
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
### Custom Predictor
|
|
429
|
+
|
|
430
|
+
```python
|
|
431
|
+
from kserve import Model, ModelServer
|
|
432
|
+
from kserve.errors import ModelMissingError
|
|
433
|
+
import torch
|
|
434
|
+
|
|
435
|
+
class FraudPredictor(Model):
|
|
436
|
+
"""Custom KServe predictor for fraud detection."""
|
|
437
|
+
|
|
438
|
+
def __init__(self, name: str):
|
|
439
|
+
super().__init__(name)
|
|
440
|
+
self.model = None
|
|
441
|
+
self.transformer = None
|
|
442
|
+
self.ready = False
|
|
443
|
+
|
|
444
|
+
def load(self) -> bool:
|
|
445
|
+
"""Load model and preprocessing artifacts."""
|
|
446
|
+
model_path = os.environ.get("MODEL_PATH", "/mnt/models")
|
|
447
|
+
|
|
448
|
+
self.model = torch.jit.load(f"{model_path}/model.pt")
|
|
449
|
+
self.model.eval()
|
|
450
|
+
|
|
451
|
+
self.transformer = FeatureTransformer.load(f"{model_path}/transformer.pkl")
|
|
452
|
+
|
|
453
|
+
self.ready = True
|
|
454
|
+
return self.ready
|
|
455
|
+
|
|
456
|
+
def predict(self, payload: dict, headers: dict = None) -> dict:
|
|
457
|
+
"""Run inference."""
|
|
458
|
+
if not self.ready:
|
|
459
|
+
raise ModelMissingError(self.name)
|
|
460
|
+
|
|
461
|
+
# Preprocess
|
|
462
|
+
df = pd.DataFrame(payload["instances"])
|
|
463
|
+
features = self.transformer.transform(df)
|
|
464
|
+
tensor = torch.tensor(features.values, dtype=torch.float32)
|
|
465
|
+
|
|
466
|
+
# Inference
|
|
467
|
+
with torch.no_grad():
|
|
468
|
+
logits = self.model(tensor)
|
|
469
|
+
probs = torch.sigmoid(logits).numpy()
|
|
470
|
+
|
|
471
|
+
return {
|
|
472
|
+
"predictions": probs.tolist(),
|
|
473
|
+
"model_version": os.environ.get("MODEL_VERSION", "unknown"),
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
if __name__ == "__main__":
|
|
477
|
+
model = FraudPredictor("fraud-detector")
|
|
478
|
+
ModelServer().start([model])
|
|
479
|
+
```
|
|
480
|
+
|
|
481
|
+
### Batch Inference Pipeline
|
|
482
|
+
|
|
483
|
+
```python
|
|
484
|
+
from prefect import flow, task
|
|
485
|
+
from prefect.tasks import task_input_hash
|
|
486
|
+
from datetime import timedelta
|
|
487
|
+
|
|
488
|
+
@task(cache_key_fn=task_input_hash, cache_expiration=timedelta(hours=1))
|
|
489
|
+
def load_batch_data(date: str) -> pd.DataFrame:
|
|
490
|
+
"""Load data for batch inference."""
|
|
491
|
+
return pd.read_parquet(f"s3://data/features/{date}/")
|
|
492
|
+
|
|
493
|
+
@task
|
|
494
|
+
def run_batch_inference(data: pd.DataFrame, model_uri: str) -> pd.DataFrame:
|
|
495
|
+
"""Run batch inference on data."""
|
|
496
|
+
model = mlflow.pyfunc.load_model(model_uri)
|
|
497
|
+
|
|
498
|
+
predictions = model.predict(data)
|
|
499
|
+
|
|
500
|
+
data["prediction"] = predictions
|
|
501
|
+
data["model_version"] = model_uri.split("/")[-1]
|
|
502
|
+
data["inference_timestamp"] = datetime.utcnow()
|
|
503
|
+
|
|
504
|
+
return data
|
|
505
|
+
|
|
506
|
+
@task
|
|
507
|
+
def write_predictions(predictions: pd.DataFrame, date: str) -> None:
|
|
508
|
+
"""Write predictions to storage."""
|
|
509
|
+
predictions.to_parquet(
|
|
510
|
+
f"s3://predictions/{date}/predictions.parquet",
|
|
511
|
+
index=False
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
@flow(name="batch-inference")
|
|
515
|
+
def batch_inference_pipeline(date: str, model_uri: str) -> None:
|
|
516
|
+
"""Daily batch inference pipeline."""
|
|
517
|
+
data = load_batch_data(date)
|
|
518
|
+
predictions = run_batch_inference(data, model_uri)
|
|
519
|
+
write_predictions(predictions, date)
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
---
|
|
523
|
+
|
|
524
|
+
## Monitoring & Observability
|
|
525
|
+
|
|
526
|
+
### Drift Detection
|
|
527
|
+
|
|
528
|
+
```python
|
|
529
|
+
from evidently import ColumnMapping
|
|
530
|
+
from evidently.report import Report
|
|
531
|
+
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
|
|
532
|
+
|
|
533
|
+
def detect_drift(
|
|
534
|
+
reference_data: pd.DataFrame,
|
|
535
|
+
current_data: pd.DataFrame,
|
|
536
|
+
column_mapping: ColumnMapping
|
|
537
|
+
) -> dict:
|
|
538
|
+
"""Detect data and prediction drift."""
|
|
539
|
+
|
|
540
|
+
report = Report(metrics=[
|
|
541
|
+
DataDriftPreset(),
|
|
542
|
+
TargetDriftPreset(),
|
|
543
|
+
])
|
|
544
|
+
|
|
545
|
+
report.run(
|
|
546
|
+
reference_data=reference_data,
|
|
547
|
+
current_data=current_data,
|
|
548
|
+
column_mapping=column_mapping,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
result = report.as_dict()
|
|
552
|
+
|
|
553
|
+
drift_detected = result["metrics"][0]["result"]["dataset_drift"]
|
|
554
|
+
drift_share = result["metrics"][0]["result"]["drift_share"]
|
|
555
|
+
|
|
556
|
+
return {
|
|
557
|
+
"drift_detected": drift_detected,
|
|
558
|
+
"drift_share": drift_share,
|
|
559
|
+
"drifted_columns": [
|
|
560
|
+
col for col, info in result["metrics"][0]["result"]["drift_by_columns"].items()
|
|
561
|
+
if info["drift_detected"]
|
|
562
|
+
],
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
def monitor_model_performance(
|
|
566
|
+
predictions: pd.DataFrame,
|
|
567
|
+
actuals: pd.DataFrame,
|
|
568
|
+
threshold: float = 0.05
|
|
569
|
+
) -> dict:
|
|
570
|
+
"""Monitor model performance degradation."""
|
|
571
|
+
|
|
572
|
+
merged = predictions.merge(actuals, on="id")
|
|
573
|
+
|
|
574
|
+
current_metrics = ClassificationMetrics.compute(
|
|
575
|
+
merged["actual"],
|
|
576
|
+
merged["prediction"],
|
|
577
|
+
merged["probability"]
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
baseline_metrics = load_baseline_metrics()
|
|
581
|
+
|
|
582
|
+
degradation = {
|
|
583
|
+
metric: (baseline_metrics[metric] - current_metrics[metric]) / baseline_metrics[metric]
|
|
584
|
+
for metric in ["accuracy", "precision", "recall", "f1", "roc_auc"]
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
alerts = [
|
|
588
|
+
metric for metric, deg in degradation.items()
|
|
589
|
+
if deg > threshold
|
|
590
|
+
]
|
|
591
|
+
|
|
592
|
+
return {
|
|
593
|
+
"current_metrics": current_metrics.to_dict(),
|
|
594
|
+
"degradation": degradation,
|
|
595
|
+
"alerts": alerts,
|
|
596
|
+
}
|
|
597
|
+
```
|
|
598
|
+
|
|
599
|
+
### Logging & Metrics
|
|
600
|
+
|
|
601
|
+
```python
|
|
602
|
+
import structlog
|
|
603
|
+
from prometheus_client import Counter, Histogram, Gauge
|
|
604
|
+
|
|
605
|
+
# Structured logging
|
|
606
|
+
logger = structlog.get_logger()
|
|
607
|
+
|
|
608
|
+
# Prometheus metrics
|
|
609
|
+
PREDICTION_LATENCY = Histogram(
|
|
610
|
+
"model_prediction_latency_seconds",
|
|
611
|
+
"Time spent processing prediction",
|
|
612
|
+
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0]
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
PREDICTION_COUNT = Counter(
|
|
616
|
+
"model_predictions_total",
|
|
617
|
+
"Total number of predictions",
|
|
618
|
+
["model_name", "model_version", "outcome"]
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
FEATURE_VALUE = Gauge(
|
|
622
|
+
"model_feature_value",
|
|
623
|
+
"Feature value distribution",
|
|
624
|
+
["feature_name", "quantile"]
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
def predict_with_observability(model, features: dict) -> dict:
|
|
628
|
+
"""Make prediction with full observability."""
|
|
629
|
+
|
|
630
|
+
request_id = str(uuid.uuid4())
|
|
631
|
+
start_time = time.time()
|
|
632
|
+
|
|
633
|
+
logger.info(
|
|
634
|
+
"prediction_started",
|
|
635
|
+
request_id=request_id,
|
|
636
|
+
feature_count=len(features),
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
try:
|
|
640
|
+
with PREDICTION_LATENCY.time():
|
|
641
|
+
prediction = model.predict(features)
|
|
642
|
+
|
|
643
|
+
PREDICTION_COUNT.labels(
|
|
644
|
+
model_name=model.name,
|
|
645
|
+
model_version=model.version,
|
|
646
|
+
outcome="success"
|
|
647
|
+
).inc()
|
|
648
|
+
|
|
649
|
+
logger.info(
|
|
650
|
+
"prediction_completed",
|
|
651
|
+
request_id=request_id,
|
|
652
|
+
prediction=prediction,
|
|
653
|
+
latency_ms=(time.time() - start_time) * 1000,
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
return {"prediction": prediction, "request_id": request_id}
|
|
657
|
+
|
|
658
|
+
except Exception as e:
|
|
659
|
+
PREDICTION_COUNT.labels(
|
|
660
|
+
model_name=model.name,
|
|
661
|
+
model_version=model.version,
|
|
662
|
+
outcome="error"
|
|
663
|
+
).inc()
|
|
664
|
+
|
|
665
|
+
logger.error(
|
|
666
|
+
"prediction_failed",
|
|
667
|
+
request_id=request_id,
|
|
668
|
+
error=str(e),
|
|
669
|
+
)
|
|
670
|
+
raise
|
|
671
|
+
```
|
|
672
|
+
|
|
673
|
+
---
|
|
674
|
+
|
|
675
|
+
## Security & Responsible AI
|
|
676
|
+
|
|
677
|
+
### Input Validation
|
|
678
|
+
|
|
679
|
+
```python
|
|
680
|
+
from pydantic import BaseModel, Field, validator
|
|
681
|
+
from typing import List
|
|
682
|
+
|
|
683
|
+
class PredictionRequest(BaseModel):
|
|
684
|
+
"""Validated prediction request."""
|
|
685
|
+
|
|
686
|
+
features: dict[str, float] = Field(..., min_items=1)
|
|
687
|
+
request_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
688
|
+
|
|
689
|
+
@validator("features")
|
|
690
|
+
def validate_features(cls, v):
|
|
691
|
+
required = {"feature_1", "feature_2", "feature_3"}
|
|
692
|
+
missing = required - set(v.keys())
|
|
693
|
+
if missing:
|
|
694
|
+
raise ValueError(f"Missing required features: {missing}")
|
|
695
|
+
|
|
696
|
+
for name, value in v.items():
|
|
697
|
+
if not isinstance(value, (int, float)):
|
|
698
|
+
raise ValueError(f"Feature {name} must be numeric")
|
|
699
|
+
if math.isnan(value) or math.isinf(value):
|
|
700
|
+
raise ValueError(f"Feature {name} contains invalid value")
|
|
701
|
+
|
|
702
|
+
return v
|
|
703
|
+
|
|
704
|
+
@validator("features")
|
|
705
|
+
def validate_ranges(cls, v):
|
|
706
|
+
ranges = {
|
|
707
|
+
"feature_1": (0, 1),
|
|
708
|
+
"feature_2": (-100, 100),
|
|
709
|
+
}
|
|
710
|
+
for name, (min_val, max_val) in ranges.items():
|
|
711
|
+
if name in v and not (min_val <= v[name] <= max_val):
|
|
712
|
+
raise ValueError(f"Feature {name} out of range [{min_val}, {max_val}]")
|
|
713
|
+
return v
|
|
714
|
+
```
|
|
715
|
+
|
|
716
|
+
### Fairness Assessment
|
|
717
|
+
|
|
718
|
+
```python
|
|
719
|
+
from aif360.datasets import BinaryLabelDataset
|
|
720
|
+
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
|
|
721
|
+
|
|
722
|
+
def assess_fairness(
|
|
723
|
+
data: pd.DataFrame,
|
|
724
|
+
predictions: np.ndarray,
|
|
725
|
+
protected_attribute: str,
|
|
726
|
+
privileged_groups: list[dict],
|
|
727
|
+
unprivileged_groups: list[dict],
|
|
728
|
+
) -> dict:
|
|
729
|
+
"""Assess model fairness across protected groups."""
|
|
730
|
+
|
|
731
|
+
dataset = BinaryLabelDataset(
|
|
732
|
+
df=data,
|
|
733
|
+
label_names=["label"],
|
|
734
|
+
protected_attribute_names=[protected_attribute],
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
classified_dataset = dataset.copy()
|
|
738
|
+
classified_dataset.labels = predictions.reshape(-1, 1)
|
|
739
|
+
|
|
740
|
+
metric = ClassificationMetric(
|
|
741
|
+
dataset,
|
|
742
|
+
classified_dataset,
|
|
743
|
+
unprivileged_groups=unprivileged_groups,
|
|
744
|
+
privileged_groups=privileged_groups,
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
return {
|
|
748
|
+
"statistical_parity_difference": metric.statistical_parity_difference(),
|
|
749
|
+
"equal_opportunity_difference": metric.equal_opportunity_difference(),
|
|
750
|
+
"average_odds_difference": metric.average_odds_difference(),
|
|
751
|
+
"disparate_impact": metric.disparate_impact(),
|
|
752
|
+
"theil_index": metric.theil_index(),
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
def check_fairness_thresholds(fairness_metrics: dict) -> list[str]:
|
|
756
|
+
"""Check if fairness metrics exceed acceptable thresholds."""
|
|
757
|
+
thresholds = {
|
|
758
|
+
"statistical_parity_difference": 0.1,
|
|
759
|
+
"equal_opportunity_difference": 0.1,
|
|
760
|
+
"average_odds_difference": 0.1,
|
|
761
|
+
"disparate_impact": (0.8, 1.25), # 80% rule
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
violations = []
|
|
765
|
+
|
|
766
|
+
for metric, threshold in thresholds.items():
|
|
767
|
+
value = fairness_metrics[metric]
|
|
768
|
+
if isinstance(threshold, tuple):
|
|
769
|
+
if not (threshold[0] <= value <= threshold[1]):
|
|
770
|
+
violations.append(f"{metric}: {value:.3f} not in {threshold}")
|
|
771
|
+
else:
|
|
772
|
+
if abs(value) > threshold:
|
|
773
|
+
violations.append(f"{metric}: {value:.3f} exceeds {threshold}")
|
|
774
|
+
|
|
775
|
+
return violations
|
|
776
|
+
```
|
|
777
|
+
|
|
778
|
+
### Model Explainability
|
|
779
|
+
|
|
780
|
+
```python
|
|
781
|
+
import shap
|
|
782
|
+
|
|
783
|
+
def explain_prediction(
|
|
784
|
+
model,
|
|
785
|
+
instance: pd.DataFrame,
|
|
786
|
+
background_data: pd.DataFrame,
|
|
787
|
+
top_k: int = 10
|
|
788
|
+
) -> dict:
|
|
789
|
+
"""Generate SHAP explanation for a prediction."""
|
|
790
|
+
|
|
791
|
+
explainer = shap.TreeExplainer(model)
|
|
792
|
+
shap_values = explainer.shap_values(instance)
|
|
793
|
+
|
|
794
|
+
feature_importance = pd.DataFrame({
|
|
795
|
+
"feature": instance.columns,
|
|
796
|
+
"shap_value": shap_values[0],
|
|
797
|
+
"feature_value": instance.values[0],
|
|
798
|
+
})
|
|
799
|
+
|
|
800
|
+
feature_importance["abs_shap"] = feature_importance["shap_value"].abs()
|
|
801
|
+
feature_importance = feature_importance.sort_values("abs_shap", ascending=False)
|
|
802
|
+
|
|
803
|
+
return {
|
|
804
|
+
"base_value": explainer.expected_value,
|
|
805
|
+
"prediction": model.predict(instance)[0],
|
|
806
|
+
"top_features": feature_importance.head(top_k).to_dict(orient="records"),
|
|
807
|
+
}
|
|
808
|
+
```
|
|
809
|
+
|
|
810
|
+
---
|
|
811
|
+
|
|
812
|
+
## Testing
|
|
813
|
+
|
|
814
|
+
### Unit Tests
|
|
815
|
+
|
|
816
|
+
```python
|
|
817
|
+
import pytest
|
|
818
|
+
from unittest.mock import Mock, patch
|
|
819
|
+
|
|
820
|
+
class TestFeatureTransformer:
|
|
821
|
+
"""Test feature transformation logic."""
|
|
822
|
+
|
|
823
|
+
def test_fit_transform_numeric(self):
|
|
824
|
+
df = pd.DataFrame({"feature_1": [0, 10, 20]})
|
|
825
|
+
config = TransformConfig(numeric_cols=["feature_1"])
|
|
826
|
+
|
|
827
|
+
transformer = FeatureTransformer()
|
|
828
|
+
result = transformer.fit_transform(df, config)
|
|
829
|
+
|
|
830
|
+
assert result["feature_1"].mean() == pytest.approx(0, abs=1e-10)
|
|
831
|
+
assert result["feature_1"].std() == pytest.approx(1, abs=1e-10)
|
|
832
|
+
|
|
833
|
+
def test_transform_uses_fitted_params(self):
|
|
834
|
+
train_df = pd.DataFrame({"feature_1": [0, 10, 20]})
|
|
835
|
+
test_df = pd.DataFrame({"feature_1": [5, 15]})
|
|
836
|
+
config = TransformConfig(numeric_cols=["feature_1"])
|
|
837
|
+
|
|
838
|
+
transformer = FeatureTransformer()
|
|
839
|
+
transformer.fit_transform(train_df, config)
|
|
840
|
+
result = transformer.transform(test_df)
|
|
841
|
+
|
|
842
|
+
# Should use training mean/std, not test data
|
|
843
|
+
assert result["feature_1"].iloc[0] == pytest.approx(-0.5, abs=0.1)
|
|
844
|
+
|
|
845
|
+
def test_save_load_roundtrip(self, tmp_path):
|
|
846
|
+
df = pd.DataFrame({"feature_1": [0, 10, 20]})
|
|
847
|
+
config = TransformConfig(numeric_cols=["feature_1"])
|
|
848
|
+
|
|
849
|
+
transformer = FeatureTransformer()
|
|
850
|
+
transformer.fit_transform(df, config)
|
|
851
|
+
|
|
852
|
+
path = tmp_path / "transformer.pkl"
|
|
853
|
+
transformer.save(str(path))
|
|
854
|
+
|
|
855
|
+
loaded = FeatureTransformer.load(str(path))
|
|
856
|
+
|
|
857
|
+
assert loaded.scalers.keys() == transformer.scalers.keys()
|
|
858
|
+
|
|
859
|
+
class TestDataValidation:
|
|
860
|
+
"""Test data validation schemas."""
|
|
861
|
+
|
|
862
|
+
def test_valid_data_passes(self):
|
|
863
|
+
df = pd.DataFrame({
|
|
864
|
+
"user_id": ["u1", "u2"],
|
|
865
|
+
"feature_1": [0.5, 0.7],
|
|
866
|
+
"feature_2": [1.0, 2.0],
|
|
867
|
+
"label": [0, 1],
|
|
868
|
+
})
|
|
869
|
+
|
|
870
|
+
# Should not raise
|
|
871
|
+
validated = TrainingDataSchema.validate(df)
|
|
872
|
+
assert len(validated) == 2
|
|
873
|
+
|
|
874
|
+
def test_invalid_range_fails(self):
|
|
875
|
+
df = pd.DataFrame({
|
|
876
|
+
"user_id": ["u1"],
|
|
877
|
+
"feature_1": [1.5], # Out of range [0, 1]
|
|
878
|
+
"feature_2": [1.0],
|
|
879
|
+
"label": [0],
|
|
880
|
+
})
|
|
881
|
+
|
|
882
|
+
with pytest.raises(pa.errors.SchemaError):
|
|
883
|
+
TrainingDataSchema.validate(df)
|
|
884
|
+
|
|
885
|
+
def test_missing_column_fails(self):
|
|
886
|
+
df = pd.DataFrame({
|
|
887
|
+
"user_id": ["u1"],
|
|
888
|
+
"feature_1": [0.5],
|
|
889
|
+
# Missing feature_2
|
|
890
|
+
"label": [0],
|
|
891
|
+
})
|
|
892
|
+
|
|
893
|
+
with pytest.raises(pa.errors.SchemaError):
|
|
894
|
+
TrainingDataSchema.validate(df)
|
|
895
|
+
```
|
|
896
|
+
|
|
897
|
+
### Model Tests
|
|
898
|
+
|
|
899
|
+
```python
|
|
900
|
+
class TestModelBehavior:
|
|
901
|
+
"""Test model behavior and invariants."""
|
|
902
|
+
|
|
903
|
+
@pytest.fixture
|
|
904
|
+
def trained_model(self):
|
|
905
|
+
"""Load a trained model for testing."""
|
|
906
|
+
return mlflow.pyfunc.load_model("models:/fraud-detector/production")
|
|
907
|
+
|
|
908
|
+
def test_prediction_deterministic(self, trained_model):
|
|
909
|
+
"""Same input should give same output."""
|
|
910
|
+
features = pd.DataFrame([{"feature_1": 0.5, "feature_2": 1.0}])
|
|
911
|
+
|
|
912
|
+
pred1 = trained_model.predict(features)
|
|
913
|
+
pred2 = trained_model.predict(features)
|
|
914
|
+
|
|
915
|
+
np.testing.assert_array_equal(pred1, pred2)
|
|
916
|
+
|
|
917
|
+
def test_prediction_in_valid_range(self, trained_model):
|
|
918
|
+
"""Predictions should be valid probabilities."""
|
|
919
|
+
features = pd.DataFrame([
|
|
920
|
+
{"feature_1": 0.0, "feature_2": 0.0},
|
|
921
|
+
{"feature_1": 1.0, "feature_2": 100.0},
|
|
922
|
+
{"feature_1": 0.5, "feature_2": 50.0},
|
|
923
|
+
])
|
|
924
|
+
|
|
925
|
+
predictions = trained_model.predict(features)
|
|
926
|
+
|
|
927
|
+
assert all(0 <= p <= 1 for p in predictions)
|
|
928
|
+
|
|
929
|
+
def test_monotonic_relationship(self, trained_model):
|
|
930
|
+
"""Higher risk features should increase fraud probability."""
|
|
931
|
+
low_risk = pd.DataFrame([{"feature_1": 0.1, "feature_2": 10}])
|
|
932
|
+
high_risk = pd.DataFrame([{"feature_1": 0.9, "feature_2": 90}])
|
|
933
|
+
|
|
934
|
+
low_pred = trained_model.predict(low_risk)[0]
|
|
935
|
+
high_pred = trained_model.predict(high_risk)[0]
|
|
936
|
+
|
|
937
|
+
assert high_pred > low_pred
|
|
938
|
+
|
|
939
|
+
def test_no_discrimination_by_protected_attribute(self, trained_model):
|
|
940
|
+
"""Model should not discriminate based on protected attributes."""
|
|
941
|
+
base_features = {"feature_1": 0.5, "feature_2": 50}
|
|
942
|
+
|
|
943
|
+
pred_group_a = trained_model.predict(pd.DataFrame([{**base_features, "group": "A"}]))[0]
|
|
944
|
+
pred_group_b = trained_model.predict(pd.DataFrame([{**base_features, "group": "B"}]))[0]
|
|
945
|
+
|
|
946
|
+
# Predictions should be very close if group shouldn't matter
|
|
947
|
+
assert abs(pred_group_a - pred_group_b) < 0.01
|
|
948
|
+
```
|
|
949
|
+
|
|
950
|
+
### Integration Tests
|
|
951
|
+
|
|
952
|
+
```python
|
|
953
|
+
class TestInferencePipeline:
|
|
954
|
+
"""Test end-to-end inference pipeline."""
|
|
955
|
+
|
|
956
|
+
@pytest.fixture
|
|
957
|
+
def inference_service(self):
|
|
958
|
+
"""Start inference service for testing."""
|
|
959
|
+
# Start service in test mode
|
|
960
|
+
return InferenceServiceClient("http://localhost:8080")
|
|
961
|
+
|
|
962
|
+
def test_health_check(self, inference_service):
|
|
963
|
+
"""Service should be healthy."""
|
|
964
|
+
response = inference_service.health()
|
|
965
|
+
assert response.status == "healthy"
|
|
966
|
+
assert response.model_loaded == True
|
|
967
|
+
|
|
968
|
+
def test_single_prediction(self, inference_service):
|
|
969
|
+
"""Single prediction should succeed."""
|
|
970
|
+
request = PredictionRequest(
|
|
971
|
+
features={"feature_1": 0.5, "feature_2": 1.0}
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
response = inference_service.predict(request)
|
|
975
|
+
|
|
976
|
+
assert "prediction" in response
|
|
977
|
+
assert 0 <= response["prediction"] <= 1
|
|
978
|
+
assert "request_id" in response
|
|
979
|
+
|
|
980
|
+
def test_batch_prediction(self, inference_service):
|
|
981
|
+
"""Batch prediction should succeed."""
|
|
982
|
+
requests = [
|
|
983
|
+
{"feature_1": 0.1, "feature_2": 1.0},
|
|
984
|
+
{"feature_1": 0.5, "feature_2": 2.0},
|
|
985
|
+
{"feature_1": 0.9, "feature_2": 3.0},
|
|
986
|
+
]
|
|
987
|
+
|
|
988
|
+
response = inference_service.predict_batch(requests)
|
|
989
|
+
|
|
990
|
+
assert len(response["predictions"]) == 3
|
|
991
|
+
|
|
992
|
+
def test_invalid_request_rejected(self, inference_service):
|
|
993
|
+
"""Invalid requests should be rejected with clear error."""
|
|
994
|
+
request = PredictionRequest(
|
|
995
|
+
features={"feature_1": "invalid"} # Should be numeric
|
|
996
|
+
)
|
|
997
|
+
|
|
998
|
+
with pytest.raises(ValidationError) as exc_info:
|
|
999
|
+
inference_service.predict(request)
|
|
1000
|
+
|
|
1001
|
+
assert "must be numeric" in str(exc_info.value)
|
|
1002
|
+
```
|
|
1003
|
+
|
|
1004
|
+
---
|
|
1005
|
+
|
|
1006
|
+
## Definition of Done
|
|
1007
|
+
|
|
1008
|
+
### Data Pipeline
|
|
1009
|
+
|
|
1010
|
+
- [ ] Data validation schema defined and enforced
|
|
1011
|
+
- [ ] Data quality checks automated
|
|
1012
|
+
- [ ] Feature engineering code tested
|
|
1013
|
+
- [ ] No training/serving skew (transformers serialized)
|
|
1014
|
+
- [ ] Data versioning in place
|
|
1015
|
+
|
|
1016
|
+
### Model Development
|
|
1017
|
+
|
|
1018
|
+
- [ ] Experiment tracked with all parameters and metrics
|
|
1019
|
+
- [ ] Multiple metrics evaluated (not just accuracy)
|
|
1020
|
+
- [ ] Fairness assessed across protected groups
|
|
1021
|
+
- [ ] Hyperparameters optimized
|
|
1022
|
+
- [ ] Model registered with signature
|
|
1023
|
+
|
|
1024
|
+
### Deployment
|
|
1025
|
+
|
|
1026
|
+
- [ ] Model packaged with dependencies
|
|
1027
|
+
- [ ] Inference endpoint tested
|
|
1028
|
+
- [ ] Latency meets SLA
|
|
1029
|
+
- [ ] Scaling configuration defined
|
|
1030
|
+
- [ ] Rollback procedure documented
|
|
1031
|
+
|
|
1032
|
+
### Monitoring
|
|
1033
|
+
|
|
1034
|
+
- [ ] Drift detection configured
|
|
1035
|
+
- [ ] Performance alerts set up
|
|
1036
|
+
- [ ] Logging in place
|
|
1037
|
+
- [ ] Dashboards created
|
|
1038
|
+
- [ ] Incident response plan documented
|
|
1039
|
+
|
|
1040
|
+
### Testing
|
|
1041
|
+
|
|
1042
|
+
- [ ] Unit tests for all transforms
|
|
1043
|
+
- [ ] Model behavior tests passing
|
|
1044
|
+
- [ ] Integration tests for inference
|
|
1045
|
+
- [ ] Fairness tests passing
|
|
1046
|
+
- [ ] Load testing completed
|
|
1047
|
+
|
|
1048
|
+
---
|
|
1049
|
+
|
|
1050
|
+
## Common Pitfalls
|
|
1051
|
+
|
|
1052
|
+
### 1. Ignoring Data Quality
|
|
1053
|
+
|
|
1054
|
+
```python
|
|
1055
|
+
# Bad: Trust the data
|
|
1056
|
+
df = pd.read_csv("data.csv")
|
|
1057
|
+
model.fit(df)
|
|
1058
|
+
|
|
1059
|
+
# Good: Validate everything
|
|
1060
|
+
df = pd.read_csv("data.csv")
|
|
1061
|
+
validated_df = DataSchema.validate(df)
|
|
1062
|
+
quality_report = run_quality_checks(validated_df)
|
|
1063
|
+
if quality_report.has_critical_issues:
|
|
1064
|
+
raise DataQualityError(quality_report.issues)
|
|
1065
|
+
model.fit(validated_df)
|
|
1066
|
+
```
|
|
1067
|
+
|
|
1068
|
+
### 2. Training/Serving Skew
|
|
1069
|
+
|
|
1070
|
+
```python
|
|
1071
|
+
# Bad: Different preprocessing in training vs serving
|
|
1072
|
+
# training.py
|
|
1073
|
+
df["feature"] = (df["feature"] - df["feature"].mean()) / df["feature"].std()
|
|
1074
|
+
|
|
1075
|
+
# serving.py
|
|
1076
|
+
df["feature"] = (df["feature"] - 0.5) / 0.2 # Hardcoded values!
|
|
1077
|
+
|
|
1078
|
+
# Good: Serialize the transformer
|
|
1079
|
+
transformer = FeatureTransformer()
|
|
1080
|
+
transformer.fit(train_df)
|
|
1081
|
+
transformer.save("transformer.pkl") # Use same transformer everywhere
|
|
1082
|
+
```
|
|
1083
|
+
|
|
1084
|
+
### 3. Overfitting to Offline Metrics
|
|
1085
|
+
|
|
1086
|
+
```python
|
|
1087
|
+
# Bad: Deploy based on validation metrics alone
|
|
1088
|
+
if val_accuracy > 0.95:
|
|
1089
|
+
deploy_model(model)
|
|
1090
|
+
|
|
1091
|
+
# Good: Use A/B testing in production
|
|
1092
|
+
if val_accuracy > 0.95:
|
|
1093
|
+
deploy_to_shadow(model)
|
|
1094
|
+
|
|
1095
|
+
# After collecting production data
|
|
1096
|
+
if ab_test_significant and production_lift > 0.01:
|
|
1097
|
+
promote_to_production(model)
|
|
1098
|
+
```
|
|
1099
|
+
|
|
1100
|
+
### 4. Ignoring Fairness
|
|
1101
|
+
|
|
1102
|
+
```python
|
|
1103
|
+
# Bad: Only optimize for accuracy
|
|
1104
|
+
best_model = max(models, key=lambda m: m.accuracy)
|
|
1105
|
+
|
|
1106
|
+
# Good: Consider fairness constraints
|
|
1107
|
+
valid_models = [m for m in models if passes_fairness_checks(m)]
|
|
1108
|
+
if not valid_models:
|
|
1109
|
+
raise FairnessViolation("No model meets fairness criteria")
|
|
1110
|
+
best_model = max(valid_models, key=lambda m: m.accuracy)
|
|
1111
|
+
```
|
|
1112
|
+
|
|
1113
|
+
### 5. No Drift Monitoring
|
|
1114
|
+
|
|
1115
|
+
```python
|
|
1116
|
+
# Bad: Deploy and forget
|
|
1117
|
+
deploy_model(model)
|
|
1118
|
+
|
|
1119
|
+
# Good: Continuous monitoring
|
|
1120
|
+
deploy_model(model)
|
|
1121
|
+
schedule_drift_detection(model, frequency="hourly")
|
|
1122
|
+
schedule_performance_monitoring(model, frequency="daily")
|
|
1123
|
+
setup_alerts(model, thresholds=ALERT_THRESHOLDS)
|
|
1124
|
+
```
|
|
1125
|
+
|
|
1126
|
+
---
|
|
1127
|
+
|
|
1128
|
+
## Resources
|
|
1129
|
+
|
|
1130
|
+
- [Google ML Engineering Best Practices](https://developers.google.com/machine-learning/guides/rules-of-ml)
|
|
1131
|
+
- [MLOps Principles](https://ml-ops.org/)
|
|
1132
|
+
- [NIST AI Risk Management Framework](https://www.nist.gov/itl/ai-risk-management-framework)
|
|
1133
|
+
- [Evidently AI - ML Monitoring](https://docs.evidentlyai.com/)
|
|
1134
|
+
- [MLflow Documentation](https://mlflow.org/docs/latest/index.html)
|
|
1135
|
+
- [KServe Documentation](https://kserve.github.io/website/)
|
|
1136
|
+
- [Feast Feature Store](https://docs.feast.dev/)
|