agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,658 @@
|
|
|
1
|
+
# ML Monitoring & Observability
|
|
2
|
+
|
|
3
|
+
Guidelines for monitoring machine learning systems in production, including drift detection, performance tracking, and alerting strategies.
|
|
4
|
+
|
|
5
|
+
## Core Monitoring Concepts
|
|
6
|
+
|
|
7
|
+
### ML-Specific Challenges
|
|
8
|
+
|
|
9
|
+
Traditional monitoring (CPU, memory, latency) is necessary but insufficient for ML systems. Models can silently degrade while infrastructure metrics stay green.
|
|
10
|
+
|
|
11
|
+
| Monitoring Type | What It Catches | What It Misses |
|
|
12
|
+
|-----------------|-----------------|----------------|
|
|
13
|
+
| Infrastructure | Server crashes, OOM | Silent model degradation |
|
|
14
|
+
| Application | API errors, latency spikes | Prediction quality decline |
|
|
15
|
+
| **ML-Specific** | Data drift, concept drift | Nothing (when done right) |
|
|
16
|
+
|
|
17
|
+
### Key Metrics to Track
|
|
18
|
+
|
|
19
|
+
| Category | Metrics | Purpose |
|
|
20
|
+
|----------|---------|---------|
|
|
21
|
+
| Data Quality | Missing values, schema violations, outliers | Input health |
|
|
22
|
+
| Data Drift | Feature distributions, statistical tests | Distribution shift |
|
|
23
|
+
| Model Performance | Accuracy, precision, recall, business KPIs | Output quality |
|
|
24
|
+
| Concept Drift | Prediction distribution, label correlation | Relationship changes |
|
|
25
|
+
| Operational | Latency, throughput, error rates | System health |
|
|
26
|
+
|
|
27
|
+
## Drift Detection
|
|
28
|
+
|
|
29
|
+
### Data Drift
|
|
30
|
+
|
|
31
|
+
Detect when input distributions change:
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from evidently import ColumnMapping
|
|
35
|
+
from evidently.report import Report
|
|
36
|
+
from evidently.metric_preset import DataDriftPreset
|
|
37
|
+
from evidently.metrics import DataDriftTable
|
|
38
|
+
|
|
39
|
+
def detect_data_drift(
|
|
40
|
+
reference_data: pd.DataFrame,
|
|
41
|
+
current_data: pd.DataFrame,
|
|
42
|
+
column_mapping: ColumnMapping,
|
|
43
|
+
threshold: float = 0.05,
|
|
44
|
+
) -> dict:
|
|
45
|
+
"""Detect drift in feature distributions."""
|
|
46
|
+
|
|
47
|
+
report = Report(metrics=[
|
|
48
|
+
DataDriftPreset(stattest_threshold=threshold),
|
|
49
|
+
])
|
|
50
|
+
|
|
51
|
+
report.run(
|
|
52
|
+
reference_data=reference_data,
|
|
53
|
+
current_data=current_data,
|
|
54
|
+
column_mapping=column_mapping,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
result = report.as_dict()
|
|
58
|
+
drift_info = result["metrics"][0]["result"]
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
"dataset_drift": drift_info["dataset_drift"],
|
|
62
|
+
"drift_share": drift_info["drift_share"],
|
|
63
|
+
"drifted_columns": [
|
|
64
|
+
col for col, info in drift_info["drift_by_columns"].items()
|
|
65
|
+
if info["drift_detected"]
|
|
66
|
+
],
|
|
67
|
+
"column_details": {
|
|
68
|
+
col: {
|
|
69
|
+
"drift_detected": info["drift_detected"],
|
|
70
|
+
"stattest": info["stattest_name"],
|
|
71
|
+
"p_value": info.get("p_value"),
|
|
72
|
+
}
|
|
73
|
+
for col, info in drift_info["drift_by_columns"].items()
|
|
74
|
+
},
|
|
75
|
+
}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Concept Drift
|
|
79
|
+
|
|
80
|
+
Detect when the relationship between features and target changes:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
def detect_concept_drift(
|
|
84
|
+
reference_predictions: pd.DataFrame,
|
|
85
|
+
current_predictions: pd.DataFrame,
|
|
86
|
+
window_size: int = 1000,
|
|
87
|
+
) -> dict:
|
|
88
|
+
"""Detect changes in prediction patterns."""
|
|
89
|
+
|
|
90
|
+
# Compare prediction distributions
|
|
91
|
+
from scipy import stats
|
|
92
|
+
|
|
93
|
+
ref_preds = reference_predictions["probability"]
|
|
94
|
+
curr_preds = current_predictions["probability"]
|
|
95
|
+
|
|
96
|
+
# Kolmogorov-Smirnov test
|
|
97
|
+
ks_stat, ks_pvalue = stats.ks_2samp(ref_preds, curr_preds)
|
|
98
|
+
|
|
99
|
+
# Population Stability Index
|
|
100
|
+
psi = calculate_psi(ref_preds, curr_preds, buckets=10)
|
|
101
|
+
|
|
102
|
+
# Jensen-Shannon divergence
|
|
103
|
+
js_divergence = calculate_js_divergence(ref_preds, curr_preds)
|
|
104
|
+
|
|
105
|
+
return {
|
|
106
|
+
"ks_statistic": ks_stat,
|
|
107
|
+
"ks_pvalue": ks_pvalue,
|
|
108
|
+
"psi": psi,
|
|
109
|
+
"js_divergence": js_divergence,
|
|
110
|
+
"drift_detected": psi > 0.1 or ks_pvalue < 0.05,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
def calculate_psi(reference: pd.Series, current: pd.Series, buckets: int = 10) -> float:
|
|
114
|
+
"""Calculate Population Stability Index."""
|
|
115
|
+
|
|
116
|
+
# Create bins from reference
|
|
117
|
+
_, bins = pd.cut(reference, buckets, retbins=True)
|
|
118
|
+
bins[0] = -np.inf
|
|
119
|
+
bins[-1] = np.inf
|
|
120
|
+
|
|
121
|
+
# Calculate proportions
|
|
122
|
+
ref_counts = pd.cut(reference, bins).value_counts(normalize=True)
|
|
123
|
+
curr_counts = pd.cut(current, bins).value_counts(normalize=True)
|
|
124
|
+
|
|
125
|
+
# Align indices
|
|
126
|
+
ref_counts = ref_counts.reindex(curr_counts.index, fill_value=0.0001)
|
|
127
|
+
curr_counts = curr_counts.replace(0, 0.0001)
|
|
128
|
+
|
|
129
|
+
# PSI formula
|
|
130
|
+
psi = np.sum((curr_counts - ref_counts) * np.log(curr_counts / ref_counts))
|
|
131
|
+
|
|
132
|
+
return psi
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Continuous Drift Monitoring
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from prefect import flow, task
|
|
139
|
+
from datetime import datetime, timedelta
|
|
140
|
+
|
|
141
|
+
@task
|
|
142
|
+
def fetch_reference_data(model_version: str) -> pd.DataFrame:
|
|
143
|
+
"""Fetch reference data for the model version."""
|
|
144
|
+
return pd.read_parquet(f"s3://reference-data/{model_version}/")
|
|
145
|
+
|
|
146
|
+
@task
|
|
147
|
+
def fetch_current_data(date: str) -> pd.DataFrame:
|
|
148
|
+
"""Fetch current production data."""
|
|
149
|
+
return pd.read_parquet(f"s3://production-data/{date}/")
|
|
150
|
+
|
|
151
|
+
@task
|
|
152
|
+
def run_drift_detection(
|
|
153
|
+
reference: pd.DataFrame,
|
|
154
|
+
current: pd.DataFrame,
|
|
155
|
+
) -> dict:
|
|
156
|
+
"""Run all drift detection checks."""
|
|
157
|
+
|
|
158
|
+
column_mapping = ColumnMapping(
|
|
159
|
+
target="label",
|
|
160
|
+
prediction="prediction",
|
|
161
|
+
numerical_features=NUMERIC_FEATURES,
|
|
162
|
+
categorical_features=CATEGORICAL_FEATURES,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
data_drift = detect_data_drift(reference, current, column_mapping)
|
|
166
|
+
concept_drift = detect_concept_drift(reference, current)
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
"data_drift": data_drift,
|
|
170
|
+
"concept_drift": concept_drift,
|
|
171
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
@task
|
|
175
|
+
def send_alerts(drift_results: dict) -> None:
|
|
176
|
+
"""Send alerts if drift detected."""
|
|
177
|
+
|
|
178
|
+
if drift_results["data_drift"]["dataset_drift"]:
|
|
179
|
+
send_alert(
|
|
180
|
+
channel="ml-alerts",
|
|
181
|
+
severity="warning",
|
|
182
|
+
message=f"Data drift detected: {drift_results['data_drift']['drift_share']:.1%} of features drifted",
|
|
183
|
+
details=drift_results["data_drift"]["drifted_columns"],
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
if drift_results["concept_drift"]["drift_detected"]:
|
|
187
|
+
send_alert(
|
|
188
|
+
channel="ml-alerts",
|
|
189
|
+
severity="critical",
|
|
190
|
+
message=f"Concept drift detected: PSI={drift_results['concept_drift']['psi']:.3f}",
|
|
191
|
+
details=drift_results["concept_drift"],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
@flow(name="drift-monitoring")
|
|
195
|
+
def drift_monitoring_pipeline(date: str, model_version: str):
|
|
196
|
+
"""Daily drift monitoring pipeline."""
|
|
197
|
+
|
|
198
|
+
reference = fetch_reference_data(model_version)
|
|
199
|
+
current = fetch_current_data(date)
|
|
200
|
+
|
|
201
|
+
drift_results = run_drift_detection(reference, current)
|
|
202
|
+
|
|
203
|
+
# Log to MLflow
|
|
204
|
+
log_drift_metrics(drift_results)
|
|
205
|
+
|
|
206
|
+
# Alert if needed
|
|
207
|
+
send_alerts(drift_results)
|
|
208
|
+
|
|
209
|
+
return drift_results
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## Performance Monitoring
|
|
213
|
+
|
|
214
|
+
### Real-Time Metrics
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from prometheus_client import Counter, Histogram, Gauge, Summary
|
|
218
|
+
|
|
219
|
+
# Prediction metrics
|
|
220
|
+
PREDICTION_LATENCY = Histogram(
|
|
221
|
+
"model_prediction_latency_seconds",
|
|
222
|
+
"Time to generate prediction",
|
|
223
|
+
["model_name", "model_version"],
|
|
224
|
+
buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
PREDICTION_COUNT = Counter(
|
|
228
|
+
"model_predictions_total",
|
|
229
|
+
"Total predictions made",
|
|
230
|
+
["model_name", "model_version", "outcome"],
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
PREDICTION_VALUE = Summary(
|
|
234
|
+
"model_prediction_value",
|
|
235
|
+
"Distribution of prediction values",
|
|
236
|
+
["model_name", "model_version"],
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
FEATURE_VALUE = Gauge(
|
|
240
|
+
"model_feature_value",
|
|
241
|
+
"Feature value statistics",
|
|
242
|
+
["model_name", "feature_name", "statistic"],
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Instrumented prediction function
|
|
246
|
+
def predict_with_metrics(model, features: dict) -> dict:
|
|
247
|
+
"""Make prediction with full metric instrumentation."""
|
|
248
|
+
|
|
249
|
+
start_time = time.time()
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
prediction = model.predict(features)
|
|
253
|
+
|
|
254
|
+
# Record metrics
|
|
255
|
+
latency = time.time() - start_time
|
|
256
|
+
PREDICTION_LATENCY.labels(
|
|
257
|
+
model_name=model.name,
|
|
258
|
+
model_version=model.version,
|
|
259
|
+
).observe(latency)
|
|
260
|
+
|
|
261
|
+
PREDICTION_COUNT.labels(
|
|
262
|
+
model_name=model.name,
|
|
263
|
+
model_version=model.version,
|
|
264
|
+
outcome="success",
|
|
265
|
+
).inc()
|
|
266
|
+
|
|
267
|
+
PREDICTION_VALUE.labels(
|
|
268
|
+
model_name=model.name,
|
|
269
|
+
model_version=model.version,
|
|
270
|
+
).observe(prediction)
|
|
271
|
+
|
|
272
|
+
return {"prediction": prediction, "latency_ms": latency * 1000}
|
|
273
|
+
|
|
274
|
+
except Exception as e:
|
|
275
|
+
PREDICTION_COUNT.labels(
|
|
276
|
+
model_name=model.name,
|
|
277
|
+
model_version=model.version,
|
|
278
|
+
outcome="error",
|
|
279
|
+
).inc()
|
|
280
|
+
raise
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### Delayed Labels Monitoring
|
|
284
|
+
|
|
285
|
+
When ground truth arrives later than predictions:
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
class DelayedLabelMonitor:
|
|
289
|
+
"""Monitor model performance with delayed ground truth."""
|
|
290
|
+
|
|
291
|
+
def __init__(self, model_name: str, lookback_days: int = 30):
|
|
292
|
+
self.model_name = model_name
|
|
293
|
+
self.lookback_days = lookback_days
|
|
294
|
+
|
|
295
|
+
def compute_metrics(self, date: str) -> dict:
|
|
296
|
+
"""Compute metrics for predictions that now have labels."""
|
|
297
|
+
|
|
298
|
+
# Load predictions from N days ago
|
|
299
|
+
prediction_date = (
|
|
300
|
+
datetime.strptime(date, "%Y-%m-%d") - timedelta(days=self.lookback_days)
|
|
301
|
+
).strftime("%Y-%m-%d")
|
|
302
|
+
|
|
303
|
+
predictions = load_predictions(self.model_name, prediction_date)
|
|
304
|
+
labels = load_labels(prediction_date)
|
|
305
|
+
|
|
306
|
+
# Join and compute metrics
|
|
307
|
+
joined = predictions.merge(labels, on="id")
|
|
308
|
+
|
|
309
|
+
metrics = ClassificationMetrics.compute(
|
|
310
|
+
joined["label"],
|
|
311
|
+
joined["prediction"],
|
|
312
|
+
joined["probability"],
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Compare to baseline
|
|
316
|
+
baseline_metrics = load_baseline_metrics(self.model_name)
|
|
317
|
+
|
|
318
|
+
degradation = {
|
|
319
|
+
metric: (baseline - current) / baseline
|
|
320
|
+
for metric, baseline, current in [
|
|
321
|
+
("accuracy", baseline_metrics["accuracy"], metrics.accuracy),
|
|
322
|
+
("precision", baseline_metrics["precision"], metrics.precision),
|
|
323
|
+
("recall", baseline_metrics["recall"], metrics.recall),
|
|
324
|
+
("f1", baseline_metrics["f1"], metrics.f1),
|
|
325
|
+
]
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
return {
|
|
329
|
+
"date": prediction_date,
|
|
330
|
+
"metrics": metrics.to_dict(),
|
|
331
|
+
"baseline_metrics": baseline_metrics,
|
|
332
|
+
"degradation": degradation,
|
|
333
|
+
"sample_size": len(joined),
|
|
334
|
+
}
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
## Logging & Structured Observability
|
|
338
|
+
|
|
339
|
+
### Structured Logging
|
|
340
|
+
|
|
341
|
+
```python
|
|
342
|
+
import structlog
|
|
343
|
+
|
|
344
|
+
# Configure structlog
|
|
345
|
+
structlog.configure(
|
|
346
|
+
processors=[
|
|
347
|
+
structlog.contextvars.merge_contextvars,
|
|
348
|
+
structlog.processors.add_log_level,
|
|
349
|
+
structlog.processors.TimeStamper(fmt="iso"),
|
|
350
|
+
structlog.processors.JSONRenderer(),
|
|
351
|
+
],
|
|
352
|
+
wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
logger = structlog.get_logger()
|
|
356
|
+
|
|
357
|
+
def predict_with_logging(model, request: PredictionRequest) -> PredictionResponse:
|
|
358
|
+
"""Make prediction with comprehensive logging."""
|
|
359
|
+
|
|
360
|
+
# Bind request context
|
|
361
|
+
log = logger.bind(
|
|
362
|
+
request_id=request.request_id,
|
|
363
|
+
model_name=model.name,
|
|
364
|
+
model_version=model.version,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
log.info("prediction_started", feature_count=len(request.features))
|
|
368
|
+
|
|
369
|
+
start_time = time.time()
|
|
370
|
+
|
|
371
|
+
try:
|
|
372
|
+
# Preprocessing
|
|
373
|
+
features = transformer.transform(request.features)
|
|
374
|
+
log.debug("preprocessing_complete", transformed_shape=features.shape)
|
|
375
|
+
|
|
376
|
+
# Prediction
|
|
377
|
+
prediction = model.predict(features)
|
|
378
|
+
probability = float(prediction[0])
|
|
379
|
+
|
|
380
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
381
|
+
|
|
382
|
+
log.info(
|
|
383
|
+
"prediction_complete",
|
|
384
|
+
prediction=int(probability >= THRESHOLD),
|
|
385
|
+
probability=probability,
|
|
386
|
+
latency_ms=latency_ms,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
return PredictionResponse(
|
|
390
|
+
prediction=int(probability >= THRESHOLD),
|
|
391
|
+
probability=probability,
|
|
392
|
+
latency_ms=latency_ms,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
except Exception as e:
|
|
396
|
+
log.error(
|
|
397
|
+
"prediction_failed",
|
|
398
|
+
error=str(e),
|
|
399
|
+
error_type=type(e).__name__,
|
|
400
|
+
)
|
|
401
|
+
raise
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
### Prediction Logging for Analysis
|
|
405
|
+
|
|
406
|
+
```python
|
|
407
|
+
@dataclass
|
|
408
|
+
class PredictionLog:
|
|
409
|
+
"""Structured prediction log for analysis."""
|
|
410
|
+
|
|
411
|
+
request_id: str
|
|
412
|
+
timestamp: datetime
|
|
413
|
+
model_name: str
|
|
414
|
+
model_version: str
|
|
415
|
+
features: dict[str, float]
|
|
416
|
+
prediction: int
|
|
417
|
+
probability: float
|
|
418
|
+
latency_ms: float
|
|
419
|
+
label: Optional[int] = None # Added when ground truth arrives
|
|
420
|
+
|
|
421
|
+
def to_dict(self) -> dict:
|
|
422
|
+
return asdict(self)
|
|
423
|
+
|
|
424
|
+
class PredictionLogger:
|
|
425
|
+
"""Log predictions for monitoring and retraining."""
|
|
426
|
+
|
|
427
|
+
def __init__(self, output_path: str):
|
|
428
|
+
self.output_path = output_path
|
|
429
|
+
self.buffer: list[PredictionLog] = []
|
|
430
|
+
self.buffer_size = 1000
|
|
431
|
+
|
|
432
|
+
def log(self, prediction_log: PredictionLog) -> None:
|
|
433
|
+
"""Buffer and write prediction logs."""
|
|
434
|
+
self.buffer.append(prediction_log)
|
|
435
|
+
|
|
436
|
+
if len(self.buffer) >= self.buffer_size:
|
|
437
|
+
self._flush()
|
|
438
|
+
|
|
439
|
+
def _flush(self) -> None:
|
|
440
|
+
"""Write buffer to storage."""
|
|
441
|
+
if not self.buffer:
|
|
442
|
+
return
|
|
443
|
+
|
|
444
|
+
df = pd.DataFrame([log.to_dict() for log in self.buffer])
|
|
445
|
+
|
|
446
|
+
# Partition by date
|
|
447
|
+
date = datetime.utcnow().strftime("%Y-%m-%d")
|
|
448
|
+
hour = datetime.utcnow().strftime("%H")
|
|
449
|
+
|
|
450
|
+
path = f"{self.output_path}/date={date}/hour={hour}/predictions.parquet"
|
|
451
|
+
df.to_parquet(path, index=False, append=True)
|
|
452
|
+
|
|
453
|
+
self.buffer.clear()
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
## Alerting
|
|
457
|
+
|
|
458
|
+
### Alert Configuration
|
|
459
|
+
|
|
460
|
+
```python
|
|
461
|
+
from dataclasses import dataclass
|
|
462
|
+
from enum import Enum
|
|
463
|
+
|
|
464
|
+
class AlertSeverity(Enum):
|
|
465
|
+
INFO = "info"
|
|
466
|
+
WARNING = "warning"
|
|
467
|
+
CRITICAL = "critical"
|
|
468
|
+
|
|
469
|
+
@dataclass
|
|
470
|
+
class AlertRule:
|
|
471
|
+
"""Define an alerting rule."""
|
|
472
|
+
|
|
473
|
+
name: str
|
|
474
|
+
metric: str
|
|
475
|
+
condition: str # e.g., "> 0.1", "< 0.8"
|
|
476
|
+
severity: AlertSeverity
|
|
477
|
+
window_minutes: int = 15
|
|
478
|
+
cooldown_minutes: int = 60
|
|
479
|
+
channels: list[str] = None
|
|
480
|
+
|
|
481
|
+
def evaluate(self, value: float) -> bool:
|
|
482
|
+
"""Evaluate if alert should fire."""
|
|
483
|
+
operator = self.condition[0]
|
|
484
|
+
threshold = float(self.condition[1:].strip())
|
|
485
|
+
|
|
486
|
+
if operator == ">":
|
|
487
|
+
return value > threshold
|
|
488
|
+
elif operator == "<":
|
|
489
|
+
return value < threshold
|
|
490
|
+
elif operator == "=":
|
|
491
|
+
return value == threshold
|
|
492
|
+
else:
|
|
493
|
+
raise ValueError(f"Unknown operator: {operator}")
|
|
494
|
+
|
|
495
|
+
# Define alert rules
|
|
496
|
+
ALERT_RULES = [
|
|
497
|
+
AlertRule(
|
|
498
|
+
name="high_drift",
|
|
499
|
+
metric="data_drift_share",
|
|
500
|
+
condition="> 0.3",
|
|
501
|
+
severity=AlertSeverity.CRITICAL,
|
|
502
|
+
window_minutes=60,
|
|
503
|
+
channels=["slack", "pagerduty"],
|
|
504
|
+
),
|
|
505
|
+
AlertRule(
|
|
506
|
+
name="latency_spike",
|
|
507
|
+
metric="p99_latency_ms",
|
|
508
|
+
condition="> 500",
|
|
509
|
+
severity=AlertSeverity.WARNING,
|
|
510
|
+
window_minutes=15,
|
|
511
|
+
channels=["slack"],
|
|
512
|
+
),
|
|
513
|
+
AlertRule(
|
|
514
|
+
name="accuracy_drop",
|
|
515
|
+
metric="accuracy",
|
|
516
|
+
condition="< 0.85",
|
|
517
|
+
severity=AlertSeverity.CRITICAL,
|
|
518
|
+
window_minutes=60,
|
|
519
|
+
channels=["slack", "pagerduty", "email"],
|
|
520
|
+
),
|
|
521
|
+
AlertRule(
|
|
522
|
+
name="error_rate_high",
|
|
523
|
+
metric="error_rate",
|
|
524
|
+
condition="> 0.01",
|
|
525
|
+
severity=AlertSeverity.WARNING,
|
|
526
|
+
window_minutes=5,
|
|
527
|
+
channels=["slack"],
|
|
528
|
+
),
|
|
529
|
+
]
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
### Alert Manager
|
|
533
|
+
|
|
534
|
+
```python
|
|
535
|
+
class AlertManager:
|
|
536
|
+
"""Manage alert evaluation and notification."""
|
|
537
|
+
|
|
538
|
+
def __init__(self, rules: list[AlertRule]):
|
|
539
|
+
self.rules = rules
|
|
540
|
+
self.last_fired: dict[str, datetime] = {}
|
|
541
|
+
|
|
542
|
+
def evaluate_all(self, metrics: dict[str, float]) -> list[dict]:
|
|
543
|
+
"""Evaluate all rules against current metrics."""
|
|
544
|
+
fired_alerts = []
|
|
545
|
+
|
|
546
|
+
for rule in self.rules:
|
|
547
|
+
if rule.metric not in metrics:
|
|
548
|
+
continue
|
|
549
|
+
|
|
550
|
+
value = metrics[rule.metric]
|
|
551
|
+
|
|
552
|
+
if rule.evaluate(value):
|
|
553
|
+
# Check cooldown
|
|
554
|
+
if self._in_cooldown(rule):
|
|
555
|
+
continue
|
|
556
|
+
|
|
557
|
+
alert = {
|
|
558
|
+
"rule_name": rule.name,
|
|
559
|
+
"metric": rule.metric,
|
|
560
|
+
"value": value,
|
|
561
|
+
"condition": rule.condition,
|
|
562
|
+
"severity": rule.severity.value,
|
|
563
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
fired_alerts.append(alert)
|
|
567
|
+
self._notify(rule, alert)
|
|
568
|
+
self.last_fired[rule.name] = datetime.utcnow()
|
|
569
|
+
|
|
570
|
+
return fired_alerts
|
|
571
|
+
|
|
572
|
+
def _in_cooldown(self, rule: AlertRule) -> bool:
|
|
573
|
+
"""Check if rule is in cooldown period."""
|
|
574
|
+
if rule.name not in self.last_fired:
|
|
575
|
+
return False
|
|
576
|
+
|
|
577
|
+
elapsed = datetime.utcnow() - self.last_fired[rule.name]
|
|
578
|
+
return elapsed < timedelta(minutes=rule.cooldown_minutes)
|
|
579
|
+
|
|
580
|
+
def _notify(self, rule: AlertRule, alert: dict) -> None:
|
|
581
|
+
"""Send alert notifications."""
|
|
582
|
+
for channel in rule.channels or []:
|
|
583
|
+
if channel == "slack":
|
|
584
|
+
send_slack_alert(alert)
|
|
585
|
+
elif channel == "pagerduty":
|
|
586
|
+
send_pagerduty_alert(alert)
|
|
587
|
+
elif channel == "email":
|
|
588
|
+
send_email_alert(alert)
|
|
589
|
+
```
|
|
590
|
+
|
|
591
|
+
## Dashboards
|
|
592
|
+
|
|
593
|
+
### Key Dashboard Panels
|
|
594
|
+
|
|
595
|
+
```yaml
|
|
596
|
+
# Grafana dashboard configuration (conceptual)
|
|
597
|
+
panels:
|
|
598
|
+
- title: "Model Predictions (Live)"
|
|
599
|
+
type: timeseries
|
|
600
|
+
metrics:
|
|
601
|
+
- predictions_per_second
|
|
602
|
+
- success_rate
|
|
603
|
+
|
|
604
|
+
- title: "Prediction Latency"
|
|
605
|
+
type: histogram
|
|
606
|
+
metrics:
|
|
607
|
+
- p50_latency_ms
|
|
608
|
+
- p95_latency_ms
|
|
609
|
+
- p99_latency_ms
|
|
610
|
+
|
|
611
|
+
- title: "Data Drift Score"
|
|
612
|
+
type: gauge
|
|
613
|
+
metrics:
|
|
614
|
+
- drift_share
|
|
615
|
+
thresholds:
|
|
616
|
+
- value: 0.1
|
|
617
|
+
color: green
|
|
618
|
+
- value: 0.3
|
|
619
|
+
color: yellow
|
|
620
|
+
- value: 0.5
|
|
621
|
+
color: red
|
|
622
|
+
|
|
623
|
+
- title: "Model Accuracy (Rolling 7d)"
|
|
624
|
+
type: timeseries
|
|
625
|
+
metrics:
|
|
626
|
+
- accuracy
|
|
627
|
+
- precision
|
|
628
|
+
- recall
|
|
629
|
+
- f1
|
|
630
|
+
|
|
631
|
+
- title: "Feature Distribution Comparison"
|
|
632
|
+
type: comparison
|
|
633
|
+
metrics:
|
|
634
|
+
- reference_distribution
|
|
635
|
+
- current_distribution
|
|
636
|
+
```
|
|
637
|
+
|
|
638
|
+
## Best Practices
|
|
639
|
+
|
|
640
|
+
### Monitoring Checklist
|
|
641
|
+
|
|
642
|
+
- [ ] Data drift detection configured
|
|
643
|
+
- [ ] Prediction distribution tracked
|
|
644
|
+
- [ ] Latency metrics instrumented
|
|
645
|
+
- [ ] Error rates monitored
|
|
646
|
+
- [ ] Business KPIs tracked
|
|
647
|
+
- [ ] Alerts configured with appropriate thresholds
|
|
648
|
+
- [ ] Dashboards created for each model
|
|
649
|
+
- [ ] Runbooks documented for common alerts
|
|
650
|
+
|
|
651
|
+
### Response Procedures
|
|
652
|
+
|
|
653
|
+
| Alert | Severity | Immediate Action | Follow-up |
|
|
654
|
+
|-------|----------|------------------|-----------|
|
|
655
|
+
| High drift | Critical | Investigate data source | Consider retraining |
|
|
656
|
+
| Latency spike | Warning | Check infrastructure | Scale if needed |
|
|
657
|
+
| Accuracy drop | Critical | Enable shadow model | Rollback if severe |
|
|
658
|
+
| Error rate high | Warning | Check logs | Fix bug or input issue |
|