agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,658 @@
1
+ # ML Monitoring & Observability
2
+
3
+ Guidelines for monitoring machine learning systems in production, including drift detection, performance tracking, and alerting strategies.
4
+
5
+ ## Core Monitoring Concepts
6
+
7
+ ### ML-Specific Challenges
8
+
9
+ Traditional monitoring (CPU, memory, latency) is necessary but insufficient for ML systems. Models can silently degrade while infrastructure metrics stay green.
10
+
11
+ | Monitoring Type | What It Catches | What It Misses |
12
+ |-----------------|-----------------|----------------|
13
+ | Infrastructure | Server crashes, OOM | Silent model degradation |
14
+ | Application | API errors, latency spikes | Prediction quality decline |
15
+ | **ML-Specific** | Data drift, concept drift | Nothing (when done right) |
16
+
17
+ ### Key Metrics to Track
18
+
19
+ | Category | Metrics | Purpose |
20
+ |----------|---------|---------|
21
+ | Data Quality | Missing values, schema violations, outliers | Input health |
22
+ | Data Drift | Feature distributions, statistical tests | Distribution shift |
23
+ | Model Performance | Accuracy, precision, recall, business KPIs | Output quality |
24
+ | Concept Drift | Prediction distribution, label correlation | Relationship changes |
25
+ | Operational | Latency, throughput, error rates | System health |
26
+
27
+ ## Drift Detection
28
+
29
+ ### Data Drift
30
+
31
+ Detect when input distributions change:
32
+
33
+ ```python
34
+ from evidently import ColumnMapping
35
+ from evidently.report import Report
36
+ from evidently.metric_preset import DataDriftPreset
37
+ from evidently.metrics import DataDriftTable
38
+
39
+ def detect_data_drift(
40
+ reference_data: pd.DataFrame,
41
+ current_data: pd.DataFrame,
42
+ column_mapping: ColumnMapping,
43
+ threshold: float = 0.05,
44
+ ) -> dict:
45
+ """Detect drift in feature distributions."""
46
+
47
+ report = Report(metrics=[
48
+ DataDriftPreset(stattest_threshold=threshold),
49
+ ])
50
+
51
+ report.run(
52
+ reference_data=reference_data,
53
+ current_data=current_data,
54
+ column_mapping=column_mapping,
55
+ )
56
+
57
+ result = report.as_dict()
58
+ drift_info = result["metrics"][0]["result"]
59
+
60
+ return {
61
+ "dataset_drift": drift_info["dataset_drift"],
62
+ "drift_share": drift_info["drift_share"],
63
+ "drifted_columns": [
64
+ col for col, info in drift_info["drift_by_columns"].items()
65
+ if info["drift_detected"]
66
+ ],
67
+ "column_details": {
68
+ col: {
69
+ "drift_detected": info["drift_detected"],
70
+ "stattest": info["stattest_name"],
71
+ "p_value": info.get("p_value"),
72
+ }
73
+ for col, info in drift_info["drift_by_columns"].items()
74
+ },
75
+ }
76
+ ```
77
+
78
+ ### Concept Drift
79
+
80
+ Detect when the relationship between features and target changes:
81
+
82
+ ```python
83
+ def detect_concept_drift(
84
+ reference_predictions: pd.DataFrame,
85
+ current_predictions: pd.DataFrame,
86
+ window_size: int = 1000,
87
+ ) -> dict:
88
+ """Detect changes in prediction patterns."""
89
+
90
+ # Compare prediction distributions
91
+ from scipy import stats
92
+
93
+ ref_preds = reference_predictions["probability"]
94
+ curr_preds = current_predictions["probability"]
95
+
96
+ # Kolmogorov-Smirnov test
97
+ ks_stat, ks_pvalue = stats.ks_2samp(ref_preds, curr_preds)
98
+
99
+ # Population Stability Index
100
+ psi = calculate_psi(ref_preds, curr_preds, buckets=10)
101
+
102
+ # Jensen-Shannon divergence
103
+ js_divergence = calculate_js_divergence(ref_preds, curr_preds)
104
+
105
+ return {
106
+ "ks_statistic": ks_stat,
107
+ "ks_pvalue": ks_pvalue,
108
+ "psi": psi,
109
+ "js_divergence": js_divergence,
110
+ "drift_detected": psi > 0.1 or ks_pvalue < 0.05,
111
+ }
112
+
113
+ def calculate_psi(reference: pd.Series, current: pd.Series, buckets: int = 10) -> float:
114
+ """Calculate Population Stability Index."""
115
+
116
+ # Create bins from reference
117
+ _, bins = pd.cut(reference, buckets, retbins=True)
118
+ bins[0] = -np.inf
119
+ bins[-1] = np.inf
120
+
121
+ # Calculate proportions
122
+ ref_counts = pd.cut(reference, bins).value_counts(normalize=True)
123
+ curr_counts = pd.cut(current, bins).value_counts(normalize=True)
124
+
125
+ # Align indices
126
+ ref_counts = ref_counts.reindex(curr_counts.index, fill_value=0.0001)
127
+ curr_counts = curr_counts.replace(0, 0.0001)
128
+
129
+ # PSI formula
130
+ psi = np.sum((curr_counts - ref_counts) * np.log(curr_counts / ref_counts))
131
+
132
+ return psi
133
+ ```
134
+
135
+ ### Continuous Drift Monitoring
136
+
137
+ ```python
138
+ from prefect import flow, task
139
+ from datetime import datetime, timedelta
140
+
141
+ @task
142
+ def fetch_reference_data(model_version: str) -> pd.DataFrame:
143
+ """Fetch reference data for the model version."""
144
+ return pd.read_parquet(f"s3://reference-data/{model_version}/")
145
+
146
+ @task
147
+ def fetch_current_data(date: str) -> pd.DataFrame:
148
+ """Fetch current production data."""
149
+ return pd.read_parquet(f"s3://production-data/{date}/")
150
+
151
+ @task
152
+ def run_drift_detection(
153
+ reference: pd.DataFrame,
154
+ current: pd.DataFrame,
155
+ ) -> dict:
156
+ """Run all drift detection checks."""
157
+
158
+ column_mapping = ColumnMapping(
159
+ target="label",
160
+ prediction="prediction",
161
+ numerical_features=NUMERIC_FEATURES,
162
+ categorical_features=CATEGORICAL_FEATURES,
163
+ )
164
+
165
+ data_drift = detect_data_drift(reference, current, column_mapping)
166
+ concept_drift = detect_concept_drift(reference, current)
167
+
168
+ return {
169
+ "data_drift": data_drift,
170
+ "concept_drift": concept_drift,
171
+ "timestamp": datetime.utcnow().isoformat(),
172
+ }
173
+
174
+ @task
175
+ def send_alerts(drift_results: dict) -> None:
176
+ """Send alerts if drift detected."""
177
+
178
+ if drift_results["data_drift"]["dataset_drift"]:
179
+ send_alert(
180
+ channel="ml-alerts",
181
+ severity="warning",
182
+ message=f"Data drift detected: {drift_results['data_drift']['drift_share']:.1%} of features drifted",
183
+ details=drift_results["data_drift"]["drifted_columns"],
184
+ )
185
+
186
+ if drift_results["concept_drift"]["drift_detected"]:
187
+ send_alert(
188
+ channel="ml-alerts",
189
+ severity="critical",
190
+ message=f"Concept drift detected: PSI={drift_results['concept_drift']['psi']:.3f}",
191
+ details=drift_results["concept_drift"],
192
+ )
193
+
194
+ @flow(name="drift-monitoring")
195
+ def drift_monitoring_pipeline(date: str, model_version: str):
196
+ """Daily drift monitoring pipeline."""
197
+
198
+ reference = fetch_reference_data(model_version)
199
+ current = fetch_current_data(date)
200
+
201
+ drift_results = run_drift_detection(reference, current)
202
+
203
+ # Log to MLflow
204
+ log_drift_metrics(drift_results)
205
+
206
+ # Alert if needed
207
+ send_alerts(drift_results)
208
+
209
+ return drift_results
210
+ ```
211
+
212
+ ## Performance Monitoring
213
+
214
+ ### Real-Time Metrics
215
+
216
+ ```python
217
+ from prometheus_client import Counter, Histogram, Gauge, Summary
218
+
219
+ # Prediction metrics
220
+ PREDICTION_LATENCY = Histogram(
221
+ "model_prediction_latency_seconds",
222
+ "Time to generate prediction",
223
+ ["model_name", "model_version"],
224
+ buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
225
+ )
226
+
227
+ PREDICTION_COUNT = Counter(
228
+ "model_predictions_total",
229
+ "Total predictions made",
230
+ ["model_name", "model_version", "outcome"],
231
+ )
232
+
233
+ PREDICTION_VALUE = Summary(
234
+ "model_prediction_value",
235
+ "Distribution of prediction values",
236
+ ["model_name", "model_version"],
237
+ )
238
+
239
+ FEATURE_VALUE = Gauge(
240
+ "model_feature_value",
241
+ "Feature value statistics",
242
+ ["model_name", "feature_name", "statistic"],
243
+ )
244
+
245
+ # Instrumented prediction function
246
+ def predict_with_metrics(model, features: dict) -> dict:
247
+ """Make prediction with full metric instrumentation."""
248
+
249
+ start_time = time.time()
250
+
251
+ try:
252
+ prediction = model.predict(features)
253
+
254
+ # Record metrics
255
+ latency = time.time() - start_time
256
+ PREDICTION_LATENCY.labels(
257
+ model_name=model.name,
258
+ model_version=model.version,
259
+ ).observe(latency)
260
+
261
+ PREDICTION_COUNT.labels(
262
+ model_name=model.name,
263
+ model_version=model.version,
264
+ outcome="success",
265
+ ).inc()
266
+
267
+ PREDICTION_VALUE.labels(
268
+ model_name=model.name,
269
+ model_version=model.version,
270
+ ).observe(prediction)
271
+
272
+ return {"prediction": prediction, "latency_ms": latency * 1000}
273
+
274
+ except Exception as e:
275
+ PREDICTION_COUNT.labels(
276
+ model_name=model.name,
277
+ model_version=model.version,
278
+ outcome="error",
279
+ ).inc()
280
+ raise
281
+ ```
282
+
283
+ ### Delayed Labels Monitoring
284
+
285
+ When ground truth arrives later than predictions:
286
+
287
+ ```python
288
+ class DelayedLabelMonitor:
289
+ """Monitor model performance with delayed ground truth."""
290
+
291
+ def __init__(self, model_name: str, lookback_days: int = 30):
292
+ self.model_name = model_name
293
+ self.lookback_days = lookback_days
294
+
295
+ def compute_metrics(self, date: str) -> dict:
296
+ """Compute metrics for predictions that now have labels."""
297
+
298
+ # Load predictions from N days ago
299
+ prediction_date = (
300
+ datetime.strptime(date, "%Y-%m-%d") - timedelta(days=self.lookback_days)
301
+ ).strftime("%Y-%m-%d")
302
+
303
+ predictions = load_predictions(self.model_name, prediction_date)
304
+ labels = load_labels(prediction_date)
305
+
306
+ # Join and compute metrics
307
+ joined = predictions.merge(labels, on="id")
308
+
309
+ metrics = ClassificationMetrics.compute(
310
+ joined["label"],
311
+ joined["prediction"],
312
+ joined["probability"],
313
+ )
314
+
315
+ # Compare to baseline
316
+ baseline_metrics = load_baseline_metrics(self.model_name)
317
+
318
+ degradation = {
319
+ metric: (baseline - current) / baseline
320
+ for metric, baseline, current in [
321
+ ("accuracy", baseline_metrics["accuracy"], metrics.accuracy),
322
+ ("precision", baseline_metrics["precision"], metrics.precision),
323
+ ("recall", baseline_metrics["recall"], metrics.recall),
324
+ ("f1", baseline_metrics["f1"], metrics.f1),
325
+ ]
326
+ }
327
+
328
+ return {
329
+ "date": prediction_date,
330
+ "metrics": metrics.to_dict(),
331
+ "baseline_metrics": baseline_metrics,
332
+ "degradation": degradation,
333
+ "sample_size": len(joined),
334
+ }
335
+ ```
336
+
337
+ ## Logging & Structured Observability
338
+
339
+ ### Structured Logging
340
+
341
+ ```python
342
+ import structlog
343
+
344
+ # Configure structlog
345
+ structlog.configure(
346
+ processors=[
347
+ structlog.contextvars.merge_contextvars,
348
+ structlog.processors.add_log_level,
349
+ structlog.processors.TimeStamper(fmt="iso"),
350
+ structlog.processors.JSONRenderer(),
351
+ ],
352
+ wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
353
+ )
354
+
355
+ logger = structlog.get_logger()
356
+
357
+ def predict_with_logging(model, request: PredictionRequest) -> PredictionResponse:
358
+ """Make prediction with comprehensive logging."""
359
+
360
+ # Bind request context
361
+ log = logger.bind(
362
+ request_id=request.request_id,
363
+ model_name=model.name,
364
+ model_version=model.version,
365
+ )
366
+
367
+ log.info("prediction_started", feature_count=len(request.features))
368
+
369
+ start_time = time.time()
370
+
371
+ try:
372
+ # Preprocessing
373
+ features = transformer.transform(request.features)
374
+ log.debug("preprocessing_complete", transformed_shape=features.shape)
375
+
376
+ # Prediction
377
+ prediction = model.predict(features)
378
+ probability = float(prediction[0])
379
+
380
+ latency_ms = (time.time() - start_time) * 1000
381
+
382
+ log.info(
383
+ "prediction_complete",
384
+ prediction=int(probability >= THRESHOLD),
385
+ probability=probability,
386
+ latency_ms=latency_ms,
387
+ )
388
+
389
+ return PredictionResponse(
390
+ prediction=int(probability >= THRESHOLD),
391
+ probability=probability,
392
+ latency_ms=latency_ms,
393
+ )
394
+
395
+ except Exception as e:
396
+ log.error(
397
+ "prediction_failed",
398
+ error=str(e),
399
+ error_type=type(e).__name__,
400
+ )
401
+ raise
402
+ ```
403
+
404
+ ### Prediction Logging for Analysis
405
+
406
+ ```python
407
+ @dataclass
408
+ class PredictionLog:
409
+ """Structured prediction log for analysis."""
410
+
411
+ request_id: str
412
+ timestamp: datetime
413
+ model_name: str
414
+ model_version: str
415
+ features: dict[str, float]
416
+ prediction: int
417
+ probability: float
418
+ latency_ms: float
419
+ label: Optional[int] = None # Added when ground truth arrives
420
+
421
+ def to_dict(self) -> dict:
422
+ return asdict(self)
423
+
424
+ class PredictionLogger:
425
+ """Log predictions for monitoring and retraining."""
426
+
427
+ def __init__(self, output_path: str):
428
+ self.output_path = output_path
429
+ self.buffer: list[PredictionLog] = []
430
+ self.buffer_size = 1000
431
+
432
+ def log(self, prediction_log: PredictionLog) -> None:
433
+ """Buffer and write prediction logs."""
434
+ self.buffer.append(prediction_log)
435
+
436
+ if len(self.buffer) >= self.buffer_size:
437
+ self._flush()
438
+
439
+ def _flush(self) -> None:
440
+ """Write buffer to storage."""
441
+ if not self.buffer:
442
+ return
443
+
444
+ df = pd.DataFrame([log.to_dict() for log in self.buffer])
445
+
446
+ # Partition by date
447
+ date = datetime.utcnow().strftime("%Y-%m-%d")
448
+ hour = datetime.utcnow().strftime("%H")
449
+
450
+ path = f"{self.output_path}/date={date}/hour={hour}/predictions.parquet"
451
+ df.to_parquet(path, index=False, append=True)
452
+
453
+ self.buffer.clear()
454
+ ```
455
+
456
+ ## Alerting
457
+
458
+ ### Alert Configuration
459
+
460
+ ```python
461
+ from dataclasses import dataclass
462
+ from enum import Enum
463
+
464
+ class AlertSeverity(Enum):
465
+ INFO = "info"
466
+ WARNING = "warning"
467
+ CRITICAL = "critical"
468
+
469
+ @dataclass
470
+ class AlertRule:
471
+ """Define an alerting rule."""
472
+
473
+ name: str
474
+ metric: str
475
+ condition: str # e.g., "> 0.1", "< 0.8"
476
+ severity: AlertSeverity
477
+ window_minutes: int = 15
478
+ cooldown_minutes: int = 60
479
+ channels: list[str] = None
480
+
481
+ def evaluate(self, value: float) -> bool:
482
+ """Evaluate if alert should fire."""
483
+ operator = self.condition[0]
484
+ threshold = float(self.condition[1:].strip())
485
+
486
+ if operator == ">":
487
+ return value > threshold
488
+ elif operator == "<":
489
+ return value < threshold
490
+ elif operator == "=":
491
+ return value == threshold
492
+ else:
493
+ raise ValueError(f"Unknown operator: {operator}")
494
+
495
+ # Define alert rules
496
+ ALERT_RULES = [
497
+ AlertRule(
498
+ name="high_drift",
499
+ metric="data_drift_share",
500
+ condition="> 0.3",
501
+ severity=AlertSeverity.CRITICAL,
502
+ window_minutes=60,
503
+ channels=["slack", "pagerduty"],
504
+ ),
505
+ AlertRule(
506
+ name="latency_spike",
507
+ metric="p99_latency_ms",
508
+ condition="> 500",
509
+ severity=AlertSeverity.WARNING,
510
+ window_minutes=15,
511
+ channels=["slack"],
512
+ ),
513
+ AlertRule(
514
+ name="accuracy_drop",
515
+ metric="accuracy",
516
+ condition="< 0.85",
517
+ severity=AlertSeverity.CRITICAL,
518
+ window_minutes=60,
519
+ channels=["slack", "pagerduty", "email"],
520
+ ),
521
+ AlertRule(
522
+ name="error_rate_high",
523
+ metric="error_rate",
524
+ condition="> 0.01",
525
+ severity=AlertSeverity.WARNING,
526
+ window_minutes=5,
527
+ channels=["slack"],
528
+ ),
529
+ ]
530
+ ```
531
+
532
+ ### Alert Manager
533
+
534
+ ```python
535
+ class AlertManager:
536
+ """Manage alert evaluation and notification."""
537
+
538
+ def __init__(self, rules: list[AlertRule]):
539
+ self.rules = rules
540
+ self.last_fired: dict[str, datetime] = {}
541
+
542
+ def evaluate_all(self, metrics: dict[str, float]) -> list[dict]:
543
+ """Evaluate all rules against current metrics."""
544
+ fired_alerts = []
545
+
546
+ for rule in self.rules:
547
+ if rule.metric not in metrics:
548
+ continue
549
+
550
+ value = metrics[rule.metric]
551
+
552
+ if rule.evaluate(value):
553
+ # Check cooldown
554
+ if self._in_cooldown(rule):
555
+ continue
556
+
557
+ alert = {
558
+ "rule_name": rule.name,
559
+ "metric": rule.metric,
560
+ "value": value,
561
+ "condition": rule.condition,
562
+ "severity": rule.severity.value,
563
+ "timestamp": datetime.utcnow().isoformat(),
564
+ }
565
+
566
+ fired_alerts.append(alert)
567
+ self._notify(rule, alert)
568
+ self.last_fired[rule.name] = datetime.utcnow()
569
+
570
+ return fired_alerts
571
+
572
+ def _in_cooldown(self, rule: AlertRule) -> bool:
573
+ """Check if rule is in cooldown period."""
574
+ if rule.name not in self.last_fired:
575
+ return False
576
+
577
+ elapsed = datetime.utcnow() - self.last_fired[rule.name]
578
+ return elapsed < timedelta(minutes=rule.cooldown_minutes)
579
+
580
+ def _notify(self, rule: AlertRule, alert: dict) -> None:
581
+ """Send alert notifications."""
582
+ for channel in rule.channels or []:
583
+ if channel == "slack":
584
+ send_slack_alert(alert)
585
+ elif channel == "pagerduty":
586
+ send_pagerduty_alert(alert)
587
+ elif channel == "email":
588
+ send_email_alert(alert)
589
+ ```
590
+
591
+ ## Dashboards
592
+
593
+ ### Key Dashboard Panels
594
+
595
+ ```yaml
596
+ # Grafana dashboard configuration (conceptual)
597
+ panels:
598
+ - title: "Model Predictions (Live)"
599
+ type: timeseries
600
+ metrics:
601
+ - predictions_per_second
602
+ - success_rate
603
+
604
+ - title: "Prediction Latency"
605
+ type: histogram
606
+ metrics:
607
+ - p50_latency_ms
608
+ - p95_latency_ms
609
+ - p99_latency_ms
610
+
611
+ - title: "Data Drift Score"
612
+ type: gauge
613
+ metrics:
614
+ - drift_share
615
+ thresholds:
616
+ - value: 0.1
617
+ color: green
618
+ - value: 0.3
619
+ color: yellow
620
+ - value: 0.5
621
+ color: red
622
+
623
+ - title: "Model Accuracy (Rolling 7d)"
624
+ type: timeseries
625
+ metrics:
626
+ - accuracy
627
+ - precision
628
+ - recall
629
+ - f1
630
+
631
+ - title: "Feature Distribution Comparison"
632
+ type: comparison
633
+ metrics:
634
+ - reference_distribution
635
+ - current_distribution
636
+ ```
637
+
638
+ ## Best Practices
639
+
640
+ ### Monitoring Checklist
641
+
642
+ - [ ] Data drift detection configured
643
+ - [ ] Prediction distribution tracked
644
+ - [ ] Latency metrics instrumented
645
+ - [ ] Error rates monitored
646
+ - [ ] Business KPIs tracked
647
+ - [ ] Alerts configured with appropriate thresholds
648
+ - [ ] Dashboards created for each model
649
+ - [ ] Runbooks documented for common alerts
650
+
651
+ ### Response Procedures
652
+
653
+ | Alert | Severity | Immediate Action | Follow-up |
654
+ |-------|----------|------------------|-----------|
655
+ | High drift | Critical | Investigate data source | Consider retraining |
656
+ | Latency spike | Warning | Check infrastructure | Scale if needed |
657
+ | Accuracy drop | Critical | Enable shadow model | Rollback if severe |
658
+ | Error rate high | Warning | Check logs | Fix bug or input issue |