truthound-dashboard 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. truthound_dashboard/api/alerts.py +258 -0
  2. truthound_dashboard/api/anomaly.py +1302 -0
  3. truthound_dashboard/api/cross_alerts.py +352 -0
  4. truthound_dashboard/api/deps.py +143 -0
  5. truthound_dashboard/api/drift_monitor.py +540 -0
  6. truthound_dashboard/api/lineage.py +1151 -0
  7. truthound_dashboard/api/maintenance.py +363 -0
  8. truthound_dashboard/api/middleware.py +373 -1
  9. truthound_dashboard/api/model_monitoring.py +805 -0
  10. truthound_dashboard/api/notifications_advanced.py +2452 -0
  11. truthound_dashboard/api/plugins.py +2096 -0
  12. truthound_dashboard/api/profile.py +211 -14
  13. truthound_dashboard/api/reports.py +853 -0
  14. truthound_dashboard/api/router.py +147 -0
  15. truthound_dashboard/api/rule_suggestions.py +310 -0
  16. truthound_dashboard/api/schema_evolution.py +231 -0
  17. truthound_dashboard/api/sources.py +47 -3
  18. truthound_dashboard/api/triggers.py +190 -0
  19. truthound_dashboard/api/validations.py +13 -0
  20. truthound_dashboard/api/validators.py +333 -4
  21. truthound_dashboard/api/versioning.py +309 -0
  22. truthound_dashboard/api/websocket.py +301 -0
  23. truthound_dashboard/core/__init__.py +27 -0
  24. truthound_dashboard/core/anomaly.py +1395 -0
  25. truthound_dashboard/core/anomaly_explainer.py +633 -0
  26. truthound_dashboard/core/cache.py +206 -0
  27. truthound_dashboard/core/cached_services.py +422 -0
  28. truthound_dashboard/core/charts.py +352 -0
  29. truthound_dashboard/core/connections.py +1069 -42
  30. truthound_dashboard/core/cross_alerts.py +837 -0
  31. truthound_dashboard/core/drift_monitor.py +1477 -0
  32. truthound_dashboard/core/drift_sampling.py +669 -0
  33. truthound_dashboard/core/i18n/__init__.py +42 -0
  34. truthound_dashboard/core/i18n/detector.py +173 -0
  35. truthound_dashboard/core/i18n/messages.py +564 -0
  36. truthound_dashboard/core/lineage.py +971 -0
  37. truthound_dashboard/core/maintenance.py +443 -5
  38. truthound_dashboard/core/model_monitoring.py +1043 -0
  39. truthound_dashboard/core/notifications/channels.py +1020 -1
  40. truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
  41. truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
  42. truthound_dashboard/core/notifications/deduplication/service.py +400 -0
  43. truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
  44. truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
  45. truthound_dashboard/core/notifications/dispatcher.py +43 -0
  46. truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
  47. truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
  48. truthound_dashboard/core/notifications/escalation/engine.py +429 -0
  49. truthound_dashboard/core/notifications/escalation/models.py +336 -0
  50. truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
  51. truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
  52. truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
  53. truthound_dashboard/core/notifications/events.py +49 -0
  54. truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
  55. truthound_dashboard/core/notifications/metrics/base.py +528 -0
  56. truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
  57. truthound_dashboard/core/notifications/routing/__init__.py +169 -0
  58. truthound_dashboard/core/notifications/routing/combinators.py +184 -0
  59. truthound_dashboard/core/notifications/routing/config.py +375 -0
  60. truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
  61. truthound_dashboard/core/notifications/routing/engine.py +382 -0
  62. truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
  63. truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
  64. truthound_dashboard/core/notifications/routing/rules.py +625 -0
  65. truthound_dashboard/core/notifications/routing/validator.py +678 -0
  66. truthound_dashboard/core/notifications/service.py +2 -0
  67. truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
  68. truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
  69. truthound_dashboard/core/notifications/throttling/builder.py +311 -0
  70. truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
  71. truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
  72. truthound_dashboard/core/openlineage.py +1028 -0
  73. truthound_dashboard/core/plugins/__init__.py +39 -0
  74. truthound_dashboard/core/plugins/docs/__init__.py +39 -0
  75. truthound_dashboard/core/plugins/docs/extractor.py +703 -0
  76. truthound_dashboard/core/plugins/docs/renderers.py +804 -0
  77. truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
  78. truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
  79. truthound_dashboard/core/plugins/hooks/manager.py +403 -0
  80. truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
  81. truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
  82. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
  83. truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
  84. truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
  85. truthound_dashboard/core/plugins/loader.py +504 -0
  86. truthound_dashboard/core/plugins/registry.py +810 -0
  87. truthound_dashboard/core/plugins/reporter_executor.py +588 -0
  88. truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
  89. truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
  90. truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
  91. truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
  92. truthound_dashboard/core/plugins/sandbox.py +617 -0
  93. truthound_dashboard/core/plugins/security/__init__.py +68 -0
  94. truthound_dashboard/core/plugins/security/analyzer.py +535 -0
  95. truthound_dashboard/core/plugins/security/policies.py +311 -0
  96. truthound_dashboard/core/plugins/security/protocols.py +296 -0
  97. truthound_dashboard/core/plugins/security/signing.py +842 -0
  98. truthound_dashboard/core/plugins/security.py +446 -0
  99. truthound_dashboard/core/plugins/validator_executor.py +401 -0
  100. truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
  101. truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
  102. truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
  103. truthound_dashboard/core/plugins/versioning/semver.py +266 -0
  104. truthound_dashboard/core/profile_comparison.py +601 -0
  105. truthound_dashboard/core/report_history.py +570 -0
  106. truthound_dashboard/core/reporters/__init__.py +57 -0
  107. truthound_dashboard/core/reporters/base.py +296 -0
  108. truthound_dashboard/core/reporters/csv_reporter.py +155 -0
  109. truthound_dashboard/core/reporters/html_reporter.py +598 -0
  110. truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
  111. truthound_dashboard/core/reporters/i18n/base.py +494 -0
  112. truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
  113. truthound_dashboard/core/reporters/json_reporter.py +160 -0
  114. truthound_dashboard/core/reporters/junit_reporter.py +233 -0
  115. truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
  116. truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
  117. truthound_dashboard/core/reporters/registry.py +272 -0
  118. truthound_dashboard/core/rule_generator.py +2088 -0
  119. truthound_dashboard/core/scheduler.py +822 -12
  120. truthound_dashboard/core/schema_evolution.py +858 -0
  121. truthound_dashboard/core/services.py +152 -9
  122. truthound_dashboard/core/statistics.py +718 -0
  123. truthound_dashboard/core/streaming_anomaly.py +883 -0
  124. truthound_dashboard/core/triggers/__init__.py +45 -0
  125. truthound_dashboard/core/triggers/base.py +226 -0
  126. truthound_dashboard/core/triggers/evaluators.py +609 -0
  127. truthound_dashboard/core/triggers/factory.py +363 -0
  128. truthound_dashboard/core/unified_alerts.py +870 -0
  129. truthound_dashboard/core/validation_limits.py +509 -0
  130. truthound_dashboard/core/versioning.py +709 -0
  131. truthound_dashboard/core/websocket/__init__.py +59 -0
  132. truthound_dashboard/core/websocket/manager.py +512 -0
  133. truthound_dashboard/core/websocket/messages.py +130 -0
  134. truthound_dashboard/db/__init__.py +30 -0
  135. truthound_dashboard/db/models.py +3375 -3
  136. truthound_dashboard/main.py +22 -0
  137. truthound_dashboard/schemas/__init__.py +396 -1
  138. truthound_dashboard/schemas/anomaly.py +1258 -0
  139. truthound_dashboard/schemas/base.py +4 -0
  140. truthound_dashboard/schemas/cross_alerts.py +334 -0
  141. truthound_dashboard/schemas/drift_monitor.py +890 -0
  142. truthound_dashboard/schemas/lineage.py +428 -0
  143. truthound_dashboard/schemas/maintenance.py +154 -0
  144. truthound_dashboard/schemas/model_monitoring.py +374 -0
  145. truthound_dashboard/schemas/notifications_advanced.py +1363 -0
  146. truthound_dashboard/schemas/openlineage.py +704 -0
  147. truthound_dashboard/schemas/plugins.py +1293 -0
  148. truthound_dashboard/schemas/profile.py +420 -34
  149. truthound_dashboard/schemas/profile_comparison.py +242 -0
  150. truthound_dashboard/schemas/reports.py +285 -0
  151. truthound_dashboard/schemas/rule_suggestion.py +434 -0
  152. truthound_dashboard/schemas/schema_evolution.py +164 -0
  153. truthound_dashboard/schemas/source.py +117 -2
  154. truthound_dashboard/schemas/triggers.py +511 -0
  155. truthound_dashboard/schemas/unified_alerts.py +223 -0
  156. truthound_dashboard/schemas/validation.py +25 -1
  157. truthound_dashboard/schemas/validators/__init__.py +11 -0
  158. truthound_dashboard/schemas/validators/base.py +151 -0
  159. truthound_dashboard/schemas/versioning.py +152 -0
  160. truthound_dashboard/static/index.html +2 -2
  161. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -18
  162. truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
  163. truthound_dashboard/static/assets/index-BCA8H1hO.js +0 -574
  164. truthound_dashboard/static/assets/index-BNsSQ2fN.css +0 -1
  165. truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +0 -1
  166. truthound_dashboard-1.3.0.dist-info/RECORD +0 -110
  167. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
  168. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
  169. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1043 @@
1
+ """Model monitoring service.
2
+
3
+ This module provides services for ML model monitoring,
4
+ including model registration, prediction recording, metrics aggregation,
5
+ and alert management with database persistence.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Sequence
11
+ from datetime import datetime, timedelta
12
+ from typing import Any
13
+ import statistics
14
+
15
+ from sqlalchemy import and_, func, select
16
+ from sqlalchemy.ext.asyncio import AsyncSession
17
+
18
+ from truthound_dashboard.db import BaseRepository
19
+ from truthound_dashboard.db.models import (
20
+ AlertSeverityLevel,
21
+ ModelAlert,
22
+ ModelAlertHandler,
23
+ ModelAlertRule,
24
+ ModelMetric,
25
+ ModelPrediction,
26
+ ModelStatus,
27
+ MonitoredModel,
28
+ )
29
+
30
+
31
+ # =============================================================================
32
+ # Repositories
33
+ # =============================================================================
34
+
35
+
36
+ class MonitoredModelRepository(BaseRepository[MonitoredModel]):
37
+ """Repository for MonitoredModel operations."""
38
+
39
+ model = MonitoredModel
40
+
41
+ async def get_by_name(self, name: str) -> MonitoredModel | None:
42
+ """Get model by name."""
43
+ result = await self.session.execute(
44
+ select(MonitoredModel).where(MonitoredModel.name == name)
45
+ )
46
+ return result.scalar_one_or_none()
47
+
48
+ async def get_by_status(
49
+ self,
50
+ status: str,
51
+ *,
52
+ offset: int = 0,
53
+ limit: int = 50,
54
+ ) -> Sequence[MonitoredModel]:
55
+ """Get models by status."""
56
+ result = await self.session.execute(
57
+ select(MonitoredModel)
58
+ .where(MonitoredModel.status == status)
59
+ .order_by(MonitoredModel.created_at.desc())
60
+ .offset(offset)
61
+ .limit(limit)
62
+ )
63
+ return result.scalars().all()
64
+
65
+ async def get_active_models(
66
+ self,
67
+ *,
68
+ offset: int = 0,
69
+ limit: int = 50,
70
+ ) -> Sequence[MonitoredModel]:
71
+ """Get all active models."""
72
+ return await self.get_by_status(
73
+ ModelStatus.ACTIVE.value, offset=offset, limit=limit
74
+ )
75
+
76
+ async def count_by_status(self, status: str) -> int:
77
+ """Count models by status."""
78
+ return await self.count(filters=[MonitoredModel.status == status])
79
+
80
+
81
+ class ModelPredictionRepository(BaseRepository[ModelPrediction]):
82
+ """Repository for ModelPrediction operations."""
83
+
84
+ model = ModelPrediction
85
+
86
+ async def get_by_model_id(
87
+ self,
88
+ model_id: str,
89
+ *,
90
+ offset: int = 0,
91
+ limit: int = 100,
92
+ since: datetime | None = None,
93
+ ) -> Sequence[ModelPrediction]:
94
+ """Get predictions for a model."""
95
+ query = (
96
+ select(ModelPrediction)
97
+ .where(ModelPrediction.model_id == model_id)
98
+ .order_by(ModelPrediction.recorded_at.desc())
99
+ )
100
+
101
+ if since:
102
+ query = query.where(ModelPrediction.recorded_at >= since)
103
+
104
+ result = await self.session.execute(
105
+ query.offset(offset).limit(limit)
106
+ )
107
+ return result.scalars().all()
108
+
109
+ async def count_by_model(
110
+ self,
111
+ model_id: str,
112
+ since: datetime | None = None,
113
+ ) -> int:
114
+ """Count predictions for a model."""
115
+ filters = [ModelPrediction.model_id == model_id]
116
+ if since:
117
+ filters.append(ModelPrediction.recorded_at >= since)
118
+ return await self.count(filters=filters)
119
+
120
+ async def get_latencies(
121
+ self,
122
+ model_id: str,
123
+ since: datetime,
124
+ ) -> list[float]:
125
+ """Get latency values for a model within time range."""
126
+ result = await self.session.execute(
127
+ select(ModelPrediction.latency_ms)
128
+ .where(
129
+ and_(
130
+ ModelPrediction.model_id == model_id,
131
+ ModelPrediction.recorded_at >= since,
132
+ ModelPrediction.latency_ms.isnot(None),
133
+ )
134
+ )
135
+ .order_by(ModelPrediction.recorded_at.desc())
136
+ )
137
+ return [r[0] for r in result.fetchall() if r[0] is not None]
138
+
139
+
140
+ class ModelMetricRepository(BaseRepository[ModelMetric]):
141
+ """Repository for ModelMetric operations."""
142
+
143
+ model = ModelMetric
144
+
145
+ async def get_by_model_id(
146
+ self,
147
+ model_id: str,
148
+ *,
149
+ metric_type: str | None = None,
150
+ since: datetime | None = None,
151
+ offset: int = 0,
152
+ limit: int = 100,
153
+ ) -> Sequence[ModelMetric]:
154
+ """Get metrics for a model."""
155
+ query = (
156
+ select(ModelMetric)
157
+ .where(ModelMetric.model_id == model_id)
158
+ .order_by(ModelMetric.recorded_at.desc())
159
+ )
160
+
161
+ if metric_type:
162
+ query = query.where(ModelMetric.metric_type == metric_type)
163
+ if since:
164
+ query = query.where(ModelMetric.recorded_at >= since)
165
+
166
+ result = await self.session.execute(
167
+ query.offset(offset).limit(limit)
168
+ )
169
+ return result.scalars().all()
170
+
171
+ async def record_metric(
172
+ self,
173
+ model_id: str,
174
+ metric_type: str,
175
+ metric_name: str,
176
+ value: float,
177
+ labels: dict[str, str] | None = None,
178
+ ) -> ModelMetric:
179
+ """Record a new metric."""
180
+ return await self.create(
181
+ model_id=model_id,
182
+ metric_type=metric_type,
183
+ metric_name=metric_name,
184
+ value=value,
185
+ labels=labels,
186
+ )
187
+
188
+
189
+ class ModelAlertRuleRepository(BaseRepository[ModelAlertRule]):
190
+ """Repository for ModelAlertRule operations."""
191
+
192
+ model = ModelAlertRule
193
+
194
+ async def get_by_model_id(
195
+ self,
196
+ model_id: str,
197
+ *,
198
+ active_only: bool = False,
199
+ ) -> Sequence[ModelAlertRule]:
200
+ """Get alert rules for a model."""
201
+ query = select(ModelAlertRule).where(ModelAlertRule.model_id == model_id)
202
+
203
+ if active_only:
204
+ query = query.where(ModelAlertRule.is_active == True)
205
+
206
+ result = await self.session.execute(
207
+ query.order_by(ModelAlertRule.created_at.desc())
208
+ )
209
+ return result.scalars().all()
210
+
211
+ async def get_active_rules(self) -> Sequence[ModelAlertRule]:
212
+ """Get all active rules."""
213
+ result = await self.session.execute(
214
+ select(ModelAlertRule)
215
+ .where(ModelAlertRule.is_active == True)
216
+ .order_by(ModelAlertRule.created_at.desc())
217
+ )
218
+ return result.scalars().all()
219
+
220
+
221
+ class ModelAlertHandlerRepository(BaseRepository[ModelAlertHandler]):
222
+ """Repository for ModelAlertHandler operations."""
223
+
224
+ model = ModelAlertHandler
225
+
226
+ async def get_active_handlers(self) -> Sequence[ModelAlertHandler]:
227
+ """Get all active handlers."""
228
+ result = await self.session.execute(
229
+ select(ModelAlertHandler)
230
+ .where(ModelAlertHandler.is_active == True)
231
+ .order_by(ModelAlertHandler.created_at.desc())
232
+ )
233
+ return result.scalars().all()
234
+
235
+ async def get_by_type(
236
+ self,
237
+ handler_type: str,
238
+ ) -> Sequence[ModelAlertHandler]:
239
+ """Get handlers by type."""
240
+ result = await self.session.execute(
241
+ select(ModelAlertHandler)
242
+ .where(ModelAlertHandler.handler_type == handler_type)
243
+ .order_by(ModelAlertHandler.created_at.desc())
244
+ )
245
+ return result.scalars().all()
246
+
247
+
248
+ class ModelAlertRepository(BaseRepository[ModelAlert]):
249
+ """Repository for ModelAlert operations."""
250
+
251
+ model = ModelAlert
252
+
253
+ async def get_by_model_id(
254
+ self,
255
+ model_id: str,
256
+ *,
257
+ active_only: bool = False,
258
+ severity: str | None = None,
259
+ offset: int = 0,
260
+ limit: int = 50,
261
+ ) -> Sequence[ModelAlert]:
262
+ """Get alerts for a model."""
263
+ query = (
264
+ select(ModelAlert)
265
+ .where(ModelAlert.model_id == model_id)
266
+ .order_by(ModelAlert.created_at.desc())
267
+ )
268
+
269
+ if active_only:
270
+ query = query.where(ModelAlert.resolved == False)
271
+ if severity:
272
+ query = query.where(ModelAlert.severity == severity)
273
+
274
+ result = await self.session.execute(
275
+ query.offset(offset).limit(limit)
276
+ )
277
+ return result.scalars().all()
278
+
279
+ async def get_active_alerts(
280
+ self,
281
+ *,
282
+ offset: int = 0,
283
+ limit: int = 50,
284
+ ) -> Sequence[ModelAlert]:
285
+ """Get all active (unresolved) alerts."""
286
+ result = await self.session.execute(
287
+ select(ModelAlert)
288
+ .where(ModelAlert.resolved == False)
289
+ .order_by(ModelAlert.created_at.desc())
290
+ .offset(offset)
291
+ .limit(limit)
292
+ )
293
+ return result.scalars().all()
294
+
295
+ async def count_active(self, model_id: str | None = None) -> int:
296
+ """Count active alerts."""
297
+ filters = [ModelAlert.resolved == False]
298
+ if model_id:
299
+ filters.append(ModelAlert.model_id == model_id)
300
+ return await self.count(filters=filters)
301
+
302
+
303
+ # =============================================================================
304
+ # Service
305
+ # =============================================================================
306
+
307
+
308
+ class ModelMonitoringService:
309
+ """Service for ML model monitoring.
310
+
311
+ Provides functionality for:
312
+ - Model registration and management
313
+ - Prediction recording and metrics
314
+ - Alert rules and handlers
315
+ - Dashboard data aggregation
316
+ """
317
+
318
+ def __init__(self, session: AsyncSession) -> None:
319
+ """Initialize service.
320
+
321
+ Args:
322
+ session: Database session.
323
+ """
324
+ self.session = session
325
+ self.model_repo = MonitoredModelRepository(session)
326
+ self.prediction_repo = ModelPredictionRepository(session)
327
+ self.metric_repo = ModelMetricRepository(session)
328
+ self.rule_repo = ModelAlertRuleRepository(session)
329
+ self.handler_repo = ModelAlertHandlerRepository(session)
330
+ self.alert_repo = ModelAlertRepository(session)
331
+
332
+ # =========================================================================
333
+ # Model Registration
334
+ # =========================================================================
335
+
336
+ async def register_model(
337
+ self,
338
+ name: str,
339
+ *,
340
+ version: str = "1.0.0",
341
+ description: str | None = None,
342
+ config: dict[str, Any] | None = None,
343
+ metadata: dict[str, Any] | None = None,
344
+ ) -> MonitoredModel:
345
+ """Register a new model for monitoring.
346
+
347
+ Args:
348
+ name: Model name.
349
+ version: Model version.
350
+ description: Model description.
351
+ config: Monitoring configuration.
352
+ metadata: Additional metadata.
353
+
354
+ Returns:
355
+ Created MonitoredModel.
356
+ """
357
+ return await self.model_repo.create(
358
+ name=name,
359
+ version=version,
360
+ description=description,
361
+ config=config or {},
362
+ metadata_json=metadata,
363
+ status=ModelStatus.ACTIVE.value,
364
+ prediction_count=0,
365
+ health_score=100.0,
366
+ )
367
+
368
+ async def get_model(self, model_id: str) -> MonitoredModel | None:
369
+ """Get a model by ID."""
370
+ return await self.model_repo.get_by_id(model_id)
371
+
372
+ async def get_model_by_name(self, name: str) -> MonitoredModel | None:
373
+ """Get a model by name."""
374
+ return await self.model_repo.get_by_name(name)
375
+
376
+ async def list_models(
377
+ self,
378
+ *,
379
+ status: str | None = None,
380
+ offset: int = 0,
381
+ limit: int = 50,
382
+ ) -> tuple[Sequence[MonitoredModel], int]:
383
+ """List models with pagination.
384
+
385
+ Args:
386
+ status: Optional status filter.
387
+ offset: Number to skip.
388
+ limit: Maximum to return.
389
+
390
+ Returns:
391
+ Tuple of (models, total_count).
392
+ """
393
+ filters = []
394
+ if status:
395
+ filters.append(MonitoredModel.status == status)
396
+
397
+ models = await self.model_repo.list(
398
+ offset=offset,
399
+ limit=limit,
400
+ filters=filters if filters else None,
401
+ )
402
+ total = await self.model_repo.count(filters=filters if filters else None)
403
+
404
+ return models, total
405
+
406
+ async def update_model(
407
+ self,
408
+ model_id: str,
409
+ **updates: Any,
410
+ ) -> MonitoredModel | None:
411
+ """Update a model.
412
+
413
+ Args:
414
+ model_id: Model ID.
415
+ **updates: Fields to update.
416
+
417
+ Returns:
418
+ Updated model or None if not found.
419
+ """
420
+ model = await self.model_repo.get_by_id(model_id)
421
+ if model is None:
422
+ return None
423
+
424
+ for key, value in updates.items():
425
+ if hasattr(model, key) and value is not None:
426
+ setattr(model, key, value)
427
+
428
+ await self.session.flush()
429
+ return model
430
+
431
+ async def delete_model(self, model_id: str) -> bool:
432
+ """Delete a model."""
433
+ return await self.model_repo.delete(model_id)
434
+
435
+ async def pause_model(self, model_id: str) -> MonitoredModel | None:
436
+ """Pause model monitoring."""
437
+ model = await self.model_repo.get_by_id(model_id)
438
+ if model:
439
+ model.pause()
440
+ await self.session.flush()
441
+ return model
442
+
443
+ async def resume_model(self, model_id: str) -> MonitoredModel | None:
444
+ """Resume model monitoring."""
445
+ model = await self.model_repo.get_by_id(model_id)
446
+ if model:
447
+ model.resume()
448
+ await self.session.flush()
449
+ return model
450
+
451
+ # =========================================================================
452
+ # Prediction Recording
453
+ # =========================================================================
454
+
455
+ async def record_prediction(
456
+ self,
457
+ model_id: str,
458
+ features: dict[str, Any],
459
+ prediction: Any,
460
+ *,
461
+ actual: Any | None = None,
462
+ latency_ms: float | None = None,
463
+ metadata: dict[str, Any] | None = None,
464
+ ) -> ModelPrediction:
465
+ """Record a model prediction.
466
+
467
+ Args:
468
+ model_id: Model ID.
469
+ features: Input features.
470
+ prediction: Model output.
471
+ actual: Actual value (optional).
472
+ latency_ms: Prediction latency.
473
+ metadata: Additional metadata.
474
+
475
+ Returns:
476
+ Created ModelPrediction.
477
+
478
+ Raises:
479
+ ValueError: If model not found.
480
+ """
481
+ model = await self.model_repo.get_by_id(model_id)
482
+ if model is None:
483
+ raise ValueError(f"Model '{model_id}' not found")
484
+
485
+ # Create prediction record
486
+ pred = await self.prediction_repo.create(
487
+ model_id=model_id,
488
+ features=features,
489
+ prediction=prediction,
490
+ actual=actual,
491
+ latency_ms=latency_ms,
492
+ metadata_json=metadata,
493
+ )
494
+
495
+ # Update model stats
496
+ model.record_prediction()
497
+ await self.session.flush()
498
+
499
+ # Record latency metric if available
500
+ if latency_ms is not None:
501
+ await self.metric_repo.record_metric(
502
+ model_id=model_id,
503
+ metric_type="latency",
504
+ metric_name="latency_ms",
505
+ value=latency_ms,
506
+ )
507
+
508
+ return pred
509
+
510
+ async def get_predictions(
511
+ self,
512
+ model_id: str,
513
+ *,
514
+ offset: int = 0,
515
+ limit: int = 100,
516
+ hours: int | None = None,
517
+ ) -> Sequence[ModelPrediction]:
518
+ """Get predictions for a model."""
519
+ since = None
520
+ if hours:
521
+ since = datetime.utcnow() - timedelta(hours=hours)
522
+
523
+ return await self.prediction_repo.get_by_model_id(
524
+ model_id, offset=offset, limit=limit, since=since
525
+ )
526
+
527
+ # =========================================================================
528
+ # Metrics
529
+ # =========================================================================
530
+
531
+ async def get_model_metrics(
532
+ self,
533
+ model_id: str,
534
+ hours: int = 24,
535
+ ) -> dict[str, Any]:
536
+ """Get aggregated metrics for a model.
537
+
538
+ Args:
539
+ model_id: Model ID.
540
+ hours: Time range in hours.
541
+
542
+ Returns:
543
+ Dictionary with metric summaries and time series.
544
+ """
545
+ model = await self.model_repo.get_by_id(model_id)
546
+ if model is None:
547
+ raise ValueError(f"Model '{model_id}' not found")
548
+
549
+ cutoff = datetime.utcnow() - timedelta(hours=hours)
550
+
551
+ # Get latency data
552
+ latencies = await self.prediction_repo.get_latencies(model_id, cutoff)
553
+
554
+ # Get prediction count
555
+ pred_count = await self.prediction_repo.count_by_model(model_id, cutoff)
556
+
557
+ metrics = []
558
+ data_points: dict[str, list[dict[str, Any]]] = {}
559
+
560
+ # Latency metrics
561
+ if latencies:
562
+ sorted_latencies = sorted(latencies)
563
+ n = len(sorted_latencies)
564
+
565
+ metrics.append({
566
+ "name": "latency_ms",
567
+ "type": "latency",
568
+ "count": n,
569
+ "min_value": min(latencies),
570
+ "max_value": max(latencies),
571
+ "avg_value": statistics.mean(latencies),
572
+ "p50_value": sorted_latencies[n // 2] if n > 0 else None,
573
+ "p95_value": sorted_latencies[int(n * 0.95)] if n > 0 else None,
574
+ "p99_value": sorted_latencies[int(n * 0.99)] if n > 0 else None,
575
+ "last_value": latencies[0] if latencies else None,
576
+ })
577
+
578
+ # Throughput metric
579
+ metrics.append({
580
+ "name": "throughput",
581
+ "type": "throughput",
582
+ "count": 1,
583
+ "last_value": pred_count / max(hours, 1),
584
+ })
585
+
586
+ return {
587
+ "model_id": model_id,
588
+ "model_name": model.name,
589
+ "time_range_hours": hours,
590
+ "metrics": metrics,
591
+ "data_points": data_points,
592
+ }
593
+
594
+ async def record_metric(
595
+ self,
596
+ model_id: str,
597
+ metric_type: str,
598
+ metric_name: str,
599
+ value: float,
600
+ labels: dict[str, str] | None = None,
601
+ ) -> ModelMetric:
602
+ """Record a custom metric."""
603
+ return await self.metric_repo.record_metric(
604
+ model_id=model_id,
605
+ metric_type=metric_type,
606
+ metric_name=metric_name,
607
+ value=value,
608
+ labels=labels,
609
+ )
610
+
611
+ # =========================================================================
612
+ # Alert Rules
613
+ # =========================================================================
614
+
615
+ async def create_alert_rule(
616
+ self,
617
+ model_id: str,
618
+ name: str,
619
+ rule_type: str,
620
+ config: dict[str, Any],
621
+ *,
622
+ severity: str = "warning",
623
+ ) -> ModelAlertRule:
624
+ """Create an alert rule.
625
+
626
+ Args:
627
+ model_id: Model ID.
628
+ name: Rule name.
629
+ rule_type: Rule type (threshold, statistical, trend).
630
+ config: Rule configuration.
631
+ severity: Alert severity.
632
+
633
+ Returns:
634
+ Created ModelAlertRule.
635
+
636
+ Raises:
637
+ ValueError: If model not found.
638
+ """
639
+ model = await self.model_repo.get_by_id(model_id)
640
+ if model is None:
641
+ raise ValueError(f"Model '{model_id}' not found")
642
+
643
+ return await self.rule_repo.create(
644
+ model_id=model_id,
645
+ name=name,
646
+ rule_type=rule_type,
647
+ severity=severity,
648
+ config=config,
649
+ is_active=True,
650
+ )
651
+
652
+ async def get_alert_rules(
653
+ self,
654
+ model_id: str | None = None,
655
+ active_only: bool = False,
656
+ ) -> Sequence[ModelAlertRule]:
657
+ """Get alert rules."""
658
+ if model_id:
659
+ return await self.rule_repo.get_by_model_id(model_id, active_only=active_only)
660
+ if active_only:
661
+ return await self.rule_repo.get_active_rules()
662
+ return await self.rule_repo.list()
663
+
664
+ async def update_alert_rule(
665
+ self,
666
+ rule_id: str,
667
+ **updates: Any,
668
+ ) -> ModelAlertRule | None:
669
+ """Update an alert rule."""
670
+ rule = await self.rule_repo.get_by_id(rule_id)
671
+ if rule is None:
672
+ return None
673
+
674
+ for key, value in updates.items():
675
+ if hasattr(rule, key) and value is not None:
676
+ setattr(rule, key, value)
677
+
678
+ await self.session.flush()
679
+ return rule
680
+
681
+ async def delete_alert_rule(self, rule_id: str) -> bool:
682
+ """Delete an alert rule."""
683
+ return await self.rule_repo.delete(rule_id)
684
+
685
+ # =========================================================================
686
+ # Alert Handlers
687
+ # =========================================================================
688
+
689
+ async def create_alert_handler(
690
+ self,
691
+ name: str,
692
+ handler_type: str,
693
+ config: dict[str, Any],
694
+ ) -> ModelAlertHandler:
695
+ """Create an alert handler."""
696
+ return await self.handler_repo.create(
697
+ name=name,
698
+ handler_type=handler_type,
699
+ config=config,
700
+ is_active=True,
701
+ )
702
+
703
+ async def get_alert_handlers(
704
+ self,
705
+ active_only: bool = False,
706
+ ) -> Sequence[ModelAlertHandler]:
707
+ """Get alert handlers."""
708
+ if active_only:
709
+ return await self.handler_repo.get_active_handlers()
710
+ return await self.handler_repo.list()
711
+
712
+ async def update_alert_handler(
713
+ self,
714
+ handler_id: str,
715
+ **updates: Any,
716
+ ) -> ModelAlertHandler | None:
717
+ """Update an alert handler."""
718
+ handler = await self.handler_repo.get_by_id(handler_id)
719
+ if handler is None:
720
+ return None
721
+
722
+ for key, value in updates.items():
723
+ if hasattr(handler, key) and value is not None:
724
+ setattr(handler, key, value)
725
+
726
+ await self.session.flush()
727
+ return handler
728
+
729
+ async def delete_alert_handler(self, handler_id: str) -> bool:
730
+ """Delete an alert handler."""
731
+ return await self.handler_repo.delete(handler_id)
732
+
733
+ # =========================================================================
734
+ # Alerts
735
+ # =========================================================================
736
+
737
+ async def create_alert(
738
+ self,
739
+ model_id: str,
740
+ rule_id: str,
741
+ message: str,
742
+ *,
743
+ severity: str = "warning",
744
+ metric_value: float | None = None,
745
+ threshold_value: float | None = None,
746
+ ) -> ModelAlert:
747
+ """Create an alert instance."""
748
+ return await self.alert_repo.create(
749
+ model_id=model_id,
750
+ rule_id=rule_id,
751
+ severity=severity,
752
+ message=message,
753
+ metric_value=metric_value,
754
+ threshold_value=threshold_value,
755
+ acknowledged=False,
756
+ resolved=False,
757
+ )
758
+
759
+ async def get_alerts(
760
+ self,
761
+ model_id: str | None = None,
762
+ active_only: bool = False,
763
+ severity: str | None = None,
764
+ offset: int = 0,
765
+ limit: int = 50,
766
+ ) -> tuple[Sequence[ModelAlert], int]:
767
+ """Get alerts with pagination."""
768
+ if model_id:
769
+ alerts = await self.alert_repo.get_by_model_id(
770
+ model_id, active_only=active_only, severity=severity,
771
+ offset=offset, limit=limit
772
+ )
773
+ total = await self.alert_repo.count_active(model_id) if active_only else len(alerts)
774
+ else:
775
+ if active_only:
776
+ alerts = await self.alert_repo.get_active_alerts(offset=offset, limit=limit)
777
+ total = await self.alert_repo.count_active()
778
+ else:
779
+ alerts = await self.alert_repo.list(offset=offset, limit=limit)
780
+ total = await self.alert_repo.count()
781
+
782
+ return alerts, total
783
+
784
+ async def acknowledge_alert(
785
+ self,
786
+ alert_id: str,
787
+ actor: str,
788
+ ) -> ModelAlert | None:
789
+ """Acknowledge an alert."""
790
+ alert = await self.alert_repo.get_by_id(alert_id)
791
+ if alert is None:
792
+ return None
793
+
794
+ alert.acknowledge(actor)
795
+ await self.session.flush()
796
+ return alert
797
+
798
+ async def resolve_alert(self, alert_id: str) -> ModelAlert | None:
799
+ """Resolve an alert."""
800
+ alert = await self.alert_repo.get_by_id(alert_id)
801
+ if alert is None:
802
+ return None
803
+
804
+ alert.resolve()
805
+ await self.session.flush()
806
+ return alert
807
+
808
+ # =========================================================================
809
+ # Rule Evaluation
810
+ # =========================================================================
811
+
812
+ async def evaluate_rules(self, model_id: str) -> list[ModelAlert]:
813
+ """Evaluate all active rules for a model and create alerts if triggered.
814
+
815
+ Args:
816
+ model_id: Model ID.
817
+
818
+ Returns:
819
+ List of created alerts.
820
+ """
821
+ model = await self.model_repo.get_by_id(model_id)
822
+ if model is None:
823
+ return []
824
+
825
+ rules = await self.rule_repo.get_by_model_id(model_id, active_only=True)
826
+ metrics = await self.get_model_metrics(model_id, hours=1)
827
+ alerts = []
828
+
829
+ for rule in rules:
830
+ triggered, value, threshold = self._evaluate_rule(rule, metrics)
831
+ if triggered:
832
+ rule.trigger()
833
+ alert = await self.create_alert(
834
+ model_id=model_id,
835
+ rule_id=rule.id,
836
+ message=f"Rule '{rule.name}' triggered: value={value}, threshold={threshold}",
837
+ severity=rule.severity,
838
+ metric_value=value,
839
+ threshold_value=threshold,
840
+ )
841
+ alerts.append(alert)
842
+
843
+ await self.session.flush()
844
+ return alerts
845
+
846
+ def _evaluate_rule(
847
+ self,
848
+ rule: ModelAlertRule,
849
+ metrics: dict[str, Any],
850
+ ) -> tuple[bool, float | None, float | None]:
851
+ """Evaluate a single rule against metrics.
852
+
853
+ Args:
854
+ rule: Alert rule to evaluate.
855
+ metrics: Aggregated metrics.
856
+
857
+ Returns:
858
+ Tuple of (triggered, value, threshold).
859
+ """
860
+ config = rule.config
861
+ rule_type = rule.rule_type
862
+
863
+ if rule_type == "threshold":
864
+ metric_name = config.get("metric_name", "latency_ms")
865
+ threshold = config.get("threshold", 0)
866
+ comparison = config.get("comparison", "gt")
867
+
868
+ # Find metric value
869
+ value = None
870
+ for m in metrics.get("metrics", []):
871
+ if m.get("name") == metric_name:
872
+ value = m.get("last_value") or m.get("avg_value")
873
+ break
874
+
875
+ if value is None:
876
+ return False, None, threshold
877
+
878
+ # Compare
879
+ if comparison == "gt":
880
+ triggered = value > threshold
881
+ elif comparison == "lt":
882
+ triggered = value < threshold
883
+ elif comparison == "gte":
884
+ triggered = value >= threshold
885
+ elif comparison == "lte":
886
+ triggered = value <= threshold
887
+ elif comparison == "eq":
888
+ triggered = value == threshold
889
+ else:
890
+ triggered = False
891
+
892
+ return triggered, value, threshold
893
+
894
+ elif rule_type == "statistical":
895
+ # Statistical anomaly detection based on standard deviations
896
+ metric_name = config.get("metric_name", "latency_ms")
897
+ std_devs = config.get("std_devs", 3.0)
898
+
899
+ for m in metrics.get("metrics", []):
900
+ if m.get("name") == metric_name:
901
+ avg = m.get("avg_value")
902
+ p95 = m.get("p95_value")
903
+ if avg and p95:
904
+ # Simple heuristic: if p95 is more than std_devs times avg
905
+ if p95 > avg * (1 + std_devs * 0.1):
906
+ return True, p95, avg * (1 + std_devs * 0.1)
907
+ break
908
+
909
+ return False, None, None
910
+
911
+ return False, None, None
912
+
913
+ # =========================================================================
914
+ # Dashboard Data
915
+ # =========================================================================
916
+
917
+ async def get_monitoring_overview(self) -> dict[str, Any]:
918
+ """Get monitoring overview for dashboard.
919
+
920
+ Returns:
921
+ Overview statistics.
922
+ """
923
+ models, total_models = await self.list_models()
924
+ cutoff_24h = datetime.utcnow() - timedelta(hours=24)
925
+
926
+ # Count predictions in last 24h
927
+ total_predictions = 0
928
+ for model in models:
929
+ count = await self.prediction_repo.count_by_model(model.id, cutoff_24h)
930
+ total_predictions += count
931
+
932
+ # Count active alerts
933
+ active_alerts = await self.alert_repo.count_active()
934
+
935
+ # Count models by status
936
+ active_count = await self.model_repo.count_by_status(ModelStatus.ACTIVE.value)
937
+ degraded_count = await self.model_repo.count_by_status(ModelStatus.DEGRADED.value)
938
+
939
+ # Count models with drift
940
+ models_with_drift = sum(
941
+ 1 for m in models
942
+ if (m.current_drift_score or 0) > 0.1
943
+ )
944
+
945
+ # Average latency
946
+ all_latencies = []
947
+ for model in models:
948
+ latencies = await self.prediction_repo.get_latencies(model.id, cutoff_24h)
949
+ all_latencies.extend(latencies)
950
+
951
+ avg_latency = statistics.mean(all_latencies) if all_latencies else None
952
+
953
+ return {
954
+ "total_models": total_models,
955
+ "active_models": active_count,
956
+ "degraded_models": degraded_count,
957
+ "total_predictions_24h": total_predictions,
958
+ "active_alerts": active_alerts,
959
+ "models_with_drift": models_with_drift,
960
+ "avg_latency_ms": avg_latency,
961
+ }
962
+
963
+ async def get_model_dashboard(self, model_id: str) -> dict[str, Any]:
964
+ """Get dashboard data for a specific model.
965
+
966
+ Args:
967
+ model_id: Model ID.
968
+
969
+ Returns:
970
+ Dashboard data dictionary.
971
+
972
+ Raises:
973
+ ValueError: If model not found.
974
+ """
975
+ model = await self.model_repo.get_by_id(model_id)
976
+ if model is None:
977
+ raise ValueError(f"Model '{model_id}' not found")
978
+
979
+ metrics = await self.get_model_metrics(model_id, hours=24)
980
+ alerts, _ = await self.get_alerts(model_id, active_only=True)
981
+
982
+ # Recent predictions count
983
+ cutoff = datetime.utcnow() - timedelta(hours=1)
984
+ recent_predictions = await self.prediction_repo.count_by_model(model_id, cutoff)
985
+
986
+ # Health status
987
+ if model.status == ModelStatus.DEGRADED.value:
988
+ health_status = "degraded"
989
+ elif model.status == ModelStatus.ERROR.value:
990
+ health_status = "error"
991
+ elif alerts:
992
+ health_status = "warning"
993
+ else:
994
+ health_status = "healthy"
995
+
996
+ return {
997
+ "model": self._model_to_dict(model),
998
+ "metrics": metrics,
999
+ "active_alerts": [self._alert_to_dict(a) for a in alerts],
1000
+ "recent_predictions": recent_predictions,
1001
+ "health_status": health_status,
1002
+ }
1003
+
1004
+ # =========================================================================
1005
+ # Helpers
1006
+ # =========================================================================
1007
+
1008
+ def _model_to_dict(self, model: MonitoredModel) -> dict[str, Any]:
1009
+ """Convert model to dictionary."""
1010
+ return {
1011
+ "id": model.id,
1012
+ "name": model.name,
1013
+ "version": model.version,
1014
+ "description": model.description,
1015
+ "status": model.status,
1016
+ "config": model.config,
1017
+ "metadata": model.metadata_json,
1018
+ "prediction_count": model.prediction_count,
1019
+ "last_prediction_at": model.last_prediction_at.isoformat() if model.last_prediction_at else None,
1020
+ "current_drift_score": model.current_drift_score,
1021
+ "health_score": model.health_score,
1022
+ "created_at": model.created_at.isoformat() if model.created_at else None,
1023
+ "updated_at": model.updated_at.isoformat() if model.updated_at else None,
1024
+ }
1025
+
1026
+ def _alert_to_dict(self, alert: ModelAlert) -> dict[str, Any]:
1027
+ """Convert alert to dictionary."""
1028
+ return {
1029
+ "id": alert.id,
1030
+ "model_id": alert.model_id,
1031
+ "rule_id": alert.rule_id,
1032
+ "severity": alert.severity,
1033
+ "message": alert.message,
1034
+ "metric_value": alert.metric_value,
1035
+ "threshold_value": alert.threshold_value,
1036
+ "acknowledged": alert.acknowledged,
1037
+ "acknowledged_by": alert.acknowledged_by,
1038
+ "acknowledged_at": alert.acknowledged_at.isoformat() if alert.acknowledged_at else None,
1039
+ "resolved": alert.resolved,
1040
+ "resolved_at": alert.resolved_at.isoformat() if alert.resolved_at else None,
1041
+ "created_at": alert.created_at.isoformat() if alert.created_at else None,
1042
+ "updated_at": alert.updated_at.isoformat() if alert.updated_at else None,
1043
+ }