truthound-dashboard 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +258 -0
- truthound_dashboard/api/anomaly.py +1302 -0
- truthound_dashboard/api/cross_alerts.py +352 -0
- truthound_dashboard/api/deps.py +143 -0
- truthound_dashboard/api/drift_monitor.py +540 -0
- truthound_dashboard/api/lineage.py +1151 -0
- truthound_dashboard/api/maintenance.py +363 -0
- truthound_dashboard/api/middleware.py +373 -1
- truthound_dashboard/api/model_monitoring.py +805 -0
- truthound_dashboard/api/notifications_advanced.py +2452 -0
- truthound_dashboard/api/plugins.py +2096 -0
- truthound_dashboard/api/profile.py +211 -14
- truthound_dashboard/api/reports.py +853 -0
- truthound_dashboard/api/router.py +147 -0
- truthound_dashboard/api/rule_suggestions.py +310 -0
- truthound_dashboard/api/schema_evolution.py +231 -0
- truthound_dashboard/api/sources.py +47 -3
- truthound_dashboard/api/triggers.py +190 -0
- truthound_dashboard/api/validations.py +13 -0
- truthound_dashboard/api/validators.py +333 -4
- truthound_dashboard/api/versioning.py +309 -0
- truthound_dashboard/api/websocket.py +301 -0
- truthound_dashboard/core/__init__.py +27 -0
- truthound_dashboard/core/anomaly.py +1395 -0
- truthound_dashboard/core/anomaly_explainer.py +633 -0
- truthound_dashboard/core/cache.py +206 -0
- truthound_dashboard/core/cached_services.py +422 -0
- truthound_dashboard/core/charts.py +352 -0
- truthound_dashboard/core/connections.py +1069 -42
- truthound_dashboard/core/cross_alerts.py +837 -0
- truthound_dashboard/core/drift_monitor.py +1477 -0
- truthound_dashboard/core/drift_sampling.py +669 -0
- truthound_dashboard/core/i18n/__init__.py +42 -0
- truthound_dashboard/core/i18n/detector.py +173 -0
- truthound_dashboard/core/i18n/messages.py +564 -0
- truthound_dashboard/core/lineage.py +971 -0
- truthound_dashboard/core/maintenance.py +443 -5
- truthound_dashboard/core/model_monitoring.py +1043 -0
- truthound_dashboard/core/notifications/channels.py +1020 -1
- truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
- truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
- truthound_dashboard/core/notifications/deduplication/service.py +400 -0
- truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
- truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
- truthound_dashboard/core/notifications/dispatcher.py +43 -0
- truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
- truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
- truthound_dashboard/core/notifications/escalation/engine.py +429 -0
- truthound_dashboard/core/notifications/escalation/models.py +336 -0
- truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
- truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
- truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
- truthound_dashboard/core/notifications/events.py +49 -0
- truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
- truthound_dashboard/core/notifications/metrics/base.py +528 -0
- truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
- truthound_dashboard/core/notifications/routing/__init__.py +169 -0
- truthound_dashboard/core/notifications/routing/combinators.py +184 -0
- truthound_dashboard/core/notifications/routing/config.py +375 -0
- truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
- truthound_dashboard/core/notifications/routing/engine.py +382 -0
- truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
- truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
- truthound_dashboard/core/notifications/routing/rules.py +625 -0
- truthound_dashboard/core/notifications/routing/validator.py +678 -0
- truthound_dashboard/core/notifications/service.py +2 -0
- truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
- truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
- truthound_dashboard/core/notifications/throttling/builder.py +311 -0
- truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
- truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
- truthound_dashboard/core/openlineage.py +1028 -0
- truthound_dashboard/core/plugins/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/extractor.py +703 -0
- truthound_dashboard/core/plugins/docs/renderers.py +804 -0
- truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
- truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
- truthound_dashboard/core/plugins/hooks/manager.py +403 -0
- truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
- truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
- truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
- truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
- truthound_dashboard/core/plugins/loader.py +504 -0
- truthound_dashboard/core/plugins/registry.py +810 -0
- truthound_dashboard/core/plugins/reporter_executor.py +588 -0
- truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
- truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
- truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
- truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
- truthound_dashboard/core/plugins/sandbox.py +617 -0
- truthound_dashboard/core/plugins/security/__init__.py +68 -0
- truthound_dashboard/core/plugins/security/analyzer.py +535 -0
- truthound_dashboard/core/plugins/security/policies.py +311 -0
- truthound_dashboard/core/plugins/security/protocols.py +296 -0
- truthound_dashboard/core/plugins/security/signing.py +842 -0
- truthound_dashboard/core/plugins/security.py +446 -0
- truthound_dashboard/core/plugins/validator_executor.py +401 -0
- truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
- truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
- truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
- truthound_dashboard/core/plugins/versioning/semver.py +266 -0
- truthound_dashboard/core/profile_comparison.py +601 -0
- truthound_dashboard/core/report_history.py +570 -0
- truthound_dashboard/core/reporters/__init__.py +57 -0
- truthound_dashboard/core/reporters/base.py +296 -0
- truthound_dashboard/core/reporters/csv_reporter.py +155 -0
- truthound_dashboard/core/reporters/html_reporter.py +598 -0
- truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
- truthound_dashboard/core/reporters/i18n/base.py +494 -0
- truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
- truthound_dashboard/core/reporters/json_reporter.py +160 -0
- truthound_dashboard/core/reporters/junit_reporter.py +233 -0
- truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
- truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
- truthound_dashboard/core/reporters/registry.py +272 -0
- truthound_dashboard/core/rule_generator.py +2088 -0
- truthound_dashboard/core/scheduler.py +822 -12
- truthound_dashboard/core/schema_evolution.py +858 -0
- truthound_dashboard/core/services.py +152 -9
- truthound_dashboard/core/statistics.py +718 -0
- truthound_dashboard/core/streaming_anomaly.py +883 -0
- truthound_dashboard/core/triggers/__init__.py +45 -0
- truthound_dashboard/core/triggers/base.py +226 -0
- truthound_dashboard/core/triggers/evaluators.py +609 -0
- truthound_dashboard/core/triggers/factory.py +363 -0
- truthound_dashboard/core/unified_alerts.py +870 -0
- truthound_dashboard/core/validation_limits.py +509 -0
- truthound_dashboard/core/versioning.py +709 -0
- truthound_dashboard/core/websocket/__init__.py +59 -0
- truthound_dashboard/core/websocket/manager.py +512 -0
- truthound_dashboard/core/websocket/messages.py +130 -0
- truthound_dashboard/db/__init__.py +30 -0
- truthound_dashboard/db/models.py +3375 -3
- truthound_dashboard/main.py +22 -0
- truthound_dashboard/schemas/__init__.py +396 -1
- truthound_dashboard/schemas/anomaly.py +1258 -0
- truthound_dashboard/schemas/base.py +4 -0
- truthound_dashboard/schemas/cross_alerts.py +334 -0
- truthound_dashboard/schemas/drift_monitor.py +890 -0
- truthound_dashboard/schemas/lineage.py +428 -0
- truthound_dashboard/schemas/maintenance.py +154 -0
- truthound_dashboard/schemas/model_monitoring.py +374 -0
- truthound_dashboard/schemas/notifications_advanced.py +1363 -0
- truthound_dashboard/schemas/openlineage.py +704 -0
- truthound_dashboard/schemas/plugins.py +1293 -0
- truthound_dashboard/schemas/profile.py +420 -34
- truthound_dashboard/schemas/profile_comparison.py +242 -0
- truthound_dashboard/schemas/reports.py +285 -0
- truthound_dashboard/schemas/rule_suggestion.py +434 -0
- truthound_dashboard/schemas/schema_evolution.py +164 -0
- truthound_dashboard/schemas/source.py +117 -2
- truthound_dashboard/schemas/triggers.py +511 -0
- truthound_dashboard/schemas/unified_alerts.py +223 -0
- truthound_dashboard/schemas/validation.py +25 -1
- truthound_dashboard/schemas/validators/__init__.py +11 -0
- truthound_dashboard/schemas/validators/base.py +151 -0
- truthound_dashboard/schemas/versioning.py +152 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -18
- truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
- truthound_dashboard/static/assets/index-BCA8H1hO.js +0 -574
- truthound_dashboard/static/assets/index-BNsSQ2fN.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +0 -1
- truthound_dashboard-1.3.0.dist-info/RECORD +0 -110
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1043 @@
|
|
|
1
|
+
"""Model monitoring service.
|
|
2
|
+
|
|
3
|
+
This module provides services for ML model monitoring,
|
|
4
|
+
including model registration, prediction recording, metrics aggregation,
|
|
5
|
+
and alert management with database persistence.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Sequence
|
|
11
|
+
from datetime import datetime, timedelta
|
|
12
|
+
from typing import Any
|
|
13
|
+
import statistics
|
|
14
|
+
|
|
15
|
+
from sqlalchemy import and_, func, select
|
|
16
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
17
|
+
|
|
18
|
+
from truthound_dashboard.db import BaseRepository
|
|
19
|
+
from truthound_dashboard.db.models import (
|
|
20
|
+
AlertSeverityLevel,
|
|
21
|
+
ModelAlert,
|
|
22
|
+
ModelAlertHandler,
|
|
23
|
+
ModelAlertRule,
|
|
24
|
+
ModelMetric,
|
|
25
|
+
ModelPrediction,
|
|
26
|
+
ModelStatus,
|
|
27
|
+
MonitoredModel,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# =============================================================================
|
|
32
|
+
# Repositories
|
|
33
|
+
# =============================================================================
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class MonitoredModelRepository(BaseRepository[MonitoredModel]):
|
|
37
|
+
"""Repository for MonitoredModel operations."""
|
|
38
|
+
|
|
39
|
+
model = MonitoredModel
|
|
40
|
+
|
|
41
|
+
async def get_by_name(self, name: str) -> MonitoredModel | None:
|
|
42
|
+
"""Get model by name."""
|
|
43
|
+
result = await self.session.execute(
|
|
44
|
+
select(MonitoredModel).where(MonitoredModel.name == name)
|
|
45
|
+
)
|
|
46
|
+
return result.scalar_one_or_none()
|
|
47
|
+
|
|
48
|
+
async def get_by_status(
|
|
49
|
+
self,
|
|
50
|
+
status: str,
|
|
51
|
+
*,
|
|
52
|
+
offset: int = 0,
|
|
53
|
+
limit: int = 50,
|
|
54
|
+
) -> Sequence[MonitoredModel]:
|
|
55
|
+
"""Get models by status."""
|
|
56
|
+
result = await self.session.execute(
|
|
57
|
+
select(MonitoredModel)
|
|
58
|
+
.where(MonitoredModel.status == status)
|
|
59
|
+
.order_by(MonitoredModel.created_at.desc())
|
|
60
|
+
.offset(offset)
|
|
61
|
+
.limit(limit)
|
|
62
|
+
)
|
|
63
|
+
return result.scalars().all()
|
|
64
|
+
|
|
65
|
+
async def get_active_models(
|
|
66
|
+
self,
|
|
67
|
+
*,
|
|
68
|
+
offset: int = 0,
|
|
69
|
+
limit: int = 50,
|
|
70
|
+
) -> Sequence[MonitoredModel]:
|
|
71
|
+
"""Get all active models."""
|
|
72
|
+
return await self.get_by_status(
|
|
73
|
+
ModelStatus.ACTIVE.value, offset=offset, limit=limit
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
async def count_by_status(self, status: str) -> int:
|
|
77
|
+
"""Count models by status."""
|
|
78
|
+
return await self.count(filters=[MonitoredModel.status == status])
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ModelPredictionRepository(BaseRepository[ModelPrediction]):
|
|
82
|
+
"""Repository for ModelPrediction operations."""
|
|
83
|
+
|
|
84
|
+
model = ModelPrediction
|
|
85
|
+
|
|
86
|
+
async def get_by_model_id(
|
|
87
|
+
self,
|
|
88
|
+
model_id: str,
|
|
89
|
+
*,
|
|
90
|
+
offset: int = 0,
|
|
91
|
+
limit: int = 100,
|
|
92
|
+
since: datetime | None = None,
|
|
93
|
+
) -> Sequence[ModelPrediction]:
|
|
94
|
+
"""Get predictions for a model."""
|
|
95
|
+
query = (
|
|
96
|
+
select(ModelPrediction)
|
|
97
|
+
.where(ModelPrediction.model_id == model_id)
|
|
98
|
+
.order_by(ModelPrediction.recorded_at.desc())
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if since:
|
|
102
|
+
query = query.where(ModelPrediction.recorded_at >= since)
|
|
103
|
+
|
|
104
|
+
result = await self.session.execute(
|
|
105
|
+
query.offset(offset).limit(limit)
|
|
106
|
+
)
|
|
107
|
+
return result.scalars().all()
|
|
108
|
+
|
|
109
|
+
async def count_by_model(
|
|
110
|
+
self,
|
|
111
|
+
model_id: str,
|
|
112
|
+
since: datetime | None = None,
|
|
113
|
+
) -> int:
|
|
114
|
+
"""Count predictions for a model."""
|
|
115
|
+
filters = [ModelPrediction.model_id == model_id]
|
|
116
|
+
if since:
|
|
117
|
+
filters.append(ModelPrediction.recorded_at >= since)
|
|
118
|
+
return await self.count(filters=filters)
|
|
119
|
+
|
|
120
|
+
async def get_latencies(
|
|
121
|
+
self,
|
|
122
|
+
model_id: str,
|
|
123
|
+
since: datetime,
|
|
124
|
+
) -> list[float]:
|
|
125
|
+
"""Get latency values for a model within time range."""
|
|
126
|
+
result = await self.session.execute(
|
|
127
|
+
select(ModelPrediction.latency_ms)
|
|
128
|
+
.where(
|
|
129
|
+
and_(
|
|
130
|
+
ModelPrediction.model_id == model_id,
|
|
131
|
+
ModelPrediction.recorded_at >= since,
|
|
132
|
+
ModelPrediction.latency_ms.isnot(None),
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
.order_by(ModelPrediction.recorded_at.desc())
|
|
136
|
+
)
|
|
137
|
+
return [r[0] for r in result.fetchall() if r[0] is not None]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class ModelMetricRepository(BaseRepository[ModelMetric]):
|
|
141
|
+
"""Repository for ModelMetric operations."""
|
|
142
|
+
|
|
143
|
+
model = ModelMetric
|
|
144
|
+
|
|
145
|
+
async def get_by_model_id(
|
|
146
|
+
self,
|
|
147
|
+
model_id: str,
|
|
148
|
+
*,
|
|
149
|
+
metric_type: str | None = None,
|
|
150
|
+
since: datetime | None = None,
|
|
151
|
+
offset: int = 0,
|
|
152
|
+
limit: int = 100,
|
|
153
|
+
) -> Sequence[ModelMetric]:
|
|
154
|
+
"""Get metrics for a model."""
|
|
155
|
+
query = (
|
|
156
|
+
select(ModelMetric)
|
|
157
|
+
.where(ModelMetric.model_id == model_id)
|
|
158
|
+
.order_by(ModelMetric.recorded_at.desc())
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if metric_type:
|
|
162
|
+
query = query.where(ModelMetric.metric_type == metric_type)
|
|
163
|
+
if since:
|
|
164
|
+
query = query.where(ModelMetric.recorded_at >= since)
|
|
165
|
+
|
|
166
|
+
result = await self.session.execute(
|
|
167
|
+
query.offset(offset).limit(limit)
|
|
168
|
+
)
|
|
169
|
+
return result.scalars().all()
|
|
170
|
+
|
|
171
|
+
async def record_metric(
|
|
172
|
+
self,
|
|
173
|
+
model_id: str,
|
|
174
|
+
metric_type: str,
|
|
175
|
+
metric_name: str,
|
|
176
|
+
value: float,
|
|
177
|
+
labels: dict[str, str] | None = None,
|
|
178
|
+
) -> ModelMetric:
|
|
179
|
+
"""Record a new metric."""
|
|
180
|
+
return await self.create(
|
|
181
|
+
model_id=model_id,
|
|
182
|
+
metric_type=metric_type,
|
|
183
|
+
metric_name=metric_name,
|
|
184
|
+
value=value,
|
|
185
|
+
labels=labels,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class ModelAlertRuleRepository(BaseRepository[ModelAlertRule]):
|
|
190
|
+
"""Repository for ModelAlertRule operations."""
|
|
191
|
+
|
|
192
|
+
model = ModelAlertRule
|
|
193
|
+
|
|
194
|
+
async def get_by_model_id(
|
|
195
|
+
self,
|
|
196
|
+
model_id: str,
|
|
197
|
+
*,
|
|
198
|
+
active_only: bool = False,
|
|
199
|
+
) -> Sequence[ModelAlertRule]:
|
|
200
|
+
"""Get alert rules for a model."""
|
|
201
|
+
query = select(ModelAlertRule).where(ModelAlertRule.model_id == model_id)
|
|
202
|
+
|
|
203
|
+
if active_only:
|
|
204
|
+
query = query.where(ModelAlertRule.is_active == True)
|
|
205
|
+
|
|
206
|
+
result = await self.session.execute(
|
|
207
|
+
query.order_by(ModelAlertRule.created_at.desc())
|
|
208
|
+
)
|
|
209
|
+
return result.scalars().all()
|
|
210
|
+
|
|
211
|
+
async def get_active_rules(self) -> Sequence[ModelAlertRule]:
|
|
212
|
+
"""Get all active rules."""
|
|
213
|
+
result = await self.session.execute(
|
|
214
|
+
select(ModelAlertRule)
|
|
215
|
+
.where(ModelAlertRule.is_active == True)
|
|
216
|
+
.order_by(ModelAlertRule.created_at.desc())
|
|
217
|
+
)
|
|
218
|
+
return result.scalars().all()
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class ModelAlertHandlerRepository(BaseRepository[ModelAlertHandler]):
|
|
222
|
+
"""Repository for ModelAlertHandler operations."""
|
|
223
|
+
|
|
224
|
+
model = ModelAlertHandler
|
|
225
|
+
|
|
226
|
+
async def get_active_handlers(self) -> Sequence[ModelAlertHandler]:
|
|
227
|
+
"""Get all active handlers."""
|
|
228
|
+
result = await self.session.execute(
|
|
229
|
+
select(ModelAlertHandler)
|
|
230
|
+
.where(ModelAlertHandler.is_active == True)
|
|
231
|
+
.order_by(ModelAlertHandler.created_at.desc())
|
|
232
|
+
)
|
|
233
|
+
return result.scalars().all()
|
|
234
|
+
|
|
235
|
+
async def get_by_type(
|
|
236
|
+
self,
|
|
237
|
+
handler_type: str,
|
|
238
|
+
) -> Sequence[ModelAlertHandler]:
|
|
239
|
+
"""Get handlers by type."""
|
|
240
|
+
result = await self.session.execute(
|
|
241
|
+
select(ModelAlertHandler)
|
|
242
|
+
.where(ModelAlertHandler.handler_type == handler_type)
|
|
243
|
+
.order_by(ModelAlertHandler.created_at.desc())
|
|
244
|
+
)
|
|
245
|
+
return result.scalars().all()
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class ModelAlertRepository(BaseRepository[ModelAlert]):
|
|
249
|
+
"""Repository for ModelAlert operations."""
|
|
250
|
+
|
|
251
|
+
model = ModelAlert
|
|
252
|
+
|
|
253
|
+
async def get_by_model_id(
|
|
254
|
+
self,
|
|
255
|
+
model_id: str,
|
|
256
|
+
*,
|
|
257
|
+
active_only: bool = False,
|
|
258
|
+
severity: str | None = None,
|
|
259
|
+
offset: int = 0,
|
|
260
|
+
limit: int = 50,
|
|
261
|
+
) -> Sequence[ModelAlert]:
|
|
262
|
+
"""Get alerts for a model."""
|
|
263
|
+
query = (
|
|
264
|
+
select(ModelAlert)
|
|
265
|
+
.where(ModelAlert.model_id == model_id)
|
|
266
|
+
.order_by(ModelAlert.created_at.desc())
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
if active_only:
|
|
270
|
+
query = query.where(ModelAlert.resolved == False)
|
|
271
|
+
if severity:
|
|
272
|
+
query = query.where(ModelAlert.severity == severity)
|
|
273
|
+
|
|
274
|
+
result = await self.session.execute(
|
|
275
|
+
query.offset(offset).limit(limit)
|
|
276
|
+
)
|
|
277
|
+
return result.scalars().all()
|
|
278
|
+
|
|
279
|
+
async def get_active_alerts(
|
|
280
|
+
self,
|
|
281
|
+
*,
|
|
282
|
+
offset: int = 0,
|
|
283
|
+
limit: int = 50,
|
|
284
|
+
) -> Sequence[ModelAlert]:
|
|
285
|
+
"""Get all active (unresolved) alerts."""
|
|
286
|
+
result = await self.session.execute(
|
|
287
|
+
select(ModelAlert)
|
|
288
|
+
.where(ModelAlert.resolved == False)
|
|
289
|
+
.order_by(ModelAlert.created_at.desc())
|
|
290
|
+
.offset(offset)
|
|
291
|
+
.limit(limit)
|
|
292
|
+
)
|
|
293
|
+
return result.scalars().all()
|
|
294
|
+
|
|
295
|
+
async def count_active(self, model_id: str | None = None) -> int:
|
|
296
|
+
"""Count active alerts."""
|
|
297
|
+
filters = [ModelAlert.resolved == False]
|
|
298
|
+
if model_id:
|
|
299
|
+
filters.append(ModelAlert.model_id == model_id)
|
|
300
|
+
return await self.count(filters=filters)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
# =============================================================================
|
|
304
|
+
# Service
|
|
305
|
+
# =============================================================================
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
class ModelMonitoringService:
|
|
309
|
+
"""Service for ML model monitoring.
|
|
310
|
+
|
|
311
|
+
Provides functionality for:
|
|
312
|
+
- Model registration and management
|
|
313
|
+
- Prediction recording and metrics
|
|
314
|
+
- Alert rules and handlers
|
|
315
|
+
- Dashboard data aggregation
|
|
316
|
+
"""
|
|
317
|
+
|
|
318
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
319
|
+
"""Initialize service.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
session: Database session.
|
|
323
|
+
"""
|
|
324
|
+
self.session = session
|
|
325
|
+
self.model_repo = MonitoredModelRepository(session)
|
|
326
|
+
self.prediction_repo = ModelPredictionRepository(session)
|
|
327
|
+
self.metric_repo = ModelMetricRepository(session)
|
|
328
|
+
self.rule_repo = ModelAlertRuleRepository(session)
|
|
329
|
+
self.handler_repo = ModelAlertHandlerRepository(session)
|
|
330
|
+
self.alert_repo = ModelAlertRepository(session)
|
|
331
|
+
|
|
332
|
+
# =========================================================================
|
|
333
|
+
# Model Registration
|
|
334
|
+
# =========================================================================
|
|
335
|
+
|
|
336
|
+
async def register_model(
|
|
337
|
+
self,
|
|
338
|
+
name: str,
|
|
339
|
+
*,
|
|
340
|
+
version: str = "1.0.0",
|
|
341
|
+
description: str | None = None,
|
|
342
|
+
config: dict[str, Any] | None = None,
|
|
343
|
+
metadata: dict[str, Any] | None = None,
|
|
344
|
+
) -> MonitoredModel:
|
|
345
|
+
"""Register a new model for monitoring.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
name: Model name.
|
|
349
|
+
version: Model version.
|
|
350
|
+
description: Model description.
|
|
351
|
+
config: Monitoring configuration.
|
|
352
|
+
metadata: Additional metadata.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
Created MonitoredModel.
|
|
356
|
+
"""
|
|
357
|
+
return await self.model_repo.create(
|
|
358
|
+
name=name,
|
|
359
|
+
version=version,
|
|
360
|
+
description=description,
|
|
361
|
+
config=config or {},
|
|
362
|
+
metadata_json=metadata,
|
|
363
|
+
status=ModelStatus.ACTIVE.value,
|
|
364
|
+
prediction_count=0,
|
|
365
|
+
health_score=100.0,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
async def get_model(self, model_id: str) -> MonitoredModel | None:
|
|
369
|
+
"""Get a model by ID."""
|
|
370
|
+
return await self.model_repo.get_by_id(model_id)
|
|
371
|
+
|
|
372
|
+
async def get_model_by_name(self, name: str) -> MonitoredModel | None:
|
|
373
|
+
"""Get a model by name."""
|
|
374
|
+
return await self.model_repo.get_by_name(name)
|
|
375
|
+
|
|
376
|
+
async def list_models(
|
|
377
|
+
self,
|
|
378
|
+
*,
|
|
379
|
+
status: str | None = None,
|
|
380
|
+
offset: int = 0,
|
|
381
|
+
limit: int = 50,
|
|
382
|
+
) -> tuple[Sequence[MonitoredModel], int]:
|
|
383
|
+
"""List models with pagination.
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
status: Optional status filter.
|
|
387
|
+
offset: Number to skip.
|
|
388
|
+
limit: Maximum to return.
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
Tuple of (models, total_count).
|
|
392
|
+
"""
|
|
393
|
+
filters = []
|
|
394
|
+
if status:
|
|
395
|
+
filters.append(MonitoredModel.status == status)
|
|
396
|
+
|
|
397
|
+
models = await self.model_repo.list(
|
|
398
|
+
offset=offset,
|
|
399
|
+
limit=limit,
|
|
400
|
+
filters=filters if filters else None,
|
|
401
|
+
)
|
|
402
|
+
total = await self.model_repo.count(filters=filters if filters else None)
|
|
403
|
+
|
|
404
|
+
return models, total
|
|
405
|
+
|
|
406
|
+
async def update_model(
|
|
407
|
+
self,
|
|
408
|
+
model_id: str,
|
|
409
|
+
**updates: Any,
|
|
410
|
+
) -> MonitoredModel | None:
|
|
411
|
+
"""Update a model.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
model_id: Model ID.
|
|
415
|
+
**updates: Fields to update.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
Updated model or None if not found.
|
|
419
|
+
"""
|
|
420
|
+
model = await self.model_repo.get_by_id(model_id)
|
|
421
|
+
if model is None:
|
|
422
|
+
return None
|
|
423
|
+
|
|
424
|
+
for key, value in updates.items():
|
|
425
|
+
if hasattr(model, key) and value is not None:
|
|
426
|
+
setattr(model, key, value)
|
|
427
|
+
|
|
428
|
+
await self.session.flush()
|
|
429
|
+
return model
|
|
430
|
+
|
|
431
|
+
async def delete_model(self, model_id: str) -> bool:
|
|
432
|
+
"""Delete a model."""
|
|
433
|
+
return await self.model_repo.delete(model_id)
|
|
434
|
+
|
|
435
|
+
async def pause_model(self, model_id: str) -> MonitoredModel | None:
|
|
436
|
+
"""Pause model monitoring."""
|
|
437
|
+
model = await self.model_repo.get_by_id(model_id)
|
|
438
|
+
if model:
|
|
439
|
+
model.pause()
|
|
440
|
+
await self.session.flush()
|
|
441
|
+
return model
|
|
442
|
+
|
|
443
|
+
async def resume_model(self, model_id: str) -> MonitoredModel | None:
|
|
444
|
+
"""Resume model monitoring."""
|
|
445
|
+
model = await self.model_repo.get_by_id(model_id)
|
|
446
|
+
if model:
|
|
447
|
+
model.resume()
|
|
448
|
+
await self.session.flush()
|
|
449
|
+
return model
|
|
450
|
+
|
|
451
|
+
# =========================================================================
|
|
452
|
+
# Prediction Recording
|
|
453
|
+
# =========================================================================
|
|
454
|
+
|
|
455
|
+
async def record_prediction(
|
|
456
|
+
self,
|
|
457
|
+
model_id: str,
|
|
458
|
+
features: dict[str, Any],
|
|
459
|
+
prediction: Any,
|
|
460
|
+
*,
|
|
461
|
+
actual: Any | None = None,
|
|
462
|
+
latency_ms: float | None = None,
|
|
463
|
+
metadata: dict[str, Any] | None = None,
|
|
464
|
+
) -> ModelPrediction:
|
|
465
|
+
"""Record a model prediction.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
model_id: Model ID.
|
|
469
|
+
features: Input features.
|
|
470
|
+
prediction: Model output.
|
|
471
|
+
actual: Actual value (optional).
|
|
472
|
+
latency_ms: Prediction latency.
|
|
473
|
+
metadata: Additional metadata.
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
Created ModelPrediction.
|
|
477
|
+
|
|
478
|
+
Raises:
|
|
479
|
+
ValueError: If model not found.
|
|
480
|
+
"""
|
|
481
|
+
model = await self.model_repo.get_by_id(model_id)
|
|
482
|
+
if model is None:
|
|
483
|
+
raise ValueError(f"Model '{model_id}' not found")
|
|
484
|
+
|
|
485
|
+
# Create prediction record
|
|
486
|
+
pred = await self.prediction_repo.create(
|
|
487
|
+
model_id=model_id,
|
|
488
|
+
features=features,
|
|
489
|
+
prediction=prediction,
|
|
490
|
+
actual=actual,
|
|
491
|
+
latency_ms=latency_ms,
|
|
492
|
+
metadata_json=metadata,
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
# Update model stats
|
|
496
|
+
model.record_prediction()
|
|
497
|
+
await self.session.flush()
|
|
498
|
+
|
|
499
|
+
# Record latency metric if available
|
|
500
|
+
if latency_ms is not None:
|
|
501
|
+
await self.metric_repo.record_metric(
|
|
502
|
+
model_id=model_id,
|
|
503
|
+
metric_type="latency",
|
|
504
|
+
metric_name="latency_ms",
|
|
505
|
+
value=latency_ms,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
return pred
|
|
509
|
+
|
|
510
|
+
async def get_predictions(
|
|
511
|
+
self,
|
|
512
|
+
model_id: str,
|
|
513
|
+
*,
|
|
514
|
+
offset: int = 0,
|
|
515
|
+
limit: int = 100,
|
|
516
|
+
hours: int | None = None,
|
|
517
|
+
) -> Sequence[ModelPrediction]:
|
|
518
|
+
"""Get predictions for a model."""
|
|
519
|
+
since = None
|
|
520
|
+
if hours:
|
|
521
|
+
since = datetime.utcnow() - timedelta(hours=hours)
|
|
522
|
+
|
|
523
|
+
return await self.prediction_repo.get_by_model_id(
|
|
524
|
+
model_id, offset=offset, limit=limit, since=since
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
# =========================================================================
|
|
528
|
+
# Metrics
|
|
529
|
+
# =========================================================================
|
|
530
|
+
|
|
531
|
+
async def get_model_metrics(
|
|
532
|
+
self,
|
|
533
|
+
model_id: str,
|
|
534
|
+
hours: int = 24,
|
|
535
|
+
) -> dict[str, Any]:
|
|
536
|
+
"""Get aggregated metrics for a model.
|
|
537
|
+
|
|
538
|
+
Args:
|
|
539
|
+
model_id: Model ID.
|
|
540
|
+
hours: Time range in hours.
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
Dictionary with metric summaries and time series.
|
|
544
|
+
"""
|
|
545
|
+
model = await self.model_repo.get_by_id(model_id)
|
|
546
|
+
if model is None:
|
|
547
|
+
raise ValueError(f"Model '{model_id}' not found")
|
|
548
|
+
|
|
549
|
+
cutoff = datetime.utcnow() - timedelta(hours=hours)
|
|
550
|
+
|
|
551
|
+
# Get latency data
|
|
552
|
+
latencies = await self.prediction_repo.get_latencies(model_id, cutoff)
|
|
553
|
+
|
|
554
|
+
# Get prediction count
|
|
555
|
+
pred_count = await self.prediction_repo.count_by_model(model_id, cutoff)
|
|
556
|
+
|
|
557
|
+
metrics = []
|
|
558
|
+
data_points: dict[str, list[dict[str, Any]]] = {}
|
|
559
|
+
|
|
560
|
+
# Latency metrics
|
|
561
|
+
if latencies:
|
|
562
|
+
sorted_latencies = sorted(latencies)
|
|
563
|
+
n = len(sorted_latencies)
|
|
564
|
+
|
|
565
|
+
metrics.append({
|
|
566
|
+
"name": "latency_ms",
|
|
567
|
+
"type": "latency",
|
|
568
|
+
"count": n,
|
|
569
|
+
"min_value": min(latencies),
|
|
570
|
+
"max_value": max(latencies),
|
|
571
|
+
"avg_value": statistics.mean(latencies),
|
|
572
|
+
"p50_value": sorted_latencies[n // 2] if n > 0 else None,
|
|
573
|
+
"p95_value": sorted_latencies[int(n * 0.95)] if n > 0 else None,
|
|
574
|
+
"p99_value": sorted_latencies[int(n * 0.99)] if n > 0 else None,
|
|
575
|
+
"last_value": latencies[0] if latencies else None,
|
|
576
|
+
})
|
|
577
|
+
|
|
578
|
+
# Throughput metric
|
|
579
|
+
metrics.append({
|
|
580
|
+
"name": "throughput",
|
|
581
|
+
"type": "throughput",
|
|
582
|
+
"count": 1,
|
|
583
|
+
"last_value": pred_count / max(hours, 1),
|
|
584
|
+
})
|
|
585
|
+
|
|
586
|
+
return {
|
|
587
|
+
"model_id": model_id,
|
|
588
|
+
"model_name": model.name,
|
|
589
|
+
"time_range_hours": hours,
|
|
590
|
+
"metrics": metrics,
|
|
591
|
+
"data_points": data_points,
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
async def record_metric(
|
|
595
|
+
self,
|
|
596
|
+
model_id: str,
|
|
597
|
+
metric_type: str,
|
|
598
|
+
metric_name: str,
|
|
599
|
+
value: float,
|
|
600
|
+
labels: dict[str, str] | None = None,
|
|
601
|
+
) -> ModelMetric:
|
|
602
|
+
"""Record a custom metric."""
|
|
603
|
+
return await self.metric_repo.record_metric(
|
|
604
|
+
model_id=model_id,
|
|
605
|
+
metric_type=metric_type,
|
|
606
|
+
metric_name=metric_name,
|
|
607
|
+
value=value,
|
|
608
|
+
labels=labels,
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# =========================================================================
|
|
612
|
+
# Alert Rules
|
|
613
|
+
# =========================================================================
|
|
614
|
+
|
|
615
|
+
async def create_alert_rule(
|
|
616
|
+
self,
|
|
617
|
+
model_id: str,
|
|
618
|
+
name: str,
|
|
619
|
+
rule_type: str,
|
|
620
|
+
config: dict[str, Any],
|
|
621
|
+
*,
|
|
622
|
+
severity: str = "warning",
|
|
623
|
+
) -> ModelAlertRule:
|
|
624
|
+
"""Create an alert rule.
|
|
625
|
+
|
|
626
|
+
Args:
|
|
627
|
+
model_id: Model ID.
|
|
628
|
+
name: Rule name.
|
|
629
|
+
rule_type: Rule type (threshold, statistical, trend).
|
|
630
|
+
config: Rule configuration.
|
|
631
|
+
severity: Alert severity.
|
|
632
|
+
|
|
633
|
+
Returns:
|
|
634
|
+
Created ModelAlertRule.
|
|
635
|
+
|
|
636
|
+
Raises:
|
|
637
|
+
ValueError: If model not found.
|
|
638
|
+
"""
|
|
639
|
+
model = await self.model_repo.get_by_id(model_id)
|
|
640
|
+
if model is None:
|
|
641
|
+
raise ValueError(f"Model '{model_id}' not found")
|
|
642
|
+
|
|
643
|
+
return await self.rule_repo.create(
|
|
644
|
+
model_id=model_id,
|
|
645
|
+
name=name,
|
|
646
|
+
rule_type=rule_type,
|
|
647
|
+
severity=severity,
|
|
648
|
+
config=config,
|
|
649
|
+
is_active=True,
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
async def get_alert_rules(
|
|
653
|
+
self,
|
|
654
|
+
model_id: str | None = None,
|
|
655
|
+
active_only: bool = False,
|
|
656
|
+
) -> Sequence[ModelAlertRule]:
|
|
657
|
+
"""Get alert rules."""
|
|
658
|
+
if model_id:
|
|
659
|
+
return await self.rule_repo.get_by_model_id(model_id, active_only=active_only)
|
|
660
|
+
if active_only:
|
|
661
|
+
return await self.rule_repo.get_active_rules()
|
|
662
|
+
return await self.rule_repo.list()
|
|
663
|
+
|
|
664
|
+
async def update_alert_rule(
|
|
665
|
+
self,
|
|
666
|
+
rule_id: str,
|
|
667
|
+
**updates: Any,
|
|
668
|
+
) -> ModelAlertRule | None:
|
|
669
|
+
"""Update an alert rule."""
|
|
670
|
+
rule = await self.rule_repo.get_by_id(rule_id)
|
|
671
|
+
if rule is None:
|
|
672
|
+
return None
|
|
673
|
+
|
|
674
|
+
for key, value in updates.items():
|
|
675
|
+
if hasattr(rule, key) and value is not None:
|
|
676
|
+
setattr(rule, key, value)
|
|
677
|
+
|
|
678
|
+
await self.session.flush()
|
|
679
|
+
return rule
|
|
680
|
+
|
|
681
|
+
async def delete_alert_rule(self, rule_id: str) -> bool:
|
|
682
|
+
"""Delete an alert rule."""
|
|
683
|
+
return await self.rule_repo.delete(rule_id)
|
|
684
|
+
|
|
685
|
+
# =========================================================================
|
|
686
|
+
# Alert Handlers
|
|
687
|
+
# =========================================================================
|
|
688
|
+
|
|
689
|
+
async def create_alert_handler(
|
|
690
|
+
self,
|
|
691
|
+
name: str,
|
|
692
|
+
handler_type: str,
|
|
693
|
+
config: dict[str, Any],
|
|
694
|
+
) -> ModelAlertHandler:
|
|
695
|
+
"""Create an alert handler."""
|
|
696
|
+
return await self.handler_repo.create(
|
|
697
|
+
name=name,
|
|
698
|
+
handler_type=handler_type,
|
|
699
|
+
config=config,
|
|
700
|
+
is_active=True,
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
async def get_alert_handlers(
|
|
704
|
+
self,
|
|
705
|
+
active_only: bool = False,
|
|
706
|
+
) -> Sequence[ModelAlertHandler]:
|
|
707
|
+
"""Get alert handlers."""
|
|
708
|
+
if active_only:
|
|
709
|
+
return await self.handler_repo.get_active_handlers()
|
|
710
|
+
return await self.handler_repo.list()
|
|
711
|
+
|
|
712
|
+
async def update_alert_handler(
|
|
713
|
+
self,
|
|
714
|
+
handler_id: str,
|
|
715
|
+
**updates: Any,
|
|
716
|
+
) -> ModelAlertHandler | None:
|
|
717
|
+
"""Update an alert handler."""
|
|
718
|
+
handler = await self.handler_repo.get_by_id(handler_id)
|
|
719
|
+
if handler is None:
|
|
720
|
+
return None
|
|
721
|
+
|
|
722
|
+
for key, value in updates.items():
|
|
723
|
+
if hasattr(handler, key) and value is not None:
|
|
724
|
+
setattr(handler, key, value)
|
|
725
|
+
|
|
726
|
+
await self.session.flush()
|
|
727
|
+
return handler
|
|
728
|
+
|
|
729
|
+
async def delete_alert_handler(self, handler_id: str) -> bool:
|
|
730
|
+
"""Delete an alert handler."""
|
|
731
|
+
return await self.handler_repo.delete(handler_id)
|
|
732
|
+
|
|
733
|
+
# =========================================================================
|
|
734
|
+
# Alerts
|
|
735
|
+
# =========================================================================
|
|
736
|
+
|
|
737
|
+
async def create_alert(
|
|
738
|
+
self,
|
|
739
|
+
model_id: str,
|
|
740
|
+
rule_id: str,
|
|
741
|
+
message: str,
|
|
742
|
+
*,
|
|
743
|
+
severity: str = "warning",
|
|
744
|
+
metric_value: float | None = None,
|
|
745
|
+
threshold_value: float | None = None,
|
|
746
|
+
) -> ModelAlert:
|
|
747
|
+
"""Create an alert instance."""
|
|
748
|
+
return await self.alert_repo.create(
|
|
749
|
+
model_id=model_id,
|
|
750
|
+
rule_id=rule_id,
|
|
751
|
+
severity=severity,
|
|
752
|
+
message=message,
|
|
753
|
+
metric_value=metric_value,
|
|
754
|
+
threshold_value=threshold_value,
|
|
755
|
+
acknowledged=False,
|
|
756
|
+
resolved=False,
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
async def get_alerts(
|
|
760
|
+
self,
|
|
761
|
+
model_id: str | None = None,
|
|
762
|
+
active_only: bool = False,
|
|
763
|
+
severity: str | None = None,
|
|
764
|
+
offset: int = 0,
|
|
765
|
+
limit: int = 50,
|
|
766
|
+
) -> tuple[Sequence[ModelAlert], int]:
|
|
767
|
+
"""Get alerts with pagination."""
|
|
768
|
+
if model_id:
|
|
769
|
+
alerts = await self.alert_repo.get_by_model_id(
|
|
770
|
+
model_id, active_only=active_only, severity=severity,
|
|
771
|
+
offset=offset, limit=limit
|
|
772
|
+
)
|
|
773
|
+
total = await self.alert_repo.count_active(model_id) if active_only else len(alerts)
|
|
774
|
+
else:
|
|
775
|
+
if active_only:
|
|
776
|
+
alerts = await self.alert_repo.get_active_alerts(offset=offset, limit=limit)
|
|
777
|
+
total = await self.alert_repo.count_active()
|
|
778
|
+
else:
|
|
779
|
+
alerts = await self.alert_repo.list(offset=offset, limit=limit)
|
|
780
|
+
total = await self.alert_repo.count()
|
|
781
|
+
|
|
782
|
+
return alerts, total
|
|
783
|
+
|
|
784
|
+
async def acknowledge_alert(
|
|
785
|
+
self,
|
|
786
|
+
alert_id: str,
|
|
787
|
+
actor: str,
|
|
788
|
+
) -> ModelAlert | None:
|
|
789
|
+
"""Acknowledge an alert."""
|
|
790
|
+
alert = await self.alert_repo.get_by_id(alert_id)
|
|
791
|
+
if alert is None:
|
|
792
|
+
return None
|
|
793
|
+
|
|
794
|
+
alert.acknowledge(actor)
|
|
795
|
+
await self.session.flush()
|
|
796
|
+
return alert
|
|
797
|
+
|
|
798
|
+
async def resolve_alert(self, alert_id: str) -> ModelAlert | None:
|
|
799
|
+
"""Resolve an alert."""
|
|
800
|
+
alert = await self.alert_repo.get_by_id(alert_id)
|
|
801
|
+
if alert is None:
|
|
802
|
+
return None
|
|
803
|
+
|
|
804
|
+
alert.resolve()
|
|
805
|
+
await self.session.flush()
|
|
806
|
+
return alert
|
|
807
|
+
|
|
808
|
+
# =========================================================================
|
|
809
|
+
# Rule Evaluation
|
|
810
|
+
# =========================================================================
|
|
811
|
+
|
|
812
|
+
async def evaluate_rules(self, model_id: str) -> list[ModelAlert]:
|
|
813
|
+
"""Evaluate all active rules for a model and create alerts if triggered.
|
|
814
|
+
|
|
815
|
+
Args:
|
|
816
|
+
model_id: Model ID.
|
|
817
|
+
|
|
818
|
+
Returns:
|
|
819
|
+
List of created alerts.
|
|
820
|
+
"""
|
|
821
|
+
model = await self.model_repo.get_by_id(model_id)
|
|
822
|
+
if model is None:
|
|
823
|
+
return []
|
|
824
|
+
|
|
825
|
+
rules = await self.rule_repo.get_by_model_id(model_id, active_only=True)
|
|
826
|
+
metrics = await self.get_model_metrics(model_id, hours=1)
|
|
827
|
+
alerts = []
|
|
828
|
+
|
|
829
|
+
for rule in rules:
|
|
830
|
+
triggered, value, threshold = self._evaluate_rule(rule, metrics)
|
|
831
|
+
if triggered:
|
|
832
|
+
rule.trigger()
|
|
833
|
+
alert = await self.create_alert(
|
|
834
|
+
model_id=model_id,
|
|
835
|
+
rule_id=rule.id,
|
|
836
|
+
message=f"Rule '{rule.name}' triggered: value={value}, threshold={threshold}",
|
|
837
|
+
severity=rule.severity,
|
|
838
|
+
metric_value=value,
|
|
839
|
+
threshold_value=threshold,
|
|
840
|
+
)
|
|
841
|
+
alerts.append(alert)
|
|
842
|
+
|
|
843
|
+
await self.session.flush()
|
|
844
|
+
return alerts
|
|
845
|
+
|
|
846
|
+
def _evaluate_rule(
|
|
847
|
+
self,
|
|
848
|
+
rule: ModelAlertRule,
|
|
849
|
+
metrics: dict[str, Any],
|
|
850
|
+
) -> tuple[bool, float | None, float | None]:
|
|
851
|
+
"""Evaluate a single rule against metrics.
|
|
852
|
+
|
|
853
|
+
Args:
|
|
854
|
+
rule: Alert rule to evaluate.
|
|
855
|
+
metrics: Aggregated metrics.
|
|
856
|
+
|
|
857
|
+
Returns:
|
|
858
|
+
Tuple of (triggered, value, threshold).
|
|
859
|
+
"""
|
|
860
|
+
config = rule.config
|
|
861
|
+
rule_type = rule.rule_type
|
|
862
|
+
|
|
863
|
+
if rule_type == "threshold":
|
|
864
|
+
metric_name = config.get("metric_name", "latency_ms")
|
|
865
|
+
threshold = config.get("threshold", 0)
|
|
866
|
+
comparison = config.get("comparison", "gt")
|
|
867
|
+
|
|
868
|
+
# Find metric value
|
|
869
|
+
value = None
|
|
870
|
+
for m in metrics.get("metrics", []):
|
|
871
|
+
if m.get("name") == metric_name:
|
|
872
|
+
value = m.get("last_value") or m.get("avg_value")
|
|
873
|
+
break
|
|
874
|
+
|
|
875
|
+
if value is None:
|
|
876
|
+
return False, None, threshold
|
|
877
|
+
|
|
878
|
+
# Compare
|
|
879
|
+
if comparison == "gt":
|
|
880
|
+
triggered = value > threshold
|
|
881
|
+
elif comparison == "lt":
|
|
882
|
+
triggered = value < threshold
|
|
883
|
+
elif comparison == "gte":
|
|
884
|
+
triggered = value >= threshold
|
|
885
|
+
elif comparison == "lte":
|
|
886
|
+
triggered = value <= threshold
|
|
887
|
+
elif comparison == "eq":
|
|
888
|
+
triggered = value == threshold
|
|
889
|
+
else:
|
|
890
|
+
triggered = False
|
|
891
|
+
|
|
892
|
+
return triggered, value, threshold
|
|
893
|
+
|
|
894
|
+
elif rule_type == "statistical":
|
|
895
|
+
# Statistical anomaly detection based on standard deviations
|
|
896
|
+
metric_name = config.get("metric_name", "latency_ms")
|
|
897
|
+
std_devs = config.get("std_devs", 3.0)
|
|
898
|
+
|
|
899
|
+
for m in metrics.get("metrics", []):
|
|
900
|
+
if m.get("name") == metric_name:
|
|
901
|
+
avg = m.get("avg_value")
|
|
902
|
+
p95 = m.get("p95_value")
|
|
903
|
+
if avg and p95:
|
|
904
|
+
# Simple heuristic: if p95 is more than std_devs times avg
|
|
905
|
+
if p95 > avg * (1 + std_devs * 0.1):
|
|
906
|
+
return True, p95, avg * (1 + std_devs * 0.1)
|
|
907
|
+
break
|
|
908
|
+
|
|
909
|
+
return False, None, None
|
|
910
|
+
|
|
911
|
+
return False, None, None
|
|
912
|
+
|
|
913
|
+
# =========================================================================
|
|
914
|
+
# Dashboard Data
|
|
915
|
+
# =========================================================================
|
|
916
|
+
|
|
917
|
+
async def get_monitoring_overview(self) -> dict[str, Any]:
|
|
918
|
+
"""Get monitoring overview for dashboard.
|
|
919
|
+
|
|
920
|
+
Returns:
|
|
921
|
+
Overview statistics.
|
|
922
|
+
"""
|
|
923
|
+
models, total_models = await self.list_models()
|
|
924
|
+
cutoff_24h = datetime.utcnow() - timedelta(hours=24)
|
|
925
|
+
|
|
926
|
+
# Count predictions in last 24h
|
|
927
|
+
total_predictions = 0
|
|
928
|
+
for model in models:
|
|
929
|
+
count = await self.prediction_repo.count_by_model(model.id, cutoff_24h)
|
|
930
|
+
total_predictions += count
|
|
931
|
+
|
|
932
|
+
# Count active alerts
|
|
933
|
+
active_alerts = await self.alert_repo.count_active()
|
|
934
|
+
|
|
935
|
+
# Count models by status
|
|
936
|
+
active_count = await self.model_repo.count_by_status(ModelStatus.ACTIVE.value)
|
|
937
|
+
degraded_count = await self.model_repo.count_by_status(ModelStatus.DEGRADED.value)
|
|
938
|
+
|
|
939
|
+
# Count models with drift
|
|
940
|
+
models_with_drift = sum(
|
|
941
|
+
1 for m in models
|
|
942
|
+
if (m.current_drift_score or 0) > 0.1
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
# Average latency
|
|
946
|
+
all_latencies = []
|
|
947
|
+
for model in models:
|
|
948
|
+
latencies = await self.prediction_repo.get_latencies(model.id, cutoff_24h)
|
|
949
|
+
all_latencies.extend(latencies)
|
|
950
|
+
|
|
951
|
+
avg_latency = statistics.mean(all_latencies) if all_latencies else None
|
|
952
|
+
|
|
953
|
+
return {
|
|
954
|
+
"total_models": total_models,
|
|
955
|
+
"active_models": active_count,
|
|
956
|
+
"degraded_models": degraded_count,
|
|
957
|
+
"total_predictions_24h": total_predictions,
|
|
958
|
+
"active_alerts": active_alerts,
|
|
959
|
+
"models_with_drift": models_with_drift,
|
|
960
|
+
"avg_latency_ms": avg_latency,
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
async def get_model_dashboard(self, model_id: str) -> dict[str, Any]:
|
|
964
|
+
"""Get dashboard data for a specific model.
|
|
965
|
+
|
|
966
|
+
Args:
|
|
967
|
+
model_id: Model ID.
|
|
968
|
+
|
|
969
|
+
Returns:
|
|
970
|
+
Dashboard data dictionary.
|
|
971
|
+
|
|
972
|
+
Raises:
|
|
973
|
+
ValueError: If model not found.
|
|
974
|
+
"""
|
|
975
|
+
model = await self.model_repo.get_by_id(model_id)
|
|
976
|
+
if model is None:
|
|
977
|
+
raise ValueError(f"Model '{model_id}' not found")
|
|
978
|
+
|
|
979
|
+
metrics = await self.get_model_metrics(model_id, hours=24)
|
|
980
|
+
alerts, _ = await self.get_alerts(model_id, active_only=True)
|
|
981
|
+
|
|
982
|
+
# Recent predictions count
|
|
983
|
+
cutoff = datetime.utcnow() - timedelta(hours=1)
|
|
984
|
+
recent_predictions = await self.prediction_repo.count_by_model(model_id, cutoff)
|
|
985
|
+
|
|
986
|
+
# Health status
|
|
987
|
+
if model.status == ModelStatus.DEGRADED.value:
|
|
988
|
+
health_status = "degraded"
|
|
989
|
+
elif model.status == ModelStatus.ERROR.value:
|
|
990
|
+
health_status = "error"
|
|
991
|
+
elif alerts:
|
|
992
|
+
health_status = "warning"
|
|
993
|
+
else:
|
|
994
|
+
health_status = "healthy"
|
|
995
|
+
|
|
996
|
+
return {
|
|
997
|
+
"model": self._model_to_dict(model),
|
|
998
|
+
"metrics": metrics,
|
|
999
|
+
"active_alerts": [self._alert_to_dict(a) for a in alerts],
|
|
1000
|
+
"recent_predictions": recent_predictions,
|
|
1001
|
+
"health_status": health_status,
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
# =========================================================================
|
|
1005
|
+
# Helpers
|
|
1006
|
+
# =========================================================================
|
|
1007
|
+
|
|
1008
|
+
def _model_to_dict(self, model: MonitoredModel) -> dict[str, Any]:
|
|
1009
|
+
"""Convert model to dictionary."""
|
|
1010
|
+
return {
|
|
1011
|
+
"id": model.id,
|
|
1012
|
+
"name": model.name,
|
|
1013
|
+
"version": model.version,
|
|
1014
|
+
"description": model.description,
|
|
1015
|
+
"status": model.status,
|
|
1016
|
+
"config": model.config,
|
|
1017
|
+
"metadata": model.metadata_json,
|
|
1018
|
+
"prediction_count": model.prediction_count,
|
|
1019
|
+
"last_prediction_at": model.last_prediction_at.isoformat() if model.last_prediction_at else None,
|
|
1020
|
+
"current_drift_score": model.current_drift_score,
|
|
1021
|
+
"health_score": model.health_score,
|
|
1022
|
+
"created_at": model.created_at.isoformat() if model.created_at else None,
|
|
1023
|
+
"updated_at": model.updated_at.isoformat() if model.updated_at else None,
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
def _alert_to_dict(self, alert: ModelAlert) -> dict[str, Any]:
|
|
1027
|
+
"""Convert alert to dictionary."""
|
|
1028
|
+
return {
|
|
1029
|
+
"id": alert.id,
|
|
1030
|
+
"model_id": alert.model_id,
|
|
1031
|
+
"rule_id": alert.rule_id,
|
|
1032
|
+
"severity": alert.severity,
|
|
1033
|
+
"message": alert.message,
|
|
1034
|
+
"metric_value": alert.metric_value,
|
|
1035
|
+
"threshold_value": alert.threshold_value,
|
|
1036
|
+
"acknowledged": alert.acknowledged,
|
|
1037
|
+
"acknowledged_by": alert.acknowledged_by,
|
|
1038
|
+
"acknowledged_at": alert.acknowledged_at.isoformat() if alert.acknowledged_at else None,
|
|
1039
|
+
"resolved": alert.resolved,
|
|
1040
|
+
"resolved_at": alert.resolved_at.isoformat() if alert.resolved_at else None,
|
|
1041
|
+
"created_at": alert.created_at.isoformat() if alert.created_at else None,
|
|
1042
|
+
"updated_at": alert.updated_at.isoformat() if alert.updated_at else None,
|
|
1043
|
+
}
|