kailash 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -7
- kailash/cli/__init__.py +11 -1
- kailash/cli/validation_audit.py +570 -0
- kailash/core/actors/supervisor.py +1 -1
- kailash/core/resilience/circuit_breaker.py +71 -1
- kailash/core/resilience/health_monitor.py +172 -0
- kailash/edge/compliance.py +33 -0
- kailash/edge/consistency.py +609 -0
- kailash/edge/coordination/__init__.py +30 -0
- kailash/edge/coordination/global_ordering.py +355 -0
- kailash/edge/coordination/leader_election.py +217 -0
- kailash/edge/coordination/partition_detector.py +296 -0
- kailash/edge/coordination/raft.py +485 -0
- kailash/edge/discovery.py +63 -1
- kailash/edge/migration/__init__.py +19 -0
- kailash/edge/migration/edge_migrator.py +832 -0
- kailash/edge/monitoring/__init__.py +21 -0
- kailash/edge/monitoring/edge_monitor.py +736 -0
- kailash/edge/prediction/__init__.py +10 -0
- kailash/edge/prediction/predictive_warmer.py +591 -0
- kailash/edge/resource/__init__.py +102 -0
- kailash/edge/resource/cloud_integration.py +796 -0
- kailash/edge/resource/cost_optimizer.py +949 -0
- kailash/edge/resource/docker_integration.py +919 -0
- kailash/edge/resource/kubernetes_integration.py +893 -0
- kailash/edge/resource/platform_integration.py +913 -0
- kailash/edge/resource/predictive_scaler.py +959 -0
- kailash/edge/resource/resource_analyzer.py +824 -0
- kailash/edge/resource/resource_pools.py +610 -0
- kailash/integrations/dataflow_edge.py +261 -0
- kailash/mcp_server/registry_integration.py +1 -1
- kailash/monitoring/__init__.py +18 -0
- kailash/monitoring/alerts.py +646 -0
- kailash/monitoring/metrics.py +677 -0
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/ai/__init__.py +17 -0
- kailash/nodes/ai/a2a.py +1914 -43
- kailash/nodes/ai/a2a_backup.py +1807 -0
- kailash/nodes/ai/hybrid_search.py +972 -0
- kailash/nodes/ai/semantic_memory.py +558 -0
- kailash/nodes/ai/streaming_analytics.py +947 -0
- kailash/nodes/base.py +545 -0
- kailash/nodes/edge/__init__.py +36 -0
- kailash/nodes/edge/base.py +240 -0
- kailash/nodes/edge/cloud_node.py +710 -0
- kailash/nodes/edge/coordination.py +239 -0
- kailash/nodes/edge/docker_node.py +825 -0
- kailash/nodes/edge/edge_data.py +582 -0
- kailash/nodes/edge/edge_migration_node.py +392 -0
- kailash/nodes/edge/edge_monitoring_node.py +421 -0
- kailash/nodes/edge/edge_state.py +673 -0
- kailash/nodes/edge/edge_warming_node.py +393 -0
- kailash/nodes/edge/kubernetes_node.py +652 -0
- kailash/nodes/edge/platform_node.py +766 -0
- kailash/nodes/edge/resource_analyzer_node.py +378 -0
- kailash/nodes/edge/resource_optimizer_node.py +501 -0
- kailash/nodes/edge/resource_scaler_node.py +397 -0
- kailash/nodes/ports.py +676 -0
- kailash/runtime/local.py +344 -1
- kailash/runtime/validation/__init__.py +20 -0
- kailash/runtime/validation/connection_context.py +119 -0
- kailash/runtime/validation/enhanced_error_formatter.py +202 -0
- kailash/runtime/validation/error_categorizer.py +164 -0
- kailash/runtime/validation/metrics.py +380 -0
- kailash/runtime/validation/performance.py +615 -0
- kailash/runtime/validation/suggestion_engine.py +212 -0
- kailash/testing/fixtures.py +2 -2
- kailash/workflow/builder.py +234 -8
- kailash/workflow/contracts.py +418 -0
- kailash/workflow/edge_infrastructure.py +369 -0
- kailash/workflow/migration.py +3 -3
- kailash/workflow/type_inference.py +669 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/METADATA +44 -27
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/RECORD +78 -28
- kailash/nexus/__init__.py +0 -21
- kailash/nexus/cli/__init__.py +0 -5
- kailash/nexus/cli/__main__.py +0 -6
- kailash/nexus/cli/main.py +0 -176
- kailash/nexus/factory.py +0 -413
- kailash/nexus/gateway.py +0 -545
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/WHEEL +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/entry_points.txt +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,646 @@
|
|
1
|
+
"""
|
2
|
+
Alerting system for monitoring validation failures and security violations.
|
3
|
+
|
4
|
+
Provides configurable alerting rules, notification channels, and alert management
|
5
|
+
for critical events in the Kailash SDK validation system.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
import smtplib
|
11
|
+
import threading
|
12
|
+
import time
|
13
|
+
from abc import ABC, abstractmethod
|
14
|
+
from dataclasses import dataclass, field
|
15
|
+
from datetime import UTC, datetime, timedelta
|
16
|
+
from enum import Enum
|
17
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
18
|
+
|
19
|
+
import requests
|
20
|
+
|
21
|
+
from .metrics import MetricSeries, MetricsRegistry
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class AlertSeverity(Enum):
|
27
|
+
"""Alert severity levels."""
|
28
|
+
|
29
|
+
INFO = "info"
|
30
|
+
WARNING = "warning"
|
31
|
+
ERROR = "error"
|
32
|
+
CRITICAL = "critical"
|
33
|
+
|
34
|
+
|
35
|
+
class AlertStatus(Enum):
|
36
|
+
"""Alert status."""
|
37
|
+
|
38
|
+
PENDING = "pending"
|
39
|
+
FIRING = "firing"
|
40
|
+
RESOLVED = "resolved"
|
41
|
+
SILENCED = "silenced"
|
42
|
+
|
43
|
+
|
44
|
+
@dataclass
|
45
|
+
class Alert:
|
46
|
+
"""Alert instance."""
|
47
|
+
|
48
|
+
id: str
|
49
|
+
rule_name: str
|
50
|
+
severity: AlertSeverity
|
51
|
+
title: str
|
52
|
+
description: str
|
53
|
+
labels: Dict[str, str] = field(default_factory=dict)
|
54
|
+
annotations: Dict[str, str] = field(default_factory=dict)
|
55
|
+
status: AlertStatus = AlertStatus.PENDING
|
56
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
57
|
+
fired_at: Optional[datetime] = None
|
58
|
+
resolved_at: Optional[datetime] = None
|
59
|
+
last_notification: Optional[datetime] = None
|
60
|
+
notification_count: int = 0
|
61
|
+
|
62
|
+
def fire(self):
|
63
|
+
"""Mark alert as firing."""
|
64
|
+
if self.status != AlertStatus.FIRING:
|
65
|
+
self.status = AlertStatus.FIRING
|
66
|
+
self.fired_at = datetime.now(UTC)
|
67
|
+
|
68
|
+
def resolve(self):
|
69
|
+
"""Mark alert as resolved."""
|
70
|
+
if self.status == AlertStatus.FIRING:
|
71
|
+
self.status = AlertStatus.RESOLVED
|
72
|
+
self.resolved_at = datetime.now(UTC)
|
73
|
+
|
74
|
+
def silence(self):
|
75
|
+
"""Silence the alert."""
|
76
|
+
self.status = AlertStatus.SILENCED
|
77
|
+
|
78
|
+
def should_notify(self, notification_interval: timedelta) -> bool:
|
79
|
+
"""Check if alert should send notification."""
|
80
|
+
if self.status != AlertStatus.FIRING:
|
81
|
+
return False
|
82
|
+
|
83
|
+
if self.last_notification is None:
|
84
|
+
return True
|
85
|
+
|
86
|
+
return datetime.now(UTC) - self.last_notification >= notification_interval
|
87
|
+
|
88
|
+
def mark_notified(self):
|
89
|
+
"""Mark that notification was sent."""
|
90
|
+
self.last_notification = datetime.now(UTC)
|
91
|
+
self.notification_count += 1
|
92
|
+
|
93
|
+
|
94
|
+
@dataclass
|
95
|
+
class AlertRule:
|
96
|
+
"""Alert rule configuration."""
|
97
|
+
|
98
|
+
name: str
|
99
|
+
description: str
|
100
|
+
severity: AlertSeverity
|
101
|
+
metric_name: str
|
102
|
+
condition: str # e.g., "> 10", "< 0.95", "== 0"
|
103
|
+
threshold: Union[int, float]
|
104
|
+
time_window: timedelta = timedelta(minutes=5)
|
105
|
+
evaluation_interval: timedelta = timedelta(minutes=1)
|
106
|
+
notification_interval: timedelta = timedelta(minutes=15)
|
107
|
+
labels: Dict[str, str] = field(default_factory=dict)
|
108
|
+
annotations: Dict[str, str] = field(default_factory=dict)
|
109
|
+
enabled: bool = True
|
110
|
+
|
111
|
+
def evaluate(self, metric_series: MetricSeries) -> bool:
|
112
|
+
"""Evaluate if alert condition is met.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
metric_series: Metric series to evaluate
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
True if alert condition is met
|
119
|
+
"""
|
120
|
+
if not self.enabled:
|
121
|
+
return False
|
122
|
+
|
123
|
+
# Get metric value over time window
|
124
|
+
if self.condition.startswith("rate"):
|
125
|
+
# Rate-based condition
|
126
|
+
value = metric_series.get_rate(self.time_window)
|
127
|
+
elif self.condition.startswith("avg"):
|
128
|
+
# Average-based condition
|
129
|
+
value = metric_series.get_average(self.time_window)
|
130
|
+
elif self.condition.startswith("max"):
|
131
|
+
# Maximum-based condition
|
132
|
+
value = metric_series.get_max(self.time_window)
|
133
|
+
else:
|
134
|
+
# Latest value condition
|
135
|
+
value = metric_series.get_latest_value()
|
136
|
+
|
137
|
+
if value is None:
|
138
|
+
return False
|
139
|
+
|
140
|
+
# Evaluate condition
|
141
|
+
if "> " in self.condition:
|
142
|
+
return value > self.threshold
|
143
|
+
elif "< " in self.condition:
|
144
|
+
return value < self.threshold
|
145
|
+
elif ">= " in self.condition:
|
146
|
+
return value >= self.threshold
|
147
|
+
elif "<= " in self.condition:
|
148
|
+
return value <= self.threshold
|
149
|
+
elif "== " in self.condition:
|
150
|
+
return value == self.threshold
|
151
|
+
elif "!= " in self.condition:
|
152
|
+
return value != self.threshold
|
153
|
+
else:
|
154
|
+
logger.warning(f"Unknown condition format: {self.condition}")
|
155
|
+
return False
|
156
|
+
|
157
|
+
|
158
|
+
class NotificationChannel(ABC):
|
159
|
+
"""Base class for notification channels."""
|
160
|
+
|
161
|
+
@abstractmethod
|
162
|
+
def send_notification(self, alert: Alert, context: Dict[str, Any]) -> bool:
|
163
|
+
"""Send notification for alert.
|
164
|
+
|
165
|
+
Args:
|
166
|
+
alert: Alert to send notification for
|
167
|
+
context: Additional context information
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
True if notification was sent successfully
|
171
|
+
"""
|
172
|
+
pass
|
173
|
+
|
174
|
+
|
175
|
+
class LogNotificationChannel(NotificationChannel):
|
176
|
+
"""Log-based notification channel."""
|
177
|
+
|
178
|
+
def __init__(self, log_level: str = "ERROR"):
|
179
|
+
"""Initialize log notification channel.
|
180
|
+
|
181
|
+
Args:
|
182
|
+
log_level: Log level for notifications
|
183
|
+
"""
|
184
|
+
self.log_level = getattr(logging, log_level.upper())
|
185
|
+
|
186
|
+
def send_notification(self, alert: Alert, context: Dict[str, Any]) -> bool:
|
187
|
+
"""Send notification via logging."""
|
188
|
+
message = (
|
189
|
+
f"ALERT [{alert.severity.value.upper()}] {alert.title}: {alert.description}"
|
190
|
+
)
|
191
|
+
logger.log(self.log_level, message)
|
192
|
+
return True
|
193
|
+
|
194
|
+
|
195
|
+
class EmailNotificationChannel(NotificationChannel):
|
196
|
+
"""Email notification channel."""
|
197
|
+
|
198
|
+
def __init__(
|
199
|
+
self,
|
200
|
+
smtp_host: str,
|
201
|
+
smtp_port: int,
|
202
|
+
username: str,
|
203
|
+
password: str,
|
204
|
+
from_email: str,
|
205
|
+
to_emails: List[str],
|
206
|
+
use_tls: bool = True,
|
207
|
+
):
|
208
|
+
"""Initialize email notification channel.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
smtp_host: SMTP server host
|
212
|
+
smtp_port: SMTP server port
|
213
|
+
username: SMTP username
|
214
|
+
password: SMTP password
|
215
|
+
from_email: From email address
|
216
|
+
to_emails: List of recipient email addresses
|
217
|
+
use_tls: Whether to use TLS
|
218
|
+
"""
|
219
|
+
self.smtp_host = smtp_host
|
220
|
+
self.smtp_port = smtp_port
|
221
|
+
self.username = username
|
222
|
+
self.password = password
|
223
|
+
self.from_email = from_email
|
224
|
+
self.to_emails = to_emails
|
225
|
+
self.use_tls = use_tls
|
226
|
+
|
227
|
+
def send_notification(self, alert: Alert, context: Dict[str, Any]) -> bool:
|
228
|
+
"""Send notification via email."""
|
229
|
+
try:
|
230
|
+
from email.mime.multipart import MimeMultipart
|
231
|
+
from email.mime.text import MimeText
|
232
|
+
|
233
|
+
msg = MimeMultipart()
|
234
|
+
msg["From"] = self.from_email
|
235
|
+
msg["To"] = ", ".join(self.to_emails)
|
236
|
+
msg["Subject"] = f"[{alert.severity.value.upper()}] {alert.title}"
|
237
|
+
|
238
|
+
body = self._format_email_body(alert, context)
|
239
|
+
msg.attach(MimeText(body, "html"))
|
240
|
+
|
241
|
+
server = smtplib.SMTP(self.smtp_host, self.smtp_port)
|
242
|
+
if self.use_tls:
|
243
|
+
server.starttls()
|
244
|
+
server.login(self.username, self.password)
|
245
|
+
server.sendmail(self.from_email, self.to_emails, msg.as_string())
|
246
|
+
server.quit()
|
247
|
+
|
248
|
+
return True
|
249
|
+
except Exception as e:
|
250
|
+
logger.error(f"Failed to send email notification: {e}")
|
251
|
+
return False
|
252
|
+
|
253
|
+
def _format_email_body(self, alert: Alert, context: Dict[str, Any]) -> str:
|
254
|
+
"""Format email body for alert."""
|
255
|
+
return f"""
|
256
|
+
<html>
|
257
|
+
<body>
|
258
|
+
<h2>Kailash SDK Alert: {alert.title}</h2>
|
259
|
+
<p><strong>Severity:</strong> {alert.severity.value.upper()}</p>
|
260
|
+
<p><strong>Status:</strong> {alert.status.value}</p>
|
261
|
+
<p><strong>Description:</strong> {alert.description}</p>
|
262
|
+
<p><strong>Created:</strong> {alert.created_at.isoformat()}</p>
|
263
|
+
|
264
|
+
<h3>Labels:</h3>
|
265
|
+
<ul>
|
266
|
+
{"".join(f"<li><strong>{k}:</strong> {v}</li>" for k, v in alert.labels.items())}
|
267
|
+
</ul>
|
268
|
+
|
269
|
+
<h3>Context:</h3>
|
270
|
+
<ul>
|
271
|
+
{"".join(f"<li><strong>{k}:</strong> {v}</li>" for k, v in context.items())}
|
272
|
+
</ul>
|
273
|
+
</body>
|
274
|
+
</html>
|
275
|
+
"""
|
276
|
+
|
277
|
+
|
278
|
+
class SlackNotificationChannel(NotificationChannel):
|
279
|
+
"""Slack notification channel."""
|
280
|
+
|
281
|
+
def __init__(self, webhook_url: str, channel: str = "#alerts"):
|
282
|
+
"""Initialize Slack notification channel.
|
283
|
+
|
284
|
+
Args:
|
285
|
+
webhook_url: Slack webhook URL
|
286
|
+
channel: Slack channel to send alerts to
|
287
|
+
"""
|
288
|
+
self.webhook_url = webhook_url
|
289
|
+
self.channel = channel
|
290
|
+
|
291
|
+
def send_notification(self, alert: Alert, context: Dict[str, Any]) -> bool:
|
292
|
+
"""Send notification via Slack."""
|
293
|
+
try:
|
294
|
+
color_map = {
|
295
|
+
AlertSeverity.INFO: "good",
|
296
|
+
AlertSeverity.WARNING: "warning",
|
297
|
+
AlertSeverity.ERROR: "danger",
|
298
|
+
AlertSeverity.CRITICAL: "danger",
|
299
|
+
}
|
300
|
+
|
301
|
+
payload = {
|
302
|
+
"channel": self.channel,
|
303
|
+
"username": "Kailash SDK Monitor",
|
304
|
+
"icon_emoji": ":warning:",
|
305
|
+
"attachments": [
|
306
|
+
{
|
307
|
+
"color": color_map.get(alert.severity, "danger"),
|
308
|
+
"title": f"{alert.severity.value.upper()}: {alert.title}",
|
309
|
+
"text": alert.description,
|
310
|
+
"fields": [
|
311
|
+
{
|
312
|
+
"title": "Status",
|
313
|
+
"value": alert.status.value,
|
314
|
+
"short": True,
|
315
|
+
},
|
316
|
+
{
|
317
|
+
"title": "Created",
|
318
|
+
"value": alert.created_at.isoformat(),
|
319
|
+
"short": True,
|
320
|
+
},
|
321
|
+
]
|
322
|
+
+ [
|
323
|
+
{"title": k, "value": str(v), "short": True}
|
324
|
+
for k, v in {**alert.labels, **context}.items()
|
325
|
+
],
|
326
|
+
"ts": int(alert.created_at.timestamp()),
|
327
|
+
}
|
328
|
+
],
|
329
|
+
}
|
330
|
+
|
331
|
+
response = requests.post(self.webhook_url, json=payload, timeout=10)
|
332
|
+
response.raise_for_status()
|
333
|
+
return True
|
334
|
+
except Exception as e:
|
335
|
+
logger.error(f"Failed to send Slack notification: {e}")
|
336
|
+
return False
|
337
|
+
|
338
|
+
|
339
|
+
class WebhookNotificationChannel(NotificationChannel):
|
340
|
+
"""Generic webhook notification channel."""
|
341
|
+
|
342
|
+
def __init__(self, webhook_url: str, headers: Optional[Dict[str, str]] = None):
|
343
|
+
"""Initialize webhook notification channel.
|
344
|
+
|
345
|
+
Args:
|
346
|
+
webhook_url: Webhook URL
|
347
|
+
headers: Optional HTTP headers
|
348
|
+
"""
|
349
|
+
self.webhook_url = webhook_url
|
350
|
+
self.headers = headers or {}
|
351
|
+
|
352
|
+
def send_notification(self, alert: Alert, context: Dict[str, Any]) -> bool:
|
353
|
+
"""Send notification via webhook."""
|
354
|
+
try:
|
355
|
+
payload = {
|
356
|
+
"alert": {
|
357
|
+
"id": alert.id,
|
358
|
+
"rule_name": alert.rule_name,
|
359
|
+
"severity": alert.severity.value,
|
360
|
+
"status": alert.status.value,
|
361
|
+
"title": alert.title,
|
362
|
+
"description": alert.description,
|
363
|
+
"labels": alert.labels,
|
364
|
+
"annotations": alert.annotations,
|
365
|
+
"created_at": alert.created_at.isoformat(),
|
366
|
+
"fired_at": alert.fired_at.isoformat() if alert.fired_at else None,
|
367
|
+
},
|
368
|
+
"context": context,
|
369
|
+
}
|
370
|
+
|
371
|
+
response = requests.post(
|
372
|
+
self.webhook_url, json=payload, headers=self.headers, timeout=10
|
373
|
+
)
|
374
|
+
response.raise_for_status()
|
375
|
+
return True
|
376
|
+
except Exception as e:
|
377
|
+
logger.error(f"Failed to send webhook notification: {e}")
|
378
|
+
return False
|
379
|
+
|
380
|
+
|
381
|
+
class AlertManager:
|
382
|
+
"""Alert manager for handling alerting rules and notifications."""
|
383
|
+
|
384
|
+
def __init__(self, metrics_registry: MetricsRegistry):
|
385
|
+
"""Initialize alert manager.
|
386
|
+
|
387
|
+
Args:
|
388
|
+
metrics_registry: Metrics registry to monitor
|
389
|
+
"""
|
390
|
+
self.metrics_registry = metrics_registry
|
391
|
+
self.rules: Dict[str, AlertRule] = {}
|
392
|
+
self.alerts: Dict[str, Alert] = {}
|
393
|
+
self.notification_channels: List[NotificationChannel] = []
|
394
|
+
self._lock = threading.RLock()
|
395
|
+
self._running = False
|
396
|
+
self._thread: Optional[threading.Thread] = None
|
397
|
+
|
398
|
+
def add_rule(self, rule: AlertRule):
|
399
|
+
"""Add an alerting rule.
|
400
|
+
|
401
|
+
Args:
|
402
|
+
rule: AlertRule to add
|
403
|
+
"""
|
404
|
+
with self._lock:
|
405
|
+
self.rules[rule.name] = rule
|
406
|
+
|
407
|
+
def remove_rule(self, rule_name: str):
|
408
|
+
"""Remove an alerting rule.
|
409
|
+
|
410
|
+
Args:
|
411
|
+
rule_name: Name of rule to remove
|
412
|
+
"""
|
413
|
+
with self._lock:
|
414
|
+
if rule_name in self.rules:
|
415
|
+
del self.rules[rule_name]
|
416
|
+
|
417
|
+
def add_notification_channel(self, channel: NotificationChannel):
|
418
|
+
"""Add a notification channel.
|
419
|
+
|
420
|
+
Args:
|
421
|
+
channel: NotificationChannel to add
|
422
|
+
"""
|
423
|
+
with self._lock:
|
424
|
+
self.notification_channels.append(channel)
|
425
|
+
|
426
|
+
def start(self):
|
427
|
+
"""Start the alert manager."""
|
428
|
+
with self._lock:
|
429
|
+
if self._running:
|
430
|
+
return
|
431
|
+
|
432
|
+
self._running = True
|
433
|
+
self._thread = threading.Thread(target=self._evaluation_loop, daemon=True)
|
434
|
+
self._thread.start()
|
435
|
+
logger.info("Alert manager started")
|
436
|
+
|
437
|
+
def stop(self):
|
438
|
+
"""Stop the alert manager."""
|
439
|
+
with self._lock:
|
440
|
+
self._running = False
|
441
|
+
if self._thread:
|
442
|
+
self._thread.join(timeout=5)
|
443
|
+
logger.info("Alert manager stopped")
|
444
|
+
|
445
|
+
def _evaluation_loop(self):
|
446
|
+
"""Main evaluation loop for alert rules."""
|
447
|
+
while self._running:
|
448
|
+
try:
|
449
|
+
self._evaluate_rules()
|
450
|
+
self._process_notifications()
|
451
|
+
time.sleep(10) # Evaluate every 10 seconds
|
452
|
+
except Exception as e:
|
453
|
+
logger.error(f"Error in alert evaluation loop: {e}")
|
454
|
+
|
455
|
+
def _evaluate_rules(self):
|
456
|
+
"""Evaluate all alert rules."""
|
457
|
+
with self._lock:
|
458
|
+
for rule in self.rules.values():
|
459
|
+
if not rule.enabled:
|
460
|
+
continue
|
461
|
+
|
462
|
+
try:
|
463
|
+
# Find matching metrics
|
464
|
+
for (
|
465
|
+
collector_name,
|
466
|
+
collector,
|
467
|
+
) in self.metrics_registry.get_all_collectors().items():
|
468
|
+
metric_series = collector.get_metric(rule.metric_name)
|
469
|
+
if metric_series:
|
470
|
+
self._evaluate_rule(rule, metric_series, collector_name)
|
471
|
+
except Exception as e:
|
472
|
+
logger.error(f"Error evaluating rule {rule.name}: {e}")
|
473
|
+
|
474
|
+
def _evaluate_rule(
|
475
|
+
self, rule: AlertRule, metric_series: MetricSeries, collector_name: str
|
476
|
+
):
|
477
|
+
"""Evaluate a single rule against a metric series."""
|
478
|
+
alert_id = f"{rule.name}_{collector_name}"
|
479
|
+
|
480
|
+
# Check if condition is met
|
481
|
+
condition_met = rule.evaluate(metric_series)
|
482
|
+
|
483
|
+
if condition_met:
|
484
|
+
# Create or update alert
|
485
|
+
if alert_id not in self.alerts:
|
486
|
+
alert = Alert(
|
487
|
+
id=alert_id,
|
488
|
+
rule_name=rule.name,
|
489
|
+
severity=rule.severity,
|
490
|
+
title=f"{rule.name} ({collector_name})",
|
491
|
+
description=rule.description,
|
492
|
+
labels={
|
493
|
+
**rule.labels,
|
494
|
+
"collector": collector_name,
|
495
|
+
"metric": rule.metric_name,
|
496
|
+
},
|
497
|
+
annotations=rule.annotations,
|
498
|
+
)
|
499
|
+
self.alerts[alert_id] = alert
|
500
|
+
|
501
|
+
# Fire the alert
|
502
|
+
self.alerts[alert_id].fire()
|
503
|
+
else:
|
504
|
+
# Resolve alert if it exists and is firing
|
505
|
+
if (
|
506
|
+
alert_id in self.alerts
|
507
|
+
and self.alerts[alert_id].status == AlertStatus.FIRING
|
508
|
+
):
|
509
|
+
self.alerts[alert_id].resolve()
|
510
|
+
|
511
|
+
def _process_notifications(self):
|
512
|
+
"""Process notifications for firing alerts."""
|
513
|
+
with self._lock:
|
514
|
+
for alert in self.alerts.values():
|
515
|
+
if alert.status != AlertStatus.FIRING:
|
516
|
+
continue
|
517
|
+
|
518
|
+
rule = self.rules.get(alert.rule_name)
|
519
|
+
if not rule:
|
520
|
+
continue
|
521
|
+
|
522
|
+
if alert.should_notify(rule.notification_interval):
|
523
|
+
self._send_notifications(alert)
|
524
|
+
|
525
|
+
def _send_notifications(self, alert: Alert):
|
526
|
+
"""Send notifications for an alert."""
|
527
|
+
context = {
|
528
|
+
"metric_value": self._get_current_metric_value(alert),
|
529
|
+
"notification_count": alert.notification_count + 1,
|
530
|
+
"time_since_created": str(datetime.now(UTC) - alert.created_at),
|
531
|
+
}
|
532
|
+
|
533
|
+
success = False
|
534
|
+
for channel in self.notification_channels:
|
535
|
+
try:
|
536
|
+
if channel.send_notification(alert, context):
|
537
|
+
success = True
|
538
|
+
except Exception as e:
|
539
|
+
logger.error(
|
540
|
+
f"Failed to send notification via {type(channel).__name__}: {e}"
|
541
|
+
)
|
542
|
+
|
543
|
+
if success:
|
544
|
+
alert.mark_notified()
|
545
|
+
|
546
|
+
def _get_current_metric_value(self, alert: Alert) -> Optional[Union[int, float]]:
|
547
|
+
"""Get current metric value for alert context."""
|
548
|
+
for collector in self.metrics_registry.get_all_collectors().values():
|
549
|
+
metric_series = collector.get_metric(alert.labels.get("metric"))
|
550
|
+
if metric_series:
|
551
|
+
return metric_series.get_latest_value()
|
552
|
+
return None
|
553
|
+
|
554
|
+
def get_active_alerts(self) -> List[Alert]:
|
555
|
+
"""Get all active (firing) alerts."""
|
556
|
+
with self._lock:
|
557
|
+
return [
|
558
|
+
alert
|
559
|
+
for alert in self.alerts.values()
|
560
|
+
if alert.status == AlertStatus.FIRING
|
561
|
+
]
|
562
|
+
|
563
|
+
def get_all_alerts(self) -> List[Alert]:
|
564
|
+
"""Get all alerts."""
|
565
|
+
with self._lock:
|
566
|
+
return list(self.alerts.values())
|
567
|
+
|
568
|
+
def silence_alert(self, alert_id: str):
|
569
|
+
"""Silence an alert.
|
570
|
+
|
571
|
+
Args:
|
572
|
+
alert_id: Alert ID to silence
|
573
|
+
"""
|
574
|
+
with self._lock:
|
575
|
+
if alert_id in self.alerts:
|
576
|
+
self.alerts[alert_id].silence()
|
577
|
+
|
578
|
+
def acknowledge_alert(self, alert_id: str):
|
579
|
+
"""Acknowledge an alert (same as silence for now).
|
580
|
+
|
581
|
+
Args:
|
582
|
+
alert_id: Alert ID to acknowledge
|
583
|
+
"""
|
584
|
+
self.silence_alert(alert_id)
|
585
|
+
|
586
|
+
|
587
|
+
def create_default_alert_rules() -> List[AlertRule]:
|
588
|
+
"""Create default alert rules for common scenarios."""
|
589
|
+
return [
|
590
|
+
# Validation failure rate
|
591
|
+
AlertRule(
|
592
|
+
name="high_validation_failure_rate",
|
593
|
+
description="Validation failure rate is above 10%",
|
594
|
+
severity=AlertSeverity.ERROR,
|
595
|
+
metric_name="validation_failure",
|
596
|
+
condition="rate > 0.1",
|
597
|
+
threshold=0.1,
|
598
|
+
time_window=timedelta(minutes=5),
|
599
|
+
labels={"component": "validation"},
|
600
|
+
),
|
601
|
+
# Security violations
|
602
|
+
AlertRule(
|
603
|
+
name="security_violations_detected",
|
604
|
+
description="Security violations detected",
|
605
|
+
severity=AlertSeverity.CRITICAL,
|
606
|
+
metric_name="security_violations_total",
|
607
|
+
condition="rate > 0",
|
608
|
+
threshold=0,
|
609
|
+
time_window=timedelta(minutes=1),
|
610
|
+
notification_interval=timedelta(minutes=5),
|
611
|
+
labels={"component": "security"},
|
612
|
+
),
|
613
|
+
# High response time
|
614
|
+
AlertRule(
|
615
|
+
name="high_response_time",
|
616
|
+
description="Average response time is above 1 second",
|
617
|
+
severity=AlertSeverity.WARNING,
|
618
|
+
metric_name="response_time",
|
619
|
+
condition="avg > 1000",
|
620
|
+
threshold=1000,
|
621
|
+
time_window=timedelta(minutes=5),
|
622
|
+
labels={"component": "performance"},
|
623
|
+
),
|
624
|
+
# Low cache hit rate
|
625
|
+
AlertRule(
|
626
|
+
name="low_cache_hit_rate",
|
627
|
+
description="Cache hit rate is below 80%",
|
628
|
+
severity=AlertSeverity.WARNING,
|
629
|
+
metric_name="validation_cache_hits",
|
630
|
+
condition="rate < 0.8",
|
631
|
+
threshold=0.8,
|
632
|
+
time_window=timedelta(minutes=10),
|
633
|
+
labels={"component": "cache"},
|
634
|
+
),
|
635
|
+
# High memory usage
|
636
|
+
AlertRule(
|
637
|
+
name="high_memory_usage",
|
638
|
+
description="Memory usage is above 90%",
|
639
|
+
severity=AlertSeverity.ERROR,
|
640
|
+
metric_name="memory_usage",
|
641
|
+
condition="> 90",
|
642
|
+
threshold=90,
|
643
|
+
time_window=timedelta(minutes=2),
|
644
|
+
labels={"component": "system"},
|
645
|
+
),
|
646
|
+
]
|