kailash 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -7
- kailash/cli/__init__.py +11 -1
- kailash/cli/validation_audit.py +570 -0
- kailash/core/actors/supervisor.py +1 -1
- kailash/core/resilience/circuit_breaker.py +71 -1
- kailash/core/resilience/health_monitor.py +172 -0
- kailash/edge/compliance.py +33 -0
- kailash/edge/consistency.py +609 -0
- kailash/edge/coordination/__init__.py +30 -0
- kailash/edge/coordination/global_ordering.py +355 -0
- kailash/edge/coordination/leader_election.py +217 -0
- kailash/edge/coordination/partition_detector.py +296 -0
- kailash/edge/coordination/raft.py +485 -0
- kailash/edge/discovery.py +63 -1
- kailash/edge/migration/__init__.py +19 -0
- kailash/edge/migration/edge_migrator.py +832 -0
- kailash/edge/monitoring/__init__.py +21 -0
- kailash/edge/monitoring/edge_monitor.py +736 -0
- kailash/edge/prediction/__init__.py +10 -0
- kailash/edge/prediction/predictive_warmer.py +591 -0
- kailash/edge/resource/__init__.py +102 -0
- kailash/edge/resource/cloud_integration.py +796 -0
- kailash/edge/resource/cost_optimizer.py +949 -0
- kailash/edge/resource/docker_integration.py +919 -0
- kailash/edge/resource/kubernetes_integration.py +893 -0
- kailash/edge/resource/platform_integration.py +913 -0
- kailash/edge/resource/predictive_scaler.py +959 -0
- kailash/edge/resource/resource_analyzer.py +824 -0
- kailash/edge/resource/resource_pools.py +610 -0
- kailash/integrations/dataflow_edge.py +261 -0
- kailash/mcp_server/registry_integration.py +1 -1
- kailash/monitoring/__init__.py +18 -0
- kailash/monitoring/alerts.py +646 -0
- kailash/monitoring/metrics.py +677 -0
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/ai/__init__.py +17 -0
- kailash/nodes/ai/a2a.py +1914 -43
- kailash/nodes/ai/a2a_backup.py +1807 -0
- kailash/nodes/ai/hybrid_search.py +972 -0
- kailash/nodes/ai/semantic_memory.py +558 -0
- kailash/nodes/ai/streaming_analytics.py +947 -0
- kailash/nodes/base.py +545 -0
- kailash/nodes/edge/__init__.py +36 -0
- kailash/nodes/edge/base.py +240 -0
- kailash/nodes/edge/cloud_node.py +710 -0
- kailash/nodes/edge/coordination.py +239 -0
- kailash/nodes/edge/docker_node.py +825 -0
- kailash/nodes/edge/edge_data.py +582 -0
- kailash/nodes/edge/edge_migration_node.py +392 -0
- kailash/nodes/edge/edge_monitoring_node.py +421 -0
- kailash/nodes/edge/edge_state.py +673 -0
- kailash/nodes/edge/edge_warming_node.py +393 -0
- kailash/nodes/edge/kubernetes_node.py +652 -0
- kailash/nodes/edge/platform_node.py +766 -0
- kailash/nodes/edge/resource_analyzer_node.py +378 -0
- kailash/nodes/edge/resource_optimizer_node.py +501 -0
- kailash/nodes/edge/resource_scaler_node.py +397 -0
- kailash/nodes/ports.py +676 -0
- kailash/runtime/local.py +344 -1
- kailash/runtime/validation/__init__.py +20 -0
- kailash/runtime/validation/connection_context.py +119 -0
- kailash/runtime/validation/enhanced_error_formatter.py +202 -0
- kailash/runtime/validation/error_categorizer.py +164 -0
- kailash/runtime/validation/metrics.py +380 -0
- kailash/runtime/validation/performance.py +615 -0
- kailash/runtime/validation/suggestion_engine.py +212 -0
- kailash/testing/fixtures.py +2 -2
- kailash/workflow/builder.py +234 -8
- kailash/workflow/contracts.py +418 -0
- kailash/workflow/edge_infrastructure.py +369 -0
- kailash/workflow/migration.py +3 -3
- kailash/workflow/type_inference.py +669 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/METADATA +44 -27
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/RECORD +78 -28
- kailash/nexus/__init__.py +0 -21
- kailash/nexus/cli/__init__.py +0 -5
- kailash/nexus/cli/__main__.py +0 -6
- kailash/nexus/cli/main.py +0 -176
- kailash/nexus/factory.py +0 -413
- kailash/nexus/gateway.py +0 -545
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/WHEEL +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/entry_points.txt +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,736 @@
|
|
1
|
+
"""Edge monitoring service for comprehensive edge observability.
|
2
|
+
|
3
|
+
This service provides real-time monitoring, alerting, and analytics
|
4
|
+
for edge node operations, performance, and health.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import statistics
|
9
|
+
import time
|
10
|
+
from collections import defaultdict, deque
|
11
|
+
from dataclasses import dataclass, field
|
12
|
+
from datetime import datetime, timedelta
|
13
|
+
from enum import Enum
|
14
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
15
|
+
|
16
|
+
|
17
|
+
class MetricType(Enum):
|
18
|
+
"""Types of metrics collected."""
|
19
|
+
|
20
|
+
LATENCY = "latency"
|
21
|
+
THROUGHPUT = "throughput"
|
22
|
+
ERROR_RATE = "error_rate"
|
23
|
+
RESOURCE_USAGE = "resource_usage"
|
24
|
+
AVAILABILITY = "availability"
|
25
|
+
CACHE_HIT_RATE = "cache_hit_rate"
|
26
|
+
MIGRATION_TIME = "migration_time"
|
27
|
+
COORDINATION_OVERHEAD = "coordination_overhead"
|
28
|
+
|
29
|
+
|
30
|
+
class AlertSeverity(Enum):
|
31
|
+
"""Alert severity levels."""
|
32
|
+
|
33
|
+
INFO = "info"
|
34
|
+
WARNING = "warning"
|
35
|
+
ERROR = "error"
|
36
|
+
CRITICAL = "critical"
|
37
|
+
|
38
|
+
|
39
|
+
class HealthStatus(Enum):
|
40
|
+
"""Edge node health status."""
|
41
|
+
|
42
|
+
HEALTHY = "healthy"
|
43
|
+
DEGRADED = "degraded"
|
44
|
+
UNHEALTHY = "unhealthy"
|
45
|
+
UNKNOWN = "unknown"
|
46
|
+
|
47
|
+
|
48
|
+
@dataclass
|
49
|
+
class EdgeMetric:
|
50
|
+
"""Represents a single metric measurement."""
|
51
|
+
|
52
|
+
timestamp: datetime
|
53
|
+
edge_node: str
|
54
|
+
metric_type: MetricType
|
55
|
+
value: float
|
56
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
57
|
+
|
58
|
+
def to_dict(self) -> Dict[str, Any]:
|
59
|
+
"""Convert to dictionary."""
|
60
|
+
return {
|
61
|
+
"timestamp": self.timestamp.isoformat(),
|
62
|
+
"edge_node": self.edge_node,
|
63
|
+
"metric_type": self.metric_type.value,
|
64
|
+
"value": self.value,
|
65
|
+
"tags": self.tags,
|
66
|
+
}
|
67
|
+
|
68
|
+
|
69
|
+
@dataclass
|
70
|
+
class EdgeAlert:
|
71
|
+
"""Represents an alert for edge issues."""
|
72
|
+
|
73
|
+
alert_id: str
|
74
|
+
timestamp: datetime
|
75
|
+
edge_node: str
|
76
|
+
severity: AlertSeverity
|
77
|
+
metric_type: MetricType
|
78
|
+
message: str
|
79
|
+
current_value: float
|
80
|
+
threshold: float
|
81
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
82
|
+
|
83
|
+
def to_dict(self) -> Dict[str, Any]:
|
84
|
+
"""Convert to dictionary."""
|
85
|
+
return {
|
86
|
+
"alert_id": self.alert_id,
|
87
|
+
"timestamp": self.timestamp.isoformat(),
|
88
|
+
"edge_node": self.edge_node,
|
89
|
+
"severity": self.severity.value,
|
90
|
+
"metric_type": self.metric_type.value,
|
91
|
+
"message": self.message,
|
92
|
+
"current_value": self.current_value,
|
93
|
+
"threshold": self.threshold,
|
94
|
+
"tags": self.tags,
|
95
|
+
}
|
96
|
+
|
97
|
+
|
98
|
+
@dataclass
|
99
|
+
class EdgeHealth:
|
100
|
+
"""Edge node health information."""
|
101
|
+
|
102
|
+
edge_node: str
|
103
|
+
status: HealthStatus
|
104
|
+
last_check: datetime
|
105
|
+
uptime_seconds: float
|
106
|
+
metrics_summary: Dict[str, float]
|
107
|
+
issues: List[str] = field(default_factory=list)
|
108
|
+
|
109
|
+
def to_dict(self) -> Dict[str, Any]:
|
110
|
+
"""Convert to dictionary."""
|
111
|
+
return {
|
112
|
+
"edge_node": self.edge_node,
|
113
|
+
"status": self.status.value,
|
114
|
+
"last_check": self.last_check.isoformat(),
|
115
|
+
"uptime_seconds": self.uptime_seconds,
|
116
|
+
"metrics_summary": self.metrics_summary,
|
117
|
+
"issues": self.issues,
|
118
|
+
}
|
119
|
+
|
120
|
+
|
121
|
+
class EdgeMonitor:
|
122
|
+
"""Edge monitoring service for observability and alerting.
|
123
|
+
|
124
|
+
Provides comprehensive monitoring of edge nodes including:
|
125
|
+
- Real-time metrics collection
|
126
|
+
- Health monitoring
|
127
|
+
- Alerting based on thresholds
|
128
|
+
- Performance analytics
|
129
|
+
- Anomaly detection
|
130
|
+
"""
|
131
|
+
|
132
|
+
def __init__(
|
133
|
+
self,
|
134
|
+
retention_period: int = 24 * 60 * 60, # 24 hours
|
135
|
+
alert_cooldown: int = 300, # 5 minutes
|
136
|
+
health_check_interval: int = 30, # 30 seconds
|
137
|
+
anomaly_detection: bool = True,
|
138
|
+
):
|
139
|
+
"""Initialize edge monitor.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
retention_period: How long to retain metrics (seconds)
|
143
|
+
alert_cooldown: Cooldown between alerts for same issue
|
144
|
+
health_check_interval: Interval between health checks
|
145
|
+
anomaly_detection: Enable anomaly detection
|
146
|
+
"""
|
147
|
+
self.retention_period = retention_period
|
148
|
+
self.alert_cooldown = alert_cooldown
|
149
|
+
self.health_check_interval = health_check_interval
|
150
|
+
self.anomaly_detection = anomaly_detection
|
151
|
+
|
152
|
+
# Metrics storage
|
153
|
+
self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=10000))
|
154
|
+
self.aggregated_metrics: Dict[str, Dict[str, List[float]]] = defaultdict(
|
155
|
+
lambda: defaultdict(list)
|
156
|
+
)
|
157
|
+
|
158
|
+
# Health tracking
|
159
|
+
self.health_status: Dict[str, EdgeHealth] = {}
|
160
|
+
self.node_start_times: Dict[str, datetime] = {}
|
161
|
+
|
162
|
+
# Alerting
|
163
|
+
self.alerts: List[EdgeAlert] = []
|
164
|
+
self.alert_history: Dict[str, datetime] = {}
|
165
|
+
self.alert_thresholds: Dict[MetricType, Dict[str, float]] = {
|
166
|
+
MetricType.LATENCY: {"warning": 0.5, "error": 1.0, "critical": 2.0},
|
167
|
+
MetricType.ERROR_RATE: {"warning": 0.05, "error": 0.1, "critical": 0.2},
|
168
|
+
MetricType.RESOURCE_USAGE: {
|
169
|
+
"warning": 0.7,
|
170
|
+
"error": 0.85,
|
171
|
+
"critical": 0.95,
|
172
|
+
},
|
173
|
+
MetricType.AVAILABILITY: {"warning": 0.99, "error": 0.95, "critical": 0.9},
|
174
|
+
MetricType.CACHE_HIT_RATE: {"warning": 0.7, "error": 0.5, "critical": 0.3},
|
175
|
+
}
|
176
|
+
|
177
|
+
# Analytics
|
178
|
+
self.baseline_metrics: Dict[str, Dict[MetricType, float]] = defaultdict(dict)
|
179
|
+
|
180
|
+
# Background tasks
|
181
|
+
self._running = False
|
182
|
+
self._health_check_task = None
|
183
|
+
self._cleanup_task = None
|
184
|
+
self._analytics_task = None
|
185
|
+
|
186
|
+
async def start(self):
|
187
|
+
"""Start monitoring service."""
|
188
|
+
self._running = True
|
189
|
+
self._health_check_task = asyncio.create_task(self._health_check_loop())
|
190
|
+
self._cleanup_task = asyncio.create_task(self._cleanup_loop())
|
191
|
+
if self.anomaly_detection:
|
192
|
+
self._analytics_task = asyncio.create_task(self._analytics_loop())
|
193
|
+
|
194
|
+
async def stop(self):
|
195
|
+
"""Stop monitoring service."""
|
196
|
+
self._running = False
|
197
|
+
|
198
|
+
tasks = [self._health_check_task, self._cleanup_task, self._analytics_task]
|
199
|
+
for task in tasks:
|
200
|
+
if task:
|
201
|
+
task.cancel()
|
202
|
+
try:
|
203
|
+
await task
|
204
|
+
except asyncio.CancelledError:
|
205
|
+
pass
|
206
|
+
|
207
|
+
async def record_metric(self, metric: EdgeMetric):
|
208
|
+
"""Record a metric measurement.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
metric: Metric to record
|
212
|
+
"""
|
213
|
+
# Store in time-series
|
214
|
+
key = f"{metric.edge_node}:{metric.metric_type.value}"
|
215
|
+
self.metrics[key].append(metric)
|
216
|
+
|
217
|
+
# Update aggregated metrics for fast queries
|
218
|
+
self.aggregated_metrics[metric.edge_node][metric.metric_type].append(
|
219
|
+
metric.value
|
220
|
+
)
|
221
|
+
|
222
|
+
# Check thresholds and generate alerts
|
223
|
+
await self._check_thresholds(metric)
|
224
|
+
|
225
|
+
# Update node tracking
|
226
|
+
if metric.edge_node not in self.node_start_times:
|
227
|
+
self.node_start_times[metric.edge_node] = datetime.now()
|
228
|
+
|
229
|
+
async def get_metrics(
|
230
|
+
self,
|
231
|
+
edge_node: Optional[str] = None,
|
232
|
+
metric_type: Optional[MetricType] = None,
|
233
|
+
start_time: Optional[datetime] = None,
|
234
|
+
end_time: Optional[datetime] = None,
|
235
|
+
tags: Optional[Dict[str, str]] = None,
|
236
|
+
) -> List[EdgeMetric]:
|
237
|
+
"""Query metrics with filters.
|
238
|
+
|
239
|
+
Args:
|
240
|
+
edge_node: Filter by edge node
|
241
|
+
metric_type: Filter by metric type
|
242
|
+
start_time: Start of time range
|
243
|
+
end_time: End of time range
|
244
|
+
tags: Filter by tags
|
245
|
+
|
246
|
+
Returns:
|
247
|
+
List of matching metrics
|
248
|
+
"""
|
249
|
+
results = []
|
250
|
+
|
251
|
+
# Determine keys to search
|
252
|
+
if edge_node and metric_type:
|
253
|
+
keys = [f"{edge_node}:{metric_type.value}"]
|
254
|
+
elif edge_node:
|
255
|
+
keys = [k for k in self.metrics.keys() if k.startswith(f"{edge_node}:")]
|
256
|
+
elif metric_type:
|
257
|
+
keys = [
|
258
|
+
k for k in self.metrics.keys() if k.endswith(f":{metric_type.value}")
|
259
|
+
]
|
260
|
+
else:
|
261
|
+
keys = list(self.metrics.keys())
|
262
|
+
|
263
|
+
# Filter metrics
|
264
|
+
for key in keys:
|
265
|
+
for metric in self.metrics[key]:
|
266
|
+
# Time range filter
|
267
|
+
if start_time and metric.timestamp < start_time:
|
268
|
+
continue
|
269
|
+
if end_time and metric.timestamp > end_time:
|
270
|
+
continue
|
271
|
+
|
272
|
+
# Tag filter
|
273
|
+
if tags:
|
274
|
+
if not all(metric.tags.get(k) == v for k, v in tags.items()):
|
275
|
+
continue
|
276
|
+
|
277
|
+
results.append(metric)
|
278
|
+
|
279
|
+
return sorted(results, key=lambda m: m.timestamp)
|
280
|
+
|
281
|
+
async def get_edge_health(self, edge_node: str) -> EdgeHealth:
|
282
|
+
"""Get health status for an edge node.
|
283
|
+
|
284
|
+
Args:
|
285
|
+
edge_node: Edge node identifier
|
286
|
+
|
287
|
+
Returns:
|
288
|
+
Health status
|
289
|
+
"""
|
290
|
+
if edge_node in self.health_status:
|
291
|
+
return self.health_status[edge_node]
|
292
|
+
|
293
|
+
# Create new health entry
|
294
|
+
health = EdgeHealth(
|
295
|
+
edge_node=edge_node,
|
296
|
+
status=HealthStatus.UNKNOWN,
|
297
|
+
last_check=datetime.now(),
|
298
|
+
uptime_seconds=0,
|
299
|
+
metrics_summary={},
|
300
|
+
)
|
301
|
+
|
302
|
+
self.health_status[edge_node] = health
|
303
|
+
return health
|
304
|
+
|
305
|
+
async def get_alerts(
|
306
|
+
self,
|
307
|
+
edge_node: Optional[str] = None,
|
308
|
+
severity: Optional[AlertSeverity] = None,
|
309
|
+
start_time: Optional[datetime] = None,
|
310
|
+
active_only: bool = False,
|
311
|
+
) -> List[EdgeAlert]:
|
312
|
+
"""Get alerts with filters.
|
313
|
+
|
314
|
+
Args:
|
315
|
+
edge_node: Filter by edge node
|
316
|
+
severity: Filter by severity
|
317
|
+
start_time: Filter alerts after this time
|
318
|
+
active_only: Only return active alerts
|
319
|
+
|
320
|
+
Returns:
|
321
|
+
List of matching alerts
|
322
|
+
"""
|
323
|
+
results = []
|
324
|
+
|
325
|
+
for alert in self.alerts:
|
326
|
+
# Edge node filter
|
327
|
+
if edge_node and alert.edge_node != edge_node:
|
328
|
+
continue
|
329
|
+
|
330
|
+
# Severity filter
|
331
|
+
if severity and alert.severity != severity:
|
332
|
+
continue
|
333
|
+
|
334
|
+
# Time filter
|
335
|
+
if start_time and alert.timestamp < start_time:
|
336
|
+
continue
|
337
|
+
|
338
|
+
# Active filter
|
339
|
+
if active_only:
|
340
|
+
# Check if alert is still active (within cooldown)
|
341
|
+
key = f"{alert.edge_node}:{alert.metric_type.value}"
|
342
|
+
if key in self.alert_history:
|
343
|
+
if (
|
344
|
+
datetime.now() - self.alert_history[key]
|
345
|
+
).total_seconds() > self.alert_cooldown:
|
346
|
+
continue
|
347
|
+
|
348
|
+
results.append(alert)
|
349
|
+
|
350
|
+
return sorted(results, key=lambda a: a.timestamp, reverse=True)
|
351
|
+
|
352
|
+
def get_analytics(self, edge_node: str) -> Dict[str, Any]:
|
353
|
+
"""Get analytics for an edge node.
|
354
|
+
|
355
|
+
Args:
|
356
|
+
edge_node: Edge node identifier
|
357
|
+
|
358
|
+
Returns:
|
359
|
+
Analytics summary
|
360
|
+
"""
|
361
|
+
analytics = {
|
362
|
+
"edge_node": edge_node,
|
363
|
+
"metrics_summary": {},
|
364
|
+
"trends": {},
|
365
|
+
"anomalies": [],
|
366
|
+
"recommendations": [],
|
367
|
+
}
|
368
|
+
|
369
|
+
# Calculate summaries for each metric type
|
370
|
+
for metric_type, values in self.aggregated_metrics[edge_node].items():
|
371
|
+
if not values:
|
372
|
+
continue
|
373
|
+
|
374
|
+
# Basic statistics
|
375
|
+
analytics["metrics_summary"][metric_type.value] = {
|
376
|
+
"count": len(values),
|
377
|
+
"mean": statistics.mean(values),
|
378
|
+
"median": statistics.median(values),
|
379
|
+
"std_dev": statistics.stdev(values) if len(values) > 1 else 0,
|
380
|
+
"min": min(values),
|
381
|
+
"max": max(values),
|
382
|
+
"p95": sorted(values)[int(len(values) * 0.95)] if values else 0,
|
383
|
+
"p99": sorted(values)[int(len(values) * 0.99)] if values else 0,
|
384
|
+
}
|
385
|
+
|
386
|
+
# Trend analysis (simple moving average)
|
387
|
+
if len(values) > 10:
|
388
|
+
recent = values[-10:]
|
389
|
+
older = values[-20:-10] if len(values) > 20 else values[:10]
|
390
|
+
|
391
|
+
recent_avg = statistics.mean(recent)
|
392
|
+
older_avg = statistics.mean(older)
|
393
|
+
|
394
|
+
trend = "stable"
|
395
|
+
if recent_avg > older_avg * 1.1:
|
396
|
+
trend = "increasing"
|
397
|
+
elif recent_avg < older_avg * 0.9:
|
398
|
+
trend = "decreasing"
|
399
|
+
|
400
|
+
analytics["trends"][metric_type.value] = {
|
401
|
+
"direction": trend,
|
402
|
+
"change_percent": (
|
403
|
+
((recent_avg - older_avg) / older_avg * 100) if older_avg else 0
|
404
|
+
),
|
405
|
+
}
|
406
|
+
|
407
|
+
# Detect anomalies
|
408
|
+
if self.anomaly_detection:
|
409
|
+
anomalies = self._detect_anomalies(edge_node)
|
410
|
+
analytics["anomalies"] = [a.to_dict() for a in anomalies]
|
411
|
+
|
412
|
+
# Generate recommendations
|
413
|
+
analytics["recommendations"] = self._generate_recommendations(
|
414
|
+
edge_node, analytics
|
415
|
+
)
|
416
|
+
|
417
|
+
return analytics
|
418
|
+
|
419
|
+
async def _check_thresholds(self, metric: EdgeMetric):
|
420
|
+
"""Check if metric violates thresholds and create alerts."""
|
421
|
+
if metric.metric_type not in self.alert_thresholds:
|
422
|
+
return
|
423
|
+
|
424
|
+
thresholds = self.alert_thresholds[metric.metric_type]
|
425
|
+
alert_key = f"{metric.edge_node}:{metric.metric_type.value}"
|
426
|
+
|
427
|
+
# Check cooldown
|
428
|
+
if alert_key in self.alert_history:
|
429
|
+
if (
|
430
|
+
datetime.now() - self.alert_history[alert_key]
|
431
|
+
).total_seconds() < self.alert_cooldown:
|
432
|
+
return
|
433
|
+
|
434
|
+
# Determine severity
|
435
|
+
severity = None
|
436
|
+
threshold_value = None
|
437
|
+
|
438
|
+
# For availability and cache hit rate, lower is worse
|
439
|
+
if metric.metric_type in [MetricType.AVAILABILITY, MetricType.CACHE_HIT_RATE]:
|
440
|
+
if metric.value <= thresholds.get("critical", 0):
|
441
|
+
severity = AlertSeverity.CRITICAL
|
442
|
+
threshold_value = thresholds["critical"]
|
443
|
+
elif metric.value <= thresholds.get("error", 0):
|
444
|
+
severity = AlertSeverity.ERROR
|
445
|
+
threshold_value = thresholds["error"]
|
446
|
+
elif metric.value <= thresholds.get("warning", 0):
|
447
|
+
severity = AlertSeverity.WARNING
|
448
|
+
threshold_value = thresholds["warning"]
|
449
|
+
else:
|
450
|
+
# For other metrics, higher is worse
|
451
|
+
if metric.value >= thresholds.get("critical", float("inf")):
|
452
|
+
severity = AlertSeverity.CRITICAL
|
453
|
+
threshold_value = thresholds["critical"]
|
454
|
+
elif metric.value >= thresholds.get("error", float("inf")):
|
455
|
+
severity = AlertSeverity.ERROR
|
456
|
+
threshold_value = thresholds["error"]
|
457
|
+
elif metric.value >= thresholds.get("warning", float("inf")):
|
458
|
+
severity = AlertSeverity.WARNING
|
459
|
+
threshold_value = thresholds["warning"]
|
460
|
+
|
461
|
+
# Create alert if threshold violated
|
462
|
+
if severity:
|
463
|
+
alert = EdgeAlert(
|
464
|
+
alert_id=f"{alert_key}:{int(time.time())}",
|
465
|
+
timestamp=datetime.now(),
|
466
|
+
edge_node=metric.edge_node,
|
467
|
+
severity=severity,
|
468
|
+
metric_type=metric.metric_type,
|
469
|
+
message=f"{metric.metric_type.value} threshold exceeded on {metric.edge_node}",
|
470
|
+
current_value=metric.value,
|
471
|
+
threshold=threshold_value,
|
472
|
+
tags=metric.tags,
|
473
|
+
)
|
474
|
+
|
475
|
+
self.alerts.append(alert)
|
476
|
+
self.alert_history[alert_key] = datetime.now()
|
477
|
+
|
478
|
+
async def _health_check_loop(self):
|
479
|
+
"""Background task for health monitoring."""
|
480
|
+
while self._running:
|
481
|
+
try:
|
482
|
+
# Check health of all known nodes
|
483
|
+
for edge_node in list(self.node_start_times.keys()):
|
484
|
+
await self._check_node_health(edge_node)
|
485
|
+
|
486
|
+
await asyncio.sleep(self.health_check_interval)
|
487
|
+
|
488
|
+
except Exception as e:
|
489
|
+
print(f"Health check error: {e}")
|
490
|
+
await asyncio.sleep(0.1) # Fast retry for tests
|
491
|
+
|
492
|
+
async def _check_node_health(self, edge_node: str):
|
493
|
+
"""Check health of a specific node."""
|
494
|
+
health = await self.get_edge_health(edge_node)
|
495
|
+
|
496
|
+
# Calculate uptime
|
497
|
+
if edge_node in self.node_start_times:
|
498
|
+
uptime = (datetime.now() - self.node_start_times[edge_node]).total_seconds()
|
499
|
+
health.uptime_seconds = uptime
|
500
|
+
|
501
|
+
# Analyze recent metrics
|
502
|
+
issues = []
|
503
|
+
metrics_summary = {}
|
504
|
+
|
505
|
+
for metric_type in MetricType:
|
506
|
+
key = f"{edge_node}:{metric_type.value}"
|
507
|
+
if key in self.metrics:
|
508
|
+
recent_metrics = [
|
509
|
+
m
|
510
|
+
for m in self.metrics[key]
|
511
|
+
if (datetime.now() - m.timestamp).total_seconds() < 300
|
512
|
+
] # Last 5 min
|
513
|
+
|
514
|
+
if recent_metrics:
|
515
|
+
values = [m.value for m in recent_metrics]
|
516
|
+
metrics_summary[metric_type.value] = {
|
517
|
+
"current": values[-1],
|
518
|
+
"avg": statistics.mean(values),
|
519
|
+
"min": min(values),
|
520
|
+
"max": max(values),
|
521
|
+
}
|
522
|
+
|
523
|
+
health.metrics_summary = metrics_summary
|
524
|
+
|
525
|
+
# Determine overall status
|
526
|
+
recent_alerts = await self.get_alerts(
|
527
|
+
edge_node=edge_node,
|
528
|
+
start_time=datetime.now() - timedelta(minutes=5),
|
529
|
+
active_only=True,
|
530
|
+
)
|
531
|
+
|
532
|
+
critical_alerts = [
|
533
|
+
a for a in recent_alerts if a.severity == AlertSeverity.CRITICAL
|
534
|
+
]
|
535
|
+
error_alerts = [a for a in recent_alerts if a.severity == AlertSeverity.ERROR]
|
536
|
+
|
537
|
+
if critical_alerts:
|
538
|
+
health.status = HealthStatus.UNHEALTHY
|
539
|
+
issues.extend([a.message for a in critical_alerts])
|
540
|
+
elif error_alerts:
|
541
|
+
health.status = HealthStatus.DEGRADED
|
542
|
+
issues.extend([a.message for a in error_alerts])
|
543
|
+
elif metrics_summary:
|
544
|
+
health.status = HealthStatus.HEALTHY
|
545
|
+
else:
|
546
|
+
health.status = HealthStatus.UNKNOWN
|
547
|
+
issues.append("No recent metrics received")
|
548
|
+
|
549
|
+
health.issues = issues
|
550
|
+
health.last_check = datetime.now()
|
551
|
+
|
552
|
+
async def _cleanup_loop(self):
|
553
|
+
"""Background task for cleaning old data."""
|
554
|
+
while self._running:
|
555
|
+
try:
|
556
|
+
cutoff_time = datetime.now() - timedelta(seconds=self.retention_period)
|
557
|
+
|
558
|
+
# Clean metrics
|
559
|
+
for key in list(self.metrics.keys()):
|
560
|
+
self.metrics[key] = deque(
|
561
|
+
(m for m in self.metrics[key] if m.timestamp > cutoff_time),
|
562
|
+
maxlen=10000,
|
563
|
+
)
|
564
|
+
|
565
|
+
# Clean alerts
|
566
|
+
self.alerts = [a for a in self.alerts if a.timestamp > cutoff_time]
|
567
|
+
|
568
|
+
# Clean aggregated metrics (keep recent window)
|
569
|
+
for node in self.aggregated_metrics:
|
570
|
+
for metric_type in self.aggregated_metrics[node]:
|
571
|
+
# Keep last 1000 values
|
572
|
+
if len(self.aggregated_metrics[node][metric_type]) > 1000:
|
573
|
+
self.aggregated_metrics[node][metric_type] = (
|
574
|
+
self.aggregated_metrics[node][metric_type][-1000:]
|
575
|
+
)
|
576
|
+
|
577
|
+
await asyncio.sleep(1) # Fast cleanup for tests
|
578
|
+
|
579
|
+
except Exception as e:
|
580
|
+
print(f"Cleanup error: {e}")
|
581
|
+
await asyncio.sleep(0.1) # Fast retry for tests
|
582
|
+
|
583
|
+
async def _analytics_loop(self):
|
584
|
+
"""Background task for analytics and anomaly detection."""
|
585
|
+
while self._running:
|
586
|
+
try:
|
587
|
+
# Update baselines
|
588
|
+
for edge_node in self.aggregated_metrics:
|
589
|
+
self._update_baseline(edge_node)
|
590
|
+
|
591
|
+
await asyncio.sleep(300) # Run every 5 minutes
|
592
|
+
|
593
|
+
except Exception as e:
|
594
|
+
print(f"Analytics error: {e}")
|
595
|
+
await asyncio.sleep(300)
|
596
|
+
|
597
|
+
def _update_baseline(self, edge_node: str):
|
598
|
+
"""Update baseline metrics for anomaly detection."""
|
599
|
+
for metric_type, values in self.aggregated_metrics[edge_node].items():
|
600
|
+
if len(values) > 100:
|
601
|
+
# Use median as baseline (more robust to outliers)
|
602
|
+
self.baseline_metrics[edge_node][metric_type] = statistics.median(
|
603
|
+
values
|
604
|
+
)
|
605
|
+
|
606
|
+
def _detect_anomalies(self, edge_node: str) -> List[EdgeAlert]:
|
607
|
+
"""Detect anomalies in metrics."""
|
608
|
+
anomalies = []
|
609
|
+
|
610
|
+
if edge_node not in self.baseline_metrics:
|
611
|
+
return anomalies
|
612
|
+
|
613
|
+
for metric_type, baseline in self.baseline_metrics[edge_node].items():
|
614
|
+
recent_values = self.aggregated_metrics[edge_node][metric_type][-10:]
|
615
|
+
|
616
|
+
if not recent_values:
|
617
|
+
continue
|
618
|
+
|
619
|
+
current = statistics.mean(recent_values)
|
620
|
+
|
621
|
+
# Simple anomaly detection: significant deviation from baseline
|
622
|
+
deviation = abs(current - baseline) / baseline if baseline else 0
|
623
|
+
|
624
|
+
if deviation > 0.5: # 50% deviation
|
625
|
+
anomaly = EdgeAlert(
|
626
|
+
alert_id=f"anomaly:{edge_node}:{metric_type.value}:{int(time.time())}",
|
627
|
+
timestamp=datetime.now(),
|
628
|
+
edge_node=edge_node,
|
629
|
+
severity=AlertSeverity.WARNING,
|
630
|
+
metric_type=metric_type,
|
631
|
+
message=f"Anomaly detected: {metric_type.value} deviates {deviation*100:.1f}% from baseline",
|
632
|
+
current_value=current,
|
633
|
+
threshold=baseline,
|
634
|
+
tags={"type": "anomaly", "deviation": str(deviation)},
|
635
|
+
)
|
636
|
+
anomalies.append(anomaly)
|
637
|
+
|
638
|
+
return anomalies
|
639
|
+
|
640
|
+
def _generate_recommendations(
|
641
|
+
self, edge_node: str, analytics: Dict[str, Any]
|
642
|
+
) -> List[str]:
|
643
|
+
"""Generate recommendations based on analytics."""
|
644
|
+
recommendations = []
|
645
|
+
|
646
|
+
# Check metrics
|
647
|
+
metrics = analytics.get("metrics_summary", {})
|
648
|
+
|
649
|
+
# High latency
|
650
|
+
if MetricType.LATENCY.value in metrics:
|
651
|
+
latency = metrics[MetricType.LATENCY.value]
|
652
|
+
if latency["p95"] > 1.0:
|
653
|
+
recommendations.append(
|
654
|
+
f"Consider scaling {edge_node} - p95 latency is {latency['p95']:.2f}s"
|
655
|
+
)
|
656
|
+
|
657
|
+
# High error rate
|
658
|
+
if MetricType.ERROR_RATE.value in metrics:
|
659
|
+
error_rate = metrics[MetricType.ERROR_RATE.value]
|
660
|
+
if error_rate["mean"] > 0.05:
|
661
|
+
recommendations.append(
|
662
|
+
f"Investigate errors on {edge_node} - error rate is {error_rate['mean']*100:.1f}%"
|
663
|
+
)
|
664
|
+
|
665
|
+
# Resource usage
|
666
|
+
if MetricType.RESOURCE_USAGE.value in metrics:
|
667
|
+
resources = metrics[MetricType.RESOURCE_USAGE.value]
|
668
|
+
if resources["p95"] > 0.8:
|
669
|
+
recommendations.append(
|
670
|
+
f"Resource usage high on {edge_node} - consider scaling or optimization"
|
671
|
+
)
|
672
|
+
|
673
|
+
# Cache performance
|
674
|
+
if MetricType.CACHE_HIT_RATE.value in metrics:
|
675
|
+
cache = metrics[MetricType.CACHE_HIT_RATE.value]
|
676
|
+
if cache["mean"] < 0.7:
|
677
|
+
recommendations.append(
|
678
|
+
f"Low cache hit rate ({cache['mean']*100:.1f}%) - review caching strategy"
|
679
|
+
)
|
680
|
+
|
681
|
+
# Check trends
|
682
|
+
trends = analytics.get("trends", {})
|
683
|
+
|
684
|
+
for metric, trend in trends.items():
|
685
|
+
if trend["direction"] == "increasing" and trend["change_percent"] > 20:
|
686
|
+
if metric in [MetricType.LATENCY.value, MetricType.ERROR_RATE.value]:
|
687
|
+
recommendations.append(
|
688
|
+
f"{metric} increasing by {trend['change_percent']:.1f}% - investigate cause"
|
689
|
+
)
|
690
|
+
|
691
|
+
return recommendations
|
692
|
+
|
693
|
+
def set_threshold(self, metric_type: MetricType, severity: str, value: float):
|
694
|
+
"""Update alert threshold.
|
695
|
+
|
696
|
+
Args:
|
697
|
+
metric_type: Type of metric
|
698
|
+
severity: Severity level (warning, error, critical)
|
699
|
+
value: Threshold value
|
700
|
+
"""
|
701
|
+
if metric_type not in self.alert_thresholds:
|
702
|
+
self.alert_thresholds[metric_type] = {}
|
703
|
+
|
704
|
+
self.alert_thresholds[metric_type][severity] = value
|
705
|
+
|
706
|
+
def get_summary(self) -> Dict[str, Any]:
|
707
|
+
"""Get overall monitoring summary."""
|
708
|
+
# Count nodes by health status
|
709
|
+
health_counts = defaultdict(int)
|
710
|
+
for health in self.health_status.values():
|
711
|
+
health_counts[health.status.value] += 1
|
712
|
+
|
713
|
+
# Recent alerts by severity
|
714
|
+
recent_alerts = defaultdict(int)
|
715
|
+
cutoff = datetime.now() - timedelta(hours=1)
|
716
|
+
for alert in self.alerts:
|
717
|
+
if alert.timestamp > cutoff:
|
718
|
+
recent_alerts[alert.severity.value] += 1
|
719
|
+
|
720
|
+
# Active nodes
|
721
|
+
active_nodes = []
|
722
|
+
cutoff = datetime.now() - timedelta(minutes=5)
|
723
|
+
for node, metrics_dict in self.aggregated_metrics.items():
|
724
|
+
if any(metrics_dict.values()): # Has recent metrics
|
725
|
+
active_nodes.append(node)
|
726
|
+
|
727
|
+
return {
|
728
|
+
"monitoring_active": self._running,
|
729
|
+
"total_nodes": len(self.health_status),
|
730
|
+
"active_nodes": len(active_nodes),
|
731
|
+
"health_summary": dict(health_counts),
|
732
|
+
"recent_alerts": dict(recent_alerts),
|
733
|
+
"total_metrics": sum(len(m) for m in self.metrics.values()),
|
734
|
+
"retention_period": self.retention_period,
|
735
|
+
"anomaly_detection": self.anomaly_detection,
|
736
|
+
}
|