kailash 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. kailash/__init__.py +5 -11
  2. kailash/channels/__init__.py +2 -1
  3. kailash/channels/mcp_channel.py +23 -4
  4. kailash/cli/__init__.py +11 -1
  5. kailash/cli/validate_imports.py +202 -0
  6. kailash/cli/validation_audit.py +570 -0
  7. kailash/core/actors/supervisor.py +1 -1
  8. kailash/core/resilience/bulkhead.py +15 -5
  9. kailash/core/resilience/circuit_breaker.py +74 -1
  10. kailash/core/resilience/health_monitor.py +433 -33
  11. kailash/edge/compliance.py +33 -0
  12. kailash/edge/consistency.py +609 -0
  13. kailash/edge/coordination/__init__.py +30 -0
  14. kailash/edge/coordination/global_ordering.py +355 -0
  15. kailash/edge/coordination/leader_election.py +217 -0
  16. kailash/edge/coordination/partition_detector.py +296 -0
  17. kailash/edge/coordination/raft.py +485 -0
  18. kailash/edge/discovery.py +63 -1
  19. kailash/edge/migration/__init__.py +19 -0
  20. kailash/edge/migration/edge_migration_service.py +384 -0
  21. kailash/edge/migration/edge_migrator.py +832 -0
  22. kailash/edge/monitoring/__init__.py +21 -0
  23. kailash/edge/monitoring/edge_monitor.py +736 -0
  24. kailash/edge/prediction/__init__.py +10 -0
  25. kailash/edge/prediction/predictive_warmer.py +591 -0
  26. kailash/edge/resource/__init__.py +102 -0
  27. kailash/edge/resource/cloud_integration.py +796 -0
  28. kailash/edge/resource/cost_optimizer.py +949 -0
  29. kailash/edge/resource/docker_integration.py +919 -0
  30. kailash/edge/resource/kubernetes_integration.py +893 -0
  31. kailash/edge/resource/platform_integration.py +913 -0
  32. kailash/edge/resource/predictive_scaler.py +959 -0
  33. kailash/edge/resource/resource_analyzer.py +824 -0
  34. kailash/edge/resource/resource_pools.py +610 -0
  35. kailash/integrations/dataflow_edge.py +261 -0
  36. kailash/mcp_server/registry_integration.py +1 -1
  37. kailash/mcp_server/server.py +351 -8
  38. kailash/mcp_server/transports.py +305 -0
  39. kailash/middleware/gateway/event_store.py +1 -0
  40. kailash/monitoring/__init__.py +18 -0
  41. kailash/monitoring/alerts.py +646 -0
  42. kailash/monitoring/metrics.py +677 -0
  43. kailash/nodes/__init__.py +2 -0
  44. kailash/nodes/ai/semantic_memory.py +2 -2
  45. kailash/nodes/base.py +622 -1
  46. kailash/nodes/code/python.py +44 -3
  47. kailash/nodes/data/async_sql.py +42 -20
  48. kailash/nodes/edge/__init__.py +36 -0
  49. kailash/nodes/edge/base.py +240 -0
  50. kailash/nodes/edge/cloud_node.py +710 -0
  51. kailash/nodes/edge/coordination.py +239 -0
  52. kailash/nodes/edge/docker_node.py +825 -0
  53. kailash/nodes/edge/edge_data.py +582 -0
  54. kailash/nodes/edge/edge_migration_node.py +396 -0
  55. kailash/nodes/edge/edge_monitoring_node.py +421 -0
  56. kailash/nodes/edge/edge_state.py +673 -0
  57. kailash/nodes/edge/edge_warming_node.py +393 -0
  58. kailash/nodes/edge/kubernetes_node.py +652 -0
  59. kailash/nodes/edge/platform_node.py +766 -0
  60. kailash/nodes/edge/resource_analyzer_node.py +378 -0
  61. kailash/nodes/edge/resource_optimizer_node.py +501 -0
  62. kailash/nodes/edge/resource_scaler_node.py +397 -0
  63. kailash/nodes/governance.py +410 -0
  64. kailash/nodes/ports.py +676 -0
  65. kailash/nodes/rag/registry.py +1 -1
  66. kailash/nodes/transaction/distributed_transaction_manager.py +48 -1
  67. kailash/nodes/transaction/saga_state_storage.py +2 -1
  68. kailash/nodes/validation.py +8 -8
  69. kailash/runtime/local.py +374 -1
  70. kailash/runtime/validation/__init__.py +12 -0
  71. kailash/runtime/validation/connection_context.py +119 -0
  72. kailash/runtime/validation/enhanced_error_formatter.py +202 -0
  73. kailash/runtime/validation/error_categorizer.py +164 -0
  74. kailash/runtime/validation/import_validator.py +446 -0
  75. kailash/runtime/validation/metrics.py +380 -0
  76. kailash/runtime/validation/performance.py +615 -0
  77. kailash/runtime/validation/suggestion_engine.py +212 -0
  78. kailash/testing/fixtures.py +2 -2
  79. kailash/utils/data_paths.py +74 -0
  80. kailash/workflow/builder.py +413 -8
  81. kailash/workflow/contracts.py +418 -0
  82. kailash/workflow/edge_infrastructure.py +369 -0
  83. kailash/workflow/mermaid_visualizer.py +3 -1
  84. kailash/workflow/migration.py +3 -3
  85. kailash/workflow/templates.py +6 -6
  86. kailash/workflow/type_inference.py +669 -0
  87. kailash/workflow/validation.py +134 -3
  88. {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/METADATA +52 -34
  89. {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/RECORD +93 -42
  90. kailash/nexus/__init__.py +0 -21
  91. kailash/nexus/cli/__init__.py +0 -5
  92. kailash/nexus/cli/__main__.py +0 -6
  93. kailash/nexus/cli/main.py +0 -176
  94. kailash/nexus/factory.py +0 -413
  95. kailash/nexus/gateway.py +0 -545
  96. {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/WHEEL +0 -0
  97. {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/entry_points.txt +0 -0
  98. {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/licenses/LICENSE +0 -0
  99. {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,736 @@
1
+ """Edge monitoring service for comprehensive edge observability.
2
+
3
+ This service provides real-time monitoring, alerting, and analytics
4
+ for edge node operations, performance, and health.
5
+ """
6
+
7
+ import asyncio
8
+ import statistics
9
+ import time
10
+ from collections import defaultdict, deque
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime, timedelta
13
+ from enum import Enum
14
+ from typing import Any, Dict, List, Optional, Set, Tuple
15
+
16
+
17
+ class MetricType(Enum):
18
+ """Types of metrics collected."""
19
+
20
+ LATENCY = "latency"
21
+ THROUGHPUT = "throughput"
22
+ ERROR_RATE = "error_rate"
23
+ RESOURCE_USAGE = "resource_usage"
24
+ AVAILABILITY = "availability"
25
+ CACHE_HIT_RATE = "cache_hit_rate"
26
+ MIGRATION_TIME = "migration_time"
27
+ COORDINATION_OVERHEAD = "coordination_overhead"
28
+
29
+
30
+ class AlertSeverity(Enum):
31
+ """Alert severity levels."""
32
+
33
+ INFO = "info"
34
+ WARNING = "warning"
35
+ ERROR = "error"
36
+ CRITICAL = "critical"
37
+
38
+
39
+ class HealthStatus(Enum):
40
+ """Edge node health status."""
41
+
42
+ HEALTHY = "healthy"
43
+ DEGRADED = "degraded"
44
+ UNHEALTHY = "unhealthy"
45
+ UNKNOWN = "unknown"
46
+
47
+
48
+ @dataclass
49
+ class EdgeMetric:
50
+ """Represents a single metric measurement."""
51
+
52
+ timestamp: datetime
53
+ edge_node: str
54
+ metric_type: MetricType
55
+ value: float
56
+ tags: Dict[str, str] = field(default_factory=dict)
57
+
58
+ def to_dict(self) -> Dict[str, Any]:
59
+ """Convert to dictionary."""
60
+ return {
61
+ "timestamp": self.timestamp.isoformat(),
62
+ "edge_node": self.edge_node,
63
+ "metric_type": self.metric_type.value,
64
+ "value": self.value,
65
+ "tags": self.tags,
66
+ }
67
+
68
+
69
+ @dataclass
70
+ class EdgeAlert:
71
+ """Represents an alert for edge issues."""
72
+
73
+ alert_id: str
74
+ timestamp: datetime
75
+ edge_node: str
76
+ severity: AlertSeverity
77
+ metric_type: MetricType
78
+ message: str
79
+ current_value: float
80
+ threshold: float
81
+ tags: Dict[str, str] = field(default_factory=dict)
82
+
83
+ def to_dict(self) -> Dict[str, Any]:
84
+ """Convert to dictionary."""
85
+ return {
86
+ "alert_id": self.alert_id,
87
+ "timestamp": self.timestamp.isoformat(),
88
+ "edge_node": self.edge_node,
89
+ "severity": self.severity.value,
90
+ "metric_type": self.metric_type.value,
91
+ "message": self.message,
92
+ "current_value": self.current_value,
93
+ "threshold": self.threshold,
94
+ "tags": self.tags,
95
+ }
96
+
97
+
98
+ @dataclass
99
+ class EdgeHealth:
100
+ """Edge node health information."""
101
+
102
+ edge_node: str
103
+ status: HealthStatus
104
+ last_check: datetime
105
+ uptime_seconds: float
106
+ metrics_summary: Dict[str, float]
107
+ issues: List[str] = field(default_factory=list)
108
+
109
+ def to_dict(self) -> Dict[str, Any]:
110
+ """Convert to dictionary."""
111
+ return {
112
+ "edge_node": self.edge_node,
113
+ "status": self.status.value,
114
+ "last_check": self.last_check.isoformat(),
115
+ "uptime_seconds": self.uptime_seconds,
116
+ "metrics_summary": self.metrics_summary,
117
+ "issues": self.issues,
118
+ }
119
+
120
+
121
+ class EdgeMonitor:
122
+ """Edge monitoring service for observability and alerting.
123
+
124
+ Provides comprehensive monitoring of edge nodes including:
125
+ - Real-time metrics collection
126
+ - Health monitoring
127
+ - Alerting based on thresholds
128
+ - Performance analytics
129
+ - Anomaly detection
130
+ """
131
+
132
+ def __init__(
133
+ self,
134
+ retention_period: int = 24 * 60 * 60, # 24 hours
135
+ alert_cooldown: int = 300, # 5 minutes
136
+ health_check_interval: int = 30, # 30 seconds
137
+ anomaly_detection: bool = True,
138
+ ):
139
+ """Initialize edge monitor.
140
+
141
+ Args:
142
+ retention_period: How long to retain metrics (seconds)
143
+ alert_cooldown: Cooldown between alerts for same issue
144
+ health_check_interval: Interval between health checks
145
+ anomaly_detection: Enable anomaly detection
146
+ """
147
+ self.retention_period = retention_period
148
+ self.alert_cooldown = alert_cooldown
149
+ self.health_check_interval = health_check_interval
150
+ self.anomaly_detection = anomaly_detection
151
+
152
+ # Metrics storage
153
+ self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=10000))
154
+ self.aggregated_metrics: Dict[str, Dict[str, List[float]]] = defaultdict(
155
+ lambda: defaultdict(list)
156
+ )
157
+
158
+ # Health tracking
159
+ self.health_status: Dict[str, EdgeHealth] = {}
160
+ self.node_start_times: Dict[str, datetime] = {}
161
+
162
+ # Alerting
163
+ self.alerts: List[EdgeAlert] = []
164
+ self.alert_history: Dict[str, datetime] = {}
165
+ self.alert_thresholds: Dict[MetricType, Dict[str, float]] = {
166
+ MetricType.LATENCY: {"warning": 0.5, "error": 1.0, "critical": 2.0},
167
+ MetricType.ERROR_RATE: {"warning": 0.05, "error": 0.1, "critical": 0.2},
168
+ MetricType.RESOURCE_USAGE: {
169
+ "warning": 0.7,
170
+ "error": 0.85,
171
+ "critical": 0.95,
172
+ },
173
+ MetricType.AVAILABILITY: {"warning": 0.99, "error": 0.95, "critical": 0.9},
174
+ MetricType.CACHE_HIT_RATE: {"warning": 0.7, "error": 0.5, "critical": 0.3},
175
+ }
176
+
177
+ # Analytics
178
+ self.baseline_metrics: Dict[str, Dict[MetricType, float]] = defaultdict(dict)
179
+
180
+ # Background tasks
181
+ self._running = False
182
+ self._health_check_task = None
183
+ self._cleanup_task = None
184
+ self._analytics_task = None
185
+
186
+ async def start(self):
187
+ """Start monitoring service."""
188
+ self._running = True
189
+ self._health_check_task = asyncio.create_task(self._health_check_loop())
190
+ self._cleanup_task = asyncio.create_task(self._cleanup_loop())
191
+ if self.anomaly_detection:
192
+ self._analytics_task = asyncio.create_task(self._analytics_loop())
193
+
194
+ async def stop(self):
195
+ """Stop monitoring service."""
196
+ self._running = False
197
+
198
+ tasks = [self._health_check_task, self._cleanup_task, self._analytics_task]
199
+ for task in tasks:
200
+ if task:
201
+ task.cancel()
202
+ try:
203
+ await task
204
+ except asyncio.CancelledError:
205
+ pass
206
+
207
+ async def record_metric(self, metric: EdgeMetric):
208
+ """Record a metric measurement.
209
+
210
+ Args:
211
+ metric: Metric to record
212
+ """
213
+ # Store in time-series
214
+ key = f"{metric.edge_node}:{metric.metric_type.value}"
215
+ self.metrics[key].append(metric)
216
+
217
+ # Update aggregated metrics for fast queries
218
+ self.aggregated_metrics[metric.edge_node][metric.metric_type].append(
219
+ metric.value
220
+ )
221
+
222
+ # Check thresholds and generate alerts
223
+ await self._check_thresholds(metric)
224
+
225
+ # Update node tracking
226
+ if metric.edge_node not in self.node_start_times:
227
+ self.node_start_times[metric.edge_node] = datetime.now()
228
+
229
+ async def get_metrics(
230
+ self,
231
+ edge_node: Optional[str] = None,
232
+ metric_type: Optional[MetricType] = None,
233
+ start_time: Optional[datetime] = None,
234
+ end_time: Optional[datetime] = None,
235
+ tags: Optional[Dict[str, str]] = None,
236
+ ) -> List[EdgeMetric]:
237
+ """Query metrics with filters.
238
+
239
+ Args:
240
+ edge_node: Filter by edge node
241
+ metric_type: Filter by metric type
242
+ start_time: Start of time range
243
+ end_time: End of time range
244
+ tags: Filter by tags
245
+
246
+ Returns:
247
+ List of matching metrics
248
+ """
249
+ results = []
250
+
251
+ # Determine keys to search
252
+ if edge_node and metric_type:
253
+ keys = [f"{edge_node}:{metric_type.value}"]
254
+ elif edge_node:
255
+ keys = [k for k in self.metrics.keys() if k.startswith(f"{edge_node}:")]
256
+ elif metric_type:
257
+ keys = [
258
+ k for k in self.metrics.keys() if k.endswith(f":{metric_type.value}")
259
+ ]
260
+ else:
261
+ keys = list(self.metrics.keys())
262
+
263
+ # Filter metrics
264
+ for key in keys:
265
+ for metric in self.metrics[key]:
266
+ # Time range filter
267
+ if start_time and metric.timestamp < start_time:
268
+ continue
269
+ if end_time and metric.timestamp > end_time:
270
+ continue
271
+
272
+ # Tag filter
273
+ if tags:
274
+ if not all(metric.tags.get(k) == v for k, v in tags.items()):
275
+ continue
276
+
277
+ results.append(metric)
278
+
279
+ return sorted(results, key=lambda m: m.timestamp)
280
+
281
+ async def get_edge_health(self, edge_node: str) -> EdgeHealth:
282
+ """Get health status for an edge node.
283
+
284
+ Args:
285
+ edge_node: Edge node identifier
286
+
287
+ Returns:
288
+ Health status
289
+ """
290
+ if edge_node in self.health_status:
291
+ return self.health_status[edge_node]
292
+
293
+ # Create new health entry
294
+ health = EdgeHealth(
295
+ edge_node=edge_node,
296
+ status=HealthStatus.UNKNOWN,
297
+ last_check=datetime.now(),
298
+ uptime_seconds=0,
299
+ metrics_summary={},
300
+ )
301
+
302
+ self.health_status[edge_node] = health
303
+ return health
304
+
305
+ async def get_alerts(
306
+ self,
307
+ edge_node: Optional[str] = None,
308
+ severity: Optional[AlertSeverity] = None,
309
+ start_time: Optional[datetime] = None,
310
+ active_only: bool = False,
311
+ ) -> List[EdgeAlert]:
312
+ """Get alerts with filters.
313
+
314
+ Args:
315
+ edge_node: Filter by edge node
316
+ severity: Filter by severity
317
+ start_time: Filter alerts after this time
318
+ active_only: Only return active alerts
319
+
320
+ Returns:
321
+ List of matching alerts
322
+ """
323
+ results = []
324
+
325
+ for alert in self.alerts:
326
+ # Edge node filter
327
+ if edge_node and alert.edge_node != edge_node:
328
+ continue
329
+
330
+ # Severity filter
331
+ if severity and alert.severity != severity:
332
+ continue
333
+
334
+ # Time filter
335
+ if start_time and alert.timestamp < start_time:
336
+ continue
337
+
338
+ # Active filter
339
+ if active_only:
340
+ # Check if alert is still active (within cooldown)
341
+ key = f"{alert.edge_node}:{alert.metric_type.value}"
342
+ if key in self.alert_history:
343
+ if (
344
+ datetime.now() - self.alert_history[key]
345
+ ).total_seconds() > self.alert_cooldown:
346
+ continue
347
+
348
+ results.append(alert)
349
+
350
+ return sorted(results, key=lambda a: a.timestamp, reverse=True)
351
+
352
+ def get_analytics(self, edge_node: str) -> Dict[str, Any]:
353
+ """Get analytics for an edge node.
354
+
355
+ Args:
356
+ edge_node: Edge node identifier
357
+
358
+ Returns:
359
+ Analytics summary
360
+ """
361
+ analytics = {
362
+ "edge_node": edge_node,
363
+ "metrics_summary": {},
364
+ "trends": {},
365
+ "anomalies": [],
366
+ "recommendations": [],
367
+ }
368
+
369
+ # Calculate summaries for each metric type
370
+ for metric_type, values in self.aggregated_metrics[edge_node].items():
371
+ if not values:
372
+ continue
373
+
374
+ # Basic statistics
375
+ analytics["metrics_summary"][metric_type.value] = {
376
+ "count": len(values),
377
+ "mean": statistics.mean(values),
378
+ "median": statistics.median(values),
379
+ "std_dev": statistics.stdev(values) if len(values) > 1 else 0,
380
+ "min": min(values),
381
+ "max": max(values),
382
+ "p95": sorted(values)[int(len(values) * 0.95)] if values else 0,
383
+ "p99": sorted(values)[int(len(values) * 0.99)] if values else 0,
384
+ }
385
+
386
+ # Trend analysis (simple moving average)
387
+ if len(values) > 10:
388
+ recent = values[-10:]
389
+ older = values[-20:-10] if len(values) > 20 else values[:10]
390
+
391
+ recent_avg = statistics.mean(recent)
392
+ older_avg = statistics.mean(older)
393
+
394
+ trend = "stable"
395
+ if recent_avg > older_avg * 1.1:
396
+ trend = "increasing"
397
+ elif recent_avg < older_avg * 0.9:
398
+ trend = "decreasing"
399
+
400
+ analytics["trends"][metric_type.value] = {
401
+ "direction": trend,
402
+ "change_percent": (
403
+ ((recent_avg - older_avg) / older_avg * 100) if older_avg else 0
404
+ ),
405
+ }
406
+
407
+ # Detect anomalies
408
+ if self.anomaly_detection:
409
+ anomalies = self._detect_anomalies(edge_node)
410
+ analytics["anomalies"] = [a.to_dict() for a in anomalies]
411
+
412
+ # Generate recommendations
413
+ analytics["recommendations"] = self._generate_recommendations(
414
+ edge_node, analytics
415
+ )
416
+
417
+ return analytics
418
+
419
+ async def _check_thresholds(self, metric: EdgeMetric):
420
+ """Check if metric violates thresholds and create alerts."""
421
+ if metric.metric_type not in self.alert_thresholds:
422
+ return
423
+
424
+ thresholds = self.alert_thresholds[metric.metric_type]
425
+ alert_key = f"{metric.edge_node}:{metric.metric_type.value}"
426
+
427
+ # Check cooldown
428
+ if alert_key in self.alert_history:
429
+ if (
430
+ datetime.now() - self.alert_history[alert_key]
431
+ ).total_seconds() < self.alert_cooldown:
432
+ return
433
+
434
+ # Determine severity
435
+ severity = None
436
+ threshold_value = None
437
+
438
+ # For availability and cache hit rate, lower is worse
439
+ if metric.metric_type in [MetricType.AVAILABILITY, MetricType.CACHE_HIT_RATE]:
440
+ if metric.value <= thresholds.get("critical", 0):
441
+ severity = AlertSeverity.CRITICAL
442
+ threshold_value = thresholds["critical"]
443
+ elif metric.value <= thresholds.get("error", 0):
444
+ severity = AlertSeverity.ERROR
445
+ threshold_value = thresholds["error"]
446
+ elif metric.value <= thresholds.get("warning", 0):
447
+ severity = AlertSeverity.WARNING
448
+ threshold_value = thresholds["warning"]
449
+ else:
450
+ # For other metrics, higher is worse
451
+ if metric.value >= thresholds.get("critical", float("inf")):
452
+ severity = AlertSeverity.CRITICAL
453
+ threshold_value = thresholds["critical"]
454
+ elif metric.value >= thresholds.get("error", float("inf")):
455
+ severity = AlertSeverity.ERROR
456
+ threshold_value = thresholds["error"]
457
+ elif metric.value >= thresholds.get("warning", float("inf")):
458
+ severity = AlertSeverity.WARNING
459
+ threshold_value = thresholds["warning"]
460
+
461
+ # Create alert if threshold violated
462
+ if severity:
463
+ alert = EdgeAlert(
464
+ alert_id=f"{alert_key}:{int(time.time())}",
465
+ timestamp=datetime.now(),
466
+ edge_node=metric.edge_node,
467
+ severity=severity,
468
+ metric_type=metric.metric_type,
469
+ message=f"{metric.metric_type.value} threshold exceeded on {metric.edge_node}",
470
+ current_value=metric.value,
471
+ threshold=threshold_value,
472
+ tags=metric.tags,
473
+ )
474
+
475
+ self.alerts.append(alert)
476
+ self.alert_history[alert_key] = datetime.now()
477
+
478
+ async def _health_check_loop(self):
479
+ """Background task for health monitoring."""
480
+ while self._running:
481
+ try:
482
+ # Check health of all known nodes
483
+ for edge_node in list(self.node_start_times.keys()):
484
+ await self._check_node_health(edge_node)
485
+
486
+ await asyncio.sleep(self.health_check_interval)
487
+
488
+ except Exception as e:
489
+ print(f"Health check error: {e}")
490
+ await asyncio.sleep(0.1) # Fast retry for tests
491
+
492
+ async def _check_node_health(self, edge_node: str):
493
+ """Check health of a specific node."""
494
+ health = await self.get_edge_health(edge_node)
495
+
496
+ # Calculate uptime
497
+ if edge_node in self.node_start_times:
498
+ uptime = (datetime.now() - self.node_start_times[edge_node]).total_seconds()
499
+ health.uptime_seconds = uptime
500
+
501
+ # Analyze recent metrics
502
+ issues = []
503
+ metrics_summary = {}
504
+
505
+ for metric_type in MetricType:
506
+ key = f"{edge_node}:{metric_type.value}"
507
+ if key in self.metrics:
508
+ recent_metrics = [
509
+ m
510
+ for m in self.metrics[key]
511
+ if (datetime.now() - m.timestamp).total_seconds() < 300
512
+ ] # Last 5 min
513
+
514
+ if recent_metrics:
515
+ values = [m.value for m in recent_metrics]
516
+ metrics_summary[metric_type.value] = {
517
+ "current": values[-1],
518
+ "avg": statistics.mean(values),
519
+ "min": min(values),
520
+ "max": max(values),
521
+ }
522
+
523
+ health.metrics_summary = metrics_summary
524
+
525
+ # Determine overall status
526
+ recent_alerts = await self.get_alerts(
527
+ edge_node=edge_node,
528
+ start_time=datetime.now() - timedelta(minutes=5),
529
+ active_only=True,
530
+ )
531
+
532
+ critical_alerts = [
533
+ a for a in recent_alerts if a.severity == AlertSeverity.CRITICAL
534
+ ]
535
+ error_alerts = [a for a in recent_alerts if a.severity == AlertSeverity.ERROR]
536
+
537
+ if critical_alerts:
538
+ health.status = HealthStatus.UNHEALTHY
539
+ issues.extend([a.message for a in critical_alerts])
540
+ elif error_alerts:
541
+ health.status = HealthStatus.DEGRADED
542
+ issues.extend([a.message for a in error_alerts])
543
+ elif metrics_summary:
544
+ health.status = HealthStatus.HEALTHY
545
+ else:
546
+ health.status = HealthStatus.UNKNOWN
547
+ issues.append("No recent metrics received")
548
+
549
+ health.issues = issues
550
+ health.last_check = datetime.now()
551
+
552
+ async def _cleanup_loop(self):
553
+ """Background task for cleaning old data."""
554
+ while self._running:
555
+ try:
556
+ cutoff_time = datetime.now() - timedelta(seconds=self.retention_period)
557
+
558
+ # Clean metrics
559
+ for key in list(self.metrics.keys()):
560
+ self.metrics[key] = deque(
561
+ (m for m in self.metrics[key] if m.timestamp > cutoff_time),
562
+ maxlen=10000,
563
+ )
564
+
565
+ # Clean alerts
566
+ self.alerts = [a for a in self.alerts if a.timestamp > cutoff_time]
567
+
568
+ # Clean aggregated metrics (keep recent window)
569
+ for node in self.aggregated_metrics:
570
+ for metric_type in self.aggregated_metrics[node]:
571
+ # Keep last 1000 values
572
+ if len(self.aggregated_metrics[node][metric_type]) > 1000:
573
+ self.aggregated_metrics[node][metric_type] = (
574
+ self.aggregated_metrics[node][metric_type][-1000:]
575
+ )
576
+
577
+ await asyncio.sleep(1) # Fast cleanup for tests
578
+
579
+ except Exception as e:
580
+ print(f"Cleanup error: {e}")
581
+ await asyncio.sleep(0.1) # Fast retry for tests
582
+
583
+ async def _analytics_loop(self):
584
+ """Background task for analytics and anomaly detection."""
585
+ while self._running:
586
+ try:
587
+ # Update baselines
588
+ for edge_node in self.aggregated_metrics:
589
+ self._update_baseline(edge_node)
590
+
591
+ await asyncio.sleep(300) # Run every 5 minutes
592
+
593
+ except Exception as e:
594
+ print(f"Analytics error: {e}")
595
+ await asyncio.sleep(300)
596
+
597
+ def _update_baseline(self, edge_node: str):
598
+ """Update baseline metrics for anomaly detection."""
599
+ for metric_type, values in self.aggregated_metrics[edge_node].items():
600
+ if len(values) > 100:
601
+ # Use median as baseline (more robust to outliers)
602
+ self.baseline_metrics[edge_node][metric_type] = statistics.median(
603
+ values
604
+ )
605
+
606
+ def _detect_anomalies(self, edge_node: str) -> List[EdgeAlert]:
607
+ """Detect anomalies in metrics."""
608
+ anomalies = []
609
+
610
+ if edge_node not in self.baseline_metrics:
611
+ return anomalies
612
+
613
+ for metric_type, baseline in self.baseline_metrics[edge_node].items():
614
+ recent_values = self.aggregated_metrics[edge_node][metric_type][-10:]
615
+
616
+ if not recent_values:
617
+ continue
618
+
619
+ current = statistics.mean(recent_values)
620
+
621
+ # Simple anomaly detection: significant deviation from baseline
622
+ deviation = abs(current - baseline) / baseline if baseline else 0
623
+
624
+ if deviation > 0.5: # 50% deviation
625
+ anomaly = EdgeAlert(
626
+ alert_id=f"anomaly:{edge_node}:{metric_type.value}:{int(time.time())}",
627
+ timestamp=datetime.now(),
628
+ edge_node=edge_node,
629
+ severity=AlertSeverity.WARNING,
630
+ metric_type=metric_type,
631
+ message=f"Anomaly detected: {metric_type.value} deviates {deviation*100:.1f}% from baseline",
632
+ current_value=current,
633
+ threshold=baseline,
634
+ tags={"type": "anomaly", "deviation": str(deviation)},
635
+ )
636
+ anomalies.append(anomaly)
637
+
638
+ return anomalies
639
+
640
+ def _generate_recommendations(
641
+ self, edge_node: str, analytics: Dict[str, Any]
642
+ ) -> List[str]:
643
+ """Generate recommendations based on analytics."""
644
+ recommendations = []
645
+
646
+ # Check metrics
647
+ metrics = analytics.get("metrics_summary", {})
648
+
649
+ # High latency
650
+ if MetricType.LATENCY.value in metrics:
651
+ latency = metrics[MetricType.LATENCY.value]
652
+ if latency["p95"] > 1.0:
653
+ recommendations.append(
654
+ f"Consider scaling {edge_node} - p95 latency is {latency['p95']:.2f}s"
655
+ )
656
+
657
+ # High error rate
658
+ if MetricType.ERROR_RATE.value in metrics:
659
+ error_rate = metrics[MetricType.ERROR_RATE.value]
660
+ if error_rate["mean"] > 0.05:
661
+ recommendations.append(
662
+ f"Investigate errors on {edge_node} - error rate is {error_rate['mean']*100:.1f}%"
663
+ )
664
+
665
+ # Resource usage
666
+ if MetricType.RESOURCE_USAGE.value in metrics:
667
+ resources = metrics[MetricType.RESOURCE_USAGE.value]
668
+ if resources["p95"] > 0.8:
669
+ recommendations.append(
670
+ f"Resource usage high on {edge_node} - consider scaling or optimization"
671
+ )
672
+
673
+ # Cache performance
674
+ if MetricType.CACHE_HIT_RATE.value in metrics:
675
+ cache = metrics[MetricType.CACHE_HIT_RATE.value]
676
+ if cache["mean"] < 0.7:
677
+ recommendations.append(
678
+ f"Low cache hit rate ({cache['mean']*100:.1f}%) - review caching strategy"
679
+ )
680
+
681
+ # Check trends
682
+ trends = analytics.get("trends", {})
683
+
684
+ for metric, trend in trends.items():
685
+ if trend["direction"] == "increasing" and trend["change_percent"] > 20:
686
+ if metric in [MetricType.LATENCY.value, MetricType.ERROR_RATE.value]:
687
+ recommendations.append(
688
+ f"{metric} increasing by {trend['change_percent']:.1f}% - investigate cause"
689
+ )
690
+
691
+ return recommendations
692
+
693
+ def set_threshold(self, metric_type: MetricType, severity: str, value: float):
694
+ """Update alert threshold.
695
+
696
+ Args:
697
+ metric_type: Type of metric
698
+ severity: Severity level (warning, error, critical)
699
+ value: Threshold value
700
+ """
701
+ if metric_type not in self.alert_thresholds:
702
+ self.alert_thresholds[metric_type] = {}
703
+
704
+ self.alert_thresholds[metric_type][severity] = value
705
+
706
+ def get_summary(self) -> Dict[str, Any]:
707
+ """Get overall monitoring summary."""
708
+ # Count nodes by health status
709
+ health_counts = defaultdict(int)
710
+ for health in self.health_status.values():
711
+ health_counts[health.status.value] += 1
712
+
713
+ # Recent alerts by severity
714
+ recent_alerts = defaultdict(int)
715
+ cutoff = datetime.now() - timedelta(hours=1)
716
+ for alert in self.alerts:
717
+ if alert.timestamp > cutoff:
718
+ recent_alerts[alert.severity.value] += 1
719
+
720
+ # Active nodes
721
+ active_nodes = []
722
+ cutoff = datetime.now() - timedelta(minutes=5)
723
+ for node, metrics_dict in self.aggregated_metrics.items():
724
+ if any(metrics_dict.values()): # Has recent metrics
725
+ active_nodes.append(node)
726
+
727
+ return {
728
+ "monitoring_active": self._running,
729
+ "total_nodes": len(self.health_status),
730
+ "active_nodes": len(active_nodes),
731
+ "health_summary": dict(health_counts),
732
+ "recent_alerts": dict(recent_alerts),
733
+ "total_metrics": sum(len(m) for m in self.metrics.values()),
734
+ "retention_period": self.retention_period,
735
+ "anomaly_detection": self.anomaly_detection,
736
+ }