kailash 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. kailash/__init__.py +1 -7
  2. kailash/cli/__init__.py +11 -1
  3. kailash/cli/validation_audit.py +570 -0
  4. kailash/core/actors/supervisor.py +1 -1
  5. kailash/core/resilience/circuit_breaker.py +71 -1
  6. kailash/core/resilience/health_monitor.py +172 -0
  7. kailash/edge/compliance.py +33 -0
  8. kailash/edge/consistency.py +609 -0
  9. kailash/edge/coordination/__init__.py +30 -0
  10. kailash/edge/coordination/global_ordering.py +355 -0
  11. kailash/edge/coordination/leader_election.py +217 -0
  12. kailash/edge/coordination/partition_detector.py +296 -0
  13. kailash/edge/coordination/raft.py +485 -0
  14. kailash/edge/discovery.py +63 -1
  15. kailash/edge/migration/__init__.py +19 -0
  16. kailash/edge/migration/edge_migrator.py +832 -0
  17. kailash/edge/monitoring/__init__.py +21 -0
  18. kailash/edge/monitoring/edge_monitor.py +736 -0
  19. kailash/edge/prediction/__init__.py +10 -0
  20. kailash/edge/prediction/predictive_warmer.py +591 -0
  21. kailash/edge/resource/__init__.py +102 -0
  22. kailash/edge/resource/cloud_integration.py +796 -0
  23. kailash/edge/resource/cost_optimizer.py +949 -0
  24. kailash/edge/resource/docker_integration.py +919 -0
  25. kailash/edge/resource/kubernetes_integration.py +893 -0
  26. kailash/edge/resource/platform_integration.py +913 -0
  27. kailash/edge/resource/predictive_scaler.py +959 -0
  28. kailash/edge/resource/resource_analyzer.py +824 -0
  29. kailash/edge/resource/resource_pools.py +610 -0
  30. kailash/integrations/dataflow_edge.py +261 -0
  31. kailash/mcp_server/registry_integration.py +1 -1
  32. kailash/monitoring/__init__.py +18 -0
  33. kailash/monitoring/alerts.py +646 -0
  34. kailash/monitoring/metrics.py +677 -0
  35. kailash/nodes/__init__.py +2 -0
  36. kailash/nodes/ai/__init__.py +17 -0
  37. kailash/nodes/ai/a2a.py +1914 -43
  38. kailash/nodes/ai/a2a_backup.py +1807 -0
  39. kailash/nodes/ai/hybrid_search.py +972 -0
  40. kailash/nodes/ai/semantic_memory.py +558 -0
  41. kailash/nodes/ai/streaming_analytics.py +947 -0
  42. kailash/nodes/base.py +545 -0
  43. kailash/nodes/edge/__init__.py +36 -0
  44. kailash/nodes/edge/base.py +240 -0
  45. kailash/nodes/edge/cloud_node.py +710 -0
  46. kailash/nodes/edge/coordination.py +239 -0
  47. kailash/nodes/edge/docker_node.py +825 -0
  48. kailash/nodes/edge/edge_data.py +582 -0
  49. kailash/nodes/edge/edge_migration_node.py +392 -0
  50. kailash/nodes/edge/edge_monitoring_node.py +421 -0
  51. kailash/nodes/edge/edge_state.py +673 -0
  52. kailash/nodes/edge/edge_warming_node.py +393 -0
  53. kailash/nodes/edge/kubernetes_node.py +652 -0
  54. kailash/nodes/edge/platform_node.py +766 -0
  55. kailash/nodes/edge/resource_analyzer_node.py +378 -0
  56. kailash/nodes/edge/resource_optimizer_node.py +501 -0
  57. kailash/nodes/edge/resource_scaler_node.py +397 -0
  58. kailash/nodes/ports.py +676 -0
  59. kailash/runtime/local.py +344 -1
  60. kailash/runtime/validation/__init__.py +20 -0
  61. kailash/runtime/validation/connection_context.py +119 -0
  62. kailash/runtime/validation/enhanced_error_formatter.py +202 -0
  63. kailash/runtime/validation/error_categorizer.py +164 -0
  64. kailash/runtime/validation/metrics.py +380 -0
  65. kailash/runtime/validation/performance.py +615 -0
  66. kailash/runtime/validation/suggestion_engine.py +212 -0
  67. kailash/testing/fixtures.py +2 -2
  68. kailash/workflow/builder.py +234 -8
  69. kailash/workflow/contracts.py +418 -0
  70. kailash/workflow/edge_infrastructure.py +369 -0
  71. kailash/workflow/migration.py +3 -3
  72. kailash/workflow/type_inference.py +669 -0
  73. {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/METADATA +44 -27
  74. {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/RECORD +78 -28
  75. kailash/nexus/__init__.py +0 -21
  76. kailash/nexus/cli/__init__.py +0 -5
  77. kailash/nexus/cli/__main__.py +0 -6
  78. kailash/nexus/cli/main.py +0 -176
  79. kailash/nexus/factory.py +0 -413
  80. kailash/nexus/gateway.py +0 -545
  81. {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/WHEEL +0 -0
  82. {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/entry_points.txt +0 -0
  83. {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/licenses/LICENSE +0 -0
  84. {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,947 @@
1
+ """
2
+ Streaming analytics and performance monitoring for A2A systems.
3
+
4
+ This module provides real-time streaming capabilities and performance dashboards
5
+ for monitoring A2A agent interactions, task execution, and system health.
6
+ """
7
+
8
+ import asyncio
9
+ import json
10
+ import statistics
11
+ import time
12
+ import weakref
13
+ from collections import defaultdict, deque
14
+ from dataclasses import asdict, dataclass, field
15
+ from datetime import datetime, timedelta
16
+ from enum import Enum
17
+ from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Set
18
+ from uuid import uuid4
19
+
20
+ from ..base import Node, NodeParameter, register_node
21
+
22
+
23
+ class MetricType(Enum):
24
+ """Types of metrics that can be collected."""
25
+
26
+ COUNTER = "counter"
27
+ GAUGE = "gauge"
28
+ HISTOGRAM = "histogram"
29
+ TIMER = "timer"
30
+ RATE = "rate"
31
+
32
+
33
+ class AlertSeverity(Enum):
34
+ """Alert severity levels."""
35
+
36
+ LOW = "low"
37
+ MEDIUM = "medium"
38
+ HIGH = "high"
39
+ CRITICAL = "critical"
40
+
41
+
42
+ @dataclass
43
+ class MetricValue:
44
+ """A single metric value with timestamp."""
45
+
46
+ value: float
47
+ timestamp: datetime
48
+ labels: Dict[str, str] = field(default_factory=dict)
49
+
50
+ def to_dict(self) -> Dict[str, Any]:
51
+ """Convert to dictionary for serialization."""
52
+ return {
53
+ "value": self.value,
54
+ "timestamp": self.timestamp.isoformat(),
55
+ "labels": self.labels,
56
+ }
57
+
58
+
59
+ @dataclass
60
+ class StreamEvent:
61
+ """A streaming event in the A2A system."""
62
+
63
+ event_id: str
64
+ event_type: str
65
+ source: str
66
+ timestamp: datetime
67
+ data: Dict[str, Any]
68
+ metadata: Dict[str, Any] = field(default_factory=dict)
69
+
70
+ def to_dict(self) -> Dict[str, Any]:
71
+ """Convert to dictionary for serialization."""
72
+ return {
73
+ "event_id": self.event_id,
74
+ "event_type": self.event_type,
75
+ "source": self.source,
76
+ "timestamp": self.timestamp.isoformat(),
77
+ "data": self.data,
78
+ "metadata": self.metadata,
79
+ }
80
+
81
+
82
+ @dataclass
83
+ class Alert:
84
+ """System alert based on metrics."""
85
+
86
+ alert_id: str
87
+ name: str
88
+ severity: AlertSeverity
89
+ message: str
90
+ timestamp: datetime
91
+ metric_name: str
92
+ metric_value: float
93
+ threshold: float
94
+ resolved: bool = False
95
+ resolved_at: Optional[datetime] = None
96
+
97
+ def to_dict(self) -> Dict[str, Any]:
98
+ """Convert to dictionary for serialization."""
99
+ return {
100
+ "alert_id": self.alert_id,
101
+ "name": self.name,
102
+ "severity": self.severity.value,
103
+ "message": self.message,
104
+ "timestamp": self.timestamp.isoformat(),
105
+ "metric_name": self.metric_name,
106
+ "metric_value": self.metric_value,
107
+ "threshold": self.threshold,
108
+ "resolved": self.resolved,
109
+ "resolved_at": self.resolved_at.isoformat() if self.resolved_at else None,
110
+ }
111
+
112
+
113
+ class MetricsCollector:
114
+ """Collects and manages metrics for streaming analytics."""
115
+
116
+ def __init__(self, max_retention_hours: int = 24):
117
+ self.max_retention_hours = max_retention_hours
118
+ self.metrics: Dict[str, List[MetricValue]] = defaultdict(list)
119
+ self.metric_types: Dict[str, MetricType] = {}
120
+ self.alert_rules: Dict[str, Dict[str, Any]] = {}
121
+ self.active_alerts: Dict[str, Alert] = {}
122
+ self._lock = asyncio.Lock()
123
+
124
+ async def record_metric(
125
+ self,
126
+ name: str,
127
+ value: float,
128
+ metric_type: MetricType = MetricType.GAUGE,
129
+ labels: Optional[Dict[str, str]] = None,
130
+ ):
131
+ """Record a metric value."""
132
+ async with self._lock:
133
+ metric_value = MetricValue(
134
+ value=value, timestamp=datetime.now(), labels=labels or {}
135
+ )
136
+
137
+ self.metrics[name].append(metric_value)
138
+ self.metric_types[name] = metric_type
139
+
140
+ # Clean up old metrics
141
+ await self._cleanup_old_metrics(name)
142
+
143
+ # Check alert rules
144
+ await self._check_alert_rules(name, value)
145
+
146
+ async def increment_counter(
147
+ self, name: str, value: float = 1.0, labels: Optional[Dict[str, str]] = None
148
+ ):
149
+ """Increment a counter metric."""
150
+ await self.record_metric(name, value, MetricType.COUNTER, labels)
151
+
152
+ async def set_gauge(
153
+ self, name: str, value: float, labels: Optional[Dict[str, str]] = None
154
+ ):
155
+ """Set a gauge metric."""
156
+ await self.record_metric(name, value, MetricType.GAUGE, labels)
157
+
158
+ async def record_timer(
159
+ self, name: str, duration: float, labels: Optional[Dict[str, str]] = None
160
+ ):
161
+ """Record a timer metric."""
162
+ await self.record_metric(name, duration, MetricType.TIMER, labels)
163
+
164
+ async def get_metric_values(
165
+ self,
166
+ name: str,
167
+ since: Optional[datetime] = None,
168
+ labels: Optional[Dict[str, str]] = None,
169
+ ) -> List[MetricValue]:
170
+ """Get metric values with optional filtering."""
171
+ async with self._lock:
172
+ values = self.metrics.get(name, [])
173
+
174
+ if since:
175
+ values = [v for v in values if v.timestamp >= since]
176
+
177
+ if labels:
178
+ values = [
179
+ v
180
+ for v in values
181
+ if all(v.labels.get(k) == val for k, val in labels.items())
182
+ ]
183
+
184
+ return values
185
+
186
+ async def get_metric_stats(
187
+ self, name: str, since: Optional[datetime] = None
188
+ ) -> Dict[str, float]:
189
+ """Get statistical summary of a metric."""
190
+ values = await self.get_metric_values(name, since)
191
+
192
+ if not values:
193
+ return {}
194
+
195
+ numeric_values = [v.value for v in values]
196
+
197
+ return {
198
+ "count": len(numeric_values),
199
+ "min": min(numeric_values),
200
+ "max": max(numeric_values),
201
+ "mean": statistics.mean(numeric_values),
202
+ "median": statistics.median(numeric_values),
203
+ "stddev": (
204
+ statistics.stdev(numeric_values) if len(numeric_values) > 1 else 0.0
205
+ ),
206
+ "sum": sum(numeric_values),
207
+ }
208
+
209
+ async def add_alert_rule(
210
+ self,
211
+ name: str,
212
+ metric_name: str,
213
+ threshold: float,
214
+ condition: str = "greater_than",
215
+ severity: AlertSeverity = AlertSeverity.MEDIUM,
216
+ message: str = "",
217
+ ):
218
+ """Add an alert rule."""
219
+ self.alert_rules[name] = {
220
+ "metric_name": metric_name,
221
+ "threshold": threshold,
222
+ "condition": condition,
223
+ "severity": severity,
224
+ "message": message or f"{metric_name} {condition} {threshold}",
225
+ }
226
+
227
+ async def _cleanup_old_metrics(self, name: str):
228
+ """Clean up old metric values."""
229
+ cutoff_time = datetime.now() - timedelta(hours=self.max_retention_hours)
230
+ self.metrics[name] = [
231
+ v for v in self.metrics[name] if v.timestamp >= cutoff_time
232
+ ]
233
+
234
+ async def _check_alert_rules(self, metric_name: str, value: float):
235
+ """Check if metric value triggers any alerts."""
236
+ for rule_name, rule in self.alert_rules.items():
237
+ if rule["metric_name"] != metric_name:
238
+ continue
239
+
240
+ condition = rule["condition"]
241
+ threshold = rule["threshold"]
242
+ triggered = False
243
+
244
+ if condition == "greater_than" and value > threshold:
245
+ triggered = True
246
+ elif condition == "less_than" and value < threshold:
247
+ triggered = True
248
+ elif condition == "equals" and value == threshold:
249
+ triggered = True
250
+
251
+ if triggered:
252
+ await self._trigger_alert(rule_name, rule, value)
253
+ else:
254
+ await self._resolve_alert(rule_name)
255
+
256
+ async def _trigger_alert(self, rule_name: str, rule: Dict[str, Any], value: float):
257
+ """Trigger an alert."""
258
+ if rule_name not in self.active_alerts:
259
+ alert = Alert(
260
+ alert_id=str(uuid4()),
261
+ name=rule_name,
262
+ severity=rule["severity"],
263
+ message=rule["message"],
264
+ timestamp=datetime.now(),
265
+ metric_name=rule["metric_name"],
266
+ metric_value=value,
267
+ threshold=rule["threshold"],
268
+ )
269
+ self.active_alerts[rule_name] = alert
270
+
271
+ async def _resolve_alert(self, rule_name: str):
272
+ """Resolve an alert."""
273
+ if rule_name in self.active_alerts:
274
+ alert = self.active_alerts[rule_name]
275
+ alert.resolved = True
276
+ alert.resolved_at = datetime.now()
277
+ # Keep resolved alerts for a bit, then clean up
278
+ # In production, you might want to send to external system
279
+
280
+ async def get_active_alerts(self) -> List[Alert]:
281
+ """Get all active alerts."""
282
+ return [alert for alert in self.active_alerts.values() if not alert.resolved]
283
+
284
+ async def get_all_metrics(self) -> Dict[str, List[Dict[str, Any]]]:
285
+ """Get all metrics as serializable data."""
286
+ result = {}
287
+ async with self._lock:
288
+ for name, values in self.metrics.items():
289
+ result[name] = [v.to_dict() for v in values]
290
+ return result
291
+
292
+
293
+ class EventStreamer:
294
+ """Streams events from the A2A system."""
295
+
296
+ def __init__(self, buffer_size: int = 1000):
297
+ self.buffer_size = buffer_size
298
+ self.event_buffer: deque = deque(maxlen=buffer_size)
299
+ self.subscribers: Set[asyncio.Queue] = set()
300
+ self.event_handlers: Dict[str, List[Callable]] = defaultdict(list)
301
+ self._lock = asyncio.Lock()
302
+
303
+ async def publish_event(self, event: StreamEvent):
304
+ """Publish an event to all subscribers."""
305
+ async with self._lock:
306
+ # Add to buffer
307
+ self.event_buffer.append(event)
308
+
309
+ # Notify subscribers
310
+ dead_queues = set()
311
+ for queue in self.subscribers:
312
+ try:
313
+ await queue.put(event)
314
+ except asyncio.QueueEmpty:
315
+ # Queue is full, subscriber is slow
316
+ dead_queues.add(queue)
317
+ except Exception:
318
+ # Subscriber is dead
319
+ dead_queues.add(queue)
320
+
321
+ # Clean up dead subscribers
322
+ self.subscribers -= dead_queues
323
+
324
+ # Call event handlers
325
+ for handler in self.event_handlers.get(event.event_type, []):
326
+ try:
327
+ await handler(event)
328
+ except Exception:
329
+ # Log error but continue
330
+ pass
331
+
332
+ async def subscribe(
333
+ self, queue_size: int = 100
334
+ ) -> AsyncGenerator[StreamEvent, None]:
335
+ """Subscribe to event stream."""
336
+ queue = asyncio.Queue(maxsize=queue_size)
337
+ self.subscribers.add(queue)
338
+
339
+ try:
340
+ while True:
341
+ event = await queue.get()
342
+ yield event
343
+ except asyncio.CancelledError:
344
+ self.subscribers.discard(queue)
345
+ raise
346
+ except Exception:
347
+ self.subscribers.discard(queue)
348
+ raise
349
+
350
+ async def add_event_handler(
351
+ self, event_type: str, handler: Callable[[StreamEvent], None]
352
+ ):
353
+ """Add an event handler for a specific event type."""
354
+ self.event_handlers[event_type].append(handler)
355
+
356
+ async def get_recent_events(
357
+ self, event_type: Optional[str] = None, limit: int = 100
358
+ ) -> List[StreamEvent]:
359
+ """Get recent events from the buffer."""
360
+ async with self._lock:
361
+ events = list(self.event_buffer)
362
+
363
+ if event_type:
364
+ events = [e for e in events if e.event_type == event_type]
365
+
366
+ return events[-limit:]
367
+
368
+
369
+ class PerformanceDashboard:
370
+ """Real-time performance dashboard for A2A system."""
371
+
372
+ def __init__(
373
+ self, metrics_collector: MetricsCollector, event_streamer: EventStreamer
374
+ ):
375
+ self.metrics_collector = metrics_collector
376
+ self.event_streamer = event_streamer
377
+ self.dashboard_data: Dict[str, Any] = {}
378
+ self._update_interval = 5 # seconds
379
+ self._update_task: Optional[asyncio.Task] = None
380
+
381
+ async def start(self):
382
+ """Start the dashboard update loop."""
383
+ if self._update_task is None:
384
+ self._update_task = asyncio.create_task(self._update_loop())
385
+
386
+ async def stop(self):
387
+ """Stop the dashboard update loop."""
388
+ if self._update_task:
389
+ self._update_task.cancel()
390
+ try:
391
+ await self._update_task
392
+ except asyncio.CancelledError:
393
+ pass
394
+ self._update_task = None
395
+
396
+ async def _update_loop(self):
397
+ """Main update loop for dashboard data."""
398
+ while True:
399
+ try:
400
+ await self._update_dashboard_data()
401
+ await asyncio.sleep(self._update_interval)
402
+ except asyncio.CancelledError:
403
+ break
404
+ except Exception:
405
+ # Log error but continue
406
+ await asyncio.sleep(self._update_interval)
407
+
408
+ async def _update_dashboard_data(self):
409
+ """Update dashboard data with current metrics."""
410
+ now = datetime.now()
411
+ last_hour = now - timedelta(hours=1)
412
+
413
+ # Get key metrics
414
+ task_stats = await self.metrics_collector.get_metric_stats(
415
+ "tasks_completed", last_hour
416
+ )
417
+ agent_stats = await self.metrics_collector.get_metric_stats(
418
+ "agent_utilization", last_hour
419
+ )
420
+ insight_stats = await self.metrics_collector.get_metric_stats(
421
+ "insight_quality", last_hour
422
+ )
423
+
424
+ # Get recent events
425
+ recent_events = await self.event_streamer.get_recent_events(limit=50)
426
+
427
+ # Get active alerts
428
+ active_alerts = await self.metrics_collector.get_active_alerts()
429
+
430
+ # Update dashboard data
431
+ self.dashboard_data = {
432
+ "timestamp": now.isoformat(),
433
+ "overview": {
434
+ "total_tasks": task_stats.get("sum", 0),
435
+ "average_agent_utilization": agent_stats.get("mean", 0.0),
436
+ "average_insight_quality": insight_stats.get("mean", 0.0),
437
+ "active_alerts": len(active_alerts),
438
+ },
439
+ "task_performance": {
440
+ "completed_last_hour": task_stats.get("count", 0),
441
+ "completion_rate": task_stats.get("mean", 0.0),
442
+ "peak_completion_rate": task_stats.get("max", 0.0),
443
+ },
444
+ "agent_performance": {
445
+ "average_utilization": agent_stats.get("mean", 0.0),
446
+ "peak_utilization": agent_stats.get("max", 0.0),
447
+ "utilization_stddev": agent_stats.get("stddev", 0.0),
448
+ },
449
+ "insight_quality": {
450
+ "average_quality": insight_stats.get("mean", 0.0),
451
+ "quality_trend": "stable", # Could be calculated from historical data
452
+ "quality_distribution": {
453
+ "high": 0, # Could be calculated from raw data
454
+ "medium": 0,
455
+ "low": 0,
456
+ },
457
+ },
458
+ "recent_events": [e.to_dict() for e in recent_events[-10:]],
459
+ "active_alerts": [a.to_dict() for a in active_alerts],
460
+ "system_health": {
461
+ "status": "healthy" if len(active_alerts) == 0 else "degraded",
462
+ "uptime": "99.9%", # Could be calculated from metrics
463
+ "last_updated": now.isoformat(),
464
+ },
465
+ }
466
+
467
+ async def get_dashboard_data(self) -> Dict[str, Any]:
468
+ """Get current dashboard data."""
469
+ return self.dashboard_data.copy()
470
+
471
+ async def get_real_time_metrics(self) -> Dict[str, Any]:
472
+ """Get real-time metrics summary."""
473
+ now = datetime.now()
474
+ last_minute = now - timedelta(minutes=1)
475
+
476
+ # Get very recent metrics
477
+ recent_tasks = await self.metrics_collector.get_metric_values(
478
+ "tasks_completed", last_minute
479
+ )
480
+ recent_insights = await self.metrics_collector.get_metric_values(
481
+ "insight_quality", last_minute
482
+ )
483
+
484
+ return {
485
+ "timestamp": now.isoformat(),
486
+ "tasks_per_minute": len(recent_tasks),
487
+ "average_insight_quality": (
488
+ statistics.mean([v.value for v in recent_insights])
489
+ if recent_insights
490
+ else 0.0
491
+ ),
492
+ "active_agents": len(
493
+ await self.metrics_collector.get_metric_values(
494
+ "agent_active", last_minute
495
+ )
496
+ ),
497
+ "system_load": 0.0, # Could be calculated from various metrics
498
+ }
499
+
500
+
501
+ @register_node()
502
+ class StreamingAnalyticsNode(Node):
503
+ """Node for streaming analytics and real-time monitoring."""
504
+
505
+ def __init__(self, name: str = "streaming_analytics", **kwargs):
506
+ """Initialize streaming analytics node."""
507
+ self.action = "start_monitoring"
508
+ self.metrics_config = None
509
+ self.alert_rules = None
510
+ self.dashboard_config = None
511
+ self.buffer_size = 1000
512
+ self.retention_hours = 24
513
+ self.update_interval = 5
514
+
515
+ # Set attributes from kwargs
516
+ for key, value in kwargs.items():
517
+ if hasattr(self, key):
518
+ setattr(self, key, value)
519
+
520
+ super().__init__(name=name, **kwargs)
521
+
522
+ # Initialize components
523
+ self.metrics_collector = MetricsCollector(
524
+ max_retention_hours=self.retention_hours
525
+ )
526
+ self.event_streamer = EventStreamer(buffer_size=self.buffer_size)
527
+ self.dashboard = PerformanceDashboard(
528
+ self.metrics_collector, self.event_streamer
529
+ )
530
+
531
+ # Track if monitoring is active
532
+ self._monitoring_active = False
533
+
534
+ def get_parameters(self) -> Dict[str, NodeParameter]:
535
+ """Get node parameters."""
536
+ return {
537
+ "action": NodeParameter(
538
+ name="action",
539
+ type=str,
540
+ required=False,
541
+ default="start_monitoring",
542
+ description="Action to perform: start_monitoring, stop_monitoring, get_metrics, get_dashboard",
543
+ ),
544
+ "metrics_config": NodeParameter(
545
+ name="metrics_config",
546
+ type=dict,
547
+ required=False,
548
+ description="Configuration for metrics collection",
549
+ ),
550
+ "alert_rules": NodeParameter(
551
+ name="alert_rules",
552
+ type=list,
553
+ required=False,
554
+ description="Alert rules configuration",
555
+ ),
556
+ "dashboard_config": NodeParameter(
557
+ name="dashboard_config",
558
+ type=dict,
559
+ required=False,
560
+ description="Dashboard configuration",
561
+ ),
562
+ "buffer_size": NodeParameter(
563
+ name="buffer_size",
564
+ type=int,
565
+ required=False,
566
+ default=1000,
567
+ description="Event buffer size",
568
+ ),
569
+ "retention_hours": NodeParameter(
570
+ name="retention_hours",
571
+ type=int,
572
+ required=False,
573
+ default=24,
574
+ description="Metric retention period in hours",
575
+ ),
576
+ "update_interval": NodeParameter(
577
+ name="update_interval",
578
+ type=int,
579
+ required=False,
580
+ default=5,
581
+ description="Dashboard update interval in seconds",
582
+ ),
583
+ }
584
+
585
+ async def run(self, **kwargs) -> Dict[str, Any]:
586
+ """Execute streaming analytics action."""
587
+ # Get parameters
588
+ action = kwargs.get("action", self.action)
589
+ metrics_config = kwargs.get("metrics_config", self.metrics_config)
590
+ alert_rules = kwargs.get("alert_rules", self.alert_rules)
591
+ dashboard_config = kwargs.get("dashboard_config", self.dashboard_config)
592
+
593
+ if action == "start_monitoring":
594
+ return await self._start_monitoring(
595
+ metrics_config, alert_rules, dashboard_config
596
+ )
597
+ elif action == "stop_monitoring":
598
+ return await self._stop_monitoring()
599
+ elif action == "get_metrics":
600
+ return await self._get_metrics()
601
+ elif action == "get_dashboard":
602
+ return await self._get_dashboard()
603
+ elif action == "record_metric":
604
+ return await self._record_metric(kwargs)
605
+ elif action == "publish_event":
606
+ return await self._publish_event(kwargs)
607
+ else:
608
+ raise ValueError(f"Unknown action: {action}")
609
+
610
+ async def _start_monitoring(
611
+ self,
612
+ metrics_config: Optional[Dict[str, Any]],
613
+ alert_rules: Optional[List[Dict[str, Any]]],
614
+ dashboard_config: Optional[Dict[str, Any]],
615
+ ) -> Dict[str, Any]:
616
+ """Start monitoring with configuration."""
617
+ # Configure alert rules
618
+ if alert_rules:
619
+ for rule in alert_rules:
620
+ await self.metrics_collector.add_alert_rule(
621
+ name=rule["name"],
622
+ metric_name=rule["metric_name"],
623
+ threshold=rule["threshold"],
624
+ condition=rule.get("condition", "greater_than"),
625
+ severity=AlertSeverity(rule.get("severity", "medium")),
626
+ message=rule.get("message", ""),
627
+ )
628
+
629
+ # Configure dashboard
630
+ if dashboard_config:
631
+ self.dashboard._update_interval = dashboard_config.get("update_interval", 5)
632
+
633
+ # Start dashboard
634
+ await self.dashboard.start()
635
+ self._monitoring_active = True
636
+
637
+ return {
638
+ "success": True,
639
+ "message": "Monitoring started",
640
+ "monitoring_active": self._monitoring_active,
641
+ "alert_rules_configured": len(alert_rules) if alert_rules else 0,
642
+ "dashboard_update_interval": self.dashboard._update_interval,
643
+ }
644
+
645
+ async def _stop_monitoring(self) -> Dict[str, Any]:
646
+ """Stop monitoring."""
647
+ await self.dashboard.stop()
648
+ self._monitoring_active = False
649
+
650
+ return {
651
+ "success": True,
652
+ "message": "Monitoring stopped",
653
+ "monitoring_active": self._monitoring_active,
654
+ }
655
+
656
+ async def _get_metrics(self) -> Dict[str, Any]:
657
+ """Get current metrics."""
658
+ all_metrics = await self.metrics_collector.get_all_metrics()
659
+ active_alerts = await self.metrics_collector.get_active_alerts()
660
+
661
+ return {
662
+ "success": True,
663
+ "metrics": all_metrics,
664
+ "active_alerts": [a.to_dict() for a in active_alerts],
665
+ "monitoring_active": self._monitoring_active,
666
+ }
667
+
668
+ async def _get_dashboard(self) -> Dict[str, Any]:
669
+ """Get dashboard data."""
670
+ dashboard_data = await self.dashboard.get_dashboard_data()
671
+ real_time_metrics = await self.dashboard.get_real_time_metrics()
672
+
673
+ return {
674
+ "success": True,
675
+ "dashboard": dashboard_data,
676
+ "real_time": real_time_metrics,
677
+ "monitoring_active": self._monitoring_active,
678
+ }
679
+
680
+ async def _record_metric(self, params: Dict[str, Any]) -> Dict[str, Any]:
681
+ """Record a metric value."""
682
+ metric_name = params.get("metric_name")
683
+ metric_value = params.get("metric_value")
684
+ metric_type = params.get("metric_type", "gauge")
685
+ labels = params.get("labels", {})
686
+
687
+ if not metric_name or metric_value is None:
688
+ raise ValueError("metric_name and metric_value are required")
689
+
690
+ await self.metrics_collector.record_metric(
691
+ name=metric_name,
692
+ value=float(metric_value),
693
+ metric_type=MetricType(metric_type),
694
+ labels=labels,
695
+ )
696
+
697
+ return {
698
+ "success": True,
699
+ "message": f"Recorded metric {metric_name} = {metric_value}",
700
+ "metric_name": metric_name,
701
+ "metric_value": metric_value,
702
+ }
703
+
704
+ async def _publish_event(self, params: Dict[str, Any]) -> Dict[str, Any]:
705
+ """Publish a stream event."""
706
+ event_type = params.get("event_type")
707
+ source = params.get("source", "unknown")
708
+ data = params.get("data", {})
709
+ metadata = params.get("metadata", {})
710
+
711
+ if not event_type:
712
+ raise ValueError("event_type is required")
713
+
714
+ event = StreamEvent(
715
+ event_id=str(uuid4()),
716
+ event_type=event_type,
717
+ source=source,
718
+ timestamp=datetime.now(),
719
+ data=data,
720
+ metadata=metadata,
721
+ )
722
+
723
+ await self.event_streamer.publish_event(event)
724
+
725
+ return {
726
+ "success": True,
727
+ "message": f"Published event {event_type}",
728
+ "event_id": event.event_id,
729
+ "event_type": event_type,
730
+ }
731
+
732
+
733
+ @register_node()
734
+ class A2AMonitoringNode(Node):
735
+ """Specialized monitoring node for A2A systems."""
736
+
737
+ def __init__(self, name: str = "a2a_monitoring", **kwargs):
738
+ """Initialize A2A monitoring node."""
739
+ self.coordinator_node = None
740
+ self.streaming_node = None
741
+ self.monitoring_interval = 10
742
+ self.enable_auto_alerts = True
743
+
744
+ # Set attributes from kwargs
745
+ for key, value in kwargs.items():
746
+ if hasattr(self, key):
747
+ setattr(self, key, value)
748
+
749
+ super().__init__(name=name, **kwargs)
750
+
751
+ # Monitoring task
752
+ self._monitoring_task: Optional[asyncio.Task] = None
753
+
754
+ def get_parameters(self) -> Dict[str, NodeParameter]:
755
+ """Get node parameters."""
756
+ return {
757
+ "coordinator_node": NodeParameter(
758
+ name="coordinator_node",
759
+ type=object,
760
+ required=True,
761
+ description="A2A coordinator node to monitor",
762
+ ),
763
+ "streaming_node": NodeParameter(
764
+ name="streaming_node",
765
+ type=object,
766
+ required=True,
767
+ description="Streaming analytics node",
768
+ ),
769
+ "monitoring_interval": NodeParameter(
770
+ name="monitoring_interval",
771
+ type=int,
772
+ required=False,
773
+ default=10,
774
+ description="Monitoring interval in seconds",
775
+ ),
776
+ "enable_auto_alerts": NodeParameter(
777
+ name="enable_auto_alerts",
778
+ type=bool,
779
+ required=False,
780
+ default=True,
781
+ description="Enable automatic alert generation",
782
+ ),
783
+ }
784
+
785
+ async def run(self, **kwargs) -> Dict[str, Any]:
786
+ """Start A2A monitoring."""
787
+ # Get parameters
788
+ coordinator_node = kwargs.get("coordinator_node", self.coordinator_node)
789
+ streaming_node = kwargs.get("streaming_node", self.streaming_node)
790
+ monitoring_interval = kwargs.get(
791
+ "monitoring_interval", self.monitoring_interval
792
+ )
793
+ enable_auto_alerts = kwargs.get("enable_auto_alerts", self.enable_auto_alerts)
794
+
795
+ if not coordinator_node or not streaming_node:
796
+ raise ValueError("coordinator_node and streaming_node are required")
797
+
798
+ # Set up monitoring
799
+ self.coordinator_node = coordinator_node
800
+ self.streaming_node = streaming_node
801
+ self.monitoring_interval = monitoring_interval
802
+
803
+ # Configure default alert rules for A2A
804
+ if enable_auto_alerts:
805
+ await self._setup_default_alerts()
806
+
807
+ # Start monitoring task
808
+ if self._monitoring_task is None:
809
+ self._monitoring_task = asyncio.create_task(self._monitoring_loop())
810
+
811
+ return {
812
+ "success": True,
813
+ "message": "A2A monitoring started",
814
+ "monitoring_interval": self.monitoring_interval,
815
+ "auto_alerts_enabled": enable_auto_alerts,
816
+ }
817
+
818
+ async def _setup_default_alerts(self):
819
+ """Set up default alert rules for A2A monitoring."""
820
+ default_rules = [
821
+ {
822
+ "name": "high_task_failure_rate",
823
+ "metric_name": "task_failure_rate",
824
+ "threshold": 0.1,
825
+ "condition": "greater_than",
826
+ "severity": "high",
827
+ "message": "Task failure rate is above 10%",
828
+ },
829
+ {
830
+ "name": "low_agent_utilization",
831
+ "metric_name": "agent_utilization",
832
+ "threshold": 0.3,
833
+ "condition": "less_than",
834
+ "severity": "medium",
835
+ "message": "Agent utilization is below 30%",
836
+ },
837
+ {
838
+ "name": "low_insight_quality",
839
+ "metric_name": "insight_quality",
840
+ "threshold": 0.6,
841
+ "condition": "less_than",
842
+ "severity": "medium",
843
+ "message": "Average insight quality is below 60%",
844
+ },
845
+ {
846
+ "name": "high_response_time",
847
+ "metric_name": "response_time",
848
+ "threshold": 5000, # 5 seconds
849
+ "condition": "greater_than",
850
+ "severity": "high",
851
+ "message": "Response time is above 5 seconds",
852
+ },
853
+ ]
854
+
855
+ for rule in default_rules:
856
+ await self.streaming_node.metrics_collector.add_alert_rule(
857
+ name=rule["name"],
858
+ metric_name=rule["metric_name"],
859
+ threshold=rule["threshold"],
860
+ condition=rule["condition"],
861
+ severity=AlertSeverity(rule["severity"]),
862
+ message=rule["message"],
863
+ )
864
+
865
+ async def _monitoring_loop(self):
866
+ """Main monitoring loop."""
867
+ while True:
868
+ try:
869
+ await self._collect_a2a_metrics()
870
+ await asyncio.sleep(self.monitoring_interval)
871
+ except asyncio.CancelledError:
872
+ break
873
+ except Exception:
874
+ # Log error but continue monitoring
875
+ await asyncio.sleep(self.monitoring_interval)
876
+
877
+ async def _collect_a2a_metrics(self):
878
+ """Collect metrics from A2A coordinator."""
879
+ if not self.coordinator_node:
880
+ return
881
+
882
+ # Get current state from coordinator
883
+ active_tasks = len(getattr(self.coordinator_node, "active_tasks", {}))
884
+ completed_tasks = len(getattr(self.coordinator_node, "completed_tasks", []))
885
+ registered_agents = len(getattr(self.coordinator_node, "registered_agents", {}))
886
+
887
+ # Calculate metrics
888
+ total_tasks = active_tasks + completed_tasks
889
+ agent_utilization = active_tasks / max(registered_agents, 1)
890
+
891
+ # Record metrics
892
+ await self.streaming_node.metrics_collector.set_gauge(
893
+ "active_tasks", active_tasks
894
+ )
895
+ await self.streaming_node.metrics_collector.set_gauge(
896
+ "completed_tasks", completed_tasks
897
+ )
898
+ await self.streaming_node.metrics_collector.set_gauge(
899
+ "registered_agents", registered_agents
900
+ )
901
+ await self.streaming_node.metrics_collector.set_gauge(
902
+ "agent_utilization", agent_utilization
903
+ )
904
+
905
+ # Calculate insight quality if available
906
+ if hasattr(self.coordinator_node, "completed_tasks"):
907
+ completed_tasks_list = getattr(self.coordinator_node, "completed_tasks", [])
908
+ if completed_tasks_list:
909
+ quality_scores = []
910
+ for task in completed_tasks_list[-10:]: # Last 10 tasks
911
+ if hasattr(task, "current_quality_score"):
912
+ # Handle both real values and mock objects
913
+ score = task.current_quality_score
914
+ if isinstance(score, (int, float)):
915
+ quality_scores.append(score)
916
+
917
+ if quality_scores:
918
+ avg_quality = sum(quality_scores) / len(quality_scores)
919
+ await self.streaming_node.metrics_collector.set_gauge(
920
+ "insight_quality", avg_quality
921
+ )
922
+
923
+ # Publish monitoring event
924
+ event = StreamEvent(
925
+ event_id=str(uuid4()),
926
+ event_type="a2a_metrics_collected",
927
+ source="a2a_monitoring",
928
+ timestamp=datetime.now(),
929
+ data={
930
+ "active_tasks": active_tasks,
931
+ "completed_tasks": completed_tasks,
932
+ "registered_agents": registered_agents,
933
+ "agent_utilization": agent_utilization,
934
+ },
935
+ )
936
+
937
+ await self.streaming_node.event_streamer.publish_event(event)
938
+
939
+ async def stop_monitoring(self):
940
+ """Stop monitoring."""
941
+ if self._monitoring_task:
942
+ self._monitoring_task.cancel()
943
+ try:
944
+ await self._monitoring_task
945
+ except asyncio.CancelledError:
946
+ pass
947
+ self._monitoring_task = None