kailash 0.8.1__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -1
- kailash/middleware/database/base_models.py +7 -1
- kailash/nodes/ai/__init__.py +17 -0
- kailash/nodes/ai/a2a.py +1914 -43
- kailash/nodes/ai/a2a_backup.py +1807 -0
- kailash/nodes/ai/hybrid_search.py +972 -0
- kailash/nodes/ai/semantic_memory.py +558 -0
- kailash/nodes/ai/streaming_analytics.py +947 -0
- kailash/nodes/code/python.py +1 -0
- kailash/runtime/local.py +66 -0
- kailash/runtime/secret_provider.py +293 -0
- kailash/workflow/builder.py +88 -9
- kailash-0.8.4.dist-info/METADATA +533 -0
- {kailash-0.8.1.dist-info → kailash-0.8.4.dist-info}/RECORD +18 -13
- kailash-0.8.1.dist-info/METADATA +0 -745
- {kailash-0.8.1.dist-info → kailash-0.8.4.dist-info}/WHEEL +0 -0
- {kailash-0.8.1.dist-info → kailash-0.8.4.dist-info}/entry_points.txt +0 -0
- {kailash-0.8.1.dist-info → kailash-0.8.4.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.8.1.dist-info → kailash-0.8.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,947 @@
|
|
1
|
+
"""
|
2
|
+
Streaming analytics and performance monitoring for A2A systems.
|
3
|
+
|
4
|
+
This module provides real-time streaming capabilities and performance dashboards
|
5
|
+
for monitoring A2A agent interactions, task execution, and system health.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import asyncio
|
9
|
+
import json
|
10
|
+
import statistics
|
11
|
+
import time
|
12
|
+
import weakref
|
13
|
+
from collections import defaultdict, deque
|
14
|
+
from dataclasses import asdict, dataclass, field
|
15
|
+
from datetime import datetime, timedelta
|
16
|
+
from enum import Enum
|
17
|
+
from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Set
|
18
|
+
from uuid import uuid4
|
19
|
+
|
20
|
+
from ..base import Node, NodeParameter, register_node
|
21
|
+
|
22
|
+
|
23
|
+
class MetricType(Enum):
|
24
|
+
"""Types of metrics that can be collected."""
|
25
|
+
|
26
|
+
COUNTER = "counter"
|
27
|
+
GAUGE = "gauge"
|
28
|
+
HISTOGRAM = "histogram"
|
29
|
+
TIMER = "timer"
|
30
|
+
RATE = "rate"
|
31
|
+
|
32
|
+
|
33
|
+
class AlertSeverity(Enum):
|
34
|
+
"""Alert severity levels."""
|
35
|
+
|
36
|
+
LOW = "low"
|
37
|
+
MEDIUM = "medium"
|
38
|
+
HIGH = "high"
|
39
|
+
CRITICAL = "critical"
|
40
|
+
|
41
|
+
|
42
|
+
@dataclass
|
43
|
+
class MetricValue:
|
44
|
+
"""A single metric value with timestamp."""
|
45
|
+
|
46
|
+
value: float
|
47
|
+
timestamp: datetime
|
48
|
+
labels: Dict[str, str] = field(default_factory=dict)
|
49
|
+
|
50
|
+
def to_dict(self) -> Dict[str, Any]:
|
51
|
+
"""Convert to dictionary for serialization."""
|
52
|
+
return {
|
53
|
+
"value": self.value,
|
54
|
+
"timestamp": self.timestamp.isoformat(),
|
55
|
+
"labels": self.labels,
|
56
|
+
}
|
57
|
+
|
58
|
+
|
59
|
+
@dataclass
|
60
|
+
class StreamEvent:
|
61
|
+
"""A streaming event in the A2A system."""
|
62
|
+
|
63
|
+
event_id: str
|
64
|
+
event_type: str
|
65
|
+
source: str
|
66
|
+
timestamp: datetime
|
67
|
+
data: Dict[str, Any]
|
68
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
69
|
+
|
70
|
+
def to_dict(self) -> Dict[str, Any]:
|
71
|
+
"""Convert to dictionary for serialization."""
|
72
|
+
return {
|
73
|
+
"event_id": self.event_id,
|
74
|
+
"event_type": self.event_type,
|
75
|
+
"source": self.source,
|
76
|
+
"timestamp": self.timestamp.isoformat(),
|
77
|
+
"data": self.data,
|
78
|
+
"metadata": self.metadata,
|
79
|
+
}
|
80
|
+
|
81
|
+
|
82
|
+
@dataclass
|
83
|
+
class Alert:
|
84
|
+
"""System alert based on metrics."""
|
85
|
+
|
86
|
+
alert_id: str
|
87
|
+
name: str
|
88
|
+
severity: AlertSeverity
|
89
|
+
message: str
|
90
|
+
timestamp: datetime
|
91
|
+
metric_name: str
|
92
|
+
metric_value: float
|
93
|
+
threshold: float
|
94
|
+
resolved: bool = False
|
95
|
+
resolved_at: Optional[datetime] = None
|
96
|
+
|
97
|
+
def to_dict(self) -> Dict[str, Any]:
|
98
|
+
"""Convert to dictionary for serialization."""
|
99
|
+
return {
|
100
|
+
"alert_id": self.alert_id,
|
101
|
+
"name": self.name,
|
102
|
+
"severity": self.severity.value,
|
103
|
+
"message": self.message,
|
104
|
+
"timestamp": self.timestamp.isoformat(),
|
105
|
+
"metric_name": self.metric_name,
|
106
|
+
"metric_value": self.metric_value,
|
107
|
+
"threshold": self.threshold,
|
108
|
+
"resolved": self.resolved,
|
109
|
+
"resolved_at": self.resolved_at.isoformat() if self.resolved_at else None,
|
110
|
+
}
|
111
|
+
|
112
|
+
|
113
|
+
class MetricsCollector:
|
114
|
+
"""Collects and manages metrics for streaming analytics."""
|
115
|
+
|
116
|
+
def __init__(self, max_retention_hours: int = 24):
|
117
|
+
self.max_retention_hours = max_retention_hours
|
118
|
+
self.metrics: Dict[str, List[MetricValue]] = defaultdict(list)
|
119
|
+
self.metric_types: Dict[str, MetricType] = {}
|
120
|
+
self.alert_rules: Dict[str, Dict[str, Any]] = {}
|
121
|
+
self.active_alerts: Dict[str, Alert] = {}
|
122
|
+
self._lock = asyncio.Lock()
|
123
|
+
|
124
|
+
async def record_metric(
|
125
|
+
self,
|
126
|
+
name: str,
|
127
|
+
value: float,
|
128
|
+
metric_type: MetricType = MetricType.GAUGE,
|
129
|
+
labels: Optional[Dict[str, str]] = None,
|
130
|
+
):
|
131
|
+
"""Record a metric value."""
|
132
|
+
async with self._lock:
|
133
|
+
metric_value = MetricValue(
|
134
|
+
value=value, timestamp=datetime.now(), labels=labels or {}
|
135
|
+
)
|
136
|
+
|
137
|
+
self.metrics[name].append(metric_value)
|
138
|
+
self.metric_types[name] = metric_type
|
139
|
+
|
140
|
+
# Clean up old metrics
|
141
|
+
await self._cleanup_old_metrics(name)
|
142
|
+
|
143
|
+
# Check alert rules
|
144
|
+
await self._check_alert_rules(name, value)
|
145
|
+
|
146
|
+
async def increment_counter(
|
147
|
+
self, name: str, value: float = 1.0, labels: Optional[Dict[str, str]] = None
|
148
|
+
):
|
149
|
+
"""Increment a counter metric."""
|
150
|
+
await self.record_metric(name, value, MetricType.COUNTER, labels)
|
151
|
+
|
152
|
+
async def set_gauge(
|
153
|
+
self, name: str, value: float, labels: Optional[Dict[str, str]] = None
|
154
|
+
):
|
155
|
+
"""Set a gauge metric."""
|
156
|
+
await self.record_metric(name, value, MetricType.GAUGE, labels)
|
157
|
+
|
158
|
+
async def record_timer(
|
159
|
+
self, name: str, duration: float, labels: Optional[Dict[str, str]] = None
|
160
|
+
):
|
161
|
+
"""Record a timer metric."""
|
162
|
+
await self.record_metric(name, duration, MetricType.TIMER, labels)
|
163
|
+
|
164
|
+
async def get_metric_values(
|
165
|
+
self,
|
166
|
+
name: str,
|
167
|
+
since: Optional[datetime] = None,
|
168
|
+
labels: Optional[Dict[str, str]] = None,
|
169
|
+
) -> List[MetricValue]:
|
170
|
+
"""Get metric values with optional filtering."""
|
171
|
+
async with self._lock:
|
172
|
+
values = self.metrics.get(name, [])
|
173
|
+
|
174
|
+
if since:
|
175
|
+
values = [v for v in values if v.timestamp >= since]
|
176
|
+
|
177
|
+
if labels:
|
178
|
+
values = [
|
179
|
+
v
|
180
|
+
for v in values
|
181
|
+
if all(v.labels.get(k) == val for k, val in labels.items())
|
182
|
+
]
|
183
|
+
|
184
|
+
return values
|
185
|
+
|
186
|
+
async def get_metric_stats(
|
187
|
+
self, name: str, since: Optional[datetime] = None
|
188
|
+
) -> Dict[str, float]:
|
189
|
+
"""Get statistical summary of a metric."""
|
190
|
+
values = await self.get_metric_values(name, since)
|
191
|
+
|
192
|
+
if not values:
|
193
|
+
return {}
|
194
|
+
|
195
|
+
numeric_values = [v.value for v in values]
|
196
|
+
|
197
|
+
return {
|
198
|
+
"count": len(numeric_values),
|
199
|
+
"min": min(numeric_values),
|
200
|
+
"max": max(numeric_values),
|
201
|
+
"mean": statistics.mean(numeric_values),
|
202
|
+
"median": statistics.median(numeric_values),
|
203
|
+
"stddev": (
|
204
|
+
statistics.stdev(numeric_values) if len(numeric_values) > 1 else 0.0
|
205
|
+
),
|
206
|
+
"sum": sum(numeric_values),
|
207
|
+
}
|
208
|
+
|
209
|
+
async def add_alert_rule(
|
210
|
+
self,
|
211
|
+
name: str,
|
212
|
+
metric_name: str,
|
213
|
+
threshold: float,
|
214
|
+
condition: str = "greater_than",
|
215
|
+
severity: AlertSeverity = AlertSeverity.MEDIUM,
|
216
|
+
message: str = "",
|
217
|
+
):
|
218
|
+
"""Add an alert rule."""
|
219
|
+
self.alert_rules[name] = {
|
220
|
+
"metric_name": metric_name,
|
221
|
+
"threshold": threshold,
|
222
|
+
"condition": condition,
|
223
|
+
"severity": severity,
|
224
|
+
"message": message or f"{metric_name} {condition} {threshold}",
|
225
|
+
}
|
226
|
+
|
227
|
+
async def _cleanup_old_metrics(self, name: str):
|
228
|
+
"""Clean up old metric values."""
|
229
|
+
cutoff_time = datetime.now() - timedelta(hours=self.max_retention_hours)
|
230
|
+
self.metrics[name] = [
|
231
|
+
v for v in self.metrics[name] if v.timestamp >= cutoff_time
|
232
|
+
]
|
233
|
+
|
234
|
+
async def _check_alert_rules(self, metric_name: str, value: float):
|
235
|
+
"""Check if metric value triggers any alerts."""
|
236
|
+
for rule_name, rule in self.alert_rules.items():
|
237
|
+
if rule["metric_name"] != metric_name:
|
238
|
+
continue
|
239
|
+
|
240
|
+
condition = rule["condition"]
|
241
|
+
threshold = rule["threshold"]
|
242
|
+
triggered = False
|
243
|
+
|
244
|
+
if condition == "greater_than" and value > threshold:
|
245
|
+
triggered = True
|
246
|
+
elif condition == "less_than" and value < threshold:
|
247
|
+
triggered = True
|
248
|
+
elif condition == "equals" and value == threshold:
|
249
|
+
triggered = True
|
250
|
+
|
251
|
+
if triggered:
|
252
|
+
await self._trigger_alert(rule_name, rule, value)
|
253
|
+
else:
|
254
|
+
await self._resolve_alert(rule_name)
|
255
|
+
|
256
|
+
async def _trigger_alert(self, rule_name: str, rule: Dict[str, Any], value: float):
|
257
|
+
"""Trigger an alert."""
|
258
|
+
if rule_name not in self.active_alerts:
|
259
|
+
alert = Alert(
|
260
|
+
alert_id=str(uuid4()),
|
261
|
+
name=rule_name,
|
262
|
+
severity=rule["severity"],
|
263
|
+
message=rule["message"],
|
264
|
+
timestamp=datetime.now(),
|
265
|
+
metric_name=rule["metric_name"],
|
266
|
+
metric_value=value,
|
267
|
+
threshold=rule["threshold"],
|
268
|
+
)
|
269
|
+
self.active_alerts[rule_name] = alert
|
270
|
+
|
271
|
+
async def _resolve_alert(self, rule_name: str):
|
272
|
+
"""Resolve an alert."""
|
273
|
+
if rule_name in self.active_alerts:
|
274
|
+
alert = self.active_alerts[rule_name]
|
275
|
+
alert.resolved = True
|
276
|
+
alert.resolved_at = datetime.now()
|
277
|
+
# Keep resolved alerts for a bit, then clean up
|
278
|
+
# In production, you might want to send to external system
|
279
|
+
|
280
|
+
async def get_active_alerts(self) -> List[Alert]:
|
281
|
+
"""Get all active alerts."""
|
282
|
+
return [alert for alert in self.active_alerts.values() if not alert.resolved]
|
283
|
+
|
284
|
+
async def get_all_metrics(self) -> Dict[str, List[Dict[str, Any]]]:
|
285
|
+
"""Get all metrics as serializable data."""
|
286
|
+
result = {}
|
287
|
+
async with self._lock:
|
288
|
+
for name, values in self.metrics.items():
|
289
|
+
result[name] = [v.to_dict() for v in values]
|
290
|
+
return result
|
291
|
+
|
292
|
+
|
293
|
+
class EventStreamer:
|
294
|
+
"""Streams events from the A2A system."""
|
295
|
+
|
296
|
+
def __init__(self, buffer_size: int = 1000):
|
297
|
+
self.buffer_size = buffer_size
|
298
|
+
self.event_buffer: deque = deque(maxlen=buffer_size)
|
299
|
+
self.subscribers: Set[asyncio.Queue] = set()
|
300
|
+
self.event_handlers: Dict[str, List[Callable]] = defaultdict(list)
|
301
|
+
self._lock = asyncio.Lock()
|
302
|
+
|
303
|
+
async def publish_event(self, event: StreamEvent):
|
304
|
+
"""Publish an event to all subscribers."""
|
305
|
+
async with self._lock:
|
306
|
+
# Add to buffer
|
307
|
+
self.event_buffer.append(event)
|
308
|
+
|
309
|
+
# Notify subscribers
|
310
|
+
dead_queues = set()
|
311
|
+
for queue in self.subscribers:
|
312
|
+
try:
|
313
|
+
await queue.put(event)
|
314
|
+
except asyncio.QueueEmpty:
|
315
|
+
# Queue is full, subscriber is slow
|
316
|
+
dead_queues.add(queue)
|
317
|
+
except Exception:
|
318
|
+
# Subscriber is dead
|
319
|
+
dead_queues.add(queue)
|
320
|
+
|
321
|
+
# Clean up dead subscribers
|
322
|
+
self.subscribers -= dead_queues
|
323
|
+
|
324
|
+
# Call event handlers
|
325
|
+
for handler in self.event_handlers.get(event.event_type, []):
|
326
|
+
try:
|
327
|
+
await handler(event)
|
328
|
+
except Exception:
|
329
|
+
# Log error but continue
|
330
|
+
pass
|
331
|
+
|
332
|
+
async def subscribe(
|
333
|
+
self, queue_size: int = 100
|
334
|
+
) -> AsyncGenerator[StreamEvent, None]:
|
335
|
+
"""Subscribe to event stream."""
|
336
|
+
queue = asyncio.Queue(maxsize=queue_size)
|
337
|
+
self.subscribers.add(queue)
|
338
|
+
|
339
|
+
try:
|
340
|
+
while True:
|
341
|
+
event = await queue.get()
|
342
|
+
yield event
|
343
|
+
except asyncio.CancelledError:
|
344
|
+
self.subscribers.discard(queue)
|
345
|
+
raise
|
346
|
+
except Exception:
|
347
|
+
self.subscribers.discard(queue)
|
348
|
+
raise
|
349
|
+
|
350
|
+
async def add_event_handler(
|
351
|
+
self, event_type: str, handler: Callable[[StreamEvent], None]
|
352
|
+
):
|
353
|
+
"""Add an event handler for a specific event type."""
|
354
|
+
self.event_handlers[event_type].append(handler)
|
355
|
+
|
356
|
+
async def get_recent_events(
|
357
|
+
self, event_type: Optional[str] = None, limit: int = 100
|
358
|
+
) -> List[StreamEvent]:
|
359
|
+
"""Get recent events from the buffer."""
|
360
|
+
async with self._lock:
|
361
|
+
events = list(self.event_buffer)
|
362
|
+
|
363
|
+
if event_type:
|
364
|
+
events = [e for e in events if e.event_type == event_type]
|
365
|
+
|
366
|
+
return events[-limit:]
|
367
|
+
|
368
|
+
|
369
|
+
class PerformanceDashboard:
|
370
|
+
"""Real-time performance dashboard for A2A system."""
|
371
|
+
|
372
|
+
def __init__(
|
373
|
+
self, metrics_collector: MetricsCollector, event_streamer: EventStreamer
|
374
|
+
):
|
375
|
+
self.metrics_collector = metrics_collector
|
376
|
+
self.event_streamer = event_streamer
|
377
|
+
self.dashboard_data: Dict[str, Any] = {}
|
378
|
+
self._update_interval = 5 # seconds
|
379
|
+
self._update_task: Optional[asyncio.Task] = None
|
380
|
+
|
381
|
+
async def start(self):
|
382
|
+
"""Start the dashboard update loop."""
|
383
|
+
if self._update_task is None:
|
384
|
+
self._update_task = asyncio.create_task(self._update_loop())
|
385
|
+
|
386
|
+
async def stop(self):
|
387
|
+
"""Stop the dashboard update loop."""
|
388
|
+
if self._update_task:
|
389
|
+
self._update_task.cancel()
|
390
|
+
try:
|
391
|
+
await self._update_task
|
392
|
+
except asyncio.CancelledError:
|
393
|
+
pass
|
394
|
+
self._update_task = None
|
395
|
+
|
396
|
+
async def _update_loop(self):
|
397
|
+
"""Main update loop for dashboard data."""
|
398
|
+
while True:
|
399
|
+
try:
|
400
|
+
await self._update_dashboard_data()
|
401
|
+
await asyncio.sleep(self._update_interval)
|
402
|
+
except asyncio.CancelledError:
|
403
|
+
break
|
404
|
+
except Exception:
|
405
|
+
# Log error but continue
|
406
|
+
await asyncio.sleep(self._update_interval)
|
407
|
+
|
408
|
+
async def _update_dashboard_data(self):
|
409
|
+
"""Update dashboard data with current metrics."""
|
410
|
+
now = datetime.now()
|
411
|
+
last_hour = now - timedelta(hours=1)
|
412
|
+
|
413
|
+
# Get key metrics
|
414
|
+
task_stats = await self.metrics_collector.get_metric_stats(
|
415
|
+
"tasks_completed", last_hour
|
416
|
+
)
|
417
|
+
agent_stats = await self.metrics_collector.get_metric_stats(
|
418
|
+
"agent_utilization", last_hour
|
419
|
+
)
|
420
|
+
insight_stats = await self.metrics_collector.get_metric_stats(
|
421
|
+
"insight_quality", last_hour
|
422
|
+
)
|
423
|
+
|
424
|
+
# Get recent events
|
425
|
+
recent_events = await self.event_streamer.get_recent_events(limit=50)
|
426
|
+
|
427
|
+
# Get active alerts
|
428
|
+
active_alerts = await self.metrics_collector.get_active_alerts()
|
429
|
+
|
430
|
+
# Update dashboard data
|
431
|
+
self.dashboard_data = {
|
432
|
+
"timestamp": now.isoformat(),
|
433
|
+
"overview": {
|
434
|
+
"total_tasks": task_stats.get("sum", 0),
|
435
|
+
"average_agent_utilization": agent_stats.get("mean", 0.0),
|
436
|
+
"average_insight_quality": insight_stats.get("mean", 0.0),
|
437
|
+
"active_alerts": len(active_alerts),
|
438
|
+
},
|
439
|
+
"task_performance": {
|
440
|
+
"completed_last_hour": task_stats.get("count", 0),
|
441
|
+
"completion_rate": task_stats.get("mean", 0.0),
|
442
|
+
"peak_completion_rate": task_stats.get("max", 0.0),
|
443
|
+
},
|
444
|
+
"agent_performance": {
|
445
|
+
"average_utilization": agent_stats.get("mean", 0.0),
|
446
|
+
"peak_utilization": agent_stats.get("max", 0.0),
|
447
|
+
"utilization_stddev": agent_stats.get("stddev", 0.0),
|
448
|
+
},
|
449
|
+
"insight_quality": {
|
450
|
+
"average_quality": insight_stats.get("mean", 0.0),
|
451
|
+
"quality_trend": "stable", # Could be calculated from historical data
|
452
|
+
"quality_distribution": {
|
453
|
+
"high": 0, # Could be calculated from raw data
|
454
|
+
"medium": 0,
|
455
|
+
"low": 0,
|
456
|
+
},
|
457
|
+
},
|
458
|
+
"recent_events": [e.to_dict() for e in recent_events[-10:]],
|
459
|
+
"active_alerts": [a.to_dict() for a in active_alerts],
|
460
|
+
"system_health": {
|
461
|
+
"status": "healthy" if len(active_alerts) == 0 else "degraded",
|
462
|
+
"uptime": "99.9%", # Could be calculated from metrics
|
463
|
+
"last_updated": now.isoformat(),
|
464
|
+
},
|
465
|
+
}
|
466
|
+
|
467
|
+
async def get_dashboard_data(self) -> Dict[str, Any]:
|
468
|
+
"""Get current dashboard data."""
|
469
|
+
return self.dashboard_data.copy()
|
470
|
+
|
471
|
+
async def get_real_time_metrics(self) -> Dict[str, Any]:
|
472
|
+
"""Get real-time metrics summary."""
|
473
|
+
now = datetime.now()
|
474
|
+
last_minute = now - timedelta(minutes=1)
|
475
|
+
|
476
|
+
# Get very recent metrics
|
477
|
+
recent_tasks = await self.metrics_collector.get_metric_values(
|
478
|
+
"tasks_completed", last_minute
|
479
|
+
)
|
480
|
+
recent_insights = await self.metrics_collector.get_metric_values(
|
481
|
+
"insight_quality", last_minute
|
482
|
+
)
|
483
|
+
|
484
|
+
return {
|
485
|
+
"timestamp": now.isoformat(),
|
486
|
+
"tasks_per_minute": len(recent_tasks),
|
487
|
+
"average_insight_quality": (
|
488
|
+
statistics.mean([v.value for v in recent_insights])
|
489
|
+
if recent_insights
|
490
|
+
else 0.0
|
491
|
+
),
|
492
|
+
"active_agents": len(
|
493
|
+
await self.metrics_collector.get_metric_values(
|
494
|
+
"agent_active", last_minute
|
495
|
+
)
|
496
|
+
),
|
497
|
+
"system_load": 0.0, # Could be calculated from various metrics
|
498
|
+
}
|
499
|
+
|
500
|
+
|
501
|
+
@register_node()
|
502
|
+
class StreamingAnalyticsNode(Node):
|
503
|
+
"""Node for streaming analytics and real-time monitoring."""
|
504
|
+
|
505
|
+
def __init__(self, name: str = "streaming_analytics", **kwargs):
|
506
|
+
"""Initialize streaming analytics node."""
|
507
|
+
self.action = "start_monitoring"
|
508
|
+
self.metrics_config = None
|
509
|
+
self.alert_rules = None
|
510
|
+
self.dashboard_config = None
|
511
|
+
self.buffer_size = 1000
|
512
|
+
self.retention_hours = 24
|
513
|
+
self.update_interval = 5
|
514
|
+
|
515
|
+
# Set attributes from kwargs
|
516
|
+
for key, value in kwargs.items():
|
517
|
+
if hasattr(self, key):
|
518
|
+
setattr(self, key, value)
|
519
|
+
|
520
|
+
super().__init__(name=name, **kwargs)
|
521
|
+
|
522
|
+
# Initialize components
|
523
|
+
self.metrics_collector = MetricsCollector(
|
524
|
+
max_retention_hours=self.retention_hours
|
525
|
+
)
|
526
|
+
self.event_streamer = EventStreamer(buffer_size=self.buffer_size)
|
527
|
+
self.dashboard = PerformanceDashboard(
|
528
|
+
self.metrics_collector, self.event_streamer
|
529
|
+
)
|
530
|
+
|
531
|
+
# Track if monitoring is active
|
532
|
+
self._monitoring_active = False
|
533
|
+
|
534
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
535
|
+
"""Get node parameters."""
|
536
|
+
return {
|
537
|
+
"action": NodeParameter(
|
538
|
+
name="action",
|
539
|
+
type=str,
|
540
|
+
required=False,
|
541
|
+
default="start_monitoring",
|
542
|
+
description="Action to perform: start_monitoring, stop_monitoring, get_metrics, get_dashboard",
|
543
|
+
),
|
544
|
+
"metrics_config": NodeParameter(
|
545
|
+
name="metrics_config",
|
546
|
+
type=dict,
|
547
|
+
required=False,
|
548
|
+
description="Configuration for metrics collection",
|
549
|
+
),
|
550
|
+
"alert_rules": NodeParameter(
|
551
|
+
name="alert_rules",
|
552
|
+
type=list,
|
553
|
+
required=False,
|
554
|
+
description="Alert rules configuration",
|
555
|
+
),
|
556
|
+
"dashboard_config": NodeParameter(
|
557
|
+
name="dashboard_config",
|
558
|
+
type=dict,
|
559
|
+
required=False,
|
560
|
+
description="Dashboard configuration",
|
561
|
+
),
|
562
|
+
"buffer_size": NodeParameter(
|
563
|
+
name="buffer_size",
|
564
|
+
type=int,
|
565
|
+
required=False,
|
566
|
+
default=1000,
|
567
|
+
description="Event buffer size",
|
568
|
+
),
|
569
|
+
"retention_hours": NodeParameter(
|
570
|
+
name="retention_hours",
|
571
|
+
type=int,
|
572
|
+
required=False,
|
573
|
+
default=24,
|
574
|
+
description="Metric retention period in hours",
|
575
|
+
),
|
576
|
+
"update_interval": NodeParameter(
|
577
|
+
name="update_interval",
|
578
|
+
type=int,
|
579
|
+
required=False,
|
580
|
+
default=5,
|
581
|
+
description="Dashboard update interval in seconds",
|
582
|
+
),
|
583
|
+
}
|
584
|
+
|
585
|
+
async def run(self, **kwargs) -> Dict[str, Any]:
|
586
|
+
"""Execute streaming analytics action."""
|
587
|
+
# Get parameters
|
588
|
+
action = kwargs.get("action", self.action)
|
589
|
+
metrics_config = kwargs.get("metrics_config", self.metrics_config)
|
590
|
+
alert_rules = kwargs.get("alert_rules", self.alert_rules)
|
591
|
+
dashboard_config = kwargs.get("dashboard_config", self.dashboard_config)
|
592
|
+
|
593
|
+
if action == "start_monitoring":
|
594
|
+
return await self._start_monitoring(
|
595
|
+
metrics_config, alert_rules, dashboard_config
|
596
|
+
)
|
597
|
+
elif action == "stop_monitoring":
|
598
|
+
return await self._stop_monitoring()
|
599
|
+
elif action == "get_metrics":
|
600
|
+
return await self._get_metrics()
|
601
|
+
elif action == "get_dashboard":
|
602
|
+
return await self._get_dashboard()
|
603
|
+
elif action == "record_metric":
|
604
|
+
return await self._record_metric(kwargs)
|
605
|
+
elif action == "publish_event":
|
606
|
+
return await self._publish_event(kwargs)
|
607
|
+
else:
|
608
|
+
raise ValueError(f"Unknown action: {action}")
|
609
|
+
|
610
|
+
async def _start_monitoring(
|
611
|
+
self,
|
612
|
+
metrics_config: Optional[Dict[str, Any]],
|
613
|
+
alert_rules: Optional[List[Dict[str, Any]]],
|
614
|
+
dashboard_config: Optional[Dict[str, Any]],
|
615
|
+
) -> Dict[str, Any]:
|
616
|
+
"""Start monitoring with configuration."""
|
617
|
+
# Configure alert rules
|
618
|
+
if alert_rules:
|
619
|
+
for rule in alert_rules:
|
620
|
+
await self.metrics_collector.add_alert_rule(
|
621
|
+
name=rule["name"],
|
622
|
+
metric_name=rule["metric_name"],
|
623
|
+
threshold=rule["threshold"],
|
624
|
+
condition=rule.get("condition", "greater_than"),
|
625
|
+
severity=AlertSeverity(rule.get("severity", "medium")),
|
626
|
+
message=rule.get("message", ""),
|
627
|
+
)
|
628
|
+
|
629
|
+
# Configure dashboard
|
630
|
+
if dashboard_config:
|
631
|
+
self.dashboard._update_interval = dashboard_config.get("update_interval", 5)
|
632
|
+
|
633
|
+
# Start dashboard
|
634
|
+
await self.dashboard.start()
|
635
|
+
self._monitoring_active = True
|
636
|
+
|
637
|
+
return {
|
638
|
+
"success": True,
|
639
|
+
"message": "Monitoring started",
|
640
|
+
"monitoring_active": self._monitoring_active,
|
641
|
+
"alert_rules_configured": len(alert_rules) if alert_rules else 0,
|
642
|
+
"dashboard_update_interval": self.dashboard._update_interval,
|
643
|
+
}
|
644
|
+
|
645
|
+
async def _stop_monitoring(self) -> Dict[str, Any]:
|
646
|
+
"""Stop monitoring."""
|
647
|
+
await self.dashboard.stop()
|
648
|
+
self._monitoring_active = False
|
649
|
+
|
650
|
+
return {
|
651
|
+
"success": True,
|
652
|
+
"message": "Monitoring stopped",
|
653
|
+
"monitoring_active": self._monitoring_active,
|
654
|
+
}
|
655
|
+
|
656
|
+
async def _get_metrics(self) -> Dict[str, Any]:
|
657
|
+
"""Get current metrics."""
|
658
|
+
all_metrics = await self.metrics_collector.get_all_metrics()
|
659
|
+
active_alerts = await self.metrics_collector.get_active_alerts()
|
660
|
+
|
661
|
+
return {
|
662
|
+
"success": True,
|
663
|
+
"metrics": all_metrics,
|
664
|
+
"active_alerts": [a.to_dict() for a in active_alerts],
|
665
|
+
"monitoring_active": self._monitoring_active,
|
666
|
+
}
|
667
|
+
|
668
|
+
async def _get_dashboard(self) -> Dict[str, Any]:
|
669
|
+
"""Get dashboard data."""
|
670
|
+
dashboard_data = await self.dashboard.get_dashboard_data()
|
671
|
+
real_time_metrics = await self.dashboard.get_real_time_metrics()
|
672
|
+
|
673
|
+
return {
|
674
|
+
"success": True,
|
675
|
+
"dashboard": dashboard_data,
|
676
|
+
"real_time": real_time_metrics,
|
677
|
+
"monitoring_active": self._monitoring_active,
|
678
|
+
}
|
679
|
+
|
680
|
+
async def _record_metric(self, params: Dict[str, Any]) -> Dict[str, Any]:
|
681
|
+
"""Record a metric value."""
|
682
|
+
metric_name = params.get("metric_name")
|
683
|
+
metric_value = params.get("metric_value")
|
684
|
+
metric_type = params.get("metric_type", "gauge")
|
685
|
+
labels = params.get("labels", {})
|
686
|
+
|
687
|
+
if not metric_name or metric_value is None:
|
688
|
+
raise ValueError("metric_name and metric_value are required")
|
689
|
+
|
690
|
+
await self.metrics_collector.record_metric(
|
691
|
+
name=metric_name,
|
692
|
+
value=float(metric_value),
|
693
|
+
metric_type=MetricType(metric_type),
|
694
|
+
labels=labels,
|
695
|
+
)
|
696
|
+
|
697
|
+
return {
|
698
|
+
"success": True,
|
699
|
+
"message": f"Recorded metric {metric_name} = {metric_value}",
|
700
|
+
"metric_name": metric_name,
|
701
|
+
"metric_value": metric_value,
|
702
|
+
}
|
703
|
+
|
704
|
+
async def _publish_event(self, params: Dict[str, Any]) -> Dict[str, Any]:
|
705
|
+
"""Publish a stream event."""
|
706
|
+
event_type = params.get("event_type")
|
707
|
+
source = params.get("source", "unknown")
|
708
|
+
data = params.get("data", {})
|
709
|
+
metadata = params.get("metadata", {})
|
710
|
+
|
711
|
+
if not event_type:
|
712
|
+
raise ValueError("event_type is required")
|
713
|
+
|
714
|
+
event = StreamEvent(
|
715
|
+
event_id=str(uuid4()),
|
716
|
+
event_type=event_type,
|
717
|
+
source=source,
|
718
|
+
timestamp=datetime.now(),
|
719
|
+
data=data,
|
720
|
+
metadata=metadata,
|
721
|
+
)
|
722
|
+
|
723
|
+
await self.event_streamer.publish_event(event)
|
724
|
+
|
725
|
+
return {
|
726
|
+
"success": True,
|
727
|
+
"message": f"Published event {event_type}",
|
728
|
+
"event_id": event.event_id,
|
729
|
+
"event_type": event_type,
|
730
|
+
}
|
731
|
+
|
732
|
+
|
733
|
+
@register_node()
|
734
|
+
class A2AMonitoringNode(Node):
|
735
|
+
"""Specialized monitoring node for A2A systems."""
|
736
|
+
|
737
|
+
def __init__(self, name: str = "a2a_monitoring", **kwargs):
|
738
|
+
"""Initialize A2A monitoring node."""
|
739
|
+
self.coordinator_node = None
|
740
|
+
self.streaming_node = None
|
741
|
+
self.monitoring_interval = 10
|
742
|
+
self.enable_auto_alerts = True
|
743
|
+
|
744
|
+
# Set attributes from kwargs
|
745
|
+
for key, value in kwargs.items():
|
746
|
+
if hasattr(self, key):
|
747
|
+
setattr(self, key, value)
|
748
|
+
|
749
|
+
super().__init__(name=name, **kwargs)
|
750
|
+
|
751
|
+
# Monitoring task
|
752
|
+
self._monitoring_task: Optional[asyncio.Task] = None
|
753
|
+
|
754
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
755
|
+
"""Get node parameters."""
|
756
|
+
return {
|
757
|
+
"coordinator_node": NodeParameter(
|
758
|
+
name="coordinator_node",
|
759
|
+
type=object,
|
760
|
+
required=True,
|
761
|
+
description="A2A coordinator node to monitor",
|
762
|
+
),
|
763
|
+
"streaming_node": NodeParameter(
|
764
|
+
name="streaming_node",
|
765
|
+
type=object,
|
766
|
+
required=True,
|
767
|
+
description="Streaming analytics node",
|
768
|
+
),
|
769
|
+
"monitoring_interval": NodeParameter(
|
770
|
+
name="monitoring_interval",
|
771
|
+
type=int,
|
772
|
+
required=False,
|
773
|
+
default=10,
|
774
|
+
description="Monitoring interval in seconds",
|
775
|
+
),
|
776
|
+
"enable_auto_alerts": NodeParameter(
|
777
|
+
name="enable_auto_alerts",
|
778
|
+
type=bool,
|
779
|
+
required=False,
|
780
|
+
default=True,
|
781
|
+
description="Enable automatic alert generation",
|
782
|
+
),
|
783
|
+
}
|
784
|
+
|
785
|
+
async def run(self, **kwargs) -> Dict[str, Any]:
|
786
|
+
"""Start A2A monitoring."""
|
787
|
+
# Get parameters
|
788
|
+
coordinator_node = kwargs.get("coordinator_node", self.coordinator_node)
|
789
|
+
streaming_node = kwargs.get("streaming_node", self.streaming_node)
|
790
|
+
monitoring_interval = kwargs.get(
|
791
|
+
"monitoring_interval", self.monitoring_interval
|
792
|
+
)
|
793
|
+
enable_auto_alerts = kwargs.get("enable_auto_alerts", self.enable_auto_alerts)
|
794
|
+
|
795
|
+
if not coordinator_node or not streaming_node:
|
796
|
+
raise ValueError("coordinator_node and streaming_node are required")
|
797
|
+
|
798
|
+
# Set up monitoring
|
799
|
+
self.coordinator_node = coordinator_node
|
800
|
+
self.streaming_node = streaming_node
|
801
|
+
self.monitoring_interval = monitoring_interval
|
802
|
+
|
803
|
+
# Configure default alert rules for A2A
|
804
|
+
if enable_auto_alerts:
|
805
|
+
await self._setup_default_alerts()
|
806
|
+
|
807
|
+
# Start monitoring task
|
808
|
+
if self._monitoring_task is None:
|
809
|
+
self._monitoring_task = asyncio.create_task(self._monitoring_loop())
|
810
|
+
|
811
|
+
return {
|
812
|
+
"success": True,
|
813
|
+
"message": "A2A monitoring started",
|
814
|
+
"monitoring_interval": self.monitoring_interval,
|
815
|
+
"auto_alerts_enabled": enable_auto_alerts,
|
816
|
+
}
|
817
|
+
|
818
|
+
async def _setup_default_alerts(self):
|
819
|
+
"""Set up default alert rules for A2A monitoring."""
|
820
|
+
default_rules = [
|
821
|
+
{
|
822
|
+
"name": "high_task_failure_rate",
|
823
|
+
"metric_name": "task_failure_rate",
|
824
|
+
"threshold": 0.1,
|
825
|
+
"condition": "greater_than",
|
826
|
+
"severity": "high",
|
827
|
+
"message": "Task failure rate is above 10%",
|
828
|
+
},
|
829
|
+
{
|
830
|
+
"name": "low_agent_utilization",
|
831
|
+
"metric_name": "agent_utilization",
|
832
|
+
"threshold": 0.3,
|
833
|
+
"condition": "less_than",
|
834
|
+
"severity": "medium",
|
835
|
+
"message": "Agent utilization is below 30%",
|
836
|
+
},
|
837
|
+
{
|
838
|
+
"name": "low_insight_quality",
|
839
|
+
"metric_name": "insight_quality",
|
840
|
+
"threshold": 0.6,
|
841
|
+
"condition": "less_than",
|
842
|
+
"severity": "medium",
|
843
|
+
"message": "Average insight quality is below 60%",
|
844
|
+
},
|
845
|
+
{
|
846
|
+
"name": "high_response_time",
|
847
|
+
"metric_name": "response_time",
|
848
|
+
"threshold": 5000, # 5 seconds
|
849
|
+
"condition": "greater_than",
|
850
|
+
"severity": "high",
|
851
|
+
"message": "Response time is above 5 seconds",
|
852
|
+
},
|
853
|
+
]
|
854
|
+
|
855
|
+
for rule in default_rules:
|
856
|
+
await self.streaming_node.metrics_collector.add_alert_rule(
|
857
|
+
name=rule["name"],
|
858
|
+
metric_name=rule["metric_name"],
|
859
|
+
threshold=rule["threshold"],
|
860
|
+
condition=rule["condition"],
|
861
|
+
severity=AlertSeverity(rule["severity"]),
|
862
|
+
message=rule["message"],
|
863
|
+
)
|
864
|
+
|
865
|
+
async def _monitoring_loop(self):
|
866
|
+
"""Main monitoring loop."""
|
867
|
+
while True:
|
868
|
+
try:
|
869
|
+
await self._collect_a2a_metrics()
|
870
|
+
await asyncio.sleep(self.monitoring_interval)
|
871
|
+
except asyncio.CancelledError:
|
872
|
+
break
|
873
|
+
except Exception:
|
874
|
+
# Log error but continue monitoring
|
875
|
+
await asyncio.sleep(self.monitoring_interval)
|
876
|
+
|
877
|
+
async def _collect_a2a_metrics(self):
|
878
|
+
"""Collect metrics from A2A coordinator."""
|
879
|
+
if not self.coordinator_node:
|
880
|
+
return
|
881
|
+
|
882
|
+
# Get current state from coordinator
|
883
|
+
active_tasks = len(getattr(self.coordinator_node, "active_tasks", {}))
|
884
|
+
completed_tasks = len(getattr(self.coordinator_node, "completed_tasks", []))
|
885
|
+
registered_agents = len(getattr(self.coordinator_node, "registered_agents", {}))
|
886
|
+
|
887
|
+
# Calculate metrics
|
888
|
+
total_tasks = active_tasks + completed_tasks
|
889
|
+
agent_utilization = active_tasks / max(registered_agents, 1)
|
890
|
+
|
891
|
+
# Record metrics
|
892
|
+
await self.streaming_node.metrics_collector.set_gauge(
|
893
|
+
"active_tasks", active_tasks
|
894
|
+
)
|
895
|
+
await self.streaming_node.metrics_collector.set_gauge(
|
896
|
+
"completed_tasks", completed_tasks
|
897
|
+
)
|
898
|
+
await self.streaming_node.metrics_collector.set_gauge(
|
899
|
+
"registered_agents", registered_agents
|
900
|
+
)
|
901
|
+
await self.streaming_node.metrics_collector.set_gauge(
|
902
|
+
"agent_utilization", agent_utilization
|
903
|
+
)
|
904
|
+
|
905
|
+
# Calculate insight quality if available
|
906
|
+
if hasattr(self.coordinator_node, "completed_tasks"):
|
907
|
+
completed_tasks_list = getattr(self.coordinator_node, "completed_tasks", [])
|
908
|
+
if completed_tasks_list:
|
909
|
+
quality_scores = []
|
910
|
+
for task in completed_tasks_list[-10:]: # Last 10 tasks
|
911
|
+
if hasattr(task, "current_quality_score"):
|
912
|
+
# Handle both real values and mock objects
|
913
|
+
score = task.current_quality_score
|
914
|
+
if isinstance(score, (int, float)):
|
915
|
+
quality_scores.append(score)
|
916
|
+
|
917
|
+
if quality_scores:
|
918
|
+
avg_quality = sum(quality_scores) / len(quality_scores)
|
919
|
+
await self.streaming_node.metrics_collector.set_gauge(
|
920
|
+
"insight_quality", avg_quality
|
921
|
+
)
|
922
|
+
|
923
|
+
# Publish monitoring event
|
924
|
+
event = StreamEvent(
|
925
|
+
event_id=str(uuid4()),
|
926
|
+
event_type="a2a_metrics_collected",
|
927
|
+
source="a2a_monitoring",
|
928
|
+
timestamp=datetime.now(),
|
929
|
+
data={
|
930
|
+
"active_tasks": active_tasks,
|
931
|
+
"completed_tasks": completed_tasks,
|
932
|
+
"registered_agents": registered_agents,
|
933
|
+
"agent_utilization": agent_utilization,
|
934
|
+
},
|
935
|
+
)
|
936
|
+
|
937
|
+
await self.streaming_node.event_streamer.publish_event(event)
|
938
|
+
|
939
|
+
async def stop_monitoring(self):
|
940
|
+
"""Stop monitoring."""
|
941
|
+
if self._monitoring_task:
|
942
|
+
self._monitoring_task.cancel()
|
943
|
+
try:
|
944
|
+
await self._monitoring_task
|
945
|
+
except asyncio.CancelledError:
|
946
|
+
pass
|
947
|
+
self._monitoring_task = None
|