kailash 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -1
- kailash/access_control/__init__.py +1 -1
- kailash/core/actors/adaptive_pool_controller.py +630 -0
- kailash/core/actors/connection_actor.py +3 -3
- kailash/core/ml/__init__.py +1 -0
- kailash/core/ml/query_patterns.py +544 -0
- kailash/core/monitoring/__init__.py +19 -0
- kailash/core/monitoring/connection_metrics.py +488 -0
- kailash/core/optimization/__init__.py +1 -0
- kailash/core/resilience/__init__.py +17 -0
- kailash/core/resilience/circuit_breaker.py +382 -0
- kailash/gateway/api.py +7 -5
- kailash/gateway/enhanced_gateway.py +1 -1
- kailash/middleware/auth/access_control.py +11 -11
- kailash/middleware/communication/ai_chat.py +7 -7
- kailash/middleware/communication/api_gateway.py +5 -15
- kailash/middleware/gateway/checkpoint_manager.py +45 -8
- kailash/middleware/gateway/event_store.py +66 -26
- kailash/middleware/mcp/enhanced_server.py +2 -2
- kailash/nodes/admin/permission_check.py +110 -30
- kailash/nodes/admin/schema.sql +387 -0
- kailash/nodes/admin/tenant_isolation.py +249 -0
- kailash/nodes/admin/transaction_utils.py +244 -0
- kailash/nodes/admin/user_management.py +37 -9
- kailash/nodes/ai/ai_providers.py +55 -3
- kailash/nodes/ai/llm_agent.py +115 -13
- kailash/nodes/data/query_pipeline.py +641 -0
- kailash/nodes/data/query_router.py +895 -0
- kailash/nodes/data/sql.py +24 -0
- kailash/nodes/data/workflow_connection_pool.py +451 -23
- kailash/nodes/monitoring/__init__.py +3 -5
- kailash/nodes/monitoring/connection_dashboard.py +822 -0
- kailash/nodes/rag/__init__.py +1 -3
- kailash/resources/registry.py +6 -0
- kailash/runtime/async_local.py +7 -0
- kailash/utils/export.py +152 -0
- kailash/workflow/builder.py +42 -0
- kailash/workflow/graph.py +86 -17
- kailash/workflow/templates.py +4 -9
- {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/METADATA +14 -1
- {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/RECORD +45 -31
- {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/WHEEL +0 -0
- {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/entry_points.txt +0 -0
- {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,822 @@
|
|
1
|
+
"""Real-time monitoring dashboard for connection pools.
|
2
|
+
|
3
|
+
This module provides a web-based dashboard for monitoring connection pool
|
4
|
+
health, performance metrics, and alerts. It integrates with the metrics
|
5
|
+
collection system to provide real-time visualization.
|
6
|
+
|
7
|
+
Features:
|
8
|
+
- Real-time pool statistics with WebSocket updates
|
9
|
+
- Health score visualization with history
|
10
|
+
- Alert configuration and notifications
|
11
|
+
- Historical trend analysis with charts
|
12
|
+
- Export capabilities for reports
|
13
|
+
|
14
|
+
Example:
|
15
|
+
>>> dashboard = ConnectionDashboardNode(
|
16
|
+
... name="pool_monitor",
|
17
|
+
... port=8080,
|
18
|
+
... update_interval=1.0,
|
19
|
+
... retention_hours=24
|
20
|
+
... )
|
21
|
+
>>>
|
22
|
+
>>> # Start dashboard server
|
23
|
+
>>> await dashboard.start()
|
24
|
+
>>>
|
25
|
+
>>> # Access at http://localhost:8080
|
26
|
+
"""
|
27
|
+
|
28
|
+
import asyncio
|
29
|
+
import json
|
30
|
+
import logging
|
31
|
+
import time
|
32
|
+
from dataclasses import dataclass, field
|
33
|
+
from datetime import datetime, timedelta
|
34
|
+
from typing import Any, Dict, List, Optional, Set
|
35
|
+
|
36
|
+
import aiohttp_cors
|
37
|
+
from aiohttp import web
|
38
|
+
|
39
|
+
from kailash.nodes.base import Node, NodeParameter, register_node
|
40
|
+
|
41
|
+
logger = logging.getLogger(__name__)
|
42
|
+
|
43
|
+
|
44
|
+
@dataclass
|
45
|
+
class AlertRule:
|
46
|
+
"""Alert rule configuration."""
|
47
|
+
|
48
|
+
id: str
|
49
|
+
name: str
|
50
|
+
condition: str # e.g., "pool_utilization > 0.9"
|
51
|
+
threshold: float
|
52
|
+
duration_seconds: int = 60 # How long condition must be true
|
53
|
+
cooldown_seconds: int = 300 # Prevent alert spam
|
54
|
+
severity: str = "warning" # info, warning, error, critical
|
55
|
+
enabled: bool = True
|
56
|
+
last_triggered: Optional[float] = None
|
57
|
+
|
58
|
+
def is_in_cooldown(self) -> bool:
|
59
|
+
"""Check if alert is in cooldown period."""
|
60
|
+
if not self.last_triggered:
|
61
|
+
return False
|
62
|
+
return (time.time() - self.last_triggered) < self.cooldown_seconds
|
63
|
+
|
64
|
+
|
65
|
+
@dataclass
|
66
|
+
class Alert:
|
67
|
+
"""Active alert instance."""
|
68
|
+
|
69
|
+
rule_id: str
|
70
|
+
triggered_at: float
|
71
|
+
severity: str
|
72
|
+
message: str
|
73
|
+
metric_value: float
|
74
|
+
resolved: bool = False
|
75
|
+
resolved_at: Optional[float] = None
|
76
|
+
|
77
|
+
def duration(self) -> float:
|
78
|
+
"""Get alert duration in seconds."""
|
79
|
+
end_time = self.resolved_at or time.time()
|
80
|
+
return end_time - self.triggered_at
|
81
|
+
|
82
|
+
|
83
|
+
class MetricsCache:
|
84
|
+
"""Cache for metrics data with time-based expiration."""
|
85
|
+
|
86
|
+
def __init__(self, retention_hours: int = 24):
|
87
|
+
"""Initialize metrics cache."""
|
88
|
+
self.retention_hours = retention_hours
|
89
|
+
self._data: Dict[str, List[Dict[str, Any]]] = {}
|
90
|
+
self._last_cleanup = time.time()
|
91
|
+
|
92
|
+
def add(self, metric_name: str, value: Dict[str, Any]):
|
93
|
+
"""Add metric value to cache."""
|
94
|
+
if metric_name not in self._data:
|
95
|
+
self._data[metric_name] = []
|
96
|
+
|
97
|
+
value["timestamp"] = time.time()
|
98
|
+
self._data[metric_name].append(value)
|
99
|
+
|
100
|
+
# Periodic cleanup
|
101
|
+
if time.time() - self._last_cleanup > 3600: # Every hour
|
102
|
+
self._cleanup()
|
103
|
+
|
104
|
+
def get_recent(self, metric_name: str, minutes: int = 60) -> List[Dict[str, Any]]:
|
105
|
+
"""Get recent metric values."""
|
106
|
+
if metric_name not in self._data:
|
107
|
+
return []
|
108
|
+
|
109
|
+
cutoff = time.time() - (minutes * 60)
|
110
|
+
return [v for v in self._data[metric_name] if v["timestamp"] >= cutoff]
|
111
|
+
|
112
|
+
def _cleanup(self):
|
113
|
+
"""Remove old data."""
|
114
|
+
cutoff = time.time() - (self.retention_hours * 3600)
|
115
|
+
|
116
|
+
for metric_name in list(self._data.keys()):
|
117
|
+
self._data[metric_name] = [
|
118
|
+
v for v in self._data[metric_name] if v["timestamp"] >= cutoff
|
119
|
+
]
|
120
|
+
|
121
|
+
# Remove empty metrics
|
122
|
+
if not self._data[metric_name]:
|
123
|
+
del self._data[metric_name]
|
124
|
+
|
125
|
+
self._last_cleanup = time.time()
|
126
|
+
|
127
|
+
|
128
|
+
@register_node()
|
129
|
+
class ConnectionDashboardNode(Node):
|
130
|
+
"""Web-based monitoring dashboard for connection pools.
|
131
|
+
|
132
|
+
Provides real-time visualization of connection pool metrics,
|
133
|
+
health scores, and alerts through a web interface.
|
134
|
+
"""
|
135
|
+
|
136
|
+
def __init__(self, **config):
|
137
|
+
"""Initialize dashboard node.
|
138
|
+
|
139
|
+
Args:
|
140
|
+
port: Web server port (default: 8080)
|
141
|
+
host: Web server host (default: localhost)
|
142
|
+
update_interval: Metric update interval in seconds (default: 1.0)
|
143
|
+
retention_hours: How long to keep historical data (default: 24)
|
144
|
+
enable_alerts: Enable alert system (default: True)
|
145
|
+
"""
|
146
|
+
self.port = config.get("port", 8080)
|
147
|
+
self.host = config.get("host", "localhost")
|
148
|
+
self.update_interval = config.get("update_interval", 1.0)
|
149
|
+
self.retention_hours = config.get("retention_hours", 24)
|
150
|
+
self.enable_alerts = config.get("enable_alerts", True)
|
151
|
+
|
152
|
+
super().__init__(**config)
|
153
|
+
|
154
|
+
# Web server
|
155
|
+
self.app = None
|
156
|
+
self.runner = None
|
157
|
+
self.site = None
|
158
|
+
|
159
|
+
# WebSocket connections
|
160
|
+
self._websockets: Set[web.WebSocketResponse] = set()
|
161
|
+
|
162
|
+
# Metrics cache
|
163
|
+
self._cache = MetricsCache(self.retention_hours)
|
164
|
+
|
165
|
+
# Alert system
|
166
|
+
self._alert_rules: Dict[str, AlertRule] = {}
|
167
|
+
self._active_alerts: Dict[str, Alert] = {}
|
168
|
+
self._alert_history: List[Alert] = []
|
169
|
+
|
170
|
+
# Update task
|
171
|
+
self._update_task: Optional[asyncio.Task] = None
|
172
|
+
|
173
|
+
# Initialize default alert rules
|
174
|
+
self._init_default_alerts()
|
175
|
+
|
176
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
177
|
+
"""Get node parameters."""
|
178
|
+
return {
|
179
|
+
"port": NodeParameter(
|
180
|
+
name="port", type=int, default=8080, description="Web server port"
|
181
|
+
),
|
182
|
+
"host": NodeParameter(
|
183
|
+
name="host",
|
184
|
+
type=str,
|
185
|
+
default="localhost",
|
186
|
+
description="Web server host",
|
187
|
+
),
|
188
|
+
"update_interval": NodeParameter(
|
189
|
+
name="update_interval",
|
190
|
+
type=float,
|
191
|
+
default=1.0,
|
192
|
+
description="Metric update interval in seconds",
|
193
|
+
),
|
194
|
+
"retention_hours": NodeParameter(
|
195
|
+
name="retention_hours",
|
196
|
+
type=int,
|
197
|
+
default=24,
|
198
|
+
description="Historical data retention in hours",
|
199
|
+
),
|
200
|
+
"enable_alerts": NodeParameter(
|
201
|
+
name="enable_alerts",
|
202
|
+
type=bool,
|
203
|
+
default=True,
|
204
|
+
description="Enable alert system",
|
205
|
+
),
|
206
|
+
"action": NodeParameter(
|
207
|
+
name="action",
|
208
|
+
type=str,
|
209
|
+
required=False,
|
210
|
+
description="Dashboard action",
|
211
|
+
choices=["start", "stop", "status"],
|
212
|
+
),
|
213
|
+
}
|
214
|
+
|
215
|
+
async def execute(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
216
|
+
"""Execute dashboard action.
|
217
|
+
|
218
|
+
Actions:
|
219
|
+
- start: Start the dashboard server
|
220
|
+
- stop: Stop the dashboard server
|
221
|
+
- status: Get dashboard status
|
222
|
+
"""
|
223
|
+
action = input_data.get("action", "status")
|
224
|
+
|
225
|
+
if action == "start":
|
226
|
+
await self.start()
|
227
|
+
return {
|
228
|
+
"status": "started",
|
229
|
+
"url": f"http://{self.host}:{self.port}",
|
230
|
+
"websocket": f"ws://{self.host}:{self.port}/ws",
|
231
|
+
}
|
232
|
+
|
233
|
+
elif action == "stop":
|
234
|
+
await self.stop()
|
235
|
+
return {"status": "stopped"}
|
236
|
+
|
237
|
+
else:
|
238
|
+
return self.get_status()
|
239
|
+
|
240
|
+
async def start(self):
|
241
|
+
"""Start the dashboard web server."""
|
242
|
+
if self.app:
|
243
|
+
logger.warning("Dashboard already running")
|
244
|
+
return
|
245
|
+
|
246
|
+
# Create web app
|
247
|
+
self.app = web.Application()
|
248
|
+
|
249
|
+
# Setup CORS
|
250
|
+
cors = aiohttp_cors.setup(
|
251
|
+
self.app,
|
252
|
+
defaults={
|
253
|
+
"*": aiohttp_cors.ResourceOptions(
|
254
|
+
allow_credentials=True, expose_headers="*", allow_headers="*"
|
255
|
+
)
|
256
|
+
},
|
257
|
+
)
|
258
|
+
|
259
|
+
# Add routes
|
260
|
+
self.app.router.add_get("/", self._handle_index)
|
261
|
+
self.app.router.add_get("/api/metrics", self._handle_metrics)
|
262
|
+
self.app.router.add_get("/api/pools", self._handle_pools)
|
263
|
+
self.app.router.add_get("/api/alerts", self._handle_alerts)
|
264
|
+
self.app.router.add_post("/api/alerts", self._handle_create_alert)
|
265
|
+
self.app.router.add_delete("/api/alerts/{alert_id}", self._handle_delete_alert)
|
266
|
+
self.app.router.add_get("/api/history/{metric_name}", self._handle_history)
|
267
|
+
self.app.router.add_get("/ws", self._handle_websocket)
|
268
|
+
|
269
|
+
# Configure CORS for all routes
|
270
|
+
for route in list(self.app.router.routes()):
|
271
|
+
cors.add(route)
|
272
|
+
|
273
|
+
# Start server
|
274
|
+
self.runner = web.AppRunner(self.app)
|
275
|
+
await self.runner.setup()
|
276
|
+
self.site = web.TCPSite(self.runner, self.host, self.port)
|
277
|
+
await self.site.start()
|
278
|
+
|
279
|
+
# Start update task
|
280
|
+
self._update_task = asyncio.create_task(self._update_loop())
|
281
|
+
|
282
|
+
logger.info(f"Dashboard started at http://{self.host}:{self.port}")
|
283
|
+
|
284
|
+
async def stop(self):
|
285
|
+
"""Stop the dashboard web server."""
|
286
|
+
# Stop update task
|
287
|
+
if self._update_task:
|
288
|
+
self._update_task.cancel()
|
289
|
+
try:
|
290
|
+
await self._update_task
|
291
|
+
except asyncio.CancelledError:
|
292
|
+
pass
|
293
|
+
|
294
|
+
# Close WebSocket connections
|
295
|
+
for ws in list(self._websockets):
|
296
|
+
await ws.close()
|
297
|
+
|
298
|
+
# Stop web server
|
299
|
+
if self.site:
|
300
|
+
await self.site.stop()
|
301
|
+
if self.runner:
|
302
|
+
await self.runner.cleanup()
|
303
|
+
|
304
|
+
self.app = None
|
305
|
+
self.runner = None
|
306
|
+
self.site = None
|
307
|
+
|
308
|
+
logger.info("Dashboard stopped")
|
309
|
+
|
310
|
+
async def _handle_index(self, request: web.Request) -> web.Response:
|
311
|
+
"""Serve dashboard HTML."""
|
312
|
+
html = """
|
313
|
+
<!DOCTYPE html>
|
314
|
+
<html>
|
315
|
+
<head>
|
316
|
+
<title>Connection Pool Dashboard</title>
|
317
|
+
<style>
|
318
|
+
body { font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }
|
319
|
+
.container { max-width: 1200px; margin: 0 auto; }
|
320
|
+
.pool-card { background: white; padding: 20px; margin: 10px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
|
321
|
+
.metric { display: inline-block; margin: 10px 20px 10px 0; }
|
322
|
+
.metric-label { color: #666; font-size: 12px; }
|
323
|
+
.metric-value { font-size: 24px; font-weight: bold; }
|
324
|
+
.health-bar { width: 100%; height: 20px; background: #ddd; border-radius: 10px; overflow: hidden; }
|
325
|
+
.health-fill { height: 100%; transition: width 0.3s, background-color 0.3s; }
|
326
|
+
.health-good { background: #4caf50; }
|
327
|
+
.health-warning { background: #ff9800; }
|
328
|
+
.health-critical { background: #f44336; }
|
329
|
+
.alert { padding: 10px; margin: 5px 0; border-radius: 4px; }
|
330
|
+
.alert-warning { background: #fff3cd; border: 1px solid #ffeeba; }
|
331
|
+
.alert-error { background: #f8d7da; border: 1px solid #f5c6cb; }
|
332
|
+
.alert-critical { background: #d1ecf1; border: 1px solid #bee5eb; }
|
333
|
+
.chart { width: 100%; height: 200px; margin: 20px 0; }
|
334
|
+
</style>
|
335
|
+
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
336
|
+
</head>
|
337
|
+
<body>
|
338
|
+
<div class="container">
|
339
|
+
<h1>Connection Pool Dashboard</h1>
|
340
|
+
|
341
|
+
<div id="alerts"></div>
|
342
|
+
|
343
|
+
<h2>Connection Pools</h2>
|
344
|
+
<div id="pools"></div>
|
345
|
+
|
346
|
+
<h2>Metrics History</h2>
|
347
|
+
<canvas id="metricsChart" class="chart"></canvas>
|
348
|
+
</div>
|
349
|
+
|
350
|
+
<script>
|
351
|
+
// WebSocket connection
|
352
|
+
const ws = new WebSocket(`ws://${window.location.host}/ws`);
|
353
|
+
|
354
|
+
// Chart setup
|
355
|
+
const ctx = document.getElementById('metricsChart').getContext('2d');
|
356
|
+
const chart = new Chart(ctx, {
|
357
|
+
type: 'line',
|
358
|
+
data: {
|
359
|
+
labels: [],
|
360
|
+
datasets: [{
|
361
|
+
label: 'Pool Utilization',
|
362
|
+
data: [],
|
363
|
+
borderColor: 'rgb(75, 192, 192)',
|
364
|
+
tension: 0.1
|
365
|
+
}]
|
366
|
+
},
|
367
|
+
options: {
|
368
|
+
responsive: true,
|
369
|
+
scales: {
|
370
|
+
y: {
|
371
|
+
beginAtZero: true,
|
372
|
+
max: 1
|
373
|
+
}
|
374
|
+
}
|
375
|
+
}
|
376
|
+
});
|
377
|
+
|
378
|
+
// Update functions
|
379
|
+
function updatePools(pools) {
|
380
|
+
const container = document.getElementById('pools');
|
381
|
+
container.innerHTML = '';
|
382
|
+
|
383
|
+
for (const [name, pool] of Object.entries(pools)) {
|
384
|
+
const healthClass = pool.health_score > 80 ? 'health-good' :
|
385
|
+
pool.health_score > 60 ? 'health-warning' : 'health-critical';
|
386
|
+
|
387
|
+
container.innerHTML += `
|
388
|
+
<div class="pool-card">
|
389
|
+
<h3>${name}</h3>
|
390
|
+
|
391
|
+
<div class="health-bar">
|
392
|
+
<div class="health-fill ${healthClass}" style="width: ${pool.health_score}%"></div>
|
393
|
+
</div>
|
394
|
+
|
395
|
+
<div class="metric">
|
396
|
+
<div class="metric-label">Active Connections</div>
|
397
|
+
<div class="metric-value">${pool.active_connections}</div>
|
398
|
+
</div>
|
399
|
+
|
400
|
+
<div class="metric">
|
401
|
+
<div class="metric-label">Total Connections</div>
|
402
|
+
<div class="metric-value">${pool.total_connections}</div>
|
403
|
+
</div>
|
404
|
+
|
405
|
+
<div class="metric">
|
406
|
+
<div class="metric-label">Utilization</div>
|
407
|
+
<div class="metric-value">${(pool.utilization * 100).toFixed(1)}%</div>
|
408
|
+
</div>
|
409
|
+
|
410
|
+
<div class="metric">
|
411
|
+
<div class="metric-label">Queries/sec</div>
|
412
|
+
<div class="metric-value">${pool.queries_per_second.toFixed(1)}</div>
|
413
|
+
</div>
|
414
|
+
|
415
|
+
<div class="metric">
|
416
|
+
<div class="metric-label">Avg Query Time</div>
|
417
|
+
<div class="metric-value">${pool.avg_query_time_ms.toFixed(1)}ms</div>
|
418
|
+
</div>
|
419
|
+
|
420
|
+
<div class="metric">
|
421
|
+
<div class="metric-label">Error Rate</div>
|
422
|
+
<div class="metric-value">${(pool.error_rate * 100).toFixed(2)}%</div>
|
423
|
+
</div>
|
424
|
+
</div>
|
425
|
+
`;
|
426
|
+
}
|
427
|
+
}
|
428
|
+
|
429
|
+
function updateAlerts(alerts) {
|
430
|
+
const container = document.getElementById('alerts');
|
431
|
+
container.innerHTML = '';
|
432
|
+
|
433
|
+
if (alerts.length === 0) return;
|
434
|
+
|
435
|
+
container.innerHTML = '<h2>Active Alerts</h2>';
|
436
|
+
|
437
|
+
for (const alert of alerts) {
|
438
|
+
const alertClass = `alert-${alert.severity}`;
|
439
|
+
container.innerHTML += `
|
440
|
+
<div class="alert ${alertClass}">
|
441
|
+
<strong>${alert.message}</strong> -
|
442
|
+
${new Date(alert.triggered_at * 1000).toLocaleTimeString()}
|
443
|
+
</div>
|
444
|
+
`;
|
445
|
+
}
|
446
|
+
}
|
447
|
+
|
448
|
+
function updateChart(data) {
|
449
|
+
// Update chart with latest data
|
450
|
+
if (data.timestamp && data.utilization !== undefined) {
|
451
|
+
const time = new Date(data.timestamp * 1000).toLocaleTimeString();
|
452
|
+
|
453
|
+
chart.data.labels.push(time);
|
454
|
+
chart.data.datasets[0].data.push(data.utilization);
|
455
|
+
|
456
|
+
// Keep only last 60 points
|
457
|
+
if (chart.data.labels.length > 60) {
|
458
|
+
chart.data.labels.shift();
|
459
|
+
chart.data.datasets[0].data.shift();
|
460
|
+
}
|
461
|
+
|
462
|
+
chart.update('none'); // No animation for real-time
|
463
|
+
}
|
464
|
+
}
|
465
|
+
|
466
|
+
// WebSocket handlers
|
467
|
+
ws.onmessage = (event) => {
|
468
|
+
const data = JSON.parse(event.data);
|
469
|
+
|
470
|
+
if (data.type === 'pools') {
|
471
|
+
updatePools(data.pools);
|
472
|
+
} else if (data.type === 'alerts') {
|
473
|
+
updateAlerts(data.alerts);
|
474
|
+
} else if (data.type === 'metrics') {
|
475
|
+
updateChart(data.data);
|
476
|
+
}
|
477
|
+
};
|
478
|
+
|
479
|
+
ws.onerror = (error) => {
|
480
|
+
console.error('WebSocket error:', error);
|
481
|
+
};
|
482
|
+
|
483
|
+
// Initial load
|
484
|
+
fetch('/api/pools').then(r => r.json()).then(data => updatePools(data));
|
485
|
+
fetch('/api/alerts').then(r => r.json()).then(data => updateAlerts(data.active));
|
486
|
+
</script>
|
487
|
+
</body>
|
488
|
+
</html>
|
489
|
+
"""
|
490
|
+
return web.Response(text=html, content_type="text/html")
|
491
|
+
|
492
|
+
async def _handle_metrics(self, request: web.Request) -> web.Response:
|
493
|
+
"""Get current metrics."""
|
494
|
+
metrics = await self._collect_metrics()
|
495
|
+
return web.json_response(metrics)
|
496
|
+
|
497
|
+
async def _handle_pools(self, request: web.Request) -> web.Response:
|
498
|
+
"""Get pool information."""
|
499
|
+
pools = await self._get_pool_info()
|
500
|
+
return web.json_response(pools)
|
501
|
+
|
502
|
+
async def _handle_alerts(self, request: web.Request) -> web.Response:
|
503
|
+
"""Get alerts."""
|
504
|
+
return web.json_response(
|
505
|
+
{
|
506
|
+
"active": [
|
507
|
+
{
|
508
|
+
"rule_id": alert.rule_id,
|
509
|
+
"triggered_at": alert.triggered_at,
|
510
|
+
"severity": alert.severity,
|
511
|
+
"message": alert.message,
|
512
|
+
"duration": alert.duration(),
|
513
|
+
}
|
514
|
+
for alert in self._active_alerts.values()
|
515
|
+
if not alert.resolved
|
516
|
+
],
|
517
|
+
"rules": [
|
518
|
+
{
|
519
|
+
"id": rule.id,
|
520
|
+
"name": rule.name,
|
521
|
+
"condition": rule.condition,
|
522
|
+
"threshold": rule.threshold,
|
523
|
+
"severity": rule.severity,
|
524
|
+
"enabled": rule.enabled,
|
525
|
+
}
|
526
|
+
for rule in self._alert_rules.values()
|
527
|
+
],
|
528
|
+
"history": [
|
529
|
+
{
|
530
|
+
"rule_id": alert.rule_id,
|
531
|
+
"triggered_at": alert.triggered_at,
|
532
|
+
"resolved_at": alert.resolved_at,
|
533
|
+
"severity": alert.severity,
|
534
|
+
"message": alert.message,
|
535
|
+
"duration": alert.duration(),
|
536
|
+
}
|
537
|
+
for alert in self._alert_history[-20:] # Last 20 alerts
|
538
|
+
],
|
539
|
+
}
|
540
|
+
)
|
541
|
+
|
542
|
+
async def _handle_create_alert(self, request: web.Request) -> web.Response:
|
543
|
+
"""Create new alert rule."""
|
544
|
+
data = await request.json()
|
545
|
+
|
546
|
+
rule = AlertRule(
|
547
|
+
id=f"rule_{len(self._alert_rules)}",
|
548
|
+
name=data["name"],
|
549
|
+
condition=data["condition"],
|
550
|
+
threshold=data["threshold"],
|
551
|
+
duration_seconds=data.get("duration_seconds", 60),
|
552
|
+
severity=data.get("severity", "warning"),
|
553
|
+
)
|
554
|
+
|
555
|
+
self._alert_rules[rule.id] = rule
|
556
|
+
|
557
|
+
return web.json_response({"id": rule.id})
|
558
|
+
|
559
|
+
async def _handle_delete_alert(self, request: web.Request) -> web.Response:
|
560
|
+
"""Delete alert rule."""
|
561
|
+
alert_id = request.match_info["alert_id"]
|
562
|
+
|
563
|
+
if alert_id in self._alert_rules:
|
564
|
+
del self._alert_rules[alert_id]
|
565
|
+
return web.json_response({"deleted": True})
|
566
|
+
|
567
|
+
return web.json_response({"error": "Alert not found"}, status=404)
|
568
|
+
|
569
|
+
async def _handle_history(self, request: web.Request) -> web.Response:
|
570
|
+
"""Get metric history."""
|
571
|
+
metric_name = request.match_info["metric_name"]
|
572
|
+
minutes = int(request.query.get("minutes", 60))
|
573
|
+
|
574
|
+
history = self._cache.get_recent(metric_name, minutes)
|
575
|
+
|
576
|
+
return web.json_response(history)
|
577
|
+
|
578
|
+
async def _handle_websocket(self, request: web.Request) -> web.WebSocketResponse:
|
579
|
+
"""Handle WebSocket connection."""
|
580
|
+
ws = web.WebSocketResponse()
|
581
|
+
await ws.prepare(request)
|
582
|
+
|
583
|
+
self._websockets.add(ws)
|
584
|
+
|
585
|
+
try:
|
586
|
+
# Send initial data
|
587
|
+
pools = await self._get_pool_info()
|
588
|
+
await ws.send_json({"type": "pools", "pools": pools})
|
589
|
+
|
590
|
+
# Keep connection alive
|
591
|
+
async for msg in ws:
|
592
|
+
if msg.type == web.WSMsgType.TEXT:
|
593
|
+
# Handle client messages if needed
|
594
|
+
pass
|
595
|
+
elif msg.type == web.WSMsgType.ERROR:
|
596
|
+
logger.error(f"WebSocket error: {ws.exception()}")
|
597
|
+
|
598
|
+
finally:
|
599
|
+
self._websockets.discard(ws)
|
600
|
+
|
601
|
+
return ws
|
602
|
+
|
603
|
+
async def _update_loop(self):
|
604
|
+
"""Periodically update metrics and check alerts."""
|
605
|
+
while True:
|
606
|
+
try:
|
607
|
+
# Collect metrics
|
608
|
+
metrics = await self._collect_metrics()
|
609
|
+
pools = await self._get_pool_info()
|
610
|
+
|
611
|
+
# Update cache
|
612
|
+
for pool_name, pool_data in pools.items():
|
613
|
+
self._cache.add(
|
614
|
+
f"{pool_name}_utilization",
|
615
|
+
{"value": pool_data["utilization"], "pool": pool_name},
|
616
|
+
)
|
617
|
+
|
618
|
+
# Check alerts
|
619
|
+
if self.enable_alerts:
|
620
|
+
await self._check_alerts(pools)
|
621
|
+
|
622
|
+
# Broadcast to WebSocket clients
|
623
|
+
await self._broadcast({"type": "pools", "pools": pools})
|
624
|
+
|
625
|
+
# Send sample metrics for chart
|
626
|
+
if pools:
|
627
|
+
first_pool = next(iter(pools.values()))
|
628
|
+
await self._broadcast(
|
629
|
+
{
|
630
|
+
"type": "metrics",
|
631
|
+
"data": {
|
632
|
+
"timestamp": time.time(),
|
633
|
+
"utilization": first_pool["utilization"],
|
634
|
+
},
|
635
|
+
}
|
636
|
+
)
|
637
|
+
|
638
|
+
# Send active alerts
|
639
|
+
active_alerts = [
|
640
|
+
{
|
641
|
+
"rule_id": alert.rule_id,
|
642
|
+
"triggered_at": alert.triggered_at,
|
643
|
+
"severity": alert.severity,
|
644
|
+
"message": alert.message,
|
645
|
+
}
|
646
|
+
for alert in self._active_alerts.values()
|
647
|
+
if not alert.resolved
|
648
|
+
]
|
649
|
+
|
650
|
+
if active_alerts:
|
651
|
+
await self._broadcast({"type": "alerts", "alerts": active_alerts})
|
652
|
+
|
653
|
+
except Exception as e:
|
654
|
+
logger.error(f"Error in update loop: {e}")
|
655
|
+
|
656
|
+
await asyncio.sleep(self.update_interval)
|
657
|
+
|
658
|
+
async def _collect_metrics(self) -> Dict[str, Any]:
|
659
|
+
"""Collect metrics from all pools."""
|
660
|
+
# This would integrate with the MetricsAggregator
|
661
|
+
# For now, return sample data
|
662
|
+
return {"timestamp": time.time(), "pools": await self._get_pool_info()}
|
663
|
+
|
664
|
+
async def _get_pool_info(self) -> Dict[str, Dict[str, Any]]:
|
665
|
+
"""Get information about all pools."""
|
666
|
+
# This would get real data from connection pools
|
667
|
+
# For now, return sample data
|
668
|
+
|
669
|
+
# Try to get real pools from resource registry
|
670
|
+
pools_info = {}
|
671
|
+
|
672
|
+
if hasattr(self, "runtime") and hasattr(self.runtime, "resource_registry"):
|
673
|
+
resources = self.runtime.resource_registry.list_resources()
|
674
|
+
|
675
|
+
for name, resource in resources.items():
|
676
|
+
if hasattr(resource, "get_pool_statistics"):
|
677
|
+
try:
|
678
|
+
stats = await resource.get_pool_statistics()
|
679
|
+
pools_info[name] = {
|
680
|
+
"health_score": stats.get("health_score", 100),
|
681
|
+
"active_connections": stats.get("active_connections", 0),
|
682
|
+
"total_connections": stats.get("total_connections", 0),
|
683
|
+
"utilization": stats.get("utilization", 0.0),
|
684
|
+
"queries_per_second": stats.get("queries_per_second", 0.0),
|
685
|
+
"avg_query_time_ms": stats.get("avg_query_time_ms", 0.0),
|
686
|
+
"error_rate": stats.get("error_rate", 0.0),
|
687
|
+
}
|
688
|
+
except Exception as e:
|
689
|
+
logger.error(f"Error getting stats for pool {name}: {e}")
|
690
|
+
|
691
|
+
# If no real pools, return sample data
|
692
|
+
if not pools_info:
|
693
|
+
pools_info = {
|
694
|
+
"main_pool": {
|
695
|
+
"health_score": 85,
|
696
|
+
"active_connections": 8,
|
697
|
+
"total_connections": 10,
|
698
|
+
"utilization": 0.8,
|
699
|
+
"queries_per_second": 150.5,
|
700
|
+
"avg_query_time_ms": 12.3,
|
701
|
+
"error_rate": 0.002,
|
702
|
+
}
|
703
|
+
}
|
704
|
+
|
705
|
+
return pools_info
|
706
|
+
|
707
|
+
async def _check_alerts(self, pools: Dict[str, Dict[str, Any]]):
|
708
|
+
"""Check alert conditions."""
|
709
|
+
for rule in self._alert_rules.values():
|
710
|
+
if not rule.enabled or rule.is_in_cooldown():
|
711
|
+
continue
|
712
|
+
|
713
|
+
# Simple condition evaluation (in production, use proper parser)
|
714
|
+
triggered = False
|
715
|
+
metric_value = 0.0
|
716
|
+
|
717
|
+
for pool_name, pool_data in pools.items():
|
718
|
+
if (
|
719
|
+
"utilization" in rule.condition
|
720
|
+
and pool_data["utilization"] > rule.threshold
|
721
|
+
):
|
722
|
+
triggered = True
|
723
|
+
metric_value = pool_data["utilization"]
|
724
|
+
break
|
725
|
+
elif (
|
726
|
+
"error_rate" in rule.condition
|
727
|
+
and pool_data["error_rate"] > rule.threshold
|
728
|
+
):
|
729
|
+
triggered = True
|
730
|
+
metric_value = pool_data["error_rate"]
|
731
|
+
break
|
732
|
+
|
733
|
+
# Check if alert should be triggered
|
734
|
+
alert_key = f"{rule.id}_{int(time.time() / rule.duration_seconds)}"
|
735
|
+
|
736
|
+
if triggered:
|
737
|
+
if alert_key not in self._active_alerts:
|
738
|
+
alert = Alert(
|
739
|
+
rule_id=rule.id,
|
740
|
+
triggered_at=time.time(),
|
741
|
+
severity=rule.severity,
|
742
|
+
message=f"{rule.name}: {rule.condition} (value: {metric_value:.2f})",
|
743
|
+
metric_value=metric_value,
|
744
|
+
)
|
745
|
+
|
746
|
+
self._active_alerts[alert_key] = alert
|
747
|
+
self._alert_history.append(alert)
|
748
|
+
rule.last_triggered = time.time()
|
749
|
+
|
750
|
+
logger.warning(f"Alert triggered: {alert.message}")
|
751
|
+
else:
|
752
|
+
# Resolve alert if condition no longer met
|
753
|
+
if alert_key in self._active_alerts:
|
754
|
+
alert = self._active_alerts[alert_key]
|
755
|
+
alert.resolved = True
|
756
|
+
alert.resolved_at = time.time()
|
757
|
+
|
758
|
+
logger.info(f"Alert resolved: {rule.name}")
|
759
|
+
|
760
|
+
async def _broadcast(self, data: Dict[str, Any]):
|
761
|
+
"""Broadcast data to all WebSocket clients."""
|
762
|
+
if not self._websockets:
|
763
|
+
return
|
764
|
+
|
765
|
+
# Send to all connected clients
|
766
|
+
disconnected = set()
|
767
|
+
|
768
|
+
for ws in self._websockets:
|
769
|
+
try:
|
770
|
+
await ws.send_json(data)
|
771
|
+
except ConnectionResetError:
|
772
|
+
disconnected.add(ws)
|
773
|
+
|
774
|
+
# Remove disconnected clients
|
775
|
+
self._websockets -= disconnected
|
776
|
+
|
777
|
+
def _init_default_alerts(self):
|
778
|
+
"""Initialize default alert rules."""
|
779
|
+
default_rules = [
|
780
|
+
AlertRule(
|
781
|
+
id="high_utilization",
|
782
|
+
name="High Pool Utilization",
|
783
|
+
condition="pool_utilization > 0.9",
|
784
|
+
threshold=0.9,
|
785
|
+
duration_seconds=60,
|
786
|
+
severity="warning",
|
787
|
+
),
|
788
|
+
AlertRule(
|
789
|
+
id="high_error_rate",
|
790
|
+
name="High Error Rate",
|
791
|
+
condition="error_rate > 0.05",
|
792
|
+
threshold=0.05,
|
793
|
+
duration_seconds=30,
|
794
|
+
severity="error",
|
795
|
+
),
|
796
|
+
AlertRule(
|
797
|
+
id="pool_exhausted",
|
798
|
+
name="Pool Exhausted",
|
799
|
+
condition="pool_utilization >= 1.0",
|
800
|
+
threshold=1.0,
|
801
|
+
duration_seconds=10,
|
802
|
+
severity="critical",
|
803
|
+
),
|
804
|
+
]
|
805
|
+
|
806
|
+
for rule in default_rules:
|
807
|
+
self._alert_rules[rule.id] = rule
|
808
|
+
|
809
|
+
def get_status(self) -> Dict[str, Any]:
|
810
|
+
"""Get dashboard status."""
|
811
|
+
return {
|
812
|
+
"running": self.app is not None,
|
813
|
+
"url": f"http://{self.host}:{self.port}" if self.app else None,
|
814
|
+
"websocket_clients": len(self._websockets),
|
815
|
+
"active_alerts": len(
|
816
|
+
[a for a in self._active_alerts.values() if not a.resolved]
|
817
|
+
),
|
818
|
+
"alert_rules": len(self._alert_rules),
|
819
|
+
"cached_metrics": len(self._cache._data),
|
820
|
+
"update_interval": self.update_interval,
|
821
|
+
"retention_hours": self.retention_hours,
|
822
|
+
}
|