kailash 0.6.4__py3-none-any.whl → 0.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +2 -2
- kailash/api/studio.py +2 -1
- kailash/api/workflow_api.py +2 -1
- kailash/core/resilience/bulkhead.py +21 -6
- kailash/core/resilience/health_monitor.py +578 -0
- kailash/mcp_server/ai_registry_server.py +3 -3
- kailash/mcp_server/server.py +33 -16
- kailash/middleware/communication/api_gateway.py +8 -1
- kailash/nodes/admin/security_event.py +5 -1
- kailash/nodes/admin/user_management.py +1 -0
- kailash/nodes/api/http.py +7 -2
- kailash/nodes/auth/mfa.py +1 -0
- kailash/nodes/base.py +3 -1
- kailash/nodes/cache/cache_invalidation.py +18 -14
- kailash/nodes/code/python.py +5 -2
- kailash/nodes/data/async_sql.py +1956 -129
- kailash/nodes/data/redis.py +52 -23
- kailash/nodes/rag/registry.py +5 -1
- kailash/nodes/security/behavior_analysis.py +1 -0
- kailash/runtime/local.py +9 -3
- kailash/runtime/parameter_injector.py +5 -5
- kailash/workflow/builder.py +19 -15
- {kailash-0.6.4.dist-info → kailash-0.6.5.dist-info}/METADATA +1 -1
- {kailash-0.6.4.dist-info → kailash-0.6.5.dist-info}/RECORD +28 -27
- {kailash-0.6.4.dist-info → kailash-0.6.5.dist-info}/WHEEL +0 -0
- {kailash-0.6.4.dist-info → kailash-0.6.5.dist-info}/entry_points.txt +0 -0
- {kailash-0.6.4.dist-info → kailash-0.6.5.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.6.4.dist-info → kailash-0.6.5.dist-info}/top_level.txt +0 -0
kailash/__init__.py
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
The Kailash SDK provides a comprehensive framework for creating nodes and workflows
|
4
4
|
that align with container-node architecture while allowing rapid prototyping.
|
5
5
|
|
6
|
-
New in v0.6.
|
7
|
-
improvements
|
6
|
+
New in v0.6.5: Enterprise AsyncSQL enhancements with optimistic locking, comprehensive
|
7
|
+
testing improvements, and production-grade documentation.
|
8
8
|
"""
|
9
9
|
|
10
10
|
from kailash.nodes.base import Node, NodeMetadata, NodeParameter
|
kailash/api/studio.py
CHANGED
@@ -20,13 +20,14 @@ from typing import Any
|
|
20
20
|
import uvicorn
|
21
21
|
from fastapi import FastAPI, HTTPException, Query, WebSocket, WebSocketDisconnect
|
22
22
|
from fastapi.middleware.cors import CORSMiddleware
|
23
|
+
from pydantic import BaseModel, Field
|
24
|
+
|
23
25
|
from kailash.nodes.base import NodeRegistry
|
24
26
|
from kailash.runtime.local import LocalRuntime
|
25
27
|
from kailash.tracking.manager import TaskManager
|
26
28
|
from kailash.tracking.storage.filesystem import FileSystemStorage
|
27
29
|
from kailash.utils.export import export_workflow
|
28
30
|
from kailash.workflow import Workflow
|
29
|
-
from pydantic import BaseModel, Field
|
30
31
|
|
31
32
|
from .custom_nodes import setup_custom_node_routes
|
32
33
|
from .database import (
|
kailash/api/workflow_api.py
CHANGED
@@ -13,10 +13,11 @@ from typing import Any
|
|
13
13
|
import uvicorn
|
14
14
|
from fastapi import BackgroundTasks, FastAPI, HTTPException
|
15
15
|
from fastapi.responses import StreamingResponse
|
16
|
+
from pydantic import BaseModel, Field
|
17
|
+
|
16
18
|
from kailash.runtime.local import LocalRuntime
|
17
19
|
from kailash.workflow.builder import WorkflowBuilder
|
18
20
|
from kailash.workflow.graph import Workflow
|
19
|
-
from pydantic import BaseModel, Field
|
20
21
|
|
21
22
|
|
22
23
|
class ExecutionMode(str, Enum):
|
@@ -156,12 +156,27 @@ class BulkheadPartition:
|
|
156
156
|
op_timeout = timeout or self.config.timeout
|
157
157
|
|
158
158
|
try:
|
159
|
-
# Check
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
)
|
159
|
+
# Check if partition can accept new operations
|
160
|
+
async with self._lock:
|
161
|
+
current_active = len(self._active_operations)
|
162
|
+
current_queued = self._operation_queue.qsize()
|
163
|
+
|
164
|
+
# Reject if no queue capacity (queue_size=0) and at capacity
|
165
|
+
if (
|
166
|
+
self.config.queue_size == 0
|
167
|
+
and current_active >= self.config.max_concurrent_operations
|
168
|
+
):
|
169
|
+
await self._record_rejection("no_queue_capacity")
|
170
|
+
raise BulkheadRejectionError(
|
171
|
+
f"Partition {self.config.name} has no queue capacity and is at max concurrent operations"
|
172
|
+
)
|
173
|
+
|
174
|
+
# Reject if queue is full
|
175
|
+
if self._operation_queue.full():
|
176
|
+
await self._record_rejection("queue_full")
|
177
|
+
raise BulkheadRejectionError(
|
178
|
+
f"Partition {self.config.name} queue is full"
|
179
|
+
)
|
165
180
|
|
166
181
|
# Queue the operation
|
167
182
|
await self._operation_queue.put((operation_id, func, args, kwargs))
|
@@ -0,0 +1,578 @@
|
|
1
|
+
"""Connection health monitoring system for enterprise resilience.
|
2
|
+
|
3
|
+
This module provides comprehensive health monitoring for database connections,
|
4
|
+
Redis connections, and other external services. It integrates with circuit
|
5
|
+
breakers and bulkhead patterns to provide enterprise-grade observability.
|
6
|
+
|
7
|
+
Features:
|
8
|
+
- Real-time health status monitoring
|
9
|
+
- Automatic health checks with configurable intervals
|
10
|
+
- Integration with circuit breakers and bulkheads
|
11
|
+
- Health-based routing and failover
|
12
|
+
- Comprehensive metrics collection
|
13
|
+
- Alert generation for critical failures
|
14
|
+
|
15
|
+
Example:
|
16
|
+
>>> monitor = HealthMonitor()
|
17
|
+
>>> monitor.register_check("database", DatabaseHealthCheck(...))
|
18
|
+
>>> status = await monitor.get_health_status("database")
|
19
|
+
>>> if status.is_healthy:
|
20
|
+
... # Proceed with operation
|
21
|
+
"""
|
22
|
+
|
23
|
+
import asyncio
|
24
|
+
import logging
|
25
|
+
import time
|
26
|
+
from abc import ABC, abstractmethod
|
27
|
+
from dataclasses import dataclass, field
|
28
|
+
from datetime import UTC, datetime, timedelta
|
29
|
+
from enum import Enum
|
30
|
+
from typing import Any, Callable, Dict, List, Optional, Set
|
31
|
+
from uuid import uuid4
|
32
|
+
|
33
|
+
logger = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
|
36
|
+
class HealthStatus(Enum):
|
37
|
+
"""Health status levels."""
|
38
|
+
|
39
|
+
HEALTHY = "healthy"
|
40
|
+
DEGRADED = "degraded"
|
41
|
+
UNHEALTHY = "unhealthy"
|
42
|
+
UNKNOWN = "unknown"
|
43
|
+
|
44
|
+
|
45
|
+
class AlertLevel(Enum):
|
46
|
+
"""Alert severity levels."""
|
47
|
+
|
48
|
+
INFO = "info"
|
49
|
+
WARNING = "warning"
|
50
|
+
CRITICAL = "critical"
|
51
|
+
FATAL = "fatal"
|
52
|
+
|
53
|
+
|
54
|
+
@dataclass
|
55
|
+
class HealthCheckResult:
|
56
|
+
"""Result of a health check operation."""
|
57
|
+
|
58
|
+
check_id: str
|
59
|
+
service_name: str
|
60
|
+
status: HealthStatus
|
61
|
+
response_time_ms: float
|
62
|
+
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
|
63
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
64
|
+
error_message: Optional[str] = None
|
65
|
+
is_healthy: bool = field(init=False)
|
66
|
+
|
67
|
+
def __post_init__(self):
|
68
|
+
"""Calculate health status."""
|
69
|
+
self.is_healthy = self.status in [HealthStatus.HEALTHY, HealthStatus.DEGRADED]
|
70
|
+
|
71
|
+
|
72
|
+
@dataclass
|
73
|
+
class HealthMetrics:
|
74
|
+
"""Health monitoring metrics."""
|
75
|
+
|
76
|
+
total_checks: int = 0
|
77
|
+
successful_checks: int = 0
|
78
|
+
failed_checks: int = 0
|
79
|
+
avg_response_time_ms: float = 0.0
|
80
|
+
max_response_time_ms: float = 0.0
|
81
|
+
uptime_percentage: float = 100.0
|
82
|
+
consecutive_failures: int = 0
|
83
|
+
last_successful_check: Optional[datetime] = None
|
84
|
+
last_failed_check: Optional[datetime] = None
|
85
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
86
|
+
|
87
|
+
|
88
|
+
@dataclass
|
89
|
+
class HealthAlert:
|
90
|
+
"""Health monitoring alert."""
|
91
|
+
|
92
|
+
alert_id: str = field(default_factory=lambda: str(uuid4()))
|
93
|
+
service_name: str = ""
|
94
|
+
level: AlertLevel = AlertLevel.INFO
|
95
|
+
message: str = ""
|
96
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
97
|
+
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
|
98
|
+
resolved: bool = False
|
99
|
+
resolved_at: Optional[datetime] = None
|
100
|
+
|
101
|
+
|
102
|
+
class HealthCheck(ABC):
|
103
|
+
"""Abstract base class for health checks."""
|
104
|
+
|
105
|
+
def __init__(self, name: str, timeout: float = 5.0, critical: bool = True):
|
106
|
+
"""Initialize health check.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
name: Name of the service being checked
|
110
|
+
timeout: Timeout for health check in seconds
|
111
|
+
critical: Whether this check is critical for overall health
|
112
|
+
"""
|
113
|
+
self.name = name
|
114
|
+
self.timeout = timeout
|
115
|
+
self.critical = critical
|
116
|
+
|
117
|
+
@abstractmethod
|
118
|
+
async def check_health(self) -> HealthCheckResult:
|
119
|
+
"""Perform health check and return result."""
|
120
|
+
pass
|
121
|
+
|
122
|
+
|
123
|
+
class DatabaseHealthCheck(HealthCheck):
|
124
|
+
"""Health check for database connections."""
|
125
|
+
|
126
|
+
def __init__(self, name: str, connection_string: str, **kwargs):
|
127
|
+
"""Initialize database health check."""
|
128
|
+
super().__init__(name, **kwargs)
|
129
|
+
self.connection_string = connection_string
|
130
|
+
|
131
|
+
async def check_health(self) -> HealthCheckResult:
|
132
|
+
"""Check database health."""
|
133
|
+
start_time = time.time()
|
134
|
+
check_id = str(uuid4())
|
135
|
+
|
136
|
+
try:
|
137
|
+
# Import SQL node for health checking
|
138
|
+
from src.kailash.nodes.data.sql import SQLDatabaseNode
|
139
|
+
|
140
|
+
sql_node = SQLDatabaseNode(connection_string=self.connection_string)
|
141
|
+
|
142
|
+
# Execute simple health check query
|
143
|
+
result = await asyncio.wait_for(
|
144
|
+
asyncio.to_thread(sql_node.execute, query="SELECT 1 as health_check"),
|
145
|
+
timeout=self.timeout,
|
146
|
+
)
|
147
|
+
|
148
|
+
response_time = (time.time() - start_time) * 1000
|
149
|
+
|
150
|
+
if "data" in result and len(result["data"]) > 0:
|
151
|
+
return HealthCheckResult(
|
152
|
+
check_id=check_id,
|
153
|
+
service_name=self.name,
|
154
|
+
status=HealthStatus.HEALTHY,
|
155
|
+
response_time_ms=response_time,
|
156
|
+
details={
|
157
|
+
"query_executed": True,
|
158
|
+
"rows_returned": len(result["data"]),
|
159
|
+
"execution_time": result.get("execution_time", 0),
|
160
|
+
},
|
161
|
+
)
|
162
|
+
else:
|
163
|
+
return HealthCheckResult(
|
164
|
+
check_id=check_id,
|
165
|
+
service_name=self.name,
|
166
|
+
status=HealthStatus.DEGRADED,
|
167
|
+
response_time_ms=response_time,
|
168
|
+
details={"query_executed": True, "rows_returned": 0},
|
169
|
+
error_message="Query returned no data",
|
170
|
+
)
|
171
|
+
|
172
|
+
except asyncio.TimeoutError:
|
173
|
+
response_time = (time.time() - start_time) * 1000
|
174
|
+
return HealthCheckResult(
|
175
|
+
check_id=check_id,
|
176
|
+
service_name=self.name,
|
177
|
+
status=HealthStatus.UNHEALTHY,
|
178
|
+
response_time_ms=response_time,
|
179
|
+
error_message=f"Health check timed out after {self.timeout}s",
|
180
|
+
)
|
181
|
+
except Exception as e:
|
182
|
+
response_time = (time.time() - start_time) * 1000
|
183
|
+
return HealthCheckResult(
|
184
|
+
check_id=check_id,
|
185
|
+
service_name=self.name,
|
186
|
+
status=HealthStatus.UNHEALTHY,
|
187
|
+
response_time_ms=response_time,
|
188
|
+
error_message=str(e),
|
189
|
+
)
|
190
|
+
|
191
|
+
|
192
|
+
class RedisHealthCheck(HealthCheck):
|
193
|
+
"""Health check for Redis connections."""
|
194
|
+
|
195
|
+
def __init__(self, name: str, redis_config: Dict[str, Any], **kwargs):
|
196
|
+
"""Initialize Redis health check."""
|
197
|
+
super().__init__(name, **kwargs)
|
198
|
+
self.redis_config = redis_config
|
199
|
+
|
200
|
+
async def check_health(self) -> HealthCheckResult:
|
201
|
+
"""Check Redis health."""
|
202
|
+
start_time = time.time()
|
203
|
+
check_id = str(uuid4())
|
204
|
+
|
205
|
+
try:
|
206
|
+
import redis
|
207
|
+
|
208
|
+
# Create Redis client
|
209
|
+
client = redis.Redis(**self.redis_config)
|
210
|
+
|
211
|
+
# Execute ping command
|
212
|
+
await asyncio.wait_for(asyncio.to_thread(client.ping), timeout=self.timeout)
|
213
|
+
|
214
|
+
# Get Redis info
|
215
|
+
info = await asyncio.to_thread(client.info)
|
216
|
+
|
217
|
+
response_time = (time.time() - start_time) * 1000
|
218
|
+
|
219
|
+
return HealthCheckResult(
|
220
|
+
check_id=check_id,
|
221
|
+
service_name=self.name,
|
222
|
+
status=HealthStatus.HEALTHY,
|
223
|
+
response_time_ms=response_time,
|
224
|
+
details={
|
225
|
+
"ping_successful": True,
|
226
|
+
"connected_clients": info.get("connected_clients", 0),
|
227
|
+
"used_memory": info.get("used_memory", 0),
|
228
|
+
"redis_version": info.get("redis_version", "unknown"),
|
229
|
+
},
|
230
|
+
)
|
231
|
+
|
232
|
+
except asyncio.TimeoutError:
|
233
|
+
response_time = (time.time() - start_time) * 1000
|
234
|
+
return HealthCheckResult(
|
235
|
+
check_id=check_id,
|
236
|
+
service_name=self.name,
|
237
|
+
status=HealthStatus.UNHEALTHY,
|
238
|
+
response_time_ms=response_time,
|
239
|
+
error_message=f"Redis health check timed out after {self.timeout}s",
|
240
|
+
)
|
241
|
+
except Exception as e:
|
242
|
+
response_time = (time.time() - start_time) * 1000
|
243
|
+
return HealthCheckResult(
|
244
|
+
check_id=check_id,
|
245
|
+
service_name=self.name,
|
246
|
+
status=HealthStatus.UNHEALTHY,
|
247
|
+
response_time_ms=response_time,
|
248
|
+
error_message=str(e),
|
249
|
+
)
|
250
|
+
|
251
|
+
|
252
|
+
class HTTPHealthCheck(HealthCheck):
|
253
|
+
"""Health check for HTTP endpoints."""
|
254
|
+
|
255
|
+
def __init__(self, name: str, url: str, expected_status: int = 200, **kwargs):
|
256
|
+
"""Initialize HTTP health check."""
|
257
|
+
super().__init__(name, **kwargs)
|
258
|
+
self.url = url
|
259
|
+
self.expected_status = expected_status
|
260
|
+
|
261
|
+
async def check_health(self) -> HealthCheckResult:
|
262
|
+
"""Check HTTP endpoint health."""
|
263
|
+
start_time = time.time()
|
264
|
+
check_id = str(uuid4())
|
265
|
+
|
266
|
+
try:
|
267
|
+
import httpx
|
268
|
+
|
269
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
270
|
+
response = await client.get(self.url)
|
271
|
+
|
272
|
+
response_time = (time.time() - start_time) * 1000
|
273
|
+
|
274
|
+
if response.status_code == self.expected_status:
|
275
|
+
status = HealthStatus.HEALTHY
|
276
|
+
elif 200 <= response.status_code < 300:
|
277
|
+
status = HealthStatus.DEGRADED
|
278
|
+
else:
|
279
|
+
status = HealthStatus.UNHEALTHY
|
280
|
+
|
281
|
+
return HealthCheckResult(
|
282
|
+
check_id=check_id,
|
283
|
+
service_name=self.name,
|
284
|
+
status=status,
|
285
|
+
response_time_ms=response_time,
|
286
|
+
details={
|
287
|
+
"status_code": response.status_code,
|
288
|
+
"expected_status": self.expected_status,
|
289
|
+
"content_length": len(response.content),
|
290
|
+
},
|
291
|
+
)
|
292
|
+
|
293
|
+
except asyncio.TimeoutError:
|
294
|
+
response_time = (time.time() - start_time) * 1000
|
295
|
+
return HealthCheckResult(
|
296
|
+
check_id=check_id,
|
297
|
+
service_name=self.name,
|
298
|
+
status=HealthStatus.UNHEALTHY,
|
299
|
+
response_time_ms=response_time,
|
300
|
+
error_message=f"HTTP health check timed out after {self.timeout}s",
|
301
|
+
)
|
302
|
+
except Exception as e:
|
303
|
+
response_time = (time.time() - start_time) * 1000
|
304
|
+
return HealthCheckResult(
|
305
|
+
check_id=check_id,
|
306
|
+
service_name=self.name,
|
307
|
+
status=HealthStatus.UNHEALTHY,
|
308
|
+
response_time_ms=response_time,
|
309
|
+
error_message=str(e),
|
310
|
+
)
|
311
|
+
|
312
|
+
|
313
|
+
class HealthMonitor:
|
314
|
+
"""Enterprise health monitoring system."""
|
315
|
+
|
316
|
+
def __init__(self, check_interval: float = 30.0, alert_threshold: int = 3):
|
317
|
+
"""Initialize health monitor.
|
318
|
+
|
319
|
+
Args:
|
320
|
+
check_interval: Interval between health checks in seconds
|
321
|
+
alert_threshold: Number of consecutive failures before alerting
|
322
|
+
"""
|
323
|
+
self.check_interval = check_interval
|
324
|
+
self.alert_threshold = alert_threshold
|
325
|
+
self.health_checks: Dict[str, HealthCheck] = {}
|
326
|
+
self.metrics: Dict[str, HealthMetrics] = {}
|
327
|
+
self.alerts: List[HealthAlert] = []
|
328
|
+
self.alert_callbacks: List[Callable[[HealthAlert], None]] = []
|
329
|
+
self._monitoring_task: Optional[asyncio.Task] = None
|
330
|
+
self._running = False
|
331
|
+
self._lock = asyncio.Lock()
|
332
|
+
|
333
|
+
logger.info("Initialized HealthMonitor")
|
334
|
+
|
335
|
+
def register_check(self, service_name: str, health_check: HealthCheck):
|
336
|
+
"""Register a health check."""
|
337
|
+
self.health_checks[service_name] = health_check
|
338
|
+
self.metrics[service_name] = HealthMetrics()
|
339
|
+
logger.info(f"Registered health check for service: {service_name}")
|
340
|
+
|
341
|
+
def register_alert_callback(self, callback: Callable[[HealthAlert], None]):
|
342
|
+
"""Register callback for health alerts."""
|
343
|
+
self.alert_callbacks.append(callback)
|
344
|
+
|
345
|
+
async def check_service_health(self, service_name: str) -> HealthCheckResult:
|
346
|
+
"""Perform health check for specific service."""
|
347
|
+
if service_name not in self.health_checks:
|
348
|
+
raise ValueError(f"No health check registered for service: {service_name}")
|
349
|
+
|
350
|
+
health_check = self.health_checks[service_name]
|
351
|
+
result = await health_check.check_health()
|
352
|
+
|
353
|
+
# Update metrics
|
354
|
+
await self._update_metrics(service_name, result)
|
355
|
+
|
356
|
+
# Check for alerts
|
357
|
+
await self._check_alerts(service_name, result)
|
358
|
+
|
359
|
+
return result
|
360
|
+
|
361
|
+
async def get_health_status(self, service_name: str) -> Optional[HealthCheckResult]:
|
362
|
+
"""Get latest health status for service."""
|
363
|
+
return await self.check_service_health(service_name)
|
364
|
+
|
365
|
+
async def get_all_health_status(self) -> Dict[str, HealthCheckResult]:
|
366
|
+
"""Get health status for all registered services."""
|
367
|
+
results = {}
|
368
|
+
for service_name in self.health_checks:
|
369
|
+
try:
|
370
|
+
results[service_name] = await self.check_service_health(service_name)
|
371
|
+
except Exception as e:
|
372
|
+
logger.error(f"Failed to check health for {service_name}: {e}")
|
373
|
+
results[service_name] = HealthCheckResult(
|
374
|
+
check_id=str(uuid4()),
|
375
|
+
service_name=service_name,
|
376
|
+
status=HealthStatus.UNKNOWN,
|
377
|
+
response_time_ms=0.0,
|
378
|
+
error_message=str(e),
|
379
|
+
)
|
380
|
+
return results
|
381
|
+
|
382
|
+
async def get_overall_health(self) -> HealthStatus:
|
383
|
+
"""Get overall system health status."""
|
384
|
+
all_status = await self.get_all_health_status()
|
385
|
+
|
386
|
+
if not all_status:
|
387
|
+
return HealthStatus.UNKNOWN
|
388
|
+
|
389
|
+
critical_services = [
|
390
|
+
name for name, check in self.health_checks.items() if check.critical
|
391
|
+
]
|
392
|
+
|
393
|
+
# Check critical services first
|
394
|
+
critical_unhealthy = any(
|
395
|
+
all_status[name].status == HealthStatus.UNHEALTHY
|
396
|
+
for name in critical_services
|
397
|
+
if name in all_status
|
398
|
+
)
|
399
|
+
|
400
|
+
if critical_unhealthy:
|
401
|
+
return HealthStatus.UNHEALTHY
|
402
|
+
|
403
|
+
# Check if any service is degraded
|
404
|
+
any_degraded = any(
|
405
|
+
result.status == HealthStatus.DEGRADED for result in all_status.values()
|
406
|
+
)
|
407
|
+
|
408
|
+
if any_degraded:
|
409
|
+
return HealthStatus.DEGRADED
|
410
|
+
|
411
|
+
# Check if all are healthy
|
412
|
+
all_healthy = all(
|
413
|
+
result.status == HealthStatus.HEALTHY for result in all_status.values()
|
414
|
+
)
|
415
|
+
|
416
|
+
return HealthStatus.HEALTHY if all_healthy else HealthStatus.UNKNOWN
|
417
|
+
|
418
|
+
async def get_metrics(self, service_name: str) -> Optional[HealthMetrics]:
|
419
|
+
"""Get metrics for specific service."""
|
420
|
+
return self.metrics.get(service_name)
|
421
|
+
|
422
|
+
async def get_all_metrics(self) -> Dict[str, HealthMetrics]:
|
423
|
+
"""Get metrics for all services."""
|
424
|
+
return self.metrics.copy()
|
425
|
+
|
426
|
+
async def get_alerts(self, resolved: Optional[bool] = None) -> List[HealthAlert]:
|
427
|
+
"""Get health alerts."""
|
428
|
+
if resolved is None:
|
429
|
+
return self.alerts.copy()
|
430
|
+
return [alert for alert in self.alerts if alert.resolved == resolved]
|
431
|
+
|
432
|
+
async def start_monitoring(self):
|
433
|
+
"""Start continuous health monitoring."""
|
434
|
+
if self._running:
|
435
|
+
logger.warning("Health monitoring already running")
|
436
|
+
return
|
437
|
+
|
438
|
+
self._running = True
|
439
|
+
self._monitoring_task = asyncio.create_task(self._monitoring_loop())
|
440
|
+
logger.info("Started health monitoring")
|
441
|
+
|
442
|
+
async def stop_monitoring(self):
|
443
|
+
"""Stop continuous health monitoring."""
|
444
|
+
if not self._running:
|
445
|
+
return
|
446
|
+
|
447
|
+
self._running = False
|
448
|
+
if self._monitoring_task:
|
449
|
+
self._monitoring_task.cancel()
|
450
|
+
try:
|
451
|
+
await self._monitoring_task
|
452
|
+
except asyncio.CancelledError:
|
453
|
+
pass
|
454
|
+
|
455
|
+
logger.info("Stopped health monitoring")
|
456
|
+
|
457
|
+
async def _monitoring_loop(self):
|
458
|
+
"""Main monitoring loop."""
|
459
|
+
while self._running:
|
460
|
+
try:
|
461
|
+
# Check all services
|
462
|
+
await self.get_all_health_status()
|
463
|
+
|
464
|
+
# Wait for next check interval
|
465
|
+
await asyncio.sleep(self.check_interval)
|
466
|
+
|
467
|
+
except asyncio.CancelledError:
|
468
|
+
break
|
469
|
+
except Exception as e:
|
470
|
+
logger.error(f"Error in monitoring loop: {e}")
|
471
|
+
await asyncio.sleep(min(self.check_interval, 10)) # Fallback interval
|
472
|
+
|
473
|
+
async def _update_metrics(self, service_name: str, result: HealthCheckResult):
|
474
|
+
"""Update metrics for service."""
|
475
|
+
async with self._lock:
|
476
|
+
metrics = self.metrics[service_name]
|
477
|
+
|
478
|
+
metrics.total_checks += 1
|
479
|
+
|
480
|
+
if result.is_healthy:
|
481
|
+
metrics.successful_checks += 1
|
482
|
+
metrics.consecutive_failures = 0
|
483
|
+
metrics.last_successful_check = result.timestamp
|
484
|
+
else:
|
485
|
+
metrics.failed_checks += 1
|
486
|
+
metrics.consecutive_failures += 1
|
487
|
+
metrics.last_failed_check = result.timestamp
|
488
|
+
|
489
|
+
# Update response time metrics
|
490
|
+
if metrics.total_checks == 1:
|
491
|
+
metrics.avg_response_time_ms = result.response_time_ms
|
492
|
+
else:
|
493
|
+
metrics.avg_response_time_ms = (
|
494
|
+
metrics.avg_response_time_ms * (metrics.total_checks - 1)
|
495
|
+
+ result.response_time_ms
|
496
|
+
) / metrics.total_checks
|
497
|
+
|
498
|
+
if result.response_time_ms > metrics.max_response_time_ms:
|
499
|
+
metrics.max_response_time_ms = result.response_time_ms
|
500
|
+
|
501
|
+
# Update uptime percentage
|
502
|
+
metrics.uptime_percentage = (
|
503
|
+
metrics.successful_checks / metrics.total_checks
|
504
|
+
) * 100
|
505
|
+
|
506
|
+
async def _check_alerts(self, service_name: str, result: HealthCheckResult):
|
507
|
+
"""Check if alerts should be generated."""
|
508
|
+
metrics = self.metrics[service_name]
|
509
|
+
|
510
|
+
# Check for consecutive failure threshold
|
511
|
+
if metrics.consecutive_failures >= self.alert_threshold:
|
512
|
+
await self._generate_alert(
|
513
|
+
service_name,
|
514
|
+
AlertLevel.CRITICAL,
|
515
|
+
f"Service {service_name} has {metrics.consecutive_failures} consecutive failures",
|
516
|
+
{
|
517
|
+
"consecutive_failures": metrics.consecutive_failures,
|
518
|
+
"last_error": result.error_message,
|
519
|
+
"health_status": result.status.value,
|
520
|
+
},
|
521
|
+
)
|
522
|
+
|
523
|
+
# Check for high response times
|
524
|
+
if result.response_time_ms > 5000: # 5 seconds
|
525
|
+
await self._generate_alert(
|
526
|
+
service_name,
|
527
|
+
AlertLevel.WARNING,
|
528
|
+
f"High response time for {service_name}: {result.response_time_ms:.2f}ms",
|
529
|
+
{
|
530
|
+
"response_time_ms": result.response_time_ms,
|
531
|
+
"avg_response_time_ms": metrics.avg_response_time_ms,
|
532
|
+
},
|
533
|
+
)
|
534
|
+
|
535
|
+
async def _generate_alert(
|
536
|
+
self,
|
537
|
+
service_name: str,
|
538
|
+
level: AlertLevel,
|
539
|
+
message: str,
|
540
|
+
details: Dict[str, Any],
|
541
|
+
):
|
542
|
+
"""Generate health alert."""
|
543
|
+
alert = HealthAlert(
|
544
|
+
service_name=service_name, level=level, message=message, details=details
|
545
|
+
)
|
546
|
+
|
547
|
+
self.alerts.append(alert)
|
548
|
+
|
549
|
+
# Call alert callbacks
|
550
|
+
for callback in self.alert_callbacks:
|
551
|
+
try:
|
552
|
+
callback(alert)
|
553
|
+
except Exception as e:
|
554
|
+
logger.error(f"Error in alert callback: {e}")
|
555
|
+
|
556
|
+
logger.warning(f"Health alert generated: {message}")
|
557
|
+
|
558
|
+
|
559
|
+
# Global health monitor instance
|
560
|
+
_health_monitor: Optional[HealthMonitor] = None
|
561
|
+
|
562
|
+
|
563
|
+
def get_health_monitor() -> HealthMonitor:
|
564
|
+
"""Get global health monitor instance."""
|
565
|
+
global _health_monitor
|
566
|
+
if _health_monitor is None:
|
567
|
+
_health_monitor = HealthMonitor()
|
568
|
+
return _health_monitor
|
569
|
+
|
570
|
+
|
571
|
+
async def quick_health_check(service_name: str) -> bool:
|
572
|
+
"""Quick health check for a service."""
|
573
|
+
monitor = get_health_monitor()
|
574
|
+
try:
|
575
|
+
result = await monitor.get_health_status(service_name)
|
576
|
+
return result.is_healthy if result else False
|
577
|
+
except Exception:
|
578
|
+
return False
|
@@ -21,7 +21,7 @@ except ImportError:
|
|
21
21
|
# Fallback if official MCP is broken
|
22
22
|
print("Warning: Official MCP server not available, using fallback")
|
23
23
|
from kailash.mcp_server.server import MCPServerBase as Server
|
24
|
-
|
24
|
+
|
25
25
|
# Minimal type definitions for fallback
|
26
26
|
class Resource:
|
27
27
|
def __init__(self, uri, name, description, mimeType=None):
|
@@ -29,12 +29,12 @@ except ImportError:
|
|
29
29
|
self.name = name
|
30
30
|
self.description = description
|
31
31
|
self.mimeType = mimeType
|
32
|
-
|
32
|
+
|
33
33
|
class TextContent:
|
34
34
|
def __init__(self, type, text):
|
35
35
|
self.type = type
|
36
36
|
self.text = text
|
37
|
-
|
37
|
+
|
38
38
|
class Tool:
|
39
39
|
def __init__(self, name, description, inputSchema):
|
40
40
|
self.name = name
|