kailash 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +3 -3
- kailash/api/custom_nodes_secure.py +3 -3
- kailash/api/gateway.py +1 -1
- kailash/api/studio.py +1 -1
- kailash/api/workflow_api.py +2 -2
- kailash/core/resilience/bulkhead.py +475 -0
- kailash/core/resilience/circuit_breaker.py +92 -10
- kailash/core/resilience/health_monitor.py +578 -0
- kailash/edge/discovery.py +86 -0
- kailash/mcp_server/__init__.py +309 -33
- kailash/mcp_server/advanced_features.py +1022 -0
- kailash/mcp_server/ai_registry_server.py +27 -2
- kailash/mcp_server/auth.py +789 -0
- kailash/mcp_server/client.py +645 -378
- kailash/mcp_server/discovery.py +1593 -0
- kailash/mcp_server/errors.py +673 -0
- kailash/mcp_server/oauth.py +1727 -0
- kailash/mcp_server/protocol.py +1126 -0
- kailash/mcp_server/registry_integration.py +587 -0
- kailash/mcp_server/server.py +1228 -96
- kailash/mcp_server/transports.py +1169 -0
- kailash/mcp_server/utils/__init__.py +6 -1
- kailash/mcp_server/utils/cache.py +250 -7
- kailash/middleware/auth/auth_manager.py +3 -3
- kailash/middleware/communication/api_gateway.py +1 -1
- kailash/middleware/communication/realtime.py +1 -1
- kailash/middleware/mcp/enhanced_server.py +1 -1
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/admin/audit_log.py +6 -6
- kailash/nodes/admin/permission_check.py +8 -8
- kailash/nodes/admin/role_management.py +32 -28
- kailash/nodes/admin/schema.sql +6 -1
- kailash/nodes/admin/schema_manager.py +13 -13
- kailash/nodes/admin/security_event.py +15 -15
- kailash/nodes/admin/tenant_isolation.py +3 -3
- kailash/nodes/admin/transaction_utils.py +3 -3
- kailash/nodes/admin/user_management.py +21 -21
- kailash/nodes/ai/a2a.py +11 -11
- kailash/nodes/ai/ai_providers.py +9 -12
- kailash/nodes/ai/embedding_generator.py +13 -14
- kailash/nodes/ai/intelligent_agent_orchestrator.py +19 -19
- kailash/nodes/ai/iterative_llm_agent.py +2 -2
- kailash/nodes/ai/llm_agent.py +210 -33
- kailash/nodes/ai/self_organizing.py +2 -2
- kailash/nodes/alerts/discord.py +4 -4
- kailash/nodes/api/graphql.py +6 -6
- kailash/nodes/api/http.py +10 -10
- kailash/nodes/api/rate_limiting.py +4 -4
- kailash/nodes/api/rest.py +15 -15
- kailash/nodes/auth/mfa.py +3 -3
- kailash/nodes/auth/risk_assessment.py +2 -2
- kailash/nodes/auth/session_management.py +5 -5
- kailash/nodes/auth/sso.py +143 -0
- kailash/nodes/base.py +8 -2
- kailash/nodes/base_async.py +16 -2
- kailash/nodes/base_with_acl.py +2 -2
- kailash/nodes/cache/__init__.py +9 -0
- kailash/nodes/cache/cache.py +1172 -0
- kailash/nodes/cache/cache_invalidation.py +874 -0
- kailash/nodes/cache/redis_pool_manager.py +595 -0
- kailash/nodes/code/async_python.py +2 -1
- kailash/nodes/code/python.py +194 -30
- kailash/nodes/compliance/data_retention.py +6 -6
- kailash/nodes/compliance/gdpr.py +5 -5
- kailash/nodes/data/__init__.py +10 -0
- kailash/nodes/data/async_sql.py +1956 -129
- kailash/nodes/data/optimistic_locking.py +906 -0
- kailash/nodes/data/readers.py +8 -8
- kailash/nodes/data/redis.py +378 -0
- kailash/nodes/data/sql.py +314 -3
- kailash/nodes/data/streaming.py +21 -0
- kailash/nodes/enterprise/__init__.py +8 -0
- kailash/nodes/enterprise/audit_logger.py +285 -0
- kailash/nodes/enterprise/batch_processor.py +22 -3
- kailash/nodes/enterprise/data_lineage.py +1 -1
- kailash/nodes/enterprise/mcp_executor.py +205 -0
- kailash/nodes/enterprise/service_discovery.py +150 -0
- kailash/nodes/enterprise/tenant_assignment.py +108 -0
- kailash/nodes/logic/async_operations.py +2 -2
- kailash/nodes/logic/convergence.py +1 -1
- kailash/nodes/logic/operations.py +1 -1
- kailash/nodes/monitoring/__init__.py +11 -1
- kailash/nodes/monitoring/health_check.py +456 -0
- kailash/nodes/monitoring/log_processor.py +817 -0
- kailash/nodes/monitoring/metrics_collector.py +627 -0
- kailash/nodes/monitoring/performance_benchmark.py +137 -11
- kailash/nodes/rag/advanced.py +7 -7
- kailash/nodes/rag/agentic.py +49 -2
- kailash/nodes/rag/conversational.py +3 -3
- kailash/nodes/rag/evaluation.py +3 -3
- kailash/nodes/rag/federated.py +3 -3
- kailash/nodes/rag/graph.py +3 -3
- kailash/nodes/rag/multimodal.py +3 -3
- kailash/nodes/rag/optimized.py +5 -5
- kailash/nodes/rag/privacy.py +3 -3
- kailash/nodes/rag/query_processing.py +6 -6
- kailash/nodes/rag/realtime.py +1 -1
- kailash/nodes/rag/registry.py +1 -1
- kailash/nodes/rag/router.py +1 -1
- kailash/nodes/rag/similarity.py +7 -7
- kailash/nodes/rag/strategies.py +4 -4
- kailash/nodes/security/abac_evaluator.py +6 -6
- kailash/nodes/security/behavior_analysis.py +5 -5
- kailash/nodes/security/credential_manager.py +1 -1
- kailash/nodes/security/rotating_credentials.py +11 -11
- kailash/nodes/security/threat_detection.py +8 -8
- kailash/nodes/testing/credential_testing.py +2 -2
- kailash/nodes/transform/processors.py +5 -5
- kailash/runtime/local.py +163 -9
- kailash/runtime/parameter_injection.py +425 -0
- kailash/runtime/parameter_injector.py +657 -0
- kailash/runtime/testing.py +2 -2
- kailash/testing/fixtures.py +2 -2
- kailash/workflow/builder.py +99 -14
- kailash/workflow/builder_improvements.py +207 -0
- kailash/workflow/input_handling.py +170 -0
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/METADATA +22 -9
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/RECORD +122 -95
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/WHEEL +0 -0
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/entry_points.txt +0 -0
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,578 @@
|
|
1
|
+
"""Connection health monitoring system for enterprise resilience.
|
2
|
+
|
3
|
+
This module provides comprehensive health monitoring for database connections,
|
4
|
+
Redis connections, and other external services. It integrates with circuit
|
5
|
+
breakers and bulkhead patterns to provide enterprise-grade observability.
|
6
|
+
|
7
|
+
Features:
|
8
|
+
- Real-time health status monitoring
|
9
|
+
- Automatic health checks with configurable intervals
|
10
|
+
- Integration with circuit breakers and bulkheads
|
11
|
+
- Health-based routing and failover
|
12
|
+
- Comprehensive metrics collection
|
13
|
+
- Alert generation for critical failures
|
14
|
+
|
15
|
+
Example:
|
16
|
+
>>> monitor = HealthMonitor()
|
17
|
+
>>> monitor.register_check("database", DatabaseHealthCheck(...))
|
18
|
+
>>> status = await monitor.get_health_status("database")
|
19
|
+
>>> if status.is_healthy:
|
20
|
+
... # Proceed with operation
|
21
|
+
"""
|
22
|
+
|
23
|
+
import asyncio
|
24
|
+
import logging
|
25
|
+
import time
|
26
|
+
from abc import ABC, abstractmethod
|
27
|
+
from dataclasses import dataclass, field
|
28
|
+
from datetime import UTC, datetime, timedelta
|
29
|
+
from enum import Enum
|
30
|
+
from typing import Any, Callable, Dict, List, Optional, Set
|
31
|
+
from uuid import uuid4
|
32
|
+
|
33
|
+
logger = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
|
36
|
+
class HealthStatus(Enum):
|
37
|
+
"""Health status levels."""
|
38
|
+
|
39
|
+
HEALTHY = "healthy"
|
40
|
+
DEGRADED = "degraded"
|
41
|
+
UNHEALTHY = "unhealthy"
|
42
|
+
UNKNOWN = "unknown"
|
43
|
+
|
44
|
+
|
45
|
+
class AlertLevel(Enum):
|
46
|
+
"""Alert severity levels."""
|
47
|
+
|
48
|
+
INFO = "info"
|
49
|
+
WARNING = "warning"
|
50
|
+
CRITICAL = "critical"
|
51
|
+
FATAL = "fatal"
|
52
|
+
|
53
|
+
|
54
|
+
@dataclass
|
55
|
+
class HealthCheckResult:
|
56
|
+
"""Result of a health check operation."""
|
57
|
+
|
58
|
+
check_id: str
|
59
|
+
service_name: str
|
60
|
+
status: HealthStatus
|
61
|
+
response_time_ms: float
|
62
|
+
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
|
63
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
64
|
+
error_message: Optional[str] = None
|
65
|
+
is_healthy: bool = field(init=False)
|
66
|
+
|
67
|
+
def __post_init__(self):
|
68
|
+
"""Calculate health status."""
|
69
|
+
self.is_healthy = self.status in [HealthStatus.HEALTHY, HealthStatus.DEGRADED]
|
70
|
+
|
71
|
+
|
72
|
+
@dataclass
|
73
|
+
class HealthMetrics:
|
74
|
+
"""Health monitoring metrics."""
|
75
|
+
|
76
|
+
total_checks: int = 0
|
77
|
+
successful_checks: int = 0
|
78
|
+
failed_checks: int = 0
|
79
|
+
avg_response_time_ms: float = 0.0
|
80
|
+
max_response_time_ms: float = 0.0
|
81
|
+
uptime_percentage: float = 100.0
|
82
|
+
consecutive_failures: int = 0
|
83
|
+
last_successful_check: Optional[datetime] = None
|
84
|
+
last_failed_check: Optional[datetime] = None
|
85
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
86
|
+
|
87
|
+
|
88
|
+
@dataclass
|
89
|
+
class HealthAlert:
|
90
|
+
"""Health monitoring alert."""
|
91
|
+
|
92
|
+
alert_id: str = field(default_factory=lambda: str(uuid4()))
|
93
|
+
service_name: str = ""
|
94
|
+
level: AlertLevel = AlertLevel.INFO
|
95
|
+
message: str = ""
|
96
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
97
|
+
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
|
98
|
+
resolved: bool = False
|
99
|
+
resolved_at: Optional[datetime] = None
|
100
|
+
|
101
|
+
|
102
|
+
class HealthCheck(ABC):
|
103
|
+
"""Abstract base class for health checks."""
|
104
|
+
|
105
|
+
def __init__(self, name: str, timeout: float = 5.0, critical: bool = True):
|
106
|
+
"""Initialize health check.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
name: Name of the service being checked
|
110
|
+
timeout: Timeout for health check in seconds
|
111
|
+
critical: Whether this check is critical for overall health
|
112
|
+
"""
|
113
|
+
self.name = name
|
114
|
+
self.timeout = timeout
|
115
|
+
self.critical = critical
|
116
|
+
|
117
|
+
@abstractmethod
|
118
|
+
async def check_health(self) -> HealthCheckResult:
|
119
|
+
"""Perform health check and return result."""
|
120
|
+
pass
|
121
|
+
|
122
|
+
|
123
|
+
class DatabaseHealthCheck(HealthCheck):
|
124
|
+
"""Health check for database connections."""
|
125
|
+
|
126
|
+
def __init__(self, name: str, connection_string: str, **kwargs):
|
127
|
+
"""Initialize database health check."""
|
128
|
+
super().__init__(name, **kwargs)
|
129
|
+
self.connection_string = connection_string
|
130
|
+
|
131
|
+
async def check_health(self) -> HealthCheckResult:
|
132
|
+
"""Check database health."""
|
133
|
+
start_time = time.time()
|
134
|
+
check_id = str(uuid4())
|
135
|
+
|
136
|
+
try:
|
137
|
+
# Import SQL node for health checking
|
138
|
+
from src.kailash.nodes.data.sql import SQLDatabaseNode
|
139
|
+
|
140
|
+
sql_node = SQLDatabaseNode(connection_string=self.connection_string)
|
141
|
+
|
142
|
+
# Execute simple health check query
|
143
|
+
result = await asyncio.wait_for(
|
144
|
+
asyncio.to_thread(sql_node.execute, query="SELECT 1 as health_check"),
|
145
|
+
timeout=self.timeout,
|
146
|
+
)
|
147
|
+
|
148
|
+
response_time = (time.time() - start_time) * 1000
|
149
|
+
|
150
|
+
if "data" in result and len(result["data"]) > 0:
|
151
|
+
return HealthCheckResult(
|
152
|
+
check_id=check_id,
|
153
|
+
service_name=self.name,
|
154
|
+
status=HealthStatus.HEALTHY,
|
155
|
+
response_time_ms=response_time,
|
156
|
+
details={
|
157
|
+
"query_executed": True,
|
158
|
+
"rows_returned": len(result["data"]),
|
159
|
+
"execution_time": result.get("execution_time", 0),
|
160
|
+
},
|
161
|
+
)
|
162
|
+
else:
|
163
|
+
return HealthCheckResult(
|
164
|
+
check_id=check_id,
|
165
|
+
service_name=self.name,
|
166
|
+
status=HealthStatus.DEGRADED,
|
167
|
+
response_time_ms=response_time,
|
168
|
+
details={"query_executed": True, "rows_returned": 0},
|
169
|
+
error_message="Query returned no data",
|
170
|
+
)
|
171
|
+
|
172
|
+
except asyncio.TimeoutError:
|
173
|
+
response_time = (time.time() - start_time) * 1000
|
174
|
+
return HealthCheckResult(
|
175
|
+
check_id=check_id,
|
176
|
+
service_name=self.name,
|
177
|
+
status=HealthStatus.UNHEALTHY,
|
178
|
+
response_time_ms=response_time,
|
179
|
+
error_message=f"Health check timed out after {self.timeout}s",
|
180
|
+
)
|
181
|
+
except Exception as e:
|
182
|
+
response_time = (time.time() - start_time) * 1000
|
183
|
+
return HealthCheckResult(
|
184
|
+
check_id=check_id,
|
185
|
+
service_name=self.name,
|
186
|
+
status=HealthStatus.UNHEALTHY,
|
187
|
+
response_time_ms=response_time,
|
188
|
+
error_message=str(e),
|
189
|
+
)
|
190
|
+
|
191
|
+
|
192
|
+
class RedisHealthCheck(HealthCheck):
|
193
|
+
"""Health check for Redis connections."""
|
194
|
+
|
195
|
+
def __init__(self, name: str, redis_config: Dict[str, Any], **kwargs):
|
196
|
+
"""Initialize Redis health check."""
|
197
|
+
super().__init__(name, **kwargs)
|
198
|
+
self.redis_config = redis_config
|
199
|
+
|
200
|
+
async def check_health(self) -> HealthCheckResult:
|
201
|
+
"""Check Redis health."""
|
202
|
+
start_time = time.time()
|
203
|
+
check_id = str(uuid4())
|
204
|
+
|
205
|
+
try:
|
206
|
+
import redis
|
207
|
+
|
208
|
+
# Create Redis client
|
209
|
+
client = redis.Redis(**self.redis_config)
|
210
|
+
|
211
|
+
# Execute ping command
|
212
|
+
await asyncio.wait_for(asyncio.to_thread(client.ping), timeout=self.timeout)
|
213
|
+
|
214
|
+
# Get Redis info
|
215
|
+
info = await asyncio.to_thread(client.info)
|
216
|
+
|
217
|
+
response_time = (time.time() - start_time) * 1000
|
218
|
+
|
219
|
+
return HealthCheckResult(
|
220
|
+
check_id=check_id,
|
221
|
+
service_name=self.name,
|
222
|
+
status=HealthStatus.HEALTHY,
|
223
|
+
response_time_ms=response_time,
|
224
|
+
details={
|
225
|
+
"ping_successful": True,
|
226
|
+
"connected_clients": info.get("connected_clients", 0),
|
227
|
+
"used_memory": info.get("used_memory", 0),
|
228
|
+
"redis_version": info.get("redis_version", "unknown"),
|
229
|
+
},
|
230
|
+
)
|
231
|
+
|
232
|
+
except asyncio.TimeoutError:
|
233
|
+
response_time = (time.time() - start_time) * 1000
|
234
|
+
return HealthCheckResult(
|
235
|
+
check_id=check_id,
|
236
|
+
service_name=self.name,
|
237
|
+
status=HealthStatus.UNHEALTHY,
|
238
|
+
response_time_ms=response_time,
|
239
|
+
error_message=f"Redis health check timed out after {self.timeout}s",
|
240
|
+
)
|
241
|
+
except Exception as e:
|
242
|
+
response_time = (time.time() - start_time) * 1000
|
243
|
+
return HealthCheckResult(
|
244
|
+
check_id=check_id,
|
245
|
+
service_name=self.name,
|
246
|
+
status=HealthStatus.UNHEALTHY,
|
247
|
+
response_time_ms=response_time,
|
248
|
+
error_message=str(e),
|
249
|
+
)
|
250
|
+
|
251
|
+
|
252
|
+
class HTTPHealthCheck(HealthCheck):
|
253
|
+
"""Health check for HTTP endpoints."""
|
254
|
+
|
255
|
+
def __init__(self, name: str, url: str, expected_status: int = 200, **kwargs):
|
256
|
+
"""Initialize HTTP health check."""
|
257
|
+
super().__init__(name, **kwargs)
|
258
|
+
self.url = url
|
259
|
+
self.expected_status = expected_status
|
260
|
+
|
261
|
+
async def check_health(self) -> HealthCheckResult:
|
262
|
+
"""Check HTTP endpoint health."""
|
263
|
+
start_time = time.time()
|
264
|
+
check_id = str(uuid4())
|
265
|
+
|
266
|
+
try:
|
267
|
+
import httpx
|
268
|
+
|
269
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
270
|
+
response = await client.get(self.url)
|
271
|
+
|
272
|
+
response_time = (time.time() - start_time) * 1000
|
273
|
+
|
274
|
+
if response.status_code == self.expected_status:
|
275
|
+
status = HealthStatus.HEALTHY
|
276
|
+
elif 200 <= response.status_code < 300:
|
277
|
+
status = HealthStatus.DEGRADED
|
278
|
+
else:
|
279
|
+
status = HealthStatus.UNHEALTHY
|
280
|
+
|
281
|
+
return HealthCheckResult(
|
282
|
+
check_id=check_id,
|
283
|
+
service_name=self.name,
|
284
|
+
status=status,
|
285
|
+
response_time_ms=response_time,
|
286
|
+
details={
|
287
|
+
"status_code": response.status_code,
|
288
|
+
"expected_status": self.expected_status,
|
289
|
+
"content_length": len(response.content),
|
290
|
+
},
|
291
|
+
)
|
292
|
+
|
293
|
+
except asyncio.TimeoutError:
|
294
|
+
response_time = (time.time() - start_time) * 1000
|
295
|
+
return HealthCheckResult(
|
296
|
+
check_id=check_id,
|
297
|
+
service_name=self.name,
|
298
|
+
status=HealthStatus.UNHEALTHY,
|
299
|
+
response_time_ms=response_time,
|
300
|
+
error_message=f"HTTP health check timed out after {self.timeout}s",
|
301
|
+
)
|
302
|
+
except Exception as e:
|
303
|
+
response_time = (time.time() - start_time) * 1000
|
304
|
+
return HealthCheckResult(
|
305
|
+
check_id=check_id,
|
306
|
+
service_name=self.name,
|
307
|
+
status=HealthStatus.UNHEALTHY,
|
308
|
+
response_time_ms=response_time,
|
309
|
+
error_message=str(e),
|
310
|
+
)
|
311
|
+
|
312
|
+
|
313
|
+
class HealthMonitor:
|
314
|
+
"""Enterprise health monitoring system."""
|
315
|
+
|
316
|
+
def __init__(self, check_interval: float = 30.0, alert_threshold: int = 3):
|
317
|
+
"""Initialize health monitor.
|
318
|
+
|
319
|
+
Args:
|
320
|
+
check_interval: Interval between health checks in seconds
|
321
|
+
alert_threshold: Number of consecutive failures before alerting
|
322
|
+
"""
|
323
|
+
self.check_interval = check_interval
|
324
|
+
self.alert_threshold = alert_threshold
|
325
|
+
self.health_checks: Dict[str, HealthCheck] = {}
|
326
|
+
self.metrics: Dict[str, HealthMetrics] = {}
|
327
|
+
self.alerts: List[HealthAlert] = []
|
328
|
+
self.alert_callbacks: List[Callable[[HealthAlert], None]] = []
|
329
|
+
self._monitoring_task: Optional[asyncio.Task] = None
|
330
|
+
self._running = False
|
331
|
+
self._lock = asyncio.Lock()
|
332
|
+
|
333
|
+
logger.info("Initialized HealthMonitor")
|
334
|
+
|
335
|
+
def register_check(self, service_name: str, health_check: HealthCheck):
|
336
|
+
"""Register a health check."""
|
337
|
+
self.health_checks[service_name] = health_check
|
338
|
+
self.metrics[service_name] = HealthMetrics()
|
339
|
+
logger.info(f"Registered health check for service: {service_name}")
|
340
|
+
|
341
|
+
def register_alert_callback(self, callback: Callable[[HealthAlert], None]):
|
342
|
+
"""Register callback for health alerts."""
|
343
|
+
self.alert_callbacks.append(callback)
|
344
|
+
|
345
|
+
async def check_service_health(self, service_name: str) -> HealthCheckResult:
|
346
|
+
"""Perform health check for specific service."""
|
347
|
+
if service_name not in self.health_checks:
|
348
|
+
raise ValueError(f"No health check registered for service: {service_name}")
|
349
|
+
|
350
|
+
health_check = self.health_checks[service_name]
|
351
|
+
result = await health_check.check_health()
|
352
|
+
|
353
|
+
# Update metrics
|
354
|
+
await self._update_metrics(service_name, result)
|
355
|
+
|
356
|
+
# Check for alerts
|
357
|
+
await self._check_alerts(service_name, result)
|
358
|
+
|
359
|
+
return result
|
360
|
+
|
361
|
+
async def get_health_status(self, service_name: str) -> Optional[HealthCheckResult]:
|
362
|
+
"""Get latest health status for service."""
|
363
|
+
return await self.check_service_health(service_name)
|
364
|
+
|
365
|
+
async def get_all_health_status(self) -> Dict[str, HealthCheckResult]:
|
366
|
+
"""Get health status for all registered services."""
|
367
|
+
results = {}
|
368
|
+
for service_name in self.health_checks:
|
369
|
+
try:
|
370
|
+
results[service_name] = await self.check_service_health(service_name)
|
371
|
+
except Exception as e:
|
372
|
+
logger.error(f"Failed to check health for {service_name}: {e}")
|
373
|
+
results[service_name] = HealthCheckResult(
|
374
|
+
check_id=str(uuid4()),
|
375
|
+
service_name=service_name,
|
376
|
+
status=HealthStatus.UNKNOWN,
|
377
|
+
response_time_ms=0.0,
|
378
|
+
error_message=str(e),
|
379
|
+
)
|
380
|
+
return results
|
381
|
+
|
382
|
+
async def get_overall_health(self) -> HealthStatus:
|
383
|
+
"""Get overall system health status."""
|
384
|
+
all_status = await self.get_all_health_status()
|
385
|
+
|
386
|
+
if not all_status:
|
387
|
+
return HealthStatus.UNKNOWN
|
388
|
+
|
389
|
+
critical_services = [
|
390
|
+
name for name, check in self.health_checks.items() if check.critical
|
391
|
+
]
|
392
|
+
|
393
|
+
# Check critical services first
|
394
|
+
critical_unhealthy = any(
|
395
|
+
all_status[name].status == HealthStatus.UNHEALTHY
|
396
|
+
for name in critical_services
|
397
|
+
if name in all_status
|
398
|
+
)
|
399
|
+
|
400
|
+
if critical_unhealthy:
|
401
|
+
return HealthStatus.UNHEALTHY
|
402
|
+
|
403
|
+
# Check if any service is degraded
|
404
|
+
any_degraded = any(
|
405
|
+
result.status == HealthStatus.DEGRADED for result in all_status.values()
|
406
|
+
)
|
407
|
+
|
408
|
+
if any_degraded:
|
409
|
+
return HealthStatus.DEGRADED
|
410
|
+
|
411
|
+
# Check if all are healthy
|
412
|
+
all_healthy = all(
|
413
|
+
result.status == HealthStatus.HEALTHY for result in all_status.values()
|
414
|
+
)
|
415
|
+
|
416
|
+
return HealthStatus.HEALTHY if all_healthy else HealthStatus.UNKNOWN
|
417
|
+
|
418
|
+
async def get_metrics(self, service_name: str) -> Optional[HealthMetrics]:
|
419
|
+
"""Get metrics for specific service."""
|
420
|
+
return self.metrics.get(service_name)
|
421
|
+
|
422
|
+
async def get_all_metrics(self) -> Dict[str, HealthMetrics]:
|
423
|
+
"""Get metrics for all services."""
|
424
|
+
return self.metrics.copy()
|
425
|
+
|
426
|
+
async def get_alerts(self, resolved: Optional[bool] = None) -> List[HealthAlert]:
|
427
|
+
"""Get health alerts."""
|
428
|
+
if resolved is None:
|
429
|
+
return self.alerts.copy()
|
430
|
+
return [alert for alert in self.alerts if alert.resolved == resolved]
|
431
|
+
|
432
|
+
async def start_monitoring(self):
|
433
|
+
"""Start continuous health monitoring."""
|
434
|
+
if self._running:
|
435
|
+
logger.warning("Health monitoring already running")
|
436
|
+
return
|
437
|
+
|
438
|
+
self._running = True
|
439
|
+
self._monitoring_task = asyncio.create_task(self._monitoring_loop())
|
440
|
+
logger.info("Started health monitoring")
|
441
|
+
|
442
|
+
async def stop_monitoring(self):
|
443
|
+
"""Stop continuous health monitoring."""
|
444
|
+
if not self._running:
|
445
|
+
return
|
446
|
+
|
447
|
+
self._running = False
|
448
|
+
if self._monitoring_task:
|
449
|
+
self._monitoring_task.cancel()
|
450
|
+
try:
|
451
|
+
await self._monitoring_task
|
452
|
+
except asyncio.CancelledError:
|
453
|
+
pass
|
454
|
+
|
455
|
+
logger.info("Stopped health monitoring")
|
456
|
+
|
457
|
+
async def _monitoring_loop(self):
|
458
|
+
"""Main monitoring loop."""
|
459
|
+
while self._running:
|
460
|
+
try:
|
461
|
+
# Check all services
|
462
|
+
await self.get_all_health_status()
|
463
|
+
|
464
|
+
# Wait for next check interval
|
465
|
+
await asyncio.sleep(self.check_interval)
|
466
|
+
|
467
|
+
except asyncio.CancelledError:
|
468
|
+
break
|
469
|
+
except Exception as e:
|
470
|
+
logger.error(f"Error in monitoring loop: {e}")
|
471
|
+
await asyncio.sleep(min(self.check_interval, 10)) # Fallback interval
|
472
|
+
|
473
|
+
async def _update_metrics(self, service_name: str, result: HealthCheckResult):
|
474
|
+
"""Update metrics for service."""
|
475
|
+
async with self._lock:
|
476
|
+
metrics = self.metrics[service_name]
|
477
|
+
|
478
|
+
metrics.total_checks += 1
|
479
|
+
|
480
|
+
if result.is_healthy:
|
481
|
+
metrics.successful_checks += 1
|
482
|
+
metrics.consecutive_failures = 0
|
483
|
+
metrics.last_successful_check = result.timestamp
|
484
|
+
else:
|
485
|
+
metrics.failed_checks += 1
|
486
|
+
metrics.consecutive_failures += 1
|
487
|
+
metrics.last_failed_check = result.timestamp
|
488
|
+
|
489
|
+
# Update response time metrics
|
490
|
+
if metrics.total_checks == 1:
|
491
|
+
metrics.avg_response_time_ms = result.response_time_ms
|
492
|
+
else:
|
493
|
+
metrics.avg_response_time_ms = (
|
494
|
+
metrics.avg_response_time_ms * (metrics.total_checks - 1)
|
495
|
+
+ result.response_time_ms
|
496
|
+
) / metrics.total_checks
|
497
|
+
|
498
|
+
if result.response_time_ms > metrics.max_response_time_ms:
|
499
|
+
metrics.max_response_time_ms = result.response_time_ms
|
500
|
+
|
501
|
+
# Update uptime percentage
|
502
|
+
metrics.uptime_percentage = (
|
503
|
+
metrics.successful_checks / metrics.total_checks
|
504
|
+
) * 100
|
505
|
+
|
506
|
+
async def _check_alerts(self, service_name: str, result: HealthCheckResult):
|
507
|
+
"""Check if alerts should be generated."""
|
508
|
+
metrics = self.metrics[service_name]
|
509
|
+
|
510
|
+
# Check for consecutive failure threshold
|
511
|
+
if metrics.consecutive_failures >= self.alert_threshold:
|
512
|
+
await self._generate_alert(
|
513
|
+
service_name,
|
514
|
+
AlertLevel.CRITICAL,
|
515
|
+
f"Service {service_name} has {metrics.consecutive_failures} consecutive failures",
|
516
|
+
{
|
517
|
+
"consecutive_failures": metrics.consecutive_failures,
|
518
|
+
"last_error": result.error_message,
|
519
|
+
"health_status": result.status.value,
|
520
|
+
},
|
521
|
+
)
|
522
|
+
|
523
|
+
# Check for high response times
|
524
|
+
if result.response_time_ms > 5000: # 5 seconds
|
525
|
+
await self._generate_alert(
|
526
|
+
service_name,
|
527
|
+
AlertLevel.WARNING,
|
528
|
+
f"High response time for {service_name}: {result.response_time_ms:.2f}ms",
|
529
|
+
{
|
530
|
+
"response_time_ms": result.response_time_ms,
|
531
|
+
"avg_response_time_ms": metrics.avg_response_time_ms,
|
532
|
+
},
|
533
|
+
)
|
534
|
+
|
535
|
+
async def _generate_alert(
|
536
|
+
self,
|
537
|
+
service_name: str,
|
538
|
+
level: AlertLevel,
|
539
|
+
message: str,
|
540
|
+
details: Dict[str, Any],
|
541
|
+
):
|
542
|
+
"""Generate health alert."""
|
543
|
+
alert = HealthAlert(
|
544
|
+
service_name=service_name, level=level, message=message, details=details
|
545
|
+
)
|
546
|
+
|
547
|
+
self.alerts.append(alert)
|
548
|
+
|
549
|
+
# Call alert callbacks
|
550
|
+
for callback in self.alert_callbacks:
|
551
|
+
try:
|
552
|
+
callback(alert)
|
553
|
+
except Exception as e:
|
554
|
+
logger.error(f"Error in alert callback: {e}")
|
555
|
+
|
556
|
+
logger.warning(f"Health alert generated: {message}")
|
557
|
+
|
558
|
+
|
559
|
+
# Global health monitor instance
|
560
|
+
_health_monitor: Optional[HealthMonitor] = None
|
561
|
+
|
562
|
+
|
563
|
+
def get_health_monitor() -> HealthMonitor:
|
564
|
+
"""Get global health monitor instance."""
|
565
|
+
global _health_monitor
|
566
|
+
if _health_monitor is None:
|
567
|
+
_health_monitor = HealthMonitor()
|
568
|
+
return _health_monitor
|
569
|
+
|
570
|
+
|
571
|
+
async def quick_health_check(service_name: str) -> bool:
|
572
|
+
"""Quick health check for a service."""
|
573
|
+
monitor = get_health_monitor()
|
574
|
+
try:
|
575
|
+
result = await monitor.get_health_status(service_name)
|
576
|
+
return result.is_healthy if result else False
|
577
|
+
except Exception:
|
578
|
+
return False
|
kailash/edge/discovery.py
CHANGED
@@ -199,6 +199,92 @@ class EdgeDiscovery:
|
|
199
199
|
self._last_health_check[location.location_id] = datetime.now(UTC)
|
200
200
|
logger.info(f"Added edge location: {location.name}")
|
201
201
|
|
202
|
+
async def register_edge(self, edge_config: Dict[str, Any]):
|
203
|
+
"""Register an edge location from configuration dictionary.
|
204
|
+
|
205
|
+
Args:
|
206
|
+
edge_config: Dictionary containing edge location configuration
|
207
|
+
"""
|
208
|
+
from .location import (
|
209
|
+
ComplianceZone,
|
210
|
+
EdgeCapabilities,
|
211
|
+
EdgeLocation,
|
212
|
+
EdgeRegion,
|
213
|
+
GeographicCoordinates,
|
214
|
+
)
|
215
|
+
|
216
|
+
# Extract basic info
|
217
|
+
location_id = edge_config["id"]
|
218
|
+
region_str = edge_config.get("region", "us-east")
|
219
|
+
|
220
|
+
# Map region string to enum
|
221
|
+
region_map = {
|
222
|
+
"us-east-1": EdgeRegion.US_EAST,
|
223
|
+
"us-west-1": EdgeRegion.US_WEST,
|
224
|
+
"eu-west-1": EdgeRegion.EU_WEST,
|
225
|
+
"eu-central-1": EdgeRegion.EU_CENTRAL,
|
226
|
+
"asia-southeast-1": EdgeRegion.ASIA_SOUTHEAST,
|
227
|
+
}
|
228
|
+
region = region_map.get(region_str, EdgeRegion.US_EAST)
|
229
|
+
|
230
|
+
# Default coordinates based on region
|
231
|
+
coord_map = {
|
232
|
+
EdgeRegion.US_EAST: GeographicCoordinates(39.0458, -76.6413), # Virginia
|
233
|
+
EdgeRegion.US_WEST: GeographicCoordinates(37.7749, -122.4194), # California
|
234
|
+
EdgeRegion.EU_WEST: GeographicCoordinates(53.3498, -6.2603), # Ireland
|
235
|
+
EdgeRegion.EU_CENTRAL: GeographicCoordinates(50.1109, 8.6821), # Frankfurt
|
236
|
+
EdgeRegion.ASIA_SOUTHEAST: GeographicCoordinates(
|
237
|
+
1.3521, 103.8198
|
238
|
+
), # Singapore
|
239
|
+
}
|
240
|
+
coordinates = coord_map.get(region, GeographicCoordinates(39.0458, -76.6413))
|
241
|
+
|
242
|
+
# Create capabilities
|
243
|
+
capabilities = EdgeCapabilities(
|
244
|
+
cpu_cores=edge_config.get("capacity", 1000) // 100, # Rough mapping
|
245
|
+
memory_gb=edge_config.get("capacity", 1000) // 50,
|
246
|
+
storage_gb=edge_config.get("capacity", 1000) * 2,
|
247
|
+
bandwidth_gbps=10.0,
|
248
|
+
database_support=["postgresql", "redis"],
|
249
|
+
ai_models_available=["llama", "claude"],
|
250
|
+
)
|
251
|
+
|
252
|
+
# Create edge location
|
253
|
+
location = EdgeLocation(
|
254
|
+
location_id=location_id,
|
255
|
+
name=f"Edge {region_str.title()}",
|
256
|
+
region=region,
|
257
|
+
coordinates=coordinates,
|
258
|
+
capabilities=capabilities,
|
259
|
+
endpoint_url=edge_config.get(
|
260
|
+
"endpoint", f"http://{location_id}.edge.local:8080"
|
261
|
+
),
|
262
|
+
)
|
263
|
+
|
264
|
+
# Set health status
|
265
|
+
from .location import EdgeStatus
|
266
|
+
|
267
|
+
if edge_config.get("healthy", True):
|
268
|
+
location.status = EdgeStatus.ACTIVE
|
269
|
+
self._health_results[location_id] = HealthCheckResult.HEALTHY
|
270
|
+
else:
|
271
|
+
location.status = EdgeStatus.OFFLINE
|
272
|
+
self._health_results[location_id] = HealthCheckResult.UNHEALTHY
|
273
|
+
|
274
|
+
# Update metrics
|
275
|
+
location.metrics.latency_p50_ms = edge_config.get("latency_ms", 10)
|
276
|
+
location.metrics.cpu_utilization = edge_config.get(
|
277
|
+
"current_load", 0
|
278
|
+
) / edge_config.get("capacity", 1000)
|
279
|
+
|
280
|
+
# Add to locations
|
281
|
+
self.locations[location_id] = location
|
282
|
+
self._last_health_check[location_id] = datetime.now(UTC)
|
283
|
+
|
284
|
+
logger.info(f"Registered edge location: {location_id} in {region_str}")
|
285
|
+
|
286
|
+
return location
|
287
|
+
|
202
288
|
def remove_location(self, location_id: str):
|
203
289
|
"""Remove an edge location from the discovery pool."""
|
204
290
|
if location_id in self.locations:
|