kailash 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. kailash/__init__.py +3 -3
  2. kailash/api/custom_nodes_secure.py +3 -3
  3. kailash/api/gateway.py +1 -1
  4. kailash/api/studio.py +1 -1
  5. kailash/api/workflow_api.py +2 -2
  6. kailash/core/resilience/bulkhead.py +475 -0
  7. kailash/core/resilience/circuit_breaker.py +92 -10
  8. kailash/core/resilience/health_monitor.py +578 -0
  9. kailash/edge/discovery.py +86 -0
  10. kailash/mcp_server/__init__.py +309 -33
  11. kailash/mcp_server/advanced_features.py +1022 -0
  12. kailash/mcp_server/ai_registry_server.py +27 -2
  13. kailash/mcp_server/auth.py +789 -0
  14. kailash/mcp_server/client.py +645 -378
  15. kailash/mcp_server/discovery.py +1593 -0
  16. kailash/mcp_server/errors.py +673 -0
  17. kailash/mcp_server/oauth.py +1727 -0
  18. kailash/mcp_server/protocol.py +1126 -0
  19. kailash/mcp_server/registry_integration.py +587 -0
  20. kailash/mcp_server/server.py +1228 -96
  21. kailash/mcp_server/transports.py +1169 -0
  22. kailash/mcp_server/utils/__init__.py +6 -1
  23. kailash/mcp_server/utils/cache.py +250 -7
  24. kailash/middleware/auth/auth_manager.py +3 -3
  25. kailash/middleware/communication/api_gateway.py +1 -1
  26. kailash/middleware/communication/realtime.py +1 -1
  27. kailash/middleware/mcp/enhanced_server.py +1 -1
  28. kailash/nodes/__init__.py +2 -0
  29. kailash/nodes/admin/audit_log.py +6 -6
  30. kailash/nodes/admin/permission_check.py +8 -8
  31. kailash/nodes/admin/role_management.py +32 -28
  32. kailash/nodes/admin/schema.sql +6 -1
  33. kailash/nodes/admin/schema_manager.py +13 -13
  34. kailash/nodes/admin/security_event.py +15 -15
  35. kailash/nodes/admin/tenant_isolation.py +3 -3
  36. kailash/nodes/admin/transaction_utils.py +3 -3
  37. kailash/nodes/admin/user_management.py +21 -21
  38. kailash/nodes/ai/a2a.py +11 -11
  39. kailash/nodes/ai/ai_providers.py +9 -12
  40. kailash/nodes/ai/embedding_generator.py +13 -14
  41. kailash/nodes/ai/intelligent_agent_orchestrator.py +19 -19
  42. kailash/nodes/ai/iterative_llm_agent.py +2 -2
  43. kailash/nodes/ai/llm_agent.py +210 -33
  44. kailash/nodes/ai/self_organizing.py +2 -2
  45. kailash/nodes/alerts/discord.py +4 -4
  46. kailash/nodes/api/graphql.py +6 -6
  47. kailash/nodes/api/http.py +10 -10
  48. kailash/nodes/api/rate_limiting.py +4 -4
  49. kailash/nodes/api/rest.py +15 -15
  50. kailash/nodes/auth/mfa.py +3 -3
  51. kailash/nodes/auth/risk_assessment.py +2 -2
  52. kailash/nodes/auth/session_management.py +5 -5
  53. kailash/nodes/auth/sso.py +143 -0
  54. kailash/nodes/base.py +8 -2
  55. kailash/nodes/base_async.py +16 -2
  56. kailash/nodes/base_with_acl.py +2 -2
  57. kailash/nodes/cache/__init__.py +9 -0
  58. kailash/nodes/cache/cache.py +1172 -0
  59. kailash/nodes/cache/cache_invalidation.py +874 -0
  60. kailash/nodes/cache/redis_pool_manager.py +595 -0
  61. kailash/nodes/code/async_python.py +2 -1
  62. kailash/nodes/code/python.py +194 -30
  63. kailash/nodes/compliance/data_retention.py +6 -6
  64. kailash/nodes/compliance/gdpr.py +5 -5
  65. kailash/nodes/data/__init__.py +10 -0
  66. kailash/nodes/data/async_sql.py +1956 -129
  67. kailash/nodes/data/optimistic_locking.py +906 -0
  68. kailash/nodes/data/readers.py +8 -8
  69. kailash/nodes/data/redis.py +378 -0
  70. kailash/nodes/data/sql.py +314 -3
  71. kailash/nodes/data/streaming.py +21 -0
  72. kailash/nodes/enterprise/__init__.py +8 -0
  73. kailash/nodes/enterprise/audit_logger.py +285 -0
  74. kailash/nodes/enterprise/batch_processor.py +22 -3
  75. kailash/nodes/enterprise/data_lineage.py +1 -1
  76. kailash/nodes/enterprise/mcp_executor.py +205 -0
  77. kailash/nodes/enterprise/service_discovery.py +150 -0
  78. kailash/nodes/enterprise/tenant_assignment.py +108 -0
  79. kailash/nodes/logic/async_operations.py +2 -2
  80. kailash/nodes/logic/convergence.py +1 -1
  81. kailash/nodes/logic/operations.py +1 -1
  82. kailash/nodes/monitoring/__init__.py +11 -1
  83. kailash/nodes/monitoring/health_check.py +456 -0
  84. kailash/nodes/monitoring/log_processor.py +817 -0
  85. kailash/nodes/monitoring/metrics_collector.py +627 -0
  86. kailash/nodes/monitoring/performance_benchmark.py +137 -11
  87. kailash/nodes/rag/advanced.py +7 -7
  88. kailash/nodes/rag/agentic.py +49 -2
  89. kailash/nodes/rag/conversational.py +3 -3
  90. kailash/nodes/rag/evaluation.py +3 -3
  91. kailash/nodes/rag/federated.py +3 -3
  92. kailash/nodes/rag/graph.py +3 -3
  93. kailash/nodes/rag/multimodal.py +3 -3
  94. kailash/nodes/rag/optimized.py +5 -5
  95. kailash/nodes/rag/privacy.py +3 -3
  96. kailash/nodes/rag/query_processing.py +6 -6
  97. kailash/nodes/rag/realtime.py +1 -1
  98. kailash/nodes/rag/registry.py +1 -1
  99. kailash/nodes/rag/router.py +1 -1
  100. kailash/nodes/rag/similarity.py +7 -7
  101. kailash/nodes/rag/strategies.py +4 -4
  102. kailash/nodes/security/abac_evaluator.py +6 -6
  103. kailash/nodes/security/behavior_analysis.py +5 -5
  104. kailash/nodes/security/credential_manager.py +1 -1
  105. kailash/nodes/security/rotating_credentials.py +11 -11
  106. kailash/nodes/security/threat_detection.py +8 -8
  107. kailash/nodes/testing/credential_testing.py +2 -2
  108. kailash/nodes/transform/processors.py +5 -5
  109. kailash/runtime/local.py +163 -9
  110. kailash/runtime/parameter_injection.py +425 -0
  111. kailash/runtime/parameter_injector.py +657 -0
  112. kailash/runtime/testing.py +2 -2
  113. kailash/testing/fixtures.py +2 -2
  114. kailash/workflow/builder.py +99 -14
  115. kailash/workflow/builder_improvements.py +207 -0
  116. kailash/workflow/input_handling.py +170 -0
  117. {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/METADATA +22 -9
  118. {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/RECORD +122 -95
  119. {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/WHEEL +0 -0
  120. {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/entry_points.txt +0 -0
  121. {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/licenses/LICENSE +0 -0
  122. {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,578 @@
1
+ """Connection health monitoring system for enterprise resilience.
2
+
3
+ This module provides comprehensive health monitoring for database connections,
4
+ Redis connections, and other external services. It integrates with circuit
5
+ breakers and bulkhead patterns to provide enterprise-grade observability.
6
+
7
+ Features:
8
+ - Real-time health status monitoring
9
+ - Automatic health checks with configurable intervals
10
+ - Integration with circuit breakers and bulkheads
11
+ - Health-based routing and failover
12
+ - Comprehensive metrics collection
13
+ - Alert generation for critical failures
14
+
15
+ Example:
16
+ >>> monitor = HealthMonitor()
17
+ >>> monitor.register_check("database", DatabaseHealthCheck(...))
18
+ >>> status = await monitor.get_health_status("database")
19
+ >>> if status.is_healthy:
20
+ ... # Proceed with operation
21
+ """
22
+
23
+ import asyncio
24
+ import logging
25
+ import time
26
+ from abc import ABC, abstractmethod
27
+ from dataclasses import dataclass, field
28
+ from datetime import UTC, datetime, timedelta
29
+ from enum import Enum
30
+ from typing import Any, Callable, Dict, List, Optional, Set
31
+ from uuid import uuid4
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class HealthStatus(Enum):
37
+ """Health status levels."""
38
+
39
+ HEALTHY = "healthy"
40
+ DEGRADED = "degraded"
41
+ UNHEALTHY = "unhealthy"
42
+ UNKNOWN = "unknown"
43
+
44
+
45
+ class AlertLevel(Enum):
46
+ """Alert severity levels."""
47
+
48
+ INFO = "info"
49
+ WARNING = "warning"
50
+ CRITICAL = "critical"
51
+ FATAL = "fatal"
52
+
53
+
54
+ @dataclass
55
+ class HealthCheckResult:
56
+ """Result of a health check operation."""
57
+
58
+ check_id: str
59
+ service_name: str
60
+ status: HealthStatus
61
+ response_time_ms: float
62
+ timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
63
+ details: Dict[str, Any] = field(default_factory=dict)
64
+ error_message: Optional[str] = None
65
+ is_healthy: bool = field(init=False)
66
+
67
+ def __post_init__(self):
68
+ """Calculate health status."""
69
+ self.is_healthy = self.status in [HealthStatus.HEALTHY, HealthStatus.DEGRADED]
70
+
71
+
72
+ @dataclass
73
+ class HealthMetrics:
74
+ """Health monitoring metrics."""
75
+
76
+ total_checks: int = 0
77
+ successful_checks: int = 0
78
+ failed_checks: int = 0
79
+ avg_response_time_ms: float = 0.0
80
+ max_response_time_ms: float = 0.0
81
+ uptime_percentage: float = 100.0
82
+ consecutive_failures: int = 0
83
+ last_successful_check: Optional[datetime] = None
84
+ last_failed_check: Optional[datetime] = None
85
+ created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
86
+
87
+
88
+ @dataclass
89
+ class HealthAlert:
90
+ """Health monitoring alert."""
91
+
92
+ alert_id: str = field(default_factory=lambda: str(uuid4()))
93
+ service_name: str = ""
94
+ level: AlertLevel = AlertLevel.INFO
95
+ message: str = ""
96
+ details: Dict[str, Any] = field(default_factory=dict)
97
+ timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
98
+ resolved: bool = False
99
+ resolved_at: Optional[datetime] = None
100
+
101
+
102
+ class HealthCheck(ABC):
103
+ """Abstract base class for health checks."""
104
+
105
+ def __init__(self, name: str, timeout: float = 5.0, critical: bool = True):
106
+ """Initialize health check.
107
+
108
+ Args:
109
+ name: Name of the service being checked
110
+ timeout: Timeout for health check in seconds
111
+ critical: Whether this check is critical for overall health
112
+ """
113
+ self.name = name
114
+ self.timeout = timeout
115
+ self.critical = critical
116
+
117
+ @abstractmethod
118
+ async def check_health(self) -> HealthCheckResult:
119
+ """Perform health check and return result."""
120
+ pass
121
+
122
+
123
+ class DatabaseHealthCheck(HealthCheck):
124
+ """Health check for database connections."""
125
+
126
+ def __init__(self, name: str, connection_string: str, **kwargs):
127
+ """Initialize database health check."""
128
+ super().__init__(name, **kwargs)
129
+ self.connection_string = connection_string
130
+
131
+ async def check_health(self) -> HealthCheckResult:
132
+ """Check database health."""
133
+ start_time = time.time()
134
+ check_id = str(uuid4())
135
+
136
+ try:
137
+ # Import SQL node for health checking
138
+ from src.kailash.nodes.data.sql import SQLDatabaseNode
139
+
140
+ sql_node = SQLDatabaseNode(connection_string=self.connection_string)
141
+
142
+ # Execute simple health check query
143
+ result = await asyncio.wait_for(
144
+ asyncio.to_thread(sql_node.execute, query="SELECT 1 as health_check"),
145
+ timeout=self.timeout,
146
+ )
147
+
148
+ response_time = (time.time() - start_time) * 1000
149
+
150
+ if "data" in result and len(result["data"]) > 0:
151
+ return HealthCheckResult(
152
+ check_id=check_id,
153
+ service_name=self.name,
154
+ status=HealthStatus.HEALTHY,
155
+ response_time_ms=response_time,
156
+ details={
157
+ "query_executed": True,
158
+ "rows_returned": len(result["data"]),
159
+ "execution_time": result.get("execution_time", 0),
160
+ },
161
+ )
162
+ else:
163
+ return HealthCheckResult(
164
+ check_id=check_id,
165
+ service_name=self.name,
166
+ status=HealthStatus.DEGRADED,
167
+ response_time_ms=response_time,
168
+ details={"query_executed": True, "rows_returned": 0},
169
+ error_message="Query returned no data",
170
+ )
171
+
172
+ except asyncio.TimeoutError:
173
+ response_time = (time.time() - start_time) * 1000
174
+ return HealthCheckResult(
175
+ check_id=check_id,
176
+ service_name=self.name,
177
+ status=HealthStatus.UNHEALTHY,
178
+ response_time_ms=response_time,
179
+ error_message=f"Health check timed out after {self.timeout}s",
180
+ )
181
+ except Exception as e:
182
+ response_time = (time.time() - start_time) * 1000
183
+ return HealthCheckResult(
184
+ check_id=check_id,
185
+ service_name=self.name,
186
+ status=HealthStatus.UNHEALTHY,
187
+ response_time_ms=response_time,
188
+ error_message=str(e),
189
+ )
190
+
191
+
192
+ class RedisHealthCheck(HealthCheck):
193
+ """Health check for Redis connections."""
194
+
195
+ def __init__(self, name: str, redis_config: Dict[str, Any], **kwargs):
196
+ """Initialize Redis health check."""
197
+ super().__init__(name, **kwargs)
198
+ self.redis_config = redis_config
199
+
200
+ async def check_health(self) -> HealthCheckResult:
201
+ """Check Redis health."""
202
+ start_time = time.time()
203
+ check_id = str(uuid4())
204
+
205
+ try:
206
+ import redis
207
+
208
+ # Create Redis client
209
+ client = redis.Redis(**self.redis_config)
210
+
211
+ # Execute ping command
212
+ await asyncio.wait_for(asyncio.to_thread(client.ping), timeout=self.timeout)
213
+
214
+ # Get Redis info
215
+ info = await asyncio.to_thread(client.info)
216
+
217
+ response_time = (time.time() - start_time) * 1000
218
+
219
+ return HealthCheckResult(
220
+ check_id=check_id,
221
+ service_name=self.name,
222
+ status=HealthStatus.HEALTHY,
223
+ response_time_ms=response_time,
224
+ details={
225
+ "ping_successful": True,
226
+ "connected_clients": info.get("connected_clients", 0),
227
+ "used_memory": info.get("used_memory", 0),
228
+ "redis_version": info.get("redis_version", "unknown"),
229
+ },
230
+ )
231
+
232
+ except asyncio.TimeoutError:
233
+ response_time = (time.time() - start_time) * 1000
234
+ return HealthCheckResult(
235
+ check_id=check_id,
236
+ service_name=self.name,
237
+ status=HealthStatus.UNHEALTHY,
238
+ response_time_ms=response_time,
239
+ error_message=f"Redis health check timed out after {self.timeout}s",
240
+ )
241
+ except Exception as e:
242
+ response_time = (time.time() - start_time) * 1000
243
+ return HealthCheckResult(
244
+ check_id=check_id,
245
+ service_name=self.name,
246
+ status=HealthStatus.UNHEALTHY,
247
+ response_time_ms=response_time,
248
+ error_message=str(e),
249
+ )
250
+
251
+
252
+ class HTTPHealthCheck(HealthCheck):
253
+ """Health check for HTTP endpoints."""
254
+
255
+ def __init__(self, name: str, url: str, expected_status: int = 200, **kwargs):
256
+ """Initialize HTTP health check."""
257
+ super().__init__(name, **kwargs)
258
+ self.url = url
259
+ self.expected_status = expected_status
260
+
261
+ async def check_health(self) -> HealthCheckResult:
262
+ """Check HTTP endpoint health."""
263
+ start_time = time.time()
264
+ check_id = str(uuid4())
265
+
266
+ try:
267
+ import httpx
268
+
269
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
270
+ response = await client.get(self.url)
271
+
272
+ response_time = (time.time() - start_time) * 1000
273
+
274
+ if response.status_code == self.expected_status:
275
+ status = HealthStatus.HEALTHY
276
+ elif 200 <= response.status_code < 300:
277
+ status = HealthStatus.DEGRADED
278
+ else:
279
+ status = HealthStatus.UNHEALTHY
280
+
281
+ return HealthCheckResult(
282
+ check_id=check_id,
283
+ service_name=self.name,
284
+ status=status,
285
+ response_time_ms=response_time,
286
+ details={
287
+ "status_code": response.status_code,
288
+ "expected_status": self.expected_status,
289
+ "content_length": len(response.content),
290
+ },
291
+ )
292
+
293
+ except asyncio.TimeoutError:
294
+ response_time = (time.time() - start_time) * 1000
295
+ return HealthCheckResult(
296
+ check_id=check_id,
297
+ service_name=self.name,
298
+ status=HealthStatus.UNHEALTHY,
299
+ response_time_ms=response_time,
300
+ error_message=f"HTTP health check timed out after {self.timeout}s",
301
+ )
302
+ except Exception as e:
303
+ response_time = (time.time() - start_time) * 1000
304
+ return HealthCheckResult(
305
+ check_id=check_id,
306
+ service_name=self.name,
307
+ status=HealthStatus.UNHEALTHY,
308
+ response_time_ms=response_time,
309
+ error_message=str(e),
310
+ )
311
+
312
+
313
+ class HealthMonitor:
314
+ """Enterprise health monitoring system."""
315
+
316
+ def __init__(self, check_interval: float = 30.0, alert_threshold: int = 3):
317
+ """Initialize health monitor.
318
+
319
+ Args:
320
+ check_interval: Interval between health checks in seconds
321
+ alert_threshold: Number of consecutive failures before alerting
322
+ """
323
+ self.check_interval = check_interval
324
+ self.alert_threshold = alert_threshold
325
+ self.health_checks: Dict[str, HealthCheck] = {}
326
+ self.metrics: Dict[str, HealthMetrics] = {}
327
+ self.alerts: List[HealthAlert] = []
328
+ self.alert_callbacks: List[Callable[[HealthAlert], None]] = []
329
+ self._monitoring_task: Optional[asyncio.Task] = None
330
+ self._running = False
331
+ self._lock = asyncio.Lock()
332
+
333
+ logger.info("Initialized HealthMonitor")
334
+
335
+ def register_check(self, service_name: str, health_check: HealthCheck):
336
+ """Register a health check."""
337
+ self.health_checks[service_name] = health_check
338
+ self.metrics[service_name] = HealthMetrics()
339
+ logger.info(f"Registered health check for service: {service_name}")
340
+
341
+ def register_alert_callback(self, callback: Callable[[HealthAlert], None]):
342
+ """Register callback for health alerts."""
343
+ self.alert_callbacks.append(callback)
344
+
345
+ async def check_service_health(self, service_name: str) -> HealthCheckResult:
346
+ """Perform health check for specific service."""
347
+ if service_name not in self.health_checks:
348
+ raise ValueError(f"No health check registered for service: {service_name}")
349
+
350
+ health_check = self.health_checks[service_name]
351
+ result = await health_check.check_health()
352
+
353
+ # Update metrics
354
+ await self._update_metrics(service_name, result)
355
+
356
+ # Check for alerts
357
+ await self._check_alerts(service_name, result)
358
+
359
+ return result
360
+
361
+ async def get_health_status(self, service_name: str) -> Optional[HealthCheckResult]:
362
+ """Get latest health status for service."""
363
+ return await self.check_service_health(service_name)
364
+
365
+ async def get_all_health_status(self) -> Dict[str, HealthCheckResult]:
366
+ """Get health status for all registered services."""
367
+ results = {}
368
+ for service_name in self.health_checks:
369
+ try:
370
+ results[service_name] = await self.check_service_health(service_name)
371
+ except Exception as e:
372
+ logger.error(f"Failed to check health for {service_name}: {e}")
373
+ results[service_name] = HealthCheckResult(
374
+ check_id=str(uuid4()),
375
+ service_name=service_name,
376
+ status=HealthStatus.UNKNOWN,
377
+ response_time_ms=0.0,
378
+ error_message=str(e),
379
+ )
380
+ return results
381
+
382
+ async def get_overall_health(self) -> HealthStatus:
383
+ """Get overall system health status."""
384
+ all_status = await self.get_all_health_status()
385
+
386
+ if not all_status:
387
+ return HealthStatus.UNKNOWN
388
+
389
+ critical_services = [
390
+ name for name, check in self.health_checks.items() if check.critical
391
+ ]
392
+
393
+ # Check critical services first
394
+ critical_unhealthy = any(
395
+ all_status[name].status == HealthStatus.UNHEALTHY
396
+ for name in critical_services
397
+ if name in all_status
398
+ )
399
+
400
+ if critical_unhealthy:
401
+ return HealthStatus.UNHEALTHY
402
+
403
+ # Check if any service is degraded
404
+ any_degraded = any(
405
+ result.status == HealthStatus.DEGRADED for result in all_status.values()
406
+ )
407
+
408
+ if any_degraded:
409
+ return HealthStatus.DEGRADED
410
+
411
+ # Check if all are healthy
412
+ all_healthy = all(
413
+ result.status == HealthStatus.HEALTHY for result in all_status.values()
414
+ )
415
+
416
+ return HealthStatus.HEALTHY if all_healthy else HealthStatus.UNKNOWN
417
+
418
+ async def get_metrics(self, service_name: str) -> Optional[HealthMetrics]:
419
+ """Get metrics for specific service."""
420
+ return self.metrics.get(service_name)
421
+
422
+ async def get_all_metrics(self) -> Dict[str, HealthMetrics]:
423
+ """Get metrics for all services."""
424
+ return self.metrics.copy()
425
+
426
+ async def get_alerts(self, resolved: Optional[bool] = None) -> List[HealthAlert]:
427
+ """Get health alerts."""
428
+ if resolved is None:
429
+ return self.alerts.copy()
430
+ return [alert for alert in self.alerts if alert.resolved == resolved]
431
+
432
+ async def start_monitoring(self):
433
+ """Start continuous health monitoring."""
434
+ if self._running:
435
+ logger.warning("Health monitoring already running")
436
+ return
437
+
438
+ self._running = True
439
+ self._monitoring_task = asyncio.create_task(self._monitoring_loop())
440
+ logger.info("Started health monitoring")
441
+
442
+ async def stop_monitoring(self):
443
+ """Stop continuous health monitoring."""
444
+ if not self._running:
445
+ return
446
+
447
+ self._running = False
448
+ if self._monitoring_task:
449
+ self._monitoring_task.cancel()
450
+ try:
451
+ await self._monitoring_task
452
+ except asyncio.CancelledError:
453
+ pass
454
+
455
+ logger.info("Stopped health monitoring")
456
+
457
+ async def _monitoring_loop(self):
458
+ """Main monitoring loop."""
459
+ while self._running:
460
+ try:
461
+ # Check all services
462
+ await self.get_all_health_status()
463
+
464
+ # Wait for next check interval
465
+ await asyncio.sleep(self.check_interval)
466
+
467
+ except asyncio.CancelledError:
468
+ break
469
+ except Exception as e:
470
+ logger.error(f"Error in monitoring loop: {e}")
471
+ await asyncio.sleep(min(self.check_interval, 10)) # Fallback interval
472
+
473
+ async def _update_metrics(self, service_name: str, result: HealthCheckResult):
474
+ """Update metrics for service."""
475
+ async with self._lock:
476
+ metrics = self.metrics[service_name]
477
+
478
+ metrics.total_checks += 1
479
+
480
+ if result.is_healthy:
481
+ metrics.successful_checks += 1
482
+ metrics.consecutive_failures = 0
483
+ metrics.last_successful_check = result.timestamp
484
+ else:
485
+ metrics.failed_checks += 1
486
+ metrics.consecutive_failures += 1
487
+ metrics.last_failed_check = result.timestamp
488
+
489
+ # Update response time metrics
490
+ if metrics.total_checks == 1:
491
+ metrics.avg_response_time_ms = result.response_time_ms
492
+ else:
493
+ metrics.avg_response_time_ms = (
494
+ metrics.avg_response_time_ms * (metrics.total_checks - 1)
495
+ + result.response_time_ms
496
+ ) / metrics.total_checks
497
+
498
+ if result.response_time_ms > metrics.max_response_time_ms:
499
+ metrics.max_response_time_ms = result.response_time_ms
500
+
501
+ # Update uptime percentage
502
+ metrics.uptime_percentage = (
503
+ metrics.successful_checks / metrics.total_checks
504
+ ) * 100
505
+
506
+ async def _check_alerts(self, service_name: str, result: HealthCheckResult):
507
+ """Check if alerts should be generated."""
508
+ metrics = self.metrics[service_name]
509
+
510
+ # Check for consecutive failure threshold
511
+ if metrics.consecutive_failures >= self.alert_threshold:
512
+ await self._generate_alert(
513
+ service_name,
514
+ AlertLevel.CRITICAL,
515
+ f"Service {service_name} has {metrics.consecutive_failures} consecutive failures",
516
+ {
517
+ "consecutive_failures": metrics.consecutive_failures,
518
+ "last_error": result.error_message,
519
+ "health_status": result.status.value,
520
+ },
521
+ )
522
+
523
+ # Check for high response times
524
+ if result.response_time_ms > 5000: # 5 seconds
525
+ await self._generate_alert(
526
+ service_name,
527
+ AlertLevel.WARNING,
528
+ f"High response time for {service_name}: {result.response_time_ms:.2f}ms",
529
+ {
530
+ "response_time_ms": result.response_time_ms,
531
+ "avg_response_time_ms": metrics.avg_response_time_ms,
532
+ },
533
+ )
534
+
535
+ async def _generate_alert(
536
+ self,
537
+ service_name: str,
538
+ level: AlertLevel,
539
+ message: str,
540
+ details: Dict[str, Any],
541
+ ):
542
+ """Generate health alert."""
543
+ alert = HealthAlert(
544
+ service_name=service_name, level=level, message=message, details=details
545
+ )
546
+
547
+ self.alerts.append(alert)
548
+
549
+ # Call alert callbacks
550
+ for callback in self.alert_callbacks:
551
+ try:
552
+ callback(alert)
553
+ except Exception as e:
554
+ logger.error(f"Error in alert callback: {e}")
555
+
556
+ logger.warning(f"Health alert generated: {message}")
557
+
558
+
559
+ # Global health monitor instance
560
+ _health_monitor: Optional[HealthMonitor] = None
561
+
562
+
563
+ def get_health_monitor() -> HealthMonitor:
564
+ """Get global health monitor instance."""
565
+ global _health_monitor
566
+ if _health_monitor is None:
567
+ _health_monitor = HealthMonitor()
568
+ return _health_monitor
569
+
570
+
571
+ async def quick_health_check(service_name: str) -> bool:
572
+ """Quick health check for a service."""
573
+ monitor = get_health_monitor()
574
+ try:
575
+ result = await monitor.get_health_status(service_name)
576
+ return result.is_healthy if result else False
577
+ except Exception:
578
+ return False
kailash/edge/discovery.py CHANGED
@@ -199,6 +199,92 @@ class EdgeDiscovery:
199
199
  self._last_health_check[location.location_id] = datetime.now(UTC)
200
200
  logger.info(f"Added edge location: {location.name}")
201
201
 
202
+ async def register_edge(self, edge_config: Dict[str, Any]):
203
+ """Register an edge location from configuration dictionary.
204
+
205
+ Args:
206
+ edge_config: Dictionary containing edge location configuration
207
+ """
208
+ from .location import (
209
+ ComplianceZone,
210
+ EdgeCapabilities,
211
+ EdgeLocation,
212
+ EdgeRegion,
213
+ GeographicCoordinates,
214
+ )
215
+
216
+ # Extract basic info
217
+ location_id = edge_config["id"]
218
+ region_str = edge_config.get("region", "us-east")
219
+
220
+ # Map region string to enum
221
+ region_map = {
222
+ "us-east-1": EdgeRegion.US_EAST,
223
+ "us-west-1": EdgeRegion.US_WEST,
224
+ "eu-west-1": EdgeRegion.EU_WEST,
225
+ "eu-central-1": EdgeRegion.EU_CENTRAL,
226
+ "asia-southeast-1": EdgeRegion.ASIA_SOUTHEAST,
227
+ }
228
+ region = region_map.get(region_str, EdgeRegion.US_EAST)
229
+
230
+ # Default coordinates based on region
231
+ coord_map = {
232
+ EdgeRegion.US_EAST: GeographicCoordinates(39.0458, -76.6413), # Virginia
233
+ EdgeRegion.US_WEST: GeographicCoordinates(37.7749, -122.4194), # California
234
+ EdgeRegion.EU_WEST: GeographicCoordinates(53.3498, -6.2603), # Ireland
235
+ EdgeRegion.EU_CENTRAL: GeographicCoordinates(50.1109, 8.6821), # Frankfurt
236
+ EdgeRegion.ASIA_SOUTHEAST: GeographicCoordinates(
237
+ 1.3521, 103.8198
238
+ ), # Singapore
239
+ }
240
+ coordinates = coord_map.get(region, GeographicCoordinates(39.0458, -76.6413))
241
+
242
+ # Create capabilities
243
+ capabilities = EdgeCapabilities(
244
+ cpu_cores=edge_config.get("capacity", 1000) // 100, # Rough mapping
245
+ memory_gb=edge_config.get("capacity", 1000) // 50,
246
+ storage_gb=edge_config.get("capacity", 1000) * 2,
247
+ bandwidth_gbps=10.0,
248
+ database_support=["postgresql", "redis"],
249
+ ai_models_available=["llama", "claude"],
250
+ )
251
+
252
+ # Create edge location
253
+ location = EdgeLocation(
254
+ location_id=location_id,
255
+ name=f"Edge {region_str.title()}",
256
+ region=region,
257
+ coordinates=coordinates,
258
+ capabilities=capabilities,
259
+ endpoint_url=edge_config.get(
260
+ "endpoint", f"http://{location_id}.edge.local:8080"
261
+ ),
262
+ )
263
+
264
+ # Set health status
265
+ from .location import EdgeStatus
266
+
267
+ if edge_config.get("healthy", True):
268
+ location.status = EdgeStatus.ACTIVE
269
+ self._health_results[location_id] = HealthCheckResult.HEALTHY
270
+ else:
271
+ location.status = EdgeStatus.OFFLINE
272
+ self._health_results[location_id] = HealthCheckResult.UNHEALTHY
273
+
274
+ # Update metrics
275
+ location.metrics.latency_p50_ms = edge_config.get("latency_ms", 10)
276
+ location.metrics.cpu_utilization = edge_config.get(
277
+ "current_load", 0
278
+ ) / edge_config.get("capacity", 1000)
279
+
280
+ # Add to locations
281
+ self.locations[location_id] = location
282
+ self._last_health_check[location_id] = datetime.now(UTC)
283
+
284
+ logger.info(f"Registered edge location: {location_id} in {region_str}")
285
+
286
+ return location
287
+
202
288
  def remove_location(self, location_id: str):
203
289
  """Remove an edge location from the discovery pool."""
204
290
  if location_id in self.locations: