kailash 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +5 -11
- kailash/channels/__init__.py +2 -1
- kailash/channels/mcp_channel.py +23 -4
- kailash/cli/__init__.py +11 -1
- kailash/cli/validate_imports.py +202 -0
- kailash/cli/validation_audit.py +570 -0
- kailash/core/actors/supervisor.py +1 -1
- kailash/core/resilience/bulkhead.py +15 -5
- kailash/core/resilience/circuit_breaker.py +74 -1
- kailash/core/resilience/health_monitor.py +433 -33
- kailash/edge/compliance.py +33 -0
- kailash/edge/consistency.py +609 -0
- kailash/edge/coordination/__init__.py +30 -0
- kailash/edge/coordination/global_ordering.py +355 -0
- kailash/edge/coordination/leader_election.py +217 -0
- kailash/edge/coordination/partition_detector.py +296 -0
- kailash/edge/coordination/raft.py +485 -0
- kailash/edge/discovery.py +63 -1
- kailash/edge/migration/__init__.py +19 -0
- kailash/edge/migration/edge_migration_service.py +384 -0
- kailash/edge/migration/edge_migrator.py +832 -0
- kailash/edge/monitoring/__init__.py +21 -0
- kailash/edge/monitoring/edge_monitor.py +736 -0
- kailash/edge/prediction/__init__.py +10 -0
- kailash/edge/prediction/predictive_warmer.py +591 -0
- kailash/edge/resource/__init__.py +102 -0
- kailash/edge/resource/cloud_integration.py +796 -0
- kailash/edge/resource/cost_optimizer.py +949 -0
- kailash/edge/resource/docker_integration.py +919 -0
- kailash/edge/resource/kubernetes_integration.py +893 -0
- kailash/edge/resource/platform_integration.py +913 -0
- kailash/edge/resource/predictive_scaler.py +959 -0
- kailash/edge/resource/resource_analyzer.py +824 -0
- kailash/edge/resource/resource_pools.py +610 -0
- kailash/integrations/dataflow_edge.py +261 -0
- kailash/mcp_server/registry_integration.py +1 -1
- kailash/mcp_server/server.py +351 -8
- kailash/mcp_server/transports.py +305 -0
- kailash/middleware/gateway/event_store.py +1 -0
- kailash/monitoring/__init__.py +18 -0
- kailash/monitoring/alerts.py +646 -0
- kailash/monitoring/metrics.py +677 -0
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/ai/semantic_memory.py +2 -2
- kailash/nodes/base.py +622 -1
- kailash/nodes/code/python.py +44 -3
- kailash/nodes/data/async_sql.py +42 -20
- kailash/nodes/edge/__init__.py +36 -0
- kailash/nodes/edge/base.py +240 -0
- kailash/nodes/edge/cloud_node.py +710 -0
- kailash/nodes/edge/coordination.py +239 -0
- kailash/nodes/edge/docker_node.py +825 -0
- kailash/nodes/edge/edge_data.py +582 -0
- kailash/nodes/edge/edge_migration_node.py +396 -0
- kailash/nodes/edge/edge_monitoring_node.py +421 -0
- kailash/nodes/edge/edge_state.py +673 -0
- kailash/nodes/edge/edge_warming_node.py +393 -0
- kailash/nodes/edge/kubernetes_node.py +652 -0
- kailash/nodes/edge/platform_node.py +766 -0
- kailash/nodes/edge/resource_analyzer_node.py +378 -0
- kailash/nodes/edge/resource_optimizer_node.py +501 -0
- kailash/nodes/edge/resource_scaler_node.py +397 -0
- kailash/nodes/governance.py +410 -0
- kailash/nodes/ports.py +676 -0
- kailash/nodes/rag/registry.py +1 -1
- kailash/nodes/transaction/distributed_transaction_manager.py +48 -1
- kailash/nodes/transaction/saga_state_storage.py +2 -1
- kailash/nodes/validation.py +8 -8
- kailash/runtime/local.py +374 -1
- kailash/runtime/validation/__init__.py +12 -0
- kailash/runtime/validation/connection_context.py +119 -0
- kailash/runtime/validation/enhanced_error_formatter.py +202 -0
- kailash/runtime/validation/error_categorizer.py +164 -0
- kailash/runtime/validation/import_validator.py +446 -0
- kailash/runtime/validation/metrics.py +380 -0
- kailash/runtime/validation/performance.py +615 -0
- kailash/runtime/validation/suggestion_engine.py +212 -0
- kailash/testing/fixtures.py +2 -2
- kailash/utils/data_paths.py +74 -0
- kailash/workflow/builder.py +413 -8
- kailash/workflow/contracts.py +418 -0
- kailash/workflow/edge_infrastructure.py +369 -0
- kailash/workflow/mermaid_visualizer.py +3 -1
- kailash/workflow/migration.py +3 -3
- kailash/workflow/templates.py +6 -6
- kailash/workflow/type_inference.py +669 -0
- kailash/workflow/validation.py +134 -3
- {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/METADATA +52 -34
- {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/RECORD +93 -42
- kailash/nexus/__init__.py +0 -21
- kailash/nexus/cli/__init__.py +0 -5
- kailash/nexus/cli/__main__.py +0 -6
- kailash/nexus/cli/main.py +0 -176
- kailash/nexus/factory.py +0 -413
- kailash/nexus/gateway.py +0 -545
- {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/WHEEL +0 -0
- {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/entry_points.txt +0 -0
- {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.8.4.dist-info → kailash-0.8.6.dist-info}/top_level.txt +0 -0
@@ -396,6 +396,16 @@ class ConnectionCircuitBreaker(Generic[T]):
|
|
396
396
|
if listener in self._listeners:
|
397
397
|
self._listeners.remove(listener)
|
398
398
|
|
399
|
+
@property
|
400
|
+
def success_count(self) -> int:
|
401
|
+
"""Get number of successful calls."""
|
402
|
+
return self.metrics.successful_calls
|
403
|
+
|
404
|
+
@property
|
405
|
+
def failure_count(self) -> int:
|
406
|
+
"""Get number of failed calls."""
|
407
|
+
return self.metrics.failed_calls
|
408
|
+
|
399
409
|
def get_status(self) -> Dict[str, Any]:
|
400
410
|
"""Get current circuit breaker status."""
|
401
411
|
return {
|
@@ -435,10 +445,16 @@ class ConnectionCircuitBreaker(Generic[T]):
|
|
435
445
|
class CircuitBreakerManager:
|
436
446
|
"""Manages multiple circuit breakers for different resources."""
|
437
447
|
|
438
|
-
def __init__(self):
|
448
|
+
def __init__(self, performance_monitor=None):
|
439
449
|
"""Initialize circuit breaker manager."""
|
440
450
|
self._breakers: Dict[str, ConnectionCircuitBreaker] = {}
|
441
451
|
self._default_config = CircuitBreakerConfig()
|
452
|
+
self._performance_monitor = performance_monitor
|
453
|
+
self._patterns = {
|
454
|
+
"database": CircuitBreakerConfig(failure_threshold=5, recovery_timeout=60),
|
455
|
+
"api": CircuitBreakerConfig(failure_threshold=3, recovery_timeout=30),
|
456
|
+
"cache": CircuitBreakerConfig(failure_threshold=2, recovery_timeout=15),
|
457
|
+
}
|
442
458
|
|
443
459
|
def get_or_create(
|
444
460
|
self, name: str, config: Optional[CircuitBreakerConfig] = None
|
@@ -450,6 +466,63 @@ class CircuitBreakerManager:
|
|
450
466
|
)
|
451
467
|
return self._breakers[name]
|
452
468
|
|
469
|
+
def create_circuit_breaker(
|
470
|
+
self,
|
471
|
+
name: str,
|
472
|
+
config: Optional[CircuitBreakerConfig] = None,
|
473
|
+
pattern: Optional[str] = None,
|
474
|
+
) -> ConnectionCircuitBreaker:
|
475
|
+
"""Create a new circuit breaker with optional pattern-based configuration."""
|
476
|
+
if pattern and pattern in self._patterns:
|
477
|
+
config = config or self._patterns[pattern]
|
478
|
+
return self.get_or_create(name, config)
|
479
|
+
|
480
|
+
async def execute_with_circuit_breaker(
|
481
|
+
self, name: str, func: Callable, fallback: Optional[Callable] = None
|
482
|
+
):
|
483
|
+
"""Execute a function with circuit breaker protection."""
|
484
|
+
cb = self.get_or_create(name)
|
485
|
+
try:
|
486
|
+
result = await cb.call(func)
|
487
|
+
return result
|
488
|
+
except CircuitBreakerError:
|
489
|
+
if fallback:
|
490
|
+
if asyncio.iscoroutinefunction(fallback):
|
491
|
+
return await fallback()
|
492
|
+
else:
|
493
|
+
return fallback()
|
494
|
+
raise
|
495
|
+
|
496
|
+
def get_circuit_breaker(self, name: str) -> Optional[ConnectionCircuitBreaker]:
|
497
|
+
"""Get an existing circuit breaker by name."""
|
498
|
+
return self._breakers.get(name)
|
499
|
+
|
500
|
+
def get_all_circuit_states(self) -> Dict[str, Dict[str, Any]]:
|
501
|
+
"""Get the state of all circuit breakers."""
|
502
|
+
return {name: cb.get_status() for name, cb in self._breakers.items()}
|
503
|
+
|
504
|
+
def force_open_circuit_breaker(self, name: str) -> bool:
|
505
|
+
"""Manually open a circuit breaker."""
|
506
|
+
cb = self._breakers.get(name)
|
507
|
+
if cb:
|
508
|
+
asyncio.create_task(cb.force_open("Manual override"))
|
509
|
+
return True
|
510
|
+
return False
|
511
|
+
|
512
|
+
def reset_circuit_breaker(self, name: str) -> bool:
|
513
|
+
"""Reset a circuit breaker to closed state."""
|
514
|
+
cb = self._breakers.get(name)
|
515
|
+
if cb:
|
516
|
+
asyncio.create_task(cb.reset())
|
517
|
+
return True
|
518
|
+
return False
|
519
|
+
|
520
|
+
def add_global_callback(self, callback: Callable):
|
521
|
+
"""Add a global callback for circuit breaker state changes."""
|
522
|
+
# For now, add to all existing breakers
|
523
|
+
for cb in self._breakers.values():
|
524
|
+
cb.add_listener(callback)
|
525
|
+
|
453
526
|
def get_all_status(self) -> Dict[str, Dict[str, Any]]:
|
454
527
|
"""Get status of all circuit breakers."""
|
455
528
|
return {name: breaker.get_status() for name, breaker in self._breakers.items()}
|
@@ -64,10 +64,33 @@ class HealthCheckResult:
|
|
64
64
|
error_message: Optional[str] = None
|
65
65
|
is_healthy: bool = field(init=False)
|
66
66
|
|
67
|
+
# Additional attributes for compatibility
|
68
|
+
check_name: str = field(default="", init=False)
|
69
|
+
message: str = field(default="", init=False)
|
70
|
+
error: Optional[str] = field(default=None, init=False)
|
71
|
+
metadata: Dict[str, Any] = field(default_factory=dict, init=False)
|
72
|
+
|
67
73
|
def __post_init__(self):
|
68
|
-
"""Calculate health status."""
|
74
|
+
"""Calculate health status and initialize compatibility fields."""
|
69
75
|
self.is_healthy = self.status in [HealthStatus.HEALTHY, HealthStatus.DEGRADED]
|
70
76
|
|
77
|
+
# Initialize compatibility fields
|
78
|
+
self.check_name = self.service_name
|
79
|
+
self.error = self.error_message
|
80
|
+
self.metadata = self.details.copy()
|
81
|
+
|
82
|
+
# Set message based on status
|
83
|
+
if self.status == HealthStatus.HEALTHY:
|
84
|
+
self.message = "Service is healthy"
|
85
|
+
elif self.status == HealthStatus.DEGRADED:
|
86
|
+
self.message = "Service is degraded but functional"
|
87
|
+
elif self.status == HealthStatus.UNHEALTHY:
|
88
|
+
self.message = (
|
89
|
+
f"Service is unhealthy: {self.error_message or 'Unknown error'}"
|
90
|
+
)
|
91
|
+
else:
|
92
|
+
self.message = "Service status unknown"
|
93
|
+
|
71
94
|
|
72
95
|
@dataclass
|
73
96
|
class HealthMetrics:
|
@@ -123,10 +146,20 @@ class HealthCheck(ABC):
|
|
123
146
|
class DatabaseHealthCheck(HealthCheck):
|
124
147
|
"""Health check for database connections."""
|
125
148
|
|
126
|
-
def __init__(self, name: str,
|
149
|
+
def __init__(self, name: str, database_node_or_connection_string, **kwargs):
|
127
150
|
"""Initialize database health check."""
|
128
151
|
super().__init__(name, **kwargs)
|
129
|
-
self.
|
152
|
+
self.check_name = name # Required by HealthCheckManager
|
153
|
+
|
154
|
+
# Handle both database node objects and connection strings
|
155
|
+
if hasattr(database_node_or_connection_string, "execute"):
|
156
|
+
# It's a database node object
|
157
|
+
self.database_node = database_node_or_connection_string
|
158
|
+
self.connection_string = None
|
159
|
+
else:
|
160
|
+
# It's a connection string
|
161
|
+
self.connection_string = database_node_or_connection_string
|
162
|
+
self.database_node = None
|
130
163
|
|
131
164
|
async def check_health(self) -> HealthCheckResult:
|
132
165
|
"""Check database health."""
|
@@ -134,41 +167,72 @@ class DatabaseHealthCheck(HealthCheck):
|
|
134
167
|
check_id = str(uuid4())
|
135
168
|
|
136
169
|
try:
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
170
|
+
if self.database_node:
|
171
|
+
# Use database node object directly
|
172
|
+
result = await self.database_node.execute(
|
173
|
+
"SELECT 1 as health_check", "dict"
|
174
|
+
)
|
141
175
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
176
|
+
response_time = (time.time() - start_time) * 1000
|
177
|
+
|
178
|
+
if result and result.get("success"):
|
179
|
+
return HealthCheckResult(
|
180
|
+
check_id=check_id,
|
181
|
+
service_name=self.name,
|
182
|
+
status=HealthStatus.HEALTHY,
|
183
|
+
response_time_ms=response_time,
|
184
|
+
details={
|
185
|
+
"query_executed": True,
|
186
|
+
"query_result": result.get("data", []),
|
187
|
+
},
|
188
|
+
)
|
189
|
+
else:
|
190
|
+
return HealthCheckResult(
|
191
|
+
check_id=check_id,
|
192
|
+
service_name=self.name,
|
193
|
+
status=HealthStatus.DEGRADED,
|
194
|
+
response_time_ms=response_time,
|
195
|
+
details={"query_executed": True, "query_result": []},
|
196
|
+
error_message="Query returned no success result",
|
197
|
+
)
|
198
|
+
else:
|
199
|
+
# Use connection string with SQL node
|
200
|
+
from src.kailash.nodes.data.sql import SQLDatabaseNode
|
147
201
|
|
148
|
-
|
202
|
+
sql_node = SQLDatabaseNode(connection_string=self.connection_string)
|
149
203
|
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
details={
|
157
|
-
"query_executed": True,
|
158
|
-
"rows_returned": len(result["data"]),
|
159
|
-
"execution_time": result.get("execution_time", 0),
|
160
|
-
},
|
161
|
-
)
|
162
|
-
else:
|
163
|
-
return HealthCheckResult(
|
164
|
-
check_id=check_id,
|
165
|
-
service_name=self.name,
|
166
|
-
status=HealthStatus.DEGRADED,
|
167
|
-
response_time_ms=response_time,
|
168
|
-
details={"query_executed": True, "rows_returned": 0},
|
169
|
-
error_message="Query returned no data",
|
204
|
+
# Execute simple health check query
|
205
|
+
result = await asyncio.wait_for(
|
206
|
+
asyncio.to_thread(
|
207
|
+
sql_node.execute, query="SELECT 1 as health_check"
|
208
|
+
),
|
209
|
+
timeout=self.timeout,
|
170
210
|
)
|
171
211
|
|
212
|
+
response_time = (time.time() - start_time) * 1000
|
213
|
+
|
214
|
+
if "data" in result and len(result["data"]) > 0:
|
215
|
+
return HealthCheckResult(
|
216
|
+
check_id=check_id,
|
217
|
+
service_name=self.name,
|
218
|
+
status=HealthStatus.HEALTHY,
|
219
|
+
response_time_ms=response_time,
|
220
|
+
details={
|
221
|
+
"query_executed": True,
|
222
|
+
"rows_returned": len(result["data"]),
|
223
|
+
"execution_time": result.get("execution_time", 0),
|
224
|
+
},
|
225
|
+
)
|
226
|
+
else:
|
227
|
+
return HealthCheckResult(
|
228
|
+
check_id=check_id,
|
229
|
+
service_name=self.name,
|
230
|
+
status=HealthStatus.DEGRADED,
|
231
|
+
response_time_ms=response_time,
|
232
|
+
details={"query_executed": True, "rows_returned": 0},
|
233
|
+
error_message="Query returned no data",
|
234
|
+
)
|
235
|
+
|
172
236
|
except asyncio.TimeoutError:
|
173
237
|
response_time = (time.time() - start_time) * 1000
|
174
238
|
return HealthCheckResult(
|
@@ -196,6 +260,7 @@ class RedisHealthCheck(HealthCheck):
|
|
196
260
|
"""Initialize Redis health check."""
|
197
261
|
super().__init__(name, **kwargs)
|
198
262
|
self.redis_config = redis_config
|
263
|
+
self.check_name = name # Required by HealthCheckManager
|
199
264
|
|
200
265
|
async def check_health(self) -> HealthCheckResult:
|
201
266
|
"""Check Redis health."""
|
@@ -249,6 +314,148 @@ class RedisHealthCheck(HealthCheck):
|
|
249
314
|
)
|
250
315
|
|
251
316
|
|
317
|
+
class MemoryHealthCheck(HealthCheck):
|
318
|
+
"""Health check for system memory usage."""
|
319
|
+
|
320
|
+
def __init__(
|
321
|
+
self,
|
322
|
+
name: str,
|
323
|
+
warning_threshold: float = 80.0,
|
324
|
+
critical_threshold: float = 95.0,
|
325
|
+
**kwargs,
|
326
|
+
):
|
327
|
+
"""Initialize memory health check."""
|
328
|
+
super().__init__(name, **kwargs)
|
329
|
+
self.warning_threshold = warning_threshold
|
330
|
+
self.critical_threshold = critical_threshold
|
331
|
+
self.check_name = name # Required by HealthCheckManager
|
332
|
+
|
333
|
+
async def check_health(self) -> HealthCheckResult:
|
334
|
+
"""Check system memory health."""
|
335
|
+
start_time = time.time()
|
336
|
+
check_id = str(uuid4())
|
337
|
+
|
338
|
+
try:
|
339
|
+
import psutil
|
340
|
+
|
341
|
+
memory = psutil.virtual_memory()
|
342
|
+
response_time = (time.time() - start_time) * 1000
|
343
|
+
|
344
|
+
# Determine status based on memory usage
|
345
|
+
if memory.percent >= self.critical_threshold:
|
346
|
+
status = HealthStatus.UNHEALTHY
|
347
|
+
message = f"Critical memory usage: {memory.percent:.1f}%"
|
348
|
+
elif memory.percent >= self.warning_threshold:
|
349
|
+
status = HealthStatus.DEGRADED
|
350
|
+
message = f"High memory usage: {memory.percent:.1f}%"
|
351
|
+
else:
|
352
|
+
status = HealthStatus.HEALTHY
|
353
|
+
message = f"Memory usage normal: {memory.percent:.1f}%"
|
354
|
+
|
355
|
+
return HealthCheckResult(
|
356
|
+
check_id=check_id,
|
357
|
+
service_name=self.name,
|
358
|
+
status=status,
|
359
|
+
response_time_ms=response_time,
|
360
|
+
details={
|
361
|
+
"memory_percent": memory.percent,
|
362
|
+
"total_memory": memory.total,
|
363
|
+
"available_memory": memory.available,
|
364
|
+
"used_memory": memory.used,
|
365
|
+
},
|
366
|
+
error_message=message if status != HealthStatus.HEALTHY else None,
|
367
|
+
)
|
368
|
+
|
369
|
+
except Exception as e:
|
370
|
+
response_time = (time.time() - start_time) * 1000
|
371
|
+
return HealthCheckResult(
|
372
|
+
check_id=check_id,
|
373
|
+
service_name=self.name,
|
374
|
+
status=HealthStatus.UNHEALTHY,
|
375
|
+
response_time_ms=response_time,
|
376
|
+
error_message=str(e),
|
377
|
+
)
|
378
|
+
|
379
|
+
|
380
|
+
class CustomHealthCheck(HealthCheck):
|
381
|
+
"""Custom health check that executes user-defined check function."""
|
382
|
+
|
383
|
+
def __init__(self, name: str, check_function: Callable, **kwargs):
|
384
|
+
"""Initialize custom health check."""
|
385
|
+
super().__init__(name, **kwargs)
|
386
|
+
self.check_function = check_function
|
387
|
+
self.check_name = name # Required by HealthCheckManager
|
388
|
+
|
389
|
+
async def check_health(self) -> HealthCheckResult:
|
390
|
+
"""Execute custom health check function."""
|
391
|
+
start_time = time.time()
|
392
|
+
check_id = str(uuid4())
|
393
|
+
|
394
|
+
try:
|
395
|
+
# Execute the custom check function
|
396
|
+
if asyncio.iscoroutinefunction(self.check_function):
|
397
|
+
result = await asyncio.wait_for(
|
398
|
+
self.check_function(), timeout=self.timeout
|
399
|
+
)
|
400
|
+
else:
|
401
|
+
result = await asyncio.wait_for(
|
402
|
+
asyncio.to_thread(self.check_function), timeout=self.timeout
|
403
|
+
)
|
404
|
+
|
405
|
+
response_time = (time.time() - start_time) * 1000
|
406
|
+
|
407
|
+
# Handle different result formats
|
408
|
+
if isinstance(result, bool):
|
409
|
+
status = HealthStatus.HEALTHY if result else HealthStatus.UNHEALTHY
|
410
|
+
message = "Check passed" if result else "Check failed"
|
411
|
+
details = {"result": result}
|
412
|
+
elif isinstance(result, dict):
|
413
|
+
# Expect dict with status, message, metadata
|
414
|
+
status_str = result.get("status", "healthy").lower()
|
415
|
+
if status_str == "healthy":
|
416
|
+
status = HealthStatus.HEALTHY
|
417
|
+
elif status_str == "degraded":
|
418
|
+
status = HealthStatus.DEGRADED
|
419
|
+
else:
|
420
|
+
status = HealthStatus.UNHEALTHY
|
421
|
+
|
422
|
+
message = result.get("message", "Custom check completed")
|
423
|
+
details = result.get("metadata", {})
|
424
|
+
else:
|
425
|
+
# Assume success if we get any non-false result
|
426
|
+
status = HealthStatus.HEALTHY
|
427
|
+
message = "Custom check completed"
|
428
|
+
details = {"result": str(result)}
|
429
|
+
|
430
|
+
return HealthCheckResult(
|
431
|
+
check_id=check_id,
|
432
|
+
service_name=self.name,
|
433
|
+
status=status,
|
434
|
+
response_time_ms=response_time,
|
435
|
+
details=details,
|
436
|
+
error_message=None if status == HealthStatus.HEALTHY else message,
|
437
|
+
)
|
438
|
+
|
439
|
+
except asyncio.TimeoutError:
|
440
|
+
response_time = (time.time() - start_time) * 1000
|
441
|
+
return HealthCheckResult(
|
442
|
+
check_id=check_id,
|
443
|
+
service_name=self.name,
|
444
|
+
status=HealthStatus.UNHEALTHY,
|
445
|
+
response_time_ms=response_time,
|
446
|
+
error_message=f"Custom health check timeout after {self.timeout}s",
|
447
|
+
)
|
448
|
+
except Exception as e:
|
449
|
+
response_time = (time.time() - start_time) * 1000
|
450
|
+
return HealthCheckResult(
|
451
|
+
check_id=check_id,
|
452
|
+
service_name=self.name,
|
453
|
+
status=HealthStatus.UNHEALTHY,
|
454
|
+
response_time_ms=response_time,
|
455
|
+
error_message=str(e),
|
456
|
+
)
|
457
|
+
|
458
|
+
|
252
459
|
class HTTPHealthCheck(HealthCheck):
|
253
460
|
"""Health check for HTTP endpoints."""
|
254
461
|
|
@@ -257,6 +464,7 @@ class HTTPHealthCheck(HealthCheck):
|
|
257
464
|
super().__init__(name, **kwargs)
|
258
465
|
self.url = url
|
259
466
|
self.expected_status = expected_status
|
467
|
+
self.check_name = name # Required by HealthCheckManager
|
260
468
|
|
261
469
|
async def check_health(self) -> HealthCheckResult:
|
262
470
|
"""Check HTTP endpoint health."""
|
@@ -576,3 +784,195 @@ async def quick_health_check(service_name: str) -> bool:
|
|
576
784
|
return result.is_healthy if result else False
|
577
785
|
except Exception:
|
578
786
|
return False
|
787
|
+
|
788
|
+
|
789
|
+
@dataclass
|
790
|
+
class HealthSummary:
|
791
|
+
"""Health summary for all checks."""
|
792
|
+
|
793
|
+
total_checks: int
|
794
|
+
healthy_checks: int
|
795
|
+
degraded_checks: int
|
796
|
+
unhealthy_checks: int
|
797
|
+
overall_status: HealthStatus
|
798
|
+
details: List[HealthCheckResult]
|
799
|
+
|
800
|
+
|
801
|
+
class HealthCheckManager:
|
802
|
+
"""Manager for orchestrating multiple health checks with configuration."""
|
803
|
+
|
804
|
+
def __init__(self, config: Dict[str, Any]):
|
805
|
+
"""Initialize health check manager with configuration."""
|
806
|
+
self.config = config
|
807
|
+
self.enabled = config.get("enabled", True)
|
808
|
+
self.default_interval = config.get("default_interval", 30.0)
|
809
|
+
self.parallel_checks = config.get("parallel_checks", True)
|
810
|
+
self.max_concurrent_checks = config.get("max_concurrent_checks", 10)
|
811
|
+
|
812
|
+
self.health_checks: Dict[str, HealthCheck] = {}
|
813
|
+
self.check_intervals: Dict[str, float] = {}
|
814
|
+
self.last_results: Dict[str, HealthCheckResult] = {}
|
815
|
+
self.history: Dict[str, List[HealthCheckResult]] = {}
|
816
|
+
self.status_change_callbacks: List[Callable] = []
|
817
|
+
self._running = False
|
818
|
+
|
819
|
+
def register_health_check(self, health_check: HealthCheck, interval: float = None):
|
820
|
+
"""Register a health check with optional interval."""
|
821
|
+
check_name = health_check.check_name
|
822
|
+
self.health_checks[check_name] = health_check
|
823
|
+
self.check_intervals[check_name] = interval or self.default_interval
|
824
|
+
self.history[check_name] = []
|
825
|
+
|
826
|
+
async def run_health_check(self, check_name: str) -> HealthCheckResult:
|
827
|
+
"""Run a specific health check."""
|
828
|
+
if check_name not in self.health_checks:
|
829
|
+
raise ValueError(f"Health check '{check_name}' not found")
|
830
|
+
|
831
|
+
health_check = self.health_checks[check_name]
|
832
|
+
result = await health_check.check_health()
|
833
|
+
|
834
|
+
# Check for status changes before storing new result
|
835
|
+
await self._check_status_change(check_name, result)
|
836
|
+
|
837
|
+
# Store result
|
838
|
+
self.last_results[check_name] = result
|
839
|
+
self.history[check_name].append(result)
|
840
|
+
|
841
|
+
return result
|
842
|
+
|
843
|
+
async def run_all_health_checks(self) -> List[HealthCheckResult]:
|
844
|
+
"""Run all registered health checks."""
|
845
|
+
if not self.health_checks:
|
846
|
+
return []
|
847
|
+
|
848
|
+
if self.parallel_checks:
|
849
|
+
# Run checks in parallel
|
850
|
+
tasks = [
|
851
|
+
self.run_health_check(check_name)
|
852
|
+
for check_name in self.health_checks.keys()
|
853
|
+
]
|
854
|
+
|
855
|
+
# Limit concurrency
|
856
|
+
semaphore = asyncio.Semaphore(self.max_concurrent_checks)
|
857
|
+
|
858
|
+
async def run_with_semaphore(task):
|
859
|
+
async with semaphore:
|
860
|
+
return await task
|
861
|
+
|
862
|
+
results = await asyncio.gather(
|
863
|
+
*[run_with_semaphore(task) for task in tasks]
|
864
|
+
)
|
865
|
+
else:
|
866
|
+
# Run checks sequentially
|
867
|
+
results = []
|
868
|
+
for check_name in self.health_checks.keys():
|
869
|
+
result = await self.run_health_check(check_name)
|
870
|
+
results.append(result)
|
871
|
+
|
872
|
+
return results
|
873
|
+
|
874
|
+
async def get_health_summary(self) -> HealthSummary:
|
875
|
+
"""Get summary of all health checks."""
|
876
|
+
results = await self.run_all_health_checks()
|
877
|
+
|
878
|
+
healthy_count = sum(1 for r in results if r.status == HealthStatus.HEALTHY)
|
879
|
+
degraded_count = sum(1 for r in results if r.status == HealthStatus.DEGRADED)
|
880
|
+
unhealthy_count = sum(1 for r in results if r.status == HealthStatus.UNHEALTHY)
|
881
|
+
|
882
|
+
# Determine overall status
|
883
|
+
if unhealthy_count > 0:
|
884
|
+
overall_status = HealthStatus.UNHEALTHY
|
885
|
+
elif degraded_count > 0:
|
886
|
+
overall_status = HealthStatus.DEGRADED
|
887
|
+
elif healthy_count > 0:
|
888
|
+
overall_status = HealthStatus.HEALTHY
|
889
|
+
else:
|
890
|
+
overall_status = HealthStatus.UNKNOWN
|
891
|
+
|
892
|
+
return HealthSummary(
|
893
|
+
total_checks=len(results),
|
894
|
+
healthy_checks=healthy_count,
|
895
|
+
degraded_checks=degraded_count,
|
896
|
+
unhealthy_checks=unhealthy_count,
|
897
|
+
overall_status=overall_status,
|
898
|
+
details=results,
|
899
|
+
)
|
900
|
+
|
901
|
+
def add_status_change_callback(self, callback: Callable):
|
902
|
+
"""Add callback for status changes."""
|
903
|
+
self.status_change_callbacks.append(callback)
|
904
|
+
|
905
|
+
def get_health_history(
|
906
|
+
self, check_name: str, limit: int = None
|
907
|
+
) -> List[HealthCheckResult]:
|
908
|
+
"""Get health check history for a specific check."""
|
909
|
+
history = self.history.get(check_name, [])
|
910
|
+
if limit:
|
911
|
+
return history[-limit:]
|
912
|
+
return history
|
913
|
+
|
914
|
+
async def _check_status_change(self, check_name: str, result: HealthCheckResult):
|
915
|
+
"""Check if status has changed and notify callbacks."""
|
916
|
+
if check_name in self.last_results:
|
917
|
+
previous = self.last_results[check_name]
|
918
|
+
if previous.status != result.status:
|
919
|
+
# Status changed, notify callbacks
|
920
|
+
for callback in self.status_change_callbacks:
|
921
|
+
try:
|
922
|
+
await callback(check_name, result)
|
923
|
+
except Exception as e:
|
924
|
+
logger.error(f"Error in status change callback: {e}")
|
925
|
+
|
926
|
+
async def shutdown(self):
|
927
|
+
"""Shutdown the health check manager."""
|
928
|
+
self._running = False
|
929
|
+
# Any cleanup logic here
|
930
|
+
|
931
|
+
|
932
|
+
# Global health manager instance for convenience functions
|
933
|
+
_global_health_manager: Optional[HealthCheckManager] = None
|
934
|
+
|
935
|
+
|
936
|
+
def get_health_manager() -> HealthCheckManager:
|
937
|
+
"""Get the global health manager instance."""
|
938
|
+
global _global_health_manager
|
939
|
+
if _global_health_manager is None:
|
940
|
+
config = {
|
941
|
+
"enabled": True,
|
942
|
+
"default_interval": 30.0,
|
943
|
+
"parallel_checks": True,
|
944
|
+
"max_concurrent_checks": 10,
|
945
|
+
}
|
946
|
+
_global_health_manager = HealthCheckManager(config)
|
947
|
+
return _global_health_manager
|
948
|
+
|
949
|
+
|
950
|
+
# Add convenience functions for registering health checks
|
951
|
+
async def register_database_health_check(
|
952
|
+
name: str, database_node, interval: float = 30.0
|
953
|
+
):
|
954
|
+
"""Register a database health check with global manager."""
|
955
|
+
health_check = DatabaseHealthCheck(name, database_node)
|
956
|
+
manager = get_health_manager()
|
957
|
+
manager.register_health_check(health_check, interval)
|
958
|
+
|
959
|
+
|
960
|
+
async def register_memory_health_check(
|
961
|
+
name: str,
|
962
|
+
warning_threshold: float = 80.0,
|
963
|
+
critical_threshold: float = 95.0,
|
964
|
+
interval: float = 30.0,
|
965
|
+
):
|
966
|
+
"""Register a memory health check with global manager."""
|
967
|
+
health_check = MemoryHealthCheck(name, warning_threshold, critical_threshold)
|
968
|
+
manager = get_health_manager()
|
969
|
+
manager.register_health_check(health_check, interval)
|
970
|
+
|
971
|
+
|
972
|
+
async def register_custom_health_check(
|
973
|
+
name: str, check_func: Callable, interval: float = 30.0, timeout: float = 10.0
|
974
|
+
):
|
975
|
+
"""Register a custom health check with global manager."""
|
976
|
+
health_check = CustomHealthCheck(name, check_func, timeout=timeout)
|
977
|
+
manager = get_health_manager()
|
978
|
+
manager.register_health_check(health_check, interval)
|
kailash/edge/compliance.py
CHANGED
@@ -785,6 +785,39 @@ class ComplianceRouter:
|
|
785
785
|
"""Get recent compliance decisions from audit log."""
|
786
786
|
return self.audit_log[-limit:]
|
787
787
|
|
788
|
+
def is_compliant_location(
|
789
|
+
self,
|
790
|
+
location: "EdgeLocation",
|
791
|
+
data_class: DataClassification,
|
792
|
+
required_zones: List[str],
|
793
|
+
) -> bool:
|
794
|
+
"""Check if a location is compliant for given data class and zones.
|
795
|
+
|
796
|
+
Args:
|
797
|
+
location: Edge location to check
|
798
|
+
data_class: Classification of the data
|
799
|
+
required_zones: Required compliance zones
|
800
|
+
|
801
|
+
Returns:
|
802
|
+
True if location is compliant
|
803
|
+
"""
|
804
|
+
# Avoid circular import
|
805
|
+
from kailash.edge.location import EdgeRegion
|
806
|
+
|
807
|
+
# Check if location has all required compliance zones
|
808
|
+
location_zones = [z.value for z in location.compliance_zones]
|
809
|
+
|
810
|
+
# For GDPR compliance, PII/EU_PERSONAL data must be in EU regions or GDPR-compliant zones
|
811
|
+
if "gdpr" in required_zones and data_class in [
|
812
|
+
DataClassification.PII,
|
813
|
+
DataClassification.EU_PERSONAL,
|
814
|
+
]:
|
815
|
+
# Check if location has GDPR compliance zone
|
816
|
+
return "gdpr" in location_zones
|
817
|
+
|
818
|
+
# For other cases, check if location has the required zones
|
819
|
+
return all(zone in location_zones for zone in required_zones)
|
820
|
+
|
788
821
|
def get_compliance_summary(self) -> Dict[str, Any]:
|
789
822
|
"""Get summary of compliance decisions and performance."""
|
790
823
|
if not self.audit_log:
|