kailash 0.9.15__py3-none-any.whl → 0.9.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +4 -3
- kailash/middleware/database/base_models.py +7 -1
- kailash/migration/__init__.py +30 -0
- kailash/migration/cli.py +340 -0
- kailash/migration/compatibility_checker.py +662 -0
- kailash/migration/configuration_validator.py +837 -0
- kailash/migration/documentation_generator.py +1828 -0
- kailash/migration/examples/__init__.py +5 -0
- kailash/migration/examples/complete_migration_example.py +692 -0
- kailash/migration/migration_assistant.py +715 -0
- kailash/migration/performance_comparator.py +760 -0
- kailash/migration/regression_detector.py +1141 -0
- kailash/migration/tests/__init__.py +6 -0
- kailash/migration/tests/test_compatibility_checker.py +403 -0
- kailash/migration/tests/test_integration.py +463 -0
- kailash/migration/tests/test_migration_assistant.py +397 -0
- kailash/migration/tests/test_performance_comparator.py +433 -0
- kailash/monitoring/__init__.py +29 -2
- kailash/monitoring/asyncsql_metrics.py +275 -0
- kailash/nodes/data/async_sql.py +1828 -33
- kailash/runtime/local.py +1255 -8
- kailash/runtime/monitoring/__init__.py +1 -0
- kailash/runtime/monitoring/runtime_monitor.py +780 -0
- kailash/runtime/resource_manager.py +3033 -0
- kailash/sdk_exceptions.py +21 -0
- kailash/workflow/cyclic_runner.py +18 -2
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/METADATA +1 -1
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/RECORD +33 -14
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/WHEEL +0 -0
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/entry_points.txt +0 -0
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/licenses/NOTICE +0 -0
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,780 @@
|
|
1
|
+
"""Runtime monitoring and metrics collection.
|
2
|
+
|
3
|
+
This module provides comprehensive monitoring capabilities for the enhanced
|
4
|
+
LocalRuntime, including resource monitoring, health checks, performance
|
5
|
+
tracking, and enterprise integration.
|
6
|
+
|
7
|
+
Components:
|
8
|
+
- RuntimeMonitor: Overall runtime health and performance tracking
|
9
|
+
- ResourceMonitor: Resource usage and limits monitoring
|
10
|
+
- HealthChecker: Health check coordination and reporting
|
11
|
+
"""
|
12
|
+
|
13
|
+
import asyncio
|
14
|
+
import logging
|
15
|
+
import threading
|
16
|
+
import time
|
17
|
+
from collections import defaultdict, deque
|
18
|
+
from datetime import UTC, datetime
|
19
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
20
|
+
|
21
|
+
import psutil
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class ResourceMonitor:
|
27
|
+
"""Monitors resource usage and enforces limits."""
|
28
|
+
|
29
|
+
def __init__(
|
30
|
+
self,
|
31
|
+
resource_limits: Optional[Dict[str, Any]] = None,
|
32
|
+
monitoring_interval: float = 1.0,
|
33
|
+
alert_thresholds: Optional[Dict[str, float]] = None,
|
34
|
+
history_size: int = 100,
|
35
|
+
):
|
36
|
+
"""Initialize resource monitor.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
resource_limits: Limits for various resources
|
40
|
+
monitoring_interval: How often to check resources (seconds)
|
41
|
+
alert_thresholds: Thresholds for triggering alerts (0.0-1.0)
|
42
|
+
history_size: Number of historical samples to keep
|
43
|
+
"""
|
44
|
+
self.resource_limits = resource_limits or {}
|
45
|
+
self.monitoring_interval = monitoring_interval
|
46
|
+
self.alert_thresholds = alert_thresholds or {
|
47
|
+
"memory": 0.8,
|
48
|
+
"connections": 0.9,
|
49
|
+
"cpu": 0.85,
|
50
|
+
}
|
51
|
+
self.history_size = history_size
|
52
|
+
|
53
|
+
# Current usage tracking
|
54
|
+
self._current_usage: Dict[str, Any] = {}
|
55
|
+
self._connections: Dict[str, Any] = {}
|
56
|
+
self._usage_history: deque = deque(maxlen=history_size)
|
57
|
+
|
58
|
+
# Monitoring state
|
59
|
+
self._is_monitoring = False
|
60
|
+
self._monitoring_task: Optional[asyncio.Task] = None
|
61
|
+
self._lock = threading.RLock()
|
62
|
+
|
63
|
+
# Validate limits
|
64
|
+
self._validate_resource_limits()
|
65
|
+
|
66
|
+
logger.info("ResourceMonitor initialized")
|
67
|
+
|
68
|
+
def _validate_resource_limits(self) -> None:
|
69
|
+
"""Validate resource limits configuration."""
|
70
|
+
for key, value in self.resource_limits.items():
|
71
|
+
if isinstance(value, (int, float)) and value < 0:
|
72
|
+
raise ValueError(f"Resource limit '{key}' cannot be negative: {value}")
|
73
|
+
|
74
|
+
def get_current_memory_usage(self) -> float:
|
75
|
+
"""Get current memory usage in MB.
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
Current memory usage in megabytes
|
79
|
+
"""
|
80
|
+
try:
|
81
|
+
process = psutil.Process()
|
82
|
+
memory_info = process.memory_info()
|
83
|
+
return memory_info.rss / (1024 * 1024) # Convert to MB
|
84
|
+
except Exception as e:
|
85
|
+
logger.warning(f"Failed to get memory usage: {e}")
|
86
|
+
return 0.0
|
87
|
+
|
88
|
+
def add_connection(self, connection_id: str) -> None:
|
89
|
+
"""Add a connection for tracking.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
connection_id: Unique connection identifier
|
93
|
+
"""
|
94
|
+
with self._lock:
|
95
|
+
self._connections[connection_id] = {
|
96
|
+
"created_at": datetime.now(UTC),
|
97
|
+
"last_used": datetime.now(UTC),
|
98
|
+
}
|
99
|
+
|
100
|
+
def remove_connection(self, connection_id: str) -> None:
|
101
|
+
"""Remove a connection from tracking.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
connection_id: Connection identifier to remove
|
105
|
+
"""
|
106
|
+
with self._lock:
|
107
|
+
self._connections.pop(connection_id, None)
|
108
|
+
|
109
|
+
def get_connection_count(self) -> int:
|
110
|
+
"""Get current number of tracked connections.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
Number of active connections
|
114
|
+
"""
|
115
|
+
with self._lock:
|
116
|
+
return len(self._connections)
|
117
|
+
|
118
|
+
def check_resource_limits(self) -> bool:
|
119
|
+
"""Check if current usage is within limits.
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
True if within limits, False otherwise
|
123
|
+
"""
|
124
|
+
violations = self.get_limit_violations()
|
125
|
+
return len(violations) == 0
|
126
|
+
|
127
|
+
def get_limit_violations(self) -> Dict[str, Any]:
|
128
|
+
"""Get current limit violations.
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
Dictionary of violated limits with details
|
132
|
+
"""
|
133
|
+
violations = {}
|
134
|
+
|
135
|
+
# Check memory limit
|
136
|
+
if "max_memory_mb" in self.resource_limits:
|
137
|
+
current_memory = self.get_current_memory_usage()
|
138
|
+
limit = self.resource_limits["max_memory_mb"]
|
139
|
+
if current_memory > limit:
|
140
|
+
violations["memory"] = {
|
141
|
+
"current": current_memory,
|
142
|
+
"limit": limit,
|
143
|
+
"violation_percent": (current_memory / limit - 1) * 100,
|
144
|
+
}
|
145
|
+
|
146
|
+
# Check connection limit
|
147
|
+
if "max_connections" in self.resource_limits:
|
148
|
+
current_connections = self.get_connection_count()
|
149
|
+
limit = self.resource_limits["max_connections"]
|
150
|
+
if current_connections > limit:
|
151
|
+
violations["connections"] = {
|
152
|
+
"current": current_connections,
|
153
|
+
"limit": limit,
|
154
|
+
"violation_count": current_connections - limit,
|
155
|
+
}
|
156
|
+
|
157
|
+
return violations
|
158
|
+
|
159
|
+
async def start_monitoring(self) -> None:
|
160
|
+
"""Start continuous resource monitoring."""
|
161
|
+
if self._is_monitoring:
|
162
|
+
return
|
163
|
+
|
164
|
+
self._is_monitoring = True
|
165
|
+
self._monitoring_task = asyncio.create_task(self._monitoring_loop())
|
166
|
+
logger.info("Started resource monitoring")
|
167
|
+
|
168
|
+
async def stop_monitoring(self) -> None:
|
169
|
+
"""Stop continuous resource monitoring."""
|
170
|
+
if not self._is_monitoring:
|
171
|
+
return
|
172
|
+
|
173
|
+
self._is_monitoring = False
|
174
|
+
if self._monitoring_task:
|
175
|
+
self._monitoring_task.cancel()
|
176
|
+
try:
|
177
|
+
await self._monitoring_task
|
178
|
+
except asyncio.CancelledError:
|
179
|
+
pass
|
180
|
+
|
181
|
+
logger.info("Stopped resource monitoring")
|
182
|
+
|
183
|
+
async def _monitoring_loop(self) -> None:
|
184
|
+
"""Main monitoring loop."""
|
185
|
+
while self._is_monitoring:
|
186
|
+
try:
|
187
|
+
# Collect current usage
|
188
|
+
usage_sample = {
|
189
|
+
"timestamp": datetime.now(UTC),
|
190
|
+
"memory_mb": self.get_current_memory_usage(),
|
191
|
+
"connections": self.get_connection_count(),
|
192
|
+
"cpu_percent": self._get_cpu_usage(),
|
193
|
+
}
|
194
|
+
|
195
|
+
self._record_usage_sample(usage_sample)
|
196
|
+
|
197
|
+
# Wait for next interval
|
198
|
+
await asyncio.sleep(self.monitoring_interval)
|
199
|
+
|
200
|
+
except asyncio.CancelledError:
|
201
|
+
break
|
202
|
+
except Exception as e:
|
203
|
+
logger.error(f"Error in monitoring loop: {e}")
|
204
|
+
await asyncio.sleep(1) # Brief pause before retry
|
205
|
+
|
206
|
+
def _get_cpu_usage(self) -> float:
|
207
|
+
"""Get current CPU usage percentage."""
|
208
|
+
try:
|
209
|
+
return psutil.cpu_percent(interval=0.1)
|
210
|
+
except Exception as e:
|
211
|
+
logger.warning(f"Failed to get CPU usage: {e}")
|
212
|
+
return 0.0
|
213
|
+
|
214
|
+
def _record_usage_sample(self, sample: Dict[str, Any]) -> None:
|
215
|
+
"""Record a usage sample in history.
|
216
|
+
|
217
|
+
Args:
|
218
|
+
sample: Usage sample to record
|
219
|
+
"""
|
220
|
+
with self._lock:
|
221
|
+
self._usage_history.append(sample)
|
222
|
+
self._current_usage = sample.copy()
|
223
|
+
|
224
|
+
def get_monitoring_metrics(self) -> List[Dict[str, Any]]:
|
225
|
+
"""Get monitoring metrics history.
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
List of historical usage samples
|
229
|
+
"""
|
230
|
+
with self._lock:
|
231
|
+
return list(self._usage_history)
|
232
|
+
|
233
|
+
def get_usage_history(self) -> List[Dict[str, Any]]:
|
234
|
+
"""Get usage history.
|
235
|
+
|
236
|
+
Returns:
|
237
|
+
List of historical usage samples
|
238
|
+
"""
|
239
|
+
return self.get_monitoring_metrics()
|
240
|
+
|
241
|
+
@property
|
242
|
+
def is_monitoring(self) -> bool:
|
243
|
+
"""Check if monitoring is active."""
|
244
|
+
return self._is_monitoring
|
245
|
+
|
246
|
+
|
247
|
+
class RuntimeMonitor:
|
248
|
+
"""Overall runtime monitoring and health tracking."""
|
249
|
+
|
250
|
+
def __init__(
|
251
|
+
self,
|
252
|
+
runtime_id: str,
|
253
|
+
enable_performance_tracking: bool = True,
|
254
|
+
enable_health_checks: bool = True,
|
255
|
+
metrics_collector: Optional[Any] = None,
|
256
|
+
audit_logger: Optional[Any] = None,
|
257
|
+
alert_manager: Optional[Any] = None,
|
258
|
+
dashboard_client: Optional[Any] = None,
|
259
|
+
):
|
260
|
+
"""Initialize runtime monitor.
|
261
|
+
|
262
|
+
Args:
|
263
|
+
runtime_id: Unique runtime identifier
|
264
|
+
enable_performance_tracking: Enable execution performance tracking
|
265
|
+
enable_health_checks: Enable health check system
|
266
|
+
metrics_collector: Enterprise metrics collector
|
267
|
+
audit_logger: Enterprise audit logger
|
268
|
+
alert_manager: Enterprise alert manager
|
269
|
+
dashboard_client: Enterprise dashboard client
|
270
|
+
"""
|
271
|
+
self.runtime_id = runtime_id
|
272
|
+
self.enable_performance_tracking = enable_performance_tracking
|
273
|
+
self.enable_health_checks = enable_health_checks
|
274
|
+
|
275
|
+
# Enterprise integrations
|
276
|
+
self.metrics_collector = metrics_collector
|
277
|
+
self.audit_logger = audit_logger
|
278
|
+
self.alert_manager = alert_manager
|
279
|
+
self.dashboard_client = dashboard_client
|
280
|
+
|
281
|
+
# Performance tracking
|
282
|
+
self._execution_metrics: List[Dict[str, Any]] = []
|
283
|
+
self._performance_benchmarks: List[Dict[str, Any]] = []
|
284
|
+
self._active_executions: Dict[str, Dict[str, Any]] = {}
|
285
|
+
|
286
|
+
# Health checks
|
287
|
+
self._health_checks: Dict[str, Callable] = {}
|
288
|
+
self._async_health_checks: Dict[str, Callable] = {}
|
289
|
+
|
290
|
+
# Thread safety
|
291
|
+
self._lock = threading.RLock()
|
292
|
+
|
293
|
+
logger.info(f"RuntimeMonitor initialized for {runtime_id}")
|
294
|
+
|
295
|
+
def start_execution_tracking(self, workflow_id: str) -> str:
|
296
|
+
"""Start tracking workflow execution.
|
297
|
+
|
298
|
+
Args:
|
299
|
+
workflow_id: Workflow identifier
|
300
|
+
|
301
|
+
Returns:
|
302
|
+
Execution tracking ID
|
303
|
+
"""
|
304
|
+
if not self.enable_performance_tracking:
|
305
|
+
return ""
|
306
|
+
|
307
|
+
execution_id = f"exec_{int(time.time() * 1000)}_{workflow_id}"
|
308
|
+
|
309
|
+
with self._lock:
|
310
|
+
self._active_executions[execution_id] = {
|
311
|
+
"workflow_id": workflow_id,
|
312
|
+
"start_time": time.time(),
|
313
|
+
"start_timestamp": datetime.now(UTC),
|
314
|
+
}
|
315
|
+
|
316
|
+
return execution_id
|
317
|
+
|
318
|
+
def end_execution_tracking(self, execution_id: str, success: bool) -> None:
|
319
|
+
"""End execution tracking.
|
320
|
+
|
321
|
+
Args:
|
322
|
+
execution_id: Execution tracking ID
|
323
|
+
success: Whether execution was successful
|
324
|
+
"""
|
325
|
+
if not self.enable_performance_tracking or not execution_id:
|
326
|
+
return
|
327
|
+
|
328
|
+
with self._lock:
|
329
|
+
if execution_id in self._active_executions:
|
330
|
+
execution_info = self._active_executions.pop(execution_id)
|
331
|
+
|
332
|
+
end_time = time.time()
|
333
|
+
duration_ms = (end_time - execution_info["start_time"]) * 1000
|
334
|
+
|
335
|
+
metric = {
|
336
|
+
"execution_id": execution_id,
|
337
|
+
"workflow_id": execution_info["workflow_id"],
|
338
|
+
"start_time": execution_info["start_timestamp"],
|
339
|
+
"end_time": datetime.now(UTC),
|
340
|
+
"duration_ms": duration_ms,
|
341
|
+
"success": success,
|
342
|
+
}
|
343
|
+
|
344
|
+
self._execution_metrics.append(metric)
|
345
|
+
|
346
|
+
# Report to enterprise metrics if available
|
347
|
+
if self.metrics_collector:
|
348
|
+
try:
|
349
|
+
self.metrics_collector.record_metric(
|
350
|
+
"workflow_execution", metric
|
351
|
+
)
|
352
|
+
except Exception as e:
|
353
|
+
logger.warning(f"Failed to record enterprise metric: {e}")
|
354
|
+
|
355
|
+
def get_execution_metrics(self) -> List[Dict[str, Any]]:
|
356
|
+
"""Get execution metrics.
|
357
|
+
|
358
|
+
Returns:
|
359
|
+
List of execution metrics
|
360
|
+
"""
|
361
|
+
with self._lock:
|
362
|
+
return self._execution_metrics.copy()
|
363
|
+
|
364
|
+
def register_health_check(self, name: str, check_function: Callable) -> None:
|
365
|
+
"""Register a health check function.
|
366
|
+
|
367
|
+
Args:
|
368
|
+
name: Health check name
|
369
|
+
check_function: Function that returns health status
|
370
|
+
"""
|
371
|
+
self._health_checks[name] = check_function
|
372
|
+
|
373
|
+
def register_async_health_check(self, name: str, check_function: Callable) -> None:
|
374
|
+
"""Register an async health check function.
|
375
|
+
|
376
|
+
Args:
|
377
|
+
name: Health check name
|
378
|
+
check_function: Async function that returns health status
|
379
|
+
"""
|
380
|
+
self._async_health_checks[name] = check_function
|
381
|
+
|
382
|
+
def run_health_checks(self) -> Dict[str, Any]:
|
383
|
+
"""Run all registered health checks.
|
384
|
+
|
385
|
+
Returns:
|
386
|
+
Health check results
|
387
|
+
"""
|
388
|
+
if not self.enable_health_checks:
|
389
|
+
return {}
|
390
|
+
|
391
|
+
results = {}
|
392
|
+
|
393
|
+
for name, check_func in self._health_checks.items():
|
394
|
+
try:
|
395
|
+
result = check_func()
|
396
|
+
results[name] = (
|
397
|
+
result if isinstance(result, dict) else {"status": str(result)}
|
398
|
+
)
|
399
|
+
except Exception as e:
|
400
|
+
results[name] = {"status": "error", "error": str(e)}
|
401
|
+
logger.warning(f"Health check '{name}' failed: {e}")
|
402
|
+
|
403
|
+
return results
|
404
|
+
|
405
|
+
async def run_async_health_checks(self) -> Dict[str, Any]:
|
406
|
+
"""Run all registered async health checks.
|
407
|
+
|
408
|
+
Returns:
|
409
|
+
Async health check results
|
410
|
+
"""
|
411
|
+
if not self.enable_health_checks:
|
412
|
+
return {}
|
413
|
+
|
414
|
+
results = {}
|
415
|
+
|
416
|
+
for name, check_func in self._async_health_checks.items():
|
417
|
+
try:
|
418
|
+
result = await check_func()
|
419
|
+
results[name] = (
|
420
|
+
result if isinstance(result, dict) else {"status": str(result)}
|
421
|
+
)
|
422
|
+
except Exception as e:
|
423
|
+
results[name] = {"status": "error", "error": str(e)}
|
424
|
+
logger.warning(f"Async health check '{name}' failed: {e}")
|
425
|
+
|
426
|
+
return results
|
427
|
+
|
428
|
+
def get_aggregated_metrics(self) -> Dict[str, Any]:
|
429
|
+
"""Get aggregated performance metrics.
|
430
|
+
|
431
|
+
Returns:
|
432
|
+
Aggregated metrics summary
|
433
|
+
"""
|
434
|
+
with self._lock:
|
435
|
+
if not self._execution_metrics:
|
436
|
+
return {
|
437
|
+
"total_executions": 0,
|
438
|
+
"success_rate": 0.0,
|
439
|
+
"avg_execution_time_ms": 0.0,
|
440
|
+
}
|
441
|
+
|
442
|
+
total = len(self._execution_metrics)
|
443
|
+
successful = sum(1 for m in self._execution_metrics if m["success"])
|
444
|
+
success_rate = successful / total if total > 0 else 0.0
|
445
|
+
|
446
|
+
avg_duration = (
|
447
|
+
sum(m["duration_ms"] for m in self._execution_metrics) / total
|
448
|
+
)
|
449
|
+
|
450
|
+
return {
|
451
|
+
"total_executions": total,
|
452
|
+
"success_rate": success_rate,
|
453
|
+
"avg_execution_time_ms": avg_duration,
|
454
|
+
"successful_executions": successful,
|
455
|
+
"failed_executions": total - successful,
|
456
|
+
}
|
457
|
+
|
458
|
+
def record_performance_benchmark(
|
459
|
+
self,
|
460
|
+
operation: str,
|
461
|
+
duration_ms: float,
|
462
|
+
metadata: Optional[Dict[str, Any]] = None,
|
463
|
+
) -> None:
|
464
|
+
"""Record a performance benchmark.
|
465
|
+
|
466
|
+
Args:
|
467
|
+
operation: Operation name
|
468
|
+
duration_ms: Duration in milliseconds
|
469
|
+
metadata: Additional metadata
|
470
|
+
"""
|
471
|
+
benchmark = {
|
472
|
+
"operation": operation,
|
473
|
+
"duration_ms": duration_ms,
|
474
|
+
"timestamp": datetime.now(UTC),
|
475
|
+
"metadata": metadata or {},
|
476
|
+
}
|
477
|
+
|
478
|
+
with self._lock:
|
479
|
+
self._performance_benchmarks.append(benchmark)
|
480
|
+
|
481
|
+
def get_performance_benchmarks(self) -> List[Dict[str, Any]]:
|
482
|
+
"""Get performance benchmarks.
|
483
|
+
|
484
|
+
Returns:
|
485
|
+
List of performance benchmarks
|
486
|
+
"""
|
487
|
+
with self._lock:
|
488
|
+
return self._performance_benchmarks.copy()
|
489
|
+
|
490
|
+
def log_audit_event(self, event_type: str, details: Dict[str, Any]) -> None:
|
491
|
+
"""Log an audit event.
|
492
|
+
|
493
|
+
Args:
|
494
|
+
event_type: Type of audit event
|
495
|
+
details: Event details
|
496
|
+
"""
|
497
|
+
if self.audit_logger:
|
498
|
+
try:
|
499
|
+
self.audit_logger.log_event(
|
500
|
+
{
|
501
|
+
"runtime_id": self.runtime_id,
|
502
|
+
"event_type": event_type,
|
503
|
+
"timestamp": datetime.now(UTC),
|
504
|
+
"details": details,
|
505
|
+
}
|
506
|
+
)
|
507
|
+
except Exception as e:
|
508
|
+
logger.warning(f"Failed to log audit event: {e}")
|
509
|
+
|
510
|
+
def check_and_trigger_alerts(self, metrics: Dict[str, Any]) -> None:
|
511
|
+
"""Check metrics and trigger alerts if needed.
|
512
|
+
|
513
|
+
Args:
|
514
|
+
metrics: Current metrics to check
|
515
|
+
"""
|
516
|
+
if not self.alert_manager:
|
517
|
+
return
|
518
|
+
|
519
|
+
try:
|
520
|
+
# Check for alert conditions
|
521
|
+
alerts = []
|
522
|
+
|
523
|
+
if metrics.get("memory_usage_percent", 0) > 90:
|
524
|
+
alerts.append(
|
525
|
+
{
|
526
|
+
"type": "high_memory_usage",
|
527
|
+
"severity": "warning",
|
528
|
+
"message": f"High memory usage: {metrics['memory_usage_percent']:.1f}%",
|
529
|
+
}
|
530
|
+
)
|
531
|
+
|
532
|
+
if metrics.get("error_rate", 0) > 0.1:
|
533
|
+
alerts.append(
|
534
|
+
{
|
535
|
+
"type": "high_error_rate",
|
536
|
+
"severity": "critical",
|
537
|
+
"message": f"High error rate: {metrics['error_rate']:.1%}",
|
538
|
+
}
|
539
|
+
)
|
540
|
+
|
541
|
+
# Trigger alerts
|
542
|
+
for alert in alerts:
|
543
|
+
self.alert_manager.trigger_alert(alert)
|
544
|
+
|
545
|
+
except Exception as e:
|
546
|
+
logger.warning(f"Failed to check/trigger alerts: {e}")
|
547
|
+
|
548
|
+
async def push_metrics_to_dashboard(self, metrics: Dict[str, Any]) -> None:
|
549
|
+
"""Push metrics to enterprise dashboard.
|
550
|
+
|
551
|
+
Args:
|
552
|
+
metrics: Metrics to push
|
553
|
+
"""
|
554
|
+
if not self.dashboard_client:
|
555
|
+
return
|
556
|
+
|
557
|
+
try:
|
558
|
+
await self.dashboard_client.push_metrics(metrics)
|
559
|
+
except Exception as e:
|
560
|
+
logger.warning(f"Failed to push metrics to dashboard: {e}")
|
561
|
+
|
562
|
+
|
563
|
+
class HealthChecker:
|
564
|
+
"""Coordinates health checks and status reporting."""
|
565
|
+
|
566
|
+
def __init__(self):
|
567
|
+
"""Initialize health checker."""
|
568
|
+
self._checks: Dict[str, Callable] = {}
|
569
|
+
|
570
|
+
def register_check(self, name: str, check_func: Callable) -> None:
|
571
|
+
"""Register a health check.
|
572
|
+
|
573
|
+
Args:
|
574
|
+
name: Check name
|
575
|
+
check_func: Function that returns health status
|
576
|
+
"""
|
577
|
+
self._checks[name] = check_func
|
578
|
+
|
579
|
+
def run_checks(self) -> Dict[str, Any]:
|
580
|
+
"""Run all health checks.
|
581
|
+
|
582
|
+
Returns:
|
583
|
+
Health check results
|
584
|
+
"""
|
585
|
+
results = {}
|
586
|
+
overall_status = "healthy"
|
587
|
+
|
588
|
+
for name, check_func in self._checks.items():
|
589
|
+
try:
|
590
|
+
result = check_func()
|
591
|
+
if isinstance(result, dict):
|
592
|
+
results[name] = result
|
593
|
+
if result.get("status") != "healthy":
|
594
|
+
overall_status = "degraded"
|
595
|
+
else:
|
596
|
+
results[name] = {"status": "healthy" if result else "unhealthy"}
|
597
|
+
if not result:
|
598
|
+
overall_status = "degraded"
|
599
|
+
except Exception as e:
|
600
|
+
results[name] = {"status": "error", "error": str(e)}
|
601
|
+
overall_status = "unhealthy"
|
602
|
+
|
603
|
+
return {
|
604
|
+
"status": overall_status,
|
605
|
+
"details": results,
|
606
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
607
|
+
}
|
608
|
+
|
609
|
+
|
610
|
+
# Enterprise monitoring integration adapters
|
611
|
+
class PrometheusAdapter:
|
612
|
+
"""Adapter for Prometheus metrics integration."""
|
613
|
+
|
614
|
+
def __init__(
|
615
|
+
self, prefix: str = "kailash", labels: Optional[Dict[str, str]] = None
|
616
|
+
):
|
617
|
+
"""Initialize Prometheus adapter.
|
618
|
+
|
619
|
+
Args:
|
620
|
+
prefix: Metric name prefix
|
621
|
+
labels: Default labels to add to all metrics
|
622
|
+
"""
|
623
|
+
self.prefix = prefix
|
624
|
+
self.default_labels = labels or {}
|
625
|
+
self._metrics_cache = {}
|
626
|
+
|
627
|
+
try:
|
628
|
+
# Try to import prometheus_client if available
|
629
|
+
import prometheus_client
|
630
|
+
|
631
|
+
self.prometheus_client = prometheus_client
|
632
|
+
self.enabled = True
|
633
|
+
logger.info("Prometheus adapter initialized")
|
634
|
+
except ImportError:
|
635
|
+
self.prometheus_client = None
|
636
|
+
self.enabled = False
|
637
|
+
logger.warning(
|
638
|
+
"Prometheus client not available - metrics will be logged only"
|
639
|
+
)
|
640
|
+
|
641
|
+
def counter(self, name: str, description: str, labels: List[str] = None) -> Any:
|
642
|
+
"""Get or create a counter metric."""
|
643
|
+
full_name = f"{self.prefix}_{name}"
|
644
|
+
|
645
|
+
if not self.enabled:
|
646
|
+
return MockMetric(full_name, "counter")
|
647
|
+
|
648
|
+
if full_name not in self._metrics_cache:
|
649
|
+
self._metrics_cache[full_name] = self.prometheus_client.Counter(
|
650
|
+
full_name, description, labels or []
|
651
|
+
)
|
652
|
+
|
653
|
+
return self._metrics_cache[full_name]
|
654
|
+
|
655
|
+
def gauge(self, name: str, description: str, labels: List[str] = None) -> Any:
|
656
|
+
"""Get or create a gauge metric."""
|
657
|
+
full_name = f"{self.prefix}_{name}"
|
658
|
+
|
659
|
+
if not self.enabled:
|
660
|
+
return MockMetric(full_name, "gauge")
|
661
|
+
|
662
|
+
if full_name not in self._metrics_cache:
|
663
|
+
self._metrics_cache[full_name] = self.prometheus_client.Gauge(
|
664
|
+
full_name, description, labels or []
|
665
|
+
)
|
666
|
+
|
667
|
+
return self._metrics_cache[full_name]
|
668
|
+
|
669
|
+
|
670
|
+
class DataDogAdapter:
|
671
|
+
"""Adapter for DataDog metrics integration."""
|
672
|
+
|
673
|
+
def __init__(self, prefix: str = "kailash", tags: List[str] = None):
|
674
|
+
"""Initialize DataDog adapter."""
|
675
|
+
self.prefix = prefix
|
676
|
+
self.default_tags = tags or []
|
677
|
+
|
678
|
+
try:
|
679
|
+
import datadog
|
680
|
+
|
681
|
+
self.datadog = datadog
|
682
|
+
self.enabled = True
|
683
|
+
logger.info("DataDog adapter initialized")
|
684
|
+
except ImportError:
|
685
|
+
self.datadog = None
|
686
|
+
self.enabled = False
|
687
|
+
logger.warning("DataDog client not available - metrics will be logged only")
|
688
|
+
|
689
|
+
def increment(self, metric: str, value: int = 1, tags: List[str] = None) -> None:
|
690
|
+
"""Increment a counter metric."""
|
691
|
+
full_name = f"{self.prefix}.{metric}"
|
692
|
+
all_tags = self.default_tags + (tags or [])
|
693
|
+
|
694
|
+
if self.enabled:
|
695
|
+
self.datadog.statsd.increment(full_name, value, tags=all_tags)
|
696
|
+
else:
|
697
|
+
logger.info(f"DataDog metric: {full_name} += {value} (tags: {all_tags})")
|
698
|
+
|
699
|
+
def gauge(self, metric: str, value: float, tags: List[str] = None) -> None:
|
700
|
+
"""Set a gauge metric."""
|
701
|
+
full_name = f"{self.prefix}.{metric}"
|
702
|
+
all_tags = self.default_tags + (tags or [])
|
703
|
+
|
704
|
+
if self.enabled:
|
705
|
+
self.datadog.statsd.gauge(full_name, value, tags=all_tags)
|
706
|
+
else:
|
707
|
+
logger.info(f"DataDog metric: {full_name} = {value} (tags: {all_tags})")
|
708
|
+
|
709
|
+
|
710
|
+
class MockMetric:
|
711
|
+
"""Mock metric for when real monitoring is not available."""
|
712
|
+
|
713
|
+
def __init__(self, name: str, metric_type: str):
|
714
|
+
self.name = name
|
715
|
+
self.metric_type = metric_type
|
716
|
+
|
717
|
+
def inc(self, amount: float = 1, **kwargs) -> None:
|
718
|
+
"""Mock increment."""
|
719
|
+
logger.debug(f"Mock {self.metric_type} {self.name} += {amount}")
|
720
|
+
|
721
|
+
def set(self, value: float, **kwargs) -> None:
|
722
|
+
"""Mock set."""
|
723
|
+
logger.debug(f"Mock {self.metric_type} {self.name} = {value}")
|
724
|
+
|
725
|
+
def labels(self, **kwargs):
|
726
|
+
"""Mock labels."""
|
727
|
+
return self
|
728
|
+
|
729
|
+
|
730
|
+
class EnterpriseMonitoringManager:
|
731
|
+
"""Manages enterprise monitoring integrations."""
|
732
|
+
|
733
|
+
def __init__(self, runtime_id: str):
|
734
|
+
"""Initialize enterprise monitoring manager."""
|
735
|
+
self.runtime_id = runtime_id
|
736
|
+
self.adapters: Dict[str, Any] = {}
|
737
|
+
|
738
|
+
# Initialize available adapters
|
739
|
+
self.adapters["prometheus"] = PrometheusAdapter(
|
740
|
+
prefix="kailash_runtime", labels={"runtime_id": runtime_id}
|
741
|
+
)
|
742
|
+
|
743
|
+
self.adapters["datadog"] = DataDogAdapter(
|
744
|
+
prefix="kailash.runtime", tags=[f"runtime_id:{runtime_id}"]
|
745
|
+
)
|
746
|
+
|
747
|
+
logger.info(f"Enterprise monitoring initialized for runtime {runtime_id}")
|
748
|
+
|
749
|
+
def record_workflow_execution(
|
750
|
+
self, workflow_id: str, duration_ms: float, success: bool
|
751
|
+
) -> None:
|
752
|
+
"""Record workflow execution metrics."""
|
753
|
+
# Prometheus
|
754
|
+
if self.adapters["prometheus"].enabled:
|
755
|
+
counter = self.adapters["prometheus"].counter(
|
756
|
+
"workflows_total",
|
757
|
+
"Total workflows executed",
|
758
|
+
["workflow_id", "success"],
|
759
|
+
)
|
760
|
+
counter.labels(workflow_id=workflow_id, success=str(success)).inc()
|
761
|
+
|
762
|
+
# DataDog
|
763
|
+
self.adapters["datadog"].increment(
|
764
|
+
"workflow.executions",
|
765
|
+
tags=[f"workflow_id:{workflow_id}", f"success:{success}"],
|
766
|
+
)
|
767
|
+
|
768
|
+
def record_resource_usage(self, resource_type: str, value: float) -> None:
|
769
|
+
"""Record resource usage metrics."""
|
770
|
+
# Prometheus
|
771
|
+
if self.adapters["prometheus"].enabled:
|
772
|
+
gauge = self.adapters["prometheus"].gauge(
|
773
|
+
f"resource_{resource_type}", f"{resource_type} usage", ["resource_type"]
|
774
|
+
)
|
775
|
+
gauge.labels(resource_type=resource_type).set(value)
|
776
|
+
|
777
|
+
# DataDog
|
778
|
+
self.adapters["datadog"].gauge(
|
779
|
+
f"resource.{resource_type}", value, tags=[f"resource_type:{resource_type}"]
|
780
|
+
)
|