kailash 0.9.15__py3-none-any.whl → 0.9.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. kailash/__init__.py +4 -3
  2. kailash/middleware/database/base_models.py +7 -1
  3. kailash/migration/__init__.py +30 -0
  4. kailash/migration/cli.py +340 -0
  5. kailash/migration/compatibility_checker.py +662 -0
  6. kailash/migration/configuration_validator.py +837 -0
  7. kailash/migration/documentation_generator.py +1828 -0
  8. kailash/migration/examples/__init__.py +5 -0
  9. kailash/migration/examples/complete_migration_example.py +692 -0
  10. kailash/migration/migration_assistant.py +715 -0
  11. kailash/migration/performance_comparator.py +760 -0
  12. kailash/migration/regression_detector.py +1141 -0
  13. kailash/migration/tests/__init__.py +6 -0
  14. kailash/migration/tests/test_compatibility_checker.py +403 -0
  15. kailash/migration/tests/test_integration.py +463 -0
  16. kailash/migration/tests/test_migration_assistant.py +397 -0
  17. kailash/migration/tests/test_performance_comparator.py +433 -0
  18. kailash/monitoring/__init__.py +29 -2
  19. kailash/monitoring/asyncsql_metrics.py +275 -0
  20. kailash/nodes/data/async_sql.py +1828 -33
  21. kailash/runtime/local.py +1255 -8
  22. kailash/runtime/monitoring/__init__.py +1 -0
  23. kailash/runtime/monitoring/runtime_monitor.py +780 -0
  24. kailash/runtime/resource_manager.py +3033 -0
  25. kailash/sdk_exceptions.py +21 -0
  26. kailash/workflow/cyclic_runner.py +18 -2
  27. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/METADATA +1 -1
  28. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/RECORD +33 -14
  29. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/WHEEL +0 -0
  30. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/entry_points.txt +0 -0
  31. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/licenses/LICENSE +0 -0
  32. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/licenses/NOTICE +0 -0
  33. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,780 @@
1
+ """Runtime monitoring and metrics collection.
2
+
3
+ This module provides comprehensive monitoring capabilities for the enhanced
4
+ LocalRuntime, including resource monitoring, health checks, performance
5
+ tracking, and enterprise integration.
6
+
7
+ Components:
8
+ - RuntimeMonitor: Overall runtime health and performance tracking
9
+ - ResourceMonitor: Resource usage and limits monitoring
10
+ - HealthChecker: Health check coordination and reporting
11
+ """
12
+
13
+ import asyncio
14
+ import logging
15
+ import threading
16
+ import time
17
+ from collections import defaultdict, deque
18
+ from datetime import UTC, datetime
19
+ from typing import Any, Callable, Dict, List, Optional, Union
20
+
21
+ import psutil
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class ResourceMonitor:
27
+ """Monitors resource usage and enforces limits."""
28
+
29
+ def __init__(
30
+ self,
31
+ resource_limits: Optional[Dict[str, Any]] = None,
32
+ monitoring_interval: float = 1.0,
33
+ alert_thresholds: Optional[Dict[str, float]] = None,
34
+ history_size: int = 100,
35
+ ):
36
+ """Initialize resource monitor.
37
+
38
+ Args:
39
+ resource_limits: Limits for various resources
40
+ monitoring_interval: How often to check resources (seconds)
41
+ alert_thresholds: Thresholds for triggering alerts (0.0-1.0)
42
+ history_size: Number of historical samples to keep
43
+ """
44
+ self.resource_limits = resource_limits or {}
45
+ self.monitoring_interval = monitoring_interval
46
+ self.alert_thresholds = alert_thresholds or {
47
+ "memory": 0.8,
48
+ "connections": 0.9,
49
+ "cpu": 0.85,
50
+ }
51
+ self.history_size = history_size
52
+
53
+ # Current usage tracking
54
+ self._current_usage: Dict[str, Any] = {}
55
+ self._connections: Dict[str, Any] = {}
56
+ self._usage_history: deque = deque(maxlen=history_size)
57
+
58
+ # Monitoring state
59
+ self._is_monitoring = False
60
+ self._monitoring_task: Optional[asyncio.Task] = None
61
+ self._lock = threading.RLock()
62
+
63
+ # Validate limits
64
+ self._validate_resource_limits()
65
+
66
+ logger.info("ResourceMonitor initialized")
67
+
68
+ def _validate_resource_limits(self) -> None:
69
+ """Validate resource limits configuration."""
70
+ for key, value in self.resource_limits.items():
71
+ if isinstance(value, (int, float)) and value < 0:
72
+ raise ValueError(f"Resource limit '{key}' cannot be negative: {value}")
73
+
74
+ def get_current_memory_usage(self) -> float:
75
+ """Get current memory usage in MB.
76
+
77
+ Returns:
78
+ Current memory usage in megabytes
79
+ """
80
+ try:
81
+ process = psutil.Process()
82
+ memory_info = process.memory_info()
83
+ return memory_info.rss / (1024 * 1024) # Convert to MB
84
+ except Exception as e:
85
+ logger.warning(f"Failed to get memory usage: {e}")
86
+ return 0.0
87
+
88
+ def add_connection(self, connection_id: str) -> None:
89
+ """Add a connection for tracking.
90
+
91
+ Args:
92
+ connection_id: Unique connection identifier
93
+ """
94
+ with self._lock:
95
+ self._connections[connection_id] = {
96
+ "created_at": datetime.now(UTC),
97
+ "last_used": datetime.now(UTC),
98
+ }
99
+
100
+ def remove_connection(self, connection_id: str) -> None:
101
+ """Remove a connection from tracking.
102
+
103
+ Args:
104
+ connection_id: Connection identifier to remove
105
+ """
106
+ with self._lock:
107
+ self._connections.pop(connection_id, None)
108
+
109
+ def get_connection_count(self) -> int:
110
+ """Get current number of tracked connections.
111
+
112
+ Returns:
113
+ Number of active connections
114
+ """
115
+ with self._lock:
116
+ return len(self._connections)
117
+
118
+ def check_resource_limits(self) -> bool:
119
+ """Check if current usage is within limits.
120
+
121
+ Returns:
122
+ True if within limits, False otherwise
123
+ """
124
+ violations = self.get_limit_violations()
125
+ return len(violations) == 0
126
+
127
+ def get_limit_violations(self) -> Dict[str, Any]:
128
+ """Get current limit violations.
129
+
130
+ Returns:
131
+ Dictionary of violated limits with details
132
+ """
133
+ violations = {}
134
+
135
+ # Check memory limit
136
+ if "max_memory_mb" in self.resource_limits:
137
+ current_memory = self.get_current_memory_usage()
138
+ limit = self.resource_limits["max_memory_mb"]
139
+ if current_memory > limit:
140
+ violations["memory"] = {
141
+ "current": current_memory,
142
+ "limit": limit,
143
+ "violation_percent": (current_memory / limit - 1) * 100,
144
+ }
145
+
146
+ # Check connection limit
147
+ if "max_connections" in self.resource_limits:
148
+ current_connections = self.get_connection_count()
149
+ limit = self.resource_limits["max_connections"]
150
+ if current_connections > limit:
151
+ violations["connections"] = {
152
+ "current": current_connections,
153
+ "limit": limit,
154
+ "violation_count": current_connections - limit,
155
+ }
156
+
157
+ return violations
158
+
159
+ async def start_monitoring(self) -> None:
160
+ """Start continuous resource monitoring."""
161
+ if self._is_monitoring:
162
+ return
163
+
164
+ self._is_monitoring = True
165
+ self._monitoring_task = asyncio.create_task(self._monitoring_loop())
166
+ logger.info("Started resource monitoring")
167
+
168
+ async def stop_monitoring(self) -> None:
169
+ """Stop continuous resource monitoring."""
170
+ if not self._is_monitoring:
171
+ return
172
+
173
+ self._is_monitoring = False
174
+ if self._monitoring_task:
175
+ self._monitoring_task.cancel()
176
+ try:
177
+ await self._monitoring_task
178
+ except asyncio.CancelledError:
179
+ pass
180
+
181
+ logger.info("Stopped resource monitoring")
182
+
183
+ async def _monitoring_loop(self) -> None:
184
+ """Main monitoring loop."""
185
+ while self._is_monitoring:
186
+ try:
187
+ # Collect current usage
188
+ usage_sample = {
189
+ "timestamp": datetime.now(UTC),
190
+ "memory_mb": self.get_current_memory_usage(),
191
+ "connections": self.get_connection_count(),
192
+ "cpu_percent": self._get_cpu_usage(),
193
+ }
194
+
195
+ self._record_usage_sample(usage_sample)
196
+
197
+ # Wait for next interval
198
+ await asyncio.sleep(self.monitoring_interval)
199
+
200
+ except asyncio.CancelledError:
201
+ break
202
+ except Exception as e:
203
+ logger.error(f"Error in monitoring loop: {e}")
204
+ await asyncio.sleep(1) # Brief pause before retry
205
+
206
+ def _get_cpu_usage(self) -> float:
207
+ """Get current CPU usage percentage."""
208
+ try:
209
+ return psutil.cpu_percent(interval=0.1)
210
+ except Exception as e:
211
+ logger.warning(f"Failed to get CPU usage: {e}")
212
+ return 0.0
213
+
214
+ def _record_usage_sample(self, sample: Dict[str, Any]) -> None:
215
+ """Record a usage sample in history.
216
+
217
+ Args:
218
+ sample: Usage sample to record
219
+ """
220
+ with self._lock:
221
+ self._usage_history.append(sample)
222
+ self._current_usage = sample.copy()
223
+
224
+ def get_monitoring_metrics(self) -> List[Dict[str, Any]]:
225
+ """Get monitoring metrics history.
226
+
227
+ Returns:
228
+ List of historical usage samples
229
+ """
230
+ with self._lock:
231
+ return list(self._usage_history)
232
+
233
+ def get_usage_history(self) -> List[Dict[str, Any]]:
234
+ """Get usage history.
235
+
236
+ Returns:
237
+ List of historical usage samples
238
+ """
239
+ return self.get_monitoring_metrics()
240
+
241
+ @property
242
+ def is_monitoring(self) -> bool:
243
+ """Check if monitoring is active."""
244
+ return self._is_monitoring
245
+
246
+
247
+ class RuntimeMonitor:
248
+ """Overall runtime monitoring and health tracking."""
249
+
250
+ def __init__(
251
+ self,
252
+ runtime_id: str,
253
+ enable_performance_tracking: bool = True,
254
+ enable_health_checks: bool = True,
255
+ metrics_collector: Optional[Any] = None,
256
+ audit_logger: Optional[Any] = None,
257
+ alert_manager: Optional[Any] = None,
258
+ dashboard_client: Optional[Any] = None,
259
+ ):
260
+ """Initialize runtime monitor.
261
+
262
+ Args:
263
+ runtime_id: Unique runtime identifier
264
+ enable_performance_tracking: Enable execution performance tracking
265
+ enable_health_checks: Enable health check system
266
+ metrics_collector: Enterprise metrics collector
267
+ audit_logger: Enterprise audit logger
268
+ alert_manager: Enterprise alert manager
269
+ dashboard_client: Enterprise dashboard client
270
+ """
271
+ self.runtime_id = runtime_id
272
+ self.enable_performance_tracking = enable_performance_tracking
273
+ self.enable_health_checks = enable_health_checks
274
+
275
+ # Enterprise integrations
276
+ self.metrics_collector = metrics_collector
277
+ self.audit_logger = audit_logger
278
+ self.alert_manager = alert_manager
279
+ self.dashboard_client = dashboard_client
280
+
281
+ # Performance tracking
282
+ self._execution_metrics: List[Dict[str, Any]] = []
283
+ self._performance_benchmarks: List[Dict[str, Any]] = []
284
+ self._active_executions: Dict[str, Dict[str, Any]] = {}
285
+
286
+ # Health checks
287
+ self._health_checks: Dict[str, Callable] = {}
288
+ self._async_health_checks: Dict[str, Callable] = {}
289
+
290
+ # Thread safety
291
+ self._lock = threading.RLock()
292
+
293
+ logger.info(f"RuntimeMonitor initialized for {runtime_id}")
294
+
295
+ def start_execution_tracking(self, workflow_id: str) -> str:
296
+ """Start tracking workflow execution.
297
+
298
+ Args:
299
+ workflow_id: Workflow identifier
300
+
301
+ Returns:
302
+ Execution tracking ID
303
+ """
304
+ if not self.enable_performance_tracking:
305
+ return ""
306
+
307
+ execution_id = f"exec_{int(time.time() * 1000)}_{workflow_id}"
308
+
309
+ with self._lock:
310
+ self._active_executions[execution_id] = {
311
+ "workflow_id": workflow_id,
312
+ "start_time": time.time(),
313
+ "start_timestamp": datetime.now(UTC),
314
+ }
315
+
316
+ return execution_id
317
+
318
+ def end_execution_tracking(self, execution_id: str, success: bool) -> None:
319
+ """End execution tracking.
320
+
321
+ Args:
322
+ execution_id: Execution tracking ID
323
+ success: Whether execution was successful
324
+ """
325
+ if not self.enable_performance_tracking or not execution_id:
326
+ return
327
+
328
+ with self._lock:
329
+ if execution_id in self._active_executions:
330
+ execution_info = self._active_executions.pop(execution_id)
331
+
332
+ end_time = time.time()
333
+ duration_ms = (end_time - execution_info["start_time"]) * 1000
334
+
335
+ metric = {
336
+ "execution_id": execution_id,
337
+ "workflow_id": execution_info["workflow_id"],
338
+ "start_time": execution_info["start_timestamp"],
339
+ "end_time": datetime.now(UTC),
340
+ "duration_ms": duration_ms,
341
+ "success": success,
342
+ }
343
+
344
+ self._execution_metrics.append(metric)
345
+
346
+ # Report to enterprise metrics if available
347
+ if self.metrics_collector:
348
+ try:
349
+ self.metrics_collector.record_metric(
350
+ "workflow_execution", metric
351
+ )
352
+ except Exception as e:
353
+ logger.warning(f"Failed to record enterprise metric: {e}")
354
+
355
+ def get_execution_metrics(self) -> List[Dict[str, Any]]:
356
+ """Get execution metrics.
357
+
358
+ Returns:
359
+ List of execution metrics
360
+ """
361
+ with self._lock:
362
+ return self._execution_metrics.copy()
363
+
364
+ def register_health_check(self, name: str, check_function: Callable) -> None:
365
+ """Register a health check function.
366
+
367
+ Args:
368
+ name: Health check name
369
+ check_function: Function that returns health status
370
+ """
371
+ self._health_checks[name] = check_function
372
+
373
+ def register_async_health_check(self, name: str, check_function: Callable) -> None:
374
+ """Register an async health check function.
375
+
376
+ Args:
377
+ name: Health check name
378
+ check_function: Async function that returns health status
379
+ """
380
+ self._async_health_checks[name] = check_function
381
+
382
+ def run_health_checks(self) -> Dict[str, Any]:
383
+ """Run all registered health checks.
384
+
385
+ Returns:
386
+ Health check results
387
+ """
388
+ if not self.enable_health_checks:
389
+ return {}
390
+
391
+ results = {}
392
+
393
+ for name, check_func in self._health_checks.items():
394
+ try:
395
+ result = check_func()
396
+ results[name] = (
397
+ result if isinstance(result, dict) else {"status": str(result)}
398
+ )
399
+ except Exception as e:
400
+ results[name] = {"status": "error", "error": str(e)}
401
+ logger.warning(f"Health check '{name}' failed: {e}")
402
+
403
+ return results
404
+
405
+ async def run_async_health_checks(self) -> Dict[str, Any]:
406
+ """Run all registered async health checks.
407
+
408
+ Returns:
409
+ Async health check results
410
+ """
411
+ if not self.enable_health_checks:
412
+ return {}
413
+
414
+ results = {}
415
+
416
+ for name, check_func in self._async_health_checks.items():
417
+ try:
418
+ result = await check_func()
419
+ results[name] = (
420
+ result if isinstance(result, dict) else {"status": str(result)}
421
+ )
422
+ except Exception as e:
423
+ results[name] = {"status": "error", "error": str(e)}
424
+ logger.warning(f"Async health check '{name}' failed: {e}")
425
+
426
+ return results
427
+
428
+ def get_aggregated_metrics(self) -> Dict[str, Any]:
429
+ """Get aggregated performance metrics.
430
+
431
+ Returns:
432
+ Aggregated metrics summary
433
+ """
434
+ with self._lock:
435
+ if not self._execution_metrics:
436
+ return {
437
+ "total_executions": 0,
438
+ "success_rate": 0.0,
439
+ "avg_execution_time_ms": 0.0,
440
+ }
441
+
442
+ total = len(self._execution_metrics)
443
+ successful = sum(1 for m in self._execution_metrics if m["success"])
444
+ success_rate = successful / total if total > 0 else 0.0
445
+
446
+ avg_duration = (
447
+ sum(m["duration_ms"] for m in self._execution_metrics) / total
448
+ )
449
+
450
+ return {
451
+ "total_executions": total,
452
+ "success_rate": success_rate,
453
+ "avg_execution_time_ms": avg_duration,
454
+ "successful_executions": successful,
455
+ "failed_executions": total - successful,
456
+ }
457
+
458
+ def record_performance_benchmark(
459
+ self,
460
+ operation: str,
461
+ duration_ms: float,
462
+ metadata: Optional[Dict[str, Any]] = None,
463
+ ) -> None:
464
+ """Record a performance benchmark.
465
+
466
+ Args:
467
+ operation: Operation name
468
+ duration_ms: Duration in milliseconds
469
+ metadata: Additional metadata
470
+ """
471
+ benchmark = {
472
+ "operation": operation,
473
+ "duration_ms": duration_ms,
474
+ "timestamp": datetime.now(UTC),
475
+ "metadata": metadata or {},
476
+ }
477
+
478
+ with self._lock:
479
+ self._performance_benchmarks.append(benchmark)
480
+
481
+ def get_performance_benchmarks(self) -> List[Dict[str, Any]]:
482
+ """Get performance benchmarks.
483
+
484
+ Returns:
485
+ List of performance benchmarks
486
+ """
487
+ with self._lock:
488
+ return self._performance_benchmarks.copy()
489
+
490
+ def log_audit_event(self, event_type: str, details: Dict[str, Any]) -> None:
491
+ """Log an audit event.
492
+
493
+ Args:
494
+ event_type: Type of audit event
495
+ details: Event details
496
+ """
497
+ if self.audit_logger:
498
+ try:
499
+ self.audit_logger.log_event(
500
+ {
501
+ "runtime_id": self.runtime_id,
502
+ "event_type": event_type,
503
+ "timestamp": datetime.now(UTC),
504
+ "details": details,
505
+ }
506
+ )
507
+ except Exception as e:
508
+ logger.warning(f"Failed to log audit event: {e}")
509
+
510
+ def check_and_trigger_alerts(self, metrics: Dict[str, Any]) -> None:
511
+ """Check metrics and trigger alerts if needed.
512
+
513
+ Args:
514
+ metrics: Current metrics to check
515
+ """
516
+ if not self.alert_manager:
517
+ return
518
+
519
+ try:
520
+ # Check for alert conditions
521
+ alerts = []
522
+
523
+ if metrics.get("memory_usage_percent", 0) > 90:
524
+ alerts.append(
525
+ {
526
+ "type": "high_memory_usage",
527
+ "severity": "warning",
528
+ "message": f"High memory usage: {metrics['memory_usage_percent']:.1f}%",
529
+ }
530
+ )
531
+
532
+ if metrics.get("error_rate", 0) > 0.1:
533
+ alerts.append(
534
+ {
535
+ "type": "high_error_rate",
536
+ "severity": "critical",
537
+ "message": f"High error rate: {metrics['error_rate']:.1%}",
538
+ }
539
+ )
540
+
541
+ # Trigger alerts
542
+ for alert in alerts:
543
+ self.alert_manager.trigger_alert(alert)
544
+
545
+ except Exception as e:
546
+ logger.warning(f"Failed to check/trigger alerts: {e}")
547
+
548
+ async def push_metrics_to_dashboard(self, metrics: Dict[str, Any]) -> None:
549
+ """Push metrics to enterprise dashboard.
550
+
551
+ Args:
552
+ metrics: Metrics to push
553
+ """
554
+ if not self.dashboard_client:
555
+ return
556
+
557
+ try:
558
+ await self.dashboard_client.push_metrics(metrics)
559
+ except Exception as e:
560
+ logger.warning(f"Failed to push metrics to dashboard: {e}")
561
+
562
+
563
+ class HealthChecker:
564
+ """Coordinates health checks and status reporting."""
565
+
566
+ def __init__(self):
567
+ """Initialize health checker."""
568
+ self._checks: Dict[str, Callable] = {}
569
+
570
+ def register_check(self, name: str, check_func: Callable) -> None:
571
+ """Register a health check.
572
+
573
+ Args:
574
+ name: Check name
575
+ check_func: Function that returns health status
576
+ """
577
+ self._checks[name] = check_func
578
+
579
+ def run_checks(self) -> Dict[str, Any]:
580
+ """Run all health checks.
581
+
582
+ Returns:
583
+ Health check results
584
+ """
585
+ results = {}
586
+ overall_status = "healthy"
587
+
588
+ for name, check_func in self._checks.items():
589
+ try:
590
+ result = check_func()
591
+ if isinstance(result, dict):
592
+ results[name] = result
593
+ if result.get("status") != "healthy":
594
+ overall_status = "degraded"
595
+ else:
596
+ results[name] = {"status": "healthy" if result else "unhealthy"}
597
+ if not result:
598
+ overall_status = "degraded"
599
+ except Exception as e:
600
+ results[name] = {"status": "error", "error": str(e)}
601
+ overall_status = "unhealthy"
602
+
603
+ return {
604
+ "status": overall_status,
605
+ "details": results,
606
+ "timestamp": datetime.now(UTC).isoformat(),
607
+ }
608
+
609
+
610
+ # Enterprise monitoring integration adapters
611
+ class PrometheusAdapter:
612
+ """Adapter for Prometheus metrics integration."""
613
+
614
+ def __init__(
615
+ self, prefix: str = "kailash", labels: Optional[Dict[str, str]] = None
616
+ ):
617
+ """Initialize Prometheus adapter.
618
+
619
+ Args:
620
+ prefix: Metric name prefix
621
+ labels: Default labels to add to all metrics
622
+ """
623
+ self.prefix = prefix
624
+ self.default_labels = labels or {}
625
+ self._metrics_cache = {}
626
+
627
+ try:
628
+ # Try to import prometheus_client if available
629
+ import prometheus_client
630
+
631
+ self.prometheus_client = prometheus_client
632
+ self.enabled = True
633
+ logger.info("Prometheus adapter initialized")
634
+ except ImportError:
635
+ self.prometheus_client = None
636
+ self.enabled = False
637
+ logger.warning(
638
+ "Prometheus client not available - metrics will be logged only"
639
+ )
640
+
641
+ def counter(self, name: str, description: str, labels: List[str] = None) -> Any:
642
+ """Get or create a counter metric."""
643
+ full_name = f"{self.prefix}_{name}"
644
+
645
+ if not self.enabled:
646
+ return MockMetric(full_name, "counter")
647
+
648
+ if full_name not in self._metrics_cache:
649
+ self._metrics_cache[full_name] = self.prometheus_client.Counter(
650
+ full_name, description, labels or []
651
+ )
652
+
653
+ return self._metrics_cache[full_name]
654
+
655
+ def gauge(self, name: str, description: str, labels: List[str] = None) -> Any:
656
+ """Get or create a gauge metric."""
657
+ full_name = f"{self.prefix}_{name}"
658
+
659
+ if not self.enabled:
660
+ return MockMetric(full_name, "gauge")
661
+
662
+ if full_name not in self._metrics_cache:
663
+ self._metrics_cache[full_name] = self.prometheus_client.Gauge(
664
+ full_name, description, labels or []
665
+ )
666
+
667
+ return self._metrics_cache[full_name]
668
+
669
+
670
+ class DataDogAdapter:
671
+ """Adapter for DataDog metrics integration."""
672
+
673
+ def __init__(self, prefix: str = "kailash", tags: List[str] = None):
674
+ """Initialize DataDog adapter."""
675
+ self.prefix = prefix
676
+ self.default_tags = tags or []
677
+
678
+ try:
679
+ import datadog
680
+
681
+ self.datadog = datadog
682
+ self.enabled = True
683
+ logger.info("DataDog adapter initialized")
684
+ except ImportError:
685
+ self.datadog = None
686
+ self.enabled = False
687
+ logger.warning("DataDog client not available - metrics will be logged only")
688
+
689
+ def increment(self, metric: str, value: int = 1, tags: List[str] = None) -> None:
690
+ """Increment a counter metric."""
691
+ full_name = f"{self.prefix}.{metric}"
692
+ all_tags = self.default_tags + (tags or [])
693
+
694
+ if self.enabled:
695
+ self.datadog.statsd.increment(full_name, value, tags=all_tags)
696
+ else:
697
+ logger.info(f"DataDog metric: {full_name} += {value} (tags: {all_tags})")
698
+
699
+ def gauge(self, metric: str, value: float, tags: List[str] = None) -> None:
700
+ """Set a gauge metric."""
701
+ full_name = f"{self.prefix}.{metric}"
702
+ all_tags = self.default_tags + (tags or [])
703
+
704
+ if self.enabled:
705
+ self.datadog.statsd.gauge(full_name, value, tags=all_tags)
706
+ else:
707
+ logger.info(f"DataDog metric: {full_name} = {value} (tags: {all_tags})")
708
+
709
+
710
+ class MockMetric:
711
+ """Mock metric for when real monitoring is not available."""
712
+
713
+ def __init__(self, name: str, metric_type: str):
714
+ self.name = name
715
+ self.metric_type = metric_type
716
+
717
+ def inc(self, amount: float = 1, **kwargs) -> None:
718
+ """Mock increment."""
719
+ logger.debug(f"Mock {self.metric_type} {self.name} += {amount}")
720
+
721
+ def set(self, value: float, **kwargs) -> None:
722
+ """Mock set."""
723
+ logger.debug(f"Mock {self.metric_type} {self.name} = {value}")
724
+
725
+ def labels(self, **kwargs):
726
+ """Mock labels."""
727
+ return self
728
+
729
+
730
+ class EnterpriseMonitoringManager:
731
+ """Manages enterprise monitoring integrations."""
732
+
733
+ def __init__(self, runtime_id: str):
734
+ """Initialize enterprise monitoring manager."""
735
+ self.runtime_id = runtime_id
736
+ self.adapters: Dict[str, Any] = {}
737
+
738
+ # Initialize available adapters
739
+ self.adapters["prometheus"] = PrometheusAdapter(
740
+ prefix="kailash_runtime", labels={"runtime_id": runtime_id}
741
+ )
742
+
743
+ self.adapters["datadog"] = DataDogAdapter(
744
+ prefix="kailash.runtime", tags=[f"runtime_id:{runtime_id}"]
745
+ )
746
+
747
+ logger.info(f"Enterprise monitoring initialized for runtime {runtime_id}")
748
+
749
+ def record_workflow_execution(
750
+ self, workflow_id: str, duration_ms: float, success: bool
751
+ ) -> None:
752
+ """Record workflow execution metrics."""
753
+ # Prometheus
754
+ if self.adapters["prometheus"].enabled:
755
+ counter = self.adapters["prometheus"].counter(
756
+ "workflows_total",
757
+ "Total workflows executed",
758
+ ["workflow_id", "success"],
759
+ )
760
+ counter.labels(workflow_id=workflow_id, success=str(success)).inc()
761
+
762
+ # DataDog
763
+ self.adapters["datadog"].increment(
764
+ "workflow.executions",
765
+ tags=[f"workflow_id:{workflow_id}", f"success:{success}"],
766
+ )
767
+
768
+ def record_resource_usage(self, resource_type: str, value: float) -> None:
769
+ """Record resource usage metrics."""
770
+ # Prometheus
771
+ if self.adapters["prometheus"].enabled:
772
+ gauge = self.adapters["prometheus"].gauge(
773
+ f"resource_{resource_type}", f"{resource_type} usage", ["resource_type"]
774
+ )
775
+ gauge.labels(resource_type=resource_type).set(value)
776
+
777
+ # DataDog
778
+ self.adapters["datadog"].gauge(
779
+ f"resource.{resource_type}", value, tags=[f"resource_type:{resource_type}"]
780
+ )