kailash 0.6.6__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. kailash/__init__.py +35 -5
  2. kailash/access_control.py +64 -46
  3. kailash/adapters/__init__.py +5 -0
  4. kailash/adapters/mcp_platform_adapter.py +273 -0
  5. kailash/api/workflow_api.py +34 -3
  6. kailash/channels/__init__.py +21 -0
  7. kailash/channels/api_channel.py +409 -0
  8. kailash/channels/base.py +271 -0
  9. kailash/channels/cli_channel.py +661 -0
  10. kailash/channels/event_router.py +496 -0
  11. kailash/channels/mcp_channel.py +648 -0
  12. kailash/channels/session.py +423 -0
  13. kailash/mcp_server/discovery.py +57 -18
  14. kailash/middleware/communication/api_gateway.py +23 -3
  15. kailash/middleware/communication/realtime.py +83 -0
  16. kailash/middleware/core/agent_ui.py +1 -1
  17. kailash/middleware/gateway/storage_backends.py +393 -0
  18. kailash/middleware/mcp/enhanced_server.py +22 -16
  19. kailash/nexus/__init__.py +21 -0
  20. kailash/nexus/cli/__init__.py +5 -0
  21. kailash/nexus/cli/__main__.py +6 -0
  22. kailash/nexus/cli/main.py +176 -0
  23. kailash/nexus/factory.py +413 -0
  24. kailash/nexus/gateway.py +545 -0
  25. kailash/nodes/__init__.py +8 -5
  26. kailash/nodes/ai/iterative_llm_agent.py +988 -17
  27. kailash/nodes/ai/llm_agent.py +29 -9
  28. kailash/nodes/api/__init__.py +2 -2
  29. kailash/nodes/api/monitoring.py +1 -1
  30. kailash/nodes/base.py +29 -5
  31. kailash/nodes/base_async.py +54 -14
  32. kailash/nodes/code/async_python.py +1 -1
  33. kailash/nodes/code/python.py +50 -6
  34. kailash/nodes/data/async_sql.py +90 -0
  35. kailash/nodes/data/bulk_operations.py +939 -0
  36. kailash/nodes/data/query_builder.py +373 -0
  37. kailash/nodes/data/query_cache.py +512 -0
  38. kailash/nodes/monitoring/__init__.py +10 -0
  39. kailash/nodes/monitoring/deadlock_detector.py +964 -0
  40. kailash/nodes/monitoring/performance_anomaly.py +1078 -0
  41. kailash/nodes/monitoring/race_condition_detector.py +1151 -0
  42. kailash/nodes/monitoring/transaction_metrics.py +790 -0
  43. kailash/nodes/monitoring/transaction_monitor.py +931 -0
  44. kailash/nodes/security/behavior_analysis.py +414 -0
  45. kailash/nodes/system/__init__.py +17 -0
  46. kailash/nodes/system/command_parser.py +820 -0
  47. kailash/nodes/transaction/__init__.py +48 -0
  48. kailash/nodes/transaction/distributed_transaction_manager.py +983 -0
  49. kailash/nodes/transaction/saga_coordinator.py +652 -0
  50. kailash/nodes/transaction/saga_state_storage.py +411 -0
  51. kailash/nodes/transaction/saga_step.py +467 -0
  52. kailash/nodes/transaction/transaction_context.py +756 -0
  53. kailash/nodes/transaction/two_phase_commit.py +978 -0
  54. kailash/nodes/transform/processors.py +17 -1
  55. kailash/nodes/validation/__init__.py +21 -0
  56. kailash/nodes/validation/test_executor.py +532 -0
  57. kailash/nodes/validation/validation_nodes.py +447 -0
  58. kailash/resources/factory.py +1 -1
  59. kailash/runtime/access_controlled.py +9 -7
  60. kailash/runtime/async_local.py +84 -21
  61. kailash/runtime/local.py +21 -2
  62. kailash/runtime/parameter_injector.py +187 -31
  63. kailash/runtime/runner.py +6 -4
  64. kailash/runtime/testing.py +1 -1
  65. kailash/security.py +22 -3
  66. kailash/servers/__init__.py +32 -0
  67. kailash/servers/durable_workflow_server.py +430 -0
  68. kailash/servers/enterprise_workflow_server.py +522 -0
  69. kailash/servers/gateway.py +183 -0
  70. kailash/servers/workflow_server.py +293 -0
  71. kailash/utils/data_validation.py +192 -0
  72. kailash/workflow/builder.py +382 -15
  73. kailash/workflow/cyclic_runner.py +102 -10
  74. kailash/workflow/validation.py +144 -8
  75. kailash/workflow/visualization.py +99 -27
  76. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/METADATA +3 -2
  77. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/RECORD +81 -40
  78. kailash/workflow/builder_improvements.py +0 -207
  79. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/WHEEL +0 -0
  80. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/entry_points.txt +0 -0
  81. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/licenses/LICENSE +0 -0
  82. {kailash-0.6.6.dist-info → kailash-0.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1078 @@
1
+ """Performance anomaly detection node with baseline learning and statistical analysis.
2
+
3
+ This module provides comprehensive performance anomaly detection capabilities with
4
+ baseline learning, statistical analysis, and classification of performance issues.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ import statistics
10
+ import time
11
+ from collections import defaultdict, deque
12
+ from dataclasses import dataclass, field
13
+ from datetime import UTC, datetime, timedelta
14
+ from enum import Enum
15
+ from typing import Any, Dict, List, Optional, Tuple
16
+
17
+ import numpy as np
18
+
19
+ from kailash.nodes.base import NodeParameter, register_node
20
+ from kailash.nodes.base_async import AsyncNode
21
+ from kailash.sdk_exceptions import NodeExecutionError
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class AnomalyType(Enum):
27
+ """Types of performance anomalies."""
28
+
29
+ LATENCY_SPIKE = "latency_spike"
30
+ THROUGHPUT_DROP = "throughput_drop"
31
+ ERROR_RATE_INCREASE = "error_rate_increase"
32
+ RESOURCE_EXHAUSTION = "resource_exhaustion"
33
+ RESPONSE_TIME_VARIANCE = "response_time_variance"
34
+ CONCURRENCY_ANOMALY = "concurrency_anomaly"
35
+ TREND_ANOMALY = "trend_anomaly"
36
+
37
+
38
+ class AnomalySeverity(Enum):
39
+ """Severity levels for anomalies."""
40
+
41
+ LOW = "low"
42
+ MEDIUM = "medium"
43
+ HIGH = "high"
44
+ CRITICAL = "critical"
45
+
46
+
47
+ class DetectionMethod(Enum):
48
+ """Anomaly detection methods."""
49
+
50
+ STATISTICAL = "statistical"
51
+ THRESHOLD_BASED = "threshold_based"
52
+ ROLLING_AVERAGE = "rolling_average"
53
+ ZSCORE = "zscore"
54
+ IQR = "iqr" # Interquartile Range
55
+ EXPONENTIAL_SMOOTHING = "exponential_smoothing"
56
+ MACHINE_LEARNING = "machine_learning"
57
+
58
+
59
+ @dataclass
60
+ class PerformanceMetric:
61
+ """Represents a performance metric data point."""
62
+
63
+ metric_name: str
64
+ value: float
65
+ timestamp: float
66
+ tags: Dict[str, str] = field(default_factory=dict)
67
+ metadata: Dict[str, Any] = field(default_factory=dict)
68
+
69
+
70
+ @dataclass
71
+ class PerformanceBaseline:
72
+ """Performance baseline for anomaly detection."""
73
+
74
+ metric_name: str
75
+ created_at: float
76
+ updated_at: float
77
+ sample_count: int
78
+
79
+ # Statistical measures
80
+ mean: float
81
+ median: float
82
+ std_dev: float
83
+ min_value: float
84
+ max_value: float
85
+ percentiles: Dict[str, float] = field(default_factory=dict)
86
+
87
+ # Trend analysis
88
+ trend_slope: float = 0.0
89
+ seasonal_pattern: List[float] = field(default_factory=list)
90
+
91
+ # Detection thresholds
92
+ upper_threshold: float = 0.0
93
+ lower_threshold: float = 0.0
94
+ variance_threshold: float = 0.0
95
+
96
+ # Learning parameters
97
+ learning_rate: float = 0.1
98
+ decay_factor: float = 0.95
99
+
100
+
101
+ @dataclass
102
+ class PerformanceAnomaly:
103
+ """Represents a detected performance anomaly."""
104
+
105
+ anomaly_id: str
106
+ anomaly_type: AnomalyType
107
+ metric_name: str
108
+ detected_at: float
109
+ value: float
110
+ expected_value: float
111
+ deviation: float
112
+ severity: AnomalySeverity
113
+ confidence: float # 0.0 to 1.0
114
+ detection_method: DetectionMethod
115
+ description: str
116
+ impact_assessment: str
117
+ recommended_actions: List[str] = field(default_factory=list)
118
+ tags: Dict[str, str] = field(default_factory=dict)
119
+ metadata: Dict[str, Any] = field(default_factory=dict)
120
+
121
+
122
+ @register_node()
123
+ class PerformanceAnomalyNode(AsyncNode):
124
+ """Node for detecting performance anomalies using baseline learning.
125
+
126
+ This node provides comprehensive performance anomaly detection including:
127
+ - Baseline performance learning with adaptive algorithms
128
+ - Statistical anomaly detection (Z-score, IQR, exponential smoothing)
129
+ - Threshold-based anomaly detection with dynamic thresholds
130
+ - Trend analysis and seasonal pattern detection
131
+ - Anomaly classification with severity assessment
132
+ - Real-time monitoring with configurable sensitivity
133
+ - Integration with alerting systems
134
+
135
+ Design Purpose:
136
+ - Detect performance degradation before it impacts users
137
+ - Learn normal performance patterns automatically
138
+ - Provide actionable insights for performance optimization
139
+ - Support proactive performance monitoring
140
+
141
+ Examples:
142
+ >>> # Initialize baseline learning
143
+ >>> anomaly_detector = PerformanceAnomalyNode()
144
+ >>> result = await anomaly_detector.execute(
145
+ ... operation="initialize_baseline",
146
+ ... metric_name="api_response_time",
147
+ ... detection_methods=["statistical", "threshold_based"],
148
+ ... sensitivity=0.7
149
+ ... )
150
+
151
+ >>> # Feed performance metrics
152
+ >>> result = await anomaly_detector.execute(
153
+ ... operation="add_metric",
154
+ ... metric_name="api_response_time",
155
+ ... value=250.5,
156
+ ... tags={"endpoint": "/api/users", "method": "GET"}
157
+ ... )
158
+
159
+ >>> # Detect anomalies
160
+ >>> result = await anomaly_detector.execute(
161
+ ... operation="detect_anomalies",
162
+ ... metric_names=["api_response_time"],
163
+ ... detection_window=300.0 # 5 minutes
164
+ ... )
165
+ """
166
+
167
+ def __init__(self, **kwargs):
168
+ """Initialize the performance anomaly detector node."""
169
+ super().__init__(**kwargs)
170
+ self._baselines: Dict[str, PerformanceBaseline] = {}
171
+ self._metrics_buffer: Dict[str, deque] = defaultdict(
172
+ lambda: deque(maxlen=10000)
173
+ )
174
+ self._detected_anomalies: List[PerformanceAnomaly] = []
175
+ self._monitoring_active = False
176
+ self._background_tasks: set = set()
177
+ self._detection_config = {
178
+ "sensitivity": 0.8,
179
+ "min_samples": 30,
180
+ "learning_rate": 0.1,
181
+ "zscore_threshold": 2.5,
182
+ "iqr_multiplier": 1.5,
183
+ }
184
+ self.logger.info(f"Initialized PerformanceAnomalyNode: {self.id}")
185
+
186
+ def get_parameters(self) -> Dict[str, NodeParameter]:
187
+ """Define the parameters this node accepts."""
188
+ return {
189
+ "operation": NodeParameter(
190
+ name="operation",
191
+ type=str,
192
+ required=True,
193
+ description="Operation (initialize_baseline, add_metric, detect_anomalies, get_baseline, get_anomalies, start_monitoring, stop_monitoring)",
194
+ ),
195
+ "metric_name": NodeParameter(
196
+ name="metric_name",
197
+ type=str,
198
+ required=False,
199
+ description="Name of the performance metric",
200
+ ),
201
+ "metric_names": NodeParameter(
202
+ name="metric_names",
203
+ type=list,
204
+ required=False,
205
+ default=[],
206
+ description="List of metric names to process",
207
+ ),
208
+ "value": NodeParameter(
209
+ name="value",
210
+ type=float,
211
+ required=False,
212
+ description="Metric value to add",
213
+ ),
214
+ "timestamp": NodeParameter(
215
+ name="timestamp",
216
+ type=float,
217
+ required=False,
218
+ description="Timestamp for the metric (defaults to current time)",
219
+ ),
220
+ "tags": NodeParameter(
221
+ name="tags",
222
+ type=dict,
223
+ required=False,
224
+ default={},
225
+ description="Tags for metric categorization",
226
+ ),
227
+ "detection_methods": NodeParameter(
228
+ name="detection_methods",
229
+ type=list,
230
+ required=False,
231
+ default=["statistical", "threshold_based"],
232
+ description="Detection methods to use (statistical, threshold_based, rolling_average, zscore, iqr)",
233
+ ),
234
+ "sensitivity": NodeParameter(
235
+ name="sensitivity",
236
+ type=float,
237
+ required=False,
238
+ default=0.8,
239
+ description="Detection sensitivity (0.0 to 1.0, higher = more sensitive)",
240
+ ),
241
+ "detection_window": NodeParameter(
242
+ name="detection_window",
243
+ type=float,
244
+ required=False,
245
+ default=300.0,
246
+ description="Time window for anomaly detection in seconds",
247
+ ),
248
+ "min_samples": NodeParameter(
249
+ name="min_samples",
250
+ type=int,
251
+ required=False,
252
+ default=30,
253
+ description="Minimum samples required for baseline learning",
254
+ ),
255
+ "learning_rate": NodeParameter(
256
+ name="learning_rate",
257
+ type=float,
258
+ required=False,
259
+ default=0.1,
260
+ description="Learning rate for adaptive baseline updates",
261
+ ),
262
+ "zscore_threshold": NodeParameter(
263
+ name="zscore_threshold",
264
+ type=float,
265
+ required=False,
266
+ default=2.5,
267
+ description="Z-score threshold for anomaly detection",
268
+ ),
269
+ "enable_monitoring": NodeParameter(
270
+ name="enable_monitoring",
271
+ type=bool,
272
+ required=False,
273
+ default=False,
274
+ description="Enable continuous anomaly monitoring",
275
+ ),
276
+ "monitoring_interval": NodeParameter(
277
+ name="monitoring_interval",
278
+ type=float,
279
+ required=False,
280
+ default=30.0,
281
+ description="Monitoring interval in seconds",
282
+ ),
283
+ "metadata": NodeParameter(
284
+ name="metadata",
285
+ type=dict,
286
+ required=False,
287
+ default={},
288
+ description="Additional metadata for the operation",
289
+ ),
290
+ }
291
+
292
+ def get_output_schema(self) -> Dict[str, NodeParameter]:
293
+ """Define the output schema for this node."""
294
+ return {
295
+ "anomalies_detected": NodeParameter(
296
+ name="anomalies_detected",
297
+ type=list,
298
+ description="List of detected anomalies",
299
+ ),
300
+ "anomaly_count": NodeParameter(
301
+ name="anomaly_count",
302
+ type=int,
303
+ description="Number of anomalies detected",
304
+ ),
305
+ "baselines": NodeParameter(
306
+ name="baselines", type=dict, description="Current performance baselines"
307
+ ),
308
+ "metrics_processed": NodeParameter(
309
+ name="metrics_processed",
310
+ type=int,
311
+ description="Number of metrics processed",
312
+ ),
313
+ "detection_summary": NodeParameter(
314
+ name="detection_summary",
315
+ type=dict,
316
+ description="Summary of detection results",
317
+ ),
318
+ "recommendations": NodeParameter(
319
+ name="recommendations",
320
+ type=list,
321
+ description="Performance optimization recommendations",
322
+ ),
323
+ "monitoring_status": NodeParameter(
324
+ name="monitoring_status",
325
+ type=str,
326
+ description="Current monitoring status",
327
+ ),
328
+ "timestamp": NodeParameter(
329
+ name="timestamp", type=str, description="ISO timestamp of operation"
330
+ ),
331
+ "status": NodeParameter(
332
+ name="status", type=str, description="Operation status"
333
+ ),
334
+ }
335
+
336
+ async def async_run(self, **kwargs) -> Dict[str, Any]:
337
+ """Execute performance anomaly detection operation."""
338
+ operation = kwargs.get("operation")
339
+
340
+ try:
341
+ if operation == "initialize_baseline":
342
+ return await self._initialize_baseline(**kwargs)
343
+ elif operation == "add_metric":
344
+ return await self._add_metric(**kwargs)
345
+ elif operation == "detect_anomalies":
346
+ return await self._detect_anomalies(**kwargs)
347
+ elif operation == "get_baseline":
348
+ return await self._get_baseline(**kwargs)
349
+ elif operation == "get_anomalies":
350
+ return await self._get_anomalies(**kwargs)
351
+ elif operation == "start_monitoring":
352
+ return await self._start_monitoring(**kwargs)
353
+ elif operation == "stop_monitoring":
354
+ return await self._stop_monitoring(**kwargs)
355
+ else:
356
+ raise ValueError(f"Unknown operation: {operation}")
357
+
358
+ except Exception as e:
359
+ self.logger.error(
360
+ f"Performance anomaly detection operation failed: {str(e)}"
361
+ )
362
+ raise NodeExecutionError(f"Failed to execute anomaly detection: {str(e)}")
363
+
364
+ async def _initialize_baseline(self, **kwargs) -> Dict[str, Any]:
365
+ """Initialize baseline learning for a metric."""
366
+ metric_name = kwargs.get("metric_name")
367
+ if not metric_name:
368
+ raise ValueError("metric_name is required for initialize_baseline")
369
+
370
+ detection_methods = kwargs.get(
371
+ "detection_methods", ["statistical", "threshold_based"]
372
+ )
373
+ sensitivity = kwargs.get("sensitivity", 0.8)
374
+ min_samples = kwargs.get("min_samples", 30)
375
+ learning_rate = kwargs.get("learning_rate", 0.1)
376
+
377
+ # Update detection configuration
378
+ self._detection_config.update(
379
+ {
380
+ "sensitivity": sensitivity,
381
+ "min_samples": min_samples,
382
+ "learning_rate": learning_rate,
383
+ }
384
+ )
385
+
386
+ # Initialize baseline if it doesn't exist
387
+ if metric_name not in self._baselines:
388
+ current_time = time.time()
389
+ baseline = PerformanceBaseline(
390
+ metric_name=metric_name,
391
+ created_at=current_time,
392
+ updated_at=current_time,
393
+ sample_count=0,
394
+ mean=0.0,
395
+ median=0.0,
396
+ std_dev=0.0,
397
+ min_value=float("inf"),
398
+ max_value=float("-inf"),
399
+ learning_rate=learning_rate,
400
+ )
401
+ self._baselines[metric_name] = baseline
402
+
403
+ self.logger.info(f"Initialized baseline for metric: {metric_name}")
404
+
405
+ return {
406
+ "anomalies_detected": [],
407
+ "anomaly_count": 0,
408
+ "baselines": {
409
+ metric_name: self._serialize_baseline(self._baselines[metric_name])
410
+ },
411
+ "metrics_processed": 0,
412
+ "detection_summary": {"initialized": True, "methods": detection_methods},
413
+ "recommendations": [],
414
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
415
+ "timestamp": datetime.now(UTC).isoformat(),
416
+ "status": "success",
417
+ }
418
+
419
+ async def _add_metric(self, **kwargs) -> Dict[str, Any]:
420
+ """Add a performance metric and update baseline."""
421
+ metric_name = kwargs.get("metric_name")
422
+ value = kwargs.get("value")
423
+ timestamp = kwargs.get("timestamp", time.time())
424
+ tags = kwargs.get("tags", {})
425
+ metadata = kwargs.get("metadata", {})
426
+
427
+ if not metric_name or value is None:
428
+ raise ValueError("metric_name and value are required for add_metric")
429
+
430
+ # Create metric object
431
+ metric = PerformanceMetric(
432
+ metric_name=metric_name,
433
+ value=float(value),
434
+ timestamp=timestamp,
435
+ tags=tags,
436
+ metadata=metadata,
437
+ )
438
+
439
+ # Add to buffer
440
+ self._metrics_buffer[metric_name].append(metric)
441
+
442
+ # Update baseline if it exists
443
+ if metric_name in self._baselines:
444
+ await self._update_baseline(metric_name, metric)
445
+
446
+ # Check for immediate anomalies
447
+ anomalies = []
448
+ if metric_name in self._baselines:
449
+ anomalies = await self._check_metric_anomalies(metric)
450
+
451
+ self.logger.debug(f"Added metric {metric_name}={value} at {timestamp}")
452
+
453
+ return {
454
+ "anomalies_detected": [self._serialize_anomaly(a) for a in anomalies],
455
+ "anomaly_count": len(anomalies),
456
+ "baselines": {},
457
+ "metrics_processed": 1,
458
+ "detection_summary": {
459
+ "immediate_check": True,
460
+ "anomalies_found": len(anomalies),
461
+ },
462
+ "recommendations": [],
463
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
464
+ "timestamp": datetime.now(UTC).isoformat(),
465
+ "status": "success",
466
+ }
467
+
468
+ async def _detect_anomalies(self, **kwargs) -> Dict[str, Any]:
469
+ """Detect anomalies in performance metrics."""
470
+ metric_names = kwargs.get("metric_names", [])
471
+ detection_window = kwargs.get("detection_window", 300.0)
472
+ detection_methods = kwargs.get(
473
+ "detection_methods", ["statistical", "threshold_based"]
474
+ )
475
+
476
+ if not metric_names:
477
+ metric_names = list(self._baselines.keys())
478
+
479
+ current_time = time.time()
480
+ window_start = current_time - detection_window
481
+
482
+ all_anomalies = []
483
+ detection_summary = {}
484
+
485
+ for metric_name in metric_names:
486
+ if metric_name not in self._baselines:
487
+ continue
488
+
489
+ # Get metrics within detection window
490
+ recent_metrics = [
491
+ m
492
+ for m in self._metrics_buffer[metric_name]
493
+ if m.timestamp >= window_start
494
+ ]
495
+
496
+ if not recent_metrics:
497
+ continue
498
+
499
+ # Apply different detection methods
500
+ metric_anomalies = []
501
+ for method in detection_methods:
502
+ method_anomalies = await self._apply_detection_method(
503
+ metric_name, recent_metrics, DetectionMethod(method)
504
+ )
505
+ metric_anomalies.extend(method_anomalies)
506
+
507
+ # Remove duplicates and merge similar anomalies
508
+ unique_anomalies = self._deduplicate_anomalies(metric_anomalies)
509
+ all_anomalies.extend(unique_anomalies)
510
+
511
+ detection_summary[metric_name] = {
512
+ "metrics_analyzed": len(recent_metrics),
513
+ "anomalies_found": len(unique_anomalies),
514
+ "methods_used": detection_methods,
515
+ }
516
+
517
+ # Store detected anomalies
518
+ self._detected_anomalies.extend(all_anomalies)
519
+
520
+ # Generate recommendations
521
+ recommendations = self._generate_recommendations(all_anomalies)
522
+
523
+ self.logger.info(
524
+ f"Detected {len(all_anomalies)} anomalies across {len(metric_names)} metrics"
525
+ )
526
+
527
+ return {
528
+ "anomalies_detected": [self._serialize_anomaly(a) for a in all_anomalies],
529
+ "anomaly_count": len(all_anomalies),
530
+ "baselines": {
531
+ name: self._serialize_baseline(baseline)
532
+ for name, baseline in self._baselines.items()
533
+ },
534
+ "metrics_processed": sum(
535
+ s.get("metrics_analyzed", 0) for s in detection_summary.values()
536
+ ),
537
+ "detection_summary": detection_summary,
538
+ "recommendations": recommendations,
539
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
540
+ "timestamp": datetime.now(UTC).isoformat(),
541
+ "status": "success",
542
+ }
543
+
544
+ async def _update_baseline(self, metric_name: str, metric: PerformanceMetric):
545
+ """Update baseline with new metric using adaptive learning."""
546
+ baseline = self._baselines[metric_name]
547
+ value = metric.value
548
+
549
+ # Update sample count
550
+ baseline.sample_count += 1
551
+
552
+ # Update basic statistics using online algorithms
553
+ if baseline.sample_count == 1:
554
+ baseline.mean = value
555
+ baseline.median = value
556
+ baseline.min_value = value
557
+ baseline.max_value = value
558
+ baseline.std_dev = 0.0
559
+ else:
560
+ # Update mean using exponential moving average
561
+ baseline.mean = (
562
+ 1 - baseline.learning_rate
563
+ ) * baseline.mean + baseline.learning_rate * value
564
+
565
+ # Update min/max
566
+ baseline.min_value = min(baseline.min_value, value)
567
+ baseline.max_value = max(baseline.max_value, value)
568
+
569
+ # Update standard deviation using Welford's online algorithm
570
+ if baseline.sample_count >= self._detection_config["min_samples"]:
571
+ recent_metrics = list(self._metrics_buffer[metric_name])[
572
+ -self._detection_config["min_samples"] :
573
+ ]
574
+ values = [m.value for m in recent_metrics]
575
+ baseline.std_dev = float(np.std(values))
576
+ baseline.median = float(np.median(values))
577
+
578
+ # Calculate percentiles
579
+ baseline.percentiles = {
580
+ "p50": float(np.percentile(values, 50)),
581
+ "p90": float(np.percentile(values, 90)),
582
+ "p95": float(np.percentile(values, 95)),
583
+ "p99": float(np.percentile(values, 99)),
584
+ }
585
+
586
+ # Update thresholds based on sensitivity
587
+ sensitivity = self._detection_config["sensitivity"]
588
+ baseline.upper_threshold = baseline.mean + (
589
+ sensitivity * 2 * baseline.std_dev
590
+ )
591
+ baseline.lower_threshold = baseline.mean - (
592
+ sensitivity * 2 * baseline.std_dev
593
+ )
594
+ baseline.variance_threshold = baseline.std_dev * sensitivity
595
+
596
+ baseline.updated_at = time.time()
597
+
598
+ async def _check_metric_anomalies(
599
+ self, metric: PerformanceMetric
600
+ ) -> List[PerformanceAnomaly]:
601
+ """Check a single metric for anomalies."""
602
+ anomalies = []
603
+ baseline = self._baselines.get(metric.metric_name)
604
+
605
+ if (
606
+ not baseline
607
+ or baseline.sample_count < self._detection_config["min_samples"]
608
+ ):
609
+ return anomalies
610
+
611
+ # Threshold-based detection
612
+ if metric.value > baseline.upper_threshold:
613
+ anomaly = self._create_anomaly(
614
+ metric,
615
+ baseline,
616
+ AnomalyType.LATENCY_SPIKE,
617
+ DetectionMethod.THRESHOLD_BASED,
618
+ f"Value {metric.value:.2f} exceeds upper threshold {baseline.upper_threshold:.2f}",
619
+ )
620
+ anomalies.append(anomaly)
621
+ elif metric.value < baseline.lower_threshold:
622
+ anomaly = self._create_anomaly(
623
+ metric,
624
+ baseline,
625
+ AnomalyType.THROUGHPUT_DROP,
626
+ DetectionMethod.THRESHOLD_BASED,
627
+ f"Value {metric.value:.2f} below lower threshold {baseline.lower_threshold:.2f}",
628
+ )
629
+ anomalies.append(anomaly)
630
+
631
+ # Z-score based detection
632
+ if baseline.std_dev > 0:
633
+ zscore = abs(metric.value - baseline.mean) / baseline.std_dev
634
+ if zscore > self._detection_config["zscore_threshold"]:
635
+ anomaly = self._create_anomaly(
636
+ metric,
637
+ baseline,
638
+ AnomalyType.RESPONSE_TIME_VARIANCE,
639
+ DetectionMethod.ZSCORE,
640
+ f"Z-score {zscore:.2f} exceeds threshold {self._detection_config['zscore_threshold']}",
641
+ )
642
+ anomalies.append(anomaly)
643
+
644
+ return anomalies
645
+
646
+ async def _apply_detection_method(
647
+ self,
648
+ metric_name: str,
649
+ metrics: List[PerformanceMetric],
650
+ method: DetectionMethod,
651
+ ) -> List[PerformanceAnomaly]:
652
+ """Apply a specific detection method to metrics."""
653
+ anomalies = []
654
+ baseline = self._baselines.get(metric_name)
655
+
656
+ if not baseline or not metrics:
657
+ return anomalies
658
+
659
+ values = [m.value for m in metrics]
660
+
661
+ if method == DetectionMethod.STATISTICAL:
662
+ # Statistical analysis using Z-score and IQR
663
+ if len(values) >= 10:
664
+ mean_val = np.mean(values)
665
+ std_val = np.std(values)
666
+
667
+ for metric in metrics:
668
+ if std_val > 0:
669
+ zscore = abs(metric.value - mean_val) / std_val
670
+ if zscore > self._detection_config["zscore_threshold"]:
671
+ anomaly = self._create_anomaly(
672
+ metric,
673
+ baseline,
674
+ AnomalyType.RESPONSE_TIME_VARIANCE,
675
+ method,
676
+ f"Statistical outlier with Z-score {zscore:.2f}",
677
+ )
678
+ anomalies.append(anomaly)
679
+
680
+ elif method == DetectionMethod.IQR:
681
+ # Interquartile Range method
682
+ if len(values) >= 10:
683
+ q1 = np.percentile(values, 25)
684
+ q3 = np.percentile(values, 75)
685
+ iqr = q3 - q1
686
+ multiplier = self._detection_config.get("iqr_multiplier", 1.5)
687
+
688
+ lower_bound = q1 - multiplier * iqr
689
+ upper_bound = q3 + multiplier * iqr
690
+
691
+ for metric in metrics:
692
+ if metric.value < lower_bound or metric.value > upper_bound:
693
+ anomaly = self._create_anomaly(
694
+ metric,
695
+ baseline,
696
+ AnomalyType.RESPONSE_TIME_VARIANCE,
697
+ method,
698
+ f"IQR outlier: value {metric.value:.2f} outside [{lower_bound:.2f}, {upper_bound:.2f}]",
699
+ )
700
+ anomalies.append(anomaly)
701
+
702
+ elif method == DetectionMethod.ROLLING_AVERAGE:
703
+ # Rolling average deviation
704
+ if len(values) >= 10:
705
+ window_size = min(10, len(values) // 2)
706
+ for i in range(window_size, len(metrics)):
707
+ window_values = values[i - window_size : i]
708
+ rolling_avg = np.mean(window_values)
709
+ rolling_std = np.std(window_values)
710
+
711
+ current_metric = metrics[i]
712
+ if rolling_std > 0:
713
+ deviation = (
714
+ abs(current_metric.value - rolling_avg) / rolling_std
715
+ )
716
+ if deviation > 2.0: # 2 standard deviations
717
+ anomaly = self._create_anomaly(
718
+ current_metric,
719
+ baseline,
720
+ AnomalyType.TREND_ANOMALY,
721
+ method,
722
+ f"Rolling average deviation: {deviation:.2f}",
723
+ )
724
+ anomalies.append(anomaly)
725
+
726
+ return anomalies
727
+
728
+ def _create_anomaly(
729
+ self,
730
+ metric: PerformanceMetric,
731
+ baseline: PerformanceBaseline,
732
+ anomaly_type: AnomalyType,
733
+ method: DetectionMethod,
734
+ description: str,
735
+ ) -> PerformanceAnomaly:
736
+ """Create an anomaly detection object."""
737
+ expected_value = baseline.mean
738
+ deviation = abs(metric.value - expected_value)
739
+
740
+ # Calculate confidence based on deviation magnitude
741
+ if baseline.std_dev > 0:
742
+ confidence = min(1.0, deviation / (2 * baseline.std_dev))
743
+ else:
744
+ confidence = 1.0 if deviation > 0 else 0.0
745
+
746
+ # Determine severity
747
+ severity = self._determine_severity(deviation, baseline)
748
+
749
+ # Generate recommendations
750
+ recommendations = self._get_anomaly_recommendations(anomaly_type, metric)
751
+
752
+ return PerformanceAnomaly(
753
+ anomaly_id=f"anomaly_{int(time.time() * 1000000)}",
754
+ anomaly_type=anomaly_type,
755
+ metric_name=metric.metric_name,
756
+ detected_at=time.time(),
757
+ value=metric.value,
758
+ expected_value=expected_value,
759
+ deviation=deviation,
760
+ severity=severity,
761
+ confidence=confidence,
762
+ detection_method=method,
763
+ description=description,
764
+ impact_assessment=self._assess_impact(anomaly_type, deviation, baseline),
765
+ recommended_actions=recommendations,
766
+ tags=metric.tags,
767
+ metadata=metric.metadata,
768
+ )
769
+
770
+ def _determine_severity(
771
+ self, deviation: float, baseline: PerformanceBaseline
772
+ ) -> AnomalySeverity:
773
+ """Determine severity based on deviation magnitude."""
774
+ if baseline.std_dev <= 0:
775
+ return AnomalySeverity.MEDIUM
776
+
777
+ zscore_equivalent = deviation / baseline.std_dev
778
+
779
+ if zscore_equivalent > 4.0:
780
+ return AnomalySeverity.CRITICAL
781
+ elif zscore_equivalent > 3.0:
782
+ return AnomalySeverity.HIGH
783
+ elif zscore_equivalent > 2.0:
784
+ return AnomalySeverity.MEDIUM
785
+ else:
786
+ return AnomalySeverity.LOW
787
+
788
+ def _assess_impact(
789
+ self, anomaly_type: AnomalyType, deviation: float, baseline: PerformanceBaseline
790
+ ) -> str:
791
+ """Assess the potential impact of an anomaly."""
792
+ impact_map = {
793
+ AnomalyType.LATENCY_SPIKE: f"Increased response time may impact user experience. Current deviation: {deviation:.2f}ms above baseline.",
794
+ AnomalyType.THROUGHPUT_DROP: f"Reduced system throughput may indicate capacity issues. Current drop: {deviation:.2f} below expected.",
795
+ AnomalyType.ERROR_RATE_INCREASE: f"Higher error rate indicates system instability. Error rate increased by {deviation:.2f}%.",
796
+ AnomalyType.RESOURCE_EXHAUSTION: f"Resource usage spike may lead to system degradation. Usage increased by {deviation:.2f} units.",
797
+ AnomalyType.RESPONSE_TIME_VARIANCE: f"Inconsistent response times indicate system instability. Variance deviation: {deviation:.2f}.",
798
+ AnomalyType.CONCURRENCY_ANOMALY: f"Unusual concurrency patterns may indicate load issues. Concurrency deviation: {deviation:.2f}.",
799
+ AnomalyType.TREND_ANOMALY: f"Performance trend anomaly detected. Pattern deviation: {deviation:.2f}.",
800
+ }
801
+ return impact_map.get(
802
+ anomaly_type,
803
+ f"Performance anomaly detected with deviation: {deviation:.2f}",
804
+ )
805
+
806
+ def _get_anomaly_recommendations(
807
+ self, anomaly_type: AnomalyType, metric: PerformanceMetric
808
+ ) -> List[str]:
809
+ """Get recommendations for handling specific anomaly types."""
810
+ recommendation_map = {
811
+ AnomalyType.LATENCY_SPIKE: [
812
+ "Check for database query optimization opportunities",
813
+ "Review recent code deployments for performance regressions",
814
+ "Monitor system resource utilization (CPU, memory, I/O)",
815
+ "Consider horizontal scaling if load is high",
816
+ ],
817
+ AnomalyType.THROUGHPUT_DROP: [
818
+ "Investigate potential bottlenecks in request processing",
819
+ "Check for resource contention or lock contention",
820
+ "Review connection pool configurations",
821
+ "Monitor downstream service dependencies",
822
+ ],
823
+ AnomalyType.ERROR_RATE_INCREASE: [
824
+ "Review application logs for error patterns",
825
+ "Check external service dependencies",
826
+ "Validate input data quality and format",
827
+ "Consider implementing circuit breaker patterns",
828
+ ],
829
+ AnomalyType.RESOURCE_EXHAUSTION: [
830
+ "Scale up system resources (CPU, memory)",
831
+ "Implement resource pooling and caching",
832
+ "Review memory leaks and resource cleanup",
833
+ "Consider load balancing and distribution",
834
+ ],
835
+ AnomalyType.RESPONSE_TIME_VARIANCE: [
836
+ "Investigate intermittent performance issues",
837
+ "Check for garbage collection or memory pressure",
838
+ "Review caching effectiveness",
839
+ "Monitor network latency and stability",
840
+ ],
841
+ }
842
+ return recommendation_map.get(
843
+ anomaly_type, ["Investigate performance patterns and system metrics"]
844
+ )
845
+
846
+ def _deduplicate_anomalies(
847
+ self, anomalies: List[PerformanceAnomaly]
848
+ ) -> List[PerformanceAnomaly]:
849
+ """Remove duplicate and similar anomalies."""
850
+ if not anomalies:
851
+ return []
852
+
853
+ # Sort by confidence and severity for prioritization
854
+ sorted_anomalies = sorted(
855
+ anomalies, key=lambda a: (a.severity.value, a.confidence), reverse=True
856
+ )
857
+
858
+ unique_anomalies = []
859
+ for anomaly in sorted_anomalies:
860
+ # Check if similar anomaly already exists
861
+ is_duplicate = False
862
+ for existing in unique_anomalies:
863
+ if (
864
+ existing.metric_name == anomaly.metric_name
865
+ and existing.anomaly_type == anomaly.anomaly_type
866
+ and abs(existing.detected_at - anomaly.detected_at)
867
+ < 60.0 # Within 1 minute
868
+ ):
869
+ is_duplicate = True
870
+ break
871
+
872
+ if not is_duplicate:
873
+ unique_anomalies.append(anomaly)
874
+
875
+ return unique_anomalies
876
+
877
+ def _generate_recommendations(
878
+ self, anomalies: List[PerformanceAnomaly]
879
+ ) -> List[str]:
880
+ """Generate overall performance optimization recommendations."""
881
+ if not anomalies:
882
+ return ["System performance appears normal"]
883
+
884
+ recommendations = set()
885
+
886
+ # Analyze anomaly patterns
887
+ anomaly_types = [a.anomaly_type for a in anomalies]
888
+ severity_levels = [a.severity for a in anomalies]
889
+
890
+ # High-level recommendations based on patterns
891
+ if AnomalyType.LATENCY_SPIKE in anomaly_types:
892
+ recommendations.add("Implement performance monitoring and alerting")
893
+ recommendations.add("Consider caching frequently accessed data")
894
+
895
+ if AnomalyType.THROUGHPUT_DROP in anomaly_types:
896
+ recommendations.add("Review system capacity and scaling policies")
897
+ recommendations.add("Optimize database queries and connections")
898
+
899
+ if any(s == AnomalySeverity.CRITICAL for s in severity_levels):
900
+ recommendations.add(
901
+ "Immediate investigation required - critical performance issue detected"
902
+ )
903
+
904
+ # Add specific recommendations from individual anomalies
905
+ for anomaly in anomalies[:3]: # Top 3 anomalies
906
+ recommendations.update(
907
+ anomaly.recommended_actions[:2]
908
+ ) # Top 2 actions each
909
+
910
+ return list(recommendations)
911
+
912
+ async def _get_baseline(self, **kwargs) -> Dict[str, Any]:
913
+ """Get baseline information for metrics."""
914
+ metric_name = kwargs.get("metric_name")
915
+
916
+ if metric_name:
917
+ baselines = (
918
+ {metric_name: self._serialize_baseline(self._baselines[metric_name])}
919
+ if metric_name in self._baselines
920
+ else {}
921
+ )
922
+ else:
923
+ baselines = {
924
+ name: self._serialize_baseline(baseline)
925
+ for name, baseline in self._baselines.items()
926
+ }
927
+
928
+ return {
929
+ "anomalies_detected": [],
930
+ "anomaly_count": 0,
931
+ "baselines": baselines,
932
+ "metrics_processed": 0,
933
+ "detection_summary": {"baselines_retrieved": len(baselines)},
934
+ "recommendations": [],
935
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
936
+ "timestamp": datetime.now(UTC).isoformat(),
937
+ "status": "success",
938
+ }
939
+
940
+ async def _get_anomalies(self, **kwargs) -> Dict[str, Any]:
941
+ """Get detected anomalies."""
942
+ return {
943
+ "anomalies_detected": [
944
+ self._serialize_anomaly(a) for a in self._detected_anomalies
945
+ ],
946
+ "anomaly_count": len(self._detected_anomalies),
947
+ "baselines": {},
948
+ "metrics_processed": 0,
949
+ "detection_summary": {"anomalies_retrieved": len(self._detected_anomalies)},
950
+ "recommendations": self._generate_recommendations(self._detected_anomalies),
951
+ "monitoring_status": "monitoring" if self._monitoring_active else "idle",
952
+ "timestamp": datetime.now(UTC).isoformat(),
953
+ "status": "success",
954
+ }
955
+
956
+ async def _start_monitoring(self, **kwargs) -> Dict[str, Any]:
957
+ """Start continuous anomaly monitoring."""
958
+ interval = kwargs.get("monitoring_interval", 30.0)
959
+
960
+ if not self._monitoring_active:
961
+ self._monitoring_active = True
962
+ monitoring_task = asyncio.create_task(self._monitoring_loop(interval))
963
+ self._background_tasks.add(monitoring_task)
964
+ monitoring_task.add_done_callback(self._background_tasks.discard)
965
+
966
+ return {
967
+ "anomalies_detected": [],
968
+ "anomaly_count": 0,
969
+ "baselines": {},
970
+ "metrics_processed": 0,
971
+ "detection_summary": {"monitoring_started": True, "interval": interval},
972
+ "recommendations": [],
973
+ "monitoring_status": "monitoring",
974
+ "timestamp": datetime.now(UTC).isoformat(),
975
+ "status": "success",
976
+ }
977
+
978
+ async def _stop_monitoring(self, **kwargs) -> Dict[str, Any]:
979
+ """Stop continuous anomaly monitoring."""
980
+ self._monitoring_active = False
981
+
982
+ # Cancel background tasks
983
+ for task in self._background_tasks:
984
+ if not task.done():
985
+ task.cancel()
986
+
987
+ # Wait for tasks to complete
988
+ if self._background_tasks:
989
+ await asyncio.gather(*self._background_tasks, return_exceptions=True)
990
+
991
+ self._background_tasks.clear()
992
+
993
+ return {
994
+ "anomalies_detected": [],
995
+ "anomaly_count": 0,
996
+ "baselines": {},
997
+ "metrics_processed": 0,
998
+ "detection_summary": {"monitoring_stopped": True},
999
+ "recommendations": [],
1000
+ "monitoring_status": "stopped",
1001
+ "timestamp": datetime.now(UTC).isoformat(),
1002
+ "status": "success",
1003
+ }
1004
+
1005
+ async def _monitoring_loop(self, interval: float):
1006
+ """Background monitoring loop for continuous anomaly detection."""
1007
+ while self._monitoring_active:
1008
+ try:
1009
+ await asyncio.sleep(interval)
1010
+
1011
+ # Run anomaly detection on all metrics
1012
+ metric_names = list(self._baselines.keys())
1013
+ if metric_names:
1014
+ result = await self._detect_anomalies(
1015
+ metric_names=metric_names,
1016
+ detection_window=interval * 2,
1017
+ )
1018
+
1019
+ if result["anomaly_count"] > 0:
1020
+ self.logger.warning(
1021
+ f"Monitoring detected {result['anomaly_count']} performance anomalies"
1022
+ )
1023
+
1024
+ except asyncio.CancelledError:
1025
+ break
1026
+ except Exception as e:
1027
+ self.logger.error(f"Monitoring loop error: {e}")
1028
+
1029
+ def _serialize_baseline(self, baseline: PerformanceBaseline) -> Dict[str, Any]:
1030
+ """Serialize a baseline to dictionary."""
1031
+ return {
1032
+ "metric_name": baseline.metric_name,
1033
+ "created_at": baseline.created_at,
1034
+ "updated_at": baseline.updated_at,
1035
+ "sample_count": baseline.sample_count,
1036
+ "mean": baseline.mean,
1037
+ "median": baseline.median,
1038
+ "std_dev": baseline.std_dev,
1039
+ "min_value": baseline.min_value,
1040
+ "max_value": baseline.max_value,
1041
+ "percentiles": baseline.percentiles,
1042
+ "trend_slope": baseline.trend_slope,
1043
+ "upper_threshold": baseline.upper_threshold,
1044
+ "lower_threshold": baseline.lower_threshold,
1045
+ "variance_threshold": baseline.variance_threshold,
1046
+ "learning_rate": baseline.learning_rate,
1047
+ }
1048
+
1049
+ def _serialize_anomaly(self, anomaly: PerformanceAnomaly) -> Dict[str, Any]:
1050
+ """Serialize an anomaly to dictionary."""
1051
+ return {
1052
+ "anomaly_id": anomaly.anomaly_id,
1053
+ "anomaly_type": anomaly.anomaly_type.value,
1054
+ "metric_name": anomaly.metric_name,
1055
+ "detected_at": anomaly.detected_at,
1056
+ "value": anomaly.value,
1057
+ "expected_value": anomaly.expected_value,
1058
+ "deviation": anomaly.deviation,
1059
+ "severity": anomaly.severity.value,
1060
+ "confidence": anomaly.confidence,
1061
+ "detection_method": anomaly.detection_method.value,
1062
+ "description": anomaly.description,
1063
+ "impact_assessment": anomaly.impact_assessment,
1064
+ "recommended_actions": anomaly.recommended_actions,
1065
+ "tags": anomaly.tags,
1066
+ "metadata": anomaly.metadata,
1067
+ }
1068
+
1069
+ def run(self, **kwargs) -> Dict[str, Any]:
1070
+ """Synchronous wrapper for compatibility."""
1071
+ import asyncio
1072
+
1073
+ return asyncio.run(self.async_run(**kwargs))
1074
+
1075
+ async def cleanup(self):
1076
+ """Cleanup resources when node is destroyed."""
1077
+ await self._stop_monitoring()
1078
+ await super().cleanup() if hasattr(super(), "cleanup") else None