kailash 0.6.6__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +35 -5
- kailash/adapters/__init__.py +5 -0
- kailash/adapters/mcp_platform_adapter.py +273 -0
- kailash/channels/__init__.py +21 -0
- kailash/channels/api_channel.py +409 -0
- kailash/channels/base.py +271 -0
- kailash/channels/cli_channel.py +661 -0
- kailash/channels/event_router.py +496 -0
- kailash/channels/mcp_channel.py +648 -0
- kailash/channels/session.py +423 -0
- kailash/mcp_server/discovery.py +1 -1
- kailash/middleware/mcp/enhanced_server.py +22 -16
- kailash/nexus/__init__.py +21 -0
- kailash/nexus/factory.py +413 -0
- kailash/nexus/gateway.py +545 -0
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/ai/iterative_llm_agent.py +988 -17
- kailash/nodes/ai/llm_agent.py +29 -9
- kailash/nodes/api/__init__.py +2 -2
- kailash/nodes/api/monitoring.py +1 -1
- kailash/nodes/base_async.py +54 -14
- kailash/nodes/code/async_python.py +1 -1
- kailash/nodes/data/bulk_operations.py +939 -0
- kailash/nodes/data/query_builder.py +373 -0
- kailash/nodes/data/query_cache.py +512 -0
- kailash/nodes/monitoring/__init__.py +10 -0
- kailash/nodes/monitoring/deadlock_detector.py +964 -0
- kailash/nodes/monitoring/performance_anomaly.py +1078 -0
- kailash/nodes/monitoring/race_condition_detector.py +1151 -0
- kailash/nodes/monitoring/transaction_metrics.py +790 -0
- kailash/nodes/monitoring/transaction_monitor.py +931 -0
- kailash/nodes/system/__init__.py +17 -0
- kailash/nodes/system/command_parser.py +820 -0
- kailash/nodes/transaction/__init__.py +48 -0
- kailash/nodes/transaction/distributed_transaction_manager.py +983 -0
- kailash/nodes/transaction/saga_coordinator.py +652 -0
- kailash/nodes/transaction/saga_state_storage.py +411 -0
- kailash/nodes/transaction/saga_step.py +467 -0
- kailash/nodes/transaction/transaction_context.py +756 -0
- kailash/nodes/transaction/two_phase_commit.py +978 -0
- kailash/nodes/transform/processors.py +17 -1
- kailash/nodes/validation/__init__.py +21 -0
- kailash/nodes/validation/test_executor.py +532 -0
- kailash/nodes/validation/validation_nodes.py +447 -0
- kailash/resources/factory.py +1 -1
- kailash/runtime/async_local.py +84 -21
- kailash/runtime/local.py +21 -2
- kailash/runtime/parameter_injector.py +187 -31
- kailash/security.py +16 -1
- kailash/servers/__init__.py +32 -0
- kailash/servers/durable_workflow_server.py +430 -0
- kailash/servers/enterprise_workflow_server.py +466 -0
- kailash/servers/gateway.py +183 -0
- kailash/servers/workflow_server.py +290 -0
- kailash/utils/data_validation.py +192 -0
- kailash/workflow/builder.py +291 -12
- kailash/workflow/validation.py +144 -8
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/METADATA +1 -1
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/RECORD +63 -25
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/WHEEL +0 -0
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.6.6.dist-info → kailash-0.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1078 @@
|
|
1
|
+
"""Performance anomaly detection node with baseline learning and statistical analysis.
|
2
|
+
|
3
|
+
This module provides comprehensive performance anomaly detection capabilities with
|
4
|
+
baseline learning, statistical analysis, and classification of performance issues.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
import statistics
|
10
|
+
import time
|
11
|
+
from collections import defaultdict, deque
|
12
|
+
from dataclasses import dataclass, field
|
13
|
+
from datetime import UTC, datetime, timedelta
|
14
|
+
from enum import Enum
|
15
|
+
from typing import Any, Dict, List, Optional, Tuple
|
16
|
+
|
17
|
+
import numpy as np
|
18
|
+
|
19
|
+
from kailash.nodes.base import NodeParameter, register_node
|
20
|
+
from kailash.nodes.base_async import AsyncNode
|
21
|
+
from kailash.sdk_exceptions import NodeExecutionError
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class AnomalyType(Enum):
|
27
|
+
"""Types of performance anomalies."""
|
28
|
+
|
29
|
+
LATENCY_SPIKE = "latency_spike"
|
30
|
+
THROUGHPUT_DROP = "throughput_drop"
|
31
|
+
ERROR_RATE_INCREASE = "error_rate_increase"
|
32
|
+
RESOURCE_EXHAUSTION = "resource_exhaustion"
|
33
|
+
RESPONSE_TIME_VARIANCE = "response_time_variance"
|
34
|
+
CONCURRENCY_ANOMALY = "concurrency_anomaly"
|
35
|
+
TREND_ANOMALY = "trend_anomaly"
|
36
|
+
|
37
|
+
|
38
|
+
class AnomalySeverity(Enum):
|
39
|
+
"""Severity levels for anomalies."""
|
40
|
+
|
41
|
+
LOW = "low"
|
42
|
+
MEDIUM = "medium"
|
43
|
+
HIGH = "high"
|
44
|
+
CRITICAL = "critical"
|
45
|
+
|
46
|
+
|
47
|
+
class DetectionMethod(Enum):
|
48
|
+
"""Anomaly detection methods."""
|
49
|
+
|
50
|
+
STATISTICAL = "statistical"
|
51
|
+
THRESHOLD_BASED = "threshold_based"
|
52
|
+
ROLLING_AVERAGE = "rolling_average"
|
53
|
+
ZSCORE = "zscore"
|
54
|
+
IQR = "iqr" # Interquartile Range
|
55
|
+
EXPONENTIAL_SMOOTHING = "exponential_smoothing"
|
56
|
+
MACHINE_LEARNING = "machine_learning"
|
57
|
+
|
58
|
+
|
59
|
+
@dataclass
|
60
|
+
class PerformanceMetric:
|
61
|
+
"""Represents a performance metric data point."""
|
62
|
+
|
63
|
+
metric_name: str
|
64
|
+
value: float
|
65
|
+
timestamp: float
|
66
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
67
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
68
|
+
|
69
|
+
|
70
|
+
@dataclass
|
71
|
+
class PerformanceBaseline:
|
72
|
+
"""Performance baseline for anomaly detection."""
|
73
|
+
|
74
|
+
metric_name: str
|
75
|
+
created_at: float
|
76
|
+
updated_at: float
|
77
|
+
sample_count: int
|
78
|
+
|
79
|
+
# Statistical measures
|
80
|
+
mean: float
|
81
|
+
median: float
|
82
|
+
std_dev: float
|
83
|
+
min_value: float
|
84
|
+
max_value: float
|
85
|
+
percentiles: Dict[str, float] = field(default_factory=dict)
|
86
|
+
|
87
|
+
# Trend analysis
|
88
|
+
trend_slope: float = 0.0
|
89
|
+
seasonal_pattern: List[float] = field(default_factory=list)
|
90
|
+
|
91
|
+
# Detection thresholds
|
92
|
+
upper_threshold: float = 0.0
|
93
|
+
lower_threshold: float = 0.0
|
94
|
+
variance_threshold: float = 0.0
|
95
|
+
|
96
|
+
# Learning parameters
|
97
|
+
learning_rate: float = 0.1
|
98
|
+
decay_factor: float = 0.95
|
99
|
+
|
100
|
+
|
101
|
+
@dataclass
|
102
|
+
class PerformanceAnomaly:
|
103
|
+
"""Represents a detected performance anomaly."""
|
104
|
+
|
105
|
+
anomaly_id: str
|
106
|
+
anomaly_type: AnomalyType
|
107
|
+
metric_name: str
|
108
|
+
detected_at: float
|
109
|
+
value: float
|
110
|
+
expected_value: float
|
111
|
+
deviation: float
|
112
|
+
severity: AnomalySeverity
|
113
|
+
confidence: float # 0.0 to 1.0
|
114
|
+
detection_method: DetectionMethod
|
115
|
+
description: str
|
116
|
+
impact_assessment: str
|
117
|
+
recommended_actions: List[str] = field(default_factory=list)
|
118
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
119
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
120
|
+
|
121
|
+
|
122
|
+
@register_node()
|
123
|
+
class PerformanceAnomalyNode(AsyncNode):
|
124
|
+
"""Node for detecting performance anomalies using baseline learning.
|
125
|
+
|
126
|
+
This node provides comprehensive performance anomaly detection including:
|
127
|
+
- Baseline performance learning with adaptive algorithms
|
128
|
+
- Statistical anomaly detection (Z-score, IQR, exponential smoothing)
|
129
|
+
- Threshold-based anomaly detection with dynamic thresholds
|
130
|
+
- Trend analysis and seasonal pattern detection
|
131
|
+
- Anomaly classification with severity assessment
|
132
|
+
- Real-time monitoring with configurable sensitivity
|
133
|
+
- Integration with alerting systems
|
134
|
+
|
135
|
+
Design Purpose:
|
136
|
+
- Detect performance degradation before it impacts users
|
137
|
+
- Learn normal performance patterns automatically
|
138
|
+
- Provide actionable insights for performance optimization
|
139
|
+
- Support proactive performance monitoring
|
140
|
+
|
141
|
+
Examples:
|
142
|
+
>>> # Initialize baseline learning
|
143
|
+
>>> anomaly_detector = PerformanceAnomalyNode()
|
144
|
+
>>> result = await anomaly_detector.execute(
|
145
|
+
... operation="initialize_baseline",
|
146
|
+
... metric_name="api_response_time",
|
147
|
+
... detection_methods=["statistical", "threshold_based"],
|
148
|
+
... sensitivity=0.7
|
149
|
+
... )
|
150
|
+
|
151
|
+
>>> # Feed performance metrics
|
152
|
+
>>> result = await anomaly_detector.execute(
|
153
|
+
... operation="add_metric",
|
154
|
+
... metric_name="api_response_time",
|
155
|
+
... value=250.5,
|
156
|
+
... tags={"endpoint": "/api/users", "method": "GET"}
|
157
|
+
... )
|
158
|
+
|
159
|
+
>>> # Detect anomalies
|
160
|
+
>>> result = await anomaly_detector.execute(
|
161
|
+
... operation="detect_anomalies",
|
162
|
+
... metric_names=["api_response_time"],
|
163
|
+
... detection_window=300.0 # 5 minutes
|
164
|
+
... )
|
165
|
+
"""
|
166
|
+
|
167
|
+
def __init__(self, **kwargs):
|
168
|
+
"""Initialize the performance anomaly detector node."""
|
169
|
+
super().__init__(**kwargs)
|
170
|
+
self._baselines: Dict[str, PerformanceBaseline] = {}
|
171
|
+
self._metrics_buffer: Dict[str, deque] = defaultdict(
|
172
|
+
lambda: deque(maxlen=10000)
|
173
|
+
)
|
174
|
+
self._detected_anomalies: List[PerformanceAnomaly] = []
|
175
|
+
self._monitoring_active = False
|
176
|
+
self._background_tasks: set = set()
|
177
|
+
self._detection_config = {
|
178
|
+
"sensitivity": 0.8,
|
179
|
+
"min_samples": 30,
|
180
|
+
"learning_rate": 0.1,
|
181
|
+
"zscore_threshold": 2.5,
|
182
|
+
"iqr_multiplier": 1.5,
|
183
|
+
}
|
184
|
+
self.logger.info(f"Initialized PerformanceAnomalyNode: {self.id}")
|
185
|
+
|
186
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
187
|
+
"""Define the parameters this node accepts."""
|
188
|
+
return {
|
189
|
+
"operation": NodeParameter(
|
190
|
+
name="operation",
|
191
|
+
type=str,
|
192
|
+
required=True,
|
193
|
+
description="Operation (initialize_baseline, add_metric, detect_anomalies, get_baseline, get_anomalies, start_monitoring, stop_monitoring)",
|
194
|
+
),
|
195
|
+
"metric_name": NodeParameter(
|
196
|
+
name="metric_name",
|
197
|
+
type=str,
|
198
|
+
required=False,
|
199
|
+
description="Name of the performance metric",
|
200
|
+
),
|
201
|
+
"metric_names": NodeParameter(
|
202
|
+
name="metric_names",
|
203
|
+
type=list,
|
204
|
+
required=False,
|
205
|
+
default=[],
|
206
|
+
description="List of metric names to process",
|
207
|
+
),
|
208
|
+
"value": NodeParameter(
|
209
|
+
name="value",
|
210
|
+
type=float,
|
211
|
+
required=False,
|
212
|
+
description="Metric value to add",
|
213
|
+
),
|
214
|
+
"timestamp": NodeParameter(
|
215
|
+
name="timestamp",
|
216
|
+
type=float,
|
217
|
+
required=False,
|
218
|
+
description="Timestamp for the metric (defaults to current time)",
|
219
|
+
),
|
220
|
+
"tags": NodeParameter(
|
221
|
+
name="tags",
|
222
|
+
type=dict,
|
223
|
+
required=False,
|
224
|
+
default={},
|
225
|
+
description="Tags for metric categorization",
|
226
|
+
),
|
227
|
+
"detection_methods": NodeParameter(
|
228
|
+
name="detection_methods",
|
229
|
+
type=list,
|
230
|
+
required=False,
|
231
|
+
default=["statistical", "threshold_based"],
|
232
|
+
description="Detection methods to use (statistical, threshold_based, rolling_average, zscore, iqr)",
|
233
|
+
),
|
234
|
+
"sensitivity": NodeParameter(
|
235
|
+
name="sensitivity",
|
236
|
+
type=float,
|
237
|
+
required=False,
|
238
|
+
default=0.8,
|
239
|
+
description="Detection sensitivity (0.0 to 1.0, higher = more sensitive)",
|
240
|
+
),
|
241
|
+
"detection_window": NodeParameter(
|
242
|
+
name="detection_window",
|
243
|
+
type=float,
|
244
|
+
required=False,
|
245
|
+
default=300.0,
|
246
|
+
description="Time window for anomaly detection in seconds",
|
247
|
+
),
|
248
|
+
"min_samples": NodeParameter(
|
249
|
+
name="min_samples",
|
250
|
+
type=int,
|
251
|
+
required=False,
|
252
|
+
default=30,
|
253
|
+
description="Minimum samples required for baseline learning",
|
254
|
+
),
|
255
|
+
"learning_rate": NodeParameter(
|
256
|
+
name="learning_rate",
|
257
|
+
type=float,
|
258
|
+
required=False,
|
259
|
+
default=0.1,
|
260
|
+
description="Learning rate for adaptive baseline updates",
|
261
|
+
),
|
262
|
+
"zscore_threshold": NodeParameter(
|
263
|
+
name="zscore_threshold",
|
264
|
+
type=float,
|
265
|
+
required=False,
|
266
|
+
default=2.5,
|
267
|
+
description="Z-score threshold for anomaly detection",
|
268
|
+
),
|
269
|
+
"enable_monitoring": NodeParameter(
|
270
|
+
name="enable_monitoring",
|
271
|
+
type=bool,
|
272
|
+
required=False,
|
273
|
+
default=False,
|
274
|
+
description="Enable continuous anomaly monitoring",
|
275
|
+
),
|
276
|
+
"monitoring_interval": NodeParameter(
|
277
|
+
name="monitoring_interval",
|
278
|
+
type=float,
|
279
|
+
required=False,
|
280
|
+
default=30.0,
|
281
|
+
description="Monitoring interval in seconds",
|
282
|
+
),
|
283
|
+
"metadata": NodeParameter(
|
284
|
+
name="metadata",
|
285
|
+
type=dict,
|
286
|
+
required=False,
|
287
|
+
default={},
|
288
|
+
description="Additional metadata for the operation",
|
289
|
+
),
|
290
|
+
}
|
291
|
+
|
292
|
+
def get_output_schema(self) -> Dict[str, NodeParameter]:
|
293
|
+
"""Define the output schema for this node."""
|
294
|
+
return {
|
295
|
+
"anomalies_detected": NodeParameter(
|
296
|
+
name="anomalies_detected",
|
297
|
+
type=list,
|
298
|
+
description="List of detected anomalies",
|
299
|
+
),
|
300
|
+
"anomaly_count": NodeParameter(
|
301
|
+
name="anomaly_count",
|
302
|
+
type=int,
|
303
|
+
description="Number of anomalies detected",
|
304
|
+
),
|
305
|
+
"baselines": NodeParameter(
|
306
|
+
name="baselines", type=dict, description="Current performance baselines"
|
307
|
+
),
|
308
|
+
"metrics_processed": NodeParameter(
|
309
|
+
name="metrics_processed",
|
310
|
+
type=int,
|
311
|
+
description="Number of metrics processed",
|
312
|
+
),
|
313
|
+
"detection_summary": NodeParameter(
|
314
|
+
name="detection_summary",
|
315
|
+
type=dict,
|
316
|
+
description="Summary of detection results",
|
317
|
+
),
|
318
|
+
"recommendations": NodeParameter(
|
319
|
+
name="recommendations",
|
320
|
+
type=list,
|
321
|
+
description="Performance optimization recommendations",
|
322
|
+
),
|
323
|
+
"monitoring_status": NodeParameter(
|
324
|
+
name="monitoring_status",
|
325
|
+
type=str,
|
326
|
+
description="Current monitoring status",
|
327
|
+
),
|
328
|
+
"timestamp": NodeParameter(
|
329
|
+
name="timestamp", type=str, description="ISO timestamp of operation"
|
330
|
+
),
|
331
|
+
"status": NodeParameter(
|
332
|
+
name="status", type=str, description="Operation status"
|
333
|
+
),
|
334
|
+
}
|
335
|
+
|
336
|
+
async def async_run(self, **kwargs) -> Dict[str, Any]:
|
337
|
+
"""Execute performance anomaly detection operation."""
|
338
|
+
operation = kwargs.get("operation")
|
339
|
+
|
340
|
+
try:
|
341
|
+
if operation == "initialize_baseline":
|
342
|
+
return await self._initialize_baseline(**kwargs)
|
343
|
+
elif operation == "add_metric":
|
344
|
+
return await self._add_metric(**kwargs)
|
345
|
+
elif operation == "detect_anomalies":
|
346
|
+
return await self._detect_anomalies(**kwargs)
|
347
|
+
elif operation == "get_baseline":
|
348
|
+
return await self._get_baseline(**kwargs)
|
349
|
+
elif operation == "get_anomalies":
|
350
|
+
return await self._get_anomalies(**kwargs)
|
351
|
+
elif operation == "start_monitoring":
|
352
|
+
return await self._start_monitoring(**kwargs)
|
353
|
+
elif operation == "stop_monitoring":
|
354
|
+
return await self._stop_monitoring(**kwargs)
|
355
|
+
else:
|
356
|
+
raise ValueError(f"Unknown operation: {operation}")
|
357
|
+
|
358
|
+
except Exception as e:
|
359
|
+
self.logger.error(
|
360
|
+
f"Performance anomaly detection operation failed: {str(e)}"
|
361
|
+
)
|
362
|
+
raise NodeExecutionError(f"Failed to execute anomaly detection: {str(e)}")
|
363
|
+
|
364
|
+
async def _initialize_baseline(self, **kwargs) -> Dict[str, Any]:
|
365
|
+
"""Initialize baseline learning for a metric."""
|
366
|
+
metric_name = kwargs.get("metric_name")
|
367
|
+
if not metric_name:
|
368
|
+
raise ValueError("metric_name is required for initialize_baseline")
|
369
|
+
|
370
|
+
detection_methods = kwargs.get(
|
371
|
+
"detection_methods", ["statistical", "threshold_based"]
|
372
|
+
)
|
373
|
+
sensitivity = kwargs.get("sensitivity", 0.8)
|
374
|
+
min_samples = kwargs.get("min_samples", 30)
|
375
|
+
learning_rate = kwargs.get("learning_rate", 0.1)
|
376
|
+
|
377
|
+
# Update detection configuration
|
378
|
+
self._detection_config.update(
|
379
|
+
{
|
380
|
+
"sensitivity": sensitivity,
|
381
|
+
"min_samples": min_samples,
|
382
|
+
"learning_rate": learning_rate,
|
383
|
+
}
|
384
|
+
)
|
385
|
+
|
386
|
+
# Initialize baseline if it doesn't exist
|
387
|
+
if metric_name not in self._baselines:
|
388
|
+
current_time = time.time()
|
389
|
+
baseline = PerformanceBaseline(
|
390
|
+
metric_name=metric_name,
|
391
|
+
created_at=current_time,
|
392
|
+
updated_at=current_time,
|
393
|
+
sample_count=0,
|
394
|
+
mean=0.0,
|
395
|
+
median=0.0,
|
396
|
+
std_dev=0.0,
|
397
|
+
min_value=float("inf"),
|
398
|
+
max_value=float("-inf"),
|
399
|
+
learning_rate=learning_rate,
|
400
|
+
)
|
401
|
+
self._baselines[metric_name] = baseline
|
402
|
+
|
403
|
+
self.logger.info(f"Initialized baseline for metric: {metric_name}")
|
404
|
+
|
405
|
+
return {
|
406
|
+
"anomalies_detected": [],
|
407
|
+
"anomaly_count": 0,
|
408
|
+
"baselines": {
|
409
|
+
metric_name: self._serialize_baseline(self._baselines[metric_name])
|
410
|
+
},
|
411
|
+
"metrics_processed": 0,
|
412
|
+
"detection_summary": {"initialized": True, "methods": detection_methods},
|
413
|
+
"recommendations": [],
|
414
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
415
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
416
|
+
"status": "success",
|
417
|
+
}
|
418
|
+
|
419
|
+
async def _add_metric(self, **kwargs) -> Dict[str, Any]:
|
420
|
+
"""Add a performance metric and update baseline."""
|
421
|
+
metric_name = kwargs.get("metric_name")
|
422
|
+
value = kwargs.get("value")
|
423
|
+
timestamp = kwargs.get("timestamp", time.time())
|
424
|
+
tags = kwargs.get("tags", {})
|
425
|
+
metadata = kwargs.get("metadata", {})
|
426
|
+
|
427
|
+
if not metric_name or value is None:
|
428
|
+
raise ValueError("metric_name and value are required for add_metric")
|
429
|
+
|
430
|
+
# Create metric object
|
431
|
+
metric = PerformanceMetric(
|
432
|
+
metric_name=metric_name,
|
433
|
+
value=float(value),
|
434
|
+
timestamp=timestamp,
|
435
|
+
tags=tags,
|
436
|
+
metadata=metadata,
|
437
|
+
)
|
438
|
+
|
439
|
+
# Add to buffer
|
440
|
+
self._metrics_buffer[metric_name].append(metric)
|
441
|
+
|
442
|
+
# Update baseline if it exists
|
443
|
+
if metric_name in self._baselines:
|
444
|
+
await self._update_baseline(metric_name, metric)
|
445
|
+
|
446
|
+
# Check for immediate anomalies
|
447
|
+
anomalies = []
|
448
|
+
if metric_name in self._baselines:
|
449
|
+
anomalies = await self._check_metric_anomalies(metric)
|
450
|
+
|
451
|
+
self.logger.debug(f"Added metric {metric_name}={value} at {timestamp}")
|
452
|
+
|
453
|
+
return {
|
454
|
+
"anomalies_detected": [self._serialize_anomaly(a) for a in anomalies],
|
455
|
+
"anomaly_count": len(anomalies),
|
456
|
+
"baselines": {},
|
457
|
+
"metrics_processed": 1,
|
458
|
+
"detection_summary": {
|
459
|
+
"immediate_check": True,
|
460
|
+
"anomalies_found": len(anomalies),
|
461
|
+
},
|
462
|
+
"recommendations": [],
|
463
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
464
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
465
|
+
"status": "success",
|
466
|
+
}
|
467
|
+
|
468
|
+
async def _detect_anomalies(self, **kwargs) -> Dict[str, Any]:
|
469
|
+
"""Detect anomalies in performance metrics."""
|
470
|
+
metric_names = kwargs.get("metric_names", [])
|
471
|
+
detection_window = kwargs.get("detection_window", 300.0)
|
472
|
+
detection_methods = kwargs.get(
|
473
|
+
"detection_methods", ["statistical", "threshold_based"]
|
474
|
+
)
|
475
|
+
|
476
|
+
if not metric_names:
|
477
|
+
metric_names = list(self._baselines.keys())
|
478
|
+
|
479
|
+
current_time = time.time()
|
480
|
+
window_start = current_time - detection_window
|
481
|
+
|
482
|
+
all_anomalies = []
|
483
|
+
detection_summary = {}
|
484
|
+
|
485
|
+
for metric_name in metric_names:
|
486
|
+
if metric_name not in self._baselines:
|
487
|
+
continue
|
488
|
+
|
489
|
+
# Get metrics within detection window
|
490
|
+
recent_metrics = [
|
491
|
+
m
|
492
|
+
for m in self._metrics_buffer[metric_name]
|
493
|
+
if m.timestamp >= window_start
|
494
|
+
]
|
495
|
+
|
496
|
+
if not recent_metrics:
|
497
|
+
continue
|
498
|
+
|
499
|
+
# Apply different detection methods
|
500
|
+
metric_anomalies = []
|
501
|
+
for method in detection_methods:
|
502
|
+
method_anomalies = await self._apply_detection_method(
|
503
|
+
metric_name, recent_metrics, DetectionMethod(method)
|
504
|
+
)
|
505
|
+
metric_anomalies.extend(method_anomalies)
|
506
|
+
|
507
|
+
# Remove duplicates and merge similar anomalies
|
508
|
+
unique_anomalies = self._deduplicate_anomalies(metric_anomalies)
|
509
|
+
all_anomalies.extend(unique_anomalies)
|
510
|
+
|
511
|
+
detection_summary[metric_name] = {
|
512
|
+
"metrics_analyzed": len(recent_metrics),
|
513
|
+
"anomalies_found": len(unique_anomalies),
|
514
|
+
"methods_used": detection_methods,
|
515
|
+
}
|
516
|
+
|
517
|
+
# Store detected anomalies
|
518
|
+
self._detected_anomalies.extend(all_anomalies)
|
519
|
+
|
520
|
+
# Generate recommendations
|
521
|
+
recommendations = self._generate_recommendations(all_anomalies)
|
522
|
+
|
523
|
+
self.logger.info(
|
524
|
+
f"Detected {len(all_anomalies)} anomalies across {len(metric_names)} metrics"
|
525
|
+
)
|
526
|
+
|
527
|
+
return {
|
528
|
+
"anomalies_detected": [self._serialize_anomaly(a) for a in all_anomalies],
|
529
|
+
"anomaly_count": len(all_anomalies),
|
530
|
+
"baselines": {
|
531
|
+
name: self._serialize_baseline(baseline)
|
532
|
+
for name, baseline in self._baselines.items()
|
533
|
+
},
|
534
|
+
"metrics_processed": sum(
|
535
|
+
s.get("metrics_analyzed", 0) for s in detection_summary.values()
|
536
|
+
),
|
537
|
+
"detection_summary": detection_summary,
|
538
|
+
"recommendations": recommendations,
|
539
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
540
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
541
|
+
"status": "success",
|
542
|
+
}
|
543
|
+
|
544
|
+
async def _update_baseline(self, metric_name: str, metric: PerformanceMetric):
|
545
|
+
"""Update baseline with new metric using adaptive learning."""
|
546
|
+
baseline = self._baselines[metric_name]
|
547
|
+
value = metric.value
|
548
|
+
|
549
|
+
# Update sample count
|
550
|
+
baseline.sample_count += 1
|
551
|
+
|
552
|
+
# Update basic statistics using online algorithms
|
553
|
+
if baseline.sample_count == 1:
|
554
|
+
baseline.mean = value
|
555
|
+
baseline.median = value
|
556
|
+
baseline.min_value = value
|
557
|
+
baseline.max_value = value
|
558
|
+
baseline.std_dev = 0.0
|
559
|
+
else:
|
560
|
+
# Update mean using exponential moving average
|
561
|
+
baseline.mean = (
|
562
|
+
1 - baseline.learning_rate
|
563
|
+
) * baseline.mean + baseline.learning_rate * value
|
564
|
+
|
565
|
+
# Update min/max
|
566
|
+
baseline.min_value = min(baseline.min_value, value)
|
567
|
+
baseline.max_value = max(baseline.max_value, value)
|
568
|
+
|
569
|
+
# Update standard deviation using Welford's online algorithm
|
570
|
+
if baseline.sample_count >= self._detection_config["min_samples"]:
|
571
|
+
recent_metrics = list(self._metrics_buffer[metric_name])[
|
572
|
+
-self._detection_config["min_samples"] :
|
573
|
+
]
|
574
|
+
values = [m.value for m in recent_metrics]
|
575
|
+
baseline.std_dev = float(np.std(values))
|
576
|
+
baseline.median = float(np.median(values))
|
577
|
+
|
578
|
+
# Calculate percentiles
|
579
|
+
baseline.percentiles = {
|
580
|
+
"p50": float(np.percentile(values, 50)),
|
581
|
+
"p90": float(np.percentile(values, 90)),
|
582
|
+
"p95": float(np.percentile(values, 95)),
|
583
|
+
"p99": float(np.percentile(values, 99)),
|
584
|
+
}
|
585
|
+
|
586
|
+
# Update thresholds based on sensitivity
|
587
|
+
sensitivity = self._detection_config["sensitivity"]
|
588
|
+
baseline.upper_threshold = baseline.mean + (
|
589
|
+
sensitivity * 2 * baseline.std_dev
|
590
|
+
)
|
591
|
+
baseline.lower_threshold = baseline.mean - (
|
592
|
+
sensitivity * 2 * baseline.std_dev
|
593
|
+
)
|
594
|
+
baseline.variance_threshold = baseline.std_dev * sensitivity
|
595
|
+
|
596
|
+
baseline.updated_at = time.time()
|
597
|
+
|
598
|
+
async def _check_metric_anomalies(
|
599
|
+
self, metric: PerformanceMetric
|
600
|
+
) -> List[PerformanceAnomaly]:
|
601
|
+
"""Check a single metric for anomalies."""
|
602
|
+
anomalies = []
|
603
|
+
baseline = self._baselines.get(metric.metric_name)
|
604
|
+
|
605
|
+
if (
|
606
|
+
not baseline
|
607
|
+
or baseline.sample_count < self._detection_config["min_samples"]
|
608
|
+
):
|
609
|
+
return anomalies
|
610
|
+
|
611
|
+
# Threshold-based detection
|
612
|
+
if metric.value > baseline.upper_threshold:
|
613
|
+
anomaly = self._create_anomaly(
|
614
|
+
metric,
|
615
|
+
baseline,
|
616
|
+
AnomalyType.LATENCY_SPIKE,
|
617
|
+
DetectionMethod.THRESHOLD_BASED,
|
618
|
+
f"Value {metric.value:.2f} exceeds upper threshold {baseline.upper_threshold:.2f}",
|
619
|
+
)
|
620
|
+
anomalies.append(anomaly)
|
621
|
+
elif metric.value < baseline.lower_threshold:
|
622
|
+
anomaly = self._create_anomaly(
|
623
|
+
metric,
|
624
|
+
baseline,
|
625
|
+
AnomalyType.THROUGHPUT_DROP,
|
626
|
+
DetectionMethod.THRESHOLD_BASED,
|
627
|
+
f"Value {metric.value:.2f} below lower threshold {baseline.lower_threshold:.2f}",
|
628
|
+
)
|
629
|
+
anomalies.append(anomaly)
|
630
|
+
|
631
|
+
# Z-score based detection
|
632
|
+
if baseline.std_dev > 0:
|
633
|
+
zscore = abs(metric.value - baseline.mean) / baseline.std_dev
|
634
|
+
if zscore > self._detection_config["zscore_threshold"]:
|
635
|
+
anomaly = self._create_anomaly(
|
636
|
+
metric,
|
637
|
+
baseline,
|
638
|
+
AnomalyType.RESPONSE_TIME_VARIANCE,
|
639
|
+
DetectionMethod.ZSCORE,
|
640
|
+
f"Z-score {zscore:.2f} exceeds threshold {self._detection_config['zscore_threshold']}",
|
641
|
+
)
|
642
|
+
anomalies.append(anomaly)
|
643
|
+
|
644
|
+
return anomalies
|
645
|
+
|
646
|
+
async def _apply_detection_method(
|
647
|
+
self,
|
648
|
+
metric_name: str,
|
649
|
+
metrics: List[PerformanceMetric],
|
650
|
+
method: DetectionMethod,
|
651
|
+
) -> List[PerformanceAnomaly]:
|
652
|
+
"""Apply a specific detection method to metrics."""
|
653
|
+
anomalies = []
|
654
|
+
baseline = self._baselines.get(metric_name)
|
655
|
+
|
656
|
+
if not baseline or not metrics:
|
657
|
+
return anomalies
|
658
|
+
|
659
|
+
values = [m.value for m in metrics]
|
660
|
+
|
661
|
+
if method == DetectionMethod.STATISTICAL:
|
662
|
+
# Statistical analysis using Z-score and IQR
|
663
|
+
if len(values) >= 10:
|
664
|
+
mean_val = np.mean(values)
|
665
|
+
std_val = np.std(values)
|
666
|
+
|
667
|
+
for metric in metrics:
|
668
|
+
if std_val > 0:
|
669
|
+
zscore = abs(metric.value - mean_val) / std_val
|
670
|
+
if zscore > self._detection_config["zscore_threshold"]:
|
671
|
+
anomaly = self._create_anomaly(
|
672
|
+
metric,
|
673
|
+
baseline,
|
674
|
+
AnomalyType.RESPONSE_TIME_VARIANCE,
|
675
|
+
method,
|
676
|
+
f"Statistical outlier with Z-score {zscore:.2f}",
|
677
|
+
)
|
678
|
+
anomalies.append(anomaly)
|
679
|
+
|
680
|
+
elif method == DetectionMethod.IQR:
|
681
|
+
# Interquartile Range method
|
682
|
+
if len(values) >= 10:
|
683
|
+
q1 = np.percentile(values, 25)
|
684
|
+
q3 = np.percentile(values, 75)
|
685
|
+
iqr = q3 - q1
|
686
|
+
multiplier = self._detection_config.get("iqr_multiplier", 1.5)
|
687
|
+
|
688
|
+
lower_bound = q1 - multiplier * iqr
|
689
|
+
upper_bound = q3 + multiplier * iqr
|
690
|
+
|
691
|
+
for metric in metrics:
|
692
|
+
if metric.value < lower_bound or metric.value > upper_bound:
|
693
|
+
anomaly = self._create_anomaly(
|
694
|
+
metric,
|
695
|
+
baseline,
|
696
|
+
AnomalyType.RESPONSE_TIME_VARIANCE,
|
697
|
+
method,
|
698
|
+
f"IQR outlier: value {metric.value:.2f} outside [{lower_bound:.2f}, {upper_bound:.2f}]",
|
699
|
+
)
|
700
|
+
anomalies.append(anomaly)
|
701
|
+
|
702
|
+
elif method == DetectionMethod.ROLLING_AVERAGE:
|
703
|
+
# Rolling average deviation
|
704
|
+
if len(values) >= 10:
|
705
|
+
window_size = min(10, len(values) // 2)
|
706
|
+
for i in range(window_size, len(metrics)):
|
707
|
+
window_values = values[i - window_size : i]
|
708
|
+
rolling_avg = np.mean(window_values)
|
709
|
+
rolling_std = np.std(window_values)
|
710
|
+
|
711
|
+
current_metric = metrics[i]
|
712
|
+
if rolling_std > 0:
|
713
|
+
deviation = (
|
714
|
+
abs(current_metric.value - rolling_avg) / rolling_std
|
715
|
+
)
|
716
|
+
if deviation > 2.0: # 2 standard deviations
|
717
|
+
anomaly = self._create_anomaly(
|
718
|
+
current_metric,
|
719
|
+
baseline,
|
720
|
+
AnomalyType.TREND_ANOMALY,
|
721
|
+
method,
|
722
|
+
f"Rolling average deviation: {deviation:.2f}",
|
723
|
+
)
|
724
|
+
anomalies.append(anomaly)
|
725
|
+
|
726
|
+
return anomalies
|
727
|
+
|
728
|
+
def _create_anomaly(
|
729
|
+
self,
|
730
|
+
metric: PerformanceMetric,
|
731
|
+
baseline: PerformanceBaseline,
|
732
|
+
anomaly_type: AnomalyType,
|
733
|
+
method: DetectionMethod,
|
734
|
+
description: str,
|
735
|
+
) -> PerformanceAnomaly:
|
736
|
+
"""Create an anomaly detection object."""
|
737
|
+
expected_value = baseline.mean
|
738
|
+
deviation = abs(metric.value - expected_value)
|
739
|
+
|
740
|
+
# Calculate confidence based on deviation magnitude
|
741
|
+
if baseline.std_dev > 0:
|
742
|
+
confidence = min(1.0, deviation / (2 * baseline.std_dev))
|
743
|
+
else:
|
744
|
+
confidence = 1.0 if deviation > 0 else 0.0
|
745
|
+
|
746
|
+
# Determine severity
|
747
|
+
severity = self._determine_severity(deviation, baseline)
|
748
|
+
|
749
|
+
# Generate recommendations
|
750
|
+
recommendations = self._get_anomaly_recommendations(anomaly_type, metric)
|
751
|
+
|
752
|
+
return PerformanceAnomaly(
|
753
|
+
anomaly_id=f"anomaly_{int(time.time() * 1000000)}",
|
754
|
+
anomaly_type=anomaly_type,
|
755
|
+
metric_name=metric.metric_name,
|
756
|
+
detected_at=time.time(),
|
757
|
+
value=metric.value,
|
758
|
+
expected_value=expected_value,
|
759
|
+
deviation=deviation,
|
760
|
+
severity=severity,
|
761
|
+
confidence=confidence,
|
762
|
+
detection_method=method,
|
763
|
+
description=description,
|
764
|
+
impact_assessment=self._assess_impact(anomaly_type, deviation, baseline),
|
765
|
+
recommended_actions=recommendations,
|
766
|
+
tags=metric.tags,
|
767
|
+
metadata=metric.metadata,
|
768
|
+
)
|
769
|
+
|
770
|
+
def _determine_severity(
|
771
|
+
self, deviation: float, baseline: PerformanceBaseline
|
772
|
+
) -> AnomalySeverity:
|
773
|
+
"""Determine severity based on deviation magnitude."""
|
774
|
+
if baseline.std_dev <= 0:
|
775
|
+
return AnomalySeverity.MEDIUM
|
776
|
+
|
777
|
+
zscore_equivalent = deviation / baseline.std_dev
|
778
|
+
|
779
|
+
if zscore_equivalent > 4.0:
|
780
|
+
return AnomalySeverity.CRITICAL
|
781
|
+
elif zscore_equivalent > 3.0:
|
782
|
+
return AnomalySeverity.HIGH
|
783
|
+
elif zscore_equivalent > 2.0:
|
784
|
+
return AnomalySeverity.MEDIUM
|
785
|
+
else:
|
786
|
+
return AnomalySeverity.LOW
|
787
|
+
|
788
|
+
def _assess_impact(
|
789
|
+
self, anomaly_type: AnomalyType, deviation: float, baseline: PerformanceBaseline
|
790
|
+
) -> str:
|
791
|
+
"""Assess the potential impact of an anomaly."""
|
792
|
+
impact_map = {
|
793
|
+
AnomalyType.LATENCY_SPIKE: f"Increased response time may impact user experience. Current deviation: {deviation:.2f}ms above baseline.",
|
794
|
+
AnomalyType.THROUGHPUT_DROP: f"Reduced system throughput may indicate capacity issues. Current drop: {deviation:.2f} below expected.",
|
795
|
+
AnomalyType.ERROR_RATE_INCREASE: f"Higher error rate indicates system instability. Error rate increased by {deviation:.2f}%.",
|
796
|
+
AnomalyType.RESOURCE_EXHAUSTION: f"Resource usage spike may lead to system degradation. Usage increased by {deviation:.2f} units.",
|
797
|
+
AnomalyType.RESPONSE_TIME_VARIANCE: f"Inconsistent response times indicate system instability. Variance deviation: {deviation:.2f}.",
|
798
|
+
AnomalyType.CONCURRENCY_ANOMALY: f"Unusual concurrency patterns may indicate load issues. Concurrency deviation: {deviation:.2f}.",
|
799
|
+
AnomalyType.TREND_ANOMALY: f"Performance trend anomaly detected. Pattern deviation: {deviation:.2f}.",
|
800
|
+
}
|
801
|
+
return impact_map.get(
|
802
|
+
anomaly_type,
|
803
|
+
f"Performance anomaly detected with deviation: {deviation:.2f}",
|
804
|
+
)
|
805
|
+
|
806
|
+
def _get_anomaly_recommendations(
|
807
|
+
self, anomaly_type: AnomalyType, metric: PerformanceMetric
|
808
|
+
) -> List[str]:
|
809
|
+
"""Get recommendations for handling specific anomaly types."""
|
810
|
+
recommendation_map = {
|
811
|
+
AnomalyType.LATENCY_SPIKE: [
|
812
|
+
"Check for database query optimization opportunities",
|
813
|
+
"Review recent code deployments for performance regressions",
|
814
|
+
"Monitor system resource utilization (CPU, memory, I/O)",
|
815
|
+
"Consider horizontal scaling if load is high",
|
816
|
+
],
|
817
|
+
AnomalyType.THROUGHPUT_DROP: [
|
818
|
+
"Investigate potential bottlenecks in request processing",
|
819
|
+
"Check for resource contention or lock contention",
|
820
|
+
"Review connection pool configurations",
|
821
|
+
"Monitor downstream service dependencies",
|
822
|
+
],
|
823
|
+
AnomalyType.ERROR_RATE_INCREASE: [
|
824
|
+
"Review application logs for error patterns",
|
825
|
+
"Check external service dependencies",
|
826
|
+
"Validate input data quality and format",
|
827
|
+
"Consider implementing circuit breaker patterns",
|
828
|
+
],
|
829
|
+
AnomalyType.RESOURCE_EXHAUSTION: [
|
830
|
+
"Scale up system resources (CPU, memory)",
|
831
|
+
"Implement resource pooling and caching",
|
832
|
+
"Review memory leaks and resource cleanup",
|
833
|
+
"Consider load balancing and distribution",
|
834
|
+
],
|
835
|
+
AnomalyType.RESPONSE_TIME_VARIANCE: [
|
836
|
+
"Investigate intermittent performance issues",
|
837
|
+
"Check for garbage collection or memory pressure",
|
838
|
+
"Review caching effectiveness",
|
839
|
+
"Monitor network latency and stability",
|
840
|
+
],
|
841
|
+
}
|
842
|
+
return recommendation_map.get(
|
843
|
+
anomaly_type, ["Investigate performance patterns and system metrics"]
|
844
|
+
)
|
845
|
+
|
846
|
+
def _deduplicate_anomalies(
|
847
|
+
self, anomalies: List[PerformanceAnomaly]
|
848
|
+
) -> List[PerformanceAnomaly]:
|
849
|
+
"""Remove duplicate and similar anomalies."""
|
850
|
+
if not anomalies:
|
851
|
+
return []
|
852
|
+
|
853
|
+
# Sort by confidence and severity for prioritization
|
854
|
+
sorted_anomalies = sorted(
|
855
|
+
anomalies, key=lambda a: (a.severity.value, a.confidence), reverse=True
|
856
|
+
)
|
857
|
+
|
858
|
+
unique_anomalies = []
|
859
|
+
for anomaly in sorted_anomalies:
|
860
|
+
# Check if similar anomaly already exists
|
861
|
+
is_duplicate = False
|
862
|
+
for existing in unique_anomalies:
|
863
|
+
if (
|
864
|
+
existing.metric_name == anomaly.metric_name
|
865
|
+
and existing.anomaly_type == anomaly.anomaly_type
|
866
|
+
and abs(existing.detected_at - anomaly.detected_at)
|
867
|
+
< 60.0 # Within 1 minute
|
868
|
+
):
|
869
|
+
is_duplicate = True
|
870
|
+
break
|
871
|
+
|
872
|
+
if not is_duplicate:
|
873
|
+
unique_anomalies.append(anomaly)
|
874
|
+
|
875
|
+
return unique_anomalies
|
876
|
+
|
877
|
+
def _generate_recommendations(
|
878
|
+
self, anomalies: List[PerformanceAnomaly]
|
879
|
+
) -> List[str]:
|
880
|
+
"""Generate overall performance optimization recommendations."""
|
881
|
+
if not anomalies:
|
882
|
+
return ["System performance appears normal"]
|
883
|
+
|
884
|
+
recommendations = set()
|
885
|
+
|
886
|
+
# Analyze anomaly patterns
|
887
|
+
anomaly_types = [a.anomaly_type for a in anomalies]
|
888
|
+
severity_levels = [a.severity for a in anomalies]
|
889
|
+
|
890
|
+
# High-level recommendations based on patterns
|
891
|
+
if AnomalyType.LATENCY_SPIKE in anomaly_types:
|
892
|
+
recommendations.add("Implement performance monitoring and alerting")
|
893
|
+
recommendations.add("Consider caching frequently accessed data")
|
894
|
+
|
895
|
+
if AnomalyType.THROUGHPUT_DROP in anomaly_types:
|
896
|
+
recommendations.add("Review system capacity and scaling policies")
|
897
|
+
recommendations.add("Optimize database queries and connections")
|
898
|
+
|
899
|
+
if any(s == AnomalySeverity.CRITICAL for s in severity_levels):
|
900
|
+
recommendations.add(
|
901
|
+
"Immediate investigation required - critical performance issue detected"
|
902
|
+
)
|
903
|
+
|
904
|
+
# Add specific recommendations from individual anomalies
|
905
|
+
for anomaly in anomalies[:3]: # Top 3 anomalies
|
906
|
+
recommendations.update(
|
907
|
+
anomaly.recommended_actions[:2]
|
908
|
+
) # Top 2 actions each
|
909
|
+
|
910
|
+
return list(recommendations)
|
911
|
+
|
912
|
+
async def _get_baseline(self, **kwargs) -> Dict[str, Any]:
|
913
|
+
"""Get baseline information for metrics."""
|
914
|
+
metric_name = kwargs.get("metric_name")
|
915
|
+
|
916
|
+
if metric_name:
|
917
|
+
baselines = (
|
918
|
+
{metric_name: self._serialize_baseline(self._baselines[metric_name])}
|
919
|
+
if metric_name in self._baselines
|
920
|
+
else {}
|
921
|
+
)
|
922
|
+
else:
|
923
|
+
baselines = {
|
924
|
+
name: self._serialize_baseline(baseline)
|
925
|
+
for name, baseline in self._baselines.items()
|
926
|
+
}
|
927
|
+
|
928
|
+
return {
|
929
|
+
"anomalies_detected": [],
|
930
|
+
"anomaly_count": 0,
|
931
|
+
"baselines": baselines,
|
932
|
+
"metrics_processed": 0,
|
933
|
+
"detection_summary": {"baselines_retrieved": len(baselines)},
|
934
|
+
"recommendations": [],
|
935
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
936
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
937
|
+
"status": "success",
|
938
|
+
}
|
939
|
+
|
940
|
+
async def _get_anomalies(self, **kwargs) -> Dict[str, Any]:
|
941
|
+
"""Get detected anomalies."""
|
942
|
+
return {
|
943
|
+
"anomalies_detected": [
|
944
|
+
self._serialize_anomaly(a) for a in self._detected_anomalies
|
945
|
+
],
|
946
|
+
"anomaly_count": len(self._detected_anomalies),
|
947
|
+
"baselines": {},
|
948
|
+
"metrics_processed": 0,
|
949
|
+
"detection_summary": {"anomalies_retrieved": len(self._detected_anomalies)},
|
950
|
+
"recommendations": self._generate_recommendations(self._detected_anomalies),
|
951
|
+
"monitoring_status": "monitoring" if self._monitoring_active else "idle",
|
952
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
953
|
+
"status": "success",
|
954
|
+
}
|
955
|
+
|
956
|
+
async def _start_monitoring(self, **kwargs) -> Dict[str, Any]:
|
957
|
+
"""Start continuous anomaly monitoring."""
|
958
|
+
interval = kwargs.get("monitoring_interval", 30.0)
|
959
|
+
|
960
|
+
if not self._monitoring_active:
|
961
|
+
self._monitoring_active = True
|
962
|
+
monitoring_task = asyncio.create_task(self._monitoring_loop(interval))
|
963
|
+
self._background_tasks.add(monitoring_task)
|
964
|
+
monitoring_task.add_done_callback(self._background_tasks.discard)
|
965
|
+
|
966
|
+
return {
|
967
|
+
"anomalies_detected": [],
|
968
|
+
"anomaly_count": 0,
|
969
|
+
"baselines": {},
|
970
|
+
"metrics_processed": 0,
|
971
|
+
"detection_summary": {"monitoring_started": True, "interval": interval},
|
972
|
+
"recommendations": [],
|
973
|
+
"monitoring_status": "monitoring",
|
974
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
975
|
+
"status": "success",
|
976
|
+
}
|
977
|
+
|
978
|
+
async def _stop_monitoring(self, **kwargs) -> Dict[str, Any]:
|
979
|
+
"""Stop continuous anomaly monitoring."""
|
980
|
+
self._monitoring_active = False
|
981
|
+
|
982
|
+
# Cancel background tasks
|
983
|
+
for task in self._background_tasks:
|
984
|
+
if not task.done():
|
985
|
+
task.cancel()
|
986
|
+
|
987
|
+
# Wait for tasks to complete
|
988
|
+
if self._background_tasks:
|
989
|
+
await asyncio.gather(*self._background_tasks, return_exceptions=True)
|
990
|
+
|
991
|
+
self._background_tasks.clear()
|
992
|
+
|
993
|
+
return {
|
994
|
+
"anomalies_detected": [],
|
995
|
+
"anomaly_count": 0,
|
996
|
+
"baselines": {},
|
997
|
+
"metrics_processed": 0,
|
998
|
+
"detection_summary": {"monitoring_stopped": True},
|
999
|
+
"recommendations": [],
|
1000
|
+
"monitoring_status": "stopped",
|
1001
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
1002
|
+
"status": "success",
|
1003
|
+
}
|
1004
|
+
|
1005
|
+
async def _monitoring_loop(self, interval: float):
|
1006
|
+
"""Background monitoring loop for continuous anomaly detection."""
|
1007
|
+
while self._monitoring_active:
|
1008
|
+
try:
|
1009
|
+
await asyncio.sleep(interval)
|
1010
|
+
|
1011
|
+
# Run anomaly detection on all metrics
|
1012
|
+
metric_names = list(self._baselines.keys())
|
1013
|
+
if metric_names:
|
1014
|
+
result = await self._detect_anomalies(
|
1015
|
+
metric_names=metric_names,
|
1016
|
+
detection_window=interval * 2,
|
1017
|
+
)
|
1018
|
+
|
1019
|
+
if result["anomaly_count"] > 0:
|
1020
|
+
self.logger.warning(
|
1021
|
+
f"Monitoring detected {result['anomaly_count']} performance anomalies"
|
1022
|
+
)
|
1023
|
+
|
1024
|
+
except asyncio.CancelledError:
|
1025
|
+
break
|
1026
|
+
except Exception as e:
|
1027
|
+
self.logger.error(f"Monitoring loop error: {e}")
|
1028
|
+
|
1029
|
+
def _serialize_baseline(self, baseline: PerformanceBaseline) -> Dict[str, Any]:
|
1030
|
+
"""Serialize a baseline to dictionary."""
|
1031
|
+
return {
|
1032
|
+
"metric_name": baseline.metric_name,
|
1033
|
+
"created_at": baseline.created_at,
|
1034
|
+
"updated_at": baseline.updated_at,
|
1035
|
+
"sample_count": baseline.sample_count,
|
1036
|
+
"mean": baseline.mean,
|
1037
|
+
"median": baseline.median,
|
1038
|
+
"std_dev": baseline.std_dev,
|
1039
|
+
"min_value": baseline.min_value,
|
1040
|
+
"max_value": baseline.max_value,
|
1041
|
+
"percentiles": baseline.percentiles,
|
1042
|
+
"trend_slope": baseline.trend_slope,
|
1043
|
+
"upper_threshold": baseline.upper_threshold,
|
1044
|
+
"lower_threshold": baseline.lower_threshold,
|
1045
|
+
"variance_threshold": baseline.variance_threshold,
|
1046
|
+
"learning_rate": baseline.learning_rate,
|
1047
|
+
}
|
1048
|
+
|
1049
|
+
def _serialize_anomaly(self, anomaly: PerformanceAnomaly) -> Dict[str, Any]:
|
1050
|
+
"""Serialize an anomaly to dictionary."""
|
1051
|
+
return {
|
1052
|
+
"anomaly_id": anomaly.anomaly_id,
|
1053
|
+
"anomaly_type": anomaly.anomaly_type.value,
|
1054
|
+
"metric_name": anomaly.metric_name,
|
1055
|
+
"detected_at": anomaly.detected_at,
|
1056
|
+
"value": anomaly.value,
|
1057
|
+
"expected_value": anomaly.expected_value,
|
1058
|
+
"deviation": anomaly.deviation,
|
1059
|
+
"severity": anomaly.severity.value,
|
1060
|
+
"confidence": anomaly.confidence,
|
1061
|
+
"detection_method": anomaly.detection_method.value,
|
1062
|
+
"description": anomaly.description,
|
1063
|
+
"impact_assessment": anomaly.impact_assessment,
|
1064
|
+
"recommended_actions": anomaly.recommended_actions,
|
1065
|
+
"tags": anomaly.tags,
|
1066
|
+
"metadata": anomaly.metadata,
|
1067
|
+
}
|
1068
|
+
|
1069
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
1070
|
+
"""Synchronous wrapper for compatibility."""
|
1071
|
+
import asyncio
|
1072
|
+
|
1073
|
+
return asyncio.run(self.async_run(**kwargs))
|
1074
|
+
|
1075
|
+
async def cleanup(self):
|
1076
|
+
"""Cleanup resources when node is destroyed."""
|
1077
|
+
await self._stop_monitoring()
|
1078
|
+
await super().cleanup() if hasattr(super(), "cleanup") else None
|