kailash 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -7
- kailash/cli/__init__.py +11 -1
- kailash/cli/validation_audit.py +570 -0
- kailash/core/actors/supervisor.py +1 -1
- kailash/core/resilience/circuit_breaker.py +71 -1
- kailash/core/resilience/health_monitor.py +172 -0
- kailash/edge/compliance.py +33 -0
- kailash/edge/consistency.py +609 -0
- kailash/edge/coordination/__init__.py +30 -0
- kailash/edge/coordination/global_ordering.py +355 -0
- kailash/edge/coordination/leader_election.py +217 -0
- kailash/edge/coordination/partition_detector.py +296 -0
- kailash/edge/coordination/raft.py +485 -0
- kailash/edge/discovery.py +63 -1
- kailash/edge/migration/__init__.py +19 -0
- kailash/edge/migration/edge_migrator.py +832 -0
- kailash/edge/monitoring/__init__.py +21 -0
- kailash/edge/monitoring/edge_monitor.py +736 -0
- kailash/edge/prediction/__init__.py +10 -0
- kailash/edge/prediction/predictive_warmer.py +591 -0
- kailash/edge/resource/__init__.py +102 -0
- kailash/edge/resource/cloud_integration.py +796 -0
- kailash/edge/resource/cost_optimizer.py +949 -0
- kailash/edge/resource/docker_integration.py +919 -0
- kailash/edge/resource/kubernetes_integration.py +893 -0
- kailash/edge/resource/platform_integration.py +913 -0
- kailash/edge/resource/predictive_scaler.py +959 -0
- kailash/edge/resource/resource_analyzer.py +824 -0
- kailash/edge/resource/resource_pools.py +610 -0
- kailash/integrations/dataflow_edge.py +261 -0
- kailash/mcp_server/registry_integration.py +1 -1
- kailash/monitoring/__init__.py +18 -0
- kailash/monitoring/alerts.py +646 -0
- kailash/monitoring/metrics.py +677 -0
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/ai/__init__.py +17 -0
- kailash/nodes/ai/a2a.py +1914 -43
- kailash/nodes/ai/a2a_backup.py +1807 -0
- kailash/nodes/ai/hybrid_search.py +972 -0
- kailash/nodes/ai/semantic_memory.py +558 -0
- kailash/nodes/ai/streaming_analytics.py +947 -0
- kailash/nodes/base.py +545 -0
- kailash/nodes/edge/__init__.py +36 -0
- kailash/nodes/edge/base.py +240 -0
- kailash/nodes/edge/cloud_node.py +710 -0
- kailash/nodes/edge/coordination.py +239 -0
- kailash/nodes/edge/docker_node.py +825 -0
- kailash/nodes/edge/edge_data.py +582 -0
- kailash/nodes/edge/edge_migration_node.py +392 -0
- kailash/nodes/edge/edge_monitoring_node.py +421 -0
- kailash/nodes/edge/edge_state.py +673 -0
- kailash/nodes/edge/edge_warming_node.py +393 -0
- kailash/nodes/edge/kubernetes_node.py +652 -0
- kailash/nodes/edge/platform_node.py +766 -0
- kailash/nodes/edge/resource_analyzer_node.py +378 -0
- kailash/nodes/edge/resource_optimizer_node.py +501 -0
- kailash/nodes/edge/resource_scaler_node.py +397 -0
- kailash/nodes/ports.py +676 -0
- kailash/runtime/local.py +344 -1
- kailash/runtime/validation/__init__.py +20 -0
- kailash/runtime/validation/connection_context.py +119 -0
- kailash/runtime/validation/enhanced_error_formatter.py +202 -0
- kailash/runtime/validation/error_categorizer.py +164 -0
- kailash/runtime/validation/metrics.py +380 -0
- kailash/runtime/validation/performance.py +615 -0
- kailash/runtime/validation/suggestion_engine.py +212 -0
- kailash/testing/fixtures.py +2 -2
- kailash/workflow/builder.py +234 -8
- kailash/workflow/contracts.py +418 -0
- kailash/workflow/edge_infrastructure.py +369 -0
- kailash/workflow/migration.py +3 -3
- kailash/workflow/type_inference.py +669 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/METADATA +44 -27
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/RECORD +78 -28
- kailash/nexus/__init__.py +0 -21
- kailash/nexus/cli/__init__.py +0 -5
- kailash/nexus/cli/__main__.py +0 -6
- kailash/nexus/cli/main.py +0 -176
- kailash/nexus/factory.py +0 -413
- kailash/nexus/gateway.py +0 -545
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/WHEEL +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/entry_points.txt +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.8.3.dist-info → kailash-0.8.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,824 @@
|
|
1
|
+
"""Resource analyzer for intelligent edge resource management.
|
2
|
+
|
3
|
+
This module provides real-time resource analysis, pattern identification,
|
4
|
+
and bottleneck detection for edge computing infrastructure.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
from collections import defaultdict, deque
|
10
|
+
from dataclasses import dataclass, field
|
11
|
+
from datetime import datetime, timedelta
|
12
|
+
from enum import Enum
|
13
|
+
from typing import Any, Dict, List, Optional, Tuple
|
14
|
+
|
15
|
+
import numpy as np
|
16
|
+
from scipy import stats
|
17
|
+
|
18
|
+
|
19
|
+
class ResourceType(Enum):
|
20
|
+
"""Types of resources to analyze."""
|
21
|
+
|
22
|
+
CPU = "cpu"
|
23
|
+
MEMORY = "memory"
|
24
|
+
GPU = "gpu"
|
25
|
+
STORAGE = "storage"
|
26
|
+
NETWORK = "network"
|
27
|
+
CUSTOM = "custom"
|
28
|
+
|
29
|
+
|
30
|
+
class BottleneckType(Enum):
|
31
|
+
"""Types of resource bottlenecks."""
|
32
|
+
|
33
|
+
CAPACITY = "capacity" # Not enough resources
|
34
|
+
ALLOCATION = "allocation" # Poor distribution
|
35
|
+
CONTENTION = "contention" # Resource conflicts
|
36
|
+
FRAGMENTATION = "fragmentation" # Wasted space
|
37
|
+
THROTTLING = "throttling" # Rate limiting
|
38
|
+
|
39
|
+
|
40
|
+
@dataclass
|
41
|
+
class ResourceMetric:
|
42
|
+
"""Single resource metric measurement."""
|
43
|
+
|
44
|
+
timestamp: datetime
|
45
|
+
edge_node: str
|
46
|
+
resource_type: ResourceType
|
47
|
+
used: float
|
48
|
+
available: float
|
49
|
+
total: float
|
50
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
51
|
+
|
52
|
+
@property
|
53
|
+
def utilization(self) -> float:
|
54
|
+
"""Calculate utilization percentage."""
|
55
|
+
if self.total == 0:
|
56
|
+
return 0.0
|
57
|
+
return (self.used / self.total) * 100
|
58
|
+
|
59
|
+
@property
|
60
|
+
def free(self) -> float:
|
61
|
+
"""Calculate free resources."""
|
62
|
+
return self.total - self.used
|
63
|
+
|
64
|
+
def to_dict(self) -> Dict[str, Any]:
|
65
|
+
"""Convert to dictionary."""
|
66
|
+
return {
|
67
|
+
"timestamp": self.timestamp.isoformat(),
|
68
|
+
"edge_node": self.edge_node,
|
69
|
+
"resource_type": self.resource_type.value,
|
70
|
+
"used": self.used,
|
71
|
+
"available": self.available,
|
72
|
+
"total": self.total,
|
73
|
+
"utilization": self.utilization,
|
74
|
+
"metadata": self.metadata,
|
75
|
+
}
|
76
|
+
|
77
|
+
|
78
|
+
@dataclass
|
79
|
+
class ResourcePattern:
|
80
|
+
"""Identified resource usage pattern."""
|
81
|
+
|
82
|
+
pattern_type: str
|
83
|
+
confidence: float
|
84
|
+
edge_nodes: List[str]
|
85
|
+
resource_types: List[ResourceType]
|
86
|
+
characteristics: Dict[str, Any]
|
87
|
+
recommendations: List[str]
|
88
|
+
|
89
|
+
def to_dict(self) -> Dict[str, Any]:
|
90
|
+
"""Convert to dictionary."""
|
91
|
+
return {
|
92
|
+
"pattern_type": self.pattern_type,
|
93
|
+
"confidence": self.confidence,
|
94
|
+
"edge_nodes": self.edge_nodes,
|
95
|
+
"resource_types": [rt.value for rt in self.resource_types],
|
96
|
+
"characteristics": self.characteristics,
|
97
|
+
"recommendations": self.recommendations,
|
98
|
+
}
|
99
|
+
|
100
|
+
|
101
|
+
@dataclass
|
102
|
+
class Bottleneck:
|
103
|
+
"""Identified resource bottleneck."""
|
104
|
+
|
105
|
+
bottleneck_type: BottleneckType
|
106
|
+
severity: float # 0-1 scale
|
107
|
+
edge_node: str
|
108
|
+
resource_type: ResourceType
|
109
|
+
description: str
|
110
|
+
impact: Dict[str, Any]
|
111
|
+
resolution: List[str]
|
112
|
+
|
113
|
+
def to_dict(self) -> Dict[str, Any]:
|
114
|
+
"""Convert to dictionary."""
|
115
|
+
return {
|
116
|
+
"bottleneck_type": self.bottleneck_type.value,
|
117
|
+
"severity": self.severity,
|
118
|
+
"edge_node": self.edge_node,
|
119
|
+
"resource_type": self.resource_type.value,
|
120
|
+
"description": self.description,
|
121
|
+
"impact": self.impact,
|
122
|
+
"resolution": self.resolution,
|
123
|
+
}
|
124
|
+
|
125
|
+
|
126
|
+
class ResourceAnalyzer:
|
127
|
+
"""Analyzes resource usage patterns and identifies bottlenecks."""
|
128
|
+
|
129
|
+
def __init__(
|
130
|
+
self,
|
131
|
+
history_window: int = 3600, # 1 hour
|
132
|
+
analysis_interval: int = 60, # 1 minute
|
133
|
+
anomaly_threshold: float = 2.5, # Standard deviations
|
134
|
+
pattern_confidence_threshold: float = 0.7,
|
135
|
+
):
|
136
|
+
"""Initialize resource analyzer.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
history_window: Time window for analysis (seconds)
|
140
|
+
analysis_interval: Interval between analyses (seconds)
|
141
|
+
anomaly_threshold: Threshold for anomaly detection
|
142
|
+
pattern_confidence_threshold: Minimum confidence for patterns
|
143
|
+
"""
|
144
|
+
self.history_window = history_window
|
145
|
+
self.analysis_interval = analysis_interval
|
146
|
+
self.anomaly_threshold = anomaly_threshold
|
147
|
+
self.pattern_confidence_threshold = pattern_confidence_threshold
|
148
|
+
|
149
|
+
# Resource metrics storage
|
150
|
+
self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
|
151
|
+
|
152
|
+
# Analysis results
|
153
|
+
self.patterns: List[ResourcePattern] = []
|
154
|
+
self.bottlenecks: List[Bottleneck] = []
|
155
|
+
self.anomalies: List[Dict[str, Any]] = []
|
156
|
+
|
157
|
+
# Background task
|
158
|
+
self._analysis_task: Optional[asyncio.Task] = None
|
159
|
+
|
160
|
+
self.logger = logging.getLogger(__name__)
|
161
|
+
|
162
|
+
async def start(self):
|
163
|
+
"""Start background analysis."""
|
164
|
+
if not self._analysis_task:
|
165
|
+
self._analysis_task = asyncio.create_task(self._analysis_loop())
|
166
|
+
self.logger.info("Resource analyzer started")
|
167
|
+
|
168
|
+
async def stop(self):
|
169
|
+
"""Stop background analysis."""
|
170
|
+
if self._analysis_task:
|
171
|
+
self._analysis_task.cancel()
|
172
|
+
try:
|
173
|
+
await self._analysis_task
|
174
|
+
except asyncio.CancelledError:
|
175
|
+
pass
|
176
|
+
self._analysis_task = None
|
177
|
+
self.logger.info("Resource analyzer stopped")
|
178
|
+
|
179
|
+
async def record_metric(self, metric: ResourceMetric):
|
180
|
+
"""Record a resource metric.
|
181
|
+
|
182
|
+
Args:
|
183
|
+
metric: Resource metric to record
|
184
|
+
"""
|
185
|
+
key = f"{metric.edge_node}:{metric.resource_type.value}"
|
186
|
+
self.metrics[key].append(metric)
|
187
|
+
|
188
|
+
# Check for immediate issues
|
189
|
+
await self._check_immediate_issues(metric)
|
190
|
+
|
191
|
+
async def analyze_resources(self) -> Dict[str, Any]:
|
192
|
+
"""Perform comprehensive resource analysis.
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
Analysis results
|
196
|
+
"""
|
197
|
+
# Clear previous results
|
198
|
+
self.patterns.clear()
|
199
|
+
self.bottlenecks.clear()
|
200
|
+
self.anomalies.clear()
|
201
|
+
|
202
|
+
# Run all analyses
|
203
|
+
await self._identify_patterns()
|
204
|
+
await self._detect_bottlenecks()
|
205
|
+
await self._detect_anomalies()
|
206
|
+
|
207
|
+
return {
|
208
|
+
"patterns": [p.to_dict() for p in self.patterns],
|
209
|
+
"bottlenecks": [b.to_dict() for b in self.bottlenecks],
|
210
|
+
"anomalies": self.anomalies,
|
211
|
+
"summary": self._generate_summary(),
|
212
|
+
}
|
213
|
+
|
214
|
+
async def get_resource_trends(
|
215
|
+
self,
|
216
|
+
edge_node: Optional[str] = None,
|
217
|
+
resource_type: Optional[ResourceType] = None,
|
218
|
+
duration_minutes: int = 60,
|
219
|
+
) -> Dict[str, Any]:
|
220
|
+
"""Get resource usage trends.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
edge_node: Filter by edge node
|
224
|
+
resource_type: Filter by resource type
|
225
|
+
duration_minutes: Duration to analyze
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
Trend analysis
|
229
|
+
"""
|
230
|
+
trends = {}
|
231
|
+
cutoff = datetime.now() - timedelta(minutes=duration_minutes)
|
232
|
+
|
233
|
+
for key, metrics in self.metrics.items():
|
234
|
+
node, rtype = key.split(":")
|
235
|
+
|
236
|
+
# Apply filters
|
237
|
+
if edge_node and node != edge_node:
|
238
|
+
continue
|
239
|
+
if resource_type and rtype != resource_type.value:
|
240
|
+
continue
|
241
|
+
|
242
|
+
# Get recent metrics
|
243
|
+
recent = [m for m in metrics if m.timestamp > cutoff]
|
244
|
+
if not recent:
|
245
|
+
continue
|
246
|
+
|
247
|
+
# Calculate trends
|
248
|
+
utilizations = [m.utilization for m in recent]
|
249
|
+
timestamps = [(m.timestamp - cutoff).total_seconds() for m in recent]
|
250
|
+
|
251
|
+
if len(utilizations) > 1:
|
252
|
+
# Linear regression for trend
|
253
|
+
slope, intercept, r_value, _, _ = stats.linregress(
|
254
|
+
timestamps, utilizations
|
255
|
+
)
|
256
|
+
|
257
|
+
trends[key] = {
|
258
|
+
"current": utilizations[-1],
|
259
|
+
"average": np.mean(utilizations),
|
260
|
+
"min": np.min(utilizations),
|
261
|
+
"max": np.max(utilizations),
|
262
|
+
"std_dev": np.std(utilizations),
|
263
|
+
"trend_slope": slope,
|
264
|
+
"trend_direction": (
|
265
|
+
"increasing"
|
266
|
+
if slope > 0.1
|
267
|
+
else "decreasing" if slope < -0.1 else "stable"
|
268
|
+
),
|
269
|
+
"prediction_1h": (
|
270
|
+
intercept + slope * 3600 if abs(r_value) > 0.5 else None
|
271
|
+
),
|
272
|
+
}
|
273
|
+
|
274
|
+
return trends
|
275
|
+
|
276
|
+
async def get_optimization_recommendations(self) -> List[Dict[str, Any]]:
|
277
|
+
"""Get resource optimization recommendations.
|
278
|
+
|
279
|
+
Returns:
|
280
|
+
List of recommendations
|
281
|
+
"""
|
282
|
+
recommendations = []
|
283
|
+
|
284
|
+
# Analyze current state
|
285
|
+
await self.analyze_resources()
|
286
|
+
|
287
|
+
# Pattern-based recommendations
|
288
|
+
for pattern in self.patterns:
|
289
|
+
if pattern.confidence >= self.pattern_confidence_threshold:
|
290
|
+
recommendations.append(
|
291
|
+
{
|
292
|
+
"type": "pattern",
|
293
|
+
"priority": self._calculate_priority(pattern.confidence),
|
294
|
+
"pattern": pattern.pattern_type,
|
295
|
+
"affected_nodes": pattern.edge_nodes,
|
296
|
+
"recommendations": pattern.recommendations,
|
297
|
+
"expected_improvement": pattern.characteristics.get(
|
298
|
+
"improvement", "10-20%"
|
299
|
+
),
|
300
|
+
}
|
301
|
+
)
|
302
|
+
|
303
|
+
# Bottleneck-based recommendations
|
304
|
+
for bottleneck in self.bottlenecks:
|
305
|
+
if bottleneck.severity > 0.5:
|
306
|
+
recommendations.append(
|
307
|
+
{
|
308
|
+
"type": "bottleneck",
|
309
|
+
"priority": self._calculate_priority(bottleneck.severity),
|
310
|
+
"issue": bottleneck.description,
|
311
|
+
"node": bottleneck.edge_node,
|
312
|
+
"resource": bottleneck.resource_type.value,
|
313
|
+
"resolutions": bottleneck.resolution,
|
314
|
+
"impact": bottleneck.impact,
|
315
|
+
}
|
316
|
+
)
|
317
|
+
|
318
|
+
# Sort by priority
|
319
|
+
recommendations.sort(key=lambda x: x["priority"], reverse=True)
|
320
|
+
|
321
|
+
return recommendations
|
322
|
+
|
323
|
+
async def _analysis_loop(self):
|
324
|
+
"""Background analysis loop."""
|
325
|
+
while True:
|
326
|
+
try:
|
327
|
+
await asyncio.sleep(self.analysis_interval)
|
328
|
+
await self.analyze_resources()
|
329
|
+
except asyncio.CancelledError:
|
330
|
+
break
|
331
|
+
except Exception as e:
|
332
|
+
self.logger.error(f"Analysis error: {e}")
|
333
|
+
|
334
|
+
async def _check_immediate_issues(self, metric: ResourceMetric):
|
335
|
+
"""Check for immediate issues requiring attention.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
metric: Resource metric to check
|
339
|
+
"""
|
340
|
+
# Critical utilization check
|
341
|
+
if metric.utilization > 95:
|
342
|
+
self.logger.warning(
|
343
|
+
f"Critical {metric.resource_type.value} utilization "
|
344
|
+
f"on {metric.edge_node}: {metric.utilization:.1f}%"
|
345
|
+
)
|
346
|
+
|
347
|
+
# No available resources
|
348
|
+
if metric.available == 0 and metric.resource_type in [
|
349
|
+
ResourceType.CPU,
|
350
|
+
ResourceType.MEMORY,
|
351
|
+
]:
|
352
|
+
self.logger.error(
|
353
|
+
f"No {metric.resource_type.value} available " f"on {metric.edge_node}"
|
354
|
+
)
|
355
|
+
|
356
|
+
async def _identify_patterns(self):
|
357
|
+
"""Identify resource usage patterns."""
|
358
|
+
# Periodic pattern detection
|
359
|
+
periodic_pattern = await self._detect_periodic_pattern()
|
360
|
+
if periodic_pattern:
|
361
|
+
self.patterns.append(periodic_pattern)
|
362
|
+
|
363
|
+
# Spike pattern detection
|
364
|
+
spike_pattern = await self._detect_spike_pattern()
|
365
|
+
if spike_pattern:
|
366
|
+
self.patterns.append(spike_pattern)
|
367
|
+
|
368
|
+
# Gradual increase pattern
|
369
|
+
growth_pattern = await self._detect_growth_pattern()
|
370
|
+
if growth_pattern:
|
371
|
+
self.patterns.append(growth_pattern)
|
372
|
+
|
373
|
+
# Imbalance pattern
|
374
|
+
imbalance_pattern = await self._detect_imbalance_pattern()
|
375
|
+
if imbalance_pattern:
|
376
|
+
self.patterns.append(imbalance_pattern)
|
377
|
+
|
378
|
+
async def _detect_periodic_pattern(self) -> Optional[ResourcePattern]:
|
379
|
+
"""Detect periodic usage patterns."""
|
380
|
+
for key, metrics in self.metrics.items():
|
381
|
+
if len(metrics) < 100:
|
382
|
+
continue
|
383
|
+
|
384
|
+
# Extract utilization time series
|
385
|
+
utilizations = [m.utilization for m in metrics]
|
386
|
+
|
387
|
+
# Simple FFT-based periodicity detection
|
388
|
+
fft = np.fft.fft(utilizations)
|
389
|
+
frequencies = np.fft.fftfreq(len(utilizations))
|
390
|
+
|
391
|
+
# Find dominant frequency
|
392
|
+
dominant_idx = np.argmax(np.abs(fft[1 : len(fft) // 2])) + 1
|
393
|
+
if np.abs(fft[dominant_idx]) > len(utilizations) * 0.1:
|
394
|
+
period = (
|
395
|
+
1 / frequencies[dominant_idx]
|
396
|
+
if frequencies[dominant_idx] != 0
|
397
|
+
else 0
|
398
|
+
)
|
399
|
+
|
400
|
+
if period > 0:
|
401
|
+
node, rtype = key.split(":")
|
402
|
+
return ResourcePattern(
|
403
|
+
pattern_type="periodic",
|
404
|
+
confidence=min(
|
405
|
+
np.abs(fft[dominant_idx]) / len(utilizations), 1.0
|
406
|
+
),
|
407
|
+
edge_nodes=[node],
|
408
|
+
resource_types=[ResourceType(rtype)],
|
409
|
+
characteristics={
|
410
|
+
"period_seconds": abs(period * self.analysis_interval),
|
411
|
+
"amplitude": np.std(utilizations),
|
412
|
+
"improvement": "15-25%",
|
413
|
+
},
|
414
|
+
recommendations=[
|
415
|
+
f"Implement predictive scaling with {abs(period * self.analysis_interval):.0f}s period",
|
416
|
+
"Use time-based resource allocation",
|
417
|
+
"Consider workload scheduling optimization",
|
418
|
+
],
|
419
|
+
)
|
420
|
+
|
421
|
+
return None
|
422
|
+
|
423
|
+
async def _detect_spike_pattern(self) -> Optional[ResourcePattern]:
|
424
|
+
"""Detect resource usage spikes."""
|
425
|
+
spike_nodes = []
|
426
|
+
spike_resources = set()
|
427
|
+
|
428
|
+
for key, metrics in self.metrics.items():
|
429
|
+
if len(metrics) < 10:
|
430
|
+
continue
|
431
|
+
|
432
|
+
utilizations = [m.utilization for m in metrics]
|
433
|
+
mean = np.mean(utilizations)
|
434
|
+
std = np.std(utilizations)
|
435
|
+
|
436
|
+
# Count spikes
|
437
|
+
spikes = sum(1 for u in utilizations if u > mean + 2 * std)
|
438
|
+
|
439
|
+
if spikes > len(utilizations) * 0.1: # More than 10% are spikes
|
440
|
+
node, rtype = key.split(":")
|
441
|
+
spike_nodes.append(node)
|
442
|
+
spike_resources.add(ResourceType(rtype))
|
443
|
+
|
444
|
+
if spike_nodes:
|
445
|
+
return ResourcePattern(
|
446
|
+
pattern_type="spike",
|
447
|
+
confidence=0.8,
|
448
|
+
edge_nodes=list(set(spike_nodes)),
|
449
|
+
resource_types=list(spike_resources),
|
450
|
+
characteristics={
|
451
|
+
"frequency": "frequent",
|
452
|
+
"impact": "high",
|
453
|
+
"improvement": "20-30%",
|
454
|
+
},
|
455
|
+
recommendations=[
|
456
|
+
"Implement burst capacity allocation",
|
457
|
+
"Use resource pooling for spike handling",
|
458
|
+
"Consider request rate limiting",
|
459
|
+
"Enable auto-scaling with aggressive policies",
|
460
|
+
],
|
461
|
+
)
|
462
|
+
|
463
|
+
return None
|
464
|
+
|
465
|
+
async def _detect_growth_pattern(self) -> Optional[ResourcePattern]:
|
466
|
+
"""Detect gradual resource growth patterns."""
|
467
|
+
growth_nodes = []
|
468
|
+
growth_resources = set()
|
469
|
+
|
470
|
+
for key, metrics in self.metrics.items():
|
471
|
+
if len(metrics) < 50:
|
472
|
+
continue
|
473
|
+
|
474
|
+
# Get recent metrics
|
475
|
+
recent = list(metrics)[-50:]
|
476
|
+
utilizations = [m.utilization for m in recent]
|
477
|
+
timestamps = list(range(len(utilizations)))
|
478
|
+
|
479
|
+
# Linear regression
|
480
|
+
slope, _, r_value, _, _ = stats.linregress(timestamps, utilizations)
|
481
|
+
|
482
|
+
# Significant positive trend
|
483
|
+
if slope > 0.1 and abs(r_value) > 0.7:
|
484
|
+
node, rtype = key.split(":")
|
485
|
+
growth_nodes.append(node)
|
486
|
+
growth_resources.add(ResourceType(rtype))
|
487
|
+
|
488
|
+
if growth_nodes:
|
489
|
+
return ResourcePattern(
|
490
|
+
pattern_type="gradual_growth",
|
491
|
+
confidence=0.75,
|
492
|
+
edge_nodes=list(set(growth_nodes)),
|
493
|
+
resource_types=list(growth_resources),
|
494
|
+
characteristics={
|
495
|
+
"growth_rate": "steady",
|
496
|
+
"risk": "capacity_exhaustion",
|
497
|
+
"improvement": "25-35%",
|
498
|
+
},
|
499
|
+
recommendations=[
|
500
|
+
"Plan capacity expansion",
|
501
|
+
"Implement predictive scaling",
|
502
|
+
"Review resource cleanup policies",
|
503
|
+
"Consider workload migration strategies",
|
504
|
+
],
|
505
|
+
)
|
506
|
+
|
507
|
+
return None
|
508
|
+
|
509
|
+
async def _detect_imbalance_pattern(self) -> Optional[ResourcePattern]:
|
510
|
+
"""Detect resource imbalance across nodes."""
|
511
|
+
# Group by resource type
|
512
|
+
by_type: Dict[ResourceType, List[float]] = defaultdict(list)
|
513
|
+
node_utils: Dict[str, float] = {}
|
514
|
+
|
515
|
+
for key, metrics in self.metrics.items():
|
516
|
+
if not metrics:
|
517
|
+
continue
|
518
|
+
|
519
|
+
node, rtype = key.split(":")
|
520
|
+
recent_util = np.mean([m.utilization for m in list(metrics)[-10:]])
|
521
|
+
|
522
|
+
by_type[ResourceType(rtype)].append(recent_util)
|
523
|
+
node_utils[node] = recent_util
|
524
|
+
|
525
|
+
# Check for imbalance
|
526
|
+
imbalanced_resources = []
|
527
|
+
for rtype, utils in by_type.items():
|
528
|
+
if len(utils) > 1:
|
529
|
+
cv = np.std(utils) / np.mean(utils) if np.mean(utils) > 0 else 0
|
530
|
+
if cv > 0.5: # Coefficient of variation > 0.5
|
531
|
+
imbalanced_resources.append(rtype)
|
532
|
+
|
533
|
+
if imbalanced_resources:
|
534
|
+
# Find over and under utilized nodes
|
535
|
+
avg_util = np.mean(list(node_utils.values()))
|
536
|
+
over_utilized = [n for n, u in node_utils.items() if u > avg_util + 20]
|
537
|
+
under_utilized = [n for n, u in node_utils.items() if u < avg_util - 20]
|
538
|
+
|
539
|
+
return ResourcePattern(
|
540
|
+
pattern_type="imbalance",
|
541
|
+
confidence=0.85,
|
542
|
+
edge_nodes=over_utilized + under_utilized,
|
543
|
+
resource_types=imbalanced_resources,
|
544
|
+
characteristics={
|
545
|
+
"over_utilized": over_utilized,
|
546
|
+
"under_utilized": under_utilized,
|
547
|
+
"imbalance_severity": "high",
|
548
|
+
"improvement": "30-40%",
|
549
|
+
},
|
550
|
+
recommendations=[
|
551
|
+
"Implement load balancing strategies",
|
552
|
+
"Use affinity rules for better distribution",
|
553
|
+
"Consider workload migration from hot nodes",
|
554
|
+
"Enable cross-node resource sharing",
|
555
|
+
],
|
556
|
+
)
|
557
|
+
|
558
|
+
return None
|
559
|
+
|
560
|
+
async def _detect_bottlenecks(self):
|
561
|
+
"""Detect resource bottlenecks."""
|
562
|
+
# Capacity bottlenecks
|
563
|
+
await self._detect_capacity_bottlenecks()
|
564
|
+
|
565
|
+
# Allocation bottlenecks
|
566
|
+
await self._detect_allocation_bottlenecks()
|
567
|
+
|
568
|
+
# Contention bottlenecks
|
569
|
+
await self._detect_contention_bottlenecks()
|
570
|
+
|
571
|
+
# Fragmentation bottlenecks
|
572
|
+
await self._detect_fragmentation_bottlenecks()
|
573
|
+
|
574
|
+
async def _detect_capacity_bottlenecks(self):
|
575
|
+
"""Detect capacity bottlenecks."""
|
576
|
+
for key, metrics in self.metrics.items():
|
577
|
+
if not metrics:
|
578
|
+
continue
|
579
|
+
|
580
|
+
node, rtype = key.split(":")
|
581
|
+
recent = list(metrics)[-10:]
|
582
|
+
|
583
|
+
# Check sustained high utilization
|
584
|
+
high_util_count = sum(1 for m in recent if m.utilization > 85)
|
585
|
+
|
586
|
+
if high_util_count > len(recent) * 0.8:
|
587
|
+
avg_util = np.mean([m.utilization for m in recent])
|
588
|
+
|
589
|
+
self.bottlenecks.append(
|
590
|
+
Bottleneck(
|
591
|
+
bottleneck_type=BottleneckType.CAPACITY,
|
592
|
+
severity=min((avg_util - 85) / 15, 1.0),
|
593
|
+
edge_node=node,
|
594
|
+
resource_type=ResourceType(rtype),
|
595
|
+
description=f"Sustained high {rtype} utilization ({avg_util:.1f}%)",
|
596
|
+
impact={
|
597
|
+
"performance_degradation": "high",
|
598
|
+
"request_failures": avg_util > 95,
|
599
|
+
"user_impact": "significant",
|
600
|
+
},
|
601
|
+
resolution=[
|
602
|
+
f"Increase {rtype} capacity on {node}",
|
603
|
+
"Migrate workloads to other nodes",
|
604
|
+
"Optimize resource-intensive operations",
|
605
|
+
"Enable vertical scaling",
|
606
|
+
],
|
607
|
+
)
|
608
|
+
)
|
609
|
+
|
610
|
+
async def _detect_allocation_bottlenecks(self):
|
611
|
+
"""Detect allocation bottlenecks."""
|
612
|
+
# Check for poor allocation patterns
|
613
|
+
for key, metrics in self.metrics.items():
|
614
|
+
if len(metrics) < 20:
|
615
|
+
continue
|
616
|
+
|
617
|
+
node, rtype = key.split(":")
|
618
|
+
|
619
|
+
# Look for allocation/deallocation patterns
|
620
|
+
utils = [m.utilization for m in metrics]
|
621
|
+
changes = np.diff(utils)
|
622
|
+
|
623
|
+
# High variation suggests allocation issues
|
624
|
+
if np.std(changes) > 10:
|
625
|
+
self.bottlenecks.append(
|
626
|
+
Bottleneck(
|
627
|
+
bottleneck_type=BottleneckType.ALLOCATION,
|
628
|
+
severity=min(np.std(changes) / 20, 1.0),
|
629
|
+
edge_node=node,
|
630
|
+
resource_type=ResourceType(rtype),
|
631
|
+
description=f"Inefficient {rtype} allocation patterns",
|
632
|
+
impact={
|
633
|
+
"resource_waste": "moderate",
|
634
|
+
"response_variance": "high",
|
635
|
+
"efficiency": "low",
|
636
|
+
},
|
637
|
+
resolution=[
|
638
|
+
"Implement resource pooling",
|
639
|
+
"Use allocation caching",
|
640
|
+
"Optimize allocation algorithms",
|
641
|
+
"Review resource lifecycle management",
|
642
|
+
],
|
643
|
+
)
|
644
|
+
)
|
645
|
+
|
646
|
+
async def _detect_contention_bottlenecks(self):
|
647
|
+
"""Detect resource contention."""
|
648
|
+
# Look for patterns indicating contention
|
649
|
+
for key, metrics in self.metrics.items():
|
650
|
+
if len(metrics) < 30:
|
651
|
+
continue
|
652
|
+
|
653
|
+
node, rtype = key.split(":")
|
654
|
+
|
655
|
+
# Get wait times from metadata if available
|
656
|
+
wait_times = []
|
657
|
+
for m in metrics:
|
658
|
+
if "wait_time" in m.metadata:
|
659
|
+
wait_times.append(m.metadata["wait_time"])
|
660
|
+
|
661
|
+
if wait_times and np.mean(wait_times) > 100: # 100ms average wait
|
662
|
+
self.bottlenecks.append(
|
663
|
+
Bottleneck(
|
664
|
+
bottleneck_type=BottleneckType.CONTENTION,
|
665
|
+
severity=min(np.mean(wait_times) / 500, 1.0),
|
666
|
+
edge_node=node,
|
667
|
+
resource_type=ResourceType(rtype),
|
668
|
+
description=f"High {rtype} contention (avg wait: {np.mean(wait_times):.0f}ms)",
|
669
|
+
impact={
|
670
|
+
"latency_increase": f"{np.mean(wait_times):.0f}ms",
|
671
|
+
"throughput_reduction": "significant",
|
672
|
+
"user_experience": "degraded",
|
673
|
+
},
|
674
|
+
resolution=[
|
675
|
+
"Implement resource locking optimization",
|
676
|
+
"Use lock-free data structures",
|
677
|
+
"Increase resource pool size",
|
678
|
+
"Review concurrent access patterns",
|
679
|
+
],
|
680
|
+
)
|
681
|
+
)
|
682
|
+
|
683
|
+
async def _detect_fragmentation_bottlenecks(self):
|
684
|
+
"""Detect resource fragmentation."""
|
685
|
+
for key, metrics in self.metrics.items():
|
686
|
+
if not metrics:
|
687
|
+
continue
|
688
|
+
|
689
|
+
node, rtype = key.split(":")
|
690
|
+
recent = list(metrics)[-5:]
|
691
|
+
|
692
|
+
# Check for fragmentation indicators
|
693
|
+
for m in recent:
|
694
|
+
if m.used < m.total * 0.7 and m.available < m.total * 0.2:
|
695
|
+
# Used is low but available is also low = fragmentation
|
696
|
+
fragmentation_pct = (1 - (m.used + m.available) / m.total) * 100
|
697
|
+
|
698
|
+
if fragmentation_pct > 10:
|
699
|
+
self.bottlenecks.append(
|
700
|
+
Bottleneck(
|
701
|
+
bottleneck_type=BottleneckType.FRAGMENTATION,
|
702
|
+
severity=min(fragmentation_pct / 30, 1.0),
|
703
|
+
edge_node=node,
|
704
|
+
resource_type=ResourceType(rtype),
|
705
|
+
description=f"{rtype} fragmentation ({fragmentation_pct:.1f}%)",
|
706
|
+
impact={
|
707
|
+
"wasted_resources": f"{fragmentation_pct:.1f}%",
|
708
|
+
"allocation_failures": fragmentation_pct > 20,
|
709
|
+
"efficiency": "reduced",
|
710
|
+
},
|
711
|
+
resolution=[
|
712
|
+
"Implement defragmentation routine",
|
713
|
+
"Use contiguous allocation strategies",
|
714
|
+
"Review resource allocation sizes",
|
715
|
+
"Enable resource compaction",
|
716
|
+
],
|
717
|
+
)
|
718
|
+
)
|
719
|
+
break
|
720
|
+
|
721
|
+
async def _detect_anomalies(self):
|
722
|
+
"""Detect resource anomalies."""
|
723
|
+
cutoff = datetime.now() - timedelta(seconds=self.history_window)
|
724
|
+
|
725
|
+
for key, metrics in self.metrics.items():
|
726
|
+
if len(metrics) < 20:
|
727
|
+
continue
|
728
|
+
|
729
|
+
node, rtype = key.split(":")
|
730
|
+
|
731
|
+
# Get historical data
|
732
|
+
historical = [m for m in metrics if m.timestamp < cutoff]
|
733
|
+
recent = [m for m in metrics if m.timestamp >= cutoff]
|
734
|
+
|
735
|
+
if len(historical) < 10 or len(recent) < 3:
|
736
|
+
continue
|
737
|
+
|
738
|
+
# Calculate statistics
|
739
|
+
hist_utils = [m.utilization for m in historical]
|
740
|
+
recent_utils = [m.utilization for m in recent]
|
741
|
+
|
742
|
+
mean = np.mean(hist_utils)
|
743
|
+
std = np.std(hist_utils)
|
744
|
+
|
745
|
+
# Check for anomalies
|
746
|
+
for i, util in enumerate(recent_utils):
|
747
|
+
z_score = (util - mean) / std if std > 0 else 0
|
748
|
+
|
749
|
+
if abs(z_score) > self.anomaly_threshold:
|
750
|
+
self.anomalies.append(
|
751
|
+
{
|
752
|
+
"timestamp": recent[i].timestamp.isoformat(),
|
753
|
+
"edge_node": node,
|
754
|
+
"resource_type": rtype,
|
755
|
+
"value": util,
|
756
|
+
"expected_range": [mean - 2 * std, mean + 2 * std],
|
757
|
+
"z_score": z_score,
|
758
|
+
"severity": "high" if abs(z_score) > 4 else "medium",
|
759
|
+
"description": f"Unusual {rtype} utilization: {util:.1f}% (expected: {mean:.1f}±{std:.1f}%)",
|
760
|
+
}
|
761
|
+
)
|
762
|
+
|
763
|
+
def _generate_summary(self) -> Dict[str, Any]:
|
764
|
+
"""Generate analysis summary."""
|
765
|
+
# Calculate overall health score
|
766
|
+
pattern_score = 100 - len(self.patterns) * 10
|
767
|
+
bottleneck_score = 100 - sum(b.severity * 20 for b in self.bottlenecks)
|
768
|
+
anomaly_score = 100 - len(self.anomalies) * 5
|
769
|
+
|
770
|
+
health_score = max(
|
771
|
+
0, min(100, (pattern_score + bottleneck_score + anomaly_score) / 3)
|
772
|
+
)
|
773
|
+
|
774
|
+
return {
|
775
|
+
"health_score": health_score,
|
776
|
+
"health_status": self._get_health_status(health_score),
|
777
|
+
"total_patterns": len(self.patterns),
|
778
|
+
"total_bottlenecks": len(self.bottlenecks),
|
779
|
+
"total_anomalies": len(self.anomalies),
|
780
|
+
"critical_issues": len([b for b in self.bottlenecks if b.severity > 0.8]),
|
781
|
+
"top_recommendations": self._get_top_recommendations(),
|
782
|
+
}
|
783
|
+
|
784
|
+
def _get_health_status(self, score: float) -> str:
|
785
|
+
"""Get health status from score."""
|
786
|
+
if score >= 90:
|
787
|
+
return "excellent"
|
788
|
+
elif score >= 75:
|
789
|
+
return "good"
|
790
|
+
elif score >= 60:
|
791
|
+
return "fair"
|
792
|
+
elif score >= 40:
|
793
|
+
return "poor"
|
794
|
+
else:
|
795
|
+
return "critical"
|
796
|
+
|
797
|
+
def _get_top_recommendations(self) -> List[str]:
|
798
|
+
"""Get top recommendations."""
|
799
|
+
recommendations = []
|
800
|
+
|
801
|
+
# From patterns
|
802
|
+
for pattern in sorted(self.patterns, key=lambda p: p.confidence, reverse=True)[
|
803
|
+
:2
|
804
|
+
]:
|
805
|
+
recommendations.extend(pattern.recommendations[:1])
|
806
|
+
|
807
|
+
# From bottlenecks
|
808
|
+
for bottleneck in sorted(
|
809
|
+
self.bottlenecks, key=lambda b: b.severity, reverse=True
|
810
|
+
)[:2]:
|
811
|
+
recommendations.extend(bottleneck.resolution[:1])
|
812
|
+
|
813
|
+
return recommendations[:5] # Top 5 recommendations
|
814
|
+
|
815
|
+
def _calculate_priority(self, score: float) -> str:
|
816
|
+
"""Calculate priority from score."""
|
817
|
+
if score >= 0.8:
|
818
|
+
return "critical"
|
819
|
+
elif score >= 0.6:
|
820
|
+
return "high"
|
821
|
+
elif score >= 0.4:
|
822
|
+
return "medium"
|
823
|
+
else:
|
824
|
+
return "low"
|