kailash 0.8.4__py3-none-any.whl → 0.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. kailash/__init__.py +1 -7
  2. kailash/cli/__init__.py +11 -1
  3. kailash/cli/validation_audit.py +570 -0
  4. kailash/core/actors/supervisor.py +1 -1
  5. kailash/core/resilience/circuit_breaker.py +71 -1
  6. kailash/core/resilience/health_monitor.py +172 -0
  7. kailash/edge/compliance.py +33 -0
  8. kailash/edge/consistency.py +609 -0
  9. kailash/edge/coordination/__init__.py +30 -0
  10. kailash/edge/coordination/global_ordering.py +355 -0
  11. kailash/edge/coordination/leader_election.py +217 -0
  12. kailash/edge/coordination/partition_detector.py +296 -0
  13. kailash/edge/coordination/raft.py +485 -0
  14. kailash/edge/discovery.py +63 -1
  15. kailash/edge/migration/__init__.py +19 -0
  16. kailash/edge/migration/edge_migrator.py +832 -0
  17. kailash/edge/monitoring/__init__.py +21 -0
  18. kailash/edge/monitoring/edge_monitor.py +736 -0
  19. kailash/edge/prediction/__init__.py +10 -0
  20. kailash/edge/prediction/predictive_warmer.py +591 -0
  21. kailash/edge/resource/__init__.py +102 -0
  22. kailash/edge/resource/cloud_integration.py +796 -0
  23. kailash/edge/resource/cost_optimizer.py +949 -0
  24. kailash/edge/resource/docker_integration.py +919 -0
  25. kailash/edge/resource/kubernetes_integration.py +893 -0
  26. kailash/edge/resource/platform_integration.py +913 -0
  27. kailash/edge/resource/predictive_scaler.py +959 -0
  28. kailash/edge/resource/resource_analyzer.py +824 -0
  29. kailash/edge/resource/resource_pools.py +610 -0
  30. kailash/integrations/dataflow_edge.py +261 -0
  31. kailash/mcp_server/registry_integration.py +1 -1
  32. kailash/monitoring/__init__.py +18 -0
  33. kailash/monitoring/alerts.py +646 -0
  34. kailash/monitoring/metrics.py +677 -0
  35. kailash/nodes/__init__.py +2 -0
  36. kailash/nodes/ai/semantic_memory.py +2 -2
  37. kailash/nodes/base.py +545 -0
  38. kailash/nodes/edge/__init__.py +36 -0
  39. kailash/nodes/edge/base.py +240 -0
  40. kailash/nodes/edge/cloud_node.py +710 -0
  41. kailash/nodes/edge/coordination.py +239 -0
  42. kailash/nodes/edge/docker_node.py +825 -0
  43. kailash/nodes/edge/edge_data.py +582 -0
  44. kailash/nodes/edge/edge_migration_node.py +392 -0
  45. kailash/nodes/edge/edge_monitoring_node.py +421 -0
  46. kailash/nodes/edge/edge_state.py +673 -0
  47. kailash/nodes/edge/edge_warming_node.py +393 -0
  48. kailash/nodes/edge/kubernetes_node.py +652 -0
  49. kailash/nodes/edge/platform_node.py +766 -0
  50. kailash/nodes/edge/resource_analyzer_node.py +378 -0
  51. kailash/nodes/edge/resource_optimizer_node.py +501 -0
  52. kailash/nodes/edge/resource_scaler_node.py +397 -0
  53. kailash/nodes/ports.py +676 -0
  54. kailash/runtime/local.py +344 -1
  55. kailash/runtime/validation/__init__.py +20 -0
  56. kailash/runtime/validation/connection_context.py +119 -0
  57. kailash/runtime/validation/enhanced_error_formatter.py +202 -0
  58. kailash/runtime/validation/error_categorizer.py +164 -0
  59. kailash/runtime/validation/metrics.py +380 -0
  60. kailash/runtime/validation/performance.py +615 -0
  61. kailash/runtime/validation/suggestion_engine.py +212 -0
  62. kailash/testing/fixtures.py +2 -2
  63. kailash/workflow/builder.py +230 -4
  64. kailash/workflow/contracts.py +418 -0
  65. kailash/workflow/edge_infrastructure.py +369 -0
  66. kailash/workflow/migration.py +3 -3
  67. kailash/workflow/type_inference.py +669 -0
  68. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/METADATA +43 -27
  69. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/RECORD +73 -27
  70. kailash/nexus/__init__.py +0 -21
  71. kailash/nexus/cli/__init__.py +0 -5
  72. kailash/nexus/cli/__main__.py +0 -6
  73. kailash/nexus/cli/main.py +0 -176
  74. kailash/nexus/factory.py +0 -413
  75. kailash/nexus/gateway.py +0 -545
  76. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/WHEEL +0 -0
  77. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/entry_points.txt +0 -0
  78. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/licenses/LICENSE +0 -0
  79. {kailash-0.8.4.dist-info → kailash-0.8.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,824 @@
1
+ """Resource analyzer for intelligent edge resource management.
2
+
3
+ This module provides real-time resource analysis, pattern identification,
4
+ and bottleneck detection for edge computing infrastructure.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ from collections import defaultdict, deque
10
+ from dataclasses import dataclass, field
11
+ from datetime import datetime, timedelta
12
+ from enum import Enum
13
+ from typing import Any, Dict, List, Optional, Tuple
14
+
15
+ import numpy as np
16
+ from scipy import stats
17
+
18
+
19
+ class ResourceType(Enum):
20
+ """Types of resources to analyze."""
21
+
22
+ CPU = "cpu"
23
+ MEMORY = "memory"
24
+ GPU = "gpu"
25
+ STORAGE = "storage"
26
+ NETWORK = "network"
27
+ CUSTOM = "custom"
28
+
29
+
30
+ class BottleneckType(Enum):
31
+ """Types of resource bottlenecks."""
32
+
33
+ CAPACITY = "capacity" # Not enough resources
34
+ ALLOCATION = "allocation" # Poor distribution
35
+ CONTENTION = "contention" # Resource conflicts
36
+ FRAGMENTATION = "fragmentation" # Wasted space
37
+ THROTTLING = "throttling" # Rate limiting
38
+
39
+
40
+ @dataclass
41
+ class ResourceMetric:
42
+ """Single resource metric measurement."""
43
+
44
+ timestamp: datetime
45
+ edge_node: str
46
+ resource_type: ResourceType
47
+ used: float
48
+ available: float
49
+ total: float
50
+ metadata: Dict[str, Any] = field(default_factory=dict)
51
+
52
+ @property
53
+ def utilization(self) -> float:
54
+ """Calculate utilization percentage."""
55
+ if self.total == 0:
56
+ return 0.0
57
+ return (self.used / self.total) * 100
58
+
59
+ @property
60
+ def free(self) -> float:
61
+ """Calculate free resources."""
62
+ return self.total - self.used
63
+
64
+ def to_dict(self) -> Dict[str, Any]:
65
+ """Convert to dictionary."""
66
+ return {
67
+ "timestamp": self.timestamp.isoformat(),
68
+ "edge_node": self.edge_node,
69
+ "resource_type": self.resource_type.value,
70
+ "used": self.used,
71
+ "available": self.available,
72
+ "total": self.total,
73
+ "utilization": self.utilization,
74
+ "metadata": self.metadata,
75
+ }
76
+
77
+
78
+ @dataclass
79
+ class ResourcePattern:
80
+ """Identified resource usage pattern."""
81
+
82
+ pattern_type: str
83
+ confidence: float
84
+ edge_nodes: List[str]
85
+ resource_types: List[ResourceType]
86
+ characteristics: Dict[str, Any]
87
+ recommendations: List[str]
88
+
89
+ def to_dict(self) -> Dict[str, Any]:
90
+ """Convert to dictionary."""
91
+ return {
92
+ "pattern_type": self.pattern_type,
93
+ "confidence": self.confidence,
94
+ "edge_nodes": self.edge_nodes,
95
+ "resource_types": [rt.value for rt in self.resource_types],
96
+ "characteristics": self.characteristics,
97
+ "recommendations": self.recommendations,
98
+ }
99
+
100
+
101
+ @dataclass
102
+ class Bottleneck:
103
+ """Identified resource bottleneck."""
104
+
105
+ bottleneck_type: BottleneckType
106
+ severity: float # 0-1 scale
107
+ edge_node: str
108
+ resource_type: ResourceType
109
+ description: str
110
+ impact: Dict[str, Any]
111
+ resolution: List[str]
112
+
113
+ def to_dict(self) -> Dict[str, Any]:
114
+ """Convert to dictionary."""
115
+ return {
116
+ "bottleneck_type": self.bottleneck_type.value,
117
+ "severity": self.severity,
118
+ "edge_node": self.edge_node,
119
+ "resource_type": self.resource_type.value,
120
+ "description": self.description,
121
+ "impact": self.impact,
122
+ "resolution": self.resolution,
123
+ }
124
+
125
+
126
+ class ResourceAnalyzer:
127
+ """Analyzes resource usage patterns and identifies bottlenecks."""
128
+
129
+ def __init__(
130
+ self,
131
+ history_window: int = 3600, # 1 hour
132
+ analysis_interval: int = 60, # 1 minute
133
+ anomaly_threshold: float = 2.5, # Standard deviations
134
+ pattern_confidence_threshold: float = 0.7,
135
+ ):
136
+ """Initialize resource analyzer.
137
+
138
+ Args:
139
+ history_window: Time window for analysis (seconds)
140
+ analysis_interval: Interval between analyses (seconds)
141
+ anomaly_threshold: Threshold for anomaly detection
142
+ pattern_confidence_threshold: Minimum confidence for patterns
143
+ """
144
+ self.history_window = history_window
145
+ self.analysis_interval = analysis_interval
146
+ self.anomaly_threshold = anomaly_threshold
147
+ self.pattern_confidence_threshold = pattern_confidence_threshold
148
+
149
+ # Resource metrics storage
150
+ self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
151
+
152
+ # Analysis results
153
+ self.patterns: List[ResourcePattern] = []
154
+ self.bottlenecks: List[Bottleneck] = []
155
+ self.anomalies: List[Dict[str, Any]] = []
156
+
157
+ # Background task
158
+ self._analysis_task: Optional[asyncio.Task] = None
159
+
160
+ self.logger = logging.getLogger(__name__)
161
+
162
+ async def start(self):
163
+ """Start background analysis."""
164
+ if not self._analysis_task:
165
+ self._analysis_task = asyncio.create_task(self._analysis_loop())
166
+ self.logger.info("Resource analyzer started")
167
+
168
+ async def stop(self):
169
+ """Stop background analysis."""
170
+ if self._analysis_task:
171
+ self._analysis_task.cancel()
172
+ try:
173
+ await self._analysis_task
174
+ except asyncio.CancelledError:
175
+ pass
176
+ self._analysis_task = None
177
+ self.logger.info("Resource analyzer stopped")
178
+
179
+ async def record_metric(self, metric: ResourceMetric):
180
+ """Record a resource metric.
181
+
182
+ Args:
183
+ metric: Resource metric to record
184
+ """
185
+ key = f"{metric.edge_node}:{metric.resource_type.value}"
186
+ self.metrics[key].append(metric)
187
+
188
+ # Check for immediate issues
189
+ await self._check_immediate_issues(metric)
190
+
191
+ async def analyze_resources(self) -> Dict[str, Any]:
192
+ """Perform comprehensive resource analysis.
193
+
194
+ Returns:
195
+ Analysis results
196
+ """
197
+ # Clear previous results
198
+ self.patterns.clear()
199
+ self.bottlenecks.clear()
200
+ self.anomalies.clear()
201
+
202
+ # Run all analyses
203
+ await self._identify_patterns()
204
+ await self._detect_bottlenecks()
205
+ await self._detect_anomalies()
206
+
207
+ return {
208
+ "patterns": [p.to_dict() for p in self.patterns],
209
+ "bottlenecks": [b.to_dict() for b in self.bottlenecks],
210
+ "anomalies": self.anomalies,
211
+ "summary": self._generate_summary(),
212
+ }
213
+
214
+ async def get_resource_trends(
215
+ self,
216
+ edge_node: Optional[str] = None,
217
+ resource_type: Optional[ResourceType] = None,
218
+ duration_minutes: int = 60,
219
+ ) -> Dict[str, Any]:
220
+ """Get resource usage trends.
221
+
222
+ Args:
223
+ edge_node: Filter by edge node
224
+ resource_type: Filter by resource type
225
+ duration_minutes: Duration to analyze
226
+
227
+ Returns:
228
+ Trend analysis
229
+ """
230
+ trends = {}
231
+ cutoff = datetime.now() - timedelta(minutes=duration_minutes)
232
+
233
+ for key, metrics in self.metrics.items():
234
+ node, rtype = key.split(":")
235
+
236
+ # Apply filters
237
+ if edge_node and node != edge_node:
238
+ continue
239
+ if resource_type and rtype != resource_type.value:
240
+ continue
241
+
242
+ # Get recent metrics
243
+ recent = [m for m in metrics if m.timestamp > cutoff]
244
+ if not recent:
245
+ continue
246
+
247
+ # Calculate trends
248
+ utilizations = [m.utilization for m in recent]
249
+ timestamps = [(m.timestamp - cutoff).total_seconds() for m in recent]
250
+
251
+ if len(utilizations) > 1:
252
+ # Linear regression for trend
253
+ slope, intercept, r_value, _, _ = stats.linregress(
254
+ timestamps, utilizations
255
+ )
256
+
257
+ trends[key] = {
258
+ "current": utilizations[-1],
259
+ "average": np.mean(utilizations),
260
+ "min": np.min(utilizations),
261
+ "max": np.max(utilizations),
262
+ "std_dev": np.std(utilizations),
263
+ "trend_slope": slope,
264
+ "trend_direction": (
265
+ "increasing"
266
+ if slope > 0.1
267
+ else "decreasing" if slope < -0.1 else "stable"
268
+ ),
269
+ "prediction_1h": (
270
+ intercept + slope * 3600 if abs(r_value) > 0.5 else None
271
+ ),
272
+ }
273
+
274
+ return trends
275
+
276
+ async def get_optimization_recommendations(self) -> List[Dict[str, Any]]:
277
+ """Get resource optimization recommendations.
278
+
279
+ Returns:
280
+ List of recommendations
281
+ """
282
+ recommendations = []
283
+
284
+ # Analyze current state
285
+ await self.analyze_resources()
286
+
287
+ # Pattern-based recommendations
288
+ for pattern in self.patterns:
289
+ if pattern.confidence >= self.pattern_confidence_threshold:
290
+ recommendations.append(
291
+ {
292
+ "type": "pattern",
293
+ "priority": self._calculate_priority(pattern.confidence),
294
+ "pattern": pattern.pattern_type,
295
+ "affected_nodes": pattern.edge_nodes,
296
+ "recommendations": pattern.recommendations,
297
+ "expected_improvement": pattern.characteristics.get(
298
+ "improvement", "10-20%"
299
+ ),
300
+ }
301
+ )
302
+
303
+ # Bottleneck-based recommendations
304
+ for bottleneck in self.bottlenecks:
305
+ if bottleneck.severity > 0.5:
306
+ recommendations.append(
307
+ {
308
+ "type": "bottleneck",
309
+ "priority": self._calculate_priority(bottleneck.severity),
310
+ "issue": bottleneck.description,
311
+ "node": bottleneck.edge_node,
312
+ "resource": bottleneck.resource_type.value,
313
+ "resolutions": bottleneck.resolution,
314
+ "impact": bottleneck.impact,
315
+ }
316
+ )
317
+
318
+ # Sort by priority
319
+ recommendations.sort(key=lambda x: x["priority"], reverse=True)
320
+
321
+ return recommendations
322
+
323
+ async def _analysis_loop(self):
324
+ """Background analysis loop."""
325
+ while True:
326
+ try:
327
+ await asyncio.sleep(self.analysis_interval)
328
+ await self.analyze_resources()
329
+ except asyncio.CancelledError:
330
+ break
331
+ except Exception as e:
332
+ self.logger.error(f"Analysis error: {e}")
333
+
334
+ async def _check_immediate_issues(self, metric: ResourceMetric):
335
+ """Check for immediate issues requiring attention.
336
+
337
+ Args:
338
+ metric: Resource metric to check
339
+ """
340
+ # Critical utilization check
341
+ if metric.utilization > 95:
342
+ self.logger.warning(
343
+ f"Critical {metric.resource_type.value} utilization "
344
+ f"on {metric.edge_node}: {metric.utilization:.1f}%"
345
+ )
346
+
347
+ # No available resources
348
+ if metric.available == 0 and metric.resource_type in [
349
+ ResourceType.CPU,
350
+ ResourceType.MEMORY,
351
+ ]:
352
+ self.logger.error(
353
+ f"No {metric.resource_type.value} available " f"on {metric.edge_node}"
354
+ )
355
+
356
+ async def _identify_patterns(self):
357
+ """Identify resource usage patterns."""
358
+ # Periodic pattern detection
359
+ periodic_pattern = await self._detect_periodic_pattern()
360
+ if periodic_pattern:
361
+ self.patterns.append(periodic_pattern)
362
+
363
+ # Spike pattern detection
364
+ spike_pattern = await self._detect_spike_pattern()
365
+ if spike_pattern:
366
+ self.patterns.append(spike_pattern)
367
+
368
+ # Gradual increase pattern
369
+ growth_pattern = await self._detect_growth_pattern()
370
+ if growth_pattern:
371
+ self.patterns.append(growth_pattern)
372
+
373
+ # Imbalance pattern
374
+ imbalance_pattern = await self._detect_imbalance_pattern()
375
+ if imbalance_pattern:
376
+ self.patterns.append(imbalance_pattern)
377
+
378
+ async def _detect_periodic_pattern(self) -> Optional[ResourcePattern]:
379
+ """Detect periodic usage patterns."""
380
+ for key, metrics in self.metrics.items():
381
+ if len(metrics) < 100:
382
+ continue
383
+
384
+ # Extract utilization time series
385
+ utilizations = [m.utilization for m in metrics]
386
+
387
+ # Simple FFT-based periodicity detection
388
+ fft = np.fft.fft(utilizations)
389
+ frequencies = np.fft.fftfreq(len(utilizations))
390
+
391
+ # Find dominant frequency
392
+ dominant_idx = np.argmax(np.abs(fft[1 : len(fft) // 2])) + 1
393
+ if np.abs(fft[dominant_idx]) > len(utilizations) * 0.1:
394
+ period = (
395
+ 1 / frequencies[dominant_idx]
396
+ if frequencies[dominant_idx] != 0
397
+ else 0
398
+ )
399
+
400
+ if period > 0:
401
+ node, rtype = key.split(":")
402
+ return ResourcePattern(
403
+ pattern_type="periodic",
404
+ confidence=min(
405
+ np.abs(fft[dominant_idx]) / len(utilizations), 1.0
406
+ ),
407
+ edge_nodes=[node],
408
+ resource_types=[ResourceType(rtype)],
409
+ characteristics={
410
+ "period_seconds": abs(period * self.analysis_interval),
411
+ "amplitude": np.std(utilizations),
412
+ "improvement": "15-25%",
413
+ },
414
+ recommendations=[
415
+ f"Implement predictive scaling with {abs(period * self.analysis_interval):.0f}s period",
416
+ "Use time-based resource allocation",
417
+ "Consider workload scheduling optimization",
418
+ ],
419
+ )
420
+
421
+ return None
422
+
423
+ async def _detect_spike_pattern(self) -> Optional[ResourcePattern]:
424
+ """Detect resource usage spikes."""
425
+ spike_nodes = []
426
+ spike_resources = set()
427
+
428
+ for key, metrics in self.metrics.items():
429
+ if len(metrics) < 10:
430
+ continue
431
+
432
+ utilizations = [m.utilization for m in metrics]
433
+ mean = np.mean(utilizations)
434
+ std = np.std(utilizations)
435
+
436
+ # Count spikes
437
+ spikes = sum(1 for u in utilizations if u > mean + 2 * std)
438
+
439
+ if spikes > len(utilizations) * 0.1: # More than 10% are spikes
440
+ node, rtype = key.split(":")
441
+ spike_nodes.append(node)
442
+ spike_resources.add(ResourceType(rtype))
443
+
444
+ if spike_nodes:
445
+ return ResourcePattern(
446
+ pattern_type="spike",
447
+ confidence=0.8,
448
+ edge_nodes=list(set(spike_nodes)),
449
+ resource_types=list(spike_resources),
450
+ characteristics={
451
+ "frequency": "frequent",
452
+ "impact": "high",
453
+ "improvement": "20-30%",
454
+ },
455
+ recommendations=[
456
+ "Implement burst capacity allocation",
457
+ "Use resource pooling for spike handling",
458
+ "Consider request rate limiting",
459
+ "Enable auto-scaling with aggressive policies",
460
+ ],
461
+ )
462
+
463
+ return None
464
+
465
+ async def _detect_growth_pattern(self) -> Optional[ResourcePattern]:
466
+ """Detect gradual resource growth patterns."""
467
+ growth_nodes = []
468
+ growth_resources = set()
469
+
470
+ for key, metrics in self.metrics.items():
471
+ if len(metrics) < 50:
472
+ continue
473
+
474
+ # Get recent metrics
475
+ recent = list(metrics)[-50:]
476
+ utilizations = [m.utilization for m in recent]
477
+ timestamps = list(range(len(utilizations)))
478
+
479
+ # Linear regression
480
+ slope, _, r_value, _, _ = stats.linregress(timestamps, utilizations)
481
+
482
+ # Significant positive trend
483
+ if slope > 0.1 and abs(r_value) > 0.7:
484
+ node, rtype = key.split(":")
485
+ growth_nodes.append(node)
486
+ growth_resources.add(ResourceType(rtype))
487
+
488
+ if growth_nodes:
489
+ return ResourcePattern(
490
+ pattern_type="gradual_growth",
491
+ confidence=0.75,
492
+ edge_nodes=list(set(growth_nodes)),
493
+ resource_types=list(growth_resources),
494
+ characteristics={
495
+ "growth_rate": "steady",
496
+ "risk": "capacity_exhaustion",
497
+ "improvement": "25-35%",
498
+ },
499
+ recommendations=[
500
+ "Plan capacity expansion",
501
+ "Implement predictive scaling",
502
+ "Review resource cleanup policies",
503
+ "Consider workload migration strategies",
504
+ ],
505
+ )
506
+
507
+ return None
508
+
509
+ async def _detect_imbalance_pattern(self) -> Optional[ResourcePattern]:
510
+ """Detect resource imbalance across nodes."""
511
+ # Group by resource type
512
+ by_type: Dict[ResourceType, List[float]] = defaultdict(list)
513
+ node_utils: Dict[str, float] = {}
514
+
515
+ for key, metrics in self.metrics.items():
516
+ if not metrics:
517
+ continue
518
+
519
+ node, rtype = key.split(":")
520
+ recent_util = np.mean([m.utilization for m in list(metrics)[-10:]])
521
+
522
+ by_type[ResourceType(rtype)].append(recent_util)
523
+ node_utils[node] = recent_util
524
+
525
+ # Check for imbalance
526
+ imbalanced_resources = []
527
+ for rtype, utils in by_type.items():
528
+ if len(utils) > 1:
529
+ cv = np.std(utils) / np.mean(utils) if np.mean(utils) > 0 else 0
530
+ if cv > 0.5: # Coefficient of variation > 0.5
531
+ imbalanced_resources.append(rtype)
532
+
533
+ if imbalanced_resources:
534
+ # Find over and under utilized nodes
535
+ avg_util = np.mean(list(node_utils.values()))
536
+ over_utilized = [n for n, u in node_utils.items() if u > avg_util + 20]
537
+ under_utilized = [n for n, u in node_utils.items() if u < avg_util - 20]
538
+
539
+ return ResourcePattern(
540
+ pattern_type="imbalance",
541
+ confidence=0.85,
542
+ edge_nodes=over_utilized + under_utilized,
543
+ resource_types=imbalanced_resources,
544
+ characteristics={
545
+ "over_utilized": over_utilized,
546
+ "under_utilized": under_utilized,
547
+ "imbalance_severity": "high",
548
+ "improvement": "30-40%",
549
+ },
550
+ recommendations=[
551
+ "Implement load balancing strategies",
552
+ "Use affinity rules for better distribution",
553
+ "Consider workload migration from hot nodes",
554
+ "Enable cross-node resource sharing",
555
+ ],
556
+ )
557
+
558
+ return None
559
+
560
+ async def _detect_bottlenecks(self):
561
+ """Detect resource bottlenecks."""
562
+ # Capacity bottlenecks
563
+ await self._detect_capacity_bottlenecks()
564
+
565
+ # Allocation bottlenecks
566
+ await self._detect_allocation_bottlenecks()
567
+
568
+ # Contention bottlenecks
569
+ await self._detect_contention_bottlenecks()
570
+
571
+ # Fragmentation bottlenecks
572
+ await self._detect_fragmentation_bottlenecks()
573
+
574
+ async def _detect_capacity_bottlenecks(self):
575
+ """Detect capacity bottlenecks."""
576
+ for key, metrics in self.metrics.items():
577
+ if not metrics:
578
+ continue
579
+
580
+ node, rtype = key.split(":")
581
+ recent = list(metrics)[-10:]
582
+
583
+ # Check sustained high utilization
584
+ high_util_count = sum(1 for m in recent if m.utilization > 85)
585
+
586
+ if high_util_count > len(recent) * 0.8:
587
+ avg_util = np.mean([m.utilization for m in recent])
588
+
589
+ self.bottlenecks.append(
590
+ Bottleneck(
591
+ bottleneck_type=BottleneckType.CAPACITY,
592
+ severity=min((avg_util - 85) / 15, 1.0),
593
+ edge_node=node,
594
+ resource_type=ResourceType(rtype),
595
+ description=f"Sustained high {rtype} utilization ({avg_util:.1f}%)",
596
+ impact={
597
+ "performance_degradation": "high",
598
+ "request_failures": avg_util > 95,
599
+ "user_impact": "significant",
600
+ },
601
+ resolution=[
602
+ f"Increase {rtype} capacity on {node}",
603
+ "Migrate workloads to other nodes",
604
+ "Optimize resource-intensive operations",
605
+ "Enable vertical scaling",
606
+ ],
607
+ )
608
+ )
609
+
610
+ async def _detect_allocation_bottlenecks(self):
611
+ """Detect allocation bottlenecks."""
612
+ # Check for poor allocation patterns
613
+ for key, metrics in self.metrics.items():
614
+ if len(metrics) < 20:
615
+ continue
616
+
617
+ node, rtype = key.split(":")
618
+
619
+ # Look for allocation/deallocation patterns
620
+ utils = [m.utilization for m in metrics]
621
+ changes = np.diff(utils)
622
+
623
+ # High variation suggests allocation issues
624
+ if np.std(changes) > 10:
625
+ self.bottlenecks.append(
626
+ Bottleneck(
627
+ bottleneck_type=BottleneckType.ALLOCATION,
628
+ severity=min(np.std(changes) / 20, 1.0),
629
+ edge_node=node,
630
+ resource_type=ResourceType(rtype),
631
+ description=f"Inefficient {rtype} allocation patterns",
632
+ impact={
633
+ "resource_waste": "moderate",
634
+ "response_variance": "high",
635
+ "efficiency": "low",
636
+ },
637
+ resolution=[
638
+ "Implement resource pooling",
639
+ "Use allocation caching",
640
+ "Optimize allocation algorithms",
641
+ "Review resource lifecycle management",
642
+ ],
643
+ )
644
+ )
645
+
646
+ async def _detect_contention_bottlenecks(self):
647
+ """Detect resource contention."""
648
+ # Look for patterns indicating contention
649
+ for key, metrics in self.metrics.items():
650
+ if len(metrics) < 30:
651
+ continue
652
+
653
+ node, rtype = key.split(":")
654
+
655
+ # Get wait times from metadata if available
656
+ wait_times = []
657
+ for m in metrics:
658
+ if "wait_time" in m.metadata:
659
+ wait_times.append(m.metadata["wait_time"])
660
+
661
+ if wait_times and np.mean(wait_times) > 100: # 100ms average wait
662
+ self.bottlenecks.append(
663
+ Bottleneck(
664
+ bottleneck_type=BottleneckType.CONTENTION,
665
+ severity=min(np.mean(wait_times) / 500, 1.0),
666
+ edge_node=node,
667
+ resource_type=ResourceType(rtype),
668
+ description=f"High {rtype} contention (avg wait: {np.mean(wait_times):.0f}ms)",
669
+ impact={
670
+ "latency_increase": f"{np.mean(wait_times):.0f}ms",
671
+ "throughput_reduction": "significant",
672
+ "user_experience": "degraded",
673
+ },
674
+ resolution=[
675
+ "Implement resource locking optimization",
676
+ "Use lock-free data structures",
677
+ "Increase resource pool size",
678
+ "Review concurrent access patterns",
679
+ ],
680
+ )
681
+ )
682
+
683
+ async def _detect_fragmentation_bottlenecks(self):
684
+ """Detect resource fragmentation."""
685
+ for key, metrics in self.metrics.items():
686
+ if not metrics:
687
+ continue
688
+
689
+ node, rtype = key.split(":")
690
+ recent = list(metrics)[-5:]
691
+
692
+ # Check for fragmentation indicators
693
+ for m in recent:
694
+ if m.used < m.total * 0.7 and m.available < m.total * 0.2:
695
+ # Used is low but available is also low = fragmentation
696
+ fragmentation_pct = (1 - (m.used + m.available) / m.total) * 100
697
+
698
+ if fragmentation_pct > 10:
699
+ self.bottlenecks.append(
700
+ Bottleneck(
701
+ bottleneck_type=BottleneckType.FRAGMENTATION,
702
+ severity=min(fragmentation_pct / 30, 1.0),
703
+ edge_node=node,
704
+ resource_type=ResourceType(rtype),
705
+ description=f"{rtype} fragmentation ({fragmentation_pct:.1f}%)",
706
+ impact={
707
+ "wasted_resources": f"{fragmentation_pct:.1f}%",
708
+ "allocation_failures": fragmentation_pct > 20,
709
+ "efficiency": "reduced",
710
+ },
711
+ resolution=[
712
+ "Implement defragmentation routine",
713
+ "Use contiguous allocation strategies",
714
+ "Review resource allocation sizes",
715
+ "Enable resource compaction",
716
+ ],
717
+ )
718
+ )
719
+ break
720
+
721
+ async def _detect_anomalies(self):
722
+ """Detect resource anomalies."""
723
+ cutoff = datetime.now() - timedelta(seconds=self.history_window)
724
+
725
+ for key, metrics in self.metrics.items():
726
+ if len(metrics) < 20:
727
+ continue
728
+
729
+ node, rtype = key.split(":")
730
+
731
+ # Get historical data
732
+ historical = [m for m in metrics if m.timestamp < cutoff]
733
+ recent = [m for m in metrics if m.timestamp >= cutoff]
734
+
735
+ if len(historical) < 10 or len(recent) < 3:
736
+ continue
737
+
738
+ # Calculate statistics
739
+ hist_utils = [m.utilization for m in historical]
740
+ recent_utils = [m.utilization for m in recent]
741
+
742
+ mean = np.mean(hist_utils)
743
+ std = np.std(hist_utils)
744
+
745
+ # Check for anomalies
746
+ for i, util in enumerate(recent_utils):
747
+ z_score = (util - mean) / std if std > 0 else 0
748
+
749
+ if abs(z_score) > self.anomaly_threshold:
750
+ self.anomalies.append(
751
+ {
752
+ "timestamp": recent[i].timestamp.isoformat(),
753
+ "edge_node": node,
754
+ "resource_type": rtype,
755
+ "value": util,
756
+ "expected_range": [mean - 2 * std, mean + 2 * std],
757
+ "z_score": z_score,
758
+ "severity": "high" if abs(z_score) > 4 else "medium",
759
+ "description": f"Unusual {rtype} utilization: {util:.1f}% (expected: {mean:.1f}±{std:.1f}%)",
760
+ }
761
+ )
762
+
763
+ def _generate_summary(self) -> Dict[str, Any]:
764
+ """Generate analysis summary."""
765
+ # Calculate overall health score
766
+ pattern_score = 100 - len(self.patterns) * 10
767
+ bottleneck_score = 100 - sum(b.severity * 20 for b in self.bottlenecks)
768
+ anomaly_score = 100 - len(self.anomalies) * 5
769
+
770
+ health_score = max(
771
+ 0, min(100, (pattern_score + bottleneck_score + anomaly_score) / 3)
772
+ )
773
+
774
+ return {
775
+ "health_score": health_score,
776
+ "health_status": self._get_health_status(health_score),
777
+ "total_patterns": len(self.patterns),
778
+ "total_bottlenecks": len(self.bottlenecks),
779
+ "total_anomalies": len(self.anomalies),
780
+ "critical_issues": len([b for b in self.bottlenecks if b.severity > 0.8]),
781
+ "top_recommendations": self._get_top_recommendations(),
782
+ }
783
+
784
+ def _get_health_status(self, score: float) -> str:
785
+ """Get health status from score."""
786
+ if score >= 90:
787
+ return "excellent"
788
+ elif score >= 75:
789
+ return "good"
790
+ elif score >= 60:
791
+ return "fair"
792
+ elif score >= 40:
793
+ return "poor"
794
+ else:
795
+ return "critical"
796
+
797
+ def _get_top_recommendations(self) -> List[str]:
798
+ """Get top recommendations."""
799
+ recommendations = []
800
+
801
+ # From patterns
802
+ for pattern in sorted(self.patterns, key=lambda p: p.confidence, reverse=True)[
803
+ :2
804
+ ]:
805
+ recommendations.extend(pattern.recommendations[:1])
806
+
807
+ # From bottlenecks
808
+ for bottleneck in sorted(
809
+ self.bottlenecks, key=lambda b: b.severity, reverse=True
810
+ )[:2]:
811
+ recommendations.extend(bottleneck.resolution[:1])
812
+
813
+ return recommendations[:5] # Top 5 recommendations
814
+
815
+ def _calculate_priority(self, score: float) -> str:
816
+ """Calculate priority from score."""
817
+ if score >= 0.8:
818
+ return "critical"
819
+ elif score >= 0.6:
820
+ return "high"
821
+ elif score >= 0.4:
822
+ return "medium"
823
+ else:
824
+ return "low"