hqde 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hqde might be problematic. Click here for more details.

@@ -0,0 +1,465 @@
1
+ """
2
+ Performance monitoring utilities for HQDE framework.
3
+
4
+ This module provides comprehensive performance monitoring, metrics collection,
5
+ and system resource tracking for distributed ensemble learning.
6
+ """
7
+
8
+ import torch
9
+ import psutil
10
+ import time
11
+ import threading
12
+ import numpy as np
13
+ from typing import Dict, List, Optional, Any, Callable
14
+ from collections import defaultdict, deque
15
+ import logging
16
+ import json
17
+
18
+
19
+ class SystemMetrics:
20
+ """Container for system performance metrics."""
21
+
22
+ def __init__(self):
23
+ self.timestamp = time.time()
24
+ self.cpu_percent = 0.0
25
+ self.memory_percent = 0.0
26
+ self.memory_used_gb = 0.0
27
+ self.disk_io_read_mb = 0.0
28
+ self.disk_io_write_mb = 0.0
29
+ self.network_sent_mb = 0.0
30
+ self.network_recv_mb = 0.0
31
+ self.gpu_memory_used_gb = 0.0
32
+ self.gpu_utilization = 0.0
33
+ self.load_average = 0.0
34
+
35
+ def to_dict(self) -> Dict[str, float]:
36
+ """Convert metrics to dictionary."""
37
+ return {
38
+ 'timestamp': self.timestamp,
39
+ 'cpu_percent': self.cpu_percent,
40
+ 'memory_percent': self.memory_percent,
41
+ 'memory_used_gb': self.memory_used_gb,
42
+ 'disk_io_read_mb': self.disk_io_read_mb,
43
+ 'disk_io_write_mb': self.disk_io_write_mb,
44
+ 'network_sent_mb': self.network_sent_mb,
45
+ 'network_recv_mb': self.network_recv_mb,
46
+ 'gpu_memory_used_gb': self.gpu_memory_used_gb,
47
+ 'gpu_utilization': self.gpu_utilization,
48
+ 'load_average': self.load_average
49
+ }
50
+
51
+ @classmethod
52
+ def from_dict(cls, data: Dict[str, float]) -> 'SystemMetrics':
53
+ """Create SystemMetrics from dictionary."""
54
+ metrics = cls()
55
+ for key, value in data.items():
56
+ if hasattr(metrics, key):
57
+ setattr(metrics, key, value)
58
+ return metrics
59
+
60
+
61
+ class PerformanceMonitor:
62
+ """Comprehensive performance monitor for HQDE systems."""
63
+
64
+ def __init__(self,
65
+ monitoring_interval: float = 1.0,
66
+ history_size: int = 1000,
67
+ enable_gpu_monitoring: bool = True):
68
+ """
69
+ Initialize performance monitor.
70
+
71
+ Args:
72
+ monitoring_interval: Interval between metric collections (seconds)
73
+ history_size: Maximum number of historical metrics to keep
74
+ enable_gpu_monitoring: Whether to monitor GPU metrics
75
+ """
76
+ self.monitoring_interval = monitoring_interval
77
+ self.history_size = history_size
78
+ self.enable_gpu_monitoring = enable_gpu_monitoring
79
+
80
+ # Metrics storage
81
+ self.metrics_history = deque(maxlen=history_size)
82
+ self.ensemble_metrics = defaultdict(list)
83
+ self.training_metrics = defaultdict(list)
84
+
85
+ # Monitoring state
86
+ self.is_monitoring = False
87
+ self.monitoring_thread = None
88
+
89
+ # Performance baselines
90
+ self.baseline_metrics = None
91
+ self.performance_alerts = []
92
+
93
+ # Event tracking
94
+ self.events = []
95
+ self.custom_metrics = defaultdict(list)
96
+
97
+ # Initialize baseline
98
+ self._establish_baseline()
99
+
100
+ def _establish_baseline(self):
101
+ """Establish performance baseline."""
102
+ baseline_samples = []
103
+ for _ in range(10):
104
+ metrics = self._collect_system_metrics()
105
+ baseline_samples.append(metrics)
106
+ time.sleep(0.1)
107
+
108
+ if baseline_samples:
109
+ self.baseline_metrics = self._calculate_average_metrics(baseline_samples)
110
+
111
+ def _collect_system_metrics(self) -> SystemMetrics:
112
+ """Collect current system metrics."""
113
+ metrics = SystemMetrics()
114
+
115
+ try:
116
+ # CPU metrics
117
+ metrics.cpu_percent = psutil.cpu_percent(interval=0.1)
118
+
119
+ # Memory metrics
120
+ memory = psutil.virtual_memory()
121
+ metrics.memory_percent = memory.percent
122
+ metrics.memory_used_gb = memory.used / (1024**3)
123
+
124
+ # Disk I/O metrics
125
+ disk_io = psutil.disk_io_counters()
126
+ if disk_io:
127
+ metrics.disk_io_read_mb = disk_io.read_bytes / (1024**2)
128
+ metrics.disk_io_write_mb = disk_io.write_bytes / (1024**2)
129
+
130
+ # Network metrics
131
+ network_io = psutil.net_io_counters()
132
+ if network_io:
133
+ metrics.network_sent_mb = network_io.bytes_sent / (1024**2)
134
+ metrics.network_recv_mb = network_io.bytes_recv / (1024**2)
135
+
136
+ # Load average (Unix-like systems)
137
+ if hasattr(psutil, 'getloadavg'):
138
+ metrics.load_average = psutil.getloadavg()[0]
139
+
140
+ # GPU metrics
141
+ if self.enable_gpu_monitoring and torch.cuda.is_available():
142
+ try:
143
+ metrics.gpu_memory_used_gb = torch.cuda.memory_allocated() / (1024**3)
144
+ # GPU utilization would require nvidia-ml-py or similar
145
+ metrics.gpu_utilization = 0.0 # Placeholder
146
+ except:
147
+ pass
148
+
149
+ except Exception as e:
150
+ logging.warning(f"Error collecting system metrics: {e}")
151
+
152
+ return metrics
153
+
154
+ def _calculate_average_metrics(self, metrics_list: List[SystemMetrics]) -> SystemMetrics:
155
+ """Calculate average of multiple SystemMetrics."""
156
+ if not metrics_list:
157
+ return SystemMetrics()
158
+
159
+ avg_metrics = SystemMetrics()
160
+ fields = ['cpu_percent', 'memory_percent', 'memory_used_gb',
161
+ 'disk_io_read_mb', 'disk_io_write_mb', 'network_sent_mb',
162
+ 'network_recv_mb', 'gpu_memory_used_gb', 'gpu_utilization',
163
+ 'load_average']
164
+
165
+ for field in fields:
166
+ values = [getattr(m, field) for m in metrics_list]
167
+ setattr(avg_metrics, field, np.mean(values))
168
+
169
+ return avg_metrics
170
+
171
+ def start_monitoring(self):
172
+ """Start continuous performance monitoring."""
173
+ if self.is_monitoring:
174
+ return
175
+
176
+ self.is_monitoring = True
177
+ self.monitoring_thread = threading.Thread(target=self._monitoring_loop)
178
+ self.monitoring_thread.daemon = True
179
+ self.monitoring_thread.start()
180
+
181
+ logging.info("Performance monitoring started")
182
+
183
+ def stop_monitoring(self):
184
+ """Stop performance monitoring."""
185
+ self.is_monitoring = False
186
+ if self.monitoring_thread:
187
+ self.monitoring_thread.join()
188
+
189
+ logging.info("Performance monitoring stopped")
190
+
191
+ def _monitoring_loop(self):
192
+ """Main monitoring loop."""
193
+ while self.is_monitoring:
194
+ try:
195
+ metrics = self._collect_system_metrics()
196
+ self.metrics_history.append(metrics)
197
+
198
+ # Check for performance alerts
199
+ self._check_performance_alerts(metrics)
200
+
201
+ time.sleep(self.monitoring_interval)
202
+ except Exception as e:
203
+ logging.error(f"Error in monitoring loop: {e}")
204
+
205
+ def _check_performance_alerts(self, metrics: SystemMetrics):
206
+ """Check for performance alerts based on current metrics."""
207
+ alerts = []
208
+
209
+ # High CPU usage alert
210
+ if metrics.cpu_percent > 90:
211
+ alerts.append({
212
+ 'type': 'high_cpu',
213
+ 'message': f'High CPU usage: {metrics.cpu_percent:.1f}%',
214
+ 'timestamp': metrics.timestamp,
215
+ 'severity': 'warning'
216
+ })
217
+
218
+ # High memory usage alert
219
+ if metrics.memory_percent > 85:
220
+ alerts.append({
221
+ 'type': 'high_memory',
222
+ 'message': f'High memory usage: {metrics.memory_percent:.1f}%',
223
+ 'timestamp': metrics.timestamp,
224
+ 'severity': 'warning'
225
+ })
226
+
227
+ # GPU memory alert
228
+ if metrics.gpu_memory_used_gb > 8: # Assuming 8GB+ is high usage
229
+ alerts.append({
230
+ 'type': 'high_gpu_memory',
231
+ 'message': f'High GPU memory usage: {metrics.gpu_memory_used_gb:.1f}GB',
232
+ 'timestamp': metrics.timestamp,
233
+ 'severity': 'warning'
234
+ })
235
+
236
+ # Add alerts to history
237
+ self.performance_alerts.extend(alerts)
238
+
239
+ # Keep only recent alerts
240
+ cutoff_time = time.time() - 3600 # Last hour
241
+ self.performance_alerts = [
242
+ alert for alert in self.performance_alerts
243
+ if alert['timestamp'] > cutoff_time
244
+ ]
245
+
246
+ def record_ensemble_metric(self,
247
+ metric_name: str,
248
+ value: float,
249
+ ensemble_id: Optional[str] = None,
250
+ metadata: Optional[Dict[str, Any]] = None):
251
+ """Record an ensemble-specific metric."""
252
+ metric_data = {
253
+ 'timestamp': time.time(),
254
+ 'value': value,
255
+ 'ensemble_id': ensemble_id,
256
+ 'metadata': metadata or {}
257
+ }
258
+
259
+ self.ensemble_metrics[metric_name].append(metric_data)
260
+
261
+ # Keep only recent metrics
262
+ if len(self.ensemble_metrics[metric_name]) > self.history_size:
263
+ self.ensemble_metrics[metric_name] = \
264
+ self.ensemble_metrics[metric_name][-self.history_size:]
265
+
266
+ def record_training_metric(self,
267
+ metric_name: str,
268
+ value: float,
269
+ epoch: Optional[int] = None,
270
+ batch: Optional[int] = None):
271
+ """Record a training-specific metric."""
272
+ metric_data = {
273
+ 'timestamp': time.time(),
274
+ 'value': value,
275
+ 'epoch': epoch,
276
+ 'batch': batch
277
+ }
278
+
279
+ self.training_metrics[metric_name].append(metric_data)
280
+
281
+ # Keep only recent metrics
282
+ if len(self.training_metrics[metric_name]) > self.history_size:
283
+ self.training_metrics[metric_name] = \
284
+ self.training_metrics[metric_name][-self.history_size:]
285
+
286
+ def record_custom_metric(self,
287
+ metric_name: str,
288
+ value: Any,
289
+ tags: Optional[Dict[str, str]] = None):
290
+ """Record a custom metric."""
291
+ metric_data = {
292
+ 'timestamp': time.time(),
293
+ 'value': value,
294
+ 'tags': tags or {}
295
+ }
296
+
297
+ self.custom_metrics[metric_name].append(metric_data)
298
+
299
+ # Keep only recent metrics
300
+ if len(self.custom_metrics[metric_name]) > self.history_size:
301
+ self.custom_metrics[metric_name] = \
302
+ self.custom_metrics[metric_name][-self.history_size:]
303
+
304
+ def record_event(self,
305
+ event_type: str,
306
+ description: str,
307
+ metadata: Optional[Dict[str, Any]] = None):
308
+ """Record a system event."""
309
+ event = {
310
+ 'timestamp': time.time(),
311
+ 'type': event_type,
312
+ 'description': description,
313
+ 'metadata': metadata or {}
314
+ }
315
+
316
+ self.events.append(event)
317
+
318
+ # Keep only recent events
319
+ if len(self.events) > self.history_size:
320
+ self.events = self.events[-self.history_size:]
321
+
322
+ def get_current_metrics(self) -> Optional[SystemMetrics]:
323
+ """Get the most recent system metrics."""
324
+ if self.metrics_history:
325
+ return self.metrics_history[-1]
326
+ return None
327
+
328
+ def get_metrics_summary(self, window_minutes: float = 60) -> Dict[str, Any]:
329
+ """Get summary statistics for metrics within a time window."""
330
+ cutoff_time = time.time() - (window_minutes * 60)
331
+ recent_metrics = [
332
+ m for m in self.metrics_history
333
+ if m.timestamp > cutoff_time
334
+ ]
335
+
336
+ if not recent_metrics:
337
+ return {}
338
+
339
+ summary = {}
340
+ fields = ['cpu_percent', 'memory_percent', 'memory_used_gb',
341
+ 'gpu_memory_used_gb', 'load_average']
342
+
343
+ for field in fields:
344
+ values = [getattr(m, field) for m in recent_metrics]
345
+ summary[field] = {
346
+ 'mean': np.mean(values),
347
+ 'std': np.std(values),
348
+ 'min': np.min(values),
349
+ 'max': np.max(values),
350
+ 'current': values[-1] if values else 0
351
+ }
352
+
353
+ return summary
354
+
355
+ def get_performance_report(self) -> Dict[str, Any]:
356
+ """Generate comprehensive performance report."""
357
+ current_metrics = self.get_current_metrics()
358
+ metrics_summary = self.get_metrics_summary()
359
+
360
+ # Calculate performance compared to baseline
361
+ performance_comparison = {}
362
+ if self.baseline_metrics and current_metrics:
363
+ fields = ['cpu_percent', 'memory_percent', 'gpu_memory_used_gb']
364
+ for field in fields:
365
+ baseline_val = getattr(self.baseline_metrics, field)
366
+ current_val = getattr(current_metrics, field)
367
+ if baseline_val > 0:
368
+ change_percent = ((current_val - baseline_val) / baseline_val) * 100
369
+ performance_comparison[field] = {
370
+ 'baseline': baseline_val,
371
+ 'current': current_val,
372
+ 'change_percent': change_percent
373
+ }
374
+
375
+ return {
376
+ 'current_metrics': current_metrics.to_dict() if current_metrics else {},
377
+ 'metrics_summary': metrics_summary,
378
+ 'performance_comparison': performance_comparison,
379
+ 'recent_alerts': self.performance_alerts[-10:], # Last 10 alerts
380
+ 'ensemble_metrics_count': {k: len(v) for k, v in self.ensemble_metrics.items()},
381
+ 'training_metrics_count': {k: len(v) for k, v in self.training_metrics.items()},
382
+ 'recent_events': self.events[-10:], # Last 10 events
383
+ 'monitoring_status': self.is_monitoring
384
+ }
385
+
386
+ def export_metrics(self, filepath: str, format: str = 'json'):
387
+ """Export metrics to file."""
388
+ if format.lower() == 'json':
389
+ self._export_json(filepath)
390
+ elif format.lower() == 'csv':
391
+ self._export_csv(filepath)
392
+ else:
393
+ raise ValueError(f"Unsupported export format: {format}")
394
+
395
+ def _export_json(self, filepath: str):
396
+ """Export metrics to JSON file."""
397
+ export_data = {
398
+ 'system_metrics': [m.to_dict() for m in self.metrics_history],
399
+ 'ensemble_metrics': dict(self.ensemble_metrics),
400
+ 'training_metrics': dict(self.training_metrics),
401
+ 'custom_metrics': dict(self.custom_metrics),
402
+ 'events': self.events,
403
+ 'performance_alerts': self.performance_alerts
404
+ }
405
+
406
+ with open(filepath, 'w') as f:
407
+ json.dump(export_data, f, indent=2, default=str)
408
+
409
+ def _export_csv(self, filepath: str):
410
+ """Export system metrics to CSV file."""
411
+ import csv
412
+
413
+ with open(filepath, 'w', newline='') as f:
414
+ if not self.metrics_history:
415
+ return
416
+
417
+ fieldnames = self.metrics_history[0].to_dict().keys()
418
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
419
+
420
+ writer.writeheader()
421
+ for metrics in self.metrics_history:
422
+ writer.writerow(metrics.to_dict())
423
+
424
+ def cleanup(self):
425
+ """Cleanup monitoring resources."""
426
+ self.stop_monitoring()
427
+ self.metrics_history.clear()
428
+ self.ensemble_metrics.clear()
429
+ self.training_metrics.clear()
430
+ self.custom_metrics.clear()
431
+ self.events.clear()
432
+ self.performance_alerts.clear()
433
+
434
+
435
+ class TimingContext:
436
+ """Context manager for measuring execution time."""
437
+
438
+ def __init__(self, monitor: PerformanceMonitor, metric_name: str, **kwargs):
439
+ self.monitor = monitor
440
+ self.metric_name = metric_name
441
+ self.kwargs = kwargs
442
+ self.start_time = None
443
+
444
+ def __enter__(self):
445
+ self.start_time = time.time()
446
+ return self
447
+
448
+ def __exit__(self, exc_type, exc_val, exc_tb):
449
+ if self.start_time is not None:
450
+ execution_time = time.time() - self.start_time
451
+ self.monitor.record_custom_metric(
452
+ self.metric_name,
453
+ execution_time,
454
+ tags={'unit': 'seconds', **self.kwargs}
455
+ )
456
+
457
+
458
+ def monitor_performance(monitor: PerformanceMonitor, metric_name: str):
459
+ """Decorator for monitoring function performance."""
460
+ def decorator(func: Callable):
461
+ def wrapper(*args, **kwargs):
462
+ with TimingContext(monitor, f"{metric_name}_execution_time"):
463
+ return func(*args, **kwargs)
464
+ return wrapper
465
+ return decorator
@@ -0,0 +1,9 @@
1
+ """
2
+ Visualization utilities for HQDE framework.
3
+
4
+ Placeholder implementation for visualization components.
5
+ """
6
+
7
+ class HQDEVisualizer:
8
+ """Placeholder HQDEVisualizer class."""
9
+ pass