hqde 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hqde might be problematic. Click here for more details.

@@ -0,0 +1,498 @@
1
+ """
2
+ Dynamic load balancing module for HQDE framework.
3
+
4
+ This module implements intelligent workload distribution, performance monitoring,
5
+ and adaptive task scheduling for optimal resource utilization.
6
+ """
7
+
8
+ import torch
9
+ import ray
10
+ import numpy as np
11
+ import psutil
12
+ from typing import Dict, List, Optional, Tuple, Any, Callable
13
+ import time
14
+ import threading
15
+ import logging
16
+ from collections import defaultdict, deque
17
+
18
+
19
+ @ray.remote
20
+ class WorkerNode:
21
+ """Individual worker node with performance monitoring."""
22
+
23
+ def __init__(self, node_id: str, capabilities: Dict[str, Any]):
24
+ self.node_id = node_id
25
+ self.capabilities = capabilities
26
+ self.current_load = 0.0
27
+ self.task_queue = deque()
28
+ self.performance_history = deque(maxlen=100)
29
+ self.is_active = True
30
+
31
+ def get_system_metrics(self) -> Dict[str, float]:
32
+ """Get current system metrics for this node."""
33
+ return {
34
+ 'cpu_percent': psutil.cpu_percent(interval=0.1),
35
+ 'memory_percent': psutil.virtual_memory().percent,
36
+ 'disk_io_percent': psutil.disk_usage('/').percent if hasattr(psutil, 'disk_usage') else 0.0,
37
+ 'load_average': psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else 0.0,
38
+ 'current_task_load': self.current_load
39
+ }
40
+
41
+ def execute_task(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
42
+ """Execute a task and return results with performance metrics."""
43
+ start_time = time.time()
44
+ task_id = task_data.get('task_id', 'unknown')
45
+
46
+ try:
47
+ # Simulate task execution
48
+ self.current_load += task_data.get('estimated_load', 0.1)
49
+
50
+ # Process the task based on type
51
+ task_type = task_data.get('type', 'default')
52
+ if task_type == 'weight_aggregation':
53
+ result = self._execute_weight_aggregation(task_data)
54
+ elif task_type == 'quantization':
55
+ result = self._execute_quantization(task_data)
56
+ elif task_type == 'ensemble_training':
57
+ result = self._execute_ensemble_training(task_data)
58
+ else:
59
+ result = {'status': 'completed', 'data': task_data.get('data', {})}
60
+
61
+ execution_time = time.time() - start_time
62
+
63
+ # Update performance history
64
+ self.performance_history.append({
65
+ 'task_id': task_id,
66
+ 'execution_time': execution_time,
67
+ 'task_type': task_type,
68
+ 'success': True,
69
+ 'timestamp': time.time()
70
+ })
71
+
72
+ self.current_load = max(0.0, self.current_load - task_data.get('estimated_load', 0.1))
73
+
74
+ return {
75
+ 'node_id': self.node_id,
76
+ 'task_id': task_id,
77
+ 'result': result,
78
+ 'execution_time': execution_time,
79
+ 'status': 'success'
80
+ }
81
+
82
+ except Exception as e:
83
+ execution_time = time.time() - start_time
84
+ self.performance_history.append({
85
+ 'task_id': task_id,
86
+ 'execution_time': execution_time,
87
+ 'task_type': task_type,
88
+ 'success': False,
89
+ 'error': str(e),
90
+ 'timestamp': time.time()
91
+ })
92
+
93
+ self.current_load = max(0.0, self.current_load - task_data.get('estimated_load', 0.1))
94
+
95
+ return {
96
+ 'node_id': self.node_id,
97
+ 'task_id': task_id,
98
+ 'result': None,
99
+ 'execution_time': execution_time,
100
+ 'status': 'error',
101
+ 'error': str(e)
102
+ }
103
+
104
+ def _execute_weight_aggregation(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
105
+ """Execute weight aggregation task."""
106
+ weights = task_data.get('weights', [])
107
+ if not weights:
108
+ return {'aggregated_weights': {}}
109
+
110
+ # Simple aggregation for demonstration
111
+ aggregated = {}
112
+ for param_name in weights[0].keys():
113
+ param_tensors = [w[param_name] for w in weights if param_name in w]
114
+ if param_tensors:
115
+ aggregated[param_name] = torch.stack(param_tensors).mean(dim=0)
116
+
117
+ return {'aggregated_weights': aggregated}
118
+
119
+ def _execute_quantization(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
120
+ """Execute quantization task."""
121
+ weights = task_data.get('weights', {})
122
+ quantized_weights = {}
123
+
124
+ for param_name, weight_tensor in weights.items():
125
+ # Simple quantization simulation
126
+ quantized_weights[param_name] = torch.round(weight_tensor * 255) / 255
127
+
128
+ return {'quantized_weights': quantized_weights}
129
+
130
+ def _execute_ensemble_training(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
131
+ """Execute ensemble training task."""
132
+ # Simulate training step
133
+ training_time = np.random.uniform(0.1, 1.0)
134
+ time.sleep(training_time / 10) # Scaled down for simulation
135
+
136
+ return {
137
+ 'training_loss': np.random.uniform(0.1, 2.0),
138
+ 'accuracy': np.random.uniform(0.7, 0.95),
139
+ 'training_time': training_time
140
+ }
141
+
142
+ def get_performance_stats(self) -> Dict[str, Any]:
143
+ """Get performance statistics for this node."""
144
+ if not self.performance_history:
145
+ return {
146
+ 'node_id': self.node_id,
147
+ 'avg_execution_time': 0.0,
148
+ 'success_rate': 1.0,
149
+ 'total_tasks': 0,
150
+ 'current_load': self.current_load
151
+ }
152
+
153
+ successful_tasks = [task for task in self.performance_history if task['success']]
154
+ total_tasks = len(self.performance_history)
155
+
156
+ avg_execution_time = np.mean([task['execution_time'] for task in self.performance_history])
157
+ success_rate = len(successful_tasks) / total_tasks if total_tasks > 0 else 1.0
158
+
159
+ return {
160
+ 'node_id': self.node_id,
161
+ 'avg_execution_time': avg_execution_time,
162
+ 'success_rate': success_rate,
163
+ 'total_tasks': total_tasks,
164
+ 'current_load': self.current_load,
165
+ 'capabilities': self.capabilities
166
+ }
167
+
168
+ def set_active_status(self, is_active: bool):
169
+ """Set the active status of this node."""
170
+ self.is_active = is_active
171
+
172
+ def get_active_status(self) -> bool:
173
+ """Get the active status of this node."""
174
+ return self.is_active
175
+
176
+
177
+ class DynamicLoadBalancer:
178
+ """Dynamic load balancer for HQDE distributed ensemble learning."""
179
+
180
+ def __init__(self,
181
+ balancing_strategy: str = "adaptive",
182
+ monitoring_interval: float = 5.0,
183
+ load_threshold: float = 0.8):
184
+ """
185
+ Initialize dynamic load balancer.
186
+
187
+ Args:
188
+ balancing_strategy: Load balancing strategy ("round_robin", "least_loaded", "adaptive")
189
+ monitoring_interval: Interval for monitoring node performance
190
+ load_threshold: Threshold for triggering load redistribution
191
+ """
192
+ self.balancing_strategy = balancing_strategy
193
+ self.monitoring_interval = monitoring_interval
194
+ self.load_threshold = load_threshold
195
+
196
+ self.worker_nodes = {}
197
+ self.task_queue = deque()
198
+ self.task_history = defaultdict(list)
199
+ self.performance_predictor = PerformancePredictor()
200
+
201
+ # Monitoring thread
202
+ self.monitoring_active = False
203
+ self.monitoring_thread = None
204
+
205
+ # Load balancing metrics
206
+ self.balancing_metrics = {
207
+ 'total_tasks_scheduled': 0,
208
+ 'load_redistributions': 0,
209
+ 'average_response_time': 0.0,
210
+ 'node_utilization': {}
211
+ }
212
+
213
+ def add_worker_node(self, node_id: str, capabilities: Dict[str, Any]):
214
+ """Add a worker node to the load balancer."""
215
+ worker_node = WorkerNode.remote(node_id, capabilities)
216
+ self.worker_nodes[node_id] = worker_node
217
+ self.balancing_metrics['node_utilization'][node_id] = 0.0
218
+
219
+ def remove_worker_node(self, node_id: str):
220
+ """Remove a worker node from the load balancer."""
221
+ if node_id in self.worker_nodes:
222
+ # Set node as inactive first
223
+ ray.get(self.worker_nodes[node_id].set_active_status.remote(False))
224
+ del self.worker_nodes[node_id]
225
+ if node_id in self.balancing_metrics['node_utilization']:
226
+ del self.balancing_metrics['node_utilization'][node_id]
227
+
228
+ def schedule_task(self, task_data: Dict[str, Any]) -> str:
229
+ """Schedule a task for execution."""
230
+ if not self.worker_nodes:
231
+ raise RuntimeError("No worker nodes available")
232
+
233
+ # Select best node for the task
234
+ selected_node_id = self._select_node_for_task(task_data)
235
+
236
+ if selected_node_id is None:
237
+ raise RuntimeError("No suitable node found for task")
238
+
239
+ # Schedule task on selected node
240
+ task_future = self.worker_nodes[selected_node_id].execute_task.remote(task_data)
241
+
242
+ # Track task
243
+ task_id = task_data.get('task_id', f"task_{int(time.time())}")
244
+ self.task_history[selected_node_id].append({
245
+ 'task_id': task_id,
246
+ 'future': task_future,
247
+ 'scheduled_time': time.time(),
248
+ 'task_data': task_data
249
+ })
250
+
251
+ self.balancing_metrics['total_tasks_scheduled'] += 1
252
+
253
+ return task_id
254
+
255
+ def _select_node_for_task(self, task_data: Dict[str, Any]) -> Optional[str]:
256
+ """Select the best node for executing a task."""
257
+ if self.balancing_strategy == "round_robin":
258
+ return self._round_robin_selection()
259
+ elif self.balancing_strategy == "least_loaded":
260
+ return self._least_loaded_selection()
261
+ elif self.balancing_strategy == "adaptive":
262
+ return self._adaptive_selection(task_data)
263
+ else:
264
+ return self._round_robin_selection()
265
+
266
+ def _round_robin_selection(self) -> Optional[str]:
267
+ """Simple round-robin node selection."""
268
+ active_nodes = list(self.worker_nodes.keys())
269
+ if not active_nodes:
270
+ return None
271
+
272
+ # Simple counter-based round robin
273
+ selection_index = self.balancing_metrics['total_tasks_scheduled'] % len(active_nodes)
274
+ return active_nodes[selection_index]
275
+
276
+ def _least_loaded_selection(self) -> Optional[str]:
277
+ """Select the least loaded node."""
278
+ if not self.worker_nodes:
279
+ return None
280
+
281
+ # Get current load for all nodes
282
+ load_futures = {
283
+ node_id: node.get_system_metrics.remote()
284
+ for node_id, node in self.worker_nodes.items()
285
+ }
286
+
287
+ node_loads = {}
288
+ for node_id, future in load_futures.items():
289
+ try:
290
+ metrics = ray.get(future)
291
+ node_loads[node_id] = metrics.get('current_task_load', 0.0)
292
+ except:
293
+ node_loads[node_id] = float('inf') # Exclude failed nodes
294
+
295
+ # Select node with minimum load
296
+ return min(node_loads.keys(), key=lambda x: node_loads[x])
297
+
298
+ def _adaptive_selection(self, task_data: Dict[str, Any]) -> Optional[str]:
299
+ """Adaptive node selection based on task requirements and node capabilities."""
300
+ if not self.worker_nodes:
301
+ return None
302
+
303
+ # Get performance stats for all nodes
304
+ perf_futures = {
305
+ node_id: node.get_performance_stats.remote()
306
+ for node_id, node in self.worker_nodes.items()
307
+ }
308
+
309
+ node_scores = {}
310
+ for node_id, future in perf_futures.items():
311
+ try:
312
+ stats = ray.get(future)
313
+
314
+ # Calculate suitability score
315
+ score = self._calculate_node_suitability_score(task_data, stats)
316
+ node_scores[node_id] = score
317
+ except:
318
+ node_scores[node_id] = 0.0 # Exclude failed nodes
319
+
320
+ if not node_scores:
321
+ return None
322
+
323
+ # Select node with highest suitability score
324
+ return max(node_scores.keys(), key=lambda x: node_scores[x])
325
+
326
+ def _calculate_node_suitability_score(self,
327
+ task_data: Dict[str, Any],
328
+ node_stats: Dict[str, Any]) -> float:
329
+ """Calculate suitability score for a node given a task."""
330
+ score = 0.0
331
+
332
+ # Factor in success rate
333
+ success_rate = node_stats.get('success_rate', 1.0)
334
+ score += success_rate * 0.4
335
+
336
+ # Factor in current load (lower is better)
337
+ current_load = node_stats.get('current_load', 0.0)
338
+ load_factor = max(0.0, 1.0 - current_load)
339
+ score += load_factor * 0.3
340
+
341
+ # Factor in average execution time (lower is better)
342
+ avg_time = node_stats.get('avg_execution_time', 1.0)
343
+ time_factor = max(0.0, 1.0 - min(avg_time / 10.0, 1.0)) # Normalize to 10 seconds max
344
+ score += time_factor * 0.2
345
+
346
+ # Factor in capabilities match
347
+ task_requirements = task_data.get('requirements', {})
348
+ node_capabilities = node_stats.get('capabilities', {})
349
+ capability_match = self._calculate_capability_match(task_requirements, node_capabilities)
350
+ score += capability_match * 0.1
351
+
352
+ return score
353
+
354
+ def _calculate_capability_match(self,
355
+ requirements: Dict[str, Any],
356
+ capabilities: Dict[str, Any]) -> float:
357
+ """Calculate how well node capabilities match task requirements."""
358
+ if not requirements:
359
+ return 1.0
360
+
361
+ matches = 0
362
+ total_requirements = 0
363
+
364
+ for req_key, req_value in requirements.items():
365
+ total_requirements += 1
366
+ if req_key in capabilities:
367
+ cap_value = capabilities[req_key]
368
+ if isinstance(req_value, (int, float)) and isinstance(cap_value, (int, float)):
369
+ if cap_value >= req_value:
370
+ matches += 1
371
+ elif req_value == cap_value:
372
+ matches += 1
373
+
374
+ return matches / total_requirements if total_requirements > 0 else 1.0
375
+
376
+ def start_monitoring(self):
377
+ """Start performance monitoring."""
378
+ self.monitoring_active = True
379
+ self.monitoring_thread = threading.Thread(target=self._monitoring_loop)
380
+ self.monitoring_thread.daemon = True
381
+ self.monitoring_thread.start()
382
+
383
+ def stop_monitoring(self):
384
+ """Stop performance monitoring."""
385
+ self.monitoring_active = False
386
+ if self.monitoring_thread:
387
+ self.monitoring_thread.join()
388
+
389
+ def _monitoring_loop(self):
390
+ """Main monitoring loop."""
391
+ while self.monitoring_active:
392
+ try:
393
+ self._collect_performance_metrics()
394
+ self._check_load_balance()
395
+ time.sleep(self.monitoring_interval)
396
+ except Exception as e:
397
+ logging.error(f"Error in monitoring loop: {e}")
398
+
399
+ def _collect_performance_metrics(self):
400
+ """Collect performance metrics from all nodes."""
401
+ metric_futures = {
402
+ node_id: node.get_system_metrics.remote()
403
+ for node_id, node in self.worker_nodes.items()
404
+ }
405
+
406
+ for node_id, future in metric_futures.items():
407
+ try:
408
+ metrics = ray.get(future)
409
+ current_load = metrics.get('current_task_load', 0.0)
410
+ self.balancing_metrics['node_utilization'][node_id] = current_load
411
+ except:
412
+ # Node might be unavailable
413
+ pass
414
+
415
+ def _check_load_balance(self):
416
+ """Check if load rebalancing is needed."""
417
+ node_loads = list(self.balancing_metrics['node_utilization'].values())
418
+
419
+ if len(node_loads) < 2:
420
+ return
421
+
422
+ max_load = max(node_loads)
423
+ min_load = min(node_loads)
424
+ load_imbalance = max_load - min_load
425
+
426
+ if load_imbalance > self.load_threshold:
427
+ self._rebalance_load()
428
+
429
+ def _rebalance_load(self):
430
+ """Perform load rebalancing."""
431
+ # This is a simplified rebalancing strategy
432
+ # In practice, this would involve more sophisticated task migration
433
+ self.balancing_metrics['load_redistributions'] += 1
434
+ logging.info("Load rebalancing triggered")
435
+
436
+ def get_balancing_statistics(self) -> Dict[str, Any]:
437
+ """Get load balancing statistics."""
438
+ # Calculate average response time
439
+ all_tasks = []
440
+ for node_tasks in self.task_history.values():
441
+ all_tasks.extend(node_tasks)
442
+
443
+ if all_tasks:
444
+ # This is simplified - in practice, you'd track completion times
445
+ self.balancing_metrics['average_response_time'] = 1.0 # Placeholder
446
+
447
+ return {
448
+ 'balancing_metrics': self.balancing_metrics.copy(),
449
+ 'active_nodes': len(self.worker_nodes),
450
+ 'balancing_strategy': self.balancing_strategy,
451
+ 'monitoring_interval': self.monitoring_interval,
452
+ 'load_threshold': self.load_threshold
453
+ }
454
+
455
+ def cleanup(self):
456
+ """Cleanup load balancer resources."""
457
+ self.stop_monitoring()
458
+ # Ray will automatically clean up remote actors
459
+
460
+
461
+ class PerformancePredictor:
462
+ """Simple performance predictor for task scheduling."""
463
+
464
+ def __init__(self):
465
+ self.task_performance_history = defaultdict(list)
466
+
467
+ def predict_completion_time(self,
468
+ task_features: Dict[str, Any],
469
+ node_features: Dict[str, Any]) -> float:
470
+ """Predict task completion time based on features."""
471
+ # Simplified prediction based on task type and node performance
472
+ task_type = task_features.get('type', 'default')
473
+ node_avg_time = node_features.get('avg_execution_time', 1.0)
474
+
475
+ # Task type multipliers
476
+ type_multipliers = {
477
+ 'weight_aggregation': 0.5,
478
+ 'quantization': 0.8,
479
+ 'ensemble_training': 2.0,
480
+ 'default': 1.0
481
+ }
482
+
483
+ base_time = type_multipliers.get(task_type, 1.0)
484
+ predicted_time = base_time * node_avg_time
485
+
486
+ return predicted_time
487
+
488
+ def update_performance_history(self,
489
+ task_features: Dict[str, Any],
490
+ actual_time: float):
491
+ """Update performance history with actual completion time."""
492
+ task_type = task_features.get('type', 'default')
493
+ self.task_performance_history[task_type].append(actual_time)
494
+
495
+ # Keep only recent history
496
+ if len(self.task_performance_history[task_type]) > 100:
497
+ self.task_performance_history[task_type] = \
498
+ self.task_performance_history[task_type][-100:]