hqde 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hqde might be problematic. Click here for more details.
- hqde/__init__.py +62 -0
- hqde/__main__.py +0 -0
- hqde/core/__init__.py +23 -0
- hqde/core/hqde_system.py +380 -0
- hqde/distributed/__init__.py +18 -0
- hqde/distributed/fault_tolerance.py +346 -0
- hqde/distributed/hierarchical_aggregator.py +399 -0
- hqde/distributed/load_balancer.py +498 -0
- hqde/distributed/mapreduce_ensemble.py +394 -0
- hqde/py.typed +0 -0
- hqde/quantum/__init__.py +17 -0
- hqde/quantum/quantum_aggregator.py +291 -0
- hqde/quantum/quantum_noise.py +284 -0
- hqde/quantum/quantum_optimization.py +336 -0
- hqde/utils/__init__.py +20 -0
- hqde/utils/config_manager.py +9 -0
- hqde/utils/data_utils.py +13 -0
- hqde/utils/performance_monitor.py +465 -0
- hqde/utils/visualization.py +9 -0
- hqde-0.1.0.dist-info/METADATA +237 -0
- hqde-0.1.0.dist-info/RECORD +24 -0
- hqde-0.1.0.dist-info/WHEEL +5 -0
- hqde-0.1.0.dist-info/licenses/LICENSE +21 -0
- hqde-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dynamic load balancing module for HQDE framework.
|
|
3
|
+
|
|
4
|
+
This module implements intelligent workload distribution, performance monitoring,
|
|
5
|
+
and adaptive task scheduling for optimal resource utilization.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import ray
|
|
10
|
+
import numpy as np
|
|
11
|
+
import psutil
|
|
12
|
+
from typing import Dict, List, Optional, Tuple, Any, Callable
|
|
13
|
+
import time
|
|
14
|
+
import threading
|
|
15
|
+
import logging
|
|
16
|
+
from collections import defaultdict, deque
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@ray.remote
|
|
20
|
+
class WorkerNode:
|
|
21
|
+
"""Individual worker node with performance monitoring."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, node_id: str, capabilities: Dict[str, Any]):
|
|
24
|
+
self.node_id = node_id
|
|
25
|
+
self.capabilities = capabilities
|
|
26
|
+
self.current_load = 0.0
|
|
27
|
+
self.task_queue = deque()
|
|
28
|
+
self.performance_history = deque(maxlen=100)
|
|
29
|
+
self.is_active = True
|
|
30
|
+
|
|
31
|
+
def get_system_metrics(self) -> Dict[str, float]:
|
|
32
|
+
"""Get current system metrics for this node."""
|
|
33
|
+
return {
|
|
34
|
+
'cpu_percent': psutil.cpu_percent(interval=0.1),
|
|
35
|
+
'memory_percent': psutil.virtual_memory().percent,
|
|
36
|
+
'disk_io_percent': psutil.disk_usage('/').percent if hasattr(psutil, 'disk_usage') else 0.0,
|
|
37
|
+
'load_average': psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else 0.0,
|
|
38
|
+
'current_task_load': self.current_load
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
def execute_task(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
42
|
+
"""Execute a task and return results with performance metrics."""
|
|
43
|
+
start_time = time.time()
|
|
44
|
+
task_id = task_data.get('task_id', 'unknown')
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
# Simulate task execution
|
|
48
|
+
self.current_load += task_data.get('estimated_load', 0.1)
|
|
49
|
+
|
|
50
|
+
# Process the task based on type
|
|
51
|
+
task_type = task_data.get('type', 'default')
|
|
52
|
+
if task_type == 'weight_aggregation':
|
|
53
|
+
result = self._execute_weight_aggregation(task_data)
|
|
54
|
+
elif task_type == 'quantization':
|
|
55
|
+
result = self._execute_quantization(task_data)
|
|
56
|
+
elif task_type == 'ensemble_training':
|
|
57
|
+
result = self._execute_ensemble_training(task_data)
|
|
58
|
+
else:
|
|
59
|
+
result = {'status': 'completed', 'data': task_data.get('data', {})}
|
|
60
|
+
|
|
61
|
+
execution_time = time.time() - start_time
|
|
62
|
+
|
|
63
|
+
# Update performance history
|
|
64
|
+
self.performance_history.append({
|
|
65
|
+
'task_id': task_id,
|
|
66
|
+
'execution_time': execution_time,
|
|
67
|
+
'task_type': task_type,
|
|
68
|
+
'success': True,
|
|
69
|
+
'timestamp': time.time()
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
self.current_load = max(0.0, self.current_load - task_data.get('estimated_load', 0.1))
|
|
73
|
+
|
|
74
|
+
return {
|
|
75
|
+
'node_id': self.node_id,
|
|
76
|
+
'task_id': task_id,
|
|
77
|
+
'result': result,
|
|
78
|
+
'execution_time': execution_time,
|
|
79
|
+
'status': 'success'
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
except Exception as e:
|
|
83
|
+
execution_time = time.time() - start_time
|
|
84
|
+
self.performance_history.append({
|
|
85
|
+
'task_id': task_id,
|
|
86
|
+
'execution_time': execution_time,
|
|
87
|
+
'task_type': task_type,
|
|
88
|
+
'success': False,
|
|
89
|
+
'error': str(e),
|
|
90
|
+
'timestamp': time.time()
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
self.current_load = max(0.0, self.current_load - task_data.get('estimated_load', 0.1))
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
'node_id': self.node_id,
|
|
97
|
+
'task_id': task_id,
|
|
98
|
+
'result': None,
|
|
99
|
+
'execution_time': execution_time,
|
|
100
|
+
'status': 'error',
|
|
101
|
+
'error': str(e)
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
def _execute_weight_aggregation(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
105
|
+
"""Execute weight aggregation task."""
|
|
106
|
+
weights = task_data.get('weights', [])
|
|
107
|
+
if not weights:
|
|
108
|
+
return {'aggregated_weights': {}}
|
|
109
|
+
|
|
110
|
+
# Simple aggregation for demonstration
|
|
111
|
+
aggregated = {}
|
|
112
|
+
for param_name in weights[0].keys():
|
|
113
|
+
param_tensors = [w[param_name] for w in weights if param_name in w]
|
|
114
|
+
if param_tensors:
|
|
115
|
+
aggregated[param_name] = torch.stack(param_tensors).mean(dim=0)
|
|
116
|
+
|
|
117
|
+
return {'aggregated_weights': aggregated}
|
|
118
|
+
|
|
119
|
+
def _execute_quantization(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
120
|
+
"""Execute quantization task."""
|
|
121
|
+
weights = task_data.get('weights', {})
|
|
122
|
+
quantized_weights = {}
|
|
123
|
+
|
|
124
|
+
for param_name, weight_tensor in weights.items():
|
|
125
|
+
# Simple quantization simulation
|
|
126
|
+
quantized_weights[param_name] = torch.round(weight_tensor * 255) / 255
|
|
127
|
+
|
|
128
|
+
return {'quantized_weights': quantized_weights}
|
|
129
|
+
|
|
130
|
+
def _execute_ensemble_training(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
131
|
+
"""Execute ensemble training task."""
|
|
132
|
+
# Simulate training step
|
|
133
|
+
training_time = np.random.uniform(0.1, 1.0)
|
|
134
|
+
time.sleep(training_time / 10) # Scaled down for simulation
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
'training_loss': np.random.uniform(0.1, 2.0),
|
|
138
|
+
'accuracy': np.random.uniform(0.7, 0.95),
|
|
139
|
+
'training_time': training_time
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
def get_performance_stats(self) -> Dict[str, Any]:
|
|
143
|
+
"""Get performance statistics for this node."""
|
|
144
|
+
if not self.performance_history:
|
|
145
|
+
return {
|
|
146
|
+
'node_id': self.node_id,
|
|
147
|
+
'avg_execution_time': 0.0,
|
|
148
|
+
'success_rate': 1.0,
|
|
149
|
+
'total_tasks': 0,
|
|
150
|
+
'current_load': self.current_load
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
successful_tasks = [task for task in self.performance_history if task['success']]
|
|
154
|
+
total_tasks = len(self.performance_history)
|
|
155
|
+
|
|
156
|
+
avg_execution_time = np.mean([task['execution_time'] for task in self.performance_history])
|
|
157
|
+
success_rate = len(successful_tasks) / total_tasks if total_tasks > 0 else 1.0
|
|
158
|
+
|
|
159
|
+
return {
|
|
160
|
+
'node_id': self.node_id,
|
|
161
|
+
'avg_execution_time': avg_execution_time,
|
|
162
|
+
'success_rate': success_rate,
|
|
163
|
+
'total_tasks': total_tasks,
|
|
164
|
+
'current_load': self.current_load,
|
|
165
|
+
'capabilities': self.capabilities
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
def set_active_status(self, is_active: bool):
|
|
169
|
+
"""Set the active status of this node."""
|
|
170
|
+
self.is_active = is_active
|
|
171
|
+
|
|
172
|
+
def get_active_status(self) -> bool:
|
|
173
|
+
"""Get the active status of this node."""
|
|
174
|
+
return self.is_active
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class DynamicLoadBalancer:
|
|
178
|
+
"""Dynamic load balancer for HQDE distributed ensemble learning."""
|
|
179
|
+
|
|
180
|
+
def __init__(self,
|
|
181
|
+
balancing_strategy: str = "adaptive",
|
|
182
|
+
monitoring_interval: float = 5.0,
|
|
183
|
+
load_threshold: float = 0.8):
|
|
184
|
+
"""
|
|
185
|
+
Initialize dynamic load balancer.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
balancing_strategy: Load balancing strategy ("round_robin", "least_loaded", "adaptive")
|
|
189
|
+
monitoring_interval: Interval for monitoring node performance
|
|
190
|
+
load_threshold: Threshold for triggering load redistribution
|
|
191
|
+
"""
|
|
192
|
+
self.balancing_strategy = balancing_strategy
|
|
193
|
+
self.monitoring_interval = monitoring_interval
|
|
194
|
+
self.load_threshold = load_threshold
|
|
195
|
+
|
|
196
|
+
self.worker_nodes = {}
|
|
197
|
+
self.task_queue = deque()
|
|
198
|
+
self.task_history = defaultdict(list)
|
|
199
|
+
self.performance_predictor = PerformancePredictor()
|
|
200
|
+
|
|
201
|
+
# Monitoring thread
|
|
202
|
+
self.monitoring_active = False
|
|
203
|
+
self.monitoring_thread = None
|
|
204
|
+
|
|
205
|
+
# Load balancing metrics
|
|
206
|
+
self.balancing_metrics = {
|
|
207
|
+
'total_tasks_scheduled': 0,
|
|
208
|
+
'load_redistributions': 0,
|
|
209
|
+
'average_response_time': 0.0,
|
|
210
|
+
'node_utilization': {}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
def add_worker_node(self, node_id: str, capabilities: Dict[str, Any]):
|
|
214
|
+
"""Add a worker node to the load balancer."""
|
|
215
|
+
worker_node = WorkerNode.remote(node_id, capabilities)
|
|
216
|
+
self.worker_nodes[node_id] = worker_node
|
|
217
|
+
self.balancing_metrics['node_utilization'][node_id] = 0.0
|
|
218
|
+
|
|
219
|
+
def remove_worker_node(self, node_id: str):
|
|
220
|
+
"""Remove a worker node from the load balancer."""
|
|
221
|
+
if node_id in self.worker_nodes:
|
|
222
|
+
# Set node as inactive first
|
|
223
|
+
ray.get(self.worker_nodes[node_id].set_active_status.remote(False))
|
|
224
|
+
del self.worker_nodes[node_id]
|
|
225
|
+
if node_id in self.balancing_metrics['node_utilization']:
|
|
226
|
+
del self.balancing_metrics['node_utilization'][node_id]
|
|
227
|
+
|
|
228
|
+
def schedule_task(self, task_data: Dict[str, Any]) -> str:
|
|
229
|
+
"""Schedule a task for execution."""
|
|
230
|
+
if not self.worker_nodes:
|
|
231
|
+
raise RuntimeError("No worker nodes available")
|
|
232
|
+
|
|
233
|
+
# Select best node for the task
|
|
234
|
+
selected_node_id = self._select_node_for_task(task_data)
|
|
235
|
+
|
|
236
|
+
if selected_node_id is None:
|
|
237
|
+
raise RuntimeError("No suitable node found for task")
|
|
238
|
+
|
|
239
|
+
# Schedule task on selected node
|
|
240
|
+
task_future = self.worker_nodes[selected_node_id].execute_task.remote(task_data)
|
|
241
|
+
|
|
242
|
+
# Track task
|
|
243
|
+
task_id = task_data.get('task_id', f"task_{int(time.time())}")
|
|
244
|
+
self.task_history[selected_node_id].append({
|
|
245
|
+
'task_id': task_id,
|
|
246
|
+
'future': task_future,
|
|
247
|
+
'scheduled_time': time.time(),
|
|
248
|
+
'task_data': task_data
|
|
249
|
+
})
|
|
250
|
+
|
|
251
|
+
self.balancing_metrics['total_tasks_scheduled'] += 1
|
|
252
|
+
|
|
253
|
+
return task_id
|
|
254
|
+
|
|
255
|
+
def _select_node_for_task(self, task_data: Dict[str, Any]) -> Optional[str]:
|
|
256
|
+
"""Select the best node for executing a task."""
|
|
257
|
+
if self.balancing_strategy == "round_robin":
|
|
258
|
+
return self._round_robin_selection()
|
|
259
|
+
elif self.balancing_strategy == "least_loaded":
|
|
260
|
+
return self._least_loaded_selection()
|
|
261
|
+
elif self.balancing_strategy == "adaptive":
|
|
262
|
+
return self._adaptive_selection(task_data)
|
|
263
|
+
else:
|
|
264
|
+
return self._round_robin_selection()
|
|
265
|
+
|
|
266
|
+
def _round_robin_selection(self) -> Optional[str]:
|
|
267
|
+
"""Simple round-robin node selection."""
|
|
268
|
+
active_nodes = list(self.worker_nodes.keys())
|
|
269
|
+
if not active_nodes:
|
|
270
|
+
return None
|
|
271
|
+
|
|
272
|
+
# Simple counter-based round robin
|
|
273
|
+
selection_index = self.balancing_metrics['total_tasks_scheduled'] % len(active_nodes)
|
|
274
|
+
return active_nodes[selection_index]
|
|
275
|
+
|
|
276
|
+
def _least_loaded_selection(self) -> Optional[str]:
|
|
277
|
+
"""Select the least loaded node."""
|
|
278
|
+
if not self.worker_nodes:
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
# Get current load for all nodes
|
|
282
|
+
load_futures = {
|
|
283
|
+
node_id: node.get_system_metrics.remote()
|
|
284
|
+
for node_id, node in self.worker_nodes.items()
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
node_loads = {}
|
|
288
|
+
for node_id, future in load_futures.items():
|
|
289
|
+
try:
|
|
290
|
+
metrics = ray.get(future)
|
|
291
|
+
node_loads[node_id] = metrics.get('current_task_load', 0.0)
|
|
292
|
+
except:
|
|
293
|
+
node_loads[node_id] = float('inf') # Exclude failed nodes
|
|
294
|
+
|
|
295
|
+
# Select node with minimum load
|
|
296
|
+
return min(node_loads.keys(), key=lambda x: node_loads[x])
|
|
297
|
+
|
|
298
|
+
def _adaptive_selection(self, task_data: Dict[str, Any]) -> Optional[str]:
|
|
299
|
+
"""Adaptive node selection based on task requirements and node capabilities."""
|
|
300
|
+
if not self.worker_nodes:
|
|
301
|
+
return None
|
|
302
|
+
|
|
303
|
+
# Get performance stats for all nodes
|
|
304
|
+
perf_futures = {
|
|
305
|
+
node_id: node.get_performance_stats.remote()
|
|
306
|
+
for node_id, node in self.worker_nodes.items()
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
node_scores = {}
|
|
310
|
+
for node_id, future in perf_futures.items():
|
|
311
|
+
try:
|
|
312
|
+
stats = ray.get(future)
|
|
313
|
+
|
|
314
|
+
# Calculate suitability score
|
|
315
|
+
score = self._calculate_node_suitability_score(task_data, stats)
|
|
316
|
+
node_scores[node_id] = score
|
|
317
|
+
except:
|
|
318
|
+
node_scores[node_id] = 0.0 # Exclude failed nodes
|
|
319
|
+
|
|
320
|
+
if not node_scores:
|
|
321
|
+
return None
|
|
322
|
+
|
|
323
|
+
# Select node with highest suitability score
|
|
324
|
+
return max(node_scores.keys(), key=lambda x: node_scores[x])
|
|
325
|
+
|
|
326
|
+
def _calculate_node_suitability_score(self,
|
|
327
|
+
task_data: Dict[str, Any],
|
|
328
|
+
node_stats: Dict[str, Any]) -> float:
|
|
329
|
+
"""Calculate suitability score for a node given a task."""
|
|
330
|
+
score = 0.0
|
|
331
|
+
|
|
332
|
+
# Factor in success rate
|
|
333
|
+
success_rate = node_stats.get('success_rate', 1.0)
|
|
334
|
+
score += success_rate * 0.4
|
|
335
|
+
|
|
336
|
+
# Factor in current load (lower is better)
|
|
337
|
+
current_load = node_stats.get('current_load', 0.0)
|
|
338
|
+
load_factor = max(0.0, 1.0 - current_load)
|
|
339
|
+
score += load_factor * 0.3
|
|
340
|
+
|
|
341
|
+
# Factor in average execution time (lower is better)
|
|
342
|
+
avg_time = node_stats.get('avg_execution_time', 1.0)
|
|
343
|
+
time_factor = max(0.0, 1.0 - min(avg_time / 10.0, 1.0)) # Normalize to 10 seconds max
|
|
344
|
+
score += time_factor * 0.2
|
|
345
|
+
|
|
346
|
+
# Factor in capabilities match
|
|
347
|
+
task_requirements = task_data.get('requirements', {})
|
|
348
|
+
node_capabilities = node_stats.get('capabilities', {})
|
|
349
|
+
capability_match = self._calculate_capability_match(task_requirements, node_capabilities)
|
|
350
|
+
score += capability_match * 0.1
|
|
351
|
+
|
|
352
|
+
return score
|
|
353
|
+
|
|
354
|
+
def _calculate_capability_match(self,
|
|
355
|
+
requirements: Dict[str, Any],
|
|
356
|
+
capabilities: Dict[str, Any]) -> float:
|
|
357
|
+
"""Calculate how well node capabilities match task requirements."""
|
|
358
|
+
if not requirements:
|
|
359
|
+
return 1.0
|
|
360
|
+
|
|
361
|
+
matches = 0
|
|
362
|
+
total_requirements = 0
|
|
363
|
+
|
|
364
|
+
for req_key, req_value in requirements.items():
|
|
365
|
+
total_requirements += 1
|
|
366
|
+
if req_key in capabilities:
|
|
367
|
+
cap_value = capabilities[req_key]
|
|
368
|
+
if isinstance(req_value, (int, float)) and isinstance(cap_value, (int, float)):
|
|
369
|
+
if cap_value >= req_value:
|
|
370
|
+
matches += 1
|
|
371
|
+
elif req_value == cap_value:
|
|
372
|
+
matches += 1
|
|
373
|
+
|
|
374
|
+
return matches / total_requirements if total_requirements > 0 else 1.0
|
|
375
|
+
|
|
376
|
+
def start_monitoring(self):
|
|
377
|
+
"""Start performance monitoring."""
|
|
378
|
+
self.monitoring_active = True
|
|
379
|
+
self.monitoring_thread = threading.Thread(target=self._monitoring_loop)
|
|
380
|
+
self.monitoring_thread.daemon = True
|
|
381
|
+
self.monitoring_thread.start()
|
|
382
|
+
|
|
383
|
+
def stop_monitoring(self):
|
|
384
|
+
"""Stop performance monitoring."""
|
|
385
|
+
self.monitoring_active = False
|
|
386
|
+
if self.monitoring_thread:
|
|
387
|
+
self.monitoring_thread.join()
|
|
388
|
+
|
|
389
|
+
def _monitoring_loop(self):
|
|
390
|
+
"""Main monitoring loop."""
|
|
391
|
+
while self.monitoring_active:
|
|
392
|
+
try:
|
|
393
|
+
self._collect_performance_metrics()
|
|
394
|
+
self._check_load_balance()
|
|
395
|
+
time.sleep(self.monitoring_interval)
|
|
396
|
+
except Exception as e:
|
|
397
|
+
logging.error(f"Error in monitoring loop: {e}")
|
|
398
|
+
|
|
399
|
+
def _collect_performance_metrics(self):
|
|
400
|
+
"""Collect performance metrics from all nodes."""
|
|
401
|
+
metric_futures = {
|
|
402
|
+
node_id: node.get_system_metrics.remote()
|
|
403
|
+
for node_id, node in self.worker_nodes.items()
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
for node_id, future in metric_futures.items():
|
|
407
|
+
try:
|
|
408
|
+
metrics = ray.get(future)
|
|
409
|
+
current_load = metrics.get('current_task_load', 0.0)
|
|
410
|
+
self.balancing_metrics['node_utilization'][node_id] = current_load
|
|
411
|
+
except:
|
|
412
|
+
# Node might be unavailable
|
|
413
|
+
pass
|
|
414
|
+
|
|
415
|
+
def _check_load_balance(self):
|
|
416
|
+
"""Check if load rebalancing is needed."""
|
|
417
|
+
node_loads = list(self.balancing_metrics['node_utilization'].values())
|
|
418
|
+
|
|
419
|
+
if len(node_loads) < 2:
|
|
420
|
+
return
|
|
421
|
+
|
|
422
|
+
max_load = max(node_loads)
|
|
423
|
+
min_load = min(node_loads)
|
|
424
|
+
load_imbalance = max_load - min_load
|
|
425
|
+
|
|
426
|
+
if load_imbalance > self.load_threshold:
|
|
427
|
+
self._rebalance_load()
|
|
428
|
+
|
|
429
|
+
def _rebalance_load(self):
|
|
430
|
+
"""Perform load rebalancing."""
|
|
431
|
+
# This is a simplified rebalancing strategy
|
|
432
|
+
# In practice, this would involve more sophisticated task migration
|
|
433
|
+
self.balancing_metrics['load_redistributions'] += 1
|
|
434
|
+
logging.info("Load rebalancing triggered")
|
|
435
|
+
|
|
436
|
+
def get_balancing_statistics(self) -> Dict[str, Any]:
|
|
437
|
+
"""Get load balancing statistics."""
|
|
438
|
+
# Calculate average response time
|
|
439
|
+
all_tasks = []
|
|
440
|
+
for node_tasks in self.task_history.values():
|
|
441
|
+
all_tasks.extend(node_tasks)
|
|
442
|
+
|
|
443
|
+
if all_tasks:
|
|
444
|
+
# This is simplified - in practice, you'd track completion times
|
|
445
|
+
self.balancing_metrics['average_response_time'] = 1.0 # Placeholder
|
|
446
|
+
|
|
447
|
+
return {
|
|
448
|
+
'balancing_metrics': self.balancing_metrics.copy(),
|
|
449
|
+
'active_nodes': len(self.worker_nodes),
|
|
450
|
+
'balancing_strategy': self.balancing_strategy,
|
|
451
|
+
'monitoring_interval': self.monitoring_interval,
|
|
452
|
+
'load_threshold': self.load_threshold
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
def cleanup(self):
|
|
456
|
+
"""Cleanup load balancer resources."""
|
|
457
|
+
self.stop_monitoring()
|
|
458
|
+
# Ray will automatically clean up remote actors
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
class PerformancePredictor:
|
|
462
|
+
"""Simple performance predictor for task scheduling."""
|
|
463
|
+
|
|
464
|
+
def __init__(self):
|
|
465
|
+
self.task_performance_history = defaultdict(list)
|
|
466
|
+
|
|
467
|
+
def predict_completion_time(self,
|
|
468
|
+
task_features: Dict[str, Any],
|
|
469
|
+
node_features: Dict[str, Any]) -> float:
|
|
470
|
+
"""Predict task completion time based on features."""
|
|
471
|
+
# Simplified prediction based on task type and node performance
|
|
472
|
+
task_type = task_features.get('type', 'default')
|
|
473
|
+
node_avg_time = node_features.get('avg_execution_time', 1.0)
|
|
474
|
+
|
|
475
|
+
# Task type multipliers
|
|
476
|
+
type_multipliers = {
|
|
477
|
+
'weight_aggregation': 0.5,
|
|
478
|
+
'quantization': 0.8,
|
|
479
|
+
'ensemble_training': 2.0,
|
|
480
|
+
'default': 1.0
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
base_time = type_multipliers.get(task_type, 1.0)
|
|
484
|
+
predicted_time = base_time * node_avg_time
|
|
485
|
+
|
|
486
|
+
return predicted_time
|
|
487
|
+
|
|
488
|
+
def update_performance_history(self,
|
|
489
|
+
task_features: Dict[str, Any],
|
|
490
|
+
actual_time: float):
|
|
491
|
+
"""Update performance history with actual completion time."""
|
|
492
|
+
task_type = task_features.get('type', 'default')
|
|
493
|
+
self.task_performance_history[task_type].append(actual_time)
|
|
494
|
+
|
|
495
|
+
# Keep only recent history
|
|
496
|
+
if len(self.task_performance_history[task_type]) > 100:
|
|
497
|
+
self.task_performance_history[task_type] = \
|
|
498
|
+
self.task_performance_history[task_type][-100:]
|