morphml 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of morphml might be problematic. Click here for more details.
- morphml/__init__.py +14 -0
- morphml/api/__init__.py +26 -0
- morphml/api/app.py +326 -0
- morphml/api/auth.py +193 -0
- morphml/api/client.py +338 -0
- morphml/api/models.py +132 -0
- morphml/api/rate_limit.py +192 -0
- morphml/benchmarking/__init__.py +36 -0
- morphml/benchmarking/comparison.py +430 -0
- morphml/benchmarks/__init__.py +56 -0
- morphml/benchmarks/comparator.py +409 -0
- morphml/benchmarks/datasets.py +280 -0
- morphml/benchmarks/metrics.py +199 -0
- morphml/benchmarks/openml_suite.py +201 -0
- morphml/benchmarks/problems.py +289 -0
- morphml/benchmarks/suite.py +318 -0
- morphml/cli/__init__.py +5 -0
- morphml/cli/commands/experiment.py +329 -0
- morphml/cli/main.py +457 -0
- morphml/cli/quickstart.py +312 -0
- morphml/config.py +278 -0
- morphml/constraints/__init__.py +19 -0
- morphml/constraints/handler.py +205 -0
- morphml/constraints/predicates.py +285 -0
- morphml/core/__init__.py +3 -0
- morphml/core/crossover.py +449 -0
- morphml/core/dsl/README.md +359 -0
- morphml/core/dsl/__init__.py +72 -0
- morphml/core/dsl/ast_nodes.py +364 -0
- morphml/core/dsl/compiler.py +318 -0
- morphml/core/dsl/layers.py +368 -0
- morphml/core/dsl/lexer.py +336 -0
- morphml/core/dsl/parser.py +455 -0
- morphml/core/dsl/search_space.py +386 -0
- morphml/core/dsl/syntax.py +199 -0
- morphml/core/dsl/type_system.py +361 -0
- morphml/core/dsl/validator.py +386 -0
- morphml/core/graph/__init__.py +40 -0
- morphml/core/graph/edge.py +124 -0
- morphml/core/graph/graph.py +507 -0
- morphml/core/graph/mutations.py +409 -0
- morphml/core/graph/node.py +196 -0
- morphml/core/graph/serialization.py +361 -0
- morphml/core/graph/visualization.py +431 -0
- morphml/core/objectives/__init__.py +20 -0
- morphml/core/search/__init__.py +33 -0
- morphml/core/search/individual.py +252 -0
- morphml/core/search/parameters.py +453 -0
- morphml/core/search/population.py +375 -0
- morphml/core/search/search_engine.py +340 -0
- morphml/distributed/__init__.py +76 -0
- morphml/distributed/fault_tolerance.py +497 -0
- morphml/distributed/health_monitor.py +348 -0
- morphml/distributed/master.py +709 -0
- morphml/distributed/proto/README.md +224 -0
- morphml/distributed/proto/__init__.py +74 -0
- morphml/distributed/proto/worker.proto +170 -0
- morphml/distributed/proto/worker_pb2.py +79 -0
- morphml/distributed/proto/worker_pb2_grpc.py +423 -0
- morphml/distributed/resource_manager.py +416 -0
- morphml/distributed/scheduler.py +567 -0
- morphml/distributed/storage/__init__.py +33 -0
- morphml/distributed/storage/artifacts.py +381 -0
- morphml/distributed/storage/cache.py +366 -0
- morphml/distributed/storage/checkpointing.py +329 -0
- morphml/distributed/storage/database.py +459 -0
- morphml/distributed/worker.py +549 -0
- morphml/evaluation/__init__.py +5 -0
- morphml/evaluation/heuristic.py +237 -0
- morphml/exceptions.py +55 -0
- morphml/execution/__init__.py +5 -0
- morphml/execution/local_executor.py +350 -0
- morphml/integrations/__init__.py +28 -0
- morphml/integrations/jax_adapter.py +206 -0
- morphml/integrations/pytorch_adapter.py +530 -0
- morphml/integrations/sklearn_adapter.py +206 -0
- morphml/integrations/tensorflow_adapter.py +230 -0
- morphml/logging_config.py +93 -0
- morphml/meta_learning/__init__.py +66 -0
- morphml/meta_learning/architecture_similarity.py +277 -0
- morphml/meta_learning/experiment_database.py +240 -0
- morphml/meta_learning/knowledge_base/__init__.py +19 -0
- morphml/meta_learning/knowledge_base/embedder.py +179 -0
- morphml/meta_learning/knowledge_base/knowledge_base.py +313 -0
- morphml/meta_learning/knowledge_base/meta_features.py +265 -0
- morphml/meta_learning/knowledge_base/vector_store.py +271 -0
- morphml/meta_learning/predictors/__init__.py +27 -0
- morphml/meta_learning/predictors/ensemble.py +221 -0
- morphml/meta_learning/predictors/gnn_predictor.py +552 -0
- morphml/meta_learning/predictors/learning_curve.py +231 -0
- morphml/meta_learning/predictors/proxy_metrics.py +261 -0
- morphml/meta_learning/strategy_evolution/__init__.py +27 -0
- morphml/meta_learning/strategy_evolution/adaptive_optimizer.py +226 -0
- morphml/meta_learning/strategy_evolution/bandit.py +276 -0
- morphml/meta_learning/strategy_evolution/portfolio.py +230 -0
- morphml/meta_learning/transfer.py +581 -0
- morphml/meta_learning/warm_start.py +286 -0
- morphml/optimizers/__init__.py +74 -0
- morphml/optimizers/adaptive_operators.py +399 -0
- morphml/optimizers/bayesian/__init__.py +52 -0
- morphml/optimizers/bayesian/acquisition.py +387 -0
- morphml/optimizers/bayesian/base.py +319 -0
- morphml/optimizers/bayesian/gaussian_process.py +635 -0
- morphml/optimizers/bayesian/smac.py +534 -0
- morphml/optimizers/bayesian/tpe.py +411 -0
- morphml/optimizers/differential_evolution.py +220 -0
- morphml/optimizers/evolutionary/__init__.py +61 -0
- morphml/optimizers/evolutionary/cma_es.py +416 -0
- morphml/optimizers/evolutionary/differential_evolution.py +556 -0
- morphml/optimizers/evolutionary/encoding.py +426 -0
- morphml/optimizers/evolutionary/particle_swarm.py +449 -0
- morphml/optimizers/genetic_algorithm.py +486 -0
- morphml/optimizers/gradient_based/__init__.py +22 -0
- morphml/optimizers/gradient_based/darts.py +550 -0
- morphml/optimizers/gradient_based/enas.py +585 -0
- morphml/optimizers/gradient_based/operations.py +474 -0
- morphml/optimizers/gradient_based/utils.py +601 -0
- morphml/optimizers/hill_climbing.py +169 -0
- morphml/optimizers/multi_objective/__init__.py +56 -0
- morphml/optimizers/multi_objective/indicators.py +504 -0
- morphml/optimizers/multi_objective/nsga2.py +647 -0
- morphml/optimizers/multi_objective/visualization.py +427 -0
- morphml/optimizers/nsga2.py +308 -0
- morphml/optimizers/random_search.py +172 -0
- morphml/optimizers/simulated_annealing.py +181 -0
- morphml/plugins/__init__.py +35 -0
- morphml/plugins/custom_evaluator_example.py +81 -0
- morphml/plugins/custom_optimizer_example.py +63 -0
- morphml/plugins/plugin_system.py +454 -0
- morphml/reports/__init__.py +30 -0
- morphml/reports/generator.py +362 -0
- morphml/tracking/__init__.py +7 -0
- morphml/tracking/experiment.py +309 -0
- morphml/tracking/logger.py +301 -0
- morphml/tracking/reporter.py +357 -0
- morphml/utils/__init__.py +6 -0
- morphml/utils/checkpoint.py +189 -0
- morphml/utils/comparison.py +390 -0
- morphml/utils/export.py +407 -0
- morphml/utils/progress.py +392 -0
- morphml/utils/validation.py +392 -0
- morphml/version.py +7 -0
- morphml/visualization/__init__.py +50 -0
- morphml/visualization/analytics.py +423 -0
- morphml/visualization/architecture_diagrams.py +353 -0
- morphml/visualization/architecture_plot.py +223 -0
- morphml/visualization/convergence_plot.py +174 -0
- morphml/visualization/crossover_viz.py +386 -0
- morphml/visualization/graph_viz.py +338 -0
- morphml/visualization/pareto_plot.py +149 -0
- morphml/visualization/plotly_dashboards.py +422 -0
- morphml/visualization/population.py +309 -0
- morphml/visualization/progress.py +260 -0
- morphml-1.0.0.dist-info/METADATA +434 -0
- morphml-1.0.0.dist-info/RECORD +158 -0
- morphml-1.0.0.dist-info/WHEEL +4 -0
- morphml-1.0.0.dist-info/entry_points.txt +3 -0
- morphml-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
"""Resource management for distributed workers.
|
|
2
|
+
|
|
3
|
+
Tracks and manages computational resources across worker nodes:
|
|
4
|
+
- GPU availability and memory
|
|
5
|
+
- CPU utilization
|
|
6
|
+
- Memory usage
|
|
7
|
+
- Task placement based on requirements
|
|
8
|
+
|
|
9
|
+
Author: Eshan Roy <eshanized@proton.me>
|
|
10
|
+
Organization: TONMOY INFRASTRUCTURE & VISION
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
from morphml.logging_config import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class WorkerResources:
|
|
23
|
+
"""
|
|
24
|
+
Worker computational resources.
|
|
25
|
+
|
|
26
|
+
Tracks available and total resources for a worker node.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
worker_id: Unique worker identifier
|
|
30
|
+
total_gpus: Total number of GPUs
|
|
31
|
+
available_gpus: Number of available GPUs
|
|
32
|
+
gpu_memory_total: Total GPU memory per GPU (GB)
|
|
33
|
+
gpu_memory_available: Available GPU memory (GB)
|
|
34
|
+
cpu_percent: CPU utilization percentage
|
|
35
|
+
memory_percent: RAM utilization percentage
|
|
36
|
+
network_bandwidth: Network bandwidth (Mbps)
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
worker_id: str
|
|
40
|
+
total_gpus: int = 0
|
|
41
|
+
available_gpus: int = 0
|
|
42
|
+
gpu_memory_total: float = 0.0 # GB
|
|
43
|
+
gpu_memory_available: float = 0.0 # GB
|
|
44
|
+
cpu_percent: float = 0.0
|
|
45
|
+
memory_percent: float = 0.0
|
|
46
|
+
network_bandwidth: float = 1000.0 # Mbps
|
|
47
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def gpu_utilization(self) -> float:
|
|
51
|
+
"""Calculate GPU utilization percentage."""
|
|
52
|
+
if self.total_gpus == 0:
|
|
53
|
+
return 0.0
|
|
54
|
+
return (1.0 - self.available_gpus / self.total_gpus) * 100
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def gpu_memory_utilization(self) -> float:
|
|
58
|
+
"""Calculate GPU memory utilization percentage."""
|
|
59
|
+
if self.gpu_memory_total == 0:
|
|
60
|
+
return 0.0
|
|
61
|
+
return (1.0 - self.gpu_memory_available / self.gpu_memory_total) * 100
|
|
62
|
+
|
|
63
|
+
def can_run_task(self, requirements: "TaskRequirements") -> bool:
|
|
64
|
+
"""
|
|
65
|
+
Check if worker can run task with given requirements.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
requirements: Task resource requirements
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
True if worker has sufficient resources
|
|
72
|
+
"""
|
|
73
|
+
return (
|
|
74
|
+
self.available_gpus >= requirements.min_gpus
|
|
75
|
+
and self.gpu_memory_available >= requirements.min_gpu_memory
|
|
76
|
+
and self.memory_percent < 90.0 # Don't overload memory
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def allocate(self, requirements: "TaskRequirements") -> bool:
|
|
80
|
+
"""
|
|
81
|
+
Allocate resources for task.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
requirements: Task requirements
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
True if allocation successful
|
|
88
|
+
"""
|
|
89
|
+
if not self.can_run_task(requirements):
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
self.available_gpus -= requirements.min_gpus
|
|
93
|
+
self.gpu_memory_available -= requirements.min_gpu_memory
|
|
94
|
+
|
|
95
|
+
return True
|
|
96
|
+
|
|
97
|
+
def release(self, requirements: "TaskRequirements") -> None:
|
|
98
|
+
"""
|
|
99
|
+
Release allocated resources.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
requirements: Task requirements to release
|
|
103
|
+
"""
|
|
104
|
+
self.available_gpus = min(self.total_gpus, self.available_gpus + requirements.min_gpus)
|
|
105
|
+
self.gpu_memory_available = min(
|
|
106
|
+
self.gpu_memory_total,
|
|
107
|
+
self.gpu_memory_available + requirements.min_gpu_memory,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class TaskRequirements:
|
|
113
|
+
"""
|
|
114
|
+
Task resource requirements.
|
|
115
|
+
|
|
116
|
+
Specifies minimum resources needed to execute a task.
|
|
117
|
+
|
|
118
|
+
Attributes:
|
|
119
|
+
min_gpus: Minimum number of GPUs
|
|
120
|
+
min_gpu_memory: Minimum GPU memory per GPU (GB)
|
|
121
|
+
min_cpu_cores: Minimum CPU cores
|
|
122
|
+
min_memory: Minimum RAM (GB)
|
|
123
|
+
estimated_time: Estimated execution time (seconds)
|
|
124
|
+
priority: Task priority (higher = more important)
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
min_gpus: int = 1
|
|
128
|
+
min_gpu_memory: float = 2.0 # GB
|
|
129
|
+
min_cpu_cores: int = 1
|
|
130
|
+
min_memory: float = 4.0 # GB
|
|
131
|
+
estimated_time: float = 300.0 # seconds
|
|
132
|
+
priority: float = 1.0
|
|
133
|
+
|
|
134
|
+
def __post_init__(self) -> None:
|
|
135
|
+
"""Validate requirements."""
|
|
136
|
+
if self.min_gpus < 0:
|
|
137
|
+
raise ValueError("min_gpus must be >= 0")
|
|
138
|
+
if self.min_gpu_memory < 0:
|
|
139
|
+
raise ValueError("min_gpu_memory must be >= 0")
|
|
140
|
+
if self.estimated_time < 0:
|
|
141
|
+
raise ValueError("estimated_time must be >= 0")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class ResourceManager:
|
|
145
|
+
"""
|
|
146
|
+
Manage computational resources across workers.
|
|
147
|
+
|
|
148
|
+
Tracks resource availability and helps with intelligent task placement.
|
|
149
|
+
|
|
150
|
+
Example:
|
|
151
|
+
>>> manager = ResourceManager()
|
|
152
|
+
>>> manager.update_resources('worker-1', {
|
|
153
|
+
... 'total_gpus': 4,
|
|
154
|
+
... 'available_gpus': 3,
|
|
155
|
+
... 'gpu_memory_total': 16.0,
|
|
156
|
+
... 'gpu_memory_available': 12.0,
|
|
157
|
+
... })
|
|
158
|
+
>>> requirements = TaskRequirements(min_gpus=1, min_gpu_memory=4.0)
|
|
159
|
+
>>> worker_id = manager.find_suitable_worker(requirements)
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def __init__(self) -> None:
|
|
163
|
+
"""Initialize resource manager."""
|
|
164
|
+
self.resources: Dict[str, WorkerResources] = {}
|
|
165
|
+
self.allocation_history: List[Dict[str, Any]] = []
|
|
166
|
+
|
|
167
|
+
def register_worker(self, worker_id: str, resources: Dict[str, Any]) -> None:
|
|
168
|
+
"""
|
|
169
|
+
Register worker with initial resources.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
worker_id: Unique worker identifier
|
|
173
|
+
resources: Initial resource state
|
|
174
|
+
"""
|
|
175
|
+
self.resources[worker_id] = WorkerResources(worker_id=worker_id, **resources)
|
|
176
|
+
|
|
177
|
+
logger.info(
|
|
178
|
+
f"Registered worker {worker_id}: "
|
|
179
|
+
f"{resources.get('total_gpus', 0)} GPUs, "
|
|
180
|
+
f"{resources.get('gpu_memory_total', 0):.1f}GB GPU memory"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def update_resources(self, worker_id: str, resources: Dict[str, Any]) -> None:
|
|
184
|
+
"""
|
|
185
|
+
Update worker resource information.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
worker_id: Worker identifier
|
|
189
|
+
resources: Updated resource state
|
|
190
|
+
"""
|
|
191
|
+
if worker_id not in self.resources:
|
|
192
|
+
self.register_worker(worker_id, resources)
|
|
193
|
+
else:
|
|
194
|
+
# Update existing
|
|
195
|
+
for key, value in resources.items():
|
|
196
|
+
if hasattr(self.resources[worker_id], key):
|
|
197
|
+
setattr(self.resources[worker_id], key, value)
|
|
198
|
+
|
|
199
|
+
def find_suitable_worker(
|
|
200
|
+
self, requirements: TaskRequirements, strategy: str = "best_fit"
|
|
201
|
+
) -> Optional[str]:
|
|
202
|
+
"""
|
|
203
|
+
Find worker that meets task requirements.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
requirements: Task resource requirements
|
|
207
|
+
strategy: Placement strategy
|
|
208
|
+
- 'first_fit': First worker that fits
|
|
209
|
+
- 'best_fit': Worker with least excess capacity
|
|
210
|
+
- 'worst_fit': Worker with most excess capacity
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
worker_id or None if no suitable worker
|
|
214
|
+
"""
|
|
215
|
+
suitable_workers = [
|
|
216
|
+
(wid, res) for wid, res in self.resources.items() if res.can_run_task(requirements)
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
if not suitable_workers:
|
|
220
|
+
logger.debug("No suitable worker found for task requirements")
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
# Apply placement strategy
|
|
224
|
+
if strategy == "first_fit":
|
|
225
|
+
return suitable_workers[0][0]
|
|
226
|
+
|
|
227
|
+
elif strategy == "best_fit":
|
|
228
|
+
# Worker with least excess GPUs
|
|
229
|
+
best = min(
|
|
230
|
+
suitable_workers,
|
|
231
|
+
key=lambda x: x[1].available_gpus - requirements.min_gpus,
|
|
232
|
+
)
|
|
233
|
+
return best[0]
|
|
234
|
+
|
|
235
|
+
elif strategy == "worst_fit":
|
|
236
|
+
# Worker with most excess GPUs (best for load balancing)
|
|
237
|
+
worst = max(
|
|
238
|
+
suitable_workers,
|
|
239
|
+
key=lambda x: x[1].available_gpus,
|
|
240
|
+
)
|
|
241
|
+
return worst[0]
|
|
242
|
+
|
|
243
|
+
else:
|
|
244
|
+
raise ValueError(f"Unknown placement strategy: {strategy}")
|
|
245
|
+
|
|
246
|
+
def find_all_suitable_workers(self, requirements: TaskRequirements) -> List[str]:
|
|
247
|
+
"""
|
|
248
|
+
Find all workers that meet requirements.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
requirements: Task requirements
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
List of suitable worker IDs
|
|
255
|
+
"""
|
|
256
|
+
return [wid for wid, res in self.resources.items() if res.can_run_task(requirements)]
|
|
257
|
+
|
|
258
|
+
def allocate_resources(self, worker_id: str, requirements: TaskRequirements) -> bool:
|
|
259
|
+
"""
|
|
260
|
+
Allocate resources for task on worker.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
worker_id: Worker to allocate on
|
|
264
|
+
requirements: Resource requirements
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
True if allocation successful
|
|
268
|
+
"""
|
|
269
|
+
if worker_id not in self.resources:
|
|
270
|
+
logger.error(f"Unknown worker: {worker_id}")
|
|
271
|
+
return False
|
|
272
|
+
|
|
273
|
+
success = self.resources[worker_id].allocate(requirements)
|
|
274
|
+
|
|
275
|
+
if success:
|
|
276
|
+
self.allocation_history.append(
|
|
277
|
+
{
|
|
278
|
+
"worker_id": worker_id,
|
|
279
|
+
"gpus": requirements.min_gpus,
|
|
280
|
+
"memory": requirements.min_gpu_memory,
|
|
281
|
+
}
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
logger.debug(
|
|
285
|
+
f"Allocated resources on {worker_id}: "
|
|
286
|
+
f"{requirements.min_gpus} GPUs, "
|
|
287
|
+
f"{requirements.min_gpu_memory:.1f}GB memory"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
return success
|
|
291
|
+
|
|
292
|
+
def release_resources(self, worker_id: str, requirements: TaskRequirements) -> None:
|
|
293
|
+
"""
|
|
294
|
+
Release allocated resources.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
worker_id: Worker to release resources from
|
|
298
|
+
requirements: Resource requirements to release
|
|
299
|
+
"""
|
|
300
|
+
if worker_id in self.resources:
|
|
301
|
+
self.resources[worker_id].release(requirements)
|
|
302
|
+
|
|
303
|
+
logger.debug(
|
|
304
|
+
f"Released resources on {worker_id}: "
|
|
305
|
+
f"{requirements.min_gpus} GPUs, "
|
|
306
|
+
f"{requirements.min_gpu_memory:.1f}GB memory"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
def get_total_resources(self) -> Dict[str, Any]:
|
|
310
|
+
"""
|
|
311
|
+
Get total resources across all workers.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Dictionary with aggregate resource statistics
|
|
315
|
+
"""
|
|
316
|
+
total_gpus = sum(r.total_gpus for r in self.resources.values())
|
|
317
|
+
available_gpus = sum(r.available_gpus for r in self.resources.values())
|
|
318
|
+
total_gpu_memory = sum(r.gpu_memory_total for r in self.resources.values())
|
|
319
|
+
available_gpu_memory = sum(r.gpu_memory_available for r in self.resources.values())
|
|
320
|
+
|
|
321
|
+
return {
|
|
322
|
+
"total_workers": len(self.resources),
|
|
323
|
+
"total_gpus": total_gpus,
|
|
324
|
+
"available_gpus": available_gpus,
|
|
325
|
+
"gpu_utilization": (
|
|
326
|
+
(1.0 - available_gpus / total_gpus) * 100 if total_gpus > 0 else 0.0
|
|
327
|
+
),
|
|
328
|
+
"total_gpu_memory": total_gpu_memory,
|
|
329
|
+
"available_gpu_memory": available_gpu_memory,
|
|
330
|
+
"memory_utilization": (
|
|
331
|
+
(1.0 - available_gpu_memory / total_gpu_memory) * 100
|
|
332
|
+
if total_gpu_memory > 0
|
|
333
|
+
else 0.0
|
|
334
|
+
),
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
def get_worker_resources(self, worker_id: str) -> Optional[WorkerResources]:
|
|
338
|
+
"""
|
|
339
|
+
Get resources for specific worker.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
worker_id: Worker identifier
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
WorkerResources or None if worker not found
|
|
346
|
+
"""
|
|
347
|
+
return self.resources.get(worker_id)
|
|
348
|
+
|
|
349
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
350
|
+
"""Get resource manager statistics."""
|
|
351
|
+
stats = self.get_total_resources()
|
|
352
|
+
stats["allocations"] = len(self.allocation_history)
|
|
353
|
+
stats["worker_details"] = {
|
|
354
|
+
wid: {
|
|
355
|
+
"total_gpus": res.total_gpus,
|
|
356
|
+
"available_gpus": res.available_gpus,
|
|
357
|
+
"gpu_utilization": res.gpu_utilization,
|
|
358
|
+
"cpu_percent": res.cpu_percent,
|
|
359
|
+
"memory_percent": res.memory_percent,
|
|
360
|
+
}
|
|
361
|
+
for wid, res in self.resources.items()
|
|
362
|
+
}
|
|
363
|
+
return stats
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
class GPUAffinityManager:
|
|
367
|
+
"""
|
|
368
|
+
Manage GPU affinity for tasks.
|
|
369
|
+
|
|
370
|
+
Ensures tasks are pinned to specific GPUs for better performance.
|
|
371
|
+
"""
|
|
372
|
+
|
|
373
|
+
def __init__(self) -> None:
|
|
374
|
+
"""Initialize GPU affinity manager."""
|
|
375
|
+
self.gpu_assignments: Dict[str, List[int]] = {}
|
|
376
|
+
|
|
377
|
+
def assign_gpus(self, worker_id: str, task_id: str, gpu_ids: List[int]) -> None:
|
|
378
|
+
"""
|
|
379
|
+
Assign specific GPUs to task.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
worker_id: Worker identifier
|
|
383
|
+
task_id: Task identifier
|
|
384
|
+
gpu_ids: List of GPU IDs to assign
|
|
385
|
+
"""
|
|
386
|
+
key = f"{worker_id}:{task_id}"
|
|
387
|
+
self.gpu_assignments[key] = gpu_ids
|
|
388
|
+
|
|
389
|
+
logger.debug(f"Assigned GPUs {gpu_ids} to task {task_id} on {worker_id}")
|
|
390
|
+
|
|
391
|
+
def get_assigned_gpus(self, worker_id: str, task_id: str) -> Optional[List[int]]:
|
|
392
|
+
"""
|
|
393
|
+
Get assigned GPUs for task.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
worker_id: Worker identifier
|
|
397
|
+
task_id: Task identifier
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
List of GPU IDs or None
|
|
401
|
+
"""
|
|
402
|
+
key = f"{worker_id}:{task_id}"
|
|
403
|
+
return self.gpu_assignments.get(key)
|
|
404
|
+
|
|
405
|
+
def release_gpus(self, worker_id: str, task_id: str) -> None:
|
|
406
|
+
"""
|
|
407
|
+
Release GPU assignment.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
worker_id: Worker identifier
|
|
411
|
+
task_id: Task identifier
|
|
412
|
+
"""
|
|
413
|
+
key = f"{worker_id}:{task_id}"
|
|
414
|
+
if key in self.gpu_assignments:
|
|
415
|
+
del self.gpu_assignments[key]
|
|
416
|
+
logger.debug(f"Released GPU assignment for task {task_id}")
|