morphml 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of morphml might be problematic. Click here for more details.
- morphml/__init__.py +14 -0
- morphml/api/__init__.py +26 -0
- morphml/api/app.py +326 -0
- morphml/api/auth.py +193 -0
- morphml/api/client.py +338 -0
- morphml/api/models.py +132 -0
- morphml/api/rate_limit.py +192 -0
- morphml/benchmarking/__init__.py +36 -0
- morphml/benchmarking/comparison.py +430 -0
- morphml/benchmarks/__init__.py +56 -0
- morphml/benchmarks/comparator.py +409 -0
- morphml/benchmarks/datasets.py +280 -0
- morphml/benchmarks/metrics.py +199 -0
- morphml/benchmarks/openml_suite.py +201 -0
- morphml/benchmarks/problems.py +289 -0
- morphml/benchmarks/suite.py +318 -0
- morphml/cli/__init__.py +5 -0
- morphml/cli/commands/experiment.py +329 -0
- morphml/cli/main.py +457 -0
- morphml/cli/quickstart.py +312 -0
- morphml/config.py +278 -0
- morphml/constraints/__init__.py +19 -0
- morphml/constraints/handler.py +205 -0
- morphml/constraints/predicates.py +285 -0
- morphml/core/__init__.py +3 -0
- morphml/core/crossover.py +449 -0
- morphml/core/dsl/README.md +359 -0
- morphml/core/dsl/__init__.py +72 -0
- morphml/core/dsl/ast_nodes.py +364 -0
- morphml/core/dsl/compiler.py +318 -0
- morphml/core/dsl/layers.py +368 -0
- morphml/core/dsl/lexer.py +336 -0
- morphml/core/dsl/parser.py +455 -0
- morphml/core/dsl/search_space.py +386 -0
- morphml/core/dsl/syntax.py +199 -0
- morphml/core/dsl/type_system.py +361 -0
- morphml/core/dsl/validator.py +386 -0
- morphml/core/graph/__init__.py +40 -0
- morphml/core/graph/edge.py +124 -0
- morphml/core/graph/graph.py +507 -0
- morphml/core/graph/mutations.py +409 -0
- morphml/core/graph/node.py +196 -0
- morphml/core/graph/serialization.py +361 -0
- morphml/core/graph/visualization.py +431 -0
- morphml/core/objectives/__init__.py +20 -0
- morphml/core/search/__init__.py +33 -0
- morphml/core/search/individual.py +252 -0
- morphml/core/search/parameters.py +453 -0
- morphml/core/search/population.py +375 -0
- morphml/core/search/search_engine.py +340 -0
- morphml/distributed/__init__.py +76 -0
- morphml/distributed/fault_tolerance.py +497 -0
- morphml/distributed/health_monitor.py +348 -0
- morphml/distributed/master.py +709 -0
- morphml/distributed/proto/README.md +224 -0
- morphml/distributed/proto/__init__.py +74 -0
- morphml/distributed/proto/worker.proto +170 -0
- morphml/distributed/proto/worker_pb2.py +79 -0
- morphml/distributed/proto/worker_pb2_grpc.py +423 -0
- morphml/distributed/resource_manager.py +416 -0
- morphml/distributed/scheduler.py +567 -0
- morphml/distributed/storage/__init__.py +33 -0
- morphml/distributed/storage/artifacts.py +381 -0
- morphml/distributed/storage/cache.py +366 -0
- morphml/distributed/storage/checkpointing.py +329 -0
- morphml/distributed/storage/database.py +459 -0
- morphml/distributed/worker.py +549 -0
- morphml/evaluation/__init__.py +5 -0
- morphml/evaluation/heuristic.py +237 -0
- morphml/exceptions.py +55 -0
- morphml/execution/__init__.py +5 -0
- morphml/execution/local_executor.py +350 -0
- morphml/integrations/__init__.py +28 -0
- morphml/integrations/jax_adapter.py +206 -0
- morphml/integrations/pytorch_adapter.py +530 -0
- morphml/integrations/sklearn_adapter.py +206 -0
- morphml/integrations/tensorflow_adapter.py +230 -0
- morphml/logging_config.py +93 -0
- morphml/meta_learning/__init__.py +66 -0
- morphml/meta_learning/architecture_similarity.py +277 -0
- morphml/meta_learning/experiment_database.py +240 -0
- morphml/meta_learning/knowledge_base/__init__.py +19 -0
- morphml/meta_learning/knowledge_base/embedder.py +179 -0
- morphml/meta_learning/knowledge_base/knowledge_base.py +313 -0
- morphml/meta_learning/knowledge_base/meta_features.py +265 -0
- morphml/meta_learning/knowledge_base/vector_store.py +271 -0
- morphml/meta_learning/predictors/__init__.py +27 -0
- morphml/meta_learning/predictors/ensemble.py +221 -0
- morphml/meta_learning/predictors/gnn_predictor.py +552 -0
- morphml/meta_learning/predictors/learning_curve.py +231 -0
- morphml/meta_learning/predictors/proxy_metrics.py +261 -0
- morphml/meta_learning/strategy_evolution/__init__.py +27 -0
- morphml/meta_learning/strategy_evolution/adaptive_optimizer.py +226 -0
- morphml/meta_learning/strategy_evolution/bandit.py +276 -0
- morphml/meta_learning/strategy_evolution/portfolio.py +230 -0
- morphml/meta_learning/transfer.py +581 -0
- morphml/meta_learning/warm_start.py +286 -0
- morphml/optimizers/__init__.py +74 -0
- morphml/optimizers/adaptive_operators.py +399 -0
- morphml/optimizers/bayesian/__init__.py +52 -0
- morphml/optimizers/bayesian/acquisition.py +387 -0
- morphml/optimizers/bayesian/base.py +319 -0
- morphml/optimizers/bayesian/gaussian_process.py +635 -0
- morphml/optimizers/bayesian/smac.py +534 -0
- morphml/optimizers/bayesian/tpe.py +411 -0
- morphml/optimizers/differential_evolution.py +220 -0
- morphml/optimizers/evolutionary/__init__.py +61 -0
- morphml/optimizers/evolutionary/cma_es.py +416 -0
- morphml/optimizers/evolutionary/differential_evolution.py +556 -0
- morphml/optimizers/evolutionary/encoding.py +426 -0
- morphml/optimizers/evolutionary/particle_swarm.py +449 -0
- morphml/optimizers/genetic_algorithm.py +486 -0
- morphml/optimizers/gradient_based/__init__.py +22 -0
- morphml/optimizers/gradient_based/darts.py +550 -0
- morphml/optimizers/gradient_based/enas.py +585 -0
- morphml/optimizers/gradient_based/operations.py +474 -0
- morphml/optimizers/gradient_based/utils.py +601 -0
- morphml/optimizers/hill_climbing.py +169 -0
- morphml/optimizers/multi_objective/__init__.py +56 -0
- morphml/optimizers/multi_objective/indicators.py +504 -0
- morphml/optimizers/multi_objective/nsga2.py +647 -0
- morphml/optimizers/multi_objective/visualization.py +427 -0
- morphml/optimizers/nsga2.py +308 -0
- morphml/optimizers/random_search.py +172 -0
- morphml/optimizers/simulated_annealing.py +181 -0
- morphml/plugins/__init__.py +35 -0
- morphml/plugins/custom_evaluator_example.py +81 -0
- morphml/plugins/custom_optimizer_example.py +63 -0
- morphml/plugins/plugin_system.py +454 -0
- morphml/reports/__init__.py +30 -0
- morphml/reports/generator.py +362 -0
- morphml/tracking/__init__.py +7 -0
- morphml/tracking/experiment.py +309 -0
- morphml/tracking/logger.py +301 -0
- morphml/tracking/reporter.py +357 -0
- morphml/utils/__init__.py +6 -0
- morphml/utils/checkpoint.py +189 -0
- morphml/utils/comparison.py +390 -0
- morphml/utils/export.py +407 -0
- morphml/utils/progress.py +392 -0
- morphml/utils/validation.py +392 -0
- morphml/version.py +7 -0
- morphml/visualization/__init__.py +50 -0
- morphml/visualization/analytics.py +423 -0
- morphml/visualization/architecture_diagrams.py +353 -0
- morphml/visualization/architecture_plot.py +223 -0
- morphml/visualization/convergence_plot.py +174 -0
- morphml/visualization/crossover_viz.py +386 -0
- morphml/visualization/graph_viz.py +338 -0
- morphml/visualization/pareto_plot.py +149 -0
- morphml/visualization/plotly_dashboards.py +422 -0
- morphml/visualization/population.py +309 -0
- morphml/visualization/progress.py +260 -0
- morphml-1.0.0.dist-info/METADATA +434 -0
- morphml-1.0.0.dist-info/RECORD +158 -0
- morphml-1.0.0.dist-info/WHEEL +4 -0
- morphml-1.0.0.dist-info/entry_points.txt +3 -0
- morphml-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,567 @@
|
|
|
1
|
+
"""Task scheduling strategies for distributed execution.
|
|
2
|
+
|
|
3
|
+
Implements various scheduling algorithms:
|
|
4
|
+
- FIFO (First-In-First-Out)
|
|
5
|
+
- Priority-based scheduling
|
|
6
|
+
- Load balancing
|
|
7
|
+
- Work stealing
|
|
8
|
+
- Adaptive learning scheduler
|
|
9
|
+
|
|
10
|
+
Author: Eshan Roy <eshanized@proton.me>
|
|
11
|
+
Organization: TONMOY INFRASTRUCTURE & VISION
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import time
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from collections import deque
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from queue import PriorityQueue
|
|
19
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
from morphml.distributed.master import Task, WorkerInfo
|
|
22
|
+
from morphml.logging_config import get_logger
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TaskScheduler(ABC):
|
|
28
|
+
"""
|
|
29
|
+
Base class for task schedulers.
|
|
30
|
+
|
|
31
|
+
A scheduler decides which worker should execute which task,
|
|
32
|
+
optimizing for different objectives (throughput, fairness, etc.).
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def assign_task(self, task: Task, workers: List[WorkerInfo]) -> Optional[WorkerInfo]:
|
|
37
|
+
"""
|
|
38
|
+
Assign task to a worker.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
task: Task to assign
|
|
42
|
+
workers: Available workers
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Worker to assign task to, or None if no suitable worker
|
|
46
|
+
"""
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
50
|
+
"""Get scheduler statistics."""
|
|
51
|
+
return {}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FIFOScheduler(TaskScheduler):
|
|
55
|
+
"""
|
|
56
|
+
First-In-First-Out scheduler.
|
|
57
|
+
|
|
58
|
+
Assigns tasks to the first available idle worker.
|
|
59
|
+
Simple but effective for homogeneous workloads.
|
|
60
|
+
|
|
61
|
+
Example:
|
|
62
|
+
>>> scheduler = FIFOScheduler()
|
|
63
|
+
>>> worker = scheduler.assign_task(task, workers)
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self) -> None:
|
|
67
|
+
"""Initialize FIFO scheduler."""
|
|
68
|
+
self.assignments = 0
|
|
69
|
+
|
|
70
|
+
def assign_task(self, task: Task, workers: List[WorkerInfo]) -> Optional[WorkerInfo]:
|
|
71
|
+
"""Assign to first idle worker."""
|
|
72
|
+
for worker in workers:
|
|
73
|
+
if worker.is_available():
|
|
74
|
+
self.assignments += 1
|
|
75
|
+
return worker
|
|
76
|
+
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
80
|
+
"""Get statistics."""
|
|
81
|
+
return {"assignments": self.assignments, "strategy": "FIFO"}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class PriorityScheduler(TaskScheduler):
|
|
85
|
+
"""
|
|
86
|
+
Priority-based scheduler.
|
|
87
|
+
|
|
88
|
+
Tasks with higher priority are scheduled first.
|
|
89
|
+
Useful for multi-fidelity optimization where promising
|
|
90
|
+
architectures receive more computational resources.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
max_queue_size: Maximum priority queue size
|
|
94
|
+
|
|
95
|
+
Example:
|
|
96
|
+
>>> scheduler = PriorityScheduler()
|
|
97
|
+
>>> scheduler.enqueue(task, priority=0.95)
|
|
98
|
+
>>> worker = scheduler.assign_task(task, workers)
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def __init__(self, max_queue_size: int = 10000) -> None:
|
|
102
|
+
"""Initialize priority scheduler."""
|
|
103
|
+
self.task_queue: PriorityQueue = PriorityQueue(maxsize=max_queue_size)
|
|
104
|
+
self.assignments = 0
|
|
105
|
+
self.task_priorities: Dict[str, float] = {}
|
|
106
|
+
|
|
107
|
+
def enqueue(self, task: Task, priority: float) -> None:
|
|
108
|
+
"""
|
|
109
|
+
Add task with priority.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
task: Task to enqueue
|
|
113
|
+
priority: Task priority (higher = more important)
|
|
114
|
+
"""
|
|
115
|
+
# Negative priority for max-heap behavior
|
|
116
|
+
self.task_queue.put((-priority, time.time(), task))
|
|
117
|
+
self.task_priorities[task.task_id] = priority
|
|
118
|
+
|
|
119
|
+
logger.debug(f"Enqueued task {task.task_id} with priority {priority:.4f}")
|
|
120
|
+
|
|
121
|
+
def dequeue(self) -> Optional[Task]:
|
|
122
|
+
"""Get highest priority task."""
|
|
123
|
+
if not self.task_queue.empty():
|
|
124
|
+
_, _, task = self.task_queue.get()
|
|
125
|
+
return task
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
def assign_task(self, task: Task, workers: List[WorkerInfo]) -> Optional[WorkerInfo]:
|
|
129
|
+
"""Assign to worker with least load."""
|
|
130
|
+
idle_workers = [w for w in workers if w.is_available()]
|
|
131
|
+
|
|
132
|
+
if idle_workers:
|
|
133
|
+
# Assign to worker with least completed tasks
|
|
134
|
+
worker = min(idle_workers, key=lambda w: w.tasks_completed)
|
|
135
|
+
self.assignments += 1
|
|
136
|
+
return worker
|
|
137
|
+
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
141
|
+
"""Get statistics."""
|
|
142
|
+
return {
|
|
143
|
+
"assignments": self.assignments,
|
|
144
|
+
"queue_size": self.task_queue.qsize(),
|
|
145
|
+
"strategy": "Priority",
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class LoadBalancingScheduler(TaskScheduler):
|
|
150
|
+
"""
|
|
151
|
+
Load balancing scheduler.
|
|
152
|
+
|
|
153
|
+
Distributes tasks evenly based on worker capacity and current load.
|
|
154
|
+
Considers GPU count and task queue length.
|
|
155
|
+
|
|
156
|
+
Example:
|
|
157
|
+
>>> scheduler = LoadBalancingScheduler()
|
|
158
|
+
>>> worker = scheduler.assign_task(task, workers)
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
def __init__(self) -> None:
|
|
162
|
+
"""Initialize load balancing scheduler."""
|
|
163
|
+
self.assignments = 0
|
|
164
|
+
self.worker_loads: Dict[str, float] = {}
|
|
165
|
+
|
|
166
|
+
def assign_task(self, task: Task, workers: List[WorkerInfo]) -> Optional[WorkerInfo]:
|
|
167
|
+
"""Assign to least loaded worker."""
|
|
168
|
+
available_workers = [w for w in workers if w.status != "dead"]
|
|
169
|
+
|
|
170
|
+
if not available_workers:
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
# Calculate load for each worker
|
|
174
|
+
worker_loads = {w.worker_id: self._calculate_load(w) for w in available_workers}
|
|
175
|
+
|
|
176
|
+
# Assign to least loaded
|
|
177
|
+
min_worker_id = min(worker_loads, key=worker_loads.get) # type: ignore
|
|
178
|
+
worker = next(w for w in available_workers if w.worker_id == min_worker_id)
|
|
179
|
+
|
|
180
|
+
self.assignments += 1
|
|
181
|
+
self.worker_loads = worker_loads
|
|
182
|
+
|
|
183
|
+
logger.debug(
|
|
184
|
+
f"Assigned task {task.task_id} to {worker.worker_id} "
|
|
185
|
+
f"(load: {worker_loads[min_worker_id]:.2f})"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return worker
|
|
189
|
+
|
|
190
|
+
def _calculate_load(self, worker: WorkerInfo) -> float:
|
|
191
|
+
"""
|
|
192
|
+
Calculate worker load score.
|
|
193
|
+
|
|
194
|
+
Load = (running_tasks / num_gpus) + idle_penalty
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
worker: Worker to calculate load for
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Load score (lower = less loaded)
|
|
201
|
+
"""
|
|
202
|
+
if worker.num_gpus == 0:
|
|
203
|
+
return float("inf")
|
|
204
|
+
|
|
205
|
+
# Current task load
|
|
206
|
+
running_load = (1.0 if worker.status == "busy" else 0.0) / worker.num_gpus
|
|
207
|
+
|
|
208
|
+
# Penalize failed tasks
|
|
209
|
+
failure_penalty = worker.tasks_failed * 0.1
|
|
210
|
+
|
|
211
|
+
# Total load
|
|
212
|
+
load = running_load + failure_penalty
|
|
213
|
+
|
|
214
|
+
return load
|
|
215
|
+
|
|
216
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
217
|
+
"""Get statistics."""
|
|
218
|
+
return {
|
|
219
|
+
"assignments": self.assignments,
|
|
220
|
+
"worker_loads": dict(self.worker_loads),
|
|
221
|
+
"strategy": "LoadBalancing",
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class WorkStealingScheduler(TaskScheduler):
|
|
226
|
+
"""
|
|
227
|
+
Work stealing scheduler.
|
|
228
|
+
|
|
229
|
+
Idle workers can steal tasks from busy workers' queues.
|
|
230
|
+
Improves load balancing for heterogeneous workloads.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
steal_threshold: Minimum queue length to allow stealing
|
|
234
|
+
max_steal_attempts: Maximum steal attempts per cycle
|
|
235
|
+
|
|
236
|
+
Example:
|
|
237
|
+
>>> scheduler = WorkStealingScheduler(steal_threshold=2)
|
|
238
|
+
>>> task = scheduler.steal_task(idle_worker, all_workers)
|
|
239
|
+
"""
|
|
240
|
+
|
|
241
|
+
def __init__(self, steal_threshold: int = 2, max_steal_attempts: int = 3) -> None:
|
|
242
|
+
"""Initialize work stealing scheduler."""
|
|
243
|
+
self.steal_threshold = steal_threshold
|
|
244
|
+
self.max_steal_attempts = max_steal_attempts
|
|
245
|
+
self.assignments = 0
|
|
246
|
+
self.steals = 0
|
|
247
|
+
self.worker_queues: Dict[str, deque] = {}
|
|
248
|
+
|
|
249
|
+
def assign_task(self, task: Task, workers: List[WorkerInfo]) -> Optional[WorkerInfo]:
|
|
250
|
+
"""Assign task to least loaded worker."""
|
|
251
|
+
idle_workers = [w for w in workers if w.is_available()]
|
|
252
|
+
|
|
253
|
+
if idle_workers:
|
|
254
|
+
worker = idle_workers[0]
|
|
255
|
+
|
|
256
|
+
# Add to worker's queue
|
|
257
|
+
if worker.worker_id not in self.worker_queues:
|
|
258
|
+
self.worker_queues[worker.worker_id] = deque()
|
|
259
|
+
|
|
260
|
+
self.worker_queues[worker.worker_id].append(task)
|
|
261
|
+
self.assignments += 1
|
|
262
|
+
|
|
263
|
+
return worker
|
|
264
|
+
|
|
265
|
+
return None
|
|
266
|
+
|
|
267
|
+
def steal_task(self, idle_worker: WorkerInfo, workers: List[WorkerInfo]) -> Optional[Task]:
|
|
268
|
+
"""
|
|
269
|
+
Idle worker steals task from busiest worker.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
idle_worker: Worker looking for work
|
|
273
|
+
workers: All workers
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Stolen task or None
|
|
277
|
+
"""
|
|
278
|
+
# Find workers with tasks above threshold
|
|
279
|
+
busy_workers = [
|
|
280
|
+
w
|
|
281
|
+
for w in workers
|
|
282
|
+
if w.worker_id in self.worker_queues
|
|
283
|
+
and len(self.worker_queues[w.worker_id]) > self.steal_threshold
|
|
284
|
+
]
|
|
285
|
+
|
|
286
|
+
if not busy_workers:
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
# Steal from busiest
|
|
290
|
+
busiest = max(busy_workers, key=lambda w: len(self.worker_queues[w.worker_id]))
|
|
291
|
+
|
|
292
|
+
queue = self.worker_queues[busiest.worker_id]
|
|
293
|
+
|
|
294
|
+
if queue:
|
|
295
|
+
# Steal from end (LIFO for better locality)
|
|
296
|
+
stolen_task = queue.pop()
|
|
297
|
+
self.steals += 1
|
|
298
|
+
|
|
299
|
+
logger.info(
|
|
300
|
+
f"Worker {idle_worker.worker_id} stole task {stolen_task.task_id} "
|
|
301
|
+
f"from {busiest.worker_id}"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
return stolen_task
|
|
305
|
+
|
|
306
|
+
return None
|
|
307
|
+
|
|
308
|
+
def remove_task(self, worker_id: str, task: Task) -> None:
|
|
309
|
+
"""Remove completed task from queue."""
|
|
310
|
+
if worker_id in self.worker_queues:
|
|
311
|
+
try:
|
|
312
|
+
self.worker_queues[worker_id].remove(task)
|
|
313
|
+
except ValueError:
|
|
314
|
+
pass
|
|
315
|
+
|
|
316
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
317
|
+
"""Get statistics."""
|
|
318
|
+
queue_lengths = {wid: len(queue) for wid, queue in self.worker_queues.items()}
|
|
319
|
+
|
|
320
|
+
return {
|
|
321
|
+
"assignments": self.assignments,
|
|
322
|
+
"steals": self.steals,
|
|
323
|
+
"queue_lengths": queue_lengths,
|
|
324
|
+
"strategy": "WorkStealing",
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class AdaptiveScheduler(TaskScheduler):
|
|
329
|
+
"""
|
|
330
|
+
Adaptive scheduler using performance history.
|
|
331
|
+
|
|
332
|
+
Learns optimal assignment policy based on:
|
|
333
|
+
- Worker performance history
|
|
334
|
+
- Task characteristics
|
|
335
|
+
- System state
|
|
336
|
+
|
|
337
|
+
Uses exponential moving average to track worker performance.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
learning_rate: Learning rate for EMA (0-1)
|
|
341
|
+
|
|
342
|
+
Example:
|
|
343
|
+
>>> scheduler = AdaptiveScheduler(learning_rate=0.1)
|
|
344
|
+
>>> worker = scheduler.assign_task(task, workers)
|
|
345
|
+
>>> scheduler.record_completion(worker.worker_id, task, duration=15.2, success=True)
|
|
346
|
+
"""
|
|
347
|
+
|
|
348
|
+
def __init__(self, learning_rate: float = 0.1) -> None:
|
|
349
|
+
"""Initialize adaptive scheduler."""
|
|
350
|
+
self.learning_rate = learning_rate
|
|
351
|
+
self.worker_performance: Dict[str, PerformanceStats] = {}
|
|
352
|
+
self.assignments = 0
|
|
353
|
+
self.history: List[Tuple[str, Task, float, bool]] = []
|
|
354
|
+
|
|
355
|
+
def assign_task(self, task: Task, workers: List[WorkerInfo]) -> Optional[WorkerInfo]:
|
|
356
|
+
"""
|
|
357
|
+
Assign task using learned policy.
|
|
358
|
+
|
|
359
|
+
Computes assignment score based on:
|
|
360
|
+
- Worker speed (inverse of avg completion time)
|
|
361
|
+
- Worker success rate
|
|
362
|
+
- Worker availability
|
|
363
|
+
"""
|
|
364
|
+
available_workers = [w for w in workers if w.is_available()]
|
|
365
|
+
|
|
366
|
+
if not available_workers:
|
|
367
|
+
return None
|
|
368
|
+
|
|
369
|
+
# Compute scores for each worker
|
|
370
|
+
scores = {}
|
|
371
|
+
for worker in available_workers:
|
|
372
|
+
score = self._compute_assignment_score(task, worker)
|
|
373
|
+
scores[worker.worker_id] = score
|
|
374
|
+
|
|
375
|
+
# Assign to best worker
|
|
376
|
+
best_worker_id = max(scores, key=scores.get) # type: ignore
|
|
377
|
+
worker = next(w for w in available_workers if w.worker_id == best_worker_id)
|
|
378
|
+
|
|
379
|
+
self.assignments += 1
|
|
380
|
+
|
|
381
|
+
logger.debug(
|
|
382
|
+
f"Assigned task {task.task_id} to {worker.worker_id} "
|
|
383
|
+
f"(score: {scores[best_worker_id]:.4f})"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
return worker
|
|
387
|
+
|
|
388
|
+
def _compute_assignment_score(self, task: Task, worker: WorkerInfo) -> float:
|
|
389
|
+
"""
|
|
390
|
+
Compute assignment score for worker.
|
|
391
|
+
|
|
392
|
+
Higher score = better assignment.
|
|
393
|
+
"""
|
|
394
|
+
perf = self.worker_performance.get(worker.worker_id)
|
|
395
|
+
|
|
396
|
+
if perf is None:
|
|
397
|
+
# New worker: default score based on capacity
|
|
398
|
+
return float(worker.num_gpus)
|
|
399
|
+
|
|
400
|
+
# Speed score (inverse of completion time)
|
|
401
|
+
speed_score = 1.0 / (perf.avg_completion_time + 1e-6)
|
|
402
|
+
|
|
403
|
+
# Success score
|
|
404
|
+
success_score = perf.success_rate
|
|
405
|
+
|
|
406
|
+
# GPU capacity bonus
|
|
407
|
+
capacity_bonus = worker.num_gpus / 4.0 # Normalize to typical 4 GPUs
|
|
408
|
+
|
|
409
|
+
# Combined score
|
|
410
|
+
score = speed_score * 0.5 + success_score * 0.3 + capacity_bonus * 0.2
|
|
411
|
+
|
|
412
|
+
return score
|
|
413
|
+
|
|
414
|
+
def record_completion(self, worker_id: str, task: Task, duration: float, success: bool) -> None:
|
|
415
|
+
"""
|
|
416
|
+
Record task completion for learning.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
worker_id: Worker that completed task
|
|
420
|
+
task: Completed task
|
|
421
|
+
duration: Execution duration (seconds)
|
|
422
|
+
success: Whether task succeeded
|
|
423
|
+
"""
|
|
424
|
+
# Initialize stats if needed
|
|
425
|
+
if worker_id not in self.worker_performance:
|
|
426
|
+
self.worker_performance[worker_id] = PerformanceStats()
|
|
427
|
+
|
|
428
|
+
# Update statistics
|
|
429
|
+
stats = self.worker_performance[worker_id]
|
|
430
|
+
stats.update(duration, success, self.learning_rate)
|
|
431
|
+
|
|
432
|
+
# Add to history
|
|
433
|
+
self.history.append((worker_id, task, duration, success))
|
|
434
|
+
|
|
435
|
+
# Keep history bounded
|
|
436
|
+
if len(self.history) > 10000:
|
|
437
|
+
self.history = self.history[-5000:]
|
|
438
|
+
|
|
439
|
+
logger.debug(
|
|
440
|
+
f"Updated stats for {worker_id}: "
|
|
441
|
+
f"avg_time={stats.avg_completion_time:.2f}s, "
|
|
442
|
+
f"success_rate={stats.success_rate:.2%}"
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
446
|
+
"""Get statistics."""
|
|
447
|
+
return {
|
|
448
|
+
"assignments": self.assignments,
|
|
449
|
+
"workers_tracked": len(self.worker_performance),
|
|
450
|
+
"history_size": len(self.history),
|
|
451
|
+
"worker_performance": {
|
|
452
|
+
wid: {
|
|
453
|
+
"avg_time": stats.avg_completion_time,
|
|
454
|
+
"success_rate": stats.success_rate,
|
|
455
|
+
"total_tasks": stats.total_tasks,
|
|
456
|
+
}
|
|
457
|
+
for wid, stats in self.worker_performance.items()
|
|
458
|
+
},
|
|
459
|
+
"strategy": "Adaptive",
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
@dataclass
|
|
464
|
+
class PerformanceStats:
|
|
465
|
+
"""Worker performance statistics."""
|
|
466
|
+
|
|
467
|
+
avg_completion_time: float = 10.0 # Default 10s
|
|
468
|
+
success_rate: float = 1.0
|
|
469
|
+
total_tasks: int = 0
|
|
470
|
+
|
|
471
|
+
def update(self, duration: float, success: bool, alpha: float = 0.1) -> None:
|
|
472
|
+
"""
|
|
473
|
+
Update statistics with new measurement.
|
|
474
|
+
|
|
475
|
+
Uses exponential moving average for smooth adaptation.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
duration: Task duration
|
|
479
|
+
success: Whether task succeeded
|
|
480
|
+
alpha: Learning rate (0-1)
|
|
481
|
+
"""
|
|
482
|
+
self.total_tasks += 1
|
|
483
|
+
|
|
484
|
+
# Exponential moving average for completion time
|
|
485
|
+
self.avg_completion_time = alpha * duration + (1 - alpha) * self.avg_completion_time
|
|
486
|
+
|
|
487
|
+
# Exponential moving average for success rate
|
|
488
|
+
success_value = 1.0 if success else 0.0
|
|
489
|
+
self.success_rate = alpha * success_value + (1 - alpha) * self.success_rate
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
class RoundRobinScheduler(TaskScheduler):
|
|
493
|
+
"""
|
|
494
|
+
Round-robin scheduler.
|
|
495
|
+
|
|
496
|
+
Distributes tasks in circular order across workers.
|
|
497
|
+
Simple and fair for homogeneous workers.
|
|
498
|
+
|
|
499
|
+
Example:
|
|
500
|
+
>>> scheduler = RoundRobinScheduler()
|
|
501
|
+
>>> worker = scheduler.assign_task(task, workers)
|
|
502
|
+
"""
|
|
503
|
+
|
|
504
|
+
def __init__(self) -> None:
|
|
505
|
+
"""Initialize round-robin scheduler."""
|
|
506
|
+
self.current_index = 0
|
|
507
|
+
self.assignments = 0
|
|
508
|
+
|
|
509
|
+
def assign_task(self, task: Task, workers: List[WorkerInfo]) -> Optional[WorkerInfo]:
|
|
510
|
+
"""Assign task in round-robin fashion."""
|
|
511
|
+
available_workers = [w for w in workers if w.is_available()]
|
|
512
|
+
|
|
513
|
+
if not available_workers:
|
|
514
|
+
return None
|
|
515
|
+
|
|
516
|
+
# Select worker in round-robin
|
|
517
|
+
worker = available_workers[self.current_index % len(available_workers)]
|
|
518
|
+
|
|
519
|
+
self.current_index += 1
|
|
520
|
+
self.assignments += 1
|
|
521
|
+
|
|
522
|
+
return worker
|
|
523
|
+
|
|
524
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
525
|
+
"""Get statistics."""
|
|
526
|
+
return {
|
|
527
|
+
"assignments": self.assignments,
|
|
528
|
+
"current_index": self.current_index,
|
|
529
|
+
"strategy": "RoundRobin",
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def create_scheduler(strategy: str, **kwargs: Any) -> TaskScheduler:
|
|
534
|
+
"""
|
|
535
|
+
Factory function to create scheduler.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
strategy: Scheduler strategy name
|
|
539
|
+
- 'fifo': First-In-First-Out
|
|
540
|
+
- 'priority': Priority-based
|
|
541
|
+
- 'load_balancing': Load balancing
|
|
542
|
+
- 'work_stealing': Work stealing
|
|
543
|
+
- 'adaptive': Adaptive learning
|
|
544
|
+
- 'round_robin': Round-robin
|
|
545
|
+
**kwargs: Additional scheduler-specific arguments
|
|
546
|
+
|
|
547
|
+
Returns:
|
|
548
|
+
TaskScheduler instance
|
|
549
|
+
|
|
550
|
+
Example:
|
|
551
|
+
>>> scheduler = create_scheduler('adaptive', learning_rate=0.15)
|
|
552
|
+
"""
|
|
553
|
+
schedulers = {
|
|
554
|
+
"fifo": FIFOScheduler,
|
|
555
|
+
"priority": PriorityScheduler,
|
|
556
|
+
"load_balancing": LoadBalancingScheduler,
|
|
557
|
+
"work_stealing": WorkStealingScheduler,
|
|
558
|
+
"adaptive": AdaptiveScheduler,
|
|
559
|
+
"round_robin": RoundRobinScheduler,
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
if strategy not in schedulers:
|
|
563
|
+
raise ValueError(
|
|
564
|
+
f"Unknown scheduler strategy: {strategy}. " f"Available: {list(schedulers.keys())}"
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
return schedulers[strategy](**kwargs)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Distributed storage backends for MorphML.
|
|
2
|
+
|
|
3
|
+
Provides persistent storage for:
|
|
4
|
+
- Experiment results (PostgreSQL)
|
|
5
|
+
- Fast caching (Redis)
|
|
6
|
+
- Model artifacts (S3/MinIO)
|
|
7
|
+
- Checkpoints (combined storage)
|
|
8
|
+
|
|
9
|
+
Author: Eshan Roy <eshanized@proton.me>
|
|
10
|
+
Organization: TONMOY INFRASTRUCTURE & VISION
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from morphml.distributed.storage.artifacts import ArtifactStore
|
|
14
|
+
from morphml.distributed.storage.cache import DistributedCache
|
|
15
|
+
from morphml.distributed.storage.checkpointing import CheckpointManager
|
|
16
|
+
from morphml.distributed.storage.database import (
|
|
17
|
+
Architecture,
|
|
18
|
+
DatabaseManager,
|
|
19
|
+
Experiment,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
# Database
|
|
24
|
+
"DatabaseManager",
|
|
25
|
+
"Experiment",
|
|
26
|
+
"Architecture",
|
|
27
|
+
# Cache
|
|
28
|
+
"DistributedCache",
|
|
29
|
+
# Artifacts
|
|
30
|
+
"ArtifactStore",
|
|
31
|
+
# Checkpointing
|
|
32
|
+
"CheckpointManager",
|
|
33
|
+
]
|