morphml 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of morphml might be problematic. Click here for more details.
- morphml/__init__.py +14 -0
- morphml/api/__init__.py +26 -0
- morphml/api/app.py +326 -0
- morphml/api/auth.py +193 -0
- morphml/api/client.py +338 -0
- morphml/api/models.py +132 -0
- morphml/api/rate_limit.py +192 -0
- morphml/benchmarking/__init__.py +36 -0
- morphml/benchmarking/comparison.py +430 -0
- morphml/benchmarks/__init__.py +56 -0
- morphml/benchmarks/comparator.py +409 -0
- morphml/benchmarks/datasets.py +280 -0
- morphml/benchmarks/metrics.py +199 -0
- morphml/benchmarks/openml_suite.py +201 -0
- morphml/benchmarks/problems.py +289 -0
- morphml/benchmarks/suite.py +318 -0
- morphml/cli/__init__.py +5 -0
- morphml/cli/commands/experiment.py +329 -0
- morphml/cli/main.py +457 -0
- morphml/cli/quickstart.py +312 -0
- morphml/config.py +278 -0
- morphml/constraints/__init__.py +19 -0
- morphml/constraints/handler.py +205 -0
- morphml/constraints/predicates.py +285 -0
- morphml/core/__init__.py +3 -0
- morphml/core/crossover.py +449 -0
- morphml/core/dsl/README.md +359 -0
- morphml/core/dsl/__init__.py +72 -0
- morphml/core/dsl/ast_nodes.py +364 -0
- morphml/core/dsl/compiler.py +318 -0
- morphml/core/dsl/layers.py +368 -0
- morphml/core/dsl/lexer.py +336 -0
- morphml/core/dsl/parser.py +455 -0
- morphml/core/dsl/search_space.py +386 -0
- morphml/core/dsl/syntax.py +199 -0
- morphml/core/dsl/type_system.py +361 -0
- morphml/core/dsl/validator.py +386 -0
- morphml/core/graph/__init__.py +40 -0
- morphml/core/graph/edge.py +124 -0
- morphml/core/graph/graph.py +507 -0
- morphml/core/graph/mutations.py +409 -0
- morphml/core/graph/node.py +196 -0
- morphml/core/graph/serialization.py +361 -0
- morphml/core/graph/visualization.py +431 -0
- morphml/core/objectives/__init__.py +20 -0
- morphml/core/search/__init__.py +33 -0
- morphml/core/search/individual.py +252 -0
- morphml/core/search/parameters.py +453 -0
- morphml/core/search/population.py +375 -0
- morphml/core/search/search_engine.py +340 -0
- morphml/distributed/__init__.py +76 -0
- morphml/distributed/fault_tolerance.py +497 -0
- morphml/distributed/health_monitor.py +348 -0
- morphml/distributed/master.py +709 -0
- morphml/distributed/proto/README.md +224 -0
- morphml/distributed/proto/__init__.py +74 -0
- morphml/distributed/proto/worker.proto +170 -0
- morphml/distributed/proto/worker_pb2.py +79 -0
- morphml/distributed/proto/worker_pb2_grpc.py +423 -0
- morphml/distributed/resource_manager.py +416 -0
- morphml/distributed/scheduler.py +567 -0
- morphml/distributed/storage/__init__.py +33 -0
- morphml/distributed/storage/artifacts.py +381 -0
- morphml/distributed/storage/cache.py +366 -0
- morphml/distributed/storage/checkpointing.py +329 -0
- morphml/distributed/storage/database.py +459 -0
- morphml/distributed/worker.py +549 -0
- morphml/evaluation/__init__.py +5 -0
- morphml/evaluation/heuristic.py +237 -0
- morphml/exceptions.py +55 -0
- morphml/execution/__init__.py +5 -0
- morphml/execution/local_executor.py +350 -0
- morphml/integrations/__init__.py +28 -0
- morphml/integrations/jax_adapter.py +206 -0
- morphml/integrations/pytorch_adapter.py +530 -0
- morphml/integrations/sklearn_adapter.py +206 -0
- morphml/integrations/tensorflow_adapter.py +230 -0
- morphml/logging_config.py +93 -0
- morphml/meta_learning/__init__.py +66 -0
- morphml/meta_learning/architecture_similarity.py +277 -0
- morphml/meta_learning/experiment_database.py +240 -0
- morphml/meta_learning/knowledge_base/__init__.py +19 -0
- morphml/meta_learning/knowledge_base/embedder.py +179 -0
- morphml/meta_learning/knowledge_base/knowledge_base.py +313 -0
- morphml/meta_learning/knowledge_base/meta_features.py +265 -0
- morphml/meta_learning/knowledge_base/vector_store.py +271 -0
- morphml/meta_learning/predictors/__init__.py +27 -0
- morphml/meta_learning/predictors/ensemble.py +221 -0
- morphml/meta_learning/predictors/gnn_predictor.py +552 -0
- morphml/meta_learning/predictors/learning_curve.py +231 -0
- morphml/meta_learning/predictors/proxy_metrics.py +261 -0
- morphml/meta_learning/strategy_evolution/__init__.py +27 -0
- morphml/meta_learning/strategy_evolution/adaptive_optimizer.py +226 -0
- morphml/meta_learning/strategy_evolution/bandit.py +276 -0
- morphml/meta_learning/strategy_evolution/portfolio.py +230 -0
- morphml/meta_learning/transfer.py +581 -0
- morphml/meta_learning/warm_start.py +286 -0
- morphml/optimizers/__init__.py +74 -0
- morphml/optimizers/adaptive_operators.py +399 -0
- morphml/optimizers/bayesian/__init__.py +52 -0
- morphml/optimizers/bayesian/acquisition.py +387 -0
- morphml/optimizers/bayesian/base.py +319 -0
- morphml/optimizers/bayesian/gaussian_process.py +635 -0
- morphml/optimizers/bayesian/smac.py +534 -0
- morphml/optimizers/bayesian/tpe.py +411 -0
- morphml/optimizers/differential_evolution.py +220 -0
- morphml/optimizers/evolutionary/__init__.py +61 -0
- morphml/optimizers/evolutionary/cma_es.py +416 -0
- morphml/optimizers/evolutionary/differential_evolution.py +556 -0
- morphml/optimizers/evolutionary/encoding.py +426 -0
- morphml/optimizers/evolutionary/particle_swarm.py +449 -0
- morphml/optimizers/genetic_algorithm.py +486 -0
- morphml/optimizers/gradient_based/__init__.py +22 -0
- morphml/optimizers/gradient_based/darts.py +550 -0
- morphml/optimizers/gradient_based/enas.py +585 -0
- morphml/optimizers/gradient_based/operations.py +474 -0
- morphml/optimizers/gradient_based/utils.py +601 -0
- morphml/optimizers/hill_climbing.py +169 -0
- morphml/optimizers/multi_objective/__init__.py +56 -0
- morphml/optimizers/multi_objective/indicators.py +504 -0
- morphml/optimizers/multi_objective/nsga2.py +647 -0
- morphml/optimizers/multi_objective/visualization.py +427 -0
- morphml/optimizers/nsga2.py +308 -0
- morphml/optimizers/random_search.py +172 -0
- morphml/optimizers/simulated_annealing.py +181 -0
- morphml/plugins/__init__.py +35 -0
- morphml/plugins/custom_evaluator_example.py +81 -0
- morphml/plugins/custom_optimizer_example.py +63 -0
- morphml/plugins/plugin_system.py +454 -0
- morphml/reports/__init__.py +30 -0
- morphml/reports/generator.py +362 -0
- morphml/tracking/__init__.py +7 -0
- morphml/tracking/experiment.py +309 -0
- morphml/tracking/logger.py +301 -0
- morphml/tracking/reporter.py +357 -0
- morphml/utils/__init__.py +6 -0
- morphml/utils/checkpoint.py +189 -0
- morphml/utils/comparison.py +390 -0
- morphml/utils/export.py +407 -0
- morphml/utils/progress.py +392 -0
- morphml/utils/validation.py +392 -0
- morphml/version.py +7 -0
- morphml/visualization/__init__.py +50 -0
- morphml/visualization/analytics.py +423 -0
- morphml/visualization/architecture_diagrams.py +353 -0
- morphml/visualization/architecture_plot.py +223 -0
- morphml/visualization/convergence_plot.py +174 -0
- morphml/visualization/crossover_viz.py +386 -0
- morphml/visualization/graph_viz.py +338 -0
- morphml/visualization/pareto_plot.py +149 -0
- morphml/visualization/plotly_dashboards.py +422 -0
- morphml/visualization/population.py +309 -0
- morphml/visualization/progress.py +260 -0
- morphml-1.0.0.dist-info/METADATA +434 -0
- morphml-1.0.0.dist-info/RECORD +158 -0
- morphml-1.0.0.dist-info/WHEEL +4 -0
- morphml-1.0.0.dist-info/entry_points.txt +3 -0
- morphml-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,549 @@
|
|
|
1
|
+
"""Worker node for distributed architecture evaluation.
|
|
2
|
+
|
|
3
|
+
Workers execute architecture evaluation tasks assigned by the master node.
|
|
4
|
+
|
|
5
|
+
Author: Eshan Roy <eshanized@proton.me>
|
|
6
|
+
Organization: TONMOY INFRASTRUCTURE & VISION
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import socket
|
|
10
|
+
import threading
|
|
11
|
+
import time
|
|
12
|
+
import uuid
|
|
13
|
+
from concurrent import futures
|
|
14
|
+
from typing import Any, Dict, Optional
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
import grpc
|
|
18
|
+
|
|
19
|
+
from morphml.distributed.proto import worker_pb2, worker_pb2_grpc
|
|
20
|
+
|
|
21
|
+
GRPC_AVAILABLE = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
GRPC_AVAILABLE = False
|
|
24
|
+
|
|
25
|
+
# Create stub modules when grpc is not available
|
|
26
|
+
class _StubModule:
|
|
27
|
+
def __getattr__(self, name):
|
|
28
|
+
raise ImportError(
|
|
29
|
+
"grpc is not installed. Install with: pip install grpcio grpcio-tools"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
worker_pb2 = _StubModule()
|
|
33
|
+
worker_pb2_grpc = _StubModule()
|
|
34
|
+
grpc = _StubModule()
|
|
35
|
+
|
|
36
|
+
from morphml.core.graph import ModelGraph
|
|
37
|
+
from morphml.exceptions import DistributedError
|
|
38
|
+
from morphml.logging_config import get_logger
|
|
39
|
+
|
|
40
|
+
logger = get_logger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class WorkerNode:
|
|
44
|
+
"""
|
|
45
|
+
Worker node for distributed architecture evaluation.
|
|
46
|
+
|
|
47
|
+
Responsibilities:
|
|
48
|
+
1. Register with master node
|
|
49
|
+
2. Receive evaluation tasks
|
|
50
|
+
3. Train and evaluate architectures
|
|
51
|
+
4. Send results back to master
|
|
52
|
+
5. Send periodic heartbeat
|
|
53
|
+
6. Handle graceful shutdown
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
config: Worker configuration
|
|
57
|
+
- worker_id: Unique worker ID (generated if not provided)
|
|
58
|
+
- master_host: Master node hostname/IP (required)
|
|
59
|
+
- master_port: Master node port (default: 50051)
|
|
60
|
+
- port: Worker port (default: 50052)
|
|
61
|
+
- num_gpus: Number of GPUs available (default: 1)
|
|
62
|
+
- gpu_ids: Specific GPU IDs to use (default: range(num_gpus))
|
|
63
|
+
- heartbeat_interval: Heartbeat interval (seconds, default: 10)
|
|
64
|
+
- evaluator: Custom evaluation function
|
|
65
|
+
|
|
66
|
+
Example:
|
|
67
|
+
>>> worker = WorkerNode({
|
|
68
|
+
... 'master_host': 'localhost',
|
|
69
|
+
... 'master_port': 50051,
|
|
70
|
+
... 'port': 50052,
|
|
71
|
+
... 'num_gpus': 1,
|
|
72
|
+
... 'evaluator': my_eval_function
|
|
73
|
+
... })
|
|
74
|
+
>>> worker.start()
|
|
75
|
+
>>> # Worker runs until stopped
|
|
76
|
+
>>> worker.stop()
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(self, config: Dict[str, Any]):
|
|
80
|
+
"""Initialize worker node."""
|
|
81
|
+
if not GRPC_AVAILABLE:
|
|
82
|
+
raise DistributedError(
|
|
83
|
+
"gRPC not available. Install with: pip install grpcio grpcio-tools"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
self.config = config
|
|
87
|
+
|
|
88
|
+
# Worker identification
|
|
89
|
+
self.worker_id = config.get("worker_id", str(uuid.uuid4()))
|
|
90
|
+
self.master_host = config["master_host"]
|
|
91
|
+
self.master_port = config.get("master_port", 50051)
|
|
92
|
+
self.port = config.get("port", 50052)
|
|
93
|
+
|
|
94
|
+
# GPU configuration
|
|
95
|
+
self.num_gpus = config.get("num_gpus", 1)
|
|
96
|
+
self.gpu_ids = config.get("gpu_ids", list(range(self.num_gpus)))
|
|
97
|
+
|
|
98
|
+
# Evaluation configuration
|
|
99
|
+
self.evaluator = config.get("evaluator")
|
|
100
|
+
self.heartbeat_interval = config.get("heartbeat_interval", 10)
|
|
101
|
+
|
|
102
|
+
# State
|
|
103
|
+
self.running = False
|
|
104
|
+
self.current_task_id: Optional[str] = None
|
|
105
|
+
self.tasks_completed = 0
|
|
106
|
+
self.tasks_failed = 0
|
|
107
|
+
self.start_time = time.time()
|
|
108
|
+
|
|
109
|
+
# gRPC server
|
|
110
|
+
self.server: Optional[grpc.Server] = None
|
|
111
|
+
|
|
112
|
+
logger.info(
|
|
113
|
+
f"Initialized WorkerNode (id={self.worker_id[:12]}) " f"with {self.num_gpus} GPU(s)"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def start(self) -> None:
|
|
117
|
+
"""Start worker node."""
|
|
118
|
+
logger.info(f"Starting worker {self.worker_id[:12]} on port {self.port}")
|
|
119
|
+
|
|
120
|
+
# Register with master
|
|
121
|
+
self._register_with_master()
|
|
122
|
+
|
|
123
|
+
# Start gRPC server
|
|
124
|
+
self.server = grpc.server(
|
|
125
|
+
futures.ThreadPoolExecutor(max_workers=5),
|
|
126
|
+
options=[
|
|
127
|
+
("grpc.max_send_message_length", 100 * 1024 * 1024),
|
|
128
|
+
("grpc.max_receive_message_length", 100 * 1024 * 1024),
|
|
129
|
+
],
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Add servicer
|
|
133
|
+
worker_pb2_grpc.add_WorkerServiceServicer_to_server(WorkerServicer(self), self.server)
|
|
134
|
+
|
|
135
|
+
# Start server
|
|
136
|
+
self.server.add_insecure_port(f"[::]:{self.port}")
|
|
137
|
+
self.server.start()
|
|
138
|
+
|
|
139
|
+
self.running = True
|
|
140
|
+
self.start_time = time.time()
|
|
141
|
+
|
|
142
|
+
# Start heartbeat
|
|
143
|
+
self._start_heartbeat()
|
|
144
|
+
|
|
145
|
+
logger.info(f"Worker {self.worker_id[:12]} started successfully")
|
|
146
|
+
|
|
147
|
+
def stop(self) -> None:
|
|
148
|
+
"""Stop worker gracefully."""
|
|
149
|
+
logger.info(f"Stopping worker {self.worker_id[:12]}")
|
|
150
|
+
self.running = False
|
|
151
|
+
|
|
152
|
+
if self.server:
|
|
153
|
+
self.server.stop(grace=5)
|
|
154
|
+
|
|
155
|
+
logger.info(f"Worker {self.worker_id[:12]} stopped")
|
|
156
|
+
|
|
157
|
+
def wait_for_shutdown(self) -> None:
|
|
158
|
+
"""Block until worker is stopped."""
|
|
159
|
+
if self.server:
|
|
160
|
+
self.server.wait_for_termination()
|
|
161
|
+
|
|
162
|
+
def evaluate_architecture(
|
|
163
|
+
self,
|
|
164
|
+
architecture: ModelGraph,
|
|
165
|
+
config: Optional[Dict[str, Any]] = None,
|
|
166
|
+
) -> Dict[str, float]:
|
|
167
|
+
"""
|
|
168
|
+
Evaluate architecture.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
architecture: ModelGraph to evaluate
|
|
172
|
+
config: Evaluation configuration (epochs, batch_size, etc.)
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Dictionary with metrics (accuracy, params, latency, etc.)
|
|
176
|
+
|
|
177
|
+
Example:
|
|
178
|
+
>>> result = worker.evaluate_architecture(graph)
|
|
179
|
+
>>> print(result['val_accuracy'])
|
|
180
|
+
"""
|
|
181
|
+
logger.info(f"Evaluating architecture on worker {self.worker_id[:12]}")
|
|
182
|
+
|
|
183
|
+
start_time = time.time()
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
# Use custom evaluator if provided
|
|
187
|
+
if self.evaluator:
|
|
188
|
+
result = self.evaluator(architecture)
|
|
189
|
+
|
|
190
|
+
# Ensure result is a dictionary
|
|
191
|
+
if not isinstance(result, dict):
|
|
192
|
+
result = {"fitness": float(result)}
|
|
193
|
+
|
|
194
|
+
else:
|
|
195
|
+
# Default evaluation (heuristic)
|
|
196
|
+
result = self._default_evaluation(architecture, config or {})
|
|
197
|
+
|
|
198
|
+
# Add metadata
|
|
199
|
+
result["worker_id"] = self.worker_id
|
|
200
|
+
result["evaluation_time"] = time.time() - start_time
|
|
201
|
+
result["gpu_id"] = self.gpu_ids[0] if self.gpu_ids else -1
|
|
202
|
+
|
|
203
|
+
logger.debug(
|
|
204
|
+
f"Evaluation complete in {result['evaluation_time']:.2f}s: "
|
|
205
|
+
f"fitness={result.get('fitness', 'N/A')}"
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
return result
|
|
209
|
+
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.error(f"Evaluation failed: {e}")
|
|
212
|
+
raise
|
|
213
|
+
|
|
214
|
+
def _default_evaluation(
|
|
215
|
+
self, architecture: ModelGraph, config: Dict[str, Any]
|
|
216
|
+
) -> Dict[str, float]:
|
|
217
|
+
"""
|
|
218
|
+
Default heuristic evaluation when no evaluator provided.
|
|
219
|
+
|
|
220
|
+
This is a fast proxy evaluation based on architecture properties.
|
|
221
|
+
For actual training, provide a custom evaluator.
|
|
222
|
+
"""
|
|
223
|
+
from morphml.evaluation import HeuristicEvaluator
|
|
224
|
+
|
|
225
|
+
evaluator = HeuristicEvaluator()
|
|
226
|
+
fitness = evaluator(architecture)
|
|
227
|
+
|
|
228
|
+
# Estimate other metrics
|
|
229
|
+
params = architecture.estimate_parameters()
|
|
230
|
+
depth = len(list(architecture.topological_sort()))
|
|
231
|
+
|
|
232
|
+
return {
|
|
233
|
+
"fitness": fitness,
|
|
234
|
+
"val_accuracy": fitness,
|
|
235
|
+
"params": params,
|
|
236
|
+
"depth": depth,
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
def _register_with_master(self) -> None:
|
|
240
|
+
"""Register with master node."""
|
|
241
|
+
logger.info(f"Registering with master at {self.master_host}:{self.master_port}")
|
|
242
|
+
|
|
243
|
+
max_retries = 10
|
|
244
|
+
retry_delay = 2
|
|
245
|
+
|
|
246
|
+
for attempt in range(max_retries):
|
|
247
|
+
try:
|
|
248
|
+
channel = grpc.insecure_channel(f"{self.master_host}:{self.master_port}")
|
|
249
|
+
stub = worker_pb2_grpc.MasterServiceStub(channel)
|
|
250
|
+
|
|
251
|
+
request = worker_pb2.RegisterRequest(
|
|
252
|
+
worker_id=self.worker_id,
|
|
253
|
+
host=socket.gethostname(),
|
|
254
|
+
port=self.port,
|
|
255
|
+
num_gpus=self.num_gpus,
|
|
256
|
+
gpu_ids=self.gpu_ids,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
response = stub.RegisterWorker(request, timeout=10)
|
|
260
|
+
|
|
261
|
+
if response.success:
|
|
262
|
+
logger.info(f"Successfully registered with master (id={response.master_id})")
|
|
263
|
+
return
|
|
264
|
+
else:
|
|
265
|
+
raise DistributedError(f"Registration failed: {response.message}")
|
|
266
|
+
|
|
267
|
+
except grpc.RpcError as e:
|
|
268
|
+
if attempt < max_retries - 1:
|
|
269
|
+
logger.warning(
|
|
270
|
+
f"Registration attempt {attempt + 1}/{max_retries} failed: {e}. "
|
|
271
|
+
f"Retrying in {retry_delay}s..."
|
|
272
|
+
)
|
|
273
|
+
time.sleep(retry_delay)
|
|
274
|
+
else:
|
|
275
|
+
raise DistributedError(
|
|
276
|
+
f"Failed to register with master after {max_retries} attempts"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def _start_heartbeat(self) -> None:
|
|
280
|
+
"""Start periodic heartbeat to master."""
|
|
281
|
+
|
|
282
|
+
def heartbeat_loop() -> None:
|
|
283
|
+
while self.running:
|
|
284
|
+
try:
|
|
285
|
+
channel = grpc.insecure_channel(f"{self.master_host}:{self.master_port}")
|
|
286
|
+
stub = worker_pb2_grpc.MasterServiceStub(channel)
|
|
287
|
+
|
|
288
|
+
# Determine status
|
|
289
|
+
status = "busy" if self.current_task_id else "idle"
|
|
290
|
+
|
|
291
|
+
# Create metrics
|
|
292
|
+
metrics = worker_pb2.WorkerMetrics(
|
|
293
|
+
cpu_usage=self._get_cpu_usage(),
|
|
294
|
+
memory_usage=self._get_memory_usage(),
|
|
295
|
+
gpu_usage=self._get_gpu_usage(),
|
|
296
|
+
gpu_memory=self._get_gpu_memory(),
|
|
297
|
+
tasks_completed=self.tasks_completed,
|
|
298
|
+
tasks_failed=self.tasks_failed,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
request = worker_pb2.HeartbeatRequest(
|
|
302
|
+
worker_id=self.worker_id,
|
|
303
|
+
status=status,
|
|
304
|
+
current_task_id=self.current_task_id or "",
|
|
305
|
+
metrics=metrics,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
response = stub.Heartbeat(request, timeout=5)
|
|
309
|
+
|
|
310
|
+
# Check if master wants us to shutdown
|
|
311
|
+
if not response.should_continue:
|
|
312
|
+
logger.info("Master requested shutdown")
|
|
313
|
+
self.running = False
|
|
314
|
+
break
|
|
315
|
+
|
|
316
|
+
except grpc.RpcError as e:
|
|
317
|
+
logger.error(f"Heartbeat failed: {e}")
|
|
318
|
+
|
|
319
|
+
except Exception as e:
|
|
320
|
+
logger.error(f"Heartbeat error: {e}")
|
|
321
|
+
|
|
322
|
+
time.sleep(self.heartbeat_interval)
|
|
323
|
+
|
|
324
|
+
thread = threading.Thread(target=heartbeat_loop, daemon=True, name="HeartbeatThread")
|
|
325
|
+
thread.start()
|
|
326
|
+
logger.debug("Heartbeat thread started")
|
|
327
|
+
|
|
328
|
+
def _submit_result(
|
|
329
|
+
self,
|
|
330
|
+
task_id: str,
|
|
331
|
+
success: bool,
|
|
332
|
+
metrics: Dict[str, float],
|
|
333
|
+
error: str = "",
|
|
334
|
+
duration: float = 0.0,
|
|
335
|
+
) -> None:
|
|
336
|
+
"""Submit task result to master."""
|
|
337
|
+
try:
|
|
338
|
+
channel = grpc.insecure_channel(f"{self.master_host}:{self.master_port}")
|
|
339
|
+
stub = worker_pb2_grpc.MasterServiceStub(channel)
|
|
340
|
+
|
|
341
|
+
request = worker_pb2.ResultRequest(
|
|
342
|
+
task_id=task_id,
|
|
343
|
+
worker_id=self.worker_id,
|
|
344
|
+
success=success,
|
|
345
|
+
metrics=metrics,
|
|
346
|
+
error=error,
|
|
347
|
+
duration=duration,
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
response = stub.SubmitResult(request, timeout=10)
|
|
351
|
+
|
|
352
|
+
if not response.acknowledged:
|
|
353
|
+
logger.warning(f"Master did not acknowledge result for task {task_id}")
|
|
354
|
+
|
|
355
|
+
except grpc.RpcError as e:
|
|
356
|
+
logger.error(f"Failed to submit result for task {task_id}: {e}")
|
|
357
|
+
|
|
358
|
+
def _get_cpu_usage(self) -> float:
|
|
359
|
+
"""Get CPU usage percentage."""
|
|
360
|
+
try:
|
|
361
|
+
import psutil
|
|
362
|
+
|
|
363
|
+
return psutil.cpu_percent(interval=0.1)
|
|
364
|
+
except Exception:
|
|
365
|
+
return 0.0
|
|
366
|
+
|
|
367
|
+
def _get_memory_usage(self) -> float:
|
|
368
|
+
"""Get memory usage percentage."""
|
|
369
|
+
try:
|
|
370
|
+
import psutil
|
|
371
|
+
|
|
372
|
+
return psutil.virtual_memory().percent
|
|
373
|
+
except Exception:
|
|
374
|
+
return 0.0
|
|
375
|
+
|
|
376
|
+
def _get_gpu_usage(self) -> float:
|
|
377
|
+
"""Get GPU usage percentage."""
|
|
378
|
+
try:
|
|
379
|
+
import pynvml
|
|
380
|
+
|
|
381
|
+
pynvml.nvmlInit()
|
|
382
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_ids[0])
|
|
383
|
+
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
|
384
|
+
return float(utilization.gpu)
|
|
385
|
+
except Exception:
|
|
386
|
+
return 0.0
|
|
387
|
+
|
|
388
|
+
def _get_gpu_memory(self) -> float:
|
|
389
|
+
"""Get GPU memory usage percentage."""
|
|
390
|
+
try:
|
|
391
|
+
import pynvml
|
|
392
|
+
|
|
393
|
+
pynvml.nvmlInit()
|
|
394
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_ids[0])
|
|
395
|
+
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
396
|
+
return (mem_info.used / mem_info.total) * 100
|
|
397
|
+
except Exception:
|
|
398
|
+
return 0.0
|
|
399
|
+
|
|
400
|
+
def get_status(self) -> Dict[str, Any]:
|
|
401
|
+
"""Get worker status."""
|
|
402
|
+
return {
|
|
403
|
+
"worker_id": self.worker_id,
|
|
404
|
+
"status": "busy" if self.current_task_id else "idle",
|
|
405
|
+
"current_task": self.current_task_id,
|
|
406
|
+
"tasks_completed": self.tasks_completed,
|
|
407
|
+
"tasks_failed": self.tasks_failed,
|
|
408
|
+
"uptime_seconds": time.time() - self.start_time,
|
|
409
|
+
"cpu_usage": self._get_cpu_usage(),
|
|
410
|
+
"memory_usage": self._get_memory_usage(),
|
|
411
|
+
"gpu_usage": self._get_gpu_usage(),
|
|
412
|
+
"gpu_memory": self._get_gpu_memory(),
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
if GRPC_AVAILABLE:
|
|
417
|
+
|
|
418
|
+
class WorkerServicer(worker_pb2_grpc.WorkerServiceServicer):
|
|
419
|
+
"""gRPC servicer for worker node."""
|
|
420
|
+
|
|
421
|
+
def __init__(self, worker: WorkerNode):
|
|
422
|
+
"""Initialize servicer."""
|
|
423
|
+
self.worker = worker
|
|
424
|
+
|
|
425
|
+
def Evaluate(
|
|
426
|
+
self, request: worker_pb2.EvaluateRequest, context: grpc.ServicerContext
|
|
427
|
+
) -> worker_pb2.EvaluateResponse:
|
|
428
|
+
"""Handle evaluation task."""
|
|
429
|
+
task_id = request.task_id
|
|
430
|
+
|
|
431
|
+
logger.info(f"Received evaluation task: {task_id}")
|
|
432
|
+
|
|
433
|
+
self.worker.current_task_id = task_id
|
|
434
|
+
start_time = time.time()
|
|
435
|
+
|
|
436
|
+
try:
|
|
437
|
+
# Deserialize architecture
|
|
438
|
+
architecture = ModelGraph.from_json(request.architecture)
|
|
439
|
+
|
|
440
|
+
# Evaluate
|
|
441
|
+
result = self.worker.evaluate_architecture(architecture)
|
|
442
|
+
|
|
443
|
+
duration = time.time() - start_time
|
|
444
|
+
|
|
445
|
+
# Update stats
|
|
446
|
+
self.worker.tasks_completed += 1
|
|
447
|
+
self.worker.current_task_id = None
|
|
448
|
+
|
|
449
|
+
# Submit result to master
|
|
450
|
+
self.worker._submit_result(
|
|
451
|
+
task_id=task_id,
|
|
452
|
+
success=True,
|
|
453
|
+
metrics=result,
|
|
454
|
+
duration=duration,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
# Return response
|
|
458
|
+
return worker_pb2.EvaluateResponse(
|
|
459
|
+
task_id=task_id,
|
|
460
|
+
success=True,
|
|
461
|
+
metrics=result,
|
|
462
|
+
error="",
|
|
463
|
+
duration=duration,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
except Exception as e:
|
|
467
|
+
duration = time.time() - start_time
|
|
468
|
+
error_msg = str(e)
|
|
469
|
+
|
|
470
|
+
logger.error(f"Evaluation failed for task {task_id}: {error_msg}")
|
|
471
|
+
|
|
472
|
+
# Update stats
|
|
473
|
+
self.worker.tasks_failed += 1
|
|
474
|
+
self.worker.current_task_id = None
|
|
475
|
+
|
|
476
|
+
# Submit failure to master
|
|
477
|
+
self.worker._submit_result(
|
|
478
|
+
task_id=task_id,
|
|
479
|
+
success=False,
|
|
480
|
+
metrics={},
|
|
481
|
+
error=error_msg,
|
|
482
|
+
duration=duration,
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
return worker_pb2.EvaluateResponse(
|
|
486
|
+
task_id=task_id,
|
|
487
|
+
success=False,
|
|
488
|
+
metrics={},
|
|
489
|
+
error=error_msg,
|
|
490
|
+
duration=duration,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
def GetStatus(
|
|
494
|
+
self, request: worker_pb2.StatusRequest, context: grpc.ServicerContext
|
|
495
|
+
) -> worker_pb2.StatusResponse:
|
|
496
|
+
"""Handle status request."""
|
|
497
|
+
status = self.worker.get_status()
|
|
498
|
+
|
|
499
|
+
metrics = worker_pb2.WorkerMetrics(
|
|
500
|
+
cpu_usage=status["cpu_usage"],
|
|
501
|
+
memory_usage=status["memory_usage"],
|
|
502
|
+
gpu_usage=status["gpu_usage"],
|
|
503
|
+
gpu_memory=status["gpu_memory"],
|
|
504
|
+
tasks_completed=status["tasks_completed"],
|
|
505
|
+
tasks_failed=status["tasks_failed"],
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
return worker_pb2.StatusResponse(
|
|
509
|
+
status=status["status"],
|
|
510
|
+
current_task_id=status["current_task"] or "",
|
|
511
|
+
metrics=metrics,
|
|
512
|
+
uptime_seconds=int(status["uptime_seconds"]),
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
def Shutdown(
|
|
516
|
+
self, request: worker_pb2.ShutdownRequest, context: grpc.ServicerContext
|
|
517
|
+
) -> worker_pb2.ShutdownResponse:
|
|
518
|
+
"""Handle shutdown request."""
|
|
519
|
+
logger.info(
|
|
520
|
+
f"Shutdown requested (graceful={request.graceful}) for worker {request.worker_id}"
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
if request.graceful:
|
|
524
|
+
# Wait for current task to finish
|
|
525
|
+
while self.worker.current_task_id:
|
|
526
|
+
time.sleep(1)
|
|
527
|
+
|
|
528
|
+
# Stop worker
|
|
529
|
+
self.worker.stop()
|
|
530
|
+
|
|
531
|
+
return worker_pb2.ShutdownResponse(acknowledged=True)
|
|
532
|
+
|
|
533
|
+
def CancelTask(
|
|
534
|
+
self, request: worker_pb2.CancelRequest, context: grpc.ServicerContext
|
|
535
|
+
) -> worker_pb2.CancelResponse:
|
|
536
|
+
"""Handle task cancellation."""
|
|
537
|
+
task_id = request.task_id
|
|
538
|
+
|
|
539
|
+
if self.worker.current_task_id == task_id:
|
|
540
|
+
logger.warning(f"Cancelling task {task_id}")
|
|
541
|
+
# Note: Actual cancellation would require more complex logic
|
|
542
|
+
# For now, just clear the task ID
|
|
543
|
+
self.worker.current_task_id = None
|
|
544
|
+
|
|
545
|
+
return worker_pb2.CancelResponse(success=True, message="Task cancelled")
|
|
546
|
+
else:
|
|
547
|
+
return worker_pb2.CancelResponse(
|
|
548
|
+
success=False, message=f"Task {task_id} not running on this worker"
|
|
549
|
+
)
|