morphml 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of morphml might be problematic. Click here for more details.

Files changed (158) hide show
  1. morphml/__init__.py +14 -0
  2. morphml/api/__init__.py +26 -0
  3. morphml/api/app.py +326 -0
  4. morphml/api/auth.py +193 -0
  5. morphml/api/client.py +338 -0
  6. morphml/api/models.py +132 -0
  7. morphml/api/rate_limit.py +192 -0
  8. morphml/benchmarking/__init__.py +36 -0
  9. morphml/benchmarking/comparison.py +430 -0
  10. morphml/benchmarks/__init__.py +56 -0
  11. morphml/benchmarks/comparator.py +409 -0
  12. morphml/benchmarks/datasets.py +280 -0
  13. morphml/benchmarks/metrics.py +199 -0
  14. morphml/benchmarks/openml_suite.py +201 -0
  15. morphml/benchmarks/problems.py +289 -0
  16. morphml/benchmarks/suite.py +318 -0
  17. morphml/cli/__init__.py +5 -0
  18. morphml/cli/commands/experiment.py +329 -0
  19. morphml/cli/main.py +457 -0
  20. morphml/cli/quickstart.py +312 -0
  21. morphml/config.py +278 -0
  22. morphml/constraints/__init__.py +19 -0
  23. morphml/constraints/handler.py +205 -0
  24. morphml/constraints/predicates.py +285 -0
  25. morphml/core/__init__.py +3 -0
  26. morphml/core/crossover.py +449 -0
  27. morphml/core/dsl/README.md +359 -0
  28. morphml/core/dsl/__init__.py +72 -0
  29. morphml/core/dsl/ast_nodes.py +364 -0
  30. morphml/core/dsl/compiler.py +318 -0
  31. morphml/core/dsl/layers.py +368 -0
  32. morphml/core/dsl/lexer.py +336 -0
  33. morphml/core/dsl/parser.py +455 -0
  34. morphml/core/dsl/search_space.py +386 -0
  35. morphml/core/dsl/syntax.py +199 -0
  36. morphml/core/dsl/type_system.py +361 -0
  37. morphml/core/dsl/validator.py +386 -0
  38. morphml/core/graph/__init__.py +40 -0
  39. morphml/core/graph/edge.py +124 -0
  40. morphml/core/graph/graph.py +507 -0
  41. morphml/core/graph/mutations.py +409 -0
  42. morphml/core/graph/node.py +196 -0
  43. morphml/core/graph/serialization.py +361 -0
  44. morphml/core/graph/visualization.py +431 -0
  45. morphml/core/objectives/__init__.py +20 -0
  46. morphml/core/search/__init__.py +33 -0
  47. morphml/core/search/individual.py +252 -0
  48. morphml/core/search/parameters.py +453 -0
  49. morphml/core/search/population.py +375 -0
  50. morphml/core/search/search_engine.py +340 -0
  51. morphml/distributed/__init__.py +76 -0
  52. morphml/distributed/fault_tolerance.py +497 -0
  53. morphml/distributed/health_monitor.py +348 -0
  54. morphml/distributed/master.py +709 -0
  55. morphml/distributed/proto/README.md +224 -0
  56. morphml/distributed/proto/__init__.py +74 -0
  57. morphml/distributed/proto/worker.proto +170 -0
  58. morphml/distributed/proto/worker_pb2.py +79 -0
  59. morphml/distributed/proto/worker_pb2_grpc.py +423 -0
  60. morphml/distributed/resource_manager.py +416 -0
  61. morphml/distributed/scheduler.py +567 -0
  62. morphml/distributed/storage/__init__.py +33 -0
  63. morphml/distributed/storage/artifacts.py +381 -0
  64. morphml/distributed/storage/cache.py +366 -0
  65. morphml/distributed/storage/checkpointing.py +329 -0
  66. morphml/distributed/storage/database.py +459 -0
  67. morphml/distributed/worker.py +549 -0
  68. morphml/evaluation/__init__.py +5 -0
  69. morphml/evaluation/heuristic.py +237 -0
  70. morphml/exceptions.py +55 -0
  71. morphml/execution/__init__.py +5 -0
  72. morphml/execution/local_executor.py +350 -0
  73. morphml/integrations/__init__.py +28 -0
  74. morphml/integrations/jax_adapter.py +206 -0
  75. morphml/integrations/pytorch_adapter.py +530 -0
  76. morphml/integrations/sklearn_adapter.py +206 -0
  77. morphml/integrations/tensorflow_adapter.py +230 -0
  78. morphml/logging_config.py +93 -0
  79. morphml/meta_learning/__init__.py +66 -0
  80. morphml/meta_learning/architecture_similarity.py +277 -0
  81. morphml/meta_learning/experiment_database.py +240 -0
  82. morphml/meta_learning/knowledge_base/__init__.py +19 -0
  83. morphml/meta_learning/knowledge_base/embedder.py +179 -0
  84. morphml/meta_learning/knowledge_base/knowledge_base.py +313 -0
  85. morphml/meta_learning/knowledge_base/meta_features.py +265 -0
  86. morphml/meta_learning/knowledge_base/vector_store.py +271 -0
  87. morphml/meta_learning/predictors/__init__.py +27 -0
  88. morphml/meta_learning/predictors/ensemble.py +221 -0
  89. morphml/meta_learning/predictors/gnn_predictor.py +552 -0
  90. morphml/meta_learning/predictors/learning_curve.py +231 -0
  91. morphml/meta_learning/predictors/proxy_metrics.py +261 -0
  92. morphml/meta_learning/strategy_evolution/__init__.py +27 -0
  93. morphml/meta_learning/strategy_evolution/adaptive_optimizer.py +226 -0
  94. morphml/meta_learning/strategy_evolution/bandit.py +276 -0
  95. morphml/meta_learning/strategy_evolution/portfolio.py +230 -0
  96. morphml/meta_learning/transfer.py +581 -0
  97. morphml/meta_learning/warm_start.py +286 -0
  98. morphml/optimizers/__init__.py +74 -0
  99. morphml/optimizers/adaptive_operators.py +399 -0
  100. morphml/optimizers/bayesian/__init__.py +52 -0
  101. morphml/optimizers/bayesian/acquisition.py +387 -0
  102. morphml/optimizers/bayesian/base.py +319 -0
  103. morphml/optimizers/bayesian/gaussian_process.py +635 -0
  104. morphml/optimizers/bayesian/smac.py +534 -0
  105. morphml/optimizers/bayesian/tpe.py +411 -0
  106. morphml/optimizers/differential_evolution.py +220 -0
  107. morphml/optimizers/evolutionary/__init__.py +61 -0
  108. morphml/optimizers/evolutionary/cma_es.py +416 -0
  109. morphml/optimizers/evolutionary/differential_evolution.py +556 -0
  110. morphml/optimizers/evolutionary/encoding.py +426 -0
  111. morphml/optimizers/evolutionary/particle_swarm.py +449 -0
  112. morphml/optimizers/genetic_algorithm.py +486 -0
  113. morphml/optimizers/gradient_based/__init__.py +22 -0
  114. morphml/optimizers/gradient_based/darts.py +550 -0
  115. morphml/optimizers/gradient_based/enas.py +585 -0
  116. morphml/optimizers/gradient_based/operations.py +474 -0
  117. morphml/optimizers/gradient_based/utils.py +601 -0
  118. morphml/optimizers/hill_climbing.py +169 -0
  119. morphml/optimizers/multi_objective/__init__.py +56 -0
  120. morphml/optimizers/multi_objective/indicators.py +504 -0
  121. morphml/optimizers/multi_objective/nsga2.py +647 -0
  122. morphml/optimizers/multi_objective/visualization.py +427 -0
  123. morphml/optimizers/nsga2.py +308 -0
  124. morphml/optimizers/random_search.py +172 -0
  125. morphml/optimizers/simulated_annealing.py +181 -0
  126. morphml/plugins/__init__.py +35 -0
  127. morphml/plugins/custom_evaluator_example.py +81 -0
  128. morphml/plugins/custom_optimizer_example.py +63 -0
  129. morphml/plugins/plugin_system.py +454 -0
  130. morphml/reports/__init__.py +30 -0
  131. morphml/reports/generator.py +362 -0
  132. morphml/tracking/__init__.py +7 -0
  133. morphml/tracking/experiment.py +309 -0
  134. morphml/tracking/logger.py +301 -0
  135. morphml/tracking/reporter.py +357 -0
  136. morphml/utils/__init__.py +6 -0
  137. morphml/utils/checkpoint.py +189 -0
  138. morphml/utils/comparison.py +390 -0
  139. morphml/utils/export.py +407 -0
  140. morphml/utils/progress.py +392 -0
  141. morphml/utils/validation.py +392 -0
  142. morphml/version.py +7 -0
  143. morphml/visualization/__init__.py +50 -0
  144. morphml/visualization/analytics.py +423 -0
  145. morphml/visualization/architecture_diagrams.py +353 -0
  146. morphml/visualization/architecture_plot.py +223 -0
  147. morphml/visualization/convergence_plot.py +174 -0
  148. morphml/visualization/crossover_viz.py +386 -0
  149. morphml/visualization/graph_viz.py +338 -0
  150. morphml/visualization/pareto_plot.py +149 -0
  151. morphml/visualization/plotly_dashboards.py +422 -0
  152. morphml/visualization/population.py +309 -0
  153. morphml/visualization/progress.py +260 -0
  154. morphml-1.0.0.dist-info/METADATA +434 -0
  155. morphml-1.0.0.dist-info/RECORD +158 -0
  156. morphml-1.0.0.dist-info/WHEEL +4 -0
  157. morphml-1.0.0.dist-info/entry_points.txt +3 -0
  158. morphml-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,76 @@
1
+ """Distributed execution module for MorphML.
2
+
3
+ This module provides distributed architecture search capabilities with:
4
+ - Master-worker coordination
5
+ - Task scheduling and distribution
6
+ - Fault tolerance and recovery
7
+ - Distributed storage and caching
8
+
9
+ Author: Eshan Roy <eshanized@proton.me>
10
+ Organization: TONMOY INFRASTRUCTURE & VISION
11
+ """
12
+
13
+ from morphml.distributed.fault_tolerance import (
14
+ CircuitBreaker,
15
+ FailureEvent,
16
+ FailureType,
17
+ FaultToleranceManager,
18
+ )
19
+ from morphml.distributed.health_monitor import (
20
+ HealthMetrics,
21
+ HealthMonitor,
22
+ get_system_health,
23
+ is_system_healthy,
24
+ )
25
+ from morphml.distributed.master import MasterNode, Task, WorkerInfo
26
+ from morphml.distributed.resource_manager import (
27
+ GPUAffinityManager,
28
+ ResourceManager,
29
+ TaskRequirements,
30
+ WorkerResources,
31
+ )
32
+ from morphml.distributed.scheduler import (
33
+ AdaptiveScheduler,
34
+ FIFOScheduler,
35
+ LoadBalancingScheduler,
36
+ PerformanceStats,
37
+ PriorityScheduler,
38
+ RoundRobinScheduler,
39
+ TaskScheduler,
40
+ WorkStealingScheduler,
41
+ create_scheduler,
42
+ )
43
+ from morphml.distributed.worker import WorkerNode
44
+
45
+ __all__ = [
46
+ # Core components
47
+ "MasterNode",
48
+ "WorkerNode",
49
+ "WorkerInfo",
50
+ "Task",
51
+ # Schedulers
52
+ "TaskScheduler",
53
+ "FIFOScheduler",
54
+ "PriorityScheduler",
55
+ "LoadBalancingScheduler",
56
+ "WorkStealingScheduler",
57
+ "AdaptiveScheduler",
58
+ "RoundRobinScheduler",
59
+ "PerformanceStats",
60
+ "create_scheduler",
61
+ # Resource management
62
+ "ResourceManager",
63
+ "WorkerResources",
64
+ "TaskRequirements",
65
+ "GPUAffinityManager",
66
+ # Fault tolerance
67
+ "FaultToleranceManager",
68
+ "CircuitBreaker",
69
+ "FailureType",
70
+ "FailureEvent",
71
+ # Health monitoring
72
+ "HealthMonitor",
73
+ "HealthMetrics",
74
+ "get_system_health",
75
+ "is_system_healthy",
76
+ ]
@@ -0,0 +1,497 @@
1
+ """Fault tolerance and recovery mechanisms.
2
+
3
+ Handles worker failures, task retries, and checkpoint-based recovery.
4
+
5
+ Author: Eshan Roy <eshanized@proton.me>
6
+ Organization: TONMOY INFRASTRUCTURE & VISION
7
+ """
8
+
9
+ import time
10
+ from dataclasses import dataclass
11
+ from enum import Enum
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from morphml.core.search import Individual
15
+ from morphml.distributed.master import Task, WorkerInfo
16
+ from morphml.logging_config import get_logger
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ class FailureType(Enum):
22
+ """Types of failures that can occur."""
23
+
24
+ WORKER_CRASH = "worker_crash"
25
+ TASK_TIMEOUT = "task_timeout"
26
+ NETWORK_ERROR = "network_error"
27
+ OUT_OF_MEMORY = "out_of_memory"
28
+ GPU_ERROR = "gpu_error"
29
+ EVALUATION_ERROR = "evaluation_error"
30
+ UNKNOWN = "unknown"
31
+
32
+
33
+ @dataclass
34
+ class FailureEvent:
35
+ """Record of a failure event."""
36
+
37
+ timestamp: float
38
+ failure_type: FailureType
39
+ worker_id: Optional[str] = None
40
+ task_id: Optional[str] = None
41
+ details: Optional[str] = None
42
+ recovered: bool = False
43
+
44
+
45
+ class CircuitBreaker:
46
+ """
47
+ Circuit breaker pattern for worker management.
48
+
49
+ Prevents repeated use of failing workers by temporarily disabling them.
50
+
51
+ States:
52
+ - CLOSED: Normal operation
53
+ - OPEN: Worker disabled due to failures
54
+ - HALF_OPEN: Testing if worker has recovered
55
+
56
+ Args:
57
+ failure_threshold: Number of failures to trigger open state
58
+ timeout: Seconds before transitioning to HALF_OPEN
59
+ success_threshold: Successes needed in HALF_OPEN to close
60
+
61
+ Example:
62
+ >>> breaker = CircuitBreaker(failure_threshold=3, timeout=300)
63
+ >>> breaker.record_failure()
64
+ >>> if breaker.is_open():
65
+ ... print("Circuit open, worker disabled")
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ failure_threshold: int = 3,
71
+ timeout: float = 300.0,
72
+ success_threshold: int = 2,
73
+ ):
74
+ """Initialize circuit breaker."""
75
+ self.failure_threshold = failure_threshold
76
+ self.timeout = timeout
77
+ self.success_threshold = success_threshold
78
+
79
+ self.state = "CLOSED"
80
+ self.failure_count = 0
81
+ self.success_count = 0
82
+ self.opened_at: Optional[float] = None
83
+
84
+ def record_failure(self) -> None:
85
+ """Record a failure."""
86
+ if self.state == "CLOSED":
87
+ self.failure_count += 1
88
+
89
+ if self.failure_count >= self.failure_threshold:
90
+ self._open()
91
+
92
+ elif self.state == "HALF_OPEN":
93
+ # Failed during testing, re-open
94
+ self._open()
95
+ self.success_count = 0
96
+
97
+ def record_success(self) -> None:
98
+ """Record a success."""
99
+ if self.state == "HALF_OPEN":
100
+ self.success_count += 1
101
+
102
+ if self.success_count >= self.success_threshold:
103
+ self._close()
104
+
105
+ elif self.state == "CLOSED":
106
+ # Reset failure count on success
107
+ self.failure_count = max(0, self.failure_count - 1)
108
+
109
+ def is_open(self) -> bool:
110
+ """
111
+ Check if circuit is open.
112
+
113
+ Returns:
114
+ True if circuit is open (worker disabled)
115
+ """
116
+ if self.state == "CLOSED":
117
+ return False
118
+
119
+ # Check if timeout elapsed (transition to HALF_OPEN)
120
+ if self.state == "OPEN" and self.opened_at:
121
+ if (time.time() - self.opened_at) > self.timeout:
122
+ self._half_open()
123
+ return False
124
+
125
+ return self.state == "OPEN"
126
+
127
+ def _open(self) -> None:
128
+ """Open the circuit."""
129
+ self.state = "OPEN"
130
+ self.opened_at = time.time()
131
+ logger.warning("Circuit breaker opened")
132
+
133
+ def _half_open(self) -> None:
134
+ """Transition to half-open state."""
135
+ self.state = "HALF_OPEN"
136
+ self.success_count = 0
137
+ logger.info("Circuit breaker half-open (testing recovery)")
138
+
139
+ def _close(self) -> None:
140
+ """Close the circuit."""
141
+ self.state = "CLOSED"
142
+ self.failure_count = 0
143
+ self.success_count = 0
144
+ self.opened_at = None
145
+ logger.info("Circuit breaker closed (worker recovered)")
146
+
147
+ def get_state(self) -> str:
148
+ """Get current state."""
149
+ return self.state
150
+
151
+
152
+ class FaultToleranceManager:
153
+ """
154
+ Manage fault tolerance and recovery.
155
+
156
+ Provides automatic recovery from failures:
157
+ - Task retry with exponential backoff
158
+ - Worker failure detection and handling
159
+ - Task reassignment from failed workers
160
+ - Circuit breaker pattern for unreliable workers
161
+ - Checkpoint-based experiment recovery
162
+
163
+ Args:
164
+ config: Configuration dictionary
165
+ - max_retries: Maximum task retries (default: 3)
166
+ - retry_delay: Base delay between retries in seconds (default: 5)
167
+ - health_check_interval: Health check frequency (default: 30)
168
+ - checkpoint_interval: Checkpoint every N generations (default: 10)
169
+ - circuit_breaker_threshold: Failures to open circuit (default: 3)
170
+ - circuit_breaker_timeout: Circuit breaker timeout (default: 300)
171
+
172
+ Example:
173
+ >>> manager = FaultToleranceManager({'max_retries': 3, 'retry_delay': 5})
174
+ >>>
175
+ >>> # Handle task failure
176
+ >>> should_retry = manager.handle_task_failure(
177
+ ... task, FailureType.NETWORK_ERROR, "Connection timeout"
178
+ ... )
179
+ >>>
180
+ >>> # Handle worker failure
181
+ >>> manager.handle_worker_failure('worker-1')
182
+ >>>
183
+ >>> # Reassign tasks
184
+ >>> reassignment = manager.reassign_tasks('worker-1', task_ids, workers)
185
+ """
186
+
187
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
188
+ """Initialize fault tolerance manager."""
189
+ config = config or {}
190
+
191
+ self.max_retries = config.get("max_retries", 3)
192
+ self.retry_delay = config.get("retry_delay", 5.0)
193
+ self.health_check_interval = config.get("health_check_interval", 30.0)
194
+ self.checkpoint_interval = config.get("checkpoint_interval", 10)
195
+ self.circuit_breaker_threshold = config.get("circuit_breaker_threshold", 3)
196
+ self.circuit_breaker_timeout = config.get("circuit_breaker_timeout", 300.0)
197
+
198
+ # Task retry tracking
199
+ self.task_retries: Dict[str, int] = {}
200
+ self.task_failures: Dict[str, List[FailureEvent]] = {}
201
+ self.failed_tasks: List[Task] = []
202
+
203
+ # Worker failure tracking
204
+ self.worker_failures: Dict[str, List[FailureEvent]] = {}
205
+ self.failed_workers: set = set()
206
+
207
+ # Circuit breakers per worker
208
+ self.circuit_breakers: Dict[str, CircuitBreaker] = {}
209
+
210
+ # Recovery tracking
211
+ self.recovery_attempts: Dict[str, int] = {}
212
+
213
+ logger.info(
214
+ f"Initialized FaultToleranceManager "
215
+ f"(max_retries={self.max_retries}, retry_delay={self.retry_delay}s)"
216
+ )
217
+
218
+ def handle_task_failure(self, task: Task, failure_type: FailureType, error: str) -> bool:
219
+ """
220
+ Handle task failure with automatic retry logic.
221
+
222
+ Args:
223
+ task: Failed task
224
+ failure_type: Type of failure
225
+ error: Error message
226
+
227
+ Returns:
228
+ True if task should be retried, False otherwise
229
+ """
230
+ task_id = task.task_id
231
+
232
+ # Record failure event
233
+ if task_id not in self.task_failures:
234
+ self.task_failures[task_id] = []
235
+
236
+ self.task_failures[task_id].append(
237
+ FailureEvent(
238
+ timestamp=time.time(),
239
+ failure_type=failure_type,
240
+ task_id=task_id,
241
+ worker_id=task.worker_id,
242
+ details=error,
243
+ )
244
+ )
245
+
246
+ # Increment retry count
247
+ self.task_retries[task_id] = self.task_retries.get(task_id, 0) + 1
248
+ retry_count = self.task_retries[task_id]
249
+
250
+ logger.warning(
251
+ f"Task {task_id} failed: {failure_type.value} - {error} "
252
+ f"(retry {retry_count}/{self.max_retries})"
253
+ )
254
+
255
+ # Check if should retry
256
+ if retry_count < self.max_retries:
257
+ # Exponential backoff
258
+ delay = self.retry_delay * (2 ** (retry_count - 1))
259
+ logger.info(f"Will retry task {task_id} after {delay:.1f}s")
260
+
261
+ # Note: Actual sleep happens in the caller
262
+ return True
263
+ else:
264
+ logger.error(f"Task {task_id} exceeded max retries, marking as failed")
265
+ self.failed_tasks.append(task)
266
+ return False
267
+
268
+ def handle_worker_failure(
269
+ self, worker_id: str, failure_type: FailureType = FailureType.WORKER_CRASH
270
+ ) -> None:
271
+ """
272
+ Handle worker failure.
273
+
274
+ 1. Record failure event
275
+ 2. Update circuit breaker
276
+ 3. Mark worker as failed
277
+
278
+ Args:
279
+ worker_id: Failed worker ID
280
+ failure_type: Type of failure
281
+ """
282
+ logger.error(f"Worker {worker_id} failed: {failure_type.value}")
283
+
284
+ # Record failure event
285
+ if worker_id not in self.worker_failures:
286
+ self.worker_failures[worker_id] = []
287
+
288
+ self.worker_failures[worker_id].append(
289
+ FailureEvent(
290
+ timestamp=time.time(),
291
+ failure_type=failure_type,
292
+ worker_id=worker_id,
293
+ )
294
+ )
295
+
296
+ # Mark as failed
297
+ self.failed_workers.add(worker_id)
298
+
299
+ # Update circuit breaker
300
+ breaker = self._get_circuit_breaker(worker_id)
301
+ breaker.record_failure()
302
+
303
+ # Check failure rate
304
+ recent_failures = self._count_recent_failures(worker_id, window=3600)
305
+
306
+ if recent_failures > self.circuit_breaker_threshold:
307
+ logger.warning(
308
+ f"Worker {worker_id} has {recent_failures} failures in last hour, "
309
+ f"circuit breaker: {breaker.get_state()}"
310
+ )
311
+
312
+ def handle_worker_recovery(self, worker_id: str) -> None:
313
+ """
314
+ Handle worker recovery.
315
+
316
+ Args:
317
+ worker_id: Recovered worker ID
318
+ """
319
+ logger.info(f"Worker {worker_id} recovered")
320
+
321
+ # Remove from failed set
322
+ self.failed_workers.discard(worker_id)
323
+
324
+ # Record success in circuit breaker
325
+ if worker_id in self.circuit_breakers:
326
+ self.circuit_breakers[worker_id].record_success()
327
+
328
+ def reassign_tasks(
329
+ self,
330
+ failed_worker_id: str,
331
+ task_ids: List[str],
332
+ available_workers: List[WorkerInfo],
333
+ ) -> Dict[str, str]:
334
+ """
335
+ Reassign tasks from failed worker to healthy workers.
336
+
337
+ Args:
338
+ failed_worker_id: Worker that failed
339
+ task_ids: Tasks to reassign
340
+ available_workers: Available workers
341
+
342
+ Returns:
343
+ Mapping of task_id -> new_worker_id
344
+ """
345
+ reassignment = {}
346
+
347
+ # Filter healthy workers
348
+ healthy_workers = [
349
+ w
350
+ for w in available_workers
351
+ if w.worker_id != failed_worker_id
352
+ and w.status != "dead"
353
+ and not self.is_worker_unhealthy(w.worker_id)
354
+ ]
355
+
356
+ if not healthy_workers:
357
+ logger.error("No healthy workers available for task reassignment")
358
+ return reassignment
359
+
360
+ # Round-robin assignment
361
+ worker_idx = 0
362
+ for task_id in task_ids:
363
+ worker = healthy_workers[worker_idx % len(healthy_workers)]
364
+ reassignment[task_id] = worker.worker_id
365
+
366
+ logger.info(
367
+ f"Reassigned task {task_id} from {failed_worker_id} " f"to {worker.worker_id}"
368
+ )
369
+
370
+ worker_idx += 1
371
+
372
+ return reassignment
373
+
374
+ def is_worker_unhealthy(self, worker_id: str) -> bool:
375
+ """
376
+ Check if worker is unhealthy.
377
+
378
+ Args:
379
+ worker_id: Worker ID
380
+
381
+ Returns:
382
+ True if worker should not receive tasks
383
+ """
384
+ # Check if in failed set
385
+ if worker_id in self.failed_workers:
386
+ return True
387
+
388
+ # Check circuit breaker
389
+ if worker_id in self.circuit_breakers:
390
+ return self.circuit_breakers[worker_id].is_open()
391
+
392
+ return False
393
+
394
+ def recover_from_checkpoint(self, checkpoint: Dict[str, Any], optimizer: Any) -> int:
395
+ """
396
+ Recover experiment state from checkpoint.
397
+
398
+ Args:
399
+ checkpoint: Checkpoint dictionary
400
+ optimizer: Optimizer to restore
401
+
402
+ Returns:
403
+ Generation to resume from
404
+ """
405
+ generation = checkpoint.get("generation", 0)
406
+
407
+ logger.info(f"Recovering from checkpoint at generation {generation}")
408
+
409
+ # Restore optimizer state
410
+ optimizer_state = checkpoint.get("optimizer_state", {})
411
+ if hasattr(optimizer, "load_state"):
412
+ optimizer.load_state(optimizer_state)
413
+
414
+ # Restore population
415
+ population_data = checkpoint.get("population", [])
416
+ if population_data and hasattr(optimizer, "population"):
417
+ try:
418
+ population = [Individual.from_dict(ind_dict) for ind_dict in population_data]
419
+ optimizer.population = population
420
+ logger.info(f"Restored population of {len(population)} individuals")
421
+ except Exception as e:
422
+ logger.warning(f"Failed to restore population: {e}")
423
+
424
+ # Record recovery
425
+ exp_id = checkpoint.get("experiment_id", "unknown")
426
+ self.recovery_attempts[exp_id] = self.recovery_attempts.get(exp_id, 0) + 1
427
+
428
+ logger.info(f"Recovery complete, resuming from generation {generation}")
429
+
430
+ return generation
431
+
432
+ def _get_circuit_breaker(self, worker_id: str) -> CircuitBreaker:
433
+ """Get or create circuit breaker for worker."""
434
+ if worker_id not in self.circuit_breakers:
435
+ self.circuit_breakers[worker_id] = CircuitBreaker(
436
+ failure_threshold=self.circuit_breaker_threshold,
437
+ timeout=self.circuit_breaker_timeout,
438
+ )
439
+
440
+ return self.circuit_breakers[worker_id]
441
+
442
+ def _count_recent_failures(self, worker_id: str, window: float = 3600.0) -> int:
443
+ """
444
+ Count worker failures within time window.
445
+
446
+ Args:
447
+ worker_id: Worker ID
448
+ window: Time window in seconds
449
+
450
+ Returns:
451
+ Number of failures
452
+ """
453
+ if worker_id not in self.worker_failures:
454
+ return 0
455
+
456
+ current_time = time.time()
457
+ recent = [
458
+ f for f in self.worker_failures[worker_id] if (current_time - f.timestamp) < window
459
+ ]
460
+
461
+ return len(recent)
462
+
463
+ def get_statistics(self) -> Dict[str, Any]:
464
+ """
465
+ Get fault tolerance statistics.
466
+
467
+ Returns:
468
+ Statistics dictionary
469
+ """
470
+ total_task_failures = sum(len(events) for events in self.task_failures.values())
471
+ total_worker_failures = sum(len(events) for events in self.worker_failures.values())
472
+
473
+ circuit_breaker_states = {
474
+ wid: breaker.get_state() for wid, breaker in self.circuit_breakers.items()
475
+ }
476
+
477
+ return {
478
+ "total_task_failures": total_task_failures,
479
+ "total_worker_failures": total_worker_failures,
480
+ "failed_tasks": len(self.failed_tasks),
481
+ "failed_workers": len(self.failed_workers),
482
+ "circuit_breakers": circuit_breaker_states,
483
+ "tasks_with_retries": len(self.task_retries),
484
+ "recovery_attempts": dict(self.recovery_attempts),
485
+ }
486
+
487
+ def reset(self) -> None:
488
+ """Reset all fault tolerance state."""
489
+ self.task_retries.clear()
490
+ self.task_failures.clear()
491
+ self.failed_tasks.clear()
492
+ self.worker_failures.clear()
493
+ self.failed_workers.clear()
494
+ self.circuit_breakers.clear()
495
+ self.recovery_attempts.clear()
496
+
497
+ logger.info("Reset fault tolerance state")