morphml 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of morphml might be problematic. Click here for more details.
- morphml/__init__.py +14 -0
- morphml/api/__init__.py +26 -0
- morphml/api/app.py +326 -0
- morphml/api/auth.py +193 -0
- morphml/api/client.py +338 -0
- morphml/api/models.py +132 -0
- morphml/api/rate_limit.py +192 -0
- morphml/benchmarking/__init__.py +36 -0
- morphml/benchmarking/comparison.py +430 -0
- morphml/benchmarks/__init__.py +56 -0
- morphml/benchmarks/comparator.py +409 -0
- morphml/benchmarks/datasets.py +280 -0
- morphml/benchmarks/metrics.py +199 -0
- morphml/benchmarks/openml_suite.py +201 -0
- morphml/benchmarks/problems.py +289 -0
- morphml/benchmarks/suite.py +318 -0
- morphml/cli/__init__.py +5 -0
- morphml/cli/commands/experiment.py +329 -0
- morphml/cli/main.py +457 -0
- morphml/cli/quickstart.py +312 -0
- morphml/config.py +278 -0
- morphml/constraints/__init__.py +19 -0
- morphml/constraints/handler.py +205 -0
- morphml/constraints/predicates.py +285 -0
- morphml/core/__init__.py +3 -0
- morphml/core/crossover.py +449 -0
- morphml/core/dsl/README.md +359 -0
- morphml/core/dsl/__init__.py +72 -0
- morphml/core/dsl/ast_nodes.py +364 -0
- morphml/core/dsl/compiler.py +318 -0
- morphml/core/dsl/layers.py +368 -0
- morphml/core/dsl/lexer.py +336 -0
- morphml/core/dsl/parser.py +455 -0
- morphml/core/dsl/search_space.py +386 -0
- morphml/core/dsl/syntax.py +199 -0
- morphml/core/dsl/type_system.py +361 -0
- morphml/core/dsl/validator.py +386 -0
- morphml/core/graph/__init__.py +40 -0
- morphml/core/graph/edge.py +124 -0
- morphml/core/graph/graph.py +507 -0
- morphml/core/graph/mutations.py +409 -0
- morphml/core/graph/node.py +196 -0
- morphml/core/graph/serialization.py +361 -0
- morphml/core/graph/visualization.py +431 -0
- morphml/core/objectives/__init__.py +20 -0
- morphml/core/search/__init__.py +33 -0
- morphml/core/search/individual.py +252 -0
- morphml/core/search/parameters.py +453 -0
- morphml/core/search/population.py +375 -0
- morphml/core/search/search_engine.py +340 -0
- morphml/distributed/__init__.py +76 -0
- morphml/distributed/fault_tolerance.py +497 -0
- morphml/distributed/health_monitor.py +348 -0
- morphml/distributed/master.py +709 -0
- morphml/distributed/proto/README.md +224 -0
- morphml/distributed/proto/__init__.py +74 -0
- morphml/distributed/proto/worker.proto +170 -0
- morphml/distributed/proto/worker_pb2.py +79 -0
- morphml/distributed/proto/worker_pb2_grpc.py +423 -0
- morphml/distributed/resource_manager.py +416 -0
- morphml/distributed/scheduler.py +567 -0
- morphml/distributed/storage/__init__.py +33 -0
- morphml/distributed/storage/artifacts.py +381 -0
- morphml/distributed/storage/cache.py +366 -0
- morphml/distributed/storage/checkpointing.py +329 -0
- morphml/distributed/storage/database.py +459 -0
- morphml/distributed/worker.py +549 -0
- morphml/evaluation/__init__.py +5 -0
- morphml/evaluation/heuristic.py +237 -0
- morphml/exceptions.py +55 -0
- morphml/execution/__init__.py +5 -0
- morphml/execution/local_executor.py +350 -0
- morphml/integrations/__init__.py +28 -0
- morphml/integrations/jax_adapter.py +206 -0
- morphml/integrations/pytorch_adapter.py +530 -0
- morphml/integrations/sklearn_adapter.py +206 -0
- morphml/integrations/tensorflow_adapter.py +230 -0
- morphml/logging_config.py +93 -0
- morphml/meta_learning/__init__.py +66 -0
- morphml/meta_learning/architecture_similarity.py +277 -0
- morphml/meta_learning/experiment_database.py +240 -0
- morphml/meta_learning/knowledge_base/__init__.py +19 -0
- morphml/meta_learning/knowledge_base/embedder.py +179 -0
- morphml/meta_learning/knowledge_base/knowledge_base.py +313 -0
- morphml/meta_learning/knowledge_base/meta_features.py +265 -0
- morphml/meta_learning/knowledge_base/vector_store.py +271 -0
- morphml/meta_learning/predictors/__init__.py +27 -0
- morphml/meta_learning/predictors/ensemble.py +221 -0
- morphml/meta_learning/predictors/gnn_predictor.py +552 -0
- morphml/meta_learning/predictors/learning_curve.py +231 -0
- morphml/meta_learning/predictors/proxy_metrics.py +261 -0
- morphml/meta_learning/strategy_evolution/__init__.py +27 -0
- morphml/meta_learning/strategy_evolution/adaptive_optimizer.py +226 -0
- morphml/meta_learning/strategy_evolution/bandit.py +276 -0
- morphml/meta_learning/strategy_evolution/portfolio.py +230 -0
- morphml/meta_learning/transfer.py +581 -0
- morphml/meta_learning/warm_start.py +286 -0
- morphml/optimizers/__init__.py +74 -0
- morphml/optimizers/adaptive_operators.py +399 -0
- morphml/optimizers/bayesian/__init__.py +52 -0
- morphml/optimizers/bayesian/acquisition.py +387 -0
- morphml/optimizers/bayesian/base.py +319 -0
- morphml/optimizers/bayesian/gaussian_process.py +635 -0
- morphml/optimizers/bayesian/smac.py +534 -0
- morphml/optimizers/bayesian/tpe.py +411 -0
- morphml/optimizers/differential_evolution.py +220 -0
- morphml/optimizers/evolutionary/__init__.py +61 -0
- morphml/optimizers/evolutionary/cma_es.py +416 -0
- morphml/optimizers/evolutionary/differential_evolution.py +556 -0
- morphml/optimizers/evolutionary/encoding.py +426 -0
- morphml/optimizers/evolutionary/particle_swarm.py +449 -0
- morphml/optimizers/genetic_algorithm.py +486 -0
- morphml/optimizers/gradient_based/__init__.py +22 -0
- morphml/optimizers/gradient_based/darts.py +550 -0
- morphml/optimizers/gradient_based/enas.py +585 -0
- morphml/optimizers/gradient_based/operations.py +474 -0
- morphml/optimizers/gradient_based/utils.py +601 -0
- morphml/optimizers/hill_climbing.py +169 -0
- morphml/optimizers/multi_objective/__init__.py +56 -0
- morphml/optimizers/multi_objective/indicators.py +504 -0
- morphml/optimizers/multi_objective/nsga2.py +647 -0
- morphml/optimizers/multi_objective/visualization.py +427 -0
- morphml/optimizers/nsga2.py +308 -0
- morphml/optimizers/random_search.py +172 -0
- morphml/optimizers/simulated_annealing.py +181 -0
- morphml/plugins/__init__.py +35 -0
- morphml/plugins/custom_evaluator_example.py +81 -0
- morphml/plugins/custom_optimizer_example.py +63 -0
- morphml/plugins/plugin_system.py +454 -0
- morphml/reports/__init__.py +30 -0
- morphml/reports/generator.py +362 -0
- morphml/tracking/__init__.py +7 -0
- morphml/tracking/experiment.py +309 -0
- morphml/tracking/logger.py +301 -0
- morphml/tracking/reporter.py +357 -0
- morphml/utils/__init__.py +6 -0
- morphml/utils/checkpoint.py +189 -0
- morphml/utils/comparison.py +390 -0
- morphml/utils/export.py +407 -0
- morphml/utils/progress.py +392 -0
- morphml/utils/validation.py +392 -0
- morphml/version.py +7 -0
- morphml/visualization/__init__.py +50 -0
- morphml/visualization/analytics.py +423 -0
- morphml/visualization/architecture_diagrams.py +353 -0
- morphml/visualization/architecture_plot.py +223 -0
- morphml/visualization/convergence_plot.py +174 -0
- morphml/visualization/crossover_viz.py +386 -0
- morphml/visualization/graph_viz.py +338 -0
- morphml/visualization/pareto_plot.py +149 -0
- morphml/visualization/plotly_dashboards.py +422 -0
- morphml/visualization/population.py +309 -0
- morphml/visualization/progress.py +260 -0
- morphml-1.0.0.dist-info/METADATA +434 -0
- morphml-1.0.0.dist-info/RECORD +158 -0
- morphml-1.0.0.dist-info/WHEEL +4 -0
- morphml-1.0.0.dist-info/entry_points.txt +3 -0
- morphml-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Distributed execution module for MorphML.
|
|
2
|
+
|
|
3
|
+
This module provides distributed architecture search capabilities with:
|
|
4
|
+
- Master-worker coordination
|
|
5
|
+
- Task scheduling and distribution
|
|
6
|
+
- Fault tolerance and recovery
|
|
7
|
+
- Distributed storage and caching
|
|
8
|
+
|
|
9
|
+
Author: Eshan Roy <eshanized@proton.me>
|
|
10
|
+
Organization: TONMOY INFRASTRUCTURE & VISION
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from morphml.distributed.fault_tolerance import (
|
|
14
|
+
CircuitBreaker,
|
|
15
|
+
FailureEvent,
|
|
16
|
+
FailureType,
|
|
17
|
+
FaultToleranceManager,
|
|
18
|
+
)
|
|
19
|
+
from morphml.distributed.health_monitor import (
|
|
20
|
+
HealthMetrics,
|
|
21
|
+
HealthMonitor,
|
|
22
|
+
get_system_health,
|
|
23
|
+
is_system_healthy,
|
|
24
|
+
)
|
|
25
|
+
from morphml.distributed.master import MasterNode, Task, WorkerInfo
|
|
26
|
+
from morphml.distributed.resource_manager import (
|
|
27
|
+
GPUAffinityManager,
|
|
28
|
+
ResourceManager,
|
|
29
|
+
TaskRequirements,
|
|
30
|
+
WorkerResources,
|
|
31
|
+
)
|
|
32
|
+
from morphml.distributed.scheduler import (
|
|
33
|
+
AdaptiveScheduler,
|
|
34
|
+
FIFOScheduler,
|
|
35
|
+
LoadBalancingScheduler,
|
|
36
|
+
PerformanceStats,
|
|
37
|
+
PriorityScheduler,
|
|
38
|
+
RoundRobinScheduler,
|
|
39
|
+
TaskScheduler,
|
|
40
|
+
WorkStealingScheduler,
|
|
41
|
+
create_scheduler,
|
|
42
|
+
)
|
|
43
|
+
from morphml.distributed.worker import WorkerNode
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
# Core components
|
|
47
|
+
"MasterNode",
|
|
48
|
+
"WorkerNode",
|
|
49
|
+
"WorkerInfo",
|
|
50
|
+
"Task",
|
|
51
|
+
# Schedulers
|
|
52
|
+
"TaskScheduler",
|
|
53
|
+
"FIFOScheduler",
|
|
54
|
+
"PriorityScheduler",
|
|
55
|
+
"LoadBalancingScheduler",
|
|
56
|
+
"WorkStealingScheduler",
|
|
57
|
+
"AdaptiveScheduler",
|
|
58
|
+
"RoundRobinScheduler",
|
|
59
|
+
"PerformanceStats",
|
|
60
|
+
"create_scheduler",
|
|
61
|
+
# Resource management
|
|
62
|
+
"ResourceManager",
|
|
63
|
+
"WorkerResources",
|
|
64
|
+
"TaskRequirements",
|
|
65
|
+
"GPUAffinityManager",
|
|
66
|
+
# Fault tolerance
|
|
67
|
+
"FaultToleranceManager",
|
|
68
|
+
"CircuitBreaker",
|
|
69
|
+
"FailureType",
|
|
70
|
+
"FailureEvent",
|
|
71
|
+
# Health monitoring
|
|
72
|
+
"HealthMonitor",
|
|
73
|
+
"HealthMetrics",
|
|
74
|
+
"get_system_health",
|
|
75
|
+
"is_system_healthy",
|
|
76
|
+
]
|
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
"""Fault tolerance and recovery mechanisms.
|
|
2
|
+
|
|
3
|
+
Handles worker failures, task retries, and checkpoint-based recovery.
|
|
4
|
+
|
|
5
|
+
Author: Eshan Roy <eshanized@proton.me>
|
|
6
|
+
Organization: TONMOY INFRASTRUCTURE & VISION
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import time
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from morphml.core.search import Individual
|
|
15
|
+
from morphml.distributed.master import Task, WorkerInfo
|
|
16
|
+
from morphml.logging_config import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FailureType(Enum):
|
|
22
|
+
"""Types of failures that can occur."""
|
|
23
|
+
|
|
24
|
+
WORKER_CRASH = "worker_crash"
|
|
25
|
+
TASK_TIMEOUT = "task_timeout"
|
|
26
|
+
NETWORK_ERROR = "network_error"
|
|
27
|
+
OUT_OF_MEMORY = "out_of_memory"
|
|
28
|
+
GPU_ERROR = "gpu_error"
|
|
29
|
+
EVALUATION_ERROR = "evaluation_error"
|
|
30
|
+
UNKNOWN = "unknown"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class FailureEvent:
|
|
35
|
+
"""Record of a failure event."""
|
|
36
|
+
|
|
37
|
+
timestamp: float
|
|
38
|
+
failure_type: FailureType
|
|
39
|
+
worker_id: Optional[str] = None
|
|
40
|
+
task_id: Optional[str] = None
|
|
41
|
+
details: Optional[str] = None
|
|
42
|
+
recovered: bool = False
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class CircuitBreaker:
|
|
46
|
+
"""
|
|
47
|
+
Circuit breaker pattern for worker management.
|
|
48
|
+
|
|
49
|
+
Prevents repeated use of failing workers by temporarily disabling them.
|
|
50
|
+
|
|
51
|
+
States:
|
|
52
|
+
- CLOSED: Normal operation
|
|
53
|
+
- OPEN: Worker disabled due to failures
|
|
54
|
+
- HALF_OPEN: Testing if worker has recovered
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
failure_threshold: Number of failures to trigger open state
|
|
58
|
+
timeout: Seconds before transitioning to HALF_OPEN
|
|
59
|
+
success_threshold: Successes needed in HALF_OPEN to close
|
|
60
|
+
|
|
61
|
+
Example:
|
|
62
|
+
>>> breaker = CircuitBreaker(failure_threshold=3, timeout=300)
|
|
63
|
+
>>> breaker.record_failure()
|
|
64
|
+
>>> if breaker.is_open():
|
|
65
|
+
... print("Circuit open, worker disabled")
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
failure_threshold: int = 3,
|
|
71
|
+
timeout: float = 300.0,
|
|
72
|
+
success_threshold: int = 2,
|
|
73
|
+
):
|
|
74
|
+
"""Initialize circuit breaker."""
|
|
75
|
+
self.failure_threshold = failure_threshold
|
|
76
|
+
self.timeout = timeout
|
|
77
|
+
self.success_threshold = success_threshold
|
|
78
|
+
|
|
79
|
+
self.state = "CLOSED"
|
|
80
|
+
self.failure_count = 0
|
|
81
|
+
self.success_count = 0
|
|
82
|
+
self.opened_at: Optional[float] = None
|
|
83
|
+
|
|
84
|
+
def record_failure(self) -> None:
|
|
85
|
+
"""Record a failure."""
|
|
86
|
+
if self.state == "CLOSED":
|
|
87
|
+
self.failure_count += 1
|
|
88
|
+
|
|
89
|
+
if self.failure_count >= self.failure_threshold:
|
|
90
|
+
self._open()
|
|
91
|
+
|
|
92
|
+
elif self.state == "HALF_OPEN":
|
|
93
|
+
# Failed during testing, re-open
|
|
94
|
+
self._open()
|
|
95
|
+
self.success_count = 0
|
|
96
|
+
|
|
97
|
+
def record_success(self) -> None:
|
|
98
|
+
"""Record a success."""
|
|
99
|
+
if self.state == "HALF_OPEN":
|
|
100
|
+
self.success_count += 1
|
|
101
|
+
|
|
102
|
+
if self.success_count >= self.success_threshold:
|
|
103
|
+
self._close()
|
|
104
|
+
|
|
105
|
+
elif self.state == "CLOSED":
|
|
106
|
+
# Reset failure count on success
|
|
107
|
+
self.failure_count = max(0, self.failure_count - 1)
|
|
108
|
+
|
|
109
|
+
def is_open(self) -> bool:
|
|
110
|
+
"""
|
|
111
|
+
Check if circuit is open.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
True if circuit is open (worker disabled)
|
|
115
|
+
"""
|
|
116
|
+
if self.state == "CLOSED":
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
# Check if timeout elapsed (transition to HALF_OPEN)
|
|
120
|
+
if self.state == "OPEN" and self.opened_at:
|
|
121
|
+
if (time.time() - self.opened_at) > self.timeout:
|
|
122
|
+
self._half_open()
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
return self.state == "OPEN"
|
|
126
|
+
|
|
127
|
+
def _open(self) -> None:
|
|
128
|
+
"""Open the circuit."""
|
|
129
|
+
self.state = "OPEN"
|
|
130
|
+
self.opened_at = time.time()
|
|
131
|
+
logger.warning("Circuit breaker opened")
|
|
132
|
+
|
|
133
|
+
def _half_open(self) -> None:
|
|
134
|
+
"""Transition to half-open state."""
|
|
135
|
+
self.state = "HALF_OPEN"
|
|
136
|
+
self.success_count = 0
|
|
137
|
+
logger.info("Circuit breaker half-open (testing recovery)")
|
|
138
|
+
|
|
139
|
+
def _close(self) -> None:
|
|
140
|
+
"""Close the circuit."""
|
|
141
|
+
self.state = "CLOSED"
|
|
142
|
+
self.failure_count = 0
|
|
143
|
+
self.success_count = 0
|
|
144
|
+
self.opened_at = None
|
|
145
|
+
logger.info("Circuit breaker closed (worker recovered)")
|
|
146
|
+
|
|
147
|
+
def get_state(self) -> str:
|
|
148
|
+
"""Get current state."""
|
|
149
|
+
return self.state
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class FaultToleranceManager:
|
|
153
|
+
"""
|
|
154
|
+
Manage fault tolerance and recovery.
|
|
155
|
+
|
|
156
|
+
Provides automatic recovery from failures:
|
|
157
|
+
- Task retry with exponential backoff
|
|
158
|
+
- Worker failure detection and handling
|
|
159
|
+
- Task reassignment from failed workers
|
|
160
|
+
- Circuit breaker pattern for unreliable workers
|
|
161
|
+
- Checkpoint-based experiment recovery
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
config: Configuration dictionary
|
|
165
|
+
- max_retries: Maximum task retries (default: 3)
|
|
166
|
+
- retry_delay: Base delay between retries in seconds (default: 5)
|
|
167
|
+
- health_check_interval: Health check frequency (default: 30)
|
|
168
|
+
- checkpoint_interval: Checkpoint every N generations (default: 10)
|
|
169
|
+
- circuit_breaker_threshold: Failures to open circuit (default: 3)
|
|
170
|
+
- circuit_breaker_timeout: Circuit breaker timeout (default: 300)
|
|
171
|
+
|
|
172
|
+
Example:
|
|
173
|
+
>>> manager = FaultToleranceManager({'max_retries': 3, 'retry_delay': 5})
|
|
174
|
+
>>>
|
|
175
|
+
>>> # Handle task failure
|
|
176
|
+
>>> should_retry = manager.handle_task_failure(
|
|
177
|
+
... task, FailureType.NETWORK_ERROR, "Connection timeout"
|
|
178
|
+
... )
|
|
179
|
+
>>>
|
|
180
|
+
>>> # Handle worker failure
|
|
181
|
+
>>> manager.handle_worker_failure('worker-1')
|
|
182
|
+
>>>
|
|
183
|
+
>>> # Reassign tasks
|
|
184
|
+
>>> reassignment = manager.reassign_tasks('worker-1', task_ids, workers)
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
188
|
+
"""Initialize fault tolerance manager."""
|
|
189
|
+
config = config or {}
|
|
190
|
+
|
|
191
|
+
self.max_retries = config.get("max_retries", 3)
|
|
192
|
+
self.retry_delay = config.get("retry_delay", 5.0)
|
|
193
|
+
self.health_check_interval = config.get("health_check_interval", 30.0)
|
|
194
|
+
self.checkpoint_interval = config.get("checkpoint_interval", 10)
|
|
195
|
+
self.circuit_breaker_threshold = config.get("circuit_breaker_threshold", 3)
|
|
196
|
+
self.circuit_breaker_timeout = config.get("circuit_breaker_timeout", 300.0)
|
|
197
|
+
|
|
198
|
+
# Task retry tracking
|
|
199
|
+
self.task_retries: Dict[str, int] = {}
|
|
200
|
+
self.task_failures: Dict[str, List[FailureEvent]] = {}
|
|
201
|
+
self.failed_tasks: List[Task] = []
|
|
202
|
+
|
|
203
|
+
# Worker failure tracking
|
|
204
|
+
self.worker_failures: Dict[str, List[FailureEvent]] = {}
|
|
205
|
+
self.failed_workers: set = set()
|
|
206
|
+
|
|
207
|
+
# Circuit breakers per worker
|
|
208
|
+
self.circuit_breakers: Dict[str, CircuitBreaker] = {}
|
|
209
|
+
|
|
210
|
+
# Recovery tracking
|
|
211
|
+
self.recovery_attempts: Dict[str, int] = {}
|
|
212
|
+
|
|
213
|
+
logger.info(
|
|
214
|
+
f"Initialized FaultToleranceManager "
|
|
215
|
+
f"(max_retries={self.max_retries}, retry_delay={self.retry_delay}s)"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def handle_task_failure(self, task: Task, failure_type: FailureType, error: str) -> bool:
|
|
219
|
+
"""
|
|
220
|
+
Handle task failure with automatic retry logic.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
task: Failed task
|
|
224
|
+
failure_type: Type of failure
|
|
225
|
+
error: Error message
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
True if task should be retried, False otherwise
|
|
229
|
+
"""
|
|
230
|
+
task_id = task.task_id
|
|
231
|
+
|
|
232
|
+
# Record failure event
|
|
233
|
+
if task_id not in self.task_failures:
|
|
234
|
+
self.task_failures[task_id] = []
|
|
235
|
+
|
|
236
|
+
self.task_failures[task_id].append(
|
|
237
|
+
FailureEvent(
|
|
238
|
+
timestamp=time.time(),
|
|
239
|
+
failure_type=failure_type,
|
|
240
|
+
task_id=task_id,
|
|
241
|
+
worker_id=task.worker_id,
|
|
242
|
+
details=error,
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Increment retry count
|
|
247
|
+
self.task_retries[task_id] = self.task_retries.get(task_id, 0) + 1
|
|
248
|
+
retry_count = self.task_retries[task_id]
|
|
249
|
+
|
|
250
|
+
logger.warning(
|
|
251
|
+
f"Task {task_id} failed: {failure_type.value} - {error} "
|
|
252
|
+
f"(retry {retry_count}/{self.max_retries})"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Check if should retry
|
|
256
|
+
if retry_count < self.max_retries:
|
|
257
|
+
# Exponential backoff
|
|
258
|
+
delay = self.retry_delay * (2 ** (retry_count - 1))
|
|
259
|
+
logger.info(f"Will retry task {task_id} after {delay:.1f}s")
|
|
260
|
+
|
|
261
|
+
# Note: Actual sleep happens in the caller
|
|
262
|
+
return True
|
|
263
|
+
else:
|
|
264
|
+
logger.error(f"Task {task_id} exceeded max retries, marking as failed")
|
|
265
|
+
self.failed_tasks.append(task)
|
|
266
|
+
return False
|
|
267
|
+
|
|
268
|
+
def handle_worker_failure(
|
|
269
|
+
self, worker_id: str, failure_type: FailureType = FailureType.WORKER_CRASH
|
|
270
|
+
) -> None:
|
|
271
|
+
"""
|
|
272
|
+
Handle worker failure.
|
|
273
|
+
|
|
274
|
+
1. Record failure event
|
|
275
|
+
2. Update circuit breaker
|
|
276
|
+
3. Mark worker as failed
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
worker_id: Failed worker ID
|
|
280
|
+
failure_type: Type of failure
|
|
281
|
+
"""
|
|
282
|
+
logger.error(f"Worker {worker_id} failed: {failure_type.value}")
|
|
283
|
+
|
|
284
|
+
# Record failure event
|
|
285
|
+
if worker_id not in self.worker_failures:
|
|
286
|
+
self.worker_failures[worker_id] = []
|
|
287
|
+
|
|
288
|
+
self.worker_failures[worker_id].append(
|
|
289
|
+
FailureEvent(
|
|
290
|
+
timestamp=time.time(),
|
|
291
|
+
failure_type=failure_type,
|
|
292
|
+
worker_id=worker_id,
|
|
293
|
+
)
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Mark as failed
|
|
297
|
+
self.failed_workers.add(worker_id)
|
|
298
|
+
|
|
299
|
+
# Update circuit breaker
|
|
300
|
+
breaker = self._get_circuit_breaker(worker_id)
|
|
301
|
+
breaker.record_failure()
|
|
302
|
+
|
|
303
|
+
# Check failure rate
|
|
304
|
+
recent_failures = self._count_recent_failures(worker_id, window=3600)
|
|
305
|
+
|
|
306
|
+
if recent_failures > self.circuit_breaker_threshold:
|
|
307
|
+
logger.warning(
|
|
308
|
+
f"Worker {worker_id} has {recent_failures} failures in last hour, "
|
|
309
|
+
f"circuit breaker: {breaker.get_state()}"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
def handle_worker_recovery(self, worker_id: str) -> None:
|
|
313
|
+
"""
|
|
314
|
+
Handle worker recovery.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
worker_id: Recovered worker ID
|
|
318
|
+
"""
|
|
319
|
+
logger.info(f"Worker {worker_id} recovered")
|
|
320
|
+
|
|
321
|
+
# Remove from failed set
|
|
322
|
+
self.failed_workers.discard(worker_id)
|
|
323
|
+
|
|
324
|
+
# Record success in circuit breaker
|
|
325
|
+
if worker_id in self.circuit_breakers:
|
|
326
|
+
self.circuit_breakers[worker_id].record_success()
|
|
327
|
+
|
|
328
|
+
def reassign_tasks(
|
|
329
|
+
self,
|
|
330
|
+
failed_worker_id: str,
|
|
331
|
+
task_ids: List[str],
|
|
332
|
+
available_workers: List[WorkerInfo],
|
|
333
|
+
) -> Dict[str, str]:
|
|
334
|
+
"""
|
|
335
|
+
Reassign tasks from failed worker to healthy workers.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
failed_worker_id: Worker that failed
|
|
339
|
+
task_ids: Tasks to reassign
|
|
340
|
+
available_workers: Available workers
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
Mapping of task_id -> new_worker_id
|
|
344
|
+
"""
|
|
345
|
+
reassignment = {}
|
|
346
|
+
|
|
347
|
+
# Filter healthy workers
|
|
348
|
+
healthy_workers = [
|
|
349
|
+
w
|
|
350
|
+
for w in available_workers
|
|
351
|
+
if w.worker_id != failed_worker_id
|
|
352
|
+
and w.status != "dead"
|
|
353
|
+
and not self.is_worker_unhealthy(w.worker_id)
|
|
354
|
+
]
|
|
355
|
+
|
|
356
|
+
if not healthy_workers:
|
|
357
|
+
logger.error("No healthy workers available for task reassignment")
|
|
358
|
+
return reassignment
|
|
359
|
+
|
|
360
|
+
# Round-robin assignment
|
|
361
|
+
worker_idx = 0
|
|
362
|
+
for task_id in task_ids:
|
|
363
|
+
worker = healthy_workers[worker_idx % len(healthy_workers)]
|
|
364
|
+
reassignment[task_id] = worker.worker_id
|
|
365
|
+
|
|
366
|
+
logger.info(
|
|
367
|
+
f"Reassigned task {task_id} from {failed_worker_id} " f"to {worker.worker_id}"
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
worker_idx += 1
|
|
371
|
+
|
|
372
|
+
return reassignment
|
|
373
|
+
|
|
374
|
+
def is_worker_unhealthy(self, worker_id: str) -> bool:
|
|
375
|
+
"""
|
|
376
|
+
Check if worker is unhealthy.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
worker_id: Worker ID
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
True if worker should not receive tasks
|
|
383
|
+
"""
|
|
384
|
+
# Check if in failed set
|
|
385
|
+
if worker_id in self.failed_workers:
|
|
386
|
+
return True
|
|
387
|
+
|
|
388
|
+
# Check circuit breaker
|
|
389
|
+
if worker_id in self.circuit_breakers:
|
|
390
|
+
return self.circuit_breakers[worker_id].is_open()
|
|
391
|
+
|
|
392
|
+
return False
|
|
393
|
+
|
|
394
|
+
def recover_from_checkpoint(self, checkpoint: Dict[str, Any], optimizer: Any) -> int:
|
|
395
|
+
"""
|
|
396
|
+
Recover experiment state from checkpoint.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
checkpoint: Checkpoint dictionary
|
|
400
|
+
optimizer: Optimizer to restore
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
Generation to resume from
|
|
404
|
+
"""
|
|
405
|
+
generation = checkpoint.get("generation", 0)
|
|
406
|
+
|
|
407
|
+
logger.info(f"Recovering from checkpoint at generation {generation}")
|
|
408
|
+
|
|
409
|
+
# Restore optimizer state
|
|
410
|
+
optimizer_state = checkpoint.get("optimizer_state", {})
|
|
411
|
+
if hasattr(optimizer, "load_state"):
|
|
412
|
+
optimizer.load_state(optimizer_state)
|
|
413
|
+
|
|
414
|
+
# Restore population
|
|
415
|
+
population_data = checkpoint.get("population", [])
|
|
416
|
+
if population_data and hasattr(optimizer, "population"):
|
|
417
|
+
try:
|
|
418
|
+
population = [Individual.from_dict(ind_dict) for ind_dict in population_data]
|
|
419
|
+
optimizer.population = population
|
|
420
|
+
logger.info(f"Restored population of {len(population)} individuals")
|
|
421
|
+
except Exception as e:
|
|
422
|
+
logger.warning(f"Failed to restore population: {e}")
|
|
423
|
+
|
|
424
|
+
# Record recovery
|
|
425
|
+
exp_id = checkpoint.get("experiment_id", "unknown")
|
|
426
|
+
self.recovery_attempts[exp_id] = self.recovery_attempts.get(exp_id, 0) + 1
|
|
427
|
+
|
|
428
|
+
logger.info(f"Recovery complete, resuming from generation {generation}")
|
|
429
|
+
|
|
430
|
+
return generation
|
|
431
|
+
|
|
432
|
+
def _get_circuit_breaker(self, worker_id: str) -> CircuitBreaker:
|
|
433
|
+
"""Get or create circuit breaker for worker."""
|
|
434
|
+
if worker_id not in self.circuit_breakers:
|
|
435
|
+
self.circuit_breakers[worker_id] = CircuitBreaker(
|
|
436
|
+
failure_threshold=self.circuit_breaker_threshold,
|
|
437
|
+
timeout=self.circuit_breaker_timeout,
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
return self.circuit_breakers[worker_id]
|
|
441
|
+
|
|
442
|
+
def _count_recent_failures(self, worker_id: str, window: float = 3600.0) -> int:
|
|
443
|
+
"""
|
|
444
|
+
Count worker failures within time window.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
worker_id: Worker ID
|
|
448
|
+
window: Time window in seconds
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
Number of failures
|
|
452
|
+
"""
|
|
453
|
+
if worker_id not in self.worker_failures:
|
|
454
|
+
return 0
|
|
455
|
+
|
|
456
|
+
current_time = time.time()
|
|
457
|
+
recent = [
|
|
458
|
+
f for f in self.worker_failures[worker_id] if (current_time - f.timestamp) < window
|
|
459
|
+
]
|
|
460
|
+
|
|
461
|
+
return len(recent)
|
|
462
|
+
|
|
463
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
464
|
+
"""
|
|
465
|
+
Get fault tolerance statistics.
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Statistics dictionary
|
|
469
|
+
"""
|
|
470
|
+
total_task_failures = sum(len(events) for events in self.task_failures.values())
|
|
471
|
+
total_worker_failures = sum(len(events) for events in self.worker_failures.values())
|
|
472
|
+
|
|
473
|
+
circuit_breaker_states = {
|
|
474
|
+
wid: breaker.get_state() for wid, breaker in self.circuit_breakers.items()
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
return {
|
|
478
|
+
"total_task_failures": total_task_failures,
|
|
479
|
+
"total_worker_failures": total_worker_failures,
|
|
480
|
+
"failed_tasks": len(self.failed_tasks),
|
|
481
|
+
"failed_workers": len(self.failed_workers),
|
|
482
|
+
"circuit_breakers": circuit_breaker_states,
|
|
483
|
+
"tasks_with_retries": len(self.task_retries),
|
|
484
|
+
"recovery_attempts": dict(self.recovery_attempts),
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
def reset(self) -> None:
|
|
488
|
+
"""Reset all fault tolerance state."""
|
|
489
|
+
self.task_retries.clear()
|
|
490
|
+
self.task_failures.clear()
|
|
491
|
+
self.failed_tasks.clear()
|
|
492
|
+
self.worker_failures.clear()
|
|
493
|
+
self.failed_workers.clear()
|
|
494
|
+
self.circuit_breakers.clear()
|
|
495
|
+
self.recovery_attempts.clear()
|
|
496
|
+
|
|
497
|
+
logger.info("Reset fault tolerance state")
|