claude-mpm 3.4.0__py3-none-any.whl → 3.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/cli/commands/memory.py +6 -1
- claude_mpm/core/config.py +160 -0
- claude_mpm/hooks/claude_hooks/hook_wrapper.sh +1 -1
- claude_mpm/scripts/socketio_daemon.py +49 -9
- claude_mpm/scripts/socketio_server_manager.py +370 -45
- claude_mpm/services/__init__.py +18 -0
- claude_mpm/services/agent_memory_manager.py +7 -5
- claude_mpm/services/exceptions.py +677 -0
- claude_mpm/services/health_monitor.py +892 -0
- claude_mpm/services/memory_builder.py +4 -2
- claude_mpm/services/memory_optimizer.py +6 -2
- claude_mpm/services/recovery_manager.py +670 -0
- claude_mpm/services/socketio_server.py +188 -11
- claude_mpm/services/standalone_socketio_server.py +703 -34
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/METADATA +1 -1
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/RECORD +21 -18
- /claude_mpm/{web → dashboard}/open_dashboard.py +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/WHEEL +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/entry_points.txt +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,670 @@
|
|
|
1
|
+
"""Automatic recovery manager for claude-mpm Socket.IO server.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive recovery mechanisms including:
|
|
4
|
+
- Circuit breaker pattern to prevent restart loops
|
|
5
|
+
- Configurable recovery strategies
|
|
6
|
+
- Graceful restart and recovery procedures
|
|
7
|
+
- Recovery event logging and notifications
|
|
8
|
+
- Integration with health monitoring system
|
|
9
|
+
|
|
10
|
+
Design Principles:
|
|
11
|
+
- Prevention of restart loops through circuit breaker
|
|
12
|
+
- Graduated recovery responses based on failure severity
|
|
13
|
+
- Comprehensive logging for recovery events
|
|
14
|
+
- Configurable recovery policies and thresholds
|
|
15
|
+
- Integration with existing service lifecycle
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import logging
|
|
20
|
+
import time
|
|
21
|
+
import signal
|
|
22
|
+
import os
|
|
23
|
+
import threading
|
|
24
|
+
from abc import ABC, abstractmethod
|
|
25
|
+
from collections import deque
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from datetime import datetime, timezone
|
|
28
|
+
from enum import Enum
|
|
29
|
+
from typing import Any, Dict, List, Optional, Callable, Union
|
|
30
|
+
import json
|
|
31
|
+
|
|
32
|
+
from .health_monitor import HealthStatus, HealthCheckResult
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class RecoveryAction(Enum):
|
|
36
|
+
"""Types of recovery actions that can be performed."""
|
|
37
|
+
NONE = "none"
|
|
38
|
+
LOG_WARNING = "log_warning"
|
|
39
|
+
CLEAR_CONNECTIONS = "clear_connections"
|
|
40
|
+
RESTART_SERVICE = "restart_service"
|
|
41
|
+
EMERGENCY_STOP = "emergency_stop"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class CircuitState(Enum):
|
|
45
|
+
"""Circuit breaker states."""
|
|
46
|
+
CLOSED = "closed" # Normal operation
|
|
47
|
+
OPEN = "open" # Recovery blocked due to failures
|
|
48
|
+
HALF_OPEN = "half_open" # Testing if recovery is working
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class RecoveryEvent:
|
|
53
|
+
"""Recovery event record."""
|
|
54
|
+
timestamp: float
|
|
55
|
+
action: RecoveryAction
|
|
56
|
+
trigger: str
|
|
57
|
+
health_status: HealthStatus
|
|
58
|
+
success: bool
|
|
59
|
+
duration_ms: float
|
|
60
|
+
error_message: Optional[str] = None
|
|
61
|
+
|
|
62
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
63
|
+
"""Convert recovery event to dictionary."""
|
|
64
|
+
return {
|
|
65
|
+
'timestamp': self.timestamp,
|
|
66
|
+
'timestamp_iso': datetime.fromtimestamp(self.timestamp, timezone.utc).isoformat(),
|
|
67
|
+
'action': self.action.value,
|
|
68
|
+
'trigger': self.trigger,
|
|
69
|
+
'health_status': self.health_status.value,
|
|
70
|
+
'success': self.success,
|
|
71
|
+
'duration_ms': self.duration_ms,
|
|
72
|
+
'error_message': self.error_message
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class RecoveryStrategy(ABC):
|
|
77
|
+
"""Abstract base class for recovery strategies."""
|
|
78
|
+
|
|
79
|
+
@abstractmethod
|
|
80
|
+
def should_recover(self, health_result: HealthCheckResult) -> bool:
|
|
81
|
+
"""Determine if recovery should be triggered based on health result."""
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
@abstractmethod
|
|
85
|
+
def get_recovery_action(self, health_result: HealthCheckResult) -> RecoveryAction:
|
|
86
|
+
"""Determine the appropriate recovery action."""
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def get_name(self) -> str:
|
|
91
|
+
"""Get the name of this recovery strategy."""
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class GradedRecoveryStrategy(RecoveryStrategy):
|
|
96
|
+
"""Recovery strategy with graduated response based on health status and history.
|
|
97
|
+
|
|
98
|
+
Recovery actions are escalated based on:
|
|
99
|
+
- Current health status severity
|
|
100
|
+
- Number of recent failures
|
|
101
|
+
- Time since last recovery attempt
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
105
|
+
"""Initialize graded recovery strategy.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
config: Configuration dictionary for recovery thresholds
|
|
109
|
+
"""
|
|
110
|
+
self.config = config or {}
|
|
111
|
+
self.logger = logging.getLogger(f"{__name__}.GradedRecoveryStrategy")
|
|
112
|
+
|
|
113
|
+
# Configuration with defaults
|
|
114
|
+
self.warning_threshold = self.config.get('warning_threshold', 2)
|
|
115
|
+
self.critical_threshold = self.config.get('critical_threshold', 1)
|
|
116
|
+
self.failure_window_seconds = self.config.get('failure_window_seconds', 300)
|
|
117
|
+
self.min_recovery_interval = self.config.get('min_recovery_interval', 60)
|
|
118
|
+
|
|
119
|
+
# Track recent failures
|
|
120
|
+
self.recent_failures: deque = deque(maxlen=10)
|
|
121
|
+
self.last_recovery_time = 0
|
|
122
|
+
|
|
123
|
+
def get_name(self) -> str:
|
|
124
|
+
return "graded_recovery"
|
|
125
|
+
|
|
126
|
+
def should_recover(self, health_result: HealthCheckResult) -> bool:
|
|
127
|
+
"""Determine if recovery should be triggered."""
|
|
128
|
+
current_time = time.time()
|
|
129
|
+
|
|
130
|
+
# Don't trigger recovery too frequently
|
|
131
|
+
if current_time - self.last_recovery_time < self.min_recovery_interval:
|
|
132
|
+
self.logger.debug("Recovery suppressed due to min interval")
|
|
133
|
+
return False
|
|
134
|
+
|
|
135
|
+
# Check current health status
|
|
136
|
+
if health_result.overall_status in [HealthStatus.CRITICAL]:
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
if health_result.overall_status == HealthStatus.WARNING:
|
|
140
|
+
# Count recent warnings in time window
|
|
141
|
+
cutoff_time = current_time - self.failure_window_seconds
|
|
142
|
+
recent_warnings = [
|
|
143
|
+
event for event in self.recent_failures
|
|
144
|
+
if event >= cutoff_time
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
if len(recent_warnings) >= self.warning_threshold:
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
def get_recovery_action(self, health_result: HealthCheckResult) -> RecoveryAction:
|
|
153
|
+
"""Determine the appropriate recovery action based on health status."""
|
|
154
|
+
current_time = time.time()
|
|
155
|
+
|
|
156
|
+
# Count recent failures
|
|
157
|
+
cutoff_time = current_time - self.failure_window_seconds
|
|
158
|
+
recent_failures = [
|
|
159
|
+
event for event in self.recent_failures
|
|
160
|
+
if event >= cutoff_time
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
failure_count = len(recent_failures)
|
|
164
|
+
|
|
165
|
+
# Record this failure
|
|
166
|
+
if health_result.overall_status in [HealthStatus.WARNING, HealthStatus.CRITICAL]:
|
|
167
|
+
self.recent_failures.append(current_time)
|
|
168
|
+
|
|
169
|
+
# Determine action based on status and failure history
|
|
170
|
+
if health_result.overall_status == HealthStatus.CRITICAL:
|
|
171
|
+
if failure_count >= 3:
|
|
172
|
+
return RecoveryAction.EMERGENCY_STOP
|
|
173
|
+
elif failure_count >= 2:
|
|
174
|
+
return RecoveryAction.RESTART_SERVICE
|
|
175
|
+
else:
|
|
176
|
+
return RecoveryAction.CLEAR_CONNECTIONS
|
|
177
|
+
|
|
178
|
+
elif health_result.overall_status == HealthStatus.WARNING:
|
|
179
|
+
if failure_count >= self.warning_threshold:
|
|
180
|
+
return RecoveryAction.CLEAR_CONNECTIONS
|
|
181
|
+
else:
|
|
182
|
+
return RecoveryAction.LOG_WARNING
|
|
183
|
+
|
|
184
|
+
return RecoveryAction.NONE
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class CircuitBreaker:
|
|
188
|
+
"""Circuit breaker to prevent recovery loops and cascading failures.
|
|
189
|
+
|
|
190
|
+
Implements the circuit breaker pattern to:
|
|
191
|
+
- Prevent excessive recovery attempts
|
|
192
|
+
- Allow time for systems to stabilize
|
|
193
|
+
- Gradually re-enable recovery after failures
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
def __init__(self, failure_threshold: int = 5, timeout_seconds: int = 300,
|
|
197
|
+
success_threshold: int = 3):
|
|
198
|
+
"""Initialize circuit breaker.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
failure_threshold: Number of failures before opening circuit
|
|
202
|
+
timeout_seconds: Time to wait in OPEN state before trying HALF_OPEN
|
|
203
|
+
success_threshold: Number of successes needed in HALF_OPEN to close circuit
|
|
204
|
+
"""
|
|
205
|
+
self.failure_threshold = failure_threshold
|
|
206
|
+
self.timeout_seconds = timeout_seconds
|
|
207
|
+
self.success_threshold = success_threshold
|
|
208
|
+
|
|
209
|
+
self.state = CircuitState.CLOSED
|
|
210
|
+
self.failure_count = 0
|
|
211
|
+
self.success_count = 0
|
|
212
|
+
self.last_failure_time = 0
|
|
213
|
+
self.state_change_time = time.time()
|
|
214
|
+
|
|
215
|
+
self.logger = logging.getLogger(f"{__name__}.CircuitBreaker")
|
|
216
|
+
self.logger.info(f"Circuit breaker initialized: failure_threshold={failure_threshold}, "
|
|
217
|
+
f"timeout={timeout_seconds}s, success_threshold={success_threshold}")
|
|
218
|
+
|
|
219
|
+
def can_proceed(self) -> bool:
|
|
220
|
+
"""Check if recovery operations can proceed."""
|
|
221
|
+
current_time = time.time()
|
|
222
|
+
|
|
223
|
+
if self.state == CircuitState.CLOSED:
|
|
224
|
+
return True
|
|
225
|
+
|
|
226
|
+
elif self.state == CircuitState.OPEN:
|
|
227
|
+
# Check if timeout has elapsed
|
|
228
|
+
if current_time - self.last_failure_time >= self.timeout_seconds:
|
|
229
|
+
self._transition_to_half_open()
|
|
230
|
+
return True
|
|
231
|
+
return False
|
|
232
|
+
|
|
233
|
+
elif self.state == CircuitState.HALF_OPEN:
|
|
234
|
+
return True
|
|
235
|
+
|
|
236
|
+
return False
|
|
237
|
+
|
|
238
|
+
def record_success(self) -> None:
|
|
239
|
+
"""Record a successful recovery operation."""
|
|
240
|
+
if self.state == CircuitState.CLOSED:
|
|
241
|
+
# Reset failure count on success in normal state
|
|
242
|
+
self.failure_count = 0
|
|
243
|
+
|
|
244
|
+
elif self.state == CircuitState.HALF_OPEN:
|
|
245
|
+
self.success_count += 1
|
|
246
|
+
self.logger.debug(f"Circuit breaker success count: {self.success_count}/{self.success_threshold}")
|
|
247
|
+
|
|
248
|
+
if self.success_count >= self.success_threshold:
|
|
249
|
+
self._transition_to_closed()
|
|
250
|
+
|
|
251
|
+
def record_failure(self) -> None:
|
|
252
|
+
"""Record a failed recovery operation."""
|
|
253
|
+
current_time = time.time()
|
|
254
|
+
self.last_failure_time = current_time
|
|
255
|
+
|
|
256
|
+
if self.state == CircuitState.CLOSED:
|
|
257
|
+
self.failure_count += 1
|
|
258
|
+
self.logger.warning(f"Circuit breaker failure count: {self.failure_count}/{self.failure_threshold}")
|
|
259
|
+
|
|
260
|
+
if self.failure_count >= self.failure_threshold:
|
|
261
|
+
self._transition_to_open()
|
|
262
|
+
|
|
263
|
+
elif self.state == CircuitState.HALF_OPEN:
|
|
264
|
+
# Failure in half-open state goes back to open
|
|
265
|
+
self._transition_to_open()
|
|
266
|
+
|
|
267
|
+
def _transition_to_open(self) -> None:
|
|
268
|
+
"""Transition circuit to OPEN state."""
|
|
269
|
+
self.state = CircuitState.OPEN
|
|
270
|
+
self.state_change_time = time.time()
|
|
271
|
+
self.success_count = 0
|
|
272
|
+
self.logger.warning(f"Circuit breaker OPENED due to {self.failure_count} failures. "
|
|
273
|
+
f"Recovery blocked for {self.timeout_seconds} seconds.")
|
|
274
|
+
|
|
275
|
+
def _transition_to_half_open(self) -> None:
|
|
276
|
+
"""Transition circuit to HALF_OPEN state."""
|
|
277
|
+
self.state = CircuitState.HALF_OPEN
|
|
278
|
+
self.state_change_time = time.time()
|
|
279
|
+
self.success_count = 0
|
|
280
|
+
self.logger.info("Circuit breaker transitioned to HALF_OPEN. Testing recovery...")
|
|
281
|
+
|
|
282
|
+
def _transition_to_closed(self) -> None:
|
|
283
|
+
"""Transition circuit to CLOSED state."""
|
|
284
|
+
self.state = CircuitState.CLOSED
|
|
285
|
+
self.state_change_time = time.time()
|
|
286
|
+
self.failure_count = 0
|
|
287
|
+
self.success_count = 0
|
|
288
|
+
self.logger.info("Circuit breaker CLOSED. Normal recovery operations resumed.")
|
|
289
|
+
|
|
290
|
+
def get_status(self) -> Dict[str, Any]:
|
|
291
|
+
"""Get current circuit breaker status."""
|
|
292
|
+
current_time = time.time()
|
|
293
|
+
return {
|
|
294
|
+
'state': self.state.value,
|
|
295
|
+
'failure_count': self.failure_count,
|
|
296
|
+
'success_count': self.success_count,
|
|
297
|
+
'last_failure_time': self.last_failure_time,
|
|
298
|
+
'state_change_time': self.state_change_time,
|
|
299
|
+
'time_in_current_state': current_time - self.state_change_time,
|
|
300
|
+
'can_proceed': self.can_proceed(),
|
|
301
|
+
'config': {
|
|
302
|
+
'failure_threshold': self.failure_threshold,
|
|
303
|
+
'timeout_seconds': self.timeout_seconds,
|
|
304
|
+
'success_threshold': self.success_threshold
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class RecoveryManager:
|
|
310
|
+
"""Advanced recovery manager with circuit breaker and configurable strategies.
|
|
311
|
+
|
|
312
|
+
Provides comprehensive recovery capabilities including:
|
|
313
|
+
- Health-based recovery triggering
|
|
314
|
+
- Circuit breaker protection
|
|
315
|
+
- Multiple recovery strategies
|
|
316
|
+
- Recovery event logging and history
|
|
317
|
+
- Integration with service lifecycle
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None,
|
|
321
|
+
server_instance=None):
|
|
322
|
+
"""Initialize recovery manager.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
config: Configuration dictionary for recovery settings
|
|
326
|
+
server_instance: Reference to the Socket.IO server instance
|
|
327
|
+
"""
|
|
328
|
+
self.config = config or {}
|
|
329
|
+
self.server_instance = server_instance
|
|
330
|
+
self.logger = logging.getLogger(f"{__name__}.RecoveryManager")
|
|
331
|
+
|
|
332
|
+
# Configuration with defaults
|
|
333
|
+
self.enabled = self.config.get('enabled', True)
|
|
334
|
+
self.check_interval = self.config.get('check_interval', 60)
|
|
335
|
+
self.max_recovery_attempts = self.config.get('max_recovery_attempts', 5)
|
|
336
|
+
self.recovery_timeout = self.config.get('recovery_timeout', 30)
|
|
337
|
+
|
|
338
|
+
# Initialize circuit breaker
|
|
339
|
+
circuit_config = self.config.get('circuit_breaker', {})
|
|
340
|
+
self.circuit_breaker = CircuitBreaker(
|
|
341
|
+
failure_threshold=circuit_config.get('failure_threshold', 5),
|
|
342
|
+
timeout_seconds=circuit_config.get('timeout_seconds', 300),
|
|
343
|
+
success_threshold=circuit_config.get('success_threshold', 3)
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Initialize recovery strategy
|
|
347
|
+
strategy_config = self.config.get('strategy', {})
|
|
348
|
+
self.recovery_strategy = GradedRecoveryStrategy(strategy_config)
|
|
349
|
+
|
|
350
|
+
# Recovery event history
|
|
351
|
+
self.recovery_history: deque = deque(maxlen=100)
|
|
352
|
+
|
|
353
|
+
# Recovery state
|
|
354
|
+
self.recovery_in_progress = False
|
|
355
|
+
self.last_recovery_time = 0
|
|
356
|
+
self.recovery_count = 0
|
|
357
|
+
|
|
358
|
+
# Recovery callbacks
|
|
359
|
+
self.recovery_callbacks: List[Callable[[RecoveryEvent], None]] = []
|
|
360
|
+
|
|
361
|
+
# Statistics
|
|
362
|
+
self.recovery_stats = {
|
|
363
|
+
'total_recoveries': 0,
|
|
364
|
+
'successful_recoveries': 0,
|
|
365
|
+
'failed_recoveries': 0,
|
|
366
|
+
'actions_performed': {action.value: 0 for action in RecoveryAction},
|
|
367
|
+
'average_recovery_duration_ms': 0
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
self.logger.info(f"Recovery manager initialized with strategy: {self.recovery_strategy.get_name()}")
|
|
371
|
+
|
|
372
|
+
def add_recovery_callback(self, callback: Callable[[RecoveryEvent], None]) -> None:
|
|
373
|
+
"""Add a callback to be notified of recovery events."""
|
|
374
|
+
self.recovery_callbacks.append(callback)
|
|
375
|
+
self.logger.debug(f"Added recovery callback: {callback.__name__}")
|
|
376
|
+
|
|
377
|
+
def handle_health_result(self, health_result: HealthCheckResult) -> Optional[RecoveryEvent]:
|
|
378
|
+
"""Handle health check result and trigger recovery if needed.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
health_result: Health check result to evaluate
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
RecoveryEvent if recovery was triggered, None otherwise
|
|
385
|
+
"""
|
|
386
|
+
if not self.enabled:
|
|
387
|
+
return None
|
|
388
|
+
|
|
389
|
+
if self.recovery_in_progress:
|
|
390
|
+
self.logger.debug("Recovery already in progress, skipping")
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
# Check if recovery should be triggered
|
|
394
|
+
if not self.recovery_strategy.should_recover(health_result):
|
|
395
|
+
return None
|
|
396
|
+
|
|
397
|
+
# Check circuit breaker
|
|
398
|
+
if not self.circuit_breaker.can_proceed():
|
|
399
|
+
self.logger.warning("Recovery suppressed by circuit breaker")
|
|
400
|
+
return None
|
|
401
|
+
|
|
402
|
+
# Determine recovery action
|
|
403
|
+
action = self.recovery_strategy.get_recovery_action(health_result)
|
|
404
|
+
|
|
405
|
+
if action == RecoveryAction.NONE:
|
|
406
|
+
return None
|
|
407
|
+
|
|
408
|
+
# Trigger recovery
|
|
409
|
+
return asyncio.create_task(self._perform_recovery(action, health_result, "health_check"))
|
|
410
|
+
|
|
411
|
+
async def _perform_recovery(self, action: RecoveryAction,
|
|
412
|
+
health_result: HealthCheckResult,
|
|
413
|
+
trigger: str) -> RecoveryEvent:
|
|
414
|
+
"""Perform recovery action and record the event.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
action: Recovery action to perform
|
|
418
|
+
health_result: Health result that triggered recovery
|
|
419
|
+
trigger: Description of what triggered the recovery
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
RecoveryEvent record of the recovery attempt
|
|
423
|
+
"""
|
|
424
|
+
if self.recovery_in_progress:
|
|
425
|
+
raise RuntimeError("Recovery already in progress")
|
|
426
|
+
|
|
427
|
+
self.recovery_in_progress = True
|
|
428
|
+
start_time = time.time()
|
|
429
|
+
success = False
|
|
430
|
+
error_message = None
|
|
431
|
+
|
|
432
|
+
try:
|
|
433
|
+
self.logger.info(f"Starting recovery action: {action.value} (trigger: {trigger})")
|
|
434
|
+
|
|
435
|
+
if action == RecoveryAction.LOG_WARNING:
|
|
436
|
+
success = await self._log_warning(health_result)
|
|
437
|
+
elif action == RecoveryAction.CLEAR_CONNECTIONS:
|
|
438
|
+
success = await self._clear_connections()
|
|
439
|
+
elif action == RecoveryAction.RESTART_SERVICE:
|
|
440
|
+
success = await self._restart_service()
|
|
441
|
+
elif action == RecoveryAction.EMERGENCY_STOP:
|
|
442
|
+
success = await self._emergency_stop()
|
|
443
|
+
else:
|
|
444
|
+
error_message = f"Unknown recovery action: {action}"
|
|
445
|
+
self.logger.error(error_message)
|
|
446
|
+
|
|
447
|
+
except Exception as e:
|
|
448
|
+
error_message = f"Recovery action failed: {e}"
|
|
449
|
+
self.logger.error(error_message)
|
|
450
|
+
success = False
|
|
451
|
+
|
|
452
|
+
finally:
|
|
453
|
+
self.recovery_in_progress = False
|
|
454
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
455
|
+
|
|
456
|
+
# Create recovery event
|
|
457
|
+
event = RecoveryEvent(
|
|
458
|
+
timestamp=start_time,
|
|
459
|
+
action=action,
|
|
460
|
+
trigger=trigger,
|
|
461
|
+
health_status=health_result.overall_status,
|
|
462
|
+
success=success,
|
|
463
|
+
duration_ms=duration_ms,
|
|
464
|
+
error_message=error_message
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
# Update statistics
|
|
468
|
+
self._update_recovery_stats(event)
|
|
469
|
+
|
|
470
|
+
# Record in circuit breaker
|
|
471
|
+
if success:
|
|
472
|
+
self.circuit_breaker.record_success()
|
|
473
|
+
else:
|
|
474
|
+
self.circuit_breaker.record_failure()
|
|
475
|
+
|
|
476
|
+
# Store event
|
|
477
|
+
self.recovery_history.append(event)
|
|
478
|
+
self.last_recovery_time = start_time
|
|
479
|
+
self.recovery_count += 1
|
|
480
|
+
|
|
481
|
+
# Notify callbacks
|
|
482
|
+
for callback in self.recovery_callbacks:
|
|
483
|
+
try:
|
|
484
|
+
callback(event)
|
|
485
|
+
except Exception as e:
|
|
486
|
+
self.logger.error(f"Recovery callback {callback.__name__} failed: {e}")
|
|
487
|
+
|
|
488
|
+
result_msg = "succeeded" if success else "failed"
|
|
489
|
+
self.logger.info(f"Recovery action {action.value} {result_msg} in {duration_ms:.2f}ms")
|
|
490
|
+
|
|
491
|
+
return event
|
|
492
|
+
|
|
493
|
+
async def _log_warning(self, health_result: HealthCheckResult) -> bool:
|
|
494
|
+
"""Log a warning about health issues."""
|
|
495
|
+
try:
|
|
496
|
+
warning_metrics = [m for m in health_result.metrics if m.status == HealthStatus.WARNING]
|
|
497
|
+
critical_metrics = [m for m in health_result.metrics if m.status == HealthStatus.CRITICAL]
|
|
498
|
+
|
|
499
|
+
self.logger.warning(f"Health warning detected: {len(warning_metrics)} warning metrics, "
|
|
500
|
+
f"{len(critical_metrics)} critical metrics")
|
|
501
|
+
|
|
502
|
+
for metric in warning_metrics + critical_metrics:
|
|
503
|
+
self.logger.warning(f" {metric.name}: {metric.value} ({metric.status.value}) - {metric.message}")
|
|
504
|
+
|
|
505
|
+
return True
|
|
506
|
+
except Exception as e:
|
|
507
|
+
self.logger.error(f"Failed to log warning: {e}")
|
|
508
|
+
return False
|
|
509
|
+
|
|
510
|
+
async def _clear_connections(self) -> bool:
|
|
511
|
+
"""Clear all client connections to reset connection state."""
|
|
512
|
+
try:
|
|
513
|
+
if not self.server_instance or not hasattr(self.server_instance, 'sio'):
|
|
514
|
+
self.logger.warning("No server instance available for connection clearing")
|
|
515
|
+
return False
|
|
516
|
+
|
|
517
|
+
sio = self.server_instance.sio
|
|
518
|
+
if not sio:
|
|
519
|
+
self.logger.warning("Socket.IO instance not available")
|
|
520
|
+
return False
|
|
521
|
+
|
|
522
|
+
# Get current clients
|
|
523
|
+
clients = list(self.server_instance.clients) if hasattr(self.server_instance, 'clients') else []
|
|
524
|
+
|
|
525
|
+
self.logger.info(f"Clearing {len(clients)} client connections")
|
|
526
|
+
|
|
527
|
+
# Disconnect all clients
|
|
528
|
+
for client_id in clients:
|
|
529
|
+
try:
|
|
530
|
+
await sio.disconnect(client_id)
|
|
531
|
+
except Exception as e:
|
|
532
|
+
self.logger.warning(f"Failed to disconnect client {client_id}: {e}")
|
|
533
|
+
|
|
534
|
+
# Clear client tracking
|
|
535
|
+
if hasattr(self.server_instance, 'clients'):
|
|
536
|
+
self.server_instance.clients.clear()
|
|
537
|
+
if hasattr(self.server_instance, 'client_versions'):
|
|
538
|
+
self.server_instance.client_versions.clear()
|
|
539
|
+
|
|
540
|
+
self.logger.info("Client connections cleared successfully")
|
|
541
|
+
return True
|
|
542
|
+
|
|
543
|
+
except Exception as e:
|
|
544
|
+
self.logger.error(f"Failed to clear connections: {e}")
|
|
545
|
+
return False
|
|
546
|
+
|
|
547
|
+
async def _restart_service(self) -> bool:
|
|
548
|
+
"""Restart the Socket.IO service."""
|
|
549
|
+
try:
|
|
550
|
+
if not self.server_instance:
|
|
551
|
+
self.logger.error("No server instance available for restart")
|
|
552
|
+
return False
|
|
553
|
+
|
|
554
|
+
self.logger.info("Attempting graceful service restart")
|
|
555
|
+
|
|
556
|
+
# Save current configuration
|
|
557
|
+
host = getattr(self.server_instance, 'host', 'localhost')
|
|
558
|
+
port = getattr(self.server_instance, 'port', 8765)
|
|
559
|
+
|
|
560
|
+
# Stop current server
|
|
561
|
+
try:
|
|
562
|
+
await self.server_instance._shutdown_async()
|
|
563
|
+
self.logger.info("Server shutdown completed")
|
|
564
|
+
except Exception as e:
|
|
565
|
+
self.logger.warning(f"Error during shutdown: {e}")
|
|
566
|
+
|
|
567
|
+
# Wait a moment for cleanup
|
|
568
|
+
await asyncio.sleep(1)
|
|
569
|
+
|
|
570
|
+
# Restart server
|
|
571
|
+
await self.server_instance.start_async()
|
|
572
|
+
self.logger.info("Server restart completed successfully")
|
|
573
|
+
|
|
574
|
+
return True
|
|
575
|
+
|
|
576
|
+
except Exception as e:
|
|
577
|
+
self.logger.error(f"Failed to restart service: {e}")
|
|
578
|
+
return False
|
|
579
|
+
|
|
580
|
+
async def _emergency_stop(self) -> bool:
|
|
581
|
+
"""Perform emergency stop of the service."""
|
|
582
|
+
try:
|
|
583
|
+
self.logger.critical("Performing emergency stop due to critical health issues")
|
|
584
|
+
|
|
585
|
+
if self.server_instance:
|
|
586
|
+
try:
|
|
587
|
+
# Force immediate shutdown
|
|
588
|
+
await self.server_instance._shutdown_async()
|
|
589
|
+
except Exception as e:
|
|
590
|
+
self.logger.error(f"Error during emergency shutdown: {e}")
|
|
591
|
+
|
|
592
|
+
# Send termination signal to process
|
|
593
|
+
try:
|
|
594
|
+
os.kill(os.getpid(), signal.SIGTERM)
|
|
595
|
+
except Exception as e:
|
|
596
|
+
self.logger.error(f"Failed to send termination signal: {e}")
|
|
597
|
+
return False
|
|
598
|
+
|
|
599
|
+
return True
|
|
600
|
+
|
|
601
|
+
except Exception as e:
|
|
602
|
+
self.logger.error(f"Emergency stop failed: {e}")
|
|
603
|
+
return False
|
|
604
|
+
|
|
605
|
+
def _update_recovery_stats(self, event: RecoveryEvent) -> None:
|
|
606
|
+
"""Update recovery statistics with new event."""
|
|
607
|
+
self.recovery_stats['total_recoveries'] += 1
|
|
608
|
+
|
|
609
|
+
if event.success:
|
|
610
|
+
self.recovery_stats['successful_recoveries'] += 1
|
|
611
|
+
else:
|
|
612
|
+
self.recovery_stats['failed_recoveries'] += 1
|
|
613
|
+
|
|
614
|
+
self.recovery_stats['actions_performed'][event.action.value] += 1
|
|
615
|
+
|
|
616
|
+
# Update average duration
|
|
617
|
+
total_recoveries = self.recovery_stats['total_recoveries']
|
|
618
|
+
current_avg = self.recovery_stats['average_recovery_duration_ms']
|
|
619
|
+
self.recovery_stats['average_recovery_duration_ms'] = (
|
|
620
|
+
(current_avg * (total_recoveries - 1) + event.duration_ms) / total_recoveries
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
def get_recovery_status(self) -> Dict[str, Any]:
|
|
624
|
+
"""Get comprehensive recovery manager status."""
|
|
625
|
+
return {
|
|
626
|
+
'enabled': self.enabled,
|
|
627
|
+
'recovery_in_progress': self.recovery_in_progress,
|
|
628
|
+
'last_recovery_time': self.last_recovery_time,
|
|
629
|
+
'recovery_count': self.recovery_count,
|
|
630
|
+
'strategy': self.recovery_strategy.get_name(),
|
|
631
|
+
'circuit_breaker': self.circuit_breaker.get_status(),
|
|
632
|
+
'recovery_stats': dict(self.recovery_stats),
|
|
633
|
+
'recent_recoveries': [event.to_dict() for event in list(self.recovery_history)[-10:]],
|
|
634
|
+
'config': {
|
|
635
|
+
'check_interval': self.check_interval,
|
|
636
|
+
'max_recovery_attempts': self.max_recovery_attempts,
|
|
637
|
+
'recovery_timeout': self.recovery_timeout
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
def get_recovery_history(self, limit: Optional[int] = None) -> List[RecoveryEvent]:
|
|
642
|
+
"""Get recovery event history.
|
|
643
|
+
|
|
644
|
+
Args:
|
|
645
|
+
limit: Maximum number of events to return
|
|
646
|
+
|
|
647
|
+
Returns:
|
|
648
|
+
List of recovery events, newest first
|
|
649
|
+
"""
|
|
650
|
+
history = list(self.recovery_history)
|
|
651
|
+
history.reverse() # Newest first
|
|
652
|
+
|
|
653
|
+
if limit:
|
|
654
|
+
history = history[:limit]
|
|
655
|
+
|
|
656
|
+
return history
|
|
657
|
+
|
|
658
|
+
def is_enabled(self) -> bool:
|
|
659
|
+
"""Check if recovery manager is enabled."""
|
|
660
|
+
return self.enabled
|
|
661
|
+
|
|
662
|
+
def enable(self) -> None:
|
|
663
|
+
"""Enable recovery manager."""
|
|
664
|
+
self.enabled = True
|
|
665
|
+
self.logger.info("Recovery manager enabled")
|
|
666
|
+
|
|
667
|
+
def disable(self) -> None:
|
|
668
|
+
"""Disable recovery manager."""
|
|
669
|
+
self.enabled = False
|
|
670
|
+
self.logger.info("Recovery manager disabled")
|