claude-mpm 3.4.0__py3-none-any.whl → 3.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,670 @@
1
+ """Automatic recovery manager for claude-mpm Socket.IO server.
2
+
3
+ This module provides comprehensive recovery mechanisms including:
4
+ - Circuit breaker pattern to prevent restart loops
5
+ - Configurable recovery strategies
6
+ - Graceful restart and recovery procedures
7
+ - Recovery event logging and notifications
8
+ - Integration with health monitoring system
9
+
10
+ Design Principles:
11
+ - Prevention of restart loops through circuit breaker
12
+ - Graduated recovery responses based on failure severity
13
+ - Comprehensive logging for recovery events
14
+ - Configurable recovery policies and thresholds
15
+ - Integration with existing service lifecycle
16
+ """
17
+
18
+ import asyncio
19
+ import logging
20
+ import time
21
+ import signal
22
+ import os
23
+ import threading
24
+ from abc import ABC, abstractmethod
25
+ from collections import deque
26
+ from dataclasses import dataclass
27
+ from datetime import datetime, timezone
28
+ from enum import Enum
29
+ from typing import Any, Dict, List, Optional, Callable, Union
30
+ import json
31
+
32
+ from .health_monitor import HealthStatus, HealthCheckResult
33
+
34
+
35
+ class RecoveryAction(Enum):
36
+ """Types of recovery actions that can be performed."""
37
+ NONE = "none"
38
+ LOG_WARNING = "log_warning"
39
+ CLEAR_CONNECTIONS = "clear_connections"
40
+ RESTART_SERVICE = "restart_service"
41
+ EMERGENCY_STOP = "emergency_stop"
42
+
43
+
44
+ class CircuitState(Enum):
45
+ """Circuit breaker states."""
46
+ CLOSED = "closed" # Normal operation
47
+ OPEN = "open" # Recovery blocked due to failures
48
+ HALF_OPEN = "half_open" # Testing if recovery is working
49
+
50
+
51
+ @dataclass
52
+ class RecoveryEvent:
53
+ """Recovery event record."""
54
+ timestamp: float
55
+ action: RecoveryAction
56
+ trigger: str
57
+ health_status: HealthStatus
58
+ success: bool
59
+ duration_ms: float
60
+ error_message: Optional[str] = None
61
+
62
+ def to_dict(self) -> Dict[str, Any]:
63
+ """Convert recovery event to dictionary."""
64
+ return {
65
+ 'timestamp': self.timestamp,
66
+ 'timestamp_iso': datetime.fromtimestamp(self.timestamp, timezone.utc).isoformat(),
67
+ 'action': self.action.value,
68
+ 'trigger': self.trigger,
69
+ 'health_status': self.health_status.value,
70
+ 'success': self.success,
71
+ 'duration_ms': self.duration_ms,
72
+ 'error_message': self.error_message
73
+ }
74
+
75
+
76
+ class RecoveryStrategy(ABC):
77
+ """Abstract base class for recovery strategies."""
78
+
79
+ @abstractmethod
80
+ def should_recover(self, health_result: HealthCheckResult) -> bool:
81
+ """Determine if recovery should be triggered based on health result."""
82
+ pass
83
+
84
+ @abstractmethod
85
+ def get_recovery_action(self, health_result: HealthCheckResult) -> RecoveryAction:
86
+ """Determine the appropriate recovery action."""
87
+ pass
88
+
89
+ @abstractmethod
90
+ def get_name(self) -> str:
91
+ """Get the name of this recovery strategy."""
92
+ pass
93
+
94
+
95
+ class GradedRecoveryStrategy(RecoveryStrategy):
96
+ """Recovery strategy with graduated response based on health status and history.
97
+
98
+ Recovery actions are escalated based on:
99
+ - Current health status severity
100
+ - Number of recent failures
101
+ - Time since last recovery attempt
102
+ """
103
+
104
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
105
+ """Initialize graded recovery strategy.
106
+
107
+ Args:
108
+ config: Configuration dictionary for recovery thresholds
109
+ """
110
+ self.config = config or {}
111
+ self.logger = logging.getLogger(f"{__name__}.GradedRecoveryStrategy")
112
+
113
+ # Configuration with defaults
114
+ self.warning_threshold = self.config.get('warning_threshold', 2)
115
+ self.critical_threshold = self.config.get('critical_threshold', 1)
116
+ self.failure_window_seconds = self.config.get('failure_window_seconds', 300)
117
+ self.min_recovery_interval = self.config.get('min_recovery_interval', 60)
118
+
119
+ # Track recent failures
120
+ self.recent_failures: deque = deque(maxlen=10)
121
+ self.last_recovery_time = 0
122
+
123
+ def get_name(self) -> str:
124
+ return "graded_recovery"
125
+
126
+ def should_recover(self, health_result: HealthCheckResult) -> bool:
127
+ """Determine if recovery should be triggered."""
128
+ current_time = time.time()
129
+
130
+ # Don't trigger recovery too frequently
131
+ if current_time - self.last_recovery_time < self.min_recovery_interval:
132
+ self.logger.debug("Recovery suppressed due to min interval")
133
+ return False
134
+
135
+ # Check current health status
136
+ if health_result.overall_status in [HealthStatus.CRITICAL]:
137
+ return True
138
+
139
+ if health_result.overall_status == HealthStatus.WARNING:
140
+ # Count recent warnings in time window
141
+ cutoff_time = current_time - self.failure_window_seconds
142
+ recent_warnings = [
143
+ event for event in self.recent_failures
144
+ if event >= cutoff_time
145
+ ]
146
+
147
+ if len(recent_warnings) >= self.warning_threshold:
148
+ return True
149
+
150
+ return False
151
+
152
+ def get_recovery_action(self, health_result: HealthCheckResult) -> RecoveryAction:
153
+ """Determine the appropriate recovery action based on health status."""
154
+ current_time = time.time()
155
+
156
+ # Count recent failures
157
+ cutoff_time = current_time - self.failure_window_seconds
158
+ recent_failures = [
159
+ event for event in self.recent_failures
160
+ if event >= cutoff_time
161
+ ]
162
+
163
+ failure_count = len(recent_failures)
164
+
165
+ # Record this failure
166
+ if health_result.overall_status in [HealthStatus.WARNING, HealthStatus.CRITICAL]:
167
+ self.recent_failures.append(current_time)
168
+
169
+ # Determine action based on status and failure history
170
+ if health_result.overall_status == HealthStatus.CRITICAL:
171
+ if failure_count >= 3:
172
+ return RecoveryAction.EMERGENCY_STOP
173
+ elif failure_count >= 2:
174
+ return RecoveryAction.RESTART_SERVICE
175
+ else:
176
+ return RecoveryAction.CLEAR_CONNECTIONS
177
+
178
+ elif health_result.overall_status == HealthStatus.WARNING:
179
+ if failure_count >= self.warning_threshold:
180
+ return RecoveryAction.CLEAR_CONNECTIONS
181
+ else:
182
+ return RecoveryAction.LOG_WARNING
183
+
184
+ return RecoveryAction.NONE
185
+
186
+
187
+ class CircuitBreaker:
188
+ """Circuit breaker to prevent recovery loops and cascading failures.
189
+
190
+ Implements the circuit breaker pattern to:
191
+ - Prevent excessive recovery attempts
192
+ - Allow time for systems to stabilize
193
+ - Gradually re-enable recovery after failures
194
+ """
195
+
196
+ def __init__(self, failure_threshold: int = 5, timeout_seconds: int = 300,
197
+ success_threshold: int = 3):
198
+ """Initialize circuit breaker.
199
+
200
+ Args:
201
+ failure_threshold: Number of failures before opening circuit
202
+ timeout_seconds: Time to wait in OPEN state before trying HALF_OPEN
203
+ success_threshold: Number of successes needed in HALF_OPEN to close circuit
204
+ """
205
+ self.failure_threshold = failure_threshold
206
+ self.timeout_seconds = timeout_seconds
207
+ self.success_threshold = success_threshold
208
+
209
+ self.state = CircuitState.CLOSED
210
+ self.failure_count = 0
211
+ self.success_count = 0
212
+ self.last_failure_time = 0
213
+ self.state_change_time = time.time()
214
+
215
+ self.logger = logging.getLogger(f"{__name__}.CircuitBreaker")
216
+ self.logger.info(f"Circuit breaker initialized: failure_threshold={failure_threshold}, "
217
+ f"timeout={timeout_seconds}s, success_threshold={success_threshold}")
218
+
219
+ def can_proceed(self) -> bool:
220
+ """Check if recovery operations can proceed."""
221
+ current_time = time.time()
222
+
223
+ if self.state == CircuitState.CLOSED:
224
+ return True
225
+
226
+ elif self.state == CircuitState.OPEN:
227
+ # Check if timeout has elapsed
228
+ if current_time - self.last_failure_time >= self.timeout_seconds:
229
+ self._transition_to_half_open()
230
+ return True
231
+ return False
232
+
233
+ elif self.state == CircuitState.HALF_OPEN:
234
+ return True
235
+
236
+ return False
237
+
238
+ def record_success(self) -> None:
239
+ """Record a successful recovery operation."""
240
+ if self.state == CircuitState.CLOSED:
241
+ # Reset failure count on success in normal state
242
+ self.failure_count = 0
243
+
244
+ elif self.state == CircuitState.HALF_OPEN:
245
+ self.success_count += 1
246
+ self.logger.debug(f"Circuit breaker success count: {self.success_count}/{self.success_threshold}")
247
+
248
+ if self.success_count >= self.success_threshold:
249
+ self._transition_to_closed()
250
+
251
+ def record_failure(self) -> None:
252
+ """Record a failed recovery operation."""
253
+ current_time = time.time()
254
+ self.last_failure_time = current_time
255
+
256
+ if self.state == CircuitState.CLOSED:
257
+ self.failure_count += 1
258
+ self.logger.warning(f"Circuit breaker failure count: {self.failure_count}/{self.failure_threshold}")
259
+
260
+ if self.failure_count >= self.failure_threshold:
261
+ self._transition_to_open()
262
+
263
+ elif self.state == CircuitState.HALF_OPEN:
264
+ # Failure in half-open state goes back to open
265
+ self._transition_to_open()
266
+
267
+ def _transition_to_open(self) -> None:
268
+ """Transition circuit to OPEN state."""
269
+ self.state = CircuitState.OPEN
270
+ self.state_change_time = time.time()
271
+ self.success_count = 0
272
+ self.logger.warning(f"Circuit breaker OPENED due to {self.failure_count} failures. "
273
+ f"Recovery blocked for {self.timeout_seconds} seconds.")
274
+
275
+ def _transition_to_half_open(self) -> None:
276
+ """Transition circuit to HALF_OPEN state."""
277
+ self.state = CircuitState.HALF_OPEN
278
+ self.state_change_time = time.time()
279
+ self.success_count = 0
280
+ self.logger.info("Circuit breaker transitioned to HALF_OPEN. Testing recovery...")
281
+
282
+ def _transition_to_closed(self) -> None:
283
+ """Transition circuit to CLOSED state."""
284
+ self.state = CircuitState.CLOSED
285
+ self.state_change_time = time.time()
286
+ self.failure_count = 0
287
+ self.success_count = 0
288
+ self.logger.info("Circuit breaker CLOSED. Normal recovery operations resumed.")
289
+
290
+ def get_status(self) -> Dict[str, Any]:
291
+ """Get current circuit breaker status."""
292
+ current_time = time.time()
293
+ return {
294
+ 'state': self.state.value,
295
+ 'failure_count': self.failure_count,
296
+ 'success_count': self.success_count,
297
+ 'last_failure_time': self.last_failure_time,
298
+ 'state_change_time': self.state_change_time,
299
+ 'time_in_current_state': current_time - self.state_change_time,
300
+ 'can_proceed': self.can_proceed(),
301
+ 'config': {
302
+ 'failure_threshold': self.failure_threshold,
303
+ 'timeout_seconds': self.timeout_seconds,
304
+ 'success_threshold': self.success_threshold
305
+ }
306
+ }
307
+
308
+
309
+ class RecoveryManager:
310
+ """Advanced recovery manager with circuit breaker and configurable strategies.
311
+
312
+ Provides comprehensive recovery capabilities including:
313
+ - Health-based recovery triggering
314
+ - Circuit breaker protection
315
+ - Multiple recovery strategies
316
+ - Recovery event logging and history
317
+ - Integration with service lifecycle
318
+ """
319
+
320
+ def __init__(self, config: Optional[Dict[str, Any]] = None,
321
+ server_instance=None):
322
+ """Initialize recovery manager.
323
+
324
+ Args:
325
+ config: Configuration dictionary for recovery settings
326
+ server_instance: Reference to the Socket.IO server instance
327
+ """
328
+ self.config = config or {}
329
+ self.server_instance = server_instance
330
+ self.logger = logging.getLogger(f"{__name__}.RecoveryManager")
331
+
332
+ # Configuration with defaults
333
+ self.enabled = self.config.get('enabled', True)
334
+ self.check_interval = self.config.get('check_interval', 60)
335
+ self.max_recovery_attempts = self.config.get('max_recovery_attempts', 5)
336
+ self.recovery_timeout = self.config.get('recovery_timeout', 30)
337
+
338
+ # Initialize circuit breaker
339
+ circuit_config = self.config.get('circuit_breaker', {})
340
+ self.circuit_breaker = CircuitBreaker(
341
+ failure_threshold=circuit_config.get('failure_threshold', 5),
342
+ timeout_seconds=circuit_config.get('timeout_seconds', 300),
343
+ success_threshold=circuit_config.get('success_threshold', 3)
344
+ )
345
+
346
+ # Initialize recovery strategy
347
+ strategy_config = self.config.get('strategy', {})
348
+ self.recovery_strategy = GradedRecoveryStrategy(strategy_config)
349
+
350
+ # Recovery event history
351
+ self.recovery_history: deque = deque(maxlen=100)
352
+
353
+ # Recovery state
354
+ self.recovery_in_progress = False
355
+ self.last_recovery_time = 0
356
+ self.recovery_count = 0
357
+
358
+ # Recovery callbacks
359
+ self.recovery_callbacks: List[Callable[[RecoveryEvent], None]] = []
360
+
361
+ # Statistics
362
+ self.recovery_stats = {
363
+ 'total_recoveries': 0,
364
+ 'successful_recoveries': 0,
365
+ 'failed_recoveries': 0,
366
+ 'actions_performed': {action.value: 0 for action in RecoveryAction},
367
+ 'average_recovery_duration_ms': 0
368
+ }
369
+
370
+ self.logger.info(f"Recovery manager initialized with strategy: {self.recovery_strategy.get_name()}")
371
+
372
+ def add_recovery_callback(self, callback: Callable[[RecoveryEvent], None]) -> None:
373
+ """Add a callback to be notified of recovery events."""
374
+ self.recovery_callbacks.append(callback)
375
+ self.logger.debug(f"Added recovery callback: {callback.__name__}")
376
+
377
+ def handle_health_result(self, health_result: HealthCheckResult) -> Optional[RecoveryEvent]:
378
+ """Handle health check result and trigger recovery if needed.
379
+
380
+ Args:
381
+ health_result: Health check result to evaluate
382
+
383
+ Returns:
384
+ RecoveryEvent if recovery was triggered, None otherwise
385
+ """
386
+ if not self.enabled:
387
+ return None
388
+
389
+ if self.recovery_in_progress:
390
+ self.logger.debug("Recovery already in progress, skipping")
391
+ return None
392
+
393
+ # Check if recovery should be triggered
394
+ if not self.recovery_strategy.should_recover(health_result):
395
+ return None
396
+
397
+ # Check circuit breaker
398
+ if not self.circuit_breaker.can_proceed():
399
+ self.logger.warning("Recovery suppressed by circuit breaker")
400
+ return None
401
+
402
+ # Determine recovery action
403
+ action = self.recovery_strategy.get_recovery_action(health_result)
404
+
405
+ if action == RecoveryAction.NONE:
406
+ return None
407
+
408
+ # Trigger recovery
409
+ return asyncio.create_task(self._perform_recovery(action, health_result, "health_check"))
410
+
411
+ async def _perform_recovery(self, action: RecoveryAction,
412
+ health_result: HealthCheckResult,
413
+ trigger: str) -> RecoveryEvent:
414
+ """Perform recovery action and record the event.
415
+
416
+ Args:
417
+ action: Recovery action to perform
418
+ health_result: Health result that triggered recovery
419
+ trigger: Description of what triggered the recovery
420
+
421
+ Returns:
422
+ RecoveryEvent record of the recovery attempt
423
+ """
424
+ if self.recovery_in_progress:
425
+ raise RuntimeError("Recovery already in progress")
426
+
427
+ self.recovery_in_progress = True
428
+ start_time = time.time()
429
+ success = False
430
+ error_message = None
431
+
432
+ try:
433
+ self.logger.info(f"Starting recovery action: {action.value} (trigger: {trigger})")
434
+
435
+ if action == RecoveryAction.LOG_WARNING:
436
+ success = await self._log_warning(health_result)
437
+ elif action == RecoveryAction.CLEAR_CONNECTIONS:
438
+ success = await self._clear_connections()
439
+ elif action == RecoveryAction.RESTART_SERVICE:
440
+ success = await self._restart_service()
441
+ elif action == RecoveryAction.EMERGENCY_STOP:
442
+ success = await self._emergency_stop()
443
+ else:
444
+ error_message = f"Unknown recovery action: {action}"
445
+ self.logger.error(error_message)
446
+
447
+ except Exception as e:
448
+ error_message = f"Recovery action failed: {e}"
449
+ self.logger.error(error_message)
450
+ success = False
451
+
452
+ finally:
453
+ self.recovery_in_progress = False
454
+ duration_ms = (time.time() - start_time) * 1000
455
+
456
+ # Create recovery event
457
+ event = RecoveryEvent(
458
+ timestamp=start_time,
459
+ action=action,
460
+ trigger=trigger,
461
+ health_status=health_result.overall_status,
462
+ success=success,
463
+ duration_ms=duration_ms,
464
+ error_message=error_message
465
+ )
466
+
467
+ # Update statistics
468
+ self._update_recovery_stats(event)
469
+
470
+ # Record in circuit breaker
471
+ if success:
472
+ self.circuit_breaker.record_success()
473
+ else:
474
+ self.circuit_breaker.record_failure()
475
+
476
+ # Store event
477
+ self.recovery_history.append(event)
478
+ self.last_recovery_time = start_time
479
+ self.recovery_count += 1
480
+
481
+ # Notify callbacks
482
+ for callback in self.recovery_callbacks:
483
+ try:
484
+ callback(event)
485
+ except Exception as e:
486
+ self.logger.error(f"Recovery callback {callback.__name__} failed: {e}")
487
+
488
+ result_msg = "succeeded" if success else "failed"
489
+ self.logger.info(f"Recovery action {action.value} {result_msg} in {duration_ms:.2f}ms")
490
+
491
+ return event
492
+
493
+ async def _log_warning(self, health_result: HealthCheckResult) -> bool:
494
+ """Log a warning about health issues."""
495
+ try:
496
+ warning_metrics = [m for m in health_result.metrics if m.status == HealthStatus.WARNING]
497
+ critical_metrics = [m for m in health_result.metrics if m.status == HealthStatus.CRITICAL]
498
+
499
+ self.logger.warning(f"Health warning detected: {len(warning_metrics)} warning metrics, "
500
+ f"{len(critical_metrics)} critical metrics")
501
+
502
+ for metric in warning_metrics + critical_metrics:
503
+ self.logger.warning(f" {metric.name}: {metric.value} ({metric.status.value}) - {metric.message}")
504
+
505
+ return True
506
+ except Exception as e:
507
+ self.logger.error(f"Failed to log warning: {e}")
508
+ return False
509
+
510
+ async def _clear_connections(self) -> bool:
511
+ """Clear all client connections to reset connection state."""
512
+ try:
513
+ if not self.server_instance or not hasattr(self.server_instance, 'sio'):
514
+ self.logger.warning("No server instance available for connection clearing")
515
+ return False
516
+
517
+ sio = self.server_instance.sio
518
+ if not sio:
519
+ self.logger.warning("Socket.IO instance not available")
520
+ return False
521
+
522
+ # Get current clients
523
+ clients = list(self.server_instance.clients) if hasattr(self.server_instance, 'clients') else []
524
+
525
+ self.logger.info(f"Clearing {len(clients)} client connections")
526
+
527
+ # Disconnect all clients
528
+ for client_id in clients:
529
+ try:
530
+ await sio.disconnect(client_id)
531
+ except Exception as e:
532
+ self.logger.warning(f"Failed to disconnect client {client_id}: {e}")
533
+
534
+ # Clear client tracking
535
+ if hasattr(self.server_instance, 'clients'):
536
+ self.server_instance.clients.clear()
537
+ if hasattr(self.server_instance, 'client_versions'):
538
+ self.server_instance.client_versions.clear()
539
+
540
+ self.logger.info("Client connections cleared successfully")
541
+ return True
542
+
543
+ except Exception as e:
544
+ self.logger.error(f"Failed to clear connections: {e}")
545
+ return False
546
+
547
+ async def _restart_service(self) -> bool:
548
+ """Restart the Socket.IO service."""
549
+ try:
550
+ if not self.server_instance:
551
+ self.logger.error("No server instance available for restart")
552
+ return False
553
+
554
+ self.logger.info("Attempting graceful service restart")
555
+
556
+ # Save current configuration
557
+ host = getattr(self.server_instance, 'host', 'localhost')
558
+ port = getattr(self.server_instance, 'port', 8765)
559
+
560
+ # Stop current server
561
+ try:
562
+ await self.server_instance._shutdown_async()
563
+ self.logger.info("Server shutdown completed")
564
+ except Exception as e:
565
+ self.logger.warning(f"Error during shutdown: {e}")
566
+
567
+ # Wait a moment for cleanup
568
+ await asyncio.sleep(1)
569
+
570
+ # Restart server
571
+ await self.server_instance.start_async()
572
+ self.logger.info("Server restart completed successfully")
573
+
574
+ return True
575
+
576
+ except Exception as e:
577
+ self.logger.error(f"Failed to restart service: {e}")
578
+ return False
579
+
580
+ async def _emergency_stop(self) -> bool:
581
+ """Perform emergency stop of the service."""
582
+ try:
583
+ self.logger.critical("Performing emergency stop due to critical health issues")
584
+
585
+ if self.server_instance:
586
+ try:
587
+ # Force immediate shutdown
588
+ await self.server_instance._shutdown_async()
589
+ except Exception as e:
590
+ self.logger.error(f"Error during emergency shutdown: {e}")
591
+
592
+ # Send termination signal to process
593
+ try:
594
+ os.kill(os.getpid(), signal.SIGTERM)
595
+ except Exception as e:
596
+ self.logger.error(f"Failed to send termination signal: {e}")
597
+ return False
598
+
599
+ return True
600
+
601
+ except Exception as e:
602
+ self.logger.error(f"Emergency stop failed: {e}")
603
+ return False
604
+
605
+ def _update_recovery_stats(self, event: RecoveryEvent) -> None:
606
+ """Update recovery statistics with new event."""
607
+ self.recovery_stats['total_recoveries'] += 1
608
+
609
+ if event.success:
610
+ self.recovery_stats['successful_recoveries'] += 1
611
+ else:
612
+ self.recovery_stats['failed_recoveries'] += 1
613
+
614
+ self.recovery_stats['actions_performed'][event.action.value] += 1
615
+
616
+ # Update average duration
617
+ total_recoveries = self.recovery_stats['total_recoveries']
618
+ current_avg = self.recovery_stats['average_recovery_duration_ms']
619
+ self.recovery_stats['average_recovery_duration_ms'] = (
620
+ (current_avg * (total_recoveries - 1) + event.duration_ms) / total_recoveries
621
+ )
622
+
623
+ def get_recovery_status(self) -> Dict[str, Any]:
624
+ """Get comprehensive recovery manager status."""
625
+ return {
626
+ 'enabled': self.enabled,
627
+ 'recovery_in_progress': self.recovery_in_progress,
628
+ 'last_recovery_time': self.last_recovery_time,
629
+ 'recovery_count': self.recovery_count,
630
+ 'strategy': self.recovery_strategy.get_name(),
631
+ 'circuit_breaker': self.circuit_breaker.get_status(),
632
+ 'recovery_stats': dict(self.recovery_stats),
633
+ 'recent_recoveries': [event.to_dict() for event in list(self.recovery_history)[-10:]],
634
+ 'config': {
635
+ 'check_interval': self.check_interval,
636
+ 'max_recovery_attempts': self.max_recovery_attempts,
637
+ 'recovery_timeout': self.recovery_timeout
638
+ }
639
+ }
640
+
641
+ def get_recovery_history(self, limit: Optional[int] = None) -> List[RecoveryEvent]:
642
+ """Get recovery event history.
643
+
644
+ Args:
645
+ limit: Maximum number of events to return
646
+
647
+ Returns:
648
+ List of recovery events, newest first
649
+ """
650
+ history = list(self.recovery_history)
651
+ history.reverse() # Newest first
652
+
653
+ if limit:
654
+ history = history[:limit]
655
+
656
+ return history
657
+
658
+ def is_enabled(self) -> bool:
659
+ """Check if recovery manager is enabled."""
660
+ return self.enabled
661
+
662
+ def enable(self) -> None:
663
+ """Enable recovery manager."""
664
+ self.enabled = True
665
+ self.logger.info("Recovery manager enabled")
666
+
667
+ def disable(self) -> None:
668
+ """Disable recovery manager."""
669
+ self.enabled = False
670
+ self.logger.info("Recovery manager disabled")