claude-mpm 4.13.2__py3-none-any.whl → 4.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of claude-mpm might be problematic. Click here for more details.

Files changed (44) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/cli/__init__.py +10 -0
  3. claude_mpm/cli/commands/local_deploy.py +536 -0
  4. claude_mpm/cli/parsers/base_parser.py +7 -0
  5. claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
  6. claude_mpm/config/model_config.py +428 -0
  7. claude_mpm/core/interactive_session.py +3 -0
  8. claude_mpm/services/core/interfaces/__init__.py +74 -2
  9. claude_mpm/services/core/interfaces/health.py +172 -0
  10. claude_mpm/services/core/interfaces/model.py +281 -0
  11. claude_mpm/services/core/interfaces/process.py +372 -0
  12. claude_mpm/services/core/interfaces/restart.py +307 -0
  13. claude_mpm/services/core/interfaces/stability.py +260 -0
  14. claude_mpm/services/core/models/__init__.py +35 -0
  15. claude_mpm/services/core/models/health.py +189 -0
  16. claude_mpm/services/core/models/process.py +258 -0
  17. claude_mpm/services/core/models/restart.py +302 -0
  18. claude_mpm/services/core/models/stability.py +264 -0
  19. claude_mpm/services/local_ops/__init__.py +163 -0
  20. claude_mpm/services/local_ops/crash_detector.py +257 -0
  21. claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
  22. claude_mpm/services/local_ops/health_checks/http_check.py +223 -0
  23. claude_mpm/services/local_ops/health_checks/process_check.py +235 -0
  24. claude_mpm/services/local_ops/health_checks/resource_check.py +254 -0
  25. claude_mpm/services/local_ops/health_manager.py +430 -0
  26. claude_mpm/services/local_ops/log_monitor.py +396 -0
  27. claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
  28. claude_mpm/services/local_ops/process_manager.py +595 -0
  29. claude_mpm/services/local_ops/resource_monitor.py +331 -0
  30. claude_mpm/services/local_ops/restart_manager.py +401 -0
  31. claude_mpm/services/local_ops/restart_policy.py +387 -0
  32. claude_mpm/services/local_ops/state_manager.py +371 -0
  33. claude_mpm/services/local_ops/unified_manager.py +600 -0
  34. claude_mpm/services/model/__init__.py +147 -0
  35. claude_mpm/services/model/base_provider.py +365 -0
  36. claude_mpm/services/model/claude_provider.py +412 -0
  37. claude_mpm/services/model/model_router.py +453 -0
  38. claude_mpm/services/model/ollama_provider.py +415 -0
  39. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/METADATA +1 -1
  40. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/RECORD +44 -12
  41. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/WHEEL +0 -0
  42. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/entry_points.txt +0 -0
  43. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/licenses/LICENSE +0 -0
  44. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,264 @@
1
+ """
2
+ Stability Monitoring Data Models for Claude MPM Framework
3
+ ===========================================================
4
+
5
+ WHY: This module defines data structures for stability monitoring operations,
6
+ including memory leak detection, log pattern matching, and resource usage tracking.
7
+
8
+ DESIGN DECISION: Uses dataclasses for immutability and type safety. Provides
9
+ clear data structures for proactive monitoring and crash prevention.
10
+
11
+ ARCHITECTURE:
12
+ - MemoryTrend: Memory usage trend analysis with leak detection
13
+ - LogPatternMatch: Log pattern match with severity and context
14
+ - ResourceUsage: Comprehensive resource usage snapshot
15
+ """
16
+
17
+ from dataclasses import asdict, dataclass, field
18
+ from datetime import datetime
19
+ from typing import Any, Dict, List
20
+
21
+
22
+ @dataclass
23
+ class MemoryTrend:
24
+ """
25
+ Memory usage trend analysis result.
26
+
27
+ WHY: Provides structured data for memory leak detection, including
28
+ historical measurements, slope calculation, and leak detection status.
29
+
30
+ Attributes:
31
+ deployment_id: Unique deployment identifier
32
+ timestamps: List of measurement timestamps
33
+ memory_mb: List of memory measurements in megabytes
34
+ slope_mb_per_minute: Calculated memory growth rate (MB/minute)
35
+ is_leaking: Whether a memory leak was detected
36
+ window_size: Number of measurements in the analysis window
37
+ threshold_mb_per_minute: Leak detection threshold used
38
+ """
39
+
40
+ deployment_id: str
41
+ timestamps: List[datetime] = field(default_factory=list)
42
+ memory_mb: List[float] = field(default_factory=list)
43
+ slope_mb_per_minute: float = 0.0
44
+ is_leaking: bool = False
45
+ window_size: int = 0
46
+ threshold_mb_per_minute: float = 10.0
47
+
48
+ def to_dict(self) -> Dict[str, Any]:
49
+ """
50
+ Convert to dictionary for JSON serialization.
51
+
52
+ Returns:
53
+ Dictionary representation with datetimes converted to ISO format
54
+ """
55
+ data = asdict(self)
56
+ data["timestamps"] = [ts.isoformat() for ts in self.timestamps]
57
+ return data
58
+
59
+ @classmethod
60
+ def from_dict(cls, data: Dict[str, Any]) -> "MemoryTrend":
61
+ """
62
+ Create MemoryTrend from dictionary.
63
+
64
+ Args:
65
+ data: Dictionary from JSON deserialization
66
+
67
+ Returns:
68
+ MemoryTrend instance
69
+ """
70
+ # Convert ISO strings to datetime
71
+ if isinstance(data.get("timestamps"), list):
72
+ data["timestamps"] = [
73
+ datetime.fromisoformat(ts) if isinstance(ts, str) else ts
74
+ for ts in data["timestamps"]
75
+ ]
76
+
77
+ return cls(**data)
78
+
79
+ @property
80
+ def latest_memory_mb(self) -> float:
81
+ """Get the most recent memory measurement."""
82
+ return self.memory_mb[-1] if self.memory_mb else 0.0
83
+
84
+ @property
85
+ def oldest_memory_mb(self) -> float:
86
+ """Get the oldest memory measurement in the window."""
87
+ return self.memory_mb[0] if self.memory_mb else 0.0
88
+
89
+ @property
90
+ def time_span_minutes(self) -> float:
91
+ """Get the time span covered by the measurements in minutes."""
92
+ if len(self.timestamps) < 2:
93
+ return 0.0
94
+ delta = self.timestamps[-1] - self.timestamps[0]
95
+ return delta.total_seconds() / 60.0
96
+
97
+
98
+ @dataclass
99
+ class LogPatternMatch:
100
+ """
101
+ Result of a log pattern match.
102
+
103
+ WHY: Contains all information about a detected error pattern in logs,
104
+ enabling analysis, alerting, and debugging of issues before they cause crashes.
105
+
106
+ Attributes:
107
+ deployment_id: Unique deployment identifier
108
+ pattern: Regex pattern that matched
109
+ line: The log line that matched
110
+ timestamp: When the match was detected
111
+ severity: Error severity level (ERROR, CRITICAL, WARNING)
112
+ line_number: Line number in log file (if available)
113
+ context: Additional context lines (before/after)
114
+ """
115
+
116
+ deployment_id: str
117
+ pattern: str
118
+ line: str
119
+ timestamp: datetime = field(default_factory=datetime.now)
120
+ severity: str = "ERROR"
121
+ line_number: int = 0
122
+ context: List[str] = field(default_factory=list)
123
+
124
+ def to_dict(self) -> Dict[str, Any]:
125
+ """
126
+ Convert to dictionary for JSON serialization.
127
+
128
+ Returns:
129
+ Dictionary representation with datetime converted to ISO format
130
+ """
131
+ data = asdict(self)
132
+ data["timestamp"] = self.timestamp.isoformat()
133
+ return data
134
+
135
+ @classmethod
136
+ def from_dict(cls, data: Dict[str, Any]) -> "LogPatternMatch":
137
+ """
138
+ Create LogPatternMatch from dictionary.
139
+
140
+ Args:
141
+ data: Dictionary from JSON deserialization
142
+
143
+ Returns:
144
+ LogPatternMatch instance
145
+ """
146
+ # Convert ISO string to datetime
147
+ if isinstance(data.get("timestamp"), str):
148
+ data["timestamp"] = datetime.fromisoformat(data["timestamp"])
149
+
150
+ return cls(**data)
151
+
152
+ @property
153
+ def is_critical(self) -> bool:
154
+ """Check if this match represents a critical error."""
155
+ return self.severity == "CRITICAL"
156
+
157
+
158
+ @dataclass
159
+ class ResourceUsage:
160
+ """
161
+ Comprehensive resource usage snapshot.
162
+
163
+ WHY: Provides detailed resource consumption metrics across multiple
164
+ resource types to enable preemptive action before exhaustion.
165
+
166
+ Attributes:
167
+ deployment_id: Unique deployment identifier
168
+ file_descriptors: Current file descriptor count
169
+ max_file_descriptors: Maximum file descriptors allowed (ulimit -n)
170
+ threads: Current thread count
171
+ connections: Current network connection count
172
+ disk_free_mb: Free disk space in working directory (MB)
173
+ is_critical: Whether any resource exceeds 80% threshold
174
+ timestamp: When the measurement was taken
175
+ details: Additional resource-specific details
176
+ """
177
+
178
+ deployment_id: str
179
+ file_descriptors: int = 0
180
+ max_file_descriptors: int = 0
181
+ threads: int = 0
182
+ connections: int = 0
183
+ disk_free_mb: float = 0.0
184
+ is_critical: bool = False
185
+ timestamp: datetime = field(default_factory=datetime.now)
186
+ details: Dict[str, Any] = field(default_factory=dict)
187
+
188
+ def to_dict(self) -> Dict[str, Any]:
189
+ """
190
+ Convert to dictionary for JSON serialization.
191
+
192
+ Returns:
193
+ Dictionary representation with datetime converted to ISO format
194
+ """
195
+ data = asdict(self)
196
+ data["timestamp"] = self.timestamp.isoformat()
197
+ return data
198
+
199
+ @classmethod
200
+ def from_dict(cls, data: Dict[str, Any]) -> "ResourceUsage":
201
+ """
202
+ Create ResourceUsage from dictionary.
203
+
204
+ Args:
205
+ data: Dictionary from JSON deserialization
206
+
207
+ Returns:
208
+ ResourceUsage instance
209
+ """
210
+ # Convert ISO string to datetime
211
+ if isinstance(data.get("timestamp"), str):
212
+ data["timestamp"] = datetime.fromisoformat(data["timestamp"])
213
+
214
+ return cls(**data)
215
+
216
+ @property
217
+ def fd_usage_percent(self) -> float:
218
+ """Calculate file descriptor usage percentage."""
219
+ if self.max_file_descriptors == 0:
220
+ return 0.0
221
+ return (self.file_descriptors / self.max_file_descriptors) * 100.0
222
+
223
+ @property
224
+ def is_fd_critical(self) -> bool:
225
+ """Check if file descriptor usage is critical (>80%)."""
226
+ return self.fd_usage_percent >= 80.0 # >= instead of > for 80% exactly
227
+
228
+ def get_critical_resources(self) -> List[str]:
229
+ """
230
+ Get list of resources at critical levels.
231
+
232
+ Returns:
233
+ List of resource names exceeding 80% threshold
234
+ """
235
+ critical = []
236
+
237
+ if self.is_fd_critical:
238
+ critical.append(
239
+ f"file_descriptors ({self.file_descriptors}/{self.max_file_descriptors})"
240
+ )
241
+
242
+ # Check thread count (threshold from details if available)
243
+ thread_threshold = self.details.get("thread_threshold", 1000)
244
+ if self.threads > thread_threshold * 0.8:
245
+ critical.append(f"threads ({self.threads})")
246
+
247
+ # Check connection count (threshold from details if available)
248
+ connection_threshold = self.details.get("connection_threshold", 500)
249
+ if self.connections > connection_threshold * 0.8:
250
+ critical.append(f"connections ({self.connections})")
251
+
252
+ # Check disk space (threshold from details if available)
253
+ disk_threshold_mb = self.details.get("disk_threshold_mb", 100)
254
+ if self.disk_free_mb < disk_threshold_mb:
255
+ critical.append(f"disk_space ({self.disk_free_mb:.1f}MB free)")
256
+
257
+ return critical
258
+
259
+
260
+ __all__ = [
261
+ "LogPatternMatch",
262
+ "MemoryTrend",
263
+ "ResourceUsage",
264
+ ]
@@ -0,0 +1,163 @@
1
+ """
2
+ Local Operations Service Package
3
+ =================================
4
+
5
+ WHY: Provides process management and health monitoring capabilities for local
6
+ development deployments. This package implements the core infrastructure needed
7
+ by the local-ops-agent to spawn, track, manage, and monitor background processes.
8
+
9
+ DESIGN DECISION: Organized as a service package under services/ to integrate
10
+ with the existing service architecture and dependency injection system.
11
+
12
+ ARCHITECTURE:
13
+ - StateManager: Persistent deployment state tracking
14
+ - ProcessManager: Process lifecycle management with isolation
15
+ - HealthCheckManager: Three-tier health monitoring (HTTP, process, resource)
16
+ - HealthChecks: HTTP, process, and resource health check implementations
17
+ - CrashDetector: Crash detection via health status monitoring
18
+ - RestartPolicy: Intelligent restart policies with exponential backoff
19
+ - RestartManager: Auto-restart orchestration with circuit breaker
20
+ - MemoryLeakDetector: Proactive memory leak detection using trend analysis
21
+ - LogMonitor: Real-time log file monitoring for error patterns
22
+ - ResourceMonitor: Comprehensive resource exhaustion prevention
23
+ - Data Models: Process status, deployment state, health status, restart history,
24
+ stability metrics (memory trends, log matches, resource usage)
25
+ - Interfaces: ILocalProcessManager, IDeploymentStateManager, IHealthCheckManager,
26
+ ICrashDetector, IRestartPolicy, IRestartManager, IMemoryLeakDetector,
27
+ ILogMonitor, IResourceMonitor
28
+
29
+ USAGE:
30
+ from claude_mpm.services.local_ops import (
31
+ LocalProcessManager,
32
+ DeploymentStateManager,
33
+ HealthCheckManager,
34
+ StartConfig,
35
+ ProcessStatus,
36
+ HealthStatus,
37
+ )
38
+
39
+ # Initialize managers
40
+ state_manager = DeploymentStateManager(".claude-mpm/deployment-state.json")
41
+ process_manager = LocalProcessManager(state_manager)
42
+ health_manager = HealthCheckManager(process_manager, check_interval=30)
43
+
44
+ # Start a process
45
+ config = StartConfig(
46
+ command=["npm", "run", "dev"],
47
+ working_directory="/path/to/project",
48
+ port=3000
49
+ )
50
+ deployment = process_manager.start(config)
51
+
52
+ # Check health
53
+ health = health_manager.check_health(deployment.deployment_id)
54
+
55
+ # Start background monitoring
56
+ health_manager.start_monitoring()
57
+ """
58
+
59
+ # Re-export data models and interfaces for convenience
60
+ from claude_mpm.services.core.interfaces.health import (
61
+ IHealthCheck,
62
+ IHealthCheckManager,
63
+ )
64
+ from claude_mpm.services.core.interfaces.process import (
65
+ IDeploymentStateManager,
66
+ ILocalProcessManager,
67
+ )
68
+ from claude_mpm.services.core.interfaces.restart import (
69
+ ICrashDetector,
70
+ IRestartManager,
71
+ IRestartPolicy,
72
+ )
73
+ from claude_mpm.services.core.interfaces.stability import (
74
+ ILogMonitor,
75
+ IMemoryLeakDetector,
76
+ IResourceMonitor,
77
+ )
78
+ from claude_mpm.services.core.models.health import (
79
+ DeploymentHealth,
80
+ HealthCheckResult,
81
+ HealthStatus,
82
+ )
83
+ from claude_mpm.services.core.models.process import (
84
+ PROTECTED_PORT_RANGES,
85
+ DeploymentState,
86
+ ProcessInfo,
87
+ ProcessStatus,
88
+ StartConfig,
89
+ is_port_protected,
90
+ )
91
+ from claude_mpm.services.core.models.restart import (
92
+ CircuitBreakerState,
93
+ RestartAttempt,
94
+ RestartConfig,
95
+ RestartHistory,
96
+ )
97
+ from claude_mpm.services.core.models.stability import (
98
+ LogPatternMatch,
99
+ MemoryTrend,
100
+ ResourceUsage,
101
+ )
102
+
103
+ # Import service implementations
104
+ from .health_manager import HealthCheckManager
105
+ from .log_monitor import LogMonitor
106
+ from .memory_leak_detector import MemoryLeakDetector
107
+ from .process_manager import (
108
+ LocalProcessManager,
109
+ PortConflictError,
110
+ ProcessSpawnError,
111
+ )
112
+ from .resource_monitor import ResourceMonitor
113
+ from .state_manager import DeploymentStateManager, StateCorruptionError
114
+ from .unified_manager import UnifiedLocalOpsManager
115
+
116
+ __all__ = [
117
+ "PROTECTED_PORT_RANGES",
118
+ # Data models - Restart
119
+ "CircuitBreakerState",
120
+ "CrashDetector",
121
+ "DeploymentHealth",
122
+ "DeploymentState",
123
+ # Service implementations
124
+ "DeploymentStateManager",
125
+ "HealthCheckManager",
126
+ "HealthCheckResult",
127
+ # Data models - Health
128
+ "HealthStatus",
129
+ "ICrashDetector",
130
+ # Interfaces
131
+ "IDeploymentStateManager",
132
+ "IHealthCheck",
133
+ "IHealthCheckManager",
134
+ "ILocalProcessManager",
135
+ "ILogMonitor",
136
+ "IMemoryLeakDetector",
137
+ "IResourceMonitor",
138
+ "IRestartManager",
139
+ "IRestartPolicy",
140
+ "LocalProcessManager",
141
+ "LogMonitor",
142
+ "LogPatternMatch",
143
+ "MemoryLeakDetector",
144
+ # Data models - Stability
145
+ "MemoryTrend",
146
+ "PortConflictError",
147
+ "ProcessInfo",
148
+ "ProcessSpawnError",
149
+ # Data models - Process
150
+ "ProcessStatus",
151
+ "ResourceMonitor",
152
+ "ResourceUsage",
153
+ "RestartAttempt",
154
+ "RestartConfig",
155
+ "RestartHistory",
156
+ "RestartManager",
157
+ "RestartPolicy",
158
+ "StartConfig",
159
+ # Exceptions
160
+ "StateCorruptionError",
161
+ "UnifiedLocalOpsManager",
162
+ "is_port_protected",
163
+ ]
@@ -0,0 +1,257 @@
1
+ """
2
+ Crash Detector for Claude MPM Framework
3
+ ========================================
4
+
5
+ WHY: Detects process crashes and failures by monitoring health status changes,
6
+ process exits, and zombie states. Integrates with HealthCheckManager to receive
7
+ real-time status updates.
8
+
9
+ DESIGN DECISION: Uses callback-based architecture to receive health status
10
+ changes from HealthCheckManager. Tracks crash history per deployment to
11
+ enable pattern detection and intelligent restart policies.
12
+
13
+ ARCHITECTURE:
14
+ - Subscribes to HealthCheckManager status change callbacks
15
+ - Detects crashes when status transitions to UNHEALTHY
16
+ - Tracks crash count per deployment
17
+ - Invokes registered crash callbacks when crash detected
18
+
19
+ USAGE:
20
+ crash_detector = CrashDetector(health_manager)
21
+ crash_detector.register_crash_callback(handle_crash)
22
+ crash_detector.start_monitoring(deployment_id)
23
+ """
24
+
25
+ import threading
26
+ from collections import defaultdict
27
+ from typing import Callable, Dict, List, Set
28
+
29
+ from claude_mpm.services.core.base import SyncBaseService
30
+ from claude_mpm.services.core.interfaces.health import IHealthCheckManager
31
+ from claude_mpm.services.core.interfaces.restart import ICrashDetector
32
+ from claude_mpm.services.core.models.health import HealthStatus
33
+
34
+
35
+ class CrashDetector(SyncBaseService, ICrashDetector):
36
+ """
37
+ Detects process crashes via health status monitoring.
38
+
39
+ WHY: Provides automated crash detection by monitoring health status
40
+ changes. Enables reactive restart policies based on crash events.
41
+
42
+ Thread Safety: All public methods are thread-safe with proper locking.
43
+ """
44
+
45
+ def __init__(self, health_manager: IHealthCheckManager):
46
+ """
47
+ Initialize crash detector.
48
+
49
+ Args:
50
+ health_manager: Health check manager for status monitoring
51
+ """
52
+ super().__init__("CrashDetector")
53
+ self.health_manager = health_manager
54
+ self._lock = threading.Lock()
55
+
56
+ # Deployments being monitored
57
+ self._monitored_deployments: Set[str] = set()
58
+
59
+ # Last known health status per deployment
60
+ self._last_health_status: Dict[str, HealthStatus] = {}
61
+
62
+ # Crash count per deployment
63
+ self._crash_count: Dict[str, int] = defaultdict(int)
64
+
65
+ # Crash callbacks: List of functions called with (deployment_id, reason)
66
+ self._crash_callbacks: List[Callable[[str, str], None]] = []
67
+
68
+ def initialize(self) -> bool:
69
+ """
70
+ Initialize the crash detector.
71
+
72
+ Returns:
73
+ True if initialization successful
74
+ """
75
+ self.logger.info("Initializing CrashDetector")
76
+
77
+ # Register with health manager to receive status change callbacks
78
+ self.health_manager.register_status_callback(self._on_health_status_change)
79
+
80
+ self.logger.info("CrashDetector initialized successfully")
81
+ return True
82
+
83
+ def register_crash_callback(self, callback: Callable[[str, str], None]) -> None:
84
+ """
85
+ Register a callback to be invoked when a crash is detected.
86
+
87
+ Args:
88
+ callback: Function called with (deployment_id, reason)
89
+ """
90
+ with self._lock:
91
+ self._crash_callbacks.append(callback)
92
+ callback_name = getattr(callback, "__name__", repr(callback))
93
+ self.logger.debug(f"Registered crash callback: {callback_name}")
94
+
95
+ def start_monitoring(self, deployment_id: str) -> None:
96
+ """
97
+ Start monitoring a deployment for crashes.
98
+
99
+ Args:
100
+ deployment_id: Unique deployment identifier
101
+
102
+ Raises:
103
+ ValueError: If deployment_id not found
104
+ """
105
+ with self._lock:
106
+ self._monitored_deployments.add(deployment_id)
107
+ self.logger.info(
108
+ f"Started crash monitoring for deployment: {deployment_id}"
109
+ )
110
+
111
+ # Get initial health status
112
+ try:
113
+ health = self.health_manager.check_health(deployment_id)
114
+ self._last_health_status[deployment_id] = health.overall_status
115
+ self.logger.debug(
116
+ f"Initial health status for {deployment_id}: {health.overall_status.value}"
117
+ )
118
+ except Exception as e:
119
+ self.logger.warning(
120
+ f"Failed to get initial health status for {deployment_id}: {e}"
121
+ )
122
+ self._last_health_status[deployment_id] = HealthStatus.UNKNOWN
123
+
124
+ def stop_monitoring(self, deployment_id: str) -> None:
125
+ """
126
+ Stop monitoring a deployment.
127
+
128
+ Args:
129
+ deployment_id: Unique deployment identifier
130
+ """
131
+ with self._lock:
132
+ self._monitored_deployments.discard(deployment_id)
133
+ self._last_health_status.pop(deployment_id, None)
134
+ self.logger.info(
135
+ f"Stopped crash monitoring for deployment: {deployment_id}"
136
+ )
137
+
138
+ def is_monitoring(self, deployment_id: str) -> bool:
139
+ """
140
+ Check if a deployment is being monitored.
141
+
142
+ Args:
143
+ deployment_id: Unique deployment identifier
144
+
145
+ Returns:
146
+ True if deployment is being monitored
147
+ """
148
+ with self._lock:
149
+ return deployment_id in self._monitored_deployments
150
+
151
+ def get_crash_count(self, deployment_id: str) -> int:
152
+ """
153
+ Get the number of crashes detected for a deployment.
154
+
155
+ Args:
156
+ deployment_id: Unique deployment identifier
157
+
158
+ Returns:
159
+ Number of crashes detected
160
+ """
161
+ with self._lock:
162
+ return self._crash_count.get(deployment_id, 0)
163
+
164
+ def reset_crash_count(self, deployment_id: str) -> None:
165
+ """
166
+ Reset crash count for a deployment.
167
+
168
+ WHY: Allows manual intervention to clear crash history.
169
+
170
+ Args:
171
+ deployment_id: Unique deployment identifier
172
+ """
173
+ with self._lock:
174
+ self._crash_count[deployment_id] = 0
175
+ self.logger.debug(f"Reset crash count for deployment: {deployment_id}")
176
+
177
+ def shutdown(self) -> bool:
178
+ """
179
+ Shutdown the crash detector.
180
+
181
+ Returns:
182
+ True if shutdown successful
183
+ """
184
+ with self._lock:
185
+ self._monitored_deployments.clear()
186
+ self._last_health_status.clear()
187
+ self._crash_count.clear()
188
+ self._crash_callbacks.clear()
189
+ self.logger.info("CrashDetector shutdown successfully")
190
+ return True
191
+
192
+ def _on_health_status_change(
193
+ self, deployment_id: str, old_status: HealthStatus, new_status: HealthStatus
194
+ ) -> None:
195
+ """
196
+ Handle health status changes from HealthCheckManager.
197
+
198
+ WHY: Callback invoked by HealthCheckManager when status changes.
199
+ Detects crashes when status transitions to UNHEALTHY.
200
+
201
+ Args:
202
+ deployment_id: Unique deployment identifier
203
+ old_status: Previous health status
204
+ new_status: New health status
205
+ """
206
+ with self._lock:
207
+ # Only process if we're monitoring this deployment
208
+ if deployment_id not in self._monitored_deployments:
209
+ return
210
+
211
+ # Update last known status
212
+ self._last_health_status[deployment_id] = new_status
213
+
214
+ # Detect crash: transition from operational to UNHEALTHY
215
+ if old_status.is_operational() and new_status.is_critical():
216
+ self._handle_crash(
217
+ deployment_id, "Health status transitioned to UNHEALTHY"
218
+ )
219
+
220
+ # Also detect: transition from UNKNOWN to UNHEALTHY (process died)
221
+ elif (
222
+ old_status == HealthStatus.UNKNOWN
223
+ and new_status == HealthStatus.UNHEALTHY
224
+ ):
225
+ self._handle_crash(deployment_id, "Process became unhealthy")
226
+
227
+ def _handle_crash(self, deployment_id: str, reason: str) -> None:
228
+ """
229
+ Handle detected crash.
230
+
231
+ WHY: Increments crash count and invokes all registered callbacks.
232
+
233
+ Args:
234
+ deployment_id: Unique deployment identifier
235
+ reason: Reason for crash detection
236
+ """
237
+ # Increment crash count
238
+ self._crash_count[deployment_id] += 1
239
+ crash_count = self._crash_count[deployment_id]
240
+
241
+ self.logger.warning(
242
+ f"Crash detected for deployment {deployment_id} "
243
+ f"(count: {crash_count}): {reason}"
244
+ )
245
+
246
+ # Invoke all crash callbacks
247
+ for callback in self._crash_callbacks:
248
+ try:
249
+ callback(deployment_id, reason)
250
+ except Exception as e:
251
+ self.logger.error(
252
+ f"Error invoking crash callback {callback.__name__}: {e}",
253
+ exc_info=True,
254
+ )
255
+
256
+
257
+ __all__ = ["CrashDetector"]