claude-mpm 4.13.1__py3-none-any.whl → 4.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of claude-mpm might be problematic. Click here for more details.
- claude_mpm/VERSION +1 -1
- claude_mpm/agents/PM_INSTRUCTIONS.md +68 -0
- claude_mpm/cli/__init__.py +10 -0
- claude_mpm/cli/commands/local_deploy.py +536 -0
- claude_mpm/cli/parsers/base_parser.py +7 -0
- claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
- claude_mpm/commands/mpm-agents-detect.md +168 -0
- claude_mpm/commands/mpm-agents-recommend.md +214 -0
- claude_mpm/commands/mpm-agents.md +75 -1
- claude_mpm/commands/mpm-auto-configure.md +217 -0
- claude_mpm/commands/mpm-help.md +160 -0
- claude_mpm/config/model_config.py +428 -0
- claude_mpm/core/interactive_session.py +3 -0
- claude_mpm/services/core/interfaces/__init__.py +74 -2
- claude_mpm/services/core/interfaces/health.py +172 -0
- claude_mpm/services/core/interfaces/model.py +281 -0
- claude_mpm/services/core/interfaces/process.py +372 -0
- claude_mpm/services/core/interfaces/restart.py +307 -0
- claude_mpm/services/core/interfaces/stability.py +260 -0
- claude_mpm/services/core/models/__init__.py +35 -0
- claude_mpm/services/core/models/health.py +189 -0
- claude_mpm/services/core/models/process.py +258 -0
- claude_mpm/services/core/models/restart.py +302 -0
- claude_mpm/services/core/models/stability.py +264 -0
- claude_mpm/services/local_ops/__init__.py +163 -0
- claude_mpm/services/local_ops/crash_detector.py +257 -0
- claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
- claude_mpm/services/local_ops/health_checks/http_check.py +223 -0
- claude_mpm/services/local_ops/health_checks/process_check.py +235 -0
- claude_mpm/services/local_ops/health_checks/resource_check.py +254 -0
- claude_mpm/services/local_ops/health_manager.py +430 -0
- claude_mpm/services/local_ops/log_monitor.py +396 -0
- claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
- claude_mpm/services/local_ops/process_manager.py +595 -0
- claude_mpm/services/local_ops/resource_monitor.py +331 -0
- claude_mpm/services/local_ops/restart_manager.py +401 -0
- claude_mpm/services/local_ops/restart_policy.py +387 -0
- claude_mpm/services/local_ops/state_manager.py +371 -0
- claude_mpm/services/local_ops/unified_manager.py +600 -0
- claude_mpm/services/model/__init__.py +147 -0
- claude_mpm/services/model/base_provider.py +365 -0
- claude_mpm/services/model/claude_provider.py +412 -0
- claude_mpm/services/model/model_router.py +453 -0
- claude_mpm/services/model/ollama_provider.py +415 -0
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/METADATA +1 -1
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/RECORD +50 -15
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/WHEEL +0 -0
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Stability Monitoring Data Models for Claude MPM Framework
|
|
3
|
+
===========================================================
|
|
4
|
+
|
|
5
|
+
WHY: This module defines data structures for stability monitoring operations,
|
|
6
|
+
including memory leak detection, log pattern matching, and resource usage tracking.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Uses dataclasses for immutability and type safety. Provides
|
|
9
|
+
clear data structures for proactive monitoring and crash prevention.
|
|
10
|
+
|
|
11
|
+
ARCHITECTURE:
|
|
12
|
+
- MemoryTrend: Memory usage trend analysis with leak detection
|
|
13
|
+
- LogPatternMatch: Log pattern match with severity and context
|
|
14
|
+
- ResourceUsage: Comprehensive resource usage snapshot
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from dataclasses import asdict, dataclass, field
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from typing import Any, Dict, List
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class MemoryTrend:
|
|
24
|
+
"""
|
|
25
|
+
Memory usage trend analysis result.
|
|
26
|
+
|
|
27
|
+
WHY: Provides structured data for memory leak detection, including
|
|
28
|
+
historical measurements, slope calculation, and leak detection status.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
deployment_id: Unique deployment identifier
|
|
32
|
+
timestamps: List of measurement timestamps
|
|
33
|
+
memory_mb: List of memory measurements in megabytes
|
|
34
|
+
slope_mb_per_minute: Calculated memory growth rate (MB/minute)
|
|
35
|
+
is_leaking: Whether a memory leak was detected
|
|
36
|
+
window_size: Number of measurements in the analysis window
|
|
37
|
+
threshold_mb_per_minute: Leak detection threshold used
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
deployment_id: str
|
|
41
|
+
timestamps: List[datetime] = field(default_factory=list)
|
|
42
|
+
memory_mb: List[float] = field(default_factory=list)
|
|
43
|
+
slope_mb_per_minute: float = 0.0
|
|
44
|
+
is_leaking: bool = False
|
|
45
|
+
window_size: int = 0
|
|
46
|
+
threshold_mb_per_minute: float = 10.0
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
49
|
+
"""
|
|
50
|
+
Convert to dictionary for JSON serialization.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Dictionary representation with datetimes converted to ISO format
|
|
54
|
+
"""
|
|
55
|
+
data = asdict(self)
|
|
56
|
+
data["timestamps"] = [ts.isoformat() for ts in self.timestamps]
|
|
57
|
+
return data
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_dict(cls, data: Dict[str, Any]) -> "MemoryTrend":
|
|
61
|
+
"""
|
|
62
|
+
Create MemoryTrend from dictionary.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
data: Dictionary from JSON deserialization
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
MemoryTrend instance
|
|
69
|
+
"""
|
|
70
|
+
# Convert ISO strings to datetime
|
|
71
|
+
if isinstance(data.get("timestamps"), list):
|
|
72
|
+
data["timestamps"] = [
|
|
73
|
+
datetime.fromisoformat(ts) if isinstance(ts, str) else ts
|
|
74
|
+
for ts in data["timestamps"]
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
return cls(**data)
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def latest_memory_mb(self) -> float:
|
|
81
|
+
"""Get the most recent memory measurement."""
|
|
82
|
+
return self.memory_mb[-1] if self.memory_mb else 0.0
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def oldest_memory_mb(self) -> float:
|
|
86
|
+
"""Get the oldest memory measurement in the window."""
|
|
87
|
+
return self.memory_mb[0] if self.memory_mb else 0.0
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def time_span_minutes(self) -> float:
|
|
91
|
+
"""Get the time span covered by the measurements in minutes."""
|
|
92
|
+
if len(self.timestamps) < 2:
|
|
93
|
+
return 0.0
|
|
94
|
+
delta = self.timestamps[-1] - self.timestamps[0]
|
|
95
|
+
return delta.total_seconds() / 60.0
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class LogPatternMatch:
|
|
100
|
+
"""
|
|
101
|
+
Result of a log pattern match.
|
|
102
|
+
|
|
103
|
+
WHY: Contains all information about a detected error pattern in logs,
|
|
104
|
+
enabling analysis, alerting, and debugging of issues before they cause crashes.
|
|
105
|
+
|
|
106
|
+
Attributes:
|
|
107
|
+
deployment_id: Unique deployment identifier
|
|
108
|
+
pattern: Regex pattern that matched
|
|
109
|
+
line: The log line that matched
|
|
110
|
+
timestamp: When the match was detected
|
|
111
|
+
severity: Error severity level (ERROR, CRITICAL, WARNING)
|
|
112
|
+
line_number: Line number in log file (if available)
|
|
113
|
+
context: Additional context lines (before/after)
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
deployment_id: str
|
|
117
|
+
pattern: str
|
|
118
|
+
line: str
|
|
119
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
120
|
+
severity: str = "ERROR"
|
|
121
|
+
line_number: int = 0
|
|
122
|
+
context: List[str] = field(default_factory=list)
|
|
123
|
+
|
|
124
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
125
|
+
"""
|
|
126
|
+
Convert to dictionary for JSON serialization.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Dictionary representation with datetime converted to ISO format
|
|
130
|
+
"""
|
|
131
|
+
data = asdict(self)
|
|
132
|
+
data["timestamp"] = self.timestamp.isoformat()
|
|
133
|
+
return data
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def from_dict(cls, data: Dict[str, Any]) -> "LogPatternMatch":
|
|
137
|
+
"""
|
|
138
|
+
Create LogPatternMatch from dictionary.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
data: Dictionary from JSON deserialization
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
LogPatternMatch instance
|
|
145
|
+
"""
|
|
146
|
+
# Convert ISO string to datetime
|
|
147
|
+
if isinstance(data.get("timestamp"), str):
|
|
148
|
+
data["timestamp"] = datetime.fromisoformat(data["timestamp"])
|
|
149
|
+
|
|
150
|
+
return cls(**data)
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def is_critical(self) -> bool:
|
|
154
|
+
"""Check if this match represents a critical error."""
|
|
155
|
+
return self.severity == "CRITICAL"
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@dataclass
|
|
159
|
+
class ResourceUsage:
|
|
160
|
+
"""
|
|
161
|
+
Comprehensive resource usage snapshot.
|
|
162
|
+
|
|
163
|
+
WHY: Provides detailed resource consumption metrics across multiple
|
|
164
|
+
resource types to enable preemptive action before exhaustion.
|
|
165
|
+
|
|
166
|
+
Attributes:
|
|
167
|
+
deployment_id: Unique deployment identifier
|
|
168
|
+
file_descriptors: Current file descriptor count
|
|
169
|
+
max_file_descriptors: Maximum file descriptors allowed (ulimit -n)
|
|
170
|
+
threads: Current thread count
|
|
171
|
+
connections: Current network connection count
|
|
172
|
+
disk_free_mb: Free disk space in working directory (MB)
|
|
173
|
+
is_critical: Whether any resource exceeds 80% threshold
|
|
174
|
+
timestamp: When the measurement was taken
|
|
175
|
+
details: Additional resource-specific details
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
deployment_id: str
|
|
179
|
+
file_descriptors: int = 0
|
|
180
|
+
max_file_descriptors: int = 0
|
|
181
|
+
threads: int = 0
|
|
182
|
+
connections: int = 0
|
|
183
|
+
disk_free_mb: float = 0.0
|
|
184
|
+
is_critical: bool = False
|
|
185
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
186
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
187
|
+
|
|
188
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
189
|
+
"""
|
|
190
|
+
Convert to dictionary for JSON serialization.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Dictionary representation with datetime converted to ISO format
|
|
194
|
+
"""
|
|
195
|
+
data = asdict(self)
|
|
196
|
+
data["timestamp"] = self.timestamp.isoformat()
|
|
197
|
+
return data
|
|
198
|
+
|
|
199
|
+
@classmethod
|
|
200
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ResourceUsage":
|
|
201
|
+
"""
|
|
202
|
+
Create ResourceUsage from dictionary.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
data: Dictionary from JSON deserialization
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
ResourceUsage instance
|
|
209
|
+
"""
|
|
210
|
+
# Convert ISO string to datetime
|
|
211
|
+
if isinstance(data.get("timestamp"), str):
|
|
212
|
+
data["timestamp"] = datetime.fromisoformat(data["timestamp"])
|
|
213
|
+
|
|
214
|
+
return cls(**data)
|
|
215
|
+
|
|
216
|
+
@property
|
|
217
|
+
def fd_usage_percent(self) -> float:
|
|
218
|
+
"""Calculate file descriptor usage percentage."""
|
|
219
|
+
if self.max_file_descriptors == 0:
|
|
220
|
+
return 0.0
|
|
221
|
+
return (self.file_descriptors / self.max_file_descriptors) * 100.0
|
|
222
|
+
|
|
223
|
+
@property
|
|
224
|
+
def is_fd_critical(self) -> bool:
|
|
225
|
+
"""Check if file descriptor usage is critical (>80%)."""
|
|
226
|
+
return self.fd_usage_percent >= 80.0 # >= instead of > for 80% exactly
|
|
227
|
+
|
|
228
|
+
def get_critical_resources(self) -> List[str]:
|
|
229
|
+
"""
|
|
230
|
+
Get list of resources at critical levels.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
List of resource names exceeding 80% threshold
|
|
234
|
+
"""
|
|
235
|
+
critical = []
|
|
236
|
+
|
|
237
|
+
if self.is_fd_critical:
|
|
238
|
+
critical.append(
|
|
239
|
+
f"file_descriptors ({self.file_descriptors}/{self.max_file_descriptors})"
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Check thread count (threshold from details if available)
|
|
243
|
+
thread_threshold = self.details.get("thread_threshold", 1000)
|
|
244
|
+
if self.threads > thread_threshold * 0.8:
|
|
245
|
+
critical.append(f"threads ({self.threads})")
|
|
246
|
+
|
|
247
|
+
# Check connection count (threshold from details if available)
|
|
248
|
+
connection_threshold = self.details.get("connection_threshold", 500)
|
|
249
|
+
if self.connections > connection_threshold * 0.8:
|
|
250
|
+
critical.append(f"connections ({self.connections})")
|
|
251
|
+
|
|
252
|
+
# Check disk space (threshold from details if available)
|
|
253
|
+
disk_threshold_mb = self.details.get("disk_threshold_mb", 100)
|
|
254
|
+
if self.disk_free_mb < disk_threshold_mb:
|
|
255
|
+
critical.append(f"disk_space ({self.disk_free_mb:.1f}MB free)")
|
|
256
|
+
|
|
257
|
+
return critical
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
__all__ = [
|
|
261
|
+
"LogPatternMatch",
|
|
262
|
+
"MemoryTrend",
|
|
263
|
+
"ResourceUsage",
|
|
264
|
+
]
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Local Operations Service Package
|
|
3
|
+
=================================
|
|
4
|
+
|
|
5
|
+
WHY: Provides process management and health monitoring capabilities for local
|
|
6
|
+
development deployments. This package implements the core infrastructure needed
|
|
7
|
+
by the local-ops-agent to spawn, track, manage, and monitor background processes.
|
|
8
|
+
|
|
9
|
+
DESIGN DECISION: Organized as a service package under services/ to integrate
|
|
10
|
+
with the existing service architecture and dependency injection system.
|
|
11
|
+
|
|
12
|
+
ARCHITECTURE:
|
|
13
|
+
- StateManager: Persistent deployment state tracking
|
|
14
|
+
- ProcessManager: Process lifecycle management with isolation
|
|
15
|
+
- HealthCheckManager: Three-tier health monitoring (HTTP, process, resource)
|
|
16
|
+
- HealthChecks: HTTP, process, and resource health check implementations
|
|
17
|
+
- CrashDetector: Crash detection via health status monitoring
|
|
18
|
+
- RestartPolicy: Intelligent restart policies with exponential backoff
|
|
19
|
+
- RestartManager: Auto-restart orchestration with circuit breaker
|
|
20
|
+
- MemoryLeakDetector: Proactive memory leak detection using trend analysis
|
|
21
|
+
- LogMonitor: Real-time log file monitoring for error patterns
|
|
22
|
+
- ResourceMonitor: Comprehensive resource exhaustion prevention
|
|
23
|
+
- Data Models: Process status, deployment state, health status, restart history,
|
|
24
|
+
stability metrics (memory trends, log matches, resource usage)
|
|
25
|
+
- Interfaces: ILocalProcessManager, IDeploymentStateManager, IHealthCheckManager,
|
|
26
|
+
ICrashDetector, IRestartPolicy, IRestartManager, IMemoryLeakDetector,
|
|
27
|
+
ILogMonitor, IResourceMonitor
|
|
28
|
+
|
|
29
|
+
USAGE:
|
|
30
|
+
from claude_mpm.services.local_ops import (
|
|
31
|
+
LocalProcessManager,
|
|
32
|
+
DeploymentStateManager,
|
|
33
|
+
HealthCheckManager,
|
|
34
|
+
StartConfig,
|
|
35
|
+
ProcessStatus,
|
|
36
|
+
HealthStatus,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Initialize managers
|
|
40
|
+
state_manager = DeploymentStateManager(".claude-mpm/deployment-state.json")
|
|
41
|
+
process_manager = LocalProcessManager(state_manager)
|
|
42
|
+
health_manager = HealthCheckManager(process_manager, check_interval=30)
|
|
43
|
+
|
|
44
|
+
# Start a process
|
|
45
|
+
config = StartConfig(
|
|
46
|
+
command=["npm", "run", "dev"],
|
|
47
|
+
working_directory="/path/to/project",
|
|
48
|
+
port=3000
|
|
49
|
+
)
|
|
50
|
+
deployment = process_manager.start(config)
|
|
51
|
+
|
|
52
|
+
# Check health
|
|
53
|
+
health = health_manager.check_health(deployment.deployment_id)
|
|
54
|
+
|
|
55
|
+
# Start background monitoring
|
|
56
|
+
health_manager.start_monitoring()
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
# Re-export data models and interfaces for convenience
|
|
60
|
+
from claude_mpm.services.core.interfaces.health import (
|
|
61
|
+
IHealthCheck,
|
|
62
|
+
IHealthCheckManager,
|
|
63
|
+
)
|
|
64
|
+
from claude_mpm.services.core.interfaces.process import (
|
|
65
|
+
IDeploymentStateManager,
|
|
66
|
+
ILocalProcessManager,
|
|
67
|
+
)
|
|
68
|
+
from claude_mpm.services.core.interfaces.restart import (
|
|
69
|
+
ICrashDetector,
|
|
70
|
+
IRestartManager,
|
|
71
|
+
IRestartPolicy,
|
|
72
|
+
)
|
|
73
|
+
from claude_mpm.services.core.interfaces.stability import (
|
|
74
|
+
ILogMonitor,
|
|
75
|
+
IMemoryLeakDetector,
|
|
76
|
+
IResourceMonitor,
|
|
77
|
+
)
|
|
78
|
+
from claude_mpm.services.core.models.health import (
|
|
79
|
+
DeploymentHealth,
|
|
80
|
+
HealthCheckResult,
|
|
81
|
+
HealthStatus,
|
|
82
|
+
)
|
|
83
|
+
from claude_mpm.services.core.models.process import (
|
|
84
|
+
PROTECTED_PORT_RANGES,
|
|
85
|
+
DeploymentState,
|
|
86
|
+
ProcessInfo,
|
|
87
|
+
ProcessStatus,
|
|
88
|
+
StartConfig,
|
|
89
|
+
is_port_protected,
|
|
90
|
+
)
|
|
91
|
+
from claude_mpm.services.core.models.restart import (
|
|
92
|
+
CircuitBreakerState,
|
|
93
|
+
RestartAttempt,
|
|
94
|
+
RestartConfig,
|
|
95
|
+
RestartHistory,
|
|
96
|
+
)
|
|
97
|
+
from claude_mpm.services.core.models.stability import (
|
|
98
|
+
LogPatternMatch,
|
|
99
|
+
MemoryTrend,
|
|
100
|
+
ResourceUsage,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Import service implementations
|
|
104
|
+
from .health_manager import HealthCheckManager
|
|
105
|
+
from .log_monitor import LogMonitor
|
|
106
|
+
from .memory_leak_detector import MemoryLeakDetector
|
|
107
|
+
from .process_manager import (
|
|
108
|
+
LocalProcessManager,
|
|
109
|
+
PortConflictError,
|
|
110
|
+
ProcessSpawnError,
|
|
111
|
+
)
|
|
112
|
+
from .resource_monitor import ResourceMonitor
|
|
113
|
+
from .state_manager import DeploymentStateManager, StateCorruptionError
|
|
114
|
+
from .unified_manager import UnifiedLocalOpsManager
|
|
115
|
+
|
|
116
|
+
__all__ = [
|
|
117
|
+
"PROTECTED_PORT_RANGES",
|
|
118
|
+
# Data models - Restart
|
|
119
|
+
"CircuitBreakerState",
|
|
120
|
+
"CrashDetector",
|
|
121
|
+
"DeploymentHealth",
|
|
122
|
+
"DeploymentState",
|
|
123
|
+
# Service implementations
|
|
124
|
+
"DeploymentStateManager",
|
|
125
|
+
"HealthCheckManager",
|
|
126
|
+
"HealthCheckResult",
|
|
127
|
+
# Data models - Health
|
|
128
|
+
"HealthStatus",
|
|
129
|
+
"ICrashDetector",
|
|
130
|
+
# Interfaces
|
|
131
|
+
"IDeploymentStateManager",
|
|
132
|
+
"IHealthCheck",
|
|
133
|
+
"IHealthCheckManager",
|
|
134
|
+
"ILocalProcessManager",
|
|
135
|
+
"ILogMonitor",
|
|
136
|
+
"IMemoryLeakDetector",
|
|
137
|
+
"IResourceMonitor",
|
|
138
|
+
"IRestartManager",
|
|
139
|
+
"IRestartPolicy",
|
|
140
|
+
"LocalProcessManager",
|
|
141
|
+
"LogMonitor",
|
|
142
|
+
"LogPatternMatch",
|
|
143
|
+
"MemoryLeakDetector",
|
|
144
|
+
# Data models - Stability
|
|
145
|
+
"MemoryTrend",
|
|
146
|
+
"PortConflictError",
|
|
147
|
+
"ProcessInfo",
|
|
148
|
+
"ProcessSpawnError",
|
|
149
|
+
# Data models - Process
|
|
150
|
+
"ProcessStatus",
|
|
151
|
+
"ResourceMonitor",
|
|
152
|
+
"ResourceUsage",
|
|
153
|
+
"RestartAttempt",
|
|
154
|
+
"RestartConfig",
|
|
155
|
+
"RestartHistory",
|
|
156
|
+
"RestartManager",
|
|
157
|
+
"RestartPolicy",
|
|
158
|
+
"StartConfig",
|
|
159
|
+
# Exceptions
|
|
160
|
+
"StateCorruptionError",
|
|
161
|
+
"UnifiedLocalOpsManager",
|
|
162
|
+
"is_port_protected",
|
|
163
|
+
]
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Crash Detector for Claude MPM Framework
|
|
3
|
+
========================================
|
|
4
|
+
|
|
5
|
+
WHY: Detects process crashes and failures by monitoring health status changes,
|
|
6
|
+
process exits, and zombie states. Integrates with HealthCheckManager to receive
|
|
7
|
+
real-time status updates.
|
|
8
|
+
|
|
9
|
+
DESIGN DECISION: Uses callback-based architecture to receive health status
|
|
10
|
+
changes from HealthCheckManager. Tracks crash history per deployment to
|
|
11
|
+
enable pattern detection and intelligent restart policies.
|
|
12
|
+
|
|
13
|
+
ARCHITECTURE:
|
|
14
|
+
- Subscribes to HealthCheckManager status change callbacks
|
|
15
|
+
- Detects crashes when status transitions to UNHEALTHY
|
|
16
|
+
- Tracks crash count per deployment
|
|
17
|
+
- Invokes registered crash callbacks when crash detected
|
|
18
|
+
|
|
19
|
+
USAGE:
|
|
20
|
+
crash_detector = CrashDetector(health_manager)
|
|
21
|
+
crash_detector.register_crash_callback(handle_crash)
|
|
22
|
+
crash_detector.start_monitoring(deployment_id)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import threading
|
|
26
|
+
from collections import defaultdict
|
|
27
|
+
from typing import Callable, Dict, List, Set
|
|
28
|
+
|
|
29
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
30
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheckManager
|
|
31
|
+
from claude_mpm.services.core.interfaces.restart import ICrashDetector
|
|
32
|
+
from claude_mpm.services.core.models.health import HealthStatus
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CrashDetector(SyncBaseService, ICrashDetector):
|
|
36
|
+
"""
|
|
37
|
+
Detects process crashes via health status monitoring.
|
|
38
|
+
|
|
39
|
+
WHY: Provides automated crash detection by monitoring health status
|
|
40
|
+
changes. Enables reactive restart policies based on crash events.
|
|
41
|
+
|
|
42
|
+
Thread Safety: All public methods are thread-safe with proper locking.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, health_manager: IHealthCheckManager):
|
|
46
|
+
"""
|
|
47
|
+
Initialize crash detector.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
health_manager: Health check manager for status monitoring
|
|
51
|
+
"""
|
|
52
|
+
super().__init__("CrashDetector")
|
|
53
|
+
self.health_manager = health_manager
|
|
54
|
+
self._lock = threading.Lock()
|
|
55
|
+
|
|
56
|
+
# Deployments being monitored
|
|
57
|
+
self._monitored_deployments: Set[str] = set()
|
|
58
|
+
|
|
59
|
+
# Last known health status per deployment
|
|
60
|
+
self._last_health_status: Dict[str, HealthStatus] = {}
|
|
61
|
+
|
|
62
|
+
# Crash count per deployment
|
|
63
|
+
self._crash_count: Dict[str, int] = defaultdict(int)
|
|
64
|
+
|
|
65
|
+
# Crash callbacks: List of functions called with (deployment_id, reason)
|
|
66
|
+
self._crash_callbacks: List[Callable[[str, str], None]] = []
|
|
67
|
+
|
|
68
|
+
def initialize(self) -> bool:
|
|
69
|
+
"""
|
|
70
|
+
Initialize the crash detector.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
True if initialization successful
|
|
74
|
+
"""
|
|
75
|
+
self.logger.info("Initializing CrashDetector")
|
|
76
|
+
|
|
77
|
+
# Register with health manager to receive status change callbacks
|
|
78
|
+
self.health_manager.register_status_callback(self._on_health_status_change)
|
|
79
|
+
|
|
80
|
+
self.logger.info("CrashDetector initialized successfully")
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
def register_crash_callback(self, callback: Callable[[str, str], None]) -> None:
|
|
84
|
+
"""
|
|
85
|
+
Register a callback to be invoked when a crash is detected.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
callback: Function called with (deployment_id, reason)
|
|
89
|
+
"""
|
|
90
|
+
with self._lock:
|
|
91
|
+
self._crash_callbacks.append(callback)
|
|
92
|
+
callback_name = getattr(callback, "__name__", repr(callback))
|
|
93
|
+
self.logger.debug(f"Registered crash callback: {callback_name}")
|
|
94
|
+
|
|
95
|
+
def start_monitoring(self, deployment_id: str) -> None:
|
|
96
|
+
"""
|
|
97
|
+
Start monitoring a deployment for crashes.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
deployment_id: Unique deployment identifier
|
|
101
|
+
|
|
102
|
+
Raises:
|
|
103
|
+
ValueError: If deployment_id not found
|
|
104
|
+
"""
|
|
105
|
+
with self._lock:
|
|
106
|
+
self._monitored_deployments.add(deployment_id)
|
|
107
|
+
self.logger.info(
|
|
108
|
+
f"Started crash monitoring for deployment: {deployment_id}"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Get initial health status
|
|
112
|
+
try:
|
|
113
|
+
health = self.health_manager.check_health(deployment_id)
|
|
114
|
+
self._last_health_status[deployment_id] = health.overall_status
|
|
115
|
+
self.logger.debug(
|
|
116
|
+
f"Initial health status for {deployment_id}: {health.overall_status.value}"
|
|
117
|
+
)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
self.logger.warning(
|
|
120
|
+
f"Failed to get initial health status for {deployment_id}: {e}"
|
|
121
|
+
)
|
|
122
|
+
self._last_health_status[deployment_id] = HealthStatus.UNKNOWN
|
|
123
|
+
|
|
124
|
+
def stop_monitoring(self, deployment_id: str) -> None:
|
|
125
|
+
"""
|
|
126
|
+
Stop monitoring a deployment.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
deployment_id: Unique deployment identifier
|
|
130
|
+
"""
|
|
131
|
+
with self._lock:
|
|
132
|
+
self._monitored_deployments.discard(deployment_id)
|
|
133
|
+
self._last_health_status.pop(deployment_id, None)
|
|
134
|
+
self.logger.info(
|
|
135
|
+
f"Stopped crash monitoring for deployment: {deployment_id}"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def is_monitoring(self, deployment_id: str) -> bool:
|
|
139
|
+
"""
|
|
140
|
+
Check if a deployment is being monitored.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
deployment_id: Unique deployment identifier
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
True if deployment is being monitored
|
|
147
|
+
"""
|
|
148
|
+
with self._lock:
|
|
149
|
+
return deployment_id in self._monitored_deployments
|
|
150
|
+
|
|
151
|
+
def get_crash_count(self, deployment_id: str) -> int:
|
|
152
|
+
"""
|
|
153
|
+
Get the number of crashes detected for a deployment.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
deployment_id: Unique deployment identifier
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Number of crashes detected
|
|
160
|
+
"""
|
|
161
|
+
with self._lock:
|
|
162
|
+
return self._crash_count.get(deployment_id, 0)
|
|
163
|
+
|
|
164
|
+
def reset_crash_count(self, deployment_id: str) -> None:
|
|
165
|
+
"""
|
|
166
|
+
Reset crash count for a deployment.
|
|
167
|
+
|
|
168
|
+
WHY: Allows manual intervention to clear crash history.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
deployment_id: Unique deployment identifier
|
|
172
|
+
"""
|
|
173
|
+
with self._lock:
|
|
174
|
+
self._crash_count[deployment_id] = 0
|
|
175
|
+
self.logger.debug(f"Reset crash count for deployment: {deployment_id}")
|
|
176
|
+
|
|
177
|
+
def shutdown(self) -> bool:
|
|
178
|
+
"""
|
|
179
|
+
Shutdown the crash detector.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
True if shutdown successful
|
|
183
|
+
"""
|
|
184
|
+
with self._lock:
|
|
185
|
+
self._monitored_deployments.clear()
|
|
186
|
+
self._last_health_status.clear()
|
|
187
|
+
self._crash_count.clear()
|
|
188
|
+
self._crash_callbacks.clear()
|
|
189
|
+
self.logger.info("CrashDetector shutdown successfully")
|
|
190
|
+
return True
|
|
191
|
+
|
|
192
|
+
def _on_health_status_change(
|
|
193
|
+
self, deployment_id: str, old_status: HealthStatus, new_status: HealthStatus
|
|
194
|
+
) -> None:
|
|
195
|
+
"""
|
|
196
|
+
Handle health status changes from HealthCheckManager.
|
|
197
|
+
|
|
198
|
+
WHY: Callback invoked by HealthCheckManager when status changes.
|
|
199
|
+
Detects crashes when status transitions to UNHEALTHY.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
deployment_id: Unique deployment identifier
|
|
203
|
+
old_status: Previous health status
|
|
204
|
+
new_status: New health status
|
|
205
|
+
"""
|
|
206
|
+
with self._lock:
|
|
207
|
+
# Only process if we're monitoring this deployment
|
|
208
|
+
if deployment_id not in self._monitored_deployments:
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
# Update last known status
|
|
212
|
+
self._last_health_status[deployment_id] = new_status
|
|
213
|
+
|
|
214
|
+
# Detect crash: transition from operational to UNHEALTHY
|
|
215
|
+
if old_status.is_operational() and new_status.is_critical():
|
|
216
|
+
self._handle_crash(
|
|
217
|
+
deployment_id, "Health status transitioned to UNHEALTHY"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Also detect: transition from UNKNOWN to UNHEALTHY (process died)
|
|
221
|
+
elif (
|
|
222
|
+
old_status == HealthStatus.UNKNOWN
|
|
223
|
+
and new_status == HealthStatus.UNHEALTHY
|
|
224
|
+
):
|
|
225
|
+
self._handle_crash(deployment_id, "Process became unhealthy")
|
|
226
|
+
|
|
227
|
+
def _handle_crash(self, deployment_id: str, reason: str) -> None:
|
|
228
|
+
"""
|
|
229
|
+
Handle detected crash.
|
|
230
|
+
|
|
231
|
+
WHY: Increments crash count and invokes all registered callbacks.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
deployment_id: Unique deployment identifier
|
|
235
|
+
reason: Reason for crash detection
|
|
236
|
+
"""
|
|
237
|
+
# Increment crash count
|
|
238
|
+
self._crash_count[deployment_id] += 1
|
|
239
|
+
crash_count = self._crash_count[deployment_id]
|
|
240
|
+
|
|
241
|
+
self.logger.warning(
|
|
242
|
+
f"Crash detected for deployment {deployment_id} "
|
|
243
|
+
f"(count: {crash_count}): {reason}"
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Invoke all crash callbacks
|
|
247
|
+
for callback in self._crash_callbacks:
|
|
248
|
+
try:
|
|
249
|
+
callback(deployment_id, reason)
|
|
250
|
+
except Exception as e:
|
|
251
|
+
self.logger.error(
|
|
252
|
+
f"Error invoking crash callback {callback.__name__}: {e}",
|
|
253
|
+
exc_info=True,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
__all__ = ["CrashDetector"]
|