claude-mpm 4.13.2__py3-none-any.whl → 4.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of claude-mpm might be problematic. Click here for more details.
- claude_mpm/VERSION +1 -1
- claude_mpm/cli/__init__.py +10 -0
- claude_mpm/cli/commands/local_deploy.py +536 -0
- claude_mpm/cli/parsers/base_parser.py +7 -0
- claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
- claude_mpm/config/model_config.py +428 -0
- claude_mpm/core/interactive_session.py +3 -0
- claude_mpm/services/core/interfaces/__init__.py +74 -2
- claude_mpm/services/core/interfaces/health.py +172 -0
- claude_mpm/services/core/interfaces/model.py +281 -0
- claude_mpm/services/core/interfaces/process.py +372 -0
- claude_mpm/services/core/interfaces/restart.py +307 -0
- claude_mpm/services/core/interfaces/stability.py +260 -0
- claude_mpm/services/core/models/__init__.py +35 -0
- claude_mpm/services/core/models/health.py +189 -0
- claude_mpm/services/core/models/process.py +258 -0
- claude_mpm/services/core/models/restart.py +302 -0
- claude_mpm/services/core/models/stability.py +264 -0
- claude_mpm/services/local_ops/__init__.py +163 -0
- claude_mpm/services/local_ops/crash_detector.py +257 -0
- claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
- claude_mpm/services/local_ops/health_checks/http_check.py +223 -0
- claude_mpm/services/local_ops/health_checks/process_check.py +235 -0
- claude_mpm/services/local_ops/health_checks/resource_check.py +254 -0
- claude_mpm/services/local_ops/health_manager.py +430 -0
- claude_mpm/services/local_ops/log_monitor.py +396 -0
- claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
- claude_mpm/services/local_ops/process_manager.py +595 -0
- claude_mpm/services/local_ops/resource_monitor.py +331 -0
- claude_mpm/services/local_ops/restart_manager.py +401 -0
- claude_mpm/services/local_ops/restart_policy.py +387 -0
- claude_mpm/services/local_ops/state_manager.py +371 -0
- claude_mpm/services/local_ops/unified_manager.py +600 -0
- claude_mpm/services/model/__init__.py +147 -0
- claude_mpm/services/model/base_provider.py +365 -0
- claude_mpm/services/model/claude_provider.py +412 -0
- claude_mpm/services/model/model_router.py +453 -0
- claude_mpm/services/model/ollama_provider.py +415 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/METADATA +1 -1
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/RECORD +44 -12
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/WHEEL +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Resource Health Check for Claude MPM Framework
|
|
3
|
+
===============================================
|
|
4
|
+
|
|
5
|
+
WHY: Provides resource usage monitoring including CPU, memory, file descriptors,
|
|
6
|
+
threads, and network connections to detect resource exhaustion issues.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Uses psutil for cross-platform resource monitoring with
|
|
9
|
+
configurable thresholds for different resource types.
|
|
10
|
+
|
|
11
|
+
ARCHITECTURE:
|
|
12
|
+
- CPU usage monitoring (threshold: 80%)
|
|
13
|
+
- Memory usage monitoring (threshold: 500MB)
|
|
14
|
+
- File descriptor count (threshold: 1000, Unix only)
|
|
15
|
+
- Thread count monitoring
|
|
16
|
+
- Network connection count (open sockets)
|
|
17
|
+
|
|
18
|
+
USAGE:
|
|
19
|
+
resource_check = ResourceHealthCheck(process_manager)
|
|
20
|
+
result = resource_check.check(
|
|
21
|
+
deployment_id="my-app",
|
|
22
|
+
cpu_threshold=80.0,
|
|
23
|
+
memory_threshold_mb=500.0
|
|
24
|
+
)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import platform
|
|
28
|
+
|
|
29
|
+
import psutil
|
|
30
|
+
|
|
31
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
32
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheck
|
|
33
|
+
from claude_mpm.services.core.interfaces.process import ILocalProcessManager
|
|
34
|
+
from claude_mpm.services.core.models.health import HealthCheckResult, HealthStatus
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ResourceHealthCheck(SyncBaseService, IHealthCheck):
|
|
38
|
+
"""
|
|
39
|
+
Resource usage health check implementation.
|
|
40
|
+
|
|
41
|
+
WHY: Monitors resource consumption to detect issues before they
|
|
42
|
+
cause service degradation or failures.
|
|
43
|
+
|
|
44
|
+
Thread Safety: Stateless, safe for concurrent execution.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
# Default thresholds
|
|
48
|
+
DEFAULT_CPU_THRESHOLD = 80.0 # Percentage
|
|
49
|
+
DEFAULT_MEMORY_THRESHOLD_MB = 500.0 # Megabytes
|
|
50
|
+
DEFAULT_FD_THRESHOLD = 1000 # File descriptors (Unix only)
|
|
51
|
+
DEFAULT_THREAD_THRESHOLD = 100 # Threads
|
|
52
|
+
|
|
53
|
+
def __init__(self, process_manager: ILocalProcessManager):
|
|
54
|
+
"""
|
|
55
|
+
Initialize resource health check.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
process_manager: Process manager for deployment lookup
|
|
59
|
+
"""
|
|
60
|
+
super().__init__("ResourceHealthCheck")
|
|
61
|
+
self.process_manager = process_manager
|
|
62
|
+
self.is_windows = platform.system() == "Windows"
|
|
63
|
+
|
|
64
|
+
def initialize(self) -> bool:
|
|
65
|
+
"""
|
|
66
|
+
Initialize the health check.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
True if initialization successful
|
|
70
|
+
"""
|
|
71
|
+
self._initialized = True
|
|
72
|
+
self.log_info("Resource health check initialized")
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
def shutdown(self) -> None:
|
|
76
|
+
"""Shutdown health check (no resources to clean up)."""
|
|
77
|
+
self._shutdown = True
|
|
78
|
+
|
|
79
|
+
def get_check_type(self) -> str:
|
|
80
|
+
"""Get the check type identifier."""
|
|
81
|
+
return "resource"
|
|
82
|
+
|
|
83
|
+
def check(self, deployment_id: str, **kwargs) -> HealthCheckResult:
|
|
84
|
+
"""
|
|
85
|
+
Execute resource health check for a deployment.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
deployment_id: Unique deployment identifier
|
|
89
|
+
**kwargs: Optional parameters:
|
|
90
|
+
- cpu_threshold: CPU usage threshold percentage (default: 80.0)
|
|
91
|
+
- memory_threshold_mb: Memory usage threshold in MB (default: 500.0)
|
|
92
|
+
- fd_threshold: File descriptor threshold (default: 1000, Unix only)
|
|
93
|
+
- thread_threshold: Thread count threshold (default: 100)
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
HealthCheckResult with check status and details
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
ValueError: If deployment_id not found
|
|
100
|
+
"""
|
|
101
|
+
# Validate deployment exists
|
|
102
|
+
deployment = self.process_manager.state_manager.get_deployment(deployment_id)
|
|
103
|
+
if not deployment:
|
|
104
|
+
raise ValueError(f"Deployment not found: {deployment_id}")
|
|
105
|
+
|
|
106
|
+
# Get thresholds from kwargs
|
|
107
|
+
cpu_threshold = kwargs.get("cpu_threshold", self.DEFAULT_CPU_THRESHOLD)
|
|
108
|
+
memory_threshold_mb = kwargs.get(
|
|
109
|
+
"memory_threshold_mb", self.DEFAULT_MEMORY_THRESHOLD_MB
|
|
110
|
+
)
|
|
111
|
+
fd_threshold = kwargs.get("fd_threshold", self.DEFAULT_FD_THRESHOLD)
|
|
112
|
+
thread_threshold = kwargs.get("thread_threshold", self.DEFAULT_THREAD_THRESHOLD)
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
process = psutil.Process(deployment.process_id)
|
|
116
|
+
|
|
117
|
+
# Collect resource metrics
|
|
118
|
+
details = {
|
|
119
|
+
"pid": deployment.process_id,
|
|
120
|
+
"deployment_id": deployment_id,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
issues = []
|
|
124
|
+
|
|
125
|
+
# Check CPU usage
|
|
126
|
+
try:
|
|
127
|
+
cpu_percent = process.cpu_percent(interval=0.1)
|
|
128
|
+
details["cpu_percent"] = round(cpu_percent, 2)
|
|
129
|
+
details["cpu_threshold"] = cpu_threshold
|
|
130
|
+
|
|
131
|
+
if cpu_percent > cpu_threshold:
|
|
132
|
+
issues.append(
|
|
133
|
+
f"High CPU usage: {cpu_percent:.1f}% (threshold: {cpu_threshold}%)"
|
|
134
|
+
)
|
|
135
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
# Check memory usage
|
|
139
|
+
try:
|
|
140
|
+
memory_info = process.memory_info()
|
|
141
|
+
memory_mb = memory_info.rss / (1024 * 1024)
|
|
142
|
+
details["memory_mb"] = round(memory_mb, 2)
|
|
143
|
+
details["memory_threshold_mb"] = memory_threshold_mb
|
|
144
|
+
|
|
145
|
+
if memory_mb > memory_threshold_mb:
|
|
146
|
+
issues.append(
|
|
147
|
+
f"High memory usage: {memory_mb:.1f}MB (threshold: {memory_threshold_mb}MB)"
|
|
148
|
+
)
|
|
149
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
# Check file descriptors (Unix only)
|
|
153
|
+
if not self.is_windows:
|
|
154
|
+
try:
|
|
155
|
+
num_fds = process.num_fds()
|
|
156
|
+
details["num_fds"] = num_fds
|
|
157
|
+
details["fd_threshold"] = fd_threshold
|
|
158
|
+
|
|
159
|
+
if num_fds > fd_threshold:
|
|
160
|
+
issues.append(
|
|
161
|
+
f"High file descriptor count: {num_fds} (threshold: {fd_threshold})"
|
|
162
|
+
)
|
|
163
|
+
except (
|
|
164
|
+
psutil.NoSuchProcess,
|
|
165
|
+
psutil.AccessDenied,
|
|
166
|
+
AttributeError,
|
|
167
|
+
):
|
|
168
|
+
# num_fds() not available on all platforms
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
# Check thread count
|
|
172
|
+
try:
|
|
173
|
+
num_threads = process.num_threads()
|
|
174
|
+
details["num_threads"] = num_threads
|
|
175
|
+
details["thread_threshold"] = thread_threshold
|
|
176
|
+
|
|
177
|
+
if num_threads > thread_threshold:
|
|
178
|
+
issues.append(
|
|
179
|
+
f"High thread count: {num_threads} (threshold: {thread_threshold})"
|
|
180
|
+
)
|
|
181
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
182
|
+
pass
|
|
183
|
+
|
|
184
|
+
# Check connection count
|
|
185
|
+
try:
|
|
186
|
+
connections = process.net_connections()
|
|
187
|
+
num_connections = len(connections)
|
|
188
|
+
details["num_connections"] = num_connections
|
|
189
|
+
|
|
190
|
+
# Add connection breakdown by state
|
|
191
|
+
connection_states = {}
|
|
192
|
+
for conn in connections:
|
|
193
|
+
state = conn.status
|
|
194
|
+
connection_states[state] = connection_states.get(state, 0) + 1
|
|
195
|
+
details["connection_states"] = connection_states
|
|
196
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
197
|
+
pass
|
|
198
|
+
|
|
199
|
+
# Determine health status based on issues
|
|
200
|
+
if issues:
|
|
201
|
+
return HealthCheckResult(
|
|
202
|
+
status=HealthStatus.DEGRADED,
|
|
203
|
+
check_type=self.get_check_type(),
|
|
204
|
+
message=f"Resource usage issues detected: {'; '.join(issues)}",
|
|
205
|
+
details=details,
|
|
206
|
+
)
|
|
207
|
+
return HealthCheckResult(
|
|
208
|
+
status=HealthStatus.HEALTHY,
|
|
209
|
+
check_type=self.get_check_type(),
|
|
210
|
+
message="Resource usage within normal limits",
|
|
211
|
+
details=details,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
except psutil.NoSuchProcess:
|
|
215
|
+
# Process does not exist
|
|
216
|
+
return HealthCheckResult(
|
|
217
|
+
status=HealthStatus.UNHEALTHY,
|
|
218
|
+
check_type=self.get_check_type(),
|
|
219
|
+
message="Process no longer exists",
|
|
220
|
+
details={
|
|
221
|
+
"pid": deployment.process_id,
|
|
222
|
+
"deployment_id": deployment_id,
|
|
223
|
+
},
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
except psutil.AccessDenied as e:
|
|
227
|
+
# Cannot access process information
|
|
228
|
+
return HealthCheckResult(
|
|
229
|
+
status=HealthStatus.UNKNOWN,
|
|
230
|
+
check_type=self.get_check_type(),
|
|
231
|
+
message="Cannot access process resource information",
|
|
232
|
+
details={
|
|
233
|
+
"pid": deployment.process_id,
|
|
234
|
+
"deployment_id": deployment_id,
|
|
235
|
+
"error": str(e),
|
|
236
|
+
},
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
except Exception as e:
|
|
240
|
+
# Unexpected error
|
|
241
|
+
self.log_error(f"Unexpected error in resource health check: {e}")
|
|
242
|
+
return HealthCheckResult(
|
|
243
|
+
status=HealthStatus.UNKNOWN,
|
|
244
|
+
check_type=self.get_check_type(),
|
|
245
|
+
message="Health check failed with error",
|
|
246
|
+
details={
|
|
247
|
+
"pid": deployment.process_id,
|
|
248
|
+
"deployment_id": deployment_id,
|
|
249
|
+
"error": str(e),
|
|
250
|
+
},
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
__all__ = ["ResourceHealthCheck"]
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Health Check Manager for Claude MPM Framework
|
|
3
|
+
==============================================
|
|
4
|
+
|
|
5
|
+
WHY: Orchestrates multiple health check types, provides background monitoring,
|
|
6
|
+
and maintains historical health data for local deployments.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Uses background daemon thread for continuous monitoring with
|
|
9
|
+
configurable check intervals. Aggregates results from all health check types
|
|
10
|
+
using defined priority rules.
|
|
11
|
+
|
|
12
|
+
ARCHITECTURE:
|
|
13
|
+
- Orchestrates HTTP, process, and resource health checks
|
|
14
|
+
- Background monitoring thread with configurable interval (default: 30s)
|
|
15
|
+
- Thread-safe status tracking with threading.Lock
|
|
16
|
+
- Historical health data (last 100 checks per deployment)
|
|
17
|
+
- Health status aggregation with priority:
|
|
18
|
+
1. Process UNHEALTHY = Deployment UNHEALTHY (critical)
|
|
19
|
+
2. Any check UNHEALTHY = Deployment DEGRADED (service issues)
|
|
20
|
+
3. All checks HEALTHY = Deployment HEALTHY
|
|
21
|
+
4. Otherwise = UNKNOWN
|
|
22
|
+
- Event callbacks for status changes
|
|
23
|
+
|
|
24
|
+
USAGE:
|
|
25
|
+
health_manager = HealthCheckManager(
|
|
26
|
+
process_manager=process_manager,
|
|
27
|
+
check_interval=30,
|
|
28
|
+
)
|
|
29
|
+
health_manager.start_monitoring()
|
|
30
|
+
|
|
31
|
+
# Check health on-demand
|
|
32
|
+
health = health_manager.check_health(deployment_id)
|
|
33
|
+
|
|
34
|
+
# Stop monitoring
|
|
35
|
+
health_manager.stop_monitoring()
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
import threading
|
|
39
|
+
from collections import defaultdict
|
|
40
|
+
from typing import Callable, Dict, List, Optional
|
|
41
|
+
|
|
42
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
43
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheckManager
|
|
44
|
+
from claude_mpm.services.core.interfaces.process import ILocalProcessManager
|
|
45
|
+
from claude_mpm.services.core.models.health import (
|
|
46
|
+
DeploymentHealth,
|
|
47
|
+
HealthCheckResult,
|
|
48
|
+
HealthStatus,
|
|
49
|
+
)
|
|
50
|
+
from claude_mpm.services.local_ops.health_checks import (
|
|
51
|
+
HttpHealthCheck,
|
|
52
|
+
ProcessHealthCheck,
|
|
53
|
+
ResourceHealthCheck,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class HealthCheckManager(SyncBaseService, IHealthCheckManager):
|
|
58
|
+
"""
|
|
59
|
+
Health check orchestration and monitoring service.
|
|
60
|
+
|
|
61
|
+
WHY: Provides comprehensive health monitoring by coordinating multiple
|
|
62
|
+
check types, maintaining historical data, and enabling background monitoring.
|
|
63
|
+
|
|
64
|
+
Thread Safety: All public methods are thread-safe with proper locking.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
process_manager: ILocalProcessManager,
|
|
70
|
+
check_interval: int = 30,
|
|
71
|
+
history_limit: int = 100,
|
|
72
|
+
):
|
|
73
|
+
"""
|
|
74
|
+
Initialize health check manager.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
process_manager: Process manager for deployment lookup
|
|
78
|
+
check_interval: Background check interval in seconds (default: 30)
|
|
79
|
+
history_limit: Maximum historical entries per deployment (default: 100)
|
|
80
|
+
"""
|
|
81
|
+
super().__init__("HealthCheckManager")
|
|
82
|
+
self.process_manager = process_manager
|
|
83
|
+
self.check_interval = check_interval
|
|
84
|
+
self.history_limit = history_limit
|
|
85
|
+
|
|
86
|
+
# Initialize health check implementations
|
|
87
|
+
self.http_check = HttpHealthCheck(process_manager)
|
|
88
|
+
self.process_check = ProcessHealthCheck(process_manager)
|
|
89
|
+
self.resource_check = ResourceHealthCheck(process_manager)
|
|
90
|
+
|
|
91
|
+
# Background monitoring state
|
|
92
|
+
self._monitoring = False
|
|
93
|
+
self._monitor_thread: Optional[threading.Thread] = None
|
|
94
|
+
self._stop_event = threading.Event()
|
|
95
|
+
self._lock = threading.Lock()
|
|
96
|
+
|
|
97
|
+
# Health history: deployment_id -> List[DeploymentHealth]
|
|
98
|
+
self._health_history: Dict[str, List[DeploymentHealth]] = defaultdict(list)
|
|
99
|
+
|
|
100
|
+
# Status change callbacks
|
|
101
|
+
self._status_callbacks: List[Callable] = []
|
|
102
|
+
|
|
103
|
+
def initialize(self) -> bool:
|
|
104
|
+
"""
|
|
105
|
+
Initialize the health check manager.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
True if initialization successful
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
# Initialize all health check implementations
|
|
112
|
+
if not self.http_check.initialize():
|
|
113
|
+
self.log_error("Failed to initialize HTTP health check")
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
if not self.process_check.initialize():
|
|
117
|
+
self.log_error("Failed to initialize process health check")
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
if not self.resource_check.initialize():
|
|
121
|
+
self.log_error("Failed to initialize resource health check")
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
self._initialized = True
|
|
125
|
+
self.log_info("Health check manager initialized")
|
|
126
|
+
return True
|
|
127
|
+
|
|
128
|
+
except Exception as e:
|
|
129
|
+
self.log_error(f"Failed to initialize: {e}")
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
def shutdown(self) -> None:
|
|
133
|
+
"""Shutdown health check manager and stop monitoring."""
|
|
134
|
+
if self._monitoring:
|
|
135
|
+
self.stop_monitoring()
|
|
136
|
+
|
|
137
|
+
# Shutdown health check implementations
|
|
138
|
+
self.http_check.shutdown()
|
|
139
|
+
self.process_check.shutdown()
|
|
140
|
+
self.resource_check.shutdown()
|
|
141
|
+
|
|
142
|
+
self._shutdown = True
|
|
143
|
+
self.log_info("Health check manager shutdown complete")
|
|
144
|
+
|
|
145
|
+
def check_health(self, deployment_id: str, **kwargs) -> DeploymentHealth:
|
|
146
|
+
"""
|
|
147
|
+
Execute all health checks for a deployment.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
deployment_id: Unique deployment identifier
|
|
151
|
+
**kwargs: Optional parameters passed to health checks:
|
|
152
|
+
- endpoint: HTTP endpoint URL
|
|
153
|
+
- timeout: HTTP timeout in seconds
|
|
154
|
+
- cpu_threshold: CPU usage threshold percentage
|
|
155
|
+
- memory_threshold_mb: Memory usage threshold in MB
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
DeploymentHealth with aggregated status and check results
|
|
159
|
+
|
|
160
|
+
Raises:
|
|
161
|
+
ValueError: If deployment_id not found
|
|
162
|
+
"""
|
|
163
|
+
# Validate deployment exists
|
|
164
|
+
deployment = self.process_manager.state_manager.get_deployment(deployment_id)
|
|
165
|
+
if not deployment:
|
|
166
|
+
raise ValueError(f"Deployment not found: {deployment_id}")
|
|
167
|
+
|
|
168
|
+
# Execute all health checks
|
|
169
|
+
checks: List[HealthCheckResult] = []
|
|
170
|
+
|
|
171
|
+
# 1. Process health check (most critical)
|
|
172
|
+
try:
|
|
173
|
+
process_result = self.process_check.check(deployment_id, **kwargs)
|
|
174
|
+
checks.append(process_result)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
self.log_error(f"Process health check failed: {e}")
|
|
177
|
+
checks.append(
|
|
178
|
+
HealthCheckResult(
|
|
179
|
+
status=HealthStatus.UNKNOWN,
|
|
180
|
+
check_type="process",
|
|
181
|
+
message=f"Check failed: {e}",
|
|
182
|
+
details={"error": str(e)},
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# 2. Resource health check
|
|
187
|
+
try:
|
|
188
|
+
resource_result = self.resource_check.check(deployment_id, **kwargs)
|
|
189
|
+
checks.append(resource_result)
|
|
190
|
+
except Exception as e:
|
|
191
|
+
self.log_error(f"Resource health check failed: {e}")
|
|
192
|
+
checks.append(
|
|
193
|
+
HealthCheckResult(
|
|
194
|
+
status=HealthStatus.UNKNOWN,
|
|
195
|
+
check_type="resource",
|
|
196
|
+
message=f"Check failed: {e}",
|
|
197
|
+
details={"error": str(e)},
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# 3. HTTP health check (optional, only if endpoint configured)
|
|
202
|
+
try:
|
|
203
|
+
http_result = self.http_check.check(deployment_id, **kwargs)
|
|
204
|
+
# Only add if check was actually performed (not UNKNOWN due to no endpoint)
|
|
205
|
+
if http_result.status != HealthStatus.UNKNOWN or kwargs.get("endpoint"):
|
|
206
|
+
checks.append(http_result)
|
|
207
|
+
except Exception as e:
|
|
208
|
+
self.log_error(f"HTTP health check failed: {e}")
|
|
209
|
+
checks.append(
|
|
210
|
+
HealthCheckResult(
|
|
211
|
+
status=HealthStatus.UNKNOWN,
|
|
212
|
+
check_type="http",
|
|
213
|
+
message=f"Check failed: {e}",
|
|
214
|
+
details={"error": str(e)},
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Aggregate health status
|
|
219
|
+
overall_status = self._aggregate_health_status(checks)
|
|
220
|
+
|
|
221
|
+
# Create deployment health
|
|
222
|
+
deployment_health = DeploymentHealth(
|
|
223
|
+
deployment_id=deployment_id,
|
|
224
|
+
overall_status=overall_status,
|
|
225
|
+
checks=checks,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Update health history
|
|
229
|
+
with self._lock:
|
|
230
|
+
self._health_history[deployment_id].append(deployment_health)
|
|
231
|
+
# Trim history to limit
|
|
232
|
+
if len(self._health_history[deployment_id]) > self.history_limit:
|
|
233
|
+
self._health_history[deployment_id] = self._health_history[
|
|
234
|
+
deployment_id
|
|
235
|
+
][-self.history_limit :]
|
|
236
|
+
|
|
237
|
+
# Check for status changes and trigger callbacks
|
|
238
|
+
if len(self._health_history[deployment_id]) >= 2:
|
|
239
|
+
previous_health = self._health_history[deployment_id][-2]
|
|
240
|
+
if previous_health.overall_status != overall_status:
|
|
241
|
+
self._trigger_status_callbacks(
|
|
242
|
+
deployment_id, previous_health.overall_status, overall_status
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
return deployment_health
|
|
246
|
+
|
|
247
|
+
def start_monitoring(self) -> None:
|
|
248
|
+
"""
|
|
249
|
+
Start background health monitoring.
|
|
250
|
+
|
|
251
|
+
WHY: Enables continuous health tracking without manual polling.
|
|
252
|
+
Creates a daemon thread that performs periodic checks.
|
|
253
|
+
"""
|
|
254
|
+
with self._lock:
|
|
255
|
+
if self._monitoring:
|
|
256
|
+
self.log_warning("Health monitoring already running")
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
self._monitoring = True
|
|
260
|
+
self._stop_event.clear()
|
|
261
|
+
|
|
262
|
+
# Create and start monitoring thread
|
|
263
|
+
self._monitor_thread = threading.Thread(
|
|
264
|
+
target=self._monitor_loop, daemon=True, name="HealthMonitorThread"
|
|
265
|
+
)
|
|
266
|
+
self._monitor_thread.start()
|
|
267
|
+
|
|
268
|
+
self.log_info(
|
|
269
|
+
f"Started health monitoring with {self.check_interval}s interval"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
def stop_monitoring(self) -> None:
|
|
273
|
+
"""
|
|
274
|
+
Stop background health monitoring.
|
|
275
|
+
|
|
276
|
+
WHY: Gracefully stops the monitoring thread and releases resources.
|
|
277
|
+
"""
|
|
278
|
+
with self._lock:
|
|
279
|
+
if not self._monitoring:
|
|
280
|
+
return
|
|
281
|
+
|
|
282
|
+
self._monitoring = False
|
|
283
|
+
self._stop_event.set()
|
|
284
|
+
|
|
285
|
+
# Wait for monitoring thread to stop
|
|
286
|
+
if self._monitor_thread and self._monitor_thread.is_alive():
|
|
287
|
+
self._monitor_thread.join(timeout=5.0)
|
|
288
|
+
|
|
289
|
+
self.log_info("Stopped health monitoring")
|
|
290
|
+
|
|
291
|
+
def is_monitoring(self) -> bool:
|
|
292
|
+
"""
|
|
293
|
+
Check if background monitoring is active.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
True if monitoring thread is running
|
|
297
|
+
"""
|
|
298
|
+
with self._lock:
|
|
299
|
+
return self._monitoring
|
|
300
|
+
|
|
301
|
+
def get_health_history(
|
|
302
|
+
self, deployment_id: str, limit: int = 10
|
|
303
|
+
) -> List[DeploymentHealth]:
|
|
304
|
+
"""
|
|
305
|
+
Get historical health check results for a deployment.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
deployment_id: Unique deployment identifier
|
|
309
|
+
limit: Maximum number of historical entries to return
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
List of DeploymentHealth objects, newest first
|
|
313
|
+
"""
|
|
314
|
+
with self._lock:
|
|
315
|
+
history = self._health_history.get(deployment_id, [])
|
|
316
|
+
return list(reversed(history[-limit:]))
|
|
317
|
+
|
|
318
|
+
def register_status_callback(
|
|
319
|
+
self, callback: Callable[[str, HealthStatus, HealthStatus], None]
|
|
320
|
+
) -> None:
|
|
321
|
+
"""
|
|
322
|
+
Register a callback for health status changes.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
callback: Function called with (deployment_id, old_status, new_status)
|
|
326
|
+
"""
|
|
327
|
+
with self._lock:
|
|
328
|
+
self._status_callbacks.append(callback)
|
|
329
|
+
self.log_debug(f"Registered status callback: {callback.__name__}")
|
|
330
|
+
|
|
331
|
+
def _monitor_loop(self) -> None:
|
|
332
|
+
"""
|
|
333
|
+
Background monitoring loop.
|
|
334
|
+
|
|
335
|
+
WHY: Runs in a separate thread to perform periodic health checks
|
|
336
|
+
on all active deployments.
|
|
337
|
+
"""
|
|
338
|
+
self.log_debug("Health monitoring loop started")
|
|
339
|
+
|
|
340
|
+
while not self._stop_event.is_set():
|
|
341
|
+
try:
|
|
342
|
+
# Get all active deployments
|
|
343
|
+
deployments = self.process_manager.state_manager.get_all_deployments()
|
|
344
|
+
|
|
345
|
+
for deployment in deployments:
|
|
346
|
+
if self._stop_event.is_set():
|
|
347
|
+
break
|
|
348
|
+
|
|
349
|
+
try:
|
|
350
|
+
# Perform health check
|
|
351
|
+
self.check_health(deployment.deployment_id)
|
|
352
|
+
except Exception as e:
|
|
353
|
+
self.log_error(
|
|
354
|
+
f"Error checking health for {deployment.deployment_id}: {e}"
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Sleep until next check interval
|
|
358
|
+
# Use wait() instead of sleep() for faster shutdown response
|
|
359
|
+
self._stop_event.wait(timeout=self.check_interval)
|
|
360
|
+
|
|
361
|
+
except Exception as e:
|
|
362
|
+
self.log_error(f"Error in health monitoring loop: {e}")
|
|
363
|
+
# Don't crash the thread, just continue
|
|
364
|
+
self._stop_event.wait(timeout=1.0)
|
|
365
|
+
|
|
366
|
+
self.log_debug("Health monitoring loop stopped")
|
|
367
|
+
|
|
368
|
+
def _aggregate_health_status(self, checks: List[HealthCheckResult]) -> HealthStatus:
|
|
369
|
+
"""
|
|
370
|
+
Aggregate health status from multiple check results.
|
|
371
|
+
|
|
372
|
+
WHY: Combines results from different check types using priority rules
|
|
373
|
+
to determine overall deployment health.
|
|
374
|
+
|
|
375
|
+
Priority Rules:
|
|
376
|
+
1. Process UNHEALTHY = Deployment UNHEALTHY (critical)
|
|
377
|
+
2. Any check UNHEALTHY = Deployment DEGRADED (service issues but process alive)
|
|
378
|
+
3. All checks HEALTHY = Deployment HEALTHY
|
|
379
|
+
4. Otherwise = UNKNOWN
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
checks: List of health check results
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
Aggregated HealthStatus
|
|
386
|
+
"""
|
|
387
|
+
if not checks:
|
|
388
|
+
return HealthStatus.UNKNOWN
|
|
389
|
+
|
|
390
|
+
# Get process check result (most critical)
|
|
391
|
+
process_check = next((c for c in checks if c.check_type == "process"), None)
|
|
392
|
+
|
|
393
|
+
# Rule 1: Process UNHEALTHY = Deployment UNHEALTHY
|
|
394
|
+
if process_check and process_check.status == HealthStatus.UNHEALTHY:
|
|
395
|
+
return HealthStatus.UNHEALTHY
|
|
396
|
+
|
|
397
|
+
# Rule 2: Any check UNHEALTHY (but process alive) = DEGRADED
|
|
398
|
+
if any(c.status == HealthStatus.UNHEALTHY for c in checks):
|
|
399
|
+
return HealthStatus.DEGRADED
|
|
400
|
+
|
|
401
|
+
# Check for degraded status
|
|
402
|
+
if any(c.status == HealthStatus.DEGRADED for c in checks):
|
|
403
|
+
return HealthStatus.DEGRADED
|
|
404
|
+
|
|
405
|
+
# Rule 3: All checks HEALTHY = Deployment HEALTHY
|
|
406
|
+
if all(c.status == HealthStatus.HEALTHY for c in checks):
|
|
407
|
+
return HealthStatus.HEALTHY
|
|
408
|
+
|
|
409
|
+
# Rule 4: Otherwise = UNKNOWN
|
|
410
|
+
return HealthStatus.UNKNOWN
|
|
411
|
+
|
|
412
|
+
def _trigger_status_callbacks(
|
|
413
|
+
self, deployment_id: str, old_status: HealthStatus, new_status: HealthStatus
|
|
414
|
+
) -> None:
|
|
415
|
+
"""
|
|
416
|
+
Trigger registered callbacks for status changes.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
deployment_id: Deployment that changed status
|
|
420
|
+
old_status: Previous health status
|
|
421
|
+
new_status: New health status
|
|
422
|
+
"""
|
|
423
|
+
for callback in self._status_callbacks:
|
|
424
|
+
try:
|
|
425
|
+
callback(deployment_id, old_status, new_status)
|
|
426
|
+
except Exception as e:
|
|
427
|
+
self.log_error(f"Error in status callback {callback.__name__}: {e}")
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
__all__ = ["HealthCheckManager"]
|