claude-mpm 4.13.1__py3-none-any.whl → 4.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of claude-mpm might be problematic. Click here for more details.
- claude_mpm/VERSION +1 -1
- claude_mpm/agents/PM_INSTRUCTIONS.md +68 -0
- claude_mpm/cli/__init__.py +10 -0
- claude_mpm/cli/commands/local_deploy.py +536 -0
- claude_mpm/cli/parsers/base_parser.py +7 -0
- claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
- claude_mpm/commands/mpm-agents-detect.md +168 -0
- claude_mpm/commands/mpm-agents-recommend.md +214 -0
- claude_mpm/commands/mpm-agents.md +75 -1
- claude_mpm/commands/mpm-auto-configure.md +217 -0
- claude_mpm/commands/mpm-help.md +160 -0
- claude_mpm/config/model_config.py +428 -0
- claude_mpm/core/interactive_session.py +3 -0
- claude_mpm/services/core/interfaces/__init__.py +74 -2
- claude_mpm/services/core/interfaces/health.py +172 -0
- claude_mpm/services/core/interfaces/model.py +281 -0
- claude_mpm/services/core/interfaces/process.py +372 -0
- claude_mpm/services/core/interfaces/restart.py +307 -0
- claude_mpm/services/core/interfaces/stability.py +260 -0
- claude_mpm/services/core/models/__init__.py +35 -0
- claude_mpm/services/core/models/health.py +189 -0
- claude_mpm/services/core/models/process.py +258 -0
- claude_mpm/services/core/models/restart.py +302 -0
- claude_mpm/services/core/models/stability.py +264 -0
- claude_mpm/services/local_ops/__init__.py +163 -0
- claude_mpm/services/local_ops/crash_detector.py +257 -0
- claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
- claude_mpm/services/local_ops/health_checks/http_check.py +223 -0
- claude_mpm/services/local_ops/health_checks/process_check.py +235 -0
- claude_mpm/services/local_ops/health_checks/resource_check.py +254 -0
- claude_mpm/services/local_ops/health_manager.py +430 -0
- claude_mpm/services/local_ops/log_monitor.py +396 -0
- claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
- claude_mpm/services/local_ops/process_manager.py +595 -0
- claude_mpm/services/local_ops/resource_monitor.py +331 -0
- claude_mpm/services/local_ops/restart_manager.py +401 -0
- claude_mpm/services/local_ops/restart_policy.py +387 -0
- claude_mpm/services/local_ops/state_manager.py +371 -0
- claude_mpm/services/local_ops/unified_manager.py +600 -0
- claude_mpm/services/model/__init__.py +147 -0
- claude_mpm/services/model/base_provider.py +365 -0
- claude_mpm/services/model/claude_provider.py +412 -0
- claude_mpm/services/model/model_router.py +453 -0
- claude_mpm/services/model/ollama_provider.py +415 -0
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/METADATA +1 -1
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/RECORD +50 -15
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/WHEEL +0 -0
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Health Check Implementations for Local Operations
|
|
3
|
+
==================================================
|
|
4
|
+
|
|
5
|
+
WHY: Provides three-tier health monitoring for local deployments:
|
|
6
|
+
- HTTP health checks for endpoint availability
|
|
7
|
+
- Process health checks for process status
|
|
8
|
+
- Resource health checks for CPU/memory/connections
|
|
9
|
+
|
|
10
|
+
ARCHITECTURE:
|
|
11
|
+
- HttpHealthCheck: HTTP endpoint availability and response time
|
|
12
|
+
- ProcessHealthCheck: Process existence and status validation
|
|
13
|
+
- ResourceHealthCheck: CPU, memory, and connection monitoring
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from claude_mpm.services.local_ops.health_checks.http_check import HttpHealthCheck
|
|
17
|
+
from claude_mpm.services.local_ops.health_checks.process_check import (
|
|
18
|
+
ProcessHealthCheck,
|
|
19
|
+
)
|
|
20
|
+
from claude_mpm.services.local_ops.health_checks.resource_check import (
|
|
21
|
+
ResourceHealthCheck,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"HttpHealthCheck",
|
|
26
|
+
"ProcessHealthCheck",
|
|
27
|
+
"ResourceHealthCheck",
|
|
28
|
+
]
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTTP Health Check for Claude MPM Framework
|
|
3
|
+
===========================================
|
|
4
|
+
|
|
5
|
+
WHY: Provides HTTP endpoint health monitoring with response time measurement,
|
|
6
|
+
status code validation, and timeout handling.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Uses requests library with configurable timeout and retry logic.
|
|
9
|
+
Supports custom headers and SSL/TLS validation.
|
|
10
|
+
|
|
11
|
+
ARCHITECTURE:
|
|
12
|
+
- Synchronous HTTP GET requests
|
|
13
|
+
- Response time measurement with time.perf_counter()
|
|
14
|
+
- Status code validation (2xx/3xx = healthy)
|
|
15
|
+
- Timeout and connection error handling
|
|
16
|
+
- Retry logic with exponential backoff
|
|
17
|
+
|
|
18
|
+
USAGE:
|
|
19
|
+
http_check = HttpHealthCheck(process_manager)
|
|
20
|
+
result = http_check.check(
|
|
21
|
+
deployment_id="my-app",
|
|
22
|
+
endpoint="http://localhost:3000/health",
|
|
23
|
+
timeout=5.0
|
|
24
|
+
)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import time
|
|
28
|
+
|
|
29
|
+
import requests
|
|
30
|
+
from requests.exceptions import ConnectionError, RequestException, Timeout
|
|
31
|
+
|
|
32
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
33
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheck
|
|
34
|
+
from claude_mpm.services.core.interfaces.process import ILocalProcessManager
|
|
35
|
+
from claude_mpm.services.core.models.health import HealthCheckResult, HealthStatus
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class HttpHealthCheck(SyncBaseService, IHealthCheck):
|
|
39
|
+
"""
|
|
40
|
+
HTTP endpoint health check implementation.
|
|
41
|
+
|
|
42
|
+
WHY: Validates that deployed services are accessible via HTTP and
|
|
43
|
+
responding within acceptable timeframes.
|
|
44
|
+
|
|
45
|
+
Thread Safety: Stateless, safe for concurrent execution.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
process_manager: ILocalProcessManager,
|
|
51
|
+
default_timeout: float = 5.0,
|
|
52
|
+
max_retries: int = 2,
|
|
53
|
+
):
|
|
54
|
+
"""
|
|
55
|
+
Initialize HTTP health check.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
process_manager: Process manager for deployment lookup
|
|
59
|
+
default_timeout: Default timeout in seconds
|
|
60
|
+
max_retries: Maximum number of retry attempts
|
|
61
|
+
"""
|
|
62
|
+
super().__init__("HttpHealthCheck")
|
|
63
|
+
self.process_manager = process_manager
|
|
64
|
+
self.default_timeout = default_timeout
|
|
65
|
+
self.max_retries = max_retries
|
|
66
|
+
|
|
67
|
+
def initialize(self) -> bool:
|
|
68
|
+
"""
|
|
69
|
+
Initialize the health check.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
True if initialization successful
|
|
73
|
+
"""
|
|
74
|
+
self._initialized = True
|
|
75
|
+
self.log_info("HTTP health check initialized")
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
def shutdown(self) -> None:
|
|
79
|
+
"""Shutdown health check (no resources to clean up)."""
|
|
80
|
+
self._shutdown = True
|
|
81
|
+
|
|
82
|
+
def get_check_type(self) -> str:
|
|
83
|
+
"""Get the check type identifier."""
|
|
84
|
+
return "http"
|
|
85
|
+
|
|
86
|
+
def check(self, deployment_id: str, **kwargs) -> HealthCheckResult:
|
|
87
|
+
"""
|
|
88
|
+
Execute HTTP health check for a deployment.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
deployment_id: Unique deployment identifier
|
|
92
|
+
**kwargs: Optional parameters:
|
|
93
|
+
- endpoint: HTTP endpoint URL (required)
|
|
94
|
+
- timeout: Request timeout in seconds (default: 5.0)
|
|
95
|
+
- headers: Custom HTTP headers
|
|
96
|
+
- verify_ssl: Verify SSL certificates (default: True)
|
|
97
|
+
- expected_status: Expected status code (default: 200)
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
HealthCheckResult with check status and details
|
|
101
|
+
|
|
102
|
+
Raises:
|
|
103
|
+
ValueError: If deployment_id not found or endpoint not provided
|
|
104
|
+
"""
|
|
105
|
+
# Validate deployment exists
|
|
106
|
+
deployment = self.process_manager.state_manager.get_deployment(deployment_id)
|
|
107
|
+
if not deployment:
|
|
108
|
+
raise ValueError(f"Deployment not found: {deployment_id}")
|
|
109
|
+
|
|
110
|
+
# Get endpoint from kwargs
|
|
111
|
+
endpoint = kwargs.get("endpoint")
|
|
112
|
+
if not endpoint:
|
|
113
|
+
# Try to construct from deployment port
|
|
114
|
+
if deployment.port:
|
|
115
|
+
endpoint = f"http://localhost:{deployment.port}/health"
|
|
116
|
+
else:
|
|
117
|
+
return HealthCheckResult(
|
|
118
|
+
status=HealthStatus.UNKNOWN,
|
|
119
|
+
check_type=self.get_check_type(),
|
|
120
|
+
message="No HTTP endpoint configured for deployment",
|
|
121
|
+
details={"deployment_id": deployment_id},
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Get optional parameters
|
|
125
|
+
timeout = kwargs.get("timeout", self.default_timeout)
|
|
126
|
+
headers = kwargs.get("headers", {})
|
|
127
|
+
verify_ssl = kwargs.get("verify_ssl", True)
|
|
128
|
+
expected_status = kwargs.get("expected_status", 200)
|
|
129
|
+
|
|
130
|
+
# Perform HTTP check with retries
|
|
131
|
+
for attempt in range(self.max_retries + 1):
|
|
132
|
+
try:
|
|
133
|
+
start_time = time.perf_counter()
|
|
134
|
+
response = requests.get(
|
|
135
|
+
endpoint, timeout=timeout, headers=headers, verify=verify_ssl
|
|
136
|
+
)
|
|
137
|
+
response_time = time.perf_counter() - start_time
|
|
138
|
+
|
|
139
|
+
# Check status code
|
|
140
|
+
if response.status_code == expected_status or (
|
|
141
|
+
200 <= response.status_code < 400
|
|
142
|
+
):
|
|
143
|
+
return HealthCheckResult(
|
|
144
|
+
status=HealthStatus.HEALTHY,
|
|
145
|
+
check_type=self.get_check_type(),
|
|
146
|
+
message="HTTP endpoint responding normally",
|
|
147
|
+
details={
|
|
148
|
+
"endpoint": endpoint,
|
|
149
|
+
"status_code": response.status_code,
|
|
150
|
+
"response_time_ms": round(response_time * 1000, 2),
|
|
151
|
+
"attempt": attempt + 1,
|
|
152
|
+
},
|
|
153
|
+
)
|
|
154
|
+
return HealthCheckResult(
|
|
155
|
+
status=HealthStatus.DEGRADED,
|
|
156
|
+
check_type=self.get_check_type(),
|
|
157
|
+
message="HTTP endpoint returned unexpected status code",
|
|
158
|
+
details={
|
|
159
|
+
"endpoint": endpoint,
|
|
160
|
+
"status_code": response.status_code,
|
|
161
|
+
"expected_status": expected_status,
|
|
162
|
+
"response_time_ms": round(response_time * 1000, 2),
|
|
163
|
+
},
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
except Timeout:
|
|
167
|
+
if attempt < self.max_retries:
|
|
168
|
+
self.log_debug(
|
|
169
|
+
f"HTTP check timeout for {deployment_id}, "
|
|
170
|
+
f"retrying (attempt {attempt + 1}/{self.max_retries})"
|
|
171
|
+
)
|
|
172
|
+
time.sleep(0.5 * (2**attempt)) # Exponential backoff
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
return HealthCheckResult(
|
|
176
|
+
status=HealthStatus.DEGRADED,
|
|
177
|
+
check_type=self.get_check_type(),
|
|
178
|
+
message=f"HTTP endpoint timeout after {self.max_retries + 1} attempts",
|
|
179
|
+
details={
|
|
180
|
+
"endpoint": endpoint,
|
|
181
|
+
"timeout_seconds": timeout,
|
|
182
|
+
"attempts": self.max_retries + 1,
|
|
183
|
+
},
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
except ConnectionError as e:
|
|
187
|
+
if attempt < self.max_retries:
|
|
188
|
+
self.log_debug(
|
|
189
|
+
f"HTTP connection error for {deployment_id}, "
|
|
190
|
+
f"retrying (attempt {attempt + 1}/{self.max_retries})"
|
|
191
|
+
)
|
|
192
|
+
time.sleep(0.5 * (2**attempt)) # Exponential backoff
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
return HealthCheckResult(
|
|
196
|
+
status=HealthStatus.UNHEALTHY,
|
|
197
|
+
check_type=self.get_check_type(),
|
|
198
|
+
message="Cannot connect to HTTP endpoint",
|
|
199
|
+
details={
|
|
200
|
+
"endpoint": endpoint,
|
|
201
|
+
"error": str(e),
|
|
202
|
+
"attempts": self.max_retries + 1,
|
|
203
|
+
},
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
except RequestException as e:
|
|
207
|
+
return HealthCheckResult(
|
|
208
|
+
status=HealthStatus.UNHEALTHY,
|
|
209
|
+
check_type=self.get_check_type(),
|
|
210
|
+
message="HTTP request failed",
|
|
211
|
+
details={"endpoint": endpoint, "error": str(e)},
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Should not reach here, but return unknown as fallback
|
|
215
|
+
return HealthCheckResult(
|
|
216
|
+
status=HealthStatus.UNKNOWN,
|
|
217
|
+
check_type=self.get_check_type(),
|
|
218
|
+
message="HTTP check completed with unknown result",
|
|
219
|
+
details={"endpoint": endpoint},
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
__all__ = ["HttpHealthCheck"]
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Process Health Check for Claude MPM Framework
|
|
3
|
+
==============================================
|
|
4
|
+
|
|
5
|
+
WHY: Provides process-level health monitoring including existence validation,
|
|
6
|
+
status checking (running/zombie/stopped), and exit code detection.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Uses psutil for cross-platform process monitoring. Validates
|
|
9
|
+
process existence, status, and parent-child relationships.
|
|
10
|
+
|
|
11
|
+
ARCHITECTURE:
|
|
12
|
+
- Process existence verification with psutil.Process(pid)
|
|
13
|
+
- Process status checking (running, zombie, stopped, sleeping)
|
|
14
|
+
- Exit code detection for dead processes
|
|
15
|
+
- Parent-child relationship validation
|
|
16
|
+
- Process responsiveness checking (not hung)
|
|
17
|
+
|
|
18
|
+
USAGE:
|
|
19
|
+
process_check = ProcessHealthCheck(process_manager)
|
|
20
|
+
result = process_check.check(deployment_id="my-app")
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import psutil
|
|
24
|
+
|
|
25
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
26
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheck
|
|
27
|
+
from claude_mpm.services.core.interfaces.process import ILocalProcessManager
|
|
28
|
+
from claude_mpm.services.core.models.health import HealthCheckResult, HealthStatus
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ProcessHealthCheck(SyncBaseService, IHealthCheck):
|
|
32
|
+
"""
|
|
33
|
+
Process status health check implementation.
|
|
34
|
+
|
|
35
|
+
WHY: Validates that the process is running properly and not in a
|
|
36
|
+
degraded state (zombie, stopped, etc.).
|
|
37
|
+
|
|
38
|
+
Thread Safety: Stateless, safe for concurrent execution.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, process_manager: ILocalProcessManager):
|
|
42
|
+
"""
|
|
43
|
+
Initialize process health check.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
process_manager: Process manager for deployment lookup
|
|
47
|
+
"""
|
|
48
|
+
super().__init__("ProcessHealthCheck")
|
|
49
|
+
self.process_manager = process_manager
|
|
50
|
+
|
|
51
|
+
def initialize(self) -> bool:
|
|
52
|
+
"""
|
|
53
|
+
Initialize the health check.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
True if initialization successful
|
|
57
|
+
"""
|
|
58
|
+
self._initialized = True
|
|
59
|
+
self.log_info("Process health check initialized")
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
def shutdown(self) -> None:
|
|
63
|
+
"""Shutdown health check (no resources to clean up)."""
|
|
64
|
+
self._shutdown = True
|
|
65
|
+
|
|
66
|
+
def get_check_type(self) -> str:
|
|
67
|
+
"""Get the check type identifier."""
|
|
68
|
+
return "process"
|
|
69
|
+
|
|
70
|
+
def check(self, deployment_id: str, **kwargs) -> HealthCheckResult:
|
|
71
|
+
"""
|
|
72
|
+
Execute process health check for a deployment.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
deployment_id: Unique deployment identifier
|
|
76
|
+
**kwargs: Optional parameters:
|
|
77
|
+
- check_responsiveness: Check if process is responsive (default: True)
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
HealthCheckResult with check status and details
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
ValueError: If deployment_id not found
|
|
84
|
+
"""
|
|
85
|
+
# Validate deployment exists
|
|
86
|
+
deployment = self.process_manager.state_manager.get_deployment(deployment_id)
|
|
87
|
+
if not deployment:
|
|
88
|
+
raise ValueError(f"Deployment not found: {deployment_id}")
|
|
89
|
+
|
|
90
|
+
check_responsiveness = kwargs.get("check_responsiveness", True)
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
process = psutil.Process(deployment.process_id)
|
|
94
|
+
|
|
95
|
+
# Check if process exists and is running
|
|
96
|
+
if not process.is_running():
|
|
97
|
+
return HealthCheckResult(
|
|
98
|
+
status=HealthStatus.UNHEALTHY,
|
|
99
|
+
check_type=self.get_check_type(),
|
|
100
|
+
message="Process is not running",
|
|
101
|
+
details={
|
|
102
|
+
"pid": deployment.process_id,
|
|
103
|
+
"deployment_id": deployment_id,
|
|
104
|
+
},
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Get process status
|
|
108
|
+
process_status = process.status()
|
|
109
|
+
|
|
110
|
+
# Check for zombie process
|
|
111
|
+
if process_status == psutil.STATUS_ZOMBIE:
|
|
112
|
+
return HealthCheckResult(
|
|
113
|
+
status=HealthStatus.UNHEALTHY,
|
|
114
|
+
check_type=self.get_check_type(),
|
|
115
|
+
message="Process is a zombie",
|
|
116
|
+
details={
|
|
117
|
+
"pid": deployment.process_id,
|
|
118
|
+
"status": process_status,
|
|
119
|
+
"deployment_id": deployment_id,
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Check for stopped process
|
|
124
|
+
if process_status in (psutil.STATUS_STOPPED, psutil.STATUS_DEAD):
|
|
125
|
+
return HealthCheckResult(
|
|
126
|
+
status=HealthStatus.UNHEALTHY,
|
|
127
|
+
check_type=self.get_check_type(),
|
|
128
|
+
message=f"Process is {process_status}",
|
|
129
|
+
details={
|
|
130
|
+
"pid": deployment.process_id,
|
|
131
|
+
"status": process_status,
|
|
132
|
+
"deployment_id": deployment_id,
|
|
133
|
+
},
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Check responsiveness (CPU activity)
|
|
137
|
+
if check_responsiveness:
|
|
138
|
+
try:
|
|
139
|
+
cpu_percent = process.cpu_percent(interval=0.1)
|
|
140
|
+
# Process should have some CPU activity or be idle/sleeping
|
|
141
|
+
# A process with 0% CPU for extended periods might be hung
|
|
142
|
+
is_responsive = cpu_percent > 0 or process_status in (
|
|
143
|
+
psutil.STATUS_SLEEPING,
|
|
144
|
+
psutil.STATUS_IDLE,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
if not is_responsive:
|
|
148
|
+
return HealthCheckResult(
|
|
149
|
+
status=HealthStatus.DEGRADED,
|
|
150
|
+
check_type=self.get_check_type(),
|
|
151
|
+
message="Process may be unresponsive",
|
|
152
|
+
details={
|
|
153
|
+
"pid": deployment.process_id,
|
|
154
|
+
"status": process_status,
|
|
155
|
+
"cpu_percent": cpu_percent,
|
|
156
|
+
"deployment_id": deployment_id,
|
|
157
|
+
},
|
|
158
|
+
)
|
|
159
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
160
|
+
# Process disappeared or we lost access during check
|
|
161
|
+
return HealthCheckResult(
|
|
162
|
+
status=HealthStatus.UNHEALTHY,
|
|
163
|
+
check_type=self.get_check_type(),
|
|
164
|
+
message="Process disappeared during check",
|
|
165
|
+
details={
|
|
166
|
+
"pid": deployment.process_id,
|
|
167
|
+
"deployment_id": deployment_id,
|
|
168
|
+
},
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Process is healthy
|
|
172
|
+
# Get additional process info for details
|
|
173
|
+
try:
|
|
174
|
+
process_info = {
|
|
175
|
+
"pid": deployment.process_id,
|
|
176
|
+
"status": process_status,
|
|
177
|
+
"deployment_id": deployment_id,
|
|
178
|
+
"name": process.name(),
|
|
179
|
+
"num_threads": process.num_threads(),
|
|
180
|
+
}
|
|
181
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
182
|
+
process_info = {
|
|
183
|
+
"pid": deployment.process_id,
|
|
184
|
+
"status": process_status,
|
|
185
|
+
"deployment_id": deployment_id,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
return HealthCheckResult(
|
|
189
|
+
status=HealthStatus.HEALTHY,
|
|
190
|
+
check_type=self.get_check_type(),
|
|
191
|
+
message="Process is running normally",
|
|
192
|
+
details=process_info,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
except psutil.NoSuchProcess:
|
|
196
|
+
# Process does not exist
|
|
197
|
+
return HealthCheckResult(
|
|
198
|
+
status=HealthStatus.UNHEALTHY,
|
|
199
|
+
check_type=self.get_check_type(),
|
|
200
|
+
message="Process no longer exists",
|
|
201
|
+
details={
|
|
202
|
+
"pid": deployment.process_id,
|
|
203
|
+
"deployment_id": deployment_id,
|
|
204
|
+
},
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
except psutil.AccessDenied as e:
|
|
208
|
+
# Cannot access process information
|
|
209
|
+
return HealthCheckResult(
|
|
210
|
+
status=HealthStatus.UNKNOWN,
|
|
211
|
+
check_type=self.get_check_type(),
|
|
212
|
+
message="Cannot access process information",
|
|
213
|
+
details={
|
|
214
|
+
"pid": deployment.process_id,
|
|
215
|
+
"deployment_id": deployment_id,
|
|
216
|
+
"error": str(e),
|
|
217
|
+
},
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
except Exception as e:
|
|
221
|
+
# Unexpected error
|
|
222
|
+
self.log_error(f"Unexpected error in process health check: {e}")
|
|
223
|
+
return HealthCheckResult(
|
|
224
|
+
status=HealthStatus.UNKNOWN,
|
|
225
|
+
check_type=self.get_check_type(),
|
|
226
|
+
message="Health check failed with error",
|
|
227
|
+
details={
|
|
228
|
+
"pid": deployment.process_id,
|
|
229
|
+
"deployment_id": deployment_id,
|
|
230
|
+
"error": str(e),
|
|
231
|
+
},
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
__all__ = ["ProcessHealthCheck"]
|