claude-mpm 4.13.1__py3-none-any.whl → 4.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of claude-mpm might be problematic. Click here for more details.

Files changed (50) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/PM_INSTRUCTIONS.md +68 -0
  3. claude_mpm/cli/__init__.py +10 -0
  4. claude_mpm/cli/commands/local_deploy.py +536 -0
  5. claude_mpm/cli/parsers/base_parser.py +7 -0
  6. claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
  7. claude_mpm/commands/mpm-agents-detect.md +168 -0
  8. claude_mpm/commands/mpm-agents-recommend.md +214 -0
  9. claude_mpm/commands/mpm-agents.md +75 -1
  10. claude_mpm/commands/mpm-auto-configure.md +217 -0
  11. claude_mpm/commands/mpm-help.md +160 -0
  12. claude_mpm/config/model_config.py +428 -0
  13. claude_mpm/core/interactive_session.py +3 -0
  14. claude_mpm/services/core/interfaces/__init__.py +74 -2
  15. claude_mpm/services/core/interfaces/health.py +172 -0
  16. claude_mpm/services/core/interfaces/model.py +281 -0
  17. claude_mpm/services/core/interfaces/process.py +372 -0
  18. claude_mpm/services/core/interfaces/restart.py +307 -0
  19. claude_mpm/services/core/interfaces/stability.py +260 -0
  20. claude_mpm/services/core/models/__init__.py +35 -0
  21. claude_mpm/services/core/models/health.py +189 -0
  22. claude_mpm/services/core/models/process.py +258 -0
  23. claude_mpm/services/core/models/restart.py +302 -0
  24. claude_mpm/services/core/models/stability.py +264 -0
  25. claude_mpm/services/local_ops/__init__.py +163 -0
  26. claude_mpm/services/local_ops/crash_detector.py +257 -0
  27. claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
  28. claude_mpm/services/local_ops/health_checks/http_check.py +223 -0
  29. claude_mpm/services/local_ops/health_checks/process_check.py +235 -0
  30. claude_mpm/services/local_ops/health_checks/resource_check.py +254 -0
  31. claude_mpm/services/local_ops/health_manager.py +430 -0
  32. claude_mpm/services/local_ops/log_monitor.py +396 -0
  33. claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
  34. claude_mpm/services/local_ops/process_manager.py +595 -0
  35. claude_mpm/services/local_ops/resource_monitor.py +331 -0
  36. claude_mpm/services/local_ops/restart_manager.py +401 -0
  37. claude_mpm/services/local_ops/restart_policy.py +387 -0
  38. claude_mpm/services/local_ops/state_manager.py +371 -0
  39. claude_mpm/services/local_ops/unified_manager.py +600 -0
  40. claude_mpm/services/model/__init__.py +147 -0
  41. claude_mpm/services/model/base_provider.py +365 -0
  42. claude_mpm/services/model/claude_provider.py +412 -0
  43. claude_mpm/services/model/model_router.py +453 -0
  44. claude_mpm/services/model/ollama_provider.py +415 -0
  45. {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/METADATA +1 -1
  46. {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/RECORD +50 -15
  47. {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/WHEEL +0 -0
  48. {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/entry_points.txt +0 -0
  49. {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/licenses/LICENSE +0 -0
  50. {claude_mpm-4.13.1.dist-info → claude_mpm-4.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,28 @@
1
+ """
2
+ Health Check Implementations for Local Operations
3
+ ==================================================
4
+
5
+ WHY: Provides three-tier health monitoring for local deployments:
6
+ - HTTP health checks for endpoint availability
7
+ - Process health checks for process status
8
+ - Resource health checks for CPU/memory/connections
9
+
10
+ ARCHITECTURE:
11
+ - HttpHealthCheck: HTTP endpoint availability and response time
12
+ - ProcessHealthCheck: Process existence and status validation
13
+ - ResourceHealthCheck: CPU, memory, and connection monitoring
14
+ """
15
+
16
+ from claude_mpm.services.local_ops.health_checks.http_check import HttpHealthCheck
17
+ from claude_mpm.services.local_ops.health_checks.process_check import (
18
+ ProcessHealthCheck,
19
+ )
20
+ from claude_mpm.services.local_ops.health_checks.resource_check import (
21
+ ResourceHealthCheck,
22
+ )
23
+
24
+ __all__ = [
25
+ "HttpHealthCheck",
26
+ "ProcessHealthCheck",
27
+ "ResourceHealthCheck",
28
+ ]
@@ -0,0 +1,223 @@
1
+ """
2
+ HTTP Health Check for Claude MPM Framework
3
+ ===========================================
4
+
5
+ WHY: Provides HTTP endpoint health monitoring with response time measurement,
6
+ status code validation, and timeout handling.
7
+
8
+ DESIGN DECISION: Uses requests library with configurable timeout and retry logic.
9
+ Supports custom headers and SSL/TLS validation.
10
+
11
+ ARCHITECTURE:
12
+ - Synchronous HTTP GET requests
13
+ - Response time measurement with time.perf_counter()
14
+ - Status code validation (2xx/3xx = healthy)
15
+ - Timeout and connection error handling
16
+ - Retry logic with exponential backoff
17
+
18
+ USAGE:
19
+ http_check = HttpHealthCheck(process_manager)
20
+ result = http_check.check(
21
+ deployment_id="my-app",
22
+ endpoint="http://localhost:3000/health",
23
+ timeout=5.0
24
+ )
25
+ """
26
+
27
+ import time
28
+
29
+ import requests
30
+ from requests.exceptions import ConnectionError, RequestException, Timeout
31
+
32
+ from claude_mpm.services.core.base import SyncBaseService
33
+ from claude_mpm.services.core.interfaces.health import IHealthCheck
34
+ from claude_mpm.services.core.interfaces.process import ILocalProcessManager
35
+ from claude_mpm.services.core.models.health import HealthCheckResult, HealthStatus
36
+
37
+
38
+ class HttpHealthCheck(SyncBaseService, IHealthCheck):
39
+ """
40
+ HTTP endpoint health check implementation.
41
+
42
+ WHY: Validates that deployed services are accessible via HTTP and
43
+ responding within acceptable timeframes.
44
+
45
+ Thread Safety: Stateless, safe for concurrent execution.
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ process_manager: ILocalProcessManager,
51
+ default_timeout: float = 5.0,
52
+ max_retries: int = 2,
53
+ ):
54
+ """
55
+ Initialize HTTP health check.
56
+
57
+ Args:
58
+ process_manager: Process manager for deployment lookup
59
+ default_timeout: Default timeout in seconds
60
+ max_retries: Maximum number of retry attempts
61
+ """
62
+ super().__init__("HttpHealthCheck")
63
+ self.process_manager = process_manager
64
+ self.default_timeout = default_timeout
65
+ self.max_retries = max_retries
66
+
67
+ def initialize(self) -> bool:
68
+ """
69
+ Initialize the health check.
70
+
71
+ Returns:
72
+ True if initialization successful
73
+ """
74
+ self._initialized = True
75
+ self.log_info("HTTP health check initialized")
76
+ return True
77
+
78
+ def shutdown(self) -> None:
79
+ """Shutdown health check (no resources to clean up)."""
80
+ self._shutdown = True
81
+
82
+ def get_check_type(self) -> str:
83
+ """Get the check type identifier."""
84
+ return "http"
85
+
86
+ def check(self, deployment_id: str, **kwargs) -> HealthCheckResult:
87
+ """
88
+ Execute HTTP health check for a deployment.
89
+
90
+ Args:
91
+ deployment_id: Unique deployment identifier
92
+ **kwargs: Optional parameters:
93
+ - endpoint: HTTP endpoint URL (required)
94
+ - timeout: Request timeout in seconds (default: 5.0)
95
+ - headers: Custom HTTP headers
96
+ - verify_ssl: Verify SSL certificates (default: True)
97
+ - expected_status: Expected status code (default: 200)
98
+
99
+ Returns:
100
+ HealthCheckResult with check status and details
101
+
102
+ Raises:
103
+ ValueError: If deployment_id not found or endpoint not provided
104
+ """
105
+ # Validate deployment exists
106
+ deployment = self.process_manager.state_manager.get_deployment(deployment_id)
107
+ if not deployment:
108
+ raise ValueError(f"Deployment not found: {deployment_id}")
109
+
110
+ # Get endpoint from kwargs
111
+ endpoint = kwargs.get("endpoint")
112
+ if not endpoint:
113
+ # Try to construct from deployment port
114
+ if deployment.port:
115
+ endpoint = f"http://localhost:{deployment.port}/health"
116
+ else:
117
+ return HealthCheckResult(
118
+ status=HealthStatus.UNKNOWN,
119
+ check_type=self.get_check_type(),
120
+ message="No HTTP endpoint configured for deployment",
121
+ details={"deployment_id": deployment_id},
122
+ )
123
+
124
+ # Get optional parameters
125
+ timeout = kwargs.get("timeout", self.default_timeout)
126
+ headers = kwargs.get("headers", {})
127
+ verify_ssl = kwargs.get("verify_ssl", True)
128
+ expected_status = kwargs.get("expected_status", 200)
129
+
130
+ # Perform HTTP check with retries
131
+ for attempt in range(self.max_retries + 1):
132
+ try:
133
+ start_time = time.perf_counter()
134
+ response = requests.get(
135
+ endpoint, timeout=timeout, headers=headers, verify=verify_ssl
136
+ )
137
+ response_time = time.perf_counter() - start_time
138
+
139
+ # Check status code
140
+ if response.status_code == expected_status or (
141
+ 200 <= response.status_code < 400
142
+ ):
143
+ return HealthCheckResult(
144
+ status=HealthStatus.HEALTHY,
145
+ check_type=self.get_check_type(),
146
+ message="HTTP endpoint responding normally",
147
+ details={
148
+ "endpoint": endpoint,
149
+ "status_code": response.status_code,
150
+ "response_time_ms": round(response_time * 1000, 2),
151
+ "attempt": attempt + 1,
152
+ },
153
+ )
154
+ return HealthCheckResult(
155
+ status=HealthStatus.DEGRADED,
156
+ check_type=self.get_check_type(),
157
+ message="HTTP endpoint returned unexpected status code",
158
+ details={
159
+ "endpoint": endpoint,
160
+ "status_code": response.status_code,
161
+ "expected_status": expected_status,
162
+ "response_time_ms": round(response_time * 1000, 2),
163
+ },
164
+ )
165
+
166
+ except Timeout:
167
+ if attempt < self.max_retries:
168
+ self.log_debug(
169
+ f"HTTP check timeout for {deployment_id}, "
170
+ f"retrying (attempt {attempt + 1}/{self.max_retries})"
171
+ )
172
+ time.sleep(0.5 * (2**attempt)) # Exponential backoff
173
+ continue
174
+
175
+ return HealthCheckResult(
176
+ status=HealthStatus.DEGRADED,
177
+ check_type=self.get_check_type(),
178
+ message=f"HTTP endpoint timeout after {self.max_retries + 1} attempts",
179
+ details={
180
+ "endpoint": endpoint,
181
+ "timeout_seconds": timeout,
182
+ "attempts": self.max_retries + 1,
183
+ },
184
+ )
185
+
186
+ except ConnectionError as e:
187
+ if attempt < self.max_retries:
188
+ self.log_debug(
189
+ f"HTTP connection error for {deployment_id}, "
190
+ f"retrying (attempt {attempt + 1}/{self.max_retries})"
191
+ )
192
+ time.sleep(0.5 * (2**attempt)) # Exponential backoff
193
+ continue
194
+
195
+ return HealthCheckResult(
196
+ status=HealthStatus.UNHEALTHY,
197
+ check_type=self.get_check_type(),
198
+ message="Cannot connect to HTTP endpoint",
199
+ details={
200
+ "endpoint": endpoint,
201
+ "error": str(e),
202
+ "attempts": self.max_retries + 1,
203
+ },
204
+ )
205
+
206
+ except RequestException as e:
207
+ return HealthCheckResult(
208
+ status=HealthStatus.UNHEALTHY,
209
+ check_type=self.get_check_type(),
210
+ message="HTTP request failed",
211
+ details={"endpoint": endpoint, "error": str(e)},
212
+ )
213
+
214
+ # Should not reach here, but return unknown as fallback
215
+ return HealthCheckResult(
216
+ status=HealthStatus.UNKNOWN,
217
+ check_type=self.get_check_type(),
218
+ message="HTTP check completed with unknown result",
219
+ details={"endpoint": endpoint},
220
+ )
221
+
222
+
223
+ __all__ = ["HttpHealthCheck"]
@@ -0,0 +1,235 @@
1
+ """
2
+ Process Health Check for Claude MPM Framework
3
+ ==============================================
4
+
5
+ WHY: Provides process-level health monitoring including existence validation,
6
+ status checking (running/zombie/stopped), and exit code detection.
7
+
8
+ DESIGN DECISION: Uses psutil for cross-platform process monitoring. Validates
9
+ process existence, status, and parent-child relationships.
10
+
11
+ ARCHITECTURE:
12
+ - Process existence verification with psutil.Process(pid)
13
+ - Process status checking (running, zombie, stopped, sleeping)
14
+ - Exit code detection for dead processes
15
+ - Parent-child relationship validation
16
+ - Process responsiveness checking (not hung)
17
+
18
+ USAGE:
19
+ process_check = ProcessHealthCheck(process_manager)
20
+ result = process_check.check(deployment_id="my-app")
21
+ """
22
+
23
+ import psutil
24
+
25
+ from claude_mpm.services.core.base import SyncBaseService
26
+ from claude_mpm.services.core.interfaces.health import IHealthCheck
27
+ from claude_mpm.services.core.interfaces.process import ILocalProcessManager
28
+ from claude_mpm.services.core.models.health import HealthCheckResult, HealthStatus
29
+
30
+
31
+ class ProcessHealthCheck(SyncBaseService, IHealthCheck):
32
+ """
33
+ Process status health check implementation.
34
+
35
+ WHY: Validates that the process is running properly and not in a
36
+ degraded state (zombie, stopped, etc.).
37
+
38
+ Thread Safety: Stateless, safe for concurrent execution.
39
+ """
40
+
41
+ def __init__(self, process_manager: ILocalProcessManager):
42
+ """
43
+ Initialize process health check.
44
+
45
+ Args:
46
+ process_manager: Process manager for deployment lookup
47
+ """
48
+ super().__init__("ProcessHealthCheck")
49
+ self.process_manager = process_manager
50
+
51
+ def initialize(self) -> bool:
52
+ """
53
+ Initialize the health check.
54
+
55
+ Returns:
56
+ True if initialization successful
57
+ """
58
+ self._initialized = True
59
+ self.log_info("Process health check initialized")
60
+ return True
61
+
62
+ def shutdown(self) -> None:
63
+ """Shutdown health check (no resources to clean up)."""
64
+ self._shutdown = True
65
+
66
+ def get_check_type(self) -> str:
67
+ """Get the check type identifier."""
68
+ return "process"
69
+
70
+ def check(self, deployment_id: str, **kwargs) -> HealthCheckResult:
71
+ """
72
+ Execute process health check for a deployment.
73
+
74
+ Args:
75
+ deployment_id: Unique deployment identifier
76
+ **kwargs: Optional parameters:
77
+ - check_responsiveness: Check if process is responsive (default: True)
78
+
79
+ Returns:
80
+ HealthCheckResult with check status and details
81
+
82
+ Raises:
83
+ ValueError: If deployment_id not found
84
+ """
85
+ # Validate deployment exists
86
+ deployment = self.process_manager.state_manager.get_deployment(deployment_id)
87
+ if not deployment:
88
+ raise ValueError(f"Deployment not found: {deployment_id}")
89
+
90
+ check_responsiveness = kwargs.get("check_responsiveness", True)
91
+
92
+ try:
93
+ process = psutil.Process(deployment.process_id)
94
+
95
+ # Check if process exists and is running
96
+ if not process.is_running():
97
+ return HealthCheckResult(
98
+ status=HealthStatus.UNHEALTHY,
99
+ check_type=self.get_check_type(),
100
+ message="Process is not running",
101
+ details={
102
+ "pid": deployment.process_id,
103
+ "deployment_id": deployment_id,
104
+ },
105
+ )
106
+
107
+ # Get process status
108
+ process_status = process.status()
109
+
110
+ # Check for zombie process
111
+ if process_status == psutil.STATUS_ZOMBIE:
112
+ return HealthCheckResult(
113
+ status=HealthStatus.UNHEALTHY,
114
+ check_type=self.get_check_type(),
115
+ message="Process is a zombie",
116
+ details={
117
+ "pid": deployment.process_id,
118
+ "status": process_status,
119
+ "deployment_id": deployment_id,
120
+ },
121
+ )
122
+
123
+ # Check for stopped process
124
+ if process_status in (psutil.STATUS_STOPPED, psutil.STATUS_DEAD):
125
+ return HealthCheckResult(
126
+ status=HealthStatus.UNHEALTHY,
127
+ check_type=self.get_check_type(),
128
+ message=f"Process is {process_status}",
129
+ details={
130
+ "pid": deployment.process_id,
131
+ "status": process_status,
132
+ "deployment_id": deployment_id,
133
+ },
134
+ )
135
+
136
+ # Check responsiveness (CPU activity)
137
+ if check_responsiveness:
138
+ try:
139
+ cpu_percent = process.cpu_percent(interval=0.1)
140
+ # Process should have some CPU activity or be idle/sleeping
141
+ # A process with 0% CPU for extended periods might be hung
142
+ is_responsive = cpu_percent > 0 or process_status in (
143
+ psutil.STATUS_SLEEPING,
144
+ psutil.STATUS_IDLE,
145
+ )
146
+
147
+ if not is_responsive:
148
+ return HealthCheckResult(
149
+ status=HealthStatus.DEGRADED,
150
+ check_type=self.get_check_type(),
151
+ message="Process may be unresponsive",
152
+ details={
153
+ "pid": deployment.process_id,
154
+ "status": process_status,
155
+ "cpu_percent": cpu_percent,
156
+ "deployment_id": deployment_id,
157
+ },
158
+ )
159
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
160
+ # Process disappeared or we lost access during check
161
+ return HealthCheckResult(
162
+ status=HealthStatus.UNHEALTHY,
163
+ check_type=self.get_check_type(),
164
+ message="Process disappeared during check",
165
+ details={
166
+ "pid": deployment.process_id,
167
+ "deployment_id": deployment_id,
168
+ },
169
+ )
170
+
171
+ # Process is healthy
172
+ # Get additional process info for details
173
+ try:
174
+ process_info = {
175
+ "pid": deployment.process_id,
176
+ "status": process_status,
177
+ "deployment_id": deployment_id,
178
+ "name": process.name(),
179
+ "num_threads": process.num_threads(),
180
+ }
181
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
182
+ process_info = {
183
+ "pid": deployment.process_id,
184
+ "status": process_status,
185
+ "deployment_id": deployment_id,
186
+ }
187
+
188
+ return HealthCheckResult(
189
+ status=HealthStatus.HEALTHY,
190
+ check_type=self.get_check_type(),
191
+ message="Process is running normally",
192
+ details=process_info,
193
+ )
194
+
195
+ except psutil.NoSuchProcess:
196
+ # Process does not exist
197
+ return HealthCheckResult(
198
+ status=HealthStatus.UNHEALTHY,
199
+ check_type=self.get_check_type(),
200
+ message="Process no longer exists",
201
+ details={
202
+ "pid": deployment.process_id,
203
+ "deployment_id": deployment_id,
204
+ },
205
+ )
206
+
207
+ except psutil.AccessDenied as e:
208
+ # Cannot access process information
209
+ return HealthCheckResult(
210
+ status=HealthStatus.UNKNOWN,
211
+ check_type=self.get_check_type(),
212
+ message="Cannot access process information",
213
+ details={
214
+ "pid": deployment.process_id,
215
+ "deployment_id": deployment_id,
216
+ "error": str(e),
217
+ },
218
+ )
219
+
220
+ except Exception as e:
221
+ # Unexpected error
222
+ self.log_error(f"Unexpected error in process health check: {e}")
223
+ return HealthCheckResult(
224
+ status=HealthStatus.UNKNOWN,
225
+ check_type=self.get_check_type(),
226
+ message="Health check failed with error",
227
+ details={
228
+ "pid": deployment.process_id,
229
+ "deployment_id": deployment_id,
230
+ "error": str(e),
231
+ },
232
+ )
233
+
234
+
235
+ __all__ = ["ProcessHealthCheck"]