claude-mpm 4.13.2__py3-none-any.whl → 4.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of claude-mpm might be problematic. Click here for more details.

Files changed (44) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/cli/__init__.py +10 -0
  3. claude_mpm/cli/commands/local_deploy.py +536 -0
  4. claude_mpm/cli/parsers/base_parser.py +7 -0
  5. claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
  6. claude_mpm/config/model_config.py +428 -0
  7. claude_mpm/core/interactive_session.py +3 -0
  8. claude_mpm/services/core/interfaces/__init__.py +74 -2
  9. claude_mpm/services/core/interfaces/health.py +172 -0
  10. claude_mpm/services/core/interfaces/model.py +281 -0
  11. claude_mpm/services/core/interfaces/process.py +372 -0
  12. claude_mpm/services/core/interfaces/restart.py +307 -0
  13. claude_mpm/services/core/interfaces/stability.py +260 -0
  14. claude_mpm/services/core/models/__init__.py +35 -0
  15. claude_mpm/services/core/models/health.py +189 -0
  16. claude_mpm/services/core/models/process.py +258 -0
  17. claude_mpm/services/core/models/restart.py +302 -0
  18. claude_mpm/services/core/models/stability.py +264 -0
  19. claude_mpm/services/local_ops/__init__.py +163 -0
  20. claude_mpm/services/local_ops/crash_detector.py +257 -0
  21. claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
  22. claude_mpm/services/local_ops/health_checks/http_check.py +223 -0
  23. claude_mpm/services/local_ops/health_checks/process_check.py +235 -0
  24. claude_mpm/services/local_ops/health_checks/resource_check.py +254 -0
  25. claude_mpm/services/local_ops/health_manager.py +430 -0
  26. claude_mpm/services/local_ops/log_monitor.py +396 -0
  27. claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
  28. claude_mpm/services/local_ops/process_manager.py +595 -0
  29. claude_mpm/services/local_ops/resource_monitor.py +331 -0
  30. claude_mpm/services/local_ops/restart_manager.py +401 -0
  31. claude_mpm/services/local_ops/restart_policy.py +387 -0
  32. claude_mpm/services/local_ops/state_manager.py +371 -0
  33. claude_mpm/services/local_ops/unified_manager.py +600 -0
  34. claude_mpm/services/model/__init__.py +147 -0
  35. claude_mpm/services/model/base_provider.py +365 -0
  36. claude_mpm/services/model/claude_provider.py +412 -0
  37. claude_mpm/services/model/model_router.py +453 -0
  38. claude_mpm/services/model/ollama_provider.py +415 -0
  39. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/METADATA +1 -1
  40. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/RECORD +44 -12
  41. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/WHEEL +0 -0
  42. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/entry_points.txt +0 -0
  43. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/licenses/LICENSE +0 -0
  44. {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,260 @@
1
+ """
2
+ Stability Monitoring Interfaces for Claude MPM Framework
3
+ ==========================================================
4
+
5
+ WHY: This module defines interfaces for proactive stability monitoring including
6
+ memory leak detection, log monitoring, and resource exhaustion prevention.
7
+
8
+ DESIGN DECISION: Separated from health checks to enable preventive monitoring
9
+ that triggers actions BEFORE crashes occur. Provides early warning systems.
10
+
11
+ ARCHITECTURE:
12
+ - IMemoryLeakDetector: Interface for memory leak detection using trend analysis
13
+ - ILogMonitor: Interface for real-time log file monitoring and pattern matching
14
+ - IResourceMonitor: Interface for comprehensive resource usage tracking
15
+
16
+ USAGE:
17
+ memory_detector = MemoryLeakDetector(leak_threshold_mb_per_minute=10.0)
18
+ log_monitor = LogMonitor(log_file="/var/log/app.log")
19
+ resource_monitor = ResourceMonitor(fd_threshold_percent=0.8)
20
+
21
+ # Integrate with health monitoring
22
+ health_manager.add_stability_monitors(
23
+ memory_detector=memory_detector,
24
+ log_monitor=log_monitor,
25
+ resource_monitor=resource_monitor,
26
+ )
27
+ """
28
+
29
+ from abc import ABC, abstractmethod
30
+ from typing import Callable, List
31
+
32
+ from claude_mpm.services.core.models.stability import (
33
+ LogPatternMatch,
34
+ MemoryTrend,
35
+ ResourceUsage,
36
+ )
37
+
38
+
39
+ class IMemoryLeakDetector(ABC):
40
+ """
41
+ Interface for memory leak detection using trend analysis.
42
+
43
+ WHY: Memory leaks are a common cause of process crashes. Early detection
44
+ enables preemptive restarts BEFORE the OOM killer terminates the process.
45
+
46
+ DESIGN DECISION: Uses slope-based trend analysis over a rolling window
47
+ to detect sustained memory growth patterns, filtering out normal variations.
48
+
49
+ Algorithm:
50
+ 1. Maintain rolling window of memory measurements (timestamp, memory_mb)
51
+ 2. Calculate linear regression slope (MB per minute)
52
+ 3. Detect leak if slope exceeds threshold (default: 10 MB/minute)
53
+ 4. Trigger alert when leak detected and memory > 80% limit
54
+
55
+ Thread Safety: Implementations must be thread-safe for concurrent access.
56
+ """
57
+
58
+ @abstractmethod
59
+ def record_memory_usage(self, deployment_id: str, memory_mb: float) -> None:
60
+ """
61
+ Record a memory usage measurement.
62
+
63
+ WHY: Builds historical data for trend analysis. Should be called
64
+ periodically (e.g., every 30s) to collect sufficient data points.
65
+
66
+ Args:
67
+ deployment_id: Deployment identifier
68
+ memory_mb: Current memory usage in megabytes
69
+ """
70
+
71
+ @abstractmethod
72
+ def analyze_trend(self, deployment_id: str) -> MemoryTrend:
73
+ """
74
+ Analyze memory usage trend for leak detection.
75
+
76
+ WHY: Computes slope of memory usage over time to detect sustained
77
+ growth patterns characteristic of memory leaks.
78
+
79
+ Args:
80
+ deployment_id: Deployment identifier
81
+
82
+ Returns:
83
+ MemoryTrend with slope analysis and leak detection result
84
+
85
+ Algorithm:
86
+ slope_mb_per_minute = (recent_memory - old_memory) / time_delta_minutes
87
+ is_leaking = slope_mb_per_minute > threshold
88
+ """
89
+
90
+ @abstractmethod
91
+ def is_leaking(self, deployment_id: str) -> bool:
92
+ """
93
+ Check if deployment has a detected memory leak.
94
+
95
+ Returns:
96
+ True if leak detected (sustained memory growth)
97
+ """
98
+
99
+ @abstractmethod
100
+ def register_leak_callback(
101
+ self, callback: Callable[[str, MemoryTrend], None]
102
+ ) -> None:
103
+ """
104
+ Register callback for leak detection events.
105
+
106
+ Args:
107
+ callback: Function called with (deployment_id, trend) when leak detected
108
+ """
109
+
110
+
111
+ class ILogMonitor(ABC):
112
+ """
113
+ Interface for real-time log file monitoring and pattern matching.
114
+
115
+ WHY: Application logs contain early warning signals (exceptions, OOM errors,
116
+ segfaults) that predict imminent crashes. Real-time monitoring enables
117
+ proactive intervention.
118
+
119
+ DESIGN DECISION: Uses watchdog library for efficient file system monitoring.
120
+ Avoids polling by receiving file modification events from the OS.
121
+
122
+ Pattern Matching:
123
+ - Regex-based patterns for flexibility
124
+ - Configurable patterns per deployment
125
+ - Built-in patterns for common errors:
126
+ * OutOfMemoryError
127
+ * Segmentation fault
128
+ * Exception: / Traceback
129
+ * Database connection errors
130
+ * Network timeouts
131
+
132
+ Thread Safety: Uses watchdog's thread-safe event handling.
133
+ """
134
+
135
+ @abstractmethod
136
+ def start_monitoring(self, log_file: str, deployment_id: str) -> None:
137
+ """
138
+ Start monitoring a log file for error patterns.
139
+
140
+ WHY: Begins watching the log file for new entries. Uses OS-level
141
+ file system events for efficiency.
142
+
143
+ Args:
144
+ log_file: Path to log file to monitor
145
+ deployment_id: Deployment identifier for callbacks
146
+ """
147
+
148
+ @abstractmethod
149
+ def stop_monitoring(self, deployment_id: str) -> None:
150
+ """
151
+ Stop monitoring a deployment's log file.
152
+
153
+ Args:
154
+ deployment_id: Deployment identifier
155
+ """
156
+
157
+ @abstractmethod
158
+ def add_pattern(self, pattern: str, severity: str = "ERROR") -> None:
159
+ """
160
+ Add an error pattern to monitor.
161
+
162
+ Args:
163
+ pattern: Regex pattern to match
164
+ severity: Error severity (ERROR, CRITICAL, WARNING)
165
+ """
166
+
167
+ @abstractmethod
168
+ def get_recent_matches(
169
+ self, deployment_id: str, limit: int = 10
170
+ ) -> List[LogPatternMatch]:
171
+ """
172
+ Get recent pattern matches for a deployment.
173
+
174
+ Args:
175
+ deployment_id: Deployment identifier
176
+ limit: Maximum number of matches to return
177
+
178
+ Returns:
179
+ List of LogPatternMatch objects, newest first
180
+ """
181
+
182
+ @abstractmethod
183
+ def register_match_callback(
184
+ self, callback: Callable[[str, LogPatternMatch], None]
185
+ ) -> None:
186
+ """
187
+ Register callback for pattern matches.
188
+
189
+ Args:
190
+ callback: Function called with (deployment_id, match) when pattern detected
191
+ """
192
+
193
+
194
+ class IResourceMonitor(ABC):
195
+ """
196
+ Interface for comprehensive resource usage monitoring.
197
+
198
+ WHY: Resource exhaustion (file descriptors, threads, connections, disk space)
199
+ causes crashes and degradation. Monitoring enables preemptive action at 80%
200
+ thresholds before hitting hard limits.
201
+
202
+ DESIGN DECISION: Extends basic resource health checks with:
203
+ - Higher granularity (more frequent checks)
204
+ - Percentage-based thresholds (80% of ulimit)
205
+ - Trend analysis for growth rate
206
+ - Integration with restart manager for preemptive restarts
207
+
208
+ Resource Types:
209
+ 1. File Descriptors: Critical for I/O operations (Unix: ulimit -n)
210
+ 2. Threads: Memory and scheduling overhead
211
+ 3. Network Connections: Socket exhaustion
212
+ 4. Disk Space: Working directory availability
213
+
214
+ Thread Safety: Implementations must be thread-safe.
215
+ """
216
+
217
+ @abstractmethod
218
+ def check_resources(self, deployment_id: str) -> ResourceUsage:
219
+ """
220
+ Check resource usage for a deployment.
221
+
222
+ WHY: Provides comprehensive snapshot of resource consumption across
223
+ all monitored resource types.
224
+
225
+ Args:
226
+ deployment_id: Deployment identifier
227
+
228
+ Returns:
229
+ ResourceUsage with current metrics and critical status
230
+
231
+ Raises:
232
+ ValueError: If deployment not found
233
+ """
234
+
235
+ @abstractmethod
236
+ def is_critical(self, deployment_id: str) -> bool:
237
+ """
238
+ Check if any resource is at critical threshold (>80%).
239
+
240
+ Returns:
241
+ True if any resource exceeds 80% of limit
242
+ """
243
+
244
+ @abstractmethod
245
+ def register_critical_callback(
246
+ self, callback: Callable[[str, ResourceUsage], None]
247
+ ) -> None:
248
+ """
249
+ Register callback for critical resource usage.
250
+
251
+ Args:
252
+ callback: Function called with (deployment_id, usage) when critical
253
+ """
254
+
255
+
256
+ __all__ = [
257
+ "ILogMonitor",
258
+ "IMemoryLeakDetector",
259
+ "IResourceMonitor",
260
+ ]
@@ -20,6 +20,25 @@ from .agent_config import (
20
20
  ConfigurationResult,
21
21
  ValidationResult,
22
22
  )
23
+ from .process import (
24
+ PROTECTED_PORT_RANGES,
25
+ DeploymentState,
26
+ ProcessInfo,
27
+ ProcessStatus,
28
+ StartConfig,
29
+ is_port_protected,
30
+ )
31
+ from .restart import (
32
+ CircuitBreakerState,
33
+ RestartAttempt,
34
+ RestartConfig,
35
+ RestartHistory,
36
+ )
37
+ from .stability import (
38
+ LogPatternMatch,
39
+ MemoryTrend,
40
+ ResourceUsage,
41
+ )
23
42
  from .toolchain import (
24
43
  ConfidenceLevel,
25
44
  DeploymentTarget,
@@ -43,4 +62,20 @@ __all__ = [ # noqa: RUF022 - Grouped by category with comments for clarity
43
62
  "ConfigurationResult",
44
63
  "ValidationResult",
45
64
  "ConfigurationPreview",
65
+ # Process management models
66
+ "ProcessStatus",
67
+ "DeploymentState",
68
+ "ProcessInfo",
69
+ "StartConfig",
70
+ "PROTECTED_PORT_RANGES",
71
+ "is_port_protected",
72
+ # Restart management models
73
+ "CircuitBreakerState",
74
+ "RestartAttempt",
75
+ "RestartHistory",
76
+ "RestartConfig",
77
+ # Stability monitoring models
78
+ "MemoryTrend",
79
+ "LogPatternMatch",
80
+ "ResourceUsage",
46
81
  ]
@@ -0,0 +1,189 @@
1
+ """
2
+ Health Monitoring Data Models for Claude MPM Framework
3
+ =======================================================
4
+
5
+ WHY: This module defines data structures for health monitoring operations,
6
+ including health status, check results, and deployment health aggregations.
7
+
8
+ DESIGN DECISION: Uses dataclasses for immutability and type safety. Provides
9
+ clear health status enum and structured check results.
10
+
11
+ ARCHITECTURE:
12
+ - HealthStatus: Enum of health states (HEALTHY, DEGRADED, UNHEALTHY, UNKNOWN)
13
+ - HealthCheckResult: Result of a single health check
14
+ - DeploymentHealth: Aggregated health status for a deployment
15
+ """
16
+
17
+ from dataclasses import asdict, dataclass, field
18
+ from datetime import datetime
19
+ from enum import Enum
20
+ from typing import Any, Dict, List
21
+
22
+
23
+ class HealthStatus(Enum):
24
+ """
25
+ Health status levels.
26
+
27
+ WHY: Provides granular health states to distinguish between different
28
+ levels of service degradation.
29
+
30
+ States:
31
+ HEALTHY: All checks passing, process operating normally
32
+ DEGRADED: Process running but with issues (high resource usage, slow responses)
33
+ UNHEALTHY: Critical failure (process dead, crashed, or unresponsive)
34
+ UNKNOWN: Cannot determine health status
35
+ """
36
+
37
+ HEALTHY = "healthy"
38
+ DEGRADED = "degraded"
39
+ UNHEALTHY = "unhealthy"
40
+ UNKNOWN = "unknown"
41
+
42
+ def is_operational(self) -> bool:
43
+ """Check if status indicates operational service."""
44
+ return self in (HealthStatus.HEALTHY, HealthStatus.DEGRADED)
45
+
46
+ def is_critical(self) -> bool:
47
+ """Check if status indicates critical failure."""
48
+ return self == HealthStatus.UNHEALTHY
49
+
50
+
51
+ @dataclass
52
+ class HealthCheckResult:
53
+ """
54
+ Result of a single health check.
55
+
56
+ WHY: Contains all information about a specific health check execution,
57
+ enabling detailed analysis and debugging of health issues.
58
+
59
+ Attributes:
60
+ status: HealthStatus of the check
61
+ check_type: Type of health check (http, process, resource)
62
+ message: Human-readable description of the result
63
+ details: Additional check-specific data
64
+ checked_at: Timestamp when check was performed
65
+ """
66
+
67
+ status: HealthStatus
68
+ check_type: str
69
+ message: str
70
+ details: Dict[str, Any] = field(default_factory=dict)
71
+ checked_at: datetime = field(default_factory=datetime.now)
72
+
73
+ def to_dict(self) -> Dict[str, Any]:
74
+ """
75
+ Convert to dictionary for JSON serialization.
76
+
77
+ Returns:
78
+ Dictionary representation with datetime converted to ISO format
79
+ """
80
+ data = asdict(self)
81
+ data["status"] = self.status.value
82
+ data["checked_at"] = self.checked_at.isoformat()
83
+ return data
84
+
85
+ @classmethod
86
+ def from_dict(cls, data: Dict[str, Any]) -> "HealthCheckResult":
87
+ """
88
+ Create HealthCheckResult from dictionary.
89
+
90
+ Args:
91
+ data: Dictionary from JSON deserialization
92
+
93
+ Returns:
94
+ HealthCheckResult instance
95
+ """
96
+ # Convert ISO string to datetime
97
+ if isinstance(data.get("checked_at"), str):
98
+ data["checked_at"] = datetime.fromisoformat(data["checked_at"])
99
+
100
+ # Convert status string to enum
101
+ if isinstance(data.get("status"), str):
102
+ data["status"] = HealthStatus(data["status"])
103
+
104
+ return cls(**data)
105
+
106
+
107
+ @dataclass
108
+ class DeploymentHealth:
109
+ """
110
+ Aggregated health status for a deployment.
111
+
112
+ WHY: Combines results from multiple health checks to provide a
113
+ comprehensive health assessment of a deployment.
114
+
115
+ Attributes:
116
+ deployment_id: Unique deployment identifier
117
+ overall_status: Aggregated health status
118
+ checks: List of individual health check results
119
+ last_check: Timestamp of the most recent health check
120
+ """
121
+
122
+ deployment_id: str
123
+ overall_status: HealthStatus
124
+ checks: List[HealthCheckResult] = field(default_factory=list)
125
+ last_check: datetime = field(default_factory=datetime.now)
126
+
127
+ def to_dict(self) -> Dict[str, Any]:
128
+ """
129
+ Convert to dictionary for JSON serialization.
130
+
131
+ Returns:
132
+ Dictionary representation
133
+ """
134
+ return {
135
+ "deployment_id": self.deployment_id,
136
+ "overall_status": self.overall_status.value,
137
+ "checks": [check.to_dict() for check in self.checks],
138
+ "last_check": self.last_check.isoformat(),
139
+ }
140
+
141
+ @classmethod
142
+ def from_dict(cls, data: Dict[str, Any]) -> "DeploymentHealth":
143
+ """
144
+ Create DeploymentHealth from dictionary.
145
+
146
+ Args:
147
+ data: Dictionary from JSON deserialization
148
+
149
+ Returns:
150
+ DeploymentHealth instance
151
+ """
152
+ # Convert ISO string to datetime
153
+ if isinstance(data.get("last_check"), str):
154
+ data["last_check"] = datetime.fromisoformat(data["last_check"])
155
+
156
+ # Convert status string to enum
157
+ if isinstance(data.get("overall_status"), str):
158
+ data["overall_status"] = HealthStatus(data["overall_status"])
159
+
160
+ # Convert check dicts to HealthCheckResult objects
161
+ if isinstance(data.get("checks"), list):
162
+ data["checks"] = [
163
+ HealthCheckResult.from_dict(check) if isinstance(check, dict) else check
164
+ for check in data["checks"]
165
+ ]
166
+
167
+ return cls(**data)
168
+
169
+ def get_check_by_type(self, check_type: str) -> HealthCheckResult | None:
170
+ """
171
+ Get the result of a specific check type.
172
+
173
+ Args:
174
+ check_type: Type of health check to retrieve
175
+
176
+ Returns:
177
+ HealthCheckResult if found, None otherwise
178
+ """
179
+ for check in self.checks:
180
+ if check.check_type == check_type:
181
+ return check
182
+ return None
183
+
184
+
185
+ __all__ = [
186
+ "DeploymentHealth",
187
+ "HealthCheckResult",
188
+ "HealthStatus",
189
+ ]