claude-mpm 4.13.2__py3-none-any.whl → 4.18.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/BASE_ENGINEER.md +286 -0
  3. claude_mpm/agents/BASE_PM.md +48 -17
  4. claude_mpm/agents/OUTPUT_STYLE.md +329 -11
  5. claude_mpm/agents/PM_INSTRUCTIONS.md +227 -8
  6. claude_mpm/agents/agent_loader.py +17 -5
  7. claude_mpm/agents/frontmatter_validator.py +284 -253
  8. claude_mpm/agents/templates/agentic-coder-optimizer.json +9 -2
  9. claude_mpm/agents/templates/api_qa.json +7 -1
  10. claude_mpm/agents/templates/clerk-ops.json +8 -1
  11. claude_mpm/agents/templates/code_analyzer.json +4 -1
  12. claude_mpm/agents/templates/dart_engineer.json +11 -1
  13. claude_mpm/agents/templates/data_engineer.json +11 -1
  14. claude_mpm/agents/templates/documentation.json +6 -1
  15. claude_mpm/agents/templates/engineer.json +18 -1
  16. claude_mpm/agents/templates/gcp_ops_agent.json +8 -1
  17. claude_mpm/agents/templates/golang_engineer.json +11 -1
  18. claude_mpm/agents/templates/java_engineer.json +12 -2
  19. claude_mpm/agents/templates/local_ops_agent.json +1217 -6
  20. claude_mpm/agents/templates/nextjs_engineer.json +11 -1
  21. claude_mpm/agents/templates/ops.json +8 -1
  22. claude_mpm/agents/templates/php-engineer.json +11 -1
  23. claude_mpm/agents/templates/project_organizer.json +10 -3
  24. claude_mpm/agents/templates/prompt-engineer.json +5 -1
  25. claude_mpm/agents/templates/python_engineer.json +11 -1
  26. claude_mpm/agents/templates/qa.json +7 -1
  27. claude_mpm/agents/templates/react_engineer.json +11 -1
  28. claude_mpm/agents/templates/refactoring_engineer.json +8 -1
  29. claude_mpm/agents/templates/research.json +4 -1
  30. claude_mpm/agents/templates/ruby-engineer.json +11 -1
  31. claude_mpm/agents/templates/rust_engineer.json +11 -1
  32. claude_mpm/agents/templates/security.json +6 -1
  33. claude_mpm/agents/templates/svelte-engineer.json +225 -0
  34. claude_mpm/agents/templates/ticketing.json +6 -1
  35. claude_mpm/agents/templates/typescript_engineer.json +11 -1
  36. claude_mpm/agents/templates/vercel_ops_agent.json +8 -1
  37. claude_mpm/agents/templates/version_control.json +8 -1
  38. claude_mpm/agents/templates/web_qa.json +7 -1
  39. claude_mpm/agents/templates/web_ui.json +11 -1
  40. claude_mpm/cli/__init__.py +34 -706
  41. claude_mpm/cli/commands/agent_manager.py +25 -12
  42. claude_mpm/cli/commands/agent_state_manager.py +186 -0
  43. claude_mpm/cli/commands/agents.py +204 -148
  44. claude_mpm/cli/commands/aggregate.py +7 -3
  45. claude_mpm/cli/commands/analyze.py +9 -4
  46. claude_mpm/cli/commands/analyze_code.py +7 -2
  47. claude_mpm/cli/commands/auto_configure.py +7 -9
  48. claude_mpm/cli/commands/config.py +47 -13
  49. claude_mpm/cli/commands/configure.py +294 -1788
  50. claude_mpm/cli/commands/configure_agent_display.py +261 -0
  51. claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
  52. claude_mpm/cli/commands/configure_hook_manager.py +225 -0
  53. claude_mpm/cli/commands/configure_models.py +18 -0
  54. claude_mpm/cli/commands/configure_navigation.py +167 -0
  55. claude_mpm/cli/commands/configure_paths.py +104 -0
  56. claude_mpm/cli/commands/configure_persistence.py +254 -0
  57. claude_mpm/cli/commands/configure_startup_manager.py +646 -0
  58. claude_mpm/cli/commands/configure_template_editor.py +497 -0
  59. claude_mpm/cli/commands/configure_validators.py +73 -0
  60. claude_mpm/cli/commands/local_deploy.py +537 -0
  61. claude_mpm/cli/commands/memory.py +54 -20
  62. claude_mpm/cli/commands/mpm_init.py +39 -25
  63. claude_mpm/cli/commands/mpm_init_handler.py +8 -3
  64. claude_mpm/cli/executor.py +202 -0
  65. claude_mpm/cli/helpers.py +105 -0
  66. claude_mpm/cli/interactive/__init__.py +3 -0
  67. claude_mpm/cli/interactive/skills_wizard.py +491 -0
  68. claude_mpm/cli/parsers/__init__.py +7 -1
  69. claude_mpm/cli/parsers/base_parser.py +98 -3
  70. claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
  71. claude_mpm/cli/shared/output_formatters.py +28 -19
  72. claude_mpm/cli/startup.py +481 -0
  73. claude_mpm/cli/utils.py +52 -1
  74. claude_mpm/commands/mpm-help.md +3 -0
  75. claude_mpm/commands/mpm-version.md +113 -0
  76. claude_mpm/commands/mpm.md +1 -0
  77. claude_mpm/config/agent_config.py +2 -2
  78. claude_mpm/config/model_config.py +428 -0
  79. claude_mpm/core/base_service.py +13 -12
  80. claude_mpm/core/enums.py +452 -0
  81. claude_mpm/core/factories.py +1 -1
  82. claude_mpm/core/instruction_reinforcement_hook.py +2 -1
  83. claude_mpm/core/interactive_session.py +9 -3
  84. claude_mpm/core/logging_config.py +6 -2
  85. claude_mpm/core/oneshot_session.py +8 -4
  86. claude_mpm/core/optimized_agent_loader.py +3 -3
  87. claude_mpm/core/output_style_manager.py +12 -192
  88. claude_mpm/core/service_registry.py +5 -1
  89. claude_mpm/core/types.py +2 -9
  90. claude_mpm/core/typing_utils.py +7 -6
  91. claude_mpm/dashboard/static/js/dashboard.js +0 -14
  92. claude_mpm/dashboard/templates/index.html +3 -41
  93. claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
  94. claude_mpm/hooks/instruction_reinforcement.py +7 -2
  95. claude_mpm/models/resume_log.py +340 -0
  96. claude_mpm/services/agents/auto_config_manager.py +10 -11
  97. claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
  98. claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
  99. claude_mpm/services/agents/deployment/agent_validator.py +17 -1
  100. claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
  101. claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
  102. claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
  103. claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
  104. claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
  105. claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
  106. claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
  107. claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
  108. claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
  109. claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
  110. claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
  111. claude_mpm/services/agents/local_template_manager.py +1 -1
  112. claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
  113. claude_mpm/services/agents/registry/modification_tracker.py +5 -2
  114. claude_mpm/services/command_handler_service.py +11 -5
  115. claude_mpm/services/core/interfaces/__init__.py +74 -2
  116. claude_mpm/services/core/interfaces/health.py +172 -0
  117. claude_mpm/services/core/interfaces/model.py +281 -0
  118. claude_mpm/services/core/interfaces/process.py +372 -0
  119. claude_mpm/services/core/interfaces/restart.py +307 -0
  120. claude_mpm/services/core/interfaces/stability.py +260 -0
  121. claude_mpm/services/core/models/__init__.py +33 -0
  122. claude_mpm/services/core/models/agent_config.py +12 -28
  123. claude_mpm/services/core/models/health.py +162 -0
  124. claude_mpm/services/core/models/process.py +235 -0
  125. claude_mpm/services/core/models/restart.py +302 -0
  126. claude_mpm/services/core/models/stability.py +264 -0
  127. claude_mpm/services/core/path_resolver.py +23 -7
  128. claude_mpm/services/diagnostics/__init__.py +2 -2
  129. claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
  130. claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
  131. claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
  132. claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
  133. claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
  134. claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
  135. claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
  136. claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
  137. claude_mpm/services/diagnostics/checks/mcp_services_check.py +36 -31
  138. claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
  139. claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
  140. claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
  141. claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
  142. claude_mpm/services/diagnostics/models.py +19 -24
  143. claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
  144. claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
  145. claude_mpm/services/infrastructure/monitoring/base.py +5 -13
  146. claude_mpm/services/infrastructure/monitoring/network.py +7 -6
  147. claude_mpm/services/infrastructure/monitoring/process.py +13 -12
  148. claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
  149. claude_mpm/services/infrastructure/monitoring/service.py +16 -15
  150. claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
  151. claude_mpm/services/local_ops/__init__.py +163 -0
  152. claude_mpm/services/local_ops/crash_detector.py +257 -0
  153. claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
  154. claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
  155. claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
  156. claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
  157. claude_mpm/services/local_ops/health_manager.py +430 -0
  158. claude_mpm/services/local_ops/log_monitor.py +396 -0
  159. claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
  160. claude_mpm/services/local_ops/process_manager.py +595 -0
  161. claude_mpm/services/local_ops/resource_monitor.py +331 -0
  162. claude_mpm/services/local_ops/restart_manager.py +401 -0
  163. claude_mpm/services/local_ops/restart_policy.py +387 -0
  164. claude_mpm/services/local_ops/state_manager.py +372 -0
  165. claude_mpm/services/local_ops/unified_manager.py +600 -0
  166. claude_mpm/services/mcp_config_manager.py +9 -4
  167. claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
  168. claude_mpm/services/mcp_gateway/core/base.py +18 -31
  169. claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +71 -24
  170. claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
  171. claude_mpm/services/memory_hook_service.py +4 -1
  172. claude_mpm/services/model/__init__.py +147 -0
  173. claude_mpm/services/model/base_provider.py +365 -0
  174. claude_mpm/services/model/claude_provider.py +412 -0
  175. claude_mpm/services/model/model_router.py +453 -0
  176. claude_mpm/services/model/ollama_provider.py +415 -0
  177. claude_mpm/services/monitor/daemon_manager.py +3 -2
  178. claude_mpm/services/monitor/handlers/dashboard.py +2 -1
  179. claude_mpm/services/monitor/handlers/hooks.py +2 -1
  180. claude_mpm/services/monitor/management/lifecycle.py +3 -2
  181. claude_mpm/services/monitor/server.py +2 -1
  182. claude_mpm/services/session_management_service.py +3 -2
  183. claude_mpm/services/session_manager.py +205 -1
  184. claude_mpm/services/shared/async_service_base.py +16 -27
  185. claude_mpm/services/shared/lifecycle_service_base.py +1 -14
  186. claude_mpm/services/socketio/handlers/__init__.py +5 -2
  187. claude_mpm/services/socketio/handlers/hook.py +13 -2
  188. claude_mpm/services/socketio/handlers/registry.py +4 -2
  189. claude_mpm/services/socketio/server/main.py +10 -8
  190. claude_mpm/services/subprocess_launcher_service.py +14 -5
  191. claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
  192. claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
  193. claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
  194. claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
  195. claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
  196. claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
  197. claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
  198. claude_mpm/services/unified/deployment_strategies/local.py +6 -5
  199. claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
  200. claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
  201. claude_mpm/services/unified/interfaces.py +3 -1
  202. claude_mpm/services/unified/unified_analyzer.py +14 -10
  203. claude_mpm/services/unified/unified_config.py +2 -1
  204. claude_mpm/services/unified/unified_deployment.py +9 -4
  205. claude_mpm/services/version_service.py +104 -1
  206. claude_mpm/skills/__init__.py +21 -0
  207. claude_mpm/skills/bundled/__init__.py +6 -0
  208. claude_mpm/skills/bundled/api-documentation.md +393 -0
  209. claude_mpm/skills/bundled/async-testing.md +571 -0
  210. claude_mpm/skills/bundled/code-review.md +143 -0
  211. claude_mpm/skills/bundled/database-migration.md +199 -0
  212. claude_mpm/skills/bundled/docker-containerization.md +194 -0
  213. claude_mpm/skills/bundled/express-local-dev.md +1429 -0
  214. claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
  215. claude_mpm/skills/bundled/git-workflow.md +414 -0
  216. claude_mpm/skills/bundled/imagemagick.md +204 -0
  217. claude_mpm/skills/bundled/json-data-handling.md +223 -0
  218. claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
  219. claude_mpm/skills/bundled/pdf.md +141 -0
  220. claude_mpm/skills/bundled/performance-profiling.md +567 -0
  221. claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
  222. claude_mpm/skills/bundled/security-scanning.md +327 -0
  223. claude_mpm/skills/bundled/systematic-debugging.md +473 -0
  224. claude_mpm/skills/bundled/test-driven-development.md +378 -0
  225. claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
  226. claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
  227. claude_mpm/skills/bundled/xlsx.md +157 -0
  228. claude_mpm/skills/registry.py +286 -0
  229. claude_mpm/skills/skill_manager.py +310 -0
  230. claude_mpm/tools/code_tree_analyzer.py +177 -141
  231. claude_mpm/tools/code_tree_events.py +4 -2
  232. claude_mpm/utils/agent_dependency_loader.py +2 -2
  233. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +117 -8
  234. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +238 -174
  235. claude_mpm/dashboard/static/css/code-tree.css +0 -1639
  236. claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
  237. claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
  238. claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
  239. claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
  240. claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
  241. claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
  242. claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
  243. claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
  244. claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
  245. claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
  246. claude_mpm/services/project/analyzer_refactored.py +0 -450
  247. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
  248. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
  249. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
  250. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,307 @@
1
+ """
2
+ Restart Management Interfaces for Claude MPM Framework
3
+ ========================================================
4
+
5
+ WHY: This module defines interfaces for auto-restart functionality with crash
6
+ detection, intelligent restart policies, and circuit breaker patterns.
7
+
8
+ DESIGN DECISION: Restart interfaces are separated to enable modular restart
9
+ management with different crash detection strategies and restart policies.
10
+
11
+ ARCHITECTURE:
12
+ - ICrashDetector: Interface for detecting process crashes and failures
13
+ - IRestartPolicy: Interface for restart decision logic with backoff
14
+ - IRestartManager: Interface for orchestrating the full restart workflow
15
+
16
+ USAGE:
17
+ crash_detector = CrashDetector(health_manager)
18
+ restart_policy = RestartPolicy(config)
19
+ restart_manager = RestartManager(
20
+ process_manager=process_manager,
21
+ health_manager=health_manager,
22
+ config=config
23
+ )
24
+ restart_manager.enable_auto_restart(deployment_id)
25
+ """
26
+
27
+ from abc import ABC, abstractmethod
28
+ from typing import TYPE_CHECKING, Callable, Optional
29
+
30
+ if TYPE_CHECKING:
31
+ from claude_mpm.services.core.models.restart import RestartHistory
32
+
33
+
34
+ class ICrashDetector(ABC):
35
+ """
36
+ Interface for detecting process crashes and failures.
37
+
38
+ WHY: Crash detection requires monitoring health status changes, process
39
+ exits, and zombie states. This interface abstracts different detection
40
+ strategies to enable flexible crash monitoring.
41
+
42
+ DESIGN DECISION: Integrates with IHealthCheckManager via callbacks to
43
+ receive real-time status updates. Tracks crash history per deployment
44
+ to enable pattern detection.
45
+
46
+ Thread Safety: Implementations must be thread-safe for concurrent monitoring.
47
+ """
48
+
49
+ @abstractmethod
50
+ def register_crash_callback(self, callback: Callable[[str, str], None]) -> None:
51
+ """
52
+ Register a callback to be invoked when a crash is detected.
53
+
54
+ Args:
55
+ callback: Function called with (deployment_id, reason)
56
+ """
57
+
58
+ @abstractmethod
59
+ def start_monitoring(self, deployment_id: str) -> None:
60
+ """
61
+ Start monitoring a deployment for crashes.
62
+
63
+ WHY: Enables targeted monitoring for specific deployments.
64
+
65
+ Args:
66
+ deployment_id: Unique deployment identifier
67
+
68
+ Raises:
69
+ ValueError: If deployment_id not found
70
+ """
71
+
72
+ @abstractmethod
73
+ def stop_monitoring(self, deployment_id: str) -> None:
74
+ """
75
+ Stop monitoring a deployment.
76
+
77
+ Args:
78
+ deployment_id: Unique deployment identifier
79
+ """
80
+
81
+ @abstractmethod
82
+ def is_monitoring(self, deployment_id: str) -> bool:
83
+ """
84
+ Check if a deployment is being monitored.
85
+
86
+ Args:
87
+ deployment_id: Unique deployment identifier
88
+
89
+ Returns:
90
+ True if deployment is being monitored
91
+ """
92
+
93
+ @abstractmethod
94
+ def get_crash_count(self, deployment_id: str) -> int:
95
+ """
96
+ Get the number of crashes detected for a deployment.
97
+
98
+ Args:
99
+ deployment_id: Unique deployment identifier
100
+
101
+ Returns:
102
+ Number of crashes detected
103
+ """
104
+
105
+
106
+ class IRestartPolicy(ABC):
107
+ """
108
+ Interface for restart decision logic with exponential backoff.
109
+
110
+ WHY: Restart policies prevent restart loops through exponential backoff,
111
+ max attempts, and circuit breaker patterns. This interface abstracts
112
+ the decision-making logic to enable different strategies.
113
+
114
+ DESIGN DECISION: Implements exponential backoff with configurable
115
+ parameters and circuit breaker state transitions (CLOSED → OPEN → HALF_OPEN).
116
+
117
+ Circuit Breaker States:
118
+ - CLOSED: Normal operation, restarts allowed
119
+ - OPEN: Circuit breaker tripped, restarts blocked
120
+ - HALF_OPEN: Testing if service recovered
121
+ """
122
+
123
+ @abstractmethod
124
+ def should_restart(self, deployment_id: str) -> bool:
125
+ """
126
+ Determine if a deployment should be restarted.
127
+
128
+ WHY: Central decision point that considers attempt count, circuit
129
+ breaker state, and backoff timing.
130
+
131
+ Args:
132
+ deployment_id: Unique deployment identifier
133
+
134
+ Returns:
135
+ True if restart should proceed
136
+ """
137
+
138
+ @abstractmethod
139
+ def calculate_backoff(self, deployment_id: str) -> float:
140
+ """
141
+ Calculate backoff time in seconds for next restart.
142
+
143
+ WHY: Implements exponential backoff to prevent restart storms.
144
+ Formula: min(initial * (multiplier ** (attempt - 1)), max_backoff)
145
+
146
+ Args:
147
+ deployment_id: Unique deployment identifier
148
+
149
+ Returns:
150
+ Backoff time in seconds (0 if first attempt)
151
+ """
152
+
153
+ @abstractmethod
154
+ def record_restart_attempt(
155
+ self, deployment_id: str, success: bool, failure_reason: Optional[str] = None
156
+ ) -> None:
157
+ """
158
+ Record a restart attempt and update circuit breaker state.
159
+
160
+ Args:
161
+ deployment_id: Unique deployment identifier
162
+ success: Whether restart succeeded
163
+ failure_reason: Optional reason for failure
164
+ """
165
+
166
+ @abstractmethod
167
+ def reset_restart_history(self, deployment_id: str) -> None:
168
+ """
169
+ Reset restart history for a deployment.
170
+
171
+ WHY: Clears restart attempts after successful recovery or manual
172
+ intervention.
173
+
174
+ Args:
175
+ deployment_id: Unique deployment identifier
176
+ """
177
+
178
+ @abstractmethod
179
+ def get_circuit_breaker_state(self, deployment_id: str) -> str:
180
+ """
181
+ Get current circuit breaker state.
182
+
183
+ Args:
184
+ deployment_id: Unique deployment identifier
185
+
186
+ Returns:
187
+ Circuit breaker state (CLOSED, OPEN, HALF_OPEN)
188
+ """
189
+
190
+ @abstractmethod
191
+ def get_restart_attempt_count(self, deployment_id: str) -> int:
192
+ """
193
+ Get number of restart attempts for a deployment.
194
+
195
+ Args:
196
+ deployment_id: Unique deployment identifier
197
+
198
+ Returns:
199
+ Number of restart attempts
200
+ """
201
+
202
+
203
+ class IRestartManager(ABC):
204
+ """
205
+ Interface for orchestrating the complete restart workflow.
206
+
207
+ WHY: Restart management requires coordinating crash detection, policy
208
+ evaluation, process restart, and health verification. This interface
209
+ provides a high-level API for automatic and manual restarts.
210
+
211
+ DESIGN DECISION: Provides both automatic (background) and manual
212
+ (on-demand) restart operations. Integrates with all components:
213
+ CrashDetector, RestartPolicy, ProcessManager, and HealthCheckManager.
214
+
215
+ Restart Workflow:
216
+ 1. Detect crash (via CrashDetector callback)
217
+ 2. Check restart policy (max attempts, circuit breaker)
218
+ 3. Wait for backoff period
219
+ 4. Execute restart (preserve original StartConfig)
220
+ 5. Verify health after restart
221
+ 6. Record attempt and update circuit breaker
222
+ """
223
+
224
+ @abstractmethod
225
+ def enable_auto_restart(self, deployment_id: str) -> None:
226
+ """
227
+ Enable automatic restarts for a deployment.
228
+
229
+ WHY: Enables hands-free recovery from crashes. Starts monitoring
230
+ via CrashDetector and registers restart callbacks.
231
+
232
+ Args:
233
+ deployment_id: Unique deployment identifier
234
+
235
+ Raises:
236
+ ValueError: If deployment_id not found
237
+ """
238
+
239
+ @abstractmethod
240
+ def disable_auto_restart(self, deployment_id: str) -> None:
241
+ """
242
+ Disable automatic restarts for a deployment.
243
+
244
+ Args:
245
+ deployment_id: Unique deployment identifier
246
+ """
247
+
248
+ @abstractmethod
249
+ def is_auto_restart_enabled(self, deployment_id: str) -> bool:
250
+ """
251
+ Check if auto-restart is enabled for a deployment.
252
+
253
+ Args:
254
+ deployment_id: Unique deployment identifier
255
+
256
+ Returns:
257
+ True if auto-restart is enabled
258
+ """
259
+
260
+ @abstractmethod
261
+ def restart_deployment(self, deployment_id: str, manual: bool = False) -> bool:
262
+ """
263
+ Restart a deployment (manual or automatic trigger).
264
+
265
+ WHY: Provides unified restart operation that respects policy
266
+ constraints and performs health verification.
267
+
268
+ Args:
269
+ deployment_id: Unique deployment identifier
270
+ manual: If True, bypass some policy checks (e.g., circuit breaker)
271
+
272
+ Returns:
273
+ True if restart succeeded
274
+
275
+ Raises:
276
+ ValueError: If deployment_id not found
277
+ """
278
+
279
+ @abstractmethod
280
+ def get_restart_history(self, deployment_id: str) -> Optional["RestartHistory"]:
281
+ """
282
+ Get restart history for a deployment.
283
+
284
+ Args:
285
+ deployment_id: Unique deployment identifier
286
+
287
+ Returns:
288
+ RestartHistory if found, None otherwise
289
+ """
290
+
291
+ @abstractmethod
292
+ def clear_restart_history(self, deployment_id: str) -> None:
293
+ """
294
+ Clear restart history and reset circuit breaker.
295
+
296
+ WHY: Allows manual intervention to clear failed restart state.
297
+
298
+ Args:
299
+ deployment_id: Unique deployment identifier
300
+ """
301
+
302
+
303
+ __all__ = [
304
+ "ICrashDetector",
305
+ "IRestartManager",
306
+ "IRestartPolicy",
307
+ ]
@@ -0,0 +1,260 @@
1
+ """
2
+ Stability Monitoring Interfaces for Claude MPM Framework
3
+ ==========================================================
4
+
5
+ WHY: This module defines interfaces for proactive stability monitoring including
6
+ memory leak detection, log monitoring, and resource exhaustion prevention.
7
+
8
+ DESIGN DECISION: Separated from health checks to enable preventive monitoring
9
+ that triggers actions BEFORE crashes occur. Provides early warning systems.
10
+
11
+ ARCHITECTURE:
12
+ - IMemoryLeakDetector: Interface for memory leak detection using trend analysis
13
+ - ILogMonitor: Interface for real-time log file monitoring and pattern matching
14
+ - IResourceMonitor: Interface for comprehensive resource usage tracking
15
+
16
+ USAGE:
17
+ memory_detector = MemoryLeakDetector(leak_threshold_mb_per_minute=10.0)
18
+ log_monitor = LogMonitor(log_file="/var/log/app.log")
19
+ resource_monitor = ResourceMonitor(fd_threshold_percent=0.8)
20
+
21
+ # Integrate with health monitoring
22
+ health_manager.add_stability_monitors(
23
+ memory_detector=memory_detector,
24
+ log_monitor=log_monitor,
25
+ resource_monitor=resource_monitor,
26
+ )
27
+ """
28
+
29
+ from abc import ABC, abstractmethod
30
+ from typing import Callable, List
31
+
32
+ from claude_mpm.services.core.models.stability import (
33
+ LogPatternMatch,
34
+ MemoryTrend,
35
+ ResourceUsage,
36
+ )
37
+
38
+
39
+ class IMemoryLeakDetector(ABC):
40
+ """
41
+ Interface for memory leak detection using trend analysis.
42
+
43
+ WHY: Memory leaks are a common cause of process crashes. Early detection
44
+ enables preemptive restarts BEFORE the OOM killer terminates the process.
45
+
46
+ DESIGN DECISION: Uses slope-based trend analysis over a rolling window
47
+ to detect sustained memory growth patterns, filtering out normal variations.
48
+
49
+ Algorithm:
50
+ 1. Maintain rolling window of memory measurements (timestamp, memory_mb)
51
+ 2. Calculate linear regression slope (MB per minute)
52
+ 3. Detect leak if slope exceeds threshold (default: 10 MB/minute)
53
+ 4. Trigger alert when leak detected and memory > 80% limit
54
+
55
+ Thread Safety: Implementations must be thread-safe for concurrent access.
56
+ """
57
+
58
+ @abstractmethod
59
+ def record_memory_usage(self, deployment_id: str, memory_mb: float) -> None:
60
+ """
61
+ Record a memory usage measurement.
62
+
63
+ WHY: Builds historical data for trend analysis. Should be called
64
+ periodically (e.g., every 30s) to collect sufficient data points.
65
+
66
+ Args:
67
+ deployment_id: Deployment identifier
68
+ memory_mb: Current memory usage in megabytes
69
+ """
70
+
71
+ @abstractmethod
72
+ def analyze_trend(self, deployment_id: str) -> MemoryTrend:
73
+ """
74
+ Analyze memory usage trend for leak detection.
75
+
76
+ WHY: Computes slope of memory usage over time to detect sustained
77
+ growth patterns characteristic of memory leaks.
78
+
79
+ Args:
80
+ deployment_id: Deployment identifier
81
+
82
+ Returns:
83
+ MemoryTrend with slope analysis and leak detection result
84
+
85
+ Algorithm:
86
+ slope_mb_per_minute = (recent_memory - old_memory) / time_delta_minutes
87
+ is_leaking = slope_mb_per_minute > threshold
88
+ """
89
+
90
+ @abstractmethod
91
+ def is_leaking(self, deployment_id: str) -> bool:
92
+ """
93
+ Check if deployment has a detected memory leak.
94
+
95
+ Returns:
96
+ True if leak detected (sustained memory growth)
97
+ """
98
+
99
+ @abstractmethod
100
+ def register_leak_callback(
101
+ self, callback: Callable[[str, MemoryTrend], None]
102
+ ) -> None:
103
+ """
104
+ Register callback for leak detection events.
105
+
106
+ Args:
107
+ callback: Function called with (deployment_id, trend) when leak detected
108
+ """
109
+
110
+
111
+ class ILogMonitor(ABC):
112
+ """
113
+ Interface for real-time log file monitoring and pattern matching.
114
+
115
+ WHY: Application logs contain early warning signals (exceptions, OOM errors,
116
+ segfaults) that predict imminent crashes. Real-time monitoring enables
117
+ proactive intervention.
118
+
119
+ DESIGN DECISION: Uses watchdog library for efficient file system monitoring.
120
+ Avoids polling by receiving file modification events from the OS.
121
+
122
+ Pattern Matching:
123
+ - Regex-based patterns for flexibility
124
+ - Configurable patterns per deployment
125
+ - Built-in patterns for common errors:
126
+ * OutOfMemoryError
127
+ * Segmentation fault
128
+ * Exception: / Traceback
129
+ * Database connection errors
130
+ * Network timeouts
131
+
132
+ Thread Safety: Uses watchdog's thread-safe event handling.
133
+ """
134
+
135
+ @abstractmethod
136
+ def start_monitoring(self, log_file: str, deployment_id: str) -> None:
137
+ """
138
+ Start monitoring a log file for error patterns.
139
+
140
+ WHY: Begins watching the log file for new entries. Uses OS-level
141
+ file system events for efficiency.
142
+
143
+ Args:
144
+ log_file: Path to log file to monitor
145
+ deployment_id: Deployment identifier for callbacks
146
+ """
147
+
148
+ @abstractmethod
149
+ def stop_monitoring(self, deployment_id: str) -> None:
150
+ """
151
+ Stop monitoring a deployment's log file.
152
+
153
+ Args:
154
+ deployment_id: Deployment identifier
155
+ """
156
+
157
+ @abstractmethod
158
+ def add_pattern(self, pattern: str, severity: str = "ERROR") -> None:
159
+ """
160
+ Add an error pattern to monitor.
161
+
162
+ Args:
163
+ pattern: Regex pattern to match
164
+ severity: Error severity (ERROR, CRITICAL, WARNING)
165
+ """
166
+
167
+ @abstractmethod
168
+ def get_recent_matches(
169
+ self, deployment_id: str, limit: int = 10
170
+ ) -> List[LogPatternMatch]:
171
+ """
172
+ Get recent pattern matches for a deployment.
173
+
174
+ Args:
175
+ deployment_id: Deployment identifier
176
+ limit: Maximum number of matches to return
177
+
178
+ Returns:
179
+ List of LogPatternMatch objects, newest first
180
+ """
181
+
182
+ @abstractmethod
183
+ def register_match_callback(
184
+ self, callback: Callable[[str, LogPatternMatch], None]
185
+ ) -> None:
186
+ """
187
+ Register callback for pattern matches.
188
+
189
+ Args:
190
+ callback: Function called with (deployment_id, match) when pattern detected
191
+ """
192
+
193
+
194
+ class IResourceMonitor(ABC):
195
+ """
196
+ Interface for comprehensive resource usage monitoring.
197
+
198
+ WHY: Resource exhaustion (file descriptors, threads, connections, disk space)
199
+ causes crashes and degradation. Monitoring enables preemptive action at 80%
200
+ thresholds before hitting hard limits.
201
+
202
+ DESIGN DECISION: Extends basic resource health checks with:
203
+ - Higher granularity (more frequent checks)
204
+ - Percentage-based thresholds (80% of ulimit)
205
+ - Trend analysis for growth rate
206
+ - Integration with restart manager for preemptive restarts
207
+
208
+ Resource Types:
209
+ 1. File Descriptors: Critical for I/O operations (Unix: ulimit -n)
210
+ 2. Threads: Memory and scheduling overhead
211
+ 3. Network Connections: Socket exhaustion
212
+ 4. Disk Space: Working directory availability
213
+
214
+ Thread Safety: Implementations must be thread-safe.
215
+ """
216
+
217
+ @abstractmethod
218
+ def check_resources(self, deployment_id: str) -> ResourceUsage:
219
+ """
220
+ Check resource usage for a deployment.
221
+
222
+ WHY: Provides comprehensive snapshot of resource consumption across
223
+ all monitored resource types.
224
+
225
+ Args:
226
+ deployment_id: Deployment identifier
227
+
228
+ Returns:
229
+ ResourceUsage with current metrics and critical status
230
+
231
+ Raises:
232
+ ValueError: If deployment not found
233
+ """
234
+
235
+ @abstractmethod
236
+ def is_critical(self, deployment_id: str) -> bool:
237
+ """
238
+ Check if any resource is at critical threshold (>80%).
239
+
240
+ Returns:
241
+ True if any resource exceeds 80% of limit
242
+ """
243
+
244
+ @abstractmethod
245
+ def register_critical_callback(
246
+ self, callback: Callable[[str, ResourceUsage], None]
247
+ ) -> None:
248
+ """
249
+ Register callback for critical resource usage.
250
+
251
+ Args:
252
+ callback: Function called with (deployment_id, usage) when critical
253
+ """
254
+
255
+
256
+ __all__ = [
257
+ "ILogMonitor",
258
+ "IMemoryLeakDetector",
259
+ "IResourceMonitor",
260
+ ]
@@ -20,6 +20,24 @@ from .agent_config import (
20
20
  ConfigurationResult,
21
21
  ValidationResult,
22
22
  )
23
+ from .process import (
24
+ PROTECTED_PORT_RANGES,
25
+ DeploymentState,
26
+ ProcessInfo,
27
+ StartConfig,
28
+ is_port_protected,
29
+ )
30
+ from .restart import (
31
+ CircuitBreakerState,
32
+ RestartAttempt,
33
+ RestartConfig,
34
+ RestartHistory,
35
+ )
36
+ from .stability import (
37
+ LogPatternMatch,
38
+ MemoryTrend,
39
+ ResourceUsage,
40
+ )
23
41
  from .toolchain import (
24
42
  ConfidenceLevel,
25
43
  DeploymentTarget,
@@ -43,4 +61,19 @@ __all__ = [ # noqa: RUF022 - Grouped by category with comments for clarity
43
61
  "ConfigurationResult",
44
62
  "ValidationResult",
45
63
  "ConfigurationPreview",
64
+ # Process management models
65
+ "DeploymentState",
66
+ "ProcessInfo",
67
+ "StartConfig",
68
+ "PROTECTED_PORT_RANGES",
69
+ "is_port_protected",
70
+ # Restart management models
71
+ "CircuitBreakerState",
72
+ "RestartAttempt",
73
+ "RestartHistory",
74
+ "RestartConfig",
75
+ # Stability monitoring models
76
+ "MemoryTrend",
77
+ "LogPatternMatch",
78
+ "ResourceUsage",
46
79
  ]
@@ -17,6 +17,8 @@ from dataclasses import dataclass, field
17
17
  from enum import Enum
18
18
  from typing import Any, Dict, List, Optional
19
19
 
20
+ from ....core.enums import OperationResult, ValidationSeverity
21
+
20
22
 
21
23
  class AgentSpecialization(str, Enum):
22
24
  """Agent specialization categories.
@@ -154,20 +156,6 @@ class AgentRecommendation:
154
156
  }
155
157
 
156
158
 
157
- class ConfigurationStatus(str, Enum):
158
- """Status of configuration operation.
159
-
160
- WHY: Configuration can succeed, fail, or partially succeed. This enum
161
- provides a standardized way to communicate operation outcomes.
162
- """
163
-
164
- SUCCESS = "success"
165
- PARTIAL_SUCCESS = "partial_success"
166
- FAILURE = "failure"
167
- VALIDATION_ERROR = "validation_error"
168
- USER_CANCELLED = "user_cancelled"
169
-
170
-
171
159
  @dataclass
172
160
  class ConfigurationResult:
173
161
  """Result of automated configuration operation.
@@ -179,9 +167,17 @@ class ConfigurationResult:
179
167
  DESIGN DECISION: Separates successful and failed deployments to enable
180
168
  proper error handling. Includes validation results and user-facing
181
169
  messages for transparency.
170
+
171
+ NOTE: Uses core OperationResult enum (consolidated from ConfigurationStatus
172
+ in Phase 3A Batch 25). Mappings:
173
+ - SUCCESS → OperationResult.SUCCESS
174
+ - PARTIAL_SUCCESS → OperationResult.WARNING (partial success with issues)
175
+ - FAILURE → OperationResult.FAILED
176
+ - VALIDATION_ERROR → OperationResult.ERROR
177
+ - USER_CANCELLED → OperationResult.CANCELLED
182
178
  """
183
179
 
184
- status: ConfigurationStatus
180
+ status: OperationResult
185
181
  deployed_agents: List[str] = field(default_factory=list)
186
182
  failed_agents: List[str] = field(default_factory=list)
187
183
  validation_warnings: List[str] = field(default_factory=list)
@@ -193,7 +189,7 @@ class ConfigurationResult:
193
189
  @property
194
190
  def is_successful(self) -> bool:
195
191
  """Check if configuration was completely successful."""
196
- return self.status == ConfigurationStatus.SUCCESS
192
+ return self.status == OperationResult.SUCCESS
197
193
 
198
194
  @property
199
195
  def has_failures(self) -> bool:
@@ -223,18 +219,6 @@ class ConfigurationResult:
223
219
  }
224
220
 
225
221
 
226
- class ValidationSeverity(str, Enum):
227
- """Severity level for validation issues.
228
-
229
- WHY: Not all validation issues are equally critical. This enum enables
230
- categorization of issues by severity to support appropriate handling.
231
- """
232
-
233
- ERROR = "error" # Blocks deployment
234
- WARNING = "warning" # Should be reviewed but doesn't block
235
- INFO = "info" # Informational only
236
-
237
-
238
222
  @dataclass
239
223
  class ValidationIssue:
240
224
  """Represents a validation issue.