claude-mpm 4.13.2__py3-none-any.whl → 4.18.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/BASE_ENGINEER.md +286 -0
  3. claude_mpm/agents/BASE_PM.md +48 -17
  4. claude_mpm/agents/OUTPUT_STYLE.md +329 -11
  5. claude_mpm/agents/PM_INSTRUCTIONS.md +227 -8
  6. claude_mpm/agents/agent_loader.py +17 -5
  7. claude_mpm/agents/frontmatter_validator.py +284 -253
  8. claude_mpm/agents/templates/agentic-coder-optimizer.json +9 -2
  9. claude_mpm/agents/templates/api_qa.json +7 -1
  10. claude_mpm/agents/templates/clerk-ops.json +8 -1
  11. claude_mpm/agents/templates/code_analyzer.json +4 -1
  12. claude_mpm/agents/templates/dart_engineer.json +11 -1
  13. claude_mpm/agents/templates/data_engineer.json +11 -1
  14. claude_mpm/agents/templates/documentation.json +6 -1
  15. claude_mpm/agents/templates/engineer.json +18 -1
  16. claude_mpm/agents/templates/gcp_ops_agent.json +8 -1
  17. claude_mpm/agents/templates/golang_engineer.json +11 -1
  18. claude_mpm/agents/templates/java_engineer.json +12 -2
  19. claude_mpm/agents/templates/local_ops_agent.json +1217 -6
  20. claude_mpm/agents/templates/nextjs_engineer.json +11 -1
  21. claude_mpm/agents/templates/ops.json +8 -1
  22. claude_mpm/agents/templates/php-engineer.json +11 -1
  23. claude_mpm/agents/templates/project_organizer.json +10 -3
  24. claude_mpm/agents/templates/prompt-engineer.json +5 -1
  25. claude_mpm/agents/templates/python_engineer.json +11 -1
  26. claude_mpm/agents/templates/qa.json +7 -1
  27. claude_mpm/agents/templates/react_engineer.json +11 -1
  28. claude_mpm/agents/templates/refactoring_engineer.json +8 -1
  29. claude_mpm/agents/templates/research.json +4 -1
  30. claude_mpm/agents/templates/ruby-engineer.json +11 -1
  31. claude_mpm/agents/templates/rust_engineer.json +11 -1
  32. claude_mpm/agents/templates/security.json +6 -1
  33. claude_mpm/agents/templates/svelte-engineer.json +225 -0
  34. claude_mpm/agents/templates/ticketing.json +6 -1
  35. claude_mpm/agents/templates/typescript_engineer.json +11 -1
  36. claude_mpm/agents/templates/vercel_ops_agent.json +8 -1
  37. claude_mpm/agents/templates/version_control.json +8 -1
  38. claude_mpm/agents/templates/web_qa.json +7 -1
  39. claude_mpm/agents/templates/web_ui.json +11 -1
  40. claude_mpm/cli/__init__.py +34 -706
  41. claude_mpm/cli/commands/agent_manager.py +25 -12
  42. claude_mpm/cli/commands/agent_state_manager.py +186 -0
  43. claude_mpm/cli/commands/agents.py +204 -148
  44. claude_mpm/cli/commands/aggregate.py +7 -3
  45. claude_mpm/cli/commands/analyze.py +9 -4
  46. claude_mpm/cli/commands/analyze_code.py +7 -2
  47. claude_mpm/cli/commands/auto_configure.py +7 -9
  48. claude_mpm/cli/commands/config.py +47 -13
  49. claude_mpm/cli/commands/configure.py +294 -1788
  50. claude_mpm/cli/commands/configure_agent_display.py +261 -0
  51. claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
  52. claude_mpm/cli/commands/configure_hook_manager.py +225 -0
  53. claude_mpm/cli/commands/configure_models.py +18 -0
  54. claude_mpm/cli/commands/configure_navigation.py +167 -0
  55. claude_mpm/cli/commands/configure_paths.py +104 -0
  56. claude_mpm/cli/commands/configure_persistence.py +254 -0
  57. claude_mpm/cli/commands/configure_startup_manager.py +646 -0
  58. claude_mpm/cli/commands/configure_template_editor.py +497 -0
  59. claude_mpm/cli/commands/configure_validators.py +73 -0
  60. claude_mpm/cli/commands/local_deploy.py +537 -0
  61. claude_mpm/cli/commands/memory.py +54 -20
  62. claude_mpm/cli/commands/mpm_init.py +39 -25
  63. claude_mpm/cli/commands/mpm_init_handler.py +8 -3
  64. claude_mpm/cli/executor.py +202 -0
  65. claude_mpm/cli/helpers.py +105 -0
  66. claude_mpm/cli/interactive/__init__.py +3 -0
  67. claude_mpm/cli/interactive/skills_wizard.py +491 -0
  68. claude_mpm/cli/parsers/__init__.py +7 -1
  69. claude_mpm/cli/parsers/base_parser.py +98 -3
  70. claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
  71. claude_mpm/cli/shared/output_formatters.py +28 -19
  72. claude_mpm/cli/startup.py +481 -0
  73. claude_mpm/cli/utils.py +52 -1
  74. claude_mpm/commands/mpm-help.md +3 -0
  75. claude_mpm/commands/mpm-version.md +113 -0
  76. claude_mpm/commands/mpm.md +1 -0
  77. claude_mpm/config/agent_config.py +2 -2
  78. claude_mpm/config/model_config.py +428 -0
  79. claude_mpm/core/base_service.py +13 -12
  80. claude_mpm/core/enums.py +452 -0
  81. claude_mpm/core/factories.py +1 -1
  82. claude_mpm/core/instruction_reinforcement_hook.py +2 -1
  83. claude_mpm/core/interactive_session.py +9 -3
  84. claude_mpm/core/logging_config.py +6 -2
  85. claude_mpm/core/oneshot_session.py +8 -4
  86. claude_mpm/core/optimized_agent_loader.py +3 -3
  87. claude_mpm/core/output_style_manager.py +12 -192
  88. claude_mpm/core/service_registry.py +5 -1
  89. claude_mpm/core/types.py +2 -9
  90. claude_mpm/core/typing_utils.py +7 -6
  91. claude_mpm/dashboard/static/js/dashboard.js +0 -14
  92. claude_mpm/dashboard/templates/index.html +3 -41
  93. claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
  94. claude_mpm/hooks/instruction_reinforcement.py +7 -2
  95. claude_mpm/models/resume_log.py +340 -0
  96. claude_mpm/services/agents/auto_config_manager.py +10 -11
  97. claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
  98. claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
  99. claude_mpm/services/agents/deployment/agent_validator.py +17 -1
  100. claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
  101. claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
  102. claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
  103. claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
  104. claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
  105. claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
  106. claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
  107. claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
  108. claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
  109. claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
  110. claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
  111. claude_mpm/services/agents/local_template_manager.py +1 -1
  112. claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
  113. claude_mpm/services/agents/registry/modification_tracker.py +5 -2
  114. claude_mpm/services/command_handler_service.py +11 -5
  115. claude_mpm/services/core/interfaces/__init__.py +74 -2
  116. claude_mpm/services/core/interfaces/health.py +172 -0
  117. claude_mpm/services/core/interfaces/model.py +281 -0
  118. claude_mpm/services/core/interfaces/process.py +372 -0
  119. claude_mpm/services/core/interfaces/restart.py +307 -0
  120. claude_mpm/services/core/interfaces/stability.py +260 -0
  121. claude_mpm/services/core/models/__init__.py +33 -0
  122. claude_mpm/services/core/models/agent_config.py +12 -28
  123. claude_mpm/services/core/models/health.py +162 -0
  124. claude_mpm/services/core/models/process.py +235 -0
  125. claude_mpm/services/core/models/restart.py +302 -0
  126. claude_mpm/services/core/models/stability.py +264 -0
  127. claude_mpm/services/core/path_resolver.py +23 -7
  128. claude_mpm/services/diagnostics/__init__.py +2 -2
  129. claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
  130. claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
  131. claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
  132. claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
  133. claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
  134. claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
  135. claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
  136. claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
  137. claude_mpm/services/diagnostics/checks/mcp_services_check.py +36 -31
  138. claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
  139. claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
  140. claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
  141. claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
  142. claude_mpm/services/diagnostics/models.py +19 -24
  143. claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
  144. claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
  145. claude_mpm/services/infrastructure/monitoring/base.py +5 -13
  146. claude_mpm/services/infrastructure/monitoring/network.py +7 -6
  147. claude_mpm/services/infrastructure/monitoring/process.py +13 -12
  148. claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
  149. claude_mpm/services/infrastructure/monitoring/service.py +16 -15
  150. claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
  151. claude_mpm/services/local_ops/__init__.py +163 -0
  152. claude_mpm/services/local_ops/crash_detector.py +257 -0
  153. claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
  154. claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
  155. claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
  156. claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
  157. claude_mpm/services/local_ops/health_manager.py +430 -0
  158. claude_mpm/services/local_ops/log_monitor.py +396 -0
  159. claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
  160. claude_mpm/services/local_ops/process_manager.py +595 -0
  161. claude_mpm/services/local_ops/resource_monitor.py +331 -0
  162. claude_mpm/services/local_ops/restart_manager.py +401 -0
  163. claude_mpm/services/local_ops/restart_policy.py +387 -0
  164. claude_mpm/services/local_ops/state_manager.py +372 -0
  165. claude_mpm/services/local_ops/unified_manager.py +600 -0
  166. claude_mpm/services/mcp_config_manager.py +9 -4
  167. claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
  168. claude_mpm/services/mcp_gateway/core/base.py +18 -31
  169. claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +71 -24
  170. claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
  171. claude_mpm/services/memory_hook_service.py +4 -1
  172. claude_mpm/services/model/__init__.py +147 -0
  173. claude_mpm/services/model/base_provider.py +365 -0
  174. claude_mpm/services/model/claude_provider.py +412 -0
  175. claude_mpm/services/model/model_router.py +453 -0
  176. claude_mpm/services/model/ollama_provider.py +415 -0
  177. claude_mpm/services/monitor/daemon_manager.py +3 -2
  178. claude_mpm/services/monitor/handlers/dashboard.py +2 -1
  179. claude_mpm/services/monitor/handlers/hooks.py +2 -1
  180. claude_mpm/services/monitor/management/lifecycle.py +3 -2
  181. claude_mpm/services/monitor/server.py +2 -1
  182. claude_mpm/services/session_management_service.py +3 -2
  183. claude_mpm/services/session_manager.py +205 -1
  184. claude_mpm/services/shared/async_service_base.py +16 -27
  185. claude_mpm/services/shared/lifecycle_service_base.py +1 -14
  186. claude_mpm/services/socketio/handlers/__init__.py +5 -2
  187. claude_mpm/services/socketio/handlers/hook.py +13 -2
  188. claude_mpm/services/socketio/handlers/registry.py +4 -2
  189. claude_mpm/services/socketio/server/main.py +10 -8
  190. claude_mpm/services/subprocess_launcher_service.py +14 -5
  191. claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
  192. claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
  193. claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
  194. claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
  195. claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
  196. claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
  197. claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
  198. claude_mpm/services/unified/deployment_strategies/local.py +6 -5
  199. claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
  200. claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
  201. claude_mpm/services/unified/interfaces.py +3 -1
  202. claude_mpm/services/unified/unified_analyzer.py +14 -10
  203. claude_mpm/services/unified/unified_config.py +2 -1
  204. claude_mpm/services/unified/unified_deployment.py +9 -4
  205. claude_mpm/services/version_service.py +104 -1
  206. claude_mpm/skills/__init__.py +21 -0
  207. claude_mpm/skills/bundled/__init__.py +6 -0
  208. claude_mpm/skills/bundled/api-documentation.md +393 -0
  209. claude_mpm/skills/bundled/async-testing.md +571 -0
  210. claude_mpm/skills/bundled/code-review.md +143 -0
  211. claude_mpm/skills/bundled/database-migration.md +199 -0
  212. claude_mpm/skills/bundled/docker-containerization.md +194 -0
  213. claude_mpm/skills/bundled/express-local-dev.md +1429 -0
  214. claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
  215. claude_mpm/skills/bundled/git-workflow.md +414 -0
  216. claude_mpm/skills/bundled/imagemagick.md +204 -0
  217. claude_mpm/skills/bundled/json-data-handling.md +223 -0
  218. claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
  219. claude_mpm/skills/bundled/pdf.md +141 -0
  220. claude_mpm/skills/bundled/performance-profiling.md +567 -0
  221. claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
  222. claude_mpm/skills/bundled/security-scanning.md +327 -0
  223. claude_mpm/skills/bundled/systematic-debugging.md +473 -0
  224. claude_mpm/skills/bundled/test-driven-development.md +378 -0
  225. claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
  226. claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
  227. claude_mpm/skills/bundled/xlsx.md +157 -0
  228. claude_mpm/skills/registry.py +286 -0
  229. claude_mpm/skills/skill_manager.py +310 -0
  230. claude_mpm/tools/code_tree_analyzer.py +177 -141
  231. claude_mpm/tools/code_tree_events.py +4 -2
  232. claude_mpm/utils/agent_dependency_loader.py +2 -2
  233. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +117 -8
  234. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +238 -174
  235. claude_mpm/dashboard/static/css/code-tree.css +0 -1639
  236. claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
  237. claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
  238. claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
  239. claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
  240. claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
  241. claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
  242. claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
  243. claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
  244. claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
  245. claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
  246. claude_mpm/services/project/analyzer_refactored.py +0 -450
  247. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
  248. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
  249. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
  250. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,257 @@
1
+ """
2
+ Crash Detector for Claude MPM Framework
3
+ ========================================
4
+
5
+ WHY: Detects process crashes and failures by monitoring health status changes,
6
+ process exits, and zombie states. Integrates with HealthCheckManager to receive
7
+ real-time status updates.
8
+
9
+ DESIGN DECISION: Uses callback-based architecture to receive health status
10
+ changes from HealthCheckManager. Tracks crash history per deployment to
11
+ enable pattern detection and intelligent restart policies.
12
+
13
+ ARCHITECTURE:
14
+ - Subscribes to HealthCheckManager status change callbacks
15
+ - Detects crashes when status transitions to UNHEALTHY
16
+ - Tracks crash count per deployment
17
+ - Invokes registered crash callbacks when crash detected
18
+
19
+ USAGE:
20
+ crash_detector = CrashDetector(health_manager)
21
+ crash_detector.register_crash_callback(handle_crash)
22
+ crash_detector.start_monitoring(deployment_id)
23
+ """
24
+
25
+ import threading
26
+ from collections import defaultdict
27
+ from typing import Callable, Dict, List, Set
28
+
29
+ from claude_mpm.core.enums import HealthStatus
30
+ from claude_mpm.services.core.base import SyncBaseService
31
+ from claude_mpm.services.core.interfaces.health import IHealthCheckManager
32
+ from claude_mpm.services.core.interfaces.restart import ICrashDetector
33
+
34
+
35
+ class CrashDetector(SyncBaseService, ICrashDetector):
36
+ """
37
+ Detects process crashes via health status monitoring.
38
+
39
+ WHY: Provides automated crash detection by monitoring health status
40
+ changes. Enables reactive restart policies based on crash events.
41
+
42
+ Thread Safety: All public methods are thread-safe with proper locking.
43
+ """
44
+
45
+ def __init__(self, health_manager: IHealthCheckManager):
46
+ """
47
+ Initialize crash detector.
48
+
49
+ Args:
50
+ health_manager: Health check manager for status monitoring
51
+ """
52
+ super().__init__("CrashDetector")
53
+ self.health_manager = health_manager
54
+ self._lock = threading.Lock()
55
+
56
+ # Deployments being monitored
57
+ self._monitored_deployments: Set[str] = set()
58
+
59
+ # Last known health status per deployment
60
+ self._last_health_status: Dict[str, HealthStatus] = {}
61
+
62
+ # Crash count per deployment
63
+ self._crash_count: Dict[str, int] = defaultdict(int)
64
+
65
+ # Crash callbacks: List of functions called with (deployment_id, reason)
66
+ self._crash_callbacks: List[Callable[[str, str], None]] = []
67
+
68
+ def initialize(self) -> bool:
69
+ """
70
+ Initialize the crash detector.
71
+
72
+ Returns:
73
+ True if initialization successful
74
+ """
75
+ self.logger.info("Initializing CrashDetector")
76
+
77
+ # Register with health manager to receive status change callbacks
78
+ self.health_manager.register_status_callback(self._on_health_status_change)
79
+
80
+ self.logger.info("CrashDetector initialized successfully")
81
+ return True
82
+
83
+ def register_crash_callback(self, callback: Callable[[str, str], None]) -> None:
84
+ """
85
+ Register a callback to be invoked when a crash is detected.
86
+
87
+ Args:
88
+ callback: Function called with (deployment_id, reason)
89
+ """
90
+ with self._lock:
91
+ self._crash_callbacks.append(callback)
92
+ callback_name = getattr(callback, "__name__", repr(callback))
93
+ self.logger.debug(f"Registered crash callback: {callback_name}")
94
+
95
+ def start_monitoring(self, deployment_id: str) -> None:
96
+ """
97
+ Start monitoring a deployment for crashes.
98
+
99
+ Args:
100
+ deployment_id: Unique deployment identifier
101
+
102
+ Raises:
103
+ ValueError: If deployment_id not found
104
+ """
105
+ with self._lock:
106
+ self._monitored_deployments.add(deployment_id)
107
+ self.logger.info(
108
+ f"Started crash monitoring for deployment: {deployment_id}"
109
+ )
110
+
111
+ # Get initial health status
112
+ try:
113
+ health = self.health_manager.check_health(deployment_id)
114
+ self._last_health_status[deployment_id] = health.overall_status
115
+ self.logger.debug(
116
+ f"Initial health status for {deployment_id}: {health.overall_status.value}"
117
+ )
118
+ except Exception as e:
119
+ self.logger.warning(
120
+ f"Failed to get initial health status for {deployment_id}: {e}"
121
+ )
122
+ self._last_health_status[deployment_id] = HealthStatus.UNKNOWN
123
+
124
+ def stop_monitoring(self, deployment_id: str) -> None:
125
+ """
126
+ Stop monitoring a deployment.
127
+
128
+ Args:
129
+ deployment_id: Unique deployment identifier
130
+ """
131
+ with self._lock:
132
+ self._monitored_deployments.discard(deployment_id)
133
+ self._last_health_status.pop(deployment_id, None)
134
+ self.logger.info(
135
+ f"Stopped crash monitoring for deployment: {deployment_id}"
136
+ )
137
+
138
+ def is_monitoring(self, deployment_id: str) -> bool:
139
+ """
140
+ Check if a deployment is being monitored.
141
+
142
+ Args:
143
+ deployment_id: Unique deployment identifier
144
+
145
+ Returns:
146
+ True if deployment is being monitored
147
+ """
148
+ with self._lock:
149
+ return deployment_id in self._monitored_deployments
150
+
151
+ def get_crash_count(self, deployment_id: str) -> int:
152
+ """
153
+ Get the number of crashes detected for a deployment.
154
+
155
+ Args:
156
+ deployment_id: Unique deployment identifier
157
+
158
+ Returns:
159
+ Number of crashes detected
160
+ """
161
+ with self._lock:
162
+ return self._crash_count.get(deployment_id, 0)
163
+
164
+ def reset_crash_count(self, deployment_id: str) -> None:
165
+ """
166
+ Reset crash count for a deployment.
167
+
168
+ WHY: Allows manual intervention to clear crash history.
169
+
170
+ Args:
171
+ deployment_id: Unique deployment identifier
172
+ """
173
+ with self._lock:
174
+ self._crash_count[deployment_id] = 0
175
+ self.logger.debug(f"Reset crash count for deployment: {deployment_id}")
176
+
177
+ def shutdown(self) -> bool:
178
+ """
179
+ Shutdown the crash detector.
180
+
181
+ Returns:
182
+ True if shutdown successful
183
+ """
184
+ with self._lock:
185
+ self._monitored_deployments.clear()
186
+ self._last_health_status.clear()
187
+ self._crash_count.clear()
188
+ self._crash_callbacks.clear()
189
+ self.logger.info("CrashDetector shutdown successfully")
190
+ return True
191
+
192
+ def _on_health_status_change(
193
+ self, deployment_id: str, old_status: HealthStatus, new_status: HealthStatus
194
+ ) -> None:
195
+ """
196
+ Handle health status changes from HealthCheckManager.
197
+
198
+ WHY: Callback invoked by HealthCheckManager when status changes.
199
+ Detects crashes when status transitions to UNHEALTHY.
200
+
201
+ Args:
202
+ deployment_id: Unique deployment identifier
203
+ old_status: Previous health status
204
+ new_status: New health status
205
+ """
206
+ with self._lock:
207
+ # Only process if we're monitoring this deployment
208
+ if deployment_id not in self._monitored_deployments:
209
+ return
210
+
211
+ # Update last known status
212
+ self._last_health_status[deployment_id] = new_status
213
+
214
+ # Detect crash: transition from operational to UNHEALTHY
215
+ if old_status.is_operational() and new_status.is_critical():
216
+ self._handle_crash(
217
+ deployment_id, "Health status transitioned to UNHEALTHY"
218
+ )
219
+
220
+ # Also detect: transition from UNKNOWN to UNHEALTHY (process died)
221
+ elif (
222
+ old_status == HealthStatus.UNKNOWN
223
+ and new_status == HealthStatus.UNHEALTHY
224
+ ):
225
+ self._handle_crash(deployment_id, "Process became unhealthy")
226
+
227
+ def _handle_crash(self, deployment_id: str, reason: str) -> None:
228
+ """
229
+ Handle detected crash.
230
+
231
+ WHY: Increments crash count and invokes all registered callbacks.
232
+
233
+ Args:
234
+ deployment_id: Unique deployment identifier
235
+ reason: Reason for crash detection
236
+ """
237
+ # Increment crash count
238
+ self._crash_count[deployment_id] += 1
239
+ crash_count = self._crash_count[deployment_id]
240
+
241
+ self.logger.warning(
242
+ f"Crash detected for deployment {deployment_id} "
243
+ f"(count: {crash_count}): {reason}"
244
+ )
245
+
246
+ # Invoke all crash callbacks
247
+ for callback in self._crash_callbacks:
248
+ try:
249
+ callback(deployment_id, reason)
250
+ except Exception as e:
251
+ self.logger.error(
252
+ f"Error invoking crash callback {callback.__name__}: {e}",
253
+ exc_info=True,
254
+ )
255
+
256
+
257
+ __all__ = ["CrashDetector"]
@@ -0,0 +1,28 @@
1
+ """
2
+ Health Check Implementations for Local Operations
3
+ ==================================================
4
+
5
+ WHY: Provides three-tier health monitoring for local deployments:
6
+ - HTTP health checks for endpoint availability
7
+ - Process health checks for process status
8
+ - Resource health checks for CPU/memory/connections
9
+
10
+ ARCHITECTURE:
11
+ - HttpHealthCheck: HTTP endpoint availability and response time
12
+ - ProcessHealthCheck: Process existence and status validation
13
+ - ResourceHealthCheck: CPU, memory, and connection monitoring
14
+ """
15
+
16
+ from claude_mpm.services.local_ops.health_checks.http_check import HttpHealthCheck
17
+ from claude_mpm.services.local_ops.health_checks.process_check import (
18
+ ProcessHealthCheck,
19
+ )
20
+ from claude_mpm.services.local_ops.health_checks.resource_check import (
21
+ ResourceHealthCheck,
22
+ )
23
+
24
+ __all__ = [
25
+ "HttpHealthCheck",
26
+ "ProcessHealthCheck",
27
+ "ResourceHealthCheck",
28
+ ]
@@ -0,0 +1,224 @@
1
+ """
2
+ HTTP Health Check for Claude MPM Framework
3
+ ===========================================
4
+
5
+ WHY: Provides HTTP endpoint health monitoring with response time measurement,
6
+ status code validation, and timeout handling.
7
+
8
+ DESIGN DECISION: Uses requests library with configurable timeout and retry logic.
9
+ Supports custom headers and SSL/TLS validation.
10
+
11
+ ARCHITECTURE:
12
+ - Synchronous HTTP GET requests
13
+ - Response time measurement with time.perf_counter()
14
+ - Status code validation (2xx/3xx = healthy)
15
+ - Timeout and connection error handling
16
+ - Retry logic with exponential backoff
17
+
18
+ USAGE:
19
+ http_check = HttpHealthCheck(process_manager)
20
+ result = http_check.check(
21
+ deployment_id="my-app",
22
+ endpoint="http://localhost:3000/health",
23
+ timeout=5.0
24
+ )
25
+ """
26
+
27
+ import time
28
+
29
+ import requests
30
+ from requests.exceptions import ConnectionError, RequestException, Timeout
31
+
32
+ from claude_mpm.core.enums import HealthStatus
33
+ from claude_mpm.services.core.base import SyncBaseService
34
+ from claude_mpm.services.core.interfaces.health import IHealthCheck
35
+ from claude_mpm.services.core.interfaces.process import ILocalProcessManager
36
+ from claude_mpm.services.core.models.health import HealthCheckResult
37
+
38
+
39
+ class HttpHealthCheck(SyncBaseService, IHealthCheck):
40
+ """
41
+ HTTP endpoint health check implementation.
42
+
43
+ WHY: Validates that deployed services are accessible via HTTP and
44
+ responding within acceptable timeframes.
45
+
46
+ Thread Safety: Stateless, safe for concurrent execution.
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ process_manager: ILocalProcessManager,
52
+ default_timeout: float = 5.0,
53
+ max_retries: int = 2,
54
+ ):
55
+ """
56
+ Initialize HTTP health check.
57
+
58
+ Args:
59
+ process_manager: Process manager for deployment lookup
60
+ default_timeout: Default timeout in seconds
61
+ max_retries: Maximum number of retry attempts
62
+ """
63
+ super().__init__("HttpHealthCheck")
64
+ self.process_manager = process_manager
65
+ self.default_timeout = default_timeout
66
+ self.max_retries = max_retries
67
+
68
+ def initialize(self) -> bool:
69
+ """
70
+ Initialize the health check.
71
+
72
+ Returns:
73
+ True if initialization successful
74
+ """
75
+ self._initialized = True
76
+ self.log_info("HTTP health check initialized")
77
+ return True
78
+
79
+ def shutdown(self) -> None:
80
+ """Shutdown health check (no resources to clean up)."""
81
+ self._shutdown = True
82
+
83
+ def get_check_type(self) -> str:
84
+ """Get the check type identifier."""
85
+ return "http"
86
+
87
+ def check(self, deployment_id: str, **kwargs) -> HealthCheckResult:
88
+ """
89
+ Execute HTTP health check for a deployment.
90
+
91
+ Args:
92
+ deployment_id: Unique deployment identifier
93
+ **kwargs: Optional parameters:
94
+ - endpoint: HTTP endpoint URL (required)
95
+ - timeout: Request timeout in seconds (default: 5.0)
96
+ - headers: Custom HTTP headers
97
+ - verify_ssl: Verify SSL certificates (default: True)
98
+ - expected_status: Expected status code (default: 200)
99
+
100
+ Returns:
101
+ HealthCheckResult with check status and details
102
+
103
+ Raises:
104
+ ValueError: If deployment_id not found or endpoint not provided
105
+ """
106
+ # Validate deployment exists
107
+ deployment = self.process_manager.state_manager.get_deployment(deployment_id)
108
+ if not deployment:
109
+ raise ValueError(f"Deployment not found: {deployment_id}")
110
+
111
+ # Get endpoint from kwargs
112
+ endpoint = kwargs.get("endpoint")
113
+ if not endpoint:
114
+ # Try to construct from deployment port
115
+ if deployment.port:
116
+ endpoint = f"http://localhost:{deployment.port}/health"
117
+ else:
118
+ return HealthCheckResult(
119
+ status=HealthStatus.UNKNOWN,
120
+ check_type=self.get_check_type(),
121
+ message="No HTTP endpoint configured for deployment",
122
+ details={"deployment_id": deployment_id},
123
+ )
124
+
125
+ # Get optional parameters
126
+ timeout = kwargs.get("timeout", self.default_timeout)
127
+ headers = kwargs.get("headers", {})
128
+ verify_ssl = kwargs.get("verify_ssl", True)
129
+ expected_status = kwargs.get("expected_status", 200)
130
+
131
+ # Perform HTTP check with retries
132
+ for attempt in range(self.max_retries + 1):
133
+ try:
134
+ start_time = time.perf_counter()
135
+ response = requests.get(
136
+ endpoint, timeout=timeout, headers=headers, verify=verify_ssl
137
+ )
138
+ response_time = time.perf_counter() - start_time
139
+
140
+ # Check status code
141
+ if response.status_code == expected_status or (
142
+ 200 <= response.status_code < 400
143
+ ):
144
+ return HealthCheckResult(
145
+ status=HealthStatus.HEALTHY,
146
+ check_type=self.get_check_type(),
147
+ message="HTTP endpoint responding normally",
148
+ details={
149
+ "endpoint": endpoint,
150
+ "status_code": response.status_code,
151
+ "response_time_ms": round(response_time * 1000, 2),
152
+ "attempt": attempt + 1,
153
+ },
154
+ )
155
+ return HealthCheckResult(
156
+ status=HealthStatus.DEGRADED,
157
+ check_type=self.get_check_type(),
158
+ message="HTTP endpoint returned unexpected status code",
159
+ details={
160
+ "endpoint": endpoint,
161
+ "status_code": response.status_code,
162
+ "expected_status": expected_status,
163
+ "response_time_ms": round(response_time * 1000, 2),
164
+ },
165
+ )
166
+
167
+ except Timeout:
168
+ if attempt < self.max_retries:
169
+ self.log_debug(
170
+ f"HTTP check timeout for {deployment_id}, "
171
+ f"retrying (attempt {attempt + 1}/{self.max_retries})"
172
+ )
173
+ time.sleep(0.5 * (2**attempt)) # Exponential backoff
174
+ continue
175
+
176
+ return HealthCheckResult(
177
+ status=HealthStatus.DEGRADED,
178
+ check_type=self.get_check_type(),
179
+ message=f"HTTP endpoint timeout after {self.max_retries + 1} attempts",
180
+ details={
181
+ "endpoint": endpoint,
182
+ "timeout_seconds": timeout,
183
+ "attempts": self.max_retries + 1,
184
+ },
185
+ )
186
+
187
+ except ConnectionError as e:
188
+ if attempt < self.max_retries:
189
+ self.log_debug(
190
+ f"HTTP connection error for {deployment_id}, "
191
+ f"retrying (attempt {attempt + 1}/{self.max_retries})"
192
+ )
193
+ time.sleep(0.5 * (2**attempt)) # Exponential backoff
194
+ continue
195
+
196
+ return HealthCheckResult(
197
+ status=HealthStatus.UNHEALTHY,
198
+ check_type=self.get_check_type(),
199
+ message="Cannot connect to HTTP endpoint",
200
+ details={
201
+ "endpoint": endpoint,
202
+ "error": str(e),
203
+ "attempts": self.max_retries + 1,
204
+ },
205
+ )
206
+
207
+ except RequestException as e:
208
+ return HealthCheckResult(
209
+ status=HealthStatus.UNHEALTHY,
210
+ check_type=self.get_check_type(),
211
+ message="HTTP request failed",
212
+ details={"endpoint": endpoint, "error": str(e)},
213
+ )
214
+
215
+ # Should not reach here, but return unknown as fallback
216
+ return HealthCheckResult(
217
+ status=HealthStatus.UNKNOWN,
218
+ check_type=self.get_check_type(),
219
+ message="HTTP check completed with unknown result",
220
+ details={"endpoint": endpoint},
221
+ )
222
+
223
+
224
+ __all__ = ["HttpHealthCheck"]