claude-mpm 4.13.2__py3-none-any.whl → 4.18.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/BASE_ENGINEER.md +286 -0
  3. claude_mpm/agents/BASE_PM.md +48 -17
  4. claude_mpm/agents/OUTPUT_STYLE.md +329 -11
  5. claude_mpm/agents/PM_INSTRUCTIONS.md +227 -8
  6. claude_mpm/agents/agent_loader.py +17 -5
  7. claude_mpm/agents/frontmatter_validator.py +284 -253
  8. claude_mpm/agents/templates/agentic-coder-optimizer.json +9 -2
  9. claude_mpm/agents/templates/api_qa.json +7 -1
  10. claude_mpm/agents/templates/clerk-ops.json +8 -1
  11. claude_mpm/agents/templates/code_analyzer.json +4 -1
  12. claude_mpm/agents/templates/dart_engineer.json +11 -1
  13. claude_mpm/agents/templates/data_engineer.json +11 -1
  14. claude_mpm/agents/templates/documentation.json +6 -1
  15. claude_mpm/agents/templates/engineer.json +18 -1
  16. claude_mpm/agents/templates/gcp_ops_agent.json +8 -1
  17. claude_mpm/agents/templates/golang_engineer.json +11 -1
  18. claude_mpm/agents/templates/java_engineer.json +12 -2
  19. claude_mpm/agents/templates/local_ops_agent.json +1217 -6
  20. claude_mpm/agents/templates/nextjs_engineer.json +11 -1
  21. claude_mpm/agents/templates/ops.json +8 -1
  22. claude_mpm/agents/templates/php-engineer.json +11 -1
  23. claude_mpm/agents/templates/project_organizer.json +10 -3
  24. claude_mpm/agents/templates/prompt-engineer.json +5 -1
  25. claude_mpm/agents/templates/python_engineer.json +11 -1
  26. claude_mpm/agents/templates/qa.json +7 -1
  27. claude_mpm/agents/templates/react_engineer.json +11 -1
  28. claude_mpm/agents/templates/refactoring_engineer.json +8 -1
  29. claude_mpm/agents/templates/research.json +4 -1
  30. claude_mpm/agents/templates/ruby-engineer.json +11 -1
  31. claude_mpm/agents/templates/rust_engineer.json +11 -1
  32. claude_mpm/agents/templates/security.json +6 -1
  33. claude_mpm/agents/templates/svelte-engineer.json +225 -0
  34. claude_mpm/agents/templates/ticketing.json +6 -1
  35. claude_mpm/agents/templates/typescript_engineer.json +11 -1
  36. claude_mpm/agents/templates/vercel_ops_agent.json +8 -1
  37. claude_mpm/agents/templates/version_control.json +8 -1
  38. claude_mpm/agents/templates/web_qa.json +7 -1
  39. claude_mpm/agents/templates/web_ui.json +11 -1
  40. claude_mpm/cli/__init__.py +34 -706
  41. claude_mpm/cli/commands/agent_manager.py +25 -12
  42. claude_mpm/cli/commands/agent_state_manager.py +186 -0
  43. claude_mpm/cli/commands/agents.py +204 -148
  44. claude_mpm/cli/commands/aggregate.py +7 -3
  45. claude_mpm/cli/commands/analyze.py +9 -4
  46. claude_mpm/cli/commands/analyze_code.py +7 -2
  47. claude_mpm/cli/commands/auto_configure.py +7 -9
  48. claude_mpm/cli/commands/config.py +47 -13
  49. claude_mpm/cli/commands/configure.py +294 -1788
  50. claude_mpm/cli/commands/configure_agent_display.py +261 -0
  51. claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
  52. claude_mpm/cli/commands/configure_hook_manager.py +225 -0
  53. claude_mpm/cli/commands/configure_models.py +18 -0
  54. claude_mpm/cli/commands/configure_navigation.py +167 -0
  55. claude_mpm/cli/commands/configure_paths.py +104 -0
  56. claude_mpm/cli/commands/configure_persistence.py +254 -0
  57. claude_mpm/cli/commands/configure_startup_manager.py +646 -0
  58. claude_mpm/cli/commands/configure_template_editor.py +497 -0
  59. claude_mpm/cli/commands/configure_validators.py +73 -0
  60. claude_mpm/cli/commands/local_deploy.py +537 -0
  61. claude_mpm/cli/commands/memory.py +54 -20
  62. claude_mpm/cli/commands/mpm_init.py +39 -25
  63. claude_mpm/cli/commands/mpm_init_handler.py +8 -3
  64. claude_mpm/cli/executor.py +202 -0
  65. claude_mpm/cli/helpers.py +105 -0
  66. claude_mpm/cli/interactive/__init__.py +3 -0
  67. claude_mpm/cli/interactive/skills_wizard.py +491 -0
  68. claude_mpm/cli/parsers/__init__.py +7 -1
  69. claude_mpm/cli/parsers/base_parser.py +98 -3
  70. claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
  71. claude_mpm/cli/shared/output_formatters.py +28 -19
  72. claude_mpm/cli/startup.py +481 -0
  73. claude_mpm/cli/utils.py +52 -1
  74. claude_mpm/commands/mpm-help.md +3 -0
  75. claude_mpm/commands/mpm-version.md +113 -0
  76. claude_mpm/commands/mpm.md +1 -0
  77. claude_mpm/config/agent_config.py +2 -2
  78. claude_mpm/config/model_config.py +428 -0
  79. claude_mpm/core/base_service.py +13 -12
  80. claude_mpm/core/enums.py +452 -0
  81. claude_mpm/core/factories.py +1 -1
  82. claude_mpm/core/instruction_reinforcement_hook.py +2 -1
  83. claude_mpm/core/interactive_session.py +9 -3
  84. claude_mpm/core/logging_config.py +6 -2
  85. claude_mpm/core/oneshot_session.py +8 -4
  86. claude_mpm/core/optimized_agent_loader.py +3 -3
  87. claude_mpm/core/output_style_manager.py +12 -192
  88. claude_mpm/core/service_registry.py +5 -1
  89. claude_mpm/core/types.py +2 -9
  90. claude_mpm/core/typing_utils.py +7 -6
  91. claude_mpm/dashboard/static/js/dashboard.js +0 -14
  92. claude_mpm/dashboard/templates/index.html +3 -41
  93. claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
  94. claude_mpm/hooks/instruction_reinforcement.py +7 -2
  95. claude_mpm/models/resume_log.py +340 -0
  96. claude_mpm/services/agents/auto_config_manager.py +10 -11
  97. claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
  98. claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
  99. claude_mpm/services/agents/deployment/agent_validator.py +17 -1
  100. claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
  101. claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
  102. claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
  103. claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
  104. claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
  105. claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
  106. claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
  107. claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
  108. claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
  109. claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
  110. claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
  111. claude_mpm/services/agents/local_template_manager.py +1 -1
  112. claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
  113. claude_mpm/services/agents/registry/modification_tracker.py +5 -2
  114. claude_mpm/services/command_handler_service.py +11 -5
  115. claude_mpm/services/core/interfaces/__init__.py +74 -2
  116. claude_mpm/services/core/interfaces/health.py +172 -0
  117. claude_mpm/services/core/interfaces/model.py +281 -0
  118. claude_mpm/services/core/interfaces/process.py +372 -0
  119. claude_mpm/services/core/interfaces/restart.py +307 -0
  120. claude_mpm/services/core/interfaces/stability.py +260 -0
  121. claude_mpm/services/core/models/__init__.py +33 -0
  122. claude_mpm/services/core/models/agent_config.py +12 -28
  123. claude_mpm/services/core/models/health.py +162 -0
  124. claude_mpm/services/core/models/process.py +235 -0
  125. claude_mpm/services/core/models/restart.py +302 -0
  126. claude_mpm/services/core/models/stability.py +264 -0
  127. claude_mpm/services/core/path_resolver.py +23 -7
  128. claude_mpm/services/diagnostics/__init__.py +2 -2
  129. claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
  130. claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
  131. claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
  132. claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
  133. claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
  134. claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
  135. claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
  136. claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
  137. claude_mpm/services/diagnostics/checks/mcp_services_check.py +36 -31
  138. claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
  139. claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
  140. claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
  141. claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
  142. claude_mpm/services/diagnostics/models.py +19 -24
  143. claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
  144. claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
  145. claude_mpm/services/infrastructure/monitoring/base.py +5 -13
  146. claude_mpm/services/infrastructure/monitoring/network.py +7 -6
  147. claude_mpm/services/infrastructure/monitoring/process.py +13 -12
  148. claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
  149. claude_mpm/services/infrastructure/monitoring/service.py +16 -15
  150. claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
  151. claude_mpm/services/local_ops/__init__.py +163 -0
  152. claude_mpm/services/local_ops/crash_detector.py +257 -0
  153. claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
  154. claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
  155. claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
  156. claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
  157. claude_mpm/services/local_ops/health_manager.py +430 -0
  158. claude_mpm/services/local_ops/log_monitor.py +396 -0
  159. claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
  160. claude_mpm/services/local_ops/process_manager.py +595 -0
  161. claude_mpm/services/local_ops/resource_monitor.py +331 -0
  162. claude_mpm/services/local_ops/restart_manager.py +401 -0
  163. claude_mpm/services/local_ops/restart_policy.py +387 -0
  164. claude_mpm/services/local_ops/state_manager.py +372 -0
  165. claude_mpm/services/local_ops/unified_manager.py +600 -0
  166. claude_mpm/services/mcp_config_manager.py +9 -4
  167. claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
  168. claude_mpm/services/mcp_gateway/core/base.py +18 -31
  169. claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +71 -24
  170. claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
  171. claude_mpm/services/memory_hook_service.py +4 -1
  172. claude_mpm/services/model/__init__.py +147 -0
  173. claude_mpm/services/model/base_provider.py +365 -0
  174. claude_mpm/services/model/claude_provider.py +412 -0
  175. claude_mpm/services/model/model_router.py +453 -0
  176. claude_mpm/services/model/ollama_provider.py +415 -0
  177. claude_mpm/services/monitor/daemon_manager.py +3 -2
  178. claude_mpm/services/monitor/handlers/dashboard.py +2 -1
  179. claude_mpm/services/monitor/handlers/hooks.py +2 -1
  180. claude_mpm/services/monitor/management/lifecycle.py +3 -2
  181. claude_mpm/services/monitor/server.py +2 -1
  182. claude_mpm/services/session_management_service.py +3 -2
  183. claude_mpm/services/session_manager.py +205 -1
  184. claude_mpm/services/shared/async_service_base.py +16 -27
  185. claude_mpm/services/shared/lifecycle_service_base.py +1 -14
  186. claude_mpm/services/socketio/handlers/__init__.py +5 -2
  187. claude_mpm/services/socketio/handlers/hook.py +13 -2
  188. claude_mpm/services/socketio/handlers/registry.py +4 -2
  189. claude_mpm/services/socketio/server/main.py +10 -8
  190. claude_mpm/services/subprocess_launcher_service.py +14 -5
  191. claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
  192. claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
  193. claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
  194. claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
  195. claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
  196. claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
  197. claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
  198. claude_mpm/services/unified/deployment_strategies/local.py +6 -5
  199. claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
  200. claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
  201. claude_mpm/services/unified/interfaces.py +3 -1
  202. claude_mpm/services/unified/unified_analyzer.py +14 -10
  203. claude_mpm/services/unified/unified_config.py +2 -1
  204. claude_mpm/services/unified/unified_deployment.py +9 -4
  205. claude_mpm/services/version_service.py +104 -1
  206. claude_mpm/skills/__init__.py +21 -0
  207. claude_mpm/skills/bundled/__init__.py +6 -0
  208. claude_mpm/skills/bundled/api-documentation.md +393 -0
  209. claude_mpm/skills/bundled/async-testing.md +571 -0
  210. claude_mpm/skills/bundled/code-review.md +143 -0
  211. claude_mpm/skills/bundled/database-migration.md +199 -0
  212. claude_mpm/skills/bundled/docker-containerization.md +194 -0
  213. claude_mpm/skills/bundled/express-local-dev.md +1429 -0
  214. claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
  215. claude_mpm/skills/bundled/git-workflow.md +414 -0
  216. claude_mpm/skills/bundled/imagemagick.md +204 -0
  217. claude_mpm/skills/bundled/json-data-handling.md +223 -0
  218. claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
  219. claude_mpm/skills/bundled/pdf.md +141 -0
  220. claude_mpm/skills/bundled/performance-profiling.md +567 -0
  221. claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
  222. claude_mpm/skills/bundled/security-scanning.md +327 -0
  223. claude_mpm/skills/bundled/systematic-debugging.md +473 -0
  224. claude_mpm/skills/bundled/test-driven-development.md +378 -0
  225. claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
  226. claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
  227. claude_mpm/skills/bundled/xlsx.md +157 -0
  228. claude_mpm/skills/registry.py +286 -0
  229. claude_mpm/skills/skill_manager.py +310 -0
  230. claude_mpm/tools/code_tree_analyzer.py +177 -141
  231. claude_mpm/tools/code_tree_events.py +4 -2
  232. claude_mpm/utils/agent_dependency_loader.py +2 -2
  233. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +117 -8
  234. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +238 -174
  235. claude_mpm/dashboard/static/css/code-tree.css +0 -1639
  236. claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
  237. claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
  238. claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
  239. claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
  240. claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
  241. claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
  242. claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
  243. claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
  244. claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
  245. claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
  246. claude_mpm/services/project/analyzer_refactored.py +0 -450
  247. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
  248. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
  249. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
  250. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,430 @@
1
+ """
2
+ Health Check Manager for Claude MPM Framework
3
+ ==============================================
4
+
5
+ WHY: Orchestrates multiple health check types, provides background monitoring,
6
+ and maintains historical health data for local deployments.
7
+
8
+ DESIGN DECISION: Uses background daemon thread for continuous monitoring with
9
+ configurable check intervals. Aggregates results from all health check types
10
+ using defined priority rules.
11
+
12
+ ARCHITECTURE:
13
+ - Orchestrates HTTP, process, and resource health checks
14
+ - Background monitoring thread with configurable interval (default: 30s)
15
+ - Thread-safe status tracking with threading.Lock
16
+ - Historical health data (last 100 checks per deployment)
17
+ - Health status aggregation with priority:
18
+ 1. Process UNHEALTHY = Deployment UNHEALTHY (critical)
19
+ 2. Any check UNHEALTHY = Deployment DEGRADED (service issues)
20
+ 3. All checks HEALTHY = Deployment HEALTHY
21
+ 4. Otherwise = UNKNOWN
22
+ - Event callbacks for status changes
23
+
24
+ USAGE:
25
+ health_manager = HealthCheckManager(
26
+ process_manager=process_manager,
27
+ check_interval=30,
28
+ )
29
+ health_manager.start_monitoring()
30
+
31
+ # Check health on-demand
32
+ health = health_manager.check_health(deployment_id)
33
+
34
+ # Stop monitoring
35
+ health_manager.stop_monitoring()
36
+ """
37
+
38
+ import threading
39
+ from collections import defaultdict
40
+ from typing import Callable, Dict, List, Optional
41
+
42
+ from claude_mpm.core.enums import HealthStatus
43
+ from claude_mpm.services.core.base import SyncBaseService
44
+ from claude_mpm.services.core.interfaces.health import IHealthCheckManager
45
+ from claude_mpm.services.core.interfaces.process import ILocalProcessManager
46
+ from claude_mpm.services.core.models.health import (
47
+ DeploymentHealth,
48
+ HealthCheckResult,
49
+ )
50
+ from claude_mpm.services.local_ops.health_checks import (
51
+ HttpHealthCheck,
52
+ ProcessHealthCheck,
53
+ ResourceHealthCheck,
54
+ )
55
+
56
+
57
+ class HealthCheckManager(SyncBaseService, IHealthCheckManager):
58
+ """
59
+ Health check orchestration and monitoring service.
60
+
61
+ WHY: Provides comprehensive health monitoring by coordinating multiple
62
+ check types, maintaining historical data, and enabling background monitoring.
63
+
64
+ Thread Safety: All public methods are thread-safe with proper locking.
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ process_manager: ILocalProcessManager,
70
+ check_interval: int = 30,
71
+ history_limit: int = 100,
72
+ ):
73
+ """
74
+ Initialize health check manager.
75
+
76
+ Args:
77
+ process_manager: Process manager for deployment lookup
78
+ check_interval: Background check interval in seconds (default: 30)
79
+ history_limit: Maximum historical entries per deployment (default: 100)
80
+ """
81
+ super().__init__("HealthCheckManager")
82
+ self.process_manager = process_manager
83
+ self.check_interval = check_interval
84
+ self.history_limit = history_limit
85
+
86
+ # Initialize health check implementations
87
+ self.http_check = HttpHealthCheck(process_manager)
88
+ self.process_check = ProcessHealthCheck(process_manager)
89
+ self.resource_check = ResourceHealthCheck(process_manager)
90
+
91
+ # Background monitoring state
92
+ self._monitoring = False
93
+ self._monitor_thread: Optional[threading.Thread] = None
94
+ self._stop_event = threading.Event()
95
+ self._lock = threading.Lock()
96
+
97
+ # Health history: deployment_id -> List[DeploymentHealth]
98
+ self._health_history: Dict[str, List[DeploymentHealth]] = defaultdict(list)
99
+
100
+ # Status change callbacks
101
+ self._status_callbacks: List[Callable] = []
102
+
103
+ def initialize(self) -> bool:
104
+ """
105
+ Initialize the health check manager.
106
+
107
+ Returns:
108
+ True if initialization successful
109
+ """
110
+ try:
111
+ # Initialize all health check implementations
112
+ if not self.http_check.initialize():
113
+ self.log_error("Failed to initialize HTTP health check")
114
+ return False
115
+
116
+ if not self.process_check.initialize():
117
+ self.log_error("Failed to initialize process health check")
118
+ return False
119
+
120
+ if not self.resource_check.initialize():
121
+ self.log_error("Failed to initialize resource health check")
122
+ return False
123
+
124
+ self._initialized = True
125
+ self.log_info("Health check manager initialized")
126
+ return True
127
+
128
+ except Exception as e:
129
+ self.log_error(f"Failed to initialize: {e}")
130
+ return False
131
+
132
+ def shutdown(self) -> None:
133
+ """Shutdown health check manager and stop monitoring."""
134
+ if self._monitoring:
135
+ self.stop_monitoring()
136
+
137
+ # Shutdown health check implementations
138
+ self.http_check.shutdown()
139
+ self.process_check.shutdown()
140
+ self.resource_check.shutdown()
141
+
142
+ self._shutdown = True
143
+ self.log_info("Health check manager shutdown complete")
144
+
145
+ def check_health(self, deployment_id: str, **kwargs) -> DeploymentHealth:
146
+ """
147
+ Execute all health checks for a deployment.
148
+
149
+ Args:
150
+ deployment_id: Unique deployment identifier
151
+ **kwargs: Optional parameters passed to health checks:
152
+ - endpoint: HTTP endpoint URL
153
+ - timeout: HTTP timeout in seconds
154
+ - cpu_threshold: CPU usage threshold percentage
155
+ - memory_threshold_mb: Memory usage threshold in MB
156
+
157
+ Returns:
158
+ DeploymentHealth with aggregated status and check results
159
+
160
+ Raises:
161
+ ValueError: If deployment_id not found
162
+ """
163
+ # Validate deployment exists
164
+ deployment = self.process_manager.state_manager.get_deployment(deployment_id)
165
+ if not deployment:
166
+ raise ValueError(f"Deployment not found: {deployment_id}")
167
+
168
+ # Execute all health checks
169
+ checks: List[HealthCheckResult] = []
170
+
171
+ # 1. Process health check (most critical)
172
+ try:
173
+ process_result = self.process_check.check(deployment_id, **kwargs)
174
+ checks.append(process_result)
175
+ except Exception as e:
176
+ self.log_error(f"Process health check failed: {e}")
177
+ checks.append(
178
+ HealthCheckResult(
179
+ status=HealthStatus.UNKNOWN,
180
+ check_type="process",
181
+ message=f"Check failed: {e}",
182
+ details={"error": str(e)},
183
+ )
184
+ )
185
+
186
+ # 2. Resource health check
187
+ try:
188
+ resource_result = self.resource_check.check(deployment_id, **kwargs)
189
+ checks.append(resource_result)
190
+ except Exception as e:
191
+ self.log_error(f"Resource health check failed: {e}")
192
+ checks.append(
193
+ HealthCheckResult(
194
+ status=HealthStatus.UNKNOWN,
195
+ check_type="resource",
196
+ message=f"Check failed: {e}",
197
+ details={"error": str(e)},
198
+ )
199
+ )
200
+
201
+ # 3. HTTP health check (optional, only if endpoint configured)
202
+ try:
203
+ http_result = self.http_check.check(deployment_id, **kwargs)
204
+ # Only add if check was actually performed (not UNKNOWN due to no endpoint)
205
+ if http_result.status != HealthStatus.UNKNOWN or kwargs.get("endpoint"):
206
+ checks.append(http_result)
207
+ except Exception as e:
208
+ self.log_error(f"HTTP health check failed: {e}")
209
+ checks.append(
210
+ HealthCheckResult(
211
+ status=HealthStatus.UNKNOWN,
212
+ check_type="http",
213
+ message=f"Check failed: {e}",
214
+ details={"error": str(e)},
215
+ )
216
+ )
217
+
218
+ # Aggregate health status
219
+ overall_status = self._aggregate_health_status(checks)
220
+
221
+ # Create deployment health
222
+ deployment_health = DeploymentHealth(
223
+ deployment_id=deployment_id,
224
+ overall_status=overall_status,
225
+ checks=checks,
226
+ )
227
+
228
+ # Update health history
229
+ with self._lock:
230
+ self._health_history[deployment_id].append(deployment_health)
231
+ # Trim history to limit
232
+ if len(self._health_history[deployment_id]) > self.history_limit:
233
+ self._health_history[deployment_id] = self._health_history[
234
+ deployment_id
235
+ ][-self.history_limit :]
236
+
237
+ # Check for status changes and trigger callbacks
238
+ if len(self._health_history[deployment_id]) >= 2:
239
+ previous_health = self._health_history[deployment_id][-2]
240
+ if previous_health.overall_status != overall_status:
241
+ self._trigger_status_callbacks(
242
+ deployment_id, previous_health.overall_status, overall_status
243
+ )
244
+
245
+ return deployment_health
246
+
247
+ def start_monitoring(self) -> None:
248
+ """
249
+ Start background health monitoring.
250
+
251
+ WHY: Enables continuous health tracking without manual polling.
252
+ Creates a daemon thread that performs periodic checks.
253
+ """
254
+ with self._lock:
255
+ if self._monitoring:
256
+ self.log_warning("Health monitoring already running")
257
+ return
258
+
259
+ self._monitoring = True
260
+ self._stop_event.clear()
261
+
262
+ # Create and start monitoring thread
263
+ self._monitor_thread = threading.Thread(
264
+ target=self._monitor_loop, daemon=True, name="HealthMonitorThread"
265
+ )
266
+ self._monitor_thread.start()
267
+
268
+ self.log_info(
269
+ f"Started health monitoring with {self.check_interval}s interval"
270
+ )
271
+
272
+ def stop_monitoring(self) -> None:
273
+ """
274
+ Stop background health monitoring.
275
+
276
+ WHY: Gracefully stops the monitoring thread and releases resources.
277
+ """
278
+ with self._lock:
279
+ if not self._monitoring:
280
+ return
281
+
282
+ self._monitoring = False
283
+ self._stop_event.set()
284
+
285
+ # Wait for monitoring thread to stop
286
+ if self._monitor_thread and self._monitor_thread.is_alive():
287
+ self._monitor_thread.join(timeout=5.0)
288
+
289
+ self.log_info("Stopped health monitoring")
290
+
291
+ def is_monitoring(self) -> bool:
292
+ """
293
+ Check if background monitoring is active.
294
+
295
+ Returns:
296
+ True if monitoring thread is running
297
+ """
298
+ with self._lock:
299
+ return self._monitoring
300
+
301
+ def get_health_history(
302
+ self, deployment_id: str, limit: int = 10
303
+ ) -> List[DeploymentHealth]:
304
+ """
305
+ Get historical health check results for a deployment.
306
+
307
+ Args:
308
+ deployment_id: Unique deployment identifier
309
+ limit: Maximum number of historical entries to return
310
+
311
+ Returns:
312
+ List of DeploymentHealth objects, newest first
313
+ """
314
+ with self._lock:
315
+ history = self._health_history.get(deployment_id, [])
316
+ return list(reversed(history[-limit:]))
317
+
318
+ def register_status_callback(
319
+ self, callback: Callable[[str, HealthStatus, HealthStatus], None]
320
+ ) -> None:
321
+ """
322
+ Register a callback for health status changes.
323
+
324
+ Args:
325
+ callback: Function called with (deployment_id, old_status, new_status)
326
+ """
327
+ with self._lock:
328
+ self._status_callbacks.append(callback)
329
+ self.log_debug(f"Registered status callback: {callback.__name__}")
330
+
331
+ def _monitor_loop(self) -> None:
332
+ """
333
+ Background monitoring loop.
334
+
335
+ WHY: Runs in a separate thread to perform periodic health checks
336
+ on all active deployments.
337
+ """
338
+ self.log_debug("Health monitoring loop started")
339
+
340
+ while not self._stop_event.is_set():
341
+ try:
342
+ # Get all active deployments
343
+ deployments = self.process_manager.state_manager.get_all_deployments()
344
+
345
+ for deployment in deployments:
346
+ if self._stop_event.is_set():
347
+ break
348
+
349
+ try:
350
+ # Perform health check
351
+ self.check_health(deployment.deployment_id)
352
+ except Exception as e:
353
+ self.log_error(
354
+ f"Error checking health for {deployment.deployment_id}: {e}"
355
+ )
356
+
357
+ # Sleep until next check interval
358
+ # Use wait() instead of sleep() for faster shutdown response
359
+ self._stop_event.wait(timeout=self.check_interval)
360
+
361
+ except Exception as e:
362
+ self.log_error(f"Error in health monitoring loop: {e}")
363
+ # Don't crash the thread, just continue
364
+ self._stop_event.wait(timeout=1.0)
365
+
366
+ self.log_debug("Health monitoring loop stopped")
367
+
368
+ def _aggregate_health_status(self, checks: List[HealthCheckResult]) -> HealthStatus:
369
+ """
370
+ Aggregate health status from multiple check results.
371
+
372
+ WHY: Combines results from different check types using priority rules
373
+ to determine overall deployment health.
374
+
375
+ Priority Rules:
376
+ 1. Process UNHEALTHY = Deployment UNHEALTHY (critical)
377
+ 2. Any check UNHEALTHY = Deployment DEGRADED (service issues but process alive)
378
+ 3. All checks HEALTHY = Deployment HEALTHY
379
+ 4. Otherwise = UNKNOWN
380
+
381
+ Args:
382
+ checks: List of health check results
383
+
384
+ Returns:
385
+ Aggregated HealthStatus
386
+ """
387
+ if not checks:
388
+ return HealthStatus.UNKNOWN
389
+
390
+ # Get process check result (most critical)
391
+ process_check = next((c for c in checks if c.check_type == "process"), None)
392
+
393
+ # Rule 1: Process UNHEALTHY = Deployment UNHEALTHY
394
+ if process_check and process_check.status == HealthStatus.UNHEALTHY:
395
+ return HealthStatus.UNHEALTHY
396
+
397
+ # Rule 2: Any check UNHEALTHY (but process alive) = DEGRADED
398
+ if any(c.status == HealthStatus.UNHEALTHY for c in checks):
399
+ return HealthStatus.DEGRADED
400
+
401
+ # Check for degraded status
402
+ if any(c.status == HealthStatus.DEGRADED for c in checks):
403
+ return HealthStatus.DEGRADED
404
+
405
+ # Rule 3: All checks HEALTHY = Deployment HEALTHY
406
+ if all(c.status == HealthStatus.HEALTHY for c in checks):
407
+ return HealthStatus.HEALTHY
408
+
409
+ # Rule 4: Otherwise = UNKNOWN
410
+ return HealthStatus.UNKNOWN
411
+
412
+ def _trigger_status_callbacks(
413
+ self, deployment_id: str, old_status: HealthStatus, new_status: HealthStatus
414
+ ) -> None:
415
+ """
416
+ Trigger registered callbacks for status changes.
417
+
418
+ Args:
419
+ deployment_id: Deployment that changed status
420
+ old_status: Previous health status
421
+ new_status: New health status
422
+ """
423
+ for callback in self._status_callbacks:
424
+ try:
425
+ callback(deployment_id, old_status, new_status)
426
+ except Exception as e:
427
+ self.log_error(f"Error in status callback {callback.__name__}: {e}")
428
+
429
+
430
+ __all__ = ["HealthCheckManager"]