claude-mpm 4.13.2__py3-none-any.whl → 4.18.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/BASE_ENGINEER.md +286 -0
  3. claude_mpm/agents/BASE_PM.md +48 -17
  4. claude_mpm/agents/OUTPUT_STYLE.md +329 -11
  5. claude_mpm/agents/PM_INSTRUCTIONS.md +227 -8
  6. claude_mpm/agents/agent_loader.py +17 -5
  7. claude_mpm/agents/frontmatter_validator.py +284 -253
  8. claude_mpm/agents/templates/agentic-coder-optimizer.json +9 -2
  9. claude_mpm/agents/templates/api_qa.json +7 -1
  10. claude_mpm/agents/templates/clerk-ops.json +8 -1
  11. claude_mpm/agents/templates/code_analyzer.json +4 -1
  12. claude_mpm/agents/templates/dart_engineer.json +11 -1
  13. claude_mpm/agents/templates/data_engineer.json +11 -1
  14. claude_mpm/agents/templates/documentation.json +6 -1
  15. claude_mpm/agents/templates/engineer.json +18 -1
  16. claude_mpm/agents/templates/gcp_ops_agent.json +8 -1
  17. claude_mpm/agents/templates/golang_engineer.json +11 -1
  18. claude_mpm/agents/templates/java_engineer.json +12 -2
  19. claude_mpm/agents/templates/local_ops_agent.json +1217 -6
  20. claude_mpm/agents/templates/nextjs_engineer.json +11 -1
  21. claude_mpm/agents/templates/ops.json +8 -1
  22. claude_mpm/agents/templates/php-engineer.json +11 -1
  23. claude_mpm/agents/templates/project_organizer.json +10 -3
  24. claude_mpm/agents/templates/prompt-engineer.json +5 -1
  25. claude_mpm/agents/templates/python_engineer.json +11 -1
  26. claude_mpm/agents/templates/qa.json +7 -1
  27. claude_mpm/agents/templates/react_engineer.json +11 -1
  28. claude_mpm/agents/templates/refactoring_engineer.json +8 -1
  29. claude_mpm/agents/templates/research.json +4 -1
  30. claude_mpm/agents/templates/ruby-engineer.json +11 -1
  31. claude_mpm/agents/templates/rust_engineer.json +11 -1
  32. claude_mpm/agents/templates/security.json +6 -1
  33. claude_mpm/agents/templates/svelte-engineer.json +225 -0
  34. claude_mpm/agents/templates/ticketing.json +6 -1
  35. claude_mpm/agents/templates/typescript_engineer.json +11 -1
  36. claude_mpm/agents/templates/vercel_ops_agent.json +8 -1
  37. claude_mpm/agents/templates/version_control.json +8 -1
  38. claude_mpm/agents/templates/web_qa.json +7 -1
  39. claude_mpm/agents/templates/web_ui.json +11 -1
  40. claude_mpm/cli/__init__.py +34 -706
  41. claude_mpm/cli/commands/agent_manager.py +25 -12
  42. claude_mpm/cli/commands/agent_state_manager.py +186 -0
  43. claude_mpm/cli/commands/agents.py +204 -148
  44. claude_mpm/cli/commands/aggregate.py +7 -3
  45. claude_mpm/cli/commands/analyze.py +9 -4
  46. claude_mpm/cli/commands/analyze_code.py +7 -2
  47. claude_mpm/cli/commands/auto_configure.py +7 -9
  48. claude_mpm/cli/commands/config.py +47 -13
  49. claude_mpm/cli/commands/configure.py +294 -1788
  50. claude_mpm/cli/commands/configure_agent_display.py +261 -0
  51. claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
  52. claude_mpm/cli/commands/configure_hook_manager.py +225 -0
  53. claude_mpm/cli/commands/configure_models.py +18 -0
  54. claude_mpm/cli/commands/configure_navigation.py +167 -0
  55. claude_mpm/cli/commands/configure_paths.py +104 -0
  56. claude_mpm/cli/commands/configure_persistence.py +254 -0
  57. claude_mpm/cli/commands/configure_startup_manager.py +646 -0
  58. claude_mpm/cli/commands/configure_template_editor.py +497 -0
  59. claude_mpm/cli/commands/configure_validators.py +73 -0
  60. claude_mpm/cli/commands/local_deploy.py +537 -0
  61. claude_mpm/cli/commands/memory.py +54 -20
  62. claude_mpm/cli/commands/mpm_init.py +39 -25
  63. claude_mpm/cli/commands/mpm_init_handler.py +8 -3
  64. claude_mpm/cli/executor.py +202 -0
  65. claude_mpm/cli/helpers.py +105 -0
  66. claude_mpm/cli/interactive/__init__.py +3 -0
  67. claude_mpm/cli/interactive/skills_wizard.py +491 -0
  68. claude_mpm/cli/parsers/__init__.py +7 -1
  69. claude_mpm/cli/parsers/base_parser.py +98 -3
  70. claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
  71. claude_mpm/cli/shared/output_formatters.py +28 -19
  72. claude_mpm/cli/startup.py +481 -0
  73. claude_mpm/cli/utils.py +52 -1
  74. claude_mpm/commands/mpm-help.md +3 -0
  75. claude_mpm/commands/mpm-version.md +113 -0
  76. claude_mpm/commands/mpm.md +1 -0
  77. claude_mpm/config/agent_config.py +2 -2
  78. claude_mpm/config/model_config.py +428 -0
  79. claude_mpm/core/base_service.py +13 -12
  80. claude_mpm/core/enums.py +452 -0
  81. claude_mpm/core/factories.py +1 -1
  82. claude_mpm/core/instruction_reinforcement_hook.py +2 -1
  83. claude_mpm/core/interactive_session.py +9 -3
  84. claude_mpm/core/logging_config.py +6 -2
  85. claude_mpm/core/oneshot_session.py +8 -4
  86. claude_mpm/core/optimized_agent_loader.py +3 -3
  87. claude_mpm/core/output_style_manager.py +12 -192
  88. claude_mpm/core/service_registry.py +5 -1
  89. claude_mpm/core/types.py +2 -9
  90. claude_mpm/core/typing_utils.py +7 -6
  91. claude_mpm/dashboard/static/js/dashboard.js +0 -14
  92. claude_mpm/dashboard/templates/index.html +3 -41
  93. claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
  94. claude_mpm/hooks/instruction_reinforcement.py +7 -2
  95. claude_mpm/models/resume_log.py +340 -0
  96. claude_mpm/services/agents/auto_config_manager.py +10 -11
  97. claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
  98. claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
  99. claude_mpm/services/agents/deployment/agent_validator.py +17 -1
  100. claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
  101. claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
  102. claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
  103. claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
  104. claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
  105. claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
  106. claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
  107. claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
  108. claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
  109. claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
  110. claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
  111. claude_mpm/services/agents/local_template_manager.py +1 -1
  112. claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
  113. claude_mpm/services/agents/registry/modification_tracker.py +5 -2
  114. claude_mpm/services/command_handler_service.py +11 -5
  115. claude_mpm/services/core/interfaces/__init__.py +74 -2
  116. claude_mpm/services/core/interfaces/health.py +172 -0
  117. claude_mpm/services/core/interfaces/model.py +281 -0
  118. claude_mpm/services/core/interfaces/process.py +372 -0
  119. claude_mpm/services/core/interfaces/restart.py +307 -0
  120. claude_mpm/services/core/interfaces/stability.py +260 -0
  121. claude_mpm/services/core/models/__init__.py +33 -0
  122. claude_mpm/services/core/models/agent_config.py +12 -28
  123. claude_mpm/services/core/models/health.py +162 -0
  124. claude_mpm/services/core/models/process.py +235 -0
  125. claude_mpm/services/core/models/restart.py +302 -0
  126. claude_mpm/services/core/models/stability.py +264 -0
  127. claude_mpm/services/core/path_resolver.py +23 -7
  128. claude_mpm/services/diagnostics/__init__.py +2 -2
  129. claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
  130. claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
  131. claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
  132. claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
  133. claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
  134. claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
  135. claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
  136. claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
  137. claude_mpm/services/diagnostics/checks/mcp_services_check.py +36 -31
  138. claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
  139. claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
  140. claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
  141. claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
  142. claude_mpm/services/diagnostics/models.py +19 -24
  143. claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
  144. claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
  145. claude_mpm/services/infrastructure/monitoring/base.py +5 -13
  146. claude_mpm/services/infrastructure/monitoring/network.py +7 -6
  147. claude_mpm/services/infrastructure/monitoring/process.py +13 -12
  148. claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
  149. claude_mpm/services/infrastructure/monitoring/service.py +16 -15
  150. claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
  151. claude_mpm/services/local_ops/__init__.py +163 -0
  152. claude_mpm/services/local_ops/crash_detector.py +257 -0
  153. claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
  154. claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
  155. claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
  156. claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
  157. claude_mpm/services/local_ops/health_manager.py +430 -0
  158. claude_mpm/services/local_ops/log_monitor.py +396 -0
  159. claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
  160. claude_mpm/services/local_ops/process_manager.py +595 -0
  161. claude_mpm/services/local_ops/resource_monitor.py +331 -0
  162. claude_mpm/services/local_ops/restart_manager.py +401 -0
  163. claude_mpm/services/local_ops/restart_policy.py +387 -0
  164. claude_mpm/services/local_ops/state_manager.py +372 -0
  165. claude_mpm/services/local_ops/unified_manager.py +600 -0
  166. claude_mpm/services/mcp_config_manager.py +9 -4
  167. claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
  168. claude_mpm/services/mcp_gateway/core/base.py +18 -31
  169. claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +71 -24
  170. claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
  171. claude_mpm/services/memory_hook_service.py +4 -1
  172. claude_mpm/services/model/__init__.py +147 -0
  173. claude_mpm/services/model/base_provider.py +365 -0
  174. claude_mpm/services/model/claude_provider.py +412 -0
  175. claude_mpm/services/model/model_router.py +453 -0
  176. claude_mpm/services/model/ollama_provider.py +415 -0
  177. claude_mpm/services/monitor/daemon_manager.py +3 -2
  178. claude_mpm/services/monitor/handlers/dashboard.py +2 -1
  179. claude_mpm/services/monitor/handlers/hooks.py +2 -1
  180. claude_mpm/services/monitor/management/lifecycle.py +3 -2
  181. claude_mpm/services/monitor/server.py +2 -1
  182. claude_mpm/services/session_management_service.py +3 -2
  183. claude_mpm/services/session_manager.py +205 -1
  184. claude_mpm/services/shared/async_service_base.py +16 -27
  185. claude_mpm/services/shared/lifecycle_service_base.py +1 -14
  186. claude_mpm/services/socketio/handlers/__init__.py +5 -2
  187. claude_mpm/services/socketio/handlers/hook.py +13 -2
  188. claude_mpm/services/socketio/handlers/registry.py +4 -2
  189. claude_mpm/services/socketio/server/main.py +10 -8
  190. claude_mpm/services/subprocess_launcher_service.py +14 -5
  191. claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
  192. claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
  193. claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
  194. claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
  195. claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
  196. claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
  197. claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
  198. claude_mpm/services/unified/deployment_strategies/local.py +6 -5
  199. claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
  200. claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
  201. claude_mpm/services/unified/interfaces.py +3 -1
  202. claude_mpm/services/unified/unified_analyzer.py +14 -10
  203. claude_mpm/services/unified/unified_config.py +2 -1
  204. claude_mpm/services/unified/unified_deployment.py +9 -4
  205. claude_mpm/services/version_service.py +104 -1
  206. claude_mpm/skills/__init__.py +21 -0
  207. claude_mpm/skills/bundled/__init__.py +6 -0
  208. claude_mpm/skills/bundled/api-documentation.md +393 -0
  209. claude_mpm/skills/bundled/async-testing.md +571 -0
  210. claude_mpm/skills/bundled/code-review.md +143 -0
  211. claude_mpm/skills/bundled/database-migration.md +199 -0
  212. claude_mpm/skills/bundled/docker-containerization.md +194 -0
  213. claude_mpm/skills/bundled/express-local-dev.md +1429 -0
  214. claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
  215. claude_mpm/skills/bundled/git-workflow.md +414 -0
  216. claude_mpm/skills/bundled/imagemagick.md +204 -0
  217. claude_mpm/skills/bundled/json-data-handling.md +223 -0
  218. claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
  219. claude_mpm/skills/bundled/pdf.md +141 -0
  220. claude_mpm/skills/bundled/performance-profiling.md +567 -0
  221. claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
  222. claude_mpm/skills/bundled/security-scanning.md +327 -0
  223. claude_mpm/skills/bundled/systematic-debugging.md +473 -0
  224. claude_mpm/skills/bundled/test-driven-development.md +378 -0
  225. claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
  226. claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
  227. claude_mpm/skills/bundled/xlsx.md +157 -0
  228. claude_mpm/skills/registry.py +286 -0
  229. claude_mpm/skills/skill_manager.py +310 -0
  230. claude_mpm/tools/code_tree_analyzer.py +177 -141
  231. claude_mpm/tools/code_tree_events.py +4 -2
  232. claude_mpm/utils/agent_dependency_loader.py +2 -2
  233. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +117 -8
  234. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +238 -174
  235. claude_mpm/dashboard/static/css/code-tree.css +0 -1639
  236. claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
  237. claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
  238. claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
  239. claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
  240. claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
  241. claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
  242. claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
  243. claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
  244. claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
  245. claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
  246. claude_mpm/services/project/analyzer_refactored.py +0 -450
  247. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
  248. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
  249. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
  250. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,401 @@
1
+ """
2
+ Restart Manager for Claude MPM Framework
3
+ =========================================
4
+
5
+ WHY: Orchestrates the complete restart workflow including crash detection,
6
+ policy evaluation, process restart, and health verification.
7
+
8
+ DESIGN DECISION: Integrates all restart components (CrashDetector,
9
+ RestartPolicy, ProcessManager, HealthCheckManager) to provide automatic
10
+ and manual restart operations with proper verification.
11
+
12
+ ARCHITECTURE:
13
+ - Auto-restart workflow:
14
+ 1. CrashDetector detects crash → triggers callback
15
+ 2. RestartManager checks policy (max attempts, circuit breaker)
16
+ 3. Calculate and wait for backoff period
17
+ 4. Execute restart via ProcessManager
18
+ 5. Wait for health check verification
19
+ 6. Record attempt and update circuit breaker
20
+ - Manual restart: bypasses some policy checks
21
+ - Thread-safe operations with proper locking
22
+ - State persistence for restart history
23
+
24
+ USAGE:
25
+ config = RestartConfig(max_attempts=5, circuit_breaker_threshold=3)
26
+ restart_manager = RestartManager(
27
+ process_manager=process_manager,
28
+ health_manager=health_manager,
29
+ crash_detector=crash_detector,
30
+ restart_policy=restart_policy
31
+ )
32
+
33
+ # Enable auto-restart
34
+ restart_manager.enable_auto_restart(deployment_id)
35
+
36
+ # Manual restart
37
+ success = restart_manager.restart_deployment(deployment_id, manual=True)
38
+ """
39
+
40
+ import json
41
+ import threading
42
+ import time
43
+ from pathlib import Path
44
+ from typing import Optional, Set
45
+
46
+ from claude_mpm.core.enums import HealthStatus
47
+ from claude_mpm.services.core.base import SyncBaseService
48
+ from claude_mpm.services.core.interfaces.health import IHealthCheckManager
49
+ from claude_mpm.services.core.interfaces.process import ILocalProcessManager
50
+ from claude_mpm.services.core.interfaces.restart import (
51
+ ICrashDetector,
52
+ IRestartManager,
53
+ IRestartPolicy,
54
+ )
55
+ from claude_mpm.services.core.models.restart import RestartHistory
56
+
57
+
58
+ class RestartManager(SyncBaseService, IRestartManager):
59
+ """
60
+ Orchestrates automatic and manual restart operations.
61
+
62
+ WHY: Provides complete restart workflow by coordinating crash detection,
63
+ policy evaluation, process restart, and health verification.
64
+
65
+ Thread Safety: All public methods are thread-safe with proper locking.
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ process_manager: ILocalProcessManager,
71
+ health_manager: IHealthCheckManager,
72
+ crash_detector: ICrashDetector,
73
+ restart_policy: IRestartPolicy,
74
+ state_dir: Optional[Path] = None,
75
+ ):
76
+ """
77
+ Initialize restart manager.
78
+
79
+ Args:
80
+ process_manager: Process manager for restart operations
81
+ health_manager: Health check manager for verification
82
+ crash_detector: Crash detector for automatic restarts
83
+ restart_policy: Restart policy for decision making
84
+ state_dir: Directory for restart history persistence
85
+ """
86
+ super().__init__("RestartManager")
87
+ self.process_manager = process_manager
88
+ self.health_manager = health_manager
89
+ self.crash_detector = crash_detector
90
+ self.restart_policy = restart_policy
91
+
92
+ # State persistence
93
+ if state_dir is None:
94
+ state_dir = Path.home() / ".claude-mpm"
95
+ self.state_dir = Path(state_dir)
96
+ self.state_dir.mkdir(parents=True, exist_ok=True)
97
+ self.history_file = self.state_dir / "restart-history.json"
98
+
99
+ # Auto-restart tracking
100
+ self._lock = threading.Lock()
101
+ self._auto_restart_enabled: Set[str] = set()
102
+
103
+ # In-progress restart tracking (prevent concurrent restarts)
104
+ self._restart_in_progress: Set[str] = set()
105
+
106
+ def initialize(self) -> bool:
107
+ """
108
+ Initialize the restart manager.
109
+
110
+ Returns:
111
+ True if initialization successful
112
+ """
113
+ self.logger.info("Initializing RestartManager")
114
+
115
+ # Register crash callback
116
+ self.crash_detector.register_crash_callback(self._on_crash_detected)
117
+
118
+ # Load restart history from disk
119
+ self._load_restart_history()
120
+
121
+ self.logger.info("RestartManager initialized successfully")
122
+ return True
123
+
124
+ def enable_auto_restart(self, deployment_id: str) -> None:
125
+ """
126
+ Enable automatic restarts for a deployment.
127
+
128
+ Args:
129
+ deployment_id: Unique deployment identifier
130
+
131
+ Raises:
132
+ ValueError: If deployment_id not found
133
+ """
134
+ # Verify deployment exists
135
+ deployment = self.process_manager.get_status(deployment_id)
136
+ if deployment is None:
137
+ raise ValueError(f"Deployment not found: {deployment_id}")
138
+
139
+ with self._lock:
140
+ if deployment_id in self._auto_restart_enabled:
141
+ self.logger.debug(f"Auto-restart already enabled for {deployment_id}")
142
+ return
143
+
144
+ # Enable auto-restart
145
+ self._auto_restart_enabled.add(deployment_id)
146
+
147
+ # Start crash monitoring
148
+ self.crash_detector.start_monitoring(deployment_id)
149
+
150
+ self.logger.info(f"Enabled auto-restart for deployment: {deployment_id}")
151
+
152
+ def disable_auto_restart(self, deployment_id: str) -> None:
153
+ """
154
+ Disable automatic restarts for a deployment.
155
+
156
+ Args:
157
+ deployment_id: Unique deployment identifier
158
+ """
159
+ with self._lock:
160
+ self._auto_restart_enabled.discard(deployment_id)
161
+
162
+ # Stop crash monitoring
163
+ self.crash_detector.stop_monitoring(deployment_id)
164
+
165
+ self.logger.info(f"Disabled auto-restart for deployment: {deployment_id}")
166
+
167
+ def is_auto_restart_enabled(self, deployment_id: str) -> bool:
168
+ """
169
+ Check if auto-restart is enabled for a deployment.
170
+
171
+ Args:
172
+ deployment_id: Unique deployment identifier
173
+
174
+ Returns:
175
+ True if auto-restart is enabled
176
+ """
177
+ with self._lock:
178
+ return deployment_id in self._auto_restart_enabled
179
+
180
+ def restart_deployment(self, deployment_id: str, manual: bool = False) -> bool:
181
+ """
182
+ Restart a deployment (manual or automatic trigger).
183
+
184
+ Args:
185
+ deployment_id: Unique deployment identifier
186
+ manual: If True, bypass circuit breaker check
187
+
188
+ Returns:
189
+ True if restart succeeded
190
+
191
+ Raises:
192
+ ValueError: If deployment_id not found
193
+ """
194
+ with self._lock:
195
+ # Check if restart already in progress
196
+ if deployment_id in self._restart_in_progress:
197
+ self.logger.warning(
198
+ f"Restart already in progress for {deployment_id}, skipping"
199
+ )
200
+ return False
201
+
202
+ # Mark restart in progress
203
+ self._restart_in_progress.add(deployment_id)
204
+
205
+ try:
206
+ # Check restart policy (unless manual override)
207
+ if not manual:
208
+ if not self.restart_policy.should_restart(deployment_id):
209
+ self.logger.warning(
210
+ f"Restart policy blocked restart for {deployment_id}"
211
+ )
212
+ return False
213
+
214
+ # Calculate and wait for backoff
215
+ backoff = self.restart_policy.calculate_backoff(deployment_id)
216
+ if backoff > 0:
217
+ self.logger.info(
218
+ f"Waiting {backoff:.1f}s backoff before restarting {deployment_id}"
219
+ )
220
+ time.sleep(backoff)
221
+
222
+ # Execute restart
223
+ self.logger.info(f"Restarting deployment: {deployment_id}")
224
+ try:
225
+ new_deployment = self.process_manager.restart(deployment_id)
226
+
227
+ # Wait for initial health check
228
+ self.logger.debug(
229
+ f"Waiting for health check verification for {deployment_id}"
230
+ )
231
+ time.sleep(5) # Brief wait for process to initialize
232
+
233
+ # Verify health status
234
+ health = self.health_manager.check_health(new_deployment.deployment_id)
235
+ success = health.overall_status != HealthStatus.UNHEALTHY
236
+
237
+ if success:
238
+ self.logger.info(
239
+ f"Restart succeeded for {deployment_id}, "
240
+ f"health status: {health.overall_status.value}"
241
+ )
242
+ else:
243
+ self.logger.warning(
244
+ f"Restart completed but deployment unhealthy: {deployment_id}"
245
+ )
246
+
247
+ # Record attempt
248
+ failure_reason = (
249
+ None
250
+ if success
251
+ else f"Health check failed: {health.overall_status.value}"
252
+ )
253
+ self.restart_policy.record_restart_attempt(
254
+ deployment_id, success, failure_reason
255
+ )
256
+
257
+ # Persist restart history
258
+ self._save_restart_history()
259
+
260
+ return success
261
+
262
+ except Exception as e:
263
+ self.logger.error(
264
+ f"Restart failed for {deployment_id}: {e}", exc_info=True
265
+ )
266
+
267
+ # Record failed attempt
268
+ self.restart_policy.record_restart_attempt(
269
+ deployment_id, success=False, failure_reason=str(e)
270
+ )
271
+
272
+ # Persist restart history
273
+ self._save_restart_history()
274
+
275
+ return False
276
+
277
+ finally:
278
+ # Clear in-progress flag
279
+ with self._lock:
280
+ self._restart_in_progress.discard(deployment_id)
281
+
282
+ def get_restart_history(self, deployment_id: str) -> Optional[RestartHistory]:
283
+ """
284
+ Get restart history for a deployment.
285
+
286
+ Args:
287
+ deployment_id: Unique deployment identifier
288
+
289
+ Returns:
290
+ RestartHistory if found, None otherwise
291
+ """
292
+ return self.restart_policy.get_history(deployment_id)
293
+
294
+ def clear_restart_history(self, deployment_id: str) -> None:
295
+ """
296
+ Clear restart history and reset circuit breaker.
297
+
298
+ Args:
299
+ deployment_id: Unique deployment identifier
300
+ """
301
+ self.restart_policy.reset_restart_history(deployment_id)
302
+ self._save_restart_history()
303
+ self.logger.info(f"Cleared restart history for deployment: {deployment_id}")
304
+
305
+ def _on_crash_detected(self, deployment_id: str, reason: str) -> None:
306
+ """
307
+ Handle crash detection callback.
308
+
309
+ WHY: Invoked by CrashDetector when a crash is detected.
310
+ Triggers automatic restart if enabled.
311
+
312
+ Args:
313
+ deployment_id: Unique deployment identifier
314
+ reason: Reason for crash detection
315
+ """
316
+ self.logger.warning(f"Crash detected for {deployment_id}: {reason}")
317
+
318
+ # Check if auto-restart is enabled
319
+ with self._lock:
320
+ if deployment_id not in self._auto_restart_enabled:
321
+ self.logger.debug(
322
+ f"Auto-restart not enabled for {deployment_id}, ignoring crash"
323
+ )
324
+ return
325
+
326
+ # Trigger automatic restart
327
+ self.logger.info(f"Triggering automatic restart for {deployment_id}")
328
+ self.restart_deployment(deployment_id, manual=False)
329
+
330
+ def _load_restart_history(self) -> None:
331
+ """
332
+ Load restart history from disk.
333
+
334
+ WHY: Persists restart state across service restarts to maintain
335
+ circuit breaker state and attempt counts.
336
+ """
337
+ if not self.history_file.exists():
338
+ self.logger.debug("No restart history file found, starting fresh")
339
+ return
340
+
341
+ try:
342
+ with self.history_file.open() as f:
343
+ data = json.load(f)
344
+
345
+ # Load history into restart policy
346
+ for deployment_id, history_data in data.items():
347
+ history = RestartHistory.from_dict(history_data)
348
+ # Inject into restart policy's internal state
349
+ if hasattr(self.restart_policy, "_history"):
350
+ self.restart_policy._history[deployment_id] = history
351
+
352
+ self.logger.info(f"Loaded restart history for {len(data)} deployments")
353
+
354
+ except Exception as e:
355
+ self.logger.error(f"Failed to load restart history: {e}", exc_info=True)
356
+
357
+ def _save_restart_history(self) -> None:
358
+ """
359
+ Save restart history to disk.
360
+
361
+ WHY: Persists restart state to maintain circuit breaker and
362
+ attempt counts across service restarts.
363
+ """
364
+ try:
365
+ # Collect all restart histories from restart policy
366
+ data = {}
367
+ if hasattr(self.restart_policy, "_history"):
368
+ for deployment_id, history in self.restart_policy._history.items():
369
+ data[deployment_id] = history.to_dict()
370
+
371
+ # Write to disk
372
+ with self.history_file.open("w") as f:
373
+ json.dump(data, f, indent=2)
374
+
375
+ self.logger.debug(f"Saved restart history for {len(data)} deployments")
376
+
377
+ except Exception as e:
378
+ self.logger.error(f"Failed to save restart history: {e}", exc_info=True)
379
+
380
+ def shutdown(self) -> bool:
381
+ """
382
+ Shutdown the restart manager.
383
+
384
+ Returns:
385
+ True if shutdown successful
386
+ """
387
+ # Save restart history before shutdown
388
+ self._save_restart_history()
389
+
390
+ with self._lock:
391
+ # Disable all auto-restarts
392
+ for deployment_id in list(self._auto_restart_enabled):
393
+ self.crash_detector.stop_monitoring(deployment_id)
394
+ self._auto_restart_enabled.clear()
395
+ self._restart_in_progress.clear()
396
+
397
+ self.logger.info("RestartManager shutdown successfully")
398
+ return True
399
+
400
+
401
+ __all__ = ["RestartManager"]