claude-mpm 4.7.4__py3-none-any.whl → 4.18.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/BASE_AGENT_TEMPLATE.md +118 -0
  3. claude_mpm/agents/BASE_ENGINEER.md +286 -0
  4. claude_mpm/agents/BASE_PM.md +106 -1
  5. claude_mpm/agents/OUTPUT_STYLE.md +329 -11
  6. claude_mpm/agents/PM_INSTRUCTIONS.md +397 -459
  7. claude_mpm/agents/agent_loader.py +17 -5
  8. claude_mpm/agents/frontmatter_validator.py +284 -253
  9. claude_mpm/agents/templates/README.md +465 -0
  10. claude_mpm/agents/templates/agent-manager.json +4 -1
  11. claude_mpm/agents/templates/agentic-coder-optimizer.json +13 -3
  12. claude_mpm/agents/templates/api_qa.json +11 -2
  13. claude_mpm/agents/templates/circuit_breakers.md +638 -0
  14. claude_mpm/agents/templates/clerk-ops.json +12 -2
  15. claude_mpm/agents/templates/code_analyzer.json +8 -2
  16. claude_mpm/agents/templates/content-agent.json +358 -0
  17. claude_mpm/agents/templates/dart_engineer.json +15 -2
  18. claude_mpm/agents/templates/data_engineer.json +15 -2
  19. claude_mpm/agents/templates/documentation.json +10 -2
  20. claude_mpm/agents/templates/engineer.json +21 -1
  21. claude_mpm/agents/templates/gcp_ops_agent.json +12 -2
  22. claude_mpm/agents/templates/git_file_tracking.md +584 -0
  23. claude_mpm/agents/templates/golang_engineer.json +270 -0
  24. claude_mpm/agents/templates/imagemagick.json +4 -1
  25. claude_mpm/agents/templates/java_engineer.json +346 -0
  26. claude_mpm/agents/templates/local_ops_agent.json +1227 -6
  27. claude_mpm/agents/templates/memory_manager.json +4 -1
  28. claude_mpm/agents/templates/nextjs_engineer.json +141 -133
  29. claude_mpm/agents/templates/ops.json +12 -2
  30. claude_mpm/agents/templates/php-engineer.json +270 -174
  31. claude_mpm/agents/templates/pm_examples.md +474 -0
  32. claude_mpm/agents/templates/pm_red_flags.md +240 -0
  33. claude_mpm/agents/templates/product_owner.json +338 -0
  34. claude_mpm/agents/templates/project_organizer.json +14 -4
  35. claude_mpm/agents/templates/prompt-engineer.json +13 -2
  36. claude_mpm/agents/templates/python_engineer.json +174 -81
  37. claude_mpm/agents/templates/qa.json +11 -2
  38. claude_mpm/agents/templates/react_engineer.json +16 -3
  39. claude_mpm/agents/templates/refactoring_engineer.json +12 -2
  40. claude_mpm/agents/templates/research.json +34 -21
  41. claude_mpm/agents/templates/response_format.md +583 -0
  42. claude_mpm/agents/templates/ruby-engineer.json +129 -192
  43. claude_mpm/agents/templates/rust_engineer.json +270 -0
  44. claude_mpm/agents/templates/security.json +10 -2
  45. claude_mpm/agents/templates/svelte-engineer.json +225 -0
  46. claude_mpm/agents/templates/ticketing.json +10 -2
  47. claude_mpm/agents/templates/typescript_engineer.json +116 -125
  48. claude_mpm/agents/templates/validation_templates.md +312 -0
  49. claude_mpm/agents/templates/vercel_ops_agent.json +12 -2
  50. claude_mpm/agents/templates/version_control.json +12 -2
  51. claude_mpm/agents/templates/web_qa.json +11 -2
  52. claude_mpm/agents/templates/web_ui.json +15 -2
  53. claude_mpm/cli/__init__.py +34 -614
  54. claude_mpm/cli/commands/agent_manager.py +25 -12
  55. claude_mpm/cli/commands/agent_state_manager.py +186 -0
  56. claude_mpm/cli/commands/agents.py +235 -148
  57. claude_mpm/cli/commands/agents_detect.py +380 -0
  58. claude_mpm/cli/commands/agents_recommend.py +309 -0
  59. claude_mpm/cli/commands/aggregate.py +7 -3
  60. claude_mpm/cli/commands/analyze.py +9 -4
  61. claude_mpm/cli/commands/analyze_code.py +7 -2
  62. claude_mpm/cli/commands/auto_configure.py +570 -0
  63. claude_mpm/cli/commands/config.py +47 -13
  64. claude_mpm/cli/commands/configure.py +419 -1571
  65. claude_mpm/cli/commands/configure_agent_display.py +261 -0
  66. claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
  67. claude_mpm/cli/commands/configure_hook_manager.py +225 -0
  68. claude_mpm/cli/commands/configure_models.py +18 -0
  69. claude_mpm/cli/commands/configure_navigation.py +167 -0
  70. claude_mpm/cli/commands/configure_paths.py +104 -0
  71. claude_mpm/cli/commands/configure_persistence.py +254 -0
  72. claude_mpm/cli/commands/configure_startup_manager.py +646 -0
  73. claude_mpm/cli/commands/configure_template_editor.py +497 -0
  74. claude_mpm/cli/commands/configure_validators.py +73 -0
  75. claude_mpm/cli/commands/local_deploy.py +537 -0
  76. claude_mpm/cli/commands/memory.py +54 -20
  77. claude_mpm/cli/commands/mpm_init.py +585 -196
  78. claude_mpm/cli/commands/mpm_init_handler.py +37 -3
  79. claude_mpm/cli/commands/search.py +170 -4
  80. claude_mpm/cli/commands/upgrade.py +152 -0
  81. claude_mpm/cli/executor.py +202 -0
  82. claude_mpm/cli/helpers.py +105 -0
  83. claude_mpm/cli/interactive/__init__.py +3 -0
  84. claude_mpm/cli/interactive/skills_wizard.py +491 -0
  85. claude_mpm/cli/parsers/__init__.py +7 -1
  86. claude_mpm/cli/parsers/agents_parser.py +9 -0
  87. claude_mpm/cli/parsers/auto_configure_parser.py +245 -0
  88. claude_mpm/cli/parsers/base_parser.py +110 -3
  89. claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
  90. claude_mpm/cli/parsers/mpm_init_parser.py +65 -5
  91. claude_mpm/cli/shared/output_formatters.py +28 -19
  92. claude_mpm/cli/startup.py +481 -0
  93. claude_mpm/cli/utils.py +52 -1
  94. claude_mpm/commands/mpm-agents-detect.md +168 -0
  95. claude_mpm/commands/mpm-agents-recommend.md +214 -0
  96. claude_mpm/commands/mpm-agents.md +75 -1
  97. claude_mpm/commands/mpm-auto-configure.md +217 -0
  98. claude_mpm/commands/mpm-help.md +163 -0
  99. claude_mpm/commands/mpm-init.md +148 -3
  100. claude_mpm/commands/mpm-version.md +113 -0
  101. claude_mpm/commands/mpm.md +1 -0
  102. claude_mpm/config/agent_config.py +2 -2
  103. claude_mpm/config/model_config.py +428 -0
  104. claude_mpm/constants.py +1 -0
  105. claude_mpm/core/base_service.py +13 -12
  106. claude_mpm/core/enums.py +452 -0
  107. claude_mpm/core/factories.py +1 -1
  108. claude_mpm/core/instruction_reinforcement_hook.py +2 -1
  109. claude_mpm/core/interactive_session.py +9 -3
  110. claude_mpm/core/log_manager.py +2 -0
  111. claude_mpm/core/logging_config.py +6 -2
  112. claude_mpm/core/oneshot_session.py +8 -4
  113. claude_mpm/core/optimized_agent_loader.py +3 -3
  114. claude_mpm/core/output_style_manager.py +12 -192
  115. claude_mpm/core/service_registry.py +5 -1
  116. claude_mpm/core/types.py +2 -9
  117. claude_mpm/core/typing_utils.py +7 -6
  118. claude_mpm/dashboard/static/js/dashboard.js +0 -14
  119. claude_mpm/dashboard/templates/index.html +3 -41
  120. claude_mpm/hooks/__init__.py +20 -0
  121. claude_mpm/hooks/claude_hooks/event_handlers.py +4 -2
  122. claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
  123. claude_mpm/hooks/claude_hooks/services/connection_manager_http.py +23 -2
  124. claude_mpm/hooks/failure_learning/__init__.py +60 -0
  125. claude_mpm/hooks/failure_learning/failure_detection_hook.py +235 -0
  126. claude_mpm/hooks/failure_learning/fix_detection_hook.py +217 -0
  127. claude_mpm/hooks/failure_learning/learning_extraction_hook.py +286 -0
  128. claude_mpm/hooks/instruction_reinforcement.py +7 -2
  129. claude_mpm/hooks/kuzu_enrichment_hook.py +263 -0
  130. claude_mpm/hooks/kuzu_memory_hook.py +37 -12
  131. claude_mpm/hooks/kuzu_response_hook.py +183 -0
  132. claude_mpm/models/resume_log.py +340 -0
  133. claude_mpm/services/agents/__init__.py +18 -5
  134. claude_mpm/services/agents/auto_config_manager.py +796 -0
  135. claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
  136. claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
  137. claude_mpm/services/agents/deployment/agent_validator.py +17 -1
  138. claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
  139. claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
  140. claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
  141. claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
  142. claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
  143. claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
  144. claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
  145. claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
  146. claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
  147. claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
  148. claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
  149. claude_mpm/services/agents/local_template_manager.py +1 -1
  150. claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
  151. claude_mpm/services/agents/observers.py +547 -0
  152. claude_mpm/services/agents/recommender.py +568 -0
  153. claude_mpm/services/agents/registry/modification_tracker.py +5 -2
  154. claude_mpm/services/command_handler_service.py +11 -5
  155. claude_mpm/services/core/__init__.py +33 -1
  156. claude_mpm/services/core/interfaces/__init__.py +90 -3
  157. claude_mpm/services/core/interfaces/agent.py +184 -0
  158. claude_mpm/services/core/interfaces/health.py +172 -0
  159. claude_mpm/services/core/interfaces/model.py +281 -0
  160. claude_mpm/services/core/interfaces/process.py +372 -0
  161. claude_mpm/services/core/interfaces/project.py +121 -0
  162. claude_mpm/services/core/interfaces/restart.py +307 -0
  163. claude_mpm/services/core/interfaces/stability.py +260 -0
  164. claude_mpm/services/core/memory_manager.py +11 -24
  165. claude_mpm/services/core/models/__init__.py +79 -0
  166. claude_mpm/services/core/models/agent_config.py +381 -0
  167. claude_mpm/services/core/models/health.py +162 -0
  168. claude_mpm/services/core/models/process.py +235 -0
  169. claude_mpm/services/core/models/restart.py +302 -0
  170. claude_mpm/services/core/models/stability.py +264 -0
  171. claude_mpm/services/core/models/toolchain.py +306 -0
  172. claude_mpm/services/core/path_resolver.py +23 -7
  173. claude_mpm/services/diagnostics/__init__.py +2 -2
  174. claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
  175. claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
  176. claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
  177. claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
  178. claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
  179. claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
  180. claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
  181. claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
  182. claude_mpm/services/diagnostics/checks/mcp_services_check.py +38 -33
  183. claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
  184. claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
  185. claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
  186. claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
  187. claude_mpm/services/diagnostics/models.py +19 -24
  188. claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
  189. claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
  190. claude_mpm/services/infrastructure/monitoring/base.py +5 -13
  191. claude_mpm/services/infrastructure/monitoring/network.py +7 -6
  192. claude_mpm/services/infrastructure/monitoring/process.py +13 -12
  193. claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
  194. claude_mpm/services/infrastructure/monitoring/service.py +16 -15
  195. claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
  196. claude_mpm/services/local_ops/__init__.py +163 -0
  197. claude_mpm/services/local_ops/crash_detector.py +257 -0
  198. claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
  199. claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
  200. claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
  201. claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
  202. claude_mpm/services/local_ops/health_manager.py +430 -0
  203. claude_mpm/services/local_ops/log_monitor.py +396 -0
  204. claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
  205. claude_mpm/services/local_ops/process_manager.py +595 -0
  206. claude_mpm/services/local_ops/resource_monitor.py +331 -0
  207. claude_mpm/services/local_ops/restart_manager.py +401 -0
  208. claude_mpm/services/local_ops/restart_policy.py +387 -0
  209. claude_mpm/services/local_ops/state_manager.py +372 -0
  210. claude_mpm/services/local_ops/unified_manager.py +600 -0
  211. claude_mpm/services/mcp_config_manager.py +9 -4
  212. claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
  213. claude_mpm/services/mcp_gateway/core/base.py +18 -31
  214. claude_mpm/services/mcp_gateway/main.py +30 -0
  215. claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +206 -32
  216. claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
  217. claude_mpm/services/mcp_gateway/tools/kuzu_memory_service.py +25 -5
  218. claude_mpm/services/mcp_service_verifier.py +1 -1
  219. claude_mpm/services/memory/failure_tracker.py +563 -0
  220. claude_mpm/services/memory_hook_service.py +165 -4
  221. claude_mpm/services/model/__init__.py +147 -0
  222. claude_mpm/services/model/base_provider.py +365 -0
  223. claude_mpm/services/model/claude_provider.py +412 -0
  224. claude_mpm/services/model/model_router.py +453 -0
  225. claude_mpm/services/model/ollama_provider.py +415 -0
  226. claude_mpm/services/monitor/daemon_manager.py +3 -2
  227. claude_mpm/services/monitor/handlers/dashboard.py +2 -1
  228. claude_mpm/services/monitor/handlers/hooks.py +2 -1
  229. claude_mpm/services/monitor/management/lifecycle.py +3 -2
  230. claude_mpm/services/monitor/server.py +2 -1
  231. claude_mpm/services/project/__init__.py +23 -0
  232. claude_mpm/services/project/detection_strategies.py +719 -0
  233. claude_mpm/services/project/toolchain_analyzer.py +581 -0
  234. claude_mpm/services/self_upgrade_service.py +342 -0
  235. claude_mpm/services/session_management_service.py +3 -2
  236. claude_mpm/services/session_manager.py +205 -1
  237. claude_mpm/services/shared/async_service_base.py +16 -27
  238. claude_mpm/services/shared/lifecycle_service_base.py +1 -14
  239. claude_mpm/services/socketio/handlers/__init__.py +5 -2
  240. claude_mpm/services/socketio/handlers/hook.py +13 -2
  241. claude_mpm/services/socketio/handlers/registry.py +4 -2
  242. claude_mpm/services/socketio/server/main.py +10 -8
  243. claude_mpm/services/subprocess_launcher_service.py +14 -5
  244. claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
  245. claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
  246. claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
  247. claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
  248. claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
  249. claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
  250. claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
  251. claude_mpm/services/unified/deployment_strategies/local.py +6 -5
  252. claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
  253. claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
  254. claude_mpm/services/unified/interfaces.py +3 -1
  255. claude_mpm/services/unified/unified_analyzer.py +14 -10
  256. claude_mpm/services/unified/unified_config.py +2 -1
  257. claude_mpm/services/unified/unified_deployment.py +9 -4
  258. claude_mpm/services/version_service.py +104 -1
  259. claude_mpm/skills/__init__.py +21 -0
  260. claude_mpm/skills/bundled/__init__.py +6 -0
  261. claude_mpm/skills/bundled/api-documentation.md +393 -0
  262. claude_mpm/skills/bundled/async-testing.md +571 -0
  263. claude_mpm/skills/bundled/code-review.md +143 -0
  264. claude_mpm/skills/bundled/database-migration.md +199 -0
  265. claude_mpm/skills/bundled/docker-containerization.md +194 -0
  266. claude_mpm/skills/bundled/express-local-dev.md +1429 -0
  267. claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
  268. claude_mpm/skills/bundled/git-workflow.md +414 -0
  269. claude_mpm/skills/bundled/imagemagick.md +204 -0
  270. claude_mpm/skills/bundled/json-data-handling.md +223 -0
  271. claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
  272. claude_mpm/skills/bundled/pdf.md +141 -0
  273. claude_mpm/skills/bundled/performance-profiling.md +567 -0
  274. claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
  275. claude_mpm/skills/bundled/security-scanning.md +327 -0
  276. claude_mpm/skills/bundled/systematic-debugging.md +473 -0
  277. claude_mpm/skills/bundled/test-driven-development.md +378 -0
  278. claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
  279. claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
  280. claude_mpm/skills/bundled/xlsx.md +157 -0
  281. claude_mpm/skills/registry.py +286 -0
  282. claude_mpm/skills/skill_manager.py +310 -0
  283. claude_mpm/storage/state_storage.py +15 -15
  284. claude_mpm/tools/code_tree_analyzer.py +177 -141
  285. claude_mpm/tools/code_tree_events.py +4 -2
  286. claude_mpm/utils/agent_dependency_loader.py +40 -20
  287. claude_mpm/utils/display_helper.py +260 -0
  288. claude_mpm/utils/git_analyzer.py +407 -0
  289. claude_mpm/utils/robust_installer.py +73 -19
  290. {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +129 -12
  291. {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +295 -193
  292. claude_mpm/dashboard/static/css/code-tree.css +0 -1639
  293. claude_mpm/dashboard/static/index-hub-backup.html +0 -713
  294. claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
  295. claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
  296. claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
  297. claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
  298. claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
  299. claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
  300. claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
  301. claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
  302. claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
  303. claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
  304. claude_mpm/services/project/analyzer_refactored.py +0 -450
  305. {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
  306. {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
  307. {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
  308. {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,387 @@
1
+ """
2
+ Restart Policy for Claude MPM Framework
3
+ ========================================
4
+
5
+ WHY: Implements intelligent restart policies with exponential backoff,
6
+ max attempts, and circuit breaker patterns to prevent restart loops.
7
+
8
+ DESIGN DECISION: Uses exponential backoff with configurable parameters
9
+ and circuit breaker state transitions (CLOSED → OPEN → HALF_OPEN).
10
+ Tracks restart history per deployment for policy decisions.
11
+
12
+ ARCHITECTURE:
13
+ - Exponential backoff: initial * (multiplier ** (attempt - 1))
14
+ - Circuit breaker states: CLOSED, OPEN, HALF_OPEN
15
+ - Failure window tracking for circuit breaker trip detection
16
+ - Thread-safe restart history management
17
+
18
+ USAGE:
19
+ config = RestartConfig(
20
+ max_attempts=5,
21
+ initial_backoff_seconds=2.0,
22
+ circuit_breaker_threshold=3
23
+ )
24
+ policy = RestartPolicy(config)
25
+
26
+ if policy.should_restart(deployment_id):
27
+ backoff = policy.calculate_backoff(deployment_id)
28
+ time.sleep(backoff)
29
+ # Perform restart
30
+ policy.record_restart_attempt(deployment_id, success=True)
31
+ """
32
+
33
+ import threading
34
+ from datetime import datetime, timedelta, timezone
35
+ from typing import Dict, Optional
36
+
37
+ from claude_mpm.services.core.base import SyncBaseService
38
+ from claude_mpm.services.core.interfaces.restart import IRestartPolicy
39
+ from claude_mpm.services.core.models.restart import (
40
+ CircuitBreakerState,
41
+ RestartAttempt,
42
+ RestartConfig,
43
+ RestartHistory,
44
+ )
45
+
46
+
47
+ class RestartPolicy(SyncBaseService, IRestartPolicy):
48
+ """
49
+ Restart policy with exponential backoff and circuit breaker.
50
+
51
+ WHY: Prevents restart loops through intelligent policy decisions.
52
+ Implements exponential backoff to give services time to recover
53
+ and circuit breaker to block restarts after repeated failures.
54
+
55
+ Thread Safety: All public methods are thread-safe with proper locking.
56
+ """
57
+
58
+ def __init__(self, config: RestartConfig):
59
+ """
60
+ Initialize restart policy.
61
+
62
+ Args:
63
+ config: Restart configuration
64
+ """
65
+ super().__init__("RestartPolicy")
66
+ self.config = config
67
+ self._lock = threading.Lock()
68
+
69
+ # Restart history per deployment
70
+ self._history: Dict[str, RestartHistory] = {}
71
+
72
+ def initialize(self) -> bool:
73
+ """
74
+ Initialize the restart policy.
75
+
76
+ Returns:
77
+ True if initialization successful
78
+ """
79
+ self.logger.info(
80
+ f"Initializing RestartPolicy with config: "
81
+ f"max_attempts={self.config.max_attempts}, "
82
+ f"backoff={self.config.initial_backoff_seconds}s-{self.config.max_backoff_seconds}s, "
83
+ f"circuit_breaker={self.config.circuit_breaker_threshold} failures"
84
+ )
85
+ return True
86
+
87
+ def should_restart(self, deployment_id: str) -> bool:
88
+ """
89
+ Determine if a deployment should be restarted.
90
+
91
+ Args:
92
+ deployment_id: Unique deployment identifier
93
+
94
+ Returns:
95
+ True if restart should proceed
96
+ """
97
+ with self._lock:
98
+ history = self._get_or_create_history(deployment_id)
99
+
100
+ # Check circuit breaker state
101
+ if history.circuit_breaker_state == CircuitBreakerState.OPEN:
102
+ self.logger.warning(
103
+ f"Restart blocked for {deployment_id}: circuit breaker OPEN"
104
+ )
105
+ return False
106
+
107
+ # Check max attempts
108
+ attempt_count = history.get_attempt_count()
109
+ if attempt_count >= self.config.max_attempts:
110
+ self.logger.warning(
111
+ f"Restart blocked for {deployment_id}: "
112
+ f"max attempts reached ({attempt_count}/{self.config.max_attempts})"
113
+ )
114
+ return False
115
+
116
+ # Allow restart
117
+ self.logger.debug(
118
+ f"Restart allowed for {deployment_id}: "
119
+ f"attempt {attempt_count + 1}/{self.config.max_attempts}, "
120
+ f"circuit breaker {history.circuit_breaker_state.value}"
121
+ )
122
+ return True
123
+
124
+ def calculate_backoff(self, deployment_id: str) -> float:
125
+ """
126
+ Calculate backoff time in seconds for next restart.
127
+
128
+ WHY: Implements exponential backoff. For attempt N, backoff = initial * (multiplier ^ (N-2)).
129
+ Attempt 1 has no backoff (0), attempt 2 gets initial backoff, etc.
130
+
131
+ Args:
132
+ deployment_id: Unique deployment identifier
133
+
134
+ Returns:
135
+ Backoff time in seconds (0 if first attempt)
136
+ """
137
+ with self._lock:
138
+ history = self._get_or_create_history(deployment_id)
139
+ attempt_number = history.get_attempt_count() + 1
140
+
141
+ # First attempt has no backoff
142
+ if attempt_number == 1:
143
+ return 0.0
144
+
145
+ # Calculate exponential backoff: initial * (multiplier ^ (attempt - 2))
146
+ # This gives: attempt 2 = initial, attempt 3 = initial*multiplier, etc.
147
+ backoff = self.config.initial_backoff_seconds * (
148
+ self.config.backoff_multiplier ** (attempt_number - 2)
149
+ )
150
+
151
+ # Cap at max backoff
152
+ backoff = min(backoff, self.config.max_backoff_seconds)
153
+
154
+ self.logger.debug(
155
+ f"Calculated backoff for {deployment_id} "
156
+ f"(attempt {attempt_number}): {backoff:.1f}s"
157
+ )
158
+ return backoff
159
+
160
+ def record_restart_attempt(
161
+ self, deployment_id: str, success: bool, failure_reason: Optional[str] = None
162
+ ) -> None:
163
+ """
164
+ Record a restart attempt and update circuit breaker state.
165
+
166
+ Args:
167
+ deployment_id: Unique deployment identifier
168
+ success: Whether restart succeeded
169
+ failure_reason: Optional reason for failure
170
+ """
171
+ with self._lock:
172
+ history = self._get_or_create_history(deployment_id)
173
+ now = datetime.now(timezone.utc)
174
+
175
+ # Calculate backoff for this attempt (already holding lock)
176
+ attempt_number = history.get_attempt_count() + 1
177
+ if attempt_number == 1:
178
+ backoff = 0.0
179
+ else:
180
+ backoff = self.config.initial_backoff_seconds * (
181
+ self.config.backoff_multiplier ** (attempt_number - 2)
182
+ )
183
+ backoff = min(backoff, self.config.max_backoff_seconds)
184
+
185
+ # Create restart attempt record
186
+ attempt = RestartAttempt(
187
+ attempt_number=history.get_attempt_count() + 1,
188
+ deployment_id=deployment_id,
189
+ started_at=now,
190
+ completed_at=now,
191
+ success=success,
192
+ failure_reason=failure_reason,
193
+ backoff_seconds=backoff,
194
+ )
195
+
196
+ # Add to history (prepend for newest-first ordering)
197
+ history.attempts.insert(0, attempt)
198
+
199
+ # Update circuit breaker based on result
200
+ if success:
201
+ self._handle_successful_restart(history)
202
+ else:
203
+ self._handle_failed_restart(history, now)
204
+
205
+ self.logger.info(
206
+ f"Recorded restart attempt for {deployment_id}: "
207
+ f"attempt {attempt.attempt_number}, success={success}, "
208
+ f"circuit breaker={history.circuit_breaker_state.value}"
209
+ )
210
+
211
+ def reset_restart_history(self, deployment_id: str) -> None:
212
+ """
213
+ Reset restart history for a deployment.
214
+
215
+ Args:
216
+ deployment_id: Unique deployment identifier
217
+ """
218
+ with self._lock:
219
+ if deployment_id in self._history:
220
+ del self._history[deployment_id]
221
+ self.logger.info(
222
+ f"Reset restart history for deployment: {deployment_id}"
223
+ )
224
+
225
+ def get_circuit_breaker_state(self, deployment_id: str) -> str:
226
+ """
227
+ Get current circuit breaker state.
228
+
229
+ Args:
230
+ deployment_id: Unique deployment identifier
231
+
232
+ Returns:
233
+ Circuit breaker state (CLOSED, OPEN, HALF_OPEN)
234
+ """
235
+ with self._lock:
236
+ history = self._get_or_create_history(deployment_id)
237
+ return history.circuit_breaker_state.value
238
+
239
+ def get_restart_attempt_count(self, deployment_id: str) -> int:
240
+ """
241
+ Get number of restart attempts for a deployment.
242
+
243
+ Args:
244
+ deployment_id: Unique deployment identifier
245
+
246
+ Returns:
247
+ Number of restart attempts
248
+ """
249
+ with self._lock:
250
+ history = self._get_or_create_history(deployment_id)
251
+ return history.get_attempt_count()
252
+
253
+ def get_history(self, deployment_id: str) -> Optional[RestartHistory]:
254
+ """
255
+ Get restart history for a deployment.
256
+
257
+ Args:
258
+ deployment_id: Unique deployment identifier
259
+
260
+ Returns:
261
+ RestartHistory if exists, None otherwise
262
+ """
263
+ with self._lock:
264
+ return self._history.get(deployment_id)
265
+
266
+ def shutdown(self) -> bool:
267
+ """
268
+ Shutdown the restart policy.
269
+
270
+ Returns:
271
+ True if shutdown successful
272
+ """
273
+ with self._lock:
274
+ self._history.clear()
275
+ self.logger.info("RestartPolicy shutdown successfully")
276
+ return True
277
+
278
+ def _get_or_create_history(self, deployment_id: str) -> RestartHistory:
279
+ """
280
+ Get or create restart history for a deployment.
281
+
282
+ Args:
283
+ deployment_id: Unique deployment identifier
284
+
285
+ Returns:
286
+ RestartHistory instance
287
+ """
288
+ if deployment_id not in self._history:
289
+ self._history[deployment_id] = RestartHistory(deployment_id=deployment_id)
290
+ return self._history[deployment_id]
291
+
292
+ def _handle_successful_restart(self, history: RestartHistory) -> None:
293
+ """
294
+ Handle successful restart attempt.
295
+
296
+ WHY: Success transitions circuit breaker from HALF_OPEN → CLOSED
297
+ and resets failure window tracking.
298
+
299
+ Args:
300
+ history: Restart history to update
301
+ """
302
+ # Reset circuit breaker on success
303
+ if history.circuit_breaker_state == CircuitBreakerState.HALF_OPEN:
304
+ history.circuit_breaker_state = CircuitBreakerState.CLOSED
305
+ self.logger.info(
306
+ f"Circuit breaker CLOSED for {history.deployment_id} after successful restart"
307
+ )
308
+
309
+ # Reset failure window
310
+ history.failure_count_in_window = 0
311
+ history.last_failure_window_start = None
312
+
313
+ def _handle_failed_restart(self, history: RestartHistory, now: datetime) -> None:
314
+ """
315
+ Handle failed restart attempt.
316
+
317
+ WHY: Tracks failures in time window and trips circuit breaker
318
+ if threshold exceeded.
319
+
320
+ Args:
321
+ history: Restart history to update
322
+ now: Current timestamp
323
+ """
324
+ # Initialize failure window if needed
325
+ if history.last_failure_window_start is None:
326
+ history.last_failure_window_start = now
327
+ history.failure_count_in_window = 1
328
+ else:
329
+ # Check if we're still in the same window
330
+ window_start = history.last_failure_window_start
331
+ window_end = window_start + timedelta(
332
+ seconds=self.config.circuit_breaker_window_seconds
333
+ )
334
+
335
+ if now <= window_end:
336
+ # Still in window, increment count
337
+ history.failure_count_in_window += 1
338
+ else:
339
+ # Window expired, start new window
340
+ history.last_failure_window_start = now
341
+ history.failure_count_in_window = 1
342
+
343
+ # Check if we should trip the circuit breaker
344
+ if history.failure_count_in_window >= self.config.circuit_breaker_threshold:
345
+ if history.circuit_breaker_state != CircuitBreakerState.OPEN:
346
+ history.circuit_breaker_state = CircuitBreakerState.OPEN
347
+ self.logger.warning(
348
+ f"Circuit breaker OPEN for {history.deployment_id}: "
349
+ f"{history.failure_count_in_window} failures in "
350
+ f"{self.config.circuit_breaker_window_seconds}s window"
351
+ )
352
+
353
+ # Check if we should transition to HALF_OPEN
354
+ elif history.circuit_breaker_state == CircuitBreakerState.OPEN:
355
+ self._check_circuit_breaker_reset(history, now)
356
+
357
+ def _check_circuit_breaker_reset(
358
+ self, history: RestartHistory, now: datetime
359
+ ) -> None:
360
+ """
361
+ Check if circuit breaker should reset to HALF_OPEN.
362
+
363
+ WHY: After cooldown period, allow one restart attempt to test
364
+ if service has recovered.
365
+
366
+ Args:
367
+ history: Restart history to check
368
+ now: Current timestamp
369
+ """
370
+ if history.last_failure_window_start is None:
371
+ return
372
+
373
+ # Calculate reset time
374
+ reset_time = history.last_failure_window_start + timedelta(
375
+ seconds=self.config.circuit_breaker_reset_seconds
376
+ )
377
+
378
+ # Check if cooldown period has elapsed
379
+ if now >= reset_time:
380
+ history.circuit_breaker_state = CircuitBreakerState.HALF_OPEN
381
+ self.logger.info(
382
+ f"Circuit breaker HALF_OPEN for {history.deployment_id} "
383
+ f"after {self.config.circuit_breaker_reset_seconds}s cooldown"
384
+ )
385
+
386
+
387
+ __all__ = ["RestartPolicy"]