claude-mpm 3.9.11__py3-none-any.whl → 4.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/__init__.py +2 -2
  3. claude_mpm/__main__.py +3 -2
  4. claude_mpm/agents/__init__.py +85 -79
  5. claude_mpm/agents/agent_loader.py +464 -1003
  6. claude_mpm/agents/agent_loader_integration.py +45 -45
  7. claude_mpm/agents/agents_metadata.py +29 -30
  8. claude_mpm/agents/async_agent_loader.py +156 -138
  9. claude_mpm/agents/base_agent.json +1 -1
  10. claude_mpm/agents/base_agent_loader.py +179 -151
  11. claude_mpm/agents/frontmatter_validator.py +229 -130
  12. claude_mpm/agents/schema/agent_schema.json +1 -1
  13. claude_mpm/agents/system_agent_config.py +213 -147
  14. claude_mpm/agents/templates/__init__.py +13 -13
  15. claude_mpm/agents/templates/code_analyzer.json +2 -2
  16. claude_mpm/agents/templates/data_engineer.json +1 -1
  17. claude_mpm/agents/templates/documentation.json +23 -11
  18. claude_mpm/agents/templates/engineer.json +22 -6
  19. claude_mpm/agents/templates/memory_manager.json +1 -1
  20. claude_mpm/agents/templates/ops.json +2 -2
  21. claude_mpm/agents/templates/project_organizer.json +1 -1
  22. claude_mpm/agents/templates/qa.json +1 -1
  23. claude_mpm/agents/templates/refactoring_engineer.json +222 -0
  24. claude_mpm/agents/templates/research.json +20 -14
  25. claude_mpm/agents/templates/security.json +1 -1
  26. claude_mpm/agents/templates/ticketing.json +1 -1
  27. claude_mpm/agents/templates/version_control.json +1 -1
  28. claude_mpm/agents/templates/web_qa.json +3 -1
  29. claude_mpm/agents/templates/web_ui.json +2 -2
  30. claude_mpm/cli/__init__.py +79 -51
  31. claude_mpm/cli/__main__.py +3 -2
  32. claude_mpm/cli/commands/__init__.py +20 -20
  33. claude_mpm/cli/commands/agents.py +279 -247
  34. claude_mpm/cli/commands/aggregate.py +138 -157
  35. claude_mpm/cli/commands/cleanup.py +147 -147
  36. claude_mpm/cli/commands/config.py +93 -76
  37. claude_mpm/cli/commands/info.py +17 -16
  38. claude_mpm/cli/commands/mcp.py +140 -905
  39. claude_mpm/cli/commands/mcp_command_router.py +139 -0
  40. claude_mpm/cli/commands/mcp_config_commands.py +20 -0
  41. claude_mpm/cli/commands/mcp_install_commands.py +20 -0
  42. claude_mpm/cli/commands/mcp_server_commands.py +175 -0
  43. claude_mpm/cli/commands/mcp_tool_commands.py +34 -0
  44. claude_mpm/cli/commands/memory.py +239 -203
  45. claude_mpm/cli/commands/monitor.py +203 -81
  46. claude_mpm/cli/commands/run.py +380 -429
  47. claude_mpm/cli/commands/run_config_checker.py +160 -0
  48. claude_mpm/cli/commands/socketio_monitor.py +235 -0
  49. claude_mpm/cli/commands/tickets.py +305 -197
  50. claude_mpm/cli/parser.py +24 -1156
  51. claude_mpm/cli/parsers/__init__.py +29 -0
  52. claude_mpm/cli/parsers/agents_parser.py +136 -0
  53. claude_mpm/cli/parsers/base_parser.py +331 -0
  54. claude_mpm/cli/parsers/config_parser.py +85 -0
  55. claude_mpm/cli/parsers/mcp_parser.py +152 -0
  56. claude_mpm/cli/parsers/memory_parser.py +138 -0
  57. claude_mpm/cli/parsers/monitor_parser.py +104 -0
  58. claude_mpm/cli/parsers/run_parser.py +147 -0
  59. claude_mpm/cli/parsers/tickets_parser.py +203 -0
  60. claude_mpm/cli/ticket_cli.py +7 -3
  61. claude_mpm/cli/utils.py +55 -37
  62. claude_mpm/cli_module/__init__.py +6 -6
  63. claude_mpm/cli_module/args.py +188 -140
  64. claude_mpm/cli_module/commands.py +79 -70
  65. claude_mpm/cli_module/migration_example.py +38 -60
  66. claude_mpm/config/__init__.py +32 -25
  67. claude_mpm/config/agent_config.py +151 -119
  68. claude_mpm/config/experimental_features.py +71 -73
  69. claude_mpm/config/paths.py +94 -208
  70. claude_mpm/config/socketio_config.py +84 -73
  71. claude_mpm/constants.py +35 -18
  72. claude_mpm/core/__init__.py +9 -6
  73. claude_mpm/core/agent_name_normalizer.py +68 -71
  74. claude_mpm/core/agent_registry.py +372 -521
  75. claude_mpm/core/agent_session_manager.py +74 -63
  76. claude_mpm/core/base_service.py +116 -87
  77. claude_mpm/core/cache.py +119 -153
  78. claude_mpm/core/claude_runner.py +425 -1120
  79. claude_mpm/core/config.py +263 -168
  80. claude_mpm/core/config_aliases.py +69 -61
  81. claude_mpm/core/config_constants.py +292 -0
  82. claude_mpm/core/constants.py +57 -99
  83. claude_mpm/core/container.py +211 -178
  84. claude_mpm/core/exceptions.py +233 -89
  85. claude_mpm/core/factories.py +92 -54
  86. claude_mpm/core/framework_loader.py +378 -220
  87. claude_mpm/core/hook_manager.py +198 -83
  88. claude_mpm/core/hook_performance_config.py +136 -0
  89. claude_mpm/core/injectable_service.py +61 -55
  90. claude_mpm/core/interactive_session.py +165 -155
  91. claude_mpm/core/interfaces.py +221 -195
  92. claude_mpm/core/lazy.py +96 -96
  93. claude_mpm/core/logger.py +133 -107
  94. claude_mpm/core/logging_config.py +185 -157
  95. claude_mpm/core/minimal_framework_loader.py +20 -15
  96. claude_mpm/core/mixins.py +30 -29
  97. claude_mpm/core/oneshot_session.py +215 -181
  98. claude_mpm/core/optimized_agent_loader.py +134 -138
  99. claude_mpm/core/optimized_startup.py +159 -157
  100. claude_mpm/core/pm_hook_interceptor.py +85 -72
  101. claude_mpm/core/service_registry.py +103 -101
  102. claude_mpm/core/session_manager.py +97 -87
  103. claude_mpm/core/socketio_pool.py +212 -158
  104. claude_mpm/core/tool_access_control.py +58 -51
  105. claude_mpm/core/types.py +46 -24
  106. claude_mpm/core/typing_utils.py +166 -82
  107. claude_mpm/core/unified_agent_registry.py +721 -0
  108. claude_mpm/core/unified_config.py +550 -0
  109. claude_mpm/core/unified_paths.py +549 -0
  110. claude_mpm/dashboard/index.html +1 -1
  111. claude_mpm/dashboard/open_dashboard.py +51 -17
  112. claude_mpm/dashboard/static/css/dashboard.css +27 -8
  113. claude_mpm/dashboard/static/dist/components/agent-inference.js +2 -0
  114. claude_mpm/dashboard/static/dist/components/event-processor.js +2 -0
  115. claude_mpm/dashboard/static/dist/components/event-viewer.js +2 -0
  116. claude_mpm/dashboard/static/dist/components/export-manager.js +2 -0
  117. claude_mpm/dashboard/static/dist/components/file-tool-tracker.js +2 -0
  118. claude_mpm/dashboard/static/dist/components/hud-library-loader.js +2 -0
  119. claude_mpm/dashboard/static/dist/components/hud-manager.js +2 -0
  120. claude_mpm/dashboard/static/dist/components/hud-visualizer.js +2 -0
  121. claude_mpm/dashboard/static/dist/components/module-viewer.js +2 -0
  122. claude_mpm/dashboard/static/dist/components/session-manager.js +2 -0
  123. claude_mpm/dashboard/static/dist/components/socket-manager.js +2 -0
  124. claude_mpm/dashboard/static/dist/components/ui-state-manager.js +2 -0
  125. claude_mpm/dashboard/static/dist/components/working-directory.js +2 -0
  126. claude_mpm/dashboard/static/dist/dashboard.js +2 -0
  127. claude_mpm/dashboard/static/dist/socket-client.js +2 -0
  128. claude_mpm/dashboard/static/js/components/agent-inference.js +80 -76
  129. claude_mpm/dashboard/static/js/components/event-processor.js +71 -67
  130. claude_mpm/dashboard/static/js/components/event-viewer.js +74 -70
  131. claude_mpm/dashboard/static/js/components/export-manager.js +31 -28
  132. claude_mpm/dashboard/static/js/components/file-tool-tracker.js +106 -92
  133. claude_mpm/dashboard/static/js/components/hud-library-loader.js +11 -11
  134. claude_mpm/dashboard/static/js/components/hud-manager.js +73 -73
  135. claude_mpm/dashboard/static/js/components/hud-visualizer.js +163 -163
  136. claude_mpm/dashboard/static/js/components/module-viewer.js +305 -233
  137. claude_mpm/dashboard/static/js/components/session-manager.js +32 -29
  138. claude_mpm/dashboard/static/js/components/socket-manager.js +27 -20
  139. claude_mpm/dashboard/static/js/components/ui-state-manager.js +21 -18
  140. claude_mpm/dashboard/static/js/components/working-directory.js +74 -71
  141. claude_mpm/dashboard/static/js/dashboard.js +178 -453
  142. claude_mpm/dashboard/static/js/extension-error-handler.js +164 -0
  143. claude_mpm/dashboard/static/js/socket-client.js +120 -54
  144. claude_mpm/dashboard/templates/index.html +40 -50
  145. claude_mpm/experimental/cli_enhancements.py +60 -58
  146. claude_mpm/generators/__init__.py +1 -1
  147. claude_mpm/generators/agent_profile_generator.py +75 -65
  148. claude_mpm/hooks/__init__.py +1 -1
  149. claude_mpm/hooks/base_hook.py +33 -28
  150. claude_mpm/hooks/claude_hooks/__init__.py +1 -1
  151. claude_mpm/hooks/claude_hooks/connection_pool.py +120 -0
  152. claude_mpm/hooks/claude_hooks/event_handlers.py +743 -0
  153. claude_mpm/hooks/claude_hooks/hook_handler.py +415 -1331
  154. claude_mpm/hooks/claude_hooks/hook_wrapper.sh +4 -4
  155. claude_mpm/hooks/claude_hooks/memory_integration.py +221 -0
  156. claude_mpm/hooks/claude_hooks/response_tracking.py +348 -0
  157. claude_mpm/hooks/claude_hooks/tool_analysis.py +230 -0
  158. claude_mpm/hooks/memory_integration_hook.py +140 -100
  159. claude_mpm/hooks/tool_call_interceptor.py +89 -76
  160. claude_mpm/hooks/validation_hooks.py +57 -49
  161. claude_mpm/init.py +145 -121
  162. claude_mpm/models/__init__.py +9 -9
  163. claude_mpm/models/agent_definition.py +33 -23
  164. claude_mpm/models/agent_session.py +228 -200
  165. claude_mpm/scripts/__init__.py +1 -1
  166. claude_mpm/scripts/socketio_daemon.py +192 -75
  167. claude_mpm/scripts/socketio_server_manager.py +328 -0
  168. claude_mpm/scripts/start_activity_logging.py +25 -22
  169. claude_mpm/services/__init__.py +68 -43
  170. claude_mpm/services/agent_capabilities_service.py +271 -0
  171. claude_mpm/services/agents/__init__.py +23 -32
  172. claude_mpm/services/agents/deployment/__init__.py +3 -3
  173. claude_mpm/services/agents/deployment/agent_config_provider.py +310 -0
  174. claude_mpm/services/agents/deployment/agent_configuration_manager.py +359 -0
  175. claude_mpm/services/agents/deployment/agent_definition_factory.py +84 -0
  176. claude_mpm/services/agents/deployment/agent_deployment.py +415 -2113
  177. claude_mpm/services/agents/deployment/agent_discovery_service.py +387 -0
  178. claude_mpm/services/agents/deployment/agent_environment_manager.py +293 -0
  179. claude_mpm/services/agents/deployment/agent_filesystem_manager.py +387 -0
  180. claude_mpm/services/agents/deployment/agent_format_converter.py +453 -0
  181. claude_mpm/services/agents/deployment/agent_frontmatter_validator.py +161 -0
  182. claude_mpm/services/agents/deployment/agent_lifecycle_manager.py +345 -495
  183. claude_mpm/services/agents/deployment/agent_metrics_collector.py +279 -0
  184. claude_mpm/services/agents/deployment/agent_restore_handler.py +88 -0
  185. claude_mpm/services/agents/deployment/agent_template_builder.py +406 -0
  186. claude_mpm/services/agents/deployment/agent_validator.py +352 -0
  187. claude_mpm/services/agents/deployment/agent_version_manager.py +313 -0
  188. claude_mpm/services/agents/deployment/agent_versioning.py +6 -9
  189. claude_mpm/services/agents/deployment/agents_directory_resolver.py +79 -0
  190. claude_mpm/services/agents/deployment/async_agent_deployment.py +298 -234
  191. claude_mpm/services/agents/deployment/config/__init__.py +13 -0
  192. claude_mpm/services/agents/deployment/config/deployment_config.py +182 -0
  193. claude_mpm/services/agents/deployment/config/deployment_config_manager.py +200 -0
  194. claude_mpm/services/agents/deployment/deployment_config_loader.py +54 -0
  195. claude_mpm/services/agents/deployment/deployment_type_detector.py +124 -0
  196. claude_mpm/services/agents/deployment/facade/__init__.py +18 -0
  197. claude_mpm/services/agents/deployment/facade/async_deployment_executor.py +159 -0
  198. claude_mpm/services/agents/deployment/facade/deployment_executor.py +73 -0
  199. claude_mpm/services/agents/deployment/facade/deployment_facade.py +270 -0
  200. claude_mpm/services/agents/deployment/facade/sync_deployment_executor.py +178 -0
  201. claude_mpm/services/agents/deployment/interface_adapter.py +227 -0
  202. claude_mpm/services/agents/deployment/lifecycle_health_checker.py +85 -0
  203. claude_mpm/services/agents/deployment/lifecycle_performance_tracker.py +100 -0
  204. claude_mpm/services/agents/deployment/pipeline/__init__.py +32 -0
  205. claude_mpm/services/agents/deployment/pipeline/pipeline_builder.py +158 -0
  206. claude_mpm/services/agents/deployment/pipeline/pipeline_context.py +159 -0
  207. claude_mpm/services/agents/deployment/pipeline/pipeline_executor.py +169 -0
  208. claude_mpm/services/agents/deployment/pipeline/steps/__init__.py +19 -0
  209. claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +195 -0
  210. claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +119 -0
  211. claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +79 -0
  212. claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +90 -0
  213. claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +100 -0
  214. claude_mpm/services/agents/deployment/processors/__init__.py +15 -0
  215. claude_mpm/services/agents/deployment/processors/agent_deployment_context.py +98 -0
  216. claude_mpm/services/agents/deployment/processors/agent_deployment_result.py +235 -0
  217. claude_mpm/services/agents/deployment/processors/agent_processor.py +258 -0
  218. claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +318 -0
  219. claude_mpm/services/agents/deployment/results/__init__.py +13 -0
  220. claude_mpm/services/agents/deployment/results/deployment_metrics.py +200 -0
  221. claude_mpm/services/agents/deployment/results/deployment_result_builder.py +249 -0
  222. claude_mpm/services/agents/deployment/strategies/__init__.py +25 -0
  223. claude_mpm/services/agents/deployment/strategies/base_strategy.py +119 -0
  224. claude_mpm/services/agents/deployment/strategies/project_strategy.py +150 -0
  225. claude_mpm/services/agents/deployment/strategies/strategy_selector.py +117 -0
  226. claude_mpm/services/agents/deployment/strategies/system_strategy.py +116 -0
  227. claude_mpm/services/agents/deployment/strategies/user_strategy.py +137 -0
  228. claude_mpm/services/agents/deployment/system_instructions_deployer.py +108 -0
  229. claude_mpm/services/agents/deployment/validation/__init__.py +19 -0
  230. claude_mpm/services/agents/deployment/validation/agent_validator.py +323 -0
  231. claude_mpm/services/agents/deployment/validation/deployment_validator.py +238 -0
  232. claude_mpm/services/agents/deployment/validation/template_validator.py +299 -0
  233. claude_mpm/services/agents/deployment/validation/validation_result.py +226 -0
  234. claude_mpm/services/agents/loading/__init__.py +2 -2
  235. claude_mpm/services/agents/loading/agent_profile_loader.py +259 -229
  236. claude_mpm/services/agents/loading/base_agent_manager.py +90 -81
  237. claude_mpm/services/agents/loading/framework_agent_loader.py +154 -129
  238. claude_mpm/services/agents/management/__init__.py +2 -2
  239. claude_mpm/services/agents/management/agent_capabilities_generator.py +72 -58
  240. claude_mpm/services/agents/management/agent_management_service.py +209 -156
  241. claude_mpm/services/agents/memory/__init__.py +9 -6
  242. claude_mpm/services/agents/memory/agent_memory_manager.py +218 -1152
  243. claude_mpm/services/agents/memory/agent_persistence_service.py +20 -16
  244. claude_mpm/services/agents/memory/analyzer.py +430 -0
  245. claude_mpm/services/agents/memory/content_manager.py +376 -0
  246. claude_mpm/services/agents/memory/template_generator.py +468 -0
  247. claude_mpm/services/agents/registry/__init__.py +7 -10
  248. claude_mpm/services/agents/registry/deployed_agent_discovery.py +122 -97
  249. claude_mpm/services/agents/registry/modification_tracker.py +351 -285
  250. claude_mpm/services/async_session_logger.py +187 -153
  251. claude_mpm/services/claude_session_logger.py +87 -72
  252. claude_mpm/services/command_handler_service.py +217 -0
  253. claude_mpm/services/communication/__init__.py +3 -2
  254. claude_mpm/services/core/__init__.py +50 -97
  255. claude_mpm/services/core/base.py +60 -53
  256. claude_mpm/services/core/interfaces/__init__.py +188 -0
  257. claude_mpm/services/core/interfaces/agent.py +351 -0
  258. claude_mpm/services/core/interfaces/communication.py +343 -0
  259. claude_mpm/services/core/interfaces/infrastructure.py +413 -0
  260. claude_mpm/services/core/interfaces/service.py +434 -0
  261. claude_mpm/services/core/interfaces.py +19 -944
  262. claude_mpm/services/event_aggregator.py +208 -170
  263. claude_mpm/services/exceptions.py +387 -308
  264. claude_mpm/services/framework_claude_md_generator/__init__.py +75 -79
  265. claude_mpm/services/framework_claude_md_generator/content_assembler.py +69 -60
  266. claude_mpm/services/framework_claude_md_generator/content_validator.py +65 -61
  267. claude_mpm/services/framework_claude_md_generator/deployment_manager.py +68 -49
  268. claude_mpm/services/framework_claude_md_generator/section_generators/__init__.py +34 -34
  269. claude_mpm/services/framework_claude_md_generator/section_generators/agents.py +25 -22
  270. claude_mpm/services/framework_claude_md_generator/section_generators/claude_pm_init.py +10 -10
  271. claude_mpm/services/framework_claude_md_generator/section_generators/core_responsibilities.py +4 -3
  272. claude_mpm/services/framework_claude_md_generator/section_generators/delegation_constraints.py +4 -3
  273. claude_mpm/services/framework_claude_md_generator/section_generators/environment_config.py +4 -3
  274. claude_mpm/services/framework_claude_md_generator/section_generators/footer.py +6 -5
  275. claude_mpm/services/framework_claude_md_generator/section_generators/header.py +8 -7
  276. claude_mpm/services/framework_claude_md_generator/section_generators/orchestration_principles.py +4 -3
  277. claude_mpm/services/framework_claude_md_generator/section_generators/role_designation.py +6 -5
  278. claude_mpm/services/framework_claude_md_generator/section_generators/subprocess_validation.py +9 -8
  279. claude_mpm/services/framework_claude_md_generator/section_generators/todo_task_tools.py +4 -3
  280. claude_mpm/services/framework_claude_md_generator/section_generators/troubleshooting.py +5 -4
  281. claude_mpm/services/framework_claude_md_generator/section_manager.py +28 -27
  282. claude_mpm/services/framework_claude_md_generator/version_manager.py +30 -28
  283. claude_mpm/services/hook_service.py +106 -114
  284. claude_mpm/services/infrastructure/__init__.py +7 -5
  285. claude_mpm/services/infrastructure/context_preservation.py +233 -199
  286. claude_mpm/services/infrastructure/daemon_manager.py +279 -0
  287. claude_mpm/services/infrastructure/logging.py +83 -76
  288. claude_mpm/services/infrastructure/monitoring.py +547 -404
  289. claude_mpm/services/mcp_gateway/__init__.py +30 -13
  290. claude_mpm/services/mcp_gateway/config/__init__.py +2 -2
  291. claude_mpm/services/mcp_gateway/config/config_loader.py +61 -56
  292. claude_mpm/services/mcp_gateway/config/config_schema.py +50 -41
  293. claude_mpm/services/mcp_gateway/config/configuration.py +82 -75
  294. claude_mpm/services/mcp_gateway/core/__init__.py +13 -20
  295. claude_mpm/services/mcp_gateway/core/base.py +80 -67
  296. claude_mpm/services/mcp_gateway/core/exceptions.py +60 -46
  297. claude_mpm/services/mcp_gateway/core/interfaces.py +87 -84
  298. claude_mpm/services/mcp_gateway/main.py +287 -137
  299. claude_mpm/services/mcp_gateway/registry/__init__.py +1 -1
  300. claude_mpm/services/mcp_gateway/registry/service_registry.py +97 -94
  301. claude_mpm/services/mcp_gateway/registry/tool_registry.py +135 -126
  302. claude_mpm/services/mcp_gateway/server/__init__.py +2 -2
  303. claude_mpm/services/mcp_gateway/server/mcp_gateway.py +105 -110
  304. claude_mpm/services/mcp_gateway/server/stdio_handler.py +105 -107
  305. claude_mpm/services/mcp_gateway/server/stdio_server.py +691 -0
  306. claude_mpm/services/mcp_gateway/tools/__init__.py +4 -2
  307. claude_mpm/services/mcp_gateway/tools/base_adapter.py +109 -119
  308. claude_mpm/services/mcp_gateway/tools/document_summarizer.py +283 -215
  309. claude_mpm/services/mcp_gateway/tools/hello_world.py +122 -120
  310. claude_mpm/services/mcp_gateway/tools/ticket_tools.py +652 -0
  311. claude_mpm/services/mcp_gateway/tools/unified_ticket_tool.py +606 -0
  312. claude_mpm/services/memory/__init__.py +2 -2
  313. claude_mpm/services/memory/builder.py +451 -362
  314. claude_mpm/services/memory/cache/__init__.py +2 -2
  315. claude_mpm/services/memory/cache/shared_prompt_cache.py +232 -194
  316. claude_mpm/services/memory/cache/simple_cache.py +107 -93
  317. claude_mpm/services/memory/indexed_memory.py +195 -193
  318. claude_mpm/services/memory/optimizer.py +267 -234
  319. claude_mpm/services/memory/router.py +571 -263
  320. claude_mpm/services/memory_hook_service.py +237 -0
  321. claude_mpm/services/port_manager.py +223 -0
  322. claude_mpm/services/project/__init__.py +3 -3
  323. claude_mpm/services/project/analyzer.py +451 -305
  324. claude_mpm/services/project/registry.py +262 -240
  325. claude_mpm/services/recovery_manager.py +287 -231
  326. claude_mpm/services/response_tracker.py +87 -67
  327. claude_mpm/services/runner_configuration_service.py +587 -0
  328. claude_mpm/services/session_management_service.py +304 -0
  329. claude_mpm/services/socketio/__init__.py +4 -4
  330. claude_mpm/services/socketio/client_proxy.py +174 -0
  331. claude_mpm/services/socketio/handlers/__init__.py +3 -3
  332. claude_mpm/services/socketio/handlers/base.py +44 -30
  333. claude_mpm/services/socketio/handlers/connection.py +145 -65
  334. claude_mpm/services/socketio/handlers/file.py +123 -108
  335. claude_mpm/services/socketio/handlers/git.py +607 -373
  336. claude_mpm/services/socketio/handlers/hook.py +170 -0
  337. claude_mpm/services/socketio/handlers/memory.py +4 -4
  338. claude_mpm/services/socketio/handlers/project.py +4 -4
  339. claude_mpm/services/socketio/handlers/registry.py +53 -38
  340. claude_mpm/services/socketio/server/__init__.py +18 -0
  341. claude_mpm/services/socketio/server/broadcaster.py +252 -0
  342. claude_mpm/services/socketio/server/core.py +399 -0
  343. claude_mpm/services/socketio/server/main.py +323 -0
  344. claude_mpm/services/socketio_client_manager.py +160 -133
  345. claude_mpm/services/socketio_server.py +36 -1885
  346. claude_mpm/services/subprocess_launcher_service.py +316 -0
  347. claude_mpm/services/system_instructions_service.py +258 -0
  348. claude_mpm/services/ticket_manager.py +19 -533
  349. claude_mpm/services/utility_service.py +285 -0
  350. claude_mpm/services/version_control/__init__.py +18 -21
  351. claude_mpm/services/version_control/branch_strategy.py +20 -10
  352. claude_mpm/services/version_control/conflict_resolution.py +37 -13
  353. claude_mpm/services/version_control/git_operations.py +52 -21
  354. claude_mpm/services/version_control/semantic_versioning.py +92 -53
  355. claude_mpm/services/version_control/version_parser.py +145 -125
  356. claude_mpm/services/version_service.py +270 -0
  357. claude_mpm/storage/__init__.py +2 -2
  358. claude_mpm/storage/state_storage.py +177 -181
  359. claude_mpm/ticket_wrapper.py +2 -2
  360. claude_mpm/utils/__init__.py +2 -2
  361. claude_mpm/utils/agent_dependency_loader.py +453 -243
  362. claude_mpm/utils/config_manager.py +157 -118
  363. claude_mpm/utils/console.py +1 -1
  364. claude_mpm/utils/dependency_cache.py +102 -107
  365. claude_mpm/utils/dependency_manager.py +52 -47
  366. claude_mpm/utils/dependency_strategies.py +131 -96
  367. claude_mpm/utils/environment_context.py +110 -102
  368. claude_mpm/utils/error_handler.py +75 -55
  369. claude_mpm/utils/file_utils.py +80 -67
  370. claude_mpm/utils/framework_detection.py +12 -11
  371. claude_mpm/utils/import_migration_example.py +12 -60
  372. claude_mpm/utils/imports.py +48 -45
  373. claude_mpm/utils/path_operations.py +100 -93
  374. claude_mpm/utils/robust_installer.py +172 -164
  375. claude_mpm/utils/session_logging.py +30 -23
  376. claude_mpm/utils/subprocess_utils.py +99 -61
  377. claude_mpm/validation/__init__.py +1 -1
  378. claude_mpm/validation/agent_validator.py +151 -111
  379. claude_mpm/validation/frontmatter_validator.py +92 -71
  380. {claude_mpm-3.9.11.dist-info → claude_mpm-4.0.3.dist-info}/METADATA +27 -1
  381. claude_mpm-4.0.3.dist-info/RECORD +402 -0
  382. {claude_mpm-3.9.11.dist-info → claude_mpm-4.0.3.dist-info}/entry_points.txt +1 -0
  383. {claude_mpm-3.9.11.dist-info → claude_mpm-4.0.3.dist-info}/licenses/LICENSE +1 -1
  384. claude_mpm/cli/commands/run_guarded.py +0 -511
  385. claude_mpm/config/memory_guardian_config.py +0 -325
  386. claude_mpm/config/memory_guardian_yaml.py +0 -335
  387. claude_mpm/core/config_paths.py +0 -150
  388. claude_mpm/core/memory_aware_runner.py +0 -353
  389. claude_mpm/dashboard/static/js/dashboard-original.js +0 -4134
  390. claude_mpm/deployment_paths.py +0 -261
  391. claude_mpm/hooks/claude_hooks/hook_handler_fixed.py +0 -454
  392. claude_mpm/models/state_models.py +0 -433
  393. claude_mpm/services/agent/__init__.py +0 -24
  394. claude_mpm/services/agent/deployment.py +0 -2548
  395. claude_mpm/services/agent/management.py +0 -598
  396. claude_mpm/services/agent/registry.py +0 -813
  397. claude_mpm/services/agents/registry/agent_registry.py +0 -813
  398. claude_mpm/services/communication/socketio.py +0 -1935
  399. claude_mpm/services/communication/websocket.py +0 -479
  400. claude_mpm/services/framework_claude_md_generator.py +0 -624
  401. claude_mpm/services/health_monitor.py +0 -893
  402. claude_mpm/services/infrastructure/graceful_degradation.py +0 -616
  403. claude_mpm/services/infrastructure/health_monitor.py +0 -775
  404. claude_mpm/services/infrastructure/memory_dashboard.py +0 -479
  405. claude_mpm/services/infrastructure/memory_guardian.py +0 -944
  406. claude_mpm/services/infrastructure/restart_protection.py +0 -642
  407. claude_mpm/services/infrastructure/state_manager.py +0 -774
  408. claude_mpm/services/mcp_gateway/manager.py +0 -334
  409. claude_mpm/services/optimized_hook_service.py +0 -542
  410. claude_mpm/services/project_analyzer.py +0 -864
  411. claude_mpm/services/project_registry.py +0 -608
  412. claude_mpm/services/standalone_socketio_server.py +0 -1300
  413. claude_mpm/services/ticket_manager_di.py +0 -318
  414. claude_mpm/services/ticketing_service_original.py +0 -510
  415. claude_mpm/utils/paths.py +0 -395
  416. claude_mpm/utils/platform_memory.py +0 -524
  417. claude_mpm-3.9.11.dist-info/RECORD +0 -306
  418. {claude_mpm-3.9.11.dist-info → claude_mpm-4.0.3.dist-info}/WHEEL +0 -0
  419. {claude_mpm-3.9.11.dist-info → claude_mpm-4.0.3.dist-info}/top_level.txt +0 -0
@@ -17,28 +17,25 @@ Design Principles:
17
17
 
18
18
  import asyncio
19
19
  import logging
20
- import time
21
- import signal
22
20
  import os
21
+ import signal
23
22
  import threading
23
+ import time
24
24
  from abc import ABC, abstractmethod
25
25
  from collections import deque
26
26
  from dataclasses import dataclass
27
27
  from datetime import datetime, timezone
28
28
  from enum import Enum
29
- from typing import Any, Dict, List, Optional, Callable, Union
30
- import json
31
- from claude_mpm.core.constants import (
32
- RetryConfig,
33
- TimeoutConfig,
34
- PerformanceConfig
35
- )
29
+ from typing import Any, Callable, Dict, List, Optional, Union
30
+
31
+ from claude_mpm.core.constants import PerformanceConfig, RetryConfig, TimeoutConfig
36
32
 
37
- from .health_monitor import HealthStatus, HealthCheckResult
33
+ from .infrastructure.monitoring import HealthCheckResult, HealthStatus
38
34
 
39
35
 
40
36
  class RecoveryAction(Enum):
41
37
  """Types of recovery actions that can be performed."""
38
+
42
39
  NONE = "none"
43
40
  LOG_WARNING = "log_warning"
44
41
  CLEAR_CONNECTIONS = "clear_connections"
@@ -48,14 +45,16 @@ class RecoveryAction(Enum):
48
45
 
49
46
  class CircuitState(Enum):
50
47
  """Circuit breaker states."""
51
- CLOSED = "closed" # Normal operation
52
- OPEN = "open" # Recovery blocked due to failures
48
+
49
+ CLOSED = "closed" # Normal operation
50
+ OPEN = "open" # Recovery blocked due to failures
53
51
  HALF_OPEN = "half_open" # Testing if recovery is working
54
52
 
55
53
 
56
54
  @dataclass
57
55
  class RecoveryEvent:
58
56
  """Recovery event record."""
57
+
59
58
  timestamp: float
60
59
  action: RecoveryAction
61
60
  trigger: str
@@ -63,34 +62,36 @@ class RecoveryEvent:
63
62
  success: bool
64
63
  duration_ms: float
65
64
  error_message: Optional[str] = None
66
-
65
+
67
66
  def to_dict(self) -> Dict[str, Any]:
68
67
  """Convert recovery event to dictionary."""
69
68
  return {
70
- 'timestamp': self.timestamp,
71
- 'timestamp_iso': datetime.fromtimestamp(self.timestamp, timezone.utc).isoformat(),
72
- 'action': self.action.value,
73
- 'trigger': self.trigger,
74
- 'health_status': self.health_status.value,
75
- 'success': self.success,
76
- 'duration_ms': self.duration_ms,
77
- 'error_message': self.error_message
69
+ "timestamp": self.timestamp,
70
+ "timestamp_iso": datetime.fromtimestamp(
71
+ self.timestamp, timezone.utc
72
+ ).isoformat(),
73
+ "action": self.action.value,
74
+ "trigger": self.trigger,
75
+ "health_status": self.health_status.value,
76
+ "success": self.success,
77
+ "duration_ms": self.duration_ms,
78
+ "error_message": self.error_message,
78
79
  }
79
80
 
80
81
 
81
82
  class RecoveryStrategy(ABC):
82
83
  """Abstract base class for recovery strategies."""
83
-
84
+
84
85
  @abstractmethod
85
86
  def should_recover(self, health_result: HealthCheckResult) -> bool:
86
87
  """Determine if recovery should be triggered based on health result."""
87
88
  pass
88
-
89
+
89
90
  @abstractmethod
90
91
  def get_recovery_action(self, health_result: HealthCheckResult) -> RecoveryAction:
91
92
  """Determine the appropriate recovery action."""
92
93
  pass
93
-
94
+
94
95
  @abstractmethod
95
96
  def get_name(self) -> str:
96
97
  """Get the name of this recovery strategy."""
@@ -99,78 +100,85 @@ class RecoveryStrategy(ABC):
99
100
 
100
101
  class GradedRecoveryStrategy(RecoveryStrategy):
101
102
  """Recovery strategy with graduated response based on health status and history.
102
-
103
+
103
104
  Recovery actions are escalated based on:
104
105
  - Current health status severity
105
106
  - Number of recent failures
106
107
  - Time since last recovery attempt
107
108
  """
108
-
109
+
109
110
  def __init__(self, config: Optional[Dict[str, Any]] = None):
110
111
  """Initialize graded recovery strategy.
111
-
112
+
112
113
  Args:
113
114
  config: Configuration dictionary for recovery thresholds
114
115
  """
115
116
  self.config = config or {}
116
117
  self.logger = logging.getLogger(f"{__name__}.GradedRecoveryStrategy")
117
-
118
+
118
119
  # Configuration with defaults
119
- self.warning_threshold = self.config.get('warning_threshold', 2)
120
- self.critical_threshold = self.config.get('critical_threshold', RetryConfig.CRITICAL_THRESHOLD)
121
- self.failure_window_seconds = self.config.get('failure_window_seconds', RetryConfig.FAILURE_WINDOW)
122
- self.min_recovery_interval = self.config.get('min_recovery_interval', RetryConfig.MIN_RECOVERY_INTERVAL)
123
-
120
+ self.warning_threshold = self.config.get("warning_threshold", 2)
121
+ self.critical_threshold = self.config.get(
122
+ "critical_threshold", RetryConfig.CRITICAL_THRESHOLD
123
+ )
124
+ self.failure_window_seconds = self.config.get(
125
+ "failure_window_seconds", RetryConfig.FAILURE_WINDOW
126
+ )
127
+ self.min_recovery_interval = self.config.get(
128
+ "min_recovery_interval", RetryConfig.MIN_RECOVERY_INTERVAL
129
+ )
130
+
124
131
  # Track recent failures
125
132
  self.recent_failures: deque = deque(maxlen=10)
126
133
  self.last_recovery_time = 0
127
-
134
+
128
135
  def get_name(self) -> str:
129
136
  return "graded_recovery"
130
-
137
+
131
138
  def should_recover(self, health_result: HealthCheckResult) -> bool:
132
139
  """Determine if recovery should be triggered."""
133
140
  current_time = time.time()
134
-
141
+
135
142
  # Don't trigger recovery too frequently
136
143
  if current_time - self.last_recovery_time < self.min_recovery_interval:
137
144
  self.logger.debug("Recovery suppressed due to min interval")
138
145
  return False
139
-
146
+
140
147
  # Check current health status
141
148
  if health_result.overall_status in [HealthStatus.CRITICAL]:
142
149
  return True
143
-
150
+
144
151
  if health_result.overall_status == HealthStatus.WARNING:
145
152
  # Count recent warnings in time window
146
153
  cutoff_time = current_time - self.failure_window_seconds
147
154
  recent_warnings = [
148
- event for event in self.recent_failures
149
- if event >= cutoff_time
155
+ event for event in self.recent_failures if event >= cutoff_time
150
156
  ]
151
-
157
+
152
158
  if len(recent_warnings) >= self.warning_threshold:
153
159
  return True
154
-
160
+
155
161
  return False
156
-
162
+
157
163
  def get_recovery_action(self, health_result: HealthCheckResult) -> RecoveryAction:
158
164
  """Determine the appropriate recovery action based on health status."""
159
165
  current_time = time.time()
160
-
166
+
161
167
  # Count recent failures
162
168
  cutoff_time = current_time - self.failure_window_seconds
163
169
  recent_failures = [
164
- event for event in self.recent_failures
165
- if event >= cutoff_time
170
+ event for event in self.recent_failures if event >= cutoff_time
166
171
  ]
167
-
172
+
168
173
  failure_count = len(recent_failures)
169
-
174
+
170
175
  # Record this failure
171
- if health_result.overall_status in [HealthStatus.WARNING, HealthStatus.CRITICAL]:
176
+ if health_result.overall_status in [
177
+ HealthStatus.WARNING,
178
+ HealthStatus.CRITICAL,
179
+ ]:
172
180
  self.recent_failures.append(current_time)
173
-
181
+
174
182
  # Determine action based on status and failure history
175
183
  if health_result.overall_status == HealthStatus.CRITICAL:
176
184
  if failure_count >= 3:
@@ -179,30 +187,33 @@ class GradedRecoveryStrategy(RecoveryStrategy):
179
187
  return RecoveryAction.RESTART_SERVICE
180
188
  else:
181
189
  return RecoveryAction.CLEAR_CONNECTIONS
182
-
190
+
183
191
  elif health_result.overall_status == HealthStatus.WARNING:
184
192
  if failure_count >= self.warning_threshold:
185
193
  return RecoveryAction.CLEAR_CONNECTIONS
186
194
  else:
187
195
  return RecoveryAction.LOG_WARNING
188
-
196
+
189
197
  return RecoveryAction.NONE
190
198
 
191
199
 
192
200
  class CircuitBreaker:
193
201
  """Circuit breaker to prevent recovery loops and cascading failures.
194
-
202
+
195
203
  Implements the circuit breaker pattern to:
196
204
  - Prevent excessive recovery attempts
197
205
  - Allow time for systems to stabilize
198
206
  - Gradually re-enable recovery after failures
199
207
  """
200
-
201
- def __init__(self, failure_threshold: int = RetryConfig.FAILURE_THRESHOLD,
202
- timeout_seconds: int = RetryConfig.CIRCUIT_TIMEOUT,
203
- success_threshold: int = RetryConfig.SUCCESS_THRESHOLD):
208
+
209
+ def __init__(
210
+ self,
211
+ failure_threshold: int = RetryConfig.FAILURE_THRESHOLD,
212
+ timeout_seconds: int = RetryConfig.CIRCUIT_TIMEOUT,
213
+ success_threshold: int = RetryConfig.SUCCESS_THRESHOLD,
214
+ ):
204
215
  """Initialize circuit breaker.
205
-
216
+
206
217
  Args:
207
218
  failure_threshold: Number of failures before opening circuit
208
219
  timeout_seconds: Time to wait in OPEN state before trying HALF_OPEN
@@ -211,80 +222,90 @@ class CircuitBreaker:
211
222
  self.failure_threshold = failure_threshold
212
223
  self.timeout_seconds = timeout_seconds
213
224
  self.success_threshold = success_threshold
214
-
225
+
215
226
  self.state = CircuitState.CLOSED
216
227
  self.failure_count = 0
217
228
  self.success_count = 0
218
229
  self.last_failure_time = 0
219
230
  self.state_change_time = time.time()
220
-
231
+
221
232
  self.logger = logging.getLogger(f"{__name__}.CircuitBreaker")
222
- self.logger.info(f"Circuit breaker initialized: failure_threshold={failure_threshold}, "
223
- f"timeout={timeout_seconds}s, success_threshold={success_threshold}")
224
-
233
+ self.logger.info(
234
+ f"Circuit breaker initialized: failure_threshold={failure_threshold}, "
235
+ f"timeout={timeout_seconds}s, success_threshold={success_threshold}"
236
+ )
237
+
225
238
  def can_proceed(self) -> bool:
226
239
  """Check if recovery operations can proceed."""
227
240
  current_time = time.time()
228
-
241
+
229
242
  if self.state == CircuitState.CLOSED:
230
243
  return True
231
-
244
+
232
245
  elif self.state == CircuitState.OPEN:
233
246
  # Check if timeout has elapsed
234
247
  if current_time - self.last_failure_time >= self.timeout_seconds:
235
248
  self._transition_to_half_open()
236
249
  return True
237
250
  return False
238
-
251
+
239
252
  elif self.state == CircuitState.HALF_OPEN:
240
253
  return True
241
-
254
+
242
255
  return False
243
-
256
+
244
257
  def record_success(self) -> None:
245
258
  """Record a successful recovery operation."""
246
259
  if self.state == CircuitState.CLOSED:
247
260
  # Reset failure count on success in normal state
248
261
  self.failure_count = 0
249
-
262
+
250
263
  elif self.state == CircuitState.HALF_OPEN:
251
264
  self.success_count += 1
252
- self.logger.debug(f"Circuit breaker success count: {self.success_count}/{self.success_threshold}")
253
-
265
+ self.logger.debug(
266
+ f"Circuit breaker success count: {self.success_count}/{self.success_threshold}"
267
+ )
268
+
254
269
  if self.success_count >= self.success_threshold:
255
270
  self._transition_to_closed()
256
-
271
+
257
272
  def record_failure(self) -> None:
258
273
  """Record a failed recovery operation."""
259
274
  current_time = time.time()
260
275
  self.last_failure_time = current_time
261
-
276
+
262
277
  if self.state == CircuitState.CLOSED:
263
278
  self.failure_count += 1
264
- self.logger.warning(f"Circuit breaker failure count: {self.failure_count}/{self.failure_threshold}")
265
-
279
+ self.logger.warning(
280
+ f"Circuit breaker failure count: {self.failure_count}/{self.failure_threshold}"
281
+ )
282
+
266
283
  if self.failure_count >= self.failure_threshold:
267
284
  self._transition_to_open()
268
-
285
+
269
286
  elif self.state == CircuitState.HALF_OPEN:
270
287
  # Failure in half-open state goes back to open
271
288
  self._transition_to_open()
272
-
289
+
273
290
  def _transition_to_open(self) -> None:
274
291
  """Transition circuit to OPEN state."""
275
292
  self.state = CircuitState.OPEN
276
293
  self.state_change_time = time.time()
277
294
  self.success_count = 0
278
- self.logger.warning(f"Circuit breaker OPENED due to {self.failure_count} failures. "
279
- f"Recovery blocked for {self.timeout_seconds} seconds.")
280
-
295
+ self.logger.warning(
296
+ f"Circuit breaker OPENED due to {self.failure_count} failures. "
297
+ f"Recovery blocked for {self.timeout_seconds} seconds."
298
+ )
299
+
281
300
  def _transition_to_half_open(self) -> None:
282
301
  """Transition circuit to HALF_OPEN state."""
283
302
  self.state = CircuitState.HALF_OPEN
284
303
  self.state_change_time = time.time()
285
304
  self.success_count = 0
286
- self.logger.info("Circuit breaker transitioned to HALF_OPEN. Testing recovery...")
287
-
305
+ self.logger.info(
306
+ "Circuit breaker transitioned to HALF_OPEN. Testing recovery..."
307
+ )
308
+
288
309
  def _transition_to_closed(self) -> None:
289
310
  """Transition circuit to CLOSED state."""
290
311
  self.state = CircuitState.CLOSED
@@ -292,29 +313,29 @@ class CircuitBreaker:
292
313
  self.failure_count = 0
293
314
  self.success_count = 0
294
315
  self.logger.info("Circuit breaker CLOSED. Normal recovery operations resumed.")
295
-
316
+
296
317
  def get_status(self) -> Dict[str, Any]:
297
318
  """Get current circuit breaker status."""
298
319
  current_time = time.time()
299
320
  return {
300
- 'state': self.state.value,
301
- 'failure_count': self.failure_count,
302
- 'success_count': self.success_count,
303
- 'last_failure_time': self.last_failure_time,
304
- 'state_change_time': self.state_change_time,
305
- 'time_in_current_state': current_time - self.state_change_time,
306
- 'can_proceed': self.can_proceed(),
307
- 'config': {
308
- 'failure_threshold': self.failure_threshold,
309
- 'timeout_seconds': self.timeout_seconds,
310
- 'success_threshold': self.success_threshold
311
- }
321
+ "state": self.state.value,
322
+ "failure_count": self.failure_count,
323
+ "success_count": self.success_count,
324
+ "last_failure_time": self.last_failure_time,
325
+ "state_change_time": self.state_change_time,
326
+ "time_in_current_state": current_time - self.state_change_time,
327
+ "can_proceed": self.can_proceed(),
328
+ "config": {
329
+ "failure_threshold": self.failure_threshold,
330
+ "timeout_seconds": self.timeout_seconds,
331
+ "success_threshold": self.success_threshold,
332
+ },
312
333
  }
313
334
 
314
335
 
315
336
  class RecoveryManager:
316
337
  """Advanced recovery manager with circuit breaker and configurable strategies.
317
-
338
+
318
339
  Provides comprehensive recovery capabilities including:
319
340
  - Health-based recovery triggering
320
341
  - Circuit breaker protection
@@ -322,11 +343,10 @@ class RecoveryManager:
322
343
  - Recovery event logging and history
323
344
  - Integration with service lifecycle
324
345
  """
325
-
326
- def __init__(self, config: Optional[Dict[str, Any]] = None,
327
- server_instance=None):
346
+
347
+ def __init__(self, config: Optional[Dict[str, Any]] = None, server_instance=None):
328
348
  """Initialize recovery manager.
329
-
349
+
330
350
  Args:
331
351
  config: Configuration dictionary for recovery settings
332
352
  server_instance: Reference to the Socket.IO server instance
@@ -334,110 +354,124 @@ class RecoveryManager:
334
354
  self.config = config or {}
335
355
  self.server_instance = server_instance
336
356
  self.logger = logging.getLogger(f"{__name__}.RecoveryManager")
337
-
357
+
338
358
  # Configuration with defaults
339
- self.enabled = self.config.get('enabled', True)
340
- self.check_interval = self.config.get('check_interval', 60)
341
- self.max_recovery_attempts = self.config.get('max_recovery_attempts', 5)
342
- self.recovery_timeout = self.config.get('recovery_timeout', 30)
343
-
359
+ self.enabled = self.config.get("enabled", True)
360
+ self.check_interval = self.config.get("check_interval", 60)
361
+ self.max_recovery_attempts = self.config.get("max_recovery_attempts", 5)
362
+ self.recovery_timeout = self.config.get("recovery_timeout", 30)
363
+
344
364
  # Initialize circuit breaker
345
- circuit_config = self.config.get('circuit_breaker', {})
365
+ circuit_config = self.config.get("circuit_breaker", {})
346
366
  self.circuit_breaker = CircuitBreaker(
347
- failure_threshold=circuit_config.get('failure_threshold', RetryConfig.FAILURE_THRESHOLD),
348
- timeout_seconds=circuit_config.get('timeout_seconds', RetryConfig.CIRCUIT_TIMEOUT),
349
- success_threshold=circuit_config.get('success_threshold', RetryConfig.SUCCESS_THRESHOLD)
367
+ failure_threshold=circuit_config.get(
368
+ "failure_threshold", RetryConfig.FAILURE_THRESHOLD
369
+ ),
370
+ timeout_seconds=circuit_config.get(
371
+ "timeout_seconds", RetryConfig.CIRCUIT_TIMEOUT
372
+ ),
373
+ success_threshold=circuit_config.get(
374
+ "success_threshold", RetryConfig.SUCCESS_THRESHOLD
375
+ ),
350
376
  )
351
-
377
+
352
378
  # Initialize recovery strategy
353
- strategy_config = self.config.get('strategy', {})
379
+ strategy_config = self.config.get("strategy", {})
354
380
  self.recovery_strategy = GradedRecoveryStrategy(strategy_config)
355
-
381
+
356
382
  # Recovery event history
357
383
  self.recovery_history: deque = deque(maxlen=100)
358
-
384
+
359
385
  # Recovery state
360
386
  self.recovery_in_progress = False
361
387
  self.last_recovery_time = 0
362
388
  self.recovery_count = 0
363
-
389
+
364
390
  # Recovery callbacks
365
391
  self.recovery_callbacks: List[Callable[[RecoveryEvent], None]] = []
366
-
392
+
367
393
  # Statistics
368
394
  self.recovery_stats = {
369
- 'total_recoveries': 0,
370
- 'successful_recoveries': 0,
371
- 'failed_recoveries': 0,
372
- 'actions_performed': {action.value: 0 for action in RecoveryAction},
373
- 'average_recovery_duration_ms': 0
395
+ "total_recoveries": 0,
396
+ "successful_recoveries": 0,
397
+ "failed_recoveries": 0,
398
+ "actions_performed": {action.value: 0 for action in RecoveryAction},
399
+ "average_recovery_duration_ms": 0,
374
400
  }
375
-
376
- self.logger.info(f"Recovery manager initialized with strategy: {self.recovery_strategy.get_name()}")
377
-
401
+
402
+ self.logger.info(
403
+ f"Recovery manager initialized with strategy: {self.recovery_strategy.get_name()}"
404
+ )
405
+
378
406
  def add_recovery_callback(self, callback: Callable[[RecoveryEvent], None]) -> None:
379
407
  """Add a callback to be notified of recovery events."""
380
408
  self.recovery_callbacks.append(callback)
381
409
  self.logger.debug(f"Added recovery callback: {callback.__name__}")
382
-
383
- def handle_health_result(self, health_result: HealthCheckResult) -> Optional[RecoveryEvent]:
410
+
411
+ def handle_health_result(
412
+ self, health_result: HealthCheckResult
413
+ ) -> Optional[RecoveryEvent]:
384
414
  """Handle health check result and trigger recovery if needed.
385
-
415
+
386
416
  Args:
387
417
  health_result: Health check result to evaluate
388
-
418
+
389
419
  Returns:
390
420
  RecoveryEvent if recovery was triggered, None otherwise
391
421
  """
392
422
  if not self.enabled:
393
423
  return None
394
-
424
+
395
425
  if self.recovery_in_progress:
396
426
  self.logger.debug("Recovery already in progress, skipping")
397
427
  return None
398
-
428
+
399
429
  # Check if recovery should be triggered
400
430
  if not self.recovery_strategy.should_recover(health_result):
401
431
  return None
402
-
432
+
403
433
  # Check circuit breaker
404
434
  if not self.circuit_breaker.can_proceed():
405
435
  self.logger.warning("Recovery suppressed by circuit breaker")
406
436
  return None
407
-
437
+
408
438
  # Determine recovery action
409
439
  action = self.recovery_strategy.get_recovery_action(health_result)
410
-
440
+
411
441
  if action == RecoveryAction.NONE:
412
442
  return None
413
-
443
+
414
444
  # Trigger recovery
415
- return asyncio.create_task(self._perform_recovery(action, health_result, "health_check"))
416
-
417
- async def _perform_recovery(self, action: RecoveryAction,
418
- health_result: HealthCheckResult,
419
- trigger: str) -> RecoveryEvent:
445
+ return asyncio.create_task(
446
+ self._perform_recovery(action, health_result, "health_check")
447
+ )
448
+
449
+ async def _perform_recovery(
450
+ self, action: RecoveryAction, health_result: HealthCheckResult, trigger: str
451
+ ) -> RecoveryEvent:
420
452
  """Perform recovery action and record the event.
421
-
453
+
422
454
  Args:
423
455
  action: Recovery action to perform
424
456
  health_result: Health result that triggered recovery
425
457
  trigger: Description of what triggered the recovery
426
-
458
+
427
459
  Returns:
428
460
  RecoveryEvent record of the recovery attempt
429
461
  """
430
462
  if self.recovery_in_progress:
431
463
  raise RuntimeError("Recovery already in progress")
432
-
464
+
433
465
  self.recovery_in_progress = True
434
466
  start_time = time.time()
435
467
  success = False
436
468
  error_message = None
437
-
469
+
438
470
  try:
439
- self.logger.info(f"Starting recovery action: {action.value} (trigger: {trigger})")
440
-
471
+ self.logger.info(
472
+ f"Starting recovery action: {action.value} (trigger: {trigger})"
473
+ )
474
+
441
475
  if action == RecoveryAction.LOG_WARNING:
442
476
  success = await self._log_warning(health_result)
443
477
  elif action == RecoveryAction.CLEAR_CONNECTIONS:
@@ -449,16 +483,16 @@ class RecoveryManager:
449
483
  else:
450
484
  error_message = f"Unknown recovery action: {action}"
451
485
  self.logger.error(error_message)
452
-
486
+
453
487
  except Exception as e:
454
488
  error_message = f"Recovery action failed: {e}"
455
489
  self.logger.error(error_message)
456
490
  success = False
457
-
491
+
458
492
  finally:
459
493
  self.recovery_in_progress = False
460
494
  duration_ms = (time.time() - start_time) * PerformanceConfig.SECONDS_TO_MS
461
-
495
+
462
496
  # Create recovery event
463
497
  event = RecoveryEvent(
464
498
  timestamp=start_time,
@@ -467,210 +501,232 @@ class RecoveryManager:
467
501
  health_status=health_result.overall_status,
468
502
  success=success,
469
503
  duration_ms=duration_ms,
470
- error_message=error_message
504
+ error_message=error_message,
471
505
  )
472
-
506
+
473
507
  # Update statistics
474
508
  self._update_recovery_stats(event)
475
-
509
+
476
510
  # Record in circuit breaker
477
511
  if success:
478
512
  self.circuit_breaker.record_success()
479
513
  else:
480
514
  self.circuit_breaker.record_failure()
481
-
515
+
482
516
  # Store event
483
517
  self.recovery_history.append(event)
484
518
  self.last_recovery_time = start_time
485
519
  self.recovery_count += 1
486
-
520
+
487
521
  # Notify callbacks
488
522
  for callback in self.recovery_callbacks:
489
523
  try:
490
524
  callback(event)
491
525
  except Exception as e:
492
- self.logger.error(f"Recovery callback {callback.__name__} failed: {e}")
493
-
526
+ self.logger.error(
527
+ f"Recovery callback {callback.__name__} failed: {e}"
528
+ )
529
+
494
530
  result_msg = "succeeded" if success else "failed"
495
- self.logger.info(f"Recovery action {action.value} {result_msg} in {duration_ms:.2f}ms")
496
-
531
+ self.logger.info(
532
+ f"Recovery action {action.value} {result_msg} in {duration_ms:.2f}ms"
533
+ )
534
+
497
535
  return event
498
-
536
+
499
537
  async def _log_warning(self, health_result: HealthCheckResult) -> bool:
500
538
  """Log a warning about health issues."""
501
539
  try:
502
- warning_metrics = [m for m in health_result.metrics if m.status == HealthStatus.WARNING]
503
- critical_metrics = [m for m in health_result.metrics if m.status == HealthStatus.CRITICAL]
504
-
505
- self.logger.warning(f"Health warning detected: {len(warning_metrics)} warning metrics, "
506
- f"{len(critical_metrics)} critical metrics")
507
-
540
+ warning_metrics = [
541
+ m for m in health_result.metrics if m.status == HealthStatus.WARNING
542
+ ]
543
+ critical_metrics = [
544
+ m for m in health_result.metrics if m.status == HealthStatus.CRITICAL
545
+ ]
546
+
547
+ self.logger.warning(
548
+ f"Health warning detected: {len(warning_metrics)} warning metrics, "
549
+ f"{len(critical_metrics)} critical metrics"
550
+ )
551
+
508
552
  for metric in warning_metrics + critical_metrics:
509
- self.logger.warning(f" {metric.name}: {metric.value} ({metric.status.value}) - {metric.message}")
510
-
553
+ self.logger.warning(
554
+ f" {metric.name}: {metric.value} ({metric.status.value}) - {metric.message}"
555
+ )
556
+
511
557
  return True
512
558
  except Exception as e:
513
559
  self.logger.error(f"Failed to log warning: {e}")
514
560
  return False
515
-
561
+
516
562
  async def _clear_connections(self) -> bool:
517
563
  """Clear all client connections to reset connection state."""
518
564
  try:
519
- if not self.server_instance or not hasattr(self.server_instance, 'sio'):
520
- self.logger.warning("No server instance available for connection clearing")
565
+ if not self.server_instance or not hasattr(self.server_instance, "sio"):
566
+ self.logger.warning(
567
+ "No server instance available for connection clearing"
568
+ )
521
569
  return False
522
-
570
+
523
571
  sio = self.server_instance.sio
524
572
  if not sio:
525
573
  self.logger.warning("Socket.IO instance not available")
526
574
  return False
527
-
575
+
528
576
  # Get current clients
529
- clients = list(self.server_instance.clients) if hasattr(self.server_instance, 'clients') else []
530
-
577
+ clients = (
578
+ list(self.server_instance.clients)
579
+ if hasattr(self.server_instance, "clients")
580
+ else []
581
+ )
582
+
531
583
  self.logger.info(f"Clearing {len(clients)} client connections")
532
-
584
+
533
585
  # Disconnect all clients
534
586
  for client_id in clients:
535
587
  try:
536
588
  await sio.disconnect(client_id)
537
589
  except Exception as e:
538
590
  self.logger.warning(f"Failed to disconnect client {client_id}: {e}")
539
-
591
+
540
592
  # Clear client tracking
541
- if hasattr(self.server_instance, 'clients'):
593
+ if hasattr(self.server_instance, "clients"):
542
594
  self.server_instance.clients.clear()
543
- if hasattr(self.server_instance, 'client_versions'):
595
+ if hasattr(self.server_instance, "client_versions"):
544
596
  self.server_instance.client_versions.clear()
545
-
597
+
546
598
  self.logger.info("Client connections cleared successfully")
547
599
  return True
548
-
600
+
549
601
  except Exception as e:
550
602
  self.logger.error(f"Failed to clear connections: {e}")
551
603
  return False
552
-
604
+
553
605
  async def _restart_service(self) -> bool:
554
606
  """Restart the Socket.IO service."""
555
607
  try:
556
608
  if not self.server_instance:
557
609
  self.logger.error("No server instance available for restart")
558
610
  return False
559
-
611
+
560
612
  self.logger.info("Attempting graceful service restart")
561
-
613
+
562
614
  # Save current configuration
563
- host = getattr(self.server_instance, 'host', 'localhost')
564
- port = getattr(self.server_instance, 'port', 8765)
565
-
615
+ host = getattr(self.server_instance, "host", "localhost")
616
+ port = getattr(self.server_instance, "port", 8765)
617
+
566
618
  # Stop current server
567
619
  try:
568
620
  await self.server_instance._shutdown_async()
569
621
  self.logger.info("Server shutdown completed")
570
622
  except Exception as e:
571
623
  self.logger.warning(f"Error during shutdown: {e}")
572
-
624
+
573
625
  # Wait a moment for cleanup
574
626
  await asyncio.sleep(1)
575
-
627
+
576
628
  # Restart server
577
629
  await self.server_instance.start_async()
578
630
  self.logger.info("Server restart completed successfully")
579
-
631
+
580
632
  return True
581
-
633
+
582
634
  except Exception as e:
583
635
  self.logger.error(f"Failed to restart service: {e}")
584
636
  return False
585
-
637
+
586
638
  async def _emergency_stop(self) -> bool:
587
639
  """Perform emergency stop of the service."""
588
640
  try:
589
- self.logger.critical("Performing emergency stop due to critical health issues")
590
-
641
+ self.logger.critical(
642
+ "Performing emergency stop due to critical health issues"
643
+ )
644
+
591
645
  if self.server_instance:
592
646
  try:
593
647
  # Force immediate shutdown
594
648
  await self.server_instance._shutdown_async()
595
649
  except Exception as e:
596
650
  self.logger.error(f"Error during emergency shutdown: {e}")
597
-
651
+
598
652
  # Send termination signal to process
599
653
  try:
600
654
  os.kill(os.getpid(), signal.SIGTERM)
601
655
  except Exception as e:
602
656
  self.logger.error(f"Failed to send termination signal: {e}")
603
657
  return False
604
-
658
+
605
659
  return True
606
-
660
+
607
661
  except Exception as e:
608
662
  self.logger.error(f"Emergency stop failed: {e}")
609
663
  return False
610
-
664
+
611
665
  def _update_recovery_stats(self, event: RecoveryEvent) -> None:
612
666
  """Update recovery statistics with new event."""
613
- self.recovery_stats['total_recoveries'] += 1
614
-
667
+ self.recovery_stats["total_recoveries"] += 1
668
+
615
669
  if event.success:
616
- self.recovery_stats['successful_recoveries'] += 1
670
+ self.recovery_stats["successful_recoveries"] += 1
617
671
  else:
618
- self.recovery_stats['failed_recoveries'] += 1
619
-
620
- self.recovery_stats['actions_performed'][event.action.value] += 1
621
-
672
+ self.recovery_stats["failed_recoveries"] += 1
673
+
674
+ self.recovery_stats["actions_performed"][event.action.value] += 1
675
+
622
676
  # Update average duration
623
- total_recoveries = self.recovery_stats['total_recoveries']
624
- current_avg = self.recovery_stats['average_recovery_duration_ms']
625
- self.recovery_stats['average_recovery_duration_ms'] = (
626
- (current_avg * (total_recoveries - 1) + event.duration_ms) / total_recoveries
627
- )
628
-
677
+ total_recoveries = self.recovery_stats["total_recoveries"]
678
+ current_avg = self.recovery_stats["average_recovery_duration_ms"]
679
+ self.recovery_stats["average_recovery_duration_ms"] = (
680
+ current_avg * (total_recoveries - 1) + event.duration_ms
681
+ ) / total_recoveries
682
+
629
683
  def get_recovery_status(self) -> Dict[str, Any]:
630
684
  """Get comprehensive recovery manager status."""
631
685
  return {
632
- 'enabled': self.enabled,
633
- 'recovery_in_progress': self.recovery_in_progress,
634
- 'last_recovery_time': self.last_recovery_time,
635
- 'recovery_count': self.recovery_count,
636
- 'strategy': self.recovery_strategy.get_name(),
637
- 'circuit_breaker': self.circuit_breaker.get_status(),
638
- 'recovery_stats': dict(self.recovery_stats),
639
- 'recent_recoveries': [event.to_dict() for event in list(self.recovery_history)[-10:]],
640
- 'config': {
641
- 'check_interval': self.check_interval,
642
- 'max_recovery_attempts': self.max_recovery_attempts,
643
- 'recovery_timeout': self.recovery_timeout
644
- }
686
+ "enabled": self.enabled,
687
+ "recovery_in_progress": self.recovery_in_progress,
688
+ "last_recovery_time": self.last_recovery_time,
689
+ "recovery_count": self.recovery_count,
690
+ "strategy": self.recovery_strategy.get_name(),
691
+ "circuit_breaker": self.circuit_breaker.get_status(),
692
+ "recovery_stats": dict(self.recovery_stats),
693
+ "recent_recoveries": [
694
+ event.to_dict() for event in list(self.recovery_history)[-10:]
695
+ ],
696
+ "config": {
697
+ "check_interval": self.check_interval,
698
+ "max_recovery_attempts": self.max_recovery_attempts,
699
+ "recovery_timeout": self.recovery_timeout,
700
+ },
645
701
  }
646
-
702
+
647
703
  def get_recovery_history(self, limit: Optional[int] = None) -> List[RecoveryEvent]:
648
704
  """Get recovery event history.
649
-
705
+
650
706
  Args:
651
707
  limit: Maximum number of events to return
652
-
708
+
653
709
  Returns:
654
710
  List of recovery events, newest first
655
711
  """
656
712
  history = list(self.recovery_history)
657
713
  history.reverse() # Newest first
658
-
714
+
659
715
  if limit:
660
716
  history = history[:limit]
661
-
717
+
662
718
  return history
663
-
719
+
664
720
  def is_enabled(self) -> bool:
665
721
  """Check if recovery manager is enabled."""
666
722
  return self.enabled
667
-
723
+
668
724
  def enable(self) -> None:
669
725
  """Enable recovery manager."""
670
726
  self.enabled = True
671
727
  self.logger.info("Recovery manager enabled")
672
-
728
+
673
729
  def disable(self) -> None:
674
730
  """Disable recovery manager."""
675
731
  self.enabled = False
676
- self.logger.info("Recovery manager disabled")
732
+ self.logger.info("Recovery manager disabled")