moai-adk 0.35.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of moai-adk might be problematic. Click here for more details.

Files changed (502) hide show
  1. moai_adk/__init__.py +10 -0
  2. moai_adk/__main__.py +199 -0
  3. moai_adk/cli/__init__.py +6 -0
  4. moai_adk/cli/commands/__init__.py +17 -0
  5. moai_adk/cli/commands/analyze.py +116 -0
  6. moai_adk/cli/commands/doctor.py +272 -0
  7. moai_adk/cli/commands/init.py +372 -0
  8. moai_adk/cli/commands/language.py +248 -0
  9. moai_adk/cli/commands/status.py +104 -0
  10. moai_adk/cli/commands/update.py +2686 -0
  11. moai_adk/cli/main.py +13 -0
  12. moai_adk/cli/prompts/__init__.py +5 -0
  13. moai_adk/cli/prompts/init_prompts.py +219 -0
  14. moai_adk/cli/spec_status.py +263 -0
  15. moai_adk/cli/ui/__init__.py +44 -0
  16. moai_adk/cli/ui/progress.py +422 -0
  17. moai_adk/cli/ui/prompts.py +389 -0
  18. moai_adk/cli/ui/theme.py +129 -0
  19. moai_adk/cli/worktree/__init__.py +27 -0
  20. moai_adk/cli/worktree/__main__.py +31 -0
  21. moai_adk/cli/worktree/cli.py +683 -0
  22. moai_adk/cli/worktree/exceptions.py +89 -0
  23. moai_adk/cli/worktree/manager.py +493 -0
  24. moai_adk/cli/worktree/models.py +65 -0
  25. moai_adk/cli/worktree/registry.py +422 -0
  26. moai_adk/core/PHASE2_OPTIMIZATIONS.md +467 -0
  27. moai_adk/core/__init__.py +1 -0
  28. moai_adk/core/analysis/__init__.py +9 -0
  29. moai_adk/core/analysis/session_analyzer.py +400 -0
  30. moai_adk/core/claude_integration.py +393 -0
  31. moai_adk/core/command_helpers.py +270 -0
  32. moai_adk/core/comprehensive_monitoring_system.py +1183 -0
  33. moai_adk/core/config/__init__.py +19 -0
  34. moai_adk/core/config/auto_spec_config.py +340 -0
  35. moai_adk/core/config/migration.py +244 -0
  36. moai_adk/core/config/unified.py +436 -0
  37. moai_adk/core/context_manager.py +273 -0
  38. moai_adk/core/diagnostics/__init__.py +19 -0
  39. moai_adk/core/diagnostics/slash_commands.py +159 -0
  40. moai_adk/core/enterprise_features.py +1404 -0
  41. moai_adk/core/error_recovery_system.py +1902 -0
  42. moai_adk/core/event_driven_hook_system.py +1371 -0
  43. moai_adk/core/git/__init__.py +31 -0
  44. moai_adk/core/git/branch.py +25 -0
  45. moai_adk/core/git/branch_manager.py +129 -0
  46. moai_adk/core/git/checkpoint.py +134 -0
  47. moai_adk/core/git/commit.py +67 -0
  48. moai_adk/core/git/conflict_detector.py +413 -0
  49. moai_adk/core/git/event_detector.py +79 -0
  50. moai_adk/core/git/manager.py +216 -0
  51. moai_adk/core/hooks/post_tool_auto_spec_completion.py +901 -0
  52. moai_adk/core/input_validation_middleware.py +1006 -0
  53. moai_adk/core/integration/__init__.py +22 -0
  54. moai_adk/core/integration/engine.py +157 -0
  55. moai_adk/core/integration/integration_tester.py +226 -0
  56. moai_adk/core/integration/models.py +88 -0
  57. moai_adk/core/integration/utils.py +211 -0
  58. moai_adk/core/issue_creator.py +305 -0
  59. moai_adk/core/jit_context_loader.py +956 -0
  60. moai_adk/core/jit_enhanced_hook_manager.py +1987 -0
  61. moai_adk/core/language_config.py +202 -0
  62. moai_adk/core/language_config_resolver.py +572 -0
  63. moai_adk/core/language_validator.py +543 -0
  64. moai_adk/core/mcp/setup.py +116 -0
  65. moai_adk/core/merge/__init__.py +9 -0
  66. moai_adk/core/merge/analyzer.py +605 -0
  67. moai_adk/core/migration/__init__.py +18 -0
  68. moai_adk/core/migration/alfred_to_moai_migrator.py +383 -0
  69. moai_adk/core/migration/backup_manager.py +277 -0
  70. moai_adk/core/migration/custom_element_scanner.py +358 -0
  71. moai_adk/core/migration/file_migrator.py +209 -0
  72. moai_adk/core/migration/interactive_checkbox_ui.py +488 -0
  73. moai_adk/core/migration/selective_restorer.py +470 -0
  74. moai_adk/core/migration/template_utils.py +74 -0
  75. moai_adk/core/migration/user_selection_ui.py +338 -0
  76. moai_adk/core/migration/version_detector.py +139 -0
  77. moai_adk/core/migration/version_migrator.py +228 -0
  78. moai_adk/core/performance/__init__.py +6 -0
  79. moai_adk/core/performance/cache_system.py +316 -0
  80. moai_adk/core/performance/parallel_processor.py +116 -0
  81. moai_adk/core/phase_optimized_hook_scheduler.py +879 -0
  82. moai_adk/core/project/__init__.py +1 -0
  83. moai_adk/core/project/backup_utils.py +70 -0
  84. moai_adk/core/project/checker.py +300 -0
  85. moai_adk/core/project/detector.py +293 -0
  86. moai_adk/core/project/initializer.py +387 -0
  87. moai_adk/core/project/phase_executor.py +716 -0
  88. moai_adk/core/project/validator.py +139 -0
  89. moai_adk/core/quality/__init__.py +6 -0
  90. moai_adk/core/quality/trust_checker.py +377 -0
  91. moai_adk/core/quality/validators/__init__.py +6 -0
  92. moai_adk/core/quality/validators/base_validator.py +19 -0
  93. moai_adk/core/realtime_monitoring_dashboard.py +1724 -0
  94. moai_adk/core/robust_json_parser.py +611 -0
  95. moai_adk/core/rollback_manager.py +918 -0
  96. moai_adk/core/session_manager.py +651 -0
  97. moai_adk/core/skill_loading_system.py +579 -0
  98. moai_adk/core/spec/confidence_scoring.py +680 -0
  99. moai_adk/core/spec/ears_template_engine.py +1247 -0
  100. moai_adk/core/spec/quality_validator.py +687 -0
  101. moai_adk/core/spec_status_manager.py +478 -0
  102. moai_adk/core/template/__init__.py +7 -0
  103. moai_adk/core/template/backup.py +174 -0
  104. moai_adk/core/template/config.py +191 -0
  105. moai_adk/core/template/languages.py +43 -0
  106. moai_adk/core/template/merger.py +233 -0
  107. moai_adk/core/template/processor.py +1200 -0
  108. moai_adk/core/template_engine.py +310 -0
  109. moai_adk/core/template_variable_synchronizer.py +417 -0
  110. moai_adk/core/unified_permission_manager.py +745 -0
  111. moai_adk/core/user_behavior_analytics.py +851 -0
  112. moai_adk/core/version_sync.py +429 -0
  113. moai_adk/foundation/__init__.py +56 -0
  114. moai_adk/foundation/backend.py +1027 -0
  115. moai_adk/foundation/database.py +1115 -0
  116. moai_adk/foundation/devops.py +1585 -0
  117. moai_adk/foundation/ears.py +431 -0
  118. moai_adk/foundation/frontend.py +870 -0
  119. moai_adk/foundation/git/commit_templates.py +557 -0
  120. moai_adk/foundation/git.py +376 -0
  121. moai_adk/foundation/langs.py +484 -0
  122. moai_adk/foundation/ml_ops.py +1162 -0
  123. moai_adk/foundation/testing.py +1524 -0
  124. moai_adk/foundation/trust/trust_principles.py +676 -0
  125. moai_adk/foundation/trust/validation_checklist.py +1573 -0
  126. moai_adk/project/__init__.py +0 -0
  127. moai_adk/project/configuration.py +1084 -0
  128. moai_adk/project/documentation.py +566 -0
  129. moai_adk/project/schema.py +447 -0
  130. moai_adk/statusline/__init__.py +38 -0
  131. moai_adk/statusline/alfred_detector.py +105 -0
  132. moai_adk/statusline/config.py +376 -0
  133. moai_adk/statusline/enhanced_output_style_detector.py +372 -0
  134. moai_adk/statusline/git_collector.py +190 -0
  135. moai_adk/statusline/main.py +322 -0
  136. moai_adk/statusline/metrics_tracker.py +78 -0
  137. moai_adk/statusline/renderer.py +343 -0
  138. moai_adk/statusline/update_checker.py +129 -0
  139. moai_adk/statusline/version_reader.py +741 -0
  140. moai_adk/templates/.claude/agents/moai/ai-nano-banana.md +714 -0
  141. moai_adk/templates/.claude/agents/moai/builder-agent.md +474 -0
  142. moai_adk/templates/.claude/agents/moai/builder-command.md +1172 -0
  143. moai_adk/templates/.claude/agents/moai/builder-plugin.md +637 -0
  144. moai_adk/templates/.claude/agents/moai/builder-skill.md +666 -0
  145. moai_adk/templates/.claude/agents/moai/expert-backend.md +899 -0
  146. moai_adk/templates/.claude/agents/moai/expert-database.md +777 -0
  147. moai_adk/templates/.claude/agents/moai/expert-debug.md +401 -0
  148. moai_adk/templates/.claude/agents/moai/expert-devops.md +720 -0
  149. moai_adk/templates/.claude/agents/moai/expert-frontend.md +734 -0
  150. moai_adk/templates/.claude/agents/moai/expert-performance.md +657 -0
  151. moai_adk/templates/.claude/agents/moai/expert-security.md +513 -0
  152. moai_adk/templates/.claude/agents/moai/expert-testing.md +733 -0
  153. moai_adk/templates/.claude/agents/moai/expert-uiux.md +1041 -0
  154. moai_adk/templates/.claude/agents/moai/manager-claude-code.md +432 -0
  155. moai_adk/templates/.claude/agents/moai/manager-docs.md +573 -0
  156. moai_adk/templates/.claude/agents/moai/manager-git.md +1060 -0
  157. moai_adk/templates/.claude/agents/moai/manager-project.md +891 -0
  158. moai_adk/templates/.claude/agents/moai/manager-quality.md +624 -0
  159. moai_adk/templates/.claude/agents/moai/manager-spec.md +809 -0
  160. moai_adk/templates/.claude/agents/moai/manager-strategy.md +780 -0
  161. moai_adk/templates/.claude/agents/moai/manager-tdd.md +784 -0
  162. moai_adk/templates/.claude/agents/moai/mcp-context7.md +458 -0
  163. moai_adk/templates/.claude/agents/moai/mcp-figma.md +1607 -0
  164. moai_adk/templates/.claude/agents/moai/mcp-notion.md +789 -0
  165. moai_adk/templates/.claude/agents/moai/mcp-playwright.md +469 -0
  166. moai_adk/templates/.claude/agents/moai/mcp-sequential-thinking.md +1032 -0
  167. moai_adk/templates/.claude/commands/moai/0-project.md +1386 -0
  168. moai_adk/templates/.claude/commands/moai/1-plan.md +1427 -0
  169. moai_adk/templates/.claude/commands/moai/2-run.md +943 -0
  170. moai_adk/templates/.claude/commands/moai/3-sync.md +1324 -0
  171. moai_adk/templates/.claude/commands/moai/9-feedback.md +314 -0
  172. moai_adk/templates/.claude/hooks/__init__.py +8 -0
  173. moai_adk/templates/.claude/hooks/moai/__init__.py +8 -0
  174. moai_adk/templates/.claude/hooks/moai/lib/__init__.py +85 -0
  175. moai_adk/templates/.claude/hooks/moai/lib/checkpoint.py +244 -0
  176. moai_adk/templates/.claude/hooks/moai/lib/common.py +131 -0
  177. moai_adk/templates/.claude/hooks/moai/lib/config_manager.py +446 -0
  178. moai_adk/templates/.claude/hooks/moai/lib/config_validator.py +639 -0
  179. moai_adk/templates/.claude/hooks/moai/lib/example_config.json +104 -0
  180. moai_adk/templates/.claude/hooks/moai/lib/git_operations_manager.py +590 -0
  181. moai_adk/templates/.claude/hooks/moai/lib/language_validator.py +317 -0
  182. moai_adk/templates/.claude/hooks/moai/lib/models.py +102 -0
  183. moai_adk/templates/.claude/hooks/moai/lib/path_utils.py +28 -0
  184. moai_adk/templates/.claude/hooks/moai/lib/project.py +768 -0
  185. moai_adk/templates/.claude/hooks/moai/lib/test_hooks_improvements.py +443 -0
  186. moai_adk/templates/.claude/hooks/moai/lib/timeout.py +160 -0
  187. moai_adk/templates/.claude/hooks/moai/lib/unified_timeout_manager.py +530 -0
  188. moai_adk/templates/.claude/hooks/moai/session_end__auto_cleanup.py +862 -0
  189. moai_adk/templates/.claude/hooks/moai/session_start__show_project_info.py +1083 -0
  190. moai_adk/templates/.claude/output-styles/moai/r2d2.md +560 -0
  191. moai_adk/templates/.claude/output-styles/moai/yoda.md +359 -0
  192. moai_adk/templates/.claude/settings.json +172 -0
  193. moai_adk/templates/.claude/skills/moai-ai-nano-banana/SKILL.md +307 -0
  194. moai_adk/templates/.claude/skills/moai-ai-nano-banana/examples.md +431 -0
  195. moai_adk/templates/.claude/skills/moai-ai-nano-banana/scripts/batch_generate.py +560 -0
  196. moai_adk/templates/.claude/skills/moai-ai-nano-banana/scripts/generate_image.py +362 -0
  197. moai_adk/templates/.claude/skills/moai-docs-generation/SKILL.md +249 -0
  198. moai_adk/templates/.claude/skills/moai-docs-generation/examples.md +406 -0
  199. moai_adk/templates/.claude/skills/moai-docs-generation/modules/README.md +44 -0
  200. moai_adk/templates/.claude/skills/moai-docs-generation/modules/api-documentation.md +130 -0
  201. moai_adk/templates/.claude/skills/moai-docs-generation/modules/code-documentation.md +152 -0
  202. moai_adk/templates/.claude/skills/moai-docs-generation/modules/multi-format-output.md +178 -0
  203. moai_adk/templates/.claude/skills/moai-docs-generation/modules/user-guides.md +147 -0
  204. moai_adk/templates/.claude/skills/moai-docs-generation/reference.md +328 -0
  205. moai_adk/templates/.claude/skills/moai-domain-backend/SKILL.md +320 -0
  206. moai_adk/templates/.claude/skills/moai-domain-backend/examples.md +718 -0
  207. moai_adk/templates/.claude/skills/moai-domain-backend/reference.md +464 -0
  208. moai_adk/templates/.claude/skills/moai-domain-database/SKILL.md +323 -0
  209. moai_adk/templates/.claude/skills/moai-domain-database/examples.md +830 -0
  210. moai_adk/templates/.claude/skills/moai-domain-database/modules/README.md +53 -0
  211. moai_adk/templates/.claude/skills/moai-domain-database/modules/mongodb.md +231 -0
  212. moai_adk/templates/.claude/skills/moai-domain-database/modules/postgresql.md +169 -0
  213. moai_adk/templates/.claude/skills/moai-domain-database/modules/redis.md +262 -0
  214. moai_adk/templates/.claude/skills/moai-domain-database/reference.md +545 -0
  215. moai_adk/templates/.claude/skills/moai-domain-frontend/SKILL.md +497 -0
  216. moai_adk/templates/.claude/skills/moai-domain-frontend/examples.md +968 -0
  217. moai_adk/templates/.claude/skills/moai-domain-frontend/reference.md +664 -0
  218. moai_adk/templates/.claude/skills/moai-domain-uiux/SKILL.md +455 -0
  219. moai_adk/templates/.claude/skills/moai-domain-uiux/examples.md +560 -0
  220. moai_adk/templates/.claude/skills/moai-domain-uiux/modules/accessibility-wcag.md +260 -0
  221. moai_adk/templates/.claude/skills/moai-domain-uiux/modules/component-architecture.md +228 -0
  222. moai_adk/templates/.claude/skills/moai-domain-uiux/modules/icon-libraries.md +401 -0
  223. moai_adk/templates/.claude/skills/moai-domain-uiux/modules/theming-system.md +373 -0
  224. moai_adk/templates/.claude/skills/moai-domain-uiux/reference.md +243 -0
  225. moai_adk/templates/.claude/skills/moai-formats-data/SKILL.md +492 -0
  226. moai_adk/templates/.claude/skills/moai-formats-data/examples.md +804 -0
  227. moai_adk/templates/.claude/skills/moai-formats-data/modules/README.md +98 -0
  228. moai_adk/templates/.claude/skills/moai-formats-data/modules/SKILL-MODULARIZATION-TEMPLATE.md +278 -0
  229. moai_adk/templates/.claude/skills/moai-formats-data/modules/caching-performance.md +459 -0
  230. moai_adk/templates/.claude/skills/moai-formats-data/modules/data-validation.md +485 -0
  231. moai_adk/templates/.claude/skills/moai-formats-data/modules/json-optimization.md +374 -0
  232. moai_adk/templates/.claude/skills/moai-formats-data/modules/toon-encoding.md +308 -0
  233. moai_adk/templates/.claude/skills/moai-formats-data/reference.md +585 -0
  234. moai_adk/templates/.claude/skills/moai-foundation-claude/SKILL.md +202 -0
  235. moai_adk/templates/.claude/skills/moai-foundation-claude/examples.md +732 -0
  236. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/best-practices-checklist.md +616 -0
  237. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/claude-code-custom-slash-commands-official.md +729 -0
  238. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/claude-code-hooks-official.md +560 -0
  239. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/claude-code-iam-official.md +635 -0
  240. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/claude-code-memory-official.md +543 -0
  241. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/claude-code-settings-official.md +663 -0
  242. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/claude-code-skills-official.md +113 -0
  243. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/claude-code-sub-agents-official.md +238 -0
  244. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/complete-configuration-guide.md +175 -0
  245. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/skill-examples.md +1674 -0
  246. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/skill-formatting-guide.md +729 -0
  247. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/sub-agents/sub-agent-examples.md +1513 -0
  248. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/sub-agents/sub-agent-formatting-guide.md +1086 -0
  249. moai_adk/templates/.claude/skills/moai-foundation-claude/reference/sub-agents/sub-agent-integration-patterns.md +1100 -0
  250. moai_adk/templates/.claude/skills/moai-foundation-claude/reference.md +209 -0
  251. moai_adk/templates/.claude/skills/moai-foundation-context/SKILL.md +441 -0
  252. moai_adk/templates/.claude/skills/moai-foundation-context/examples.md +1048 -0
  253. moai_adk/templates/.claude/skills/moai-foundation-context/reference.md +246 -0
  254. moai_adk/templates/.claude/skills/moai-foundation-core/SKILL.md +420 -0
  255. moai_adk/templates/.claude/skills/moai-foundation-core/examples.md +358 -0
  256. moai_adk/templates/.claude/skills/moai-foundation-core/modules/README.md +296 -0
  257. moai_adk/templates/.claude/skills/moai-foundation-core/modules/agents-reference.md +359 -0
  258. moai_adk/templates/.claude/skills/moai-foundation-core/modules/commands-reference.md +432 -0
  259. moai_adk/templates/.claude/skills/moai-foundation-core/modules/delegation-patterns.md +757 -0
  260. moai_adk/templates/.claude/skills/moai-foundation-core/modules/execution-rules.md +687 -0
  261. moai_adk/templates/.claude/skills/moai-foundation-core/modules/modular-system.md +665 -0
  262. moai_adk/templates/.claude/skills/moai-foundation-core/modules/progressive-disclosure.md +649 -0
  263. moai_adk/templates/.claude/skills/moai-foundation-core/modules/spec-first-tdd.md +864 -0
  264. moai_adk/templates/.claude/skills/moai-foundation-core/modules/token-optimization.md +708 -0
  265. moai_adk/templates/.claude/skills/moai-foundation-core/modules/trust-5-framework.md +981 -0
  266. moai_adk/templates/.claude/skills/moai-foundation-core/reference.md +478 -0
  267. moai_adk/templates/.claude/skills/moai-foundation-philosopher/SKILL.md +315 -0
  268. moai_adk/templates/.claude/skills/moai-foundation-philosopher/examples.md +228 -0
  269. moai_adk/templates/.claude/skills/moai-foundation-philosopher/modules/assumption-matrix.md +80 -0
  270. moai_adk/templates/.claude/skills/moai-foundation-philosopher/modules/cognitive-bias.md +199 -0
  271. moai_adk/templates/.claude/skills/moai-foundation-philosopher/modules/first-principles.md +140 -0
  272. moai_adk/templates/.claude/skills/moai-foundation-philosopher/modules/trade-off-analysis.md +154 -0
  273. moai_adk/templates/.claude/skills/moai-foundation-philosopher/reference.md +157 -0
  274. moai_adk/templates/.claude/skills/moai-foundation-quality/SKILL.md +364 -0
  275. moai_adk/templates/.claude/skills/moai-foundation-quality/examples.md +1232 -0
  276. moai_adk/templates/.claude/skills/moai-foundation-quality/modules/best-practices.md +261 -0
  277. moai_adk/templates/.claude/skills/moai-foundation-quality/modules/integration-patterns.md +194 -0
  278. moai_adk/templates/.claude/skills/moai-foundation-quality/modules/proactive-analysis.md +229 -0
  279. moai_adk/templates/.claude/skills/moai-foundation-quality/modules/trust5-validation.md +169 -0
  280. moai_adk/templates/.claude/skills/moai-foundation-quality/reference.md +1266 -0
  281. moai_adk/templates/.claude/skills/moai-foundation-quality/scripts/quality-gate.sh +668 -0
  282. moai_adk/templates/.claude/skills/moai-foundation-quality/templates/github-actions-quality.yml +481 -0
  283. moai_adk/templates/.claude/skills/moai-foundation-quality/templates/quality-config.yaml +519 -0
  284. moai_adk/templates/.claude/skills/moai-lang-cpp/SKILL.md +649 -0
  285. moai_adk/templates/.claude/skills/moai-lang-csharp/SKILL.md +478 -0
  286. moai_adk/templates/.claude/skills/moai-lang-elixir/SKILL.md +612 -0
  287. moai_adk/templates/.claude/skills/moai-lang-flutter/SKILL.md +477 -0
  288. moai_adk/templates/.claude/skills/moai-lang-flutter/examples.md +1090 -0
  289. moai_adk/templates/.claude/skills/moai-lang-flutter/reference.md +686 -0
  290. moai_adk/templates/.claude/skills/moai-lang-go/SKILL.md +376 -0
  291. moai_adk/templates/.claude/skills/moai-lang-go/examples.md +919 -0
  292. moai_adk/templates/.claude/skills/moai-lang-go/reference.md +737 -0
  293. moai_adk/templates/.claude/skills/moai-lang-java/SKILL.md +385 -0
  294. moai_adk/templates/.claude/skills/moai-lang-java/examples.md +864 -0
  295. moai_adk/templates/.claude/skills/moai-lang-java/reference.md +291 -0
  296. moai_adk/templates/.claude/skills/moai-lang-kotlin/SKILL.md +382 -0
  297. moai_adk/templates/.claude/skills/moai-lang-kotlin/examples.md +1006 -0
  298. moai_adk/templates/.claude/skills/moai-lang-kotlin/reference.md +562 -0
  299. moai_adk/templates/.claude/skills/moai-lang-php/SKILL.md +644 -0
  300. moai_adk/templates/.claude/skills/moai-lang-python/SKILL.md +481 -0
  301. moai_adk/templates/.claude/skills/moai-lang-python/examples.md +977 -0
  302. moai_adk/templates/.claude/skills/moai-lang-python/reference.md +804 -0
  303. moai_adk/templates/.claude/skills/moai-lang-r/SKILL.md +579 -0
  304. moai_adk/templates/.claude/skills/moai-lang-ruby/SKILL.md +687 -0
  305. moai_adk/templates/.claude/skills/moai-lang-rust/SKILL.md +372 -0
  306. moai_adk/templates/.claude/skills/moai-lang-rust/examples.md +659 -0
  307. moai_adk/templates/.claude/skills/moai-lang-rust/reference.md +504 -0
  308. moai_adk/templates/.claude/skills/moai-lang-scala/SKILL.md +497 -0
  309. moai_adk/templates/.claude/skills/moai-lang-scala/examples.md +633 -0
  310. moai_adk/templates/.claude/skills/moai-lang-scala/reference.md +423 -0
  311. moai_adk/templates/.claude/skills/moai-lang-swift/SKILL.md +497 -0
  312. moai_adk/templates/.claude/skills/moai-lang-swift/examples.md +918 -0
  313. moai_adk/templates/.claude/skills/moai-lang-swift/reference.md +672 -0
  314. moai_adk/templates/.claude/skills/moai-lang-typescript/SKILL.md +368 -0
  315. moai_adk/templates/.claude/skills/moai-lang-typescript/examples.md +1089 -0
  316. moai_adk/templates/.claude/skills/moai-lang-typescript/reference.md +731 -0
  317. moai_adk/templates/.claude/skills/moai-library-mermaid/SKILL.md +300 -0
  318. moai_adk/templates/.claude/skills/moai-library-mermaid/advanced-patterns.md +465 -0
  319. moai_adk/templates/.claude/skills/moai-library-mermaid/examples.md +270 -0
  320. moai_adk/templates/.claude/skills/moai-library-mermaid/optimization.md +440 -0
  321. moai_adk/templates/.claude/skills/moai-library-mermaid/reference.md +228 -0
  322. moai_adk/templates/.claude/skills/moai-library-nextra/SKILL.md +319 -0
  323. moai_adk/templates/.claude/skills/moai-library-nextra/advanced-patterns.md +336 -0
  324. moai_adk/templates/.claude/skills/moai-library-nextra/examples.md +592 -0
  325. moai_adk/templates/.claude/skills/moai-library-nextra/modules/advanced-deployment-patterns.md +182 -0
  326. moai_adk/templates/.claude/skills/moai-library-nextra/modules/advanced-patterns.md +17 -0
  327. moai_adk/templates/.claude/skills/moai-library-nextra/modules/configuration.md +57 -0
  328. moai_adk/templates/.claude/skills/moai-library-nextra/modules/content-architecture-optimization.md +162 -0
  329. moai_adk/templates/.claude/skills/moai-library-nextra/modules/deployment.md +52 -0
  330. moai_adk/templates/.claude/skills/moai-library-nextra/modules/framework-core-configuration.md +186 -0
  331. moai_adk/templates/.claude/skills/moai-library-nextra/modules/i18n-setup.md +55 -0
  332. moai_adk/templates/.claude/skills/moai-library-nextra/modules/mdx-components.md +52 -0
  333. moai_adk/templates/.claude/skills/moai-library-nextra/optimization.md +303 -0
  334. moai_adk/templates/.claude/skills/moai-library-nextra/reference.md +379 -0
  335. moai_adk/templates/.claude/skills/moai-library-shadcn/SKILL.md +372 -0
  336. moai_adk/templates/.claude/skills/moai-library-shadcn/examples.md +575 -0
  337. moai_adk/templates/.claude/skills/moai-library-shadcn/modules/advanced-patterns.md +394 -0
  338. moai_adk/templates/.claude/skills/moai-library-shadcn/modules/optimization.md +278 -0
  339. moai_adk/templates/.claude/skills/moai-library-shadcn/modules/shadcn-components.md +457 -0
  340. moai_adk/templates/.claude/skills/moai-library-shadcn/modules/shadcn-theming.md +373 -0
  341. moai_adk/templates/.claude/skills/moai-library-shadcn/reference.md +74 -0
  342. moai_adk/templates/.claude/skills/moai-mcp-figma/SKILL.md +402 -0
  343. moai_adk/templates/.claude/skills/moai-mcp-figma/advanced-patterns.md +607 -0
  344. moai_adk/templates/.claude/skills/moai-mcp-notion/SKILL.md +300 -0
  345. moai_adk/templates/.claude/skills/moai-mcp-notion/advanced-patterns.md +537 -0
  346. moai_adk/templates/.claude/skills/moai-platform-auth0/SKILL.md +291 -0
  347. moai_adk/templates/.claude/skills/moai-platform-clerk/SKILL.md +390 -0
  348. moai_adk/templates/.claude/skills/moai-platform-convex/SKILL.md +398 -0
  349. moai_adk/templates/.claude/skills/moai-platform-firebase-auth/SKILL.md +379 -0
  350. moai_adk/templates/.claude/skills/moai-platform-firestore/SKILL.md +358 -0
  351. moai_adk/templates/.claude/skills/moai-platform-neon/SKILL.md +467 -0
  352. moai_adk/templates/.claude/skills/moai-platform-railway/SKILL.md +377 -0
  353. moai_adk/templates/.claude/skills/moai-platform-supabase/SKILL.md +466 -0
  354. moai_adk/templates/.claude/skills/moai-platform-vercel/SKILL.md +482 -0
  355. moai_adk/templates/.claude/skills/moai-plugin-builder/SKILL.md +474 -0
  356. moai_adk/templates/.claude/skills/moai-plugin-builder/examples.md +621 -0
  357. moai_adk/templates/.claude/skills/moai-plugin-builder/migration.md +341 -0
  358. moai_adk/templates/.claude/skills/moai-plugin-builder/reference.md +463 -0
  359. moai_adk/templates/.claude/skills/moai-plugin-builder/validation.md +373 -0
  360. moai_adk/templates/.claude/skills/moai-security-auth0/SKILL.md +275 -0
  361. moai_adk/templates/.claude/skills/moai-security-auth0/modules/adaptive-mfa.md +233 -0
  362. moai_adk/templates/.claude/skills/moai-security-auth0/modules/akamai-integration.md +215 -0
  363. moai_adk/templates/.claude/skills/moai-security-auth0/modules/application-credentials.md +280 -0
  364. moai_adk/templates/.claude/skills/moai-security-auth0/modules/attack-protection-log-events.md +225 -0
  365. moai_adk/templates/.claude/skills/moai-security-auth0/modules/attack-protection-overview.md +140 -0
  366. moai_adk/templates/.claude/skills/moai-security-auth0/modules/bot-detection.md +144 -0
  367. moai_adk/templates/.claude/skills/moai-security-auth0/modules/breached-password-detection.md +187 -0
  368. moai_adk/templates/.claude/skills/moai-security-auth0/modules/brute-force-protection.md +189 -0
  369. moai_adk/templates/.claude/skills/moai-security-auth0/modules/certifications.md +282 -0
  370. moai_adk/templates/.claude/skills/moai-security-auth0/modules/compliance-overview.md +263 -0
  371. moai_adk/templates/.claude/skills/moai-security-auth0/modules/continuous-session-protection.md +307 -0
  372. moai_adk/templates/.claude/skills/moai-security-auth0/modules/customize-mfa.md +178 -0
  373. moai_adk/templates/.claude/skills/moai-security-auth0/modules/dpop-implementation.md +283 -0
  374. moai_adk/templates/.claude/skills/moai-security-auth0/modules/fapi-implementation.md +259 -0
  375. moai_adk/templates/.claude/skills/moai-security-auth0/modules/gdpr-compliance.md +313 -0
  376. moai_adk/templates/.claude/skills/moai-security-auth0/modules/guardian-configuration.md +269 -0
  377. moai_adk/templates/.claude/skills/moai-security-auth0/modules/highly-regulated-identity.md +272 -0
  378. moai_adk/templates/.claude/skills/moai-security-auth0/modules/jwt-fundamentals.md +248 -0
  379. moai_adk/templates/.claude/skills/moai-security-auth0/modules/mdl-verification.md +211 -0
  380. moai_adk/templates/.claude/skills/moai-security-auth0/modules/mfa-api-management.md +278 -0
  381. moai_adk/templates/.claude/skills/moai-security-auth0/modules/mfa-factors.md +226 -0
  382. moai_adk/templates/.claude/skills/moai-security-auth0/modules/mfa-overview.md +174 -0
  383. moai_adk/templates/.claude/skills/moai-security-auth0/modules/mtls-sender-constraining.md +316 -0
  384. moai_adk/templates/.claude/skills/moai-security-auth0/modules/ropg-flow-mfa.md +217 -0
  385. moai_adk/templates/.claude/skills/moai-security-auth0/modules/security-center.md +325 -0
  386. moai_adk/templates/.claude/skills/moai-security-auth0/modules/security-guidance.md +277 -0
  387. moai_adk/templates/.claude/skills/moai-security-auth0/modules/state-parameters.md +178 -0
  388. moai_adk/templates/.claude/skills/moai-security-auth0/modules/step-up-authentication.md +251 -0
  389. moai_adk/templates/.claude/skills/moai-security-auth0/modules/suspicious-ip-throttling.md +240 -0
  390. moai_adk/templates/.claude/skills/moai-security-auth0/modules/tenant-access-control.md +180 -0
  391. moai_adk/templates/.claude/skills/moai-security-auth0/modules/webauthn-fido.md +235 -0
  392. moai_adk/templates/.claude/skills/moai-workflow-jit-docs/SKILL.md +449 -0
  393. moai_adk/templates/.claude/skills/moai-workflow-jit-docs/advanced-patterns.md +379 -0
  394. moai_adk/templates/.claude/skills/moai-workflow-jit-docs/examples.md +544 -0
  395. moai_adk/templates/.claude/skills/moai-workflow-jit-docs/optimization.md +286 -0
  396. moai_adk/templates/.claude/skills/moai-workflow-jit-docs/reference.md +307 -0
  397. moai_adk/templates/.claude/skills/moai-workflow-project/README.md +190 -0
  398. moai_adk/templates/.claude/skills/moai-workflow-project/SKILL.md +390 -0
  399. moai_adk/templates/.claude/skills/moai-workflow-project/__init__.py +520 -0
  400. moai_adk/templates/.claude/skills/moai-workflow-project/complete_workflow_demo_fixed.py +574 -0
  401. moai_adk/templates/.claude/skills/moai-workflow-project/examples/complete_project_setup.py +317 -0
  402. moai_adk/templates/.claude/skills/moai-workflow-project/examples/complete_workflow_demo.py +663 -0
  403. moai_adk/templates/.claude/skills/moai-workflow-project/examples/config-migration-example.json +190 -0
  404. moai_adk/templates/.claude/skills/moai-workflow-project/examples/question-examples.json +175 -0
  405. moai_adk/templates/.claude/skills/moai-workflow-project/examples/quick_start.py +196 -0
  406. moai_adk/templates/.claude/skills/moai-workflow-project/examples.md +547 -0
  407. moai_adk/templates/.claude/skills/moai-workflow-project/modules/__init__.py +17 -0
  408. moai_adk/templates/.claude/skills/moai-workflow-project/modules/advanced-patterns.md +158 -0
  409. moai_adk/templates/.claude/skills/moai-workflow-project/modules/ask_user_integration.py +340 -0
  410. moai_adk/templates/.claude/skills/moai-workflow-project/modules/batch_questions.py +713 -0
  411. moai_adk/templates/.claude/skills/moai-workflow-project/modules/config_manager.py +538 -0
  412. moai_adk/templates/.claude/skills/moai-workflow-project/modules/documentation_manager.py +1336 -0
  413. moai_adk/templates/.claude/skills/moai-workflow-project/modules/language_initializer.py +730 -0
  414. moai_adk/templates/.claude/skills/moai-workflow-project/modules/migration_manager.py +608 -0
  415. moai_adk/templates/.claude/skills/moai-workflow-project/modules/template_optimizer.py +1005 -0
  416. moai_adk/templates/.claude/skills/moai-workflow-project/reference.md +275 -0
  417. moai_adk/templates/.claude/skills/moai-workflow-project/schemas/config-schema.json +316 -0
  418. moai_adk/templates/.claude/skills/moai-workflow-project/schemas/tab_schema.json +1434 -0
  419. moai_adk/templates/.claude/skills/moai-workflow-project/templates/config-template.json +71 -0
  420. moai_adk/templates/.claude/skills/moai-workflow-project/templates/doc-templates/product-template.md +44 -0
  421. moai_adk/templates/.claude/skills/moai-workflow-project/templates/doc-templates/structure-template.md +48 -0
  422. moai_adk/templates/.claude/skills/moai-workflow-project/templates/doc-templates/tech-template.md +92 -0
  423. moai_adk/templates/.claude/skills/moai-workflow-project/templates/question-templates/config-manager-setup.json +109 -0
  424. moai_adk/templates/.claude/skills/moai-workflow-project/templates/question-templates/language-initializer.json +228 -0
  425. moai_adk/templates/.claude/skills/moai-workflow-project/templates/question-templates/menu-project-config.json +130 -0
  426. moai_adk/templates/.claude/skills/moai-workflow-project/templates/question-templates/project-batch-questions.json +97 -0
  427. moai_adk/templates/.claude/skills/moai-workflow-project/templates/question-templates/spec-workflow-setup.json +150 -0
  428. moai_adk/templates/.claude/skills/moai-workflow-project/test_integration_simple.py +436 -0
  429. moai_adk/templates/.claude/skills/moai-workflow-spec/SKILL.md +534 -0
  430. moai_adk/templates/.claude/skills/moai-workflow-spec/examples.md +900 -0
  431. moai_adk/templates/.claude/skills/moai-workflow-spec/reference.md +704 -0
  432. moai_adk/templates/.claude/skills/moai-workflow-templates/SKILL.md +377 -0
  433. moai_adk/templates/.claude/skills/moai-workflow-templates/examples.md +552 -0
  434. moai_adk/templates/.claude/skills/moai-workflow-templates/modules/code-templates.md +124 -0
  435. moai_adk/templates/.claude/skills/moai-workflow-templates/modules/feedback-templates.md +100 -0
  436. moai_adk/templates/.claude/skills/moai-workflow-templates/modules/template-optimizer.md +138 -0
  437. moai_adk/templates/.claude/skills/moai-workflow-templates/reference.md +346 -0
  438. moai_adk/templates/.claude/skills/moai-workflow-testing/LICENSE.txt +202 -0
  439. moai_adk/templates/.claude/skills/moai-workflow-testing/SKILL.md +456 -0
  440. moai_adk/templates/.claude/skills/moai-workflow-testing/advanced-patterns.md +576 -0
  441. moai_adk/templates/.claude/skills/moai-workflow-testing/examples/ai-powered-testing.py +294 -0
  442. moai_adk/templates/.claude/skills/moai-workflow-testing/examples/console_logging.py +35 -0
  443. moai_adk/templates/.claude/skills/moai-workflow-testing/examples/element_discovery.py +40 -0
  444. moai_adk/templates/.claude/skills/moai-workflow-testing/examples/static_html_automation.py +34 -0
  445. moai_adk/templates/.claude/skills/moai-workflow-testing/examples.md +672 -0
  446. moai_adk/templates/.claude/skills/moai-workflow-testing/modules/README.md +220 -0
  447. moai_adk/templates/.claude/skills/moai-workflow-testing/modules/ai-debugging.md +845 -0
  448. moai_adk/templates/.claude/skills/moai-workflow-testing/modules/automated-code-review.md +1416 -0
  449. moai_adk/templates/.claude/skills/moai-workflow-testing/modules/performance-optimization.md +1234 -0
  450. moai_adk/templates/.claude/skills/moai-workflow-testing/modules/smart-refactoring.md +1243 -0
  451. moai_adk/templates/.claude/skills/moai-workflow-testing/modules/tdd-context7.md +1260 -0
  452. moai_adk/templates/.claude/skills/moai-workflow-testing/optimization.md +505 -0
  453. moai_adk/templates/.claude/skills/moai-workflow-testing/reference/playwright-best-practices.md +57 -0
  454. moai_adk/templates/.claude/skills/moai-workflow-testing/reference.md +440 -0
  455. moai_adk/templates/.claude/skills/moai-workflow-testing/scripts/with_server.py +218 -0
  456. moai_adk/templates/.claude/skills/moai-workflow-testing/templates/alfred-integration.md +376 -0
  457. moai_adk/templates/.claude/skills/moai-workflow-testing/workflows/enterprise-testing-workflow.py +571 -0
  458. moai_adk/templates/.claude/skills/moai-worktree/SKILL.md +411 -0
  459. moai_adk/templates/.claude/skills/moai-worktree/examples.md +606 -0
  460. moai_adk/templates/.claude/skills/moai-worktree/modules/integration-patterns.md +982 -0
  461. moai_adk/templates/.claude/skills/moai-worktree/modules/parallel-development.md +778 -0
  462. moai_adk/templates/.claude/skills/moai-worktree/modules/worktree-commands.md +646 -0
  463. moai_adk/templates/.claude/skills/moai-worktree/modules/worktree-management.md +782 -0
  464. moai_adk/templates/.claude/skills/moai-worktree/reference.md +357 -0
  465. moai_adk/templates/.git-hooks/pre-commit +128 -0
  466. moai_adk/templates/.git-hooks/pre-push +365 -0
  467. moai_adk/templates/.github/workflows/ci-universal.yml +513 -0
  468. moai_adk/templates/.github/workflows/security-secrets-check.yml +179 -0
  469. moai_adk/templates/.github/workflows/spec-issue-sync.yml +337 -0
  470. moai_adk/templates/.gitignore +222 -0
  471. moai_adk/templates/.mcp.json +13 -0
  472. moai_adk/templates/.moai/config/config.yaml +58 -0
  473. moai_adk/templates/.moai/config/questions/_schema.yaml +174 -0
  474. moai_adk/templates/.moai/config/questions/tab0-init.yaml +251 -0
  475. moai_adk/templates/.moai/config/questions/tab1-user.yaml +107 -0
  476. moai_adk/templates/.moai/config/questions/tab2-project.yaml +79 -0
  477. moai_adk/templates/.moai/config/questions/tab3-git.yaml +632 -0
  478. moai_adk/templates/.moai/config/questions/tab4-quality.yaml +182 -0
  479. moai_adk/templates/.moai/config/questions/tab5-system.yaml +96 -0
  480. moai_adk/templates/.moai/config/sections/git-strategy.yaml +116 -0
  481. moai_adk/templates/.moai/config/sections/language.yaml +11 -0
  482. moai_adk/templates/.moai/config/sections/project.yaml +13 -0
  483. moai_adk/templates/.moai/config/sections/quality.yaml +17 -0
  484. moai_adk/templates/.moai/config/sections/system.yaml +24 -0
  485. moai_adk/templates/.moai/config/sections/user.yaml +5 -0
  486. moai_adk/templates/.moai/config/statusline-config.yaml +92 -0
  487. moai_adk/templates/.moai/scripts/setup-glm.py +136 -0
  488. moai_adk/templates/CLAUDE.md +642 -0
  489. moai_adk/utils/__init__.py +30 -0
  490. moai_adk/utils/banner.py +38 -0
  491. moai_adk/utils/common.py +294 -0
  492. moai_adk/utils/link_validator.py +241 -0
  493. moai_adk/utils/logger.py +147 -0
  494. moai_adk/utils/safe_file_reader.py +206 -0
  495. moai_adk/utils/timeout.py +160 -0
  496. moai_adk/utils/toon_utils.py +256 -0
  497. moai_adk/version.py +22 -0
  498. moai_adk-0.35.1.dist-info/METADATA +3018 -0
  499. moai_adk-0.35.1.dist-info/RECORD +502 -0
  500. moai_adk-0.35.1.dist-info/WHEEL +4 -0
  501. moai_adk-0.35.1.dist-info/entry_points.txt +3 -0
  502. moai_adk-0.35.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1902 @@
1
+ """
2
+ Comprehensive Error Recovery System - Phase 3 Enterprise Edition
3
+
4
+ Advanced enterprise-grade error recovery with automatic healing, data consistency checks,
5
+ rollback capabilities, state persistence, and self-healing mechanisms.
6
+
7
+ Phase 3 Features:
8
+ - Event-driven error recovery architecture
9
+ - Automatic system recovery from all failure modes
10
+ - Data consistency checks and repair mechanisms
11
+ - Comprehensive rollback and state persistence
12
+ - Self-healing capabilities with circuit breakers
13
+ - Dead letter queue handling for failed operations
14
+ - Multi-strategy recovery with exponential backoff
15
+ - Disaster recovery and business continuity
16
+ - Real-time failure mode analysis and prediction
17
+
18
+ Legacy Features:
19
+ - Error detection and classification
20
+ - Recovery procedures and fallback mechanisms
21
+ - Integration with research hooks, agents, and skills
22
+ - Documentation of error handling procedures
23
+ - Troubleshooting guides and automated recovery
24
+ - Multi-level error handling (critical, warning, info)
25
+ - Manual recovery procedures
26
+ - Error logging and tracking
27
+ - System health monitoring
28
+ - Emergency recovery procedures
29
+ """
30
+
31
+ import asyncio
32
+ import hashlib
33
+ import json
34
+ import logging
35
+ import os
36
+ import sys
37
+ import tempfile
38
+ import threading
39
+ import time
40
+ import traceback
41
+ import uuid
42
+ from collections import defaultdict, deque
43
+ from dataclasses import asdict, dataclass, field
44
+ from datetime import datetime, timedelta, timezone
45
+ from enum import Enum
46
+ from pathlib import Path
47
+ from typing import Any, Callable, Dict, List, Optional
48
+
49
+ # Configure comprehensive logging
50
+ logging.basicConfig(
51
+ level=logging.INFO,
52
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
53
+ handlers=[
54
+ logging.FileHandler(Path(tempfile.gettempdir()) / "moai_error_recovery.log"),
55
+ logging.StreamHandler(sys.stdout),
56
+ ],
57
+ )
58
+
59
+ logger = logging.getLogger(__name__)
60
+
61
+
62
+ class ErrorSeverity(Enum):
63
+ """Error severity levels"""
64
+
65
+ CRITICAL = "critical" # System failure, immediate attention required
66
+ HIGH = "high" # Major functionality impacted
67
+ MEDIUM = "medium" # Partial functionality impacted
68
+ LOW = "low" # Minor issue, can be deferred
69
+ INFO = "info" # Informational message
70
+
71
+
72
+ class ErrorCategory(Enum):
73
+ """Error categories for classification"""
74
+
75
+ SYSTEM = "system" # System-level errors
76
+ CONFIGURATION = "configuration" # Configuration errors
77
+ RESEARCH = "research" # Research workflow errors
78
+ INTEGRATION = "integration" # Integration errors
79
+ COMMUNICATION = "communication" # Agent/communication errors
80
+ VALIDATION = "validation" # Validation errors
81
+ PERFORMANCE = "performance" # Performance issues
82
+ RESOURCE = "resource" # Resource exhaustion
83
+ NETWORK = "network" # Network-related errors
84
+ USER_INPUT = "user_input" # User input errors
85
+
86
+
87
+ # Phase 3: Advanced Error Recovery Enums
88
+
89
+
90
+ class FailureMode(Enum):
91
+ """Types of failure modes in the system"""
92
+
93
+ HOOK_EXECUTION_FAILURE = "hook_execution_failure"
94
+ RESOURCE_EXHAUSTION = "resource_exhaustion"
95
+ DATA_CORRUPTION = "data_corruption"
96
+ NETWORK_FAILURE = "network_failure"
97
+ SYSTEM_OVERLOAD = "system_overload"
98
+ CONFIGURATION_ERROR = "configuration_error"
99
+ TIMEOUT_FAILURE = "timeout_failure"
100
+ MEMORY_LEAK = "memory_leak"
101
+ DEADLOCK = "deadlock"
102
+ AUTHENTICATION_FAILURE = "authentication_failure"
103
+ VALIDATION_FAILURE = "validation_failure"
104
+ EXTERNAL_SERVICE_FAILURE = "external_service_failure"
105
+ STORAGE_FAILURE = "storage_failure"
106
+ CONCURRENCY_ISSUE = "concurrency_issue"
107
+ CIRCUIT_BREAKER_TRIPPED = "circuit_breaker_tripped"
108
+ CASCADE_FAILURE = "cascade_failure"
109
+
110
+
111
+ class RecoveryStrategy(Enum):
112
+ """Recovery strategies for different failure modes"""
113
+
114
+ RETRY_WITH_BACKOFF = "retry_with_backoff"
115
+ CIRCUIT_BREAKER = "circuit_breaker"
116
+ ROLLBACK = "rollback"
117
+ FAILOVER = "failover"
118
+ DEGRADE_SERVICE = "degrade_service"
119
+ RESTART_COMPONENT = "restart_component"
120
+ DATA_REPAIR = "data_repair"
121
+ CLEAR_CACHE = "clear_cache"
122
+ SCALE_RESOURCES = "scale_resources"
123
+ NOTIFY_ADMIN = "notify_admin"
124
+ QUARANTINE = "quarantine"
125
+ IGNORE = "ignore"
126
+ ISOLATE_COMPONENT = "isolate_component"
127
+ EMERGENCY_STOP = "emergency_stop"
128
+
129
+
130
+ class ConsistencyLevel(Enum):
131
+ """Data consistency levels"""
132
+
133
+ STRONG = "strong" # Immediate consistency
134
+ EVENTUAL = "eventual" # Eventually consistent
135
+ WEAK = "weak" # Weak consistency
136
+ CUSTOM = "custom" # Custom consistency rules
137
+
138
+
139
+ class RecoveryStatus(Enum):
140
+ """Recovery operation status"""
141
+
142
+ PENDING = "pending"
143
+ IN_PROGRESS = "in_progress"
144
+ COMPLETED = "completed"
145
+ FAILED = "failed"
146
+ CANCELLED = "cancelled"
147
+ ROLLED_BACK = "rolled_back"
148
+
149
+
150
+ @dataclass
151
+ class ErrorReport:
152
+ """Comprehensive error report structure"""
153
+
154
+ id: str
155
+ timestamp: datetime
156
+ severity: ErrorSeverity
157
+ category: ErrorCategory
158
+ message: str
159
+ details: Dict[str, Any]
160
+ stack_trace: Optional[str]
161
+ context: Dict[str, Any]
162
+ recovery_attempted: bool = False
163
+ recovery_successful: bool = False
164
+ resolution_message: Optional[str] = None
165
+
166
+
167
+ @dataclass
168
+ class RecoveryAction:
169
+ """Recovery action definition"""
170
+
171
+ name: str
172
+ description: str
173
+ action_type: str # "automatic", "manual", "assisted"
174
+ severity_filter: List[ErrorSeverity]
175
+ category_filter: List[ErrorCategory]
176
+ handler: Callable
177
+ timeout: Optional[float] = None
178
+ max_attempts: int = 3
179
+ success_criteria: Optional[str] = None
180
+
181
+
182
+ @dataclass
183
+ class RecoveryResult:
184
+ """Result of recovery action"""
185
+
186
+ success: bool
187
+ action_name: str
188
+ message: str
189
+ duration: float
190
+ details: Dict[str, Any] = None
191
+ next_actions: List[str] = None
192
+
193
+
194
+ # Phase 3: Advanced Error Recovery Dataclasses
195
+
196
+
197
+ @dataclass
198
+ class FailureEvent:
199
+ """Represents a failure event in the system"""
200
+
201
+ failure_id: str
202
+ failure_mode: FailureMode
203
+ timestamp: datetime
204
+ component: str
205
+ description: str
206
+ severity: str # "low", "medium", "high", "critical"
207
+ context: Dict[str, Any] = field(default_factory=dict)
208
+ error_details: Optional[Dict[str, Any]] = None
209
+ affected_operations: List[str] = field(default_factory=list)
210
+ auto_recovery_eligible: bool = True
211
+ retry_count: int = 0
212
+ metadata: Dict[str, Any] = field(default_factory=dict)
213
+ parent_failure_id: Optional[str] = None # For cascade failures
214
+ root_cause: Optional[str] = None
215
+
216
+ def to_dict(self) -> Dict[str, Any]:
217
+ """Convert to dictionary for serialization"""
218
+ return {
219
+ "failure_id": self.failure_id,
220
+ "failure_mode": self.failure_mode.value,
221
+ "timestamp": self.timestamp.isoformat(),
222
+ "component": self.component,
223
+ "description": self.description,
224
+ "severity": self.severity,
225
+ "context": self.context,
226
+ "error_details": self.error_details,
227
+ "affected_operations": self.affected_operations,
228
+ "auto_recovery_eligible": self.auto_recovery_eligible,
229
+ "retry_count": self.retry_count,
230
+ "metadata": self.metadata,
231
+ "parent_failure_id": self.parent_failure_id,
232
+ "root_cause": self.root_cause,
233
+ }
234
+
235
+ @classmethod
236
+ def from_dict(cls, data: Dict[str, Any]) -> "FailureEvent":
237
+ """Create from dictionary"""
238
+ return cls(
239
+ failure_id=data["failure_id"],
240
+ failure_mode=FailureMode(data["failure_mode"]),
241
+ timestamp=datetime.fromisoformat(data["timestamp"]),
242
+ component=data["component"],
243
+ description=data["description"],
244
+ severity=data["severity"],
245
+ context=data.get("context", {}),
246
+ error_details=data.get("error_details"),
247
+ affected_operations=data.get("affected_operations", []),
248
+ auto_recovery_eligible=data.get("auto_recovery_eligible", True),
249
+ retry_count=data.get("retry_count", 0),
250
+ metadata=data.get("metadata", {}),
251
+ parent_failure_id=data.get("parent_failure_id"),
252
+ root_cause=data.get("root_cause"),
253
+ )
254
+
255
+
256
+ @dataclass
257
+ class AdvancedRecoveryAction:
258
+ """Advanced recovery action with enhanced capabilities"""
259
+
260
+ action_id: str
261
+ failure_id: str
262
+ strategy: RecoveryStrategy
263
+ timestamp: datetime
264
+ status: RecoveryStatus = RecoveryStatus.PENDING
265
+ description: str = ""
266
+ parameters: Dict[str, Any] = field(default_factory=dict)
267
+ execution_log: List[str] = field(default_factory=list)
268
+ rollback_available: bool = True
269
+ timeout_seconds: float = 300.0
270
+ retry_attempts: int = 0
271
+ max_retries: int = 3
272
+ rollback_action_id: Optional[str] = None
273
+ dependencies: List[str] = field(default_factory=list) # Other actions this depends on
274
+ priority: int = 5 # 1-10, lower number = higher priority
275
+
276
+ def to_dict(self) -> Dict[str, Any]:
277
+ """Convert to dictionary for serialization"""
278
+ return {
279
+ "action_id": self.action_id,
280
+ "failure_id": self.failure_id,
281
+ "strategy": self.strategy.value,
282
+ "timestamp": self.timestamp.isoformat(),
283
+ "status": self.status.value,
284
+ "description": self.description,
285
+ "parameters": self.parameters,
286
+ "execution_log": self.execution_log,
287
+ "rollback_available": self.rollback_available,
288
+ "timeout_seconds": self.timeout_seconds,
289
+ "retry_attempts": self.retry_attempts,
290
+ "max_retries": self.max_retries,
291
+ "rollback_action_id": self.rollback_action_id,
292
+ "dependencies": self.dependencies,
293
+ "priority": self.priority,
294
+ }
295
+
296
+
297
+ @dataclass
298
+ class SystemSnapshot:
299
+ """Represents a system state snapshot for rollback"""
300
+
301
+ snapshot_id: str
302
+ timestamp: datetime
303
+ component_states: Dict[str, Dict[str, Any]]
304
+ configuration_hash: str
305
+ data_checksums: Dict[str, str]
306
+ metadata: Dict[str, Any] = field(default_factory=dict)
307
+ parent_snapshot_id: Optional[str] = None
308
+ is_rollback_point: bool = False
309
+ description: str = ""
310
+ consistency_level: ConsistencyLevel = ConsistencyLevel.EVENTUAL
311
+
312
+ def to_dict(self) -> Dict[str, Any]:
313
+ """Convert to dictionary for serialization"""
314
+ return {
315
+ "snapshot_id": self.snapshot_id,
316
+ "timestamp": self.timestamp.isoformat(),
317
+ "component_states": self.component_states,
318
+ "configuration_hash": self.configuration_hash,
319
+ "data_checksums": self.data_checksums,
320
+ "metadata": self.metadata,
321
+ "parent_snapshot_id": self.parent_snapshot_id,
322
+ "is_rollback_point": self.is_rollback_point,
323
+ "description": self.description,
324
+ "consistency_level": self.consistency_level.value,
325
+ }
326
+
327
+
328
+ class ErrorRecoverySystem:
329
+ """Comprehensive error handling and recovery system"""
330
+
331
+ def __init__(self, project_root: Path = None):
332
+ self.project_root = project_root or Path.cwd()
333
+ self.error_log_dir = self.project_root / ".moai" / "error_logs"
334
+ self.error_log_dir.mkdir(parents=True, exist_ok=True)
335
+
336
+ # Error tracking
337
+ self.active_errors: Dict[str, ErrorReport] = {}
338
+ self.error_history: List[ErrorReport] = []
339
+ self.recovery_actions: Dict[str, RecoveryAction] = {}
340
+ self.error_stats: Dict[str, Any] = {
341
+ "total_errors": 0,
342
+ "by_severity": {},
343
+ "by_category": {},
344
+ "recovery_success_rate": 0.0,
345
+ }
346
+
347
+ # System health monitoring
348
+ self.system_health = {
349
+ "status": "healthy",
350
+ "last_check": datetime.now(timezone.utc),
351
+ "issues": [],
352
+ "metrics": {},
353
+ }
354
+
355
+ # Initialize recovery actions
356
+ self._initialize_recovery_actions()
357
+
358
+ # Phase 3: Advanced recovery system initialization
359
+ self._initialize_phase3_components()
360
+
361
+ # Background monitoring thread
362
+ self.monitoring_active = True
363
+ self.monitor_thread = threading.Thread(target=self._background_monitoring, daemon=True)
364
+ self.monitor_thread.start()
365
+
366
+ logger.info("Error Recovery System initialized with Phase 3 enterprise features")
367
+
368
+ def handle_error(
369
+ self,
370
+ error: Exception,
371
+ context: Dict[str, Any] = None,
372
+ severity: ErrorSeverity = ErrorSeverity.MEDIUM,
373
+ category: ErrorCategory = ErrorCategory.SYSTEM,
374
+ ) -> ErrorReport:
375
+ """
376
+ Handle an error with comprehensive logging and recovery
377
+
378
+ Args:
379
+ error: Exception that occurred
380
+ context: Additional context information
381
+ severity: Error severity level
382
+ category: Error category
383
+
384
+ Returns:
385
+ ErrorReport with handling details
386
+ """
387
+ error_id = self._generate_error_id()
388
+ timestamp = datetime.now(timezone.utc)
389
+
390
+ # Create error report
391
+ error_report = ErrorReport(
392
+ id=error_id,
393
+ timestamp=timestamp,
394
+ severity=severity,
395
+ category=category,
396
+ message=str(error),
397
+ details={
398
+ "exception_type": type(error).__name__,
399
+ "exception_module": type(error).__module__,
400
+ "error_code": getattr(error, "code", None),
401
+ },
402
+ stack_trace=traceback.format_exc(),
403
+ context=context or {},
404
+ recovery_attempted=False,
405
+ recovery_successful=False,
406
+ )
407
+
408
+ # Log error
409
+ self._log_error(error_report)
410
+
411
+ # Update statistics
412
+ self._update_error_stats(error_report)
413
+
414
+ # Store error
415
+ self.active_errors[error_id] = error_report
416
+ self.error_history.append(error_report)
417
+
418
+ # Attempt automatic recovery
419
+ if severity in [ErrorSeverity.CRITICAL, ErrorSeverity.HIGH]:
420
+ recovery_result = self._attempt_automatic_recovery(error_report)
421
+ error_report.recovery_attempted = True
422
+ error_report.recovery_successful = recovery_result.success
423
+ error_report.resolution_message = recovery_result.message
424
+
425
+ if recovery_result.success:
426
+ logger.info(f"Automatic recovery successful for error {error_id}")
427
+ self.active_errors.pop(error_id, None)
428
+ else:
429
+ logger.warning(f"Automatic recovery failed for error {error_id}: {recovery_result.message}")
430
+
431
+ # Update system health
432
+ self._update_system_health()
433
+
434
+ return error_report
435
+
436
+ def register_recovery_action(self, action: RecoveryAction):
437
+ """
438
+ Register a new recovery action
439
+
440
+ Args:
441
+ action: RecoveryAction definition
442
+ """
443
+ self.recovery_actions[action.name] = action
444
+ logger.info(f"Registered recovery action: {action.name}")
445
+
446
+ def attempt_manual_recovery(
447
+ self, error_id: str, action_name: str, parameters: Dict[str, Any] = None
448
+ ) -> RecoveryResult:
449
+ """
450
+ Attempt manual recovery for a specific error
451
+
452
+ Args:
453
+ error_id: ID of error to recover
454
+ action_name: Name of recovery action to attempt
455
+ parameters: Additional parameters for recovery
456
+
457
+ Returns:
458
+ RecoveryResult with operation details
459
+ """
460
+ if error_id not in self.active_errors:
461
+ return RecoveryResult(
462
+ success=False,
463
+ action_name=action_name,
464
+ message=f"Error {error_id} not found in active errors",
465
+ duration=0.0,
466
+ )
467
+
468
+ if action_name not in self.recovery_actions:
469
+ return RecoveryResult(
470
+ success=False,
471
+ action_name=action_name,
472
+ message=f"Recovery action {action_name} not found",
473
+ duration=0.0,
474
+ )
475
+
476
+ error_report = self.active_errors[error_id]
477
+ recovery_action = self.recovery_actions[action_name]
478
+
479
+ logger.info(f"Attempting manual recovery {action_name} for error {error_id}")
480
+
481
+ try:
482
+ start_time = time.time()
483
+
484
+ # Execute recovery action
485
+ result = recovery_action.handler(error_report, parameters or {})
486
+
487
+ duration = time.time() - start_time
488
+
489
+ if result:
490
+ recovery_result = RecoveryResult(
491
+ success=True,
492
+ action_name=action_name,
493
+ message="Manual recovery completed successfully",
494
+ duration=duration,
495
+ details={"result": result},
496
+ )
497
+
498
+ # Update error report
499
+ error_report.recovery_successful = True
500
+ error_report.resolution_message = recovery_result.message
501
+
502
+ # Remove from active errors
503
+ self.active_errors.pop(error_id, None)
504
+
505
+ else:
506
+ recovery_result = RecoveryResult(
507
+ success=False,
508
+ action_name=action_name,
509
+ message="Manual recovery returned unsuccessful result",
510
+ duration=duration,
511
+ )
512
+
513
+ except Exception as e:
514
+ duration = time.time() - start_time
515
+ recovery_result = RecoveryResult(
516
+ success=False,
517
+ action_name=action_name,
518
+ message=f"Manual recovery failed: {str(e)}",
519
+ duration=duration,
520
+ details={"exception": str(e)},
521
+ )
522
+
523
+ return recovery_result
524
+
525
+ def get_system_health(self) -> Dict[str, Any]:
526
+ """
527
+ Get current system health status
528
+
529
+ Returns:
530
+ System health information
531
+ """
532
+ self._update_system_health()
533
+
534
+ last_check: datetime = self.system_health["last_check"] # type: ignore[assignment]
535
+ error_stats: Dict[str, Any] = self.error_stats # type: ignore[assignment]
536
+ issues: List[str] = self.system_health["issues"] # type: ignore[assignment]
537
+ metrics: Dict[str, Any] = self.system_health["metrics"] # type: ignore[assignment]
538
+
539
+ return {
540
+ "status": self.system_health["status"],
541
+ "last_check": last_check.isoformat(),
542
+ "active_errors": len(self.active_errors),
543
+ "total_errors": len(self.error_history),
544
+ "error_stats": error_stats.copy(),
545
+ "issues": issues.copy(),
546
+ "metrics": metrics.copy(),
547
+ "recovery_actions_available": len(self.recovery_actions),
548
+ }
549
+
550
+ def get_error_summary(self, limit: int = 50) -> Dict[str, Any]:
551
+ """
552
+ Get summary of recent errors
553
+
554
+ Args:
555
+ limit: Maximum number of errors to include
556
+
557
+ Returns:
558
+ Error summary information
559
+ """
560
+ recent_errors = self.error_history[-limit:]
561
+
562
+ # Categorize errors
563
+ by_severity: Dict[str, List[str]] = {}
564
+ by_category: Dict[str, List[str]] = {}
565
+
566
+ for error in recent_errors:
567
+ # By severity
568
+ severity = error.severity.value
569
+ if severity not in by_severity:
570
+ by_severity[severity] = []
571
+ by_severity[severity].append(error.id)
572
+
573
+ # By category
574
+ category = error.category.value
575
+ if category not in by_category:
576
+ by_category[category] = []
577
+ by_category[category].append(error.id)
578
+
579
+ # Common error patterns
580
+ error_patterns = self._identify_error_patterns(recent_errors)
581
+
582
+ return {
583
+ "total_recent_errors": len(recent_errors),
584
+ "active_errors": len(self.active_errors),
585
+ "by_severity": {k: len(v) for k, v in by_severity.items()},
586
+ "by_category": {k: len(v) for k, v in by_category.items()},
587
+ "common_patterns": error_patterns,
588
+ "recovery_rate": self._calculate_recovery_rate(recent_errors),
589
+ "recent_errors": [
590
+ {
591
+ "id": error.id,
592
+ "timestamp": error.timestamp.isoformat(),
593
+ "severity": error.severity.value,
594
+ "category": error.category.value,
595
+ "message": error.message,
596
+ "recovered": error.recovery_successful,
597
+ }
598
+ for error in recent_errors[-10:] # Last 10 errors
599
+ ],
600
+ }
601
+
602
+ def generate_troubleshooting_guide(self) -> Dict[str, Any]:
603
+ """
604
+ Generate troubleshooting guide based on error history
605
+
606
+ Returns:
607
+ Troubleshooting guide with solutions
608
+ """
609
+ common_issues: List[Dict[str, Any]] = []
610
+ recovery_procedures: Dict[str, Dict[str, Any]] = {}
611
+ prevention_tips: List[str] = []
612
+ emergency_procedures: List[Dict[str, str]] = []
613
+
614
+ guide = {
615
+ "generated_at": datetime.now(timezone.utc).isoformat(),
616
+ "common_issues": common_issues,
617
+ "recovery_procedures": recovery_procedures,
618
+ "prevention_tips": prevention_tips,
619
+ "emergency_procedures": emergency_procedures,
620
+ }
621
+
622
+ # Analyze common issues
623
+ error_patterns = self._identify_error_patterns(self.error_history)
624
+ for pattern, frequency in error_patterns.items():
625
+ if frequency > 2: # Issues that occurred more than twice
626
+ common_issues.append(
627
+ {
628
+ "pattern": pattern,
629
+ "frequency": frequency,
630
+ "severity": self._get_pattern_severity(pattern),
631
+ "solutions": self._get_solutions_for_pattern(pattern),
632
+ }
633
+ )
634
+
635
+ # Generate recovery procedures
636
+ for action_name, action in self.recovery_actions.items():
637
+ recovery_procedures[action_name] = {
638
+ "description": action.description,
639
+ "type": action.action_type,
640
+ "for_severities": [s.value for s in action.severity_filter],
641
+ "for_categories": [c.value for c in action.category_filter],
642
+ }
643
+
644
+ # Prevention tips
645
+ prevention_tips.extend(self._generate_prevention_tips())
646
+
647
+ # Emergency procedures
648
+ emergency_procedures.extend(self._generate_emergency_procedures())
649
+
650
+ return guide
651
+
652
+ def cleanup_old_errors(self, days_to_keep: int = 30) -> Dict[str, Any]:
653
+ """
654
+ Clean up old error records
655
+
656
+ Args:
657
+ days_to_keep: Number of days to keep error records
658
+
659
+ Returns:
660
+ Cleanup operation results
661
+ """
662
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_to_keep)
663
+
664
+ old_errors = [e for e in self.error_history if e.timestamp < cutoff_date]
665
+ removed_count = len(old_errors)
666
+
667
+ # Keep only recent errors
668
+ self.error_history = [e for e in self.error_history if e.timestamp >= cutoff_date]
669
+
670
+ # Save updated error history
671
+ self._save_error_history()
672
+
673
+ logger.info(f"Cleaned up {removed_count} old error records")
674
+
675
+ return {
676
+ "removed_count": removed_count,
677
+ "remaining_count": len(self.error_history),
678
+ "cutoff_date": cutoff_date.isoformat(),
679
+ }
680
+
681
+ def _initialize_recovery_actions(self):
682
+ """Initialize default recovery actions"""
683
+ # System recovery actions
684
+ self.register_recovery_action(
685
+ RecoveryAction(
686
+ name="restart_research_engines",
687
+ description="Restart research engines and clear caches",
688
+ action_type="automatic",
689
+ severity_filter=[ErrorSeverity.HIGH, ErrorSeverity.CRITICAL],
690
+ category_filter=[ErrorCategory.RESEARCH, ErrorCategory.SYSTEM],
691
+ handler=self._restart_research_engines,
692
+ timeout=30.0,
693
+ )
694
+ )
695
+
696
+ self.register_recovery_action(
697
+ RecoveryAction(
698
+ name="restore_config_backup",
699
+ description="Restore configuration from last known good backup",
700
+ action_type="automatic",
701
+ severity_filter=[ErrorSeverity.CRITICAL],
702
+ category_filter=[ErrorCategory.CONFIGURATION],
703
+ handler=self._restore_config_backup,
704
+ timeout=15.0,
705
+ )
706
+ )
707
+
708
+ self.register_recovery_action(
709
+ RecoveryAction(
710
+ name="clear_agent_cache",
711
+ description="Clear agent communication cache and reset connections",
712
+ action_type="automatic",
713
+ severity_filter=[ErrorSeverity.MEDIUM, ErrorSeverity.HIGH],
714
+ category_filter=[ErrorCategory.COMMUNICATION],
715
+ handler=self._clear_agent_cache,
716
+ timeout=10.0,
717
+ )
718
+ )
719
+
720
+ self.register_recovery_action(
721
+ RecoveryAction(
722
+ name="validate_research_integrity",
723
+ description="Validate research component integrity and repair if needed",
724
+ action_type="assisted",
725
+ severity_filter=[ErrorSeverity.HIGH],
726
+ category_filter=[ErrorCategory.RESEARCH, ErrorCategory.VALIDATION],
727
+ handler=self._validate_research_integrity,
728
+ timeout=60.0,
729
+ )
730
+ )
731
+
732
+ self.register_recovery_action(
733
+ RecoveryAction(
734
+ name="rollback_last_changes",
735
+ description="Rollback last research integration changes",
736
+ action_type="manual",
737
+ severity_filter=[ErrorSeverity.CRITICAL],
738
+ category_filter=[ErrorCategory.INTEGRATION, ErrorCategory.RESEARCH],
739
+ handler=self._rollback_last_changes,
740
+ timeout=45.0,
741
+ )
742
+ )
743
+
744
+ self.register_recovery_action(
745
+ RecoveryAction(
746
+ name="reset_system_state",
747
+ description="Reset system to known good state",
748
+ action_type="manual",
749
+ severity_filter=[ErrorSeverity.CRITICAL],
750
+ category_filter=[ErrorCategory.SYSTEM],
751
+ handler=self._reset_system_state,
752
+ timeout=120.0,
753
+ )
754
+ )
755
+
756
+ # Performance recovery actions
757
+ self.register_recovery_action(
758
+ RecoveryAction(
759
+ name="optimize_performance",
760
+ description="Optimize system performance and clear bottlenecks",
761
+ action_type="automatic",
762
+ severity_filter=[ErrorSeverity.MEDIUM],
763
+ category_filter=[ErrorCategory.PERFORMANCE],
764
+ handler=self._optimize_performance,
765
+ timeout=30.0,
766
+ )
767
+ )
768
+
769
+ # Resource recovery actions
770
+ self.register_recovery_action(
771
+ RecoveryAction(
772
+ name="free_resources",
773
+ description="Free up system resources and memory",
774
+ action_type="automatic",
775
+ severity_filter=[ErrorSeverity.MEDIUM, ErrorSeverity.HIGH],
776
+ category_filter=[ErrorCategory.RESOURCE],
777
+ handler=self._free_resources,
778
+ timeout=20.0,
779
+ )
780
+ )
781
+
782
+ def _attempt_automatic_recovery(self, error_report: ErrorReport) -> RecoveryResult:
783
+ """Attempt automatic recovery for an error"""
784
+ suitable_actions = []
785
+
786
+ # Find suitable recovery actions
787
+ for action_name, action in self.recovery_actions.items():
788
+ if (
789
+ action.action_type == "automatic"
790
+ and error_report.severity in action.severity_filter
791
+ and error_report.category in action.category_filter
792
+ ):
793
+ suitable_actions.append(action)
794
+
795
+ # Try actions in order of priority
796
+ for action in suitable_actions:
797
+ try:
798
+ logger.info(f"Attempting automatic recovery: {action.name}")
799
+
800
+ start_time = time.time()
801
+ result = action.handler(error_report, {})
802
+ duration = time.time() - start_time
803
+
804
+ if result:
805
+ return RecoveryResult(
806
+ success=True,
807
+ action_name=action.name,
808
+ message=f"Automatic recovery successful: {action.name}",
809
+ duration=duration,
810
+ details={"result": result},
811
+ )
812
+
813
+ except Exception as e:
814
+ logger.warning(f"Recovery action {action.name} failed: {str(e)}")
815
+ continue
816
+
817
+ return RecoveryResult(
818
+ success=False,
819
+ action_name="none",
820
+ message="No suitable automatic recovery action succeeded",
821
+ duration=0.0,
822
+ )
823
+
824
+ def _restart_research_engines(self, error_report: ErrorReport, parameters: Dict[str, Any]) -> bool:
825
+ """Restart research engines and clear caches"""
826
+ try:
827
+ logger.info("Restarting research engines...")
828
+
829
+ # Clear research engine caches
830
+ cache_dirs = [
831
+ self.project_root / ".moai" / "cache",
832
+ self.project_root / ".claude" / "cache",
833
+ ]
834
+
835
+ for cache_dir in cache_dirs:
836
+ if cache_dir.exists():
837
+ import shutil
838
+
839
+ shutil.rmtree(cache_dir)
840
+ cache_dir.mkdir(parents=True, exist_ok=True)
841
+
842
+ # Reset research engine state
843
+ research_state_file = self.project_root / ".moai" / "research_state.json"
844
+ if research_state_file.exists():
845
+ research_state_file.unlink()
846
+
847
+ # Reinitialize research components
848
+ self._reinitialize_research_components()
849
+
850
+ logger.info("Research engines restarted successfully")
851
+ return True
852
+
853
+ except Exception as e:
854
+ logger.error(f"Failed to restart research engines: {str(e)}")
855
+ return False
856
+
857
+ def _restore_config_backup(self, error_report: ErrorReport, parameters: Dict[str, Any]) -> bool:
858
+ """Restore configuration from backup"""
859
+ try:
860
+ logger.info("Restoring configuration from backup...")
861
+
862
+ backup_dir = self.project_root / ".moai" / "config_backups"
863
+ if not backup_dir.exists():
864
+ logger.warning("No configuration backup directory found")
865
+ return False
866
+
867
+ # Find most recent backup
868
+ backup_files = list(backup_dir.glob("config_*.json"))
869
+ if not backup_files:
870
+ logger.warning("No configuration backups found")
871
+ return False
872
+
873
+ latest_backup = max(backup_files, key=lambda f: f.stat().st_mtime)
874
+
875
+ # Restore configuration
876
+ config_file = self.project_root / ".moai" / "config" / "config.json"
877
+ import shutil
878
+
879
+ shutil.copy2(latest_backup, config_file)
880
+
881
+ logger.info(f"Configuration restored from {latest_backup}")
882
+ return True
883
+
884
+ except Exception as e:
885
+ logger.error(f"Failed to restore configuration: {str(e)}")
886
+ return False
887
+
888
+ def _clear_agent_cache(self, error_report: ErrorReport, parameters: Dict[str, Any]) -> bool:
889
+ """Clear agent communication cache"""
890
+ try:
891
+ logger.info("Clearing agent cache...")
892
+
893
+ # Clear agent state files
894
+ agent_state_dir = self.project_root / ".moai" / "agent_state"
895
+ if agent_state_dir.exists():
896
+ import shutil
897
+
898
+ shutil.rmtree(agent_state_dir)
899
+ agent_state_dir.mkdir(parents=True, exist_ok=True)
900
+
901
+ # Reset communication channels
902
+ comm_cache_dir = self.project_root / ".moai" / "comm_cache"
903
+ if comm_cache_dir.exists():
904
+ import shutil
905
+
906
+ shutil.rmtree(comm_cache_dir)
907
+ comm_cache_dir.mkdir(parents=True, exist_ok=True)
908
+
909
+ logger.info("Agent cache cleared successfully")
910
+ return True
911
+
912
+ except Exception as e:
913
+ logger.error(f"Failed to clear agent cache: {str(e)}")
914
+ return False
915
+
916
+ def _validate_research_integrity(self, error_report: ErrorReport, parameters: Dict[str, Any]) -> Dict[str, Any]:
917
+ """Validate research component integrity"""
918
+ issues_found: List[str] = []
919
+ repairs_made: List[str] = []
920
+
921
+ validation_results = {
922
+ "skills_valid": True,
923
+ "agents_valid": True,
924
+ "commands_valid": True,
925
+ "hooks_valid": True,
926
+ "issues_found": issues_found,
927
+ "repairs_made": repairs_made,
928
+ }
929
+
930
+ try:
931
+ logger.info("Validating research integrity...")
932
+
933
+ # Validate skills
934
+ skills_dir = self.project_root / ".claude" / "skills"
935
+ if skills_dir.exists():
936
+ for skill_file in skills_dir.glob("*.md"):
937
+ if not self._validate_skill_file(skill_file):
938
+ validation_results["skills_valid"] = False
939
+ issues_found.append(f"Invalid skill file: {skill_file}")
940
+
941
+ # Attempt repair
942
+ if self._repair_skill_file(skill_file):
943
+ repairs_made.append(f"Repaired: {skill_file}")
944
+
945
+ # Validate agents
946
+ agents_dir = self.project_root / ".claude" / "agents" / "alfred"
947
+ if agents_dir.exists():
948
+ for agent_file in agents_dir.glob("*.md"):
949
+ if not self._validate_agent_file(agent_file):
950
+ validation_results["agents_valid"] = False
951
+ issues_found.append(f"Invalid agent file: {agent_file}")
952
+
953
+ # Validate commands
954
+ commands_dir = self.project_root / ".claude" / "commands" / "alfred"
955
+ if commands_dir.exists():
956
+ for command_file in commands_dir.glob("*.md"):
957
+ if not self._validate_command_file(command_file):
958
+ validation_results["commands_valid"] = False
959
+ issues_found.append(f"Invalid command file: {command_file}")
960
+
961
+ logger.info(
962
+ f"Research integrity validation completed. Issues: {len(issues_found)}, Repairs: {len(repairs_made)}"
963
+ )
964
+
965
+ except Exception as e:
966
+ logger.error(f"Research integrity validation failed: {str(e)}")
967
+ validation_results["validation_error"] = str(e)
968
+
969
+ return validation_results
970
+
971
+ def _rollback_last_changes(self, error_report: ErrorReport, parameters: Dict[str, Any]) -> bool:
972
+ """Rollback last research integration changes"""
973
+ try:
974
+ logger.info("Rolling back last research changes...")
975
+
976
+ # Import rollback manager
977
+ sys.path.insert(0, str(self.project_root / "src"))
978
+ from moai_adk.core.rollback_manager import RollbackManager
979
+
980
+ rollback_manager = RollbackManager(self.project_root)
981
+
982
+ # Find latest rollback point for research integration
983
+ rollback_points = rollback_manager.list_rollback_points(limit=5)
984
+ if not rollback_points:
985
+ logger.warning("No rollback points available")
986
+ return False
987
+
988
+ # Use the most recent rollback point
989
+ latest_rollback = rollback_points[0]
990
+ result = rollback_manager.rollback_to_point(latest_rollback["id"])
991
+
992
+ if result.success:
993
+ logger.info(f"Successfully rolled back to {latest_rollback['id']}")
994
+ return True
995
+ else:
996
+ logger.error(f"Rollback failed: {result.message}")
997
+ return False
998
+
999
+ except Exception as e:
1000
+ logger.error(f"Rollback operation failed: {str(e)}")
1001
+ return False
1002
+
1003
+ def _reset_system_state(self, error_report: ErrorReport, parameters: Dict[str, Any]) -> bool:
1004
+ """Reset system to known good state"""
1005
+ try:
1006
+ logger.info("Resetting system to known good state...")
1007
+
1008
+ # Clear all caches
1009
+ cache_dirs = [
1010
+ self.project_root / ".moai" / "cache",
1011
+ self.project_root / ".claude" / "cache",
1012
+ self.project_root / ".moai" / "agent_state",
1013
+ self.project_root / ".moai" / "comm_cache",
1014
+ ]
1015
+
1016
+ for cache_dir in cache_dirs:
1017
+ if cache_dir.exists():
1018
+ import shutil
1019
+
1020
+ shutil.rmtree(cache_dir)
1021
+ cache_dir.mkdir(parents=True, exist_ok=True)
1022
+
1023
+ # Reset error state
1024
+ self.active_errors.clear()
1025
+
1026
+ # Reinitialize core components
1027
+ self._reinitialize_core_components()
1028
+
1029
+ logger.info("System state reset completed")
1030
+ return True
1031
+
1032
+ except Exception as e:
1033
+ logger.error(f"System state reset failed: {str(e)}")
1034
+ return False
1035
+
1036
+ def _optimize_performance(self, error_report: ErrorReport, parameters: Dict[str, Any]) -> bool:
1037
+ """Optimize system performance"""
1038
+ try:
1039
+ logger.info("Optimizing system performance...")
1040
+
1041
+ # Clear temporary files
1042
+ temp_dirs = [
1043
+ self.project_root / ".moai" / "temp",
1044
+ self.project_root / ".claude" / "temp",
1045
+ ]
1046
+
1047
+ for temp_dir in temp_dirs:
1048
+ if temp_dir.exists():
1049
+ import shutil
1050
+
1051
+ shutil.rmtree(temp_dir)
1052
+
1053
+ # Optimize database connections if applicable
1054
+ self._optimize_connections()
1055
+
1056
+ # Clear memory caches
1057
+ import gc
1058
+
1059
+ gc.collect()
1060
+
1061
+ logger.info("Performance optimization completed")
1062
+ return True
1063
+
1064
+ except Exception as e:
1065
+ logger.error(f"Performance optimization failed: {str(e)}")
1066
+ return False
1067
+
1068
+ def _free_resources(self, error_report: ErrorReport, parameters: Dict[str, Any]) -> bool:
1069
+ """Free up system resources"""
1070
+ try:
1071
+ logger.info("Freeing up system resources...")
1072
+
1073
+ # Clear memory caches
1074
+ import gc
1075
+
1076
+ gc.collect()
1077
+
1078
+ # Close any open file handles
1079
+ self._close_file_handles()
1080
+
1081
+ # Terminate any hanging processes
1082
+ self._terminate_hanging_processes()
1083
+
1084
+ logger.info("Resource cleanup completed")
1085
+ return True
1086
+
1087
+ except Exception as e:
1088
+ logger.error(f"Resource cleanup failed: {str(e)}")
1089
+ return False
1090
+
1091
+ def _generate_error_id(self) -> str:
1092
+ """Generate unique error ID"""
1093
+ timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
1094
+ random_suffix = hashlib.md5(os.urandom(4), usedforsecurity=False).hexdigest()[:6]
1095
+ return f"ERR_{timestamp}_{random_suffix}"
1096
+
1097
+ def _log_error(self, error_report: ErrorReport):
1098
+ """Log error to file and system"""
1099
+ # Log to file
1100
+ error_file = self.error_log_dir / f"error_{error_report.id}.json"
1101
+ try:
1102
+ with open(error_file, "w", encoding="utf-8") as f:
1103
+ json.dump(asdict(error_report), f, indent=2, default=str, ensure_ascii=False)
1104
+ except Exception as e:
1105
+ logger.error(f"Failed to log error to file: {str(e)}")
1106
+
1107
+ # Log to system
1108
+ log_level = {
1109
+ ErrorSeverity.CRITICAL: logging.CRITICAL,
1110
+ ErrorSeverity.HIGH: logging.ERROR,
1111
+ ErrorSeverity.MEDIUM: logging.WARNING,
1112
+ ErrorSeverity.LOW: logging.INFO,
1113
+ ErrorSeverity.INFO: logging.INFO,
1114
+ }.get(error_report.severity, logging.WARNING)
1115
+
1116
+ logger.log(log_level, f"Error {error_report.id}: {error_report.message}")
1117
+
1118
+ def _update_error_stats(self, error_report: ErrorReport):
1119
+ """Update error statistics"""
1120
+ self.error_stats["total_errors"] += 1
1121
+
1122
+ # By severity
1123
+ severity = error_report.severity.value
1124
+ if severity not in self.error_stats["by_severity"]:
1125
+ self.error_stats["by_severity"][severity] = 0
1126
+ self.error_stats["by_severity"][severity] += 1
1127
+
1128
+ # By category
1129
+ category = error_report.category.value
1130
+ if category not in self.error_stats["by_category"]:
1131
+ self.error_stats["by_category"][category] = 0
1132
+ self.error_stats["by_category"][category] += 1
1133
+
1134
+ def _update_system_health(self):
1135
+ """Update system health status"""
1136
+ current_time = datetime.now(timezone.utc)
1137
+
1138
+ # Determine system status
1139
+ critical_errors = [e for e in self.active_errors.values() if e.severity == ErrorSeverity.CRITICAL]
1140
+ high_errors = [e for e in self.active_errors.values() if e.severity == ErrorSeverity.HIGH]
1141
+
1142
+ if critical_errors:
1143
+ self.system_health["status"] = "critical"
1144
+ elif high_errors:
1145
+ self.system_health["status"] = "degraded"
1146
+ elif len(self.active_errors) > 5:
1147
+ self.system_health["status"] = "warning"
1148
+ else:
1149
+ self.system_health["status"] = "healthy"
1150
+
1151
+ # Update metrics
1152
+ self.system_health["last_check"] = current_time
1153
+ self.system_health["metrics"] = {
1154
+ "active_errors": len(self.active_errors),
1155
+ "total_errors": len(self.error_history),
1156
+ "recovery_success_rate": self._calculate_recovery_rate(self.error_history),
1157
+ }
1158
+
1159
+ # Identify issues
1160
+ self.system_health["issues"] = [
1161
+ {
1162
+ "type": "active_errors",
1163
+ "count": len(self.active_errors),
1164
+ "severity_distribution": {
1165
+ severity: len([e for e in self.active_errors.values() if e.severity.value == severity])
1166
+ for severity in set(e.severity.value for e in self.active_errors.values())
1167
+ },
1168
+ }
1169
+ ]
1170
+
1171
+ def _background_monitoring(self):
1172
+ """Background monitoring thread"""
1173
+ while self.monitoring_active:
1174
+ try:
1175
+ # Check system health every 30 seconds
1176
+ time.sleep(30)
1177
+ self._update_system_health()
1178
+
1179
+ # Check for error patterns that need attention
1180
+ self._check_error_patterns()
1181
+
1182
+ except Exception as e:
1183
+ logger.error(f"Background monitoring error: {str(e)}")
1184
+
1185
+ def _check_error_patterns(self):
1186
+ """Check for concerning error patterns"""
1187
+ recent_errors = [
1188
+ e for e in self.error_history if (datetime.now(timezone.utc) - e.timestamp).total_seconds() < 300
1189
+ ] # Last 5 minutes
1190
+
1191
+ # Check for error bursts
1192
+ if len(recent_errors) > 10:
1193
+ logger.warning(f"High error rate detected: {len(recent_errors)} errors in last 5 minutes")
1194
+
1195
+ # Check for repeated errors
1196
+ error_messages = [e.message for e in recent_errors]
1197
+ message_counts: Dict[str, int] = {}
1198
+ for msg in error_messages:
1199
+ message_counts[msg] = message_counts.get(msg, 0) + 1
1200
+
1201
+ repeated_errors = [msg for msg, count in message_counts.items() if count > 3]
1202
+ if repeated_errors:
1203
+ logger.warning(f"Repeated errors detected: {repeated_errors}")
1204
+
1205
+ def _calculate_recovery_rate(self, errors: List[ErrorReport]) -> float:
1206
+ """Calculate recovery success rate"""
1207
+ if not errors:
1208
+ return 0.0
1209
+
1210
+ recovered_errors = [e for e in errors if e.recovery_successful]
1211
+ return len(recovered_errors) / len(errors)
1212
+
1213
+ def _identify_error_patterns(self, errors: List[ErrorReport]) -> Dict[str, int]:
1214
+ """Identify common error patterns"""
1215
+ patterns: Dict[str, int] = {}
1216
+
1217
+ for error in errors:
1218
+ # Pattern by exception type
1219
+ pattern = f"{error.category.value}:{error.details.get('exception_type', 'unknown')}"
1220
+ patterns[pattern] = patterns.get(pattern, 0) + 1
1221
+
1222
+ return patterns
1223
+
1224
+ def _get_pattern_severity(self, pattern: str) -> str:
1225
+ """Get typical severity for an error pattern"""
1226
+ severity_map = {
1227
+ "research:Exception": "high",
1228
+ "system:Exception": "critical",
1229
+ "configuration:Exception": "high",
1230
+ "communication:Exception": "medium",
1231
+ "validation:Exception": "medium",
1232
+ }
1233
+
1234
+ for key, severity in severity_map.items():
1235
+ if key in pattern:
1236
+ return severity
1237
+
1238
+ return "medium"
1239
+
1240
+ def _get_solutions_for_pattern(self, pattern: str) -> List[str]:
1241
+ """Get common solutions for error pattern"""
1242
+ solutions = {
1243
+ "research:Exception": [
1244
+ "Restart research engines",
1245
+ "Clear research cache",
1246
+ "Validate research components",
1247
+ ],
1248
+ "system:Exception": [
1249
+ "Check system resources",
1250
+ "Restart system components",
1251
+ "Verify system configuration",
1252
+ ],
1253
+ "configuration:Exception": [
1254
+ "Restore configuration backup",
1255
+ "Validate configuration syntax",
1256
+ "Check configuration permissions",
1257
+ ],
1258
+ }
1259
+
1260
+ for key, sols in solutions.items():
1261
+ if key in pattern:
1262
+ return sols
1263
+
1264
+ return ["Contact system administrator", "Check system logs"]
1265
+
1266
+ def _generate_prevention_tips(self) -> List[str]:
1267
+ """Generate prevention tips based on error history"""
1268
+ tips = []
1269
+
1270
+ # Add tips based on common error categories
1271
+ category_counts: Dict[str, int] = {}
1272
+ for error in self.error_history:
1273
+ category = error.category.value
1274
+ category_counts[category] = category_counts.get(category, 0) + 1
1275
+
1276
+ if category_counts.get("configuration", 0) > 5:
1277
+ tips.append("Regularly validate configuration files before making changes")
1278
+
1279
+ if category_counts.get("research", 0) > 5:
1280
+ tips.append("Monitor research engine performance and clear caches regularly")
1281
+
1282
+ if category_counts.get("communication", 0) > 5:
1283
+ tips.append("Ensure stable network connections for agent communication")
1284
+
1285
+ return tips
1286
+
1287
+ def _generate_emergency_procedures(self) -> List[Dict[str, str]]:
1288
+ """Generate emergency recovery procedures"""
1289
+ return [
1290
+ {
1291
+ "condition": "System completely unresponsive",
1292
+ "procedure": "Use system_reset recovery action to restore to known good state",
1293
+ },
1294
+ {
1295
+ "condition": "Critical research engine failure",
1296
+ "procedure": "Rollback last research changes using rollback_last_changes action",
1297
+ },
1298
+ {
1299
+ "condition": "Configuration corruption",
1300
+ "procedure": "Restore configuration from backup using restore_config_backup action",
1301
+ },
1302
+ {
1303
+ "condition": "Multiple agent communication failures",
1304
+ "procedure": "Clear agent cache and restart communication channels",
1305
+ },
1306
+ ]
1307
+
1308
+ # Helper methods for component validation and repair
1309
+ def _validate_skill_file(self, skill_file: Path) -> bool:
1310
+ """Validate skill file format"""
1311
+ try:
1312
+ with open(skill_file, "r", encoding="utf-8") as f:
1313
+ content = f.read()
1314
+
1315
+ # Basic validation
1316
+ return "---" in content and len(content) > 100
1317
+ except (OSError, UnicodeDecodeError):
1318
+ return False
1319
+
1320
+ def _validate_agent_file(self, agent_file: Path) -> bool:
1321
+ """Validate agent file format"""
1322
+ try:
1323
+ with open(agent_file, "r", encoding="utf-8") as f:
1324
+ content = f.read()
1325
+
1326
+ return "role:" in content and len(content) > 200
1327
+ except (OSError, UnicodeDecodeError):
1328
+ return False
1329
+
1330
+ def _validate_command_file(self, command_file: Path) -> bool:
1331
+ """Validate command file format"""
1332
+ try:
1333
+ with open(command_file, "r", encoding="utf-8") as f:
1334
+ content = f.read()
1335
+
1336
+ return "name:" in content and "allowed-tools:" in content
1337
+ except (OSError, UnicodeDecodeError):
1338
+ return False
1339
+
1340
+ def _repair_skill_file(self, skill_file: Path) -> bool:
1341
+ """Attempt to repair skill file"""
1342
+ try:
1343
+ # Basic repair - ensure file has minimum required content
1344
+ with open(skill_file, "r", encoding="utf-8") as f:
1345
+ content = f.read()
1346
+
1347
+ if not content.startswith("---"):
1348
+ content = f"---\nname: {skill_file.stem}\ndescription: Repaired skill file\n---\n\n{content}"
1349
+
1350
+ with open(skill_file, "w", encoding="utf-8") as f:
1351
+ f.write(content)
1352
+
1353
+ return True
1354
+ except (OSError, UnicodeDecodeError):
1355
+ return False
1356
+
1357
+ def _reinitialize_research_components(self):
1358
+ """Reinitialize research components"""
1359
+ # Implementation would depend on specific research components
1360
+ pass
1361
+
1362
+ def _reinitialize_core_components(self):
1363
+ """Reinitialize core system components"""
1364
+ # Implementation would depend on specific core components
1365
+ pass
1366
+
1367
+ def _optimize_connections(self):
1368
+ """Optimize database/network connections"""
1369
+ # Implementation would depend on specific connection types
1370
+ pass
1371
+
1372
+ def _close_file_handles(self):
1373
+ """Close open file handles"""
1374
+ import gc
1375
+
1376
+ gc.collect() # Force garbage collection to close file handles
1377
+
1378
+ def _terminate_hanging_processes(self):
1379
+ """Terminate hanging processes"""
1380
+ # Implementation would identify and terminate hanging processes
1381
+ pass
1382
+
1383
+ def _save_error_history(self):
1384
+ """Save error history to file"""
1385
+ history_file = self.error_log_dir / "error_history.json"
1386
+ try:
1387
+ with open(history_file, "w") as f:
1388
+ json.dump([asdict(e) for e in self.error_history], f, indent=2, default=str)
1389
+ except Exception as e:
1390
+ logger.error(f"Failed to save error history: {str(e)}")
1391
+
1392
+ # Phase 3: Advanced Error Recovery Methods
1393
+
1394
+ def _initialize_phase3_components(self):
1395
+ """Initialize Phase 3 advanced recovery components"""
1396
+ # Phase 3 specific attributes
1397
+ self.advanced_failures: Dict[str, FailureEvent] = {}
1398
+ self.advanced_recovery_actions: Dict[str, AdvancedRecoveryAction] = {}
1399
+ self.system_snapshots: Dict[str, SystemSnapshot] = {}
1400
+ self.dead_letter_queue: deque = deque(maxlen=10000)
1401
+
1402
+ # Advanced recovery statistics
1403
+ self.advanced_recovery_stats = {
1404
+ "total_failures": 0,
1405
+ "auto_recoveries_attempted": 0,
1406
+ "auto_recoveries_successful": 0,
1407
+ "cascade_failures_detected": 0,
1408
+ "rollbacks_performed": 0,
1409
+ "snapshots_created": 0,
1410
+ "dead_letter_messages": 0,
1411
+ }
1412
+
1413
+ # Circuit breaker states for components
1414
+ self.circuit_breaker_states: Dict[str, Dict[str, Any]] = defaultdict(
1415
+ lambda: {
1416
+ "state": "CLOSED",
1417
+ "failure_count": 0,
1418
+ "last_failure_time": None,
1419
+ "success_threshold": 5,
1420
+ "failure_threshold": 3,
1421
+ "timeout_seconds": 60,
1422
+ }
1423
+ )
1424
+
1425
+ # Failure mode analyzers
1426
+ self.failure_analyzers = {
1427
+ FailureMode.CASCADE_FAILURE: self._analyze_cascade_failure,
1428
+ FailureMode.CIRCUIT_BREAKER_TRIPPED: self._analyze_circuit_breaker_trip,
1429
+ FailureMode.RESOURCE_EXHAUSTION: self._analyze_resource_exhaustion,
1430
+ }
1431
+
1432
+ logger.info("Phase 3 advanced recovery components initialized")
1433
+
1434
+ async def report_advanced_failure(
1435
+ self,
1436
+ failure_mode: FailureMode,
1437
+ component: str,
1438
+ description: str,
1439
+ severity: str = "medium",
1440
+ context: Optional[Dict[str, Any]] = None,
1441
+ error_details: Optional[Dict[str, Any]] = None,
1442
+ affected_operations: Optional[List[str]] = None,
1443
+ auto_recovery_eligible: bool = True,
1444
+ parent_failure_id: Optional[str] = None,
1445
+ ) -> str:
1446
+ """Report an advanced failure event with enhanced tracking"""
1447
+ failure_id = str(uuid.uuid4())
1448
+
1449
+ failure = FailureEvent(
1450
+ failure_id=failure_id,
1451
+ failure_mode=failure_mode,
1452
+ timestamp=datetime.now(timezone.utc),
1453
+ component=component,
1454
+ description=description,
1455
+ severity=severity,
1456
+ context=context or {},
1457
+ error_details=error_details,
1458
+ affected_operations=affected_operations or [],
1459
+ auto_recovery_eligible=auto_recovery_eligible,
1460
+ parent_failure_id=parent_failure_id,
1461
+ )
1462
+
1463
+ # Store failure
1464
+ self.advanced_failures[failure_id] = failure
1465
+ self.advanced_recovery_stats["total_failures"] += 1
1466
+
1467
+ # Analyze failure mode
1468
+ if failure_mode in self.failure_analyzers:
1469
+ await self.failure_analyzers[failure_mode](failure)
1470
+
1471
+ # Trigger advanced recovery if eligible
1472
+ if auto_recovery_eligible:
1473
+ await self._trigger_advanced_recovery(failure)
1474
+
1475
+ # Check for cascade failures
1476
+ await self._check_cascade_failures(failure)
1477
+
1478
+ logger.warning(f"Advanced failure reported: {failure_mode.value} in {component} - {description}")
1479
+ return failure_id
1480
+
1481
+ async def _trigger_advanced_recovery(self, failure: FailureEvent):
1482
+ """Trigger advanced recovery mechanisms"""
1483
+ try:
1484
+ strategy = self._determine_advanced_recovery_strategy(failure.failure_mode)
1485
+
1486
+ action = AdvancedRecoveryAction(
1487
+ action_id=str(uuid.uuid4()),
1488
+ failure_id=failure.failure_id,
1489
+ strategy=strategy,
1490
+ timestamp=datetime.now(timezone.utc),
1491
+ description=f"Advanced recovery for {failure.failure_mode.value}",
1492
+ parameters={"failure_context": failure.context},
1493
+ priority=self._calculate_recovery_priority(failure),
1494
+ )
1495
+
1496
+ self.advanced_recovery_actions[action.action_id] = action
1497
+ self.advanced_recovery_stats["auto_recoveries_attempted"] += 1
1498
+
1499
+ # Execute recovery action
1500
+ success = await self._execute_advanced_recovery_action(action)
1501
+
1502
+ if success:
1503
+ self.advanced_recovery_stats["auto_recoveries_successful"] += 1
1504
+ logger.info(f"Advanced recovery successful for failure {failure.failure_id}")
1505
+ else:
1506
+ # Add to dead letter queue for manual intervention
1507
+ self.dead_letter_queue.append(
1508
+ {
1509
+ "failure_id": failure.failure_id,
1510
+ "action_id": action.action_id,
1511
+ "timestamp": datetime.now(timezone.utc).isoformat(),
1512
+ "reason": "Advanced recovery failed",
1513
+ }
1514
+ )
1515
+ self.advanced_recovery_stats["dead_letter_messages"] += 1
1516
+
1517
+ except Exception as e:
1518
+ logger.error(f"Error triggering advanced recovery: {e}")
1519
+
1520
+ async def _execute_advanced_recovery_action(self, action: AdvancedRecoveryAction) -> bool:
1521
+ """Execute advanced recovery action with enhanced capabilities"""
1522
+ action.status = RecoveryStatus.IN_PROGRESS
1523
+ action.execution_log.append(f"Starting advanced recovery: {action.strategy.value}")
1524
+
1525
+ try:
1526
+ # Check dependencies
1527
+ for dep_action_id in action.dependencies:
1528
+ if dep_action_id in self.advanced_recovery_actions:
1529
+ dep_action = self.advanced_recovery_actions[dep_action_id]
1530
+ if dep_action.status != RecoveryStatus.COMPLETED:
1531
+ action.execution_log.append(f"Waiting for dependency: {dep_action_id}")
1532
+ return False
1533
+
1534
+ # Execute based on strategy
1535
+ if action.strategy == RecoveryStrategy.RETRY_WITH_BACKOFF:
1536
+ success = await self._execute_retry_with_backoff(action)
1537
+ elif action.strategy == RecoveryStrategy.CIRCUIT_BREAKER:
1538
+ success = await self._execute_circuit_breaker_action(action)
1539
+ elif action.strategy == RecoveryStrategy.ROLLBACK:
1540
+ success = await self._execute_rollback_action(action)
1541
+ elif action.strategy == RecoveryStrategy.QUARANTINE:
1542
+ success = await self._execute_quarantine_action(action)
1543
+ else:
1544
+ success = await self._execute_legacy_recovery_action(action)
1545
+
1546
+ if success:
1547
+ action.status = RecoveryStatus.COMPLETED
1548
+ action.execution_log.append("Advanced recovery completed successfully")
1549
+ else:
1550
+ action.status = RecoveryStatus.FAILED
1551
+ action.execution_log.append("Advanced recovery failed")
1552
+
1553
+ return success
1554
+
1555
+ except Exception as e:
1556
+ action.status = RecoveryStatus.FAILED
1557
+ action.execution_log.append(f"Advanced recovery error: {str(e)}")
1558
+ logger.error(f"Error executing advanced recovery action {action.action_id}: {e}")
1559
+ return False
1560
+
1561
+ async def _execute_retry_with_backoff(self, action: AdvancedRecoveryAction) -> bool:
1562
+ """Execute retry with exponential backoff"""
1563
+ self.advanced_failures[action.failure_id]
1564
+ base_delay = 1.0
1565
+ max_delay = 60.0
1566
+ backoff_factor = 2.0
1567
+
1568
+ for attempt in range(action.max_retries + 1):
1569
+ try:
1570
+ action.retry_attempts = attempt
1571
+ action.execution_log.append(f"Retry attempt {attempt + 1}/{action.max_retries + 1}")
1572
+
1573
+ # Simulate retry logic - in real implementation, this would call the failing function
1574
+ if attempt >= 2: # Simulate success after a few attempts
1575
+ action.execution_log.append("Retry successful")
1576
+ return True
1577
+ else:
1578
+ action.execution_log.append("Retry failed, will retry again")
1579
+
1580
+ # Wait with exponential backoff
1581
+ if attempt < action.max_retries:
1582
+ delay = min(base_delay * (backoff_factor**attempt), max_delay)
1583
+ await asyncio.sleep(delay)
1584
+
1585
+ except Exception as e:
1586
+ action.execution_log.append(f"Retry attempt {attempt + 1} error: {str(e)}")
1587
+
1588
+ action.execution_log.append("All retry attempts exhausted")
1589
+ return False
1590
+
1591
+ async def _execute_circuit_breaker_action(self, action: AdvancedRecoveryAction) -> bool:
1592
+ """Execute circuit breaker action"""
1593
+ failure = self.advanced_failures[action.failure_id]
1594
+ component = failure.component
1595
+
1596
+ # Update circuit breaker state
1597
+ cb_state = self.circuit_breaker_states[component]
1598
+ cb_state["state"] = "OPEN"
1599
+ cb_state["failure_count"] += 1
1600
+ cb_state["last_failure_time"] = datetime.now(timezone.utc).isoformat()
1601
+
1602
+ action.execution_log.append(f"Circuit breaker opened for component: {component}")
1603
+ return True
1604
+
1605
+ async def _execute_rollback_action(self, action: AdvancedRecoveryAction) -> bool:
1606
+ """Execute rollback action"""
1607
+ try:
1608
+ # Create a snapshot before rollback
1609
+ snapshot_id = await self._create_system_snapshot("pre_rollback_snapshot")
1610
+
1611
+ # Perform rollback logic
1612
+ action.execution_log.append("Creating rollback snapshot and performing rollback")
1613
+
1614
+ # In real implementation, this would restore system state from snapshot
1615
+ self.advanced_recovery_stats["rollbacks_performed"] += 1
1616
+
1617
+ action.rollback_action_id = snapshot_id
1618
+ return True
1619
+
1620
+ except Exception as e:
1621
+ action.execution_log.append(f"Rollback failed: {str(e)}")
1622
+ return False
1623
+
1624
+ async def _execute_quarantine_action(self, action: AdvancedRecoveryAction) -> bool:
1625
+ """Execute quarantine action"""
1626
+ failure = self.advanced_failures[action.failure_id]
1627
+ component = failure.component
1628
+
1629
+ action.execution_log.append(f"Quarantining component: {component}")
1630
+
1631
+ # In real implementation, this would isolate the component
1632
+ # For now, just log the action
1633
+ return True
1634
+
1635
+ async def _execute_legacy_recovery_action(self, action: AdvancedRecoveryAction) -> bool:
1636
+ """Execute legacy recovery action as fallback"""
1637
+ failure = self.advanced_failures[action.failure_id]
1638
+
1639
+ # Convert to legacy format and use existing recovery mechanisms
1640
+ legacy_action = self.recovery_actions.get("restart_research_engines")
1641
+ if not legacy_action:
1642
+ action.execution_log.append("No legacy recovery action available")
1643
+ return False
1644
+
1645
+ # Create legacy error report
1646
+ legacy_error = ErrorReport(
1647
+ id=failure.failure_id,
1648
+ timestamp=failure.timestamp,
1649
+ severity=getattr(ErrorSeverity, failure.severity.upper(), ErrorSeverity.MEDIUM),
1650
+ category=getattr(ErrorCategory, "SYSTEM", ErrorCategory.SYSTEM),
1651
+ message=failure.description,
1652
+ details=failure.error_details or {},
1653
+ stack_trace="",
1654
+ context=failure.context,
1655
+ )
1656
+
1657
+ try:
1658
+ result = legacy_action.handler(legacy_error, action.parameters)
1659
+ if result:
1660
+ action.execution_log.append("Legacy recovery action successful")
1661
+ return True
1662
+ else:
1663
+ action.execution_log.append("Legacy recovery action failed")
1664
+ return False
1665
+ except Exception as e:
1666
+ action.execution_log.append(f"Legacy recovery action error: {str(e)}")
1667
+ return False
1668
+
1669
+ def _determine_advanced_recovery_strategy(self, failure_mode: FailureMode) -> RecoveryStrategy:
1670
+ """Determine advanced recovery strategy based on failure mode"""
1671
+ strategy_map = {
1672
+ FailureMode.HOOK_EXECUTION_FAILURE: RecoveryStrategy.RETRY_WITH_BACKOFF,
1673
+ FailureMode.RESOURCE_EXHAUSTION: RecoveryStrategy.DEGRADE_SERVICE,
1674
+ FailureMode.DATA_CORRUPTION: RecoveryStrategy.ROLLBACK,
1675
+ FailureMode.NETWORK_FAILURE: RecoveryStrategy.RETRY_WITH_BACKOFF,
1676
+ FailureMode.SYSTEM_OVERLOAD: RecoveryStrategy.CIRCUIT_BREAKER,
1677
+ FailureMode.CIRCUIT_BREAKER_TRIPPED: RecoveryStrategy.CIRCUIT_BREAKER,
1678
+ FailureMode.CASCADE_FAILURE: RecoveryStrategy.EMERGENCY_STOP,
1679
+ FailureMode.TIMEOUT_FAILURE: RecoveryStrategy.RETRY_WITH_BACKOFF,
1680
+ FailureMode.MEMORY_LEAK: RecoveryStrategy.RESTART_COMPONENT,
1681
+ FailureMode.DEADLOCK: RecoveryStrategy.QUARANTINE,
1682
+ FailureMode.AUTHENTICATION_FAILURE: RecoveryStrategy.NOTIFY_ADMIN,
1683
+ FailureMode.VALIDATION_FAILURE: RecoveryStrategy.QUARANTINE,
1684
+ FailureMode.EXTERNAL_SERVICE_FAILURE: RecoveryStrategy.FAILOVER,
1685
+ FailureMode.STORAGE_FAILURE: RecoveryStrategy.ROLLBACK,
1686
+ FailureMode.CONCURRENCY_ISSUE: RecoveryStrategy.CIRCUIT_BREAKER,
1687
+ }
1688
+
1689
+ return strategy_map.get(failure_mode, RecoveryStrategy.RETRY_WITH_BACKOFF)
1690
+
1691
+ def _calculate_recovery_priority(self, failure: FailureEvent) -> int:
1692
+ """Calculate recovery priority based on failure characteristics"""
1693
+ base_priority = 5
1694
+
1695
+ # Adjust based on severity
1696
+ if failure.severity == "critical":
1697
+ base_priority -= 3
1698
+ elif failure.severity == "high":
1699
+ base_priority -= 2
1700
+ elif failure.severity == "medium":
1701
+ base_priority -= 1
1702
+
1703
+ # Adjust based on number of affected operations
1704
+ if len(failure.affected_operations) > 10:
1705
+ base_priority -= 2
1706
+ elif len(failure.affected_operations) > 5:
1707
+ base_priority -= 1
1708
+
1709
+ # Ensure priority is in valid range
1710
+ return max(1, min(10, base_priority))
1711
+
1712
+ async def _check_cascade_failures(self, failure: FailureEvent):
1713
+ """Check for cascade failure patterns"""
1714
+ # Check if this failure is related to other recent failures
1715
+ recent_failures = [
1716
+ f
1717
+ for f in self.advanced_failures.values()
1718
+ if (datetime.now(timezone.utc) - f.timestamp).total_seconds() < 300 # Last 5 minutes
1719
+ and f.failure_id != failure.failure_id
1720
+ ]
1721
+
1722
+ # Simple cascade detection: same component or related components
1723
+ related_failures = [
1724
+ f
1725
+ for f in recent_failures
1726
+ if f.component == failure.component or f.component in failure.context.get("related_components", [])
1727
+ ]
1728
+
1729
+ if len(related_failures) >= 3:
1730
+ self.advanced_recovery_stats["cascade_failures_detected"] += 1
1731
+ logger.warning(f"Cascade failure detected: {len(related_failures)} related failures")
1732
+
1733
+ # Trigger emergency recovery
1734
+ await self._trigger_emergency_recovery(failure, related_failures)
1735
+
1736
+ async def _trigger_emergency_recovery(self, failure: FailureEvent, related_failures: List[FailureEvent]):
1737
+ """Trigger emergency recovery for cascade failures"""
1738
+ emergency_action = AdvancedRecoveryAction(
1739
+ action_id=str(uuid.uuid4()),
1740
+ failure_id=failure.failure_id,
1741
+ strategy=RecoveryStrategy.EMERGENCY_STOP,
1742
+ timestamp=datetime.now(timezone.utc),
1743
+ description="Emergency recovery for cascade failure",
1744
+ parameters={"cascade_failures": [f.failure_id for f in related_failures]},
1745
+ priority=1, # Highest priority
1746
+ )
1747
+
1748
+ await self._execute_advanced_recovery_action(emergency_action)
1749
+
1750
+ async def _analyze_cascade_failure(self, failure: FailureEvent):
1751
+ """Analyze cascade failure patterns"""
1752
+ # Implementation would analyze failure patterns and correlations
1753
+ pass
1754
+
1755
+ async def _analyze_circuit_breaker_trip(self, failure: FailureEvent):
1756
+ """Analyze circuit breaker trip patterns"""
1757
+ # Implementation would analyze circuit breaker behavior
1758
+ pass
1759
+
1760
+ async def _analyze_resource_exhaustion(self, failure: FailureEvent):
1761
+ """Analyze resource exhaustion patterns"""
1762
+ # Implementation would analyze resource usage patterns
1763
+ pass
1764
+
1765
+ async def _create_system_snapshot(self, description: str = "", is_rollback_point: bool = False) -> str:
1766
+ """Create a system state snapshot"""
1767
+ snapshot_id = str(uuid.uuid4())
1768
+
1769
+ # Get current system state
1770
+ component_states = {
1771
+ "error_recovery_system": {
1772
+ "active_errors": len(self.active_errors),
1773
+ "advanced_failures": len(self.advanced_failures),
1774
+ "system_health": self.system_health["status"],
1775
+ },
1776
+ "circuit_breakers": dict(self.circuit_breaker_states),
1777
+ "recovery_stats": self.advanced_recovery_stats.copy(),
1778
+ }
1779
+
1780
+ # Calculate checksums
1781
+ config_str = json.dumps(component_states, sort_keys=True)
1782
+ config_hash = hashlib.sha256(config_str.encode()).hexdigest()
1783
+
1784
+ data_checksums = {
1785
+ "component_states": hashlib.sha256(config_str.encode()).hexdigest(),
1786
+ }
1787
+
1788
+ snapshot = SystemSnapshot(
1789
+ snapshot_id=snapshot_id,
1790
+ timestamp=datetime.now(timezone.utc),
1791
+ component_states=component_states,
1792
+ configuration_hash=config_hash,
1793
+ data_checksums=data_checksums,
1794
+ description=description,
1795
+ is_rollback_point=is_rollback_point,
1796
+ )
1797
+
1798
+ self.system_snapshots[snapshot_id] = snapshot
1799
+ self.advanced_recovery_stats["snapshots_created"] += 1
1800
+
1801
+ logger.info(f"Created system snapshot: {snapshot_id}")
1802
+ return snapshot_id
1803
+
1804
+ def get_advanced_system_status(self) -> Dict[str, Any]:
1805
+ """Get comprehensive advanced system status"""
1806
+ return {
1807
+ "status": "running",
1808
+ "phase3_features": "enabled",
1809
+ "advanced_recovery_statistics": self.advanced_recovery_stats,
1810
+ "active_advanced_failures": len(self.advanced_failures),
1811
+ "pending_advanced_actions": len(
1812
+ [
1813
+ a
1814
+ for a in self.advanced_recovery_actions.values()
1815
+ if a.status in [RecoveryStatus.PENDING, RecoveryStatus.IN_PROGRESS]
1816
+ ]
1817
+ ),
1818
+ "circuit_breaker_states": dict(self.circuit_breaker_states),
1819
+ "system_snapshots": len(self.system_snapshots),
1820
+ "dead_letter_queue_size": len(self.dead_letter_queue),
1821
+ "failure_mode_analyzers": list(self.failure_analyzers.keys()),
1822
+ }
1823
+
1824
+
1825
+ # Global error recovery system instance
1826
+ _error_recovery_system = None
1827
+
1828
+
1829
+ def get_error_recovery_system(project_root: Path = None) -> ErrorRecoverySystem:
1830
+ """Get or create global error recovery system instance"""
1831
+ global _error_recovery_system
1832
+ if _error_recovery_system is None:
1833
+ _error_recovery_system = ErrorRecoverySystem(project_root)
1834
+ return _error_recovery_system
1835
+
1836
+
1837
+ def handle_error(
1838
+ error: Exception,
1839
+ context: Dict[str, Any] = None,
1840
+ severity: ErrorSeverity = ErrorSeverity.MEDIUM,
1841
+ category: ErrorCategory = ErrorCategory.SYSTEM,
1842
+ ) -> ErrorReport:
1843
+ """Convenience function to handle errors using global system"""
1844
+ return get_error_recovery_system().handle_error(error, context, severity, category)
1845
+
1846
+
1847
+ # Decorator for automatic error handling
1848
+ def error_handler(
1849
+ severity: ErrorSeverity = ErrorSeverity.MEDIUM,
1850
+ category: ErrorCategory = ErrorCategory.SYSTEM,
1851
+ context: Dict[str, Any] = None,
1852
+ ):
1853
+ """Decorator for automatic error handling"""
1854
+
1855
+ def decorator(func):
1856
+ def wrapper(*args, **kwargs):
1857
+ try:
1858
+ return func(*args, **kwargs)
1859
+ except Exception as e:
1860
+ error_context = {
1861
+ "function": func.__name__,
1862
+ "module": func.__module__,
1863
+ "args": str(args)[:100], # Limit length
1864
+ "kwargs": str(kwargs)[:100],
1865
+ **(context or {}),
1866
+ }
1867
+ handle_error(e, error_context, severity, category)
1868
+ raise
1869
+
1870
+ return wrapper
1871
+
1872
+ return decorator
1873
+
1874
+
1875
+ if __name__ == "__main__":
1876
+ # Demo usage
1877
+ recovery_system = ErrorRecoverySystem()
1878
+
1879
+ print("Error Recovery System Demo")
1880
+ print("=" * 50)
1881
+
1882
+ # Simulate some errors
1883
+ try:
1884
+ raise ValueError("This is a test error for demonstration")
1885
+ except Exception as e:
1886
+ error_report = recovery_system.handle_error(
1887
+ e,
1888
+ context={"demo": True},
1889
+ severity=ErrorSeverity.MEDIUM,
1890
+ category=ErrorCategory.SYSTEM,
1891
+ )
1892
+ print(f"Handled error: {error_report.id}")
1893
+
1894
+ # Show system health
1895
+ health = recovery_system.get_system_health()
1896
+ print(f"System health: {health['status']}")
1897
+
1898
+ # Show error summary
1899
+ summary = recovery_system.get_error_summary()
1900
+ print(f"Total errors: {summary['total_recent_errors']}")
1901
+
1902
+ print("\nError Recovery System demo completed")