blockmine 1.21.0 → 1.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (492) hide show
  1. package/.claude/agents/README.md +469 -0
  2. package/.claude/agents/auth-route-debugger.md +118 -0
  3. package/.claude/agents/auth-route-tester.md +93 -0
  4. package/.claude/agents/auto-error-resolver.md +97 -0
  5. package/.claude/agents/build-optimizer.md +236 -0
  6. package/.claude/agents/code-architecture-reviewer.md +83 -0
  7. package/.claude/agents/code-refactor-master.md +94 -0
  8. package/.claude/agents/cost-optimizer.md +134 -0
  9. package/.claude/agents/deployment-orchestrator.md +113 -0
  10. package/.claude/agents/documentation-architect.md +82 -0
  11. package/.claude/agents/frontend-error-fixer.md +77 -0
  12. package/.claude/agents/iac-code-generator.md +71 -0
  13. package/.claude/agents/incident-responder.md +346 -0
  14. package/.claude/agents/infrastructure-architect.md +31 -0
  15. package/.claude/agents/kubernetes-specialist.md +56 -0
  16. package/.claude/agents/migration-planner.md +181 -0
  17. package/.claude/agents/network-architect.md +196 -0
  18. package/.claude/agents/plan-reviewer.md +52 -0
  19. package/.claude/agents/refactor-planner.md +63 -0
  20. package/.claude/agents/security-scanner.md +102 -0
  21. package/.claude/agents/web-research-specialist.md +78 -0
  22. package/.claude/commands/cost-analysis.md +315 -0
  23. package/.claude/commands/dev-docs-update.md +55 -0
  24. package/.claude/commands/dev-docs.md +51 -0
  25. package/.claude/commands/incident-debug.md +247 -0
  26. package/.claude/commands/infra-plan.md +81 -0
  27. package/.claude/commands/migration-plan.md +478 -0
  28. package/.claude/commands/route-research-for-testing.md +37 -0
  29. package/.claude/commands/security-review.md +66 -0
  30. package/.claude/hooks/CONFIG.md +448 -0
  31. package/.claude/hooks/README.md +163 -0
  32. package/.claude/hooks/SKILL_ACTIVATION_COMPLETE.md +226 -0
  33. package/.claude/hooks/WINDOWS_HOOKS_README.md +151 -0
  34. package/.claude/hooks/add-skill-activation-banners.ts +132 -0
  35. package/.claude/hooks/comprehensive-skill-test.ts +1315 -0
  36. package/.claude/hooks/error-handling-reminder.sh +12 -0
  37. package/.claude/hooks/error-handling-reminder.ts +222 -0
  38. package/.claude/hooks/k8s-manifest-validator.sh +56 -0
  39. package/.claude/hooks/package-lock.json +556 -0
  40. package/.claude/hooks/package.json +16 -0
  41. package/.claude/hooks/post-tool-use-tracker.ps1 +174 -0
  42. package/.claude/hooks/post-tool-use-tracker.sh +183 -0
  43. package/.claude/hooks/security-policy-check.sh +247 -0
  44. package/.claude/hooks/skill-activation-prompt.ps1 +10 -0
  45. package/.claude/hooks/skill-activation-prompt.sh +10 -0
  46. package/.claude/hooks/skill-activation-prompt.ts +141 -0
  47. package/.claude/hooks/stop-build-check-enhanced.sh +130 -0
  48. package/.claude/hooks/terraform-validator.sh +53 -0
  49. package/.claude/hooks/test-input.json +7 -0
  50. package/.claude/hooks/test-skill-activation.ts +427 -0
  51. package/.claude/hooks/trigger-build-resolver.sh +79 -0
  52. package/.claude/hooks/tsc-check.sh +173 -0
  53. package/.claude/hooks/tsconfig.json +19 -0
  54. package/.claude/settings.json +59 -0
  55. package/.claude/settings.local.json +36 -14
  56. package/.claude/skills/README.md +507 -0
  57. package/.claude/skills/api-engineering/SKILL.md +63 -0
  58. package/.claude/skills/api-engineering/resources/api-versioning.md +88 -0
  59. package/.claude/skills/api-engineering/resources/graphql-patterns.md +106 -0
  60. package/.claude/skills/api-engineering/resources/rate-limiting.md +118 -0
  61. package/.claude/skills/api-engineering/resources/rest-api-design.md +105 -0
  62. package/.claude/skills/backend-dev-guidelines/SKILL.md +306 -0
  63. package/.claude/skills/backend-dev-guidelines/resources/architecture-overview.md +451 -0
  64. package/.claude/skills/backend-dev-guidelines/resources/async-and-errors.md +307 -0
  65. package/.claude/skills/backend-dev-guidelines/resources/complete-examples.md +638 -0
  66. package/.claude/skills/backend-dev-guidelines/resources/configuration.md +275 -0
  67. package/.claude/skills/backend-dev-guidelines/resources/database-patterns.md +224 -0
  68. package/.claude/skills/backend-dev-guidelines/resources/middleware-guide.md +213 -0
  69. package/.claude/skills/backend-dev-guidelines/resources/routing-and-controllers.md +756 -0
  70. package/.claude/skills/backend-dev-guidelines/resources/sentry-and-monitoring.md +336 -0
  71. package/.claude/skills/backend-dev-guidelines/resources/services-and-repositories.md +789 -0
  72. package/.claude/skills/backend-dev-guidelines/resources/testing-guide.md +235 -0
  73. package/.claude/skills/backend-dev-guidelines/resources/validation-patterns.md +754 -0
  74. package/.claude/skills/budget-and-cost-management/SKILL.md +850 -0
  75. package/.claude/skills/build-engineering/SKILL.md +431 -0
  76. package/.claude/skills/build-engineering/resources/artifact-repositories.md +72 -0
  77. package/.claude/skills/build-engineering/resources/build-caching.md +96 -0
  78. package/.claude/skills/build-engineering/resources/build-pipelines.md +105 -0
  79. package/.claude/skills/build-engineering/resources/build-security.md +95 -0
  80. package/.claude/skills/build-engineering/resources/build-systems.md +389 -0
  81. package/.claude/skills/build-engineering/resources/compilation-optimization.md +201 -0
  82. package/.claude/skills/build-engineering/resources/dependency-management.md +73 -0
  83. package/.claude/skills/build-engineering/resources/monorepo-builds.md +110 -0
  84. package/.claude/skills/build-engineering/resources/performance-optimization.md +113 -0
  85. package/.claude/skills/build-engineering/resources/reproducible-builds.md +82 -0
  86. package/.claude/skills/cloud-engineering/SKILL.md +675 -0
  87. package/.claude/skills/cloud-engineering/resources/aws-patterns.md +742 -0
  88. package/.claude/skills/cloud-engineering/resources/azure-patterns.md +714 -0
  89. package/.claude/skills/cloud-engineering/resources/cleared-cloud-environments.md +987 -0
  90. package/.claude/skills/cloud-engineering/resources/cloud-cost-optimization.md +757 -0
  91. package/.claude/skills/cloud-engineering/resources/cloud-networking.md +1058 -0
  92. package/.claude/skills/cloud-engineering/resources/cloud-security-tools.md +1530 -0
  93. package/.claude/skills/cloud-engineering/resources/cloud-security.md +990 -0
  94. package/.claude/skills/cloud-engineering/resources/gcp-patterns.md +758 -0
  95. package/.claude/skills/cloud-engineering/resources/migration-strategies.md +820 -0
  96. package/.claude/skills/cloud-engineering/resources/multi-cloud-strategies.md +670 -0
  97. package/.claude/skills/cloud-engineering/resources/oci-patterns.md +1198 -0
  98. package/.claude/skills/cloud-engineering/resources/serverless-patterns.md +795 -0
  99. package/.claude/skills/cloud-engineering/resources/well-architected-frameworks.md +966 -0
  100. package/.claude/skills/cybersecurity/SKILL.md +409 -0
  101. package/.claude/skills/cybersecurity/resources/security-architecture.md +266 -0
  102. package/.claude/skills/database-engineering/SKILL.md +61 -0
  103. package/.claude/skills/database-engineering/resources/backup-and-recovery.md +72 -0
  104. package/.claude/skills/database-engineering/resources/database-replication.md +63 -0
  105. package/.claude/skills/database-engineering/resources/postgresql-fundamentals.md +70 -0
  106. package/.claude/skills/database-engineering/resources/query-optimization.md +68 -0
  107. package/.claude/skills/devsecops/SKILL.md +374 -0
  108. package/.claude/skills/devsecops/resources/ci-cd-security.md +204 -0
  109. package/.claude/skills/devsecops/resources/compliance-automation.md +530 -0
  110. package/.claude/skills/devsecops/resources/compliance-frameworks.md +2322 -0
  111. package/.claude/skills/devsecops/resources/container-security.md +915 -0
  112. package/.claude/skills/devsecops/resources/cspm-integration.md +1440 -0
  113. package/.claude/skills/devsecops/resources/policy-enforcement.md +619 -0
  114. package/.claude/skills/devsecops/resources/secrets-management.md +755 -0
  115. package/.claude/skills/devsecops/resources/security-monitoring.md +146 -0
  116. package/.claude/skills/devsecops/resources/security-scanning.md +887 -0
  117. package/.claude/skills/devsecops/resources/security-testing.md +203 -0
  118. package/.claude/skills/devsecops/resources/supply-chain-security.md +518 -0
  119. package/.claude/skills/devsecops/resources/vulnerability-management.md +481 -0
  120. package/.claude/skills/devsecops/resources/zero-trust-architecture.md +177 -0
  121. package/.claude/skills/documentation-as-code/SKILL.md +323 -0
  122. package/.claude/skills/documentation-as-code/resources/api-documentation.md +90 -0
  123. package/.claude/skills/documentation-as-code/resources/changelog-management.md +79 -0
  124. package/.claude/skills/documentation-as-code/resources/diagram-generation.md +44 -0
  125. package/.claude/skills/documentation-as-code/resources/docs-as-code-workflow.md +99 -0
  126. package/.claude/skills/documentation-as-code/resources/documentation-automation.md +68 -0
  127. package/.claude/skills/documentation-as-code/resources/documentation-sites.md +79 -0
  128. package/.claude/skills/documentation-as-code/resources/markdown-best-practices.md +162 -0
  129. package/.claude/skills/documentation-as-code/resources/openapi-specification.md +77 -0
  130. package/.claude/skills/documentation-as-code/resources/readme-engineering.md +60 -0
  131. package/.claude/skills/documentation-as-code/resources/technical-writing-guide.md +202 -0
  132. package/.claude/skills/engineering-management/SKILL.md +356 -0
  133. package/.claude/skills/engineering-management/resources/career-ladders.md +609 -0
  134. package/.claude/skills/engineering-management/resources/hiring-and-assessment.md +555 -0
  135. package/.claude/skills/engineering-management/resources/one-on-one-guides.md +609 -0
  136. package/.claude/skills/engineering-management/resources/resource-planning.md +557 -0
  137. package/.claude/skills/engineering-management/resources/team-organization-patterns.md +491 -0
  138. package/.claude/skills/engineering-management/resources/technical-interviews.md +474 -0
  139. package/.claude/skills/engineering-operations-management/SKILL.md +817 -0
  140. package/.claude/skills/error-tracking/SKILL.md +379 -0
  141. package/.claude/skills/frontend-dev-guidelines/SKILL.md +403 -0
  142. package/.claude/skills/frontend-dev-guidelines/resources/common-patterns.md +331 -0
  143. package/.claude/skills/frontend-dev-guidelines/resources/complete-examples.md +872 -0
  144. package/.claude/skills/frontend-dev-guidelines/resources/component-patterns.md +502 -0
  145. package/.claude/skills/frontend-dev-guidelines/resources/data-fetching.md +767 -0
  146. package/.claude/skills/frontend-dev-guidelines/resources/file-organization.md +502 -0
  147. package/.claude/skills/frontend-dev-guidelines/resources/loading-and-error-states.md +501 -0
  148. package/.claude/skills/frontend-dev-guidelines/resources/performance.md +406 -0
  149. package/.claude/skills/frontend-dev-guidelines/resources/routing-guide.md +364 -0
  150. package/.claude/skills/frontend-dev-guidelines/resources/styling-guide.md +428 -0
  151. package/.claude/skills/frontend-dev-guidelines/resources/typescript-standards.md +418 -0
  152. package/.claude/skills/general-it-engineering/SKILL.md +393 -0
  153. package/.claude/skills/general-it-engineering/resources/asset-management.md +712 -0
  154. package/.claude/skills/general-it-engineering/resources/automation-orchestration.md +817 -0
  155. package/.claude/skills/general-it-engineering/resources/business-continuity.md +786 -0
  156. package/.claude/skills/general-it-engineering/resources/change-management.md +715 -0
  157. package/.claude/skills/general-it-engineering/resources/enterprise-monitoring.md +729 -0
  158. package/.claude/skills/general-it-engineering/resources/help-desk-operations.md +738 -0
  159. package/.claude/skills/general-it-engineering/resources/incident-service-management.md +834 -0
  160. package/.claude/skills/general-it-engineering/resources/it-governance.md +753 -0
  161. package/.claude/skills/general-it-engineering/resources/itil-framework.md +503 -0
  162. package/.claude/skills/general-it-engineering/resources/service-management.md +669 -0
  163. package/.claude/skills/infrastructure-architecture/SKILL.md +328 -0
  164. package/.claude/skills/infrastructure-architecture/resources/architecture-decision-records.md +505 -0
  165. package/.claude/skills/infrastructure-architecture/resources/architecture-patterns.md +528 -0
  166. package/.claude/skills/infrastructure-architecture/resources/capacity-planning.md +453 -0
  167. package/.claude/skills/infrastructure-architecture/resources/cleared-environment-architecture.md +773 -0
  168. package/.claude/skills/infrastructure-architecture/resources/cost-architecture.md +499 -0
  169. package/.claude/skills/infrastructure-architecture/resources/data-architecture.md +501 -0
  170. package/.claude/skills/infrastructure-architecture/resources/disaster-recovery.md +535 -0
  171. package/.claude/skills/infrastructure-architecture/resources/migration-architecture.md +512 -0
  172. package/.claude/skills/infrastructure-architecture/resources/multi-region-design.md +608 -0
  173. package/.claude/skills/infrastructure-architecture/resources/reference-architectures.md +562 -0
  174. package/.claude/skills/infrastructure-architecture/resources/security-architecture.md +538 -0
  175. package/.claude/skills/infrastructure-architecture/resources/system-design-principles.md +489 -0
  176. package/.claude/skills/infrastructure-architecture/resources/workload-classification.md +1000 -0
  177. package/.claude/skills/infrastructure-strategy/SKILL.md +924 -0
  178. package/.claude/skills/network-engineering/SKILL.md +385 -0
  179. package/.claude/skills/network-engineering/resources/dns-management.md +738 -0
  180. package/.claude/skills/network-engineering/resources/load-balancing.md +820 -0
  181. package/.claude/skills/network-engineering/resources/network-architecture.md +546 -0
  182. package/.claude/skills/network-engineering/resources/network-security.md +921 -0
  183. package/.claude/skills/network-engineering/resources/network-troubleshooting.md +749 -0
  184. package/.claude/skills/network-engineering/resources/routing-switching.md +373 -0
  185. package/.claude/skills/network-engineering/resources/sdn-networking.md +695 -0
  186. package/.claude/skills/network-engineering/resources/service-mesh-networking.md +777 -0
  187. package/.claude/skills/network-engineering/resources/tcp-ip-protocols.md +444 -0
  188. package/.claude/skills/network-engineering/resources/vpn-connectivity.md +672 -0
  189. package/.claude/skills/observability-engineering/SKILL.md +101 -0
  190. package/.claude/skills/observability-engineering/resources/apm-tools.md +97 -0
  191. package/.claude/skills/observability-engineering/resources/correlation-strategies.md +87 -0
  192. package/.claude/skills/observability-engineering/resources/distributed-tracing.md +98 -0
  193. package/.claude/skills/observability-engineering/resources/logs-aggregation.md +118 -0
  194. package/.claude/skills/observability-engineering/resources/observability-cost-optimization.md +141 -0
  195. package/.claude/skills/observability-engineering/resources/opentelemetry.md +110 -0
  196. package/.claude/skills/platform-engineering/SKILL.md +555 -0
  197. package/.claude/skills/platform-engineering/resources/architecture-overview.md +600 -0
  198. package/.claude/skills/platform-engineering/resources/container-orchestration.md +916 -0
  199. package/.claude/skills/platform-engineering/resources/cost-optimization.md +634 -0
  200. package/.claude/skills/platform-engineering/resources/developer-platforms.md +670 -0
  201. package/.claude/skills/platform-engineering/resources/gitops-automation.md +650 -0
  202. package/.claude/skills/platform-engineering/resources/infrastructure-as-code.md +778 -0
  203. package/.claude/skills/platform-engineering/resources/infrastructure-standards.md +708 -0
  204. package/.claude/skills/platform-engineering/resources/multi-tenancy.md +602 -0
  205. package/.claude/skills/platform-engineering/resources/platform-security.md +711 -0
  206. package/.claude/skills/platform-engineering/resources/resource-management.md +592 -0
  207. package/.claude/skills/platform-engineering/resources/service-mesh.md +628 -0
  208. package/.claude/skills/release-engineering/SKILL.md +393 -0
  209. package/.claude/skills/release-engineering/resources/artifact-management.md +108 -0
  210. package/.claude/skills/release-engineering/resources/build-optimization.md +84 -0
  211. package/.claude/skills/release-engineering/resources/ci-cd-pipelines.md +411 -0
  212. package/.claude/skills/release-engineering/resources/deployment-strategies.md +197 -0
  213. package/.claude/skills/release-engineering/resources/pipeline-security.md +62 -0
  214. package/.claude/skills/release-engineering/resources/progressive-delivery.md +83 -0
  215. package/.claude/skills/release-engineering/resources/release-automation.md +68 -0
  216. package/.claude/skills/release-engineering/resources/release-orchestration.md +77 -0
  217. package/.claude/skills/release-engineering/resources/rollback-strategies.md +66 -0
  218. package/.claude/skills/release-engineering/resources/versioning-strategies.md +59 -0
  219. package/.claude/skills/route-tester/SKILL.md +392 -0
  220. package/.claude/skills/skill-developer/ADVANCED.md +197 -0
  221. package/.claude/skills/skill-developer/HOOK_MECHANISMS.md +306 -0
  222. package/.claude/skills/skill-developer/PATTERNS_LIBRARY.md +152 -0
  223. package/.claude/skills/skill-developer/SKILL.md +430 -0
  224. package/.claude/skills/skill-developer/SKILL_RULES_REFERENCE.md +315 -0
  225. package/.claude/skills/skill-developer/TRIGGER_TYPES.md +305 -0
  226. package/.claude/skills/skill-developer/TROUBLESHOOTING.md +514 -0
  227. package/.claude/skills/skill-rules.json +2940 -0
  228. package/.claude/skills/sre/SKILL.md +464 -0
  229. package/.claude/skills/sre/resources/alerting-best-practices.md +282 -0
  230. package/.claude/skills/sre/resources/capacity-planning.md +226 -0
  231. package/.claude/skills/sre/resources/chaos-engineering.md +193 -0
  232. package/.claude/skills/sre/resources/disaster-recovery.md +232 -0
  233. package/.claude/skills/sre/resources/incident-management.md +436 -0
  234. package/.claude/skills/sre/resources/observability-stack.md +240 -0
  235. package/.claude/skills/sre/resources/on-call-runbooks.md +167 -0
  236. package/.claude/skills/sre/resources/performance-optimization.md +108 -0
  237. package/.claude/skills/sre/resources/reliability-patterns.md +183 -0
  238. package/.claude/skills/sre/resources/slo-sli-sla.md +464 -0
  239. package/.claude/skills/sre/resources/toil-reduction.md +145 -0
  240. package/.claude/skills/systems-engineering/SKILL.md +648 -0
  241. package/.claude/skills/systems-engineering/resources/automation-patterns.md +771 -0
  242. package/.claude/skills/systems-engineering/resources/configuration-management.md +998 -0
  243. package/.claude/skills/systems-engineering/resources/linux-administration.md +672 -0
  244. package/.claude/skills/systems-engineering/resources/networking-fundamentals.md +982 -0
  245. package/.claude/skills/systems-engineering/resources/performance-tuning.md +871 -0
  246. package/.claude/skills/systems-engineering/resources/powershell-scripting.md +482 -0
  247. package/.claude/skills/systems-engineering/resources/security-hardening.md +739 -0
  248. package/.claude/skills/systems-engineering/resources/shell-scripting.md +915 -0
  249. package/.claude/skills/systems-engineering/resources/storage-management.md +628 -0
  250. package/.claude/skills/systems-engineering/resources/system-monitoring.md +787 -0
  251. package/.claude/skills/systems-engineering/resources/troubleshooting-guide.md +753 -0
  252. package/.claude/skills/systems-engineering/resources/windows-administration.md +738 -0
  253. package/.claude/skills/technical-leadership/SKILL.md +728 -0
  254. package/CHANGELOG.md +102 -42
  255. package/CLAUDE.md +284 -0
  256. package/README.md +315 -71
  257. package/backend/docs/SECRETS_DOCUMENTATION.md +327 -0
  258. package/backend/jest.config.js +59 -0
  259. package/backend/package-lock.json +6801 -0
  260. package/backend/package.json +24 -4
  261. package/backend/prisma/migrations/20251026104609_add_websocket_api/migration.sql +33 -0
  262. package/backend/prisma/migrations/20251116111851_add_execution_trace/migration.sql +22 -0
  263. package/backend/prisma/migrations/20251120154914_add_panel_api_keys/migration.sql +21 -0
  264. package/backend/prisma/migrations/20251121110241_add_proxy_table/migration.sql +45 -0
  265. package/backend/prisma/migrations/migration_lock.toml +2 -2
  266. package/backend/prisma/schema.prisma +103 -1
  267. package/backend/src/__tests__/core/DependencyService.test.js +336 -0
  268. package/backend/src/__tests__/core/UserService.test.js +875 -0
  269. package/backend/src/__tests__/repositories/BaseRepository.test.js +146 -0
  270. package/backend/src/__tests__/repositories/BotRepository.test.js +118 -0
  271. package/backend/src/__tests__/repositories/CommandRepository.test.js +132 -0
  272. package/backend/src/__tests__/repositories/EventGraphRepository.test.js +93 -0
  273. package/backend/src/__tests__/repositories/GroupRepository.test.js +155 -0
  274. package/backend/src/__tests__/repositories/PermissionRepository.test.js +130 -0
  275. package/backend/src/__tests__/repositories/PluginRepository.test.js +107 -0
  276. package/backend/src/__tests__/repositories/ServerRepository.test.js +80 -0
  277. package/backend/src/__tests__/repositories/UserRepository.test.js +128 -0
  278. package/backend/src/__tests__/secretsFilter.test.js +425 -0
  279. package/backend/src/__tests__/services/BotLifecycleService.test.js +416 -0
  280. package/backend/src/__tests__/services/BotProcessManager.test.js +285 -0
  281. package/backend/src/__tests__/services/CacheManager.test.js +125 -0
  282. package/backend/src/__tests__/services/CommandExecutionService.test.js +460 -0
  283. package/backend/src/__tests__/services/ResourceMonitorService.test.js +207 -0
  284. package/backend/src/__tests__/services/TelemetryService.test.js +291 -0
  285. package/backend/src/__tests__/setup.js +25 -0
  286. package/backend/src/ai/plugin-assistant-system-prompt.md +788 -0
  287. package/backend/src/api/middleware/auth.js +27 -0
  288. package/backend/src/api/middleware/botAccess.js +7 -3
  289. package/backend/src/api/middleware/panelApiAuth.js +135 -0
  290. package/backend/src/api/routes/aiAssistant.js +995 -0
  291. package/backend/src/api/routes/apiKeys.js +181 -0
  292. package/backend/src/api/routes/auth.js +669 -633
  293. package/backend/src/api/routes/botCommands.js +107 -0
  294. package/backend/src/api/routes/botGroups.js +165 -0
  295. package/backend/src/api/routes/botHistory.js +108 -0
  296. package/backend/src/api/routes/botPermissions.js +99 -0
  297. package/backend/src/api/routes/botStatus.js +36 -0
  298. package/backend/src/api/routes/botUsers.js +162 -0
  299. package/backend/src/api/routes/bots.js +2451 -2360
  300. package/backend/src/api/routes/eventGraphs.js +4 -1
  301. package/backend/src/api/routes/logs.js +13 -3
  302. package/backend/src/api/routes/panel.js +66 -66
  303. package/backend/src/api/routes/panelApiKeys.js +179 -0
  304. package/backend/src/api/routes/pluginIde.js +1715 -135
  305. package/backend/src/api/routes/plugins.js +376 -218
  306. package/backend/src/api/routes/proxies.js +130 -0
  307. package/backend/src/api/routes/search.js +4 -0
  308. package/backend/src/api/routes/servers.js +20 -3
  309. package/backend/src/api/routes/settings.js +5 -0
  310. package/backend/src/api/routes/system.js +174 -0
  311. package/backend/src/api/routes/traces.js +131 -0
  312. package/backend/src/config/debug.config.js +36 -0
  313. package/backend/src/container.js +82 -0
  314. package/backend/src/core/BotHistoryStore.js +180 -0
  315. package/backend/src/core/BotManager.js +149 -868
  316. package/backend/src/core/BotManager.old.js +1093 -0
  317. package/backend/src/core/BotProcess.js +850 -191
  318. package/backend/src/core/EventGraphManager.js +194 -198
  319. package/backend/src/core/GraphExecutionEngine.js +709 -57
  320. package/backend/src/core/MessageQueue.js +39 -12
  321. package/backend/src/core/NodeRegistry.js +37 -1134
  322. package/backend/src/core/PluginLoader.js +99 -5
  323. package/backend/src/core/PluginManager.js +126 -15
  324. package/backend/src/core/PrismaService.js +32 -0
  325. package/backend/src/core/TaskScheduler.js +1 -1
  326. package/backend/src/core/UserService.js +3 -3
  327. package/backend/src/core/__tests__/PrismaService.test.js +24 -0
  328. package/backend/src/core/commands/README.md +305 -0
  329. package/backend/src/core/commands/dev.js +13 -7
  330. package/backend/src/core/commands/ping.js +10 -4
  331. package/backend/src/core/commands/whois.js +63 -0
  332. package/backend/src/core/config/validation.js +27 -0
  333. package/backend/src/core/constants/graphTypes.js +21 -0
  334. package/backend/src/core/node-registries/actions.js +202 -0
  335. package/backend/src/core/node-registries/arrays.js +155 -0
  336. package/backend/src/core/node-registries/bot.js +23 -0
  337. package/backend/src/core/node-registries/data.js +290 -0
  338. package/backend/src/core/node-registries/debug.js +26 -0
  339. package/backend/src/core/node-registries/events.js +201 -0
  340. package/backend/src/core/node-registries/flow.js +139 -0
  341. package/backend/src/core/node-registries/logic.js +62 -0
  342. package/backend/src/core/node-registries/math.js +42 -0
  343. package/backend/src/core/node-registries/objects.js +98 -0
  344. package/backend/src/core/node-registries/strings.js +187 -0
  345. package/backend/src/core/node-registries/time.js +113 -0
  346. package/backend/src/core/node-registries/type.js +25 -0
  347. package/backend/src/core/node-registries/users.js +79 -0
  348. package/backend/src/core/nodes/{action_bot_look_at.js → actions/bot_look_at.js} +36 -36
  349. package/backend/src/core/nodes/{action_bot_set_variable.js → actions/bot_set_variable.js} +32 -32
  350. package/backend/src/core/nodes/actions/create_command.js +189 -0
  351. package/backend/src/core/nodes/actions/delete_command.js +92 -0
  352. package/backend/src/core/nodes/{action_send_log.js → actions/send_log.js} +28 -23
  353. package/backend/src/core/nodes/{action_send_message.js → actions/send_message.js} +32 -32
  354. package/backend/src/core/nodes/actions/send_websocket_response.js +33 -0
  355. package/backend/src/core/nodes/actions/update_command.js +133 -0
  356. package/backend/src/core/nodes/arrays/get_next.js +35 -0
  357. package/backend/src/core/nodes/arrays/join.js +28 -0
  358. package/backend/src/core/nodes/{data_cast.js → data/cast.js} +10 -1
  359. package/backend/src/core/nodes/data/datetime_literal.js +27 -0
  360. package/backend/src/core/nodes/data/entity_info.js +69 -0
  361. package/backend/src/core/nodes/data/get_nearby_entities.js +32 -0
  362. package/backend/src/core/nodes/data/get_nearby_players.js +64 -0
  363. package/backend/src/core/nodes/{data_get_user_field.js → data/get_user_field.js} +1 -1
  364. package/backend/src/core/nodes/data/type_check.js +53 -0
  365. package/backend/src/core/nodes/{debug_log.js → debug/log.js} +16 -16
  366. package/backend/src/core/nodes/{flow_branch.js → flow/branch.js} +15 -15
  367. package/backend/src/core/nodes/{flow_break.js → flow/break.js} +14 -14
  368. package/backend/src/core/nodes/flow/delay.js +43 -0
  369. package/backend/src/core/nodes/{flow_for_each.js → flow/for_each.js} +39 -39
  370. package/backend/src/core/nodes/{flow_sequence.js → flow/sequence.js} +16 -16
  371. package/backend/src/core/nodes/{flow_switch.js → flow/switch.js} +47 -47
  372. package/backend/src/core/nodes/{flow_while.js → flow/while.js} +1 -1
  373. package/backend/src/core/nodes/logic/__tests__/compare.test.js +83 -0
  374. package/backend/src/core/nodes/logic/not.js +22 -0
  375. package/backend/src/core/nodes/math/__tests__/operation.test.js +65 -0
  376. package/backend/src/core/nodes/strings/__tests__/concat.test.js +89 -0
  377. package/backend/src/core/nodes/{string_starts_with.js → strings/starts_with.js} +1 -1
  378. package/backend/src/core/nodes/strings/to_lower.js +22 -0
  379. package/backend/src/core/nodes/strings/to_upper.js +22 -0
  380. package/backend/src/core/nodes/time/__tests__/now.test.js +24 -0
  381. package/backend/src/core/nodes/time/add.js +33 -0
  382. package/backend/src/core/nodes/time/compare.js +35 -0
  383. package/backend/src/core/nodes/time/diff.js +29 -0
  384. package/backend/src/core/nodes/time/format.js +32 -0
  385. package/backend/src/core/nodes/time/now.js +18 -0
  386. package/backend/src/core/nodes/type/to_string.js +32 -0
  387. package/backend/src/core/nodes/{user_check_blacklist.js → users/check_blacklist.js} +37 -37
  388. package/backend/src/core/nodes/{user_get_groups.js → users/get_groups.js} +36 -36
  389. package/backend/src/core/nodes/{user_get_permissions.js → users/get_permissions.js} +36 -36
  390. package/backend/src/core/nodes/{user_set_blacklist.js → users/set_blacklist.js} +37 -37
  391. package/backend/src/core/services/BotLifecycleService.js +835 -0
  392. package/backend/src/core/services/BotProcessManager.js +163 -0
  393. package/backend/src/core/services/CacheManager.js +111 -0
  394. package/backend/src/core/services/CommandExecutionService.js +430 -0
  395. package/backend/src/core/services/DebugSessionManager.js +347 -0
  396. package/backend/src/core/services/GraphCollaborationManager.js +501 -0
  397. package/backend/src/core/services/MinecraftBotManager.js +259 -0
  398. package/backend/src/core/services/MinecraftViewerService.js +216 -0
  399. package/backend/src/core/services/ResourceMonitorService.js +90 -0
  400. package/backend/src/core/services/TelemetryService.js +124 -0
  401. package/backend/src/core/services/TraceCollectorService.js +545 -0
  402. package/backend/src/core/services/ValidationService.js +132 -0
  403. package/backend/src/core/services/__tests__/ValidationService.test.js +148 -0
  404. package/backend/src/core/services.js +20 -5
  405. package/backend/src/core/system/CommandContext.js +84 -0
  406. package/backend/src/core/system/RuntimeCommandRegistry.js +116 -0
  407. package/backend/src/core/system/Transport.js +74 -0
  408. package/backend/src/core/utils/__tests__/jsonParser.test.js +44 -0
  409. package/backend/src/core/utils/jsonParser.js +18 -0
  410. package/backend/src/core/utils/secretsFilter.js +262 -0
  411. package/backend/src/core/utils/variableParser.js +89 -0
  412. package/backend/src/core/validation/__tests__/nodeSchemas.test.js +175 -0
  413. package/backend/src/core/validation/nodeSchemas.js +112 -0
  414. package/backend/src/lib/prisma.js +2 -4
  415. package/backend/src/real-time/botApi/handlers/commandHandlers.js +28 -0
  416. package/backend/src/real-time/botApi/handlers/graphHandlers.js +99 -0
  417. package/backend/src/real-time/botApi/handlers/graphWebSocketHandlers.js +147 -0
  418. package/backend/src/real-time/botApi/handlers/index.js +43 -0
  419. package/backend/src/real-time/botApi/handlers/messageHandlers.js +66 -0
  420. package/backend/src/real-time/botApi/handlers/statusHandlers.js +17 -0
  421. package/backend/src/real-time/botApi/handlers/userHandlers.js +141 -0
  422. package/backend/src/real-time/botApi/index.js +40 -0
  423. package/backend/src/real-time/botApi/middleware.js +79 -0
  424. package/backend/src/real-time/botApi/utils.js +65 -0
  425. package/backend/src/real-time/panelNamespace.js +387 -0
  426. package/backend/src/real-time/presence.js +7 -2
  427. package/backend/src/real-time/socketHandler.js +400 -5
  428. package/backend/src/repositories/BaseRepository.js +43 -0
  429. package/backend/src/repositories/BotRepository.js +42 -0
  430. package/backend/src/repositories/CommandRepository.js +53 -0
  431. package/backend/src/repositories/EventGraphRepository.js +40 -0
  432. package/backend/src/repositories/GroupRepository.js +69 -0
  433. package/backend/src/repositories/PermissionRepository.js +48 -0
  434. package/backend/src/repositories/PluginRepository.js +42 -0
  435. package/backend/src/repositories/ServerRepository.js +27 -0
  436. package/backend/src/repositories/UserRepository.js +48 -0
  437. package/backend/src/server.js +21 -0
  438. package/backend/src/test-refactor.js +85 -0
  439. package/frontend/dist/assets/index-B1serztM.js +11210 -0
  440. package/frontend/dist/assets/index-t6K1u4OV.css +32 -0
  441. package/frontend/dist/index.html +2 -2
  442. package/frontend/package-lock.json +9437 -0
  443. package/frontend/package.json +8 -5
  444. package/package.json +3 -2
  445. package/screen/console.png +0 -0
  446. package/screen/dashboard.png +0 -0
  447. package/screen/graph_collabe.png +0 -0
  448. package/screen/graph_live_debug.png +0 -0
  449. package/screen/management_command.png +0 -0
  450. package/screen/node_debug_trace.png +0 -0
  451. package/screen/plugin_/320/276/320/261/320/267/320/276/321/200.png +0 -0
  452. package/screen/websocket.png +0 -0
  453. package/screen//320/275/320/260/321/201/321/202/321/200/320/276/320/271/320/272/320/270_/320/276/321/202/320/264/320/265/320/273/321/214/320/275/321/213/321/205_/320/272/320/276/320/274/320/260/320/275/320/264_/320/272/320/260/320/266/320/264/321/203_/320/272/320/276/320/274/320/260/320/275/320/273/320/264/321/203_/320/274/320/276/320/266/320/275/320/276_/320/275/320/260/321/201/321/202/321/200/320/260/320/270/320/262/320/260/321/202/321/214.png +0 -0
  454. package/screen//320/277/320/273/320/260/320/275/320/270/321/200/320/276/320/262/321/211/320/270/320/272_/320/274/320/276/320/266/320/275/320/276_/320/267/320/260/320/264/320/260/320/262/320/260/321/202/321/214_/320/264/320/265/320/271/321/201/321/202/320/262/320/270/321/217_/320/277/320/276_/320/262/321/200/320/265/320/274/320/265/320/275/320/270.png +0 -0
  455. package/frontend/dist/assets/index-B9GedHEa.js +0 -8352
  456. package/frontend/dist/assets/index-zLiy9MDx.css +0 -1
  457. package/nul +0 -0
  458. /package/backend/src/core/nodes/{action_http_request.js → actions/http_request.js} +0 -0
  459. /package/backend/src/core/nodes/{array_add_element.js → arrays/add_element.js} +0 -0
  460. /package/backend/src/core/nodes/{array_contains.js → arrays/contains.js} +0 -0
  461. /package/backend/src/core/nodes/{array_find_index.js → arrays/find_index.js} +0 -0
  462. /package/backend/src/core/nodes/{array_get_by_index.js → arrays/get_by_index.js} +0 -0
  463. /package/backend/src/core/nodes/{array_get_random_element.js → arrays/get_random_element.js} +0 -0
  464. /package/backend/src/core/nodes/{array_remove_by_index.js → arrays/remove_by_index.js} +0 -0
  465. /package/backend/src/core/nodes/{bot_get_position.js → bot/get_position.js} +0 -0
  466. /package/backend/src/core/nodes/{data_array_literal.js → data/array_literal.js} +0 -0
  467. /package/backend/src/core/nodes/{data_boolean_literal.js → data/boolean_literal.js} +0 -0
  468. /package/backend/src/core/nodes/{data_get_argument.js → data/get_argument.js} +0 -0
  469. /package/backend/src/core/nodes/{data_get_bot_look.js → data/get_bot_look.js} +0 -0
  470. /package/backend/src/core/nodes/{data_get_entity_field.js → data/get_entity_field.js} +0 -0
  471. /package/backend/src/core/nodes/{data_get_server_players.js → data/get_server_players.js} +0 -0
  472. /package/backend/src/core/nodes/{data_get_variable.js → data/get_variable.js} +0 -0
  473. /package/backend/src/core/nodes/{data_length.js → data/length.js} +0 -0
  474. /package/backend/src/core/nodes/{data_make_object.js → data/make_object.js} +0 -0
  475. /package/backend/src/core/nodes/{data_number_literal.js → data/number_literal.js} +0 -0
  476. /package/backend/src/core/nodes/{data_string_literal.js → data/string_literal.js} +0 -0
  477. /package/backend/src/core/nodes/{logic_compare.js → logic/compare.js} +0 -0
  478. /package/backend/src/core/nodes/{logic_operation.js → logic/operation.js} +0 -0
  479. /package/backend/src/core/nodes/{math_operation.js → math/operation.js} +0 -0
  480. /package/backend/src/core/nodes/{math_random_number.js → math/random_number.js} +0 -0
  481. /package/backend/src/core/nodes/{object_create.js → objects/create.js} +0 -0
  482. /package/backend/src/core/nodes/{object_delete.js → objects/delete.js} +0 -0
  483. /package/backend/src/core/nodes/{object_get.js → objects/get.js} +0 -0
  484. /package/backend/src/core/nodes/{object_has_key.js → objects/has_key.js} +0 -0
  485. /package/backend/src/core/nodes/{object_set.js → objects/set.js} +0 -0
  486. /package/backend/src/core/nodes/{string_concat.js → strings/concat.js} +0 -0
  487. /package/backend/src/core/nodes/{string_contains.js → strings/contains.js} +0 -0
  488. /package/backend/src/core/nodes/{string_ends_with.js → strings/ends_with.js} +0 -0
  489. /package/backend/src/core/nodes/{string_equals.js → strings/equals.js} +0 -0
  490. /package/backend/src/core/nodes/{string_length.js → strings/length.js} +0 -0
  491. /package/backend/src/core/nodes/{string_matches.js → strings/matches.js} +0 -0
  492. /package/backend/src/core/nodes/{string_split.js → strings/split.js} +0 -0
@@ -0,0 +1,232 @@
1
+ # Disaster Recovery
2
+
3
+ Backup strategies, RTO/RPO definitions, failover procedures, disaster recovery testing, and multi-region architectures.
4
+
5
+ ## Table of Contents
6
+
7
+ - [RTO and RPO](#rto-and-rpo)
8
+ - [Backup Strategies](#backup-strategies)
9
+ - [Failover Procedures](#failover-procedures)
10
+ - [DR Testing](#dr-testing)
11
+ - [Multi-Region Architecture](#multi-region-architecture)
12
+
13
+ ## RTO and RPO
14
+
15
+ **Definitions:**
16
+ ```
17
+ RTO (Recovery Time Objective):
18
+ Maximum acceptable downtime
19
+ Example: 4 hours
20
+
21
+ RPO (Recovery Point Objective):
22
+ Maximum acceptable data loss
23
+ Example: 1 hour (last backup)
24
+ ```
25
+
26
+ **RTO/RPO Tiers:**
27
+ ```yaml
28
+ tier_1_critical:
29
+ rto: 1 hour
30
+ rpo: 15 minutes
31
+ cost: High
32
+ examples: [payment processing, critical APIs]
33
+
34
+ tier_2_important:
35
+ rto: 4 hours
36
+ rpo: 1 hour
37
+ cost: Medium
38
+ examples: [main application, databases]
39
+
40
+ tier_3_standard:
41
+ rto: 24 hours
42
+ rpo: 24 hours
43
+ cost: Low
44
+ examples: [internal tools, analytics]
45
+ ```
46
+
47
+ ## Backup Strategies
48
+
49
+ **3-2-1 Rule:**
50
+ ```
51
+ 3 copies of data
52
+ 2 different media types
53
+ 1 offsite backup
54
+ ```
55
+
56
+ **Database Backups:**
57
+ ```yaml
58
+ # PostgreSQL backup with WAL archiving
59
+ postgresql_backup:
60
+ full_backup:
61
+ frequency: daily
62
+ retention: 30 days
63
+ command: |
64
+ pg_basebackup -h localhost -D /backup/$(date +%Y%m%d) -Ft -z -Xs
65
+
66
+ wal_archiving:
67
+ enabled: true
68
+ archive_command: |
69
+ aws s3 cp %p s3://backups/wal/%f
70
+ restore_command: |
71
+ aws s3 cp s3://backups/wal/%f %p
72
+
73
+ point_in_time_recovery:
74
+ enabled: true
75
+ max_recovery_window: 7 days
76
+ ```
77
+
78
+ **Automated Backups (Velero for Kubernetes):**
79
+ ```yaml
80
+ apiVersion: velero.io/v1
81
+ kind: Schedule
82
+ metadata:
83
+ name: daily-backup
84
+ spec:
85
+ schedule: "0 1 * * *" # 1 AM daily
86
+ template:
87
+ includedNamespaces:
88
+ - production
89
+ includedResources:
90
+ - "*"
91
+ snapshotVolumes: true
92
+ ttl: 720h # 30 days
93
+ ```
94
+
95
+ ## Failover Procedures
96
+
97
+ **Database Failover:**
98
+ ```yaml
99
+ # Automated failover with Patroni
100
+ patroni:
101
+ name: postgres01
102
+ scope: postgres-cluster
103
+
104
+ bootstrap:
105
+ dcs:
106
+ ttl: 30
107
+ loop_wait: 10
108
+ retry_timeout: 10
109
+ maximum_lag_on_failover: 1048576
110
+
111
+ postgresql:
112
+ parameters:
113
+ max_connections: 100
114
+ shared_buffers: 256MB
115
+
116
+ # Failover process:
117
+ # 1. Patroni detects primary failure
118
+ # 2. Initiates leader election
119
+ # 3. Promotes replica to primary
120
+ # 4. Updates DNS/load balancer
121
+ # 5. Old primary rejoins as replica
122
+ ```
123
+
124
+ **Application Failover:**
125
+ ```yaml
126
+ # Multi-region with Route53 health checks
127
+ route53_failover:
128
+ primary:
129
+ region: us-east-1
130
+ health_check:
131
+ protocol: HTTPS
132
+ path: /health
133
+ interval: 30
134
+ failure_threshold: 3
135
+
136
+ secondary:
137
+ region: us-west-2
138
+ failover_mode: automatic
139
+ activate_when: primary_unhealthy
140
+ ```
141
+
142
+ ## DR Testing
143
+
144
+ **DR Drill Schedule:**
145
+ ```yaml
146
+ quarterly_dr_drill:
147
+ week_1:
148
+ - Review DR plan
149
+ - Update runbooks
150
+ - Verify backup integrity
151
+
152
+ week_2:
153
+ - Tabletop exercise
154
+ - Walk through procedures
155
+ - Identify gaps
156
+
157
+ week_3:
158
+ - Partial failover test
159
+ - Test database recovery
160
+ - Verify monitoring
161
+
162
+ week_4:
163
+ - Full DR drill
164
+ - Complete failover
165
+ - Document lessons learned
166
+ ```
167
+
168
+ **DR Test Checklist:**
169
+ ```markdown
170
+ - [ ] Backup restoration successful
171
+ - [ ] RTO met (< 4 hours)
172
+ - [ ] RPO met (< 1 hour data loss)
173
+ - [ ] All services operational
174
+ - [ ] Monitoring functional
175
+ - [ ] Logs accessible
176
+ - [ ] Team communication effective
177
+ - [ ] Runbooks accurate
178
+ - [ ] Action items documented
179
+ ```
180
+
181
+ ## Multi-Region Architecture
182
+
183
+ **Active-Passive:**
184
+ ```
185
+ Primary Region (Active) ─────┐
186
+ - Handles all traffic │
187
+ - Database writes │ Replication
188
+
189
+ Secondary Region (Passive) ◄─┘
190
+ - Standby for failover
191
+ - Read replicas only
192
+ - Activated manually
193
+ ```
194
+
195
+ **Active-Active:**
196
+ ```
197
+ Region 1 (Active) ◄─────► Region 2 (Active)
198
+ - 50% traffic - 50% traffic
199
+ - Full read/write - Full read/write
200
+ - Bi-directional sync - Bi-directional sync
201
+ - Auto-failover - Auto-failover
202
+ ```
203
+
204
+ **Implementation:**
205
+ ```yaml
206
+ # Kubernetes multi-region with Cilium Cluster Mesh
207
+ clusters:
208
+ us-east-1:
209
+ role: primary
210
+ services:
211
+ - api-service
212
+ - database (primary)
213
+ - cache
214
+
215
+ us-west-2:
216
+ role: secondary
217
+ services:
218
+ - api-service (read-only)
219
+ - database (replica)
220
+ - cache
221
+
222
+ failover:
223
+ automatic: true
224
+ health_check_interval: 30s
225
+ failover_threshold: 3
226
+ ```
227
+
228
+ ---
229
+
230
+ **Related Resources:**
231
+ - [incident-management.md](incident-management.md)
232
+ - [reliability-patterns.md](reliability-patterns.md)
@@ -0,0 +1,436 @@
1
+ # Incident Management
2
+
3
+ Incident response procedures, severity levels, escalation paths, communication protocols, postmortems, and on-call processes.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Incident Lifecycle](#incident-lifecycle)
8
+ - [Severity Levels](#severity-levels)
9
+ - [Roles and Responsibilities](#roles-and-responsibilities)
10
+ - [Response Procedures](#response-procedures)
11
+ - [Communication](#communication)
12
+ - [Postmortems](#postmortems)
13
+ - [Best Practices](#best-practices)
14
+
15
+ ## Incident Lifecycle
16
+
17
+ ```
18
+ Detect → Respond → Mitigate → Resolve → Learn
19
+ ↓ ↓ ↓ ↓ ↓
20
+ Alert Triage Fix/Workaround Root Postmortem
21
+ Cause
22
+ ```
23
+
24
+ ## Severity Levels
25
+
26
+ ```yaml
27
+ SEV1 - Critical:
28
+ impact: Complete service outage or data loss
29
+ response_time: Immediate (< 15 minutes)
30
+ escalation: Page on-call + management
31
+ examples:
32
+ - Production database down
33
+ - Payment processing failed
34
+ - Security breach
35
+ - Data loss
36
+
37
+ SEV2 - High:
38
+ impact: Major functionality impaired
39
+ response_time: 30 minutes
40
+ escalation: Page on-call
41
+ examples:
42
+ - API latency severely degraded
43
+ - Single region outage
44
+ - Critical feature unavailable
45
+
46
+ SEV3 - Medium:
47
+ impact: Minor functionality impaired
48
+ response_time: 2 hours
49
+ escalation: Notify on-call via Slack
50
+ examples:
51
+ - Non-critical feature degraded
52
+ - Elevated error rates
53
+ - Performance slowdown
54
+
55
+ SEV4 - Low:
56
+ impact: Minimal user impact
57
+ response_time: Next business day
58
+ escalation: Create ticket
59
+ examples:
60
+ - UI cosmetic issues
61
+ - Internal tool problems
62
+ - Low-priority bugs
63
+ ```
64
+
65
+ ## Roles and Responsibilities
66
+
67
+ **Incident Commander:**
68
+ - Owns the incident response
69
+ - Makes decisions
70
+ - Coordinates team
71
+ - Manages communication
72
+
73
+ **Technical Lead:**
74
+ - Diagnoses root cause
75
+ - Implements fixes
76
+ - Validates resolution
77
+
78
+ **Communications Lead:**
79
+ - Updates status page
80
+ - Notifies stakeholders
81
+ - Manages customer communication
82
+
83
+ **Scribe:**
84
+ - Documents timeline
85
+ - Records decisions
86
+ - Tracks action items
87
+
88
+ ## Response Procedures
89
+
90
+ ### Detection and Triage
91
+
92
+ ```yaml
93
+ # incident-response.yaml
94
+ 1_detection:
95
+ - Alert triggers
96
+ - Customer report
97
+ - Monitoring system
98
+ - Team member notice
99
+
100
+ 2_initial_triage:
101
+ - Assess severity
102
+ - Determine impact
103
+ - Page appropriate team
104
+ - Create incident channel
105
+
106
+ 3_form_response_team:
107
+ - Incident Commander
108
+ - Technical Lead(s)
109
+ - Communications Lead
110
+ - Subject Matter Experts
111
+ ```
112
+
113
+ ### Incident Command Structure
114
+
115
+ ```bash
116
+ # Create incident Slack channel
117
+ /incident create SEV1 "API Gateway Down"
118
+
119
+ # Auto-creates:
120
+ # - #incident-2024-001
121
+ # - Zoom bridge
122
+ # - Status page placeholder
123
+ # - Timeline doc
124
+ ```
125
+
126
+ ### Response Playbooks
127
+
128
+ **Database Outage:**
129
+ ```yaml
130
+ playbook: database-outage
131
+ severity: SEV1
132
+
133
+ steps:
134
+ 1_immediate:
135
+ - Check database health metrics
136
+ - Verify connectivity
137
+ - Check for locks/blocking queries
138
+ - Review recent changes
139
+
140
+ 2_diagnosis:
141
+ - Check replication lag
142
+ - Review error logs
143
+ - Verify disk space
144
+ - Check connection pool
145
+
146
+ 3_mitigation:
147
+ - Failover to replica
148
+ - Kill blocking queries
149
+ - Restart if necessary
150
+ - Scale resources
151
+
152
+ 4_communication:
153
+ - Update status page
154
+ - Notify customers
155
+ - Inform stakeholders
156
+ ```
157
+
158
+ **API Latency Degradation:**
159
+ ```yaml
160
+ playbook: api-latency
161
+ severity: SEV2
162
+
163
+ steps:
164
+ 1_gather_data:
165
+ - Check p95/p99 latency
166
+ - Review error rates
167
+ - Examine slow query logs
168
+ - Check downstream services
169
+
170
+ 2_common_causes:
171
+ - Database slow queries
172
+ - Increased traffic
173
+ - Downstream dependency
174
+ - Resource exhaustion
175
+ - Code deployment
176
+
177
+ 3_quick_fixes:
178
+ - Scale up instances
179
+ - Enable/adjust caching
180
+ - Rate limit traffic
181
+ - Rollback deployment
182
+ ```
183
+
184
+ ## Communication
185
+
186
+ ### Status Page Updates
187
+
188
+ ```yaml
189
+ # Incident timeline
190
+ 14:05: Investigating - We're investigating reports of API errors
191
+ 14:15: Identified - Database connection issues identified
192
+ 14:30: Monitoring - Failover completed, monitoring recovery
193
+ 15:00: Resolved - All services restored, investigating root cause
194
+ ```
195
+
196
+ ### Customer Communication Template
197
+
198
+ ```markdown
199
+ Subject: [RESOLVED] API Service Disruption - Jan 15, 2024
200
+
201
+ Dear Customers,
202
+
203
+ SUMMARY:
204
+ Between 14:00-15:00 UTC today, our API service experienced elevated
205
+ error rates affecting approximately 10% of requests.
206
+
207
+ IMPACT:
208
+ - API errors for 10% of requests
209
+ - Average latency increased from 200ms to 2s
210
+ - No data loss occurred
211
+
212
+ ROOT CAUSE:
213
+ Database connection pool exhaustion due to traffic spike
214
+
215
+ RESOLUTION:
216
+ - Scaled database connection pools
217
+ - Implemented better connection management
218
+ - Added auto-scaling triggers
219
+
220
+ PREVENTION:
221
+ - Enhanced monitoring and alerting
222
+ - Implemented circuit breakers
223
+ - Scheduled capacity review
224
+
225
+ We apologize for the disruption. Please contact support@example.com
226
+ with any questions.
227
+
228
+ Status page: https://status.example.com/incidents/2024-001
229
+ ```
230
+
231
+ ### Internal Communication
232
+
233
+ ```markdown
234
+ # Incident Slack Update Template
235
+ :rotating_light: **SEV1 INCIDENT** :rotating_light:
236
+
237
+ **Status:** Investigating
238
+ **Impact:** API returning 500 errors
239
+ **Started:** 14:05 UTC
240
+ **Incident Commander:** @alice
241
+ **Bridge:** https://zoom.us/j/123456789
242
+ **Channel:** #incident-2024-001
243
+
244
+ **Timeline:**
245
+ 14:05 - Alert triggered for high error rate
246
+ 14:07 - Incident declared SEV1
247
+ 14:10 - Team assembled
248
+ 14:15 - Root cause identified
249
+
250
+ **Next Update:** 14:30 UTC or sooner if status changes
251
+ ```
252
+
253
+ ## Postmortems
254
+
255
+ ### Blameless Postmortem Template
256
+
257
+ ```markdown
258
+ # Postmortem: API Outage - January 15, 2024
259
+
260
+ ## Incident Summary
261
+ **Date:** 2024-01-15
262
+ **Duration:** 55 minutes (14:05 - 15:00 UTC)
263
+ **Severity:** SEV1
264
+ **Impact:** 10% error rate, 500k affected requests
265
+ **Root Cause:** Database connection pool exhaustion
266
+
267
+ ## Timeline (UTC)
268
+ - 14:00: Traffic begins increasing (2x normal)
269
+ - 14:05: Alert: High API error rate
270
+ - 14:07: Incident declared SEV1
271
+ - 14:10: Incident team assembled
272
+ - 14:15: Root cause identified (connection pool exhausted)
273
+ - 14:20: Mitigation started (scale connection pool)
274
+ - 14:30: Error rates declining
275
+ - 14:45: Monitoring recovery
276
+ - 15:00: Incident resolved
277
+
278
+ ## Root Cause Analysis
279
+
280
+ ### What Happened
281
+ A marketing campaign drove 2x normal traffic. Our database connection
282
+ pool had a static size of 100 connections, which was exhausted. API
283
+ servers could not acquire database connections, resulting in errors.
284
+
285
+ ### Why It Happened
286
+ 1. No auto-scaling for database connection pools
287
+ 2. Connection pool size not sized for peak traffic
288
+ 3. No circuit breaker to fail fast
289
+ 4. Insufficient load testing
290
+
291
+ ### Contributing Factors
292
+ - Marketing campaign not coordinated with engineering
293
+ - Connection pool metrics not monitored
294
+ - No alerts on connection pool saturation
295
+
296
+ ## Impact
297
+ - 500,000 failed API requests
298
+ - 10% error rate for 55 minutes
299
+ - Estimated revenue impact: $5,000
300
+ - Customer complaints: 50
301
+
302
+ ## What Went Well
303
+ - Fast detection (< 5 minutes)
304
+ - Clear escalation path
305
+ - Good team communication
306
+ - Status page updated regularly
307
+ - Fix deployed quickly
308
+
309
+ ## What Went Wrong
310
+ - No advance warning of traffic spike
311
+ - Connection pool not monitored
312
+ - Manual scaling required
313
+ - Customer notification delayed 10 minutes
314
+
315
+ ## Action Items
316
+
317
+ ### Immediate (This Week)
318
+ - [ ] Implement connection pool auto-scaling (@alice, 2024-01-17)
319
+ - [ ] Add connection pool metrics to dashboards (@bob, 2024-01-18)
320
+ - [ ] Create alerts for pool saturation (@charlie, 2024-01-19)
321
+
322
+ ### Short-term (This Month)
323
+ - [ ] Implement circuit breakers (@alice, 2024-01-25)
324
+ - [ ] Load test at 3x normal traffic (@bob, 2024-01-30)
325
+ - [ ] Create runbook for connection issues (@charlie, 2024-01-30)
326
+
327
+ ### Long-term (This Quarter)
328
+ - [ ] Improve engineering/marketing coordination (@dave, 2024-03-31)
329
+ - [ ] Implement capacity planning process (@eve, 2024-03-31)
330
+ - [ ] Auto-notification system for incidents (@frank, 2024-03-31)
331
+
332
+ ## Lessons Learned
333
+ 1. Static resource limits are a failure point
334
+ 2. Cross-team coordination essential for major campaigns
335
+ 3. Observability gaps can hide brewing problems
336
+ 4. Circuit breakers prevent cascading failures
337
+
338
+ ## Related Incidents
339
+ - INC-2023-089: Similar connection pool issue (resolved)
340
+ - INC-2023-112: Traffic spike from campaign (different cause)
341
+
342
+ ## Appendix
343
+ - [Grafana Dashboard](https://grafana.example.com/d/incident-2024-001)
344
+ - [Logs](https://logs.example.com/incident-2024-001)
345
+ - [Slack Channel](https://slack.com/archives/incident-2024-001)
346
+ ```
347
+
348
+ ### Postmortem Review Process
349
+
350
+ ```yaml
351
+ postmortem_process:
352
+ 1_draft:
353
+ owner: Incident Commander
354
+ deadline: 2 business days
355
+ content:
356
+ - Timeline
357
+ - Root cause
358
+ - Impact
359
+ - Action items
360
+
361
+ 2_review:
362
+ participants:
363
+ - Incident team
364
+ - Engineering leadership
365
+ - Related teams
366
+ format: Meeting (30-60 min)
367
+ goals:
368
+ - Validate accuracy
369
+ - Identify additional learnings
370
+ - Prioritize action items
371
+
372
+ 3_publish:
373
+ distribution:
374
+ - All engineering
375
+ - Product team
376
+ - Customer support
377
+ - Public (if appropriate)
378
+
379
+ 4_followup:
380
+ - Track action items in project management tool
381
+ - Review progress in weekly meetings
382
+ - Update on completion
383
+ ```
384
+
385
+ ## Best Practices
386
+
387
+ ### 1. Blameless Culture
388
+
389
+ ```
390
+ Focus on systems and processes, not individuals
391
+ Ask "how" and "why", not "who"
392
+ Encourage sharing mistakes openly
393
+ ```
394
+
395
+ ### 2. Clear Severity Definitions
396
+
397
+ ```yaml
398
+ # Document and communicate
399
+ # Train team on criteria
400
+ # Review severity in retrospective
401
+ ```
402
+
403
+ ### 3. Designated Roles
404
+
405
+ ```
406
+ Never have incident response without clear roles
407
+ Incident Commander makes all decisions
408
+ Scribe documents everything
409
+ ```
410
+
411
+ ### 4. Practice Incidents
412
+
413
+ ```yaml
414
+ # Run incident simulations quarterly
415
+ chaos_engineering:
416
+ - Simulate database failure
417
+ - Test failover procedures
418
+ - Practice communication
419
+ - Time the response
420
+ ```
421
+
422
+ ### 5. Action Item Follow-Through
423
+
424
+ ```
425
+ Assign owners and deadlines
426
+ Track in project management
427
+ Report progress weekly
428
+ Review in postmortem review
429
+ ```
430
+
431
+ ---
432
+
433
+ **Related Resources:**
434
+ - [on-call-runbooks.md](on-call-runbooks.md) - Diagnostic procedures
435
+ - [observability-stack.md](observability-stack.md) - Monitoring and detection
436
+ - [alerting-best-practices.md](alerting-best-practices.md) - Alert configuration