blockmine 1.21.0 → 1.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (433) hide show
  1. package/.claude/agents/README.md +469 -0
  2. package/.claude/agents/auth-route-debugger.md +118 -0
  3. package/.claude/agents/auth-route-tester.md +93 -0
  4. package/.claude/agents/auto-error-resolver.md +97 -0
  5. package/.claude/agents/build-optimizer.md +236 -0
  6. package/.claude/agents/code-architecture-reviewer.md +83 -0
  7. package/.claude/agents/code-refactor-master.md +94 -0
  8. package/.claude/agents/cost-optimizer.md +134 -0
  9. package/.claude/agents/deployment-orchestrator.md +113 -0
  10. package/.claude/agents/documentation-architect.md +82 -0
  11. package/.claude/agents/frontend-error-fixer.md +77 -0
  12. package/.claude/agents/iac-code-generator.md +71 -0
  13. package/.claude/agents/incident-responder.md +346 -0
  14. package/.claude/agents/infrastructure-architect.md +31 -0
  15. package/.claude/agents/kubernetes-specialist.md +56 -0
  16. package/.claude/agents/migration-planner.md +181 -0
  17. package/.claude/agents/network-architect.md +196 -0
  18. package/.claude/agents/plan-reviewer.md +52 -0
  19. package/.claude/agents/refactor-planner.md +63 -0
  20. package/.claude/agents/security-scanner.md +102 -0
  21. package/.claude/agents/web-research-specialist.md +78 -0
  22. package/.claude/commands/cost-analysis.md +315 -0
  23. package/.claude/commands/dev-docs-update.md +55 -0
  24. package/.claude/commands/dev-docs.md +51 -0
  25. package/.claude/commands/incident-debug.md +247 -0
  26. package/.claude/commands/infra-plan.md +81 -0
  27. package/.claude/commands/migration-plan.md +478 -0
  28. package/.claude/commands/route-research-for-testing.md +37 -0
  29. package/.claude/commands/security-review.md +66 -0
  30. package/.claude/hooks/CONFIG.md +448 -0
  31. package/.claude/hooks/README.md +163 -0
  32. package/.claude/hooks/SKILL_ACTIVATION_COMPLETE.md +226 -0
  33. package/.claude/hooks/WINDOWS_HOOKS_README.md +151 -0
  34. package/.claude/hooks/add-skill-activation-banners.ts +132 -0
  35. package/.claude/hooks/comprehensive-skill-test.ts +1315 -0
  36. package/.claude/hooks/error-handling-reminder.sh +12 -0
  37. package/.claude/hooks/error-handling-reminder.ts +222 -0
  38. package/.claude/hooks/k8s-manifest-validator.sh +56 -0
  39. package/.claude/hooks/package-lock.json +556 -0
  40. package/.claude/hooks/package.json +16 -0
  41. package/.claude/hooks/post-tool-use-tracker.ps1 +174 -0
  42. package/.claude/hooks/post-tool-use-tracker.sh +183 -0
  43. package/.claude/hooks/security-policy-check.sh +247 -0
  44. package/.claude/hooks/skill-activation-prompt.ps1 +10 -0
  45. package/.claude/hooks/skill-activation-prompt.sh +10 -0
  46. package/.claude/hooks/skill-activation-prompt.ts +141 -0
  47. package/.claude/hooks/stop-build-check-enhanced.sh +130 -0
  48. package/.claude/hooks/terraform-validator.sh +53 -0
  49. package/.claude/hooks/test-input.json +7 -0
  50. package/.claude/hooks/test-skill-activation.ts +427 -0
  51. package/.claude/hooks/trigger-build-resolver.sh +79 -0
  52. package/.claude/hooks/tsc-check.sh +173 -0
  53. package/.claude/hooks/tsconfig.json +19 -0
  54. package/.claude/settings.json +55 -0
  55. package/.claude/settings.local.json +27 -14
  56. package/.claude/skills/README.md +507 -0
  57. package/.claude/skills/api-engineering/SKILL.md +63 -0
  58. package/.claude/skills/api-engineering/resources/api-versioning.md +88 -0
  59. package/.claude/skills/api-engineering/resources/graphql-patterns.md +106 -0
  60. package/.claude/skills/api-engineering/resources/rate-limiting.md +118 -0
  61. package/.claude/skills/api-engineering/resources/rest-api-design.md +105 -0
  62. package/.claude/skills/backend-dev-guidelines/SKILL.md +306 -0
  63. package/.claude/skills/backend-dev-guidelines/resources/architecture-overview.md +451 -0
  64. package/.claude/skills/backend-dev-guidelines/resources/async-and-errors.md +307 -0
  65. package/.claude/skills/backend-dev-guidelines/resources/complete-examples.md +638 -0
  66. package/.claude/skills/backend-dev-guidelines/resources/configuration.md +275 -0
  67. package/.claude/skills/backend-dev-guidelines/resources/database-patterns.md +224 -0
  68. package/.claude/skills/backend-dev-guidelines/resources/middleware-guide.md +213 -0
  69. package/.claude/skills/backend-dev-guidelines/resources/routing-and-controllers.md +756 -0
  70. package/.claude/skills/backend-dev-guidelines/resources/sentry-and-monitoring.md +336 -0
  71. package/.claude/skills/backend-dev-guidelines/resources/services-and-repositories.md +789 -0
  72. package/.claude/skills/backend-dev-guidelines/resources/testing-guide.md +235 -0
  73. package/.claude/skills/backend-dev-guidelines/resources/validation-patterns.md +754 -0
  74. package/.claude/skills/budget-and-cost-management/SKILL.md +850 -0
  75. package/.claude/skills/build-engineering/SKILL.md +431 -0
  76. package/.claude/skills/build-engineering/resources/artifact-repositories.md +72 -0
  77. package/.claude/skills/build-engineering/resources/build-caching.md +96 -0
  78. package/.claude/skills/build-engineering/resources/build-pipelines.md +105 -0
  79. package/.claude/skills/build-engineering/resources/build-security.md +95 -0
  80. package/.claude/skills/build-engineering/resources/build-systems.md +389 -0
  81. package/.claude/skills/build-engineering/resources/compilation-optimization.md +201 -0
  82. package/.claude/skills/build-engineering/resources/dependency-management.md +73 -0
  83. package/.claude/skills/build-engineering/resources/monorepo-builds.md +110 -0
  84. package/.claude/skills/build-engineering/resources/performance-optimization.md +113 -0
  85. package/.claude/skills/build-engineering/resources/reproducible-builds.md +82 -0
  86. package/.claude/skills/cloud-engineering/SKILL.md +675 -0
  87. package/.claude/skills/cloud-engineering/resources/aws-patterns.md +742 -0
  88. package/.claude/skills/cloud-engineering/resources/azure-patterns.md +714 -0
  89. package/.claude/skills/cloud-engineering/resources/cleared-cloud-environments.md +987 -0
  90. package/.claude/skills/cloud-engineering/resources/cloud-cost-optimization.md +757 -0
  91. package/.claude/skills/cloud-engineering/resources/cloud-networking.md +1058 -0
  92. package/.claude/skills/cloud-engineering/resources/cloud-security-tools.md +1530 -0
  93. package/.claude/skills/cloud-engineering/resources/cloud-security.md +990 -0
  94. package/.claude/skills/cloud-engineering/resources/gcp-patterns.md +758 -0
  95. package/.claude/skills/cloud-engineering/resources/migration-strategies.md +820 -0
  96. package/.claude/skills/cloud-engineering/resources/multi-cloud-strategies.md +670 -0
  97. package/.claude/skills/cloud-engineering/resources/oci-patterns.md +1198 -0
  98. package/.claude/skills/cloud-engineering/resources/serverless-patterns.md +795 -0
  99. package/.claude/skills/cloud-engineering/resources/well-architected-frameworks.md +966 -0
  100. package/.claude/skills/cybersecurity/SKILL.md +409 -0
  101. package/.claude/skills/cybersecurity/resources/security-architecture.md +266 -0
  102. package/.claude/skills/database-engineering/SKILL.md +61 -0
  103. package/.claude/skills/database-engineering/resources/backup-and-recovery.md +72 -0
  104. package/.claude/skills/database-engineering/resources/database-replication.md +63 -0
  105. package/.claude/skills/database-engineering/resources/postgresql-fundamentals.md +70 -0
  106. package/.claude/skills/database-engineering/resources/query-optimization.md +68 -0
  107. package/.claude/skills/devsecops/SKILL.md +374 -0
  108. package/.claude/skills/devsecops/resources/ci-cd-security.md +204 -0
  109. package/.claude/skills/devsecops/resources/compliance-automation.md +530 -0
  110. package/.claude/skills/devsecops/resources/compliance-frameworks.md +2322 -0
  111. package/.claude/skills/devsecops/resources/container-security.md +915 -0
  112. package/.claude/skills/devsecops/resources/cspm-integration.md +1440 -0
  113. package/.claude/skills/devsecops/resources/policy-enforcement.md +619 -0
  114. package/.claude/skills/devsecops/resources/secrets-management.md +755 -0
  115. package/.claude/skills/devsecops/resources/security-monitoring.md +146 -0
  116. package/.claude/skills/devsecops/resources/security-scanning.md +887 -0
  117. package/.claude/skills/devsecops/resources/security-testing.md +203 -0
  118. package/.claude/skills/devsecops/resources/supply-chain-security.md +518 -0
  119. package/.claude/skills/devsecops/resources/vulnerability-management.md +481 -0
  120. package/.claude/skills/devsecops/resources/zero-trust-architecture.md +177 -0
  121. package/.claude/skills/documentation-as-code/SKILL.md +323 -0
  122. package/.claude/skills/documentation-as-code/resources/api-documentation.md +90 -0
  123. package/.claude/skills/documentation-as-code/resources/changelog-management.md +79 -0
  124. package/.claude/skills/documentation-as-code/resources/diagram-generation.md +44 -0
  125. package/.claude/skills/documentation-as-code/resources/docs-as-code-workflow.md +99 -0
  126. package/.claude/skills/documentation-as-code/resources/documentation-automation.md +68 -0
  127. package/.claude/skills/documentation-as-code/resources/documentation-sites.md +79 -0
  128. package/.claude/skills/documentation-as-code/resources/markdown-best-practices.md +162 -0
  129. package/.claude/skills/documentation-as-code/resources/openapi-specification.md +77 -0
  130. package/.claude/skills/documentation-as-code/resources/readme-engineering.md +60 -0
  131. package/.claude/skills/documentation-as-code/resources/technical-writing-guide.md +202 -0
  132. package/.claude/skills/engineering-management/SKILL.md +356 -0
  133. package/.claude/skills/engineering-management/resources/career-ladders.md +609 -0
  134. package/.claude/skills/engineering-management/resources/hiring-and-assessment.md +555 -0
  135. package/.claude/skills/engineering-management/resources/one-on-one-guides.md +609 -0
  136. package/.claude/skills/engineering-management/resources/resource-planning.md +557 -0
  137. package/.claude/skills/engineering-management/resources/team-organization-patterns.md +491 -0
  138. package/.claude/skills/engineering-management/resources/technical-interviews.md +474 -0
  139. package/.claude/skills/engineering-operations-management/SKILL.md +817 -0
  140. package/.claude/skills/error-tracking/SKILL.md +379 -0
  141. package/.claude/skills/frontend-dev-guidelines/SKILL.md +403 -0
  142. package/.claude/skills/frontend-dev-guidelines/resources/common-patterns.md +331 -0
  143. package/.claude/skills/frontend-dev-guidelines/resources/complete-examples.md +872 -0
  144. package/.claude/skills/frontend-dev-guidelines/resources/component-patterns.md +502 -0
  145. package/.claude/skills/frontend-dev-guidelines/resources/data-fetching.md +767 -0
  146. package/.claude/skills/frontend-dev-guidelines/resources/file-organization.md +502 -0
  147. package/.claude/skills/frontend-dev-guidelines/resources/loading-and-error-states.md +501 -0
  148. package/.claude/skills/frontend-dev-guidelines/resources/performance.md +406 -0
  149. package/.claude/skills/frontend-dev-guidelines/resources/routing-guide.md +364 -0
  150. package/.claude/skills/frontend-dev-guidelines/resources/styling-guide.md +428 -0
  151. package/.claude/skills/frontend-dev-guidelines/resources/typescript-standards.md +418 -0
  152. package/.claude/skills/general-it-engineering/SKILL.md +393 -0
  153. package/.claude/skills/general-it-engineering/resources/asset-management.md +712 -0
  154. package/.claude/skills/general-it-engineering/resources/automation-orchestration.md +817 -0
  155. package/.claude/skills/general-it-engineering/resources/business-continuity.md +786 -0
  156. package/.claude/skills/general-it-engineering/resources/change-management.md +715 -0
  157. package/.claude/skills/general-it-engineering/resources/enterprise-monitoring.md +729 -0
  158. package/.claude/skills/general-it-engineering/resources/help-desk-operations.md +738 -0
  159. package/.claude/skills/general-it-engineering/resources/incident-service-management.md +834 -0
  160. package/.claude/skills/general-it-engineering/resources/it-governance.md +753 -0
  161. package/.claude/skills/general-it-engineering/resources/itil-framework.md +503 -0
  162. package/.claude/skills/general-it-engineering/resources/service-management.md +669 -0
  163. package/.claude/skills/infrastructure-architecture/SKILL.md +328 -0
  164. package/.claude/skills/infrastructure-architecture/resources/architecture-decision-records.md +505 -0
  165. package/.claude/skills/infrastructure-architecture/resources/architecture-patterns.md +528 -0
  166. package/.claude/skills/infrastructure-architecture/resources/capacity-planning.md +453 -0
  167. package/.claude/skills/infrastructure-architecture/resources/cleared-environment-architecture.md +773 -0
  168. package/.claude/skills/infrastructure-architecture/resources/cost-architecture.md +499 -0
  169. package/.claude/skills/infrastructure-architecture/resources/data-architecture.md +501 -0
  170. package/.claude/skills/infrastructure-architecture/resources/disaster-recovery.md +535 -0
  171. package/.claude/skills/infrastructure-architecture/resources/migration-architecture.md +512 -0
  172. package/.claude/skills/infrastructure-architecture/resources/multi-region-design.md +608 -0
  173. package/.claude/skills/infrastructure-architecture/resources/reference-architectures.md +562 -0
  174. package/.claude/skills/infrastructure-architecture/resources/security-architecture.md +538 -0
  175. package/.claude/skills/infrastructure-architecture/resources/system-design-principles.md +489 -0
  176. package/.claude/skills/infrastructure-architecture/resources/workload-classification.md +1000 -0
  177. package/.claude/skills/infrastructure-strategy/SKILL.md +924 -0
  178. package/.claude/skills/network-engineering/SKILL.md +385 -0
  179. package/.claude/skills/network-engineering/resources/dns-management.md +738 -0
  180. package/.claude/skills/network-engineering/resources/load-balancing.md +820 -0
  181. package/.claude/skills/network-engineering/resources/network-architecture.md +546 -0
  182. package/.claude/skills/network-engineering/resources/network-security.md +921 -0
  183. package/.claude/skills/network-engineering/resources/network-troubleshooting.md +749 -0
  184. package/.claude/skills/network-engineering/resources/routing-switching.md +373 -0
  185. package/.claude/skills/network-engineering/resources/sdn-networking.md +695 -0
  186. package/.claude/skills/network-engineering/resources/service-mesh-networking.md +777 -0
  187. package/.claude/skills/network-engineering/resources/tcp-ip-protocols.md +444 -0
  188. package/.claude/skills/network-engineering/resources/vpn-connectivity.md +672 -0
  189. package/.claude/skills/observability-engineering/SKILL.md +101 -0
  190. package/.claude/skills/observability-engineering/resources/apm-tools.md +97 -0
  191. package/.claude/skills/observability-engineering/resources/correlation-strategies.md +87 -0
  192. package/.claude/skills/observability-engineering/resources/distributed-tracing.md +98 -0
  193. package/.claude/skills/observability-engineering/resources/logs-aggregation.md +118 -0
  194. package/.claude/skills/observability-engineering/resources/observability-cost-optimization.md +141 -0
  195. package/.claude/skills/observability-engineering/resources/opentelemetry.md +110 -0
  196. package/.claude/skills/platform-engineering/SKILL.md +555 -0
  197. package/.claude/skills/platform-engineering/resources/architecture-overview.md +600 -0
  198. package/.claude/skills/platform-engineering/resources/container-orchestration.md +916 -0
  199. package/.claude/skills/platform-engineering/resources/cost-optimization.md +634 -0
  200. package/.claude/skills/platform-engineering/resources/developer-platforms.md +670 -0
  201. package/.claude/skills/platform-engineering/resources/gitops-automation.md +650 -0
  202. package/.claude/skills/platform-engineering/resources/infrastructure-as-code.md +778 -0
  203. package/.claude/skills/platform-engineering/resources/infrastructure-standards.md +708 -0
  204. package/.claude/skills/platform-engineering/resources/multi-tenancy.md +602 -0
  205. package/.claude/skills/platform-engineering/resources/platform-security.md +711 -0
  206. package/.claude/skills/platform-engineering/resources/resource-management.md +592 -0
  207. package/.claude/skills/platform-engineering/resources/service-mesh.md +628 -0
  208. package/.claude/skills/release-engineering/SKILL.md +393 -0
  209. package/.claude/skills/release-engineering/resources/artifact-management.md +108 -0
  210. package/.claude/skills/release-engineering/resources/build-optimization.md +84 -0
  211. package/.claude/skills/release-engineering/resources/ci-cd-pipelines.md +411 -0
  212. package/.claude/skills/release-engineering/resources/deployment-strategies.md +197 -0
  213. package/.claude/skills/release-engineering/resources/pipeline-security.md +62 -0
  214. package/.claude/skills/release-engineering/resources/progressive-delivery.md +83 -0
  215. package/.claude/skills/release-engineering/resources/release-automation.md +68 -0
  216. package/.claude/skills/release-engineering/resources/release-orchestration.md +77 -0
  217. package/.claude/skills/release-engineering/resources/rollback-strategies.md +66 -0
  218. package/.claude/skills/release-engineering/resources/versioning-strategies.md +59 -0
  219. package/.claude/skills/route-tester/SKILL.md +392 -0
  220. package/.claude/skills/skill-developer/ADVANCED.md +197 -0
  221. package/.claude/skills/skill-developer/HOOK_MECHANISMS.md +306 -0
  222. package/.claude/skills/skill-developer/PATTERNS_LIBRARY.md +152 -0
  223. package/.claude/skills/skill-developer/SKILL.md +430 -0
  224. package/.claude/skills/skill-developer/SKILL_RULES_REFERENCE.md +315 -0
  225. package/.claude/skills/skill-developer/TRIGGER_TYPES.md +305 -0
  226. package/.claude/skills/skill-developer/TROUBLESHOOTING.md +514 -0
  227. package/.claude/skills/skill-rules.json +2940 -0
  228. package/.claude/skills/sre/SKILL.md +464 -0
  229. package/.claude/skills/sre/resources/alerting-best-practices.md +282 -0
  230. package/.claude/skills/sre/resources/capacity-planning.md +226 -0
  231. package/.claude/skills/sre/resources/chaos-engineering.md +193 -0
  232. package/.claude/skills/sre/resources/disaster-recovery.md +232 -0
  233. package/.claude/skills/sre/resources/incident-management.md +436 -0
  234. package/.claude/skills/sre/resources/observability-stack.md +240 -0
  235. package/.claude/skills/sre/resources/on-call-runbooks.md +167 -0
  236. package/.claude/skills/sre/resources/performance-optimization.md +108 -0
  237. package/.claude/skills/sre/resources/reliability-patterns.md +183 -0
  238. package/.claude/skills/sre/resources/slo-sli-sla.md +464 -0
  239. package/.claude/skills/sre/resources/toil-reduction.md +145 -0
  240. package/.claude/skills/systems-engineering/SKILL.md +648 -0
  241. package/.claude/skills/systems-engineering/resources/automation-patterns.md +771 -0
  242. package/.claude/skills/systems-engineering/resources/configuration-management.md +998 -0
  243. package/.claude/skills/systems-engineering/resources/linux-administration.md +672 -0
  244. package/.claude/skills/systems-engineering/resources/networking-fundamentals.md +982 -0
  245. package/.claude/skills/systems-engineering/resources/performance-tuning.md +871 -0
  246. package/.claude/skills/systems-engineering/resources/powershell-scripting.md +482 -0
  247. package/.claude/skills/systems-engineering/resources/security-hardening.md +739 -0
  248. package/.claude/skills/systems-engineering/resources/shell-scripting.md +915 -0
  249. package/.claude/skills/systems-engineering/resources/storage-management.md +628 -0
  250. package/.claude/skills/systems-engineering/resources/system-monitoring.md +787 -0
  251. package/.claude/skills/systems-engineering/resources/troubleshooting-guide.md +753 -0
  252. package/.claude/skills/systems-engineering/resources/windows-administration.md +738 -0
  253. package/.claude/skills/technical-leadership/SKILL.md +728 -0
  254. package/CHANGELOG.md +90 -54
  255. package/README.md +94 -0
  256. package/backend/docs/SECRETS_DOCUMENTATION.md +327 -0
  257. package/backend/jest.config.js +59 -0
  258. package/backend/package-lock.json +6129 -0
  259. package/backend/package.json +16 -4
  260. package/backend/prisma/migrations/20251026104609_add_websocket_api/migration.sql +33 -0
  261. package/backend/prisma/schema.prisma +33 -0
  262. package/backend/src/__tests__/core/DependencyService.test.js +336 -0
  263. package/backend/src/__tests__/core/UserService.test.js +875 -0
  264. package/backend/src/__tests__/repositories/BaseRepository.test.js +146 -0
  265. package/backend/src/__tests__/repositories/BotRepository.test.js +118 -0
  266. package/backend/src/__tests__/repositories/CommandRepository.test.js +132 -0
  267. package/backend/src/__tests__/repositories/EventGraphRepository.test.js +93 -0
  268. package/backend/src/__tests__/repositories/GroupRepository.test.js +155 -0
  269. package/backend/src/__tests__/repositories/PermissionRepository.test.js +130 -0
  270. package/backend/src/__tests__/repositories/PluginRepository.test.js +107 -0
  271. package/backend/src/__tests__/repositories/ServerRepository.test.js +80 -0
  272. package/backend/src/__tests__/repositories/UserRepository.test.js +128 -0
  273. package/backend/src/__tests__/secretsFilter.test.js +425 -0
  274. package/backend/src/__tests__/services/BotLifecycleService.test.js +411 -0
  275. package/backend/src/__tests__/services/BotProcessManager.test.js +285 -0
  276. package/backend/src/__tests__/services/CacheManager.test.js +125 -0
  277. package/backend/src/__tests__/services/CommandExecutionService.test.js +460 -0
  278. package/backend/src/__tests__/services/ResourceMonitorService.test.js +207 -0
  279. package/backend/src/__tests__/services/TelemetryService.test.js +291 -0
  280. package/backend/src/__tests__/setup.js +25 -0
  281. package/backend/src/api/routes/apiKeys.js +181 -0
  282. package/backend/src/api/routes/bots.js +49 -7
  283. package/backend/src/api/routes/plugins.js +2 -1
  284. package/backend/src/api/routes/system.js +174 -0
  285. package/backend/src/container.js +82 -0
  286. package/backend/src/core/BotManager.js +142 -871
  287. package/backend/src/core/BotManager.old.js +1093 -0
  288. package/backend/src/core/BotProcess.js +1092 -858
  289. package/backend/src/core/EventGraphManager.js +280 -198
  290. package/backend/src/core/GraphExecutionEngine.js +321 -325
  291. package/backend/src/core/MessageQueue.js +27 -6
  292. package/backend/src/core/NodeRegistry.js +37 -1134
  293. package/backend/src/core/PluginManager.js +62 -12
  294. package/backend/src/core/PrismaService.js +32 -0
  295. package/backend/src/core/UserService.js +3 -3
  296. package/backend/src/core/__tests__/PrismaService.test.js +24 -0
  297. package/backend/src/core/commands/README.md +305 -0
  298. package/backend/src/core/commands/dev.js +13 -7
  299. package/backend/src/core/commands/ping.js +10 -4
  300. package/backend/src/core/commands/whois.js +63 -0
  301. package/backend/src/core/config/validation.js +27 -0
  302. package/backend/src/core/constants/graphTypes.js +21 -0
  303. package/backend/src/core/node-registries/actions.js +132 -0
  304. package/backend/src/core/node-registries/arrays.js +137 -0
  305. package/backend/src/core/node-registries/bot.js +23 -0
  306. package/backend/src/core/node-registries/data.js +290 -0
  307. package/backend/src/core/node-registries/debug.js +26 -0
  308. package/backend/src/core/node-registries/events.js +187 -0
  309. package/backend/src/core/node-registries/flow.js +139 -0
  310. package/backend/src/core/node-registries/logic.js +45 -0
  311. package/backend/src/core/node-registries/math.js +42 -0
  312. package/backend/src/core/node-registries/objects.js +98 -0
  313. package/backend/src/core/node-registries/strings.js +153 -0
  314. package/backend/src/core/node-registries/time.js +113 -0
  315. package/backend/src/core/node-registries/users.js +79 -0
  316. package/backend/src/core/nodes/{action_bot_look_at.js → actions/bot_look_at.js} +36 -36
  317. package/backend/src/core/nodes/{action_bot_set_variable.js → actions/bot_set_variable.js} +32 -32
  318. package/backend/src/core/nodes/{action_send_log.js → actions/send_log.js} +28 -23
  319. package/backend/src/core/nodes/{action_send_message.js → actions/send_message.js} +32 -32
  320. package/backend/src/core/nodes/actions/send_websocket_response.js +33 -0
  321. package/backend/src/core/nodes/arrays/get_next.js +35 -0
  322. package/backend/src/core/nodes/{data_cast.js → data/cast.js} +8 -0
  323. package/backend/src/core/nodes/data/datetime_literal.js +27 -0
  324. package/backend/src/core/nodes/data/entity_info.js +69 -0
  325. package/backend/src/core/nodes/data/get_nearby_entities.js +32 -0
  326. package/backend/src/core/nodes/data/get_nearby_players.js +64 -0
  327. package/backend/src/core/nodes/{data_get_user_field.js → data/get_user_field.js} +1 -1
  328. package/backend/src/core/nodes/data/type_check.js +53 -0
  329. package/backend/src/core/nodes/{debug_log.js → debug/log.js} +16 -16
  330. package/backend/src/core/nodes/{flow_branch.js → flow/branch.js} +15 -15
  331. package/backend/src/core/nodes/{flow_break.js → flow/break.js} +14 -14
  332. package/backend/src/core/nodes/flow/delay.js +43 -0
  333. package/backend/src/core/nodes/{flow_for_each.js → flow/for_each.js} +39 -39
  334. package/backend/src/core/nodes/{flow_sequence.js → flow/sequence.js} +16 -16
  335. package/backend/src/core/nodes/{flow_switch.js → flow/switch.js} +47 -47
  336. package/backend/src/core/nodes/{flow_while.js → flow/while.js} +1 -1
  337. package/backend/src/core/nodes/logic/__tests__/compare.test.js +83 -0
  338. package/backend/src/core/nodes/math/__tests__/operation.test.js +65 -0
  339. package/backend/src/core/nodes/strings/__tests__/concat.test.js +89 -0
  340. package/backend/src/core/nodes/time/__tests__/now.test.js +24 -0
  341. package/backend/src/core/nodes/time/add.js +33 -0
  342. package/backend/src/core/nodes/time/compare.js +35 -0
  343. package/backend/src/core/nodes/time/diff.js +29 -0
  344. package/backend/src/core/nodes/time/format.js +32 -0
  345. package/backend/src/core/nodes/time/now.js +18 -0
  346. package/backend/src/core/nodes/{user_check_blacklist.js → users/check_blacklist.js} +37 -37
  347. package/backend/src/core/nodes/{user_get_groups.js → users/get_groups.js} +36 -36
  348. package/backend/src/core/nodes/{user_get_permissions.js → users/get_permissions.js} +36 -36
  349. package/backend/src/core/nodes/{user_set_blacklist.js → users/set_blacklist.js} +37 -37
  350. package/backend/src/core/services/BotLifecycleService.js +596 -0
  351. package/backend/src/core/services/BotProcessManager.js +163 -0
  352. package/backend/src/core/services/CacheManager.js +111 -0
  353. package/backend/src/core/services/CommandExecutionService.js +351 -0
  354. package/backend/src/core/services/ResourceMonitorService.js +90 -0
  355. package/backend/src/core/services/TelemetryService.js +124 -0
  356. package/backend/src/core/services/ValidationService.js +132 -0
  357. package/backend/src/core/services/__tests__/ValidationService.test.js +148 -0
  358. package/backend/src/core/services.js +20 -5
  359. package/backend/src/core/system/CommandContext.js +84 -0
  360. package/backend/src/core/system/Transport.js +78 -0
  361. package/backend/src/core/utils/__tests__/jsonParser.test.js +44 -0
  362. package/backend/src/core/utils/jsonParser.js +18 -0
  363. package/backend/src/core/utils/secretsFilter.js +262 -0
  364. package/backend/src/core/utils/variableParser.js +89 -0
  365. package/backend/src/core/validation/__tests__/nodeSchemas.test.js +175 -0
  366. package/backend/src/core/validation/nodeSchemas.js +112 -0
  367. package/backend/src/lib/prisma.js +2 -4
  368. package/backend/src/real-time/botApi/handlers/commandHandlers.js +28 -0
  369. package/backend/src/real-time/botApi/handlers/graphHandlers.js +99 -0
  370. package/backend/src/real-time/botApi/handlers/graphWebSocketHandlers.js +147 -0
  371. package/backend/src/real-time/botApi/handlers/index.js +43 -0
  372. package/backend/src/real-time/botApi/handlers/messageHandlers.js +66 -0
  373. package/backend/src/real-time/botApi/handlers/statusHandlers.js +17 -0
  374. package/backend/src/real-time/botApi/handlers/userHandlers.js +141 -0
  375. package/backend/src/real-time/botApi/index.js +40 -0
  376. package/backend/src/real-time/botApi/middleware.js +79 -0
  377. package/backend/src/real-time/botApi/utils.js +54 -0
  378. package/backend/src/real-time/socketHandler.js +6 -2
  379. package/backend/src/repositories/BaseRepository.js +43 -0
  380. package/backend/src/repositories/BotRepository.js +42 -0
  381. package/backend/src/repositories/CommandRepository.js +53 -0
  382. package/backend/src/repositories/EventGraphRepository.js +40 -0
  383. package/backend/src/repositories/GroupRepository.js +69 -0
  384. package/backend/src/repositories/PermissionRepository.js +48 -0
  385. package/backend/src/repositories/PluginRepository.js +42 -0
  386. package/backend/src/repositories/ServerRepository.js +27 -0
  387. package/backend/src/repositories/UserRepository.js +48 -0
  388. package/backend/src/server.js +3 -0
  389. package/backend/src/test-refactor.js +85 -0
  390. package/frontend/dist/assets/index-CfTo92bP.css +1 -0
  391. package/frontend/dist/assets/index-CiFD5X9Z.js +8344 -0
  392. package/frontend/dist/index.html +2 -2
  393. package/frontend/package.json +0 -5
  394. package/package.json +2 -1
  395. package/frontend/dist/assets/index-B9GedHEa.js +0 -8352
  396. package/frontend/dist/assets/index-zLiy9MDx.css +0 -1
  397. package/nul +0 -0
  398. /package/backend/src/core/nodes/{action_http_request.js → actions/http_request.js} +0 -0
  399. /package/backend/src/core/nodes/{array_add_element.js → arrays/add_element.js} +0 -0
  400. /package/backend/src/core/nodes/{array_contains.js → arrays/contains.js} +0 -0
  401. /package/backend/src/core/nodes/{array_find_index.js → arrays/find_index.js} +0 -0
  402. /package/backend/src/core/nodes/{array_get_by_index.js → arrays/get_by_index.js} +0 -0
  403. /package/backend/src/core/nodes/{array_get_random_element.js → arrays/get_random_element.js} +0 -0
  404. /package/backend/src/core/nodes/{array_remove_by_index.js → arrays/remove_by_index.js} +0 -0
  405. /package/backend/src/core/nodes/{bot_get_position.js → bot/get_position.js} +0 -0
  406. /package/backend/src/core/nodes/{data_array_literal.js → data/array_literal.js} +0 -0
  407. /package/backend/src/core/nodes/{data_boolean_literal.js → data/boolean_literal.js} +0 -0
  408. /package/backend/src/core/nodes/{data_get_argument.js → data/get_argument.js} +0 -0
  409. /package/backend/src/core/nodes/{data_get_bot_look.js → data/get_bot_look.js} +0 -0
  410. /package/backend/src/core/nodes/{data_get_entity_field.js → data/get_entity_field.js} +0 -0
  411. /package/backend/src/core/nodes/{data_get_server_players.js → data/get_server_players.js} +0 -0
  412. /package/backend/src/core/nodes/{data_get_variable.js → data/get_variable.js} +0 -0
  413. /package/backend/src/core/nodes/{data_length.js → data/length.js} +0 -0
  414. /package/backend/src/core/nodes/{data_make_object.js → data/make_object.js} +0 -0
  415. /package/backend/src/core/nodes/{data_number_literal.js → data/number_literal.js} +0 -0
  416. /package/backend/src/core/nodes/{data_string_literal.js → data/string_literal.js} +0 -0
  417. /package/backend/src/core/nodes/{logic_compare.js → logic/compare.js} +0 -0
  418. /package/backend/src/core/nodes/{logic_operation.js → logic/operation.js} +0 -0
  419. /package/backend/src/core/nodes/{math_operation.js → math/operation.js} +0 -0
  420. /package/backend/src/core/nodes/{math_random_number.js → math/random_number.js} +0 -0
  421. /package/backend/src/core/nodes/{object_create.js → objects/create.js} +0 -0
  422. /package/backend/src/core/nodes/{object_delete.js → objects/delete.js} +0 -0
  423. /package/backend/src/core/nodes/{object_get.js → objects/get.js} +0 -0
  424. /package/backend/src/core/nodes/{object_has_key.js → objects/has_key.js} +0 -0
  425. /package/backend/src/core/nodes/{object_set.js → objects/set.js} +0 -0
  426. /package/backend/src/core/nodes/{string_concat.js → strings/concat.js} +0 -0
  427. /package/backend/src/core/nodes/{string_contains.js → strings/contains.js} +0 -0
  428. /package/backend/src/core/nodes/{string_ends_with.js → strings/ends_with.js} +0 -0
  429. /package/backend/src/core/nodes/{string_equals.js → strings/equals.js} +0 -0
  430. /package/backend/src/core/nodes/{string_length.js → strings/length.js} +0 -0
  431. /package/backend/src/core/nodes/{string_matches.js → strings/matches.js} +0 -0
  432. /package/backend/src/core/nodes/{string_split.js → strings/split.js} +0 -0
  433. /package/backend/src/core/nodes/{string_starts_with.js → strings/starts_with.js} +0 -0
@@ -0,0 +1,282 @@
1
+ # Alerting Best Practices
2
+
3
+ Alert design principles, notification routing (PagerDuty, OpsGenie), alert fatigue prevention, and effective on-call alerting strategies.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Alert Design Principles](#alert-design-principles)
8
+ - [Alert Rules](#alert-rules)
9
+ - [Notification Routing](#notification-routing)
10
+ - [Alert Fatigue Prevention](#alert-fatigue-prevention)
11
+ - [Best Practices](#best-practices)
12
+
13
+ ## Alert Design Principles
14
+
15
+ **Good Alerts:**
16
+ ```
17
+ ✅ Actionable - Can be fixed immediately
18
+ ✅ Specific - Clear what's wrong
19
+ ✅ User-impacting - Affects customers
20
+ ✅ Urgent - Requires immediate attention
21
+ ✅ Novel - Not duplicate of existing alert
22
+ ```
23
+
24
+ **Bad Alerts:**
25
+ ```
26
+ ❌ Noisy - Frequent false positives
27
+ ❌ Vague - Unclear what to do
28
+ ❌ Premature - Fires before issue impacts users
29
+ ❌ Duplicate - Same as other alerts
30
+ ❌ Low-priority - Can wait until business hours
31
+ ```
32
+
33
+ ## Alert Rules
34
+
35
+ **Prometheus Alerting:**
36
+ ```yaml
37
+ groups:
38
+ - name: slo_alerts
39
+ rules:
40
+ # Good: User-impacting, actionable
41
+ - alert: HighErrorRate
42
+ expr: |
43
+ (
44
+ sum(rate(http_requests_total{status=~"5.."}[5m]))
45
+ /
46
+ sum(rate(http_requests_total[5m]))
47
+ ) > 0.05
48
+ for: 5m
49
+ labels:
50
+ severity: critical
51
+ team: platform
52
+ annotations:
53
+ summary: "Error rate above 5% for 5 minutes"
54
+ description: "{{ $value | humanizePercentage }} of requests failing"
55
+ runbook: "https://runbooks.example.com/high-error-rate"
56
+ dashboard: "https://grafana.example.com/d/service-health"
57
+
58
+ # Good: SLO-based, clear threshold
59
+ - alert: LatencyP95High
60
+ expr: |
61
+ histogram_quantile(0.95,
62
+ rate(http_request_duration_seconds_bucket[5m])
63
+ ) > 0.5
64
+ for: 10m
65
+ labels:
66
+ severity: warning
67
+ team: platform
68
+ annotations:
69
+ summary: "P95 latency above 500ms"
70
+ impact: "Users experiencing slow response times"
71
+ ```
72
+
73
+ **Multi-Window Alerts:**
74
+ ```yaml
75
+ # Fast burn + slow burn
76
+ - alert: ErrorBudgetBurn
77
+ expr: |
78
+ (
79
+ sum(rate(http_requests_total{status=~"5.."}[1h]))
80
+ /
81
+ sum(rate(http_requests_total[1h]))
82
+ > (14.4 * (1 - 0.999))
83
+ )
84
+ and
85
+ (
86
+ sum(rate(http_requests_total{status=~"5.."}[5m]))
87
+ /
88
+ sum(rate(http_requests_total[5m]))
89
+ > (14.4 * (1 - 0.999))
90
+ )
91
+ labels:
92
+ severity: critical
93
+ annotations:
94
+ summary: "Error budget burning at 14.4x rate"
95
+ ```
96
+
97
+ ## Notification Routing
98
+
99
+ **AlertManager Config:**
100
+ ```yaml
101
+ route:
102
+ receiver: default
103
+ group_by: ['alertname', 'cluster']
104
+ group_wait: 30s
105
+ group_interval: 5m
106
+ repeat_interval: 12h
107
+
108
+ routes:
109
+ # Critical: Page immediately
110
+ - match:
111
+ severity: critical
112
+ receiver: pagerduty
113
+ group_wait: 10s
114
+ repeat_interval: 5m
115
+
116
+ # Warning: Slack notification
117
+ - match:
118
+ severity: warning
119
+ receiver: slack
120
+ repeat_interval: 4h
121
+
122
+ # Info: Email only
123
+ - match:
124
+ severity: info
125
+ receiver: email
126
+ repeat_interval: 24h
127
+
128
+ receivers:
129
+ - name: pagerduty
130
+ pagerduty_configs:
131
+ - service_key: $PAGERDUTY_SERVICE_KEY
132
+ description: "{{ .GroupLabels.alertname }}"
133
+
134
+ - name: slack
135
+ slack_configs:
136
+ - api_url: $SLACK_WEBHOOK_URL
137
+ channel: '#alerts'
138
+ title: "{{ .GroupLabels.alertname }}"
139
+ text: "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}"
140
+
141
+ - name: email
142
+ email_configs:
143
+ - to: 'team@example.com'
144
+ from: 'alertmanager@example.com'
145
+ ```
146
+
147
+ **PagerDuty Integration:**
148
+ ```yaml
149
+ pagerduty_configs:
150
+ - routing_key: $PAGERDUTY_ROUTING_KEY
151
+ severity: "{{ .Labels.severity }}"
152
+ client: "Alertmanager"
153
+ client_url: "{{ .ExternalURL }}"
154
+ description: "{{ .GroupLabels.alertname }}"
155
+ details:
156
+ firing: "{{ .Alerts.Firing | len }}"
157
+ resolved: "{{ .Alerts.Resolved | len }}"
158
+ summary: "{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}"
159
+ ```
160
+
161
+ ## Alert Fatigue Prevention
162
+
163
+ **Strategies:**
164
+
165
+ 1. **High Signal-to-Noise Ratio**
166
+ ```
167
+ Target: < 5% false positive rate
168
+ If alert fires but no action taken → remove or adjust
169
+ ```
170
+
171
+ 2. **Appropriate Thresholds**
172
+ ```yaml
173
+ # Too sensitive
174
+ expr: cpu_usage > 0.5 # Fires constantly
175
+
176
+ # Better
177
+ expr: cpu_usage > 0.9 for 10m # Sustained high usage
178
+ ```
179
+
180
+ 3. **Group Similar Alerts**
181
+ ```yaml
182
+ route:
183
+ group_by: ['alertname', 'cluster', 'service']
184
+ group_wait: 30s # Wait to group
185
+ group_interval: 5m # Send grouped updates
186
+ ```
187
+
188
+ 4. **Escalation Policies**
189
+ ```yaml
190
+ # PagerDuty escalation
191
+ escalation_policy:
192
+ - level: 1
193
+ targets: [on_call_primary]
194
+ escalation_delay: 5m
195
+
196
+ - level: 2
197
+ targets: [on_call_secondary, team_lead]
198
+ escalation_delay: 10m
199
+
200
+ - level: 3
201
+ targets: [engineering_manager]
202
+ escalation_delay: 15m
203
+ ```
204
+
205
+ 5. **Alert Inhibition**
206
+ ```yaml
207
+ inhibit_rules:
208
+ # If service is down, don't alert on high latency
209
+ - source_match:
210
+ severity: critical
211
+ alertname: ServiceDown
212
+ target_match:
213
+ severity: warning
214
+ alertname: HighLatency
215
+ equal: ['service']
216
+ ```
217
+
218
+ ## Best Practices
219
+
220
+ ### 1. Include Runbook Links
221
+
222
+ ```yaml
223
+ annotations:
224
+ runbook: "https://runbooks.example.com/{{ $labels.alertname }}"
225
+ ```
226
+
227
+ ### 2. Add Context
228
+
229
+ ```yaml
230
+ annotations:
231
+ description: |
232
+ Service {{ $labels.service }} error rate is {{ $value | humanizePercentage }}
233
+ Dashboard: https://grafana.example.com/d/{{ $labels.service }}
234
+ Logs: https://logs.example.com/?service={{ $labels.service }}
235
+ ```
236
+
237
+ ### 3. Test Alerts
238
+
239
+ ```bash
240
+ # Send test alert
241
+ amtool alert add alertname=TestAlert severity=warning
242
+
243
+ # Check routing
244
+ amtool config routes test --config.file=alertmanager.yml \
245
+ severity=critical team=platform
246
+ ```
247
+
248
+ ### 4. Review Alerts Regularly
249
+
250
+ ```yaml
251
+ # Quarterly alert audit
252
+ review_process:
253
+ - Check false positive rate
254
+ - Verify runbooks are current
255
+ - Update thresholds based on trends
256
+ - Remove unused alerts
257
+ ```
258
+
259
+ ### 5. Time-Based Routing
260
+
261
+ ```yaml
262
+ # Different routing for business hours vs off-hours
263
+ routes:
264
+ - match:
265
+ severity: warning
266
+ receiver: slack
267
+ active_time_intervals:
268
+ - business_hours
269
+
270
+ - match:
271
+ severity: warning
272
+ receiver: email
273
+ active_time_intervals:
274
+ - off_hours
275
+ ```
276
+
277
+ ---
278
+
279
+ **Related Resources:**
280
+ - [incident-management.md](incident-management.md)
281
+ - [on-call-runbooks.md](on-call-runbooks.md)
282
+ - [observability-stack.md](observability-stack.md)
@@ -0,0 +1,226 @@
1
+ # Capacity Planning
2
+
3
+ Resource forecasting, growth modeling, scalability analysis, load testing, and proactive capacity management.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Capacity Planning Process](#capacity-planning-process)
8
+ - [Resource Forecasting](#resource-forecasting)
9
+ - [Load Testing](#load-testing)
10
+ - [Scalability Analysis](#scalability-analysis)
11
+
12
+ ## Capacity Planning Process
13
+
14
+ ```yaml
15
+ quarterly_process:
16
+ 1_collect_data:
17
+ - Current resource usage trends
18
+ - Traffic growth patterns
19
+ - Business projections
20
+ - Seasonal variations
21
+
22
+ 2_forecast:
23
+ - Project 6-12 months ahead
24
+ - Account for growth initiatives
25
+ - Include safety margin (20-30%)
26
+
27
+ 3_plan_upgrades:
28
+ - Identify bottlenecks
29
+ - Plan infrastructure changes
30
+ - Budget for new resources
31
+
32
+ 4_implement:
33
+ - Gradual rollout
34
+ - Monitor impact
35
+ - Adjust as needed
36
+ ```
37
+
38
+ ## Resource Forecasting
39
+
40
+ **Linear Growth Model:**
41
+ ```python
42
+ import pandas as pd
43
+ import numpy as np
44
+ from sklearn.linear_model import LinearRegression
45
+
46
+ def forecast_capacity(historical_data, months_ahead=6):
47
+ """
48
+ Forecast resource requirements
49
+
50
+ Args:
51
+ historical_data: DataFrame with 'date' and 'usage' columns
52
+ months_ahead: Number of months to forecast
53
+
54
+ Returns:
55
+ Forecasted usage values
56
+ """
57
+ # Prepare data
58
+ X = np.array(range(len(historical_data))).reshape(-1, 1)
59
+ y = historical_data['usage'].values
60
+
61
+ # Train model
62
+ model = LinearRegression()
63
+ model.fit(X, y)
64
+
65
+ # Forecast
66
+ future_X = np.array(range(len(historical_data),
67
+ len(historical_data) + months_ahead)).reshape(-1, 1)
68
+ forecast = model.predict(future_X)
69
+
70
+ # Add 30% safety margin
71
+ return forecast * 1.3
72
+
73
+ # Usage
74
+ import pandas as pd
75
+ data = pd.DataFrame({
76
+ 'date': pd.date_range('2023-01-01', periods=12, freq='M'),
77
+ 'usage': [100, 110, 115, 125, 130, 140, 145, 155, 160, 170, 175, 185]
78
+ })
79
+
80
+ forecast = forecast_capacity(data, months_ahead=6)
81
+ print(f"Forecasted usage in 6 months: {forecast[-1]:.0f}")
82
+ ```
83
+
84
+ **Capacity Metrics:**
85
+ ```yaml
86
+ cpu:
87
+ current_avg: 45%
88
+ current_p95: 75%
89
+ target_max: 80%
90
+ growth_rate: 5% monthly
91
+ action_needed: Scale in 4 months
92
+
93
+ memory:
94
+ current_avg: 60%
95
+ current_p95: 85%
96
+ target_max: 85%
97
+ growth_rate: 3% monthly
98
+ action_needed: Scale in 6 months
99
+
100
+ storage:
101
+ current_usage: 500GB
102
+ total_capacity: 1TB
103
+ growth_rate: 50GB monthly
104
+ action_needed: Scale in 10 months
105
+ ```
106
+
107
+ ## Load Testing
108
+
109
+ **k6 Load Test:**
110
+ ```javascript
111
+ // load-test.js
112
+ import http from 'k6/http';
113
+ import { check, sleep } from 'k6';
114
+
115
+ export const options = {
116
+ stages: [
117
+ { duration: '5m', target: 100 }, // Ramp up to 100 users
118
+ { duration: '10m', target: 100 }, // Stay at 100 users
119
+ { duration: '5m', target: 500 }, // Ramp to 500 users
120
+ { duration: '10m', target: 500 }, // Stay at 500
121
+ { duration: '5m', target: 1000 }, // Spike to 1000
122
+ { duration: '5m', target: 0 }, // Ramp down
123
+ ],
124
+ thresholds: {
125
+ http_req_duration: ['p(95)<500'], // 95% of requests < 500ms
126
+ http_req_failed: ['rate<0.01'], // Error rate < 1%
127
+ },
128
+ };
129
+
130
+ export default function () {
131
+ const res = http.get('https://api.example.com/');
132
+ check(res, {
133
+ 'status is 200': (r) => r.status === 200,
134
+ 'response time < 500ms': (r) => r.timings.duration < 500,
135
+ });
136
+ sleep(1);
137
+ }
138
+ ```
139
+
140
+ **Run Load Test:**
141
+ ```bash
142
+ # Local test
143
+ k6 run load-test.js
144
+
145
+ # Cloud test (distributed)
146
+ k6 cloud load-test.js
147
+
148
+ # With custom VUs
149
+ k6 run --vus 1000 --duration 30m load-test.js
150
+ ```
151
+
152
+ ## Scalability Analysis
153
+
154
+ **Horizontal vs Vertical Scaling:**
155
+ ```yaml
156
+ horizontal_scaling:
157
+ when: Stateless applications, need high availability
158
+ pros:
159
+ - No downtime
160
+ - Better fault tolerance
161
+ - Linear cost scaling
162
+ cons:
163
+ - More complex
164
+ - Coordination overhead
165
+
166
+ vertical_scaling:
167
+ when: Stateful applications, simpler architecture
168
+ pros:
169
+ - Simpler architecture
170
+ - Less coordination
171
+ cons:
172
+ - Downtime required
173
+ - Upper limits
174
+ - Single point of failure
175
+ ```
176
+
177
+ **Auto-scaling Configuration:**
178
+ ```yaml
179
+ apiVersion: autoscaling/v2
180
+ kind: HorizontalPodAutoscaler
181
+ metadata:
182
+ name: api-hpa
183
+ spec:
184
+ scaleTargetRef:
185
+ apiVersion: apps/v1
186
+ kind: Deployment
187
+ name: api
188
+ minReplicas: 3
189
+ maxReplicas: 100
190
+ metrics:
191
+ - type: Resource
192
+ resource:
193
+ name: cpu
194
+ target:
195
+ type: Utilization
196
+ averageUtilization: 70
197
+ - type: Resource
198
+ resource:
199
+ name: memory
200
+ target:
201
+ type: Utilization
202
+ averageUtilization: 80
203
+ behavior:
204
+ scaleDown:
205
+ stabilizationWindowSeconds: 300
206
+ policies:
207
+ - type: Percent
208
+ value: 50
209
+ periodSeconds: 60
210
+ scaleUp:
211
+ stabilizationWindowSeconds: 0
212
+ policies:
213
+ - type: Percent
214
+ value: 100
215
+ periodSeconds: 30
216
+ - type: Pods
217
+ value: 5
218
+ periodSeconds: 30
219
+ selectPolicy: Max
220
+ ```
221
+
222
+ ---
223
+
224
+ **Related Resources:**
225
+ - [performance-optimization.md](performance-optimization.md)
226
+ - [resource-management.md](../platform-engineering/resources/resource-management.md)
@@ -0,0 +1,193 @@
1
+ # Chaos Engineering
2
+
3
+ Chaos Monkey, fault injection, failure mode testing, Chaos Toolkit, Litmus Chaos, and resilience testing practices.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Principles](#principles)
8
+ - [Tools](#tools)
9
+ - [Experiments](#experiments)
10
+ - [Best Practices](#best-practices)
11
+
12
+ ## Principles
13
+
14
+ **Chaos Engineering Principles:**
15
+ 1. Build a hypothesis around steady state
16
+ 2. Vary real-world events
17
+ 3. Run experiments in production
18
+ 4. Automate experiments
19
+ 5. Minimize blast radius
20
+
21
+ ## Tools
22
+
23
+ **Chaos Mesh (Kubernetes):**
24
+ ```yaml
25
+ apiVersion: chaos-mesh.org/v1alpha1
26
+ kind: PodChaos
27
+ metadata:
28
+ name: pod-failure-example
29
+ spec:
30
+ action: pod-failure
31
+ mode: one
32
+ selector:
33
+ namespaces:
34
+ - production
35
+ labelSelectors:
36
+ app: api-service
37
+ duration: "30s"
38
+ scheduler:
39
+ cron: "@every 2h"
40
+ ```
41
+
42
+ **Network Chaos:**
43
+ ```yaml
44
+ apiVersion: chaos-mesh.org/v1alpha1
45
+ kind: NetworkChaos
46
+ metadata:
47
+ name: network-delay
48
+ spec:
49
+ action: delay
50
+ mode: all
51
+ selector:
52
+ namespaces:
53
+ - production
54
+ labelSelectors:
55
+ app: api-service
56
+ delay:
57
+ latency: "100ms"
58
+ correlation: "25"
59
+ jitter: "10ms"
60
+ duration: "5m"
61
+ ```
62
+
63
+ **Litmus Chaos:**
64
+ ```yaml
65
+ apiVersion: litmuschaos.io/v1alpha1
66
+ kind: ChaosEngine
67
+ metadata:
68
+ name: nginx-chaos
69
+ spec:
70
+ appinfo:
71
+ appns: 'default'
72
+ applabel: 'app=nginx'
73
+ appkind: 'deployment'
74
+ chaosServiceAccount: litmus-admin
75
+ experiments:
76
+ - name: pod-delete
77
+ spec:
78
+ components:
79
+ env:
80
+ - name: TOTAL_CHAOS_DURATION
81
+ value: '30'
82
+ - name: CHAOS_INTERVAL
83
+ value: '10'
84
+ - name: FORCE
85
+ value: 'false'
86
+ ```
87
+
88
+ ## Experiments
89
+
90
+ **Pod Deletion Test:**
91
+ ```bash
92
+ # Verify system handles pod failures
93
+ kubectl delete pod -l app=api-service --grace-period=0
94
+
95
+ # Expected outcome:
96
+ # - New pod starts automatically
97
+ # - No service interruption
98
+ # - Requests handled by other pods
99
+ ```
100
+
101
+ **Database Failure Simulation:**
102
+ ```yaml
103
+ # Simulate database connection issues
104
+ apiVersion: chaos-mesh.org/v1alpha1
105
+ kind: NetworkChaos
106
+ metadata:
107
+ name: db-partition
108
+ spec:
109
+ action: partition
110
+ mode: all
111
+ selector:
112
+ namespaces:
113
+ - production
114
+ labelSelectors:
115
+ app: api-service
116
+ direction: to
117
+ target:
118
+ selector:
119
+ namespaces:
120
+ - production
121
+ labelSelectors:
122
+ app: postgres
123
+ duration: "2m"
124
+ ```
125
+
126
+ **CPU Stress Test:**
127
+ ```yaml
128
+ apiVersion: chaos-mesh.org/v1alpha1
129
+ kind: StressChaos
130
+ metadata:
131
+ name: cpu-stress
132
+ spec:
133
+ mode: one
134
+ selector:
135
+ namespaces:
136
+ - production
137
+ labelSelectors:
138
+ app: api-service
139
+ stressors:
140
+ cpu:
141
+ workers: 4
142
+ load: 80
143
+ duration: "5m"
144
+ ```
145
+
146
+ ## Best Practices
147
+
148
+ ### 1. Start Small
149
+
150
+ ```
151
+ Begin in dev/staging
152
+ Small blast radius
153
+ Short duration
154
+ Gradually increase scope
155
+ ```
156
+
157
+ ### 2. Define Success Criteria
158
+
159
+ ```yaml
160
+ experiment:
161
+ hypothesis: "API continues serving traffic during pod failure"
162
+ success_criteria:
163
+ - Error rate < 0.1%
164
+ - P95 latency < 500ms
165
+ - No customer impact
166
+ failure_action: Rollback immediately
167
+ ```
168
+
169
+ ### 3. Automate Chaos
170
+
171
+ ```yaml
172
+ # Regular chaos experiments
173
+ schedule:
174
+ daily: Pod deletion
175
+ weekly: Network latency
176
+ monthly: Region failure simulation
177
+ ```
178
+
179
+ ### 4. Monitor During Experiments
180
+
181
+ ```yaml
182
+ observability:
183
+ - Real-time dashboards
184
+ - Alert on anomalies
185
+ - Correlate with experiment timeline
186
+ - Document unexpected behavior
187
+ ```
188
+
189
+ ---
190
+
191
+ **Related Resources:**
192
+ - [reliability-patterns.md](reliability-patterns.md)
193
+ - [incident-management.md](incident-management.md)