blockmine 1.21.0 → 1.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (433) hide show
  1. package/.claude/agents/README.md +469 -0
  2. package/.claude/agents/auth-route-debugger.md +118 -0
  3. package/.claude/agents/auth-route-tester.md +93 -0
  4. package/.claude/agents/auto-error-resolver.md +97 -0
  5. package/.claude/agents/build-optimizer.md +236 -0
  6. package/.claude/agents/code-architecture-reviewer.md +83 -0
  7. package/.claude/agents/code-refactor-master.md +94 -0
  8. package/.claude/agents/cost-optimizer.md +134 -0
  9. package/.claude/agents/deployment-orchestrator.md +113 -0
  10. package/.claude/agents/documentation-architect.md +82 -0
  11. package/.claude/agents/frontend-error-fixer.md +77 -0
  12. package/.claude/agents/iac-code-generator.md +71 -0
  13. package/.claude/agents/incident-responder.md +346 -0
  14. package/.claude/agents/infrastructure-architect.md +31 -0
  15. package/.claude/agents/kubernetes-specialist.md +56 -0
  16. package/.claude/agents/migration-planner.md +181 -0
  17. package/.claude/agents/network-architect.md +196 -0
  18. package/.claude/agents/plan-reviewer.md +52 -0
  19. package/.claude/agents/refactor-planner.md +63 -0
  20. package/.claude/agents/security-scanner.md +102 -0
  21. package/.claude/agents/web-research-specialist.md +78 -0
  22. package/.claude/commands/cost-analysis.md +315 -0
  23. package/.claude/commands/dev-docs-update.md +55 -0
  24. package/.claude/commands/dev-docs.md +51 -0
  25. package/.claude/commands/incident-debug.md +247 -0
  26. package/.claude/commands/infra-plan.md +81 -0
  27. package/.claude/commands/migration-plan.md +478 -0
  28. package/.claude/commands/route-research-for-testing.md +37 -0
  29. package/.claude/commands/security-review.md +66 -0
  30. package/.claude/hooks/CONFIG.md +448 -0
  31. package/.claude/hooks/README.md +163 -0
  32. package/.claude/hooks/SKILL_ACTIVATION_COMPLETE.md +226 -0
  33. package/.claude/hooks/WINDOWS_HOOKS_README.md +151 -0
  34. package/.claude/hooks/add-skill-activation-banners.ts +132 -0
  35. package/.claude/hooks/comprehensive-skill-test.ts +1315 -0
  36. package/.claude/hooks/error-handling-reminder.sh +12 -0
  37. package/.claude/hooks/error-handling-reminder.ts +222 -0
  38. package/.claude/hooks/k8s-manifest-validator.sh +56 -0
  39. package/.claude/hooks/package-lock.json +556 -0
  40. package/.claude/hooks/package.json +16 -0
  41. package/.claude/hooks/post-tool-use-tracker.ps1 +174 -0
  42. package/.claude/hooks/post-tool-use-tracker.sh +183 -0
  43. package/.claude/hooks/security-policy-check.sh +247 -0
  44. package/.claude/hooks/skill-activation-prompt.ps1 +10 -0
  45. package/.claude/hooks/skill-activation-prompt.sh +10 -0
  46. package/.claude/hooks/skill-activation-prompt.ts +141 -0
  47. package/.claude/hooks/stop-build-check-enhanced.sh +130 -0
  48. package/.claude/hooks/terraform-validator.sh +53 -0
  49. package/.claude/hooks/test-input.json +7 -0
  50. package/.claude/hooks/test-skill-activation.ts +427 -0
  51. package/.claude/hooks/trigger-build-resolver.sh +79 -0
  52. package/.claude/hooks/tsc-check.sh +173 -0
  53. package/.claude/hooks/tsconfig.json +19 -0
  54. package/.claude/settings.json +55 -0
  55. package/.claude/settings.local.json +27 -14
  56. package/.claude/skills/README.md +507 -0
  57. package/.claude/skills/api-engineering/SKILL.md +63 -0
  58. package/.claude/skills/api-engineering/resources/api-versioning.md +88 -0
  59. package/.claude/skills/api-engineering/resources/graphql-patterns.md +106 -0
  60. package/.claude/skills/api-engineering/resources/rate-limiting.md +118 -0
  61. package/.claude/skills/api-engineering/resources/rest-api-design.md +105 -0
  62. package/.claude/skills/backend-dev-guidelines/SKILL.md +306 -0
  63. package/.claude/skills/backend-dev-guidelines/resources/architecture-overview.md +451 -0
  64. package/.claude/skills/backend-dev-guidelines/resources/async-and-errors.md +307 -0
  65. package/.claude/skills/backend-dev-guidelines/resources/complete-examples.md +638 -0
  66. package/.claude/skills/backend-dev-guidelines/resources/configuration.md +275 -0
  67. package/.claude/skills/backend-dev-guidelines/resources/database-patterns.md +224 -0
  68. package/.claude/skills/backend-dev-guidelines/resources/middleware-guide.md +213 -0
  69. package/.claude/skills/backend-dev-guidelines/resources/routing-and-controllers.md +756 -0
  70. package/.claude/skills/backend-dev-guidelines/resources/sentry-and-monitoring.md +336 -0
  71. package/.claude/skills/backend-dev-guidelines/resources/services-and-repositories.md +789 -0
  72. package/.claude/skills/backend-dev-guidelines/resources/testing-guide.md +235 -0
  73. package/.claude/skills/backend-dev-guidelines/resources/validation-patterns.md +754 -0
  74. package/.claude/skills/budget-and-cost-management/SKILL.md +850 -0
  75. package/.claude/skills/build-engineering/SKILL.md +431 -0
  76. package/.claude/skills/build-engineering/resources/artifact-repositories.md +72 -0
  77. package/.claude/skills/build-engineering/resources/build-caching.md +96 -0
  78. package/.claude/skills/build-engineering/resources/build-pipelines.md +105 -0
  79. package/.claude/skills/build-engineering/resources/build-security.md +95 -0
  80. package/.claude/skills/build-engineering/resources/build-systems.md +389 -0
  81. package/.claude/skills/build-engineering/resources/compilation-optimization.md +201 -0
  82. package/.claude/skills/build-engineering/resources/dependency-management.md +73 -0
  83. package/.claude/skills/build-engineering/resources/monorepo-builds.md +110 -0
  84. package/.claude/skills/build-engineering/resources/performance-optimization.md +113 -0
  85. package/.claude/skills/build-engineering/resources/reproducible-builds.md +82 -0
  86. package/.claude/skills/cloud-engineering/SKILL.md +675 -0
  87. package/.claude/skills/cloud-engineering/resources/aws-patterns.md +742 -0
  88. package/.claude/skills/cloud-engineering/resources/azure-patterns.md +714 -0
  89. package/.claude/skills/cloud-engineering/resources/cleared-cloud-environments.md +987 -0
  90. package/.claude/skills/cloud-engineering/resources/cloud-cost-optimization.md +757 -0
  91. package/.claude/skills/cloud-engineering/resources/cloud-networking.md +1058 -0
  92. package/.claude/skills/cloud-engineering/resources/cloud-security-tools.md +1530 -0
  93. package/.claude/skills/cloud-engineering/resources/cloud-security.md +990 -0
  94. package/.claude/skills/cloud-engineering/resources/gcp-patterns.md +758 -0
  95. package/.claude/skills/cloud-engineering/resources/migration-strategies.md +820 -0
  96. package/.claude/skills/cloud-engineering/resources/multi-cloud-strategies.md +670 -0
  97. package/.claude/skills/cloud-engineering/resources/oci-patterns.md +1198 -0
  98. package/.claude/skills/cloud-engineering/resources/serverless-patterns.md +795 -0
  99. package/.claude/skills/cloud-engineering/resources/well-architected-frameworks.md +966 -0
  100. package/.claude/skills/cybersecurity/SKILL.md +409 -0
  101. package/.claude/skills/cybersecurity/resources/security-architecture.md +266 -0
  102. package/.claude/skills/database-engineering/SKILL.md +61 -0
  103. package/.claude/skills/database-engineering/resources/backup-and-recovery.md +72 -0
  104. package/.claude/skills/database-engineering/resources/database-replication.md +63 -0
  105. package/.claude/skills/database-engineering/resources/postgresql-fundamentals.md +70 -0
  106. package/.claude/skills/database-engineering/resources/query-optimization.md +68 -0
  107. package/.claude/skills/devsecops/SKILL.md +374 -0
  108. package/.claude/skills/devsecops/resources/ci-cd-security.md +204 -0
  109. package/.claude/skills/devsecops/resources/compliance-automation.md +530 -0
  110. package/.claude/skills/devsecops/resources/compliance-frameworks.md +2322 -0
  111. package/.claude/skills/devsecops/resources/container-security.md +915 -0
  112. package/.claude/skills/devsecops/resources/cspm-integration.md +1440 -0
  113. package/.claude/skills/devsecops/resources/policy-enforcement.md +619 -0
  114. package/.claude/skills/devsecops/resources/secrets-management.md +755 -0
  115. package/.claude/skills/devsecops/resources/security-monitoring.md +146 -0
  116. package/.claude/skills/devsecops/resources/security-scanning.md +887 -0
  117. package/.claude/skills/devsecops/resources/security-testing.md +203 -0
  118. package/.claude/skills/devsecops/resources/supply-chain-security.md +518 -0
  119. package/.claude/skills/devsecops/resources/vulnerability-management.md +481 -0
  120. package/.claude/skills/devsecops/resources/zero-trust-architecture.md +177 -0
  121. package/.claude/skills/documentation-as-code/SKILL.md +323 -0
  122. package/.claude/skills/documentation-as-code/resources/api-documentation.md +90 -0
  123. package/.claude/skills/documentation-as-code/resources/changelog-management.md +79 -0
  124. package/.claude/skills/documentation-as-code/resources/diagram-generation.md +44 -0
  125. package/.claude/skills/documentation-as-code/resources/docs-as-code-workflow.md +99 -0
  126. package/.claude/skills/documentation-as-code/resources/documentation-automation.md +68 -0
  127. package/.claude/skills/documentation-as-code/resources/documentation-sites.md +79 -0
  128. package/.claude/skills/documentation-as-code/resources/markdown-best-practices.md +162 -0
  129. package/.claude/skills/documentation-as-code/resources/openapi-specification.md +77 -0
  130. package/.claude/skills/documentation-as-code/resources/readme-engineering.md +60 -0
  131. package/.claude/skills/documentation-as-code/resources/technical-writing-guide.md +202 -0
  132. package/.claude/skills/engineering-management/SKILL.md +356 -0
  133. package/.claude/skills/engineering-management/resources/career-ladders.md +609 -0
  134. package/.claude/skills/engineering-management/resources/hiring-and-assessment.md +555 -0
  135. package/.claude/skills/engineering-management/resources/one-on-one-guides.md +609 -0
  136. package/.claude/skills/engineering-management/resources/resource-planning.md +557 -0
  137. package/.claude/skills/engineering-management/resources/team-organization-patterns.md +491 -0
  138. package/.claude/skills/engineering-management/resources/technical-interviews.md +474 -0
  139. package/.claude/skills/engineering-operations-management/SKILL.md +817 -0
  140. package/.claude/skills/error-tracking/SKILL.md +379 -0
  141. package/.claude/skills/frontend-dev-guidelines/SKILL.md +403 -0
  142. package/.claude/skills/frontend-dev-guidelines/resources/common-patterns.md +331 -0
  143. package/.claude/skills/frontend-dev-guidelines/resources/complete-examples.md +872 -0
  144. package/.claude/skills/frontend-dev-guidelines/resources/component-patterns.md +502 -0
  145. package/.claude/skills/frontend-dev-guidelines/resources/data-fetching.md +767 -0
  146. package/.claude/skills/frontend-dev-guidelines/resources/file-organization.md +502 -0
  147. package/.claude/skills/frontend-dev-guidelines/resources/loading-and-error-states.md +501 -0
  148. package/.claude/skills/frontend-dev-guidelines/resources/performance.md +406 -0
  149. package/.claude/skills/frontend-dev-guidelines/resources/routing-guide.md +364 -0
  150. package/.claude/skills/frontend-dev-guidelines/resources/styling-guide.md +428 -0
  151. package/.claude/skills/frontend-dev-guidelines/resources/typescript-standards.md +418 -0
  152. package/.claude/skills/general-it-engineering/SKILL.md +393 -0
  153. package/.claude/skills/general-it-engineering/resources/asset-management.md +712 -0
  154. package/.claude/skills/general-it-engineering/resources/automation-orchestration.md +817 -0
  155. package/.claude/skills/general-it-engineering/resources/business-continuity.md +786 -0
  156. package/.claude/skills/general-it-engineering/resources/change-management.md +715 -0
  157. package/.claude/skills/general-it-engineering/resources/enterprise-monitoring.md +729 -0
  158. package/.claude/skills/general-it-engineering/resources/help-desk-operations.md +738 -0
  159. package/.claude/skills/general-it-engineering/resources/incident-service-management.md +834 -0
  160. package/.claude/skills/general-it-engineering/resources/it-governance.md +753 -0
  161. package/.claude/skills/general-it-engineering/resources/itil-framework.md +503 -0
  162. package/.claude/skills/general-it-engineering/resources/service-management.md +669 -0
  163. package/.claude/skills/infrastructure-architecture/SKILL.md +328 -0
  164. package/.claude/skills/infrastructure-architecture/resources/architecture-decision-records.md +505 -0
  165. package/.claude/skills/infrastructure-architecture/resources/architecture-patterns.md +528 -0
  166. package/.claude/skills/infrastructure-architecture/resources/capacity-planning.md +453 -0
  167. package/.claude/skills/infrastructure-architecture/resources/cleared-environment-architecture.md +773 -0
  168. package/.claude/skills/infrastructure-architecture/resources/cost-architecture.md +499 -0
  169. package/.claude/skills/infrastructure-architecture/resources/data-architecture.md +501 -0
  170. package/.claude/skills/infrastructure-architecture/resources/disaster-recovery.md +535 -0
  171. package/.claude/skills/infrastructure-architecture/resources/migration-architecture.md +512 -0
  172. package/.claude/skills/infrastructure-architecture/resources/multi-region-design.md +608 -0
  173. package/.claude/skills/infrastructure-architecture/resources/reference-architectures.md +562 -0
  174. package/.claude/skills/infrastructure-architecture/resources/security-architecture.md +538 -0
  175. package/.claude/skills/infrastructure-architecture/resources/system-design-principles.md +489 -0
  176. package/.claude/skills/infrastructure-architecture/resources/workload-classification.md +1000 -0
  177. package/.claude/skills/infrastructure-strategy/SKILL.md +924 -0
  178. package/.claude/skills/network-engineering/SKILL.md +385 -0
  179. package/.claude/skills/network-engineering/resources/dns-management.md +738 -0
  180. package/.claude/skills/network-engineering/resources/load-balancing.md +820 -0
  181. package/.claude/skills/network-engineering/resources/network-architecture.md +546 -0
  182. package/.claude/skills/network-engineering/resources/network-security.md +921 -0
  183. package/.claude/skills/network-engineering/resources/network-troubleshooting.md +749 -0
  184. package/.claude/skills/network-engineering/resources/routing-switching.md +373 -0
  185. package/.claude/skills/network-engineering/resources/sdn-networking.md +695 -0
  186. package/.claude/skills/network-engineering/resources/service-mesh-networking.md +777 -0
  187. package/.claude/skills/network-engineering/resources/tcp-ip-protocols.md +444 -0
  188. package/.claude/skills/network-engineering/resources/vpn-connectivity.md +672 -0
  189. package/.claude/skills/observability-engineering/SKILL.md +101 -0
  190. package/.claude/skills/observability-engineering/resources/apm-tools.md +97 -0
  191. package/.claude/skills/observability-engineering/resources/correlation-strategies.md +87 -0
  192. package/.claude/skills/observability-engineering/resources/distributed-tracing.md +98 -0
  193. package/.claude/skills/observability-engineering/resources/logs-aggregation.md +118 -0
  194. package/.claude/skills/observability-engineering/resources/observability-cost-optimization.md +141 -0
  195. package/.claude/skills/observability-engineering/resources/opentelemetry.md +110 -0
  196. package/.claude/skills/platform-engineering/SKILL.md +555 -0
  197. package/.claude/skills/platform-engineering/resources/architecture-overview.md +600 -0
  198. package/.claude/skills/platform-engineering/resources/container-orchestration.md +916 -0
  199. package/.claude/skills/platform-engineering/resources/cost-optimization.md +634 -0
  200. package/.claude/skills/platform-engineering/resources/developer-platforms.md +670 -0
  201. package/.claude/skills/platform-engineering/resources/gitops-automation.md +650 -0
  202. package/.claude/skills/platform-engineering/resources/infrastructure-as-code.md +778 -0
  203. package/.claude/skills/platform-engineering/resources/infrastructure-standards.md +708 -0
  204. package/.claude/skills/platform-engineering/resources/multi-tenancy.md +602 -0
  205. package/.claude/skills/platform-engineering/resources/platform-security.md +711 -0
  206. package/.claude/skills/platform-engineering/resources/resource-management.md +592 -0
  207. package/.claude/skills/platform-engineering/resources/service-mesh.md +628 -0
  208. package/.claude/skills/release-engineering/SKILL.md +393 -0
  209. package/.claude/skills/release-engineering/resources/artifact-management.md +108 -0
  210. package/.claude/skills/release-engineering/resources/build-optimization.md +84 -0
  211. package/.claude/skills/release-engineering/resources/ci-cd-pipelines.md +411 -0
  212. package/.claude/skills/release-engineering/resources/deployment-strategies.md +197 -0
  213. package/.claude/skills/release-engineering/resources/pipeline-security.md +62 -0
  214. package/.claude/skills/release-engineering/resources/progressive-delivery.md +83 -0
  215. package/.claude/skills/release-engineering/resources/release-automation.md +68 -0
  216. package/.claude/skills/release-engineering/resources/release-orchestration.md +77 -0
  217. package/.claude/skills/release-engineering/resources/rollback-strategies.md +66 -0
  218. package/.claude/skills/release-engineering/resources/versioning-strategies.md +59 -0
  219. package/.claude/skills/route-tester/SKILL.md +392 -0
  220. package/.claude/skills/skill-developer/ADVANCED.md +197 -0
  221. package/.claude/skills/skill-developer/HOOK_MECHANISMS.md +306 -0
  222. package/.claude/skills/skill-developer/PATTERNS_LIBRARY.md +152 -0
  223. package/.claude/skills/skill-developer/SKILL.md +430 -0
  224. package/.claude/skills/skill-developer/SKILL_RULES_REFERENCE.md +315 -0
  225. package/.claude/skills/skill-developer/TRIGGER_TYPES.md +305 -0
  226. package/.claude/skills/skill-developer/TROUBLESHOOTING.md +514 -0
  227. package/.claude/skills/skill-rules.json +2940 -0
  228. package/.claude/skills/sre/SKILL.md +464 -0
  229. package/.claude/skills/sre/resources/alerting-best-practices.md +282 -0
  230. package/.claude/skills/sre/resources/capacity-planning.md +226 -0
  231. package/.claude/skills/sre/resources/chaos-engineering.md +193 -0
  232. package/.claude/skills/sre/resources/disaster-recovery.md +232 -0
  233. package/.claude/skills/sre/resources/incident-management.md +436 -0
  234. package/.claude/skills/sre/resources/observability-stack.md +240 -0
  235. package/.claude/skills/sre/resources/on-call-runbooks.md +167 -0
  236. package/.claude/skills/sre/resources/performance-optimization.md +108 -0
  237. package/.claude/skills/sre/resources/reliability-patterns.md +183 -0
  238. package/.claude/skills/sre/resources/slo-sli-sla.md +464 -0
  239. package/.claude/skills/sre/resources/toil-reduction.md +145 -0
  240. package/.claude/skills/systems-engineering/SKILL.md +648 -0
  241. package/.claude/skills/systems-engineering/resources/automation-patterns.md +771 -0
  242. package/.claude/skills/systems-engineering/resources/configuration-management.md +998 -0
  243. package/.claude/skills/systems-engineering/resources/linux-administration.md +672 -0
  244. package/.claude/skills/systems-engineering/resources/networking-fundamentals.md +982 -0
  245. package/.claude/skills/systems-engineering/resources/performance-tuning.md +871 -0
  246. package/.claude/skills/systems-engineering/resources/powershell-scripting.md +482 -0
  247. package/.claude/skills/systems-engineering/resources/security-hardening.md +739 -0
  248. package/.claude/skills/systems-engineering/resources/shell-scripting.md +915 -0
  249. package/.claude/skills/systems-engineering/resources/storage-management.md +628 -0
  250. package/.claude/skills/systems-engineering/resources/system-monitoring.md +787 -0
  251. package/.claude/skills/systems-engineering/resources/troubleshooting-guide.md +753 -0
  252. package/.claude/skills/systems-engineering/resources/windows-administration.md +738 -0
  253. package/.claude/skills/technical-leadership/SKILL.md +728 -0
  254. package/CHANGELOG.md +90 -54
  255. package/README.md +94 -0
  256. package/backend/docs/SECRETS_DOCUMENTATION.md +327 -0
  257. package/backend/jest.config.js +59 -0
  258. package/backend/package-lock.json +6129 -0
  259. package/backend/package.json +16 -4
  260. package/backend/prisma/migrations/20251026104609_add_websocket_api/migration.sql +33 -0
  261. package/backend/prisma/schema.prisma +33 -0
  262. package/backend/src/__tests__/core/DependencyService.test.js +336 -0
  263. package/backend/src/__tests__/core/UserService.test.js +875 -0
  264. package/backend/src/__tests__/repositories/BaseRepository.test.js +146 -0
  265. package/backend/src/__tests__/repositories/BotRepository.test.js +118 -0
  266. package/backend/src/__tests__/repositories/CommandRepository.test.js +132 -0
  267. package/backend/src/__tests__/repositories/EventGraphRepository.test.js +93 -0
  268. package/backend/src/__tests__/repositories/GroupRepository.test.js +155 -0
  269. package/backend/src/__tests__/repositories/PermissionRepository.test.js +130 -0
  270. package/backend/src/__tests__/repositories/PluginRepository.test.js +107 -0
  271. package/backend/src/__tests__/repositories/ServerRepository.test.js +80 -0
  272. package/backend/src/__tests__/repositories/UserRepository.test.js +128 -0
  273. package/backend/src/__tests__/secretsFilter.test.js +425 -0
  274. package/backend/src/__tests__/services/BotLifecycleService.test.js +411 -0
  275. package/backend/src/__tests__/services/BotProcessManager.test.js +285 -0
  276. package/backend/src/__tests__/services/CacheManager.test.js +125 -0
  277. package/backend/src/__tests__/services/CommandExecutionService.test.js +460 -0
  278. package/backend/src/__tests__/services/ResourceMonitorService.test.js +207 -0
  279. package/backend/src/__tests__/services/TelemetryService.test.js +291 -0
  280. package/backend/src/__tests__/setup.js +25 -0
  281. package/backend/src/api/routes/apiKeys.js +181 -0
  282. package/backend/src/api/routes/bots.js +49 -7
  283. package/backend/src/api/routes/plugins.js +2 -1
  284. package/backend/src/api/routes/system.js +174 -0
  285. package/backend/src/container.js +82 -0
  286. package/backend/src/core/BotManager.js +142 -871
  287. package/backend/src/core/BotManager.old.js +1093 -0
  288. package/backend/src/core/BotProcess.js +1092 -858
  289. package/backend/src/core/EventGraphManager.js +280 -198
  290. package/backend/src/core/GraphExecutionEngine.js +321 -325
  291. package/backend/src/core/MessageQueue.js +27 -6
  292. package/backend/src/core/NodeRegistry.js +37 -1134
  293. package/backend/src/core/PluginManager.js +62 -12
  294. package/backend/src/core/PrismaService.js +32 -0
  295. package/backend/src/core/UserService.js +3 -3
  296. package/backend/src/core/__tests__/PrismaService.test.js +24 -0
  297. package/backend/src/core/commands/README.md +305 -0
  298. package/backend/src/core/commands/dev.js +13 -7
  299. package/backend/src/core/commands/ping.js +10 -4
  300. package/backend/src/core/commands/whois.js +63 -0
  301. package/backend/src/core/config/validation.js +27 -0
  302. package/backend/src/core/constants/graphTypes.js +21 -0
  303. package/backend/src/core/node-registries/actions.js +132 -0
  304. package/backend/src/core/node-registries/arrays.js +137 -0
  305. package/backend/src/core/node-registries/bot.js +23 -0
  306. package/backend/src/core/node-registries/data.js +290 -0
  307. package/backend/src/core/node-registries/debug.js +26 -0
  308. package/backend/src/core/node-registries/events.js +187 -0
  309. package/backend/src/core/node-registries/flow.js +139 -0
  310. package/backend/src/core/node-registries/logic.js +45 -0
  311. package/backend/src/core/node-registries/math.js +42 -0
  312. package/backend/src/core/node-registries/objects.js +98 -0
  313. package/backend/src/core/node-registries/strings.js +153 -0
  314. package/backend/src/core/node-registries/time.js +113 -0
  315. package/backend/src/core/node-registries/users.js +79 -0
  316. package/backend/src/core/nodes/{action_bot_look_at.js → actions/bot_look_at.js} +36 -36
  317. package/backend/src/core/nodes/{action_bot_set_variable.js → actions/bot_set_variable.js} +32 -32
  318. package/backend/src/core/nodes/{action_send_log.js → actions/send_log.js} +28 -23
  319. package/backend/src/core/nodes/{action_send_message.js → actions/send_message.js} +32 -32
  320. package/backend/src/core/nodes/actions/send_websocket_response.js +33 -0
  321. package/backend/src/core/nodes/arrays/get_next.js +35 -0
  322. package/backend/src/core/nodes/{data_cast.js → data/cast.js} +8 -0
  323. package/backend/src/core/nodes/data/datetime_literal.js +27 -0
  324. package/backend/src/core/nodes/data/entity_info.js +69 -0
  325. package/backend/src/core/nodes/data/get_nearby_entities.js +32 -0
  326. package/backend/src/core/nodes/data/get_nearby_players.js +64 -0
  327. package/backend/src/core/nodes/{data_get_user_field.js → data/get_user_field.js} +1 -1
  328. package/backend/src/core/nodes/data/type_check.js +53 -0
  329. package/backend/src/core/nodes/{debug_log.js → debug/log.js} +16 -16
  330. package/backend/src/core/nodes/{flow_branch.js → flow/branch.js} +15 -15
  331. package/backend/src/core/nodes/{flow_break.js → flow/break.js} +14 -14
  332. package/backend/src/core/nodes/flow/delay.js +43 -0
  333. package/backend/src/core/nodes/{flow_for_each.js → flow/for_each.js} +39 -39
  334. package/backend/src/core/nodes/{flow_sequence.js → flow/sequence.js} +16 -16
  335. package/backend/src/core/nodes/{flow_switch.js → flow/switch.js} +47 -47
  336. package/backend/src/core/nodes/{flow_while.js → flow/while.js} +1 -1
  337. package/backend/src/core/nodes/logic/__tests__/compare.test.js +83 -0
  338. package/backend/src/core/nodes/math/__tests__/operation.test.js +65 -0
  339. package/backend/src/core/nodes/strings/__tests__/concat.test.js +89 -0
  340. package/backend/src/core/nodes/time/__tests__/now.test.js +24 -0
  341. package/backend/src/core/nodes/time/add.js +33 -0
  342. package/backend/src/core/nodes/time/compare.js +35 -0
  343. package/backend/src/core/nodes/time/diff.js +29 -0
  344. package/backend/src/core/nodes/time/format.js +32 -0
  345. package/backend/src/core/nodes/time/now.js +18 -0
  346. package/backend/src/core/nodes/{user_check_blacklist.js → users/check_blacklist.js} +37 -37
  347. package/backend/src/core/nodes/{user_get_groups.js → users/get_groups.js} +36 -36
  348. package/backend/src/core/nodes/{user_get_permissions.js → users/get_permissions.js} +36 -36
  349. package/backend/src/core/nodes/{user_set_blacklist.js → users/set_blacklist.js} +37 -37
  350. package/backend/src/core/services/BotLifecycleService.js +596 -0
  351. package/backend/src/core/services/BotProcessManager.js +163 -0
  352. package/backend/src/core/services/CacheManager.js +111 -0
  353. package/backend/src/core/services/CommandExecutionService.js +351 -0
  354. package/backend/src/core/services/ResourceMonitorService.js +90 -0
  355. package/backend/src/core/services/TelemetryService.js +124 -0
  356. package/backend/src/core/services/ValidationService.js +132 -0
  357. package/backend/src/core/services/__tests__/ValidationService.test.js +148 -0
  358. package/backend/src/core/services.js +20 -5
  359. package/backend/src/core/system/CommandContext.js +84 -0
  360. package/backend/src/core/system/Transport.js +78 -0
  361. package/backend/src/core/utils/__tests__/jsonParser.test.js +44 -0
  362. package/backend/src/core/utils/jsonParser.js +18 -0
  363. package/backend/src/core/utils/secretsFilter.js +262 -0
  364. package/backend/src/core/utils/variableParser.js +89 -0
  365. package/backend/src/core/validation/__tests__/nodeSchemas.test.js +175 -0
  366. package/backend/src/core/validation/nodeSchemas.js +112 -0
  367. package/backend/src/lib/prisma.js +2 -4
  368. package/backend/src/real-time/botApi/handlers/commandHandlers.js +28 -0
  369. package/backend/src/real-time/botApi/handlers/graphHandlers.js +99 -0
  370. package/backend/src/real-time/botApi/handlers/graphWebSocketHandlers.js +147 -0
  371. package/backend/src/real-time/botApi/handlers/index.js +43 -0
  372. package/backend/src/real-time/botApi/handlers/messageHandlers.js +66 -0
  373. package/backend/src/real-time/botApi/handlers/statusHandlers.js +17 -0
  374. package/backend/src/real-time/botApi/handlers/userHandlers.js +141 -0
  375. package/backend/src/real-time/botApi/index.js +40 -0
  376. package/backend/src/real-time/botApi/middleware.js +79 -0
  377. package/backend/src/real-time/botApi/utils.js +54 -0
  378. package/backend/src/real-time/socketHandler.js +6 -2
  379. package/backend/src/repositories/BaseRepository.js +43 -0
  380. package/backend/src/repositories/BotRepository.js +42 -0
  381. package/backend/src/repositories/CommandRepository.js +53 -0
  382. package/backend/src/repositories/EventGraphRepository.js +40 -0
  383. package/backend/src/repositories/GroupRepository.js +69 -0
  384. package/backend/src/repositories/PermissionRepository.js +48 -0
  385. package/backend/src/repositories/PluginRepository.js +42 -0
  386. package/backend/src/repositories/ServerRepository.js +27 -0
  387. package/backend/src/repositories/UserRepository.js +48 -0
  388. package/backend/src/server.js +3 -0
  389. package/backend/src/test-refactor.js +85 -0
  390. package/frontend/dist/assets/index-CfTo92bP.css +1 -0
  391. package/frontend/dist/assets/index-CiFD5X9Z.js +8344 -0
  392. package/frontend/dist/index.html +2 -2
  393. package/frontend/package.json +0 -5
  394. package/package.json +2 -1
  395. package/frontend/dist/assets/index-B9GedHEa.js +0 -8352
  396. package/frontend/dist/assets/index-zLiy9MDx.css +0 -1
  397. package/nul +0 -0
  398. /package/backend/src/core/nodes/{action_http_request.js → actions/http_request.js} +0 -0
  399. /package/backend/src/core/nodes/{array_add_element.js → arrays/add_element.js} +0 -0
  400. /package/backend/src/core/nodes/{array_contains.js → arrays/contains.js} +0 -0
  401. /package/backend/src/core/nodes/{array_find_index.js → arrays/find_index.js} +0 -0
  402. /package/backend/src/core/nodes/{array_get_by_index.js → arrays/get_by_index.js} +0 -0
  403. /package/backend/src/core/nodes/{array_get_random_element.js → arrays/get_random_element.js} +0 -0
  404. /package/backend/src/core/nodes/{array_remove_by_index.js → arrays/remove_by_index.js} +0 -0
  405. /package/backend/src/core/nodes/{bot_get_position.js → bot/get_position.js} +0 -0
  406. /package/backend/src/core/nodes/{data_array_literal.js → data/array_literal.js} +0 -0
  407. /package/backend/src/core/nodes/{data_boolean_literal.js → data/boolean_literal.js} +0 -0
  408. /package/backend/src/core/nodes/{data_get_argument.js → data/get_argument.js} +0 -0
  409. /package/backend/src/core/nodes/{data_get_bot_look.js → data/get_bot_look.js} +0 -0
  410. /package/backend/src/core/nodes/{data_get_entity_field.js → data/get_entity_field.js} +0 -0
  411. /package/backend/src/core/nodes/{data_get_server_players.js → data/get_server_players.js} +0 -0
  412. /package/backend/src/core/nodes/{data_get_variable.js → data/get_variable.js} +0 -0
  413. /package/backend/src/core/nodes/{data_length.js → data/length.js} +0 -0
  414. /package/backend/src/core/nodes/{data_make_object.js → data/make_object.js} +0 -0
  415. /package/backend/src/core/nodes/{data_number_literal.js → data/number_literal.js} +0 -0
  416. /package/backend/src/core/nodes/{data_string_literal.js → data/string_literal.js} +0 -0
  417. /package/backend/src/core/nodes/{logic_compare.js → logic/compare.js} +0 -0
  418. /package/backend/src/core/nodes/{logic_operation.js → logic/operation.js} +0 -0
  419. /package/backend/src/core/nodes/{math_operation.js → math/operation.js} +0 -0
  420. /package/backend/src/core/nodes/{math_random_number.js → math/random_number.js} +0 -0
  421. /package/backend/src/core/nodes/{object_create.js → objects/create.js} +0 -0
  422. /package/backend/src/core/nodes/{object_delete.js → objects/delete.js} +0 -0
  423. /package/backend/src/core/nodes/{object_get.js → objects/get.js} +0 -0
  424. /package/backend/src/core/nodes/{object_has_key.js → objects/has_key.js} +0 -0
  425. /package/backend/src/core/nodes/{object_set.js → objects/set.js} +0 -0
  426. /package/backend/src/core/nodes/{string_concat.js → strings/concat.js} +0 -0
  427. /package/backend/src/core/nodes/{string_contains.js → strings/contains.js} +0 -0
  428. /package/backend/src/core/nodes/{string_ends_with.js → strings/ends_with.js} +0 -0
  429. /package/backend/src/core/nodes/{string_equals.js → strings/equals.js} +0 -0
  430. /package/backend/src/core/nodes/{string_length.js → strings/length.js} +0 -0
  431. /package/backend/src/core/nodes/{string_matches.js → strings/matches.js} +0 -0
  432. /package/backend/src/core/nodes/{string_split.js → strings/split.js} +0 -0
  433. /package/backend/src/core/nodes/{string_starts_with.js → strings/starts_with.js} +0 -0
@@ -0,0 +1,817 @@
1
+ # Engineering Operations Management Skill
2
+
3
+ **For managers running SRE, platform, and infrastructure teams - focusing on operations, on-call, incidents, and engineering metrics.**
4
+
5
+ > This skill helps engineering managers build sustainable operations practices, prevent burnout, run effective incident reviews, and measure what matters. Complements technical SRE skills with people and process management.
6
+
7
+ ---
8
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
9
+ 🎯 SKILL ACTIVATED: engineering-operations-management
10
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
11
+
12
+
13
+ ## When to Use This Skill
14
+
15
+ **You're a manager who needs help with:**
16
+ - Structuring on-call rotations and compensation
17
+ - Preventing and addressing engineer burnout
18
+ - Running blameless postmortem reviews
19
+ - Negotiating SLOs with product teams
20
+ - Measuring engineering productivity (not vanity metrics)
21
+ - Managing toil and operational load
22
+ - Balancing feature work vs operational excellence
23
+ - Building sustainable operations culture
24
+
25
+ **This skill does NOT cover:**
26
+ - Hands-on incident response (see **sre** skill)
27
+ - Technical SLO/SLI implementation (see **sre** skill)
28
+ - Monitoring/observability setup (see **platform-engineering** skill)
29
+ - Team hiring and career development (see **engineering-management** skill)
30
+
31
+ ---
32
+
33
+ ## Core Areas
34
+
35
+ ### 1. On-Call Management
36
+
37
+ **Core principle:** On-call is a necessary operational burden that should be **fair, sustainable, and compensated**.
38
+
39
+ #### On-Call Rotation Models
40
+
41
+ **Follow-the-Sun (Recommended for global teams):**
42
+ ```
43
+ Americas Team: 6 AM - 2 PM PST (primary)
44
+ EMEA Team: 2 PM - 10 PM PST (primary)
45
+ APAC Team: 10 PM - 6 AM PST (primary)
46
+
47
+ Pros: No one wakes up at night, balanced load
48
+ Cons: Requires global team, handoff complexity
49
+ Best for: Teams with 15+ engineers across timezones
50
+ ```
51
+
52
+ **Weekly Rotation (Most common):**
53
+ ```
54
+ Week 1: Engineer A (primary), Engineer B (secondary)
55
+ Week 2: Engineer C (primary), Engineer D (secondary)
56
+ Week 3: Engineer E (primary), Engineer A (secondary)
57
+
58
+ Pros: Simple, fair rotation
59
+ Cons: Weekend coverage, potential burnout
60
+ Best for: Teams with 6-10 engineers in same timezone
61
+ ```
62
+
63
+ **Tiered Escalation:**
64
+ ```
65
+ Tier 1: Junior engineers (business hours only)
66
+ Tier 2: Senior engineers (24/7 primary)
67
+ Tier 3: Staff/Principal (escalation only)
68
+
69
+ Pros: Gradual responsibility increase
70
+ Cons: Can create "us vs them" dynamic
71
+ Best for: Large teams (15+) with clear skill levels
72
+ ```
73
+
74
+ #### On-Call Compensation Models
75
+
76
+ **Option 1: On-Call Stipend**
77
+ ```
78
+ Primary on-call: $500-$1,000/week
79
+ Secondary on-call: $250-$500/week
80
+
81
+ Pros: Predictable, simple
82
+ Cons: Doesn't account for actual pages
83
+ ```
84
+
85
+ **Option 2: Pay-per-Page**
86
+ ```
87
+ Business hours page: $50-$100
88
+ After-hours page: $150-$300
89
+ Weekend/holiday page: $300-$500
90
+
91
+ Pros: Fair - pays for actual disruption
92
+ Cons: Can incentivize ignoring issues
93
+ ```
94
+
95
+ **Option 3: Hybrid (Recommended)**
96
+ ```
97
+ Base stipend: $500/week
98
+ + $100 per after-hours page
99
+ + Comp time (1.5x hours worked after-hours)
100
+
101
+ Pros: Covers both availability and interruptions
102
+ Cons: More complex to administer
103
+ ```
104
+
105
+ **Comp time policies:**
106
+ - For every hour worked after-hours, grant 1.5 hours comp time
107
+ - Comp time must be used within 30 days
108
+ - Encourage taking comp time day-after major incidents
109
+
110
+ #### Burnout Prevention
111
+
112
+ **Warning signs:**
113
+ - 🚨 Pages > 5 per week for extended period
114
+ - 🚨 Engineer mentions exhaustion, stress in 1-on-1s
115
+ - 🚨 Quality of work declining
116
+ - 🚨 Working late nights/weekends regularly
117
+ - 🚨 Cynicism, disengagement
118
+
119
+ **Interventions:**
120
+ 1. **Immediate:** Rotate off on-call for 2-4 weeks
121
+ 2. **Short-term:** Reduce project load, pair with senior engineer
122
+ 3. **Long-term:** Fix underlying system issues causing pages
123
+
124
+ **Sustainable on-call rules:**
125
+ - No engineer on-call more than 1 week per month
126
+ - Maximum 2 weeks on-call per quarter
127
+ - Mandatory break after high-page-volume week
128
+ - No on-call during PTO or major life events
129
+
130
+ #### On-Call Scenarios
131
+
132
+ **Scenario: "What's a fair after-hours pay model?"**
133
+ - **Hybrid model (recommended):**
134
+ - Base: $500/week on-call stipend
135
+ - Plus: $100-150 per after-hours page
136
+ - Plus: 1.5x comp time for hours worked
137
+ - **Example calculation:**
138
+ - Week stipend: $500
139
+ - 3 after-hours pages × $125 = $375
140
+ - 4 hours worked × 1.5 = 6 hours comp time
141
+ - **Total value:** $875 + 6 hours off
142
+
143
+ **Scenario: "Team blamed someone in incident review - how to fix?"**
144
+ - **Immediate:** Stop the review, reset the tone
145
+ - **Say:** "We don't blame people, we fix systems. Let's focus on what failed, not who."
146
+ - **Blameless culture principles:**
147
+ - People make reasonable decisions based on information available
148
+ - Systems should prevent single points of failure
149
+ - Focus on "what" not "who"
150
+ - **Follow-up:** Coach manager running review on blameless principles
151
+
152
+ **Scenario: "During incident, what should I do as manager?"**
153
+ - **Monitor:** Watch incident channel, don't interrupt
154
+ - **Support:** "What do you need? More people? Communication handled?"
155
+ - **Shield:** Handle exec questions, keep pressure off team
156
+ - **Don't:** Take over, second-guess, or ask "why" questions mid-incident
157
+ - **After:** Thank team, schedule postmortem, ensure comp time taken
158
+
159
+ **Scenario: "How do we track incident trends?"**
160
+ - **Metrics to track:**
161
+ - Incident frequency (per week/month)
162
+ - MTTR (mean time to recovery)
163
+ - Incidents by service/component
164
+ - Incidents by root cause category
165
+ - **Look for patterns:**
166
+ - Same service failing repeatedly → systemic issue
167
+ - MTTR increasing → lack of familiarity or tooling gaps
168
+ - Spike in incidents → recent deploy or infrastructure change
169
+ - **Action:** Address top 3 incident sources quarterly
170
+
171
+ **Scenario: "What incident communication plan do we need?"**
172
+ - **During incident:**
173
+ - Sev 1: Updates every 30 minutes to execs, status page every 15 min
174
+ - Sev 2: Updates every hour to stakeholders
175
+ - Sev 3: Update when resolved
176
+ - **Channels:**
177
+ - Internal: Dedicated Slack #incidents channel
178
+ - External: Status page (Statuspage.io, etc.)
179
+ - Executives: Email + Slack DM for Sev 1/2
180
+ - **Template:**
181
+ ```
182
+ [SEV 1] API Service Outage
183
+ Impact: All users unable to login
184
+ Status: Investigating
185
+ Next update: 2:30 PM (15 minutes)
186
+ ```
187
+
188
+ **Scenario: "What's the right retrospective format?"**
189
+ - **Timeline:** Within 48 hours of incident (while fresh)
190
+ - **Attendees:** Incident responders + anyone interested (open invitation)
191
+ - **Duration:** 45-60 minutes
192
+ - **Format:**
193
+ 1. Timeline walkthrough (10 min)
194
+ 2. What went well (10 min)
195
+ 3. What went poorly (15 min)
196
+ 4. Action items (15 min) - with owners and due dates
197
+ 5. Q&A (10 min)
198
+ - **Output:** Written postmortem + action items tracked
199
+
200
+ **Scenario: "How do we communicate incidents to executives?"**
201
+ - **During:** Brief, factual updates
202
+ - "API down, 100% of users affected, team investigating"
203
+ - **After:** Business-focused summary
204
+ - Revenue impact: "$50K in lost sales"
205
+ - User impact: "10K users couldn't check out for 2 hours"
206
+ - Prevention: "Adding rate limiting to prevent recurrence"
207
+ - **Avoid:** Deep technical details unless asked
208
+
209
+ ---
210
+
211
+ ### 2. Incident Management for Managers
212
+
213
+ **Your role as a manager during incidents:**
214
+
215
+ #### During the Incident (DO NOT take over unless critical)
216
+
217
+ ```
218
+ ✅ DO:
219
+ - Monitor incident channel, offer support
220
+ - Shield team from external pressure
221
+ - Bring in additional engineers if needed
222
+ - Coordinate with stakeholders (updates to execs)
223
+ - Order food if it's going long
224
+ - Take notes for postmortem
225
+
226
+ ❌ DON'T:
227
+ - Take over incident response (unless you're most qualified)
228
+ - Ask "why didn't you..." questions during incident
229
+ - Pressure for faster resolution
230
+ - Blame individuals
231
+ - Second-guess decisions being made
232
+ ```
233
+
234
+ **Incident Severity Levels (align with team):**
235
+
236
+ ```
237
+ Sev 1 (Critical):
238
+ ├── Complete service outage
239
+ ├── Data loss or security breach
240
+ ├── Revenue impact > $10K/hour
241
+ └── Response: All hands, exec updates every 30 min
242
+
243
+ Sev 2 (High):
244
+ ├── Major feature degraded
245
+ ├── Significant user impact
246
+ ├── Revenue impact > $1K/hour
247
+ └── Response: On-call + expert, updates every hour
248
+
249
+ Sev 3 (Medium):
250
+ ├── Minor feature degraded
251
+ ├── Limited user impact
252
+ └── Response: On-call handles, regular updates
253
+
254
+ Sev 4 (Low):
255
+ ├── Internal tooling issue
256
+ ├── No user impact
257
+ └── Response: Fix during business hours
258
+ ```
259
+
260
+ #### After the Incident: Blameless Postmortem
261
+
262
+ **Blameless postmortem framework:**
263
+
264
+ ```
265
+ Postmortem Template:
266
+
267
+ ## Incident Summary
268
+ - Date/Time: When did it happen?
269
+ - Duration: How long?
270
+ - Impact: Who was affected? How many users?
271
+ - Severity: Sev 1-4
272
+
273
+ ## Timeline
274
+ - 14:32 - First alert fired
275
+ - 14:35 - Engineer A acknowledged, began investigation
276
+ - 14:45 - Root cause identified (database connection pool exhausted)
277
+ - 15:00 - Mitigation applied (increased pool size)
278
+ - 15:15 - Service fully recovered
279
+
280
+ ## Root Cause
281
+ What actually caused this? (Technical, not "Engineer X did...")
282
+
283
+ ## What Went Well
284
+ - Alert fired within 2 minutes
285
+ - Communication was clear
286
+ - Rollback was smooth
287
+
288
+ ## What Went Poorly
289
+ - No automated mitigation
290
+ - Monitoring didn't catch early warning signs
291
+ - On-call engineer not familiar with this service
292
+
293
+ ## Action Items
294
+ 1. [P0] Add automated connection pool scaling (Owner: Alice, Due: 2 weeks)
295
+ 2. [P1] Improve monitoring for connection pool saturation (Owner: Bob, Due: 1 month)
296
+ 3. [P2] Add service to on-call training rotation (Owner: Manager, Due: 2 weeks)
297
+
298
+ ## Lessons Learned
299
+ - Database connection pool defaults are too conservative
300
+ - Need better pre-production load testing
301
+ ```
302
+
303
+ **Blameless postmortem meeting (45-60 min):**
304
+
305
+ ```
306
+ 1. Introduction (5 min)
307
+ └── Remind: This is blameless, focus on systems not people
308
+
309
+ 2. Timeline Review (15 min)
310
+ └── Walk through what happened, when
311
+
312
+ 3. Root Cause Analysis (15 min)
313
+ └── "Why did this happen?" (ask "why" 5 times)
314
+
315
+ 4. What Went Well / What Went Poorly (10 min)
316
+ └── Balanced reflection
317
+
318
+ 5. Action Items (10 min)
319
+ └── Specific, assigned, with due dates
320
+ └── Priority: P0 (this week), P1 (this month), P2 (nice to have)
321
+
322
+ 6. Close (5 min)
323
+ └── Thank the team, emphasize learning
324
+ ```
325
+
326
+ **Red flags in postmortems:**
327
+ - ❌ Blaming individuals ("Alice should have...")
328
+ - ❌ Vague action items ("Improve monitoring")
329
+ - ❌ No follow-up on action items
330
+ - ❌ Defensive posturing
331
+ - ❌ Skipping postmortems for "small" incidents
332
+
333
+ **Manager's job:** Enforce blameless culture, track action items, ensure learning.
334
+
335
+ ---
336
+
337
+ ### 3. SLO Negotiation with Product Teams
338
+
339
+ **The tension:** Product wants features fast. SRE/Platform wants stability. You balance both.
340
+
341
+ #### Understanding SLOs (Simple Version for Managers)
342
+
343
+ ```
344
+ SLI (Service Level Indicator):
345
+ What you measure (e.g., "API latency p99")
346
+
347
+ SLO (Service Level Objective):
348
+ Target for reliability (e.g., "API latency p99 < 500ms, 99.9% of the time")
349
+
350
+ SLA (Service Level Agreement):
351
+ Contractual promise to customers (e.g., "99.95% uptime or we give refund")
352
+
353
+ Example:
354
+ SLI: Request success rate
355
+ SLO: 99.9% of requests succeed (internal target)
356
+ SLA: 99.5% uptime (customer-facing promise)
357
+ ```
358
+
359
+ **Error budget concept:**
360
+
361
+ ```
362
+ SLO: 99.9% availability = 0.1% allowed downtime
363
+
364
+ Per month (30 days):
365
+ ├── Total time: 43,200 minutes
366
+ ├── Allowed downtime: 43.2 minutes
367
+ └── Error budget: 43.2 minutes
368
+
369
+ If error budget exhausted:
370
+ ├── Freeze feature releases
371
+ ├── Focus on reliability improvements
372
+ └── Pay down tech debt
373
+ ```
374
+
375
+ #### SLO Negotiation Framework
376
+
377
+ **When product pushes for aggressive feature timeline:**
378
+
379
+ ```
380
+ Product: "We need to ship this feature in 2 weeks"
381
+
382
+ You (as manager):
383
+ "Let's check our error budget first. If we have budget, we can move fast.
384
+ If we're out of budget, we need to stabilize first."
385
+
386
+ Scenario 1: Error budget healthy (50% remaining)
387
+ ├── ✅ Green light for feature work
388
+ ├── 70% capacity on features
389
+ └── 30% on reliability
390
+
391
+ Scenario 2: Error budget exhausted (0% remaining)
392
+ ├── 🛑 Feature freeze
393
+ ├── 100% capacity on reliability
394
+ └── Resume features when budget recovers
395
+ ```
396
+
397
+ **How to set SLOs (practical guide):**
398
+
399
+ 1. **Start with current performance:**
400
+ - "Our API latency p99 is currently 300ms"
401
+ - Don't set SLO at 300ms - give yourself buffer
402
+
403
+ 2. **Set realistic target:**
404
+ - "Let's set SLO at p99 < 500ms"
405
+ - This gives 200ms buffer for growth/issues
406
+
407
+ 3. **Align with customer expectation:**
408
+ - "Customers complain if latency > 1s"
409
+ - SLO should prevent customer pain
410
+
411
+ 4. **Review quarterly:**
412
+ - Too easy? (Always meeting SLO) → Tighten SLO or invest in features
413
+ - Too hard? (Always missing SLO) → Loosen SLO or invest in reliability
414
+
415
+ **Common SLOs by service type:**
416
+
417
+ ```
418
+ API Services:
419
+ ├── Availability: 99.9% (43 min downtime/month)
420
+ ├── Latency p50: < 100ms
421
+ ├── Latency p99: < 500ms
422
+ └── Error rate: < 0.1%
423
+
424
+ Batch Processing:
425
+ ├── Job success rate: 99.5%
426
+ ├── Job completion time: < 4 hours
427
+ └── Data accuracy: 99.99%
428
+
429
+ Data Pipeline:
430
+ ├── Data freshness: < 15 min lag
431
+ ├── Pipeline availability: 99.9%
432
+ └── Data quality: 99.95%
433
+ ```
434
+
435
+ ---
436
+
437
+ ### 4. Engineering Metrics That Matter
438
+
439
+ **The problem:** Easy to measure vanity metrics. Hard to measure real productivity.
440
+
441
+ #### Vanity Metrics (Avoid)
442
+
443
+ ```
444
+ ❌ Lines of code written
445
+ ❌ Number of commits
446
+ ❌ Hours worked
447
+ ❌ Number of deploys (without context)
448
+ ❌ Ticket velocity (without quality)
449
+ ❌ Code coverage % (without context)
450
+ ```
451
+
452
+ **Why these are bad:**
453
+ - Lines of code: Good engineers often delete code
454
+ - Number of commits: Encourages small, meaningless commits
455
+ - Hours worked: Encourages burnout, not productivity
456
+ - Deploys without context: Could be hotfixes for bugs you introduced
457
+ - Ticket velocity: Encourages cherry-picking easy tickets
458
+ - Code coverage: Can write useless tests to hit %
459
+
460
+ #### Metrics That Actually Matter
461
+
462
+ **1. DORA Metrics (Use these)**
463
+
464
+ ```
465
+ Deployment Frequency:
466
+ ├── How often do you deploy to production?
467
+ ├── Elite: Multiple times per day
468
+ ├── High: Daily to weekly
469
+ ├── Medium: Weekly to monthly
470
+ └── Low: Monthly to every 6 months
471
+
472
+ Lead Time for Changes:
473
+ ├── How long from commit to production?
474
+ ├── Elite: < 1 hour
475
+ ├── High: 1 day to 1 week
476
+ ├── Medium: 1 week to 1 month
477
+ └── Low: 1 month to 6 months
478
+
479
+ Time to Restore Service:
480
+ ├── How long to recover from incident?
481
+ ├── Elite: < 1 hour
482
+ ├── High: < 1 day
483
+ ├── Medium: 1 day to 1 week
484
+ └── Low: > 1 week
485
+
486
+ Change Failure Rate:
487
+ ├── What % of changes cause incidents?
488
+ ├── Elite: 0-15%
489
+ ├── High: 16-30%
490
+ ├── Medium: 31-45%
491
+ └── Low: > 45%
492
+ ```
493
+
494
+ **How to use DORA metrics:**
495
+ - Track quarterly, not daily (avoid gaming)
496
+ - Trend over time (are we improving?)
497
+ - Compare to benchmarks (elite, high, medium, low)
498
+ - Use to identify improvement areas
499
+
500
+ **2. SRE Metrics**
501
+
502
+ ```
503
+ Toil Percentage:
504
+ ├── What % of engineer time is manual ops work?
505
+ ├── Target: < 30% toil
506
+ ├── Intervention needed: > 50% toil
507
+ └── Measure: Time tracking, surveys
508
+
509
+ On-Call Load:
510
+ ├── Pages per week per engineer
511
+ ├── Target: < 3 pages/week
512
+ ├── Intervention: > 5 pages/week
513
+ └── Measure: PagerDuty analytics
514
+
515
+ SLO Compliance:
516
+ ├── Are we meeting our SLOs?
517
+ ├── Target: 99%+ SLO compliance
518
+ └── Measure: Observability dashboards
519
+ ```
520
+
521
+ **3. Team Health Metrics**
522
+
523
+ ```
524
+ Engineer Satisfaction:
525
+ ├── Quarterly survey (1-10 scale)
526
+ ├── Questions: "Satisfied with work?", "Would recommend team?"
527
+ ├── Target: 8+ average
528
+ └── Red flag: < 6 average or declining trend
529
+
530
+ Retention Rate:
531
+ ├── % of engineers staying > 1 year
532
+ ├── Target: > 85% annual retention
533
+ └── Red flag: < 70% retention
534
+
535
+ Time to Productivity (New hires):
536
+ ├── How long until new hire is productive?
537
+ ├── Target: < 90 days
538
+ └── Measure: Manager assessment + self-assessment
539
+ ```
540
+
541
+ **4. Operational Excellence Metrics**
542
+
543
+ ```
544
+ Incident Trends:
545
+ ├── Number of Sev 1/2 incidents per month
546
+ ├── Target: Declining or stable
547
+ └── Red flag: Increasing trend
548
+
549
+ Postmortem Action Item Completion:
550
+ ├── % of action items completed on time
551
+ ├── Target: > 80% completion
552
+ └── Red flag: < 50% completion
553
+
554
+ Automated Test Coverage:
555
+ ├── % of critical paths covered
556
+ ├── Target: > 70% for critical paths
557
+ └── Not a vanity metric if focused on high-risk areas
558
+ ```
559
+
560
+ #### How to Present Metrics to Leadership
561
+
562
+ **Dashboard structure:**
563
+
564
+ ```
565
+ 1. Health at a Glance (Top metrics)
566
+ ├── 🟢 SLO Compliance: 99.8% (Target: 99%)
567
+ ├── 🟡 Deployment Frequency: 3x/week (Target: Daily)
568
+ ├── 🟢 Incident Rate: 2 Sev2 this month (Last month: 4)
569
+ └── 🟢 Team Satisfaction: 8.2/10 (Target: 8+)
570
+
571
+ 2. DORA Metrics Trend (Quarterly)
572
+ [Chart showing improvement over time]
573
+
574
+ 3. Focus Areas
575
+ ├── ✅ Reduced incident rate by 50% this quarter
576
+ ├── 🚧 Working on deployment frequency (automation initiative)
577
+ └── ⚠️ Toil still high at 40% - hiring 2 more engineers
578
+
579
+ 4. Asks
580
+ ├── Budget for observability tooling ($50K)
581
+ └── Approval to pause feature work next sprint for reliability
582
+ ```
583
+
584
+ ---
585
+
586
+ ### 5. Balancing Feature Work vs Operational Excellence
587
+
588
+ **The eternal tension:** Product wants features. You want stability.
589
+
590
+ #### Resource Allocation Models
591
+
592
+ **70-20-10 Rule (Recommended):**
593
+ ```
594
+ 70% Feature Work:
595
+ ├── New features product wants
596
+ ├── Customer-facing improvements
597
+ └── Revenue-generating projects
598
+
599
+ 20% Operational Excellence:
600
+ ├── Tech debt paydown
601
+ ├── Reliability improvements
602
+ ├── Monitoring enhancements
603
+ └── Automation
604
+
605
+ 10% Innovation/Learning:
606
+ ├── Explore new technologies
607
+ ├── Hackathons
608
+ ├── Learning time
609
+ └── Experimentation
610
+ ```
611
+
612
+ **Adjust based on phase:**
613
+
614
+ ```
615
+ High Growth Phase:
616
+ ├── 80% Features
617
+ ├── 15% Ops Excellence
618
+ └── 5% Innovation
619
+
620
+ Stability Phase:
621
+ ├── 50% Features
622
+ ├── 40% Ops Excellence
623
+ └── 10% Innovation
624
+
625
+ Crisis Phase (Post-Incidents):
626
+ ├── 30% Features
627
+ ├── 60% Ops Excellence
628
+ └── 10% Innovation
629
+ ```
630
+
631
+ #### Negotiating with Product
632
+
633
+ **When product asks for all-feature, no-ops time:**
634
+
635
+ ```
636
+ Scenario: "We need all engineers on Feature X for Q4"
637
+
638
+ Your response framework:
639
+ 1. Acknowledge business need
640
+ "I understand Feature X is critical for revenue"
641
+
642
+ 2. State operational reality
643
+ "Our on-call load is high (8 pages/week) and error budget is 80% exhausted"
644
+
645
+ 3. Present options
646
+ Option A: All-in on features, risk of incidents and burnout
647
+ Option B: 70-30 split, sustainable pace, less feature risk
648
+ Option C: Hire 2 more engineers to do both
649
+
650
+ 4. Recommend
651
+ "I recommend Option B - we'll deliver 70% of Feature X this quarter,
652
+ and ensure we don't have outages that impact customers"
653
+
654
+ 5. Make it their decision
655
+ "What's your preference given these trade-offs?"
656
+ ```
657
+
658
+ **Using error budgets as negotiation tool:**
659
+
660
+ ```
661
+ Error budget = objective metric, not subjective
662
+
663
+ If product wants to move fast:
664
+ ├── Check error budget: 50% remaining?
665
+ ├── ✅ Green light: "We have budget, let's ship!"
666
+ └── 🛑 Budget exhausted: "We need to stabilize first"
667
+
668
+ This removes emotion from discussion. It's data-driven.
669
+ ```
670
+
671
+ ---
672
+
673
+ ### 6. Building Sustainable Operations Culture
674
+
675
+ **Culture eats process for breakfast.**
676
+
677
+ #### Key Cultural Values
678
+
679
+ **1. Blameless Culture**
680
+ ```
681
+ When incidents happen:
682
+ ❌ "Who broke it?" → ✅ "What broke?"
683
+ ❌ "Why didn't you..." → ✅ "What can we learn?"
684
+ ❌ Hide mistakes → ✅ Share failures openly
685
+ ```
686
+
687
+ **2. Automate Toil**
688
+ ```
689
+ Manual work is not a badge of honor.
690
+ ├── Track toil percentage
691
+ ├── Reward automation, not heroics
692
+ └── "If you do it twice, automate it"
693
+ ```
694
+
695
+ **3. Sustainable On-Call**
696
+ ```
697
+ On-call is not punishment.
698
+ ├── Fair rotation
699
+ ├── Compensated fairly
700
+ ├── Protected from burnout
701
+ └── Escalation is encouraged, not weakness
702
+ ```
703
+
704
+ **4. Continuous Improvement**
705
+ ```
706
+ Every incident is a learning opportunity.
707
+ ├── Postmortems are required, not optional
708
+ ├── Action items are tracked and completed
709
+ └── Celebrate fixes, not just features
710
+ ```
711
+
712
+ #### Manager Actions to Reinforce Culture
713
+
714
+ **1. Lead by Example**
715
+ - Participate in on-call rotation (if you're technical)
716
+ - Admit your own mistakes publicly
717
+ - Take postmortem action items yourself
718
+
719
+ **2. Celebrate Operational Wins**
720
+ - Shout out engineers who reduce toil
721
+ - Highlight reliability improvements in team meetings
722
+ - Give "Operational Excellence" awards
723
+
724
+ **3. Protect Your Team**
725
+ - Say no to unrealistic timelines
726
+ - Push back on "just ship it" pressure
727
+ - Shield team from org politics
728
+
729
+ **4. Invest in Automation**
730
+ - Allocate 20% capacity to ops excellence
731
+ - Approve tool/platform budgets
732
+ - Hire for automation skills
733
+
734
+ ---
735
+
736
+ ## Quick Reference for Managers
737
+
738
+ **On-Call:**
739
+ - Rotation: Weekly or follow-the-sun
740
+ - Compensation: $500-$1000/week + pay-per-page
741
+ - Burnout prevention: Max 1 week/month, comp time after incidents
742
+
743
+ **Incidents:**
744
+ - Your role: Support, don't take over
745
+ - Blameless postmortems: Required for all Sev 1/2
746
+ - Action items: Track and ensure completion
747
+
748
+ **SLOs:**
749
+ - Start with current performance + buffer
750
+ - Use error budgets to negotiate with product
751
+ - Review quarterly
752
+
753
+ **Metrics:**
754
+ - Use DORA metrics (deployment freq, lead time, MTTR, change failure rate)
755
+ - Avoid vanity metrics (lines of code, commits, hours)
756
+ - Track team health (satisfaction, retention)
757
+
758
+ **Resource Allocation:**
759
+ - 70% features, 20% ops excellence, 10% innovation
760
+ - Adjust based on phase (growth vs stability)
761
+
762
+ **Culture:**
763
+ - Blameless, automate toil, sustainable on-call
764
+ - Lead by example, celebrate ops wins
765
+ - Protect team from burnout
766
+
767
+ ### Culture Building Scenario
768
+
769
+ **Scenario: "How do we build a sustainable ops culture?"**
770
+ - **Blameless:**
771
+ - Never "who broke it?" Always "what broke and how do we prevent it?"
772
+ - Share postmortems openly - learn from all incidents
773
+ - Reward transparency (caught early) over hiding (festered)
774
+ - **Automate toil:**
775
+ - Track toil percentage (target < 30%)
776
+ - Dedicate 20% time to automation
777
+ - Celebrate "we automated ourselves out of that problem"
778
+ - **Sustainable on-call:**
779
+ - No hero culture - don't celebrate all-nighters
780
+ - Enforce comp time and breaks
781
+ - Fix systems that cause repeated pages
782
+ - **Recognition:**
783
+ - Highlight ops wins in all-hands: "Automated X, saved 50 hours/month"
784
+ - Incident response recognition: "Great job handling outage calmly"
785
+ - Quality over speed: "Prevented incident with thorough testing"
786
+
787
+ **Scenario: "How do we prevent hero culture?"**
788
+ - **Heroes are a symptom of broken systems**
789
+ - **Signs of hero culture:**
790
+ - Same engineer always saves the day
791
+ - Working nights/weekends is celebrated
792
+ - "We need you" used as motivation
793
+ - **How to fix:**
794
+ - Document hero's knowledge → spread it
795
+ - Automate hero's manual tasks
796
+ - Create runbooks for common issues
797
+ - Rotate responsibilities - don't depend on one person
798
+ - **Say:** "I appreciate your dedication, but this is unsustainable. Let's fix the system so you don't need to be a hero."
799
+
800
+ ---
801
+
802
+ ## Integration with Other Skills
803
+
804
+ **This skill works with:**
805
+ - **engineering-management** - Hiring, career development, 1-on-1s
806
+ - **technical-leadership** - Making technical decisions, risk assessment
807
+ - **infrastructure-strategy** - Long-term planning, platform investment
808
+ - **budget-and-cost-management** - On-call budgets, tooling costs
809
+
810
+ **Technical skills your team uses:**
811
+ - **sre** - Hands-on SLO implementation, incident response
812
+ - **platform-engineering** - Building internal platforms that reduce toil
813
+ - **cybersecurity** - Security incident response, compliance
814
+
815
+ ---
816
+
817
+ **Remember:** Your job is to build sustainable operations practices that enable long-term success, not short-term heroics. Protect your team from burnout. Measure what matters. Learn from every incident.