blockmine 1.21.0 → 1.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/README.md +469 -0
- package/.claude/agents/auth-route-debugger.md +118 -0
- package/.claude/agents/auth-route-tester.md +93 -0
- package/.claude/agents/auto-error-resolver.md +97 -0
- package/.claude/agents/build-optimizer.md +236 -0
- package/.claude/agents/code-architecture-reviewer.md +83 -0
- package/.claude/agents/code-refactor-master.md +94 -0
- package/.claude/agents/cost-optimizer.md +134 -0
- package/.claude/agents/deployment-orchestrator.md +113 -0
- package/.claude/agents/documentation-architect.md +82 -0
- package/.claude/agents/frontend-error-fixer.md +77 -0
- package/.claude/agents/iac-code-generator.md +71 -0
- package/.claude/agents/incident-responder.md +346 -0
- package/.claude/agents/infrastructure-architect.md +31 -0
- package/.claude/agents/kubernetes-specialist.md +56 -0
- package/.claude/agents/migration-planner.md +181 -0
- package/.claude/agents/network-architect.md +196 -0
- package/.claude/agents/plan-reviewer.md +52 -0
- package/.claude/agents/refactor-planner.md +63 -0
- package/.claude/agents/security-scanner.md +102 -0
- package/.claude/agents/web-research-specialist.md +78 -0
- package/.claude/commands/cost-analysis.md +315 -0
- package/.claude/commands/dev-docs-update.md +55 -0
- package/.claude/commands/dev-docs.md +51 -0
- package/.claude/commands/incident-debug.md +247 -0
- package/.claude/commands/infra-plan.md +81 -0
- package/.claude/commands/migration-plan.md +478 -0
- package/.claude/commands/route-research-for-testing.md +37 -0
- package/.claude/commands/security-review.md +66 -0
- package/.claude/hooks/CONFIG.md +448 -0
- package/.claude/hooks/README.md +163 -0
- package/.claude/hooks/SKILL_ACTIVATION_COMPLETE.md +226 -0
- package/.claude/hooks/WINDOWS_HOOKS_README.md +151 -0
- package/.claude/hooks/add-skill-activation-banners.ts +132 -0
- package/.claude/hooks/comprehensive-skill-test.ts +1315 -0
- package/.claude/hooks/error-handling-reminder.sh +12 -0
- package/.claude/hooks/error-handling-reminder.ts +222 -0
- package/.claude/hooks/k8s-manifest-validator.sh +56 -0
- package/.claude/hooks/package-lock.json +556 -0
- package/.claude/hooks/package.json +16 -0
- package/.claude/hooks/post-tool-use-tracker.ps1 +174 -0
- package/.claude/hooks/post-tool-use-tracker.sh +183 -0
- package/.claude/hooks/security-policy-check.sh +247 -0
- package/.claude/hooks/skill-activation-prompt.ps1 +10 -0
- package/.claude/hooks/skill-activation-prompt.sh +10 -0
- package/.claude/hooks/skill-activation-prompt.ts +141 -0
- package/.claude/hooks/stop-build-check-enhanced.sh +130 -0
- package/.claude/hooks/terraform-validator.sh +53 -0
- package/.claude/hooks/test-input.json +7 -0
- package/.claude/hooks/test-skill-activation.ts +427 -0
- package/.claude/hooks/trigger-build-resolver.sh +79 -0
- package/.claude/hooks/tsc-check.sh +173 -0
- package/.claude/hooks/tsconfig.json +19 -0
- package/.claude/settings.json +55 -0
- package/.claude/settings.local.json +27 -14
- package/.claude/skills/README.md +507 -0
- package/.claude/skills/api-engineering/SKILL.md +63 -0
- package/.claude/skills/api-engineering/resources/api-versioning.md +88 -0
- package/.claude/skills/api-engineering/resources/graphql-patterns.md +106 -0
- package/.claude/skills/api-engineering/resources/rate-limiting.md +118 -0
- package/.claude/skills/api-engineering/resources/rest-api-design.md +105 -0
- package/.claude/skills/backend-dev-guidelines/SKILL.md +306 -0
- package/.claude/skills/backend-dev-guidelines/resources/architecture-overview.md +451 -0
- package/.claude/skills/backend-dev-guidelines/resources/async-and-errors.md +307 -0
- package/.claude/skills/backend-dev-guidelines/resources/complete-examples.md +638 -0
- package/.claude/skills/backend-dev-guidelines/resources/configuration.md +275 -0
- package/.claude/skills/backend-dev-guidelines/resources/database-patterns.md +224 -0
- package/.claude/skills/backend-dev-guidelines/resources/middleware-guide.md +213 -0
- package/.claude/skills/backend-dev-guidelines/resources/routing-and-controllers.md +756 -0
- package/.claude/skills/backend-dev-guidelines/resources/sentry-and-monitoring.md +336 -0
- package/.claude/skills/backend-dev-guidelines/resources/services-and-repositories.md +789 -0
- package/.claude/skills/backend-dev-guidelines/resources/testing-guide.md +235 -0
- package/.claude/skills/backend-dev-guidelines/resources/validation-patterns.md +754 -0
- package/.claude/skills/budget-and-cost-management/SKILL.md +850 -0
- package/.claude/skills/build-engineering/SKILL.md +431 -0
- package/.claude/skills/build-engineering/resources/artifact-repositories.md +72 -0
- package/.claude/skills/build-engineering/resources/build-caching.md +96 -0
- package/.claude/skills/build-engineering/resources/build-pipelines.md +105 -0
- package/.claude/skills/build-engineering/resources/build-security.md +95 -0
- package/.claude/skills/build-engineering/resources/build-systems.md +389 -0
- package/.claude/skills/build-engineering/resources/compilation-optimization.md +201 -0
- package/.claude/skills/build-engineering/resources/dependency-management.md +73 -0
- package/.claude/skills/build-engineering/resources/monorepo-builds.md +110 -0
- package/.claude/skills/build-engineering/resources/performance-optimization.md +113 -0
- package/.claude/skills/build-engineering/resources/reproducible-builds.md +82 -0
- package/.claude/skills/cloud-engineering/SKILL.md +675 -0
- package/.claude/skills/cloud-engineering/resources/aws-patterns.md +742 -0
- package/.claude/skills/cloud-engineering/resources/azure-patterns.md +714 -0
- package/.claude/skills/cloud-engineering/resources/cleared-cloud-environments.md +987 -0
- package/.claude/skills/cloud-engineering/resources/cloud-cost-optimization.md +757 -0
- package/.claude/skills/cloud-engineering/resources/cloud-networking.md +1058 -0
- package/.claude/skills/cloud-engineering/resources/cloud-security-tools.md +1530 -0
- package/.claude/skills/cloud-engineering/resources/cloud-security.md +990 -0
- package/.claude/skills/cloud-engineering/resources/gcp-patterns.md +758 -0
- package/.claude/skills/cloud-engineering/resources/migration-strategies.md +820 -0
- package/.claude/skills/cloud-engineering/resources/multi-cloud-strategies.md +670 -0
- package/.claude/skills/cloud-engineering/resources/oci-patterns.md +1198 -0
- package/.claude/skills/cloud-engineering/resources/serverless-patterns.md +795 -0
- package/.claude/skills/cloud-engineering/resources/well-architected-frameworks.md +966 -0
- package/.claude/skills/cybersecurity/SKILL.md +409 -0
- package/.claude/skills/cybersecurity/resources/security-architecture.md +266 -0
- package/.claude/skills/database-engineering/SKILL.md +61 -0
- package/.claude/skills/database-engineering/resources/backup-and-recovery.md +72 -0
- package/.claude/skills/database-engineering/resources/database-replication.md +63 -0
- package/.claude/skills/database-engineering/resources/postgresql-fundamentals.md +70 -0
- package/.claude/skills/database-engineering/resources/query-optimization.md +68 -0
- package/.claude/skills/devsecops/SKILL.md +374 -0
- package/.claude/skills/devsecops/resources/ci-cd-security.md +204 -0
- package/.claude/skills/devsecops/resources/compliance-automation.md +530 -0
- package/.claude/skills/devsecops/resources/compliance-frameworks.md +2322 -0
- package/.claude/skills/devsecops/resources/container-security.md +915 -0
- package/.claude/skills/devsecops/resources/cspm-integration.md +1440 -0
- package/.claude/skills/devsecops/resources/policy-enforcement.md +619 -0
- package/.claude/skills/devsecops/resources/secrets-management.md +755 -0
- package/.claude/skills/devsecops/resources/security-monitoring.md +146 -0
- package/.claude/skills/devsecops/resources/security-scanning.md +887 -0
- package/.claude/skills/devsecops/resources/security-testing.md +203 -0
- package/.claude/skills/devsecops/resources/supply-chain-security.md +518 -0
- package/.claude/skills/devsecops/resources/vulnerability-management.md +481 -0
- package/.claude/skills/devsecops/resources/zero-trust-architecture.md +177 -0
- package/.claude/skills/documentation-as-code/SKILL.md +323 -0
- package/.claude/skills/documentation-as-code/resources/api-documentation.md +90 -0
- package/.claude/skills/documentation-as-code/resources/changelog-management.md +79 -0
- package/.claude/skills/documentation-as-code/resources/diagram-generation.md +44 -0
- package/.claude/skills/documentation-as-code/resources/docs-as-code-workflow.md +99 -0
- package/.claude/skills/documentation-as-code/resources/documentation-automation.md +68 -0
- package/.claude/skills/documentation-as-code/resources/documentation-sites.md +79 -0
- package/.claude/skills/documentation-as-code/resources/markdown-best-practices.md +162 -0
- package/.claude/skills/documentation-as-code/resources/openapi-specification.md +77 -0
- package/.claude/skills/documentation-as-code/resources/readme-engineering.md +60 -0
- package/.claude/skills/documentation-as-code/resources/technical-writing-guide.md +202 -0
- package/.claude/skills/engineering-management/SKILL.md +356 -0
- package/.claude/skills/engineering-management/resources/career-ladders.md +609 -0
- package/.claude/skills/engineering-management/resources/hiring-and-assessment.md +555 -0
- package/.claude/skills/engineering-management/resources/one-on-one-guides.md +609 -0
- package/.claude/skills/engineering-management/resources/resource-planning.md +557 -0
- package/.claude/skills/engineering-management/resources/team-organization-patterns.md +491 -0
- package/.claude/skills/engineering-management/resources/technical-interviews.md +474 -0
- package/.claude/skills/engineering-operations-management/SKILL.md +817 -0
- package/.claude/skills/error-tracking/SKILL.md +379 -0
- package/.claude/skills/frontend-dev-guidelines/SKILL.md +403 -0
- package/.claude/skills/frontend-dev-guidelines/resources/common-patterns.md +331 -0
- package/.claude/skills/frontend-dev-guidelines/resources/complete-examples.md +872 -0
- package/.claude/skills/frontend-dev-guidelines/resources/component-patterns.md +502 -0
- package/.claude/skills/frontend-dev-guidelines/resources/data-fetching.md +767 -0
- package/.claude/skills/frontend-dev-guidelines/resources/file-organization.md +502 -0
- package/.claude/skills/frontend-dev-guidelines/resources/loading-and-error-states.md +501 -0
- package/.claude/skills/frontend-dev-guidelines/resources/performance.md +406 -0
- package/.claude/skills/frontend-dev-guidelines/resources/routing-guide.md +364 -0
- package/.claude/skills/frontend-dev-guidelines/resources/styling-guide.md +428 -0
- package/.claude/skills/frontend-dev-guidelines/resources/typescript-standards.md +418 -0
- package/.claude/skills/general-it-engineering/SKILL.md +393 -0
- package/.claude/skills/general-it-engineering/resources/asset-management.md +712 -0
- package/.claude/skills/general-it-engineering/resources/automation-orchestration.md +817 -0
- package/.claude/skills/general-it-engineering/resources/business-continuity.md +786 -0
- package/.claude/skills/general-it-engineering/resources/change-management.md +715 -0
- package/.claude/skills/general-it-engineering/resources/enterprise-monitoring.md +729 -0
- package/.claude/skills/general-it-engineering/resources/help-desk-operations.md +738 -0
- package/.claude/skills/general-it-engineering/resources/incident-service-management.md +834 -0
- package/.claude/skills/general-it-engineering/resources/it-governance.md +753 -0
- package/.claude/skills/general-it-engineering/resources/itil-framework.md +503 -0
- package/.claude/skills/general-it-engineering/resources/service-management.md +669 -0
- package/.claude/skills/infrastructure-architecture/SKILL.md +328 -0
- package/.claude/skills/infrastructure-architecture/resources/architecture-decision-records.md +505 -0
- package/.claude/skills/infrastructure-architecture/resources/architecture-patterns.md +528 -0
- package/.claude/skills/infrastructure-architecture/resources/capacity-planning.md +453 -0
- package/.claude/skills/infrastructure-architecture/resources/cleared-environment-architecture.md +773 -0
- package/.claude/skills/infrastructure-architecture/resources/cost-architecture.md +499 -0
- package/.claude/skills/infrastructure-architecture/resources/data-architecture.md +501 -0
- package/.claude/skills/infrastructure-architecture/resources/disaster-recovery.md +535 -0
- package/.claude/skills/infrastructure-architecture/resources/migration-architecture.md +512 -0
- package/.claude/skills/infrastructure-architecture/resources/multi-region-design.md +608 -0
- package/.claude/skills/infrastructure-architecture/resources/reference-architectures.md +562 -0
- package/.claude/skills/infrastructure-architecture/resources/security-architecture.md +538 -0
- package/.claude/skills/infrastructure-architecture/resources/system-design-principles.md +489 -0
- package/.claude/skills/infrastructure-architecture/resources/workload-classification.md +1000 -0
- package/.claude/skills/infrastructure-strategy/SKILL.md +924 -0
- package/.claude/skills/network-engineering/SKILL.md +385 -0
- package/.claude/skills/network-engineering/resources/dns-management.md +738 -0
- package/.claude/skills/network-engineering/resources/load-balancing.md +820 -0
- package/.claude/skills/network-engineering/resources/network-architecture.md +546 -0
- package/.claude/skills/network-engineering/resources/network-security.md +921 -0
- package/.claude/skills/network-engineering/resources/network-troubleshooting.md +749 -0
- package/.claude/skills/network-engineering/resources/routing-switching.md +373 -0
- package/.claude/skills/network-engineering/resources/sdn-networking.md +695 -0
- package/.claude/skills/network-engineering/resources/service-mesh-networking.md +777 -0
- package/.claude/skills/network-engineering/resources/tcp-ip-protocols.md +444 -0
- package/.claude/skills/network-engineering/resources/vpn-connectivity.md +672 -0
- package/.claude/skills/observability-engineering/SKILL.md +101 -0
- package/.claude/skills/observability-engineering/resources/apm-tools.md +97 -0
- package/.claude/skills/observability-engineering/resources/correlation-strategies.md +87 -0
- package/.claude/skills/observability-engineering/resources/distributed-tracing.md +98 -0
- package/.claude/skills/observability-engineering/resources/logs-aggregation.md +118 -0
- package/.claude/skills/observability-engineering/resources/observability-cost-optimization.md +141 -0
- package/.claude/skills/observability-engineering/resources/opentelemetry.md +110 -0
- package/.claude/skills/platform-engineering/SKILL.md +555 -0
- package/.claude/skills/platform-engineering/resources/architecture-overview.md +600 -0
- package/.claude/skills/platform-engineering/resources/container-orchestration.md +916 -0
- package/.claude/skills/platform-engineering/resources/cost-optimization.md +634 -0
- package/.claude/skills/platform-engineering/resources/developer-platforms.md +670 -0
- package/.claude/skills/platform-engineering/resources/gitops-automation.md +650 -0
- package/.claude/skills/platform-engineering/resources/infrastructure-as-code.md +778 -0
- package/.claude/skills/platform-engineering/resources/infrastructure-standards.md +708 -0
- package/.claude/skills/platform-engineering/resources/multi-tenancy.md +602 -0
- package/.claude/skills/platform-engineering/resources/platform-security.md +711 -0
- package/.claude/skills/platform-engineering/resources/resource-management.md +592 -0
- package/.claude/skills/platform-engineering/resources/service-mesh.md +628 -0
- package/.claude/skills/release-engineering/SKILL.md +393 -0
- package/.claude/skills/release-engineering/resources/artifact-management.md +108 -0
- package/.claude/skills/release-engineering/resources/build-optimization.md +84 -0
- package/.claude/skills/release-engineering/resources/ci-cd-pipelines.md +411 -0
- package/.claude/skills/release-engineering/resources/deployment-strategies.md +197 -0
- package/.claude/skills/release-engineering/resources/pipeline-security.md +62 -0
- package/.claude/skills/release-engineering/resources/progressive-delivery.md +83 -0
- package/.claude/skills/release-engineering/resources/release-automation.md +68 -0
- package/.claude/skills/release-engineering/resources/release-orchestration.md +77 -0
- package/.claude/skills/release-engineering/resources/rollback-strategies.md +66 -0
- package/.claude/skills/release-engineering/resources/versioning-strategies.md +59 -0
- package/.claude/skills/route-tester/SKILL.md +392 -0
- package/.claude/skills/skill-developer/ADVANCED.md +197 -0
- package/.claude/skills/skill-developer/HOOK_MECHANISMS.md +306 -0
- package/.claude/skills/skill-developer/PATTERNS_LIBRARY.md +152 -0
- package/.claude/skills/skill-developer/SKILL.md +430 -0
- package/.claude/skills/skill-developer/SKILL_RULES_REFERENCE.md +315 -0
- package/.claude/skills/skill-developer/TRIGGER_TYPES.md +305 -0
- package/.claude/skills/skill-developer/TROUBLESHOOTING.md +514 -0
- package/.claude/skills/skill-rules.json +2940 -0
- package/.claude/skills/sre/SKILL.md +464 -0
- package/.claude/skills/sre/resources/alerting-best-practices.md +282 -0
- package/.claude/skills/sre/resources/capacity-planning.md +226 -0
- package/.claude/skills/sre/resources/chaos-engineering.md +193 -0
- package/.claude/skills/sre/resources/disaster-recovery.md +232 -0
- package/.claude/skills/sre/resources/incident-management.md +436 -0
- package/.claude/skills/sre/resources/observability-stack.md +240 -0
- package/.claude/skills/sre/resources/on-call-runbooks.md +167 -0
- package/.claude/skills/sre/resources/performance-optimization.md +108 -0
- package/.claude/skills/sre/resources/reliability-patterns.md +183 -0
- package/.claude/skills/sre/resources/slo-sli-sla.md +464 -0
- package/.claude/skills/sre/resources/toil-reduction.md +145 -0
- package/.claude/skills/systems-engineering/SKILL.md +648 -0
- package/.claude/skills/systems-engineering/resources/automation-patterns.md +771 -0
- package/.claude/skills/systems-engineering/resources/configuration-management.md +998 -0
- package/.claude/skills/systems-engineering/resources/linux-administration.md +672 -0
- package/.claude/skills/systems-engineering/resources/networking-fundamentals.md +982 -0
- package/.claude/skills/systems-engineering/resources/performance-tuning.md +871 -0
- package/.claude/skills/systems-engineering/resources/powershell-scripting.md +482 -0
- package/.claude/skills/systems-engineering/resources/security-hardening.md +739 -0
- package/.claude/skills/systems-engineering/resources/shell-scripting.md +915 -0
- package/.claude/skills/systems-engineering/resources/storage-management.md +628 -0
- package/.claude/skills/systems-engineering/resources/system-monitoring.md +787 -0
- package/.claude/skills/systems-engineering/resources/troubleshooting-guide.md +753 -0
- package/.claude/skills/systems-engineering/resources/windows-administration.md +738 -0
- package/.claude/skills/technical-leadership/SKILL.md +728 -0
- package/CHANGELOG.md +90 -54
- package/README.md +94 -0
- package/backend/docs/SECRETS_DOCUMENTATION.md +327 -0
- package/backend/jest.config.js +59 -0
- package/backend/package-lock.json +6129 -0
- package/backend/package.json +16 -4
- package/backend/prisma/migrations/20251026104609_add_websocket_api/migration.sql +33 -0
- package/backend/prisma/schema.prisma +33 -0
- package/backend/src/__tests__/core/DependencyService.test.js +336 -0
- package/backend/src/__tests__/core/UserService.test.js +875 -0
- package/backend/src/__tests__/repositories/BaseRepository.test.js +146 -0
- package/backend/src/__tests__/repositories/BotRepository.test.js +118 -0
- package/backend/src/__tests__/repositories/CommandRepository.test.js +132 -0
- package/backend/src/__tests__/repositories/EventGraphRepository.test.js +93 -0
- package/backend/src/__tests__/repositories/GroupRepository.test.js +155 -0
- package/backend/src/__tests__/repositories/PermissionRepository.test.js +130 -0
- package/backend/src/__tests__/repositories/PluginRepository.test.js +107 -0
- package/backend/src/__tests__/repositories/ServerRepository.test.js +80 -0
- package/backend/src/__tests__/repositories/UserRepository.test.js +128 -0
- package/backend/src/__tests__/secretsFilter.test.js +425 -0
- package/backend/src/__tests__/services/BotLifecycleService.test.js +411 -0
- package/backend/src/__tests__/services/BotProcessManager.test.js +285 -0
- package/backend/src/__tests__/services/CacheManager.test.js +125 -0
- package/backend/src/__tests__/services/CommandExecutionService.test.js +460 -0
- package/backend/src/__tests__/services/ResourceMonitorService.test.js +207 -0
- package/backend/src/__tests__/services/TelemetryService.test.js +291 -0
- package/backend/src/__tests__/setup.js +25 -0
- package/backend/src/api/routes/apiKeys.js +181 -0
- package/backend/src/api/routes/bots.js +49 -7
- package/backend/src/api/routes/plugins.js +2 -1
- package/backend/src/api/routes/system.js +174 -0
- package/backend/src/container.js +82 -0
- package/backend/src/core/BotManager.js +142 -871
- package/backend/src/core/BotManager.old.js +1093 -0
- package/backend/src/core/BotProcess.js +1092 -858
- package/backend/src/core/EventGraphManager.js +280 -198
- package/backend/src/core/GraphExecutionEngine.js +321 -325
- package/backend/src/core/MessageQueue.js +27 -6
- package/backend/src/core/NodeRegistry.js +37 -1134
- package/backend/src/core/PluginManager.js +62 -12
- package/backend/src/core/PrismaService.js +32 -0
- package/backend/src/core/UserService.js +3 -3
- package/backend/src/core/__tests__/PrismaService.test.js +24 -0
- package/backend/src/core/commands/README.md +305 -0
- package/backend/src/core/commands/dev.js +13 -7
- package/backend/src/core/commands/ping.js +10 -4
- package/backend/src/core/commands/whois.js +63 -0
- package/backend/src/core/config/validation.js +27 -0
- package/backend/src/core/constants/graphTypes.js +21 -0
- package/backend/src/core/node-registries/actions.js +132 -0
- package/backend/src/core/node-registries/arrays.js +137 -0
- package/backend/src/core/node-registries/bot.js +23 -0
- package/backend/src/core/node-registries/data.js +290 -0
- package/backend/src/core/node-registries/debug.js +26 -0
- package/backend/src/core/node-registries/events.js +187 -0
- package/backend/src/core/node-registries/flow.js +139 -0
- package/backend/src/core/node-registries/logic.js +45 -0
- package/backend/src/core/node-registries/math.js +42 -0
- package/backend/src/core/node-registries/objects.js +98 -0
- package/backend/src/core/node-registries/strings.js +153 -0
- package/backend/src/core/node-registries/time.js +113 -0
- package/backend/src/core/node-registries/users.js +79 -0
- package/backend/src/core/nodes/{action_bot_look_at.js → actions/bot_look_at.js} +36 -36
- package/backend/src/core/nodes/{action_bot_set_variable.js → actions/bot_set_variable.js} +32 -32
- package/backend/src/core/nodes/{action_send_log.js → actions/send_log.js} +28 -23
- package/backend/src/core/nodes/{action_send_message.js → actions/send_message.js} +32 -32
- package/backend/src/core/nodes/actions/send_websocket_response.js +33 -0
- package/backend/src/core/nodes/arrays/get_next.js +35 -0
- package/backend/src/core/nodes/{data_cast.js → data/cast.js} +8 -0
- package/backend/src/core/nodes/data/datetime_literal.js +27 -0
- package/backend/src/core/nodes/data/entity_info.js +69 -0
- package/backend/src/core/nodes/data/get_nearby_entities.js +32 -0
- package/backend/src/core/nodes/data/get_nearby_players.js +64 -0
- package/backend/src/core/nodes/{data_get_user_field.js → data/get_user_field.js} +1 -1
- package/backend/src/core/nodes/data/type_check.js +53 -0
- package/backend/src/core/nodes/{debug_log.js → debug/log.js} +16 -16
- package/backend/src/core/nodes/{flow_branch.js → flow/branch.js} +15 -15
- package/backend/src/core/nodes/{flow_break.js → flow/break.js} +14 -14
- package/backend/src/core/nodes/flow/delay.js +43 -0
- package/backend/src/core/nodes/{flow_for_each.js → flow/for_each.js} +39 -39
- package/backend/src/core/nodes/{flow_sequence.js → flow/sequence.js} +16 -16
- package/backend/src/core/nodes/{flow_switch.js → flow/switch.js} +47 -47
- package/backend/src/core/nodes/{flow_while.js → flow/while.js} +1 -1
- package/backend/src/core/nodes/logic/__tests__/compare.test.js +83 -0
- package/backend/src/core/nodes/math/__tests__/operation.test.js +65 -0
- package/backend/src/core/nodes/strings/__tests__/concat.test.js +89 -0
- package/backend/src/core/nodes/time/__tests__/now.test.js +24 -0
- package/backend/src/core/nodes/time/add.js +33 -0
- package/backend/src/core/nodes/time/compare.js +35 -0
- package/backend/src/core/nodes/time/diff.js +29 -0
- package/backend/src/core/nodes/time/format.js +32 -0
- package/backend/src/core/nodes/time/now.js +18 -0
- package/backend/src/core/nodes/{user_check_blacklist.js → users/check_blacklist.js} +37 -37
- package/backend/src/core/nodes/{user_get_groups.js → users/get_groups.js} +36 -36
- package/backend/src/core/nodes/{user_get_permissions.js → users/get_permissions.js} +36 -36
- package/backend/src/core/nodes/{user_set_blacklist.js → users/set_blacklist.js} +37 -37
- package/backend/src/core/services/BotLifecycleService.js +596 -0
- package/backend/src/core/services/BotProcessManager.js +163 -0
- package/backend/src/core/services/CacheManager.js +111 -0
- package/backend/src/core/services/CommandExecutionService.js +351 -0
- package/backend/src/core/services/ResourceMonitorService.js +90 -0
- package/backend/src/core/services/TelemetryService.js +124 -0
- package/backend/src/core/services/ValidationService.js +132 -0
- package/backend/src/core/services/__tests__/ValidationService.test.js +148 -0
- package/backend/src/core/services.js +20 -5
- package/backend/src/core/system/CommandContext.js +84 -0
- package/backend/src/core/system/Transport.js +78 -0
- package/backend/src/core/utils/__tests__/jsonParser.test.js +44 -0
- package/backend/src/core/utils/jsonParser.js +18 -0
- package/backend/src/core/utils/secretsFilter.js +262 -0
- package/backend/src/core/utils/variableParser.js +89 -0
- package/backend/src/core/validation/__tests__/nodeSchemas.test.js +175 -0
- package/backend/src/core/validation/nodeSchemas.js +112 -0
- package/backend/src/lib/prisma.js +2 -4
- package/backend/src/real-time/botApi/handlers/commandHandlers.js +28 -0
- package/backend/src/real-time/botApi/handlers/graphHandlers.js +99 -0
- package/backend/src/real-time/botApi/handlers/graphWebSocketHandlers.js +147 -0
- package/backend/src/real-time/botApi/handlers/index.js +43 -0
- package/backend/src/real-time/botApi/handlers/messageHandlers.js +66 -0
- package/backend/src/real-time/botApi/handlers/statusHandlers.js +17 -0
- package/backend/src/real-time/botApi/handlers/userHandlers.js +141 -0
- package/backend/src/real-time/botApi/index.js +40 -0
- package/backend/src/real-time/botApi/middleware.js +79 -0
- package/backend/src/real-time/botApi/utils.js +54 -0
- package/backend/src/real-time/socketHandler.js +6 -2
- package/backend/src/repositories/BaseRepository.js +43 -0
- package/backend/src/repositories/BotRepository.js +42 -0
- package/backend/src/repositories/CommandRepository.js +53 -0
- package/backend/src/repositories/EventGraphRepository.js +40 -0
- package/backend/src/repositories/GroupRepository.js +69 -0
- package/backend/src/repositories/PermissionRepository.js +48 -0
- package/backend/src/repositories/PluginRepository.js +42 -0
- package/backend/src/repositories/ServerRepository.js +27 -0
- package/backend/src/repositories/UserRepository.js +48 -0
- package/backend/src/server.js +3 -0
- package/backend/src/test-refactor.js +85 -0
- package/frontend/dist/assets/index-CfTo92bP.css +1 -0
- package/frontend/dist/assets/index-CiFD5X9Z.js +8344 -0
- package/frontend/dist/index.html +2 -2
- package/frontend/package.json +0 -5
- package/package.json +2 -1
- package/frontend/dist/assets/index-B9GedHEa.js +0 -8352
- package/frontend/dist/assets/index-zLiy9MDx.css +0 -1
- package/nul +0 -0
- /package/backend/src/core/nodes/{action_http_request.js → actions/http_request.js} +0 -0
- /package/backend/src/core/nodes/{array_add_element.js → arrays/add_element.js} +0 -0
- /package/backend/src/core/nodes/{array_contains.js → arrays/contains.js} +0 -0
- /package/backend/src/core/nodes/{array_find_index.js → arrays/find_index.js} +0 -0
- /package/backend/src/core/nodes/{array_get_by_index.js → arrays/get_by_index.js} +0 -0
- /package/backend/src/core/nodes/{array_get_random_element.js → arrays/get_random_element.js} +0 -0
- /package/backend/src/core/nodes/{array_remove_by_index.js → arrays/remove_by_index.js} +0 -0
- /package/backend/src/core/nodes/{bot_get_position.js → bot/get_position.js} +0 -0
- /package/backend/src/core/nodes/{data_array_literal.js → data/array_literal.js} +0 -0
- /package/backend/src/core/nodes/{data_boolean_literal.js → data/boolean_literal.js} +0 -0
- /package/backend/src/core/nodes/{data_get_argument.js → data/get_argument.js} +0 -0
- /package/backend/src/core/nodes/{data_get_bot_look.js → data/get_bot_look.js} +0 -0
- /package/backend/src/core/nodes/{data_get_entity_field.js → data/get_entity_field.js} +0 -0
- /package/backend/src/core/nodes/{data_get_server_players.js → data/get_server_players.js} +0 -0
- /package/backend/src/core/nodes/{data_get_variable.js → data/get_variable.js} +0 -0
- /package/backend/src/core/nodes/{data_length.js → data/length.js} +0 -0
- /package/backend/src/core/nodes/{data_make_object.js → data/make_object.js} +0 -0
- /package/backend/src/core/nodes/{data_number_literal.js → data/number_literal.js} +0 -0
- /package/backend/src/core/nodes/{data_string_literal.js → data/string_literal.js} +0 -0
- /package/backend/src/core/nodes/{logic_compare.js → logic/compare.js} +0 -0
- /package/backend/src/core/nodes/{logic_operation.js → logic/operation.js} +0 -0
- /package/backend/src/core/nodes/{math_operation.js → math/operation.js} +0 -0
- /package/backend/src/core/nodes/{math_random_number.js → math/random_number.js} +0 -0
- /package/backend/src/core/nodes/{object_create.js → objects/create.js} +0 -0
- /package/backend/src/core/nodes/{object_delete.js → objects/delete.js} +0 -0
- /package/backend/src/core/nodes/{object_get.js → objects/get.js} +0 -0
- /package/backend/src/core/nodes/{object_has_key.js → objects/has_key.js} +0 -0
- /package/backend/src/core/nodes/{object_set.js → objects/set.js} +0 -0
- /package/backend/src/core/nodes/{string_concat.js → strings/concat.js} +0 -0
- /package/backend/src/core/nodes/{string_contains.js → strings/contains.js} +0 -0
- /package/backend/src/core/nodes/{string_ends_with.js → strings/ends_with.js} +0 -0
- /package/backend/src/core/nodes/{string_equals.js → strings/equals.js} +0 -0
- /package/backend/src/core/nodes/{string_length.js → strings/length.js} +0 -0
- /package/backend/src/core/nodes/{string_matches.js → strings/matches.js} +0 -0
- /package/backend/src/core/nodes/{string_split.js → strings/split.js} +0 -0
- /package/backend/src/core/nodes/{string_starts_with.js → strings/starts_with.js} +0 -0
|
@@ -0,0 +1,817 @@
|
|
|
1
|
+
# Engineering Operations Management Skill
|
|
2
|
+
|
|
3
|
+
**For managers running SRE, platform, and infrastructure teams - focusing on operations, on-call, incidents, and engineering metrics.**
|
|
4
|
+
|
|
5
|
+
> This skill helps engineering managers build sustainable operations practices, prevent burnout, run effective incident reviews, and measure what matters. Complements technical SRE skills with people and process management.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
9
|
+
🎯 SKILL ACTIVATED: engineering-operations-management
|
|
10
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
## When to Use This Skill
|
|
14
|
+
|
|
15
|
+
**You're a manager who needs help with:**
|
|
16
|
+
- Structuring on-call rotations and compensation
|
|
17
|
+
- Preventing and addressing engineer burnout
|
|
18
|
+
- Running blameless postmortem reviews
|
|
19
|
+
- Negotiating SLOs with product teams
|
|
20
|
+
- Measuring engineering productivity (not vanity metrics)
|
|
21
|
+
- Managing toil and operational load
|
|
22
|
+
- Balancing feature work vs operational excellence
|
|
23
|
+
- Building sustainable operations culture
|
|
24
|
+
|
|
25
|
+
**This skill does NOT cover:**
|
|
26
|
+
- Hands-on incident response (see **sre** skill)
|
|
27
|
+
- Technical SLO/SLI implementation (see **sre** skill)
|
|
28
|
+
- Monitoring/observability setup (see **platform-engineering** skill)
|
|
29
|
+
- Team hiring and career development (see **engineering-management** skill)
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Core Areas
|
|
34
|
+
|
|
35
|
+
### 1. On-Call Management
|
|
36
|
+
|
|
37
|
+
**Core principle:** On-call is a necessary operational burden that should be **fair, sustainable, and compensated**.
|
|
38
|
+
|
|
39
|
+
#### On-Call Rotation Models
|
|
40
|
+
|
|
41
|
+
**Follow-the-Sun (Recommended for global teams):**
|
|
42
|
+
```
|
|
43
|
+
Americas Team: 6 AM - 2 PM PST (primary)
|
|
44
|
+
EMEA Team: 2 PM - 10 PM PST (primary)
|
|
45
|
+
APAC Team: 10 PM - 6 AM PST (primary)
|
|
46
|
+
|
|
47
|
+
Pros: No one wakes up at night, balanced load
|
|
48
|
+
Cons: Requires global team, handoff complexity
|
|
49
|
+
Best for: Teams with 15+ engineers across timezones
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Weekly Rotation (Most common):**
|
|
53
|
+
```
|
|
54
|
+
Week 1: Engineer A (primary), Engineer B (secondary)
|
|
55
|
+
Week 2: Engineer C (primary), Engineer D (secondary)
|
|
56
|
+
Week 3: Engineer E (primary), Engineer A (secondary)
|
|
57
|
+
|
|
58
|
+
Pros: Simple, fair rotation
|
|
59
|
+
Cons: Weekend coverage, potential burnout
|
|
60
|
+
Best for: Teams with 6-10 engineers in same timezone
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
**Tiered Escalation:**
|
|
64
|
+
```
|
|
65
|
+
Tier 1: Junior engineers (business hours only)
|
|
66
|
+
Tier 2: Senior engineers (24/7 primary)
|
|
67
|
+
Tier 3: Staff/Principal (escalation only)
|
|
68
|
+
|
|
69
|
+
Pros: Gradual responsibility increase
|
|
70
|
+
Cons: Can create "us vs them" dynamic
|
|
71
|
+
Best for: Large teams (15+) with clear skill levels
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
#### On-Call Compensation Models
|
|
75
|
+
|
|
76
|
+
**Option 1: On-Call Stipend**
|
|
77
|
+
```
|
|
78
|
+
Primary on-call: $500-$1,000/week
|
|
79
|
+
Secondary on-call: $250-$500/week
|
|
80
|
+
|
|
81
|
+
Pros: Predictable, simple
|
|
82
|
+
Cons: Doesn't account for actual pages
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
**Option 2: Pay-per-Page**
|
|
86
|
+
```
|
|
87
|
+
Business hours page: $50-$100
|
|
88
|
+
After-hours page: $150-$300
|
|
89
|
+
Weekend/holiday page: $300-$500
|
|
90
|
+
|
|
91
|
+
Pros: Fair - pays for actual disruption
|
|
92
|
+
Cons: Can incentivize ignoring issues
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
**Option 3: Hybrid (Recommended)**
|
|
96
|
+
```
|
|
97
|
+
Base stipend: $500/week
|
|
98
|
+
+ $100 per after-hours page
|
|
99
|
+
+ Comp time (1.5x hours worked after-hours)
|
|
100
|
+
|
|
101
|
+
Pros: Covers both availability and interruptions
|
|
102
|
+
Cons: More complex to administer
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Comp time policies:**
|
|
106
|
+
- For every hour worked after-hours, grant 1.5 hours comp time
|
|
107
|
+
- Comp time must be used within 30 days
|
|
108
|
+
- Encourage taking comp time day-after major incidents
|
|
109
|
+
|
|
110
|
+
#### Burnout Prevention
|
|
111
|
+
|
|
112
|
+
**Warning signs:**
|
|
113
|
+
- 🚨 Pages > 5 per week for extended period
|
|
114
|
+
- 🚨 Engineer mentions exhaustion, stress in 1-on-1s
|
|
115
|
+
- 🚨 Quality of work declining
|
|
116
|
+
- 🚨 Working late nights/weekends regularly
|
|
117
|
+
- 🚨 Cynicism, disengagement
|
|
118
|
+
|
|
119
|
+
**Interventions:**
|
|
120
|
+
1. **Immediate:** Rotate off on-call for 2-4 weeks
|
|
121
|
+
2. **Short-term:** Reduce project load, pair with senior engineer
|
|
122
|
+
3. **Long-term:** Fix underlying system issues causing pages
|
|
123
|
+
|
|
124
|
+
**Sustainable on-call rules:**
|
|
125
|
+
- No engineer on-call more than 1 week per month
|
|
126
|
+
- Maximum 2 weeks on-call per quarter
|
|
127
|
+
- Mandatory break after high-page-volume week
|
|
128
|
+
- No on-call during PTO or major life events
|
|
129
|
+
|
|
130
|
+
#### On-Call Scenarios
|
|
131
|
+
|
|
132
|
+
**Scenario: "What's a fair after-hours pay model?"**
|
|
133
|
+
- **Hybrid model (recommended):**
|
|
134
|
+
- Base: $500/week on-call stipend
|
|
135
|
+
- Plus: $100-150 per after-hours page
|
|
136
|
+
- Plus: 1.5x comp time for hours worked
|
|
137
|
+
- **Example calculation:**
|
|
138
|
+
- Week stipend: $500
|
|
139
|
+
- 3 after-hours pages × $125 = $375
|
|
140
|
+
- 4 hours worked × 1.5 = 6 hours comp time
|
|
141
|
+
- **Total value:** $875 + 6 hours off
|
|
142
|
+
|
|
143
|
+
**Scenario: "Team blamed someone in incident review - how to fix?"**
|
|
144
|
+
- **Immediate:** Stop the review, reset the tone
|
|
145
|
+
- **Say:** "We don't blame people, we fix systems. Let's focus on what failed, not who."
|
|
146
|
+
- **Blameless culture principles:**
|
|
147
|
+
- People make reasonable decisions based on information available
|
|
148
|
+
- Systems should prevent single points of failure
|
|
149
|
+
- Focus on "what" not "who"
|
|
150
|
+
- **Follow-up:** Coach manager running review on blameless principles
|
|
151
|
+
|
|
152
|
+
**Scenario: "During incident, what should I do as manager?"**
|
|
153
|
+
- **Monitor:** Watch incident channel, don't interrupt
|
|
154
|
+
- **Support:** "What do you need? More people? Communication handled?"
|
|
155
|
+
- **Shield:** Handle exec questions, keep pressure off team
|
|
156
|
+
- **Don't:** Take over, second-guess, or ask "why" questions mid-incident
|
|
157
|
+
- **After:** Thank team, schedule postmortem, ensure comp time taken
|
|
158
|
+
|
|
159
|
+
**Scenario: "How do we track incident trends?"**
|
|
160
|
+
- **Metrics to track:**
|
|
161
|
+
- Incident frequency (per week/month)
|
|
162
|
+
- MTTR (mean time to recovery)
|
|
163
|
+
- Incidents by service/component
|
|
164
|
+
- Incidents by root cause category
|
|
165
|
+
- **Look for patterns:**
|
|
166
|
+
- Same service failing repeatedly → systemic issue
|
|
167
|
+
- MTTR increasing → lack of familiarity or tooling gaps
|
|
168
|
+
- Spike in incidents → recent deploy or infrastructure change
|
|
169
|
+
- **Action:** Address top 3 incident sources quarterly
|
|
170
|
+
|
|
171
|
+
**Scenario: "What incident communication plan do we need?"**
|
|
172
|
+
- **During incident:**
|
|
173
|
+
- Sev 1: Updates every 30 minutes to execs, status page every 15 min
|
|
174
|
+
- Sev 2: Updates every hour to stakeholders
|
|
175
|
+
- Sev 3: Update when resolved
|
|
176
|
+
- **Channels:**
|
|
177
|
+
- Internal: Dedicated Slack #incidents channel
|
|
178
|
+
- External: Status page (Statuspage.io, etc.)
|
|
179
|
+
- Executives: Email + Slack DM for Sev 1/2
|
|
180
|
+
- **Template:**
|
|
181
|
+
```
|
|
182
|
+
[SEV 1] API Service Outage
|
|
183
|
+
Impact: All users unable to login
|
|
184
|
+
Status: Investigating
|
|
185
|
+
Next update: 2:30 PM (15 minutes)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
**Scenario: "What's the right retrospective format?"**
|
|
189
|
+
- **Timeline:** Within 48 hours of incident (while fresh)
|
|
190
|
+
- **Attendees:** Incident responders + anyone interested (open invitation)
|
|
191
|
+
- **Duration:** 45-60 minutes
|
|
192
|
+
- **Format:**
|
|
193
|
+
1. Timeline walkthrough (10 min)
|
|
194
|
+
2. What went well (10 min)
|
|
195
|
+
3. What went poorly (15 min)
|
|
196
|
+
4. Action items (15 min) - with owners and due dates
|
|
197
|
+
5. Q&A (10 min)
|
|
198
|
+
- **Output:** Written postmortem + action items tracked
|
|
199
|
+
|
|
200
|
+
**Scenario: "How do we communicate incidents to executives?"**
|
|
201
|
+
- **During:** Brief, factual updates
|
|
202
|
+
- "API down, 100% of users affected, team investigating"
|
|
203
|
+
- **After:** Business-focused summary
|
|
204
|
+
- Revenue impact: "$50K in lost sales"
|
|
205
|
+
- User impact: "10K users couldn't check out for 2 hours"
|
|
206
|
+
- Prevention: "Adding rate limiting to prevent recurrence"
|
|
207
|
+
- **Avoid:** Deep technical details unless asked
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
### 2. Incident Management for Managers
|
|
212
|
+
|
|
213
|
+
**Your role as a manager during incidents:**
|
|
214
|
+
|
|
215
|
+
#### During the Incident (DO NOT take over unless critical)
|
|
216
|
+
|
|
217
|
+
```
|
|
218
|
+
✅ DO:
|
|
219
|
+
- Monitor incident channel, offer support
|
|
220
|
+
- Shield team from external pressure
|
|
221
|
+
- Bring in additional engineers if needed
|
|
222
|
+
- Coordinate with stakeholders (updates to execs)
|
|
223
|
+
- Order food if it's going long
|
|
224
|
+
- Take notes for postmortem
|
|
225
|
+
|
|
226
|
+
❌ DON'T:
|
|
227
|
+
- Take over incident response (unless you're most qualified)
|
|
228
|
+
- Ask "why didn't you..." questions during incident
|
|
229
|
+
- Pressure for faster resolution
|
|
230
|
+
- Blame individuals
|
|
231
|
+
- Second-guess decisions being made
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
**Incident Severity Levels (align with team):**
|
|
235
|
+
|
|
236
|
+
```
|
|
237
|
+
Sev 1 (Critical):
|
|
238
|
+
├── Complete service outage
|
|
239
|
+
├── Data loss or security breach
|
|
240
|
+
├── Revenue impact > $10K/hour
|
|
241
|
+
└── Response: All hands, exec updates every 30 min
|
|
242
|
+
|
|
243
|
+
Sev 2 (High):
|
|
244
|
+
├── Major feature degraded
|
|
245
|
+
├── Significant user impact
|
|
246
|
+
├── Revenue impact > $1K/hour
|
|
247
|
+
└── Response: On-call + expert, updates every hour
|
|
248
|
+
|
|
249
|
+
Sev 3 (Medium):
|
|
250
|
+
├── Minor feature degraded
|
|
251
|
+
├── Limited user impact
|
|
252
|
+
└── Response: On-call handles, regular updates
|
|
253
|
+
|
|
254
|
+
Sev 4 (Low):
|
|
255
|
+
├── Internal tooling issue
|
|
256
|
+
├── No user impact
|
|
257
|
+
└── Response: Fix during business hours
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
#### After the Incident: Blameless Postmortem
|
|
261
|
+
|
|
262
|
+
**Blameless postmortem framework:**
|
|
263
|
+
|
|
264
|
+
```
|
|
265
|
+
Postmortem Template:
|
|
266
|
+
|
|
267
|
+
## Incident Summary
|
|
268
|
+
- Date/Time: When did it happen?
|
|
269
|
+
- Duration: How long?
|
|
270
|
+
- Impact: Who was affected? How many users?
|
|
271
|
+
- Severity: Sev 1-4
|
|
272
|
+
|
|
273
|
+
## Timeline
|
|
274
|
+
- 14:32 - First alert fired
|
|
275
|
+
- 14:35 - Engineer A acknowledged, began investigation
|
|
276
|
+
- 14:45 - Root cause identified (database connection pool exhausted)
|
|
277
|
+
- 15:00 - Mitigation applied (increased pool size)
|
|
278
|
+
- 15:15 - Service fully recovered
|
|
279
|
+
|
|
280
|
+
## Root Cause
|
|
281
|
+
What actually caused this? (Technical, not "Engineer X did...")
|
|
282
|
+
|
|
283
|
+
## What Went Well
|
|
284
|
+
- Alert fired within 2 minutes
|
|
285
|
+
- Communication was clear
|
|
286
|
+
- Rollback was smooth
|
|
287
|
+
|
|
288
|
+
## What Went Poorly
|
|
289
|
+
- No automated mitigation
|
|
290
|
+
- Monitoring didn't catch early warning signs
|
|
291
|
+
- On-call engineer not familiar with this service
|
|
292
|
+
|
|
293
|
+
## Action Items
|
|
294
|
+
1. [P0] Add automated connection pool scaling (Owner: Alice, Due: 2 weeks)
|
|
295
|
+
2. [P1] Improve monitoring for connection pool saturation (Owner: Bob, Due: 1 month)
|
|
296
|
+
3. [P2] Add service to on-call training rotation (Owner: Manager, Due: 2 weeks)
|
|
297
|
+
|
|
298
|
+
## Lessons Learned
|
|
299
|
+
- Database connection pool defaults are too conservative
|
|
300
|
+
- Need better pre-production load testing
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
**Blameless postmortem meeting (45-60 min):**
|
|
304
|
+
|
|
305
|
+
```
|
|
306
|
+
1. Introduction (5 min)
|
|
307
|
+
└── Remind: This is blameless, focus on systems not people
|
|
308
|
+
|
|
309
|
+
2. Timeline Review (15 min)
|
|
310
|
+
└── Walk through what happened, when
|
|
311
|
+
|
|
312
|
+
3. Root Cause Analysis (15 min)
|
|
313
|
+
└── "Why did this happen?" (ask "why" 5 times)
|
|
314
|
+
|
|
315
|
+
4. What Went Well / What Went Poorly (10 min)
|
|
316
|
+
└── Balanced reflection
|
|
317
|
+
|
|
318
|
+
5. Action Items (10 min)
|
|
319
|
+
└── Specific, assigned, with due dates
|
|
320
|
+
└── Priority: P0 (this week), P1 (this month), P2 (nice to have)
|
|
321
|
+
|
|
322
|
+
6. Close (5 min)
|
|
323
|
+
└── Thank the team, emphasize learning
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
**Red flags in postmortems:**
|
|
327
|
+
- ❌ Blaming individuals ("Alice should have...")
|
|
328
|
+
- ❌ Vague action items ("Improve monitoring")
|
|
329
|
+
- ❌ No follow-up on action items
|
|
330
|
+
- ❌ Defensive posturing
|
|
331
|
+
- ❌ Skipping postmortems for "small" incidents
|
|
332
|
+
|
|
333
|
+
**Manager's job:** Enforce blameless culture, track action items, ensure learning.
|
|
334
|
+
|
|
335
|
+
---
|
|
336
|
+
|
|
337
|
+
### 3. SLO Negotiation with Product Teams
|
|
338
|
+
|
|
339
|
+
**The tension:** Product wants features fast. SRE/Platform wants stability. You balance both.
|
|
340
|
+
|
|
341
|
+
#### Understanding SLOs (Simple Version for Managers)
|
|
342
|
+
|
|
343
|
+
```
|
|
344
|
+
SLI (Service Level Indicator):
|
|
345
|
+
What you measure (e.g., "API latency p99")
|
|
346
|
+
|
|
347
|
+
SLO (Service Level Objective):
|
|
348
|
+
Target for reliability (e.g., "API latency p99 < 500ms, 99.9% of the time")
|
|
349
|
+
|
|
350
|
+
SLA (Service Level Agreement):
|
|
351
|
+
Contractual promise to customers (e.g., "99.95% uptime or we give refund")
|
|
352
|
+
|
|
353
|
+
Example:
|
|
354
|
+
SLI: Request success rate
|
|
355
|
+
SLO: 99.9% of requests succeed (internal target)
|
|
356
|
+
SLA: 99.5% uptime (customer-facing promise)
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
**Error budget concept:**
|
|
360
|
+
|
|
361
|
+
```
|
|
362
|
+
SLO: 99.9% availability = 0.1% allowed downtime
|
|
363
|
+
|
|
364
|
+
Per month (30 days):
|
|
365
|
+
├── Total time: 43,200 minutes
|
|
366
|
+
├── Allowed downtime: 43.2 minutes
|
|
367
|
+
└── Error budget: 43.2 minutes
|
|
368
|
+
|
|
369
|
+
If error budget exhausted:
|
|
370
|
+
├── Freeze feature releases
|
|
371
|
+
├── Focus on reliability improvements
|
|
372
|
+
└── Pay down tech debt
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
#### SLO Negotiation Framework
|
|
376
|
+
|
|
377
|
+
**When product pushes for aggressive feature timeline:**
|
|
378
|
+
|
|
379
|
+
```
|
|
380
|
+
Product: "We need to ship this feature in 2 weeks"
|
|
381
|
+
|
|
382
|
+
You (as manager):
|
|
383
|
+
"Let's check our error budget first. If we have budget, we can move fast.
|
|
384
|
+
If we're out of budget, we need to stabilize first."
|
|
385
|
+
|
|
386
|
+
Scenario 1: Error budget healthy (50% remaining)
|
|
387
|
+
├── ✅ Green light for feature work
|
|
388
|
+
├── 70% capacity on features
|
|
389
|
+
└── 30% on reliability
|
|
390
|
+
|
|
391
|
+
Scenario 2: Error budget exhausted (0% remaining)
|
|
392
|
+
├── 🛑 Feature freeze
|
|
393
|
+
├── 100% capacity on reliability
|
|
394
|
+
└── Resume features when budget recovers
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
**How to set SLOs (practical guide):**
|
|
398
|
+
|
|
399
|
+
1. **Start with current performance:**
|
|
400
|
+
- "Our API latency p99 is currently 300ms"
|
|
401
|
+
- Don't set SLO at 300ms - give yourself buffer
|
|
402
|
+
|
|
403
|
+
2. **Set realistic target:**
|
|
404
|
+
- "Let's set SLO at p99 < 500ms"
|
|
405
|
+
- This gives 200ms buffer for growth/issues
|
|
406
|
+
|
|
407
|
+
3. **Align with customer expectation:**
|
|
408
|
+
- "Customers complain if latency > 1s"
|
|
409
|
+
- SLO should prevent customer pain
|
|
410
|
+
|
|
411
|
+
4. **Review quarterly:**
|
|
412
|
+
- Too easy? (Always meeting SLO) → Tighten SLO or invest in features
|
|
413
|
+
- Too hard? (Always missing SLO) → Loosen SLO or invest in reliability
|
|
414
|
+
|
|
415
|
+
**Common SLOs by service type:**
|
|
416
|
+
|
|
417
|
+
```
|
|
418
|
+
API Services:
|
|
419
|
+
├── Availability: 99.9% (43 min downtime/month)
|
|
420
|
+
├── Latency p50: < 100ms
|
|
421
|
+
├── Latency p99: < 500ms
|
|
422
|
+
└── Error rate: < 0.1%
|
|
423
|
+
|
|
424
|
+
Batch Processing:
|
|
425
|
+
├── Job success rate: 99.5%
|
|
426
|
+
├── Job completion time: < 4 hours
|
|
427
|
+
└── Data accuracy: 99.99%
|
|
428
|
+
|
|
429
|
+
Data Pipeline:
|
|
430
|
+
├── Data freshness: < 15 min lag
|
|
431
|
+
├── Pipeline availability: 99.9%
|
|
432
|
+
└── Data quality: 99.95%
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
---
|
|
436
|
+
|
|
437
|
+
### 4. Engineering Metrics That Matter
|
|
438
|
+
|
|
439
|
+
**The problem:** Easy to measure vanity metrics. Hard to measure real productivity.
|
|
440
|
+
|
|
441
|
+
#### Vanity Metrics (Avoid)
|
|
442
|
+
|
|
443
|
+
```
|
|
444
|
+
❌ Lines of code written
|
|
445
|
+
❌ Number of commits
|
|
446
|
+
❌ Hours worked
|
|
447
|
+
❌ Number of deploys (without context)
|
|
448
|
+
❌ Ticket velocity (without quality)
|
|
449
|
+
❌ Code coverage % (without context)
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
**Why these are bad:**
|
|
453
|
+
- Lines of code: Good engineers often delete code
|
|
454
|
+
- Number of commits: Encourages small, meaningless commits
|
|
455
|
+
- Hours worked: Encourages burnout, not productivity
|
|
456
|
+
- Deploys without context: Could be hotfixes for bugs you introduced
|
|
457
|
+
- Ticket velocity: Encourages cherry-picking easy tickets
|
|
458
|
+
- Code coverage: Can write useless tests to hit %
|
|
459
|
+
|
|
460
|
+
#### Metrics That Actually Matter
|
|
461
|
+
|
|
462
|
+
**1. DORA Metrics (Use these)**
|
|
463
|
+
|
|
464
|
+
```
|
|
465
|
+
Deployment Frequency:
|
|
466
|
+
├── How often do you deploy to production?
|
|
467
|
+
├── Elite: Multiple times per day
|
|
468
|
+
├── High: Daily to weekly
|
|
469
|
+
├── Medium: Weekly to monthly
|
|
470
|
+
└── Low: Monthly to every 6 months
|
|
471
|
+
|
|
472
|
+
Lead Time for Changes:
|
|
473
|
+
├── How long from commit to production?
|
|
474
|
+
├── Elite: < 1 hour
|
|
475
|
+
├── High: 1 day to 1 week
|
|
476
|
+
├── Medium: 1 week to 1 month
|
|
477
|
+
└── Low: 1 month to 6 months
|
|
478
|
+
|
|
479
|
+
Time to Restore Service:
|
|
480
|
+
├── How long to recover from incident?
|
|
481
|
+
├── Elite: < 1 hour
|
|
482
|
+
├── High: < 1 day
|
|
483
|
+
├── Medium: 1 day to 1 week
|
|
484
|
+
└── Low: > 1 week
|
|
485
|
+
|
|
486
|
+
Change Failure Rate:
|
|
487
|
+
├── What % of changes cause incidents?
|
|
488
|
+
├── Elite: 0-15%
|
|
489
|
+
├── High: 16-30%
|
|
490
|
+
├── Medium: 31-45%
|
|
491
|
+
└── Low: > 45%
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
**How to use DORA metrics:**
|
|
495
|
+
- Track quarterly, not daily (avoid gaming)
|
|
496
|
+
- Trend over time (are we improving?)
|
|
497
|
+
- Compare to benchmarks (elite, high, medium, low)
|
|
498
|
+
- Use to identify improvement areas
|
|
499
|
+
|
|
500
|
+
**2. SRE Metrics**
|
|
501
|
+
|
|
502
|
+
```
|
|
503
|
+
Toil Percentage:
|
|
504
|
+
├── What % of engineer time is manual ops work?
|
|
505
|
+
├── Target: < 30% toil
|
|
506
|
+
├── Intervention needed: > 50% toil
|
|
507
|
+
└── Measure: Time tracking, surveys
|
|
508
|
+
|
|
509
|
+
On-Call Load:
|
|
510
|
+
├── Pages per week per engineer
|
|
511
|
+
├── Target: < 3 pages/week
|
|
512
|
+
├── Intervention: > 5 pages/week
|
|
513
|
+
└── Measure: PagerDuty analytics
|
|
514
|
+
|
|
515
|
+
SLO Compliance:
|
|
516
|
+
├── Are we meeting our SLOs?
|
|
517
|
+
├── Target: 99%+ SLO compliance
|
|
518
|
+
└── Measure: Observability dashboards
|
|
519
|
+
```
|
|
520
|
+
|
|
521
|
+
**3. Team Health Metrics**
|
|
522
|
+
|
|
523
|
+
```
|
|
524
|
+
Engineer Satisfaction:
|
|
525
|
+
├── Quarterly survey (1-10 scale)
|
|
526
|
+
├── Questions: "Satisfied with work?", "Would recommend team?"
|
|
527
|
+
├── Target: 8+ average
|
|
528
|
+
└── Red flag: < 6 average or declining trend
|
|
529
|
+
|
|
530
|
+
Retention Rate:
|
|
531
|
+
├── % of engineers staying > 1 year
|
|
532
|
+
├── Target: > 85% annual retention
|
|
533
|
+
└── Red flag: < 70% retention
|
|
534
|
+
|
|
535
|
+
Time to Productivity (New hires):
|
|
536
|
+
├── How long until new hire is productive?
|
|
537
|
+
├── Target: < 90 days
|
|
538
|
+
└── Measure: Manager assessment + self-assessment
|
|
539
|
+
```
|
|
540
|
+
|
|
541
|
+
**4. Operational Excellence Metrics**
|
|
542
|
+
|
|
543
|
+
```
|
|
544
|
+
Incident Trends:
|
|
545
|
+
├── Number of Sev 1/2 incidents per month
|
|
546
|
+
├── Target: Declining or stable
|
|
547
|
+
└── Red flag: Increasing trend
|
|
548
|
+
|
|
549
|
+
Postmortem Action Item Completion:
|
|
550
|
+
├── % of action items completed on time
|
|
551
|
+
├── Target: > 80% completion
|
|
552
|
+
└── Red flag: < 50% completion
|
|
553
|
+
|
|
554
|
+
Automated Test Coverage:
|
|
555
|
+
├── % of critical paths covered
|
|
556
|
+
├── Target: > 70% for critical paths
|
|
557
|
+
└── Not a vanity metric if focused on high-risk areas
|
|
558
|
+
```
|
|
559
|
+
|
|
560
|
+
#### How to Present Metrics to Leadership
|
|
561
|
+
|
|
562
|
+
**Dashboard structure:**
|
|
563
|
+
|
|
564
|
+
```
|
|
565
|
+
1. Health at a Glance (Top metrics)
|
|
566
|
+
├── 🟢 SLO Compliance: 99.8% (Target: 99%)
|
|
567
|
+
├── 🟡 Deployment Frequency: 3x/week (Target: Daily)
|
|
568
|
+
├── 🟢 Incident Rate: 2 Sev2 this month (Last month: 4)
|
|
569
|
+
└── 🟢 Team Satisfaction: 8.2/10 (Target: 8+)
|
|
570
|
+
|
|
571
|
+
2. DORA Metrics Trend (Quarterly)
|
|
572
|
+
[Chart showing improvement over time]
|
|
573
|
+
|
|
574
|
+
3. Focus Areas
|
|
575
|
+
├── ✅ Reduced incident rate by 50% this quarter
|
|
576
|
+
├── 🚧 Working on deployment frequency (automation initiative)
|
|
577
|
+
└── ⚠️ Toil still high at 40% - hiring 2 more engineers
|
|
578
|
+
|
|
579
|
+
4. Asks
|
|
580
|
+
├── Budget for observability tooling ($50K)
|
|
581
|
+
└── Approval to pause feature work next sprint for reliability
|
|
582
|
+
```
|
|
583
|
+
|
|
584
|
+
---
|
|
585
|
+
|
|
586
|
+
### 5. Balancing Feature Work vs Operational Excellence
|
|
587
|
+
|
|
588
|
+
**The eternal tension:** Product wants features. You want stability.
|
|
589
|
+
|
|
590
|
+
#### Resource Allocation Models
|
|
591
|
+
|
|
592
|
+
**70-20-10 Rule (Recommended):**
|
|
593
|
+
```
|
|
594
|
+
70% Feature Work:
|
|
595
|
+
├── New features product wants
|
|
596
|
+
├── Customer-facing improvements
|
|
597
|
+
└── Revenue-generating projects
|
|
598
|
+
|
|
599
|
+
20% Operational Excellence:
|
|
600
|
+
├── Tech debt paydown
|
|
601
|
+
├── Reliability improvements
|
|
602
|
+
├── Monitoring enhancements
|
|
603
|
+
└── Automation
|
|
604
|
+
|
|
605
|
+
10% Innovation/Learning:
|
|
606
|
+
├── Explore new technologies
|
|
607
|
+
├── Hackathons
|
|
608
|
+
├── Learning time
|
|
609
|
+
└── Experimentation
|
|
610
|
+
```
|
|
611
|
+
|
|
612
|
+
**Adjust based on phase:**
|
|
613
|
+
|
|
614
|
+
```
|
|
615
|
+
High Growth Phase:
|
|
616
|
+
├── 80% Features
|
|
617
|
+
├── 15% Ops Excellence
|
|
618
|
+
└── 5% Innovation
|
|
619
|
+
|
|
620
|
+
Stability Phase:
|
|
621
|
+
├── 50% Features
|
|
622
|
+
├── 40% Ops Excellence
|
|
623
|
+
└── 10% Innovation
|
|
624
|
+
|
|
625
|
+
Crisis Phase (Post-Incidents):
|
|
626
|
+
├── 30% Features
|
|
627
|
+
├── 60% Ops Excellence
|
|
628
|
+
└── 10% Innovation
|
|
629
|
+
```
|
|
630
|
+
|
|
631
|
+
#### Negotiating with Product
|
|
632
|
+
|
|
633
|
+
**When product asks for all-feature, no-ops time:**
|
|
634
|
+
|
|
635
|
+
```
|
|
636
|
+
Scenario: "We need all engineers on Feature X for Q4"
|
|
637
|
+
|
|
638
|
+
Your response framework:
|
|
639
|
+
1. Acknowledge business need
|
|
640
|
+
"I understand Feature X is critical for revenue"
|
|
641
|
+
|
|
642
|
+
2. State operational reality
|
|
643
|
+
"Our on-call load is high (8 pages/week) and error budget is 80% exhausted"
|
|
644
|
+
|
|
645
|
+
3. Present options
|
|
646
|
+
Option A: All-in on features, risk of incidents and burnout
|
|
647
|
+
Option B: 70-30 split, sustainable pace, less feature risk
|
|
648
|
+
Option C: Hire 2 more engineers to do both
|
|
649
|
+
|
|
650
|
+
4. Recommend
|
|
651
|
+
"I recommend Option B - we'll deliver 70% of Feature X this quarter,
|
|
652
|
+
and ensure we don't have outages that impact customers"
|
|
653
|
+
|
|
654
|
+
5. Make it their decision
|
|
655
|
+
"What's your preference given these trade-offs?"
|
|
656
|
+
```
|
|
657
|
+
|
|
658
|
+
**Using error budgets as negotiation tool:**
|
|
659
|
+
|
|
660
|
+
```
|
|
661
|
+
Error budget = objective metric, not subjective
|
|
662
|
+
|
|
663
|
+
If product wants to move fast:
|
|
664
|
+
├── Check error budget: 50% remaining?
|
|
665
|
+
├── ✅ Green light: "We have budget, let's ship!"
|
|
666
|
+
└── 🛑 Budget exhausted: "We need to stabilize first"
|
|
667
|
+
|
|
668
|
+
This removes emotion from discussion. It's data-driven.
|
|
669
|
+
```
|
|
670
|
+
|
|
671
|
+
---
|
|
672
|
+
|
|
673
|
+
### 6. Building Sustainable Operations Culture
|
|
674
|
+
|
|
675
|
+
**Culture eats process for breakfast.**
|
|
676
|
+
|
|
677
|
+
#### Key Cultural Values
|
|
678
|
+
|
|
679
|
+
**1. Blameless Culture**
|
|
680
|
+
```
|
|
681
|
+
When incidents happen:
|
|
682
|
+
❌ "Who broke it?" → ✅ "What broke?"
|
|
683
|
+
❌ "Why didn't you..." → ✅ "What can we learn?"
|
|
684
|
+
❌ Hide mistakes → ✅ Share failures openly
|
|
685
|
+
```
|
|
686
|
+
|
|
687
|
+
**2. Automate Toil**
|
|
688
|
+
```
|
|
689
|
+
Manual work is not a badge of honor.
|
|
690
|
+
├── Track toil percentage
|
|
691
|
+
├── Reward automation, not heroics
|
|
692
|
+
└── "If you do it twice, automate it"
|
|
693
|
+
```
|
|
694
|
+
|
|
695
|
+
**3. Sustainable On-Call**
|
|
696
|
+
```
|
|
697
|
+
On-call is not punishment.
|
|
698
|
+
├── Fair rotation
|
|
699
|
+
├── Compensated fairly
|
|
700
|
+
├── Protected from burnout
|
|
701
|
+
└── Escalation is encouraged, not weakness
|
|
702
|
+
```
|
|
703
|
+
|
|
704
|
+
**4. Continuous Improvement**
|
|
705
|
+
```
|
|
706
|
+
Every incident is a learning opportunity.
|
|
707
|
+
├── Postmortems are required, not optional
|
|
708
|
+
├── Action items are tracked and completed
|
|
709
|
+
└── Celebrate fixes, not just features
|
|
710
|
+
```
|
|
711
|
+
|
|
712
|
+
#### Manager Actions to Reinforce Culture
|
|
713
|
+
|
|
714
|
+
**1. Lead by Example**
|
|
715
|
+
- Participate in on-call rotation (if you're technical)
|
|
716
|
+
- Admit your own mistakes publicly
|
|
717
|
+
- Take postmortem action items yourself
|
|
718
|
+
|
|
719
|
+
**2. Celebrate Operational Wins**
|
|
720
|
+
- Shout out engineers who reduce toil
|
|
721
|
+
- Highlight reliability improvements in team meetings
|
|
722
|
+
- Give "Operational Excellence" awards
|
|
723
|
+
|
|
724
|
+
**3. Protect Your Team**
|
|
725
|
+
- Say no to unrealistic timelines
|
|
726
|
+
- Push back on "just ship it" pressure
|
|
727
|
+
- Shield team from org politics
|
|
728
|
+
|
|
729
|
+
**4. Invest in Automation**
|
|
730
|
+
- Allocate 20% capacity to ops excellence
|
|
731
|
+
- Approve tool/platform budgets
|
|
732
|
+
- Hire for automation skills
|
|
733
|
+
|
|
734
|
+
---
|
|
735
|
+
|
|
736
|
+
## Quick Reference for Managers
|
|
737
|
+
|
|
738
|
+
**On-Call:**
|
|
739
|
+
- Rotation: Weekly or follow-the-sun
|
|
740
|
+
- Compensation: $500-$1000/week + pay-per-page
|
|
741
|
+
- Burnout prevention: Max 1 week/month, comp time after incidents
|
|
742
|
+
|
|
743
|
+
**Incidents:**
|
|
744
|
+
- Your role: Support, don't take over
|
|
745
|
+
- Blameless postmortems: Required for all Sev 1/2
|
|
746
|
+
- Action items: Track and ensure completion
|
|
747
|
+
|
|
748
|
+
**SLOs:**
|
|
749
|
+
- Start with current performance + buffer
|
|
750
|
+
- Use error budgets to negotiate with product
|
|
751
|
+
- Review quarterly
|
|
752
|
+
|
|
753
|
+
**Metrics:**
|
|
754
|
+
- Use DORA metrics (deployment freq, lead time, MTTR, change failure rate)
|
|
755
|
+
- Avoid vanity metrics (lines of code, commits, hours)
|
|
756
|
+
- Track team health (satisfaction, retention)
|
|
757
|
+
|
|
758
|
+
**Resource Allocation:**
|
|
759
|
+
- 70% features, 20% ops excellence, 10% innovation
|
|
760
|
+
- Adjust based on phase (growth vs stability)
|
|
761
|
+
|
|
762
|
+
**Culture:**
|
|
763
|
+
- Blameless, automate toil, sustainable on-call
|
|
764
|
+
- Lead by example, celebrate ops wins
|
|
765
|
+
- Protect team from burnout
|
|
766
|
+
|
|
767
|
+
### Culture Building Scenario
|
|
768
|
+
|
|
769
|
+
**Scenario: "How do we build a sustainable ops culture?"**
|
|
770
|
+
- **Blameless:**
|
|
771
|
+
- Never "who broke it?" Always "what broke and how do we prevent it?"
|
|
772
|
+
- Share postmortems openly - learn from all incidents
|
|
773
|
+
- Reward transparency (caught early) over hiding (festered)
|
|
774
|
+
- **Automate toil:**
|
|
775
|
+
- Track toil percentage (target < 30%)
|
|
776
|
+
- Dedicate 20% time to automation
|
|
777
|
+
- Celebrate "we automated ourselves out of that problem"
|
|
778
|
+
- **Sustainable on-call:**
|
|
779
|
+
- No hero culture - don't celebrate all-nighters
|
|
780
|
+
- Enforce comp time and breaks
|
|
781
|
+
- Fix systems that cause repeated pages
|
|
782
|
+
- **Recognition:**
|
|
783
|
+
- Highlight ops wins in all-hands: "Automated X, saved 50 hours/month"
|
|
784
|
+
- Incident response recognition: "Great job handling outage calmly"
|
|
785
|
+
- Quality over speed: "Prevented incident with thorough testing"
|
|
786
|
+
|
|
787
|
+
**Scenario: "How do we prevent hero culture?"**
|
|
788
|
+
- **Heroes are a symptom of broken systems**
|
|
789
|
+
- **Signs of hero culture:**
|
|
790
|
+
- Same engineer always saves the day
|
|
791
|
+
- Working nights/weekends is celebrated
|
|
792
|
+
- "We need you" used as motivation
|
|
793
|
+
- **How to fix:**
|
|
794
|
+
- Document hero's knowledge → spread it
|
|
795
|
+
- Automate hero's manual tasks
|
|
796
|
+
- Create runbooks for common issues
|
|
797
|
+
- Rotate responsibilities - don't depend on one person
|
|
798
|
+
- **Say:** "I appreciate your dedication, but this is unsustainable. Let's fix the system so you don't need to be a hero."
|
|
799
|
+
|
|
800
|
+
---
|
|
801
|
+
|
|
802
|
+
## Integration with Other Skills
|
|
803
|
+
|
|
804
|
+
**This skill works with:**
|
|
805
|
+
- **engineering-management** - Hiring, career development, 1-on-1s
|
|
806
|
+
- **technical-leadership** - Making technical decisions, risk assessment
|
|
807
|
+
- **infrastructure-strategy** - Long-term planning, platform investment
|
|
808
|
+
- **budget-and-cost-management** - On-call budgets, tooling costs
|
|
809
|
+
|
|
810
|
+
**Technical skills your team uses:**
|
|
811
|
+
- **sre** - Hands-on SLO implementation, incident response
|
|
812
|
+
- **platform-engineering** - Building internal platforms that reduce toil
|
|
813
|
+
- **cybersecurity** - Security incident response, compliance
|
|
814
|
+
|
|
815
|
+
---
|
|
816
|
+
|
|
817
|
+
**Remember:** Your job is to build sustainable operations practices that enable long-term success, not short-term heroics. Protect your team from burnout. Measure what matters. Learn from every incident.
|