agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,760 @@
|
|
|
1
|
+
# Runbooks
|
|
2
|
+
|
|
3
|
+
Comprehensive guidelines for writing and maintaining operational runbooks.
|
|
4
|
+
|
|
5
|
+
## Core Principles
|
|
6
|
+
|
|
7
|
+
1. **Executable** - Clear, step-by-step instructions that anyone can follow
|
|
8
|
+
2. **Current** - Updated after every incident that uses or improves them
|
|
9
|
+
3. **Tested** - Regularly verified to ensure they work
|
|
10
|
+
4. **Automated** - Automate steps where possible, document where not
|
|
11
|
+
|
|
12
|
+
## Runbook Types
|
|
13
|
+
|
|
14
|
+
### Alert Runbooks
|
|
15
|
+
|
|
16
|
+
```yaml
|
|
17
|
+
alert_runbook:
|
|
18
|
+
purpose: "Guide response to specific alerts"
|
|
19
|
+
trigger: "Alert fires"
|
|
20
|
+
audience: "On-call engineer"
|
|
21
|
+
|
|
22
|
+
structure:
|
|
23
|
+
- "Alert overview and severity"
|
|
24
|
+
- "Symptoms and verification"
|
|
25
|
+
- "Quick diagnosis steps"
|
|
26
|
+
- "Mitigation actions"
|
|
27
|
+
- "Root cause investigation"
|
|
28
|
+
- "Escalation criteria"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Service Runbooks
|
|
32
|
+
|
|
33
|
+
```yaml
|
|
34
|
+
service_runbook:
|
|
35
|
+
purpose: "Comprehensive guide for a service"
|
|
36
|
+
trigger: "Any issue with the service"
|
|
37
|
+
audience: "Service operators"
|
|
38
|
+
|
|
39
|
+
structure:
|
|
40
|
+
- "Service overview and architecture"
|
|
41
|
+
- "Dependencies and data flows"
|
|
42
|
+
- "Common operations"
|
|
43
|
+
- "Troubleshooting guide"
|
|
44
|
+
- "Recovery procedures"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Procedure Runbooks
|
|
48
|
+
|
|
49
|
+
```yaml
|
|
50
|
+
procedure_runbook:
|
|
51
|
+
purpose: "Guide for specific operational tasks"
|
|
52
|
+
trigger: "Need to perform task"
|
|
53
|
+
audience: "Operations team"
|
|
54
|
+
|
|
55
|
+
examples:
|
|
56
|
+
- "Database failover"
|
|
57
|
+
- "Certificate rotation"
|
|
58
|
+
- "Capacity scaling"
|
|
59
|
+
- "Data recovery"
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Runbook Template
|
|
63
|
+
|
|
64
|
+
### Alert Runbook Template
|
|
65
|
+
|
|
66
|
+
```markdown
|
|
67
|
+
# Runbook: [Alert Name]
|
|
68
|
+
|
|
69
|
+
## Overview
|
|
70
|
+
|
|
71
|
+
**Alert**: `AlertName`
|
|
72
|
+
**Severity**: Critical | Warning | Info
|
|
73
|
+
**Service**: service-name
|
|
74
|
+
**Team**: team-name
|
|
75
|
+
|
|
76
|
+
### What This Alert Means
|
|
77
|
+
|
|
78
|
+
One paragraph explaining what condition triggers this alert and why it matters.
|
|
79
|
+
|
|
80
|
+
### User Impact
|
|
81
|
+
|
|
82
|
+
- What users experience when this alert fires
|
|
83
|
+
- Which features are affected
|
|
84
|
+
- Business impact
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Quick Reference
|
|
89
|
+
|
|
90
|
+
### Verify the Alert
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# Check if the condition is real
|
|
94
|
+
kubectl get pods -l app=service-name
|
|
95
|
+
curl -s http://service/health | jq .
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Quick Mitigation
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# If recent deployment, rollback
|
|
102
|
+
kubectl rollout undo deployment/service-name
|
|
103
|
+
|
|
104
|
+
# If capacity issue, scale up
|
|
105
|
+
kubectl scale deployment/service-name --replicas=10
|
|
106
|
+
|
|
107
|
+
# If specific feature causing issues
|
|
108
|
+
# Disable feature flag in LaunchDarkly
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Diagnosis
|
|
114
|
+
|
|
115
|
+
### Step 1: Verify the Alert
|
|
116
|
+
|
|
117
|
+
Check if the alert condition is accurate:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
# Query Prometheus directly
|
|
121
|
+
curl -g 'http://prometheus:9090/api/v1/query?query=rate(http_requests_total{status=~"5.."}[5m])'
|
|
122
|
+
|
|
123
|
+
# Check service logs
|
|
124
|
+
kubectl logs -l app=service-name --tail=100 | grep -i error
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Expected**: [What you expect to see if the alert is valid]
|
|
128
|
+
|
|
129
|
+
### Step 2: Check Recent Changes
|
|
130
|
+
|
|
131
|
+
Look for changes that might have caused this:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
# Recent deployments
|
|
135
|
+
kubectl rollout history deployment/service-name
|
|
136
|
+
|
|
137
|
+
# Recent config changes
|
|
138
|
+
kubectl get configmap service-config -o yaml
|
|
139
|
+
|
|
140
|
+
# Git commits in last 24h
|
|
141
|
+
git log --since="24 hours ago" --oneline
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Common causes**:
|
|
145
|
+
- Recent deployment introduced bug
|
|
146
|
+
- Configuration change
|
|
147
|
+
- Dependency failure
|
|
148
|
+
- Traffic spike
|
|
149
|
+
|
|
150
|
+
### Step 3: Check Dependencies
|
|
151
|
+
|
|
152
|
+
Verify downstream services are healthy:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
# Database connectivity
|
|
156
|
+
kubectl exec -it $(kubectl get pod -l app=service-name -o name | head -1) -- \
|
|
157
|
+
pg_isready -h database-host
|
|
158
|
+
|
|
159
|
+
# Redis connectivity
|
|
160
|
+
kubectl exec -it $(kubectl get pod -l app=service-name -o name | head -1) -- \
|
|
161
|
+
redis-cli -h redis-host ping
|
|
162
|
+
|
|
163
|
+
# External API
|
|
164
|
+
curl -s https://api.external-service.com/health
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Step 4: Check Resource Usage
|
|
168
|
+
|
|
169
|
+
Look for resource constraints:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
# Pod resources
|
|
173
|
+
kubectl top pods -l app=service-name
|
|
174
|
+
|
|
175
|
+
# Node resources
|
|
176
|
+
kubectl top nodes
|
|
177
|
+
|
|
178
|
+
# Check for OOM kills
|
|
179
|
+
kubectl get events --field-selector reason=OOMKilled
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Mitigation
|
|
185
|
+
|
|
186
|
+
### Option 1: Rollback (if recent deployment)
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
# Check deployment history
|
|
190
|
+
kubectl rollout history deployment/service-name
|
|
191
|
+
|
|
192
|
+
# Rollback to previous version
|
|
193
|
+
kubectl rollout undo deployment/service-name
|
|
194
|
+
|
|
195
|
+
# Verify rollback
|
|
196
|
+
kubectl rollout status deployment/service-name
|
|
197
|
+
|
|
198
|
+
# Check error rates returning to normal
|
|
199
|
+
# (monitor dashboard for 5 minutes)
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Option 2: Scale Up (if capacity issue)
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
# Current replica count
|
|
206
|
+
kubectl get deployment service-name -o jsonpath='{.spec.replicas}'
|
|
207
|
+
|
|
208
|
+
# Scale up
|
|
209
|
+
kubectl scale deployment/service-name --replicas=10
|
|
210
|
+
|
|
211
|
+
# Verify new pods are healthy
|
|
212
|
+
kubectl get pods -l app=service-name -w
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Option 3: Feature Flag (if specific feature)
|
|
216
|
+
|
|
217
|
+
1. Log into LaunchDarkly/Unleash
|
|
218
|
+
2. Find feature flag: `feature_name`
|
|
219
|
+
3. Disable for production environment
|
|
220
|
+
4. Monitor error rates
|
|
221
|
+
|
|
222
|
+
### Option 4: Failover (if regional issue)
|
|
223
|
+
|
|
224
|
+
See: [Disaster Recovery Runbook](#disaster-recovery)
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## Resolution
|
|
229
|
+
|
|
230
|
+
### Verify Recovery
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
# Error rate returned to normal
|
|
234
|
+
# (check Grafana dashboard)
|
|
235
|
+
|
|
236
|
+
# All pods healthy
|
|
237
|
+
kubectl get pods -l app=service-name
|
|
238
|
+
|
|
239
|
+
# Health check passing
|
|
240
|
+
curl -s http://service/health
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Post-Incident
|
|
244
|
+
|
|
245
|
+
1. Update incident timeline
|
|
246
|
+
2. Schedule postmortem (if SEV1/SEV2)
|
|
247
|
+
3. Create follow-up tickets for root cause fix
|
|
248
|
+
4. Update this runbook if needed
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## Escalation
|
|
253
|
+
|
|
254
|
+
### When to Escalate
|
|
255
|
+
|
|
256
|
+
- [ ] Not resolved within 30 minutes
|
|
257
|
+
- [ ] Data loss suspected
|
|
258
|
+
- [ ] Security implications
|
|
259
|
+
- [ ] Need database access
|
|
260
|
+
- [ ] Need infrastructure changes
|
|
261
|
+
|
|
262
|
+
### Escalation Contacts
|
|
263
|
+
|
|
264
|
+
| Role | Contact | When |
|
|
265
|
+
|------|---------|------|
|
|
266
|
+
| Backend Lead | @backend-lead | Technical escalation |
|
|
267
|
+
| DBA | @dba-oncall | Database issues |
|
|
268
|
+
| Security | @security-oncall | Security concerns |
|
|
269
|
+
| Manager | @eng-manager | Business decisions |
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## Related
|
|
274
|
+
|
|
275
|
+
- **Dashboard**: [Grafana Link](https://grafana.example.com/d/service)
|
|
276
|
+
- **Logs**: [Loki Query](https://grafana.example.com/explore?query=...)
|
|
277
|
+
- **Traces**: [Jaeger](https://jaeger.example.com/search?service=service-name)
|
|
278
|
+
- **Service Docs**: [Confluence Link](https://wiki.example.com/service-name)
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## Revision History
|
|
283
|
+
|
|
284
|
+
| Date | Author | Change |
|
|
285
|
+
|------|--------|--------|
|
|
286
|
+
| 2025-01-15 | @engineer | Added Step 3 for dependency checks |
|
|
287
|
+
| 2025-01-01 | @engineer | Initial version |
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## Writing Effective Runbooks
|
|
291
|
+
|
|
292
|
+
### Clear Instructions
|
|
293
|
+
|
|
294
|
+
```yaml
|
|
295
|
+
good_instruction:
|
|
296
|
+
format: "Verb + Object + Context"
|
|
297
|
+
examples:
|
|
298
|
+
- "Run the following command to check pod status:"
|
|
299
|
+
- "Verify the database connection by executing:"
|
|
300
|
+
- "Scale the deployment to 10 replicas:"
|
|
301
|
+
|
|
302
|
+
bad_instruction:
|
|
303
|
+
examples:
|
|
304
|
+
- "Check the pods" # Missing how
|
|
305
|
+
- "Fix the database" # Too vague
|
|
306
|
+
- "Do the thing" # Meaningless
|
|
307
|
+
|
|
308
|
+
command_blocks:
|
|
309
|
+
always_include:
|
|
310
|
+
- "What the command does"
|
|
311
|
+
- "The actual command"
|
|
312
|
+
- "Expected output"
|
|
313
|
+
- "What to do if output differs"
|
|
314
|
+
|
|
315
|
+
example: |
|
|
316
|
+
Check the current replica count:
|
|
317
|
+
|
|
318
|
+
```bash
|
|
319
|
+
kubectl get deployment api-server -o jsonpath='{.spec.replicas}'
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
**Expected**: A number (e.g., `3`)
|
|
323
|
+
|
|
324
|
+
If you see `0`, the deployment may have been scaled down.
|
|
325
|
+
Proceed to the "Scale Up" section.
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### Decision Points
|
|
329
|
+
|
|
330
|
+
```yaml
|
|
331
|
+
decision_format:
|
|
332
|
+
if_then_else:
|
|
333
|
+
format: |
|
|
334
|
+
If [condition]:
|
|
335
|
+
→ Do [action A]
|
|
336
|
+
|
|
337
|
+
If [other condition]:
|
|
338
|
+
→ Do [action B]
|
|
339
|
+
|
|
340
|
+
Otherwise:
|
|
341
|
+
→ Escalate to [team]
|
|
342
|
+
|
|
343
|
+
example: |
|
|
344
|
+
Check the error rate:
|
|
345
|
+
|
|
346
|
+
```bash
|
|
347
|
+
curl -s 'http://prometheus:9090/api/v1/query?query=...' | jq '.data.result[0].value[1]'
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
**If error rate > 10%**:
|
|
351
|
+
→ Immediately rollback (see Option 1)
|
|
352
|
+
|
|
353
|
+
**If error rate 1-10%**:
|
|
354
|
+
→ Scale up first (see Option 2)
|
|
355
|
+
→ If no improvement in 10 min, rollback
|
|
356
|
+
|
|
357
|
+
**If error rate < 1%**:
|
|
358
|
+
→ Alert may be flapping
|
|
359
|
+
→ Monitor for 15 minutes before taking action
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
### Verification Steps
|
|
363
|
+
|
|
364
|
+
```yaml
|
|
365
|
+
verification_importance:
|
|
366
|
+
why: "Confirm each action had the expected effect"
|
|
367
|
+
when: "After every significant action"
|
|
368
|
+
|
|
369
|
+
verification_pattern:
|
|
370
|
+
action: "Do something"
|
|
371
|
+
verify: "Check it worked"
|
|
372
|
+
expected: "What you should see"
|
|
373
|
+
troubleshoot: "What to do if it didn't work"
|
|
374
|
+
|
|
375
|
+
example: |
|
|
376
|
+
**Action**: Rollback the deployment
|
|
377
|
+
|
|
378
|
+
```bash
|
|
379
|
+
kubectl rollout undo deployment/api-server
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
**Verify**: Check rollback status
|
|
383
|
+
|
|
384
|
+
```bash
|
|
385
|
+
kubectl rollout status deployment/api-server
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
**Expected**: `deployment "api-server" successfully rolled out`
|
|
389
|
+
|
|
390
|
+
**If rollback fails**:
|
|
391
|
+
- Check events: `kubectl describe deployment api-server`
|
|
392
|
+
- Check pod status: `kubectl get pods -l app=api-server`
|
|
393
|
+
- Escalate if pods won't start
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
## Automation in Runbooks
|
|
397
|
+
|
|
398
|
+
### Automating Common Steps
|
|
399
|
+
|
|
400
|
+
```yaml
|
|
401
|
+
automation_levels:
|
|
402
|
+
fully_manual:
|
|
403
|
+
description: "Human runs commands, makes decisions"
|
|
404
|
+
when: "Rare events, complex judgment needed"
|
|
405
|
+
|
|
406
|
+
assisted:
|
|
407
|
+
description: "Scripts help, human approves"
|
|
408
|
+
when: "Common events, some judgment needed"
|
|
409
|
+
|
|
410
|
+
automated:
|
|
411
|
+
description: "System handles automatically"
|
|
412
|
+
when: "Well-understood, low-risk responses"
|
|
413
|
+
|
|
414
|
+
example_progression:
|
|
415
|
+
manual: |
|
|
416
|
+
# Human runs this manually
|
|
417
|
+
kubectl scale deployment/api-server --replicas=10
|
|
418
|
+
|
|
419
|
+
assisted: |
|
|
420
|
+
# Script that prompts for confirmation
|
|
421
|
+
./scripts/scale-service.sh api-server 10
|
|
422
|
+
# Output: "Scale api-server to 10 replicas? [y/N]"
|
|
423
|
+
|
|
424
|
+
automated: |
|
|
425
|
+
# HPA handles scaling automatically
|
|
426
|
+
apiVersion: autoscaling/v2
|
|
427
|
+
kind: HorizontalPodAutoscaler
|
|
428
|
+
spec:
|
|
429
|
+
minReplicas: 3
|
|
430
|
+
maxReplicas: 20
|
|
431
|
+
metrics:
|
|
432
|
+
- type: Resource
|
|
433
|
+
resource:
|
|
434
|
+
name: cpu
|
|
435
|
+
target:
|
|
436
|
+
type: Utilization
|
|
437
|
+
averageUtilization: 70
|
|
438
|
+
```
|
|
439
|
+
|
|
440
|
+
### Runbook Automation Scripts
|
|
441
|
+
|
|
442
|
+
```bash
|
|
443
|
+
#!/bin/bash
|
|
444
|
+
# scripts/diagnose-high-error-rate.sh
|
|
445
|
+
# Automated diagnosis for APIHighErrorRate alert
|
|
446
|
+
|
|
447
|
+
set -e
|
|
448
|
+
|
|
449
|
+
SERVICE=${1:-api-server}
|
|
450
|
+
NAMESPACE=${2:-production}
|
|
451
|
+
|
|
452
|
+
echo "=== Diagnosing high error rate for $SERVICE ==="
|
|
453
|
+
|
|
454
|
+
echo ""
|
|
455
|
+
echo "1. Current error rate:"
|
|
456
|
+
ERROR_RATE=$(curl -s "http://prometheus:9090/api/v1/query?query=sum(rate(http_requests_total{job=\"$SERVICE\",status=~\"5..\"}[5m]))/sum(rate(http_requests_total{job=\"$SERVICE\"}[5m]))" | jq -r '.data.result[0].value[1]')
|
|
457
|
+
echo " Error rate: $(echo "$ERROR_RATE * 100" | bc)%"
|
|
458
|
+
|
|
459
|
+
echo ""
|
|
460
|
+
echo "2. Recent deployments:"
|
|
461
|
+
kubectl rollout history deployment/$SERVICE -n $NAMESPACE | tail -5
|
|
462
|
+
|
|
463
|
+
echo ""
|
|
464
|
+
echo "3. Pod status:"
|
|
465
|
+
kubectl get pods -l app=$SERVICE -n $NAMESPACE
|
|
466
|
+
|
|
467
|
+
echo ""
|
|
468
|
+
echo "4. Recent errors in logs:"
|
|
469
|
+
kubectl logs -l app=$SERVICE -n $NAMESPACE --tail=20 | grep -i error | tail -10
|
|
470
|
+
|
|
471
|
+
echo ""
|
|
472
|
+
echo "5. Resource usage:"
|
|
473
|
+
kubectl top pods -l app=$SERVICE -n $NAMESPACE
|
|
474
|
+
|
|
475
|
+
echo ""
|
|
476
|
+
echo "=== Diagnosis complete ==="
|
|
477
|
+
echo ""
|
|
478
|
+
echo "Recommended actions:"
|
|
479
|
+
if (( $(echo "$ERROR_RATE > 0.10" | bc -l) )); then
|
|
480
|
+
echo " - ERROR RATE CRITICAL (>10%): Recommend immediate rollback"
|
|
481
|
+
echo " Run: kubectl rollout undo deployment/$SERVICE -n $NAMESPACE"
|
|
482
|
+
elif (( $(echo "$ERROR_RATE > 0.01" | bc -l) )); then
|
|
483
|
+
echo " - Error rate elevated (>1%): Check recent deployments"
|
|
484
|
+
echo " If recent deploy, consider rollback"
|
|
485
|
+
else
|
|
486
|
+
echo " - Error rate within normal range"
|
|
487
|
+
echo " Monitor and check for flapping alerts"
|
|
488
|
+
fi
|
|
489
|
+
```
|
|
490
|
+
|
|
491
|
+
## Service Runbook Template
|
|
492
|
+
|
|
493
|
+
```markdown
|
|
494
|
+
# Service Runbook: [Service Name]
|
|
495
|
+
|
|
496
|
+
## Service Overview
|
|
497
|
+
|
|
498
|
+
### Description
|
|
499
|
+
Brief description of what this service does and why it exists.
|
|
500
|
+
|
|
501
|
+
### Architecture
|
|
502
|
+
```
|
|
503
|
+
[User] → [Load Balancer] → [Service] → [Database]
|
|
504
|
+
→ [Cache]
|
|
505
|
+
→ [External API]
|
|
506
|
+
```
|
|
507
|
+
|
|
508
|
+
### Key Metrics
|
|
509
|
+
- **SLO**: 99.9% availability, p99 latency < 500ms
|
|
510
|
+
- **Traffic**: ~10k requests/minute
|
|
511
|
+
- **Error Budget**: 43 minutes/month
|
|
512
|
+
|
|
513
|
+
### Dependencies
|
|
514
|
+
|
|
515
|
+
| Dependency | Type | Impact if Down | Fallback |
|
|
516
|
+
|------------|------|----------------|----------|
|
|
517
|
+
| PostgreSQL | Critical | Service fails | None |
|
|
518
|
+
| Redis | Degraded | Slower responses | Direct DB |
|
|
519
|
+
| Auth Service | Critical | No authentication | None |
|
|
520
|
+
|
|
521
|
+
---
|
|
522
|
+
|
|
523
|
+
## Operations
|
|
524
|
+
|
|
525
|
+
### Deployment
|
|
526
|
+
```bash
|
|
527
|
+
# Deploy new version
|
|
528
|
+
kubectl set image deployment/service-name service=image:tag
|
|
529
|
+
|
|
530
|
+
# Verify deployment
|
|
531
|
+
kubectl rollout status deployment/service-name
|
|
532
|
+
```
|
|
533
|
+
|
|
534
|
+
### Scaling
|
|
535
|
+
```bash
|
|
536
|
+
# Scale manually
|
|
537
|
+
kubectl scale deployment/service-name --replicas=N
|
|
538
|
+
|
|
539
|
+
# Check current scale
|
|
540
|
+
kubectl get hpa service-name
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
### Configuration
|
|
544
|
+
```bash
|
|
545
|
+
# View config
|
|
546
|
+
kubectl get configmap service-config -o yaml
|
|
547
|
+
|
|
548
|
+
# Update config (triggers restart)
|
|
549
|
+
kubectl edit configmap service-config
|
|
550
|
+
kubectl rollout restart deployment/service-name
|
|
551
|
+
```
|
|
552
|
+
|
|
553
|
+
### Logs
|
|
554
|
+
```bash
|
|
555
|
+
# Recent logs
|
|
556
|
+
kubectl logs -l app=service-name --tail=100
|
|
557
|
+
|
|
558
|
+
# Stream logs
|
|
559
|
+
kubectl logs -l app=service-name -f
|
|
560
|
+
|
|
561
|
+
# Logs with errors only
|
|
562
|
+
kubectl logs -l app=service-name | grep -i error
|
|
563
|
+
```
|
|
564
|
+
|
|
565
|
+
---
|
|
566
|
+
|
|
567
|
+
## Troubleshooting Guide
|
|
568
|
+
|
|
569
|
+
### Service Not Responding
|
|
570
|
+
|
|
571
|
+
**Symptoms**: Health checks failing, 503 errors
|
|
572
|
+
|
|
573
|
+
**Diagnosis**:
|
|
574
|
+
```bash
|
|
575
|
+
# Check pods
|
|
576
|
+
kubectl get pods -l app=service-name
|
|
577
|
+
|
|
578
|
+
# Check events
|
|
579
|
+
kubectl get events --sort-by='.lastTimestamp' | grep service-name
|
|
580
|
+
|
|
581
|
+
# Check logs
|
|
582
|
+
kubectl logs -l app=service-name --tail=50
|
|
583
|
+
```
|
|
584
|
+
|
|
585
|
+
**Common Causes**:
|
|
586
|
+
1. Pods in CrashLoopBackOff → Check logs for startup errors
|
|
587
|
+
2. Pods Pending → Check resources, node capacity
|
|
588
|
+
3. Readiness probe failing → Check /ready endpoint
|
|
589
|
+
|
|
590
|
+
### High Latency
|
|
591
|
+
|
|
592
|
+
**Symptoms**: p99 latency above SLO
|
|
593
|
+
|
|
594
|
+
**Diagnosis**:
|
|
595
|
+
```bash
|
|
596
|
+
# Check resource usage
|
|
597
|
+
kubectl top pods -l app=service-name
|
|
598
|
+
|
|
599
|
+
# Check database latency
|
|
600
|
+
# (query Prometheus for db_query_duration_seconds)
|
|
601
|
+
|
|
602
|
+
# Check external dependencies
|
|
603
|
+
curl -w "@curl-format.txt" -o /dev/null -s https://external-api.com/health
|
|
604
|
+
```
|
|
605
|
+
|
|
606
|
+
**Common Causes**:
|
|
607
|
+
1. Database slow queries → Check slow query log
|
|
608
|
+
2. Resource constraints → Scale up
|
|
609
|
+
3. External dependency slow → Check dependency health
|
|
610
|
+
|
|
611
|
+
### High Error Rate
|
|
612
|
+
|
|
613
|
+
**Symptoms**: 5xx errors above 1%
|
|
614
|
+
|
|
615
|
+
**Diagnosis**:
|
|
616
|
+
```bash
|
|
617
|
+
# Check error breakdown
|
|
618
|
+
curl -s 'http://prometheus:9090/api/v1/query?query=sum(rate(http_requests_total{job="service-name",status=~"5.."}[5m]))by(status)'
|
|
619
|
+
|
|
620
|
+
# Check recent errors in logs
|
|
621
|
+
kubectl logs -l app=service-name | grep -E "(error|ERROR|exception)" | tail -20
|
|
622
|
+
```
|
|
623
|
+
|
|
624
|
+
**Common Causes**:
|
|
625
|
+
1. Recent deployment bug → Rollback
|
|
626
|
+
2. Dependency failure → Check dependencies
|
|
627
|
+
3. Data issue → Check for invalid requests
|
|
628
|
+
|
|
629
|
+
---
|
|
630
|
+
|
|
631
|
+
## Recovery Procedures
|
|
632
|
+
|
|
633
|
+
### Database Connection Issues
|
|
634
|
+
|
|
635
|
+
1. Verify database is reachable:
|
|
636
|
+
```bash
|
|
637
|
+
kubectl exec -it pod/service-name-xxx -- pg_isready -h db-host
|
|
638
|
+
```
|
|
639
|
+
|
|
640
|
+
2. Check connection pool:
|
|
641
|
+
```bash
|
|
642
|
+
# Query for active connections
|
|
643
|
+
kubectl exec -it pod/postgres-0 -- psql -c "SELECT count(*) FROM pg_stat_activity WHERE datname = 'service_db'"
|
|
644
|
+
```
|
|
645
|
+
|
|
646
|
+
3. If pool exhausted, restart pods:
|
|
647
|
+
```bash
|
|
648
|
+
kubectl rollout restart deployment/service-name
|
|
649
|
+
```
|
|
650
|
+
|
|
651
|
+
### Complete Service Recovery
|
|
652
|
+
|
|
653
|
+
1. Stop all traffic:
|
|
654
|
+
```bash
|
|
655
|
+
kubectl scale deployment/service-name --replicas=0
|
|
656
|
+
```
|
|
657
|
+
|
|
658
|
+
2. Fix underlying issue (database, config, etc.)
|
|
659
|
+
|
|
660
|
+
3. Restart with single replica:
|
|
661
|
+
```bash
|
|
662
|
+
kubectl scale deployment/service-name --replicas=1
|
|
663
|
+
```
|
|
664
|
+
|
|
665
|
+
4. Verify health:
|
|
666
|
+
```bash
|
|
667
|
+
kubectl logs -l app=service-name -f
|
|
668
|
+
curl http://service-name/health
|
|
669
|
+
```
|
|
670
|
+
|
|
671
|
+
5. Scale back up:
|
|
672
|
+
```bash
|
|
673
|
+
kubectl scale deployment/service-name --replicas=3
|
|
674
|
+
```
|
|
675
|
+
|
|
676
|
+
---
|
|
677
|
+
|
|
678
|
+
## Contacts
|
|
679
|
+
|
|
680
|
+
| Role | Contact | Availability |
|
|
681
|
+
|------|---------|--------------|
|
|
682
|
+
| Primary On-Call | PagerDuty | 24/7 |
|
|
683
|
+
| Service Owner | @owner | Business hours |
|
|
684
|
+
| Team Lead | @lead | Business hours |
|
|
685
|
+
|
|
686
|
+
---
|
|
687
|
+
|
|
688
|
+
## Related Documentation
|
|
689
|
+
|
|
690
|
+
- [Architecture Doc](https://wiki.example.com/service-name/architecture)
|
|
691
|
+
- [API Documentation](https://api-docs.example.com/service-name)
|
|
692
|
+
- [Dashboard](https://grafana.example.com/d/service-name)
|
|
693
|
+
- [Alert Runbooks](#alert-runbooks)
|
|
694
|
+
```
|
|
695
|
+
|
|
696
|
+
## Runbook Maintenance
|
|
697
|
+
|
|
698
|
+
### Review Schedule
|
|
699
|
+
|
|
700
|
+
```yaml
|
|
701
|
+
runbook_review:
|
|
702
|
+
triggers:
|
|
703
|
+
- "After every incident that uses the runbook"
|
|
704
|
+
- "After any production change to the service"
|
|
705
|
+
- "Quarterly scheduled review"
|
|
706
|
+
- "When on-call feedback indicates issues"
|
|
707
|
+
|
|
708
|
+
review_checklist:
|
|
709
|
+
- "Commands still work?"
|
|
710
|
+
- "URLs and links still valid?"
|
|
711
|
+
- "Screenshots still accurate?"
|
|
712
|
+
- "Contact information current?"
|
|
713
|
+
- "Escalation path correct?"
|
|
714
|
+
- "Any new failure modes to document?"
|
|
715
|
+
```
|
|
716
|
+
|
|
717
|
+
### Testing Runbooks
|
|
718
|
+
|
|
719
|
+
```yaml
|
|
720
|
+
runbook_testing:
|
|
721
|
+
methods:
|
|
722
|
+
game_day:
|
|
723
|
+
description: "Simulate incident, follow runbook"
|
|
724
|
+
frequency: "Quarterly"
|
|
725
|
+
outcome: "Identify gaps, update runbook"
|
|
726
|
+
|
|
727
|
+
shadow_run:
|
|
728
|
+
description: "New on-call follows runbook during real incident"
|
|
729
|
+
frequency: "Each new on-call"
|
|
730
|
+
outcome: "Verify clarity for newcomers"
|
|
731
|
+
|
|
732
|
+
automation_test:
|
|
733
|
+
description: "Run automated scripts in staging"
|
|
734
|
+
frequency: "After any update"
|
|
735
|
+
outcome: "Verify scripts work"
|
|
736
|
+
```
|
|
737
|
+
|
|
738
|
+
## Common Pitfalls
|
|
739
|
+
|
|
740
|
+
```yaml
|
|
741
|
+
pitfall_stale_runbooks:
|
|
742
|
+
problem: "Runbooks not updated after changes"
|
|
743
|
+
impact: "On-call follows outdated steps, makes things worse"
|
|
744
|
+
solution: "Include runbook update in change checklist"
|
|
745
|
+
|
|
746
|
+
pitfall_assumed_knowledge:
|
|
747
|
+
problem: "Runbook assumes reader knows the system"
|
|
748
|
+
impact: "New on-call can't follow"
|
|
749
|
+
solution: "Write for someone who's never seen the service"
|
|
750
|
+
|
|
751
|
+
pitfall_no_verification:
|
|
752
|
+
problem: "Steps without verification"
|
|
753
|
+
impact: "Don't know if action worked"
|
|
754
|
+
solution: "Every action needs a verification step"
|
|
755
|
+
|
|
756
|
+
pitfall_wall_of_text:
|
|
757
|
+
problem: "Long paragraphs instead of clear steps"
|
|
758
|
+
impact: "Hard to follow during stress"
|
|
759
|
+
solution: "Use numbered steps, bullet points, clear formatting"
|
|
760
|
+
```
|