agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,760 @@
1
+ # Runbooks
2
+
3
+ Comprehensive guidelines for writing and maintaining operational runbooks.
4
+
5
+ ## Core Principles
6
+
7
+ 1. **Executable** - Clear, step-by-step instructions that anyone can follow
8
+ 2. **Current** - Updated after every incident that uses or improves them
9
+ 3. **Tested** - Regularly verified to ensure they work
10
+ 4. **Automated** - Automate steps where possible, document where not
11
+
12
+ ## Runbook Types
13
+
14
+ ### Alert Runbooks
15
+
16
+ ```yaml
17
+ alert_runbook:
18
+ purpose: "Guide response to specific alerts"
19
+ trigger: "Alert fires"
20
+ audience: "On-call engineer"
21
+
22
+ structure:
23
+ - "Alert overview and severity"
24
+ - "Symptoms and verification"
25
+ - "Quick diagnosis steps"
26
+ - "Mitigation actions"
27
+ - "Root cause investigation"
28
+ - "Escalation criteria"
29
+ ```
30
+
31
+ ### Service Runbooks
32
+
33
+ ```yaml
34
+ service_runbook:
35
+ purpose: "Comprehensive guide for a service"
36
+ trigger: "Any issue with the service"
37
+ audience: "Service operators"
38
+
39
+ structure:
40
+ - "Service overview and architecture"
41
+ - "Dependencies and data flows"
42
+ - "Common operations"
43
+ - "Troubleshooting guide"
44
+ - "Recovery procedures"
45
+ ```
46
+
47
+ ### Procedure Runbooks
48
+
49
+ ```yaml
50
+ procedure_runbook:
51
+ purpose: "Guide for specific operational tasks"
52
+ trigger: "Need to perform task"
53
+ audience: "Operations team"
54
+
55
+ examples:
56
+ - "Database failover"
57
+ - "Certificate rotation"
58
+ - "Capacity scaling"
59
+ - "Data recovery"
60
+ ```
61
+
62
+ ## Runbook Template
63
+
64
+ ### Alert Runbook Template
65
+
66
+ ```markdown
67
+ # Runbook: [Alert Name]
68
+
69
+ ## Overview
70
+
71
+ **Alert**: `AlertName`
72
+ **Severity**: Critical | Warning | Info
73
+ **Service**: service-name
74
+ **Team**: team-name
75
+
76
+ ### What This Alert Means
77
+
78
+ One paragraph explaining what condition triggers this alert and why it matters.
79
+
80
+ ### User Impact
81
+
82
+ - What users experience when this alert fires
83
+ - Which features are affected
84
+ - Business impact
85
+
86
+ ---
87
+
88
+ ## Quick Reference
89
+
90
+ ### Verify the Alert
91
+
92
+ ```bash
93
+ # Check if the condition is real
94
+ kubectl get pods -l app=service-name
95
+ curl -s http://service/health | jq .
96
+ ```
97
+
98
+ ### Quick Mitigation
99
+
100
+ ```bash
101
+ # If recent deployment, rollback
102
+ kubectl rollout undo deployment/service-name
103
+
104
+ # If capacity issue, scale up
105
+ kubectl scale deployment/service-name --replicas=10
106
+
107
+ # If specific feature causing issues
108
+ # Disable feature flag in LaunchDarkly
109
+ ```
110
+
111
+ ---
112
+
113
+ ## Diagnosis
114
+
115
+ ### Step 1: Verify the Alert
116
+
117
+ Check if the alert condition is accurate:
118
+
119
+ ```bash
120
+ # Query Prometheus directly
121
+ curl -g 'http://prometheus:9090/api/v1/query?query=rate(http_requests_total{status=~"5.."}[5m])'
122
+
123
+ # Check service logs
124
+ kubectl logs -l app=service-name --tail=100 | grep -i error
125
+ ```
126
+
127
+ **Expected**: [What you expect to see if the alert is valid]
128
+
129
+ ### Step 2: Check Recent Changes
130
+
131
+ Look for changes that might have caused this:
132
+
133
+ ```bash
134
+ # Recent deployments
135
+ kubectl rollout history deployment/service-name
136
+
137
+ # Recent config changes
138
+ kubectl get configmap service-config -o yaml
139
+
140
+ # Git commits in last 24h
141
+ git log --since="24 hours ago" --oneline
142
+ ```
143
+
144
+ **Common causes**:
145
+ - Recent deployment introduced bug
146
+ - Configuration change
147
+ - Dependency failure
148
+ - Traffic spike
149
+
150
+ ### Step 3: Check Dependencies
151
+
152
+ Verify downstream services are healthy:
153
+
154
+ ```bash
155
+ # Database connectivity
156
+ kubectl exec -it $(kubectl get pod -l app=service-name -o name | head -1) -- \
157
+ pg_isready -h database-host
158
+
159
+ # Redis connectivity
160
+ kubectl exec -it $(kubectl get pod -l app=service-name -o name | head -1) -- \
161
+ redis-cli -h redis-host ping
162
+
163
+ # External API
164
+ curl -s https://api.external-service.com/health
165
+ ```
166
+
167
+ ### Step 4: Check Resource Usage
168
+
169
+ Look for resource constraints:
170
+
171
+ ```bash
172
+ # Pod resources
173
+ kubectl top pods -l app=service-name
174
+
175
+ # Node resources
176
+ kubectl top nodes
177
+
178
+ # Check for OOM kills
179
+ kubectl get events --field-selector reason=OOMKilled
180
+ ```
181
+
182
+ ---
183
+
184
+ ## Mitigation
185
+
186
+ ### Option 1: Rollback (if recent deployment)
187
+
188
+ ```bash
189
+ # Check deployment history
190
+ kubectl rollout history deployment/service-name
191
+
192
+ # Rollback to previous version
193
+ kubectl rollout undo deployment/service-name
194
+
195
+ # Verify rollback
196
+ kubectl rollout status deployment/service-name
197
+
198
+ # Check error rates returning to normal
199
+ # (monitor dashboard for 5 minutes)
200
+ ```
201
+
202
+ ### Option 2: Scale Up (if capacity issue)
203
+
204
+ ```bash
205
+ # Current replica count
206
+ kubectl get deployment service-name -o jsonpath='{.spec.replicas}'
207
+
208
+ # Scale up
209
+ kubectl scale deployment/service-name --replicas=10
210
+
211
+ # Verify new pods are healthy
212
+ kubectl get pods -l app=service-name -w
213
+ ```
214
+
215
+ ### Option 3: Feature Flag (if specific feature)
216
+
217
+ 1. Log into LaunchDarkly/Unleash
218
+ 2. Find feature flag: `feature_name`
219
+ 3. Disable for production environment
220
+ 4. Monitor error rates
221
+
222
+ ### Option 4: Failover (if regional issue)
223
+
224
+ See: [Disaster Recovery Runbook](#disaster-recovery)
225
+
226
+ ---
227
+
228
+ ## Resolution
229
+
230
+ ### Verify Recovery
231
+
232
+ ```bash
233
+ # Error rate returned to normal
234
+ # (check Grafana dashboard)
235
+
236
+ # All pods healthy
237
+ kubectl get pods -l app=service-name
238
+
239
+ # Health check passing
240
+ curl -s http://service/health
241
+ ```
242
+
243
+ ### Post-Incident
244
+
245
+ 1. Update incident timeline
246
+ 2. Schedule postmortem (if SEV1/SEV2)
247
+ 3. Create follow-up tickets for root cause fix
248
+ 4. Update this runbook if needed
249
+
250
+ ---
251
+
252
+ ## Escalation
253
+
254
+ ### When to Escalate
255
+
256
+ - [ ] Not resolved within 30 minutes
257
+ - [ ] Data loss suspected
258
+ - [ ] Security implications
259
+ - [ ] Need database access
260
+ - [ ] Need infrastructure changes
261
+
262
+ ### Escalation Contacts
263
+
264
+ | Role | Contact | When |
265
+ |------|---------|------|
266
+ | Backend Lead | @backend-lead | Technical escalation |
267
+ | DBA | @dba-oncall | Database issues |
268
+ | Security | @security-oncall | Security concerns |
269
+ | Manager | @eng-manager | Business decisions |
270
+
271
+ ---
272
+
273
+ ## Related
274
+
275
+ - **Dashboard**: [Grafana Link](https://grafana.example.com/d/service)
276
+ - **Logs**: [Loki Query](https://grafana.example.com/explore?query=...)
277
+ - **Traces**: [Jaeger](https://jaeger.example.com/search?service=service-name)
278
+ - **Service Docs**: [Confluence Link](https://wiki.example.com/service-name)
279
+
280
+ ---
281
+
282
+ ## Revision History
283
+
284
+ | Date | Author | Change |
285
+ |------|--------|--------|
286
+ | 2025-01-15 | @engineer | Added Step 3 for dependency checks |
287
+ | 2025-01-01 | @engineer | Initial version |
288
+ ```
289
+
290
+ ## Writing Effective Runbooks
291
+
292
+ ### Clear Instructions
293
+
294
+ ```yaml
295
+ good_instruction:
296
+ format: "Verb + Object + Context"
297
+ examples:
298
+ - "Run the following command to check pod status:"
299
+ - "Verify the database connection by executing:"
300
+ - "Scale the deployment to 10 replicas:"
301
+
302
+ bad_instruction:
303
+ examples:
304
+ - "Check the pods" # Missing how
305
+ - "Fix the database" # Too vague
306
+ - "Do the thing" # Meaningless
307
+
308
+ command_blocks:
309
+ always_include:
310
+ - "What the command does"
311
+ - "The actual command"
312
+ - "Expected output"
313
+ - "What to do if output differs"
314
+
315
+ example: |
316
+ Check the current replica count:
317
+
318
+ ```bash
319
+ kubectl get deployment api-server -o jsonpath='{.spec.replicas}'
320
+ ```
321
+
322
+ **Expected**: A number (e.g., `3`)
323
+
324
+ If you see `0`, the deployment may have been scaled down.
325
+ Proceed to the "Scale Up" section.
326
+ ```
327
+
328
+ ### Decision Points
329
+
330
+ ```yaml
331
+ decision_format:
332
+ if_then_else:
333
+ format: |
334
+ If [condition]:
335
+ → Do [action A]
336
+
337
+ If [other condition]:
338
+ → Do [action B]
339
+
340
+ Otherwise:
341
+ → Escalate to [team]
342
+
343
+ example: |
344
+ Check the error rate:
345
+
346
+ ```bash
347
+ curl -s 'http://prometheus:9090/api/v1/query?query=...' | jq '.data.result[0].value[1]'
348
+ ```
349
+
350
+ **If error rate > 10%**:
351
+ → Immediately rollback (see Option 1)
352
+
353
+ **If error rate 1-10%**:
354
+ → Scale up first (see Option 2)
355
+ → If no improvement in 10 min, rollback
356
+
357
+ **If error rate < 1%**:
358
+ → Alert may be flapping
359
+ → Monitor for 15 minutes before taking action
360
+ ```
361
+
362
+ ### Verification Steps
363
+
364
+ ```yaml
365
+ verification_importance:
366
+ why: "Confirm each action had the expected effect"
367
+ when: "After every significant action"
368
+
369
+ verification_pattern:
370
+ action: "Do something"
371
+ verify: "Check it worked"
372
+ expected: "What you should see"
373
+ troubleshoot: "What to do if it didn't work"
374
+
375
+ example: |
376
+ **Action**: Rollback the deployment
377
+
378
+ ```bash
379
+ kubectl rollout undo deployment/api-server
380
+ ```
381
+
382
+ **Verify**: Check rollback status
383
+
384
+ ```bash
385
+ kubectl rollout status deployment/api-server
386
+ ```
387
+
388
+ **Expected**: `deployment "api-server" successfully rolled out`
389
+
390
+ **If rollback fails**:
391
+ - Check events: `kubectl describe deployment api-server`
392
+ - Check pod status: `kubectl get pods -l app=api-server`
393
+ - Escalate if pods won't start
394
+ ```
395
+
396
+ ## Automation in Runbooks
397
+
398
+ ### Automating Common Steps
399
+
400
+ ```yaml
401
+ automation_levels:
402
+ fully_manual:
403
+ description: "Human runs commands, makes decisions"
404
+ when: "Rare events, complex judgment needed"
405
+
406
+ assisted:
407
+ description: "Scripts help, human approves"
408
+ when: "Common events, some judgment needed"
409
+
410
+ automated:
411
+ description: "System handles automatically"
412
+ when: "Well-understood, low-risk responses"
413
+
414
+ example_progression:
415
+ manual: |
416
+ # Human runs this manually
417
+ kubectl scale deployment/api-server --replicas=10
418
+
419
+ assisted: |
420
+ # Script that prompts for confirmation
421
+ ./scripts/scale-service.sh api-server 10
422
+ # Output: "Scale api-server to 10 replicas? [y/N]"
423
+
424
+ automated: |
425
+ # HPA handles scaling automatically
426
+ apiVersion: autoscaling/v2
427
+ kind: HorizontalPodAutoscaler
428
+ spec:
429
+ minReplicas: 3
430
+ maxReplicas: 20
431
+ metrics:
432
+ - type: Resource
433
+ resource:
434
+ name: cpu
435
+ target:
436
+ type: Utilization
437
+ averageUtilization: 70
438
+ ```
439
+
440
+ ### Runbook Automation Scripts
441
+
442
+ ```bash
443
+ #!/bin/bash
444
+ # scripts/diagnose-high-error-rate.sh
445
+ # Automated diagnosis for APIHighErrorRate alert
446
+
447
+ set -e
448
+
449
+ SERVICE=${1:-api-server}
450
+ NAMESPACE=${2:-production}
451
+
452
+ echo "=== Diagnosing high error rate for $SERVICE ==="
453
+
454
+ echo ""
455
+ echo "1. Current error rate:"
456
+ ERROR_RATE=$(curl -s "http://prometheus:9090/api/v1/query?query=sum(rate(http_requests_total{job=\"$SERVICE\",status=~\"5..\"}[5m]))/sum(rate(http_requests_total{job=\"$SERVICE\"}[5m]))" | jq -r '.data.result[0].value[1]')
457
+ echo " Error rate: $(echo "$ERROR_RATE * 100" | bc)%"
458
+
459
+ echo ""
460
+ echo "2. Recent deployments:"
461
+ kubectl rollout history deployment/$SERVICE -n $NAMESPACE | tail -5
462
+
463
+ echo ""
464
+ echo "3. Pod status:"
465
+ kubectl get pods -l app=$SERVICE -n $NAMESPACE
466
+
467
+ echo ""
468
+ echo "4. Recent errors in logs:"
469
+ kubectl logs -l app=$SERVICE -n $NAMESPACE --tail=20 | grep -i error | tail -10
470
+
471
+ echo ""
472
+ echo "5. Resource usage:"
473
+ kubectl top pods -l app=$SERVICE -n $NAMESPACE
474
+
475
+ echo ""
476
+ echo "=== Diagnosis complete ==="
477
+ echo ""
478
+ echo "Recommended actions:"
479
+ if (( $(echo "$ERROR_RATE > 0.10" | bc -l) )); then
480
+ echo " - ERROR RATE CRITICAL (>10%): Recommend immediate rollback"
481
+ echo " Run: kubectl rollout undo deployment/$SERVICE -n $NAMESPACE"
482
+ elif (( $(echo "$ERROR_RATE > 0.01" | bc -l) )); then
483
+ echo " - Error rate elevated (>1%): Check recent deployments"
484
+ echo " If recent deploy, consider rollback"
485
+ else
486
+ echo " - Error rate within normal range"
487
+ echo " Monitor and check for flapping alerts"
488
+ fi
489
+ ```
490
+
491
+ ## Service Runbook Template
492
+
493
+ ```markdown
494
+ # Service Runbook: [Service Name]
495
+
496
+ ## Service Overview
497
+
498
+ ### Description
499
+ Brief description of what this service does and why it exists.
500
+
501
+ ### Architecture
502
+ ```
503
+ [User] → [Load Balancer] → [Service] → [Database]
504
+ → [Cache]
505
+ → [External API]
506
+ ```
507
+
508
+ ### Key Metrics
509
+ - **SLO**: 99.9% availability, p99 latency < 500ms
510
+ - **Traffic**: ~10k requests/minute
511
+ - **Error Budget**: 43 minutes/month
512
+
513
+ ### Dependencies
514
+
515
+ | Dependency | Type | Impact if Down | Fallback |
516
+ |------------|------|----------------|----------|
517
+ | PostgreSQL | Critical | Service fails | None |
518
+ | Redis | Degraded | Slower responses | Direct DB |
519
+ | Auth Service | Critical | No authentication | None |
520
+
521
+ ---
522
+
523
+ ## Operations
524
+
525
+ ### Deployment
526
+ ```bash
527
+ # Deploy new version
528
+ kubectl set image deployment/service-name service=image:tag
529
+
530
+ # Verify deployment
531
+ kubectl rollout status deployment/service-name
532
+ ```
533
+
534
+ ### Scaling
535
+ ```bash
536
+ # Scale manually
537
+ kubectl scale deployment/service-name --replicas=N
538
+
539
+ # Check current scale
540
+ kubectl get hpa service-name
541
+ ```
542
+
543
+ ### Configuration
544
+ ```bash
545
+ # View config
546
+ kubectl get configmap service-config -o yaml
547
+
548
+ # Update config (triggers restart)
549
+ kubectl edit configmap service-config
550
+ kubectl rollout restart deployment/service-name
551
+ ```
552
+
553
+ ### Logs
554
+ ```bash
555
+ # Recent logs
556
+ kubectl logs -l app=service-name --tail=100
557
+
558
+ # Stream logs
559
+ kubectl logs -l app=service-name -f
560
+
561
+ # Logs with errors only
562
+ kubectl logs -l app=service-name | grep -i error
563
+ ```
564
+
565
+ ---
566
+
567
+ ## Troubleshooting Guide
568
+
569
+ ### Service Not Responding
570
+
571
+ **Symptoms**: Health checks failing, 503 errors
572
+
573
+ **Diagnosis**:
574
+ ```bash
575
+ # Check pods
576
+ kubectl get pods -l app=service-name
577
+
578
+ # Check events
579
+ kubectl get events --sort-by='.lastTimestamp' | grep service-name
580
+
581
+ # Check logs
582
+ kubectl logs -l app=service-name --tail=50
583
+ ```
584
+
585
+ **Common Causes**:
586
+ 1. Pods in CrashLoopBackOff → Check logs for startup errors
587
+ 2. Pods Pending → Check resources, node capacity
588
+ 3. Readiness probe failing → Check /ready endpoint
589
+
590
+ ### High Latency
591
+
592
+ **Symptoms**: p99 latency above SLO
593
+
594
+ **Diagnosis**:
595
+ ```bash
596
+ # Check resource usage
597
+ kubectl top pods -l app=service-name
598
+
599
+ # Check database latency
600
+ # (query Prometheus for db_query_duration_seconds)
601
+
602
+ # Check external dependencies
603
+ curl -w "@curl-format.txt" -o /dev/null -s https://external-api.com/health
604
+ ```
605
+
606
+ **Common Causes**:
607
+ 1. Database slow queries → Check slow query log
608
+ 2. Resource constraints → Scale up
609
+ 3. External dependency slow → Check dependency health
610
+
611
+ ### High Error Rate
612
+
613
+ **Symptoms**: 5xx errors above 1%
614
+
615
+ **Diagnosis**:
616
+ ```bash
617
+ # Check error breakdown
618
+ curl -s 'http://prometheus:9090/api/v1/query?query=sum(rate(http_requests_total{job="service-name",status=~"5.."}[5m]))by(status)'
619
+
620
+ # Check recent errors in logs
621
+ kubectl logs -l app=service-name | grep -E "(error|ERROR|exception)" | tail -20
622
+ ```
623
+
624
+ **Common Causes**:
625
+ 1. Recent deployment bug → Rollback
626
+ 2. Dependency failure → Check dependencies
627
+ 3. Data issue → Check for invalid requests
628
+
629
+ ---
630
+
631
+ ## Recovery Procedures
632
+
633
+ ### Database Connection Issues
634
+
635
+ 1. Verify database is reachable:
636
+ ```bash
637
+ kubectl exec -it pod/service-name-xxx -- pg_isready -h db-host
638
+ ```
639
+
640
+ 2. Check connection pool:
641
+ ```bash
642
+ # Query for active connections
643
+ kubectl exec -it pod/postgres-0 -- psql -c "SELECT count(*) FROM pg_stat_activity WHERE datname = 'service_db'"
644
+ ```
645
+
646
+ 3. If pool exhausted, restart pods:
647
+ ```bash
648
+ kubectl rollout restart deployment/service-name
649
+ ```
650
+
651
+ ### Complete Service Recovery
652
+
653
+ 1. Stop all traffic:
654
+ ```bash
655
+ kubectl scale deployment/service-name --replicas=0
656
+ ```
657
+
658
+ 2. Fix underlying issue (database, config, etc.)
659
+
660
+ 3. Restart with single replica:
661
+ ```bash
662
+ kubectl scale deployment/service-name --replicas=1
663
+ ```
664
+
665
+ 4. Verify health:
666
+ ```bash
667
+ kubectl logs -l app=service-name -f
668
+ curl http://service-name/health
669
+ ```
670
+
671
+ 5. Scale back up:
672
+ ```bash
673
+ kubectl scale deployment/service-name --replicas=3
674
+ ```
675
+
676
+ ---
677
+
678
+ ## Contacts
679
+
680
+ | Role | Contact | Availability |
681
+ |------|---------|--------------|
682
+ | Primary On-Call | PagerDuty | 24/7 |
683
+ | Service Owner | @owner | Business hours |
684
+ | Team Lead | @lead | Business hours |
685
+
686
+ ---
687
+
688
+ ## Related Documentation
689
+
690
+ - [Architecture Doc](https://wiki.example.com/service-name/architecture)
691
+ - [API Documentation](https://api-docs.example.com/service-name)
692
+ - [Dashboard](https://grafana.example.com/d/service-name)
693
+ - [Alert Runbooks](#alert-runbooks)
694
+ ```
695
+
696
+ ## Runbook Maintenance
697
+
698
+ ### Review Schedule
699
+
700
+ ```yaml
701
+ runbook_review:
702
+ triggers:
703
+ - "After every incident that uses the runbook"
704
+ - "After any production change to the service"
705
+ - "Quarterly scheduled review"
706
+ - "When on-call feedback indicates issues"
707
+
708
+ review_checklist:
709
+ - "Commands still work?"
710
+ - "URLs and links still valid?"
711
+ - "Screenshots still accurate?"
712
+ - "Contact information current?"
713
+ - "Escalation path correct?"
714
+ - "Any new failure modes to document?"
715
+ ```
716
+
717
+ ### Testing Runbooks
718
+
719
+ ```yaml
720
+ runbook_testing:
721
+ methods:
722
+ game_day:
723
+ description: "Simulate incident, follow runbook"
724
+ frequency: "Quarterly"
725
+ outcome: "Identify gaps, update runbook"
726
+
727
+ shadow_run:
728
+ description: "New on-call follows runbook during real incident"
729
+ frequency: "Each new on-call"
730
+ outcome: "Verify clarity for newcomers"
731
+
732
+ automation_test:
733
+ description: "Run automated scripts in staging"
734
+ frequency: "After any update"
735
+ outcome: "Verify scripts work"
736
+ ```
737
+
738
+ ## Common Pitfalls
739
+
740
+ ```yaml
741
+ pitfall_stale_runbooks:
742
+ problem: "Runbooks not updated after changes"
743
+ impact: "On-call follows outdated steps, makes things worse"
744
+ solution: "Include runbook update in change checklist"
745
+
746
+ pitfall_assumed_knowledge:
747
+ problem: "Runbook assumes reader knows the system"
748
+ impact: "New on-call can't follow"
749
+ solution: "Write for someone who's never seen the service"
750
+
751
+ pitfall_no_verification:
752
+ problem: "Steps without verification"
753
+ impact: "Don't know if action worked"
754
+ solution: "Every action needs a verification step"
755
+
756
+ pitfall_wall_of_text:
757
+ problem: "Long paragraphs instead of clear steps"
758
+ impact: "Hard to follow during stress"
759
+ solution: "Use numbered steps, bullet points, clear formatting"
760
+ ```