agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,651 @@
|
|
|
1
|
+
# Chaos Engineering
|
|
2
|
+
|
|
3
|
+
Comprehensive guidelines for building confidence in system resilience through controlled experiments.
|
|
4
|
+
|
|
5
|
+
## Core Principles
|
|
6
|
+
|
|
7
|
+
1. **Build Confidence** - Chaos engineering builds confidence in system behavior
|
|
8
|
+
2. **Production Reality** - Test in production or production-like environments
|
|
9
|
+
3. **Minimize Blast Radius** - Start small, expand carefully
|
|
10
|
+
4. **Automate Experiments** - Repeatable, consistent experiments
|
|
11
|
+
|
|
12
|
+
## Chaos Engineering Fundamentals
|
|
13
|
+
|
|
14
|
+
### What is Chaos Engineering?
|
|
15
|
+
|
|
16
|
+
```yaml
|
|
17
|
+
definition: |
|
|
18
|
+
Chaos Engineering is the discipline of experimenting on a system
|
|
19
|
+
to build confidence in the system's capability to withstand
|
|
20
|
+
turbulent conditions in production.
|
|
21
|
+
|
|
22
|
+
goals:
|
|
23
|
+
- "Discover weaknesses before they cause outages"
|
|
24
|
+
- "Build confidence in system resilience"
|
|
25
|
+
- "Improve incident response capabilities"
|
|
26
|
+
- "Validate monitoring and alerting"
|
|
27
|
+
|
|
28
|
+
not_chaos:
|
|
29
|
+
- "Breaking things randomly"
|
|
30
|
+
- "Testing in production without safeguards"
|
|
31
|
+
- "Causing outages for fun"
|
|
32
|
+
- "Skipping analysis and learning"
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Principles of Chaos Engineering
|
|
36
|
+
|
|
37
|
+
```yaml
|
|
38
|
+
principles:
|
|
39
|
+
1_hypothesis:
|
|
40
|
+
description: "Start with a hypothesis about steady state"
|
|
41
|
+
example: "If we lose one database replica, the system will continue serving requests with minimal latency impact"
|
|
42
|
+
|
|
43
|
+
2_real_world_events:
|
|
44
|
+
description: "Vary inputs that reflect real-world events"
|
|
45
|
+
examples:
|
|
46
|
+
- "Server crashes"
|
|
47
|
+
- "Network failures"
|
|
48
|
+
- "Disk full"
|
|
49
|
+
- "Clock skew"
|
|
50
|
+
- "Dependency failures"
|
|
51
|
+
|
|
52
|
+
3_production:
|
|
53
|
+
description: "Run experiments in production"
|
|
54
|
+
rationale: "Staging doesn't capture all production behaviors"
|
|
55
|
+
safeguards: "Start with small blast radius, have abort mechanisms"
|
|
56
|
+
|
|
57
|
+
4_automate:
|
|
58
|
+
description: "Automate experiments to run continuously"
|
|
59
|
+
benefit: "Catch regressions, build institutional knowledge"
|
|
60
|
+
|
|
61
|
+
5_minimize_blast_radius:
|
|
62
|
+
description: "Limit scope of experiments"
|
|
63
|
+
techniques:
|
|
64
|
+
- "Start with single instance"
|
|
65
|
+
- "Target non-critical services first"
|
|
66
|
+
- "Have kill switch ready"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Experiment Design
|
|
70
|
+
|
|
71
|
+
### Experiment Structure
|
|
72
|
+
|
|
73
|
+
```yaml
|
|
74
|
+
experiment_template:
|
|
75
|
+
name: "Descriptive name of experiment"
|
|
76
|
+
|
|
77
|
+
hypothesis:
|
|
78
|
+
description: "What we expect to happen"
|
|
79
|
+
steady_state: "Normal system behavior metrics"
|
|
80
|
+
expected_outcome: "Behavior under failure condition"
|
|
81
|
+
|
|
82
|
+
method:
|
|
83
|
+
type: "Type of failure injection"
|
|
84
|
+
target: "What component to affect"
|
|
85
|
+
magnitude: "How severe the failure"
|
|
86
|
+
duration: "How long to run"
|
|
87
|
+
|
|
88
|
+
abort_conditions:
|
|
89
|
+
- "Error rate > 5%"
|
|
90
|
+
- "User reports of issues"
|
|
91
|
+
- "On-call escalation"
|
|
92
|
+
|
|
93
|
+
rollback:
|
|
94
|
+
automatic: "Conditions that trigger auto-rollback"
|
|
95
|
+
manual: "Steps to manually abort"
|
|
96
|
+
|
|
97
|
+
analysis:
|
|
98
|
+
metrics: "What to measure"
|
|
99
|
+
success_criteria: "How to determine if hypothesis held"
|
|
100
|
+
|
|
101
|
+
example:
|
|
102
|
+
name: "Single API Pod Failure"
|
|
103
|
+
|
|
104
|
+
hypothesis:
|
|
105
|
+
description: "System handles single pod loss gracefully"
|
|
106
|
+
steady_state: "Error rate < 0.1%, latency p99 < 200ms"
|
|
107
|
+
expected_outcome: "Minimal impact during pod reschedule"
|
|
108
|
+
|
|
109
|
+
method:
|
|
110
|
+
type: "Pod termination"
|
|
111
|
+
target: "One random api-server pod"
|
|
112
|
+
magnitude: "Kill 1 of 5 pods"
|
|
113
|
+
duration: "Until pod reschedules (typically < 60s)"
|
|
114
|
+
|
|
115
|
+
abort_conditions:
|
|
116
|
+
- "Error rate > 1%"
|
|
117
|
+
- "Latency p99 > 1s"
|
|
118
|
+
- "Multiple pods affected"
|
|
119
|
+
|
|
120
|
+
rollback:
|
|
121
|
+
automatic: "Kubernetes reschedules pod"
|
|
122
|
+
manual: "kubectl scale deployment/api-server --replicas=5"
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Failure Injection Types
|
|
126
|
+
|
|
127
|
+
```yaml
|
|
128
|
+
infrastructure_failures:
|
|
129
|
+
pod_crash:
|
|
130
|
+
description: "Terminate container/pod"
|
|
131
|
+
tools: "kubectl delete, Chaos Mesh, Litmus"
|
|
132
|
+
simulates: "OOM kill, crash, node failure"
|
|
133
|
+
|
|
134
|
+
node_failure:
|
|
135
|
+
description: "Make node unavailable"
|
|
136
|
+
tools: "Cloud provider, kubectl drain"
|
|
137
|
+
simulates: "Hardware failure, kernel panic"
|
|
138
|
+
|
|
139
|
+
disk_full:
|
|
140
|
+
description: "Fill disk to capacity"
|
|
141
|
+
tools: "dd, Chaos Mesh"
|
|
142
|
+
simulates: "Log explosion, data growth"
|
|
143
|
+
|
|
144
|
+
cpu_stress:
|
|
145
|
+
description: "Consume CPU resources"
|
|
146
|
+
tools: "stress-ng, Chaos Mesh"
|
|
147
|
+
simulates: "Noisy neighbor, infinite loop"
|
|
148
|
+
|
|
149
|
+
memory_stress:
|
|
150
|
+
description: "Consume memory resources"
|
|
151
|
+
tools: "stress-ng, Chaos Mesh"
|
|
152
|
+
simulates: "Memory leak, large dataset"
|
|
153
|
+
|
|
154
|
+
network_failures:
|
|
155
|
+
latency:
|
|
156
|
+
description: "Add network delay"
|
|
157
|
+
tools: "tc, Chaos Mesh, Toxiproxy"
|
|
158
|
+
parameters: "Delay (ms), jitter"
|
|
159
|
+
simulates: "Cross-region calls, congestion"
|
|
160
|
+
|
|
161
|
+
packet_loss:
|
|
162
|
+
description: "Drop network packets"
|
|
163
|
+
tools: "tc, Chaos Mesh"
|
|
164
|
+
parameters: "Loss percentage"
|
|
165
|
+
simulates: "Network congestion, bad connection"
|
|
166
|
+
|
|
167
|
+
partition:
|
|
168
|
+
description: "Block network traffic"
|
|
169
|
+
tools: "iptables, Chaos Mesh"
|
|
170
|
+
parameters: "Source/destination, ports"
|
|
171
|
+
simulates: "Network split, firewall issues"
|
|
172
|
+
|
|
173
|
+
dns_failure:
|
|
174
|
+
description: "DNS resolution fails"
|
|
175
|
+
tools: "Chaos Mesh, custom"
|
|
176
|
+
simulates: "DNS outage, TTL issues"
|
|
177
|
+
|
|
178
|
+
application_failures:
|
|
179
|
+
exception_injection:
|
|
180
|
+
description: "Make code throw exceptions"
|
|
181
|
+
tools: "Feature flags, custom middleware"
|
|
182
|
+
simulates: "Bug, unexpected input"
|
|
183
|
+
|
|
184
|
+
slow_response:
|
|
185
|
+
description: "Delay application responses"
|
|
186
|
+
tools: "Toxiproxy, custom middleware"
|
|
187
|
+
simulates: "Slow dependency, GC pause"
|
|
188
|
+
|
|
189
|
+
partial_failure:
|
|
190
|
+
description: "Some requests fail"
|
|
191
|
+
tools: "Feature flags, service mesh"
|
|
192
|
+
simulates: "Intermittent issues"
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Tools and Frameworks
|
|
196
|
+
|
|
197
|
+
### Chaos Mesh (Kubernetes)
|
|
198
|
+
|
|
199
|
+
```yaml
|
|
200
|
+
chaos_mesh:
|
|
201
|
+
description: "Cloud-native chaos engineering platform"
|
|
202
|
+
|
|
203
|
+
installation: |
|
|
204
|
+
helm repo add chaos-mesh https://charts.chaos-mesh.org
|
|
205
|
+
helm install chaos-mesh chaos-mesh/chaos-mesh \
|
|
206
|
+
--namespace=chaos-mesh \
|
|
207
|
+
--create-namespace
|
|
208
|
+
|
|
209
|
+
pod_chaos: |
|
|
210
|
+
apiVersion: chaos-mesh.org/v1alpha1
|
|
211
|
+
kind: PodChaos
|
|
212
|
+
metadata:
|
|
213
|
+
name: pod-kill-example
|
|
214
|
+
namespace: chaos-mesh
|
|
215
|
+
spec:
|
|
216
|
+
action: pod-kill
|
|
217
|
+
mode: one # Kill one pod
|
|
218
|
+
selector:
|
|
219
|
+
namespaces:
|
|
220
|
+
- production
|
|
221
|
+
labelSelectors:
|
|
222
|
+
app: api-server
|
|
223
|
+
scheduler:
|
|
224
|
+
cron: "@every 2h" # Or run once
|
|
225
|
+
|
|
226
|
+
network_chaos: |
|
|
227
|
+
apiVersion: chaos-mesh.org/v1alpha1
|
|
228
|
+
kind: NetworkChaos
|
|
229
|
+
metadata:
|
|
230
|
+
name: network-delay
|
|
231
|
+
spec:
|
|
232
|
+
action: delay
|
|
233
|
+
mode: all
|
|
234
|
+
selector:
|
|
235
|
+
namespaces:
|
|
236
|
+
- production
|
|
237
|
+
labelSelectors:
|
|
238
|
+
app: api-server
|
|
239
|
+
delay:
|
|
240
|
+
latency: "100ms"
|
|
241
|
+
jitter: "20ms"
|
|
242
|
+
duration: "5m"
|
|
243
|
+
|
|
244
|
+
stress_chaos: |
|
|
245
|
+
apiVersion: chaos-mesh.org/v1alpha1
|
|
246
|
+
kind: StressChaos
|
|
247
|
+
metadata:
|
|
248
|
+
name: cpu-stress
|
|
249
|
+
spec:
|
|
250
|
+
mode: one
|
|
251
|
+
selector:
|
|
252
|
+
labelSelectors:
|
|
253
|
+
app: api-server
|
|
254
|
+
stressors:
|
|
255
|
+
cpu:
|
|
256
|
+
workers: 2
|
|
257
|
+
load: 80
|
|
258
|
+
duration: "5m"
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
### Litmus Chaos
|
|
262
|
+
|
|
263
|
+
```yaml
|
|
264
|
+
litmus:
|
|
265
|
+
description: "CNCF chaos engineering project"
|
|
266
|
+
|
|
267
|
+
installation: |
|
|
268
|
+
kubectl apply -f https://litmuschaos.github.io/litmus/litmus-operator-v2.14.0.yaml
|
|
269
|
+
|
|
270
|
+
experiment: |
|
|
271
|
+
apiVersion: litmuschaos.io/v1alpha1
|
|
272
|
+
kind: ChaosEngine
|
|
273
|
+
metadata:
|
|
274
|
+
name: api-chaos
|
|
275
|
+
namespace: production
|
|
276
|
+
spec:
|
|
277
|
+
appinfo:
|
|
278
|
+
appns: production
|
|
279
|
+
applabel: app=api-server
|
|
280
|
+
appkind: deployment
|
|
281
|
+
chaosServiceAccount: litmus-admin
|
|
282
|
+
experiments:
|
|
283
|
+
- name: pod-delete
|
|
284
|
+
spec:
|
|
285
|
+
components:
|
|
286
|
+
env:
|
|
287
|
+
- name: TOTAL_CHAOS_DURATION
|
|
288
|
+
value: '30'
|
|
289
|
+
- name: CHAOS_INTERVAL
|
|
290
|
+
value: '10'
|
|
291
|
+
- name: FORCE
|
|
292
|
+
value: 'false'
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### Gremlin
|
|
296
|
+
|
|
297
|
+
```yaml
|
|
298
|
+
gremlin:
|
|
299
|
+
description: "Enterprise chaos engineering platform"
|
|
300
|
+
|
|
301
|
+
features:
|
|
302
|
+
- "Web UI for experiment management"
|
|
303
|
+
- "Attack scheduling"
|
|
304
|
+
- "Team collaboration"
|
|
305
|
+
- "Compliance reporting"
|
|
306
|
+
|
|
307
|
+
attack_types:
|
|
308
|
+
resource:
|
|
309
|
+
- "CPU"
|
|
310
|
+
- "Memory"
|
|
311
|
+
- "Disk"
|
|
312
|
+
- "IO"
|
|
313
|
+
network:
|
|
314
|
+
- "Latency"
|
|
315
|
+
- "Packet loss"
|
|
316
|
+
- "DNS"
|
|
317
|
+
- "Blackhole"
|
|
318
|
+
state:
|
|
319
|
+
- "Process killer"
|
|
320
|
+
- "Shutdown"
|
|
321
|
+
- "Time travel"
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
## Game Days
|
|
325
|
+
|
|
326
|
+
### What is a Game Day?
|
|
327
|
+
|
|
328
|
+
```yaml
|
|
329
|
+
game_day:
|
|
330
|
+
definition: |
|
|
331
|
+
A scheduled event where the team intentionally injects
|
|
332
|
+
failures to test system resilience and incident response.
|
|
333
|
+
|
|
334
|
+
goals:
|
|
335
|
+
- "Validate resilience mechanisms"
|
|
336
|
+
- "Practice incident response"
|
|
337
|
+
- "Identify gaps in monitoring"
|
|
338
|
+
- "Build team confidence"
|
|
339
|
+
- "Improve runbooks"
|
|
340
|
+
|
|
341
|
+
frequency: "Quarterly at minimum"
|
|
342
|
+
duration: "2-4 hours"
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
### Game Day Planning
|
|
346
|
+
|
|
347
|
+
```yaml
|
|
348
|
+
planning_checklist:
|
|
349
|
+
2_weeks_before:
|
|
350
|
+
- "Define objectives"
|
|
351
|
+
- "Choose scenarios"
|
|
352
|
+
- "Identify participants"
|
|
353
|
+
- "Get stakeholder approval"
|
|
354
|
+
- "Schedule maintenance window (if needed)"
|
|
355
|
+
|
|
356
|
+
1_week_before:
|
|
357
|
+
- "Prepare experiment scripts"
|
|
358
|
+
- "Test abort mechanisms"
|
|
359
|
+
- "Brief participants"
|
|
360
|
+
- "Notify support/ops teams"
|
|
361
|
+
- "Prepare monitoring dashboards"
|
|
362
|
+
|
|
363
|
+
day_before:
|
|
364
|
+
- "Final review of scenarios"
|
|
365
|
+
- "Verify rollback procedures"
|
|
366
|
+
- "Confirm participant availability"
|
|
367
|
+
- "Prepare communication channels"
|
|
368
|
+
|
|
369
|
+
day_of:
|
|
370
|
+
- "Pre-game briefing"
|
|
371
|
+
- "Verify baseline metrics"
|
|
372
|
+
- "Execute experiments"
|
|
373
|
+
- "Document observations"
|
|
374
|
+
- "Post-game debrief"
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
### Game Day Scenarios
|
|
378
|
+
|
|
379
|
+
```yaml
|
|
380
|
+
beginner_scenarios:
|
|
381
|
+
single_pod_failure:
|
|
382
|
+
complexity: "Low"
|
|
383
|
+
blast_radius: "Single service"
|
|
384
|
+
learning: "Kubernetes self-healing"
|
|
385
|
+
|
|
386
|
+
dependency_latency:
|
|
387
|
+
complexity: "Low"
|
|
388
|
+
blast_radius: "Single service"
|
|
389
|
+
learning: "Timeout handling"
|
|
390
|
+
|
|
391
|
+
cache_failure:
|
|
392
|
+
complexity: "Low"
|
|
393
|
+
blast_radius: "Performance"
|
|
394
|
+
learning: "Cache fallback behavior"
|
|
395
|
+
|
|
396
|
+
intermediate_scenarios:
|
|
397
|
+
database_replica_failure:
|
|
398
|
+
complexity: "Medium"
|
|
399
|
+
blast_radius: "Data tier"
|
|
400
|
+
learning: "Database failover"
|
|
401
|
+
|
|
402
|
+
az_failure:
|
|
403
|
+
complexity: "Medium"
|
|
404
|
+
blast_radius: "Multiple services"
|
|
405
|
+
learning: "Cross-AZ resilience"
|
|
406
|
+
|
|
407
|
+
certificate_expiration:
|
|
408
|
+
complexity: "Medium"
|
|
409
|
+
blast_radius: "TLS services"
|
|
410
|
+
learning: "Certificate monitoring"
|
|
411
|
+
|
|
412
|
+
advanced_scenarios:
|
|
413
|
+
region_failover:
|
|
414
|
+
complexity: "High"
|
|
415
|
+
blast_radius: "Entire region"
|
|
416
|
+
learning: "DR procedures"
|
|
417
|
+
|
|
418
|
+
cascading_failure:
|
|
419
|
+
complexity: "High"
|
|
420
|
+
blast_radius: "Multiple services"
|
|
421
|
+
learning: "Circuit breakers"
|
|
422
|
+
|
|
423
|
+
data_corruption:
|
|
424
|
+
complexity: "High"
|
|
425
|
+
blast_radius: "Data integrity"
|
|
426
|
+
learning: "Recovery procedures"
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
### Game Day Execution
|
|
430
|
+
|
|
431
|
+
```yaml
|
|
432
|
+
execution_roles:
|
|
433
|
+
game_master:
|
|
434
|
+
responsibilities:
|
|
435
|
+
- "Run the game day"
|
|
436
|
+
- "Control experiment execution"
|
|
437
|
+
- "Make go/no-go decisions"
|
|
438
|
+
- "Call abort if needed"
|
|
439
|
+
|
|
440
|
+
red_team:
|
|
441
|
+
responsibilities:
|
|
442
|
+
- "Execute failure injections"
|
|
443
|
+
- "Escalate if issues found"
|
|
444
|
+
- "Document experiment results"
|
|
445
|
+
|
|
446
|
+
blue_team:
|
|
447
|
+
responsibilities:
|
|
448
|
+
- "Respond to failures"
|
|
449
|
+
- "Use normal incident response"
|
|
450
|
+
- "Pretend they don't know what's coming"
|
|
451
|
+
|
|
452
|
+
observers:
|
|
453
|
+
responsibilities:
|
|
454
|
+
- "Watch and learn"
|
|
455
|
+
- "Take notes"
|
|
456
|
+
- "Don't interfere"
|
|
457
|
+
|
|
458
|
+
execution_flow:
|
|
459
|
+
1_baseline:
|
|
460
|
+
- "Verify system health"
|
|
461
|
+
- "Record baseline metrics"
|
|
462
|
+
- "Confirm abort mechanisms work"
|
|
463
|
+
|
|
464
|
+
2_experiment:
|
|
465
|
+
- "Announce experiment start"
|
|
466
|
+
- "Inject failure"
|
|
467
|
+
- "Observe system response"
|
|
468
|
+
- "Monitor metrics"
|
|
469
|
+
|
|
470
|
+
3_response:
|
|
471
|
+
- "Blue team detects and responds"
|
|
472
|
+
- "Document response actions"
|
|
473
|
+
- "Note time to detection/mitigation"
|
|
474
|
+
|
|
475
|
+
4_recovery:
|
|
476
|
+
- "Remove failure injection"
|
|
477
|
+
- "Verify system recovery"
|
|
478
|
+
- "Record recovery metrics"
|
|
479
|
+
|
|
480
|
+
5_debrief:
|
|
481
|
+
- "Discuss what happened"
|
|
482
|
+
- "Identify improvements"
|
|
483
|
+
- "Create action items"
|
|
484
|
+
```
|
|
485
|
+
|
|
486
|
+
## Continuous Chaos
|
|
487
|
+
|
|
488
|
+
### Automated Chaos
|
|
489
|
+
|
|
490
|
+
```yaml
|
|
491
|
+
continuous_chaos:
|
|
492
|
+
description: "Run chaos experiments automatically"
|
|
493
|
+
|
|
494
|
+
benefits:
|
|
495
|
+
- "Catch regressions"
|
|
496
|
+
- "Build resilience muscle memory"
|
|
497
|
+
- "Validate every deployment"
|
|
498
|
+
|
|
499
|
+
implementation:
|
|
500
|
+
scheduling: |
|
|
501
|
+
# Run chaos experiments on schedule
|
|
502
|
+
apiVersion: chaos-mesh.org/v1alpha1
|
|
503
|
+
kind: Schedule
|
|
504
|
+
metadata:
|
|
505
|
+
name: continuous-pod-chaos
|
|
506
|
+
spec:
|
|
507
|
+
schedule: "*/30 * * * *" # Every 30 minutes
|
|
508
|
+
type: PodChaos
|
|
509
|
+
podChaos:
|
|
510
|
+
action: pod-kill
|
|
511
|
+
mode: one
|
|
512
|
+
selector:
|
|
513
|
+
labelSelectors:
|
|
514
|
+
chaos-enabled: "true"
|
|
515
|
+
|
|
516
|
+
ci_integration: |
|
|
517
|
+
# Run chaos as part of CI/CD
|
|
518
|
+
- name: Chaos Test
|
|
519
|
+
run: |
|
|
520
|
+
kubectl apply -f chaos-experiment.yaml
|
|
521
|
+
sleep 60
|
|
522
|
+
./verify-system-health.sh
|
|
523
|
+
kubectl delete -f chaos-experiment.yaml
|
|
524
|
+
```
|
|
525
|
+
|
|
526
|
+
### Chaos in Production
|
|
527
|
+
|
|
528
|
+
```yaml
|
|
529
|
+
production_chaos:
|
|
530
|
+
prerequisites:
|
|
531
|
+
- "Solid monitoring and alerting"
|
|
532
|
+
- "Automated rollback mechanisms"
|
|
533
|
+
- "Team buy-in and training"
|
|
534
|
+
- "Management approval"
|
|
535
|
+
|
|
536
|
+
safeguards:
|
|
537
|
+
blast_radius:
|
|
538
|
+
- "Start with single instance"
|
|
539
|
+
- "Limit to percentage of traffic"
|
|
540
|
+
- "Scope to non-critical paths first"
|
|
541
|
+
|
|
542
|
+
abort_conditions:
|
|
543
|
+
- "Automatic abort on SLO breach"
|
|
544
|
+
- "Kill switch for manual abort"
|
|
545
|
+
- "Time-limited experiments"
|
|
546
|
+
|
|
547
|
+
timing:
|
|
548
|
+
- "Business hours initially"
|
|
549
|
+
- "Avoid peak traffic"
|
|
550
|
+
- "Skip during change freezes"
|
|
551
|
+
|
|
552
|
+
gradual_expansion:
|
|
553
|
+
phase_1: "Single pod, non-peak hours"
|
|
554
|
+
phase_2: "Multiple pods, business hours"
|
|
555
|
+
phase_3: "Critical paths, scheduled"
|
|
556
|
+
phase_4: "Random chaos, continuous"
|
|
557
|
+
```
|
|
558
|
+
|
|
559
|
+
## Measuring Success
|
|
560
|
+
|
|
561
|
+
### Chaos Metrics
|
|
562
|
+
|
|
563
|
+
```yaml
|
|
564
|
+
experiment_metrics:
|
|
565
|
+
system_behavior:
|
|
566
|
+
- "Error rate during experiment"
|
|
567
|
+
- "Latency impact"
|
|
568
|
+
- "Recovery time"
|
|
569
|
+
- "Blast radius (affected users)"
|
|
570
|
+
|
|
571
|
+
detection:
|
|
572
|
+
- "Time to alert"
|
|
573
|
+
- "Alert accuracy"
|
|
574
|
+
- "Monitoring coverage"
|
|
575
|
+
|
|
576
|
+
response:
|
|
577
|
+
- "Time to acknowledge"
|
|
578
|
+
- "Time to mitigate"
|
|
579
|
+
- "Runbook effectiveness"
|
|
580
|
+
|
|
581
|
+
program_metrics:
|
|
582
|
+
coverage:
|
|
583
|
+
- "Services with chaos tests"
|
|
584
|
+
- "Failure modes tested"
|
|
585
|
+
- "Critical paths validated"
|
|
586
|
+
|
|
587
|
+
improvement:
|
|
588
|
+
- "Issues found vs production incidents"
|
|
589
|
+
- "MTTR improvement"
|
|
590
|
+
- "Confidence score (team survey)"
|
|
591
|
+
|
|
592
|
+
maturity:
|
|
593
|
+
level_1: "Ad-hoc experiments in staging"
|
|
594
|
+
level_2: "Scheduled game days"
|
|
595
|
+
level_3: "Automated chaos in staging"
|
|
596
|
+
level_4: "Continuous chaos in production"
|
|
597
|
+
```
|
|
598
|
+
|
|
599
|
+
### Reporting
|
|
600
|
+
|
|
601
|
+
```yaml
|
|
602
|
+
experiment_report:
|
|
603
|
+
summary:
|
|
604
|
+
- "Hypothesis"
|
|
605
|
+
- "Result (confirmed/denied)"
|
|
606
|
+
- "Impact observed"
|
|
607
|
+
|
|
608
|
+
metrics:
|
|
609
|
+
- "Error rate: baseline vs experiment"
|
|
610
|
+
- "Latency: baseline vs experiment"
|
|
611
|
+
- "Recovery time"
|
|
612
|
+
|
|
613
|
+
findings:
|
|
614
|
+
- "What worked well"
|
|
615
|
+
- "What didn't work"
|
|
616
|
+
- "Unexpected behaviors"
|
|
617
|
+
|
|
618
|
+
action_items:
|
|
619
|
+
- "Improvements needed"
|
|
620
|
+
- "Monitoring gaps"
|
|
621
|
+
- "Runbook updates"
|
|
622
|
+
```
|
|
623
|
+
|
|
624
|
+
## Common Pitfalls
|
|
625
|
+
|
|
626
|
+
```yaml
|
|
627
|
+
pitfall_no_hypothesis:
|
|
628
|
+
problem: "Breaking things without expected outcome"
|
|
629
|
+
impact: "Can't measure success, no learning"
|
|
630
|
+
solution: "Always start with clear hypothesis"
|
|
631
|
+
|
|
632
|
+
pitfall_big_blast_radius:
|
|
633
|
+
problem: "Too much failure at once"
|
|
634
|
+
impact: "Real outage, hard to analyze"
|
|
635
|
+
solution: "Start small, expand gradually"
|
|
636
|
+
|
|
637
|
+
pitfall_no_abort:
|
|
638
|
+
problem: "Can't stop experiment"
|
|
639
|
+
impact: "Extended outage"
|
|
640
|
+
solution: "Always have kill switch ready"
|
|
641
|
+
|
|
642
|
+
pitfall_no_follow_up:
|
|
643
|
+
problem: "Find issues, don't fix them"
|
|
644
|
+
impact: "Wasted effort"
|
|
645
|
+
solution: "Track findings to resolution"
|
|
646
|
+
|
|
647
|
+
pitfall_chaos_theater:
|
|
648
|
+
problem: "Going through motions"
|
|
649
|
+
impact: "False confidence"
|
|
650
|
+
solution: "Meaningful experiments, honest analysis"
|
|
651
|
+
```
|