@wipal/agent-team 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/.claude/rules/common/general-rules.md +141 -0
  2. package/.claude/rules/lessons/lessons.md +91 -0
  3. package/.claude/rules/role-rules/dev-fe-rules.md +146 -0
  4. package/.claude/rules/role-rules/sa-rules.md +226 -0
  5. package/.claude/skills/SKILL-INDEX.md +299 -0
  6. package/.claude/skills/community/security-validator/SKILL.md +392 -0
  7. package/.claude/skills/core/agent-creation/SKILL.md +338 -0
  8. package/.claude/skills/core/code-review/SKILL.md +154 -0
  9. package/.claude/skills/core/git-automation/SKILL.md +93 -0
  10. package/.claude/skills/core/retrospect-work/SKILL.md +172 -0
  11. package/.claude/skills/domain/architecture/adr-writing/SKILL.md +254 -0
  12. package/.claude/skills/domain/architecture/adr-writing/references/adr-best-practices.md +257 -0
  13. package/.claude/skills/domain/architecture/adr-writing/references/adr-examples.md +246 -0
  14. package/.claude/skills/domain/architecture/adr-writing/references/adr-template.md +160 -0
  15. package/.claude/skills/domain/architecture/architecture-patterns/SKILL.md +316 -0
  16. package/.claude/skills/domain/architecture/architecture-patterns/references/event-driven.md +393 -0
  17. package/.claude/skills/domain/architecture/architecture-patterns/references/microservices.md +315 -0
  18. package/.claude/skills/domain/architecture/architecture-patterns/references/monolith.md +321 -0
  19. package/.claude/skills/domain/architecture/architecture-patterns/references/serverless.md +457 -0
  20. package/.claude/skills/domain/architecture/performance-engineering/SKILL.md +227 -0
  21. package/.claude/skills/domain/architecture/performance-engineering/references/benchmarking.md +336 -0
  22. package/.claude/skills/domain/architecture/performance-engineering/references/caching-strategies.md +284 -0
  23. package/.claude/skills/domain/architecture/performance-engineering/references/optimization.md +298 -0
  24. package/.claude/skills/domain/architecture/security-architecture/SKILL.md +206 -0
  25. package/.claude/skills/domain/architecture/security-architecture/references/auth-patterns.md +209 -0
  26. package/.claude/skills/domain/architecture/security-architecture/references/compliance.md +246 -0
  27. package/.claude/skills/domain/architecture/security-architecture/references/threat-modeling.md +219 -0
  28. package/.claude/skills/domain/architecture/system-design/SKILL.md +227 -0
  29. package/.claude/skills/domain/architecture/system-design/references/distributed-systems.md +231 -0
  30. package/.claude/skills/domain/architecture/system-design/references/resilience.md +344 -0
  31. package/.claude/skills/domain/architecture/system-design/references/scalability.md +303 -0
  32. package/.claude/skills/domain/architecture/tech-selection/SKILL.md +192 -0
  33. package/.claude/skills/domain/architecture/tech-selection/references/build-vs-buy.md +258 -0
  34. package/.claude/skills/domain/architecture/tech-selection/references/evaluation-framework.md +203 -0
  35. package/.claude/skills/domain/architecture/tech-selection/references/tech-radar.md +257 -0
  36. package/.claude/skills/domain/backend/api-design/SKILL.md +121 -0
  37. package/.claude/skills/domain/backend/database-design/SKILL.md +156 -0
  38. package/.claude/skills/domain/backend/performance-be/SKILL.md +210 -0
  39. package/.claude/skills/domain/backend/security/SKILL.md +138 -0
  40. package/.claude/skills/domain/backend/testing-be/SKILL.md +203 -0
  41. package/.claude/skills/domain/devops/ci-cd/SKILL.md +188 -0
  42. package/.claude/skills/domain/devops/containerization/SKILL.md +177 -0
  43. package/.claude/skills/domain/devops/deployment/SKILL.md +198 -0
  44. package/.claude/skills/domain/devops/infrastructure-as-code/SKILL.md +178 -0
  45. package/.claude/skills/domain/devops/monitoring/SKILL.md +163 -0
  46. package/.claude/skills/domain/frontend/accessibility/SKILL.md +179 -0
  47. package/.claude/skills/domain/frontend/frontend-design/SKILL.md +138 -0
  48. package/.claude/skills/domain/frontend/performance-fe/SKILL.md +195 -0
  49. package/.claude/skills/domain/frontend/state-management/SKILL.md +190 -0
  50. package/.claude/skills/domain/frontend/testing-fe/SKILL.md +193 -0
  51. package/.claude/skills/domain/product/requirements-gathering/SKILL.md +136 -0
  52. package/.claude/skills/domain/product/roadmap-planning/SKILL.md +169 -0
  53. package/.claude/skills/domain/product/sprint-planning/SKILL.md +151 -0
  54. package/.claude/skills/domain/product/stakeholder-communication/SKILL.md +162 -0
  55. package/.claude/skills/domain/product/user-stories/SKILL.md +141 -0
  56. package/.claude/skills/domain/quality/bug-reporting/SKILL.md +150 -0
  57. package/.claude/skills/domain/quality/regression-testing/SKILL.md +178 -0
  58. package/.claude/skills/domain/quality/test-automation/SKILL.md +185 -0
  59. package/.claude/skills/domain/quality/test-planning/SKILL.md +177 -0
  60. package/.claude/skills/leadership/code-review-advanced/SKILL.md +167 -0
  61. package/.claude/skills/leadership/mentoring/SKILL.md +151 -0
  62. package/.claude/skills/leadership/technical-debt/SKILL.md +166 -0
  63. package/.claude/skills/leadership/technical-decision/SKILL.md +160 -0
  64. package/.claude/skills/security-reports/.gitkeep +0 -0
  65. package/.claude/skills/skills-registry.yaml +441 -0
  66. package/README.md +232 -0
  67. package/bin/agent-team.js +107 -0
  68. package/package.json +51 -0
  69. package/src/commands/add.js +227 -0
  70. package/src/commands/init.js +136 -0
  71. package/src/commands/list.js +66 -0
  72. package/src/commands/remove.js +71 -0
  73. package/src/commands/switch.js +53 -0
  74. package/src/index.js +11 -0
  75. package/src/interactive/prompts.js +153 -0
  76. package/src/server/api/agents.js +150 -0
  77. package/src/server/api/roles.js +97 -0
  78. package/src/server/api/skills.js +79 -0
  79. package/src/server/index.js +78 -0
  80. package/src/ui/agents.html +174 -0
  81. package/src/ui/css/styles.css +470 -0
  82. package/src/ui/index.html +107 -0
  83. package/src/ui/roles.html +371 -0
  84. package/src/ui/skills.html +332 -0
  85. package/src/utils/file-utils.js +193 -0
  86. package/src/utils/skill-resolver.js +594 -0
  87. package/src/utils/skill-scanner.js +154 -0
  88. package/templates/CLAUDE.md.tmpl +42 -0
  89. package/templates/knowledge.md.tmpl +31 -0
@@ -0,0 +1,231 @@
1
+ # Distributed Systems Fundamentals
2
+
3
+ ## Core Concepts
4
+
5
+ ### What is a Distributed System?
6
+ A distributed system is a collection of independent computers that appears to its users as a single coherent system.
7
+
8
+ ### Why Distributed Systems?
9
+ - **Scalability** - Handle more load than a single machine
10
+ - **Fault Tolerance** - Continue operating despite failures
11
+ - **Geographic Distribution** - Serve users from nearby locations
12
+ - **Cost** - Commodity hardware vs specialized machines
13
+
14
+ ## CAP Theorem Deep Dive
15
+
16
+ ### The Trade-off
17
+ ```
18
+ Consistency (C)
19
+
20
+ ┌────┴────┐
21
+ │ │
22
+ Availability Partition
23
+ (A) Tolerance (P)
24
+ │ │
25
+ └────┬────┘
26
+
27
+ ```
28
+
29
+ ### CP Systems (Consistency + Partition Tolerance)
30
+ - Block requests during partition to maintain consistency
31
+ - Examples: MongoDB, Redis, HBase, BigTable
32
+ - Use when: Financial systems, inventory management
33
+
34
+ ### AP Systems (Availability + Partition Tolerance)
35
+ - Continue serving during partition, may return stale data
36
+ - Examples: Cassandra, DynamoDB, CouchDB, Riak
37
+ - Use when: Social media, content delivery, shopping carts
38
+
39
+ ### CA Systems (Consistency + Availability)
40
+ - Only possible without partitions (single datacenter, perfect network)
41
+ - Examples: Traditional RDBMS (PostgreSQL, MySQL in single-node)
42
+ - Reality: Any distributed system must handle partitions
43
+
44
+ ## Consistency Models
45
+
46
+ ### Strong Consistency
47
+ ```
48
+ Client A: Write(X=1) ──▶ All replicas updated ──▶ Ack
49
+
50
+ Client B: ──────────────────────────────────▶│ Read(X=1)
51
+
52
+ (Guaranteed to see X=1)
53
+ ```
54
+ - All reads see the most recent write
55
+ - Implementation: Two-Phase Commit, Paxos, Raft
56
+ - Trade-off: Higher latency, lower availability
57
+
58
+ ### Eventual Consistency
59
+ ```
60
+ Client A: Write(X=1) ──▶ Ack (before replication)
61
+
62
+ Client B: ──────────────────────────────────▶│ Read(X=0 or 1)
63
+
64
+ (May see old or new value)
65
+
66
+ Eventually (after replication):
67
+ Client C: ──────────────────────────────────▶│ Read(X=1)
68
+ ```
69
+ - Replicas converge to same value eventually
70
+ - Implementation: Gossip protocols, anti-entropy
71
+ - Trade-off: Stale reads possible
72
+
73
+ ### Causal Consistency
74
+ ```
75
+ Client A: Write(X=1) ──▶
76
+
77
+ Client A: ──────────────┼──▶ Write(Y=2) [depends on X=1]
78
+ │ │
79
+ Client B: ──────────────┼───────────┼──▶ Read(Y=2)
80
+ │ │
81
+ └───────────┴──▶ Must also see X=1
82
+ ```
83
+ - Preserves causal relationships
84
+ - Uses vector clocks or version vectors
85
+ - Trade-off: Complex implementation
86
+
87
+ ## Distributed Algorithms
88
+
89
+ ### Leader Election
90
+ ```
91
+ Use cases:
92
+ - Single point of coordination
93
+ - Transaction ordering
94
+ - Configuration management
95
+
96
+ Algorithms:
97
+ - Bully Algorithm
98
+ - Raft Leader Election
99
+ - ZooKeeper/ZAB
100
+ ```
101
+
102
+ ### Consensus
103
+ ```
104
+ Use cases:
105
+ - Replicated state machines
106
+ - Configuration changes
107
+ - Membership changes
108
+
109
+ Algorithms:
110
+ - Paxos (Classic, Multi-Paxos)
111
+ - Raft
112
+ - ZAB (ZooKeeper Atomic Broadcast)
113
+ - PBFT (Byzantine fault tolerant)
114
+ ```
115
+
116
+ ### Gossip Protocol
117
+ ```
118
+ Use cases:
119
+ - Membership
120
+ - Data replication
121
+ - Failure detection
122
+
123
+ How it works:
124
+ 1. Each node picks random peers
125
+ 2. Exchanges state periodically
126
+ 3. Information spreads exponentially
127
+ 4. Eventually all nodes converge
128
+ ```
129
+
130
+ ## Time and Ordering
131
+
132
+ ### Logical Clocks (Lamport Timestamps)
133
+ ```
134
+ Rules:
135
+ 1. Before local event: increment counter
136
+ 2. Send message: include counter
137
+ 3. Receive message: counter = max(local, received) + 1
138
+
139
+ Use: Causal ordering of events
140
+ ```
141
+
142
+ ### Vector Clocks
143
+ ```
144
+ Each node maintains vector of counters:
145
+ - V[i] = count of events at node i
146
+ - V[i]++ for local event
147
+ - V = max(V, received) on receive
148
+
149
+ Use: Detect concurrent vs ordered events
150
+ ```
151
+
152
+ ### TrueTime (Spanner)
153
+ ```
154
+ Google's approach to global ordering:
155
+ - GPS + atomic clocks
156
+ - Returns interval [earliest, latest]
157
+ - Wait out uncertainty for ordering
158
+
159
+ Use: Globally consistent timestamps
160
+ ```
161
+
162
+ ## Failure Detection
163
+
164
+ ### Heartbeat-based
165
+ ```
166
+ ┌───────┐ ping ┌───────┐
167
+ │ Node A│────────▶│ Node B│
168
+ │ │◀────────│ │
169
+ └───────┘ pong └───────┘
170
+
171
+ Failure suspected if:
172
+ - No pong after N heartbeats
173
+ - Adaptive timeout based on network conditions
174
+ ```
175
+
176
+ ### Phi Accrual Failure Detector
177
+ ```
178
+ Instead of binary alive/dead:
179
+ - Outputs suspicion level φ
180
+ - φ = -log10(probability of error)
181
+ - Higher φ = more likely failed
182
+
183
+ Advantages:
184
+ - Adapts to network conditions
185
+ - Configurable thresholds
186
+ - Used in Cassandra, Akka
187
+ ```
188
+
189
+ ## Common Patterns
190
+
191
+ ### Two-Phase Commit (2PC)
192
+ ```
193
+ Phase 1 (Prepare):
194
+ Coordinator ──▶ "Can you commit?" ──▶ Participants
195
+ Participants ──▶ "Yes/No" ──▶ Coordinator
196
+
197
+ Phase 2 (Commit/Abort):
198
+ If all Yes:
199
+ Coordinator ──▶ "Commit" ──▶ Participants
200
+ If any No:
201
+ Coordinator ──▶ "Abort" ──▶ Participants
202
+
203
+ Issues:
204
+ - Blocking (if coordinator fails)
205
+ - Slow (two round trips)
206
+ ```
207
+
208
+ ### Saga Pattern
209
+ ```
210
+ Distributed transaction as sequence of local transactions:
211
+
212
+ Order Service Payment Service Inventory Service
213
+ │ │ │
214
+ ├──▶ Create Order │ │
215
+ │ │ │
216
+ ├─────────────────┼──▶ Process Payment │
217
+ │ │ │
218
+ ├─────────────────┼────────────────────┼──▶ Reserve Item
219
+ │ │ │
220
+ └◀────────────────┴────────────────────┴──▶ Complete
221
+
222
+ If any step fails:
223
+ - Execute compensating transactions in reverse
224
+ ```
225
+
226
+ ## Further Reading
227
+
228
+ - "Designing Data-Intensive Applications" - Martin Kleppmann
229
+ - "Distributed Systems: Principles and Paradigms" - Tanenbaum & van Steen
230
+ - Jepsen.io - Analysis of distributed systems safety
231
+ - Paper: "Time, Clocks, and the Ordering of Events in a Distributed System" - Lamport
@@ -0,0 +1,344 @@
1
+ # Resilience Patterns
2
+
3
+ ## What is Resilience?
4
+ The ability of a system to provide and maintain an acceptable level of service in the face of faults and challenges to normal operation.
5
+
6
+ ## Failure Types
7
+
8
+ ### Hardware Failures
9
+ ```
10
+ - Disk failure (1-5% annual failure rate)
11
+ - Memory errors (ECC can correct single-bit)
12
+ - Network failures (cables, switches)
13
+ - Power failures
14
+
15
+ Mitigation:
16
+ - Redundancy (RAID, multiple NICs)
17
+ - ECC memory
18
+ - UPS/generators
19
+ - Multi-AZ/multi-region
20
+ ```
21
+
22
+ ### Software Failures
23
+ ```
24
+ - Bugs in application code
25
+ - Memory leaks
26
+ - Deadlocks
27
+ - Configuration errors
28
+ - Dependency failures
29
+
30
+ Mitigation:
31
+ - Testing (unit, integration, chaos)
32
+ - Monitoring and alerting
33
+ - Graceful degradation
34
+ - Circuit breakers
35
+ ```
36
+
37
+ ### Network Failures
38
+ ```
39
+ - Packet loss
40
+ - Latency spikes
41
+ - Network partitions
42
+ - DNS failures
43
+
44
+ Mitigation:
45
+ - Retries with backoff
46
+ - Timeouts
47
+ - Circuit breakers
48
+ - Multiple endpoints
49
+ ```
50
+
51
+ ## Core Patterns
52
+
53
+ ### 1. Circuit Breaker
54
+ ```
55
+ States:
56
+ ┌───────────┐
57
+ │ CLOSED │ ← Normal operation, requests pass through
58
+ └─────┬─────┘
59
+ │ failures exceed threshold
60
+
61
+ ┌───────────┐
62
+ │ OPEN │ ← Requests fail fast, no downstream calls
63
+ └─────┬─────┘
64
+ │ timeout expires
65
+
66
+ ┌───────────┐
67
+ │HALF-OPEN │ ← Limited requests to test recovery
68
+ └───────────┘
69
+
70
+ ├─ success → CLOSE
71
+ └─ failure → OPEN
72
+
73
+ Implementation:
74
+ ```python
75
+ class CircuitBreaker:
76
+ def __init__(self, threshold=5, timeout=60):
77
+ self.failures = 0
78
+ self.threshold = threshold
79
+ self.timeout = timeout
80
+ self.state = "CLOSED"
81
+ self.last_failure = None
82
+
83
+ def call(self, func):
84
+ if self.state == "OPEN":
85
+ if time.now() - self.last_failure > self.timeout:
86
+ self.state = "HALF-OPEN"
87
+ else:
88
+ raise CircuitOpenError()
89
+
90
+ try:
91
+ result = func()
92
+ self.on_success()
93
+ return result
94
+ except Exception as e:
95
+ self.on_failure()
96
+ raise
97
+ ```
98
+ ```
99
+
100
+ ### 2. Retry with Exponential Backoff
101
+ ```
102
+ Pattern:
103
+ Attempt 1: ──────▶ Fail
104
+ wait 1s
105
+ Attempt 2: ──────▶ Fail
106
+ wait 2s
107
+ Attempt 3: ──────▶ Fail
108
+ wait 4s
109
+ Attempt 4: ──────▶ Success!
110
+
111
+ Jitter:
112
+ Add randomness to prevent thundering herd:
113
+ wait_time = base_delay * (2 ^ attempt) + random_jitter
114
+ ```
115
+
116
+ ### 3. Timeout
117
+ ```
118
+ Types:
119
+ 1. Connection timeout - Time to establish connection
120
+ 2. Read timeout - Time waiting for response
121
+ 3. Write timeout - Time to send request
122
+
123
+ Guidelines:
124
+ - Set aggressive timeouts (fail fast)
125
+ - Different timeouts for different operations
126
+ - Monitor actual latencies to tune
127
+ - Consider SLAs when setting
128
+ ```
129
+
130
+ ### 4. Bulkhead
131
+ ```
132
+ Isolate failures to prevent cascade:
133
+
134
+ ┌─────────────────────────────────────┐
135
+ │ Application │
136
+ ├───────────┬───────────┬─────────────┤
137
+ │ Bulkhead 1│ Bulkhead 2│ Bulkhead 3 │
138
+ │ (Users) │ (Orders) │ (Payments) │
139
+ │ ┌───────┐ │ ┌───────┐ │ ┌───────┐ │
140
+ │ │Pool:10│ │ │Pool:10│ │ │Pool:5 │ │
141
+ │ └───────┘ │ └───────┘ │ └───────┘ │
142
+ └───────────┴───────────┴─────────────┘
143
+
144
+ If Payments fails, Users and Orders still work.
145
+ ```
146
+
147
+ ### 5. Fallback
148
+ ```
149
+ When primary fails, provide alternative:
150
+
151
+ 1. Cache Fallback:
152
+ try:
153
+ data = api.get_data()
154
+ except:
155
+ data = cache.get("data_key") # Stale but better than nothing
156
+
157
+ 2. Default Value:
158
+ try:
159
+ recommendations = ml_service.get_recommendations()
160
+ except:
161
+ recommendations = get_popular_items() # Simpler fallback
162
+
163
+ 3. Graceful Degradation:
164
+ try:
165
+ full_page = render_with_recommendations()
166
+ except:
167
+ full_page = render_basic_page() # Simpler version
168
+ ```
169
+
170
+ ### 6. Rate Limiting
171
+ ```
172
+ Algorithms:
173
+
174
+ 1. Token Bucket:
175
+ - Bucket has N tokens
176
+ - Each request consumes 1 token
177
+ - Refills at rate R/second
178
+ - Reject if empty
179
+
180
+ 2. Sliding Window:
181
+ - Track requests in time window
182
+ - Count requests in last N seconds
183
+ - Reject if over limit
184
+
185
+ 3. Leaky Bucket:
186
+ - Queue with constant processing rate
187
+ - Requests queue up
188
+ - Reject if queue full
189
+
190
+ Implementation locations:
191
+ - API Gateway (global)
192
+ - Load balancer (per-service)
193
+ - Application (per-user)
194
+ ```
195
+
196
+ ## Chaos Engineering
197
+
198
+ ### Principles
199
+ ```
200
+ 1. Build a hypothesis around steady state
201
+ 2. Vary real-world events
202
+ 3. Run experiments in production
203
+ 4. Automate experiments to run continuously
204
+ 5. Minimize blast radius
205
+ ```
206
+
207
+ ### Common Experiments
208
+ ```
209
+ 1. Terminate instances
210
+ 2. Add latency to network
211
+ 3. Fail dependencies
212
+ 4. Exhaust resources (CPU, memory, disk)
213
+ 5. Corrupt messages
214
+ 6. Manipulate clocks
215
+ ```
216
+
217
+ ### Tools
218
+ ```
219
+ - Chaos Monkey (Netflix) - Random instance termination
220
+ - Gremlin - Comprehensive chaos platform
221
+ - Chaos Mesh - Kubernetes chaos
222
+ - Litmus - Cloud-native chaos engineering
223
+ - AWS FIS - Fault Injection Simulator
224
+ ```
225
+
226
+ ## Health Checks
227
+
228
+ ### Liveness vs Readiness
229
+ ```
230
+ Liveness Probe:
231
+ - "Is this process alive?"
232
+ - If fails, restart container
233
+ - Should be lightweight
234
+
235
+ Readiness Probe:
236
+ - "Is this service ready for traffic?"
237
+ - If fails, remove from load balancer
238
+ - Can check dependencies
239
+
240
+ Example:
241
+ /health/live → {"status": "ok"}
242
+ /health/ready → {"status": "ok", "checks": {"db": "ok", "cache": "ok"}}
243
+ ```
244
+
245
+ ### Health Check Endpoints
246
+ ```
247
+ Shallow Health Check:
248
+ - Only checks if process is running
249
+ - Fast, simple
250
+
251
+ Deep Health Check:
252
+ - Checks all dependencies
253
+ - Database connectivity
254
+ - Cache availability
255
+ - External service reachability
256
+ - Can be slow
257
+
258
+ Best Practice:
259
+ - Have both
260
+ - Use shallow for liveness
261
+ - Use deep for readiness/monitoring
262
+ ```
263
+
264
+ ## Monitoring & Observability
265
+
266
+ ### Three Pillars
267
+ ```
268
+ 1. Metrics (Numbers)
269
+ - Request rate, error rate, latency
270
+ - CPU, memory, disk usage
271
+ - Business metrics
272
+
273
+ 2. Logs (Events)
274
+ - Structured logging
275
+ - Correlation IDs
276
+ - Error context
277
+
278
+ 3. Traces (Journeys)
279
+ - Request flow across services
280
+ - Latency breakdown
281
+ - Dependency mapping
282
+ ```
283
+
284
+ ### RED Method (for services)
285
+ ```
286
+ - Rate: Requests per second
287
+ - Errors: Failed requests per second
288
+ - Duration: Distribution of latencies (p50, p95, p99)
289
+ ```
290
+
291
+ ### USE Method (for resources)
292
+ ```
293
+ - Utilization: % time resource busy
294
+ - Saturation: Queue length, waiting
295
+ - Errors: Error events count
296
+ ```
297
+
298
+ ## Incident Response
299
+
300
+ ### Incident Lifecycle
301
+ ```
302
+ 1. Detection → 2. Triage → 3. Investigation
303
+
304
+ 6. Postmortem ← 5. Resolution ← 4. Mitigation
305
+ ```
306
+
307
+ ### Runbooks
308
+ ```
309
+ For each alert, document:
310
+ 1. What the alert means
311
+ 2. Impact assessment
312
+ 3. Investigation steps
313
+ 4. Mitigation actions
314
+ 5. Escalation path
315
+ 6. Communication templates
316
+ ```
317
+
318
+ ### Postmortem Template
319
+ ```
320
+ # Incident: [Title]
321
+
322
+ ## Summary
323
+ - What happened?
324
+ - Impact (users affected, duration)
325
+
326
+ ## Timeline
327
+ - [Time] Alert triggered
328
+ - [Time] Investigation started
329
+ - [Time] Root cause identified
330
+ - [Time] Fix deployed
331
+
332
+ ## Root Cause
333
+ - Technical cause
334
+ - Process cause
335
+
336
+ ## Action Items
337
+ - [ ] Prevent recurrence
338
+ - [ ] Improve detection
339
+ - [ ] Update runbooks
340
+
341
+ ## Lessons Learned
342
+ - What went well
343
+ - What could be improved
344
+ ```