@sylix/coworker 2.0.10 → 2.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/dist/commands/slash/config.d.ts.map +1 -1
  2. package/dist/commands/slash/config.js +23 -5
  3. package/dist/commands/slash/config.js.map +1 -1
  4. package/dist/commands/slash/todo.js +1 -1
  5. package/dist/commands/slash/todo.js.map +1 -1
  6. package/dist/core/CoWorkerAgent.d.ts.map +1 -1
  7. package/dist/core/CoWorkerAgent.js +6 -3
  8. package/dist/core/CoWorkerAgent.js.map +1 -1
  9. package/dist/permissions/PermissionInterceptor.js +1 -1
  10. package/dist/permissions/PermissionInterceptor.js.map +1 -1
  11. package/dist/skills/defaults/accessibility/screen-reader-testing.md +545 -0
  12. package/dist/skills/defaults/accessibility/wcag-audit-patterns.md +555 -0
  13. package/dist/skills/defaults/ai-ml/rag.md +276 -0
  14. package/dist/skills/defaults/backend-development/api-design-principles.md +528 -0
  15. package/dist/skills/defaults/backend-development/api-design.md +285 -0
  16. package/dist/skills/defaults/backend-development/architecture-patterns.md +494 -0
  17. package/dist/skills/defaults/backend-development/async-python.md +237 -0
  18. package/dist/skills/defaults/backend-development/auth-implementation-patterns.md +638 -0
  19. package/dist/skills/defaults/backend-development/bazel-build-optimization.md +387 -0
  20. package/dist/skills/defaults/backend-development/billing-automation/SKILL.md +566 -0
  21. package/dist/skills/defaults/backend-development/code-review-excellence.md +538 -0
  22. package/dist/skills/defaults/backend-development/cqrs-implementation.md +554 -0
  23. package/dist/skills/defaults/backend-development/database-design.md +305 -0
  24. package/dist/skills/defaults/backend-development/debugging-strategies.md +536 -0
  25. package/dist/skills/defaults/backend-development/e2e-testing-patterns.md +544 -0
  26. package/dist/skills/defaults/backend-development/error-handling-patterns.md +641 -0
  27. package/dist/skills/defaults/backend-development/fastapi-templates.md +559 -0
  28. package/dist/skills/defaults/backend-development/fastapi.md +309 -0
  29. package/dist/skills/defaults/backend-development/git-advanced-workflows.md +405 -0
  30. package/dist/skills/defaults/backend-development/microservices-patterns.md +595 -0
  31. package/dist/skills/defaults/backend-development/microservices.md +284 -0
  32. package/dist/skills/defaults/backend-development/monorepo-management.md +623 -0
  33. package/dist/skills/defaults/backend-development/nodejs-backend-patterns.md +1048 -0
  34. package/dist/skills/defaults/backend-development/nx-workspace-patterns.md +457 -0
  35. package/dist/skills/defaults/backend-development/paypal-integration/SKILL.md +478 -0
  36. package/dist/skills/defaults/backend-development/pci-compliance/SKILL.md +480 -0
  37. package/dist/skills/defaults/backend-development/python-anti-patterns.md +349 -0
  38. package/dist/skills/defaults/backend-development/python-background-jobs.md +364 -0
  39. package/dist/skills/defaults/backend-development/python-code-style.md +360 -0
  40. package/dist/skills/defaults/backend-development/python-configuration.md +368 -0
  41. package/dist/skills/defaults/backend-development/python-design-patterns.md +296 -0
  42. package/dist/skills/defaults/backend-development/python-error-handling.md +323 -0
  43. package/dist/skills/defaults/backend-development/python-packaging.md +887 -0
  44. package/dist/skills/defaults/backend-development/python-performance-optimization.md +874 -0
  45. package/dist/skills/defaults/backend-development/python-project-structure.md +252 -0
  46. package/dist/skills/defaults/backend-development/python-resilience.md +376 -0
  47. package/dist/skills/defaults/backend-development/python-resource-management.md +421 -0
  48. package/dist/skills/defaults/backend-development/python-type-safety.md +428 -0
  49. package/dist/skills/defaults/backend-development/sql-optimization-patterns.md +509 -0
  50. package/dist/skills/defaults/backend-development/stripe-integration/SKILL.md +522 -0
  51. package/dist/skills/defaults/backend-development/turborepo-caching.md +376 -0
  52. package/dist/skills/defaults/blockchain/defi-protocol-templates.md +430 -0
  53. package/dist/skills/defaults/blockchain/nft-standards.md +364 -0
  54. package/dist/skills/defaults/blockchain/solidity-security.md +514 -0
  55. package/dist/skills/defaults/blockchain/web3-testing.md +360 -0
  56. package/dist/skills/defaults/business/competitive-landscape/SKILL.md +527 -0
  57. package/dist/skills/defaults/business/market-sizing-analysis/SKILL.md +451 -0
  58. package/dist/skills/defaults/business/startup-financial-modeling/SKILL.md +494 -0
  59. package/dist/skills/defaults/business/startup-metrics-framework/SKILL.md +564 -0
  60. package/dist/skills/defaults/business/team-composition-analysis.md +437 -0
  61. package/dist/skills/defaults/compliance/employment-contract-templates/SKILL.md +527 -0
  62. package/dist/skills/defaults/compliance/gdpr-data-handling/SKILL.md +630 -0
  63. package/dist/skills/defaults/data-engineering/airflow-dag-patterns.md +436 -0
  64. package/dist/skills/defaults/data-engineering/airflow.md +519 -0
  65. package/dist/skills/defaults/data-engineering/data-quality.md +583 -0
  66. package/dist/skills/defaults/data-engineering/dbt-transformation-patterns.md +482 -0
  67. package/dist/skills/defaults/data-engineering/dbt.md +556 -0
  68. package/dist/skills/defaults/data-engineering/ml-pipeline-workflow/SKILL.md +247 -0
  69. package/dist/skills/defaults/data-engineering/spark-optimization.md +348 -0
  70. package/dist/skills/defaults/data-engineering/spark.md +411 -0
  71. package/dist/skills/defaults/database/postgresql.md +202 -0
  72. package/dist/skills/defaults/debugging/systematic-debugging.md +249 -0
  73. package/dist/skills/defaults/devops/architecture-decision-records.md +448 -0
  74. package/dist/skills/defaults/devops/changelog-automation.md +580 -0
  75. package/dist/skills/defaults/devops/cicd.md +314 -0
  76. package/dist/skills/defaults/devops/cloud.md +263 -0
  77. package/dist/skills/defaults/devops/code-review-excellence.md +299 -0
  78. package/dist/skills/defaults/devops/cost-optimization.md +295 -0
  79. package/dist/skills/defaults/devops/deployment-pipeline-design.md +356 -0
  80. package/dist/skills/defaults/devops/docker.md +281 -0
  81. package/dist/skills/defaults/devops/git-workflows.md +205 -0
  82. package/dist/skills/defaults/devops/github-actions.md +311 -0
  83. package/dist/skills/defaults/devops/gitlab-ci-patterns.md +266 -0
  84. package/dist/skills/defaults/devops/hybrid-cloud-networking.md +241 -0
  85. package/dist/skills/defaults/devops/istio-traffic-management.md +327 -0
  86. package/dist/skills/defaults/devops/kubernetes.md +339 -0
  87. package/dist/skills/defaults/devops/linkerd-patterns.md +311 -0
  88. package/dist/skills/defaults/devops/multi-cloud-architecture.md +181 -0
  89. package/dist/skills/defaults/devops/observability.md +243 -0
  90. package/dist/skills/defaults/devops/openapi-spec-generation.md +1024 -0
  91. package/dist/skills/defaults/devops/postmortem-writing.md +396 -0
  92. package/dist/skills/defaults/devops/prometheus-configuration.md +265 -0
  93. package/dist/skills/defaults/devops/secrets-management.md +341 -0
  94. package/dist/skills/defaults/devops/service-mesh-observability.md +385 -0
  95. package/dist/skills/defaults/devops/terraform-module-library.md +244 -0
  96. package/dist/skills/defaults/finance/backtesting-frameworks/SKILL.md +663 -0
  97. package/dist/skills/defaults/finance/risk-metrics-calculation/SKILL.md +557 -0
  98. package/dist/skills/defaults/frontend/accessibility-compliance.md +420 -0
  99. package/dist/skills/defaults/frontend/design-system-patterns.md +337 -0
  100. package/dist/skills/defaults/frontend/interaction-design.md +327 -0
  101. package/dist/skills/defaults/frontend/javascript.md +311 -0
  102. package/dist/skills/defaults/frontend/modern-javascript-patterns.md +927 -0
  103. package/dist/skills/defaults/frontend/react-native-design.md +440 -0
  104. package/dist/skills/defaults/frontend/react.md +345 -0
  105. package/dist/skills/defaults/frontend/responsive-design.md +472 -0
  106. package/dist/skills/defaults/frontend/tailwind-design-system.md +337 -0
  107. package/dist/skills/defaults/frontend/typescript-advanced-types.md +724 -0
  108. package/dist/skills/defaults/frontend/typescript.md +334 -0
  109. package/dist/skills/defaults/frontend/visual-design-foundations.md +326 -0
  110. package/dist/skills/defaults/frontend/web-component-design.md +279 -0
  111. package/dist/skills/defaults/game-development/godot-gdscript-patterns.md +188 -0
  112. package/dist/skills/defaults/game-development/unity-ecs-patterns.md +594 -0
  113. package/dist/skills/defaults/kubernetes/gitops-workflow.md +285 -0
  114. package/dist/skills/defaults/kubernetes/gitops.md +280 -0
  115. package/dist/skills/defaults/kubernetes/helm-chart-scaffolding.md +553 -0
  116. package/dist/skills/defaults/kubernetes/helm.md +343 -0
  117. package/dist/skills/defaults/kubernetes/k8s-manifest-generator.md +501 -0
  118. package/dist/skills/defaults/kubernetes/k8s-security-policies.md +342 -0
  119. package/dist/skills/defaults/kubernetes/manifests.md +330 -0
  120. package/dist/skills/defaults/kubernetes/security.md +337 -0
  121. package/dist/skills/defaults/llm-application/embedding-strategies.md +608 -0
  122. package/dist/skills/defaults/llm-application/hybrid-search-implementation.md +570 -0
  123. package/dist/skills/defaults/llm-application/hybrid-search.md +570 -0
  124. package/dist/skills/defaults/llm-application/langchain-architecture.md +666 -0
  125. package/dist/skills/defaults/llm-application/langchain.md +259 -0
  126. package/dist/skills/defaults/llm-application/llm-evaluation.md +695 -0
  127. package/dist/skills/defaults/llm-application/prompt-engineering-patterns.md +449 -0
  128. package/dist/skills/defaults/llm-application/prompt-engineering.md +219 -0
  129. package/dist/skills/defaults/llm-application/rag-implementation.md +434 -0
  130. package/dist/skills/defaults/llm-application/similarity-search-patterns.md +560 -0
  131. package/dist/skills/defaults/llm-application/similarity-search.md +560 -0
  132. package/dist/skills/defaults/llm-application/vector-index-tuning.md +523 -0
  133. package/dist/skills/defaults/mobile/mobile-android-design.md +440 -0
  134. package/dist/skills/defaults/mobile/mobile-ios-design.md +266 -0
  135. package/dist/skills/defaults/monitoring/distributed-tracing.md +436 -0
  136. package/dist/skills/defaults/monitoring/grafana-dashboards.md +370 -0
  137. package/dist/skills/defaults/monitoring/prometheus-configuration.md +379 -0
  138. package/dist/skills/defaults/monitoring/slo-implementation.md +323 -0
  139. package/dist/skills/defaults/refactoring/code-refactoring.md +349 -0
  140. package/dist/skills/defaults/security/anti-reversing-techniques/SKILL.md +559 -0
  141. package/dist/skills/defaults/security/auditor.md +168 -0
  142. package/dist/skills/defaults/security/binary-analysis-patterns/SKILL.md +438 -0
  143. package/dist/skills/defaults/security/memory-forensics/SKILL.md +483 -0
  144. package/dist/skills/defaults/security/mtls-configuration.md +349 -0
  145. package/dist/skills/defaults/security/protocol-reverse-engineering/SKILL.md +520 -0
  146. package/dist/skills/defaults/security/sast-configuration.md +182 -0
  147. package/dist/skills/defaults/security/security.md +313 -0
  148. package/dist/skills/defaults/security/stride-analysis.md +273 -0
  149. package/dist/skills/defaults/security/threat-mitigation-mapping.md +290 -0
  150. package/dist/skills/defaults/systems/bash-defensive-patterns/SKILL.md +539 -0
  151. package/dist/skills/defaults/systems/bats-testing-patterns/SKILL.md +631 -0
  152. package/dist/skills/defaults/systems/go-concurrency-patterns.md +657 -0
  153. package/dist/skills/defaults/systems/memory-safety-patterns.md +605 -0
  154. package/dist/skills/defaults/systems/rust-async-patterns.md +519 -0
  155. package/dist/skills/defaults/systems/shellcheck-configuration/SKILL.md +456 -0
  156. package/dist/skills/defaults/team-collaboration/multi-reviewer-patterns.md +126 -0
  157. package/dist/skills/defaults/team-collaboration/parallel-feature-development.md +151 -0
  158. package/dist/skills/defaults/testing/javascript-testing-patterns.md +1021 -0
  159. package/dist/skills/defaults/testing/python-testing-patterns.md +351 -0
  160. package/dist/skills/defaults/testing/testing.md +332 -0
  161. package/dist/skills/defaults/workflows/context-driven-development.md +384 -0
  162. package/dist/skills/defaults/workflows/track-management.md +592 -0
  163. package/dist/skills/defaults/workflows/workflow-patterns.md +622 -0
  164. package/dist/skills/index.d.ts +11 -0
  165. package/dist/skills/index.d.ts.map +1 -0
  166. package/dist/skills/index.js +129 -0
  167. package/dist/skills/index.js.map +1 -0
  168. package/dist/utils/character.js +6 -9
  169. package/dist/utils/character.js.map +1 -1
  170. package/dist/utils/contextManager.js +3 -7
  171. package/dist/utils/contextManager.js.map +1 -1
  172. package/dist/utils/inputbar.d.ts.map +1 -1
  173. package/dist/utils/inputbar.js +8 -1
  174. package/dist/utils/inputbar.js.map +1 -1
  175. package/dist/utils/output.d.ts.map +1 -1
  176. package/dist/utils/output.js +3 -35
  177. package/dist/utils/output.js.map +1 -1
  178. package/package.json +1 -1
@@ -0,0 +1,396 @@
1
+ ---
2
+ name: postmortem-writing
3
+ description: Write effective blameless postmortems with root cause analysis, timelines, and action items. Use when conducting incident reviews, writing postmortem documents, or improving incident response processes.
4
+ ---
5
+
6
+ # Postmortem Writing
7
+
8
+ Comprehensive guide to writing effective, blameless postmortems that drive organizational learning and prevent incident recurrence.
9
+
10
+ ## When to Use This Skill
11
+
12
+ - Conducting post-incident reviews
13
+ - Writing postmortem documents
14
+ - Facilitating blameless postmortem meetings
15
+ - Identifying root causes and contributing factors
16
+ - Creating actionable follow-up items
17
+ - Building organizational learning culture
18
+
19
+ ## Core Concepts
20
+
21
+ ### 1. Blameless Culture
22
+
23
+ | Blame-Focused | Blameless |
24
+ | ------------------------ | --------------------------------- |
25
+ | "Who caused this?" | "What conditions allowed this?" |
26
+ | "Someone made a mistake" | "The system allowed this mistake" |
27
+ | Punish individuals | Improve systems |
28
+ | Hide information | Share learnings |
29
+ | Fear of speaking up | Psychological safety |
30
+
31
+ ### 2. Postmortem Triggers
32
+
33
+ - SEV1 or SEV2 incidents
34
+ - Customer-facing outages > 15 minutes
35
+ - Data loss or security incidents
36
+ - Near-misses that could have been severe
37
+ - Novel failure modes
38
+ - Incidents requiring unusual intervention
39
+
40
+ ## Quick Start
41
+
42
+ ### Postmortem Timeline
43
+
44
+ ```
45
+ Day 0: Incident occurs
46
+ Day 1-2: Draft postmortem document
47
+ Day 3-5: Postmortem meeting
48
+ Day 5-7: Finalize document, create tickets
49
+ Week 2+: Action item completion
50
+ Quarterly: Review patterns across incidents
51
+ ```
52
+
53
+ ## Templates
54
+
55
+ ### Template 1: Standard Postmortem
56
+
57
+ ```markdown
58
+ # Postmortem: [Incident Title]
59
+
60
+ **Date**: 2024-01-15
61
+ **Authors**: @alice, @bob
62
+ **Status**: Draft | In Review | Final
63
+ **Incident Severity**: SEV2
64
+ **Incident Duration**: 47 minutes
65
+
66
+ ## Executive Summary
67
+
68
+ On January 15, 2024, the payment processing service experienced a 47-minute outage affecting approximately 12,000 customers. The root cause was a database connection pool exhaustion triggered by a configuration change in deployment v2.3.4. The incident was resolved by rolling back to v2.3.3 and increasing connection pool limits.
69
+
70
+ **Impact**:
71
+
72
+ - 12,000 customers unable to complete purchases
73
+ - Estimated revenue loss: $45,000
74
+ - 847 support tickets created
75
+ - No data loss or security implications
76
+
77
+ ## Timeline (All times UTC)
78
+
79
+ | Time | Event |
80
+ | ----- | ----------------------------------------------- |
81
+ | 14:23 | Deployment v2.3.4 completed to production |
82
+ | 14:31 | First alert: `payment_error_rate > 5%` |
83
+ | 14:33 | On-call engineer @alice acknowledges alert |
84
+ | 14:35 | Initial investigation begins, error rate at 23% |
85
+ | 14:41 | Incident declared SEV2, @bob joins |
86
+ | 14:45 | Database connection exhaustion identified |
87
+ | 14:52 | Decision to rollback deployment |
88
+ | 14:58 | Rollback to v2.3.3 initiated |
89
+ | 15:10 | Rollback complete, error rate dropping |
90
+ | 15:18 | Service fully recovered, incident resolved |
91
+
92
+ ## Root Cause Analysis
93
+
94
+ ### What Happened
95
+
96
+ The v2.3.4 deployment included a change to the database query pattern that inadvertently removed connection pooling for a frequently-called endpoint. Each request opened a new database connection instead of reusing pooled connections.
97
+
98
+ ### Why It Happened
99
+
100
+ 1. **Proximate Cause**: Code change in `PaymentRepository.java` replaced pooled `DataSource` with direct `DriverManager.getConnection()` calls.
101
+
102
+ 2. **Contributing Factors**:
103
+ - Code review did not catch the connection handling change
104
+ - No integration tests specifically for connection pool behavior
105
+ - Staging environment has lower traffic, masking the issue
106
+ - Database connection metrics alert threshold was too high (90%)
107
+
108
+ 3. **5 Whys Analysis**:
109
+ - Why did the service fail? → Database connections exhausted
110
+ - Why were connections exhausted? → Each request opened new connection
111
+ - Why did each request open new connection? → Code bypassed connection pool
112
+ - Why did code bypass connection pool? → Developer unfamiliar with codebase patterns
113
+ - Why was developer unfamiliar? → No documentation on connection management patterns
114
+
115
+ ### System Diagram
116
+ ```
117
+
118
+ [Client] → [Load Balancer] → [Payment Service] → [Database]
119
+
120
+ Connection Pool (broken)
121
+
122
+ Direct connections (cause)
123
+
124
+ ```
125
+
126
+ ## Detection
127
+
128
+ ### What Worked
129
+ - Error rate alert fired within 8 minutes of deployment
130
+ - Grafana dashboard clearly showed connection spike
131
+ - On-call response was swift (2 minute acknowledgment)
132
+
133
+ ### What Didn't Work
134
+ - Database connection metric alert threshold too high
135
+ - No deployment-correlated alerting
136
+ - Canary deployment would have caught this earlier
137
+
138
+ ### Detection Gap
139
+ The deployment completed at 14:23, but the first alert didn't fire until 14:31 (8 minutes). A deployment-aware alert could have detected the issue faster.
140
+
141
+ ## Response
142
+
143
+ ### What Worked
144
+ - On-call engineer quickly identified database as the issue
145
+ - Rollback decision was made decisively
146
+ - Clear communication in incident channel
147
+
148
+ ### What Could Be Improved
149
+ - Took 10 minutes to correlate issue with recent deployment
150
+ - Had to manually check deployment history
151
+ - Rollback took 12 minutes (could be faster)
152
+
153
+ ## Impact
154
+
155
+ ### Customer Impact
156
+ - 12,000 unique customers affected
157
+ - Average impact duration: 35 minutes
158
+ - 847 support tickets (23% of affected users)
159
+ - Customer satisfaction score dropped 12 points
160
+
161
+ ### Business Impact
162
+ - Estimated revenue loss: $45,000
163
+ - Support cost: ~$2,500 (agent time)
164
+ - Engineering time: ~8 person-hours
165
+
166
+ ### Technical Impact
167
+ - Database primary experienced elevated load
168
+ - Some replica lag during incident
169
+ - No permanent damage to systems
170
+
171
+ ## Lessons Learned
172
+
173
+ ### What Went Well
174
+ 1. Alerting detected the issue before customer reports
175
+ 2. Team collaborated effectively under pressure
176
+ 3. Rollback procedure worked smoothly
177
+ 4. Communication was clear and timely
178
+
179
+ ### What Went Wrong
180
+ 1. Code review missed critical change
181
+ 2. Test coverage gap for connection pooling
182
+ 3. Staging environment doesn't reflect production traffic
183
+ 4. Alert thresholds were not tuned properly
184
+
185
+ ### Where We Got Lucky
186
+ 1. Incident occurred during business hours with full team available
187
+ 2. Database handled the load without failing completely
188
+ 3. No other incidents occurred simultaneously
189
+
190
+ ## Action Items
191
+
192
+ | Priority | Action | Owner | Due Date | Ticket |
193
+ |----------|--------|-------|----------|--------|
194
+ | P0 | Add integration test for connection pool behavior | @alice | 2024-01-22 | ENG-1234 |
195
+ | P0 | Lower database connection alert threshold to 70% | @bob | 2024-01-17 | OPS-567 |
196
+ | P1 | Document connection management patterns | @alice | 2024-01-29 | DOC-89 |
197
+ | P1 | Implement deployment-correlated alerting | @bob | 2024-02-05 | OPS-568 |
198
+ | P2 | Evaluate canary deployment strategy | @charlie | 2024-02-15 | ENG-1235 |
199
+ | P2 | Load test staging with production-like traffic | @dave | 2024-02-28 | QA-123 |
200
+
201
+ ## Appendix
202
+
203
+ ### Supporting Data
204
+
205
+ #### Error Rate Graph
206
+ [Link to Grafana dashboard snapshot]
207
+
208
+ #### Database Connection Graph
209
+ [Link to metrics]
210
+
211
+ ### Related Incidents
212
+ - 2023-11-02: Similar connection issue in User Service (POSTMORTEM-42)
213
+
214
+ ### References
215
+ - [Connection Pool Best Practices](internal-wiki/connection-pools)
216
+ - [Deployment Runbook](internal-wiki/deployment-runbook)
217
+ ```
218
+
219
+ ### Template 2: 5 Whys Analysis
220
+
221
+ ```markdown
222
+ # 5 Whys Analysis: [Incident]
223
+
224
+ ## Problem Statement
225
+
226
+ Payment service experienced 47-minute outage due to database connection exhaustion.
227
+
228
+ ## Analysis
229
+
230
+ ### Why #1: Why did the service fail?
231
+
232
+ **Answer**: Database connections were exhausted, causing all new requests to fail.
233
+
234
+ **Evidence**: Metrics showed connection count at 100/100 (max), with 500+ pending requests.
235
+
236
+ ---
237
+
238
+ ### Why #2: Why were database connections exhausted?
239
+
240
+ **Answer**: Each incoming request opened a new database connection instead of using the connection pool.
241
+
242
+ **Evidence**: Code diff shows direct `DriverManager.getConnection()` instead of pooled `DataSource`.
243
+
244
+ ---
245
+
246
+ ### Why #3: Why did the code bypass the connection pool?
247
+
248
+ **Answer**: A developer refactored the repository class and inadvertently changed the connection acquisition method.
249
+
250
+ **Evidence**: PR #1234 shows the change, made while fixing a different bug.
251
+
252
+ ---
253
+
254
+ ### Why #4: Why wasn't this caught in code review?
255
+
256
+ **Answer**: The reviewer focused on the functional change (the bug fix) and didn't notice the infrastructure change.
257
+
258
+ **Evidence**: Review comments only discuss business logic.
259
+
260
+ ---
261
+
262
+ ### Why #5: Why isn't there a safety net for this type of change?
263
+
264
+ **Answer**: We lack automated tests that verify connection pool behavior and lack documentation about our connection patterns.
265
+
266
+ **Evidence**: Test suite has no tests for connection handling; wiki has no article on database connections.
267
+
268
+ ## Root Causes Identified
269
+
270
+ 1. **Primary**: Missing automated tests for infrastructure behavior
271
+ 2. **Secondary**: Insufficient documentation of architectural patterns
272
+ 3. **Tertiary**: Code review checklist doesn't include infrastructure considerations
273
+
274
+ ## Systemic Improvements
275
+
276
+ | Root Cause | Improvement | Type |
277
+ | ------------- | --------------------------------- | ---------- |
278
+ | Missing tests | Add infrastructure behavior tests | Prevention |
279
+ | Missing docs | Document connection patterns | Prevention |
280
+ | Review gaps | Update review checklist | Detection |
281
+ | No canary | Implement canary deployments | Mitigation |
282
+ ```
283
+
284
+ ### Template 3: Quick Postmortem (Minor Incidents)
285
+
286
+ ```markdown
287
+ # Quick Postmortem: [Brief Title]
288
+
289
+ **Date**: 2024-01-15 | **Duration**: 12 min | **Severity**: SEV3
290
+
291
+ ## What Happened
292
+
293
+ API latency spiked to 5s due to cache miss storm after cache flush.
294
+
295
+ ## Timeline
296
+
297
+ - 10:00 - Cache flush initiated for config update
298
+ - 10:02 - Latency alerts fire
299
+ - 10:05 - Identified as cache miss storm
300
+ - 10:08 - Enabled cache warming
301
+ - 10:12 - Latency normalized
302
+
303
+ ## Root Cause
304
+
305
+ Full cache flush for minor config update caused thundering herd.
306
+
307
+ ## Fix
308
+
309
+ - Immediate: Enabled cache warming
310
+ - Long-term: Implement partial cache invalidation (ENG-999)
311
+
312
+ ## Lessons
313
+
314
+ Don't full-flush cache in production; use targeted invalidation.
315
+ ```
316
+
317
+ ## Facilitation Guide
318
+
319
+ ### Running a Postmortem Meeting
320
+
321
+ ```markdown
322
+ ## Meeting Structure (60 minutes)
323
+
324
+ ### 1. Opening (5 min)
325
+
326
+ - Remind everyone of blameless culture
327
+ - "We're here to learn, not to blame"
328
+ - Review meeting norms
329
+
330
+ ### 2. Timeline Review (15 min)
331
+
332
+ - Walk through events chronologically
333
+ - Ask clarifying questions
334
+ - Identify gaps in timeline
335
+
336
+ ### 3. Analysis Discussion (20 min)
337
+
338
+ - What failed?
339
+ - Why did it fail?
340
+ - What conditions allowed this?
341
+ - What would have prevented it?
342
+
343
+ ### 4. Action Items (15 min)
344
+
345
+ - Brainstorm improvements
346
+ - Prioritize by impact and effort
347
+ - Assign owners and due dates
348
+
349
+ ### 5. Closing (5 min)
350
+
351
+ - Summarize key learnings
352
+ - Confirm action item owners
353
+ - Schedule follow-up if needed
354
+
355
+ ## Facilitation Tips
356
+
357
+ - Keep discussion on track
358
+ - Redirect blame to systems
359
+ - Encourage quiet participants
360
+ - Document dissenting views
361
+ - Time-box tangents
362
+ ```
363
+
364
+ ## Anti-Patterns to Avoid
365
+
366
+ | Anti-Pattern | Problem | Better Approach |
367
+ | ----------------------- | -------------------------- | ------------------------------- |
368
+ | **Blame game** | Shuts down learning | Focus on systems |
369
+ | **Shallow analysis** | Doesn't prevent recurrence | Ask "why" 5 times |
370
+ | **No action items** | Waste of time | Always have concrete next steps |
371
+ | **Unrealistic actions** | Never completed | Scope to achievable tasks |
372
+ | **No follow-up** | Actions forgotten | Track in ticketing system |
373
+
374
+ ## Best Practices
375
+
376
+ ### Do's
377
+
378
+ - **Start immediately** - Memory fades fast
379
+ - **Be specific** - Exact times, exact errors
380
+ - **Include graphs** - Visual evidence
381
+ - **Assign owners** - No orphan action items
382
+ - **Share widely** - Organizational learning
383
+
384
+ ### Don'ts
385
+
386
+ - **Don't name and shame** - Ever
387
+ - **Don't skip small incidents** - They reveal patterns
388
+ - **Don't make it a blame doc** - That kills learning
389
+ - **Don't create busywork** - Actions should be meaningful
390
+ - **Don't skip follow-up** - Verify actions completed
391
+
392
+ ## Resources
393
+
394
+ - [Google SRE - Postmortem Culture](https://sre.google/sre-book/postmortem-culture/)
395
+ - [Etsy's Blameless Postmortems](https://codeascraft.com/2012/05/22/blameless-postmortems/)
396
+ - [PagerDuty Postmortem Guide](https://postmortems.pagerduty.com/)
@@ -0,0 +1,265 @@
1
+ ---
2
+ name: prometheus-configuration
3
+ description: Set up Prometheus for comprehensive metric collection, storage, and monitoring of infrastructure and applications. Use when implementing metrics collection, setting up monitoring infrastructure, or configuring alerting systems.
4
+ ---
5
+
6
+ # Prometheus Configuration
7
+
8
+ Complete guide to Prometheus setup, metric collection, scrape configuration, and recording rules.
9
+
10
+ ## Purpose
11
+
12
+ Configure Prometheus for comprehensive metric collection, alerting, and monitoring of infrastructure and applications.
13
+
14
+ ## When to Use
15
+
16
+ - Set up Prometheus monitoring
17
+ - Configure metric scraping
18
+ - Create recording rules
19
+ - Design alert rules
20
+ - Implement service discovery
21
+
22
+ ## Prometheus Architecture
23
+
24
+ ```
25
+ ┌──────────────┐
26
+ │ Applications │ ← Instrumented with client libraries
27
+ └──────┬───────┘
28
+ │ /metrics endpoint
29
+
30
+ ┌──────────────┐
31
+ │ Prometheus │ ← Scrapes metrics periodically
32
+ │ Server │
33
+ └──────┬───────┘
34
+
35
+ ├─→ AlertManager (alerts)
36
+ ├─→ Grafana (visualization)
37
+ └─→ Long-term storage (Thanos/Cortex)
38
+ ```
39
+
40
+ ## Installation
41
+
42
+ ### Kubernetes with Helm
43
+
44
+ ```bash
45
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
46
+ helm install prometheus prometheus-community/kube-prometheus-stack \
47
+ --namespace monitoring \
48
+ --create-namespace \
49
+ --set prometheus.prometheusSpec.retention=30d
50
+ ```
51
+
52
+ ### Docker Compose
53
+
54
+ ```yaml
55
+ version: "3.8"
56
+ services:
57
+ prometheus:
58
+ image: prom/prometheus:latest
59
+ ports:
60
+ - "9090:9090"
61
+ volumes:
62
+ - ./prometheus.yml:/etc/prometheus/prometheus.yml
63
+ - prometheus-data:/prometheus
64
+ command:
65
+ - "--config.file=/etc/prometheus/prometheus.yml"
66
+ - "--storage.tsdb.path=/prometheus"
67
+ ```
68
+
69
+ ## Configuration File
70
+
71
+ ```yaml
72
+ global:
73
+ scrape_interval: 15s
74
+ evaluation_interval: 15s
75
+
76
+ alerting:
77
+ alertmanagers:
78
+ - static_configs:
79
+ - targets:
80
+ - alertmanager:9093
81
+
82
+ rule_files:
83
+ - /etc/prometheus/rules/*.yml
84
+
85
+ scrape_configs:
86
+ - job_name: "prometheus"
87
+ static_configs:
88
+ - targets: ["localhost:9090"]
89
+
90
+ - job_name: "node-exporter"
91
+ static_configs:
92
+ - targets:
93
+ - "node1:9100"
94
+ - "node2:9100"
95
+
96
+ - job_name: "kubernetes-pods"
97
+ kubernetes_sd_configs:
98
+ - role: pod
99
+ relabel_configs:
100
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
101
+ action: keep
102
+ regex: true
103
+ - source_labels: [__meta_kubernetes_namespace]
104
+ action: replace
105
+ target_label: namespace
106
+ ```
107
+
108
+ ## Scrape Configurations
109
+
110
+ ### Static Targets
111
+
112
+ ```yaml
113
+ scrape_configs:
114
+ - job_name: "static-targets"
115
+ static_configs:
116
+ - targets: ["host1:9100", "host2:9100"]
117
+ labels:
118
+ env: "production"
119
+ ```
120
+
121
+ ### File-based Service Discovery
122
+
123
+ ```yaml
124
+ scrape_configs:
125
+ - job_name: "file-sd"
126
+ file_sd_configs:
127
+ - files:
128
+ - /etc/prometheus/targets/*.json
129
+ refresh_interval: 5m
130
+ ```
131
+
132
+ **targets/production.json:**
133
+
134
+ ```json
135
+ [
136
+ {
137
+ "targets": ["app1:9090", "app2:9090"],
138
+ "labels": {
139
+ "env": "production"
140
+ }
141
+ }
142
+ ]
143
+ ```
144
+
145
+ ### Kubernetes Service Discovery
146
+
147
+ ```yaml
148
+ scrape_configs:
149
+ - job_name: "kubernetes-services"
150
+ kubernetes_sd_configs:
151
+ - role: service
152
+ relabel_configs:
153
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
154
+ action: keep
155
+ regex: true
156
+ ```
157
+
158
+ ## Recording Rules
159
+
160
+ Create pre-computed metrics for frequently queried expressions:
161
+
162
+ ```yaml
163
+ groups:
164
+ - name: api_metrics
165
+ interval: 15s
166
+ rules:
167
+ # HTTP request rate per service
168
+ - record: job:http_requests:rate5m
169
+ expr: sum by (job) (rate(http_requests_total[5m]))
170
+
171
+ # Error rate percentage
172
+ - record: job:http_requests_error_rate:percentage
173
+ expr: |
174
+ (sum by (job) (rate(http_requests_total{status=~"5.."}[5m])) / job:http_requests:rate5m) * 100
175
+
176
+ # P95 latency
177
+ - record: job:http_request_duration:p95
178
+ expr: |
179
+ histogram_quantile(0.95,
180
+ sum by (job, le) (rate(http_request_duration_seconds_bucket[5m]))
181
+ )
182
+
183
+ - name: resource_metrics
184
+ rules:
185
+ # CPU utilization percentage
186
+ - record: instance:node_cpu:utilization
187
+ expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
188
+
189
+ # Memory utilization percentage
190
+ - record: instance:node_memory:utilization
191
+ expr: 100 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100)
192
+ ```
193
+
194
+ ## Alert Rules
195
+
196
+ ```yaml
197
+ groups:
198
+ - name: availability
199
+ rules:
200
+ - alert: ServiceDown
201
+ expr: up{job="my-app"} == 0
202
+ for: 1m
203
+ labels:
204
+ severity: critical
205
+ annotations:
206
+ summary: "Service {{ $labels.instance }} is down"
207
+ description: "{{ $labels.job }} has been down for more than 1 minute"
208
+
209
+ - alert: HighErrorRate
210
+ expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
211
+ for: 5m
212
+ labels:
213
+ severity: warning
214
+ annotations:
215
+ summary: "High error rate for {{ $labels.job }}"
216
+ description: "Error rate is {{ $value }}%"
217
+
218
+ - alert: HighCPUUsage
219
+ expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
220
+ for: 5m
221
+ labels:
222
+ severity: warning
223
+ annotations:
224
+ summary: "High CPU usage on {{ $labels.instance }}"
225
+ ```
226
+
227
+ ## Validation
228
+
229
+ ```bash
230
+ # Validate configuration
231
+ promtool check config prometheus.yml
232
+
233
+ # Validate rules
234
+ promtool check rules /etc/prometheus/rules/*.yml
235
+ ```
236
+
237
+ ## Best Practices
238
+
239
+ 1. **Use consistent naming** for metrics (prefix_name_unit)
240
+ 2. **Set appropriate scrape intervals** (15-60s typical)
241
+ 3. **Use recording rules** for expensive queries
242
+ 4. **Implement high availability** (multiple Prometheus instances)
243
+ 5. **Configure retention** based on storage capacity
244
+ 6. **Use relabeling** for metric cleanup
245
+ 7. **Monitor Prometheus itself**
246
+ 8. **Use Thanos/Cortex** for long-term storage
247
+
248
+ ## Troubleshooting
249
+
250
+ **Check scrape targets:**
251
+
252
+ ```bash
253
+ curl http://localhost:9090/api/v1/targets
254
+ ```
255
+
256
+ **Test query:**
257
+
258
+ ```bash
259
+ curl 'http://localhost:9090/api/v1/query?query=up'
260
+ ```
261
+
262
+ ## Related Skills
263
+
264
+ - `observability` - For general observability
265
+ - `grafana-dashboards` - For visualization