specweave 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/INSTALL.md +848 -0
  2. package/LICENSE +21 -0
  3. package/README.md +675 -0
  4. package/SPECWEAVE.md +665 -0
  5. package/bin/install-agents.sh +57 -0
  6. package/bin/install-all.sh +49 -0
  7. package/bin/install-commands.sh +56 -0
  8. package/bin/install-skills.sh +57 -0
  9. package/bin/specweave.js +81 -0
  10. package/dist/adapters/adapter-base.d.ts +50 -0
  11. package/dist/adapters/adapter-base.d.ts.map +1 -0
  12. package/dist/adapters/adapter-base.js +146 -0
  13. package/dist/adapters/adapter-base.js.map +1 -0
  14. package/dist/adapters/adapter-interface.d.ts +108 -0
  15. package/dist/adapters/adapter-interface.d.ts.map +1 -0
  16. package/dist/adapters/adapter-interface.js +9 -0
  17. package/dist/adapters/adapter-interface.js.map +1 -0
  18. package/dist/adapters/claude/adapter.d.ts +54 -0
  19. package/dist/adapters/claude/adapter.d.ts.map +1 -0
  20. package/dist/adapters/claude/adapter.js +184 -0
  21. package/dist/adapters/claude/adapter.js.map +1 -0
  22. package/dist/adapters/copilot/adapter.d.ts +42 -0
  23. package/dist/adapters/copilot/adapter.d.ts.map +1 -0
  24. package/dist/adapters/copilot/adapter.js +239 -0
  25. package/dist/adapters/copilot/adapter.js.map +1 -0
  26. package/dist/adapters/cursor/adapter.d.ts +42 -0
  27. package/dist/adapters/cursor/adapter.d.ts.map +1 -0
  28. package/dist/adapters/cursor/adapter.js +297 -0
  29. package/dist/adapters/cursor/adapter.js.map +1 -0
  30. package/dist/adapters/generic/adapter.d.ts +40 -0
  31. package/dist/adapters/generic/adapter.d.ts.map +1 -0
  32. package/dist/adapters/generic/adapter.js +155 -0
  33. package/dist/adapters/generic/adapter.js.map +1 -0
  34. package/dist/cli/commands/init.d.ts +6 -0
  35. package/dist/cli/commands/init.d.ts.map +1 -0
  36. package/dist/cli/commands/init.js +247 -0
  37. package/dist/cli/commands/init.js.map +1 -0
  38. package/dist/cli/commands/install.d.ts +7 -0
  39. package/dist/cli/commands/install.d.ts.map +1 -0
  40. package/dist/cli/commands/install.js +160 -0
  41. package/dist/cli/commands/install.js.map +1 -0
  42. package/dist/cli/commands/list.d.ts +6 -0
  43. package/dist/cli/commands/list.d.ts.map +1 -0
  44. package/dist/cli/commands/list.js +154 -0
  45. package/dist/cli/commands/list.js.map +1 -0
  46. package/package.json +90 -0
  47. package/src/adapters/README.md +312 -0
  48. package/src/adapters/adapter-base.ts +146 -0
  49. package/src/adapters/adapter-interface.ts +120 -0
  50. package/src/adapters/claude/README.md +241 -0
  51. package/src/adapters/claude/adapter.ts +157 -0
  52. package/src/adapters/copilot/.github/copilot/instructions.md +376 -0
  53. package/src/adapters/copilot/README.md +200 -0
  54. package/src/adapters/copilot/adapter.ts +210 -0
  55. package/src/adapters/cursor/.cursor/context/docs-context.md +62 -0
  56. package/src/adapters/cursor/.cursor/context/increments-context.md +71 -0
  57. package/src/adapters/cursor/.cursor/context/strategy-context.md +73 -0
  58. package/src/adapters/cursor/.cursor/context/tests-context.md +89 -0
  59. package/src/adapters/cursor/.cursorrules +325 -0
  60. package/src/adapters/cursor/README.md +243 -0
  61. package/src/adapters/cursor/adapter.ts +268 -0
  62. package/src/adapters/generic/README.md +277 -0
  63. package/src/adapters/generic/SPECWEAVE-MANUAL.md +676 -0
  64. package/src/adapters/generic/adapter.ts +159 -0
  65. package/src/adapters/registry.yaml +126 -0
  66. package/src/agents/architect/AGENT.md +416 -0
  67. package/src/agents/devops/AGENT.md +1738 -0
  68. package/src/agents/docs-writer/AGENT.md +239 -0
  69. package/src/agents/performance/AGENT.md +228 -0
  70. package/src/agents/pm/AGENT.md +751 -0
  71. package/src/agents/qa-lead/AGENT.md +150 -0
  72. package/src/agents/security/AGENT.md +179 -0
  73. package/src/agents/sre/AGENT.md +582 -0
  74. package/src/agents/sre/modules/backend-diagnostics.md +481 -0
  75. package/src/agents/sre/modules/database-diagnostics.md +509 -0
  76. package/src/agents/sre/modules/infrastructure.md +561 -0
  77. package/src/agents/sre/modules/monitoring.md +439 -0
  78. package/src/agents/sre/modules/security-incidents.md +421 -0
  79. package/src/agents/sre/modules/ui-diagnostics.md +302 -0
  80. package/src/agents/sre/playbooks/01-high-cpu-usage.md +204 -0
  81. package/src/agents/sre/playbooks/02-database-deadlock.md +241 -0
  82. package/src/agents/sre/playbooks/03-memory-leak.md +252 -0
  83. package/src/agents/sre/playbooks/04-slow-api-response.md +269 -0
  84. package/src/agents/sre/playbooks/05-ddos-attack.md +293 -0
  85. package/src/agents/sre/playbooks/06-disk-full.md +314 -0
  86. package/src/agents/sre/playbooks/07-service-down.md +333 -0
  87. package/src/agents/sre/playbooks/08-data-corruption.md +337 -0
  88. package/src/agents/sre/playbooks/09-cascade-failure.md +430 -0
  89. package/src/agents/sre/playbooks/10-rate-limit-exceeded.md +464 -0
  90. package/src/agents/sre/scripts/health-check.sh +230 -0
  91. package/src/agents/sre/scripts/log-analyzer.py +213 -0
  92. package/src/agents/sre/scripts/metrics-collector.sh +294 -0
  93. package/src/agents/sre/scripts/trace-analyzer.js +257 -0
  94. package/src/agents/sre/templates/incident-report.md +249 -0
  95. package/src/agents/sre/templates/mitigation-plan.md +375 -0
  96. package/src/agents/sre/templates/post-mortem.md +418 -0
  97. package/src/agents/sre/templates/runbook-template.md +412 -0
  98. package/src/agents/tech-lead/AGENT.md +263 -0
  99. package/src/commands/add-tasks.md +176 -0
  100. package/src/commands/close-increment.md +347 -0
  101. package/src/commands/create-increment.md +223 -0
  102. package/src/commands/create-project.md +528 -0
  103. package/src/commands/generate-docs.md +623 -0
  104. package/src/commands/list-increments.md +180 -0
  105. package/src/commands/review-docs.md +331 -0
  106. package/src/commands/start-increment.md +139 -0
  107. package/src/commands/sync-github.md +115 -0
  108. package/src/commands/validate-increment.md +800 -0
  109. package/src/hooks/README.md +252 -0
  110. package/src/hooks/docs-changed.sh +59 -0
  111. package/src/hooks/human-input-required.sh +55 -0
  112. package/src/hooks/post-task-completion.sh +57 -0
  113. package/src/hooks/pre-implementation.sh +47 -0
  114. package/src/skills/ado-sync/README.md +449 -0
  115. package/src/skills/ado-sync/SKILL.md +245 -0
  116. package/src/skills/ado-sync/test-cases/test-1.yaml +9 -0
  117. package/src/skills/ado-sync/test-cases/test-2.yaml +8 -0
  118. package/src/skills/ado-sync/test-cases/test-3.yaml +9 -0
  119. package/src/skills/bmad-method-expert/SKILL.md +628 -0
  120. package/src/skills/bmad-method-expert/scripts/analyze-project.js +318 -0
  121. package/src/skills/bmad-method-expert/scripts/check-setup.js +208 -0
  122. package/src/skills/bmad-method-expert/scripts/generate-template.js +1149 -0
  123. package/src/skills/bmad-method-expert/scripts/validate-documents.js +340 -0
  124. package/src/skills/bmad-method-expert/test-cases/test-1-placeholder.yaml +12 -0
  125. package/src/skills/bmad-method-expert/test-cases/test-2-placeholder.yaml +12 -0
  126. package/src/skills/bmad-method-expert/test-cases/test-3-placeholder.yaml +12 -0
  127. package/src/skills/brownfield-analyzer/SKILL.md +523 -0
  128. package/src/skills/brownfield-analyzer/test-cases/test-1-basic-analysis.yaml +48 -0
  129. package/src/skills/brownfield-analyzer/test-cases/test-2-placeholder.yaml +12 -0
  130. package/src/skills/brownfield-analyzer/test-cases/test-3-placeholder.yaml +12 -0
  131. package/src/skills/brownfield-onboarder/SKILL.md +625 -0
  132. package/src/skills/brownfield-onboarder/test-cases/test-1-placeholder.yaml +12 -0
  133. package/src/skills/brownfield-onboarder/test-cases/test-2-placeholder.yaml +12 -0
  134. package/src/skills/brownfield-onboarder/test-cases/test-3-placeholder.yaml +12 -0
  135. package/src/skills/calendar-system/test-cases/test-1-placeholder.yaml +12 -0
  136. package/src/skills/calendar-system/test-cases/test-2-placeholder.yaml +12 -0
  137. package/src/skills/calendar-system/test-cases/test-3-placeholder.yaml +12 -0
  138. package/src/skills/context-loader/SKILL.md +734 -0
  139. package/src/skills/context-loader/test-cases/test-1-basic-loading.yaml +39 -0
  140. package/src/skills/context-loader/test-cases/test-2-token-budget-exceeded.yaml +44 -0
  141. package/src/skills/context-loader/test-cases/test-3-section-anchors.yaml +45 -0
  142. package/src/skills/context-optimizer/SKILL.md +618 -0
  143. package/src/skills/context-optimizer/test-cases/test-1-bug-fix-narrow.yaml +97 -0
  144. package/src/skills/context-optimizer/test-cases/test-2-feature-focused.yaml +109 -0
  145. package/src/skills/context-optimizer/test-cases/test-3-architecture-broad.yaml +98 -0
  146. package/src/skills/cost-optimizer/SKILL.md +190 -0
  147. package/src/skills/cost-optimizer/test-cases/test-1-basic-comparison.yaml +75 -0
  148. package/src/skills/cost-optimizer/test-cases/test-2-budget-constraint.yaml +52 -0
  149. package/src/skills/cost-optimizer/test-cases/test-3-scale-requirement.yaml +63 -0
  150. package/src/skills/cost-optimizer/test-results/README.md +46 -0
  151. package/src/skills/design-system-architect/SKILL.md +107 -0
  152. package/src/skills/design-system-architect/test-cases/test-1-token-structure.yaml +23 -0
  153. package/src/skills/design-system-architect/test-cases/test-2-component-hierarchy.yaml +24 -0
  154. package/src/skills/design-system-architect/test-cases/test-3-accessibility-checklist.yaml +23 -0
  155. package/src/skills/diagrams-architect/SKILL.md +763 -0
  156. package/src/skills/diagrams-generator/SKILL.md +25 -0
  157. package/src/skills/diagrams-generator/test-cases/test-1.yaml +9 -0
  158. package/src/skills/diagrams-generator/test-cases/test-2.yaml +9 -0
  159. package/src/skills/diagrams-generator/test-cases/test-3.yaml +8 -0
  160. package/src/skills/docs-updater/README.md +48 -0
  161. package/src/skills/docs-updater/test-cases/test-1-placeholder.yaml +12 -0
  162. package/src/skills/docs-updater/test-cases/test-2-placeholder.yaml +12 -0
  163. package/src/skills/docs-updater/test-cases/test-3-placeholder.yaml +12 -0
  164. package/src/skills/dotnet-backend/SKILL.md +250 -0
  165. package/src/skills/e2e-playwright/README.md +506 -0
  166. package/src/skills/e2e-playwright/SKILL.md +457 -0
  167. package/src/skills/e2e-playwright/execute.js +373 -0
  168. package/src/skills/e2e-playwright/lib/utils.js +514 -0
  169. package/src/skills/e2e-playwright/package.json +33 -0
  170. package/src/skills/e2e-playwright/test-cases/TC-001-basic-navigation.yaml +54 -0
  171. package/src/skills/e2e-playwright/test-cases/TC-002-form-interaction.yaml +64 -0
  172. package/src/skills/e2e-playwright/test-cases/TC-003-specweave-integration.yaml +74 -0
  173. package/src/skills/e2e-playwright/test-cases/TC-004-accessibility-check.yaml +98 -0
  174. package/src/skills/figma-designer/SKILL.md +149 -0
  175. package/src/skills/figma-implementer/SKILL.md +148 -0
  176. package/src/skills/figma-mcp-connector/SKILL.md +136 -0
  177. package/src/skills/figma-mcp-connector/test-cases/test-1-read-file-desktop.yaml +22 -0
  178. package/src/skills/figma-mcp-connector/test-cases/test-2-read-file-framelink.yaml +21 -0
  179. package/src/skills/figma-mcp-connector/test-cases/test-3-error-handling.yaml +18 -0
  180. package/src/skills/figma-to-code/SKILL.md +128 -0
  181. package/src/skills/figma-to-code/test-cases/test-1-token-generation.yaml +29 -0
  182. package/src/skills/figma-to-code/test-cases/test-2-component-generation.yaml +27 -0
  183. package/src/skills/figma-to-code/test-cases/test-3-typescript-generation.yaml +28 -0
  184. package/src/skills/frontend/SKILL.md +177 -0
  185. package/src/skills/github-sync/SKILL.md +252 -0
  186. package/src/skills/github-sync/test-cases/test-1-placeholder.yaml +12 -0
  187. package/src/skills/github-sync/test-cases/test-2-placeholder.yaml +12 -0
  188. package/src/skills/github-sync/test-cases/test-3-placeholder.yaml +12 -0
  189. package/src/skills/hetzner-provisioner/README.md +308 -0
  190. package/src/skills/hetzner-provisioner/SKILL.md +251 -0
  191. package/src/skills/hetzner-provisioner/test-cases/test-1-basic-provision.yaml +71 -0
  192. package/src/skills/hetzner-provisioner/test-cases/test-2-postgres-provision.yaml +85 -0
  193. package/src/skills/hetzner-provisioner/test-cases/test-3-ssl-config.yaml +126 -0
  194. package/src/skills/hetzner-provisioner/test-results/README.md +259 -0
  195. package/src/skills/increment-planner/SKILL.md +889 -0
  196. package/src/skills/increment-planner/scripts/feature-utils.js +250 -0
  197. package/src/skills/increment-planner/test-cases/test-1-basic-feature.yaml +27 -0
  198. package/src/skills/increment-planner/test-cases/test-2-complex-feature.yaml +30 -0
  199. package/src/skills/increment-planner/test-cases/test-3-auto-numbering.yaml +24 -0
  200. package/src/skills/increment-quality-judge/SKILL.md +566 -0
  201. package/src/skills/increment-quality-judge/test-cases/test-1-good-spec.yaml +95 -0
  202. package/src/skills/increment-quality-judge/test-cases/test-2-poor-spec.yaml +108 -0
  203. package/src/skills/increment-quality-judge/test-cases/test-3-export-suggestions.yaml +87 -0
  204. package/src/skills/jira-sync/README.md +328 -0
  205. package/src/skills/jira-sync/SKILL.md +209 -0
  206. package/src/skills/jira-sync/test-cases/test-1.yaml +9 -0
  207. package/src/skills/jira-sync/test-cases/test-2.yaml +9 -0
  208. package/src/skills/jira-sync/test-cases/test-3.yaml +10 -0
  209. package/src/skills/nextjs/SKILL.md +176 -0
  210. package/src/skills/nodejs-backend/SKILL.md +181 -0
  211. package/src/skills/notification-system/test-cases/test-1-placeholder.yaml +12 -0
  212. package/src/skills/notification-system/test-cases/test-2-placeholder.yaml +12 -0
  213. package/src/skills/notification-system/test-cases/test-3-placeholder.yaml +12 -0
  214. package/src/skills/python-backend/SKILL.md +226 -0
  215. package/src/skills/role-orchestrator/README.md +197 -0
  216. package/src/skills/role-orchestrator/SKILL.md +1184 -0
  217. package/src/skills/role-orchestrator/test-cases/test-1-simple-product.yaml +98 -0
  218. package/src/skills/role-orchestrator/test-cases/test-2-quality-gate-failure.yaml +73 -0
  219. package/src/skills/role-orchestrator/test-cases/test-3-security-workflow.yaml +121 -0
  220. package/src/skills/role-orchestrator/test-cases/test-4-parallel-execution.yaml +145 -0
  221. package/src/skills/role-orchestrator/test-cases/test-5-feedback-loops.yaml +149 -0
  222. package/src/skills/skill-creator/LICENSE.txt +202 -0
  223. package/src/skills/skill-creator/SKILL.md +209 -0
  224. package/src/skills/skill-creator/scripts/init_skill.py +303 -0
  225. package/src/skills/skill-creator/scripts/package_skill.py +110 -0
  226. package/src/skills/skill-creator/scripts/quick_validate.py +65 -0
  227. package/src/skills/skill-creator/test-cases/test-1-placeholder.yaml +12 -0
  228. package/src/skills/skill-creator/test-cases/test-2-placeholder.yaml +12 -0
  229. package/src/skills/skill-creator/test-cases/test-3-placeholder.yaml +12 -0
  230. package/src/skills/skill-router/SKILL.md +497 -0
  231. package/src/skills/skill-router/test-cases/test-1-basic-routing.yaml +33 -0
  232. package/src/skills/skill-router/test-cases/test-2-ambiguous-request.yaml +42 -0
  233. package/src/skills/skill-router/test-cases/test-3-nested-orchestration.yaml +50 -0
  234. package/src/skills/spec-driven-brainstorming/README.md +264 -0
  235. package/src/skills/spec-driven-brainstorming/SKILL.md +439 -0
  236. package/src/skills/spec-driven-brainstorming/test-cases/TC-001-simple-idea-to-design.yaml +148 -0
  237. package/src/skills/spec-driven-brainstorming/test-cases/TC-002-complex-ultrathink-design.yaml +190 -0
  238. package/src/skills/spec-driven-brainstorming/test-cases/TC-003-unclear-requirements-socratic.yaml +233 -0
  239. package/src/skills/spec-driven-debugging/README.md +479 -0
  240. package/src/skills/spec-driven-debugging/SKILL.md +652 -0
  241. package/src/skills/spec-driven-debugging/test-cases/TC-001-simple-auth-bug.yaml +212 -0
  242. package/src/skills/spec-driven-debugging/test-cases/TC-002-race-condition-ultrathink.yaml +461 -0
  243. package/src/skills/spec-driven-debugging/test-cases/TC-003-brownfield-missing-spec.yaml +366 -0
  244. package/src/skills/spec-kit-expert/SKILL.md +1012 -0
  245. package/src/skills/spec-kit-expert/test-cases/test-1-placeholder.yaml +12 -0
  246. package/src/skills/spec-kit-expert/test-cases/test-2-placeholder.yaml +12 -0
  247. package/src/skills/spec-kit-expert/test-cases/test-3-placeholder.yaml +12 -0
  248. package/src/skills/specweave-ado-mapper/SKILL.md +501 -0
  249. package/src/skills/specweave-detector/SKILL.md +420 -0
  250. package/src/skills/specweave-detector/test-cases/test-1-basic-detection.yaml +37 -0
  251. package/src/skills/specweave-detector/test-cases/test-2-missing-config.yaml +37 -0
  252. package/src/skills/specweave-detector/test-cases/test-3-non-specweave-project.yaml +34 -0
  253. package/src/skills/specweave-jira-mapper/SKILL.md +500 -0
  254. package/src/skills/stripe-integrator/test-cases/test-1-placeholder.yaml +12 -0
  255. package/src/skills/stripe-integrator/test-cases/test-2-placeholder.yaml +12 -0
  256. package/src/skills/stripe-integrator/test-cases/test-3-placeholder.yaml +12 -0
  257. package/src/skills/task-builder/README.md +90 -0
  258. package/src/skills/task-builder/test-cases/test-1-placeholder.yaml +12 -0
  259. package/src/skills/task-builder/test-cases/test-2-placeholder.yaml +12 -0
  260. package/src/skills/task-builder/test-cases/test-3-placeholder.yaml +12 -0
  261. package/src/templates/.env.example +144 -0
  262. package/src/templates/.gitignore.template +81 -0
  263. package/src/templates/CLAUDE.md.template +383 -0
  264. package/src/templates/README.md.template +240 -0
  265. package/src/templates/config.yaml +333 -0
  266. package/src/templates/docs/README.md +124 -0
  267. package/src/templates/docs/adr-template.md +118 -0
  268. package/src/templates/docs/hld-template.md +220 -0
  269. package/src/templates/docs/lld-template.md +580 -0
  270. package/src/templates/docs/prd-template.md +132 -0
  271. package/src/templates/docs/rfc-template.md +229 -0
  272. package/src/templates/docs/runbook-template.md +298 -0
  273. package/src/templates/environments/minimal/.env.production +16 -0
  274. package/src/templates/environments/minimal/README.md +54 -0
  275. package/src/templates/environments/minimal/deploy-production.yml +52 -0
  276. package/src/templates/environments/progressive/.env.qa +28 -0
  277. package/src/templates/environments/progressive/README.md +129 -0
  278. package/src/templates/environments/progressive/deploy-production.yml +93 -0
  279. package/src/templates/environments/progressive/deploy-qa.yml +62 -0
  280. package/src/templates/environments/progressive/deploy-staging.yml +67 -0
  281. package/src/templates/environments/standard/.env.development +20 -0
  282. package/src/templates/environments/standard/.env.production +30 -0
  283. package/src/templates/environments/standard/.env.staging +23 -0
  284. package/src/templates/environments/standard/README.md +97 -0
  285. package/src/templates/environments/standard/deploy-production.yml +68 -0
  286. package/src/templates/environments/standard/deploy-staging.yml +61 -0
  287. package/src/templates/environments/standard/docker-compose.yml +43 -0
  288. package/src/templates/increment-metadata-template.yaml +138 -0
@@ -0,0 +1,418 @@
1
+ # Post-Mortem: [Incident Title]
2
+
3
+ **Date of Incident**: YYYY-MM-DD
4
+ **Date of Post-Mortem**: YYYY-MM-DD
5
+ **Author**: [Name]
6
+ **Reviewers**: [Names]
7
+ **Severity**: SEV1 / SEV2 / SEV3
8
+
9
+ ---
10
+
11
+ ## Executive Summary
12
+
13
+ **What Happened**: [One-paragraph summary of incident]
14
+
15
+ **Impact**: [Brief impact summary - users, duration, business]
16
+
17
+ **Root Cause**: [Root cause in one sentence]
18
+
19
+ **Resolution**: [How it was fixed]
20
+
21
+ **Example**:
22
+ ```
23
+ What Happened: On October 26, 2025, the application became unavailable for 30 minutes due to database connection pool exhaustion.
24
+
25
+ Impact: All users were unable to access the application from 14:00-14:30 UTC. Approximately 10,000 users affected.
26
+
27
+ Root Cause: Payment service had a connection leak (connections not properly closed in error handling path), which exhausted the database connection pool during high traffic.
28
+
29
+ Resolution: Application was restarted to release connections (immediate fix), and the connection leak was fixed in code (permanent fix).
30
+ ```
31
+
32
+ ---
33
+
34
+ ## Incident Details
35
+
36
+ ### Timeline
37
+
38
+ | Time (UTC) | Event | Actor |
39
+ |------------|-------|-------|
40
+ | 14:00 | Alert: "Database Connection Pool Exhausted" | Monitoring |
41
+ | 14:02 | On-call engineer paged | PagerDuty |
42
+ | 14:02 | Jane acknowledged alert | SRE (Jane) |
43
+ | 14:05 | Confirmed database connections at max (100/100) | SRE (Jane) |
44
+ | 14:08 | Checked application logs for connection usage | SRE (Jane) |
45
+ | 14:10 | Identified connection leak in payment service | SRE (Jane) |
46
+ | 14:12 | Decision: Restart payment service to free connections | SRE (Jane) |
47
+ | 14:15 | Payment service restarted | SRE (Jane) |
48
+ | 14:17 | Database connections dropped to 20/100 | SRE (Jane) |
49
+ | 14:20 | Health checks passing, traffic restored | SRE (Jane) |
50
+ | 14:25 | Monitoring for stability | SRE (Jane) |
51
+ | 14:30 | Incident declared resolved | SRE (Jane) |
52
+ | 15:00 | Developer identified code fix | Dev (Mike) |
53
+ | 16:00 | Code fix deployed to production | Dev (Mike) |
54
+ | 16:30 | Verified no recurrence after 1 hour | SRE (Jane) |
55
+
56
+ **Total Duration**: 30 minutes (outage) + 2.5 hours (full resolution)
57
+
58
+ ---
59
+
60
+ ### Impact
61
+
62
+ **Users Affected**:
63
+ - **Scope**: All users (100%)
64
+ - **Count**: ~10,000 active users
65
+ - **Duration**: 30 minutes complete outage
66
+
67
+ **Services Affected**:
68
+ - ✅ Frontend (down - unable to reach backend)
69
+ - ✅ Backend API (degraded - connection pool exhausted)
70
+ - ✅ Database (saturated - all connections in use)
71
+ - ❌ Authentication (not affected - separate service)
72
+ - ❌ Payment processing (not affected - queued transactions)
73
+
74
+ **Business Impact**:
75
+ - **Revenue Lost**: $5,000 (estimated, based on 30 min downtime)
76
+ - **SLA Breach**: No (30 min < 43.2 min monthly budget for 99.9%)
77
+ - **Customer Complaints**: 47 support tickets, 12 social media mentions
78
+ - **Reputation**: Minor (quickly resolved, transparent communication)
79
+
80
+ ---
81
+
82
+ ## Root Cause Analysis
83
+
84
+ ### The Five Whys
85
+
86
+ **1. Why did the application become unavailable?**
87
+ → Database connection pool was exhausted (100/100 connections in use)
88
+
89
+ **2. Why was the connection pool exhausted?**
90
+ → Payment service had a connection leak (connections not being released)
91
+
92
+ **3. Why were connections not being released?**
93
+ → Error handling path in payment service missing `conn.close()` in `finally` block
94
+
95
+ **4. Why was the error path missing `conn.close()`?**
96
+ → Developer oversight during code review
97
+
98
+ **5. Why didn't code review catch this?**
99
+ → No automated test or linter to check connection cleanup
100
+
101
+ **Root Cause**: Connection leak in payment service error handling path, compounded by lack of automated testing for connection cleanup.
102
+
103
+ ---
104
+
105
+ ### Contributing Factors
106
+
107
+ **Technical Factors**:
108
+ 1. Connection pool size too small (100 connections) for Black Friday traffic
109
+ 2. No connection timeout configured (connections held indefinitely)
110
+ 3. No monitoring alert for connection pool usage (only alerted at 100%)
111
+ 4. No circuit breaker to prevent cascade failures
112
+
113
+ **Process Factors**:
114
+ 1. Code review missed connection leak
115
+ 2. No automated test for connection cleanup
116
+ 3. No load testing before high-traffic event (Black Friday)
117
+ 4. No runbook for connection pool exhaustion
118
+
119
+ **Human Factors**:
120
+ 1. Developer unfamiliar with connection pool best practices
121
+ 2. Time pressure during feature development (rushed code review)
122
+
123
+ ---
124
+
125
+ ## Detection and Response
126
+
127
+ ### Detection
128
+
129
+ **How Detected**: Automated monitoring alert
130
+
131
+ **Alert**: "Database Connection Pool Exhausted"
132
+ - **Trigger**: `SELECT count(*) FROM pg_stat_activity >= 100`
133
+ - **Alert latency**: <1 minute (excellent)
134
+ - **False positive rate**: 0% (first time this alert fired)
135
+
136
+ **Detection Quality**:
137
+ - ✅ **Good**: Alert fired quickly (<1 min after issue started)
138
+ - ❌ **To Improve**: No early warning (should alert at 80%, not 100%)
139
+
140
+ ---
141
+
142
+ ### Response
143
+
144
+ **Response Timeline**:
145
+ - **Time to acknowledge**: 2 minutes (target: <5 min) ✅
146
+ - **Time to triage**: 5 minutes (target: <10 min) ✅
147
+ - **Time to identify root cause**: 10 minutes (target: <30 min) ✅
148
+ - **Time to mitigate**: 15 minutes (target: <30 min) ✅
149
+ - **Time to resolve**: 30 minutes (target: <60 min) ✅
150
+
151
+ **What Worked Well**:
152
+ - ✅ Monitoring detected issue immediately
153
+ - ✅ Clear escalation path (on-call responded in 2 min)
154
+ - ✅ Good communication (updates every 10 min)
155
+ - ✅ Quick diagnosis (root cause found in 10 min)
156
+
157
+ **What Could Be Improved**:
158
+ - ❌ No runbook for this scenario (had to figure out on the spot)
159
+ - ❌ No early warning alert (only alerted when 100% full)
160
+ - ❌ Connection pool too small (should have been sized for traffic)
161
+
162
+ ---
163
+
164
+ ## Resolution
165
+
166
+ ### Short-term Fix
167
+
168
+ **Immediate** (Restore service):
169
+ 1. Restarted payment service to release connections
170
+ - `systemctl restart payment-service`
171
+ - Impact: Service restored in 2 minutes
172
+
173
+ 2. Monitored connection pool for 30 minutes
174
+ - Verified connections stayed <50%
175
+ - No recurrence
176
+
177
+ **Short-term** (Prevent immediate recurrence):
178
+ 1. Fixed connection leak in payment service code
179
+ - Added `finally` block with `conn.close()`
180
+ - Deployed hotfix at 16:00 UTC
181
+ - Verified no leak with load test
182
+
183
+ 2. Increased connection pool size
184
+ - Changed `max_connections` from 100 to 200
185
+ - Provides headroom for traffic spikes
186
+
187
+ 3. Added connection pool monitoring alert
188
+ - Alert at 80% usage (early warning)
189
+ - Prevents exhaustion
190
+
191
+ ---
192
+
193
+ ### Long-term Prevention
194
+
195
+ **Action Items** (with owners and deadlines):
196
+
197
+ | # | Action | Priority | Owner | Due Date | Status |
198
+ |---|--------|----------|-------|----------|--------|
199
+ | 1 | Add automated test for connection cleanup | P1 | Lisa (QA) | 2025-10-27 | ✅ Done |
200
+ | 2 | Add linter rule to check connection cleanup | P1 | Mike (Dev) | 2025-10-27 | ✅ Done |
201
+ | 3 | Add connection timeout (30s) | P2 | Tom (DBA) | 2025-10-28 | ⏳ In Progress |
202
+ | 4 | Review all DB queries for connection leaks | P2 | Mike (Dev) | 2025-11-02 | 📅 Planned |
203
+ | 5 | Load test before high-traffic events | P3 | John (DevOps) | 2025-11-10 | 📅 Planned |
204
+ | 6 | Create runbook: Connection Pool Issues | P3 | Jane (SRE) | 2025-10-28 | ✅ Done |
205
+ | 7 | Add circuit breaker to prevent cascades | P3 | Mike (Dev) | 2025-11-15 | 📅 Planned |
206
+
207
+ ---
208
+
209
+ ## Lessons Learned
210
+
211
+ ### What Went Well
212
+
213
+ 1. **Monitoring was effective**
214
+ - Alert fired within 1 minute of issue
215
+ - Clear symptoms (connection pool full)
216
+
217
+ 2. **Response was fast**
218
+ - On-call responded in 2 minutes
219
+ - Root cause identified in 10 minutes
220
+ - Service restored in 15 minutes
221
+
222
+ 3. **Communication was clear**
223
+ - Updates every 10 minutes
224
+ - Status page updated promptly
225
+ - Customer support informed
226
+
227
+ 4. **Team collaboration**
228
+ - SRE diagnosed, Developer fixed, DBA scaled
229
+ - Clear roles and responsibilities
230
+
231
+ ---
232
+
233
+ ### What Went Wrong
234
+
235
+ 1. **Connection leak in production**
236
+ - Code review missed the leak
237
+ - No automated test or linter
238
+ - Developer unfamiliar with best practices
239
+
240
+ 2. **No early warning**
241
+ - Alert only fired at 100% (too late)
242
+ - Should alert at 80% for early action
243
+
244
+ 3. **Capacity planning gap**
245
+ - Connection pool too small for Black Friday
246
+ - No load testing before high-traffic event
247
+
248
+ 4. **No runbook**
249
+ - Had to figure out diagnosis on the fly
250
+ - Runbook would have saved 5-10 minutes
251
+
252
+ 5. **No circuit breaker**
253
+ - Could have prevented full outage
254
+ - Should fail gracefully, not cascade
255
+
256
+ ---
257
+
258
+ ### Preventable?
259
+
260
+ **YES** - This incident was preventable.
261
+
262
+ **How it could have been prevented**:
263
+ 1. ✅ Automated test for connection cleanup → Would have caught leak
264
+ 2. ✅ Linter rule for connection cleanup → Would have caught in CI
265
+ 3. ✅ Load testing before Black Friday → Would have found pool too small
266
+ 4. ✅ Connection pool monitoring at 80% → Would have given early warning
267
+ 5. ✅ Code review focus on error paths → Would have caught missing `finally`
268
+
269
+ ---
270
+
271
+ ## Prevention Strategies
272
+
273
+ ### Technical Improvements
274
+
275
+ 1. **Automated Testing**
276
+ - ✅ Add integration test for connection cleanup
277
+ - ✅ Add linter rule: `require-connection-cleanup`
278
+ - ✅ Test error paths (not just happy path)
279
+
280
+ 2. **Monitoring & Alerting**
281
+ - ✅ Alert at 80% connection pool usage (early warning)
282
+ - ✅ Alert on increasing connection count (detect leaks early)
283
+ - ✅ Dashboard for connection pool metrics
284
+
285
+ 3. **Capacity Planning**
286
+ - ✅ Load test before high-traffic events
287
+ - ✅ Review connection pool size quarterly
288
+ - ✅ Auto-scaling for application (not just database)
289
+
290
+ 4. **Resilience Patterns**
291
+ - ⏳ Circuit breaker (prevent cascade failures)
292
+ - ⏳ Connection timeout (30s)
293
+ - ⏳ Graceful degradation (fallback data)
294
+
295
+ ---
296
+
297
+ ### Process Improvements
298
+
299
+ 1. **Code Review**
300
+ - ✅ Checklist: Connection cleanup in error paths
301
+ - ✅ Required reviewer: Someone familiar with DB best practices
302
+ - ✅ Automated checks (linter, tests)
303
+
304
+ 2. **Runbooks**
305
+ - ✅ Create runbook: Connection Pool Exhaustion
306
+ - ⏳ Create runbook: Database Performance Issues
307
+ - ⏳ Quarterly runbook review/update
308
+
309
+ 3. **Training**
310
+ - ⏳ Database best practices training for developers
311
+ - ⏳ Connection pool management workshop
312
+ - ⏳ Incident response training
313
+
314
+ 4. **Capacity Planning**
315
+ - ✅ Load test before high-traffic events (Black Friday, launch days)
316
+ - ⏳ Quarterly capacity review
317
+ - ⏳ Traffic forecasting for events
318
+
319
+ ---
320
+
321
+ ### Cultural Improvements
322
+
323
+ 1. **Blameless Culture**
324
+ - This post-mortem focuses on systems, not individuals
325
+ - Goal: Learn and improve, not blame
326
+
327
+ 2. **Psychological Safety**
328
+ - Encourage raising concerns (e.g., "I'm not sure about error handling")
329
+ - No punishment for mistakes
330
+
331
+ 3. **Continuous Learning**
332
+ - Share post-mortems org-wide
333
+ - Regular incident review meetings
334
+ - Learn from other teams' incidents
335
+
336
+ ---
337
+
338
+ ## Recommendations
339
+
340
+ ### Immediate (This Week)
341
+
342
+ - [x] Fix connection leak in code (DONE)
343
+ - [x] Add connection pool monitoring at 80% (DONE)
344
+ - [x] Create runbook for connection pool issues (DONE)
345
+ - [ ] Add automated test for connection cleanup
346
+ - [ ] Add linter rule for connection cleanup
347
+
348
+ ### Short-term (This Month)
349
+
350
+ - [ ] Add connection timeout configuration
351
+ - [ ] Review all database queries for leaks
352
+ - [ ] Load test with 10x traffic
353
+ - [ ] Database best practices training
354
+
355
+ ### Long-term (This Quarter)
356
+
357
+ - [ ] Implement circuit breakers
358
+ - [ ] Quarterly capacity planning process
359
+ - [ ] Add auto-scaling for application tier
360
+ - [ ] Regular runbook review/update process
361
+
362
+ ---
363
+
364
+ ## Supporting Information
365
+
366
+ ### Related Incidents
367
+
368
+ - **2025-09-15**: Database connection pool exhausted (similar issue)
369
+ - Same root cause (connection leak)
370
+ - Should have prevented this incident!
371
+
372
+ - **2025-08-10**: Payment service OOM crash
373
+ - Memory leak, different symptom
374
+
375
+ ### Related Documentation
376
+
377
+ - [Database Architecture](https://wiki.example.com/db-arch)
378
+ - [Connection Pool Best Practices](https://wiki.example.com/db-pool)
379
+ - [Incident Response Process](https://wiki.example.com/incident-response)
380
+
381
+ ### Metrics
382
+
383
+ **Availability**:
384
+ - Monthly uptime target: 99.9% (43.2 min downtime allowed)
385
+ - This month actual: 99.93% (30 min downtime)
386
+ - Status: ✅ Within SLA
387
+
388
+ **MTTR** (Mean Time To Resolution):
389
+ - This incident: 30 minutes
390
+ - Team average: 45 minutes
391
+ - Status: ✅ Better than average
392
+
393
+ ---
394
+
395
+ ## Acknowledgments
396
+
397
+ **Thanks to**:
398
+ - Jane (SRE) - Quick diagnosis and mitigation
399
+ - Mike (Developer) - Fast code fix
400
+ - Tom (DBA) - Connection pool scaling
401
+ - Customer Support team - Handling user complaints
402
+
403
+ ---
404
+
405
+ ## Sign-off
406
+
407
+ This post-mortem has been reviewed and approved:
408
+
409
+ - [x] Author: Jane (SRE) - YYYY-MM-DD
410
+ - [x] Engineering Lead: Mike - YYYY-MM-DD
411
+ - [x] Manager: Sarah - YYYY-MM-DD
412
+ - [x] Action items tracked in: [JIRA-1234](link)
413
+
414
+ **Next Review**: [Date] - Check action item progress
415
+
416
+ ---
417
+
418
+ **Remember**: Incidents are learning opportunities. The goal is not to find fault, but to improve our systems and processes.