specweave 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/INSTALL.md +848 -0
  2. package/LICENSE +21 -0
  3. package/README.md +675 -0
  4. package/SPECWEAVE.md +665 -0
  5. package/bin/install-agents.sh +57 -0
  6. package/bin/install-all.sh +49 -0
  7. package/bin/install-commands.sh +56 -0
  8. package/bin/install-skills.sh +57 -0
  9. package/bin/specweave.js +81 -0
  10. package/dist/adapters/adapter-base.d.ts +50 -0
  11. package/dist/adapters/adapter-base.d.ts.map +1 -0
  12. package/dist/adapters/adapter-base.js +146 -0
  13. package/dist/adapters/adapter-base.js.map +1 -0
  14. package/dist/adapters/adapter-interface.d.ts +108 -0
  15. package/dist/adapters/adapter-interface.d.ts.map +1 -0
  16. package/dist/adapters/adapter-interface.js +9 -0
  17. package/dist/adapters/adapter-interface.js.map +1 -0
  18. package/dist/adapters/claude/adapter.d.ts +54 -0
  19. package/dist/adapters/claude/adapter.d.ts.map +1 -0
  20. package/dist/adapters/claude/adapter.js +184 -0
  21. package/dist/adapters/claude/adapter.js.map +1 -0
  22. package/dist/adapters/copilot/adapter.d.ts +42 -0
  23. package/dist/adapters/copilot/adapter.d.ts.map +1 -0
  24. package/dist/adapters/copilot/adapter.js +239 -0
  25. package/dist/adapters/copilot/adapter.js.map +1 -0
  26. package/dist/adapters/cursor/adapter.d.ts +42 -0
  27. package/dist/adapters/cursor/adapter.d.ts.map +1 -0
  28. package/dist/adapters/cursor/adapter.js +297 -0
  29. package/dist/adapters/cursor/adapter.js.map +1 -0
  30. package/dist/adapters/generic/adapter.d.ts +40 -0
  31. package/dist/adapters/generic/adapter.d.ts.map +1 -0
  32. package/dist/adapters/generic/adapter.js +155 -0
  33. package/dist/adapters/generic/adapter.js.map +1 -0
  34. package/dist/cli/commands/init.d.ts +6 -0
  35. package/dist/cli/commands/init.d.ts.map +1 -0
  36. package/dist/cli/commands/init.js +247 -0
  37. package/dist/cli/commands/init.js.map +1 -0
  38. package/dist/cli/commands/install.d.ts +7 -0
  39. package/dist/cli/commands/install.d.ts.map +1 -0
  40. package/dist/cli/commands/install.js +160 -0
  41. package/dist/cli/commands/install.js.map +1 -0
  42. package/dist/cli/commands/list.d.ts +6 -0
  43. package/dist/cli/commands/list.d.ts.map +1 -0
  44. package/dist/cli/commands/list.js +154 -0
  45. package/dist/cli/commands/list.js.map +1 -0
  46. package/package.json +90 -0
  47. package/src/adapters/README.md +312 -0
  48. package/src/adapters/adapter-base.ts +146 -0
  49. package/src/adapters/adapter-interface.ts +120 -0
  50. package/src/adapters/claude/README.md +241 -0
  51. package/src/adapters/claude/adapter.ts +157 -0
  52. package/src/adapters/copilot/.github/copilot/instructions.md +376 -0
  53. package/src/adapters/copilot/README.md +200 -0
  54. package/src/adapters/copilot/adapter.ts +210 -0
  55. package/src/adapters/cursor/.cursor/context/docs-context.md +62 -0
  56. package/src/adapters/cursor/.cursor/context/increments-context.md +71 -0
  57. package/src/adapters/cursor/.cursor/context/strategy-context.md +73 -0
  58. package/src/adapters/cursor/.cursor/context/tests-context.md +89 -0
  59. package/src/adapters/cursor/.cursorrules +325 -0
  60. package/src/adapters/cursor/README.md +243 -0
  61. package/src/adapters/cursor/adapter.ts +268 -0
  62. package/src/adapters/generic/README.md +277 -0
  63. package/src/adapters/generic/SPECWEAVE-MANUAL.md +676 -0
  64. package/src/adapters/generic/adapter.ts +159 -0
  65. package/src/adapters/registry.yaml +126 -0
  66. package/src/agents/architect/AGENT.md +416 -0
  67. package/src/agents/devops/AGENT.md +1738 -0
  68. package/src/agents/docs-writer/AGENT.md +239 -0
  69. package/src/agents/performance/AGENT.md +228 -0
  70. package/src/agents/pm/AGENT.md +751 -0
  71. package/src/agents/qa-lead/AGENT.md +150 -0
  72. package/src/agents/security/AGENT.md +179 -0
  73. package/src/agents/sre/AGENT.md +582 -0
  74. package/src/agents/sre/modules/backend-diagnostics.md +481 -0
  75. package/src/agents/sre/modules/database-diagnostics.md +509 -0
  76. package/src/agents/sre/modules/infrastructure.md +561 -0
  77. package/src/agents/sre/modules/monitoring.md +439 -0
  78. package/src/agents/sre/modules/security-incidents.md +421 -0
  79. package/src/agents/sre/modules/ui-diagnostics.md +302 -0
  80. package/src/agents/sre/playbooks/01-high-cpu-usage.md +204 -0
  81. package/src/agents/sre/playbooks/02-database-deadlock.md +241 -0
  82. package/src/agents/sre/playbooks/03-memory-leak.md +252 -0
  83. package/src/agents/sre/playbooks/04-slow-api-response.md +269 -0
  84. package/src/agents/sre/playbooks/05-ddos-attack.md +293 -0
  85. package/src/agents/sre/playbooks/06-disk-full.md +314 -0
  86. package/src/agents/sre/playbooks/07-service-down.md +333 -0
  87. package/src/agents/sre/playbooks/08-data-corruption.md +337 -0
  88. package/src/agents/sre/playbooks/09-cascade-failure.md +430 -0
  89. package/src/agents/sre/playbooks/10-rate-limit-exceeded.md +464 -0
  90. package/src/agents/sre/scripts/health-check.sh +230 -0
  91. package/src/agents/sre/scripts/log-analyzer.py +213 -0
  92. package/src/agents/sre/scripts/metrics-collector.sh +294 -0
  93. package/src/agents/sre/scripts/trace-analyzer.js +257 -0
  94. package/src/agents/sre/templates/incident-report.md +249 -0
  95. package/src/agents/sre/templates/mitigation-plan.md +375 -0
  96. package/src/agents/sre/templates/post-mortem.md +418 -0
  97. package/src/agents/sre/templates/runbook-template.md +412 -0
  98. package/src/agents/tech-lead/AGENT.md +263 -0
  99. package/src/commands/add-tasks.md +176 -0
  100. package/src/commands/close-increment.md +347 -0
  101. package/src/commands/create-increment.md +223 -0
  102. package/src/commands/create-project.md +528 -0
  103. package/src/commands/generate-docs.md +623 -0
  104. package/src/commands/list-increments.md +180 -0
  105. package/src/commands/review-docs.md +331 -0
  106. package/src/commands/start-increment.md +139 -0
  107. package/src/commands/sync-github.md +115 -0
  108. package/src/commands/validate-increment.md +800 -0
  109. package/src/hooks/README.md +252 -0
  110. package/src/hooks/docs-changed.sh +59 -0
  111. package/src/hooks/human-input-required.sh +55 -0
  112. package/src/hooks/post-task-completion.sh +57 -0
  113. package/src/hooks/pre-implementation.sh +47 -0
  114. package/src/skills/ado-sync/README.md +449 -0
  115. package/src/skills/ado-sync/SKILL.md +245 -0
  116. package/src/skills/ado-sync/test-cases/test-1.yaml +9 -0
  117. package/src/skills/ado-sync/test-cases/test-2.yaml +8 -0
  118. package/src/skills/ado-sync/test-cases/test-3.yaml +9 -0
  119. package/src/skills/bmad-method-expert/SKILL.md +628 -0
  120. package/src/skills/bmad-method-expert/scripts/analyze-project.js +318 -0
  121. package/src/skills/bmad-method-expert/scripts/check-setup.js +208 -0
  122. package/src/skills/bmad-method-expert/scripts/generate-template.js +1149 -0
  123. package/src/skills/bmad-method-expert/scripts/validate-documents.js +340 -0
  124. package/src/skills/bmad-method-expert/test-cases/test-1-placeholder.yaml +12 -0
  125. package/src/skills/bmad-method-expert/test-cases/test-2-placeholder.yaml +12 -0
  126. package/src/skills/bmad-method-expert/test-cases/test-3-placeholder.yaml +12 -0
  127. package/src/skills/brownfield-analyzer/SKILL.md +523 -0
  128. package/src/skills/brownfield-analyzer/test-cases/test-1-basic-analysis.yaml +48 -0
  129. package/src/skills/brownfield-analyzer/test-cases/test-2-placeholder.yaml +12 -0
  130. package/src/skills/brownfield-analyzer/test-cases/test-3-placeholder.yaml +12 -0
  131. package/src/skills/brownfield-onboarder/SKILL.md +625 -0
  132. package/src/skills/brownfield-onboarder/test-cases/test-1-placeholder.yaml +12 -0
  133. package/src/skills/brownfield-onboarder/test-cases/test-2-placeholder.yaml +12 -0
  134. package/src/skills/brownfield-onboarder/test-cases/test-3-placeholder.yaml +12 -0
  135. package/src/skills/calendar-system/test-cases/test-1-placeholder.yaml +12 -0
  136. package/src/skills/calendar-system/test-cases/test-2-placeholder.yaml +12 -0
  137. package/src/skills/calendar-system/test-cases/test-3-placeholder.yaml +12 -0
  138. package/src/skills/context-loader/SKILL.md +734 -0
  139. package/src/skills/context-loader/test-cases/test-1-basic-loading.yaml +39 -0
  140. package/src/skills/context-loader/test-cases/test-2-token-budget-exceeded.yaml +44 -0
  141. package/src/skills/context-loader/test-cases/test-3-section-anchors.yaml +45 -0
  142. package/src/skills/context-optimizer/SKILL.md +618 -0
  143. package/src/skills/context-optimizer/test-cases/test-1-bug-fix-narrow.yaml +97 -0
  144. package/src/skills/context-optimizer/test-cases/test-2-feature-focused.yaml +109 -0
  145. package/src/skills/context-optimizer/test-cases/test-3-architecture-broad.yaml +98 -0
  146. package/src/skills/cost-optimizer/SKILL.md +190 -0
  147. package/src/skills/cost-optimizer/test-cases/test-1-basic-comparison.yaml +75 -0
  148. package/src/skills/cost-optimizer/test-cases/test-2-budget-constraint.yaml +52 -0
  149. package/src/skills/cost-optimizer/test-cases/test-3-scale-requirement.yaml +63 -0
  150. package/src/skills/cost-optimizer/test-results/README.md +46 -0
  151. package/src/skills/design-system-architect/SKILL.md +107 -0
  152. package/src/skills/design-system-architect/test-cases/test-1-token-structure.yaml +23 -0
  153. package/src/skills/design-system-architect/test-cases/test-2-component-hierarchy.yaml +24 -0
  154. package/src/skills/design-system-architect/test-cases/test-3-accessibility-checklist.yaml +23 -0
  155. package/src/skills/diagrams-architect/SKILL.md +763 -0
  156. package/src/skills/diagrams-generator/SKILL.md +25 -0
  157. package/src/skills/diagrams-generator/test-cases/test-1.yaml +9 -0
  158. package/src/skills/diagrams-generator/test-cases/test-2.yaml +9 -0
  159. package/src/skills/diagrams-generator/test-cases/test-3.yaml +8 -0
  160. package/src/skills/docs-updater/README.md +48 -0
  161. package/src/skills/docs-updater/test-cases/test-1-placeholder.yaml +12 -0
  162. package/src/skills/docs-updater/test-cases/test-2-placeholder.yaml +12 -0
  163. package/src/skills/docs-updater/test-cases/test-3-placeholder.yaml +12 -0
  164. package/src/skills/dotnet-backend/SKILL.md +250 -0
  165. package/src/skills/e2e-playwright/README.md +506 -0
  166. package/src/skills/e2e-playwright/SKILL.md +457 -0
  167. package/src/skills/e2e-playwright/execute.js +373 -0
  168. package/src/skills/e2e-playwright/lib/utils.js +514 -0
  169. package/src/skills/e2e-playwright/package.json +33 -0
  170. package/src/skills/e2e-playwright/test-cases/TC-001-basic-navigation.yaml +54 -0
  171. package/src/skills/e2e-playwright/test-cases/TC-002-form-interaction.yaml +64 -0
  172. package/src/skills/e2e-playwright/test-cases/TC-003-specweave-integration.yaml +74 -0
  173. package/src/skills/e2e-playwright/test-cases/TC-004-accessibility-check.yaml +98 -0
  174. package/src/skills/figma-designer/SKILL.md +149 -0
  175. package/src/skills/figma-implementer/SKILL.md +148 -0
  176. package/src/skills/figma-mcp-connector/SKILL.md +136 -0
  177. package/src/skills/figma-mcp-connector/test-cases/test-1-read-file-desktop.yaml +22 -0
  178. package/src/skills/figma-mcp-connector/test-cases/test-2-read-file-framelink.yaml +21 -0
  179. package/src/skills/figma-mcp-connector/test-cases/test-3-error-handling.yaml +18 -0
  180. package/src/skills/figma-to-code/SKILL.md +128 -0
  181. package/src/skills/figma-to-code/test-cases/test-1-token-generation.yaml +29 -0
  182. package/src/skills/figma-to-code/test-cases/test-2-component-generation.yaml +27 -0
  183. package/src/skills/figma-to-code/test-cases/test-3-typescript-generation.yaml +28 -0
  184. package/src/skills/frontend/SKILL.md +177 -0
  185. package/src/skills/github-sync/SKILL.md +252 -0
  186. package/src/skills/github-sync/test-cases/test-1-placeholder.yaml +12 -0
  187. package/src/skills/github-sync/test-cases/test-2-placeholder.yaml +12 -0
  188. package/src/skills/github-sync/test-cases/test-3-placeholder.yaml +12 -0
  189. package/src/skills/hetzner-provisioner/README.md +308 -0
  190. package/src/skills/hetzner-provisioner/SKILL.md +251 -0
  191. package/src/skills/hetzner-provisioner/test-cases/test-1-basic-provision.yaml +71 -0
  192. package/src/skills/hetzner-provisioner/test-cases/test-2-postgres-provision.yaml +85 -0
  193. package/src/skills/hetzner-provisioner/test-cases/test-3-ssl-config.yaml +126 -0
  194. package/src/skills/hetzner-provisioner/test-results/README.md +259 -0
  195. package/src/skills/increment-planner/SKILL.md +889 -0
  196. package/src/skills/increment-planner/scripts/feature-utils.js +250 -0
  197. package/src/skills/increment-planner/test-cases/test-1-basic-feature.yaml +27 -0
  198. package/src/skills/increment-planner/test-cases/test-2-complex-feature.yaml +30 -0
  199. package/src/skills/increment-planner/test-cases/test-3-auto-numbering.yaml +24 -0
  200. package/src/skills/increment-quality-judge/SKILL.md +566 -0
  201. package/src/skills/increment-quality-judge/test-cases/test-1-good-spec.yaml +95 -0
  202. package/src/skills/increment-quality-judge/test-cases/test-2-poor-spec.yaml +108 -0
  203. package/src/skills/increment-quality-judge/test-cases/test-3-export-suggestions.yaml +87 -0
  204. package/src/skills/jira-sync/README.md +328 -0
  205. package/src/skills/jira-sync/SKILL.md +209 -0
  206. package/src/skills/jira-sync/test-cases/test-1.yaml +9 -0
  207. package/src/skills/jira-sync/test-cases/test-2.yaml +9 -0
  208. package/src/skills/jira-sync/test-cases/test-3.yaml +10 -0
  209. package/src/skills/nextjs/SKILL.md +176 -0
  210. package/src/skills/nodejs-backend/SKILL.md +181 -0
  211. package/src/skills/notification-system/test-cases/test-1-placeholder.yaml +12 -0
  212. package/src/skills/notification-system/test-cases/test-2-placeholder.yaml +12 -0
  213. package/src/skills/notification-system/test-cases/test-3-placeholder.yaml +12 -0
  214. package/src/skills/python-backend/SKILL.md +226 -0
  215. package/src/skills/role-orchestrator/README.md +197 -0
  216. package/src/skills/role-orchestrator/SKILL.md +1184 -0
  217. package/src/skills/role-orchestrator/test-cases/test-1-simple-product.yaml +98 -0
  218. package/src/skills/role-orchestrator/test-cases/test-2-quality-gate-failure.yaml +73 -0
  219. package/src/skills/role-orchestrator/test-cases/test-3-security-workflow.yaml +121 -0
  220. package/src/skills/role-orchestrator/test-cases/test-4-parallel-execution.yaml +145 -0
  221. package/src/skills/role-orchestrator/test-cases/test-5-feedback-loops.yaml +149 -0
  222. package/src/skills/skill-creator/LICENSE.txt +202 -0
  223. package/src/skills/skill-creator/SKILL.md +209 -0
  224. package/src/skills/skill-creator/scripts/init_skill.py +303 -0
  225. package/src/skills/skill-creator/scripts/package_skill.py +110 -0
  226. package/src/skills/skill-creator/scripts/quick_validate.py +65 -0
  227. package/src/skills/skill-creator/test-cases/test-1-placeholder.yaml +12 -0
  228. package/src/skills/skill-creator/test-cases/test-2-placeholder.yaml +12 -0
  229. package/src/skills/skill-creator/test-cases/test-3-placeholder.yaml +12 -0
  230. package/src/skills/skill-router/SKILL.md +497 -0
  231. package/src/skills/skill-router/test-cases/test-1-basic-routing.yaml +33 -0
  232. package/src/skills/skill-router/test-cases/test-2-ambiguous-request.yaml +42 -0
  233. package/src/skills/skill-router/test-cases/test-3-nested-orchestration.yaml +50 -0
  234. package/src/skills/spec-driven-brainstorming/README.md +264 -0
  235. package/src/skills/spec-driven-brainstorming/SKILL.md +439 -0
  236. package/src/skills/spec-driven-brainstorming/test-cases/TC-001-simple-idea-to-design.yaml +148 -0
  237. package/src/skills/spec-driven-brainstorming/test-cases/TC-002-complex-ultrathink-design.yaml +190 -0
  238. package/src/skills/spec-driven-brainstorming/test-cases/TC-003-unclear-requirements-socratic.yaml +233 -0
  239. package/src/skills/spec-driven-debugging/README.md +479 -0
  240. package/src/skills/spec-driven-debugging/SKILL.md +652 -0
  241. package/src/skills/spec-driven-debugging/test-cases/TC-001-simple-auth-bug.yaml +212 -0
  242. package/src/skills/spec-driven-debugging/test-cases/TC-002-race-condition-ultrathink.yaml +461 -0
  243. package/src/skills/spec-driven-debugging/test-cases/TC-003-brownfield-missing-spec.yaml +366 -0
  244. package/src/skills/spec-kit-expert/SKILL.md +1012 -0
  245. package/src/skills/spec-kit-expert/test-cases/test-1-placeholder.yaml +12 -0
  246. package/src/skills/spec-kit-expert/test-cases/test-2-placeholder.yaml +12 -0
  247. package/src/skills/spec-kit-expert/test-cases/test-3-placeholder.yaml +12 -0
  248. package/src/skills/specweave-ado-mapper/SKILL.md +501 -0
  249. package/src/skills/specweave-detector/SKILL.md +420 -0
  250. package/src/skills/specweave-detector/test-cases/test-1-basic-detection.yaml +37 -0
  251. package/src/skills/specweave-detector/test-cases/test-2-missing-config.yaml +37 -0
  252. package/src/skills/specweave-detector/test-cases/test-3-non-specweave-project.yaml +34 -0
  253. package/src/skills/specweave-jira-mapper/SKILL.md +500 -0
  254. package/src/skills/stripe-integrator/test-cases/test-1-placeholder.yaml +12 -0
  255. package/src/skills/stripe-integrator/test-cases/test-2-placeholder.yaml +12 -0
  256. package/src/skills/stripe-integrator/test-cases/test-3-placeholder.yaml +12 -0
  257. package/src/skills/task-builder/README.md +90 -0
  258. package/src/skills/task-builder/test-cases/test-1-placeholder.yaml +12 -0
  259. package/src/skills/task-builder/test-cases/test-2-placeholder.yaml +12 -0
  260. package/src/skills/task-builder/test-cases/test-3-placeholder.yaml +12 -0
  261. package/src/templates/.env.example +144 -0
  262. package/src/templates/.gitignore.template +81 -0
  263. package/src/templates/CLAUDE.md.template +383 -0
  264. package/src/templates/README.md.template +240 -0
  265. package/src/templates/config.yaml +333 -0
  266. package/src/templates/docs/README.md +124 -0
  267. package/src/templates/docs/adr-template.md +118 -0
  268. package/src/templates/docs/hld-template.md +220 -0
  269. package/src/templates/docs/lld-template.md +580 -0
  270. package/src/templates/docs/prd-template.md +132 -0
  271. package/src/templates/docs/rfc-template.md +229 -0
  272. package/src/templates/docs/runbook-template.md +298 -0
  273. package/src/templates/environments/minimal/.env.production +16 -0
  274. package/src/templates/environments/minimal/README.md +54 -0
  275. package/src/templates/environments/minimal/deploy-production.yml +52 -0
  276. package/src/templates/environments/progressive/.env.qa +28 -0
  277. package/src/templates/environments/progressive/README.md +129 -0
  278. package/src/templates/environments/progressive/deploy-production.yml +93 -0
  279. package/src/templates/environments/progressive/deploy-qa.yml +62 -0
  280. package/src/templates/environments/progressive/deploy-staging.yml +67 -0
  281. package/src/templates/environments/standard/.env.development +20 -0
  282. package/src/templates/environments/standard/.env.production +30 -0
  283. package/src/templates/environments/standard/.env.staging +23 -0
  284. package/src/templates/environments/standard/README.md +97 -0
  285. package/src/templates/environments/standard/deploy-production.yml +68 -0
  286. package/src/templates/environments/standard/deploy-staging.yml +61 -0
  287. package/src/templates/environments/standard/docker-compose.yml +43 -0
  288. package/src/templates/increment-metadata-template.yaml +138 -0
@@ -0,0 +1,204 @@
1
+ # Playbook: High CPU Usage
2
+
3
+ ## Symptoms
4
+
5
+ - CPU usage at 80-100%
6
+ - Applications slow or unresponsive
7
+ - Server lag, SSH slow
8
+ - Monitoring alert: "CPU usage >80% for 5 minutes"
9
+
10
+ ## Severity
11
+
12
+ - **SEV2** if application degraded but functional
13
+ - **SEV1** if application unresponsive
14
+
15
+ ## Diagnosis
16
+
17
+ ### Step 1: Identify Top CPU Process
18
+
19
+ ```bash
20
+ # Current CPU usage
21
+ top -bn1 | head -20
22
+
23
+ # Top CPU processes
24
+ ps aux | sort -nrk 3,3 | head -10
25
+
26
+ # CPU per thread
27
+ top -H -p <PID>
28
+ ```
29
+
30
+ **What to look for**:
31
+ - Single process using >80% CPU
32
+ - Multiple processes all high (system-wide issue)
33
+ - System CPU vs user CPU (iowait = disk issue)
34
+
35
+ ---
36
+
37
+ ### Step 2: Identify Process Type
38
+
39
+ **Application process** (node, java, python):
40
+ ```bash
41
+ # Check application logs
42
+ tail -100 /var/log/application.log
43
+
44
+ # Check for infinite loops, heavy computation
45
+ # Check APM for slow endpoints
46
+ ```
47
+
48
+ **System process** (kernel, systemd):
49
+ ```bash
50
+ # Check system logs
51
+ dmesg | tail -50
52
+ journalctl -xe
53
+
54
+ # Check for hardware issues
55
+ ```
56
+
57
+ **Unknown/suspicious process**:
58
+ ```bash
59
+ # Check process details
60
+ ps aux | grep <PID>
61
+ lsof -p <PID>
62
+
63
+ # Could be malware (crypto mining)
64
+ # See security-incidents.md
65
+ ```
66
+
67
+ ---
68
+
69
+ ### Step 3: Check If Disk-Related
70
+
71
+ ```bash
72
+ # Check iowait
73
+ iostat -x 1 5
74
+
75
+ # If iowait >20%, disk is bottleneck
76
+ # See infrastructure.md for disk I/O troubleshooting
77
+ ```
78
+
79
+ ---
80
+
81
+ ## Mitigation
82
+
83
+ ### Immediate (Now - 5 min)
84
+
85
+ **Option A: Lower Process Priority**
86
+ ```bash
87
+ # Reduce CPU priority
88
+ renice +10 <PID>
89
+
90
+ # Impact: Process gets less CPU time
91
+ # Risk: Low (process still runs, just slower)
92
+ ```
93
+
94
+ **Option B: Kill Process** (if application)
95
+ ```bash
96
+ # Graceful shutdown
97
+ kill -TERM <PID>
98
+
99
+ # Force kill (last resort)
100
+ kill -KILL <PID>
101
+
102
+ # Restart service
103
+ systemctl restart <service>
104
+
105
+ # Impact: Process restarts, CPU normalizes
106
+ # Risk: Medium (brief downtime)
107
+ ```
108
+
109
+ **Option C: Scale Horizontally** (cloud)
110
+ ```bash
111
+ # Add more instances to distribute load
112
+ # AWS: Auto Scaling Group
113
+ # Azure: Scale Set
114
+ # Kubernetes: Horizontal Pod Autoscaler
115
+
116
+ # Impact: Load distributed across instances
117
+ # Risk: Low (no downtime)
118
+ ```
119
+
120
+ ---
121
+
122
+ ### Short-term (5 min - 1 hour)
123
+
124
+ **Option A: Optimize Code** (if application bug)
125
+ ```bash
126
+ # Profile application
127
+ # Node.js: node --prof
128
+ # Java: jstack, jvisualvm
129
+ # Python: py-spy
130
+
131
+ # Identify hot path
132
+ # Fix infinite loop, heavy computation
133
+ ```
134
+
135
+ **Option B: Add Caching**
136
+ ```javascript
137
+ // Cache expensive computation
138
+ const cache = new Map();
139
+
140
+ function expensiveOperation(input) {
141
+ if (cache.has(input)) {
142
+ return cache.get(input);
143
+ }
144
+
145
+ const result = /* heavy computation */;
146
+ cache.set(input, result);
147
+ return result;
148
+ }
149
+ ```
150
+
151
+ **Option C: Scale Vertically** (cloud)
152
+ ```bash
153
+ # Resize to larger instance type
154
+ # AWS: Change instance type (t3.medium → t3.large)
155
+ # Azure: Resize VM
156
+ # Impact: More CPU capacity
157
+ # Risk: Medium (brief downtime during resize)
158
+ ```
159
+
160
+ ---
161
+
162
+ ### Long-term (1 hour+)
163
+
164
+ - [ ] Add CPU monitoring alert (>70% for 5 min)
165
+ - [ ] Optimize application code (reduce computation)
166
+ - [ ] Use worker threads for heavy tasks (Node.js)
167
+ - [ ] Implement auto-scaling (cloud)
168
+ - [ ] Add APM for performance profiling
169
+ - [ ] Review architecture (async processing, job queues)
170
+
171
+ ---
172
+
173
+ ## Escalation
174
+
175
+ **Escalate to developer if**:
176
+ - Application code causing issue
177
+ - Requires code fix or optimization
178
+
179
+ **Escalate to security-agent if**:
180
+ - Unknown/suspicious process
181
+ - Potential malware or crypto mining
182
+
183
+ **Escalate to infrastructure if**:
184
+ - Hardware issue (kernel errors)
185
+ - Cloud infrastructure problem
186
+
187
+ ---
188
+
189
+ ## Related Runbooks
190
+
191
+ - [03-memory-leak.md](03-memory-leak.md) - If memory also high
192
+ - [04-slow-api-response.md](04-slow-api-response.md) - If API slow due to CPU
193
+ - [../modules/infrastructure.md](../modules/infrastructure.md) - Infrastructure diagnostics
194
+
195
+ ---
196
+
197
+ ## Post-Incident
198
+
199
+ After resolving:
200
+ - [ ] Create post-mortem (if SEV1/SEV2)
201
+ - [ ] Identify root cause
202
+ - [ ] Add monitoring/alerting
203
+ - [ ] Update this runbook if needed
204
+ - [ ] Add regression test (if code bug)
@@ -0,0 +1,241 @@
1
+ # Playbook: Database Deadlock
2
+
3
+ ## Symptoms
4
+
5
+ - "Deadlock detected" errors in application
6
+ - API returning 500 errors
7
+ - Transactions timing out
8
+ - Database connection pool exhausted
9
+ - Monitoring alert: "Deadlock count >0"
10
+
11
+ ## Severity
12
+
13
+ - **SEV2** if isolated to specific endpoint
14
+ - **SEV1** if affecting all database operations
15
+
16
+ ## Diagnosis
17
+
18
+ ### Step 1: Confirm Deadlock (PostgreSQL)
19
+
20
+ ```sql
21
+ -- Check for currently locked queries
22
+ SELECT
23
+ blocked_locks.pid AS blocked_pid,
24
+ blocked_activity.usename AS blocked_user,
25
+ blocking_locks.pid AS blocking_pid,
26
+ blocking_activity.usename AS blocking_user,
27
+ blocked_activity.query AS blocked_statement,
28
+ blocking_activity.query AS blocking_statement
29
+ FROM pg_catalog.pg_locks blocked_locks
30
+ JOIN pg_catalog.pg_stat_activity blocked_activity
31
+ ON blocked_activity.pid = blocked_locks.pid
32
+ JOIN pg_catalog.pg_locks blocking_locks
33
+ ON blocking_locks.locktype = blocked_locks.locktype
34
+ AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
35
+ AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
36
+ AND blocking_locks.pid != blocked_locks.pid
37
+ JOIN pg_catalog.pg_stat_activity blocking_activity
38
+ ON blocking_activity.pid = blocking_locks.pid
39
+ WHERE NOT blocked_locks.granted;
40
+
41
+ -- Check deadlock log
42
+ SELECT * FROM pg_stat_database WHERE datname = 'your_database';
43
+ ```
44
+
45
+ ### Step 2: Confirm Deadlock (MySQL)
46
+
47
+ ```sql
48
+ -- Show InnoDB status (includes deadlock info)
49
+ SHOW ENGINE INNODB STATUS\G
50
+
51
+ -- Look for "LATEST DETECTED DEADLOCK" section
52
+ -- Shows which transactions were involved
53
+ ```
54
+
55
+ ---
56
+
57
+ ### Step 3: Identify Deadlock Pattern
58
+
59
+ **Common Pattern 1: Lock Order Mismatch**
60
+ ```
61
+ Transaction A: Locks row 1, then row 2
62
+ Transaction B: Locks row 2, then row 1
63
+ → DEADLOCK
64
+ ```
65
+
66
+ **Common Pattern 2: Gap Locks**
67
+ ```
68
+ Transaction A: SELECT ... FOR UPDATE WHERE id BETWEEN 1 AND 10
69
+ Transaction B: INSERT INTO table (id) VALUES (5)
70
+ → DEADLOCK
71
+ ```
72
+
73
+ **Common Pattern 3: Foreign Key Deadlock**
74
+ ```
75
+ Transaction A: Updates parent table
76
+ Transaction B: Inserts into child table
77
+ → DEADLOCK (foreign key check locks)
78
+ ```
79
+
80
+ ---
81
+
82
+ ## Mitigation
83
+
84
+ ### Immediate (Now - 5 min)
85
+
86
+ **Option A: Kill Blocking Query** (PostgreSQL)
87
+ ```sql
88
+ -- Terminate blocking process
89
+ SELECT pg_terminate_backend(<blocking_pid>);
90
+
91
+ -- Verify deadlock cleared
92
+ SELECT count(*) FROM pg_locks WHERE NOT granted;
93
+ -- Should return 0
94
+ ```
95
+
96
+ **Option B: Kill Blocking Query** (MySQL)
97
+ ```sql
98
+ -- Show process list
99
+ SHOW PROCESSLIST;
100
+
101
+ -- Kill blocking query
102
+ KILL <process_id>;
103
+ ```
104
+
105
+ **Option C: Kill Idle Transactions** (PostgreSQL)
106
+ ```sql
107
+ -- Find idle transactions (>5 min)
108
+ SELECT pg_terminate_backend(pid)
109
+ FROM pg_stat_activity
110
+ WHERE state = 'idle in transaction'
111
+ AND state_change < NOW() - INTERVAL '5 minutes';
112
+
113
+ -- Impact: Frees up locks
114
+ -- Risk: Low (transactions are idle)
115
+ ```
116
+
117
+ ---
118
+
119
+ ### Short-term (5 min - 1 hour)
120
+
121
+ **Option A: Add Transaction Timeout** (PostgreSQL)
122
+ ```sql
123
+ -- Set statement timeout (30 seconds)
124
+ ALTER DATABASE your_database SET statement_timeout = '30s';
125
+
126
+ -- Or in application:
127
+ SET statement_timeout = '30s';
128
+
129
+ -- Impact: Prevents long-running transactions
130
+ -- Risk: Low (transactions should be fast)
131
+ ```
132
+
133
+ **Option B: Add Transaction Timeout** (MySQL)
134
+ ```sql
135
+ -- Set lock wait timeout
136
+ SET GLOBAL innodb_lock_wait_timeout = 30;
137
+
138
+ -- Impact: Transactions fail instead of waiting forever
139
+ -- Risk: Low (application should handle errors)
140
+ ```
141
+
142
+ **Option C: Fix Lock Order in Application**
143
+ ```javascript
144
+ // BAD: Inconsistent lock order
145
+ async function transferMoney(fromId, toId, amount) {
146
+ await db.query('UPDATE accounts SET balance = balance - ? WHERE id = ?', [amount, fromId]);
147
+ await db.query('UPDATE accounts SET balance = balance + ? WHERE id = ?', [amount, toId]);
148
+ }
149
+
150
+ // GOOD: Consistent lock order
151
+ async function transferMoney(fromId, toId, amount) {
152
+ const firstId = Math.min(fromId, toId);
153
+ const secondId = Math.max(fromId, toId);
154
+
155
+ await db.query('UPDATE accounts SET balance = balance - ? WHERE id = ?', [amount, firstId]);
156
+ await db.query('UPDATE accounts SET balance = balance + ? WHERE id = ?', [amount, secondId]);
157
+ }
158
+ ```
159
+
160
+ ---
161
+
162
+ ### Long-term (1 hour+)
163
+
164
+ **Option A: Reduce Transaction Scope**
165
+ ```javascript
166
+ // BAD: Long transaction
167
+ BEGIN;
168
+ const user = await db.query('SELECT * FROM users WHERE id = ? FOR UPDATE', [userId]);
169
+ await sendEmail(user.email); // External call (slow!)
170
+ await db.query('UPDATE users SET last_email_sent = NOW() WHERE id = ?', [userId]);
171
+ COMMIT;
172
+
173
+ // GOOD: Short transaction
174
+ const user = await db.query('SELECT * FROM users WHERE id = ?', [userId]);
175
+ await sendEmail(user.email); // Outside transaction
176
+ await db.query('UPDATE users SET last_email_sent = NOW() WHERE id = ?', [userId]);
177
+ ```
178
+
179
+ **Option B: Use Optimistic Locking**
180
+ ```sql
181
+ -- Add version column
182
+ ALTER TABLE accounts ADD COLUMN version INT DEFAULT 0;
183
+
184
+ -- Update with version check
185
+ UPDATE accounts
186
+ SET balance = balance - 100, version = version + 1
187
+ WHERE id = 1 AND version = <current_version>;
188
+
189
+ -- If 0 rows updated, retry with new version
190
+ ```
191
+
192
+ **Option C: Review Isolation Level**
193
+ ```sql
194
+ -- PostgreSQL default: READ COMMITTED
195
+ -- Most cases: READ COMMITTED is fine
196
+ -- Rare cases: REPEATABLE READ or SERIALIZABLE
197
+
198
+ -- Lower isolation = less locking = fewer deadlocks
199
+ SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
200
+ ```
201
+
202
+ ---
203
+
204
+ ## Escalation
205
+
206
+ **Escalate to developer if**:
207
+ - Application code causing deadlock
208
+ - Requires code refactoring
209
+
210
+ **Escalate to DBA if**:
211
+ - Database configuration issue
212
+ - Foreign key constraint problem
213
+
214
+ ---
215
+
216
+ ## Prevention
217
+
218
+ - [ ] Always lock in same order
219
+ - [ ] Keep transactions short
220
+ - [ ] Use timeout (statement_timeout, lock_wait_timeout)
221
+ - [ ] Use optimistic locking when possible
222
+ - [ ] Add deadlock monitoring alert
223
+ - [ ] Review isolation level (lower = fewer deadlocks)
224
+
225
+ ---
226
+
227
+ ## Related Runbooks
228
+
229
+ - [04-slow-api-response.md](04-slow-api-response.md) - If API slow due to deadlock
230
+ - [../modules/database-diagnostics.md](../modules/database-diagnostics.md) - Database troubleshooting
231
+
232
+ ---
233
+
234
+ ## Post-Incident
235
+
236
+ After resolving:
237
+ - [ ] Create post-mortem
238
+ - [ ] Identify which queries deadlocked
239
+ - [ ] Fix lock order in application code
240
+ - [ ] Add regression test
241
+ - [ ] Update this runbook if needed
@@ -0,0 +1,252 @@
1
+ # Playbook: Memory Leak
2
+
3
+ ## Symptoms
4
+
5
+ - Memory usage increasing continuously over time
6
+ - Application crashes with OutOfMemoryError (Java) or "JavaScript heap out of memory" (Node.js)
7
+ - Performance degrades over time
8
+ - High swap usage
9
+ - Monitoring alert: "Memory usage >90%"
10
+
11
+ ## Severity
12
+
13
+ - **SEV2** if memory increasing but not yet critical
14
+ - **SEV1** if application crashed or unresponsive
15
+
16
+ ## Diagnosis
17
+
18
+ ### Step 1: Confirm Memory Leak
19
+
20
+ ```bash
21
+ # Monitor memory over time (5 minute intervals)
22
+ watch -n 300 'ps aux | grep <process> | awk "{print \$4, \$5, \$6}"'
23
+
24
+ # Check if memory continuously increasing
25
+ # Leak: 20% → 30% → 40% → 50% (linear growth)
26
+ # Normal: 30% → 32% → 31% → 30% (stable)
27
+ ```
28
+
29
+ ---
30
+
31
+ ### Step 2: Get Memory Snapshot
32
+
33
+ **Java (Heap Dump)**:
34
+ ```bash
35
+ # Get heap dump
36
+ jmap -dump:format=b,file=heap.bin <PID>
37
+
38
+ # Analyze with jhat or VisualVM
39
+ jhat heap.bin
40
+ # Open http://localhost:7000
41
+
42
+ # Or use Eclipse Memory Analyzer
43
+ ```
44
+
45
+ **Node.js (Heap Snapshot)**:
46
+ ```bash
47
+ # Start with --inspect
48
+ node --inspect index.js
49
+
50
+ # Chrome DevTools → Memory → Take heap snapshot
51
+
52
+ # Or use heapdump module
53
+ const heapdump = require('heapdump');
54
+ heapdump.writeSnapshot('/tmp/heap-' + Date.now() + '.heapsnapshot');
55
+ ```
56
+
57
+ **Python (Memory Profiler)**:
58
+ ```bash
59
+ # Install memory_profiler
60
+ pip install memory_profiler
61
+
62
+ # Profile function
63
+ python -m memory_profiler script.py
64
+ ```
65
+
66
+ ---
67
+
68
+ ### Step 3: Identify Leak Source
69
+
70
+ **Look for**:
71
+ - Large arrays/objects growing over time
72
+ - Detached DOM nodes (if browser/UI)
73
+ - Event listeners not removed
74
+ - Timers/intervals not cleared
75
+ - Closures holding references
76
+ - Cache without eviction policy
77
+
78
+ **Common patterns**:
79
+ ```javascript
80
+ // 1. Global cache growing forever
81
+ global.cache = {}; // Never cleared
82
+
83
+ // 2. Event listeners not removed
84
+ emitter.on('event', handler); // Never removed
85
+
86
+ // 3. Timers not cleared
87
+ setInterval(() => { /* ... */ }, 1000); // Never cleared
88
+
89
+ // 4. Closures
90
+ function createHandler() {
91
+ const largeData = new Array(1000000);
92
+ return () => {
93
+ // Closure keeps largeData in memory
94
+ };
95
+ }
96
+ ```
97
+
98
+ ---
99
+
100
+ ## Mitigation
101
+
102
+ ### Immediate (Now - 5 min)
103
+
104
+ **Option A: Restart Application**
105
+ ```bash
106
+ # Restart to free memory
107
+ systemctl restart application
108
+
109
+ # Impact: Memory usage returns to baseline
110
+ # Risk: Low (brief downtime)
111
+ # NOTE: This is temporary, leak will recur!
112
+ ```
113
+
114
+ **Option B: Increase Memory Limit** (temporary)
115
+ ```bash
116
+ # Java
117
+ java -Xmx4G -jar application.jar # Was 2G
118
+
119
+ # Node.js
120
+ node --max-old-space-size=4096 index.js # Was 2048
121
+
122
+ # Impact: Buys time to find root cause
123
+ # Risk: Low (but doesn't fix leak)
124
+ ```
125
+
126
+ **Option C: Scale Horizontally** (cloud)
127
+ ```bash
128
+ # Add more instances
129
+ # Use load balancer to rotate traffic
130
+ # Restart instances on schedule (e.g., every 6 hours)
131
+
132
+ # Impact: Distributes load, restarts prevent OOM
133
+ # Risk: Low (but doesn't fix root cause)
134
+ ```
135
+
136
+ ---
137
+
138
+ ### Short-term (5 min - 1 hour)
139
+
140
+ **Analyze heap dump** and identify leak source
141
+
142
+ **Common Fixes**:
143
+
144
+ **1. Add LRU Cache**
145
+ ```javascript
146
+ // BAD: Unbounded cache
147
+ const cache = {};
148
+
149
+ // GOOD: LRU cache with size limit
150
+ const LRU = require('lru-cache');
151
+ const cache = new LRU({ max: 1000 });
152
+ ```
153
+
154
+ **2. Remove Event Listeners**
155
+ ```javascript
156
+ // Add listener
157
+ const handler = () => { /* ... */ };
158
+ emitter.on('event', handler);
159
+
160
+ // CRITICAL: Remove later
161
+ emitter.off('event', handler);
162
+
163
+ // React/Vue: cleanup in componentWillUnmount/onUnmounted
164
+ ```
165
+
166
+ **3. Clear Timers**
167
+ ```javascript
168
+ // Set timer
169
+ const intervalId = setInterval(() => { /* ... */ }, 1000);
170
+
171
+ // CRITICAL: Clear later
172
+ clearInterval(intervalId);
173
+
174
+ // React: cleanup in useEffect return
175
+ useEffect(() => {
176
+ const id = setInterval(() => { /* ... */ }, 1000);
177
+ return () => clearInterval(id);
178
+ }, []);
179
+ ```
180
+
181
+ **4. Close Connections**
182
+ ```javascript
183
+ // BAD: Connection leak
184
+ const conn = await db.connect();
185
+ await conn.query(/* ... */);
186
+ // Connection never closed!
187
+
188
+ // GOOD: Always close
189
+ const conn = await db.connect();
190
+ try {
191
+ await conn.query(/* ... */);
192
+ } finally {
193
+ await conn.close(); // CRITICAL
194
+ }
195
+ ```
196
+
197
+ ---
198
+
199
+ ### Long-term (1 hour+)
200
+
201
+ - [ ] Add memory monitoring (alert if >80% and increasing)
202
+ - [ ] Add memory profiling to CI/CD (detect leaks early)
203
+ - [ ] Use WeakMap for caches (auto garbage collected)
204
+ - [ ] Review closure usage (avoid holding large data)
205
+ - [ ] Add automated restart (every N hours, if leak can't be fixed immediately)
206
+ - [ ] Load test to reproduce leak in test environment
207
+ - [ ] Fix root cause in code
208
+
209
+ ---
210
+
211
+ ## Escalation
212
+
213
+ **Escalate to developer if**:
214
+ - Application code causing leak
215
+ - Requires code fix
216
+
217
+ **Escalate to platform team if**:
218
+ - Platform/framework bug
219
+ - Requires upgrade or workaround
220
+
221
+ ---
222
+
223
+ ## Prevention Checklist
224
+
225
+ - [ ] Use LRU cache (not unbounded)
226
+ - [ ] Remove event listeners in cleanup
227
+ - [ ] Clear timers/intervals
228
+ - [ ] Close database connections (use `finally`)
229
+ - [ ] Avoid closures holding large data
230
+ - [ ] Use WeakMap for temporary caches
231
+ - [ ] Profile memory in development
232
+ - [ ] Load test before production
233
+
234
+ ---
235
+
236
+ ## Related Runbooks
237
+
238
+ - [01-high-cpu-usage.md](01-high-cpu-usage.md) - If CPU also high
239
+ - [07-service-down.md](07-service-down.md) - If OOM crashed service
240
+ - [../modules/backend-diagnostics.md](../modules/backend-diagnostics.md) - Backend troubleshooting
241
+
242
+ ---
243
+
244
+ ## Post-Incident
245
+
246
+ After resolving:
247
+ - [ ] Create post-mortem
248
+ - [ ] Identify leak source from heap dump
249
+ - [ ] Fix code
250
+ - [ ] Add regression test (memory profiling)
251
+ - [ ] Add monitoring alert
252
+ - [ ] Update this runbook if needed