specweave 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/INSTALL.md +848 -0
  2. package/LICENSE +21 -0
  3. package/README.md +675 -0
  4. package/SPECWEAVE.md +665 -0
  5. package/bin/install-agents.sh +57 -0
  6. package/bin/install-all.sh +49 -0
  7. package/bin/install-commands.sh +56 -0
  8. package/bin/install-skills.sh +57 -0
  9. package/bin/specweave.js +81 -0
  10. package/dist/adapters/adapter-base.d.ts +50 -0
  11. package/dist/adapters/adapter-base.d.ts.map +1 -0
  12. package/dist/adapters/adapter-base.js +146 -0
  13. package/dist/adapters/adapter-base.js.map +1 -0
  14. package/dist/adapters/adapter-interface.d.ts +108 -0
  15. package/dist/adapters/adapter-interface.d.ts.map +1 -0
  16. package/dist/adapters/adapter-interface.js +9 -0
  17. package/dist/adapters/adapter-interface.js.map +1 -0
  18. package/dist/adapters/claude/adapter.d.ts +54 -0
  19. package/dist/adapters/claude/adapter.d.ts.map +1 -0
  20. package/dist/adapters/claude/adapter.js +184 -0
  21. package/dist/adapters/claude/adapter.js.map +1 -0
  22. package/dist/adapters/copilot/adapter.d.ts +42 -0
  23. package/dist/adapters/copilot/adapter.d.ts.map +1 -0
  24. package/dist/adapters/copilot/adapter.js +239 -0
  25. package/dist/adapters/copilot/adapter.js.map +1 -0
  26. package/dist/adapters/cursor/adapter.d.ts +42 -0
  27. package/dist/adapters/cursor/adapter.d.ts.map +1 -0
  28. package/dist/adapters/cursor/adapter.js +297 -0
  29. package/dist/adapters/cursor/adapter.js.map +1 -0
  30. package/dist/adapters/generic/adapter.d.ts +40 -0
  31. package/dist/adapters/generic/adapter.d.ts.map +1 -0
  32. package/dist/adapters/generic/adapter.js +155 -0
  33. package/dist/adapters/generic/adapter.js.map +1 -0
  34. package/dist/cli/commands/init.d.ts +6 -0
  35. package/dist/cli/commands/init.d.ts.map +1 -0
  36. package/dist/cli/commands/init.js +247 -0
  37. package/dist/cli/commands/init.js.map +1 -0
  38. package/dist/cli/commands/install.d.ts +7 -0
  39. package/dist/cli/commands/install.d.ts.map +1 -0
  40. package/dist/cli/commands/install.js +160 -0
  41. package/dist/cli/commands/install.js.map +1 -0
  42. package/dist/cli/commands/list.d.ts +6 -0
  43. package/dist/cli/commands/list.d.ts.map +1 -0
  44. package/dist/cli/commands/list.js +154 -0
  45. package/dist/cli/commands/list.js.map +1 -0
  46. package/package.json +90 -0
  47. package/src/adapters/README.md +312 -0
  48. package/src/adapters/adapter-base.ts +146 -0
  49. package/src/adapters/adapter-interface.ts +120 -0
  50. package/src/adapters/claude/README.md +241 -0
  51. package/src/adapters/claude/adapter.ts +157 -0
  52. package/src/adapters/copilot/.github/copilot/instructions.md +376 -0
  53. package/src/adapters/copilot/README.md +200 -0
  54. package/src/adapters/copilot/adapter.ts +210 -0
  55. package/src/adapters/cursor/.cursor/context/docs-context.md +62 -0
  56. package/src/adapters/cursor/.cursor/context/increments-context.md +71 -0
  57. package/src/adapters/cursor/.cursor/context/strategy-context.md +73 -0
  58. package/src/adapters/cursor/.cursor/context/tests-context.md +89 -0
  59. package/src/adapters/cursor/.cursorrules +325 -0
  60. package/src/adapters/cursor/README.md +243 -0
  61. package/src/adapters/cursor/adapter.ts +268 -0
  62. package/src/adapters/generic/README.md +277 -0
  63. package/src/adapters/generic/SPECWEAVE-MANUAL.md +676 -0
  64. package/src/adapters/generic/adapter.ts +159 -0
  65. package/src/adapters/registry.yaml +126 -0
  66. package/src/agents/architect/AGENT.md +416 -0
  67. package/src/agents/devops/AGENT.md +1738 -0
  68. package/src/agents/docs-writer/AGENT.md +239 -0
  69. package/src/agents/performance/AGENT.md +228 -0
  70. package/src/agents/pm/AGENT.md +751 -0
  71. package/src/agents/qa-lead/AGENT.md +150 -0
  72. package/src/agents/security/AGENT.md +179 -0
  73. package/src/agents/sre/AGENT.md +582 -0
  74. package/src/agents/sre/modules/backend-diagnostics.md +481 -0
  75. package/src/agents/sre/modules/database-diagnostics.md +509 -0
  76. package/src/agents/sre/modules/infrastructure.md +561 -0
  77. package/src/agents/sre/modules/monitoring.md +439 -0
  78. package/src/agents/sre/modules/security-incidents.md +421 -0
  79. package/src/agents/sre/modules/ui-diagnostics.md +302 -0
  80. package/src/agents/sre/playbooks/01-high-cpu-usage.md +204 -0
  81. package/src/agents/sre/playbooks/02-database-deadlock.md +241 -0
  82. package/src/agents/sre/playbooks/03-memory-leak.md +252 -0
  83. package/src/agents/sre/playbooks/04-slow-api-response.md +269 -0
  84. package/src/agents/sre/playbooks/05-ddos-attack.md +293 -0
  85. package/src/agents/sre/playbooks/06-disk-full.md +314 -0
  86. package/src/agents/sre/playbooks/07-service-down.md +333 -0
  87. package/src/agents/sre/playbooks/08-data-corruption.md +337 -0
  88. package/src/agents/sre/playbooks/09-cascade-failure.md +430 -0
  89. package/src/agents/sre/playbooks/10-rate-limit-exceeded.md +464 -0
  90. package/src/agents/sre/scripts/health-check.sh +230 -0
  91. package/src/agents/sre/scripts/log-analyzer.py +213 -0
  92. package/src/agents/sre/scripts/metrics-collector.sh +294 -0
  93. package/src/agents/sre/scripts/trace-analyzer.js +257 -0
  94. package/src/agents/sre/templates/incident-report.md +249 -0
  95. package/src/agents/sre/templates/mitigation-plan.md +375 -0
  96. package/src/agents/sre/templates/post-mortem.md +418 -0
  97. package/src/agents/sre/templates/runbook-template.md +412 -0
  98. package/src/agents/tech-lead/AGENT.md +263 -0
  99. package/src/commands/add-tasks.md +176 -0
  100. package/src/commands/close-increment.md +347 -0
  101. package/src/commands/create-increment.md +223 -0
  102. package/src/commands/create-project.md +528 -0
  103. package/src/commands/generate-docs.md +623 -0
  104. package/src/commands/list-increments.md +180 -0
  105. package/src/commands/review-docs.md +331 -0
  106. package/src/commands/start-increment.md +139 -0
  107. package/src/commands/sync-github.md +115 -0
  108. package/src/commands/validate-increment.md +800 -0
  109. package/src/hooks/README.md +252 -0
  110. package/src/hooks/docs-changed.sh +59 -0
  111. package/src/hooks/human-input-required.sh +55 -0
  112. package/src/hooks/post-task-completion.sh +57 -0
  113. package/src/hooks/pre-implementation.sh +47 -0
  114. package/src/skills/ado-sync/README.md +449 -0
  115. package/src/skills/ado-sync/SKILL.md +245 -0
  116. package/src/skills/ado-sync/test-cases/test-1.yaml +9 -0
  117. package/src/skills/ado-sync/test-cases/test-2.yaml +8 -0
  118. package/src/skills/ado-sync/test-cases/test-3.yaml +9 -0
  119. package/src/skills/bmad-method-expert/SKILL.md +628 -0
  120. package/src/skills/bmad-method-expert/scripts/analyze-project.js +318 -0
  121. package/src/skills/bmad-method-expert/scripts/check-setup.js +208 -0
  122. package/src/skills/bmad-method-expert/scripts/generate-template.js +1149 -0
  123. package/src/skills/bmad-method-expert/scripts/validate-documents.js +340 -0
  124. package/src/skills/bmad-method-expert/test-cases/test-1-placeholder.yaml +12 -0
  125. package/src/skills/bmad-method-expert/test-cases/test-2-placeholder.yaml +12 -0
  126. package/src/skills/bmad-method-expert/test-cases/test-3-placeholder.yaml +12 -0
  127. package/src/skills/brownfield-analyzer/SKILL.md +523 -0
  128. package/src/skills/brownfield-analyzer/test-cases/test-1-basic-analysis.yaml +48 -0
  129. package/src/skills/brownfield-analyzer/test-cases/test-2-placeholder.yaml +12 -0
  130. package/src/skills/brownfield-analyzer/test-cases/test-3-placeholder.yaml +12 -0
  131. package/src/skills/brownfield-onboarder/SKILL.md +625 -0
  132. package/src/skills/brownfield-onboarder/test-cases/test-1-placeholder.yaml +12 -0
  133. package/src/skills/brownfield-onboarder/test-cases/test-2-placeholder.yaml +12 -0
  134. package/src/skills/brownfield-onboarder/test-cases/test-3-placeholder.yaml +12 -0
  135. package/src/skills/calendar-system/test-cases/test-1-placeholder.yaml +12 -0
  136. package/src/skills/calendar-system/test-cases/test-2-placeholder.yaml +12 -0
  137. package/src/skills/calendar-system/test-cases/test-3-placeholder.yaml +12 -0
  138. package/src/skills/context-loader/SKILL.md +734 -0
  139. package/src/skills/context-loader/test-cases/test-1-basic-loading.yaml +39 -0
  140. package/src/skills/context-loader/test-cases/test-2-token-budget-exceeded.yaml +44 -0
  141. package/src/skills/context-loader/test-cases/test-3-section-anchors.yaml +45 -0
  142. package/src/skills/context-optimizer/SKILL.md +618 -0
  143. package/src/skills/context-optimizer/test-cases/test-1-bug-fix-narrow.yaml +97 -0
  144. package/src/skills/context-optimizer/test-cases/test-2-feature-focused.yaml +109 -0
  145. package/src/skills/context-optimizer/test-cases/test-3-architecture-broad.yaml +98 -0
  146. package/src/skills/cost-optimizer/SKILL.md +190 -0
  147. package/src/skills/cost-optimizer/test-cases/test-1-basic-comparison.yaml +75 -0
  148. package/src/skills/cost-optimizer/test-cases/test-2-budget-constraint.yaml +52 -0
  149. package/src/skills/cost-optimizer/test-cases/test-3-scale-requirement.yaml +63 -0
  150. package/src/skills/cost-optimizer/test-results/README.md +46 -0
  151. package/src/skills/design-system-architect/SKILL.md +107 -0
  152. package/src/skills/design-system-architect/test-cases/test-1-token-structure.yaml +23 -0
  153. package/src/skills/design-system-architect/test-cases/test-2-component-hierarchy.yaml +24 -0
  154. package/src/skills/design-system-architect/test-cases/test-3-accessibility-checklist.yaml +23 -0
  155. package/src/skills/diagrams-architect/SKILL.md +763 -0
  156. package/src/skills/diagrams-generator/SKILL.md +25 -0
  157. package/src/skills/diagrams-generator/test-cases/test-1.yaml +9 -0
  158. package/src/skills/diagrams-generator/test-cases/test-2.yaml +9 -0
  159. package/src/skills/diagrams-generator/test-cases/test-3.yaml +8 -0
  160. package/src/skills/docs-updater/README.md +48 -0
  161. package/src/skills/docs-updater/test-cases/test-1-placeholder.yaml +12 -0
  162. package/src/skills/docs-updater/test-cases/test-2-placeholder.yaml +12 -0
  163. package/src/skills/docs-updater/test-cases/test-3-placeholder.yaml +12 -0
  164. package/src/skills/dotnet-backend/SKILL.md +250 -0
  165. package/src/skills/e2e-playwright/README.md +506 -0
  166. package/src/skills/e2e-playwright/SKILL.md +457 -0
  167. package/src/skills/e2e-playwright/execute.js +373 -0
  168. package/src/skills/e2e-playwright/lib/utils.js +514 -0
  169. package/src/skills/e2e-playwright/package.json +33 -0
  170. package/src/skills/e2e-playwright/test-cases/TC-001-basic-navigation.yaml +54 -0
  171. package/src/skills/e2e-playwright/test-cases/TC-002-form-interaction.yaml +64 -0
  172. package/src/skills/e2e-playwright/test-cases/TC-003-specweave-integration.yaml +74 -0
  173. package/src/skills/e2e-playwright/test-cases/TC-004-accessibility-check.yaml +98 -0
  174. package/src/skills/figma-designer/SKILL.md +149 -0
  175. package/src/skills/figma-implementer/SKILL.md +148 -0
  176. package/src/skills/figma-mcp-connector/SKILL.md +136 -0
  177. package/src/skills/figma-mcp-connector/test-cases/test-1-read-file-desktop.yaml +22 -0
  178. package/src/skills/figma-mcp-connector/test-cases/test-2-read-file-framelink.yaml +21 -0
  179. package/src/skills/figma-mcp-connector/test-cases/test-3-error-handling.yaml +18 -0
  180. package/src/skills/figma-to-code/SKILL.md +128 -0
  181. package/src/skills/figma-to-code/test-cases/test-1-token-generation.yaml +29 -0
  182. package/src/skills/figma-to-code/test-cases/test-2-component-generation.yaml +27 -0
  183. package/src/skills/figma-to-code/test-cases/test-3-typescript-generation.yaml +28 -0
  184. package/src/skills/frontend/SKILL.md +177 -0
  185. package/src/skills/github-sync/SKILL.md +252 -0
  186. package/src/skills/github-sync/test-cases/test-1-placeholder.yaml +12 -0
  187. package/src/skills/github-sync/test-cases/test-2-placeholder.yaml +12 -0
  188. package/src/skills/github-sync/test-cases/test-3-placeholder.yaml +12 -0
  189. package/src/skills/hetzner-provisioner/README.md +308 -0
  190. package/src/skills/hetzner-provisioner/SKILL.md +251 -0
  191. package/src/skills/hetzner-provisioner/test-cases/test-1-basic-provision.yaml +71 -0
  192. package/src/skills/hetzner-provisioner/test-cases/test-2-postgres-provision.yaml +85 -0
  193. package/src/skills/hetzner-provisioner/test-cases/test-3-ssl-config.yaml +126 -0
  194. package/src/skills/hetzner-provisioner/test-results/README.md +259 -0
  195. package/src/skills/increment-planner/SKILL.md +889 -0
  196. package/src/skills/increment-planner/scripts/feature-utils.js +250 -0
  197. package/src/skills/increment-planner/test-cases/test-1-basic-feature.yaml +27 -0
  198. package/src/skills/increment-planner/test-cases/test-2-complex-feature.yaml +30 -0
  199. package/src/skills/increment-planner/test-cases/test-3-auto-numbering.yaml +24 -0
  200. package/src/skills/increment-quality-judge/SKILL.md +566 -0
  201. package/src/skills/increment-quality-judge/test-cases/test-1-good-spec.yaml +95 -0
  202. package/src/skills/increment-quality-judge/test-cases/test-2-poor-spec.yaml +108 -0
  203. package/src/skills/increment-quality-judge/test-cases/test-3-export-suggestions.yaml +87 -0
  204. package/src/skills/jira-sync/README.md +328 -0
  205. package/src/skills/jira-sync/SKILL.md +209 -0
  206. package/src/skills/jira-sync/test-cases/test-1.yaml +9 -0
  207. package/src/skills/jira-sync/test-cases/test-2.yaml +9 -0
  208. package/src/skills/jira-sync/test-cases/test-3.yaml +10 -0
  209. package/src/skills/nextjs/SKILL.md +176 -0
  210. package/src/skills/nodejs-backend/SKILL.md +181 -0
  211. package/src/skills/notification-system/test-cases/test-1-placeholder.yaml +12 -0
  212. package/src/skills/notification-system/test-cases/test-2-placeholder.yaml +12 -0
  213. package/src/skills/notification-system/test-cases/test-3-placeholder.yaml +12 -0
  214. package/src/skills/python-backend/SKILL.md +226 -0
  215. package/src/skills/role-orchestrator/README.md +197 -0
  216. package/src/skills/role-orchestrator/SKILL.md +1184 -0
  217. package/src/skills/role-orchestrator/test-cases/test-1-simple-product.yaml +98 -0
  218. package/src/skills/role-orchestrator/test-cases/test-2-quality-gate-failure.yaml +73 -0
  219. package/src/skills/role-orchestrator/test-cases/test-3-security-workflow.yaml +121 -0
  220. package/src/skills/role-orchestrator/test-cases/test-4-parallel-execution.yaml +145 -0
  221. package/src/skills/role-orchestrator/test-cases/test-5-feedback-loops.yaml +149 -0
  222. package/src/skills/skill-creator/LICENSE.txt +202 -0
  223. package/src/skills/skill-creator/SKILL.md +209 -0
  224. package/src/skills/skill-creator/scripts/init_skill.py +303 -0
  225. package/src/skills/skill-creator/scripts/package_skill.py +110 -0
  226. package/src/skills/skill-creator/scripts/quick_validate.py +65 -0
  227. package/src/skills/skill-creator/test-cases/test-1-placeholder.yaml +12 -0
  228. package/src/skills/skill-creator/test-cases/test-2-placeholder.yaml +12 -0
  229. package/src/skills/skill-creator/test-cases/test-3-placeholder.yaml +12 -0
  230. package/src/skills/skill-router/SKILL.md +497 -0
  231. package/src/skills/skill-router/test-cases/test-1-basic-routing.yaml +33 -0
  232. package/src/skills/skill-router/test-cases/test-2-ambiguous-request.yaml +42 -0
  233. package/src/skills/skill-router/test-cases/test-3-nested-orchestration.yaml +50 -0
  234. package/src/skills/spec-driven-brainstorming/README.md +264 -0
  235. package/src/skills/spec-driven-brainstorming/SKILL.md +439 -0
  236. package/src/skills/spec-driven-brainstorming/test-cases/TC-001-simple-idea-to-design.yaml +148 -0
  237. package/src/skills/spec-driven-brainstorming/test-cases/TC-002-complex-ultrathink-design.yaml +190 -0
  238. package/src/skills/spec-driven-brainstorming/test-cases/TC-003-unclear-requirements-socratic.yaml +233 -0
  239. package/src/skills/spec-driven-debugging/README.md +479 -0
  240. package/src/skills/spec-driven-debugging/SKILL.md +652 -0
  241. package/src/skills/spec-driven-debugging/test-cases/TC-001-simple-auth-bug.yaml +212 -0
  242. package/src/skills/spec-driven-debugging/test-cases/TC-002-race-condition-ultrathink.yaml +461 -0
  243. package/src/skills/spec-driven-debugging/test-cases/TC-003-brownfield-missing-spec.yaml +366 -0
  244. package/src/skills/spec-kit-expert/SKILL.md +1012 -0
  245. package/src/skills/spec-kit-expert/test-cases/test-1-placeholder.yaml +12 -0
  246. package/src/skills/spec-kit-expert/test-cases/test-2-placeholder.yaml +12 -0
  247. package/src/skills/spec-kit-expert/test-cases/test-3-placeholder.yaml +12 -0
  248. package/src/skills/specweave-ado-mapper/SKILL.md +501 -0
  249. package/src/skills/specweave-detector/SKILL.md +420 -0
  250. package/src/skills/specweave-detector/test-cases/test-1-basic-detection.yaml +37 -0
  251. package/src/skills/specweave-detector/test-cases/test-2-missing-config.yaml +37 -0
  252. package/src/skills/specweave-detector/test-cases/test-3-non-specweave-project.yaml +34 -0
  253. package/src/skills/specweave-jira-mapper/SKILL.md +500 -0
  254. package/src/skills/stripe-integrator/test-cases/test-1-placeholder.yaml +12 -0
  255. package/src/skills/stripe-integrator/test-cases/test-2-placeholder.yaml +12 -0
  256. package/src/skills/stripe-integrator/test-cases/test-3-placeholder.yaml +12 -0
  257. package/src/skills/task-builder/README.md +90 -0
  258. package/src/skills/task-builder/test-cases/test-1-placeholder.yaml +12 -0
  259. package/src/skills/task-builder/test-cases/test-2-placeholder.yaml +12 -0
  260. package/src/skills/task-builder/test-cases/test-3-placeholder.yaml +12 -0
  261. package/src/templates/.env.example +144 -0
  262. package/src/templates/.gitignore.template +81 -0
  263. package/src/templates/CLAUDE.md.template +383 -0
  264. package/src/templates/README.md.template +240 -0
  265. package/src/templates/config.yaml +333 -0
  266. package/src/templates/docs/README.md +124 -0
  267. package/src/templates/docs/adr-template.md +118 -0
  268. package/src/templates/docs/hld-template.md +220 -0
  269. package/src/templates/docs/lld-template.md +580 -0
  270. package/src/templates/docs/prd-template.md +132 -0
  271. package/src/templates/docs/rfc-template.md +229 -0
  272. package/src/templates/docs/runbook-template.md +298 -0
  273. package/src/templates/environments/minimal/.env.production +16 -0
  274. package/src/templates/environments/minimal/README.md +54 -0
  275. package/src/templates/environments/minimal/deploy-production.yml +52 -0
  276. package/src/templates/environments/progressive/.env.qa +28 -0
  277. package/src/templates/environments/progressive/README.md +129 -0
  278. package/src/templates/environments/progressive/deploy-production.yml +93 -0
  279. package/src/templates/environments/progressive/deploy-qa.yml +62 -0
  280. package/src/templates/environments/progressive/deploy-staging.yml +67 -0
  281. package/src/templates/environments/standard/.env.development +20 -0
  282. package/src/templates/environments/standard/.env.production +30 -0
  283. package/src/templates/environments/standard/.env.staging +23 -0
  284. package/src/templates/environments/standard/README.md +97 -0
  285. package/src/templates/environments/standard/deploy-production.yml +68 -0
  286. package/src/templates/environments/standard/deploy-staging.yml +61 -0
  287. package/src/templates/environments/standard/docker-compose.yml +43 -0
  288. package/src/templates/increment-metadata-template.yaml +138 -0
@@ -0,0 +1,561 @@
1
+ # Infrastructure Diagnostics
2
+
3
+ **Purpose**: Troubleshoot server, network, disk, and cloud infrastructure issues.
4
+
5
+ ## Common Infrastructure Issues
6
+
7
+ ### 1. High CPU Usage (Server)
8
+
9
+ **Symptoms**:
10
+ - Server CPU at 100%
11
+ - Applications slow
12
+ - SSH lag
13
+
14
+ **Diagnosis**:
15
+
16
+ #### Check CPU Usage
17
+ ```bash
18
+ # Overall CPU usage
19
+ top -bn1 | grep "Cpu(s)"
20
+
21
+ # Top CPU processes
22
+ top -bn1 | head -20
23
+
24
+ # CPU usage per core
25
+ mpstat -P ALL 1 5
26
+
27
+ # Historical CPU (if sar installed)
28
+ sar -u 1 10
29
+ ```
30
+
31
+ **Red flags**:
32
+ - CPU at 100% for >5 minutes
33
+ - Single process using >80% CPU
34
+ - iowait >20% (disk bottleneck)
35
+ - System CPU >30% (kernel overhead)
36
+
37
+ ---
38
+
39
+ #### Identify CPU-heavy Process
40
+ ```bash
41
+ # Top CPU process
42
+ ps aux | sort -nrk 3,3 | head -10
43
+
44
+ # CPU per thread
45
+ top -H
46
+
47
+ # Process tree
48
+ pstree -p
49
+ ```
50
+
51
+ **Common causes**:
52
+ - Application bug (infinite loop)
53
+ - Heavy computation
54
+ - Crypto mining malware
55
+ - Backup/compression running
56
+
57
+ ---
58
+
59
+ #### Immediate Mitigation
60
+ ```bash
61
+ # 1. Limit process CPU (nice)
62
+ renice +10 <PID> # Lower priority
63
+
64
+ # 2. Kill process (last resort)
65
+ kill -TERM <PID> # Graceful
66
+ kill -KILL <PID> # Force kill
67
+
68
+ # 3. Scale horizontally (add servers)
69
+ # Cloud: Auto-scaling group
70
+
71
+ # 4. Scale vertically (bigger instance)
72
+ # Cloud: Resize instance
73
+ ```
74
+
75
+ ---
76
+
77
+ ### 2. Out of Memory (OOM)
78
+
79
+ **Symptoms**:
80
+ - "Out of memory" errors
81
+ - OOM Killer triggered
82
+ - Applications crash
83
+ - Swap usage high
84
+
85
+ **Diagnosis**:
86
+
87
+ #### Check Memory Usage
88
+ ```bash
89
+ # Current memory usage
90
+ free -h
91
+
92
+ # Memory per process
93
+ ps aux | sort -nrk 4,4 | head -10
94
+
95
+ # Check OOM killer logs
96
+ dmesg | grep -i "out of memory\|oom"
97
+ grep "Out of memory" /var/log/syslog
98
+
99
+ # Check swap usage
100
+ swapon -s
101
+ ```
102
+
103
+ **Red flags**:
104
+ - Available memory <10%
105
+ - Swap usage >80%
106
+ - OOM killer active
107
+ - Single process using >50% memory
108
+
109
+ ---
110
+
111
+ #### Immediate Mitigation
112
+ ```bash
113
+ # 1. Free page cache (safe)
114
+ sync && echo 3 > /proc/sys/vm/drop_caches
115
+
116
+ # 2. Kill memory-heavy process
117
+ kill -9 <PID>
118
+
119
+ # 3. Increase swap (temporary)
120
+ dd if=/dev/zero of=/swapfile bs=1M count=2048
121
+ mkswap /swapfile
122
+ swapon /swapfile
123
+
124
+ # 4. Scale up (more RAM)
125
+ # Cloud: Resize instance
126
+ ```
127
+
128
+ ---
129
+
130
+ ### 3. Disk Full
131
+
132
+ **Symptoms**:
133
+ - "No space left on device" errors
134
+ - Applications can't write files
135
+ - Database refuses writes
136
+ - Logs not being written
137
+
138
+ **Diagnosis**:
139
+
140
+ #### Check Disk Usage
141
+ ```bash
142
+ # Disk usage by partition
143
+ df -h
144
+
145
+ # Disk usage by directory
146
+ du -sh /*
147
+ du -sh /var/*
148
+
149
+ # Find large files
150
+ find / -type f -size +100M -exec ls -lh {} \;
151
+
152
+ # Find files using deleted space
153
+ lsof | grep deleted
154
+ ```
155
+
156
+ **Red flags**:
157
+ - Disk usage >90%
158
+ - /var/log full (runaway logs)
159
+ - /tmp full (temp files not cleaned)
160
+ - Deleted files still holding space (process has handle)
161
+
162
+ ---
163
+
164
+ #### Immediate Mitigation
165
+ ```bash
166
+ # 1. Clean up logs
167
+ find /var/log -name "*.log.*" -mtime +7 -delete
168
+ journalctl --vacuum-time=7d
169
+
170
+ # 2. Clean up temp files
171
+ rm -rf /tmp/*
172
+ rm -rf /var/tmp/*
173
+
174
+ # 3. Find and remove deleted files holding space
175
+ lsof | grep deleted | awk '{print $2}' | xargs kill -9
176
+
177
+ # 4. Compress logs
178
+ gzip /var/log/*.log
179
+
180
+ # 5. Expand disk (cloud)
181
+ # AWS: Modify EBS volume size
182
+ # Azure: Expand managed disk
183
+ # After expanding:
184
+ resize2fs /dev/xvda1 # ext4
185
+ xfs_growfs / # xfs
186
+ ```
187
+
188
+ ---
189
+
190
+ ### 4. Network Issues
191
+
192
+ **Symptoms**:
193
+ - Slow network performance
194
+ - Timeouts
195
+ - Connection refused
196
+ - High latency
197
+
198
+ **Diagnosis**:
199
+
200
+ #### Check Network Connectivity
201
+ ```bash
202
+ # Ping test
203
+ ping -c 5 google.com
204
+
205
+ # DNS resolution
206
+ nslookup example.com
207
+ dig example.com
208
+
209
+ # Traceroute
210
+ traceroute example.com
211
+
212
+ # Check network interfaces
213
+ ip addr show
214
+ ifconfig
215
+
216
+ # Check routing table
217
+ ip route show
218
+ route -n
219
+ ```
220
+
221
+ **Red flags**:
222
+ - Packet loss >1%
223
+ - Latency >100ms (same region)
224
+ - DNS resolution failures
225
+ - Interface down
226
+
227
+ ---
228
+
229
+ #### Check Network Bandwidth
230
+ ```bash
231
+ # Current bandwidth usage
232
+ iftop -i eth0
233
+
234
+ # Network stats
235
+ netstat -i
236
+
237
+ # Historical bandwidth (if vnstat installed)
238
+ vnstat -l
239
+
240
+ # Check for bandwidth limits (cloud)
241
+ # AWS: Check CloudWatch NetworkIn/NetworkOut
242
+ ```
243
+
244
+ ---
245
+
246
+ #### Check Firewall Rules
247
+ ```bash
248
+ # Check iptables rules
249
+ iptables -L -n -v
250
+
251
+ # Check firewalld (RHEL/CentOS)
252
+ firewall-cmd --list-all
253
+
254
+ # Check UFW (Ubuntu)
255
+ ufw status verbose
256
+
257
+ # Check security groups (cloud)
258
+ # AWS: EC2 → Security Groups
259
+ # Azure: Network Security Groups
260
+ ```
261
+
262
+ **Common causes**:
263
+ - Firewall blocking traffic
264
+ - Security group misconfigured
265
+ - MTU mismatch
266
+ - Network congestion
267
+ - DDoS attack
268
+
269
+ ---
270
+
271
+ #### Immediate Mitigation
272
+ ```bash
273
+ # 1. Check firewall allows traffic
274
+ iptables -A INPUT -p tcp --dport 80 -j ACCEPT
275
+ iptables -A INPUT -p tcp --dport 443 -j ACCEPT
276
+
277
+ # 2. Restart networking
278
+ systemctl restart networking
279
+ systemctl restart NetworkManager
280
+
281
+ # 3. Flush DNS cache
282
+ systemd-resolve --flush-caches
283
+
284
+ # 4. Check cloud network ACLs
285
+ # Ensure subnet has route to internet gateway
286
+ ```
287
+
288
+ ---
289
+
290
+ ### 5. High Disk I/O (Slow Disk)
291
+
292
+ **Symptoms**:
293
+ - Applications slow
294
+ - High iowait CPU
295
+ - Disk latency high
296
+
297
+ **Diagnosis**:
298
+
299
+ #### Check Disk I/O
300
+ ```bash
301
+ # Disk I/O stats
302
+ iostat -x 1 5
303
+
304
+ # Look for:
305
+ # - %util >80% (disk saturated)
306
+ # - await >100ms (high latency)
307
+
308
+ # Top I/O processes
309
+ iotop -o
310
+
311
+ # Historical I/O (if sar installed)
312
+ sar -d 1 10
313
+ ```
314
+
315
+ **Red flags**:
316
+ - %util at 100%
317
+ - await >100ms
318
+ - iowait CPU >20%
319
+ - Queue size (avgqu-sz) >10
320
+
321
+ ---
322
+
323
+ #### Common Causes
324
+ ```bash
325
+ # 1. Database without indexes (Seq Scan)
326
+ # See database-diagnostics.md
327
+
328
+ # 2. Log rotation running
329
+ # Large logs being compressed
330
+
331
+ # 3. Backup running
332
+ # Database dump, file backup
333
+
334
+ # 4. Disk issue (bad sectors)
335
+ dmesg | grep -i "I/O error"
336
+ smartctl -a /dev/sda # SMART status
337
+ ```
338
+
339
+ ---
340
+
341
+ #### Immediate Mitigation
342
+ ```bash
343
+ # 1. Reduce I/O pressure
344
+ # Stop non-critical processes (backup, log rotation)
345
+
346
+ # 2. Add read cache
347
+ # Enable query caching (database)
348
+ # Add Redis for application cache
349
+
350
+ # 3. Scale disk IOPS (cloud)
351
+ # AWS: Change EBS volume type (gp2 → gp3 → io1)
352
+ # Azure: Change disk tier
353
+
354
+ # 4. Move to SSD (if on HDD)
355
+ ```
356
+
357
+ ---
358
+
359
+ ### 6. Service Down / Process Crashed
360
+
361
+ **Symptoms**:
362
+ - Service not responding
363
+ - Health check failures
364
+ - 502 Bad Gateway
365
+
366
+ **Diagnosis**:
367
+
368
+ #### Check Service Status
369
+ ```bash
370
+ # Systemd services
371
+ systemctl status nginx
372
+ systemctl status postgresql
373
+ systemctl status application
374
+
375
+ # Check if process running
376
+ ps aux | grep nginx
377
+ pidof nginx
378
+
379
+ # Check service logs
380
+ journalctl -u nginx -n 50
381
+ tail -f /var/log/nginx/error.log
382
+ ```
383
+
384
+ **Red flags**:
385
+ - Service: inactive (dead)
386
+ - Process not found
387
+ - Recent crash in logs
388
+
389
+ ---
390
+
391
+ #### Check Why Service Crashed
392
+ ```bash
393
+ # Check system logs
394
+ dmesg | tail -50
395
+ grep "error\|segfault\|killed" /var/log/syslog
396
+
397
+ # Check application logs
398
+ tail -100 /var/log/application.log
399
+
400
+ # Check for OOM killer
401
+ dmesg | grep -i "killed process"
402
+
403
+ # Check core dumps
404
+ ls -l /var/crash/
405
+ ls -l /tmp/core*
406
+ ```
407
+
408
+ **Common causes**:
409
+ - Out of memory (OOM Killer)
410
+ - Segmentation fault (code bug)
411
+ - Unhandled exception
412
+ - Dependency service down
413
+ - Configuration error
414
+
415
+ ---
416
+
417
+ #### Immediate Mitigation
418
+ ```bash
419
+ # 1. Restart service
420
+ systemctl restart nginx
421
+
422
+ # 2. Check if started successfully
423
+ systemctl status nginx
424
+ curl http://localhost
425
+
426
+ # 3. If startup fails, check config
427
+ nginx -t # Test nginx config
428
+ postgresql -D /var/lib/postgresql/data --config-test
429
+
430
+ # 4. Enable auto-restart (systemd)
431
+ # Add to service file:
432
+ [Service]
433
+ Restart=always
434
+ RestartSec=10
435
+ ```
436
+
437
+ ---
438
+
439
+ ### 7. Cloud Infrastructure Issues
440
+
441
+ #### AWS-Specific
442
+
443
+ **Instance Issues**:
444
+ ```bash
445
+ # Check instance health
446
+ aws ec2 describe-instance-status --instance-ids i-1234567890abcdef0
447
+
448
+ # Check system logs
449
+ aws ec2 get-console-output --instance-id i-1234567890abcdef0
450
+
451
+ # Check CloudWatch metrics
452
+ aws cloudwatch get-metric-statistics \
453
+ --namespace AWS/EC2 \
454
+ --metric-name CPUUtilization \
455
+ --dimensions Name=InstanceId,Value=i-1234567890abcdef0
456
+ ```
457
+
458
+ **EBS Volume Issues**:
459
+ ```bash
460
+ # Check volume status
461
+ aws ec2 describe-volumes --volume-ids vol-1234567890abcdef0
462
+
463
+ # Increase IOPS (gp3)
464
+ aws ec2 modify-volume \
465
+ --volume-id vol-1234567890abcdef0 \
466
+ --iops 3000
467
+
468
+ # Check volume metrics
469
+ aws cloudwatch get-metric-statistics \
470
+ --namespace AWS/EBS \
471
+ --metric-name VolumeReadOps
472
+ ```
473
+
474
+ **Network Issues**:
475
+ ```bash
476
+ # Check security groups
477
+ aws ec2 describe-security-groups --group-ids sg-1234567890abcdef0
478
+
479
+ # Check network ACLs
480
+ aws ec2 describe-network-acls --network-acl-ids acl-1234567890abcdef0
481
+
482
+ # Check route tables
483
+ aws ec2 describe-route-tables --route-table-ids rtb-1234567890abcdef0
484
+ ```
485
+
486
+ ---
487
+
488
+ #### Azure-Specific
489
+
490
+ **VM Issues**:
491
+ ```bash
492
+ # Check VM status
493
+ az vm get-instance-view --name myVM --resource-group myRG
494
+
495
+ # Restart VM
496
+ az vm restart --name myVM --resource-group myRG
497
+
498
+ # Resize VM
499
+ az vm resize --name myVM --resource-group myRG --size Standard_D4s_v3
500
+ ```
501
+
502
+ **Disk Issues**:
503
+ ```bash
504
+ # Check disk status
505
+ az disk show --name myDisk --resource-group myRG
506
+
507
+ # Expand disk
508
+ az disk update --name myDisk --resource-group myRG --size-gb 256
509
+ ```
510
+
511
+ ---
512
+
513
+ ## Infrastructure Performance Metrics
514
+
515
+ **Server Health**:
516
+ - CPU: <70% average, <90% peak
517
+ - Memory: <80% usage
518
+ - Disk: <80% usage, <80% IOPS
519
+ - Network: <70% bandwidth
520
+
521
+ **Uptime**:
522
+ - Target: 99.9% (8.76 hours downtime/year)
523
+ - Monitoring: Check every 1 minute
524
+
525
+ **Response Time**:
526
+ - Ping latency: <50ms (same region)
527
+ - HTTP response: <200ms
528
+
529
+ ---
530
+
531
+ ## Infrastructure Diagnostic Checklist
532
+
533
+ **When diagnosing infrastructure issues**:
534
+
535
+ - [ ] Check CPU usage (target: <70%)
536
+ - [ ] Check memory usage (target: <80%)
537
+ - [ ] Check disk usage (target: <80%)
538
+ - [ ] Check disk I/O (%util, await)
539
+ - [ ] Check network connectivity (ping, traceroute)
540
+ - [ ] Check firewall rules (iptables, security groups)
541
+ - [ ] Check service status (systemd, ps)
542
+ - [ ] Check system logs (dmesg, /var/log/syslog)
543
+ - [ ] Check cloud metrics (CloudWatch, Azure Monitor)
544
+ - [ ] Check for hardware issues (SMART, dmesg errors)
545
+
546
+ **Tools**:
547
+ - `top`, `htop` - CPU, memory
548
+ - `df`, `du` - Disk usage
549
+ - `iostat` - Disk I/O
550
+ - `iftop`, `netstat` - Network
551
+ - `dmesg`, `journalctl` - System logs
552
+ - Cloud dashboards (AWS, Azure, GCP)
553
+
554
+ ---
555
+
556
+ ## Related Documentation
557
+
558
+ - [SKILL.md](../SKILL.md) - Main SRE agent
559
+ - [backend-diagnostics.md](backend-diagnostics.md) - Application-level troubleshooting
560
+ - [database-diagnostics.md](database-diagnostics.md) - Database performance
561
+ - [security-incidents.md](security-incidents.md) - Security response