specweave 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/INSTALL.md +848 -0
  2. package/LICENSE +21 -0
  3. package/README.md +675 -0
  4. package/SPECWEAVE.md +665 -0
  5. package/bin/install-agents.sh +57 -0
  6. package/bin/install-all.sh +49 -0
  7. package/bin/install-commands.sh +56 -0
  8. package/bin/install-skills.sh +57 -0
  9. package/bin/specweave.js +81 -0
  10. package/dist/adapters/adapter-base.d.ts +50 -0
  11. package/dist/adapters/adapter-base.d.ts.map +1 -0
  12. package/dist/adapters/adapter-base.js +146 -0
  13. package/dist/adapters/adapter-base.js.map +1 -0
  14. package/dist/adapters/adapter-interface.d.ts +108 -0
  15. package/dist/adapters/adapter-interface.d.ts.map +1 -0
  16. package/dist/adapters/adapter-interface.js +9 -0
  17. package/dist/adapters/adapter-interface.js.map +1 -0
  18. package/dist/adapters/claude/adapter.d.ts +54 -0
  19. package/dist/adapters/claude/adapter.d.ts.map +1 -0
  20. package/dist/adapters/claude/adapter.js +184 -0
  21. package/dist/adapters/claude/adapter.js.map +1 -0
  22. package/dist/adapters/copilot/adapter.d.ts +42 -0
  23. package/dist/adapters/copilot/adapter.d.ts.map +1 -0
  24. package/dist/adapters/copilot/adapter.js +239 -0
  25. package/dist/adapters/copilot/adapter.js.map +1 -0
  26. package/dist/adapters/cursor/adapter.d.ts +42 -0
  27. package/dist/adapters/cursor/adapter.d.ts.map +1 -0
  28. package/dist/adapters/cursor/adapter.js +297 -0
  29. package/dist/adapters/cursor/adapter.js.map +1 -0
  30. package/dist/adapters/generic/adapter.d.ts +40 -0
  31. package/dist/adapters/generic/adapter.d.ts.map +1 -0
  32. package/dist/adapters/generic/adapter.js +155 -0
  33. package/dist/adapters/generic/adapter.js.map +1 -0
  34. package/dist/cli/commands/init.d.ts +6 -0
  35. package/dist/cli/commands/init.d.ts.map +1 -0
  36. package/dist/cli/commands/init.js +247 -0
  37. package/dist/cli/commands/init.js.map +1 -0
  38. package/dist/cli/commands/install.d.ts +7 -0
  39. package/dist/cli/commands/install.d.ts.map +1 -0
  40. package/dist/cli/commands/install.js +160 -0
  41. package/dist/cli/commands/install.js.map +1 -0
  42. package/dist/cli/commands/list.d.ts +6 -0
  43. package/dist/cli/commands/list.d.ts.map +1 -0
  44. package/dist/cli/commands/list.js +154 -0
  45. package/dist/cli/commands/list.js.map +1 -0
  46. package/package.json +90 -0
  47. package/src/adapters/README.md +312 -0
  48. package/src/adapters/adapter-base.ts +146 -0
  49. package/src/adapters/adapter-interface.ts +120 -0
  50. package/src/adapters/claude/README.md +241 -0
  51. package/src/adapters/claude/adapter.ts +157 -0
  52. package/src/adapters/copilot/.github/copilot/instructions.md +376 -0
  53. package/src/adapters/copilot/README.md +200 -0
  54. package/src/adapters/copilot/adapter.ts +210 -0
  55. package/src/adapters/cursor/.cursor/context/docs-context.md +62 -0
  56. package/src/adapters/cursor/.cursor/context/increments-context.md +71 -0
  57. package/src/adapters/cursor/.cursor/context/strategy-context.md +73 -0
  58. package/src/adapters/cursor/.cursor/context/tests-context.md +89 -0
  59. package/src/adapters/cursor/.cursorrules +325 -0
  60. package/src/adapters/cursor/README.md +243 -0
  61. package/src/adapters/cursor/adapter.ts +268 -0
  62. package/src/adapters/generic/README.md +277 -0
  63. package/src/adapters/generic/SPECWEAVE-MANUAL.md +676 -0
  64. package/src/adapters/generic/adapter.ts +159 -0
  65. package/src/adapters/registry.yaml +126 -0
  66. package/src/agents/architect/AGENT.md +416 -0
  67. package/src/agents/devops/AGENT.md +1738 -0
  68. package/src/agents/docs-writer/AGENT.md +239 -0
  69. package/src/agents/performance/AGENT.md +228 -0
  70. package/src/agents/pm/AGENT.md +751 -0
  71. package/src/agents/qa-lead/AGENT.md +150 -0
  72. package/src/agents/security/AGENT.md +179 -0
  73. package/src/agents/sre/AGENT.md +582 -0
  74. package/src/agents/sre/modules/backend-diagnostics.md +481 -0
  75. package/src/agents/sre/modules/database-diagnostics.md +509 -0
  76. package/src/agents/sre/modules/infrastructure.md +561 -0
  77. package/src/agents/sre/modules/monitoring.md +439 -0
  78. package/src/agents/sre/modules/security-incidents.md +421 -0
  79. package/src/agents/sre/modules/ui-diagnostics.md +302 -0
  80. package/src/agents/sre/playbooks/01-high-cpu-usage.md +204 -0
  81. package/src/agents/sre/playbooks/02-database-deadlock.md +241 -0
  82. package/src/agents/sre/playbooks/03-memory-leak.md +252 -0
  83. package/src/agents/sre/playbooks/04-slow-api-response.md +269 -0
  84. package/src/agents/sre/playbooks/05-ddos-attack.md +293 -0
  85. package/src/agents/sre/playbooks/06-disk-full.md +314 -0
  86. package/src/agents/sre/playbooks/07-service-down.md +333 -0
  87. package/src/agents/sre/playbooks/08-data-corruption.md +337 -0
  88. package/src/agents/sre/playbooks/09-cascade-failure.md +430 -0
  89. package/src/agents/sre/playbooks/10-rate-limit-exceeded.md +464 -0
  90. package/src/agents/sre/scripts/health-check.sh +230 -0
  91. package/src/agents/sre/scripts/log-analyzer.py +213 -0
  92. package/src/agents/sre/scripts/metrics-collector.sh +294 -0
  93. package/src/agents/sre/scripts/trace-analyzer.js +257 -0
  94. package/src/agents/sre/templates/incident-report.md +249 -0
  95. package/src/agents/sre/templates/mitigation-plan.md +375 -0
  96. package/src/agents/sre/templates/post-mortem.md +418 -0
  97. package/src/agents/sre/templates/runbook-template.md +412 -0
  98. package/src/agents/tech-lead/AGENT.md +263 -0
  99. package/src/commands/add-tasks.md +176 -0
  100. package/src/commands/close-increment.md +347 -0
  101. package/src/commands/create-increment.md +223 -0
  102. package/src/commands/create-project.md +528 -0
  103. package/src/commands/generate-docs.md +623 -0
  104. package/src/commands/list-increments.md +180 -0
  105. package/src/commands/review-docs.md +331 -0
  106. package/src/commands/start-increment.md +139 -0
  107. package/src/commands/sync-github.md +115 -0
  108. package/src/commands/validate-increment.md +800 -0
  109. package/src/hooks/README.md +252 -0
  110. package/src/hooks/docs-changed.sh +59 -0
  111. package/src/hooks/human-input-required.sh +55 -0
  112. package/src/hooks/post-task-completion.sh +57 -0
  113. package/src/hooks/pre-implementation.sh +47 -0
  114. package/src/skills/ado-sync/README.md +449 -0
  115. package/src/skills/ado-sync/SKILL.md +245 -0
  116. package/src/skills/ado-sync/test-cases/test-1.yaml +9 -0
  117. package/src/skills/ado-sync/test-cases/test-2.yaml +8 -0
  118. package/src/skills/ado-sync/test-cases/test-3.yaml +9 -0
  119. package/src/skills/bmad-method-expert/SKILL.md +628 -0
  120. package/src/skills/bmad-method-expert/scripts/analyze-project.js +318 -0
  121. package/src/skills/bmad-method-expert/scripts/check-setup.js +208 -0
  122. package/src/skills/bmad-method-expert/scripts/generate-template.js +1149 -0
  123. package/src/skills/bmad-method-expert/scripts/validate-documents.js +340 -0
  124. package/src/skills/bmad-method-expert/test-cases/test-1-placeholder.yaml +12 -0
  125. package/src/skills/bmad-method-expert/test-cases/test-2-placeholder.yaml +12 -0
  126. package/src/skills/bmad-method-expert/test-cases/test-3-placeholder.yaml +12 -0
  127. package/src/skills/brownfield-analyzer/SKILL.md +523 -0
  128. package/src/skills/brownfield-analyzer/test-cases/test-1-basic-analysis.yaml +48 -0
  129. package/src/skills/brownfield-analyzer/test-cases/test-2-placeholder.yaml +12 -0
  130. package/src/skills/brownfield-analyzer/test-cases/test-3-placeholder.yaml +12 -0
  131. package/src/skills/brownfield-onboarder/SKILL.md +625 -0
  132. package/src/skills/brownfield-onboarder/test-cases/test-1-placeholder.yaml +12 -0
  133. package/src/skills/brownfield-onboarder/test-cases/test-2-placeholder.yaml +12 -0
  134. package/src/skills/brownfield-onboarder/test-cases/test-3-placeholder.yaml +12 -0
  135. package/src/skills/calendar-system/test-cases/test-1-placeholder.yaml +12 -0
  136. package/src/skills/calendar-system/test-cases/test-2-placeholder.yaml +12 -0
  137. package/src/skills/calendar-system/test-cases/test-3-placeholder.yaml +12 -0
  138. package/src/skills/context-loader/SKILL.md +734 -0
  139. package/src/skills/context-loader/test-cases/test-1-basic-loading.yaml +39 -0
  140. package/src/skills/context-loader/test-cases/test-2-token-budget-exceeded.yaml +44 -0
  141. package/src/skills/context-loader/test-cases/test-3-section-anchors.yaml +45 -0
  142. package/src/skills/context-optimizer/SKILL.md +618 -0
  143. package/src/skills/context-optimizer/test-cases/test-1-bug-fix-narrow.yaml +97 -0
  144. package/src/skills/context-optimizer/test-cases/test-2-feature-focused.yaml +109 -0
  145. package/src/skills/context-optimizer/test-cases/test-3-architecture-broad.yaml +98 -0
  146. package/src/skills/cost-optimizer/SKILL.md +190 -0
  147. package/src/skills/cost-optimizer/test-cases/test-1-basic-comparison.yaml +75 -0
  148. package/src/skills/cost-optimizer/test-cases/test-2-budget-constraint.yaml +52 -0
  149. package/src/skills/cost-optimizer/test-cases/test-3-scale-requirement.yaml +63 -0
  150. package/src/skills/cost-optimizer/test-results/README.md +46 -0
  151. package/src/skills/design-system-architect/SKILL.md +107 -0
  152. package/src/skills/design-system-architect/test-cases/test-1-token-structure.yaml +23 -0
  153. package/src/skills/design-system-architect/test-cases/test-2-component-hierarchy.yaml +24 -0
  154. package/src/skills/design-system-architect/test-cases/test-3-accessibility-checklist.yaml +23 -0
  155. package/src/skills/diagrams-architect/SKILL.md +763 -0
  156. package/src/skills/diagrams-generator/SKILL.md +25 -0
  157. package/src/skills/diagrams-generator/test-cases/test-1.yaml +9 -0
  158. package/src/skills/diagrams-generator/test-cases/test-2.yaml +9 -0
  159. package/src/skills/diagrams-generator/test-cases/test-3.yaml +8 -0
  160. package/src/skills/docs-updater/README.md +48 -0
  161. package/src/skills/docs-updater/test-cases/test-1-placeholder.yaml +12 -0
  162. package/src/skills/docs-updater/test-cases/test-2-placeholder.yaml +12 -0
  163. package/src/skills/docs-updater/test-cases/test-3-placeholder.yaml +12 -0
  164. package/src/skills/dotnet-backend/SKILL.md +250 -0
  165. package/src/skills/e2e-playwright/README.md +506 -0
  166. package/src/skills/e2e-playwright/SKILL.md +457 -0
  167. package/src/skills/e2e-playwright/execute.js +373 -0
  168. package/src/skills/e2e-playwright/lib/utils.js +514 -0
  169. package/src/skills/e2e-playwright/package.json +33 -0
  170. package/src/skills/e2e-playwright/test-cases/TC-001-basic-navigation.yaml +54 -0
  171. package/src/skills/e2e-playwright/test-cases/TC-002-form-interaction.yaml +64 -0
  172. package/src/skills/e2e-playwright/test-cases/TC-003-specweave-integration.yaml +74 -0
  173. package/src/skills/e2e-playwright/test-cases/TC-004-accessibility-check.yaml +98 -0
  174. package/src/skills/figma-designer/SKILL.md +149 -0
  175. package/src/skills/figma-implementer/SKILL.md +148 -0
  176. package/src/skills/figma-mcp-connector/SKILL.md +136 -0
  177. package/src/skills/figma-mcp-connector/test-cases/test-1-read-file-desktop.yaml +22 -0
  178. package/src/skills/figma-mcp-connector/test-cases/test-2-read-file-framelink.yaml +21 -0
  179. package/src/skills/figma-mcp-connector/test-cases/test-3-error-handling.yaml +18 -0
  180. package/src/skills/figma-to-code/SKILL.md +128 -0
  181. package/src/skills/figma-to-code/test-cases/test-1-token-generation.yaml +29 -0
  182. package/src/skills/figma-to-code/test-cases/test-2-component-generation.yaml +27 -0
  183. package/src/skills/figma-to-code/test-cases/test-3-typescript-generation.yaml +28 -0
  184. package/src/skills/frontend/SKILL.md +177 -0
  185. package/src/skills/github-sync/SKILL.md +252 -0
  186. package/src/skills/github-sync/test-cases/test-1-placeholder.yaml +12 -0
  187. package/src/skills/github-sync/test-cases/test-2-placeholder.yaml +12 -0
  188. package/src/skills/github-sync/test-cases/test-3-placeholder.yaml +12 -0
  189. package/src/skills/hetzner-provisioner/README.md +308 -0
  190. package/src/skills/hetzner-provisioner/SKILL.md +251 -0
  191. package/src/skills/hetzner-provisioner/test-cases/test-1-basic-provision.yaml +71 -0
  192. package/src/skills/hetzner-provisioner/test-cases/test-2-postgres-provision.yaml +85 -0
  193. package/src/skills/hetzner-provisioner/test-cases/test-3-ssl-config.yaml +126 -0
  194. package/src/skills/hetzner-provisioner/test-results/README.md +259 -0
  195. package/src/skills/increment-planner/SKILL.md +889 -0
  196. package/src/skills/increment-planner/scripts/feature-utils.js +250 -0
  197. package/src/skills/increment-planner/test-cases/test-1-basic-feature.yaml +27 -0
  198. package/src/skills/increment-planner/test-cases/test-2-complex-feature.yaml +30 -0
  199. package/src/skills/increment-planner/test-cases/test-3-auto-numbering.yaml +24 -0
  200. package/src/skills/increment-quality-judge/SKILL.md +566 -0
  201. package/src/skills/increment-quality-judge/test-cases/test-1-good-spec.yaml +95 -0
  202. package/src/skills/increment-quality-judge/test-cases/test-2-poor-spec.yaml +108 -0
  203. package/src/skills/increment-quality-judge/test-cases/test-3-export-suggestions.yaml +87 -0
  204. package/src/skills/jira-sync/README.md +328 -0
  205. package/src/skills/jira-sync/SKILL.md +209 -0
  206. package/src/skills/jira-sync/test-cases/test-1.yaml +9 -0
  207. package/src/skills/jira-sync/test-cases/test-2.yaml +9 -0
  208. package/src/skills/jira-sync/test-cases/test-3.yaml +10 -0
  209. package/src/skills/nextjs/SKILL.md +176 -0
  210. package/src/skills/nodejs-backend/SKILL.md +181 -0
  211. package/src/skills/notification-system/test-cases/test-1-placeholder.yaml +12 -0
  212. package/src/skills/notification-system/test-cases/test-2-placeholder.yaml +12 -0
  213. package/src/skills/notification-system/test-cases/test-3-placeholder.yaml +12 -0
  214. package/src/skills/python-backend/SKILL.md +226 -0
  215. package/src/skills/role-orchestrator/README.md +197 -0
  216. package/src/skills/role-orchestrator/SKILL.md +1184 -0
  217. package/src/skills/role-orchestrator/test-cases/test-1-simple-product.yaml +98 -0
  218. package/src/skills/role-orchestrator/test-cases/test-2-quality-gate-failure.yaml +73 -0
  219. package/src/skills/role-orchestrator/test-cases/test-3-security-workflow.yaml +121 -0
  220. package/src/skills/role-orchestrator/test-cases/test-4-parallel-execution.yaml +145 -0
  221. package/src/skills/role-orchestrator/test-cases/test-5-feedback-loops.yaml +149 -0
  222. package/src/skills/skill-creator/LICENSE.txt +202 -0
  223. package/src/skills/skill-creator/SKILL.md +209 -0
  224. package/src/skills/skill-creator/scripts/init_skill.py +303 -0
  225. package/src/skills/skill-creator/scripts/package_skill.py +110 -0
  226. package/src/skills/skill-creator/scripts/quick_validate.py +65 -0
  227. package/src/skills/skill-creator/test-cases/test-1-placeholder.yaml +12 -0
  228. package/src/skills/skill-creator/test-cases/test-2-placeholder.yaml +12 -0
  229. package/src/skills/skill-creator/test-cases/test-3-placeholder.yaml +12 -0
  230. package/src/skills/skill-router/SKILL.md +497 -0
  231. package/src/skills/skill-router/test-cases/test-1-basic-routing.yaml +33 -0
  232. package/src/skills/skill-router/test-cases/test-2-ambiguous-request.yaml +42 -0
  233. package/src/skills/skill-router/test-cases/test-3-nested-orchestration.yaml +50 -0
  234. package/src/skills/spec-driven-brainstorming/README.md +264 -0
  235. package/src/skills/spec-driven-brainstorming/SKILL.md +439 -0
  236. package/src/skills/spec-driven-brainstorming/test-cases/TC-001-simple-idea-to-design.yaml +148 -0
  237. package/src/skills/spec-driven-brainstorming/test-cases/TC-002-complex-ultrathink-design.yaml +190 -0
  238. package/src/skills/spec-driven-brainstorming/test-cases/TC-003-unclear-requirements-socratic.yaml +233 -0
  239. package/src/skills/spec-driven-debugging/README.md +479 -0
  240. package/src/skills/spec-driven-debugging/SKILL.md +652 -0
  241. package/src/skills/spec-driven-debugging/test-cases/TC-001-simple-auth-bug.yaml +212 -0
  242. package/src/skills/spec-driven-debugging/test-cases/TC-002-race-condition-ultrathink.yaml +461 -0
  243. package/src/skills/spec-driven-debugging/test-cases/TC-003-brownfield-missing-spec.yaml +366 -0
  244. package/src/skills/spec-kit-expert/SKILL.md +1012 -0
  245. package/src/skills/spec-kit-expert/test-cases/test-1-placeholder.yaml +12 -0
  246. package/src/skills/spec-kit-expert/test-cases/test-2-placeholder.yaml +12 -0
  247. package/src/skills/spec-kit-expert/test-cases/test-3-placeholder.yaml +12 -0
  248. package/src/skills/specweave-ado-mapper/SKILL.md +501 -0
  249. package/src/skills/specweave-detector/SKILL.md +420 -0
  250. package/src/skills/specweave-detector/test-cases/test-1-basic-detection.yaml +37 -0
  251. package/src/skills/specweave-detector/test-cases/test-2-missing-config.yaml +37 -0
  252. package/src/skills/specweave-detector/test-cases/test-3-non-specweave-project.yaml +34 -0
  253. package/src/skills/specweave-jira-mapper/SKILL.md +500 -0
  254. package/src/skills/stripe-integrator/test-cases/test-1-placeholder.yaml +12 -0
  255. package/src/skills/stripe-integrator/test-cases/test-2-placeholder.yaml +12 -0
  256. package/src/skills/stripe-integrator/test-cases/test-3-placeholder.yaml +12 -0
  257. package/src/skills/task-builder/README.md +90 -0
  258. package/src/skills/task-builder/test-cases/test-1-placeholder.yaml +12 -0
  259. package/src/skills/task-builder/test-cases/test-2-placeholder.yaml +12 -0
  260. package/src/skills/task-builder/test-cases/test-3-placeholder.yaml +12 -0
  261. package/src/templates/.env.example +144 -0
  262. package/src/templates/.gitignore.template +81 -0
  263. package/src/templates/CLAUDE.md.template +383 -0
  264. package/src/templates/README.md.template +240 -0
  265. package/src/templates/config.yaml +333 -0
  266. package/src/templates/docs/README.md +124 -0
  267. package/src/templates/docs/adr-template.md +118 -0
  268. package/src/templates/docs/hld-template.md +220 -0
  269. package/src/templates/docs/lld-template.md +580 -0
  270. package/src/templates/docs/prd-template.md +132 -0
  271. package/src/templates/docs/rfc-template.md +229 -0
  272. package/src/templates/docs/runbook-template.md +298 -0
  273. package/src/templates/environments/minimal/.env.production +16 -0
  274. package/src/templates/environments/minimal/README.md +54 -0
  275. package/src/templates/environments/minimal/deploy-production.yml +52 -0
  276. package/src/templates/environments/progressive/.env.qa +28 -0
  277. package/src/templates/environments/progressive/README.md +129 -0
  278. package/src/templates/environments/progressive/deploy-production.yml +93 -0
  279. package/src/templates/environments/progressive/deploy-qa.yml +62 -0
  280. package/src/templates/environments/progressive/deploy-staging.yml +67 -0
  281. package/src/templates/environments/standard/.env.development +20 -0
  282. package/src/templates/environments/standard/.env.production +30 -0
  283. package/src/templates/environments/standard/.env.staging +23 -0
  284. package/src/templates/environments/standard/README.md +97 -0
  285. package/src/templates/environments/standard/deploy-production.yml +68 -0
  286. package/src/templates/environments/standard/deploy-staging.yml +61 -0
  287. package/src/templates/environments/standard/docker-compose.yml +43 -0
  288. package/src/templates/increment-metadata-template.yaml +138 -0
@@ -0,0 +1,439 @@
1
+ # Monitoring & Observability
2
+
3
+ **Purpose**: Set up monitoring, alerting, and observability to detect incidents early.
4
+
5
+ ## Observability Pillars
6
+
7
+ ### 1. Metrics
8
+
9
+ **What to Monitor**:
10
+ - **Application**: Response time, error rate, throughput
11
+ - **Infrastructure**: CPU, memory, disk, network
12
+ - **Database**: Query time, connections, deadlocks
13
+ - **Business**: User signups, revenue, conversions
14
+
15
+ **Tools**:
16
+ - Prometheus + Grafana
17
+ - DataDog
18
+ - New Relic
19
+ - CloudWatch (AWS)
20
+ - Azure Monitor
21
+
22
+ ---
23
+
24
+ #### Key Metrics by Layer
25
+
26
+ **Application Metrics**:
27
+ ```
28
+ http_requests_total # Total requests
29
+ http_request_duration_seconds # Response time (histogram)
30
+ http_requests_errors_total # Error count
31
+ http_requests_in_flight # Concurrent requests
32
+ ```
33
+
34
+ **Infrastructure Metrics**:
35
+ ```
36
+ node_cpu_seconds_total # CPU usage
37
+ node_memory_usage_bytes # Memory usage
38
+ node_disk_usage_bytes # Disk usage
39
+ node_network_receive_bytes_total # Network in
40
+ ```
41
+
42
+ **Database Metrics**:
43
+ ```
44
+ pg_stat_database_tup_returned # Rows returned
45
+ pg_stat_database_tup_fetched # Rows fetched
46
+ pg_stat_database_deadlocks # Deadlock count
47
+ pg_stat_activity_connections # Active connections
48
+ ```
49
+
50
+ ---
51
+
52
+ ### 2. Logs
53
+
54
+ **What to Log**:
55
+ - **Application logs**: Errors, warnings, info
56
+ - **Access logs**: HTTP requests (nginx, apache)
57
+ - **System logs**: Kernel, systemd, auth
58
+ - **Audit logs**: Security events, data access
59
+
60
+ **Log Levels**:
61
+ - **ERROR**: Application errors, exceptions
62
+ - **WARN**: Potential issues (deprecated API, high latency)
63
+ - **INFO**: Normal operations (user login, job completed)
64
+ - **DEBUG**: Detailed troubleshooting (only in dev)
65
+
66
+ **Tools**:
67
+ - ELK Stack (Elasticsearch, Logstash, Kibana)
68
+ - Splunk
69
+ - CloudWatch Logs
70
+ - Azure Log Analytics
71
+
72
+ ---
73
+
74
+ #### Structured Logging
75
+
76
+ **BAD** (unstructured):
77
+ ```javascript
78
+ console.log("User logged in: " + userId);
79
+ ```
80
+
81
+ **GOOD** (structured JSON):
82
+ ```javascript
83
+ logger.info("User logged in", {
84
+ userId: 123,
85
+ ip: "192.168.1.1",
86
+ timestamp: "2025-10-26T12:00:00Z",
87
+ userAgent: "Mozilla/5.0...",
88
+ });
89
+
90
+ // Output:
91
+ // {"level":"info","message":"User logged in","userId":123,"ip":"192.168.1.1",...}
92
+ ```
93
+
94
+ **Benefits**:
95
+ - Queryable (filter by userId)
96
+ - Machine-readable
97
+ - Consistent format
98
+
99
+ ---
100
+
101
+ ### 3. Traces
102
+
103
+ **Purpose**: Track request flow through distributed systems
104
+
105
+ **Example**:
106
+ ```
107
+ User Request → API Gateway → Auth Service → Payment Service → Database
108
+ 1ms 2ms 50ms 100ms 30ms
109
+ ↑ SLOW SPAN
110
+ ```
111
+
112
+ **Tools**:
113
+ - Jaeger
114
+ - Zipkin
115
+ - AWS X-Ray
116
+ - DataDog APM
117
+ - New Relic
118
+
119
+ **When to Use**:
120
+ - Microservices architecture
121
+ - Slow requests (which service is slow?)
122
+ - Debugging distributed systems
123
+
124
+ ---
125
+
126
+ ## Alerting Best Practices
127
+
128
+ ### Alert on Symptoms, Not Causes
129
+
130
+ **BAD** (cause-based):
131
+ - Alert: "CPU usage >80%"
132
+ - Problem: CPU can be high without user impact
133
+
134
+ **GOOD** (symptom-based):
135
+ - Alert: "API response time >1s"
136
+ - Why: Users actually experiencing slowness
137
+
138
+ ---
139
+
140
+ ### Alert Severity Levels
141
+
142
+ **P1 (SEV1) - Page On-Call**:
143
+ - Service down (availability <99%)
144
+ - Data loss
145
+ - Security breach
146
+ - Response time >5s (unusable)
147
+
148
+ **P2 (SEV2) - Notify During Business Hours**:
149
+ - Degraded performance (response time >1s)
150
+ - Error rate >1%
151
+ - Disk >90% full
152
+
153
+ **P3 (SEV3) - Email/Slack**:
154
+ - Warning signs (disk >80%, memory >80%)
155
+ - Non-critical errors
156
+ - Monitoring gaps
157
+
158
+ ---
159
+
160
+ ### Alert Fatigue Prevention
161
+
162
+ **Rules**:
163
+ 1. **Actionable**: Every alert must have clear action
164
+ 2. **Meaningful**: Alert only on real problems
165
+ 3. **Context**: Include relevant info (which server, which metric)
166
+ 4. **Deduplicate**: Don't alert 100 times for same issue
167
+ 5. **Escalate**: Auto-escalate if not acknowledged
168
+
169
+ **Example Bad Alert**:
170
+ ```
171
+ Subject: Alert
172
+ Body: Server is down
173
+ ```
174
+
175
+ **Example Good Alert**:
176
+ ```
177
+ Subject: [P1] API Server Down - Production
178
+ Body:
179
+ - Service: api.example.com
180
+ - Issue: Health check failing for 5 minutes
181
+ - Impact: All users affected (100%)
182
+ - Runbook: https://wiki.example.com/runbook/api-down
183
+ - Dashboard: https://grafana.example.com/d/api
184
+ ```
185
+
186
+ ---
187
+
188
+ ## Monitoring Setup
189
+
190
+ ### Application Monitoring
191
+
192
+ #### Prometheus + Grafana
193
+
194
+ **Install Prometheus Client** (Node.js):
195
+ ```javascript
196
+ const client = require('prom-client');
197
+
198
+ // Enable default metrics (CPU, memory, etc.)
199
+ client.collectDefaultMetrics();
200
+
201
+ // Custom metrics
202
+ const httpRequestDuration = new client.Histogram({
203
+ name: 'http_request_duration_seconds',
204
+ help: 'HTTP request duration in seconds',
205
+ labelNames: ['method', 'route', 'status'],
206
+ });
207
+
208
+ // Instrument code
209
+ app.use((req, res, next) => {
210
+ const end = httpRequestDuration.startTimer();
211
+ res.on('finish', () => {
212
+ end({ method: req.method, route: req.route.path, status: res.statusCode });
213
+ });
214
+ next();
215
+ });
216
+
217
+ // Expose metrics endpoint
218
+ app.get('/metrics', (req, res) => {
219
+ res.set('Content-Type', client.register.contentType);
220
+ res.end(client.register.metrics());
221
+ });
222
+ ```
223
+
224
+ **Prometheus Config** (prometheus.yml):
225
+ ```yaml
226
+ scrape_configs:
227
+ - job_name: 'api-server'
228
+ static_configs:
229
+ - targets: ['localhost:3000']
230
+ scrape_interval: 15s
231
+ ```
232
+
233
+ ---
234
+
235
+ ### Log Aggregation
236
+
237
+ #### ELK Stack
238
+
239
+ **Application** (send logs to Logstash):
240
+ ```javascript
241
+ const winston = require('winston');
242
+ const LogstashTransport = require('winston-logstash-transport').LogstashTransport;
243
+
244
+ const logger = winston.createLogger({
245
+ transports: [
246
+ new LogstashTransport({
247
+ host: 'logstash.example.com',
248
+ port: 5000,
249
+ }),
250
+ ],
251
+ });
252
+
253
+ logger.info('User logged in', { userId: 123, ip: '192.168.1.1' });
254
+ ```
255
+
256
+ **Logstash Config**:
257
+ ```
258
+ input {
259
+ tcp {
260
+ port => 5000
261
+ codec => json
262
+ }
263
+ }
264
+
265
+ output {
266
+ elasticsearch {
267
+ hosts => ["elasticsearch:9200"]
268
+ index => "application-logs-%{+YYYY.MM.dd}"
269
+ }
270
+ }
271
+ ```
272
+
273
+ ---
274
+
275
+ ### Health Checks
276
+
277
+ **Purpose**: Check if service is healthy and ready to serve traffic
278
+
279
+ **Types**:
280
+ 1. **Liveness**: Is the service running? (restart if fails)
281
+ 2. **Readiness**: Is the service ready to serve traffic? (remove from load balancer if fails)
282
+
283
+ **Example** (Express.js):
284
+ ```javascript
285
+ // Liveness probe (simple check)
286
+ app.get('/healthz', (req, res) => {
287
+ res.status(200).send('OK');
288
+ });
289
+
290
+ // Readiness probe (check dependencies)
291
+ app.get('/ready', async (req, res) => {
292
+ try {
293
+ // Check database
294
+ await db.query('SELECT 1');
295
+
296
+ // Check Redis
297
+ await redis.ping();
298
+
299
+ // Check external API
300
+ await fetch('https://api.external.com/health');
301
+
302
+ res.status(200).send('Ready');
303
+ } catch (error) {
304
+ res.status(503).send('Not ready');
305
+ }
306
+ });
307
+ ```
308
+
309
+ **Kubernetes**:
310
+ ```yaml
311
+ livenessProbe:
312
+ httpGet:
313
+ path: /healthz
314
+ port: 3000
315
+ initialDelaySeconds: 30
316
+ periodSeconds: 10
317
+
318
+ readinessProbe:
319
+ httpGet:
320
+ path: /ready
321
+ port: 3000
322
+ initialDelaySeconds: 10
323
+ periodSeconds: 5
324
+ ```
325
+
326
+ ---
327
+
328
+ ### SLI, SLO, SLA
329
+
330
+ **SLI** (Service Level Indicator):
331
+ - Metrics that measure service quality
332
+ - Examples: Response time, error rate, availability
333
+
334
+ **SLO** (Service Level Objective):
335
+ - Target for SLI
336
+ - Examples: "99.9% availability", "p95 response time <500ms"
337
+
338
+ **SLA** (Service Level Agreement):
339
+ - Contract with users (with penalties)
340
+ - Examples: "99.9% uptime or refund"
341
+
342
+ **Example**:
343
+ ```
344
+ SLI: Availability = (successful requests / total requests) * 100
345
+ SLO: Availability must be ≥99.9% per month
346
+ SLA: If availability <99.9%, users get 10% refund
347
+ ```
348
+
349
+ ---
350
+
351
+ ## Monitoring Checklist
352
+
353
+ **Application**:
354
+ - [ ] Response time metrics (p50, p95, p99)
355
+ - [ ] Error rate metrics (4xx, 5xx)
356
+ - [ ] Throughput metrics (requests per second)
357
+ - [ ] Health check endpoint (/healthz, /ready)
358
+ - [ ] Structured logging (JSON format)
359
+ - [ ] Distributed tracing (if microservices)
360
+
361
+ **Infrastructure**:
362
+ - [ ] CPU, memory, disk, network metrics
363
+ - [ ] System logs (syslog, journalctl)
364
+ - [ ] Cloud metrics (CloudWatch, Azure Monitor)
365
+ - [ ] Disk I/O metrics (iostat)
366
+
367
+ **Database**:
368
+ - [ ] Query performance metrics
369
+ - [ ] Connection pool metrics
370
+ - [ ] Slow query log enabled
371
+ - [ ] Deadlock monitoring
372
+
373
+ **Alerts**:
374
+ - [ ] P1 alerts for critical issues (page on-call)
375
+ - [ ] P2 alerts for degraded performance
376
+ - [ ] Runbook linked in alerts
377
+ - [ ] Dashboard linked in alerts
378
+ - [ ] Escalation policy configured
379
+
380
+ **Dashboards**:
381
+ - [ ] Overview dashboard (RED metrics: Rate, Errors, Duration)
382
+ - [ ] Infrastructure dashboard (CPU, memory, disk)
383
+ - [ ] Database dashboard (queries, connections)
384
+ - [ ] Business metrics dashboard (signups, revenue)
385
+
386
+ ---
387
+
388
+ ## Common Monitoring Patterns
389
+
390
+ ### RED Method (for services)
391
+
392
+ **Rate**: Requests per second
393
+ **Errors**: Error rate (%)
394
+ **Duration**: Response time (p50, p95, p99)
395
+
396
+ **Dashboard**:
397
+ ```
398
+ +-----------------+ +-----------------+ +-----------------+
399
+ | Rate | | Errors | | Duration |
400
+ | 1000 req/s | | 0.5% | | p95: 250ms |
401
+ +-----------------+ +-----------------+ +-----------------+
402
+ ```
403
+
404
+ ### USE Method (for resources)
405
+
406
+ **Utilization**: % of resource used (CPU, memory, disk)
407
+ **Saturation**: Queue depth, backlog
408
+ **Errors**: Error count
409
+
410
+ **Dashboard**:
411
+ ```
412
+ CPU: 70% utilization, 0.5 load average, 0 errors
413
+ Memory: 80% utilization, 0 swap, 0 OOM kills
414
+ Disk: 60% utilization, 5ms latency, 0 I/O errors
415
+ ```
416
+
417
+ ---
418
+
419
+ ## Tools Comparison
420
+
421
+ | Tool | Type | Best For | Cost |
422
+ |------|------|----------|------|
423
+ | Prometheus + Grafana | Metrics | Self-hosted, cost-effective | Free |
424
+ | DataDog | Metrics, Logs, APM | All-in-one, easy setup | $15/host/month |
425
+ | New Relic | APM | Application performance | $99/user/month |
426
+ | ELK Stack | Logs | Log aggregation | Free (self-hosted) |
427
+ | Splunk | Logs | Enterprise log analysis | $1800/GB/year |
428
+ | Jaeger | Traces | Distributed tracing | Free |
429
+ | CloudWatch | Metrics, Logs | AWS-native | $0.30/metric/month |
430
+ | Azure Monitor | Metrics, Logs | Azure-native | $0.25/metric/month |
431
+
432
+ ---
433
+
434
+ ## Related Documentation
435
+
436
+ - [SKILL.md](../SKILL.md) - Main SRE agent
437
+ - [backend-diagnostics.md](backend-diagnostics.md) - Application troubleshooting
438
+ - [database-diagnostics.md](database-diagnostics.md) - Database monitoring
439
+ - [infrastructure.md](infrastructure.md) - Infrastructure monitoring