musubi-sdd 5.0.0 → 5.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/README.ja.md +106 -48
  2. package/README.md +110 -32
  3. package/bin/musubi-analyze.js +74 -67
  4. package/bin/musubi-browser.js +27 -26
  5. package/bin/musubi-change.js +48 -47
  6. package/bin/musubi-checkpoint.js +10 -7
  7. package/bin/musubi-convert.js +25 -25
  8. package/bin/musubi-costs.js +27 -10
  9. package/bin/musubi-gui.js +52 -46
  10. package/bin/musubi-init.js +1952 -10
  11. package/bin/musubi-orchestrate.js +327 -239
  12. package/bin/musubi-remember.js +69 -56
  13. package/bin/musubi-resolve.js +53 -45
  14. package/bin/musubi-trace.js +51 -22
  15. package/bin/musubi-validate.js +39 -30
  16. package/bin/musubi-workflow.js +33 -34
  17. package/bin/musubi.js +39 -2
  18. package/package.json +1 -1
  19. package/src/agents/agent-loop.js +94 -95
  20. package/src/agents/agentic/code-generator.js +119 -109
  21. package/src/agents/agentic/code-reviewer.js +105 -108
  22. package/src/agents/agentic/index.js +4 -4
  23. package/src/agents/browser/action-executor.js +13 -13
  24. package/src/agents/browser/ai-comparator.js +11 -10
  25. package/src/agents/browser/context-manager.js +6 -6
  26. package/src/agents/browser/index.js +5 -5
  27. package/src/agents/browser/nl-parser.js +31 -46
  28. package/src/agents/browser/screenshot.js +2 -2
  29. package/src/agents/browser/test-generator.js +6 -4
  30. package/src/agents/function-tool.js +71 -65
  31. package/src/agents/index.js +7 -7
  32. package/src/agents/schema-generator.js +98 -94
  33. package/src/analyzers/ast-extractor.js +164 -145
  34. package/src/analyzers/codegraph-auto-update.js +858 -0
  35. package/src/analyzers/complexity-analyzer.js +536 -0
  36. package/src/analyzers/context-optimizer.js +247 -125
  37. package/src/analyzers/impact-analyzer.js +1 -1
  38. package/src/analyzers/large-project-analyzer.js +766 -0
  39. package/src/analyzers/repository-map.js +83 -80
  40. package/src/analyzers/security-analyzer.js +19 -11
  41. package/src/analyzers/stuck-detector.js +19 -17
  42. package/src/converters/index.js +78 -57
  43. package/src/converters/ir/types.js +12 -12
  44. package/src/converters/parsers/musubi-parser.js +134 -126
  45. package/src/converters/parsers/openapi-parser.js +70 -53
  46. package/src/converters/parsers/speckit-parser.js +239 -175
  47. package/src/converters/writers/musubi-writer.js +123 -118
  48. package/src/converters/writers/speckit-writer.js +124 -113
  49. package/src/generators/rust-migration-generator.js +512 -0
  50. package/src/gui/public/index.html +1365 -1211
  51. package/src/gui/server.js +41 -40
  52. package/src/gui/services/file-watcher.js +23 -8
  53. package/src/gui/services/project-scanner.js +26 -20
  54. package/src/gui/services/replanning-service.js +27 -23
  55. package/src/gui/services/traceability-service.js +8 -8
  56. package/src/gui/services/workflow-service.js +14 -7
  57. package/src/index.js +151 -0
  58. package/src/integrations/cicd.js +90 -104
  59. package/src/integrations/codegraph-mcp.js +643 -0
  60. package/src/integrations/documentation.js +142 -103
  61. package/src/integrations/examples.js +95 -80
  62. package/src/integrations/github-client.js +17 -17
  63. package/src/integrations/index.js +5 -5
  64. package/src/integrations/mcp/index.js +21 -21
  65. package/src/integrations/mcp/mcp-context-provider.js +76 -78
  66. package/src/integrations/mcp/mcp-discovery.js +74 -72
  67. package/src/integrations/mcp/mcp-tool-registry.js +99 -94
  68. package/src/integrations/mcp-connector.js +70 -66
  69. package/src/integrations/platforms.js +50 -49
  70. package/src/integrations/tool-discovery.js +37 -31
  71. package/src/llm-providers/anthropic-provider.js +11 -11
  72. package/src/llm-providers/base-provider.js +16 -18
  73. package/src/llm-providers/copilot-provider.js +22 -19
  74. package/src/llm-providers/index.js +26 -25
  75. package/src/llm-providers/ollama-provider.js +11 -11
  76. package/src/llm-providers/openai-provider.js +12 -12
  77. package/src/managers/agent-memory.js +36 -24
  78. package/src/managers/checkpoint-manager.js +4 -8
  79. package/src/managers/delta-spec.js +19 -19
  80. package/src/managers/index.js +13 -4
  81. package/src/managers/memory-condenser.js +35 -45
  82. package/src/managers/repo-skill-manager.js +57 -31
  83. package/src/managers/skill-loader.js +25 -22
  84. package/src/managers/skill-tools.js +36 -72
  85. package/src/managers/workflow.js +30 -22
  86. package/src/monitoring/cost-tracker.js +53 -44
  87. package/src/monitoring/incident-manager.js +123 -103
  88. package/src/monitoring/index.js +144 -134
  89. package/src/monitoring/observability.js +82 -59
  90. package/src/monitoring/quality-dashboard.js +51 -39
  91. package/src/monitoring/release-manager.js +70 -50
  92. package/src/orchestration/agent-skill-binding.js +39 -47
  93. package/src/orchestration/error-handler.js +65 -107
  94. package/src/orchestration/guardrails/base-guardrail.js +26 -24
  95. package/src/orchestration/guardrails/guardrail-rules.js +50 -64
  96. package/src/orchestration/guardrails/index.js +5 -5
  97. package/src/orchestration/guardrails/input-guardrail.js +58 -45
  98. package/src/orchestration/guardrails/output-guardrail.js +104 -81
  99. package/src/orchestration/guardrails/safety-check.js +79 -79
  100. package/src/orchestration/index.js +38 -55
  101. package/src/orchestration/mcp-tool-adapters.js +96 -99
  102. package/src/orchestration/orchestration-engine.js +21 -21
  103. package/src/orchestration/pattern-registry.js +60 -45
  104. package/src/orchestration/patterns/auto.js +34 -47
  105. package/src/orchestration/patterns/group-chat.js +59 -65
  106. package/src/orchestration/patterns/handoff.js +67 -65
  107. package/src/orchestration/patterns/human-in-loop.js +51 -72
  108. package/src/orchestration/patterns/nested.js +25 -40
  109. package/src/orchestration/patterns/sequential.js +35 -34
  110. package/src/orchestration/patterns/swarm.js +63 -56
  111. package/src/orchestration/patterns/triage.js +150 -109
  112. package/src/orchestration/reasoning/index.js +9 -9
  113. package/src/orchestration/reasoning/planning-engine.js +143 -140
  114. package/src/orchestration/reasoning/reasoning-engine.js +206 -144
  115. package/src/orchestration/reasoning/self-correction.js +121 -128
  116. package/src/orchestration/replanning/adaptive-goal-modifier.js +107 -112
  117. package/src/orchestration/replanning/alternative-generator.js +37 -42
  118. package/src/orchestration/replanning/config.js +63 -59
  119. package/src/orchestration/replanning/goal-progress-tracker.js +98 -100
  120. package/src/orchestration/replanning/index.js +24 -20
  121. package/src/orchestration/replanning/plan-evaluator.js +49 -50
  122. package/src/orchestration/replanning/plan-monitor.js +32 -28
  123. package/src/orchestration/replanning/proactive-path-optimizer.js +175 -178
  124. package/src/orchestration/replanning/replan-history.js +33 -26
  125. package/src/orchestration/replanning/replanning-engine.js +106 -108
  126. package/src/orchestration/skill-executor.js +107 -109
  127. package/src/orchestration/skill-registry.js +85 -89
  128. package/src/orchestration/workflow-examples.js +228 -231
  129. package/src/orchestration/workflow-executor.js +65 -68
  130. package/src/orchestration/workflow-orchestrator.js +72 -73
  131. package/src/phase4-integration.js +47 -40
  132. package/src/phase5-integration.js +89 -30
  133. package/src/reporters/coverage-report.js +82 -30
  134. package/src/reporters/hierarchical-reporter.js +498 -0
  135. package/src/reporters/traceability-matrix-report.js +29 -20
  136. package/src/resolvers/issue-resolver.js +43 -31
  137. package/src/steering/advanced-validation.js +133 -124
  138. package/src/steering/auto-updater.js +60 -73
  139. package/src/steering/index.js +6 -6
  140. package/src/steering/quality-metrics.js +41 -35
  141. package/src/steering/steering-auto-update.js +83 -86
  142. package/src/steering/steering-validator.js +98 -106
  143. package/src/steering/template-constraints.js +53 -54
  144. package/src/templates/agents/claude-code/CLAUDE.md +32 -32
  145. package/src/templates/agents/claude-code/skills/agent-assistant/SKILL.md +13 -5
  146. package/src/templates/agents/claude-code/skills/ai-ml-engineer/mlops-guide.md +23 -23
  147. package/src/templates/agents/claude-code/skills/ai-ml-engineer/model-card-template.md +60 -41
  148. package/src/templates/agents/claude-code/skills/api-designer/api-patterns.md +27 -19
  149. package/src/templates/agents/claude-code/skills/api-designer/openapi-template.md +11 -7
  150. package/src/templates/agents/claude-code/skills/bug-hunter/SKILL.md +4 -3
  151. package/src/templates/agents/claude-code/skills/bug-hunter/root-cause-analysis.md +37 -15
  152. package/src/templates/agents/claude-code/skills/change-impact-analyzer/dependency-graph-patterns.md +36 -42
  153. package/src/templates/agents/claude-code/skills/change-impact-analyzer/impact-analysis-template.md +69 -60
  154. package/src/templates/agents/claude-code/skills/cloud-architect/aws-patterns.md +31 -38
  155. package/src/templates/agents/claude-code/skills/cloud-architect/azure-patterns.md +28 -23
  156. package/src/templates/agents/claude-code/skills/code-reviewer/SKILL.md +61 -0
  157. package/src/templates/agents/claude-code/skills/code-reviewer/best-practices.md +27 -0
  158. package/src/templates/agents/claude-code/skills/code-reviewer/review-checklist.md +29 -10
  159. package/src/templates/agents/claude-code/skills/code-reviewer/review-standards.md +29 -24
  160. package/src/templates/agents/claude-code/skills/constitution-enforcer/SKILL.md +8 -6
  161. package/src/templates/agents/claude-code/skills/constitution-enforcer/constitutional-articles.md +62 -26
  162. package/src/templates/agents/claude-code/skills/constitution-enforcer/phase-minus-one-gates.md +35 -16
  163. package/src/templates/agents/claude-code/skills/database-administrator/backup-recovery.md +27 -17
  164. package/src/templates/agents/claude-code/skills/database-administrator/tuning-guide.md +25 -20
  165. package/src/templates/agents/claude-code/skills/database-schema-designer/schema-patterns.md +39 -22
  166. package/src/templates/agents/claude-code/skills/devops-engineer/ci-cd-templates.md +25 -22
  167. package/src/templates/agents/claude-code/skills/issue-resolver/SKILL.md +24 -21
  168. package/src/templates/agents/claude-code/skills/orchestrator/SKILL.md +148 -63
  169. package/src/templates/agents/claude-code/skills/orchestrator/patterns.md +35 -16
  170. package/src/templates/agents/claude-code/skills/orchestrator/selection-matrix.md +69 -64
  171. package/src/templates/agents/claude-code/skills/performance-engineer/optimization-playbook.md +47 -47
  172. package/src/templates/agents/claude-code/skills/performance-optimizer/SKILL.md +69 -0
  173. package/src/templates/agents/claude-code/skills/performance-optimizer/benchmark-template.md +63 -45
  174. package/src/templates/agents/claude-code/skills/performance-optimizer/optimization-patterns.md +33 -35
  175. package/src/templates/agents/claude-code/skills/project-manager/SKILL.md +7 -6
  176. package/src/templates/agents/claude-code/skills/project-manager/agile-ceremonies.md +47 -28
  177. package/src/templates/agents/claude-code/skills/project-manager/project-templates.md +94 -78
  178. package/src/templates/agents/claude-code/skills/quality-assurance/SKILL.md +20 -17
  179. package/src/templates/agents/claude-code/skills/quality-assurance/qa-plan-template.md +63 -49
  180. package/src/templates/agents/claude-code/skills/release-coordinator/SKILL.md +5 -5
  181. package/src/templates/agents/claude-code/skills/release-coordinator/feature-flag-guide.md +30 -26
  182. package/src/templates/agents/claude-code/skills/release-coordinator/release-plan-template.md +67 -35
  183. package/src/templates/agents/claude-code/skills/requirements-analyst/ears-format.md +54 -42
  184. package/src/templates/agents/claude-code/skills/requirements-analyst/validation-rules.md +36 -33
  185. package/src/templates/agents/claude-code/skills/security-auditor/SKILL.md +77 -19
  186. package/src/templates/agents/claude-code/skills/security-auditor/audit-checklists.md +24 -24
  187. package/src/templates/agents/claude-code/skills/security-auditor/owasp-top-10.md +61 -20
  188. package/src/templates/agents/claude-code/skills/security-auditor/vulnerability-patterns.md +43 -11
  189. package/src/templates/agents/claude-code/skills/site-reliability-engineer/SKILL.md +1 -0
  190. package/src/templates/agents/claude-code/skills/site-reliability-engineer/incident-response-template.md +55 -25
  191. package/src/templates/agents/claude-code/skills/site-reliability-engineer/observability-patterns.md +78 -68
  192. package/src/templates/agents/claude-code/skills/site-reliability-engineer/slo-sli-guide.md +73 -53
  193. package/src/templates/agents/claude-code/skills/software-developer/solid-principles.md +83 -37
  194. package/src/templates/agents/claude-code/skills/software-developer/test-first-workflow.md +38 -31
  195. package/src/templates/agents/claude-code/skills/steering/SKILL.md +1 -0
  196. package/src/templates/agents/claude-code/skills/steering/auto-update-rules.md +31 -0
  197. package/src/templates/agents/claude-code/skills/system-architect/adr-template.md +25 -7
  198. package/src/templates/agents/claude-code/skills/system-architect/c4-model-guide.md +74 -61
  199. package/src/templates/agents/claude-code/skills/technical-writer/doc-templates/documentation-templates.md +70 -52
  200. package/src/templates/agents/claude-code/skills/test-engineer/SKILL.md +2 -0
  201. package/src/templates/agents/claude-code/skills/test-engineer/ears-test-mapping.md +75 -71
  202. package/src/templates/agents/claude-code/skills/test-engineer/test-types.md +85 -63
  203. package/src/templates/agents/claude-code/skills/traceability-auditor/coverage-matrix-template.md +39 -36
  204. package/src/templates/agents/claude-code/skills/traceability-auditor/gap-detection-rules.md +22 -17
  205. package/src/templates/agents/claude-code/skills/ui-ux-designer/SKILL.md +1 -0
  206. package/src/templates/agents/claude-code/skills/ui-ux-designer/accessibility-guidelines.md +49 -75
  207. package/src/templates/agents/claude-code/skills/ui-ux-designer/design-system-components.md +71 -59
  208. package/src/templates/agents/codex/AGENTS.md +74 -42
  209. package/src/templates/agents/cursor/AGENTS.md +74 -42
  210. package/src/templates/agents/gemini-cli/GEMINI.md +74 -42
  211. package/src/templates/agents/github-copilot/AGENTS.md +83 -51
  212. package/src/templates/agents/qwen-code/QWEN.md +74 -42
  213. package/src/templates/agents/windsurf/AGENTS.md +74 -42
  214. package/src/templates/architectures/README.md +41 -0
  215. package/src/templates/architectures/clean-architecture/README.md +113 -0
  216. package/src/templates/architectures/event-driven/README.md +162 -0
  217. package/src/templates/architectures/hexagonal/README.md +130 -0
  218. package/src/templates/index.js +6 -1
  219. package/src/templates/locale-manager.js +16 -16
  220. package/src/templates/shared/delta-spec-template.md +20 -13
  221. package/src/templates/shared/github-actions/musubi-issue-resolver.yml +5 -5
  222. package/src/templates/shared/github-actions/musubi-security-check.yml +3 -3
  223. package/src/templates/shared/github-actions/musubi-validate.yml +4 -4
  224. package/src/templates/shared/steering/structure.md +95 -0
  225. package/src/templates/skills/browser-agent.md +21 -16
  226. package/src/templates/skills/web-gui.md +8 -0
  227. package/src/templates/template-constraints.js +50 -53
  228. package/src/validators/advanced-validation.js +30 -36
  229. package/src/validators/constitutional-validator.js +77 -73
  230. package/src/validators/critic-system.js +49 -59
  231. package/src/validators/delta-format.js +59 -55
  232. package/src/validators/traceability-validator.js +7 -11
@@ -18,12 +18,12 @@ Template for handling and documenting incidents.
18
18
 
19
19
  ## Incident Severity Levels
20
20
 
21
- | Level | Name | Description | Response Time | Example |
22
- |-------|------|-------------|---------------|---------|
23
- | SEV-1 | Critical | Complete outage | 15 min | Site down |
24
- | SEV-2 | Major | Partial outage | 30 min | Payment failures |
25
- | SEV-3 | Minor | Degraded service | 2 hours | Slow responses |
26
- | SEV-4 | Low | Minimal impact | Next day | Minor UI bug |
21
+ | Level | Name | Description | Response Time | Example |
22
+ | ----- | -------- | ---------------- | ------------- | ---------------- |
23
+ | SEV-1 | Critical | Complete outage | 15 min | Site down |
24
+ | SEV-2 | Major | Partial outage | 30 min | Payment failures |
25
+ | SEV-3 | Minor | Degraded service | 2 hours | Slow responses |
26
+ | SEV-4 | Low | Minimal impact | Next day | Minor UI bug |
27
27
 
28
28
  ---
29
29
 
@@ -33,6 +33,7 @@ Template for handling and documenting incidents.
33
33
  # Incident Report: [INC-XXXX] [Title]
34
34
 
35
35
  ## Summary
36
+
36
37
  **Status**: Active / Mitigated / Resolved
37
38
  **Severity**: SEV-1 / SEV-2 / SEV-3 / SEV-4
38
39
  **Duration**: [Start time] - [End time] (X hours Y minutes)
@@ -42,50 +43,56 @@ Template for handling and documenting incidents.
42
43
 
43
44
  ## Timeline
44
45
 
45
- | Time (UTC) | Event |
46
- |------------|-------|
47
- | HH:MM | Incident detected via [monitoring/user report] |
48
- | HH:MM | Incident commander assigned: [Name] |
49
- | HH:MM | [Action taken] |
50
- | HH:MM | Root cause identified |
51
- | HH:MM | Mitigation deployed |
52
- | HH:MM | Service restored |
53
- | HH:MM | Incident resolved |
46
+ | Time (UTC) | Event |
47
+ | ---------- | ---------------------------------------------- |
48
+ | HH:MM | Incident detected via [monitoring/user report] |
49
+ | HH:MM | Incident commander assigned: [Name] |
50
+ | HH:MM | [Action taken] |
51
+ | HH:MM | Root cause identified |
52
+ | HH:MM | Mitigation deployed |
53
+ | HH:MM | Service restored |
54
+ | HH:MM | Incident resolved |
54
55
 
55
56
  ---
56
57
 
57
58
  ## Impact
58
59
 
59
60
  ### Users Affected
61
+
60
62
  - [Number] users impacted
61
63
  - [Regions/segments] affected
62
64
  - [Features] unavailable
63
65
 
64
66
  ### Business Impact
67
+
65
68
  - [Revenue impact if any]
66
69
  - [SLA breach if any]
67
70
  - [Reputational impact]
68
71
 
69
72
  ### Metrics
70
- | Metric | During Incident | Normal |
71
- |--------|-----------------|--------|
72
- | Error Rate | X% | Y% |
73
- | Latency p99 | Xms | Yms |
74
- | Availability | X% | Y% |
73
+
74
+ | Metric | During Incident | Normal |
75
+ | ------------ | --------------- | ------ |
76
+ | Error Rate | X% | Y% |
77
+ | Latency p99 | Xms | Yms |
78
+ | Availability | X% | Y% |
75
79
 
76
80
  ---
77
81
 
78
82
  ## Root Cause
79
83
 
80
84
  ### What Happened
85
+
81
86
  [Detailed technical explanation of the root cause]
82
87
 
83
88
  ### Why It Happened
89
+
84
90
  - [Contributing factor 1]
85
91
  - [Contributing factor 2]
86
92
  - [Why wasn't this caught earlier?]
87
93
 
88
94
  ### Timeline of Events Leading to Incident
95
+
89
96
  1. [Event 1]
90
97
  2. [Event 2]
91
98
  3. [Event that triggered incident]
@@ -95,14 +102,17 @@ Template for handling and documenting incidents.
95
102
  ## Resolution
96
103
 
97
104
  ### Immediate Actions
105
+
98
106
  - [Action 1]: [Result]
99
107
  - [Action 2]: [Result]
100
108
 
101
109
  ### Mitigation Steps
110
+
102
111
  1. [Step taken to mitigate]
103
112
  2. [Step taken to mitigate]
104
113
 
105
114
  ### Permanent Fix
115
+
106
116
  [Description of permanent fix implemented or planned]
107
117
 
108
118
  ---
@@ -110,27 +120,31 @@ Template for handling and documenting incidents.
110
120
  ## Lessons Learned
111
121
 
112
122
  ### What Went Well
123
+
113
124
  - [Positive 1]
114
125
  - [Positive 2]
115
126
 
116
127
  ### What Went Wrong
128
+
117
129
  - [Issue 1]
118
130
  - [Issue 2]
119
131
 
120
132
  ### Where We Got Lucky
133
+
121
134
  - [Lucky circumstance]
122
135
 
123
136
  ---
124
137
 
125
138
  ## Action Items
126
139
 
127
- | Priority | Action | Owner | Due Date | Status |
128
- |----------|--------|-------|----------|--------|
129
- | P1 | [Action item] | [Name] | [Date] | Open |
130
- | P2 | [Action item] | [Name] | [Date] | Open |
131
- | P2 | [Action item] | [Name] | [Date] | Open |
140
+ | Priority | Action | Owner | Due Date | Status |
141
+ | -------- | ------------- | ------ | -------- | ------ |
142
+ | P1 | [Action item] | [Name] | [Date] | Open |
143
+ | P2 | [Action item] | [Name] | [Date] | Open |
144
+ | P2 | [Action item] | [Name] | [Date] | Open |
132
145
 
133
146
  ### Follow-up Tasks
147
+
134
148
  - [ ] Schedule post-mortem meeting
135
149
  - [ ] Update runbooks
136
150
  - [ ] Improve monitoring/alerting
@@ -142,12 +156,14 @@ Template for handling and documenting incidents.
142
156
  ## Appendix
143
157
 
144
158
  ### Related Links
159
+
145
160
  - [Dashboard during incident]
146
161
  - [Relevant logs]
147
162
  - [Related tickets]
148
163
  - [Communication thread]
149
164
 
150
165
  ### Attendees
166
+
151
167
  - Incident Commander: [Name]
152
168
  - Communications Lead: [Name]
153
169
  - Technical Lead: [Name]
@@ -159,30 +175,35 @@ Template for handling and documenting incidents.
159
175
  ## Incident Commander Checklist
160
176
 
161
177
  ### Detection (0-5 min)
178
+
162
179
  - [ ] Acknowledge alert
163
180
  - [ ] Assess severity
164
181
  - [ ] Declare incident if needed
165
182
  - [ ] Assign yourself as IC
166
183
 
167
184
  ### Triage (5-15 min)
185
+
168
186
  - [ ] Create incident channel
169
187
  - [ ] Page relevant teams
170
188
  - [ ] Start incident doc
171
189
  - [ ] Begin timeline
172
190
 
173
191
  ### Communication (Ongoing)
192
+
174
193
  - [ ] Post initial status update
175
194
  - [ ] Update status every 30 min
176
195
  - [ ] Coordinate with comms team
177
196
  - [ ] Notify stakeholders
178
197
 
179
198
  ### Mitigation (Until resolved)
199
+
180
200
  - [ ] Assign investigation tasks
181
201
  - [ ] Consider rollback
182
202
  - [ ] Implement workarounds
183
203
  - [ ] Monitor progress
184
204
 
185
205
  ### Resolution
206
+
186
207
  - [ ] Confirm service restored
187
208
  - [ ] Post final status
188
209
  - [ ] Schedule post-mortem
@@ -193,6 +214,7 @@ Template for handling and documenting incidents.
193
214
  ## Communication Templates
194
215
 
195
216
  ### Internal Status Update
217
+
196
218
  ```
197
219
  🔴 [SEV-1] [Service] Incident - Update #[N]
198
220
 
@@ -205,6 +227,7 @@ IC: @[name] | Thread: [link]
205
227
  ```
206
228
 
207
229
  ### External Status Page
230
+
208
231
  ```
209
232
  [Service Name] - [Status]
210
233
 
@@ -255,32 +278,39 @@ Last Updated: [Time] UTC
255
278
  # Runbook: [Issue Name]
256
279
 
257
280
  ## Symptoms
281
+
258
282
  - [What does this look like?]
259
283
  - [What alerts fire?]
260
284
 
261
285
  ## Quick Diagnosis
286
+
262
287
  1. Check [X] dashboard
263
288
  2. Run `command`
264
289
  3. Look for [pattern]
265
290
 
266
291
  ## Common Causes
292
+
267
293
  1. [Cause 1]: [How to verify]
268
294
  2. [Cause 2]: [How to verify]
269
295
 
270
296
  ## Resolution Steps
271
297
 
272
298
  ### For Cause 1
299
+
273
300
  1. [Step 1]
274
301
  2. [Step 2]
275
302
 
276
303
  ### For Cause 2
304
+
277
305
  1. [Step 1]
278
306
  2. [Step 2]
279
307
 
280
308
  ## Escalation
309
+
281
310
  If not resolved in 15 min, page [team].
282
311
 
283
312
  ## Related
313
+
284
314
  - [Link to other runbooks]
285
315
  - [Link to documentation]
286
316
  ```
@@ -9,12 +9,15 @@ Patterns for implementing effective observability in distributed systems.
9
9
  ## Three Pillars of Observability
10
10
 
11
11
  ### 1. Metrics (What)
12
+
12
13
  Numeric measurements aggregated over time.
13
14
 
14
15
  ### 2. Logs (Why)
16
+
15
17
  Discrete events with context.
16
18
 
17
19
  ### 3. Traces (Where)
20
+
18
21
  Request paths through distributed systems.
19
22
 
20
23
  ---
@@ -23,10 +26,10 @@ Request paths through distributed systems.
23
26
 
24
27
  ### RED Method (Request-focused)
25
28
 
26
- | Metric | Description |
27
- |--------|-------------|
28
- | **R**ate | Requests per second |
29
- | **E**rrors | Failed requests per second |
29
+ | Metric | Description |
30
+ | ------------ | -------------------------- |
31
+ | **R**ate | Requests per second |
32
+ | **E**rrors | Failed requests per second |
30
33
  | **D**uration | Response time distribution |
31
34
 
32
35
  ```typescript
@@ -36,61 +39,64 @@ import { Counter, Histogram } from 'prom-client';
36
39
  const requestCounter = new Counter({
37
40
  name: 'http_requests_total',
38
41
  help: 'Total HTTP requests',
39
- labelNames: ['method', 'path', 'status']
42
+ labelNames: ['method', 'path', 'status'],
40
43
  });
41
44
 
42
45
  const requestDuration = new Histogram({
43
46
  name: 'http_request_duration_seconds',
44
47
  help: 'HTTP request duration',
45
48
  labelNames: ['method', 'path'],
46
- buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5]
49
+ buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5],
47
50
  });
48
51
 
49
52
  app.use((req, res, next) => {
50
53
  const start = Date.now();
51
-
54
+
52
55
  res.on('finish', () => {
53
56
  const duration = (Date.now() - start) / 1000;
54
-
57
+
55
58
  requestCounter.inc({
56
59
  method: req.method,
57
60
  path: req.route?.path || req.path,
58
- status: res.statusCode
61
+ status: res.statusCode,
59
62
  });
60
-
61
- requestDuration.observe({
62
- method: req.method,
63
- path: req.route?.path || req.path
64
- }, duration);
63
+
64
+ requestDuration.observe(
65
+ {
66
+ method: req.method,
67
+ path: req.route?.path || req.path,
68
+ },
69
+ duration
70
+ );
65
71
  });
66
-
72
+
67
73
  next();
68
74
  });
69
75
  ```
70
76
 
71
77
  ### USE Method (Resource-focused)
72
78
 
73
- | Metric | Description |
74
- |--------|-------------|
75
- | **U**tilization | % time resource is busy |
76
- | **S**aturation | Queue depth, waiting work |
77
- | **E**rrors | Error events |
79
+ | Metric | Description |
80
+ | --------------- | ------------------------- |
81
+ | **U**tilization | % time resource is busy |
82
+ | **S**aturation | Queue depth, waiting work |
83
+ | **E**rrors | Error events |
78
84
 
79
85
  ```typescript
80
86
  // System metrics
81
87
  const cpuUtilization = new Gauge({
82
88
  name: 'system_cpu_utilization',
83
- help: 'CPU utilization percentage'
89
+ help: 'CPU utilization percentage',
84
90
  });
85
91
 
86
92
  const memoryUtilization = new Gauge({
87
93
  name: 'system_memory_utilization',
88
- help: 'Memory utilization percentage'
94
+ help: 'Memory utilization percentage',
89
95
  });
90
96
 
91
97
  const queueDepth = new Gauge({
92
98
  name: 'job_queue_depth',
93
- help: 'Number of jobs in queue'
99
+ help: 'Number of jobs in queue',
94
100
  });
95
101
  ```
96
102
 
@@ -145,7 +151,7 @@ const storage = new AsyncLocalStorage<{ correlationId: string }>();
145
151
  app.use((req, res, next) => {
146
152
  const correlationId = req.headers['x-correlation-id'] || uuid();
147
153
  res.setHeader('x-correlation-id', correlationId);
148
-
154
+
149
155
  storage.run({ correlationId }, () => {
150
156
  next();
151
157
  });
@@ -154,22 +160,25 @@ app.use((req, res, next) => {
154
160
  // Logger wrapper
155
161
  function log(level: string, message: string, data: object = {}) {
156
162
  const store = storage.getStore();
157
- logger[level]({
158
- correlationId: store?.correlationId,
159
- ...data
160
- }, message);
163
+ logger[level](
164
+ {
165
+ correlationId: store?.correlationId,
166
+ ...data,
167
+ },
168
+ message
169
+ );
161
170
  }
162
171
  ```
163
172
 
164
173
  ### Log Levels
165
174
 
166
- | Level | When to Use |
167
- |-------|-------------|
168
- | ERROR | Operation failed, needs attention |
169
- | WARN | Unexpected but handled condition |
170
- | INFO | Significant events (startup, requests) |
171
- | DEBUG | Detailed debugging information |
172
- | TRACE | Very detailed tracing |
175
+ | Level | When to Use |
176
+ | ----- | -------------------------------------- |
177
+ | ERROR | Operation failed, needs attention |
178
+ | WARN | Unexpected but handled condition |
179
+ | INFO | Significant events (startup, requests) |
180
+ | DEBUG | Detailed debugging information |
181
+ | TRACE | Very detailed tracing |
173
182
 
174
183
  ---
175
184
 
@@ -190,7 +199,7 @@ const provider = new NodeTracerProvider();
190
199
  provider.addSpanProcessor(
191
200
  new SimpleSpanProcessor(
192
201
  new JaegerExporter({
193
- endpoint: 'http://jaeger:14268/api/traces'
202
+ endpoint: 'http://jaeger:14268/api/traces',
194
203
  })
195
204
  )
196
205
  );
@@ -198,10 +207,7 @@ provider.addSpanProcessor(
198
207
  provider.register();
199
208
 
200
209
  registerInstrumentations({
201
- instrumentations: [
202
- new HttpInstrumentation(),
203
- new ExpressInstrumentation()
204
- ]
210
+ instrumentations: [new HttpInstrumentation(), new ExpressInstrumentation()],
205
211
  });
206
212
  ```
207
213
 
@@ -213,23 +219,23 @@ import { trace } from '@opentelemetry/api';
213
219
  const tracer = trace.getTracer('my-service');
214
220
 
215
221
  async function processOrder(order: Order) {
216
- return tracer.startActiveSpan('processOrder', async (span) => {
222
+ return tracer.startActiveSpan('processOrder', async span => {
217
223
  try {
218
224
  span.setAttribute('order.id', order.id);
219
225
  span.setAttribute('order.amount', order.amount);
220
-
226
+
221
227
  // Child span for payment
222
- await tracer.startActiveSpan('processPayment', async (paymentSpan) => {
228
+ await tracer.startActiveSpan('processPayment', async paymentSpan => {
223
229
  await paymentService.charge(order);
224
230
  paymentSpan.end();
225
231
  });
226
-
232
+
227
233
  // Child span for inventory
228
- await tracer.startActiveSpan('updateInventory', async (inventorySpan) => {
234
+ await tracer.startActiveSpan('updateInventory', async inventorySpan => {
229
235
  await inventoryService.reserve(order.items);
230
236
  inventorySpan.end();
231
237
  });
232
-
238
+
233
239
  span.setStatus({ code: SpanStatusCode.OK });
234
240
  } catch (error) {
235
241
  span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
@@ -262,8 +268,8 @@ groups:
262
268
  labels:
263
269
  severity: critical
264
270
  annotations:
265
- summary: "High error budget burn rate"
266
-
271
+ summary: 'High error budget burn rate'
272
+
267
273
  # Slow burn: 5% budget in 6 hours
268
274
  - alert: SlowErrorBudgetBurn
269
275
  expr: |
@@ -283,9 +289,9 @@ rules:
283
289
  # Good: User-facing symptom
284
290
  - alert: HighLatency
285
291
  expr: http_request_duration_seconds:p99 > 0.5
286
-
292
+
287
293
  # Avoid: Cause-based
288
- - alert: HighCPU # May not affect users
294
+ - alert: HighCPU # May not affect users
289
295
  expr: cpu_utilization > 80
290
296
  ```
291
297
 
@@ -298,36 +304,36 @@ rules:
298
304
  ```yaml
299
305
  # Grafana dashboard structure
300
306
  dashboard:
301
- title: "Service Overview"
307
+ title: 'Service Overview'
302
308
  rows:
303
- - title: "Traffic"
309
+ - title: 'Traffic'
304
310
  panels:
305
311
  - type: graph
306
- title: "Requests per Second"
307
- query: "rate(http_requests_total[5m])"
308
-
309
- - title: "Errors"
312
+ title: 'Requests per Second'
313
+ query: 'rate(http_requests_total[5m])'
314
+
315
+ - title: 'Errors'
310
316
  panels:
311
317
  - type: graph
312
- title: "Error Rate"
318
+ title: 'Error Rate'
313
319
  query: "rate(http_requests_total{status=~'5..'}[5m])"
314
-
315
- - title: "Latency"
320
+
321
+ - title: 'Latency'
316
322
  panels:
317
323
  - type: graph
318
- title: "Latency Percentiles"
324
+ title: 'Latency Percentiles'
319
325
  queries:
320
- - "histogram_quantile(0.50, ...)"
321
- - "histogram_quantile(0.95, ...)"
322
- - "histogram_quantile(0.99, ...)"
323
-
324
- - title: "Saturation"
326
+ - 'histogram_quantile(0.50, ...)'
327
+ - 'histogram_quantile(0.95, ...)'
328
+ - 'histogram_quantile(0.99, ...)'
329
+
330
+ - title: 'Saturation'
325
331
  panels:
326
332
  - type: graph
327
- title: "Resource Usage"
333
+ title: 'Resource Usage'
328
334
  queries:
329
- - "cpu_utilization"
330
- - "memory_utilization"
335
+ - 'cpu_utilization'
336
+ - 'memory_utilization'
331
337
  ```
332
338
 
333
339
  ---
@@ -335,24 +341,28 @@ dashboard:
335
341
  ## Observability Checklist
336
342
 
337
343
  ### Metrics
344
+
338
345
  - [ ] RED metrics for all services
339
346
  - [ ] USE metrics for resources
340
347
  - [ ] Business metrics tracked
341
348
  - [ ] SLI metrics defined
342
349
 
343
350
  ### Logging
351
+
344
352
  - [ ] Structured JSON logs
345
353
  - [ ] Correlation IDs propagated
346
354
  - [ ] Appropriate log levels
347
355
  - [ ] Sensitive data masked
348
356
 
349
357
  ### Tracing
358
+
350
359
  - [ ] Distributed tracing enabled
351
360
  - [ ] Span context propagated
352
361
  - [ ] Key operations instrumented
353
362
  - [ ] Sampling configured
354
363
 
355
364
  ### Alerting
365
+
356
366
  - [ ] SLO-based alerts
357
367
  - [ ] Multi-window burn rates
358
368
  - [ ] Clear runbooks linked