musubi-sdd 5.0.0 → 5.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ja.md +106 -48
- package/README.md +110 -32
- package/bin/musubi-analyze.js +74 -67
- package/bin/musubi-browser.js +27 -26
- package/bin/musubi-change.js +48 -47
- package/bin/musubi-checkpoint.js +10 -7
- package/bin/musubi-convert.js +25 -25
- package/bin/musubi-costs.js +27 -10
- package/bin/musubi-gui.js +52 -46
- package/bin/musubi-init.js +1952 -10
- package/bin/musubi-orchestrate.js +327 -239
- package/bin/musubi-remember.js +69 -56
- package/bin/musubi-resolve.js +53 -45
- package/bin/musubi-trace.js +51 -22
- package/bin/musubi-validate.js +39 -30
- package/bin/musubi-workflow.js +33 -34
- package/bin/musubi.js +39 -2
- package/package.json +1 -1
- package/src/agents/agent-loop.js +94 -95
- package/src/agents/agentic/code-generator.js +119 -109
- package/src/agents/agentic/code-reviewer.js +105 -108
- package/src/agents/agentic/index.js +4 -4
- package/src/agents/browser/action-executor.js +13 -13
- package/src/agents/browser/ai-comparator.js +11 -10
- package/src/agents/browser/context-manager.js +6 -6
- package/src/agents/browser/index.js +5 -5
- package/src/agents/browser/nl-parser.js +31 -46
- package/src/agents/browser/screenshot.js +2 -2
- package/src/agents/browser/test-generator.js +6 -4
- package/src/agents/function-tool.js +71 -65
- package/src/agents/index.js +7 -7
- package/src/agents/schema-generator.js +98 -94
- package/src/analyzers/ast-extractor.js +164 -145
- package/src/analyzers/codegraph-auto-update.js +858 -0
- package/src/analyzers/complexity-analyzer.js +536 -0
- package/src/analyzers/context-optimizer.js +247 -125
- package/src/analyzers/impact-analyzer.js +1 -1
- package/src/analyzers/large-project-analyzer.js +766 -0
- package/src/analyzers/repository-map.js +83 -80
- package/src/analyzers/security-analyzer.js +19 -11
- package/src/analyzers/stuck-detector.js +19 -17
- package/src/converters/index.js +78 -57
- package/src/converters/ir/types.js +12 -12
- package/src/converters/parsers/musubi-parser.js +134 -126
- package/src/converters/parsers/openapi-parser.js +70 -53
- package/src/converters/parsers/speckit-parser.js +239 -175
- package/src/converters/writers/musubi-writer.js +123 -118
- package/src/converters/writers/speckit-writer.js +124 -113
- package/src/generators/rust-migration-generator.js +512 -0
- package/src/gui/public/index.html +1365 -1211
- package/src/gui/server.js +41 -40
- package/src/gui/services/file-watcher.js +23 -8
- package/src/gui/services/project-scanner.js +26 -20
- package/src/gui/services/replanning-service.js +27 -23
- package/src/gui/services/traceability-service.js +8 -8
- package/src/gui/services/workflow-service.js +14 -7
- package/src/index.js +151 -0
- package/src/integrations/cicd.js +90 -104
- package/src/integrations/codegraph-mcp.js +643 -0
- package/src/integrations/documentation.js +142 -103
- package/src/integrations/examples.js +95 -80
- package/src/integrations/github-client.js +17 -17
- package/src/integrations/index.js +5 -5
- package/src/integrations/mcp/index.js +21 -21
- package/src/integrations/mcp/mcp-context-provider.js +76 -78
- package/src/integrations/mcp/mcp-discovery.js +74 -72
- package/src/integrations/mcp/mcp-tool-registry.js +99 -94
- package/src/integrations/mcp-connector.js +70 -66
- package/src/integrations/platforms.js +50 -49
- package/src/integrations/tool-discovery.js +37 -31
- package/src/llm-providers/anthropic-provider.js +11 -11
- package/src/llm-providers/base-provider.js +16 -18
- package/src/llm-providers/copilot-provider.js +22 -19
- package/src/llm-providers/index.js +26 -25
- package/src/llm-providers/ollama-provider.js +11 -11
- package/src/llm-providers/openai-provider.js +12 -12
- package/src/managers/agent-memory.js +36 -24
- package/src/managers/checkpoint-manager.js +4 -8
- package/src/managers/delta-spec.js +19 -19
- package/src/managers/index.js +13 -4
- package/src/managers/memory-condenser.js +35 -45
- package/src/managers/repo-skill-manager.js +57 -31
- package/src/managers/skill-loader.js +25 -22
- package/src/managers/skill-tools.js +36 -72
- package/src/managers/workflow.js +30 -22
- package/src/monitoring/cost-tracker.js +53 -44
- package/src/monitoring/incident-manager.js +123 -103
- package/src/monitoring/index.js +144 -134
- package/src/monitoring/observability.js +82 -59
- package/src/monitoring/quality-dashboard.js +51 -39
- package/src/monitoring/release-manager.js +70 -50
- package/src/orchestration/agent-skill-binding.js +39 -47
- package/src/orchestration/error-handler.js +65 -107
- package/src/orchestration/guardrails/base-guardrail.js +26 -24
- package/src/orchestration/guardrails/guardrail-rules.js +50 -64
- package/src/orchestration/guardrails/index.js +5 -5
- package/src/orchestration/guardrails/input-guardrail.js +58 -45
- package/src/orchestration/guardrails/output-guardrail.js +104 -81
- package/src/orchestration/guardrails/safety-check.js +79 -79
- package/src/orchestration/index.js +38 -55
- package/src/orchestration/mcp-tool-adapters.js +96 -99
- package/src/orchestration/orchestration-engine.js +21 -21
- package/src/orchestration/pattern-registry.js +60 -45
- package/src/orchestration/patterns/auto.js +34 -47
- package/src/orchestration/patterns/group-chat.js +59 -65
- package/src/orchestration/patterns/handoff.js +67 -65
- package/src/orchestration/patterns/human-in-loop.js +51 -72
- package/src/orchestration/patterns/nested.js +25 -40
- package/src/orchestration/patterns/sequential.js +35 -34
- package/src/orchestration/patterns/swarm.js +63 -56
- package/src/orchestration/patterns/triage.js +150 -109
- package/src/orchestration/reasoning/index.js +9 -9
- package/src/orchestration/reasoning/planning-engine.js +143 -140
- package/src/orchestration/reasoning/reasoning-engine.js +206 -144
- package/src/orchestration/reasoning/self-correction.js +121 -128
- package/src/orchestration/replanning/adaptive-goal-modifier.js +107 -112
- package/src/orchestration/replanning/alternative-generator.js +37 -42
- package/src/orchestration/replanning/config.js +63 -59
- package/src/orchestration/replanning/goal-progress-tracker.js +98 -100
- package/src/orchestration/replanning/index.js +24 -20
- package/src/orchestration/replanning/plan-evaluator.js +49 -50
- package/src/orchestration/replanning/plan-monitor.js +32 -28
- package/src/orchestration/replanning/proactive-path-optimizer.js +175 -178
- package/src/orchestration/replanning/replan-history.js +33 -26
- package/src/orchestration/replanning/replanning-engine.js +106 -108
- package/src/orchestration/skill-executor.js +107 -109
- package/src/orchestration/skill-registry.js +85 -89
- package/src/orchestration/workflow-examples.js +228 -231
- package/src/orchestration/workflow-executor.js +65 -68
- package/src/orchestration/workflow-orchestrator.js +72 -73
- package/src/phase4-integration.js +47 -40
- package/src/phase5-integration.js +89 -30
- package/src/reporters/coverage-report.js +82 -30
- package/src/reporters/hierarchical-reporter.js +498 -0
- package/src/reporters/traceability-matrix-report.js +29 -20
- package/src/resolvers/issue-resolver.js +43 -31
- package/src/steering/advanced-validation.js +133 -124
- package/src/steering/auto-updater.js +60 -73
- package/src/steering/index.js +6 -6
- package/src/steering/quality-metrics.js +41 -35
- package/src/steering/steering-auto-update.js +83 -86
- package/src/steering/steering-validator.js +98 -106
- package/src/steering/template-constraints.js +53 -54
- package/src/templates/agents/claude-code/CLAUDE.md +32 -32
- package/src/templates/agents/claude-code/skills/agent-assistant/SKILL.md +13 -5
- package/src/templates/agents/claude-code/skills/ai-ml-engineer/mlops-guide.md +23 -23
- package/src/templates/agents/claude-code/skills/ai-ml-engineer/model-card-template.md +60 -41
- package/src/templates/agents/claude-code/skills/api-designer/api-patterns.md +27 -19
- package/src/templates/agents/claude-code/skills/api-designer/openapi-template.md +11 -7
- package/src/templates/agents/claude-code/skills/bug-hunter/SKILL.md +4 -3
- package/src/templates/agents/claude-code/skills/bug-hunter/root-cause-analysis.md +37 -15
- package/src/templates/agents/claude-code/skills/change-impact-analyzer/dependency-graph-patterns.md +36 -42
- package/src/templates/agents/claude-code/skills/change-impact-analyzer/impact-analysis-template.md +69 -60
- package/src/templates/agents/claude-code/skills/cloud-architect/aws-patterns.md +31 -38
- package/src/templates/agents/claude-code/skills/cloud-architect/azure-patterns.md +28 -23
- package/src/templates/agents/claude-code/skills/code-reviewer/SKILL.md +61 -0
- package/src/templates/agents/claude-code/skills/code-reviewer/best-practices.md +27 -0
- package/src/templates/agents/claude-code/skills/code-reviewer/review-checklist.md +29 -10
- package/src/templates/agents/claude-code/skills/code-reviewer/review-standards.md +29 -24
- package/src/templates/agents/claude-code/skills/constitution-enforcer/SKILL.md +8 -6
- package/src/templates/agents/claude-code/skills/constitution-enforcer/constitutional-articles.md +62 -26
- package/src/templates/agents/claude-code/skills/constitution-enforcer/phase-minus-one-gates.md +35 -16
- package/src/templates/agents/claude-code/skills/database-administrator/backup-recovery.md +27 -17
- package/src/templates/agents/claude-code/skills/database-administrator/tuning-guide.md +25 -20
- package/src/templates/agents/claude-code/skills/database-schema-designer/schema-patterns.md +39 -22
- package/src/templates/agents/claude-code/skills/devops-engineer/ci-cd-templates.md +25 -22
- package/src/templates/agents/claude-code/skills/issue-resolver/SKILL.md +24 -21
- package/src/templates/agents/claude-code/skills/orchestrator/SKILL.md +148 -63
- package/src/templates/agents/claude-code/skills/orchestrator/patterns.md +35 -16
- package/src/templates/agents/claude-code/skills/orchestrator/selection-matrix.md +69 -64
- package/src/templates/agents/claude-code/skills/performance-engineer/optimization-playbook.md +47 -47
- package/src/templates/agents/claude-code/skills/performance-optimizer/SKILL.md +69 -0
- package/src/templates/agents/claude-code/skills/performance-optimizer/benchmark-template.md +63 -45
- package/src/templates/agents/claude-code/skills/performance-optimizer/optimization-patterns.md +33 -35
- package/src/templates/agents/claude-code/skills/project-manager/SKILL.md +7 -6
- package/src/templates/agents/claude-code/skills/project-manager/agile-ceremonies.md +47 -28
- package/src/templates/agents/claude-code/skills/project-manager/project-templates.md +94 -78
- package/src/templates/agents/claude-code/skills/quality-assurance/SKILL.md +20 -17
- package/src/templates/agents/claude-code/skills/quality-assurance/qa-plan-template.md +63 -49
- package/src/templates/agents/claude-code/skills/release-coordinator/SKILL.md +5 -5
- package/src/templates/agents/claude-code/skills/release-coordinator/feature-flag-guide.md +30 -26
- package/src/templates/agents/claude-code/skills/release-coordinator/release-plan-template.md +67 -35
- package/src/templates/agents/claude-code/skills/requirements-analyst/ears-format.md +54 -42
- package/src/templates/agents/claude-code/skills/requirements-analyst/validation-rules.md +36 -33
- package/src/templates/agents/claude-code/skills/security-auditor/SKILL.md +77 -19
- package/src/templates/agents/claude-code/skills/security-auditor/audit-checklists.md +24 -24
- package/src/templates/agents/claude-code/skills/security-auditor/owasp-top-10.md +61 -20
- package/src/templates/agents/claude-code/skills/security-auditor/vulnerability-patterns.md +43 -11
- package/src/templates/agents/claude-code/skills/site-reliability-engineer/SKILL.md +1 -0
- package/src/templates/agents/claude-code/skills/site-reliability-engineer/incident-response-template.md +55 -25
- package/src/templates/agents/claude-code/skills/site-reliability-engineer/observability-patterns.md +78 -68
- package/src/templates/agents/claude-code/skills/site-reliability-engineer/slo-sli-guide.md +73 -53
- package/src/templates/agents/claude-code/skills/software-developer/solid-principles.md +83 -37
- package/src/templates/agents/claude-code/skills/software-developer/test-first-workflow.md +38 -31
- package/src/templates/agents/claude-code/skills/steering/SKILL.md +1 -0
- package/src/templates/agents/claude-code/skills/steering/auto-update-rules.md +31 -0
- package/src/templates/agents/claude-code/skills/system-architect/adr-template.md +25 -7
- package/src/templates/agents/claude-code/skills/system-architect/c4-model-guide.md +74 -61
- package/src/templates/agents/claude-code/skills/technical-writer/doc-templates/documentation-templates.md +70 -52
- package/src/templates/agents/claude-code/skills/test-engineer/SKILL.md +2 -0
- package/src/templates/agents/claude-code/skills/test-engineer/ears-test-mapping.md +75 -71
- package/src/templates/agents/claude-code/skills/test-engineer/test-types.md +85 -63
- package/src/templates/agents/claude-code/skills/traceability-auditor/coverage-matrix-template.md +39 -36
- package/src/templates/agents/claude-code/skills/traceability-auditor/gap-detection-rules.md +22 -17
- package/src/templates/agents/claude-code/skills/ui-ux-designer/SKILL.md +1 -0
- package/src/templates/agents/claude-code/skills/ui-ux-designer/accessibility-guidelines.md +49 -75
- package/src/templates/agents/claude-code/skills/ui-ux-designer/design-system-components.md +71 -59
- package/src/templates/agents/codex/AGENTS.md +74 -42
- package/src/templates/agents/cursor/AGENTS.md +74 -42
- package/src/templates/agents/gemini-cli/GEMINI.md +74 -42
- package/src/templates/agents/github-copilot/AGENTS.md +83 -51
- package/src/templates/agents/qwen-code/QWEN.md +74 -42
- package/src/templates/agents/windsurf/AGENTS.md +74 -42
- package/src/templates/architectures/README.md +41 -0
- package/src/templates/architectures/clean-architecture/README.md +113 -0
- package/src/templates/architectures/event-driven/README.md +162 -0
- package/src/templates/architectures/hexagonal/README.md +130 -0
- package/src/templates/index.js +6 -1
- package/src/templates/locale-manager.js +16 -16
- package/src/templates/shared/delta-spec-template.md +20 -13
- package/src/templates/shared/github-actions/musubi-issue-resolver.yml +5 -5
- package/src/templates/shared/github-actions/musubi-security-check.yml +3 -3
- package/src/templates/shared/github-actions/musubi-validate.yml +4 -4
- package/src/templates/shared/steering/structure.md +95 -0
- package/src/templates/skills/browser-agent.md +21 -16
- package/src/templates/skills/web-gui.md +8 -0
- package/src/templates/template-constraints.js +50 -53
- package/src/validators/advanced-validation.js +30 -36
- package/src/validators/constitutional-validator.js +77 -73
- package/src/validators/critic-system.js +49 -59
- package/src/validators/delta-format.js +59 -55
- package/src/validators/traceability-validator.js +7 -11
|
@@ -18,12 +18,12 @@ Template for handling and documenting incidents.
|
|
|
18
18
|
|
|
19
19
|
## Incident Severity Levels
|
|
20
20
|
|
|
21
|
-
| Level | Name
|
|
22
|
-
|
|
23
|
-
| SEV-1 | Critical | Complete outage
|
|
24
|
-
| SEV-2 | Major
|
|
25
|
-
| SEV-3 | Minor
|
|
26
|
-
| SEV-4 | Low
|
|
21
|
+
| Level | Name | Description | Response Time | Example |
|
|
22
|
+
| ----- | -------- | ---------------- | ------------- | ---------------- |
|
|
23
|
+
| SEV-1 | Critical | Complete outage | 15 min | Site down |
|
|
24
|
+
| SEV-2 | Major | Partial outage | 30 min | Payment failures |
|
|
25
|
+
| SEV-3 | Minor | Degraded service | 2 hours | Slow responses |
|
|
26
|
+
| SEV-4 | Low | Minimal impact | Next day | Minor UI bug |
|
|
27
27
|
|
|
28
28
|
---
|
|
29
29
|
|
|
@@ -33,6 +33,7 @@ Template for handling and documenting incidents.
|
|
|
33
33
|
# Incident Report: [INC-XXXX] [Title]
|
|
34
34
|
|
|
35
35
|
## Summary
|
|
36
|
+
|
|
36
37
|
**Status**: Active / Mitigated / Resolved
|
|
37
38
|
**Severity**: SEV-1 / SEV-2 / SEV-3 / SEV-4
|
|
38
39
|
**Duration**: [Start time] - [End time] (X hours Y minutes)
|
|
@@ -42,50 +43,56 @@ Template for handling and documenting incidents.
|
|
|
42
43
|
|
|
43
44
|
## Timeline
|
|
44
45
|
|
|
45
|
-
| Time (UTC) | Event
|
|
46
|
-
|
|
47
|
-
| HH:MM
|
|
48
|
-
| HH:MM
|
|
49
|
-
| HH:MM
|
|
50
|
-
| HH:MM
|
|
51
|
-
| HH:MM
|
|
52
|
-
| HH:MM
|
|
53
|
-
| HH:MM
|
|
46
|
+
| Time (UTC) | Event |
|
|
47
|
+
| ---------- | ---------------------------------------------- |
|
|
48
|
+
| HH:MM | Incident detected via [monitoring/user report] |
|
|
49
|
+
| HH:MM | Incident commander assigned: [Name] |
|
|
50
|
+
| HH:MM | [Action taken] |
|
|
51
|
+
| HH:MM | Root cause identified |
|
|
52
|
+
| HH:MM | Mitigation deployed |
|
|
53
|
+
| HH:MM | Service restored |
|
|
54
|
+
| HH:MM | Incident resolved |
|
|
54
55
|
|
|
55
56
|
---
|
|
56
57
|
|
|
57
58
|
## Impact
|
|
58
59
|
|
|
59
60
|
### Users Affected
|
|
61
|
+
|
|
60
62
|
- [Number] users impacted
|
|
61
63
|
- [Regions/segments] affected
|
|
62
64
|
- [Features] unavailable
|
|
63
65
|
|
|
64
66
|
### Business Impact
|
|
67
|
+
|
|
65
68
|
- [Revenue impact if any]
|
|
66
69
|
- [SLA breach if any]
|
|
67
70
|
- [Reputational impact]
|
|
68
71
|
|
|
69
72
|
### Metrics
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
|
73
|
-
|
|
|
74
|
-
|
|
|
73
|
+
|
|
74
|
+
| Metric | During Incident | Normal |
|
|
75
|
+
| ------------ | --------------- | ------ |
|
|
76
|
+
| Error Rate | X% | Y% |
|
|
77
|
+
| Latency p99 | Xms | Yms |
|
|
78
|
+
| Availability | X% | Y% |
|
|
75
79
|
|
|
76
80
|
---
|
|
77
81
|
|
|
78
82
|
## Root Cause
|
|
79
83
|
|
|
80
84
|
### What Happened
|
|
85
|
+
|
|
81
86
|
[Detailed technical explanation of the root cause]
|
|
82
87
|
|
|
83
88
|
### Why It Happened
|
|
89
|
+
|
|
84
90
|
- [Contributing factor 1]
|
|
85
91
|
- [Contributing factor 2]
|
|
86
92
|
- [Why wasn't this caught earlier?]
|
|
87
93
|
|
|
88
94
|
### Timeline of Events Leading to Incident
|
|
95
|
+
|
|
89
96
|
1. [Event 1]
|
|
90
97
|
2. [Event 2]
|
|
91
98
|
3. [Event that triggered incident]
|
|
@@ -95,14 +102,17 @@ Template for handling and documenting incidents.
|
|
|
95
102
|
## Resolution
|
|
96
103
|
|
|
97
104
|
### Immediate Actions
|
|
105
|
+
|
|
98
106
|
- [Action 1]: [Result]
|
|
99
107
|
- [Action 2]: [Result]
|
|
100
108
|
|
|
101
109
|
### Mitigation Steps
|
|
110
|
+
|
|
102
111
|
1. [Step taken to mitigate]
|
|
103
112
|
2. [Step taken to mitigate]
|
|
104
113
|
|
|
105
114
|
### Permanent Fix
|
|
115
|
+
|
|
106
116
|
[Description of permanent fix implemented or planned]
|
|
107
117
|
|
|
108
118
|
---
|
|
@@ -110,27 +120,31 @@ Template for handling and documenting incidents.
|
|
|
110
120
|
## Lessons Learned
|
|
111
121
|
|
|
112
122
|
### What Went Well
|
|
123
|
+
|
|
113
124
|
- [Positive 1]
|
|
114
125
|
- [Positive 2]
|
|
115
126
|
|
|
116
127
|
### What Went Wrong
|
|
128
|
+
|
|
117
129
|
- [Issue 1]
|
|
118
130
|
- [Issue 2]
|
|
119
131
|
|
|
120
132
|
### Where We Got Lucky
|
|
133
|
+
|
|
121
134
|
- [Lucky circumstance]
|
|
122
135
|
|
|
123
136
|
---
|
|
124
137
|
|
|
125
138
|
## Action Items
|
|
126
139
|
|
|
127
|
-
| Priority | Action
|
|
128
|
-
|
|
129
|
-
| P1
|
|
130
|
-
| P2
|
|
131
|
-
| P2
|
|
140
|
+
| Priority | Action | Owner | Due Date | Status |
|
|
141
|
+
| -------- | ------------- | ------ | -------- | ------ |
|
|
142
|
+
| P1 | [Action item] | [Name] | [Date] | Open |
|
|
143
|
+
| P2 | [Action item] | [Name] | [Date] | Open |
|
|
144
|
+
| P2 | [Action item] | [Name] | [Date] | Open |
|
|
132
145
|
|
|
133
146
|
### Follow-up Tasks
|
|
147
|
+
|
|
134
148
|
- [ ] Schedule post-mortem meeting
|
|
135
149
|
- [ ] Update runbooks
|
|
136
150
|
- [ ] Improve monitoring/alerting
|
|
@@ -142,12 +156,14 @@ Template for handling and documenting incidents.
|
|
|
142
156
|
## Appendix
|
|
143
157
|
|
|
144
158
|
### Related Links
|
|
159
|
+
|
|
145
160
|
- [Dashboard during incident]
|
|
146
161
|
- [Relevant logs]
|
|
147
162
|
- [Related tickets]
|
|
148
163
|
- [Communication thread]
|
|
149
164
|
|
|
150
165
|
### Attendees
|
|
166
|
+
|
|
151
167
|
- Incident Commander: [Name]
|
|
152
168
|
- Communications Lead: [Name]
|
|
153
169
|
- Technical Lead: [Name]
|
|
@@ -159,30 +175,35 @@ Template for handling and documenting incidents.
|
|
|
159
175
|
## Incident Commander Checklist
|
|
160
176
|
|
|
161
177
|
### Detection (0-5 min)
|
|
178
|
+
|
|
162
179
|
- [ ] Acknowledge alert
|
|
163
180
|
- [ ] Assess severity
|
|
164
181
|
- [ ] Declare incident if needed
|
|
165
182
|
- [ ] Assign yourself as IC
|
|
166
183
|
|
|
167
184
|
### Triage (5-15 min)
|
|
185
|
+
|
|
168
186
|
- [ ] Create incident channel
|
|
169
187
|
- [ ] Page relevant teams
|
|
170
188
|
- [ ] Start incident doc
|
|
171
189
|
- [ ] Begin timeline
|
|
172
190
|
|
|
173
191
|
### Communication (Ongoing)
|
|
192
|
+
|
|
174
193
|
- [ ] Post initial status update
|
|
175
194
|
- [ ] Update status every 30 min
|
|
176
195
|
- [ ] Coordinate with comms team
|
|
177
196
|
- [ ] Notify stakeholders
|
|
178
197
|
|
|
179
198
|
### Mitigation (Until resolved)
|
|
199
|
+
|
|
180
200
|
- [ ] Assign investigation tasks
|
|
181
201
|
- [ ] Consider rollback
|
|
182
202
|
- [ ] Implement workarounds
|
|
183
203
|
- [ ] Monitor progress
|
|
184
204
|
|
|
185
205
|
### Resolution
|
|
206
|
+
|
|
186
207
|
- [ ] Confirm service restored
|
|
187
208
|
- [ ] Post final status
|
|
188
209
|
- [ ] Schedule post-mortem
|
|
@@ -193,6 +214,7 @@ Template for handling and documenting incidents.
|
|
|
193
214
|
## Communication Templates
|
|
194
215
|
|
|
195
216
|
### Internal Status Update
|
|
217
|
+
|
|
196
218
|
```
|
|
197
219
|
🔴 [SEV-1] [Service] Incident - Update #[N]
|
|
198
220
|
|
|
@@ -205,6 +227,7 @@ IC: @[name] | Thread: [link]
|
|
|
205
227
|
```
|
|
206
228
|
|
|
207
229
|
### External Status Page
|
|
230
|
+
|
|
208
231
|
```
|
|
209
232
|
[Service Name] - [Status]
|
|
210
233
|
|
|
@@ -255,32 +278,39 @@ Last Updated: [Time] UTC
|
|
|
255
278
|
# Runbook: [Issue Name]
|
|
256
279
|
|
|
257
280
|
## Symptoms
|
|
281
|
+
|
|
258
282
|
- [What does this look like?]
|
|
259
283
|
- [What alerts fire?]
|
|
260
284
|
|
|
261
285
|
## Quick Diagnosis
|
|
286
|
+
|
|
262
287
|
1. Check [X] dashboard
|
|
263
288
|
2. Run `command`
|
|
264
289
|
3. Look for [pattern]
|
|
265
290
|
|
|
266
291
|
## Common Causes
|
|
292
|
+
|
|
267
293
|
1. [Cause 1]: [How to verify]
|
|
268
294
|
2. [Cause 2]: [How to verify]
|
|
269
295
|
|
|
270
296
|
## Resolution Steps
|
|
271
297
|
|
|
272
298
|
### For Cause 1
|
|
299
|
+
|
|
273
300
|
1. [Step 1]
|
|
274
301
|
2. [Step 2]
|
|
275
302
|
|
|
276
303
|
### For Cause 2
|
|
304
|
+
|
|
277
305
|
1. [Step 1]
|
|
278
306
|
2. [Step 2]
|
|
279
307
|
|
|
280
308
|
## Escalation
|
|
309
|
+
|
|
281
310
|
If not resolved in 15 min, page [team].
|
|
282
311
|
|
|
283
312
|
## Related
|
|
313
|
+
|
|
284
314
|
- [Link to other runbooks]
|
|
285
315
|
- [Link to documentation]
|
|
286
316
|
```
|
package/src/templates/agents/claude-code/skills/site-reliability-engineer/observability-patterns.md
CHANGED
|
@@ -9,12 +9,15 @@ Patterns for implementing effective observability in distributed systems.
|
|
|
9
9
|
## Three Pillars of Observability
|
|
10
10
|
|
|
11
11
|
### 1. Metrics (What)
|
|
12
|
+
|
|
12
13
|
Numeric measurements aggregated over time.
|
|
13
14
|
|
|
14
15
|
### 2. Logs (Why)
|
|
16
|
+
|
|
15
17
|
Discrete events with context.
|
|
16
18
|
|
|
17
19
|
### 3. Traces (Where)
|
|
20
|
+
|
|
18
21
|
Request paths through distributed systems.
|
|
19
22
|
|
|
20
23
|
---
|
|
@@ -23,10 +26,10 @@ Request paths through distributed systems.
|
|
|
23
26
|
|
|
24
27
|
### RED Method (Request-focused)
|
|
25
28
|
|
|
26
|
-
| Metric
|
|
27
|
-
|
|
28
|
-
| **R**ate
|
|
29
|
-
| **E**rrors
|
|
29
|
+
| Metric | Description |
|
|
30
|
+
| ------------ | -------------------------- |
|
|
31
|
+
| **R**ate | Requests per second |
|
|
32
|
+
| **E**rrors | Failed requests per second |
|
|
30
33
|
| **D**uration | Response time distribution |
|
|
31
34
|
|
|
32
35
|
```typescript
|
|
@@ -36,61 +39,64 @@ import { Counter, Histogram } from 'prom-client';
|
|
|
36
39
|
const requestCounter = new Counter({
|
|
37
40
|
name: 'http_requests_total',
|
|
38
41
|
help: 'Total HTTP requests',
|
|
39
|
-
labelNames: ['method', 'path', 'status']
|
|
42
|
+
labelNames: ['method', 'path', 'status'],
|
|
40
43
|
});
|
|
41
44
|
|
|
42
45
|
const requestDuration = new Histogram({
|
|
43
46
|
name: 'http_request_duration_seconds',
|
|
44
47
|
help: 'HTTP request duration',
|
|
45
48
|
labelNames: ['method', 'path'],
|
|
46
|
-
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5]
|
|
49
|
+
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5],
|
|
47
50
|
});
|
|
48
51
|
|
|
49
52
|
app.use((req, res, next) => {
|
|
50
53
|
const start = Date.now();
|
|
51
|
-
|
|
54
|
+
|
|
52
55
|
res.on('finish', () => {
|
|
53
56
|
const duration = (Date.now() - start) / 1000;
|
|
54
|
-
|
|
57
|
+
|
|
55
58
|
requestCounter.inc({
|
|
56
59
|
method: req.method,
|
|
57
60
|
path: req.route?.path || req.path,
|
|
58
|
-
status: res.statusCode
|
|
61
|
+
status: res.statusCode,
|
|
59
62
|
});
|
|
60
|
-
|
|
61
|
-
requestDuration.observe(
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
63
|
+
|
|
64
|
+
requestDuration.observe(
|
|
65
|
+
{
|
|
66
|
+
method: req.method,
|
|
67
|
+
path: req.route?.path || req.path,
|
|
68
|
+
},
|
|
69
|
+
duration
|
|
70
|
+
);
|
|
65
71
|
});
|
|
66
|
-
|
|
72
|
+
|
|
67
73
|
next();
|
|
68
74
|
});
|
|
69
75
|
```
|
|
70
76
|
|
|
71
77
|
### USE Method (Resource-focused)
|
|
72
78
|
|
|
73
|
-
| Metric
|
|
74
|
-
|
|
75
|
-
| **U**tilization | % time resource is busy
|
|
76
|
-
| **S**aturation
|
|
77
|
-
| **E**rrors
|
|
79
|
+
| Metric | Description |
|
|
80
|
+
| --------------- | ------------------------- |
|
|
81
|
+
| **U**tilization | % time resource is busy |
|
|
82
|
+
| **S**aturation | Queue depth, waiting work |
|
|
83
|
+
| **E**rrors | Error events |
|
|
78
84
|
|
|
79
85
|
```typescript
|
|
80
86
|
// System metrics
|
|
81
87
|
const cpuUtilization = new Gauge({
|
|
82
88
|
name: 'system_cpu_utilization',
|
|
83
|
-
help: 'CPU utilization percentage'
|
|
89
|
+
help: 'CPU utilization percentage',
|
|
84
90
|
});
|
|
85
91
|
|
|
86
92
|
const memoryUtilization = new Gauge({
|
|
87
93
|
name: 'system_memory_utilization',
|
|
88
|
-
help: 'Memory utilization percentage'
|
|
94
|
+
help: 'Memory utilization percentage',
|
|
89
95
|
});
|
|
90
96
|
|
|
91
97
|
const queueDepth = new Gauge({
|
|
92
98
|
name: 'job_queue_depth',
|
|
93
|
-
help: 'Number of jobs in queue'
|
|
99
|
+
help: 'Number of jobs in queue',
|
|
94
100
|
});
|
|
95
101
|
```
|
|
96
102
|
|
|
@@ -145,7 +151,7 @@ const storage = new AsyncLocalStorage<{ correlationId: string }>();
|
|
|
145
151
|
app.use((req, res, next) => {
|
|
146
152
|
const correlationId = req.headers['x-correlation-id'] || uuid();
|
|
147
153
|
res.setHeader('x-correlation-id', correlationId);
|
|
148
|
-
|
|
154
|
+
|
|
149
155
|
storage.run({ correlationId }, () => {
|
|
150
156
|
next();
|
|
151
157
|
});
|
|
@@ -154,22 +160,25 @@ app.use((req, res, next) => {
|
|
|
154
160
|
// Logger wrapper
|
|
155
161
|
function log(level: string, message: string, data: object = {}) {
|
|
156
162
|
const store = storage.getStore();
|
|
157
|
-
logger[level](
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
163
|
+
logger[level](
|
|
164
|
+
{
|
|
165
|
+
correlationId: store?.correlationId,
|
|
166
|
+
...data,
|
|
167
|
+
},
|
|
168
|
+
message
|
|
169
|
+
);
|
|
161
170
|
}
|
|
162
171
|
```
|
|
163
172
|
|
|
164
173
|
### Log Levels
|
|
165
174
|
|
|
166
|
-
| Level | When to Use
|
|
167
|
-
|
|
168
|
-
| ERROR | Operation failed, needs attention
|
|
169
|
-
| WARN
|
|
170
|
-
| INFO
|
|
171
|
-
| DEBUG | Detailed debugging information
|
|
172
|
-
| TRACE | Very detailed tracing
|
|
175
|
+
| Level | When to Use |
|
|
176
|
+
| ----- | -------------------------------------- |
|
|
177
|
+
| ERROR | Operation failed, needs attention |
|
|
178
|
+
| WARN | Unexpected but handled condition |
|
|
179
|
+
| INFO | Significant events (startup, requests) |
|
|
180
|
+
| DEBUG | Detailed debugging information |
|
|
181
|
+
| TRACE | Very detailed tracing |
|
|
173
182
|
|
|
174
183
|
---
|
|
175
184
|
|
|
@@ -190,7 +199,7 @@ const provider = new NodeTracerProvider();
|
|
|
190
199
|
provider.addSpanProcessor(
|
|
191
200
|
new SimpleSpanProcessor(
|
|
192
201
|
new JaegerExporter({
|
|
193
|
-
endpoint: 'http://jaeger:14268/api/traces'
|
|
202
|
+
endpoint: 'http://jaeger:14268/api/traces',
|
|
194
203
|
})
|
|
195
204
|
)
|
|
196
205
|
);
|
|
@@ -198,10 +207,7 @@ provider.addSpanProcessor(
|
|
|
198
207
|
provider.register();
|
|
199
208
|
|
|
200
209
|
registerInstrumentations({
|
|
201
|
-
instrumentations: [
|
|
202
|
-
new HttpInstrumentation(),
|
|
203
|
-
new ExpressInstrumentation()
|
|
204
|
-
]
|
|
210
|
+
instrumentations: [new HttpInstrumentation(), new ExpressInstrumentation()],
|
|
205
211
|
});
|
|
206
212
|
```
|
|
207
213
|
|
|
@@ -213,23 +219,23 @@ import { trace } from '@opentelemetry/api';
|
|
|
213
219
|
const tracer = trace.getTracer('my-service');
|
|
214
220
|
|
|
215
221
|
async function processOrder(order: Order) {
|
|
216
|
-
return tracer.startActiveSpan('processOrder', async
|
|
222
|
+
return tracer.startActiveSpan('processOrder', async span => {
|
|
217
223
|
try {
|
|
218
224
|
span.setAttribute('order.id', order.id);
|
|
219
225
|
span.setAttribute('order.amount', order.amount);
|
|
220
|
-
|
|
226
|
+
|
|
221
227
|
// Child span for payment
|
|
222
|
-
await tracer.startActiveSpan('processPayment', async
|
|
228
|
+
await tracer.startActiveSpan('processPayment', async paymentSpan => {
|
|
223
229
|
await paymentService.charge(order);
|
|
224
230
|
paymentSpan.end();
|
|
225
231
|
});
|
|
226
|
-
|
|
232
|
+
|
|
227
233
|
// Child span for inventory
|
|
228
|
-
await tracer.startActiveSpan('updateInventory', async
|
|
234
|
+
await tracer.startActiveSpan('updateInventory', async inventorySpan => {
|
|
229
235
|
await inventoryService.reserve(order.items);
|
|
230
236
|
inventorySpan.end();
|
|
231
237
|
});
|
|
232
|
-
|
|
238
|
+
|
|
233
239
|
span.setStatus({ code: SpanStatusCode.OK });
|
|
234
240
|
} catch (error) {
|
|
235
241
|
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
|
|
@@ -262,8 +268,8 @@ groups:
|
|
|
262
268
|
labels:
|
|
263
269
|
severity: critical
|
|
264
270
|
annotations:
|
|
265
|
-
summary:
|
|
266
|
-
|
|
271
|
+
summary: 'High error budget burn rate'
|
|
272
|
+
|
|
267
273
|
# Slow burn: 5% budget in 6 hours
|
|
268
274
|
- alert: SlowErrorBudgetBurn
|
|
269
275
|
expr: |
|
|
@@ -283,9 +289,9 @@ rules:
|
|
|
283
289
|
# Good: User-facing symptom
|
|
284
290
|
- alert: HighLatency
|
|
285
291
|
expr: http_request_duration_seconds:p99 > 0.5
|
|
286
|
-
|
|
292
|
+
|
|
287
293
|
# Avoid: Cause-based
|
|
288
|
-
- alert: HighCPU
|
|
294
|
+
- alert: HighCPU # May not affect users
|
|
289
295
|
expr: cpu_utilization > 80
|
|
290
296
|
```
|
|
291
297
|
|
|
@@ -298,36 +304,36 @@ rules:
|
|
|
298
304
|
```yaml
|
|
299
305
|
# Grafana dashboard structure
|
|
300
306
|
dashboard:
|
|
301
|
-
title:
|
|
307
|
+
title: 'Service Overview'
|
|
302
308
|
rows:
|
|
303
|
-
- title:
|
|
309
|
+
- title: 'Traffic'
|
|
304
310
|
panels:
|
|
305
311
|
- type: graph
|
|
306
|
-
title:
|
|
307
|
-
query:
|
|
308
|
-
|
|
309
|
-
- title:
|
|
312
|
+
title: 'Requests per Second'
|
|
313
|
+
query: 'rate(http_requests_total[5m])'
|
|
314
|
+
|
|
315
|
+
- title: 'Errors'
|
|
310
316
|
panels:
|
|
311
317
|
- type: graph
|
|
312
|
-
title:
|
|
318
|
+
title: 'Error Rate'
|
|
313
319
|
query: "rate(http_requests_total{status=~'5..'}[5m])"
|
|
314
|
-
|
|
315
|
-
- title:
|
|
320
|
+
|
|
321
|
+
- title: 'Latency'
|
|
316
322
|
panels:
|
|
317
323
|
- type: graph
|
|
318
|
-
title:
|
|
324
|
+
title: 'Latency Percentiles'
|
|
319
325
|
queries:
|
|
320
|
-
-
|
|
321
|
-
-
|
|
322
|
-
-
|
|
323
|
-
|
|
324
|
-
- title:
|
|
326
|
+
- 'histogram_quantile(0.50, ...)'
|
|
327
|
+
- 'histogram_quantile(0.95, ...)'
|
|
328
|
+
- 'histogram_quantile(0.99, ...)'
|
|
329
|
+
|
|
330
|
+
- title: 'Saturation'
|
|
325
331
|
panels:
|
|
326
332
|
- type: graph
|
|
327
|
-
title:
|
|
333
|
+
title: 'Resource Usage'
|
|
328
334
|
queries:
|
|
329
|
-
-
|
|
330
|
-
-
|
|
335
|
+
- 'cpu_utilization'
|
|
336
|
+
- 'memory_utilization'
|
|
331
337
|
```
|
|
332
338
|
|
|
333
339
|
---
|
|
@@ -335,24 +341,28 @@ dashboard:
|
|
|
335
341
|
## Observability Checklist
|
|
336
342
|
|
|
337
343
|
### Metrics
|
|
344
|
+
|
|
338
345
|
- [ ] RED metrics for all services
|
|
339
346
|
- [ ] USE metrics for resources
|
|
340
347
|
- [ ] Business metrics tracked
|
|
341
348
|
- [ ] SLI metrics defined
|
|
342
349
|
|
|
343
350
|
### Logging
|
|
351
|
+
|
|
344
352
|
- [ ] Structured JSON logs
|
|
345
353
|
- [ ] Correlation IDs propagated
|
|
346
354
|
- [ ] Appropriate log levels
|
|
347
355
|
- [ ] Sensitive data masked
|
|
348
356
|
|
|
349
357
|
### Tracing
|
|
358
|
+
|
|
350
359
|
- [ ] Distributed tracing enabled
|
|
351
360
|
- [ ] Span context propagated
|
|
352
361
|
- [ ] Key operations instrumented
|
|
353
362
|
- [ ] Sampling configured
|
|
354
363
|
|
|
355
364
|
### Alerting
|
|
365
|
+
|
|
356
366
|
- [ ] SLO-based alerts
|
|
357
367
|
- [ ] Multi-window burn rates
|
|
358
368
|
- [ ] Clear runbooks linked
|