musubi-sdd 5.1.0 → 5.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ja.md +106 -48
- package/README.md +110 -32
- package/bin/musubi-analyze.js +74 -67
- package/bin/musubi-browser.js +27 -26
- package/bin/musubi-change.js +48 -47
- package/bin/musubi-checkpoint.js +10 -7
- package/bin/musubi-convert.js +25 -25
- package/bin/musubi-costs.js +27 -10
- package/bin/musubi-gui.js +52 -46
- package/bin/musubi-init.js +1952 -10
- package/bin/musubi-orchestrate.js +327 -239
- package/bin/musubi-remember.js +69 -56
- package/bin/musubi-resolve.js +53 -45
- package/bin/musubi-trace.js +51 -22
- package/bin/musubi-validate.js +39 -30
- package/bin/musubi-workflow.js +33 -34
- package/bin/musubi.js +39 -2
- package/package.json +1 -1
- package/src/agents/agent-loop.js +94 -95
- package/src/agents/agentic/code-generator.js +119 -109
- package/src/agents/agentic/code-reviewer.js +105 -108
- package/src/agents/agentic/index.js +4 -4
- package/src/agents/browser/action-executor.js +13 -13
- package/src/agents/browser/ai-comparator.js +11 -10
- package/src/agents/browser/context-manager.js +6 -6
- package/src/agents/browser/index.js +5 -5
- package/src/agents/browser/nl-parser.js +31 -46
- package/src/agents/browser/screenshot.js +2 -2
- package/src/agents/browser/test-generator.js +6 -4
- package/src/agents/function-tool.js +71 -65
- package/src/agents/index.js +7 -7
- package/src/agents/schema-generator.js +98 -94
- package/src/analyzers/ast-extractor.js +158 -146
- package/src/analyzers/codegraph-auto-update.js +858 -0
- package/src/analyzers/complexity-analyzer.js +536 -0
- package/src/analyzers/context-optimizer.js +241 -126
- package/src/analyzers/impact-analyzer.js +1 -1
- package/src/analyzers/large-project-analyzer.js +766 -0
- package/src/analyzers/repository-map.js +77 -81
- package/src/analyzers/security-analyzer.js +19 -11
- package/src/analyzers/stuck-detector.js +19 -17
- package/src/converters/index.js +78 -57
- package/src/converters/ir/types.js +12 -12
- package/src/converters/parsers/musubi-parser.js +134 -126
- package/src/converters/parsers/openapi-parser.js +70 -53
- package/src/converters/parsers/speckit-parser.js +239 -175
- package/src/converters/writers/musubi-writer.js +123 -118
- package/src/converters/writers/speckit-writer.js +124 -113
- package/src/generators/rust-migration-generator.js +512 -0
- package/src/gui/public/index.html +1365 -1211
- package/src/gui/server.js +41 -40
- package/src/gui/services/file-watcher.js +23 -8
- package/src/gui/services/project-scanner.js +26 -20
- package/src/gui/services/replanning-service.js +27 -23
- package/src/gui/services/traceability-service.js +8 -8
- package/src/gui/services/workflow-service.js +14 -7
- package/src/index.js +151 -0
- package/src/integrations/cicd.js +90 -104
- package/src/integrations/codegraph-mcp.js +643 -0
- package/src/integrations/documentation.js +142 -103
- package/src/integrations/examples.js +95 -80
- package/src/integrations/github-client.js +17 -17
- package/src/integrations/index.js +5 -5
- package/src/integrations/mcp/index.js +21 -21
- package/src/integrations/mcp/mcp-context-provider.js +76 -78
- package/src/integrations/mcp/mcp-discovery.js +74 -72
- package/src/integrations/mcp/mcp-tool-registry.js +99 -94
- package/src/integrations/mcp-connector.js +70 -66
- package/src/integrations/platforms.js +50 -49
- package/src/integrations/tool-discovery.js +37 -31
- package/src/llm-providers/anthropic-provider.js +11 -11
- package/src/llm-providers/base-provider.js +16 -18
- package/src/llm-providers/copilot-provider.js +22 -19
- package/src/llm-providers/index.js +26 -25
- package/src/llm-providers/ollama-provider.js +11 -11
- package/src/llm-providers/openai-provider.js +12 -12
- package/src/managers/agent-memory.js +36 -24
- package/src/managers/checkpoint-manager.js +4 -8
- package/src/managers/delta-spec.js +19 -19
- package/src/managers/index.js +13 -4
- package/src/managers/memory-condenser.js +35 -45
- package/src/managers/repo-skill-manager.js +57 -31
- package/src/managers/skill-loader.js +25 -22
- package/src/managers/skill-tools.js +36 -72
- package/src/managers/workflow.js +30 -22
- package/src/monitoring/cost-tracker.js +48 -46
- package/src/monitoring/incident-manager.js +116 -106
- package/src/monitoring/index.js +144 -134
- package/src/monitoring/observability.js +75 -62
- package/src/monitoring/quality-dashboard.js +45 -41
- package/src/monitoring/release-manager.js +63 -53
- package/src/orchestration/agent-skill-binding.js +39 -47
- package/src/orchestration/error-handler.js +65 -107
- package/src/orchestration/guardrails/base-guardrail.js +26 -24
- package/src/orchestration/guardrails/guardrail-rules.js +50 -64
- package/src/orchestration/guardrails/index.js +5 -5
- package/src/orchestration/guardrails/input-guardrail.js +58 -45
- package/src/orchestration/guardrails/output-guardrail.js +104 -81
- package/src/orchestration/guardrails/safety-check.js +79 -79
- package/src/orchestration/index.js +38 -55
- package/src/orchestration/mcp-tool-adapters.js +96 -99
- package/src/orchestration/orchestration-engine.js +21 -21
- package/src/orchestration/pattern-registry.js +60 -45
- package/src/orchestration/patterns/auto.js +34 -47
- package/src/orchestration/patterns/group-chat.js +59 -65
- package/src/orchestration/patterns/handoff.js +67 -65
- package/src/orchestration/patterns/human-in-loop.js +51 -72
- package/src/orchestration/patterns/nested.js +25 -40
- package/src/orchestration/patterns/sequential.js +35 -34
- package/src/orchestration/patterns/swarm.js +63 -56
- package/src/orchestration/patterns/triage.js +150 -109
- package/src/orchestration/reasoning/index.js +9 -9
- package/src/orchestration/reasoning/planning-engine.js +143 -140
- package/src/orchestration/reasoning/reasoning-engine.js +206 -144
- package/src/orchestration/reasoning/self-correction.js +121 -128
- package/src/orchestration/replanning/adaptive-goal-modifier.js +107 -112
- package/src/orchestration/replanning/alternative-generator.js +37 -42
- package/src/orchestration/replanning/config.js +63 -59
- package/src/orchestration/replanning/goal-progress-tracker.js +98 -100
- package/src/orchestration/replanning/index.js +24 -20
- package/src/orchestration/replanning/plan-evaluator.js +49 -50
- package/src/orchestration/replanning/plan-monitor.js +32 -28
- package/src/orchestration/replanning/proactive-path-optimizer.js +175 -178
- package/src/orchestration/replanning/replan-history.js +33 -26
- package/src/orchestration/replanning/replanning-engine.js +106 -108
- package/src/orchestration/skill-executor.js +107 -109
- package/src/orchestration/skill-registry.js +85 -89
- package/src/orchestration/workflow-examples.js +228 -231
- package/src/orchestration/workflow-executor.js +65 -68
- package/src/orchestration/workflow-orchestrator.js +72 -73
- package/src/phase4-integration.js +47 -40
- package/src/phase5-integration.js +89 -30
- package/src/reporters/coverage-report.js +82 -30
- package/src/reporters/hierarchical-reporter.js +498 -0
- package/src/reporters/traceability-matrix-report.js +29 -20
- package/src/resolvers/issue-resolver.js +43 -31
- package/src/steering/advanced-validation.js +133 -124
- package/src/steering/auto-updater.js +60 -73
- package/src/steering/index.js +6 -6
- package/src/steering/quality-metrics.js +41 -35
- package/src/steering/steering-auto-update.js +83 -86
- package/src/steering/steering-validator.js +98 -106
- package/src/steering/template-constraints.js +53 -54
- package/src/templates/agents/claude-code/CLAUDE.md +32 -32
- package/src/templates/agents/claude-code/skills/agent-assistant/SKILL.md +13 -5
- package/src/templates/agents/claude-code/skills/ai-ml-engineer/mlops-guide.md +23 -23
- package/src/templates/agents/claude-code/skills/ai-ml-engineer/model-card-template.md +60 -41
- package/src/templates/agents/claude-code/skills/api-designer/api-patterns.md +27 -19
- package/src/templates/agents/claude-code/skills/api-designer/openapi-template.md +11 -7
- package/src/templates/agents/claude-code/skills/bug-hunter/SKILL.md +4 -3
- package/src/templates/agents/claude-code/skills/bug-hunter/root-cause-analysis.md +37 -15
- package/src/templates/agents/claude-code/skills/change-impact-analyzer/dependency-graph-patterns.md +36 -42
- package/src/templates/agents/claude-code/skills/change-impact-analyzer/impact-analysis-template.md +69 -60
- package/src/templates/agents/claude-code/skills/cloud-architect/aws-patterns.md +31 -38
- package/src/templates/agents/claude-code/skills/cloud-architect/azure-patterns.md +28 -23
- package/src/templates/agents/claude-code/skills/code-reviewer/SKILL.md +61 -0
- package/src/templates/agents/claude-code/skills/code-reviewer/best-practices.md +27 -0
- package/src/templates/agents/claude-code/skills/code-reviewer/review-checklist.md +29 -10
- package/src/templates/agents/claude-code/skills/code-reviewer/review-standards.md +29 -24
- package/src/templates/agents/claude-code/skills/constitution-enforcer/SKILL.md +8 -6
- package/src/templates/agents/claude-code/skills/constitution-enforcer/constitutional-articles.md +62 -26
- package/src/templates/agents/claude-code/skills/constitution-enforcer/phase-minus-one-gates.md +35 -16
- package/src/templates/agents/claude-code/skills/database-administrator/backup-recovery.md +27 -17
- package/src/templates/agents/claude-code/skills/database-administrator/tuning-guide.md +25 -20
- package/src/templates/agents/claude-code/skills/database-schema-designer/schema-patterns.md +39 -22
- package/src/templates/agents/claude-code/skills/devops-engineer/ci-cd-templates.md +25 -22
- package/src/templates/agents/claude-code/skills/issue-resolver/SKILL.md +24 -21
- package/src/templates/agents/claude-code/skills/orchestrator/SKILL.md +148 -63
- package/src/templates/agents/claude-code/skills/orchestrator/patterns.md +35 -16
- package/src/templates/agents/claude-code/skills/orchestrator/selection-matrix.md +69 -64
- package/src/templates/agents/claude-code/skills/performance-engineer/optimization-playbook.md +47 -47
- package/src/templates/agents/claude-code/skills/performance-optimizer/SKILL.md +69 -0
- package/src/templates/agents/claude-code/skills/performance-optimizer/benchmark-template.md +63 -45
- package/src/templates/agents/claude-code/skills/performance-optimizer/optimization-patterns.md +33 -35
- package/src/templates/agents/claude-code/skills/project-manager/SKILL.md +7 -6
- package/src/templates/agents/claude-code/skills/project-manager/agile-ceremonies.md +47 -28
- package/src/templates/agents/claude-code/skills/project-manager/project-templates.md +94 -78
- package/src/templates/agents/claude-code/skills/quality-assurance/SKILL.md +20 -17
- package/src/templates/agents/claude-code/skills/quality-assurance/qa-plan-template.md +63 -49
- package/src/templates/agents/claude-code/skills/release-coordinator/SKILL.md +5 -5
- package/src/templates/agents/claude-code/skills/release-coordinator/feature-flag-guide.md +30 -26
- package/src/templates/agents/claude-code/skills/release-coordinator/release-plan-template.md +67 -35
- package/src/templates/agents/claude-code/skills/requirements-analyst/ears-format.md +54 -42
- package/src/templates/agents/claude-code/skills/requirements-analyst/validation-rules.md +36 -33
- package/src/templates/agents/claude-code/skills/security-auditor/SKILL.md +77 -19
- package/src/templates/agents/claude-code/skills/security-auditor/audit-checklists.md +24 -24
- package/src/templates/agents/claude-code/skills/security-auditor/owasp-top-10.md +61 -20
- package/src/templates/agents/claude-code/skills/security-auditor/vulnerability-patterns.md +43 -11
- package/src/templates/agents/claude-code/skills/site-reliability-engineer/SKILL.md +1 -0
- package/src/templates/agents/claude-code/skills/site-reliability-engineer/incident-response-template.md +55 -25
- package/src/templates/agents/claude-code/skills/site-reliability-engineer/observability-patterns.md +78 -68
- package/src/templates/agents/claude-code/skills/site-reliability-engineer/slo-sli-guide.md +73 -53
- package/src/templates/agents/claude-code/skills/software-developer/solid-principles.md +83 -37
- package/src/templates/agents/claude-code/skills/software-developer/test-first-workflow.md +38 -31
- package/src/templates/agents/claude-code/skills/steering/SKILL.md +1 -0
- package/src/templates/agents/claude-code/skills/steering/auto-update-rules.md +31 -0
- package/src/templates/agents/claude-code/skills/system-architect/adr-template.md +25 -7
- package/src/templates/agents/claude-code/skills/system-architect/c4-model-guide.md +74 -61
- package/src/templates/agents/claude-code/skills/technical-writer/doc-templates/documentation-templates.md +70 -52
- package/src/templates/agents/claude-code/skills/test-engineer/SKILL.md +2 -0
- package/src/templates/agents/claude-code/skills/test-engineer/ears-test-mapping.md +75 -71
- package/src/templates/agents/claude-code/skills/test-engineer/test-types.md +85 -63
- package/src/templates/agents/claude-code/skills/traceability-auditor/coverage-matrix-template.md +39 -36
- package/src/templates/agents/claude-code/skills/traceability-auditor/gap-detection-rules.md +22 -17
- package/src/templates/agents/claude-code/skills/ui-ux-designer/SKILL.md +1 -0
- package/src/templates/agents/claude-code/skills/ui-ux-designer/accessibility-guidelines.md +49 -75
- package/src/templates/agents/claude-code/skills/ui-ux-designer/design-system-components.md +71 -59
- package/src/templates/agents/codex/AGENTS.md +74 -42
- package/src/templates/agents/cursor/AGENTS.md +74 -42
- package/src/templates/agents/gemini-cli/GEMINI.md +74 -42
- package/src/templates/agents/github-copilot/AGENTS.md +83 -51
- package/src/templates/agents/qwen-code/QWEN.md +74 -42
- package/src/templates/agents/windsurf/AGENTS.md +74 -42
- package/src/templates/architectures/README.md +41 -0
- package/src/templates/architectures/clean-architecture/README.md +113 -0
- package/src/templates/architectures/event-driven/README.md +162 -0
- package/src/templates/architectures/hexagonal/README.md +130 -0
- package/src/templates/index.js +6 -1
- package/src/templates/locale-manager.js +16 -16
- package/src/templates/shared/delta-spec-template.md +20 -13
- package/src/templates/shared/github-actions/musubi-issue-resolver.yml +5 -5
- package/src/templates/shared/github-actions/musubi-security-check.yml +3 -3
- package/src/templates/shared/github-actions/musubi-validate.yml +4 -4
- package/src/templates/shared/steering/structure.md +95 -0
- package/src/templates/skills/browser-agent.md +21 -16
- package/src/templates/skills/web-gui.md +8 -0
- package/src/templates/template-constraints.js +50 -53
- package/src/validators/advanced-validation.js +30 -36
- package/src/validators/constitutional-validator.js +77 -73
- package/src/validators/critic-system.js +49 -59
- package/src/validators/delta-format.js +59 -55
- package/src/validators/traceability-validator.js +7 -11
package/src/monitoring/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Monitoring Module - SRE, Observability, and Release Management
|
|
3
|
-
*
|
|
3
|
+
*
|
|
4
4
|
* Provides monitoring capabilities for MUSUBI-powered applications:
|
|
5
5
|
* - SLI/SLO definition and tracking
|
|
6
6
|
* - Alerting rules generation
|
|
@@ -18,7 +18,7 @@ const SLOType = {
|
|
|
18
18
|
LATENCY: 'latency',
|
|
19
19
|
THROUGHPUT: 'throughput',
|
|
20
20
|
ERROR_RATE: 'error-rate',
|
|
21
|
-
CORRECTNESS: 'correctness'
|
|
21
|
+
CORRECTNESS: 'correctness',
|
|
22
22
|
};
|
|
23
23
|
|
|
24
24
|
/**
|
|
@@ -27,7 +27,7 @@ const SLOType = {
|
|
|
27
27
|
const AlertSeverity = {
|
|
28
28
|
CRITICAL: 'critical',
|
|
29
29
|
WARNING: 'warning',
|
|
30
|
-
INFO: 'info'
|
|
30
|
+
INFO: 'info',
|
|
31
31
|
};
|
|
32
32
|
|
|
33
33
|
/**
|
|
@@ -37,7 +37,7 @@ const MetricType = {
|
|
|
37
37
|
COUNTER: 'counter',
|
|
38
38
|
GAUGE: 'gauge',
|
|
39
39
|
HISTOGRAM: 'histogram',
|
|
40
|
-
SUMMARY: 'summary'
|
|
40
|
+
SUMMARY: 'summary',
|
|
41
41
|
};
|
|
42
42
|
|
|
43
43
|
/**
|
|
@@ -62,16 +62,16 @@ class SLI {
|
|
|
62
62
|
switch (this.type) {
|
|
63
63
|
case SLOType.AVAILABILITY:
|
|
64
64
|
return `sum(rate(${this.metric}_success_total[5m])) / sum(rate(${this.metric}_total[5m]))`;
|
|
65
|
-
|
|
65
|
+
|
|
66
66
|
case SLOType.LATENCY:
|
|
67
67
|
return `histogram_quantile(0.95, sum(rate(${this.metric}_bucket[5m])) by (le))`;
|
|
68
|
-
|
|
68
|
+
|
|
69
69
|
case SLOType.ERROR_RATE:
|
|
70
70
|
return `sum(rate(${this.metric}_errors_total[5m])) / sum(rate(${this.metric}_total[5m]))`;
|
|
71
|
-
|
|
71
|
+
|
|
72
72
|
case SLOType.THROUGHPUT:
|
|
73
73
|
return `sum(rate(${this.metric}_total[5m]))`;
|
|
74
|
-
|
|
74
|
+
|
|
75
75
|
default:
|
|
76
76
|
return this.goodEventsQuery || this.metric;
|
|
77
77
|
}
|
|
@@ -84,7 +84,7 @@ class SLI {
|
|
|
84
84
|
type: this.type,
|
|
85
85
|
metric: this.metric,
|
|
86
86
|
unit: this.unit,
|
|
87
|
-
prometheusQuery: this.toPrometheusQuery()
|
|
87
|
+
prometheusQuery: this.toPrometheusQuery(),
|
|
88
88
|
};
|
|
89
89
|
}
|
|
90
90
|
}
|
|
@@ -101,7 +101,7 @@ class SLO {
|
|
|
101
101
|
this.window = options.window || '30d'; // Measurement window
|
|
102
102
|
this.burnRateThresholds = options.burnRateThresholds || {
|
|
103
103
|
critical: 14.4, // 1 hour to exhaust error budget
|
|
104
|
-
warning: 6
|
|
104
|
+
warning: 6, // 6 hours to exhaust error budget
|
|
105
105
|
};
|
|
106
106
|
}
|
|
107
107
|
|
|
@@ -112,7 +112,7 @@ class SLO {
|
|
|
112
112
|
return {
|
|
113
113
|
total: 1 - this.target,
|
|
114
114
|
remaining: null, // Calculated at runtime
|
|
115
|
-
consumptionRate: null
|
|
115
|
+
consumptionRate: null,
|
|
116
116
|
};
|
|
117
117
|
}
|
|
118
118
|
|
|
@@ -121,22 +121,22 @@ class SLO {
|
|
|
121
121
|
*/
|
|
122
122
|
toBurnRateAlert() {
|
|
123
123
|
const shortWindow = '5m';
|
|
124
|
-
const
|
|
125
|
-
|
|
124
|
+
const _longWindow = '1h';
|
|
125
|
+
|
|
126
126
|
return {
|
|
127
127
|
name: `${this.name}_high_burn_rate`,
|
|
128
128
|
expr: `(
|
|
129
129
|
${this.sli.toPrometheusQuery()}
|
|
130
|
-
) < ${this.target - (
|
|
130
|
+
) < ${this.target - (1 - this.target) * this.burnRateThresholds.critical}`,
|
|
131
131
|
for: shortWindow,
|
|
132
132
|
labels: {
|
|
133
133
|
severity: AlertSeverity.CRITICAL,
|
|
134
|
-
slo: this.name
|
|
134
|
+
slo: this.name,
|
|
135
135
|
},
|
|
136
136
|
annotations: {
|
|
137
137
|
summary: `High burn rate on SLO: ${this.name}`,
|
|
138
|
-
description: `Error budget will be exhausted within 1 hour at current rate
|
|
139
|
-
}
|
|
138
|
+
description: `Error budget will be exhausted within 1 hour at current rate`,
|
|
139
|
+
},
|
|
140
140
|
};
|
|
141
141
|
}
|
|
142
142
|
|
|
@@ -149,7 +149,7 @@ class SLO {
|
|
|
149
149
|
targetPercentage: `${(this.target * 100).toFixed(2)}%`,
|
|
150
150
|
window: this.window,
|
|
151
151
|
errorBudget: this.calculateErrorBudget(),
|
|
152
|
-
burnRateAlert: this.toBurnRateAlert()
|
|
152
|
+
burnRateAlert: this.toBurnRateAlert(),
|
|
153
153
|
};
|
|
154
154
|
}
|
|
155
155
|
}
|
|
@@ -176,7 +176,9 @@ class AlertRule {
|
|
|
176
176
|
for: ${this.for}
|
|
177
177
|
labels:
|
|
178
178
|
severity: ${this.severity}
|
|
179
|
-
${Object.entries(this.labels)
|
|
179
|
+
${Object.entries(this.labels)
|
|
180
|
+
.map(([k, v]) => ` ${k}: ${v}`)
|
|
181
|
+
.join('\n')}
|
|
180
182
|
annotations:
|
|
181
183
|
summary: "${this.annotations.summary || this.name}"
|
|
182
184
|
description: "${this.annotations.description || ''}"`;
|
|
@@ -189,7 +191,7 @@ ${Object.entries(this.labels).map(([k, v]) => ` ${k}: ${v}`).join('\n')}
|
|
|
189
191
|
for: this.for,
|
|
190
192
|
severity: this.severity,
|
|
191
193
|
labels: this.labels,
|
|
192
|
-
annotations: this.annotations
|
|
194
|
+
annotations: this.annotations,
|
|
193
195
|
};
|
|
194
196
|
}
|
|
195
197
|
}
|
|
@@ -214,7 +216,7 @@ class HealthCheck {
|
|
|
214
216
|
name: check.name,
|
|
215
217
|
type: check.type || 'dependency',
|
|
216
218
|
critical: check.critical !== false,
|
|
217
|
-
check: check.check
|
|
219
|
+
check: check.check,
|
|
218
220
|
});
|
|
219
221
|
return this;
|
|
220
222
|
}
|
|
@@ -231,22 +233,20 @@ class HealthCheck {
|
|
|
231
233
|
const startTime = Date.now();
|
|
232
234
|
const checkResult = await Promise.race([
|
|
233
235
|
check.check(),
|
|
234
|
-
new Promise((_, reject) =>
|
|
235
|
-
setTimeout(() => reject(new Error('Timeout')), this.timeout)
|
|
236
|
-
)
|
|
236
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('Timeout')), this.timeout)),
|
|
237
237
|
]);
|
|
238
|
-
|
|
238
|
+
|
|
239
239
|
results.push({
|
|
240
240
|
name: check.name,
|
|
241
241
|
status: 'healthy',
|
|
242
242
|
latency: Date.now() - startTime,
|
|
243
|
-
details: checkResult
|
|
243
|
+
details: checkResult,
|
|
244
244
|
});
|
|
245
245
|
} catch (error) {
|
|
246
246
|
results.push({
|
|
247
247
|
name: check.name,
|
|
248
248
|
status: 'unhealthy',
|
|
249
|
-
error: error.message
|
|
249
|
+
error: error.message,
|
|
250
250
|
});
|
|
251
251
|
if (check.critical) healthy = false;
|
|
252
252
|
}
|
|
@@ -255,7 +255,7 @@ class HealthCheck {
|
|
|
255
255
|
return {
|
|
256
256
|
status: healthy ? 'healthy' : 'unhealthy',
|
|
257
257
|
timestamp: new Date().toISOString(),
|
|
258
|
-
checks: results
|
|
258
|
+
checks: results,
|
|
259
259
|
};
|
|
260
260
|
}
|
|
261
261
|
|
|
@@ -263,7 +263,7 @@ class HealthCheck {
|
|
|
263
263
|
* Generate Express.js health endpoint handler
|
|
264
264
|
*/
|
|
265
265
|
toExpressHandler() {
|
|
266
|
-
const
|
|
266
|
+
const _check = this;
|
|
267
267
|
return `
|
|
268
268
|
app.get('${this.endpoint}', async (req, res) => {
|
|
269
269
|
const health = await healthCheck.execute();
|
|
@@ -289,8 +289,8 @@ app.get('${this.endpoint}/ready', async (req, res) => {
|
|
|
289
289
|
checks: this.checks.map(c => ({
|
|
290
290
|
name: c.name,
|
|
291
291
|
type: c.type,
|
|
292
|
-
critical: c.critical
|
|
293
|
-
}))
|
|
292
|
+
critical: c.critical,
|
|
293
|
+
})),
|
|
294
294
|
};
|
|
295
295
|
}
|
|
296
296
|
}
|
|
@@ -361,9 +361,8 @@ class MonitoringConfig extends EventEmitter {
|
|
|
361
361
|
* Define a health check
|
|
362
362
|
*/
|
|
363
363
|
defineHealthCheck(healthCheck) {
|
|
364
|
-
const hcInstance =
|
|
365
|
-
? healthCheck
|
|
366
|
-
: new HealthCheck(healthCheck);
|
|
364
|
+
const hcInstance =
|
|
365
|
+
healthCheck instanceof HealthCheck ? healthCheck : new HealthCheck(healthCheck);
|
|
367
366
|
this.healthChecks.set(hcInstance.name, hcInstance);
|
|
368
367
|
this.emit('healthCheckAdded', hcInstance);
|
|
369
368
|
return this;
|
|
@@ -384,7 +383,7 @@ class MonitoringConfig extends EventEmitter {
|
|
|
384
383
|
name: metric.name,
|
|
385
384
|
type: metric.type || MetricType.COUNTER,
|
|
386
385
|
help: metric.help || '',
|
|
387
|
-
labels: metric.labels || []
|
|
386
|
+
labels: metric.labels || [],
|
|
388
387
|
});
|
|
389
388
|
return this;
|
|
390
389
|
}
|
|
@@ -394,22 +393,24 @@ class MonitoringConfig extends EventEmitter {
|
|
|
394
393
|
*/
|
|
395
394
|
toPrometheusConfig() {
|
|
396
395
|
const rules = [];
|
|
397
|
-
|
|
396
|
+
|
|
398
397
|
// Generate SLO-based alerts
|
|
399
398
|
for (const slo of this.slos.values()) {
|
|
400
399
|
rules.push(slo.toBurnRateAlert());
|
|
401
400
|
}
|
|
402
|
-
|
|
401
|
+
|
|
403
402
|
// Add custom alerts
|
|
404
403
|
for (const alert of this.alerts.values()) {
|
|
405
404
|
rules.push(alert.toJSON());
|
|
406
405
|
}
|
|
407
406
|
|
|
408
407
|
return {
|
|
409
|
-
groups: [
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
408
|
+
groups: [
|
|
409
|
+
{
|
|
410
|
+
name: `${this.serviceName}-alerts`,
|
|
411
|
+
rules,
|
|
412
|
+
},
|
|
413
|
+
],
|
|
413
414
|
};
|
|
414
415
|
}
|
|
415
416
|
|
|
@@ -427,10 +428,12 @@ class MonitoringConfig extends EventEmitter {
|
|
|
427
428
|
type: 'gauge',
|
|
428
429
|
title: slo.name,
|
|
429
430
|
gridPos: { x: 0, y, w: 8, h: 6 },
|
|
430
|
-
targets: [
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
431
|
+
targets: [
|
|
432
|
+
{
|
|
433
|
+
expr: slo.sli.toPrometheusQuery(),
|
|
434
|
+
legendFormat: slo.name,
|
|
435
|
+
},
|
|
436
|
+
],
|
|
434
437
|
fieldConfig: {
|
|
435
438
|
defaults: {
|
|
436
439
|
thresholds: {
|
|
@@ -438,14 +441,14 @@ class MonitoringConfig extends EventEmitter {
|
|
|
438
441
|
steps: [
|
|
439
442
|
{ color: 'red', value: null },
|
|
440
443
|
{ color: 'yellow', value: slo.target - 0.01 },
|
|
441
|
-
{ color: 'green', value: slo.target }
|
|
442
|
-
]
|
|
444
|
+
{ color: 'green', value: slo.target },
|
|
445
|
+
],
|
|
443
446
|
},
|
|
444
447
|
min: 0,
|
|
445
448
|
max: 1,
|
|
446
|
-
unit: 'percentunit'
|
|
447
|
-
}
|
|
448
|
-
}
|
|
449
|
+
unit: 'percentunit',
|
|
450
|
+
},
|
|
451
|
+
},
|
|
449
452
|
});
|
|
450
453
|
y += 6;
|
|
451
454
|
}
|
|
@@ -457,7 +460,7 @@ class MonitoringConfig extends EventEmitter {
|
|
|
457
460
|
timezone: 'browser',
|
|
458
461
|
panels,
|
|
459
462
|
refresh: '30s',
|
|
460
|
-
time: { from: 'now-24h', to: 'now' }
|
|
463
|
+
time: { from: 'now-24h', to: 'now' },
|
|
461
464
|
};
|
|
462
465
|
}
|
|
463
466
|
|
|
@@ -473,7 +476,7 @@ class MonitoringConfig extends EventEmitter {
|
|
|
473
476
|
healthChecks: [...this.healthChecks.values()].map(h => h.toJSON()),
|
|
474
477
|
metrics: [...this.metrics.values()],
|
|
475
478
|
prometheus: this.toPrometheusConfig(),
|
|
476
|
-
grafana: this.toGrafanaDashboard()
|
|
479
|
+
grafana: this.toGrafanaDashboard(),
|
|
477
480
|
};
|
|
478
481
|
}
|
|
479
482
|
}
|
|
@@ -485,48 +488,51 @@ const SLOTemplates = {
|
|
|
485
488
|
/**
|
|
486
489
|
* API Availability SLO
|
|
487
490
|
*/
|
|
488
|
-
API_AVAILABILITY: (target = 0.999) =>
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
491
|
+
API_AVAILABILITY: (target = 0.999) =>
|
|
492
|
+
new SLO({
|
|
493
|
+
name: 'api-availability',
|
|
494
|
+
description: 'API endpoint availability',
|
|
495
|
+
sli: {
|
|
496
|
+
name: 'api-success-rate',
|
|
497
|
+
type: SLOType.AVAILABILITY,
|
|
498
|
+
metric: 'http_requests',
|
|
499
|
+
},
|
|
500
|
+
target,
|
|
501
|
+
window: '30d',
|
|
502
|
+
}),
|
|
499
503
|
|
|
500
504
|
/**
|
|
501
505
|
* API Latency SLO
|
|
502
506
|
*/
|
|
503
|
-
API_LATENCY: (target = 0.95, thresholdMs = 200) =>
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
507
|
+
API_LATENCY: (target = 0.95, thresholdMs = 200) =>
|
|
508
|
+
new SLO({
|
|
509
|
+
name: 'api-latency',
|
|
510
|
+
description: `95th percentile latency under ${thresholdMs}ms`,
|
|
511
|
+
sli: {
|
|
512
|
+
name: 'api-response-time',
|
|
513
|
+
type: SLOType.LATENCY,
|
|
514
|
+
metric: 'http_request_duration_seconds',
|
|
515
|
+
threshold: thresholdMs / 1000,
|
|
516
|
+
},
|
|
517
|
+
target,
|
|
518
|
+
window: '30d',
|
|
519
|
+
}),
|
|
515
520
|
|
|
516
521
|
/**
|
|
517
522
|
* Error Rate SLO
|
|
518
523
|
*/
|
|
519
|
-
ERROR_RATE: (target = 0.99) =>
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
524
|
+
ERROR_RATE: (target = 0.99) =>
|
|
525
|
+
new SLO({
|
|
526
|
+
name: 'error-rate',
|
|
527
|
+
description: 'Low error rate objective',
|
|
528
|
+
sli: {
|
|
529
|
+
name: 'error-rate-indicator',
|
|
530
|
+
type: SLOType.ERROR_RATE,
|
|
531
|
+
metric: 'http_requests',
|
|
532
|
+
},
|
|
533
|
+
target,
|
|
534
|
+
window: '7d',
|
|
535
|
+
}),
|
|
530
536
|
};
|
|
531
537
|
|
|
532
538
|
/**
|
|
@@ -536,58 +542,62 @@ const AlertTemplates = {
|
|
|
536
542
|
/**
|
|
537
543
|
* High Error Rate Alert
|
|
538
544
|
*/
|
|
539
|
-
HIGH_ERROR_RATE: (threshold = 0.05) =>
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
545
|
+
HIGH_ERROR_RATE: (threshold = 0.05) =>
|
|
546
|
+
new AlertRule({
|
|
547
|
+
name: 'HighErrorRate',
|
|
548
|
+
expr: `sum(rate(http_requests_errors_total[5m])) / sum(rate(http_requests_total[5m])) > ${threshold}`,
|
|
549
|
+
for: '5m',
|
|
550
|
+
severity: AlertSeverity.CRITICAL,
|
|
551
|
+
annotations: {
|
|
552
|
+
summary: 'High error rate detected',
|
|
553
|
+
description: `Error rate is above ${threshold * 100}%`,
|
|
554
|
+
},
|
|
555
|
+
}),
|
|
549
556
|
|
|
550
557
|
/**
|
|
551
558
|
* High Latency Alert
|
|
552
559
|
*/
|
|
553
|
-
HIGH_LATENCY: (thresholdMs = 500) =>
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
560
|
+
HIGH_LATENCY: (thresholdMs = 500) =>
|
|
561
|
+
new AlertRule({
|
|
562
|
+
name: 'HighLatency',
|
|
563
|
+
expr: `histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > ${thresholdMs / 1000}`,
|
|
564
|
+
for: '5m',
|
|
565
|
+
severity: AlertSeverity.WARNING,
|
|
566
|
+
annotations: {
|
|
567
|
+
summary: 'High latency detected',
|
|
568
|
+
description: `P95 latency is above ${thresholdMs}ms`,
|
|
569
|
+
},
|
|
570
|
+
}),
|
|
563
571
|
|
|
564
572
|
/**
|
|
565
573
|
* Service Down Alert
|
|
566
574
|
*/
|
|
567
|
-
SERVICE_DOWN: () =>
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
575
|
+
SERVICE_DOWN: () =>
|
|
576
|
+
new AlertRule({
|
|
577
|
+
name: 'ServiceDown',
|
|
578
|
+
expr: 'up == 0',
|
|
579
|
+
for: '1m',
|
|
580
|
+
severity: AlertSeverity.CRITICAL,
|
|
581
|
+
annotations: {
|
|
582
|
+
summary: 'Service is down',
|
|
583
|
+
description: 'Service instance is not responding',
|
|
584
|
+
},
|
|
585
|
+
}),
|
|
577
586
|
|
|
578
587
|
/**
|
|
579
588
|
* High Memory Usage Alert
|
|
580
589
|
*/
|
|
581
|
-
HIGH_MEMORY: (threshold = 0.9) =>
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
590
|
+
HIGH_MEMORY: (threshold = 0.9) =>
|
|
591
|
+
new AlertRule({
|
|
592
|
+
name: 'HighMemoryUsage',
|
|
593
|
+
expr: `process_resident_memory_bytes / node_memory_MemTotal_bytes > ${threshold}`,
|
|
594
|
+
for: '5m',
|
|
595
|
+
severity: AlertSeverity.WARNING,
|
|
596
|
+
annotations: {
|
|
597
|
+
summary: 'High memory usage',
|
|
598
|
+
description: `Memory usage is above ${threshold * 100}%`,
|
|
599
|
+
},
|
|
600
|
+
}),
|
|
591
601
|
};
|
|
592
602
|
|
|
593
603
|
/**
|
|
@@ -610,28 +620,28 @@ module.exports = {
|
|
|
610
620
|
AlertRule,
|
|
611
621
|
HealthCheck,
|
|
612
622
|
MonitoringConfig,
|
|
613
|
-
|
|
623
|
+
|
|
614
624
|
// Constants
|
|
615
625
|
SLOType,
|
|
616
626
|
AlertSeverity,
|
|
617
627
|
MetricType,
|
|
618
|
-
|
|
628
|
+
|
|
619
629
|
// Templates
|
|
620
630
|
SLOTemplates,
|
|
621
631
|
AlertTemplates,
|
|
622
|
-
|
|
632
|
+
|
|
623
633
|
// Factory
|
|
624
634
|
createMonitoringConfig,
|
|
625
|
-
|
|
635
|
+
|
|
626
636
|
// Release Manager
|
|
627
637
|
...releaseManagerModule,
|
|
628
|
-
|
|
638
|
+
|
|
629
639
|
// Incident Manager
|
|
630
640
|
...incidentManagerModule,
|
|
631
|
-
|
|
641
|
+
|
|
632
642
|
// Observability
|
|
633
643
|
...observabilityModule,
|
|
634
|
-
|
|
644
|
+
|
|
635
645
|
// Cost Tracker
|
|
636
|
-
...costTrackerModule
|
|
646
|
+
...costTrackerModule,
|
|
637
647
|
};
|