musubi-sdd 3.0.1 → 3.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/musubi-change.js +623 -10
- package/bin/musubi-orchestrate.js +456 -0
- package/bin/musubi-trace.js +393 -0
- package/package.json +3 -2
- package/src/analyzers/impact-analyzer.js +682 -0
- package/src/integrations/cicd.js +782 -0
- package/src/integrations/documentation.js +740 -0
- package/src/integrations/examples.js +789 -0
- package/src/integrations/index.js +23 -0
- package/src/integrations/platforms.js +929 -0
- package/src/managers/delta-spec.js +484 -0
- package/src/monitoring/incident-manager.js +890 -0
- package/src/monitoring/index.js +633 -0
- package/src/monitoring/observability.js +938 -0
- package/src/monitoring/release-manager.js +622 -0
- package/src/orchestration/index.js +168 -0
- package/src/orchestration/orchestration-engine.js +409 -0
- package/src/orchestration/pattern-registry.js +319 -0
- package/src/orchestration/patterns/auto.js +386 -0
- package/src/orchestration/patterns/group-chat.js +395 -0
- package/src/orchestration/patterns/human-in-loop.js +506 -0
- package/src/orchestration/patterns/nested.js +322 -0
- package/src/orchestration/patterns/sequential.js +278 -0
- package/src/orchestration/patterns/swarm.js +395 -0
- package/src/orchestration/workflow-orchestrator.js +738 -0
- package/src/reporters/coverage-report.js +452 -0
- package/src/reporters/traceability-matrix-report.js +684 -0
- package/src/steering/advanced-validation.js +812 -0
- package/src/steering/auto-updater.js +670 -0
- package/src/steering/index.js +119 -0
- package/src/steering/quality-metrics.js +650 -0
- package/src/steering/template-constraints.js +789 -0
- package/src/templates/agents/claude-code/skills/agent-assistant/SKILL.md +22 -0
- package/src/templates/agents/claude-code/skills/issue-resolver/SKILL.md +21 -0
- package/src/templates/agents/claude-code/skills/orchestrator/SKILL.md +90 -28
- package/src/templates/agents/claude-code/skills/project-manager/SKILL.md +32 -0
- package/src/templates/agents/claude-code/skills/site-reliability-engineer/SKILL.md +27 -0
- package/src/templates/agents/claude-code/skills/steering/SKILL.md +30 -0
- package/src/templates/agents/claude-code/skills/test-engineer/SKILL.md +21 -0
- package/src/templates/agents/claude-code/skills/ui-ux-designer/SKILL.md +27 -0
- package/src/templates/agents/codex/AGENTS.md +36 -1
- package/src/templates/agents/cursor/AGENTS.md +36 -1
- package/src/templates/agents/gemini-cli/GEMINI.md +36 -1
- package/src/templates/agents/github-copilot/AGENTS.md +65 -1
- package/src/templates/agents/qwen-code/QWEN.md +36 -1
- package/src/templates/agents/windsurf/AGENTS.md +36 -1
- package/src/templates/shared/delta-spec-template.md +246 -0
- package/src/validators/delta-format.js +474 -0
- package/src/validators/traceability-validator.js +561 -0
|
@@ -0,0 +1,633 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Monitoring Module - SRE, Observability, and Release Management
|
|
3
|
+
*
|
|
4
|
+
* Provides monitoring capabilities for MUSUBI-powered applications:
|
|
5
|
+
* - SLI/SLO definition and tracking
|
|
6
|
+
* - Alerting rules generation
|
|
7
|
+
* - Dashboard templates
|
|
8
|
+
* - Health check patterns
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
const { EventEmitter } = require('events');
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* SLO Types
|
|
15
|
+
*/
|
|
16
|
+
const SLOType = {
|
|
17
|
+
AVAILABILITY: 'availability',
|
|
18
|
+
LATENCY: 'latency',
|
|
19
|
+
THROUGHPUT: 'throughput',
|
|
20
|
+
ERROR_RATE: 'error-rate',
|
|
21
|
+
CORRECTNESS: 'correctness'
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Alert Severity
|
|
26
|
+
*/
|
|
27
|
+
const AlertSeverity = {
|
|
28
|
+
CRITICAL: 'critical',
|
|
29
|
+
WARNING: 'warning',
|
|
30
|
+
INFO: 'info'
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Metric Type
|
|
35
|
+
*/
|
|
36
|
+
const MetricType = {
|
|
37
|
+
COUNTER: 'counter',
|
|
38
|
+
GAUGE: 'gauge',
|
|
39
|
+
HISTOGRAM: 'histogram',
|
|
40
|
+
SUMMARY: 'summary'
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* SLI (Service Level Indicator) definition
|
|
45
|
+
*/
|
|
46
|
+
class SLI {
|
|
47
|
+
constructor(options) {
|
|
48
|
+
this.name = options.name;
|
|
49
|
+
this.description = options.description || '';
|
|
50
|
+
this.type = options.type || SLOType.AVAILABILITY;
|
|
51
|
+
this.metric = options.metric;
|
|
52
|
+
this.unit = options.unit || '';
|
|
53
|
+
this.goodEventsQuery = options.goodEventsQuery || null;
|
|
54
|
+
this.totalEventsQuery = options.totalEventsQuery || null;
|
|
55
|
+
this.threshold = options.threshold || null;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Generate Prometheus query for this SLI
|
|
60
|
+
*/
|
|
61
|
+
toPrometheusQuery() {
|
|
62
|
+
switch (this.type) {
|
|
63
|
+
case SLOType.AVAILABILITY:
|
|
64
|
+
return `sum(rate(${this.metric}_success_total[5m])) / sum(rate(${this.metric}_total[5m]))`;
|
|
65
|
+
|
|
66
|
+
case SLOType.LATENCY:
|
|
67
|
+
return `histogram_quantile(0.95, sum(rate(${this.metric}_bucket[5m])) by (le))`;
|
|
68
|
+
|
|
69
|
+
case SLOType.ERROR_RATE:
|
|
70
|
+
return `sum(rate(${this.metric}_errors_total[5m])) / sum(rate(${this.metric}_total[5m]))`;
|
|
71
|
+
|
|
72
|
+
case SLOType.THROUGHPUT:
|
|
73
|
+
return `sum(rate(${this.metric}_total[5m]))`;
|
|
74
|
+
|
|
75
|
+
default:
|
|
76
|
+
return this.goodEventsQuery || this.metric;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
toJSON() {
|
|
81
|
+
return {
|
|
82
|
+
name: this.name,
|
|
83
|
+
description: this.description,
|
|
84
|
+
type: this.type,
|
|
85
|
+
metric: this.metric,
|
|
86
|
+
unit: this.unit,
|
|
87
|
+
prometheusQuery: this.toPrometheusQuery()
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* SLO (Service Level Objective) definition
|
|
94
|
+
*/
|
|
95
|
+
class SLO {
|
|
96
|
+
constructor(options) {
|
|
97
|
+
this.name = options.name;
|
|
98
|
+
this.description = options.description || '';
|
|
99
|
+
this.sli = options.sli instanceof SLI ? options.sli : new SLI(options.sli);
|
|
100
|
+
this.target = options.target; // e.g., 0.999 for 99.9%
|
|
101
|
+
this.window = options.window || '30d'; // Measurement window
|
|
102
|
+
this.burnRateThresholds = options.burnRateThresholds || {
|
|
103
|
+
critical: 14.4, // 1 hour to exhaust error budget
|
|
104
|
+
warning: 6 // 6 hours to exhaust error budget
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Calculate error budget
|
|
110
|
+
*/
|
|
111
|
+
calculateErrorBudget() {
|
|
112
|
+
return {
|
|
113
|
+
total: 1 - this.target,
|
|
114
|
+
remaining: null, // Calculated at runtime
|
|
115
|
+
consumptionRate: null
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Generate burn rate alert rule
|
|
121
|
+
*/
|
|
122
|
+
toBurnRateAlert() {
|
|
123
|
+
const shortWindow = '5m';
|
|
124
|
+
const longWindow = '1h';
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
name: `${this.name}_high_burn_rate`,
|
|
128
|
+
expr: `(
|
|
129
|
+
${this.sli.toPrometheusQuery()}
|
|
130
|
+
) < ${this.target - ((1 - this.target) * this.burnRateThresholds.critical)}`,
|
|
131
|
+
for: shortWindow,
|
|
132
|
+
labels: {
|
|
133
|
+
severity: AlertSeverity.CRITICAL,
|
|
134
|
+
slo: this.name
|
|
135
|
+
},
|
|
136
|
+
annotations: {
|
|
137
|
+
summary: `High burn rate on SLO: ${this.name}`,
|
|
138
|
+
description: `Error budget will be exhausted within 1 hour at current rate`
|
|
139
|
+
}
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
toJSON() {
|
|
144
|
+
return {
|
|
145
|
+
name: this.name,
|
|
146
|
+
description: this.description,
|
|
147
|
+
sli: this.sli.toJSON(),
|
|
148
|
+
target: this.target,
|
|
149
|
+
targetPercentage: `${(this.target * 100).toFixed(2)}%`,
|
|
150
|
+
window: this.window,
|
|
151
|
+
errorBudget: this.calculateErrorBudget(),
|
|
152
|
+
burnRateAlert: this.toBurnRateAlert()
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Alert Rule definition
|
|
159
|
+
*/
|
|
160
|
+
class AlertRule {
|
|
161
|
+
constructor(options) {
|
|
162
|
+
this.name = options.name;
|
|
163
|
+
this.expr = options.expr;
|
|
164
|
+
this.for = options.for || '5m';
|
|
165
|
+
this.severity = options.severity || AlertSeverity.WARNING;
|
|
166
|
+
this.labels = options.labels || {};
|
|
167
|
+
this.annotations = options.annotations || {};
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Generate Prometheus alert rule YAML
|
|
172
|
+
*/
|
|
173
|
+
toPrometheusYAML() {
|
|
174
|
+
return `- alert: ${this.name}
|
|
175
|
+
expr: ${this.expr}
|
|
176
|
+
for: ${this.for}
|
|
177
|
+
labels:
|
|
178
|
+
severity: ${this.severity}
|
|
179
|
+
${Object.entries(this.labels).map(([k, v]) => ` ${k}: ${v}`).join('\n')}
|
|
180
|
+
annotations:
|
|
181
|
+
summary: "${this.annotations.summary || this.name}"
|
|
182
|
+
description: "${this.annotations.description || ''}"`;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
toJSON() {
|
|
186
|
+
return {
|
|
187
|
+
name: this.name,
|
|
188
|
+
expr: this.expr,
|
|
189
|
+
for: this.for,
|
|
190
|
+
severity: this.severity,
|
|
191
|
+
labels: this.labels,
|
|
192
|
+
annotations: this.annotations
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Health Check definition
|
|
199
|
+
*/
|
|
200
|
+
class HealthCheck {
|
|
201
|
+
constructor(options) {
|
|
202
|
+
this.name = options.name;
|
|
203
|
+
this.endpoint = options.endpoint || '/health';
|
|
204
|
+
this.interval = options.interval || 30000; // 30 seconds
|
|
205
|
+
this.timeout = options.timeout || 5000;
|
|
206
|
+
this.checks = options.checks || [];
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Add a dependency check
|
|
211
|
+
*/
|
|
212
|
+
addCheck(check) {
|
|
213
|
+
this.checks.push({
|
|
214
|
+
name: check.name,
|
|
215
|
+
type: check.type || 'dependency',
|
|
216
|
+
critical: check.critical !== false,
|
|
217
|
+
check: check.check
|
|
218
|
+
});
|
|
219
|
+
return this;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Generate health check response
|
|
224
|
+
*/
|
|
225
|
+
async execute() {
|
|
226
|
+
const results = [];
|
|
227
|
+
let healthy = true;
|
|
228
|
+
|
|
229
|
+
for (const check of this.checks) {
|
|
230
|
+
try {
|
|
231
|
+
const startTime = Date.now();
|
|
232
|
+
const checkResult = await Promise.race([
|
|
233
|
+
check.check(),
|
|
234
|
+
new Promise((_, reject) =>
|
|
235
|
+
setTimeout(() => reject(new Error('Timeout')), this.timeout)
|
|
236
|
+
)
|
|
237
|
+
]);
|
|
238
|
+
|
|
239
|
+
results.push({
|
|
240
|
+
name: check.name,
|
|
241
|
+
status: 'healthy',
|
|
242
|
+
latency: Date.now() - startTime,
|
|
243
|
+
details: checkResult
|
|
244
|
+
});
|
|
245
|
+
} catch (error) {
|
|
246
|
+
results.push({
|
|
247
|
+
name: check.name,
|
|
248
|
+
status: 'unhealthy',
|
|
249
|
+
error: error.message
|
|
250
|
+
});
|
|
251
|
+
if (check.critical) healthy = false;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
return {
|
|
256
|
+
status: healthy ? 'healthy' : 'unhealthy',
|
|
257
|
+
timestamp: new Date().toISOString(),
|
|
258
|
+
checks: results
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Generate Express.js health endpoint handler
|
|
264
|
+
*/
|
|
265
|
+
toExpressHandler() {
|
|
266
|
+
const check = this;
|
|
267
|
+
return `
|
|
268
|
+
app.get('${this.endpoint}', async (req, res) => {
|
|
269
|
+
const health = await healthCheck.execute();
|
|
270
|
+
res.status(health.status === 'healthy' ? 200 : 503).json(health);
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
app.get('${this.endpoint}/live', (req, res) => {
|
|
274
|
+
res.status(200).json({ status: 'alive', timestamp: new Date().toISOString() });
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
app.get('${this.endpoint}/ready', async (req, res) => {
|
|
278
|
+
const health = await healthCheck.execute();
|
|
279
|
+
res.status(health.status === 'healthy' ? 200 : 503).json(health);
|
|
280
|
+
});`;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
toJSON() {
|
|
284
|
+
return {
|
|
285
|
+
name: this.name,
|
|
286
|
+
endpoint: this.endpoint,
|
|
287
|
+
interval: this.interval,
|
|
288
|
+
timeout: this.timeout,
|
|
289
|
+
checks: this.checks.map(c => ({
|
|
290
|
+
name: c.name,
|
|
291
|
+
type: c.type,
|
|
292
|
+
critical: c.critical
|
|
293
|
+
}))
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* Monitoring Configuration
|
|
300
|
+
*/
|
|
301
|
+
class MonitoringConfig extends EventEmitter {
|
|
302
|
+
constructor(options = {}) {
|
|
303
|
+
super();
|
|
304
|
+
this.serviceName = options.serviceName || 'musubi-service';
|
|
305
|
+
this.environment = options.environment || 'production';
|
|
306
|
+
this.slos = new Map();
|
|
307
|
+
this.alerts = new Map();
|
|
308
|
+
this.healthChecks = new Map();
|
|
309
|
+
this.metrics = new Map();
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
/**
|
|
313
|
+
* Define an SLO
|
|
314
|
+
*/
|
|
315
|
+
defineSLO(slo) {
|
|
316
|
+
const sloInstance = slo instanceof SLO ? slo : new SLO(slo);
|
|
317
|
+
this.slos.set(sloInstance.name, sloInstance);
|
|
318
|
+
this.emit('sloAdded', sloInstance);
|
|
319
|
+
return this;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* Get an SLO
|
|
324
|
+
*/
|
|
325
|
+
getSLO(name) {
|
|
326
|
+
return this.slos.get(name);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/**
|
|
330
|
+
* List all SLOs
|
|
331
|
+
*/
|
|
332
|
+
listSLOs() {
|
|
333
|
+
return [...this.slos.values()].map(s => s.toJSON());
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
/**
|
|
337
|
+
* Define an alert rule
|
|
338
|
+
*/
|
|
339
|
+
defineAlert(alert) {
|
|
340
|
+
const alertInstance = alert instanceof AlertRule ? alert : new AlertRule(alert);
|
|
341
|
+
this.alerts.set(alertInstance.name, alertInstance);
|
|
342
|
+
this.emit('alertAdded', alertInstance);
|
|
343
|
+
return this;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Get an alert
|
|
348
|
+
*/
|
|
349
|
+
getAlert(name) {
|
|
350
|
+
return this.alerts.get(name);
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* List all alerts
|
|
355
|
+
*/
|
|
356
|
+
listAlerts() {
|
|
357
|
+
return [...this.alerts.values()].map(a => a.toJSON());
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
/**
|
|
361
|
+
* Define a health check
|
|
362
|
+
*/
|
|
363
|
+
defineHealthCheck(healthCheck) {
|
|
364
|
+
const hcInstance = healthCheck instanceof HealthCheck
|
|
365
|
+
? healthCheck
|
|
366
|
+
: new HealthCheck(healthCheck);
|
|
367
|
+
this.healthChecks.set(hcInstance.name, hcInstance);
|
|
368
|
+
this.emit('healthCheckAdded', hcInstance);
|
|
369
|
+
return this;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
/**
|
|
373
|
+
* Get a health check
|
|
374
|
+
*/
|
|
375
|
+
getHealthCheck(name) {
|
|
376
|
+
return this.healthChecks.get(name);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/**
|
|
380
|
+
* Define a metric
|
|
381
|
+
*/
|
|
382
|
+
defineMetric(metric) {
|
|
383
|
+
this.metrics.set(metric.name, {
|
|
384
|
+
name: metric.name,
|
|
385
|
+
type: metric.type || MetricType.COUNTER,
|
|
386
|
+
help: metric.help || '',
|
|
387
|
+
labels: metric.labels || []
|
|
388
|
+
});
|
|
389
|
+
return this;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/**
|
|
393
|
+
* Generate Prometheus metrics configuration
|
|
394
|
+
*/
|
|
395
|
+
toPrometheusConfig() {
|
|
396
|
+
const rules = [];
|
|
397
|
+
|
|
398
|
+
// Generate SLO-based alerts
|
|
399
|
+
for (const slo of this.slos.values()) {
|
|
400
|
+
rules.push(slo.toBurnRateAlert());
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
// Add custom alerts
|
|
404
|
+
for (const alert of this.alerts.values()) {
|
|
405
|
+
rules.push(alert.toJSON());
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
return {
|
|
409
|
+
groups: [{
|
|
410
|
+
name: `${this.serviceName}-alerts`,
|
|
411
|
+
rules
|
|
412
|
+
}]
|
|
413
|
+
};
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
/**
|
|
417
|
+
* Generate Grafana dashboard JSON
|
|
418
|
+
*/
|
|
419
|
+
toGrafanaDashboard() {
|
|
420
|
+
const panels = [];
|
|
421
|
+
let y = 0;
|
|
422
|
+
|
|
423
|
+
// SLO panels
|
|
424
|
+
for (const slo of this.slos.values()) {
|
|
425
|
+
panels.push({
|
|
426
|
+
id: panels.length + 1,
|
|
427
|
+
type: 'gauge',
|
|
428
|
+
title: slo.name,
|
|
429
|
+
gridPos: { x: 0, y, w: 8, h: 6 },
|
|
430
|
+
targets: [{
|
|
431
|
+
expr: slo.sli.toPrometheusQuery(),
|
|
432
|
+
legendFormat: slo.name
|
|
433
|
+
}],
|
|
434
|
+
fieldConfig: {
|
|
435
|
+
defaults: {
|
|
436
|
+
thresholds: {
|
|
437
|
+
mode: 'absolute',
|
|
438
|
+
steps: [
|
|
439
|
+
{ color: 'red', value: null },
|
|
440
|
+
{ color: 'yellow', value: slo.target - 0.01 },
|
|
441
|
+
{ color: 'green', value: slo.target }
|
|
442
|
+
]
|
|
443
|
+
},
|
|
444
|
+
min: 0,
|
|
445
|
+
max: 1,
|
|
446
|
+
unit: 'percentunit'
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
});
|
|
450
|
+
y += 6;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
return {
|
|
454
|
+
title: `${this.serviceName} SLO Dashboard`,
|
|
455
|
+
uid: `${this.serviceName}-slo`,
|
|
456
|
+
tags: ['slo', 'sre', this.serviceName],
|
|
457
|
+
timezone: 'browser',
|
|
458
|
+
panels,
|
|
459
|
+
refresh: '30s',
|
|
460
|
+
time: { from: 'now-24h', to: 'now' }
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Generate complete monitoring configuration
|
|
466
|
+
*/
|
|
467
|
+
toJSON() {
|
|
468
|
+
return {
|
|
469
|
+
serviceName: this.serviceName,
|
|
470
|
+
environment: this.environment,
|
|
471
|
+
slos: this.listSLOs(),
|
|
472
|
+
alerts: this.listAlerts(),
|
|
473
|
+
healthChecks: [...this.healthChecks.values()].map(h => h.toJSON()),
|
|
474
|
+
metrics: [...this.metrics.values()],
|
|
475
|
+
prometheus: this.toPrometheusConfig(),
|
|
476
|
+
grafana: this.toGrafanaDashboard()
|
|
477
|
+
};
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* Pre-defined SLO templates
|
|
483
|
+
*/
|
|
484
|
+
const SLOTemplates = {
|
|
485
|
+
/**
|
|
486
|
+
* API Availability SLO
|
|
487
|
+
*/
|
|
488
|
+
API_AVAILABILITY: (target = 0.999) => new SLO({
|
|
489
|
+
name: 'api-availability',
|
|
490
|
+
description: 'API endpoint availability',
|
|
491
|
+
sli: {
|
|
492
|
+
name: 'api-success-rate',
|
|
493
|
+
type: SLOType.AVAILABILITY,
|
|
494
|
+
metric: 'http_requests'
|
|
495
|
+
},
|
|
496
|
+
target,
|
|
497
|
+
window: '30d'
|
|
498
|
+
}),
|
|
499
|
+
|
|
500
|
+
/**
|
|
501
|
+
* API Latency SLO
|
|
502
|
+
*/
|
|
503
|
+
API_LATENCY: (target = 0.95, thresholdMs = 200) => new SLO({
|
|
504
|
+
name: 'api-latency',
|
|
505
|
+
description: `95th percentile latency under ${thresholdMs}ms`,
|
|
506
|
+
sli: {
|
|
507
|
+
name: 'api-response-time',
|
|
508
|
+
type: SLOType.LATENCY,
|
|
509
|
+
metric: 'http_request_duration_seconds',
|
|
510
|
+
threshold: thresholdMs / 1000
|
|
511
|
+
},
|
|
512
|
+
target,
|
|
513
|
+
window: '30d'
|
|
514
|
+
}),
|
|
515
|
+
|
|
516
|
+
/**
|
|
517
|
+
* Error Rate SLO
|
|
518
|
+
*/
|
|
519
|
+
ERROR_RATE: (target = 0.99) => new SLO({
|
|
520
|
+
name: 'error-rate',
|
|
521
|
+
description: 'Low error rate objective',
|
|
522
|
+
sli: {
|
|
523
|
+
name: 'error-rate-indicator',
|
|
524
|
+
type: SLOType.ERROR_RATE,
|
|
525
|
+
metric: 'http_requests'
|
|
526
|
+
},
|
|
527
|
+
target,
|
|
528
|
+
window: '7d'
|
|
529
|
+
})
|
|
530
|
+
};
|
|
531
|
+
|
|
532
|
+
/**
|
|
533
|
+
* Pre-defined Alert templates
|
|
534
|
+
*/
|
|
535
|
+
const AlertTemplates = {
|
|
536
|
+
/**
|
|
537
|
+
* High Error Rate Alert
|
|
538
|
+
*/
|
|
539
|
+
HIGH_ERROR_RATE: (threshold = 0.05) => new AlertRule({
|
|
540
|
+
name: 'HighErrorRate',
|
|
541
|
+
expr: `sum(rate(http_requests_errors_total[5m])) / sum(rate(http_requests_total[5m])) > ${threshold}`,
|
|
542
|
+
for: '5m',
|
|
543
|
+
severity: AlertSeverity.CRITICAL,
|
|
544
|
+
annotations: {
|
|
545
|
+
summary: 'High error rate detected',
|
|
546
|
+
description: `Error rate is above ${threshold * 100}%`
|
|
547
|
+
}
|
|
548
|
+
}),
|
|
549
|
+
|
|
550
|
+
/**
|
|
551
|
+
* High Latency Alert
|
|
552
|
+
*/
|
|
553
|
+
HIGH_LATENCY: (thresholdMs = 500) => new AlertRule({
|
|
554
|
+
name: 'HighLatency',
|
|
555
|
+
expr: `histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > ${thresholdMs / 1000}`,
|
|
556
|
+
for: '5m',
|
|
557
|
+
severity: AlertSeverity.WARNING,
|
|
558
|
+
annotations: {
|
|
559
|
+
summary: 'High latency detected',
|
|
560
|
+
description: `P95 latency is above ${thresholdMs}ms`
|
|
561
|
+
}
|
|
562
|
+
}),
|
|
563
|
+
|
|
564
|
+
/**
|
|
565
|
+
* Service Down Alert
|
|
566
|
+
*/
|
|
567
|
+
SERVICE_DOWN: () => new AlertRule({
|
|
568
|
+
name: 'ServiceDown',
|
|
569
|
+
expr: 'up == 0',
|
|
570
|
+
for: '1m',
|
|
571
|
+
severity: AlertSeverity.CRITICAL,
|
|
572
|
+
annotations: {
|
|
573
|
+
summary: 'Service is down',
|
|
574
|
+
description: 'Service instance is not responding'
|
|
575
|
+
}
|
|
576
|
+
}),
|
|
577
|
+
|
|
578
|
+
/**
|
|
579
|
+
* High Memory Usage Alert
|
|
580
|
+
*/
|
|
581
|
+
HIGH_MEMORY: (threshold = 0.9) => new AlertRule({
|
|
582
|
+
name: 'HighMemoryUsage',
|
|
583
|
+
expr: `process_resident_memory_bytes / node_memory_MemTotal_bytes > ${threshold}`,
|
|
584
|
+
for: '5m',
|
|
585
|
+
severity: AlertSeverity.WARNING,
|
|
586
|
+
annotations: {
|
|
587
|
+
summary: 'High memory usage',
|
|
588
|
+
description: `Memory usage is above ${threshold * 100}%`
|
|
589
|
+
}
|
|
590
|
+
})
|
|
591
|
+
};
|
|
592
|
+
|
|
593
|
+
/**
|
|
594
|
+
* Create a monitoring configuration
|
|
595
|
+
*/
|
|
596
|
+
function createMonitoringConfig(options = {}) {
|
|
597
|
+
return new MonitoringConfig(options);
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// Import sub-modules
|
|
601
|
+
const releaseManagerModule = require('./release-manager');
|
|
602
|
+
const incidentManagerModule = require('./incident-manager');
|
|
603
|
+
const observabilityModule = require('./observability');
|
|
604
|
+
|
|
605
|
+
module.exports = {
|
|
606
|
+
// Classes
|
|
607
|
+
SLI,
|
|
608
|
+
SLO,
|
|
609
|
+
AlertRule,
|
|
610
|
+
HealthCheck,
|
|
611
|
+
MonitoringConfig,
|
|
612
|
+
|
|
613
|
+
// Constants
|
|
614
|
+
SLOType,
|
|
615
|
+
AlertSeverity,
|
|
616
|
+
MetricType,
|
|
617
|
+
|
|
618
|
+
// Templates
|
|
619
|
+
SLOTemplates,
|
|
620
|
+
AlertTemplates,
|
|
621
|
+
|
|
622
|
+
// Factory
|
|
623
|
+
createMonitoringConfig,
|
|
624
|
+
|
|
625
|
+
// Release Manager
|
|
626
|
+
...releaseManagerModule,
|
|
627
|
+
|
|
628
|
+
// Incident Manager
|
|
629
|
+
...incidentManagerModule,
|
|
630
|
+
|
|
631
|
+
// Observability
|
|
632
|
+
...observabilityModule
|
|
633
|
+
};
|