@unrdf/observability 26.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.cjs +10 -0
- package/IMPLEMENTATION-SUMMARY.md +478 -0
- package/LICENSE +21 -0
- package/README.md +482 -0
- package/capability-map.md +90 -0
- package/config/alert-rules.yml +269 -0
- package/config/prometheus.yml +136 -0
- package/dashboards/grafana-unrdf.json +798 -0
- package/dashboards/unrdf-workflow-dashboard.json +295 -0
- package/docs/OBSERVABILITY-PATTERNS.md +681 -0
- package/docs/OBSERVABILITY-RUNBOOK.md +554 -0
- package/examples/observability-demo.mjs +334 -0
- package/package.json +46 -0
- package/src/advanced-metrics.mjs +413 -0
- package/src/alerts/alert-manager.mjs +436 -0
- package/src/custom-events.mjs +558 -0
- package/src/distributed-tracing.mjs +352 -0
- package/src/exporters/grafana-exporter.mjs +415 -0
- package/src/index.mjs +61 -0
- package/src/metrics/workflow-metrics.mjs +346 -0
- package/src/receipts/anchor.mjs +155 -0
- package/src/receipts/index.mjs +62 -0
- package/src/receipts/merkle-tree.mjs +188 -0
- package/src/receipts/receipt-chain.mjs +209 -0
- package/src/receipts/receipt-schema.mjs +128 -0
- package/src/receipts/tamper-detection.mjs +219 -0
- package/test/advanced-metrics.test.mjs +302 -0
- package/test/custom-events.test.mjs +387 -0
- package/test/distributed-tracing.test.mjs +314 -0
- package/validation/observability-validation.mjs +366 -0
- package/vitest.config.mjs +25 -0
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @unrdf/observability - Alert Manager
|
|
3
|
+
*
|
|
4
|
+
* Threshold-based alerting and anomaly detection for workflow metrics.
|
|
5
|
+
* Supports webhook notifications, alert rules, and smart anomaly detection
|
|
6
|
+
* using statistical analysis.
|
|
7
|
+
*
|
|
8
|
+
* @module @unrdf/observability/alerts
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { EventEmitter } from 'node:events';
|
|
12
|
+
import { z } from 'zod';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Alert severity levels
|
|
16
|
+
*/
|
|
17
|
+
export const AlertSeverity = {
|
|
18
|
+
INFO: 'info',
|
|
19
|
+
WARNING: 'warning',
|
|
20
|
+
CRITICAL: 'critical',
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Alert rule schema
|
|
25
|
+
* @type {z.ZodObject}
|
|
26
|
+
*/
|
|
27
|
+
const AlertRuleSchema = z.object({
|
|
28
|
+
id: z.string(),
|
|
29
|
+
name: z.string(),
|
|
30
|
+
metric: z.string(),
|
|
31
|
+
threshold: z.number(),
|
|
32
|
+
operator: z.enum(['gt', 'lt', 'gte', 'lte', 'eq']),
|
|
33
|
+
severity: z.enum(['info', 'warning', 'critical']).default('warning'),
|
|
34
|
+
duration: z.number().min(0).default(60000), // 1 minute default
|
|
35
|
+
labels: z.record(z.string()).optional(),
|
|
36
|
+
annotations: z.record(z.string()).optional(),
|
|
37
|
+
enabled: z.boolean().default(true),
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Webhook configuration schema
|
|
42
|
+
* @type {z.ZodObject}
|
|
43
|
+
*/
|
|
44
|
+
const WebhookConfigSchema = z.object({
|
|
45
|
+
url: z.string().url(),
|
|
46
|
+
method: z.enum(['POST', 'PUT', 'PATCH']).default('POST'),
|
|
47
|
+
headers: z.record(z.string()).optional(),
|
|
48
|
+
timeout: z.number().min(1000).default(5000),
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* AlertManager - Intelligent alerting system
|
|
53
|
+
*
|
|
54
|
+
* Features:
|
|
55
|
+
* - Threshold-based alerts with hysteresis
|
|
56
|
+
* - Anomaly detection using z-score analysis
|
|
57
|
+
* - Webhook notifications
|
|
58
|
+
* - Alert deduplication and grouping
|
|
59
|
+
* - Alert history and correlation
|
|
60
|
+
*
|
|
61
|
+
* @class
|
|
62
|
+
* @extends EventEmitter
|
|
63
|
+
*/
|
|
64
|
+
export class AlertManager extends EventEmitter {
|
|
65
|
+
/**
|
|
66
|
+
* @param {object} config - Alert manager configuration
|
|
67
|
+
* @param {object[]} [config.rules=[]] - Initial alert rules
|
|
68
|
+
* @param {object[]} [config.webhooks=[]] - Webhook endpoints
|
|
69
|
+
* @param {number} [config.checkInterval=10000] - Rule check interval in ms
|
|
70
|
+
* @param {boolean} [config.enableAnomalyDetection=true] - Enable anomaly detection
|
|
71
|
+
*/
|
|
72
|
+
constructor(config = {}) {
|
|
73
|
+
super();
|
|
74
|
+
this.rules = new Map();
|
|
75
|
+
this.webhooks = [];
|
|
76
|
+
this.alertHistory = [];
|
|
77
|
+
this.metricHistory = new Map();
|
|
78
|
+
this.activeAlerts = new Map();
|
|
79
|
+
this.config = {
|
|
80
|
+
checkInterval: 10000,
|
|
81
|
+
enableAnomalyDetection: true,
|
|
82
|
+
...config,
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
// Initialize rules and webhooks
|
|
86
|
+
if (config.rules) {
|
|
87
|
+
config.rules.forEach(rule => this.addRule(rule));
|
|
88
|
+
}
|
|
89
|
+
if (config.webhooks) {
|
|
90
|
+
config.webhooks.forEach(webhook => this.addWebhook(webhook));
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Add alert rule
|
|
96
|
+
* @param {object} rule - Alert rule configuration
|
|
97
|
+
* @returns {string} Rule ID
|
|
98
|
+
*/
|
|
99
|
+
addRule(rule) {
|
|
100
|
+
const validated = AlertRuleSchema.parse(rule);
|
|
101
|
+
this.rules.set(validated.id, {
|
|
102
|
+
...validated,
|
|
103
|
+
state: {
|
|
104
|
+
triggered: false,
|
|
105
|
+
triggeredAt: null,
|
|
106
|
+
count: 0,
|
|
107
|
+
},
|
|
108
|
+
});
|
|
109
|
+
return validated.id;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Remove alert rule
|
|
114
|
+
* @param {string} ruleId - Rule ID to remove
|
|
115
|
+
* @returns {boolean} Success status
|
|
116
|
+
*/
|
|
117
|
+
removeRule(ruleId) {
|
|
118
|
+
return this.rules.delete(ruleId);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Enable/disable alert rule
|
|
123
|
+
* @param {string} ruleId - Rule ID
|
|
124
|
+
* @param {boolean} enabled - Enable state
|
|
125
|
+
*/
|
|
126
|
+
setRuleEnabled(ruleId, enabled) {
|
|
127
|
+
const rule = this.rules.get(ruleId);
|
|
128
|
+
if (rule) {
|
|
129
|
+
rule.enabled = enabled;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Add webhook endpoint
|
|
135
|
+
* @param {object} webhook - Webhook configuration
|
|
136
|
+
*/
|
|
137
|
+
addWebhook(webhook) {
|
|
138
|
+
const validated = WebhookConfigSchema.parse(webhook);
|
|
139
|
+
this.webhooks.push(validated);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Evaluate metric value against alert rules
|
|
144
|
+
* @param {string} metricName - Metric name
|
|
145
|
+
* @param {number} value - Current metric value
|
|
146
|
+
* @param {Record<string, string>} labels - Metric labels
|
|
147
|
+
*/
|
|
148
|
+
async evaluateMetric(metricName, value, labels = {}) {
|
|
149
|
+
// Store metric history for anomaly detection
|
|
150
|
+
this._recordMetricValue(metricName, value);
|
|
151
|
+
|
|
152
|
+
// Check threshold-based rules
|
|
153
|
+
for (const [_ruleId, rule] of this.rules.entries()) {
|
|
154
|
+
if (!rule.enabled || rule.metric !== metricName) {
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const triggered = this._evaluateRule(rule, value);
|
|
159
|
+
|
|
160
|
+
if (triggered && !rule.state.triggered) {
|
|
161
|
+
// New alert triggered
|
|
162
|
+
rule.state.triggered = true;
|
|
163
|
+
rule.state.triggeredAt = Date.now();
|
|
164
|
+
rule.state.count++;
|
|
165
|
+
|
|
166
|
+
const alert = this._createAlert(rule, value, labels);
|
|
167
|
+
await this._fireAlert(alert);
|
|
168
|
+
} else if (!triggered && rule.state.triggered) {
|
|
169
|
+
// Alert resolved
|
|
170
|
+
rule.state.triggered = false;
|
|
171
|
+
const alert = this._createAlert(rule, value, labels, true);
|
|
172
|
+
await this._resolveAlert(alert);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Check for anomalies
|
|
177
|
+
if (this.config.enableAnomalyDetection) {
|
|
178
|
+
const anomaly = this._detectAnomaly(metricName, value);
|
|
179
|
+
if (anomaly) {
|
|
180
|
+
const alert = {
|
|
181
|
+
id: `anomaly-${metricName}-${Date.now()}`,
|
|
182
|
+
type: 'anomaly',
|
|
183
|
+
metric: metricName,
|
|
184
|
+
value,
|
|
185
|
+
zscore: anomaly.zscore,
|
|
186
|
+
severity: anomaly.severity,
|
|
187
|
+
timestamp: Date.now(),
|
|
188
|
+
labels,
|
|
189
|
+
};
|
|
190
|
+
await this._fireAlert(alert);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Evaluate rule against value
|
|
197
|
+
* @private
|
|
198
|
+
* @param {object} rule - Alert rule
|
|
199
|
+
* @param {number} value - Metric value
|
|
200
|
+
* @returns {boolean} True if rule triggered
|
|
201
|
+
*/
|
|
202
|
+
_evaluateRule(rule, value) {
|
|
203
|
+
const { threshold, operator } = rule;
|
|
204
|
+
|
|
205
|
+
switch (operator) {
|
|
206
|
+
case 'gt':
|
|
207
|
+
return value > threshold;
|
|
208
|
+
case 'lt':
|
|
209
|
+
return value < threshold;
|
|
210
|
+
case 'gte':
|
|
211
|
+
return value >= threshold;
|
|
212
|
+
case 'lte':
|
|
213
|
+
return value <= threshold;
|
|
214
|
+
case 'eq':
|
|
215
|
+
return value === threshold;
|
|
216
|
+
default:
|
|
217
|
+
return false;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Create alert object
|
|
223
|
+
* @private
|
|
224
|
+
*/
|
|
225
|
+
_createAlert(rule, value, labels, resolved = false) {
|
|
226
|
+
return {
|
|
227
|
+
id: `${rule.id}-${rule.state.triggeredAt}`,
|
|
228
|
+
ruleId: rule.id,
|
|
229
|
+
name: rule.name,
|
|
230
|
+
metric: rule.metric,
|
|
231
|
+
value,
|
|
232
|
+
threshold: rule.threshold,
|
|
233
|
+
operator: rule.operator,
|
|
234
|
+
severity: rule.severity,
|
|
235
|
+
labels: { ...labels, ...rule.labels },
|
|
236
|
+
annotations: rule.annotations || {},
|
|
237
|
+
status: resolved ? 'resolved' : 'firing',
|
|
238
|
+
startsAt: rule.state.triggeredAt,
|
|
239
|
+
endsAt: resolved ? Date.now() : null,
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Fire alert
|
|
245
|
+
* @private
|
|
246
|
+
*/
|
|
247
|
+
async _fireAlert(alert) {
|
|
248
|
+
this.activeAlerts.set(alert.id, alert);
|
|
249
|
+
this.alertHistory.push(alert);
|
|
250
|
+
|
|
251
|
+
// Emit alert event
|
|
252
|
+
this.emit('alert', alert);
|
|
253
|
+
|
|
254
|
+
// Send webhooks
|
|
255
|
+
await this._sendWebhooks(alert);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* Resolve alert
|
|
260
|
+
* @private
|
|
261
|
+
*/
|
|
262
|
+
async _resolveAlert(alert) {
|
|
263
|
+
this.activeAlerts.delete(alert.id);
|
|
264
|
+
this.alertHistory.push(alert);
|
|
265
|
+
|
|
266
|
+
// Emit resolution event
|
|
267
|
+
this.emit('alert:resolved', alert);
|
|
268
|
+
|
|
269
|
+
// Send webhooks
|
|
270
|
+
await this._sendWebhooks(alert);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Send webhook notifications
|
|
275
|
+
* @private
|
|
276
|
+
*/
|
|
277
|
+
async _sendWebhooks(alert) {
|
|
278
|
+
const promises = this.webhooks.map(async webhook => {
|
|
279
|
+
try {
|
|
280
|
+
const controller = new AbortController();
|
|
281
|
+
const timeout = setTimeout(() => controller.abort(), webhook.timeout);
|
|
282
|
+
|
|
283
|
+
const response = await fetch(webhook.url, {
|
|
284
|
+
method: webhook.method,
|
|
285
|
+
headers: {
|
|
286
|
+
'Content-Type': 'application/json',
|
|
287
|
+
...webhook.headers,
|
|
288
|
+
},
|
|
289
|
+
body: JSON.stringify(alert),
|
|
290
|
+
signal: controller.signal,
|
|
291
|
+
});
|
|
292
|
+
|
|
293
|
+
clearTimeout(timeout);
|
|
294
|
+
|
|
295
|
+
if (!response.ok) {
|
|
296
|
+
throw new Error(`Webhook failed: ${response.status} ${response.statusText}`);
|
|
297
|
+
}
|
|
298
|
+
} catch (error) {
|
|
299
|
+
this.emit('webhook:error', { webhook, alert, error });
|
|
300
|
+
}
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
await Promise.allSettled(promises);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Record metric value for history
|
|
308
|
+
* @private
|
|
309
|
+
*/
|
|
310
|
+
_recordMetricValue(metricName, value) {
|
|
311
|
+
if (!this.metricHistory.has(metricName)) {
|
|
312
|
+
this.metricHistory.set(metricName, []);
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
const history = this.metricHistory.get(metricName);
|
|
316
|
+
history.push({ value, timestamp: Date.now() });
|
|
317
|
+
|
|
318
|
+
// Keep last 1000 values
|
|
319
|
+
if (history.length > 1000) {
|
|
320
|
+
history.shift();
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Detect anomalies using z-score analysis
|
|
326
|
+
* @private
|
|
327
|
+
* @param {string} metricName - Metric name
|
|
328
|
+
* @param {number} value - Current value
|
|
329
|
+
* @returns {object|null} Anomaly details or null
|
|
330
|
+
*/
|
|
331
|
+
_detectAnomaly(metricName, value) {
|
|
332
|
+
const history = this.metricHistory.get(metricName);
|
|
333
|
+
|
|
334
|
+
if (!history || history.length < 30) {
|
|
335
|
+
// Need at least 30 samples for statistical analysis
|
|
336
|
+
return null;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
const values = history.map(h => h.value);
|
|
340
|
+
const mean = values.reduce((a, b) => a + b, 0) / values.length;
|
|
341
|
+
const variance = values.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / values.length;
|
|
342
|
+
const stdDev = Math.sqrt(variance);
|
|
343
|
+
|
|
344
|
+
if (stdDev === 0) {
|
|
345
|
+
return null; // No variance, can't detect anomalies
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
const zscore = Math.abs((value - mean) / stdDev);
|
|
349
|
+
|
|
350
|
+
// Anomaly thresholds
|
|
351
|
+
if (zscore > 3) {
|
|
352
|
+
return { zscore, severity: AlertSeverity.CRITICAL };
|
|
353
|
+
} else if (zscore > 2) {
|
|
354
|
+
return { zscore, severity: AlertSeverity.WARNING };
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
return null;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
/**
|
|
361
|
+
* Get active alerts
|
|
362
|
+
* @returns {object[]} Active alerts
|
|
363
|
+
*/
|
|
364
|
+
getActiveAlerts() {
|
|
365
|
+
return Array.from(this.activeAlerts.values());
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Get alert history
|
|
370
|
+
* @param {object} filters - Filter options
|
|
371
|
+
* @param {number} [filters.limit=100] - Maximum number of alerts
|
|
372
|
+
* @param {string} [filters.severity] - Filter by severity
|
|
373
|
+
* @param {string} [filters.metric] - Filter by metric
|
|
374
|
+
* @returns {object[]} Alert history
|
|
375
|
+
*/
|
|
376
|
+
getAlertHistory(filters = {}) {
|
|
377
|
+
let history = [...this.alertHistory];
|
|
378
|
+
|
|
379
|
+
if (filters.severity) {
|
|
380
|
+
history = history.filter(a => a.severity === filters.severity);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
if (filters.metric) {
|
|
384
|
+
history = history.filter(a => a.metric === filters.metric);
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
const limit = filters.limit || 100;
|
|
388
|
+
return history.slice(-limit).reverse();
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Get alert statistics
|
|
393
|
+
* @returns {object} Alert statistics
|
|
394
|
+
*/
|
|
395
|
+
getStatistics() {
|
|
396
|
+
const total = this.alertHistory.length;
|
|
397
|
+
const active = this.activeAlerts.size;
|
|
398
|
+
const bySeverity = {
|
|
399
|
+
info: 0,
|
|
400
|
+
warning: 0,
|
|
401
|
+
critical: 0,
|
|
402
|
+
};
|
|
403
|
+
|
|
404
|
+
this.alertHistory.forEach(alert => {
|
|
405
|
+
bySeverity[alert.severity]++;
|
|
406
|
+
});
|
|
407
|
+
|
|
408
|
+
return {
|
|
409
|
+
total,
|
|
410
|
+
active,
|
|
411
|
+
bySeverity,
|
|
412
|
+
rules: this.rules.size,
|
|
413
|
+
webhooks: this.webhooks.length,
|
|
414
|
+
};
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
/**
|
|
418
|
+
* Clear alert history (for testing)
|
|
419
|
+
*/
|
|
420
|
+
clearHistory() {
|
|
421
|
+
this.alertHistory = [];
|
|
422
|
+
this.metricHistory.clear();
|
|
423
|
+
this.activeAlerts.clear();
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* Create alert manager instance
|
|
429
|
+
* @param {object} config - Configuration
|
|
430
|
+
* @returns {AlertManager} Alert manager instance
|
|
431
|
+
*/
|
|
432
|
+
export function createAlertManager(config = {}) {
|
|
433
|
+
return new AlertManager(config);
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
export default AlertManager;
|