@unrdf/observability 26.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,436 @@
1
+ /**
2
+ * @unrdf/observability - Alert Manager
3
+ *
4
+ * Threshold-based alerting and anomaly detection for workflow metrics.
5
+ * Supports webhook notifications, alert rules, and smart anomaly detection
6
+ * using statistical analysis.
7
+ *
8
+ * @module @unrdf/observability/alerts
9
+ */
10
+
11
+ import { EventEmitter } from 'node:events';
12
+ import { z } from 'zod';
13
+
14
+ /**
15
+ * Alert severity levels
16
+ */
17
+ export const AlertSeverity = {
18
+ INFO: 'info',
19
+ WARNING: 'warning',
20
+ CRITICAL: 'critical',
21
+ };
22
+
23
+ /**
24
+ * Alert rule schema
25
+ * @type {z.ZodObject}
26
+ */
27
+ const AlertRuleSchema = z.object({
28
+ id: z.string(),
29
+ name: z.string(),
30
+ metric: z.string(),
31
+ threshold: z.number(),
32
+ operator: z.enum(['gt', 'lt', 'gte', 'lte', 'eq']),
33
+ severity: z.enum(['info', 'warning', 'critical']).default('warning'),
34
+ duration: z.number().min(0).default(60000), // 1 minute default
35
+ labels: z.record(z.string()).optional(),
36
+ annotations: z.record(z.string()).optional(),
37
+ enabled: z.boolean().default(true),
38
+ });
39
+
40
+ /**
41
+ * Webhook configuration schema
42
+ * @type {z.ZodObject}
43
+ */
44
+ const WebhookConfigSchema = z.object({
45
+ url: z.string().url(),
46
+ method: z.enum(['POST', 'PUT', 'PATCH']).default('POST'),
47
+ headers: z.record(z.string()).optional(),
48
+ timeout: z.number().min(1000).default(5000),
49
+ });
50
+
51
+ /**
52
+ * AlertManager - Intelligent alerting system
53
+ *
54
+ * Features:
55
+ * - Threshold-based alerts with hysteresis
56
+ * - Anomaly detection using z-score analysis
57
+ * - Webhook notifications
58
+ * - Alert deduplication and grouping
59
+ * - Alert history and correlation
60
+ *
61
+ * @class
62
+ * @extends EventEmitter
63
+ */
64
+ export class AlertManager extends EventEmitter {
65
+ /**
66
+ * @param {object} config - Alert manager configuration
67
+ * @param {object[]} [config.rules=[]] - Initial alert rules
68
+ * @param {object[]} [config.webhooks=[]] - Webhook endpoints
69
+ * @param {number} [config.checkInterval=10000] - Rule check interval in ms
70
+ * @param {boolean} [config.enableAnomalyDetection=true] - Enable anomaly detection
71
+ */
72
+ constructor(config = {}) {
73
+ super();
74
+ this.rules = new Map();
75
+ this.webhooks = [];
76
+ this.alertHistory = [];
77
+ this.metricHistory = new Map();
78
+ this.activeAlerts = new Map();
79
+ this.config = {
80
+ checkInterval: 10000,
81
+ enableAnomalyDetection: true,
82
+ ...config,
83
+ };
84
+
85
+ // Initialize rules and webhooks
86
+ if (config.rules) {
87
+ config.rules.forEach(rule => this.addRule(rule));
88
+ }
89
+ if (config.webhooks) {
90
+ config.webhooks.forEach(webhook => this.addWebhook(webhook));
91
+ }
92
+ }
93
+
94
+ /**
95
+ * Add alert rule
96
+ * @param {object} rule - Alert rule configuration
97
+ * @returns {string} Rule ID
98
+ */
99
+ addRule(rule) {
100
+ const validated = AlertRuleSchema.parse(rule);
101
+ this.rules.set(validated.id, {
102
+ ...validated,
103
+ state: {
104
+ triggered: false,
105
+ triggeredAt: null,
106
+ count: 0,
107
+ },
108
+ });
109
+ return validated.id;
110
+ }
111
+
112
+ /**
113
+ * Remove alert rule
114
+ * @param {string} ruleId - Rule ID to remove
115
+ * @returns {boolean} Success status
116
+ */
117
+ removeRule(ruleId) {
118
+ return this.rules.delete(ruleId);
119
+ }
120
+
121
+ /**
122
+ * Enable/disable alert rule
123
+ * @param {string} ruleId - Rule ID
124
+ * @param {boolean} enabled - Enable state
125
+ */
126
+ setRuleEnabled(ruleId, enabled) {
127
+ const rule = this.rules.get(ruleId);
128
+ if (rule) {
129
+ rule.enabled = enabled;
130
+ }
131
+ }
132
+
133
+ /**
134
+ * Add webhook endpoint
135
+ * @param {object} webhook - Webhook configuration
136
+ */
137
+ addWebhook(webhook) {
138
+ const validated = WebhookConfigSchema.parse(webhook);
139
+ this.webhooks.push(validated);
140
+ }
141
+
142
+ /**
143
+ * Evaluate metric value against alert rules
144
+ * @param {string} metricName - Metric name
145
+ * @param {number} value - Current metric value
146
+ * @param {Record<string, string>} labels - Metric labels
147
+ */
148
+ async evaluateMetric(metricName, value, labels = {}) {
149
+ // Store metric history for anomaly detection
150
+ this._recordMetricValue(metricName, value);
151
+
152
+ // Check threshold-based rules
153
+ for (const [_ruleId, rule] of this.rules.entries()) {
154
+ if (!rule.enabled || rule.metric !== metricName) {
155
+ continue;
156
+ }
157
+
158
+ const triggered = this._evaluateRule(rule, value);
159
+
160
+ if (triggered && !rule.state.triggered) {
161
+ // New alert triggered
162
+ rule.state.triggered = true;
163
+ rule.state.triggeredAt = Date.now();
164
+ rule.state.count++;
165
+
166
+ const alert = this._createAlert(rule, value, labels);
167
+ await this._fireAlert(alert);
168
+ } else if (!triggered && rule.state.triggered) {
169
+ // Alert resolved
170
+ rule.state.triggered = false;
171
+ const alert = this._createAlert(rule, value, labels, true);
172
+ await this._resolveAlert(alert);
173
+ }
174
+ }
175
+
176
+ // Check for anomalies
177
+ if (this.config.enableAnomalyDetection) {
178
+ const anomaly = this._detectAnomaly(metricName, value);
179
+ if (anomaly) {
180
+ const alert = {
181
+ id: `anomaly-${metricName}-${Date.now()}`,
182
+ type: 'anomaly',
183
+ metric: metricName,
184
+ value,
185
+ zscore: anomaly.zscore,
186
+ severity: anomaly.severity,
187
+ timestamp: Date.now(),
188
+ labels,
189
+ };
190
+ await this._fireAlert(alert);
191
+ }
192
+ }
193
+ }
194
+
195
+ /**
196
+ * Evaluate rule against value
197
+ * @private
198
+ * @param {object} rule - Alert rule
199
+ * @param {number} value - Metric value
200
+ * @returns {boolean} True if rule triggered
201
+ */
202
+ _evaluateRule(rule, value) {
203
+ const { threshold, operator } = rule;
204
+
205
+ switch (operator) {
206
+ case 'gt':
207
+ return value > threshold;
208
+ case 'lt':
209
+ return value < threshold;
210
+ case 'gte':
211
+ return value >= threshold;
212
+ case 'lte':
213
+ return value <= threshold;
214
+ case 'eq':
215
+ return value === threshold;
216
+ default:
217
+ return false;
218
+ }
219
+ }
220
+
221
+ /**
222
+ * Create alert object
223
+ * @private
224
+ */
225
+ _createAlert(rule, value, labels, resolved = false) {
226
+ return {
227
+ id: `${rule.id}-${rule.state.triggeredAt}`,
228
+ ruleId: rule.id,
229
+ name: rule.name,
230
+ metric: rule.metric,
231
+ value,
232
+ threshold: rule.threshold,
233
+ operator: rule.operator,
234
+ severity: rule.severity,
235
+ labels: { ...labels, ...rule.labels },
236
+ annotations: rule.annotations || {},
237
+ status: resolved ? 'resolved' : 'firing',
238
+ startsAt: rule.state.triggeredAt,
239
+ endsAt: resolved ? Date.now() : null,
240
+ };
241
+ }
242
+
243
+ /**
244
+ * Fire alert
245
+ * @private
246
+ */
247
+ async _fireAlert(alert) {
248
+ this.activeAlerts.set(alert.id, alert);
249
+ this.alertHistory.push(alert);
250
+
251
+ // Emit alert event
252
+ this.emit('alert', alert);
253
+
254
+ // Send webhooks
255
+ await this._sendWebhooks(alert);
256
+ }
257
+
258
+ /**
259
+ * Resolve alert
260
+ * @private
261
+ */
262
+ async _resolveAlert(alert) {
263
+ this.activeAlerts.delete(alert.id);
264
+ this.alertHistory.push(alert);
265
+
266
+ // Emit resolution event
267
+ this.emit('alert:resolved', alert);
268
+
269
+ // Send webhooks
270
+ await this._sendWebhooks(alert);
271
+ }
272
+
273
+ /**
274
+ * Send webhook notifications
275
+ * @private
276
+ */
277
+ async _sendWebhooks(alert) {
278
+ const promises = this.webhooks.map(async webhook => {
279
+ try {
280
+ const controller = new AbortController();
281
+ const timeout = setTimeout(() => controller.abort(), webhook.timeout);
282
+
283
+ const response = await fetch(webhook.url, {
284
+ method: webhook.method,
285
+ headers: {
286
+ 'Content-Type': 'application/json',
287
+ ...webhook.headers,
288
+ },
289
+ body: JSON.stringify(alert),
290
+ signal: controller.signal,
291
+ });
292
+
293
+ clearTimeout(timeout);
294
+
295
+ if (!response.ok) {
296
+ throw new Error(`Webhook failed: ${response.status} ${response.statusText}`);
297
+ }
298
+ } catch (error) {
299
+ this.emit('webhook:error', { webhook, alert, error });
300
+ }
301
+ });
302
+
303
+ await Promise.allSettled(promises);
304
+ }
305
+
306
+ /**
307
+ * Record metric value for history
308
+ * @private
309
+ */
310
+ _recordMetricValue(metricName, value) {
311
+ if (!this.metricHistory.has(metricName)) {
312
+ this.metricHistory.set(metricName, []);
313
+ }
314
+
315
+ const history = this.metricHistory.get(metricName);
316
+ history.push({ value, timestamp: Date.now() });
317
+
318
+ // Keep last 1000 values
319
+ if (history.length > 1000) {
320
+ history.shift();
321
+ }
322
+ }
323
+
324
+ /**
325
+ * Detect anomalies using z-score analysis
326
+ * @private
327
+ * @param {string} metricName - Metric name
328
+ * @param {number} value - Current value
329
+ * @returns {object|null} Anomaly details or null
330
+ */
331
+ _detectAnomaly(metricName, value) {
332
+ const history = this.metricHistory.get(metricName);
333
+
334
+ if (!history || history.length < 30) {
335
+ // Need at least 30 samples for statistical analysis
336
+ return null;
337
+ }
338
+
339
+ const values = history.map(h => h.value);
340
+ const mean = values.reduce((a, b) => a + b, 0) / values.length;
341
+ const variance = values.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / values.length;
342
+ const stdDev = Math.sqrt(variance);
343
+
344
+ if (stdDev === 0) {
345
+ return null; // No variance, can't detect anomalies
346
+ }
347
+
348
+ const zscore = Math.abs((value - mean) / stdDev);
349
+
350
+ // Anomaly thresholds
351
+ if (zscore > 3) {
352
+ return { zscore, severity: AlertSeverity.CRITICAL };
353
+ } else if (zscore > 2) {
354
+ return { zscore, severity: AlertSeverity.WARNING };
355
+ }
356
+
357
+ return null;
358
+ }
359
+
360
+ /**
361
+ * Get active alerts
362
+ * @returns {object[]} Active alerts
363
+ */
364
+ getActiveAlerts() {
365
+ return Array.from(this.activeAlerts.values());
366
+ }
367
+
368
+ /**
369
+ * Get alert history
370
+ * @param {object} filters - Filter options
371
+ * @param {number} [filters.limit=100] - Maximum number of alerts
372
+ * @param {string} [filters.severity] - Filter by severity
373
+ * @param {string} [filters.metric] - Filter by metric
374
+ * @returns {object[]} Alert history
375
+ */
376
+ getAlertHistory(filters = {}) {
377
+ let history = [...this.alertHistory];
378
+
379
+ if (filters.severity) {
380
+ history = history.filter(a => a.severity === filters.severity);
381
+ }
382
+
383
+ if (filters.metric) {
384
+ history = history.filter(a => a.metric === filters.metric);
385
+ }
386
+
387
+ const limit = filters.limit || 100;
388
+ return history.slice(-limit).reverse();
389
+ }
390
+
391
+ /**
392
+ * Get alert statistics
393
+ * @returns {object} Alert statistics
394
+ */
395
+ getStatistics() {
396
+ const total = this.alertHistory.length;
397
+ const active = this.activeAlerts.size;
398
+ const bySeverity = {
399
+ info: 0,
400
+ warning: 0,
401
+ critical: 0,
402
+ };
403
+
404
+ this.alertHistory.forEach(alert => {
405
+ bySeverity[alert.severity]++;
406
+ });
407
+
408
+ return {
409
+ total,
410
+ active,
411
+ bySeverity,
412
+ rules: this.rules.size,
413
+ webhooks: this.webhooks.length,
414
+ };
415
+ }
416
+
417
+ /**
418
+ * Clear alert history (for testing)
419
+ */
420
+ clearHistory() {
421
+ this.alertHistory = [];
422
+ this.metricHistory.clear();
423
+ this.activeAlerts.clear();
424
+ }
425
+ }
426
+
427
+ /**
428
+ * Create alert manager instance
429
+ * @param {object} config - Configuration
430
+ * @returns {AlertManager} Alert manager instance
431
+ */
432
+ export function createAlertManager(config = {}) {
433
+ return new AlertManager(config);
434
+ }
435
+
436
+ export default AlertManager;