@unrdf/observability 26.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.cjs +10 -0
- package/IMPLEMENTATION-SUMMARY.md +478 -0
- package/LICENSE +21 -0
- package/README.md +482 -0
- package/capability-map.md +90 -0
- package/config/alert-rules.yml +269 -0
- package/config/prometheus.yml +136 -0
- package/dashboards/grafana-unrdf.json +798 -0
- package/dashboards/unrdf-workflow-dashboard.json +295 -0
- package/docs/OBSERVABILITY-PATTERNS.md +681 -0
- package/docs/OBSERVABILITY-RUNBOOK.md +554 -0
- package/examples/observability-demo.mjs +334 -0
- package/package.json +46 -0
- package/src/advanced-metrics.mjs +413 -0
- package/src/alerts/alert-manager.mjs +436 -0
- package/src/custom-events.mjs +558 -0
- package/src/distributed-tracing.mjs +352 -0
- package/src/exporters/grafana-exporter.mjs +415 -0
- package/src/index.mjs +61 -0
- package/src/metrics/workflow-metrics.mjs +346 -0
- package/src/receipts/anchor.mjs +155 -0
- package/src/receipts/index.mjs +62 -0
- package/src/receipts/merkle-tree.mjs +188 -0
- package/src/receipts/receipt-chain.mjs +209 -0
- package/src/receipts/receipt-schema.mjs +128 -0
- package/src/receipts/tamper-detection.mjs +219 -0
- package/test/advanced-metrics.test.mjs +302 -0
- package/test/custom-events.test.mjs +387 -0
- package/test/distributed-tracing.test.mjs +314 -0
- package/validation/observability-validation.mjs +366 -0
- package/vitest.config.mjs +25 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file Advanced OpenTelemetry Metrics
|
|
3
|
+
* @module observability/advanced-metrics
|
|
4
|
+
*
|
|
5
|
+
* @description
|
|
6
|
+
* Advanced business metrics, latency histograms, throughput tracking,
|
|
7
|
+
* and resource utilization monitoring with zero performance impact.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { metrics } from '@opentelemetry/api';
|
|
11
|
+
import { z } from 'zod';
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Advanced metrics configuration schema
|
|
15
|
+
*/
|
|
16
|
+
export const AdvancedMetricsConfigSchema = z.object({
|
|
17
|
+
serviceName: z.string().default('unrdf'),
|
|
18
|
+
serviceVersion: z.string().default('6.0.0'),
|
|
19
|
+
enabled: z.boolean().default(true),
|
|
20
|
+
samplingRate: z.number().min(0).max(1).default(0.01),
|
|
21
|
+
buckets: z
|
|
22
|
+
.object({
|
|
23
|
+
latency: z.array(z.number()).default([1, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000]),
|
|
24
|
+
throughput: z.array(z.number()).default([1, 10, 50, 100, 500, 1000, 5000]),
|
|
25
|
+
memory: z.array(z.number()).default([1e6, 10e6, 50e6, 100e6, 500e6, 1e9, 5e9]),
|
|
26
|
+
})
|
|
27
|
+
.default({
|
|
28
|
+
latency: [1, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000],
|
|
29
|
+
throughput: [1, 10, 50, 100, 500, 1000, 5000],
|
|
30
|
+
memory: [1e6, 10e6, 50e6, 100e6, 500e6, 1e9, 5e9],
|
|
31
|
+
}),
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Advanced metrics manager
|
|
36
|
+
*
|
|
37
|
+
* Provides production-grade observability with:
|
|
38
|
+
* - Business metrics (success/failure rates)
|
|
39
|
+
* - Latency histograms (P50, P90, P95, P99)
|
|
40
|
+
* - Throughput metrics (ops/sec)
|
|
41
|
+
* - Resource utilization (memory, CPU)
|
|
42
|
+
* - Zero performance impact (sampling, async recording)
|
|
43
|
+
*/
|
|
44
|
+
export class AdvancedMetrics {
|
|
45
|
+
/**
|
|
46
|
+
* Create advanced metrics manager
|
|
47
|
+
* @param {Object} [config] - Configuration options
|
|
48
|
+
*/
|
|
49
|
+
constructor(config = {}) {
|
|
50
|
+
this.config = AdvancedMetricsConfigSchema.parse(config);
|
|
51
|
+
this.meter = metrics.getMeter(this.config.serviceName, this.config.serviceVersion);
|
|
52
|
+
|
|
53
|
+
// Metrics instances
|
|
54
|
+
this.businessMetrics = null;
|
|
55
|
+
this.latencyMetrics = null;
|
|
56
|
+
this.throughputMetrics = null;
|
|
57
|
+
this.resourceMetrics = null;
|
|
58
|
+
|
|
59
|
+
// Internal state for throughput calculation
|
|
60
|
+
this.operationCounts = new Map();
|
|
61
|
+
this.lastThroughputCalculation = Date.now();
|
|
62
|
+
|
|
63
|
+
if (this.config.enabled) {
|
|
64
|
+
this._initializeMetrics();
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Initialize all metrics
|
|
70
|
+
* @private
|
|
71
|
+
*/
|
|
72
|
+
_initializeMetrics() {
|
|
73
|
+
this._initializeBusinessMetrics();
|
|
74
|
+
this._initializeLatencyMetrics();
|
|
75
|
+
this._initializeThroughputMetrics();
|
|
76
|
+
this._initializeResourceMetrics();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Initialize business metrics
|
|
81
|
+
* @private
|
|
82
|
+
*/
|
|
83
|
+
_initializeBusinessMetrics() {
|
|
84
|
+
this.businessMetrics = {
|
|
85
|
+
// Operation counters by type and result
|
|
86
|
+
operations: this.meter.createCounter('business.operations.total', {
|
|
87
|
+
description: 'Total business operations by type and result',
|
|
88
|
+
}),
|
|
89
|
+
|
|
90
|
+
// Success rate gauge
|
|
91
|
+
successRate: this.meter.createUpDownCounter('business.success_rate', {
|
|
92
|
+
description: 'Success rate for operations (0-1)',
|
|
93
|
+
}),
|
|
94
|
+
|
|
95
|
+
// Failure rate by error type
|
|
96
|
+
failuresByType: this.meter.createCounter('business.failures.by_type', {
|
|
97
|
+
description: 'Failures categorized by error type',
|
|
98
|
+
}),
|
|
99
|
+
|
|
100
|
+
// SLA violations
|
|
101
|
+
slaViolations: this.meter.createCounter('business.sla_violations', {
|
|
102
|
+
description: 'SLA violations by operation and threshold',
|
|
103
|
+
}),
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Initialize latency metrics with percentiles
|
|
109
|
+
* @private
|
|
110
|
+
*/
|
|
111
|
+
_initializeLatencyMetrics() {
|
|
112
|
+
this.latencyMetrics = {
|
|
113
|
+
// Histogram with explicit buckets for percentiles
|
|
114
|
+
histogram: this.meter.createHistogram('latency.operation_duration_ms', {
|
|
115
|
+
description: 'Operation latency in milliseconds',
|
|
116
|
+
unit: 'ms',
|
|
117
|
+
advice: {
|
|
118
|
+
explicitBucketBoundaries: this.config.buckets.latency,
|
|
119
|
+
},
|
|
120
|
+
}),
|
|
121
|
+
|
|
122
|
+
// P50 latency gauge
|
|
123
|
+
p50: this.meter.createUpDownCounter('latency.p50_ms', {
|
|
124
|
+
description: 'P50 (median) latency in milliseconds',
|
|
125
|
+
unit: 'ms',
|
|
126
|
+
}),
|
|
127
|
+
|
|
128
|
+
// P90 latency gauge
|
|
129
|
+
p90: this.meter.createUpDownCounter('latency.p90_ms', {
|
|
130
|
+
description: 'P90 latency in milliseconds',
|
|
131
|
+
unit: 'ms',
|
|
132
|
+
}),
|
|
133
|
+
|
|
134
|
+
// P95 latency gauge
|
|
135
|
+
p95: this.meter.createUpDownCounter('latency.p95_ms', {
|
|
136
|
+
description: 'P95 latency in milliseconds',
|
|
137
|
+
unit: 'ms',
|
|
138
|
+
}),
|
|
139
|
+
|
|
140
|
+
// P99 latency gauge
|
|
141
|
+
p99: this.meter.createUpDownCounter('latency.p99_ms', {
|
|
142
|
+
description: 'P99 latency in milliseconds',
|
|
143
|
+
unit: 'ms',
|
|
144
|
+
}),
|
|
145
|
+
|
|
146
|
+
// Max latency
|
|
147
|
+
max: this.meter.createUpDownCounter('latency.max_ms', {
|
|
148
|
+
description: 'Maximum latency in milliseconds',
|
|
149
|
+
unit: 'ms',
|
|
150
|
+
}),
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Initialize throughput metrics
|
|
156
|
+
* @private
|
|
157
|
+
*/
|
|
158
|
+
_initializeThroughputMetrics() {
|
|
159
|
+
this.throughputMetrics = {
|
|
160
|
+
// Operations per second
|
|
161
|
+
opsPerSecond: this.meter.createUpDownCounter('throughput.ops_per_second', {
|
|
162
|
+
description: 'Operations per second',
|
|
163
|
+
unit: '1/s',
|
|
164
|
+
}),
|
|
165
|
+
|
|
166
|
+
// Throughput histogram
|
|
167
|
+
histogram: this.meter.createHistogram('throughput.rate', {
|
|
168
|
+
description: 'Throughput rate histogram',
|
|
169
|
+
unit: '1/s',
|
|
170
|
+
advice: {
|
|
171
|
+
explicitBucketBoundaries: this.config.buckets.throughput,
|
|
172
|
+
},
|
|
173
|
+
}),
|
|
174
|
+
|
|
175
|
+
// Peak throughput
|
|
176
|
+
peak: this.meter.createUpDownCounter('throughput.peak_ops_per_second', {
|
|
177
|
+
description: 'Peak operations per second',
|
|
178
|
+
unit: '1/s',
|
|
179
|
+
}),
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Initialize resource utilization metrics
|
|
185
|
+
* @private
|
|
186
|
+
*/
|
|
187
|
+
_initializeResourceMetrics() {
|
|
188
|
+
this.resourceMetrics = {
|
|
189
|
+
// Memory usage histogram
|
|
190
|
+
memoryHistogram: this.meter.createHistogram('resource.memory_bytes', {
|
|
191
|
+
description: 'Memory usage in bytes',
|
|
192
|
+
unit: 'By',
|
|
193
|
+
advice: {
|
|
194
|
+
explicitBucketBoundaries: this.config.buckets.memory,
|
|
195
|
+
},
|
|
196
|
+
}),
|
|
197
|
+
|
|
198
|
+
// Heap used
|
|
199
|
+
heapUsed: this.meter.createUpDownCounter('resource.heap_used_bytes', {
|
|
200
|
+
description: 'Heap memory used in bytes',
|
|
201
|
+
unit: 'By',
|
|
202
|
+
}),
|
|
203
|
+
|
|
204
|
+
// Heap total
|
|
205
|
+
heapTotal: this.meter.createUpDownCounter('resource.heap_total_bytes', {
|
|
206
|
+
description: 'Total heap memory in bytes',
|
|
207
|
+
unit: 'By',
|
|
208
|
+
}),
|
|
209
|
+
|
|
210
|
+
// External memory
|
|
211
|
+
external: this.meter.createUpDownCounter('resource.external_bytes', {
|
|
212
|
+
description: 'External memory used in bytes',
|
|
213
|
+
unit: 'By',
|
|
214
|
+
}),
|
|
215
|
+
|
|
216
|
+
// CPU usage (approximated from event loop lag)
|
|
217
|
+
cpuLoad: this.meter.createUpDownCounter('resource.cpu_load', {
|
|
218
|
+
description: 'CPU load estimate (0-1)',
|
|
219
|
+
}),
|
|
220
|
+
|
|
221
|
+
// Event loop lag
|
|
222
|
+
eventLoopLag: this.meter.createHistogram('resource.event_loop_lag_ms', {
|
|
223
|
+
description: 'Event loop lag in milliseconds',
|
|
224
|
+
unit: 'ms',
|
|
225
|
+
}),
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Record a business operation
|
|
231
|
+
*
|
|
232
|
+
* @param {Object} options - Operation options
|
|
233
|
+
* @param {string} options.operation - Operation type
|
|
234
|
+
* @param {boolean} options.success - Whether operation succeeded
|
|
235
|
+
* @param {number} options.duration - Operation duration in ms
|
|
236
|
+
* @param {string} [options.errorType] - Error type if failed
|
|
237
|
+
* @param {number} [options.slaThreshold] - SLA threshold in ms
|
|
238
|
+
*/
|
|
239
|
+
recordOperation({ operation, success, duration, errorType, slaThreshold }) {
|
|
240
|
+
if (!this.config.enabled || !this._shouldSample()) return;
|
|
241
|
+
|
|
242
|
+
// Record operation count
|
|
243
|
+
this.businessMetrics.operations.add(1, {
|
|
244
|
+
operation,
|
|
245
|
+
result: success ? 'success' : 'failure',
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
// Record failure by type
|
|
249
|
+
if (!success && errorType) {
|
|
250
|
+
this.businessMetrics.failuresByType.add(1, {
|
|
251
|
+
operation,
|
|
252
|
+
error_type: errorType,
|
|
253
|
+
});
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Record SLA violation
|
|
257
|
+
if (slaThreshold && duration > slaThreshold) {
|
|
258
|
+
this.businessMetrics.slaViolations.add(1, {
|
|
259
|
+
operation,
|
|
260
|
+
threshold: slaThreshold.toString(),
|
|
261
|
+
});
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// Record latency
|
|
265
|
+
this.latencyMetrics.histogram.record(duration, { operation });
|
|
266
|
+
|
|
267
|
+
// Update throughput tracking
|
|
268
|
+
this._updateThroughput(operation);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Record success rate
|
|
273
|
+
*
|
|
274
|
+
* @param {string} operation - Operation type
|
|
275
|
+
* @param {number} rate - Success rate (0-1)
|
|
276
|
+
*/
|
|
277
|
+
recordSuccessRate(operation, rate) {
|
|
278
|
+
if (!this.config.enabled) return;
|
|
279
|
+
|
|
280
|
+
this.businessMetrics.successRate.add(rate, { operation });
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Record latency percentiles
|
|
285
|
+
*
|
|
286
|
+
* @param {string} operation - Operation type
|
|
287
|
+
* @param {Object} percentiles - Percentile values
|
|
288
|
+
* @param {number} percentiles.p50 - P50 latency
|
|
289
|
+
* @param {number} percentiles.p90 - P90 latency
|
|
290
|
+
* @param {number} percentiles.p95 - P95 latency
|
|
291
|
+
* @param {number} percentiles.p99 - P99 latency
|
|
292
|
+
* @param {number} percentiles.max - Max latency
|
|
293
|
+
*/
|
|
294
|
+
recordLatencyPercentiles(operation, { p50, p90, p95, p99, max }) {
|
|
295
|
+
if (!this.config.enabled) return;
|
|
296
|
+
|
|
297
|
+
const attrs = { operation };
|
|
298
|
+
|
|
299
|
+
this.latencyMetrics.p50.add(p50, attrs);
|
|
300
|
+
this.latencyMetrics.p90.add(p90, attrs);
|
|
301
|
+
this.latencyMetrics.p95.add(p95, attrs);
|
|
302
|
+
this.latencyMetrics.p99.add(p99, attrs);
|
|
303
|
+
this.latencyMetrics.max.add(max, attrs);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Record throughput
|
|
308
|
+
*
|
|
309
|
+
* @param {string} operation - Operation type
|
|
310
|
+
* @param {number} opsPerSecond - Operations per second
|
|
311
|
+
*/
|
|
312
|
+
recordThroughput(operation, opsPerSecond) {
|
|
313
|
+
if (!this.config.enabled) return;
|
|
314
|
+
|
|
315
|
+
const attrs = { operation };
|
|
316
|
+
|
|
317
|
+
this.throughputMetrics.opsPerSecond.add(opsPerSecond, attrs);
|
|
318
|
+
this.throughputMetrics.histogram.record(opsPerSecond, attrs);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Record resource utilization
|
|
323
|
+
*/
|
|
324
|
+
recordResourceUtilization() {
|
|
325
|
+
if (!this.config.enabled) return;
|
|
326
|
+
|
|
327
|
+
const memUsage = process.memoryUsage();
|
|
328
|
+
|
|
329
|
+
this.resourceMetrics.memoryHistogram.record(memUsage.heapUsed);
|
|
330
|
+
this.resourceMetrics.heapUsed.add(memUsage.heapUsed);
|
|
331
|
+
this.resourceMetrics.heapTotal.add(memUsage.heapTotal);
|
|
332
|
+
this.resourceMetrics.external.add(memUsage.external);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Record event loop lag
|
|
337
|
+
*
|
|
338
|
+
* @param {number} lag - Lag in milliseconds
|
|
339
|
+
*/
|
|
340
|
+
recordEventLoopLag(lag) {
|
|
341
|
+
if (!this.config.enabled) return;
|
|
342
|
+
|
|
343
|
+
this.resourceMetrics.eventLoopLag.record(lag);
|
|
344
|
+
|
|
345
|
+
// Estimate CPU load from lag (simplified model)
|
|
346
|
+
const cpuLoad = Math.min(1, lag / 100);
|
|
347
|
+
this.resourceMetrics.cpuLoad.add(cpuLoad);
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
/**
|
|
351
|
+
* Update throughput tracking
|
|
352
|
+
*
|
|
353
|
+
* @param {string} operation - Operation type
|
|
354
|
+
* @private
|
|
355
|
+
*/
|
|
356
|
+
_updateThroughput(operation) {
|
|
357
|
+
const count = (this.operationCounts.get(operation) || 0) + 1;
|
|
358
|
+
this.operationCounts.set(operation, count);
|
|
359
|
+
|
|
360
|
+
// Calculate throughput every second
|
|
361
|
+
const now = Date.now();
|
|
362
|
+
const elapsed = now - this.lastThroughputCalculation;
|
|
363
|
+
|
|
364
|
+
if (elapsed >= 1000) {
|
|
365
|
+
for (const [op, opCount] of this.operationCounts.entries()) {
|
|
366
|
+
const opsPerSecond = (opCount / elapsed) * 1000;
|
|
367
|
+
this.recordThroughput(op, opsPerSecond);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
this.operationCounts.clear();
|
|
371
|
+
this.lastThroughputCalculation = now;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
/**
|
|
376
|
+
* Determine if this measurement should be sampled
|
|
377
|
+
*
|
|
378
|
+
* @returns {boolean} True if should sample
|
|
379
|
+
* @private
|
|
380
|
+
*/
|
|
381
|
+
_shouldSample() {
|
|
382
|
+
return Math.random() < this.config.samplingRate;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
/**
|
|
386
|
+
* Get current metrics summary
|
|
387
|
+
*
|
|
388
|
+
* @returns {Object} Metrics summary
|
|
389
|
+
*/
|
|
390
|
+
getSummary() {
|
|
391
|
+
return {
|
|
392
|
+
enabled: this.config.enabled,
|
|
393
|
+
samplingRate: this.config.samplingRate,
|
|
394
|
+
operationTypes: Array.from(this.operationCounts.keys()),
|
|
395
|
+
lastThroughputCalculation: this.lastThroughputCalculation,
|
|
396
|
+
};
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
/**
|
|
401
|
+
* Create advanced metrics instance
|
|
402
|
+
*
|
|
403
|
+
* @param {Object} [config] - Configuration
|
|
404
|
+
* @returns {AdvancedMetrics} Metrics instance
|
|
405
|
+
*/
|
|
406
|
+
export function createAdvancedMetrics(config = {}) {
|
|
407
|
+
return new AdvancedMetrics(config);
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
/**
|
|
411
|
+
* Default advanced metrics instance
|
|
412
|
+
*/
|
|
413
|
+
export const defaultAdvancedMetrics = createAdvancedMetrics();
|