@unrdf/observability 26.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,413 @@
1
+ /**
2
+ * @file Advanced OpenTelemetry Metrics
3
+ * @module observability/advanced-metrics
4
+ *
5
+ * @description
6
+ * Advanced business metrics, latency histograms, throughput tracking,
7
+ * and resource utilization monitoring with zero performance impact.
8
+ */
9
+
10
+ import { metrics } from '@opentelemetry/api';
11
+ import { z } from 'zod';
12
+
13
+ /**
14
+ * Advanced metrics configuration schema
15
+ */
16
+ export const AdvancedMetricsConfigSchema = z.object({
17
+ serviceName: z.string().default('unrdf'),
18
+ serviceVersion: z.string().default('6.0.0'),
19
+ enabled: z.boolean().default(true),
20
+ samplingRate: z.number().min(0).max(1).default(0.01),
21
+ buckets: z
22
+ .object({
23
+ latency: z.array(z.number()).default([1, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000]),
24
+ throughput: z.array(z.number()).default([1, 10, 50, 100, 500, 1000, 5000]),
25
+ memory: z.array(z.number()).default([1e6, 10e6, 50e6, 100e6, 500e6, 1e9, 5e9]),
26
+ })
27
+ .default({
28
+ latency: [1, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000],
29
+ throughput: [1, 10, 50, 100, 500, 1000, 5000],
30
+ memory: [1e6, 10e6, 50e6, 100e6, 500e6, 1e9, 5e9],
31
+ }),
32
+ });
33
+
34
+ /**
35
+ * Advanced metrics manager
36
+ *
37
+ * Provides production-grade observability with:
38
+ * - Business metrics (success/failure rates)
39
+ * - Latency histograms (P50, P90, P95, P99)
40
+ * - Throughput metrics (ops/sec)
41
+ * - Resource utilization (memory, CPU)
42
+ * - Zero performance impact (sampling, async recording)
43
+ */
44
+ export class AdvancedMetrics {
45
+ /**
46
+ * Create advanced metrics manager
47
+ * @param {Object} [config] - Configuration options
48
+ */
49
+ constructor(config = {}) {
50
+ this.config = AdvancedMetricsConfigSchema.parse(config);
51
+ this.meter = metrics.getMeter(this.config.serviceName, this.config.serviceVersion);
52
+
53
+ // Metrics instances
54
+ this.businessMetrics = null;
55
+ this.latencyMetrics = null;
56
+ this.throughputMetrics = null;
57
+ this.resourceMetrics = null;
58
+
59
+ // Internal state for throughput calculation
60
+ this.operationCounts = new Map();
61
+ this.lastThroughputCalculation = Date.now();
62
+
63
+ if (this.config.enabled) {
64
+ this._initializeMetrics();
65
+ }
66
+ }
67
+
68
+ /**
69
+ * Initialize all metrics
70
+ * @private
71
+ */
72
+ _initializeMetrics() {
73
+ this._initializeBusinessMetrics();
74
+ this._initializeLatencyMetrics();
75
+ this._initializeThroughputMetrics();
76
+ this._initializeResourceMetrics();
77
+ }
78
+
79
+ /**
80
+ * Initialize business metrics
81
+ * @private
82
+ */
83
+ _initializeBusinessMetrics() {
84
+ this.businessMetrics = {
85
+ // Operation counters by type and result
86
+ operations: this.meter.createCounter('business.operations.total', {
87
+ description: 'Total business operations by type and result',
88
+ }),
89
+
90
+ // Success rate gauge
91
+ successRate: this.meter.createUpDownCounter('business.success_rate', {
92
+ description: 'Success rate for operations (0-1)',
93
+ }),
94
+
95
+ // Failure rate by error type
96
+ failuresByType: this.meter.createCounter('business.failures.by_type', {
97
+ description: 'Failures categorized by error type',
98
+ }),
99
+
100
+ // SLA violations
101
+ slaViolations: this.meter.createCounter('business.sla_violations', {
102
+ description: 'SLA violations by operation and threshold',
103
+ }),
104
+ };
105
+ }
106
+
107
+ /**
108
+ * Initialize latency metrics with percentiles
109
+ * @private
110
+ */
111
+ _initializeLatencyMetrics() {
112
+ this.latencyMetrics = {
113
+ // Histogram with explicit buckets for percentiles
114
+ histogram: this.meter.createHistogram('latency.operation_duration_ms', {
115
+ description: 'Operation latency in milliseconds',
116
+ unit: 'ms',
117
+ advice: {
118
+ explicitBucketBoundaries: this.config.buckets.latency,
119
+ },
120
+ }),
121
+
122
+ // P50 latency gauge
123
+ p50: this.meter.createUpDownCounter('latency.p50_ms', {
124
+ description: 'P50 (median) latency in milliseconds',
125
+ unit: 'ms',
126
+ }),
127
+
128
+ // P90 latency gauge
129
+ p90: this.meter.createUpDownCounter('latency.p90_ms', {
130
+ description: 'P90 latency in milliseconds',
131
+ unit: 'ms',
132
+ }),
133
+
134
+ // P95 latency gauge
135
+ p95: this.meter.createUpDownCounter('latency.p95_ms', {
136
+ description: 'P95 latency in milliseconds',
137
+ unit: 'ms',
138
+ }),
139
+
140
+ // P99 latency gauge
141
+ p99: this.meter.createUpDownCounter('latency.p99_ms', {
142
+ description: 'P99 latency in milliseconds',
143
+ unit: 'ms',
144
+ }),
145
+
146
+ // Max latency
147
+ max: this.meter.createUpDownCounter('latency.max_ms', {
148
+ description: 'Maximum latency in milliseconds',
149
+ unit: 'ms',
150
+ }),
151
+ };
152
+ }
153
+
154
+ /**
155
+ * Initialize throughput metrics
156
+ * @private
157
+ */
158
+ _initializeThroughputMetrics() {
159
+ this.throughputMetrics = {
160
+ // Operations per second
161
+ opsPerSecond: this.meter.createUpDownCounter('throughput.ops_per_second', {
162
+ description: 'Operations per second',
163
+ unit: '1/s',
164
+ }),
165
+
166
+ // Throughput histogram
167
+ histogram: this.meter.createHistogram('throughput.rate', {
168
+ description: 'Throughput rate histogram',
169
+ unit: '1/s',
170
+ advice: {
171
+ explicitBucketBoundaries: this.config.buckets.throughput,
172
+ },
173
+ }),
174
+
175
+ // Peak throughput
176
+ peak: this.meter.createUpDownCounter('throughput.peak_ops_per_second', {
177
+ description: 'Peak operations per second',
178
+ unit: '1/s',
179
+ }),
180
+ };
181
+ }
182
+
183
+ /**
184
+ * Initialize resource utilization metrics
185
+ * @private
186
+ */
187
+ _initializeResourceMetrics() {
188
+ this.resourceMetrics = {
189
+ // Memory usage histogram
190
+ memoryHistogram: this.meter.createHistogram('resource.memory_bytes', {
191
+ description: 'Memory usage in bytes',
192
+ unit: 'By',
193
+ advice: {
194
+ explicitBucketBoundaries: this.config.buckets.memory,
195
+ },
196
+ }),
197
+
198
+ // Heap used
199
+ heapUsed: this.meter.createUpDownCounter('resource.heap_used_bytes', {
200
+ description: 'Heap memory used in bytes',
201
+ unit: 'By',
202
+ }),
203
+
204
+ // Heap total
205
+ heapTotal: this.meter.createUpDownCounter('resource.heap_total_bytes', {
206
+ description: 'Total heap memory in bytes',
207
+ unit: 'By',
208
+ }),
209
+
210
+ // External memory
211
+ external: this.meter.createUpDownCounter('resource.external_bytes', {
212
+ description: 'External memory used in bytes',
213
+ unit: 'By',
214
+ }),
215
+
216
+ // CPU usage (approximated from event loop lag)
217
+ cpuLoad: this.meter.createUpDownCounter('resource.cpu_load', {
218
+ description: 'CPU load estimate (0-1)',
219
+ }),
220
+
221
+ // Event loop lag
222
+ eventLoopLag: this.meter.createHistogram('resource.event_loop_lag_ms', {
223
+ description: 'Event loop lag in milliseconds',
224
+ unit: 'ms',
225
+ }),
226
+ };
227
+ }
228
+
229
+ /**
230
+ * Record a business operation
231
+ *
232
+ * @param {Object} options - Operation options
233
+ * @param {string} options.operation - Operation type
234
+ * @param {boolean} options.success - Whether operation succeeded
235
+ * @param {number} options.duration - Operation duration in ms
236
+ * @param {string} [options.errorType] - Error type if failed
237
+ * @param {number} [options.slaThreshold] - SLA threshold in ms
238
+ */
239
+ recordOperation({ operation, success, duration, errorType, slaThreshold }) {
240
+ if (!this.config.enabled || !this._shouldSample()) return;
241
+
242
+ // Record operation count
243
+ this.businessMetrics.operations.add(1, {
244
+ operation,
245
+ result: success ? 'success' : 'failure',
246
+ });
247
+
248
+ // Record failure by type
249
+ if (!success && errorType) {
250
+ this.businessMetrics.failuresByType.add(1, {
251
+ operation,
252
+ error_type: errorType,
253
+ });
254
+ }
255
+
256
+ // Record SLA violation
257
+ if (slaThreshold && duration > slaThreshold) {
258
+ this.businessMetrics.slaViolations.add(1, {
259
+ operation,
260
+ threshold: slaThreshold.toString(),
261
+ });
262
+ }
263
+
264
+ // Record latency
265
+ this.latencyMetrics.histogram.record(duration, { operation });
266
+
267
+ // Update throughput tracking
268
+ this._updateThroughput(operation);
269
+ }
270
+
271
+ /**
272
+ * Record success rate
273
+ *
274
+ * @param {string} operation - Operation type
275
+ * @param {number} rate - Success rate (0-1)
276
+ */
277
+ recordSuccessRate(operation, rate) {
278
+ if (!this.config.enabled) return;
279
+
280
+ this.businessMetrics.successRate.add(rate, { operation });
281
+ }
282
+
283
+ /**
284
+ * Record latency percentiles
285
+ *
286
+ * @param {string} operation - Operation type
287
+ * @param {Object} percentiles - Percentile values
288
+ * @param {number} percentiles.p50 - P50 latency
289
+ * @param {number} percentiles.p90 - P90 latency
290
+ * @param {number} percentiles.p95 - P95 latency
291
+ * @param {number} percentiles.p99 - P99 latency
292
+ * @param {number} percentiles.max - Max latency
293
+ */
294
+ recordLatencyPercentiles(operation, { p50, p90, p95, p99, max }) {
295
+ if (!this.config.enabled) return;
296
+
297
+ const attrs = { operation };
298
+
299
+ this.latencyMetrics.p50.add(p50, attrs);
300
+ this.latencyMetrics.p90.add(p90, attrs);
301
+ this.latencyMetrics.p95.add(p95, attrs);
302
+ this.latencyMetrics.p99.add(p99, attrs);
303
+ this.latencyMetrics.max.add(max, attrs);
304
+ }
305
+
306
+ /**
307
+ * Record throughput
308
+ *
309
+ * @param {string} operation - Operation type
310
+ * @param {number} opsPerSecond - Operations per second
311
+ */
312
+ recordThroughput(operation, opsPerSecond) {
313
+ if (!this.config.enabled) return;
314
+
315
+ const attrs = { operation };
316
+
317
+ this.throughputMetrics.opsPerSecond.add(opsPerSecond, attrs);
318
+ this.throughputMetrics.histogram.record(opsPerSecond, attrs);
319
+ }
320
+
321
+ /**
322
+ * Record resource utilization
323
+ */
324
+ recordResourceUtilization() {
325
+ if (!this.config.enabled) return;
326
+
327
+ const memUsage = process.memoryUsage();
328
+
329
+ this.resourceMetrics.memoryHistogram.record(memUsage.heapUsed);
330
+ this.resourceMetrics.heapUsed.add(memUsage.heapUsed);
331
+ this.resourceMetrics.heapTotal.add(memUsage.heapTotal);
332
+ this.resourceMetrics.external.add(memUsage.external);
333
+ }
334
+
335
+ /**
336
+ * Record event loop lag
337
+ *
338
+ * @param {number} lag - Lag in milliseconds
339
+ */
340
+ recordEventLoopLag(lag) {
341
+ if (!this.config.enabled) return;
342
+
343
+ this.resourceMetrics.eventLoopLag.record(lag);
344
+
345
+ // Estimate CPU load from lag (simplified model)
346
+ const cpuLoad = Math.min(1, lag / 100);
347
+ this.resourceMetrics.cpuLoad.add(cpuLoad);
348
+ }
349
+
350
+ /**
351
+ * Update throughput tracking
352
+ *
353
+ * @param {string} operation - Operation type
354
+ * @private
355
+ */
356
+ _updateThroughput(operation) {
357
+ const count = (this.operationCounts.get(operation) || 0) + 1;
358
+ this.operationCounts.set(operation, count);
359
+
360
+ // Calculate throughput every second
361
+ const now = Date.now();
362
+ const elapsed = now - this.lastThroughputCalculation;
363
+
364
+ if (elapsed >= 1000) {
365
+ for (const [op, opCount] of this.operationCounts.entries()) {
366
+ const opsPerSecond = (opCount / elapsed) * 1000;
367
+ this.recordThroughput(op, opsPerSecond);
368
+ }
369
+
370
+ this.operationCounts.clear();
371
+ this.lastThroughputCalculation = now;
372
+ }
373
+ }
374
+
375
+ /**
376
+ * Determine if this measurement should be sampled
377
+ *
378
+ * @returns {boolean} True if should sample
379
+ * @private
380
+ */
381
+ _shouldSample() {
382
+ return Math.random() < this.config.samplingRate;
383
+ }
384
+
385
+ /**
386
+ * Get current metrics summary
387
+ *
388
+ * @returns {Object} Metrics summary
389
+ */
390
+ getSummary() {
391
+ return {
392
+ enabled: this.config.enabled,
393
+ samplingRate: this.config.samplingRate,
394
+ operationTypes: Array.from(this.operationCounts.keys()),
395
+ lastThroughputCalculation: this.lastThroughputCalculation,
396
+ };
397
+ }
398
+ }
399
+
400
+ /**
401
+ * Create advanced metrics instance
402
+ *
403
+ * @param {Object} [config] - Configuration
404
+ * @returns {AdvancedMetrics} Metrics instance
405
+ */
406
+ export function createAdvancedMetrics(config = {}) {
407
+ return new AdvancedMetrics(config);
408
+ }
409
+
410
+ /**
411
+ * Default advanced metrics instance
412
+ */
413
+ export const defaultAdvancedMetrics = createAdvancedMetrics();