@unrdf/observability 26.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.cjs +10 -0
- package/IMPLEMENTATION-SUMMARY.md +478 -0
- package/LICENSE +21 -0
- package/README.md +482 -0
- package/capability-map.md +90 -0
- package/config/alert-rules.yml +269 -0
- package/config/prometheus.yml +136 -0
- package/dashboards/grafana-unrdf.json +798 -0
- package/dashboards/unrdf-workflow-dashboard.json +295 -0
- package/docs/OBSERVABILITY-PATTERNS.md +681 -0
- package/docs/OBSERVABILITY-RUNBOOK.md +554 -0
- package/examples/observability-demo.mjs +334 -0
- package/package.json +46 -0
- package/src/advanced-metrics.mjs +413 -0
- package/src/alerts/alert-manager.mjs +436 -0
- package/src/custom-events.mjs +558 -0
- package/src/distributed-tracing.mjs +352 -0
- package/src/exporters/grafana-exporter.mjs +415 -0
- package/src/index.mjs +61 -0
- package/src/metrics/workflow-metrics.mjs +346 -0
- package/src/receipts/anchor.mjs +155 -0
- package/src/receipts/index.mjs +62 -0
- package/src/receipts/merkle-tree.mjs +188 -0
- package/src/receipts/receipt-chain.mjs +209 -0
- package/src/receipts/receipt-schema.mjs +128 -0
- package/src/receipts/tamper-detection.mjs +219 -0
- package/test/advanced-metrics.test.mjs +302 -0
- package/test/custom-events.test.mjs +387 -0
- package/test/distributed-tracing.test.mjs +314 -0
- package/validation/observability-validation.mjs +366 -0
- package/vitest.config.mjs +25 -0
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* @unrdf/observability - Live Demo
|
|
4
|
+
*
|
|
5
|
+
* Demonstrates real-time workflow metrics collection, alerting,
|
|
6
|
+
* and Prometheus metrics endpoint exposure.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* node examples/observability-demo.mjs
|
|
10
|
+
* curl http://localhost:9090/metrics
|
|
11
|
+
*
|
|
12
|
+
* @module @unrdf/observability/examples
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import express from 'express';
|
|
16
|
+
import { createWorkflowMetrics } from '../src/metrics/workflow-metrics.mjs';
|
|
17
|
+
import { createGrafanaExporter } from '../src/exporters/grafana-exporter.mjs';
|
|
18
|
+
import { createAlertManager } from '../src/alerts/alert-manager.mjs';
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Simulated workflow execution
|
|
22
|
+
*/
|
|
23
|
+
class WorkflowSimulator {
|
|
24
|
+
constructor(metrics, alerts) {
|
|
25
|
+
this.metrics = metrics;
|
|
26
|
+
this.alerts = alerts;
|
|
27
|
+
this.workflowCount = 0;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Simulate a workflow execution
|
|
32
|
+
*/
|
|
33
|
+
async simulateWorkflow() {
|
|
34
|
+
const workflowId = `wf-${++this.workflowCount}`;
|
|
35
|
+
const pattern = this._randomPattern();
|
|
36
|
+
const taskCount = Math.floor(Math.random() * 10) + 1;
|
|
37
|
+
|
|
38
|
+
console.log(
|
|
39
|
+
`[${new Date().toISOString()}] Starting workflow ${workflowId} (pattern: ${pattern}, tasks: ${taskCount})`
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
// Record workflow start
|
|
43
|
+
this.metrics.recordWorkflowStart(workflowId, pattern);
|
|
44
|
+
|
|
45
|
+
const startTime = Date.now();
|
|
46
|
+
|
|
47
|
+
// Simulate tasks
|
|
48
|
+
for (let i = 0; i < taskCount; i++) {
|
|
49
|
+
await this._simulateTask(workflowId, i);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Random workflow outcome
|
|
53
|
+
const success = Math.random() > 0.1; // 90% success rate
|
|
54
|
+
const duration = (Date.now() - startTime) / 1000;
|
|
55
|
+
const status = success ? 'completed' : 'failed';
|
|
56
|
+
|
|
57
|
+
// Record workflow completion
|
|
58
|
+
this.metrics.recordWorkflowComplete(workflowId, status, duration, pattern);
|
|
59
|
+
|
|
60
|
+
// Evaluate against alert rules
|
|
61
|
+
await this.alerts.evaluateMetric('workflow_duration', duration, {
|
|
62
|
+
workflow_id: workflowId,
|
|
63
|
+
status,
|
|
64
|
+
pattern,
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
if (!success) {
|
|
68
|
+
this.metrics.recordError('execution_failed', workflowId, 'high');
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Record event sourcing
|
|
72
|
+
this.metrics.recordEventAppended('workflow_completed', workflowId);
|
|
73
|
+
this.metrics.updateEventStoreSize(workflowId, Math.floor(Math.random() * 1000000));
|
|
74
|
+
|
|
75
|
+
// Simulate crypto receipt
|
|
76
|
+
if (success) {
|
|
77
|
+
this.metrics.recordCryptoReceipt(workflowId, 'BLAKE3');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
console.log(
|
|
81
|
+
`[${new Date().toISOString()}] Workflow ${workflowId} ${status} in ${duration.toFixed(2)}s`
|
|
82
|
+
);
|
|
83
|
+
|
|
84
|
+
return { workflowId, status, duration };
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Simulate a task execution
|
|
89
|
+
* @private
|
|
90
|
+
*/
|
|
91
|
+
async _simulateTask(workflowId, taskIndex) {
|
|
92
|
+
const taskId = `task-${taskIndex}`;
|
|
93
|
+
const taskType = this._randomTaskType();
|
|
94
|
+
const startTime = Date.now();
|
|
95
|
+
|
|
96
|
+
// Simulate task execution time
|
|
97
|
+
const executionTime = Math.random() * 500 + 50; // 50-550ms
|
|
98
|
+
await new Promise(resolve => setTimeout(resolve, executionTime));
|
|
99
|
+
|
|
100
|
+
const success = Math.random() > 0.05; // 95% success rate
|
|
101
|
+
const duration = (Date.now() - startTime) / 1000;
|
|
102
|
+
const status = success ? 'completed' : 'failed';
|
|
103
|
+
|
|
104
|
+
// Record task execution
|
|
105
|
+
this.metrics.recordTaskExecution(workflowId, taskId, taskType, status, duration);
|
|
106
|
+
|
|
107
|
+
// Record latency
|
|
108
|
+
this.metrics.recordLatency(taskType, executionTime);
|
|
109
|
+
|
|
110
|
+
// Update queue depth (random)
|
|
111
|
+
const queueDepth = Math.floor(Math.random() * 20);
|
|
112
|
+
this.metrics.updateTaskQueueDepth(workflowId, 'default', queueDepth);
|
|
113
|
+
|
|
114
|
+
if (!success) {
|
|
115
|
+
this.metrics.recordError('task_execution_failed', workflowId, 'medium');
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return { taskId, status, duration };
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Simulate resource monitoring
|
|
123
|
+
*/
|
|
124
|
+
simulateResourceMetrics() {
|
|
125
|
+
// CPU utilization
|
|
126
|
+
const cpuUtil = Math.random() * 100;
|
|
127
|
+
this.metrics.recordResourceUtilization('cpu', 'node-0', cpuUtil);
|
|
128
|
+
this.alerts.evaluateMetric('cpu_utilization', cpuUtil, { resource_id: 'node-0' });
|
|
129
|
+
|
|
130
|
+
// Memory utilization
|
|
131
|
+
const memUtil = Math.random() * 100;
|
|
132
|
+
this.metrics.recordResourceUtilization('memory', 'node-0', memUtil);
|
|
133
|
+
this.alerts.evaluateMetric('memory_utilization', memUtil, { resource_id: 'node-0' });
|
|
134
|
+
|
|
135
|
+
// Disk utilization
|
|
136
|
+
const diskUtil = Math.random() * 100;
|
|
137
|
+
this.metrics.recordResourceUtilization('disk', 'node-0', diskUtil);
|
|
138
|
+
|
|
139
|
+
// Random resource allocations
|
|
140
|
+
if (Math.random() > 0.7) {
|
|
141
|
+
this.metrics.recordResourceAllocation(
|
|
142
|
+
'compute',
|
|
143
|
+
Math.random() > 0.9 ? 'failed' : 'allocated'
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Simulate policy evaluations
|
|
150
|
+
*/
|
|
151
|
+
simulatePolicyEvaluations() {
|
|
152
|
+
const policies = ['resource-allocation', 'task-enablement', 'workflow-completion'];
|
|
153
|
+
const policy = policies[Math.floor(Math.random() * policies.length)];
|
|
154
|
+
const result = Math.random() > 0.1 ? 'allow' : 'deny';
|
|
155
|
+
|
|
156
|
+
this.metrics.recordPolicyEvaluation(policy, result);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
_randomPattern() {
|
|
160
|
+
const patterns = ['SEQUENCE', 'PARALLEL_SPLIT', 'EXCLUSIVE_CHOICE', 'MULTI_CHOICE'];
|
|
161
|
+
return patterns[Math.floor(Math.random() * patterns.length)];
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
_randomTaskType() {
|
|
165
|
+
const types = ['atomic', 'composite', 'service', 'user'];
|
|
166
|
+
return types[Math.floor(Math.random() * types.length)];
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Main demo application
|
|
172
|
+
*/
|
|
173
|
+
async function main() {
|
|
174
|
+
console.log('=== UNRDF Observability Dashboard Demo ===\n');
|
|
175
|
+
|
|
176
|
+
// Create observability stack
|
|
177
|
+
const metrics = createWorkflowMetrics({
|
|
178
|
+
enableDefaultMetrics: true,
|
|
179
|
+
prefix: 'unrdf_workflow_',
|
|
180
|
+
labels: { environment: 'demo', version: '1.0.0' },
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
const grafana = createGrafanaExporter({
|
|
184
|
+
title: 'UNRDF Workflow Dashboard - Demo',
|
|
185
|
+
datasource: 'Prometheus',
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
const alerts = createAlertManager({
|
|
189
|
+
rules: [
|
|
190
|
+
{
|
|
191
|
+
id: 'high-workflow-duration',
|
|
192
|
+
name: 'High Workflow Duration',
|
|
193
|
+
metric: 'workflow_duration',
|
|
194
|
+
threshold: 5,
|
|
195
|
+
operator: 'gt',
|
|
196
|
+
severity: 'warning',
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
id: 'high-cpu-utilization',
|
|
200
|
+
name: 'High CPU Utilization',
|
|
201
|
+
metric: 'cpu_utilization',
|
|
202
|
+
threshold: 80,
|
|
203
|
+
operator: 'gt',
|
|
204
|
+
severity: 'critical',
|
|
205
|
+
},
|
|
206
|
+
{
|
|
207
|
+
id: 'high-memory-utilization',
|
|
208
|
+
name: 'High Memory Utilization',
|
|
209
|
+
metric: 'memory_utilization',
|
|
210
|
+
threshold: 85,
|
|
211
|
+
operator: 'gt',
|
|
212
|
+
severity: 'critical',
|
|
213
|
+
},
|
|
214
|
+
],
|
|
215
|
+
enableAnomalyDetection: true,
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
// Alert event handlers
|
|
219
|
+
alerts.on('alert', alert => {
|
|
220
|
+
console.log(`\n🚨 ALERT FIRED: ${alert.name} (${alert.severity})`);
|
|
221
|
+
console.log(` Metric: ${alert.metric} = ${alert.value} (threshold: ${alert.threshold})`);
|
|
222
|
+
});
|
|
223
|
+
|
|
224
|
+
alerts.on('alert:resolved', alert => {
|
|
225
|
+
console.log(`\n✅ ALERT RESOLVED: ${alert.name}`);
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
// Create Express app for metrics endpoint
|
|
229
|
+
const app = express();
|
|
230
|
+
const PORT = process.env.PORT || 9090;
|
|
231
|
+
|
|
232
|
+
// Metrics endpoint
|
|
233
|
+
app.get('/metrics', async (req, res) => {
|
|
234
|
+
try {
|
|
235
|
+
const metricsText = await metrics.getMetrics();
|
|
236
|
+
res.set('Content-Type', 'text/plain; version=0.0.4; charset=utf-8');
|
|
237
|
+
res.send(metricsText);
|
|
238
|
+
} catch (error) {
|
|
239
|
+
res.status(500).send(`Error generating metrics: ${error.message}`);
|
|
240
|
+
}
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
// Metrics JSON endpoint
|
|
244
|
+
app.get('/metrics/json', async (req, res) => {
|
|
245
|
+
try {
|
|
246
|
+
const metricsJSON = await metrics.getMetricsJSON();
|
|
247
|
+
res.json(metricsJSON);
|
|
248
|
+
} catch (error) {
|
|
249
|
+
res.status(500).json({ error: error.message });
|
|
250
|
+
}
|
|
251
|
+
});
|
|
252
|
+
|
|
253
|
+
// Dashboard endpoint
|
|
254
|
+
app.get('/dashboard', (req, res) => {
|
|
255
|
+
const dashboard = grafana.generateDashboard();
|
|
256
|
+
res.json(dashboard);
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
// Dashboard JSON export
|
|
260
|
+
app.get('/dashboard/export', (req, res) => {
|
|
261
|
+
const dashboardJSON = grafana.exportJSON(true);
|
|
262
|
+
res.set('Content-Type', 'application/json');
|
|
263
|
+
res.set('Content-Disposition', 'attachment; filename="unrdf-dashboard.json"');
|
|
264
|
+
res.send(dashboardJSON);
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
// Alerts endpoint
|
|
268
|
+
app.get('/alerts', (req, res) => {
|
|
269
|
+
const activeAlerts = alerts.getActiveAlerts();
|
|
270
|
+
res.json({ alerts: activeAlerts, count: activeAlerts.length });
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
// Alert history endpoint
|
|
274
|
+
app.get('/alerts/history', (req, res) => {
|
|
275
|
+
const history = alerts.getAlertHistory({ limit: 50 });
|
|
276
|
+
res.json({ history, count: history.length });
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
// Statistics endpoint
|
|
280
|
+
app.get('/stats', (req, res) => {
|
|
281
|
+
const stats = alerts.getStatistics();
|
|
282
|
+
res.json(stats);
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
// Health check
|
|
286
|
+
app.get('/health', (req, res) => {
|
|
287
|
+
res.json({ status: 'healthy', timestamp: Date.now() });
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
// Start server
|
|
291
|
+
app.listen(PORT, () => {
|
|
292
|
+
console.log(`✅ Metrics server running on http://localhost:${PORT}`);
|
|
293
|
+
console.log(` Prometheus metrics: http://localhost:${PORT}/metrics`);
|
|
294
|
+
console.log(` Metrics JSON: http://localhost:${PORT}/metrics/json`);
|
|
295
|
+
console.log(` Grafana dashboard: http://localhost:${PORT}/dashboard`);
|
|
296
|
+
console.log(` Download dashboard: http://localhost:${PORT}/dashboard/export`);
|
|
297
|
+
console.log(` Active alerts: http://localhost:${PORT}/alerts`);
|
|
298
|
+
console.log(` Alert history: http://localhost:${PORT}/alerts/history`);
|
|
299
|
+
console.log(` Statistics: http://localhost:${PORT}/stats`);
|
|
300
|
+
console.log('\nSimulating workflows...\n');
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
// Create simulator
|
|
304
|
+
const simulator = new WorkflowSimulator(metrics, alerts);
|
|
305
|
+
|
|
306
|
+
// Simulate workflows periodically
|
|
307
|
+
setInterval(() => {
|
|
308
|
+
simulator.simulateWorkflow().catch(console.error);
|
|
309
|
+
}, 3000); // Every 3 seconds
|
|
310
|
+
|
|
311
|
+
// Simulate resource metrics
|
|
312
|
+
setInterval(() => {
|
|
313
|
+
simulator.simulateResourceMetrics();
|
|
314
|
+
}, 5000); // Every 5 seconds
|
|
315
|
+
|
|
316
|
+
// Simulate policy evaluations
|
|
317
|
+
setInterval(() => {
|
|
318
|
+
simulator.simulatePolicyEvaluations();
|
|
319
|
+
}, 2000); // Every 2 seconds
|
|
320
|
+
|
|
321
|
+
// Graceful shutdown
|
|
322
|
+
process.on('SIGINT', () => {
|
|
323
|
+
console.log('\n\nShutting down gracefully...');
|
|
324
|
+
console.log('\nFinal Statistics:');
|
|
325
|
+
console.log(JSON.stringify(alerts.getStatistics(), null, 2));
|
|
326
|
+
process.exit(0);
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// Run demo
|
|
331
|
+
main().catch(error => {
|
|
332
|
+
console.error('Fatal error:', error);
|
|
333
|
+
process.exit(1);
|
|
334
|
+
});
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@unrdf/observability",
|
|
3
|
+
"version": "26.4.2",
|
|
4
|
+
"publishConfig": {
|
|
5
|
+
"access": "public"
|
|
6
|
+
},
|
|
7
|
+
"description": "Innovative Prometheus/Grafana observability dashboard for UNRDF distributed workflows",
|
|
8
|
+
"type": "module",
|
|
9
|
+
"main": "./src/index.mjs",
|
|
10
|
+
"exports": {
|
|
11
|
+
".": "./src/index.mjs",
|
|
12
|
+
"./metrics": "./src/metrics/workflow-metrics.mjs",
|
|
13
|
+
"./exporters": "./src/exporters/grafana-exporter.mjs",
|
|
14
|
+
"./alerts": "./src/alerts/alert-manager.mjs"
|
|
15
|
+
},
|
|
16
|
+
"dependencies": {
|
|
17
|
+
"prom-client": "^15.1.0",
|
|
18
|
+
"@opentelemetry/api": "^1.9.0",
|
|
19
|
+
"@opentelemetry/exporter-prometheus": "^0.49.0",
|
|
20
|
+
"@opentelemetry/sdk-metrics": "^1.21.0",
|
|
21
|
+
"express": "^4.18.2",
|
|
22
|
+
"zod": "^4.1.13"
|
|
23
|
+
},
|
|
24
|
+
"devDependencies": {
|
|
25
|
+
"vitest": "^4.0.15"
|
|
26
|
+
},
|
|
27
|
+
"keywords": [
|
|
28
|
+
"prometheus",
|
|
29
|
+
"grafana",
|
|
30
|
+
"metrics",
|
|
31
|
+
"observability",
|
|
32
|
+
"monitoring",
|
|
33
|
+
"alerting",
|
|
34
|
+
"workflow",
|
|
35
|
+
"distributed-systems"
|
|
36
|
+
],
|
|
37
|
+
"author": "UNRDF Team",
|
|
38
|
+
"license": "MIT",
|
|
39
|
+
"scripts": {
|
|
40
|
+
"demo": "node examples/observability-demo.mjs",
|
|
41
|
+
"test": "vitest run",
|
|
42
|
+
"test:watch": "vitest",
|
|
43
|
+
"lint": "eslint .",
|
|
44
|
+
"format": "prettier --write ."
|
|
45
|
+
}
|
|
46
|
+
}
|