@unrdf/observability 26.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.cjs +10 -0
- package/IMPLEMENTATION-SUMMARY.md +478 -0
- package/LICENSE +21 -0
- package/README.md +482 -0
- package/capability-map.md +90 -0
- package/config/alert-rules.yml +269 -0
- package/config/prometheus.yml +136 -0
- package/dashboards/grafana-unrdf.json +798 -0
- package/dashboards/unrdf-workflow-dashboard.json +295 -0
- package/docs/OBSERVABILITY-PATTERNS.md +681 -0
- package/docs/OBSERVABILITY-RUNBOOK.md +554 -0
- package/examples/observability-demo.mjs +334 -0
- package/package.json +46 -0
- package/src/advanced-metrics.mjs +413 -0
- package/src/alerts/alert-manager.mjs +436 -0
- package/src/custom-events.mjs +558 -0
- package/src/distributed-tracing.mjs +352 -0
- package/src/exporters/grafana-exporter.mjs +415 -0
- package/src/index.mjs +61 -0
- package/src/metrics/workflow-metrics.mjs +346 -0
- package/src/receipts/anchor.mjs +155 -0
- package/src/receipts/index.mjs +62 -0
- package/src/receipts/merkle-tree.mjs +188 -0
- package/src/receipts/receipt-chain.mjs +209 -0
- package/src/receipts/receipt-schema.mjs +128 -0
- package/src/receipts/tamper-detection.mjs +219 -0
- package/test/advanced-metrics.test.mjs +302 -0
- package/test/custom-events.test.mjs +387 -0
- package/test/distributed-tracing.test.mjs +314 -0
- package/validation/observability-validation.mjs +366 -0
- package/vitest.config.mjs +25 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
{
|
|
2
|
+
"dashboard": {
|
|
3
|
+
"id": null,
|
|
4
|
+
"uid": "unrdf-workflow-dashboard",
|
|
5
|
+
"title": "UNRDF Workflow Observability Dashboard",
|
|
6
|
+
"description": "Comprehensive monitoring for UNRDF distributed workflows with Prometheus metrics",
|
|
7
|
+
"tags": ["unrdf", "workflow", "prometheus", "observability"],
|
|
8
|
+
"timezone": "browser",
|
|
9
|
+
"schemaVersion": 38,
|
|
10
|
+
"version": 1,
|
|
11
|
+
"refresh": "5s",
|
|
12
|
+
"time": {
|
|
13
|
+
"from": "now-1h",
|
|
14
|
+
"to": "now"
|
|
15
|
+
},
|
|
16
|
+
"timepicker": {
|
|
17
|
+
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h"],
|
|
18
|
+
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
|
|
19
|
+
},
|
|
20
|
+
"panels": [
|
|
21
|
+
{
|
|
22
|
+
"id": 1,
|
|
23
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
|
24
|
+
"type": "graph",
|
|
25
|
+
"title": "Workflow Executions by Status",
|
|
26
|
+
"datasource": "Prometheus",
|
|
27
|
+
"targets": [
|
|
28
|
+
{
|
|
29
|
+
"expr": "rate(unrdf_workflow_executions_total[5m])",
|
|
30
|
+
"legendFormat": "{{status}} - {{pattern}}",
|
|
31
|
+
"refId": "A"
|
|
32
|
+
}
|
|
33
|
+
],
|
|
34
|
+
"yaxes": [{ "format": "ops", "label": "Executions/sec" }, { "format": "short" }],
|
|
35
|
+
"legend": {
|
|
36
|
+
"show": true,
|
|
37
|
+
"alignAsTable": true,
|
|
38
|
+
"values": true,
|
|
39
|
+
"current": true,
|
|
40
|
+
"max": true
|
|
41
|
+
},
|
|
42
|
+
"tooltip": { "shared": true, "sort": 2 },
|
|
43
|
+
"fill": 1,
|
|
44
|
+
"linewidth": 2
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"id": 2,
|
|
48
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
|
49
|
+
"type": "stat",
|
|
50
|
+
"title": "Active Workflows",
|
|
51
|
+
"datasource": "Prometheus",
|
|
52
|
+
"targets": [
|
|
53
|
+
{
|
|
54
|
+
"expr": "sum(unrdf_workflow_active_workflows)",
|
|
55
|
+
"refId": "A"
|
|
56
|
+
}
|
|
57
|
+
],
|
|
58
|
+
"options": {
|
|
59
|
+
"orientation": "auto",
|
|
60
|
+
"textMode": "value_and_name",
|
|
61
|
+
"colorMode": "value",
|
|
62
|
+
"graphMode": "area"
|
|
63
|
+
},
|
|
64
|
+
"fieldConfig": {
|
|
65
|
+
"defaults": {
|
|
66
|
+
"thresholds": {
|
|
67
|
+
"mode": "absolute",
|
|
68
|
+
"steps": [
|
|
69
|
+
{ "value": 0, "color": "green" },
|
|
70
|
+
{ "value": 50, "color": "yellow" },
|
|
71
|
+
{ "value": 100, "color": "red" }
|
|
72
|
+
]
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"id": 3,
|
|
79
|
+
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 },
|
|
80
|
+
"type": "heatmap",
|
|
81
|
+
"title": "Workflow Execution Duration Distribution",
|
|
82
|
+
"datasource": "Prometheus",
|
|
83
|
+
"targets": [
|
|
84
|
+
{
|
|
85
|
+
"expr": "rate(unrdf_workflow_execution_duration_seconds_bucket[5m])",
|
|
86
|
+
"legendFormat": "{{le}}",
|
|
87
|
+
"refId": "A"
|
|
88
|
+
}
|
|
89
|
+
],
|
|
90
|
+
"heatmap": {
|
|
91
|
+
"colorScheme": "interpolateSpectral"
|
|
92
|
+
},
|
|
93
|
+
"dataFormat": "tsbuckets"
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"id": 4,
|
|
97
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
|
98
|
+
"type": "graph",
|
|
99
|
+
"title": "Task Executions by Type",
|
|
100
|
+
"datasource": "Prometheus",
|
|
101
|
+
"targets": [
|
|
102
|
+
{
|
|
103
|
+
"expr": "rate(unrdf_workflow_task_executions_total[5m])",
|
|
104
|
+
"legendFormat": "{{task_type}} - {{status}}",
|
|
105
|
+
"refId": "A"
|
|
106
|
+
}
|
|
107
|
+
],
|
|
108
|
+
"yaxes": [{ "format": "ops", "label": "Tasks/sec" }, { "format": "short" }],
|
|
109
|
+
"legend": { "show": true, "alignAsTable": true, "values": true },
|
|
110
|
+
"stack": true
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"id": 5,
|
|
114
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
|
115
|
+
"type": "graph",
|
|
116
|
+
"title": "Error Rate by Severity",
|
|
117
|
+
"datasource": "Prometheus",
|
|
118
|
+
"targets": [
|
|
119
|
+
{
|
|
120
|
+
"expr": "rate(unrdf_workflow_errors_total[5m])",
|
|
121
|
+
"legendFormat": "{{severity}} - {{error_type}}",
|
|
122
|
+
"refId": "A"
|
|
123
|
+
}
|
|
124
|
+
],
|
|
125
|
+
"yaxes": [{ "format": "ops", "label": "Errors/sec" }, { "format": "short" }],
|
|
126
|
+
"alert": {
|
|
127
|
+
"conditions": [
|
|
128
|
+
{
|
|
129
|
+
"evaluator": { "params": [1], "type": "gt" },
|
|
130
|
+
"operator": { "type": "and" },
|
|
131
|
+
"query": { "params": ["A", "5m", "now"] },
|
|
132
|
+
"reducer": { "params": [], "type": "avg" },
|
|
133
|
+
"type": "query"
|
|
134
|
+
}
|
|
135
|
+
],
|
|
136
|
+
"executionErrorState": "alerting",
|
|
137
|
+
"frequency": "60s",
|
|
138
|
+
"handler": 1,
|
|
139
|
+
"name": "High Error Rate",
|
|
140
|
+
"noDataState": "no_data"
|
|
141
|
+
},
|
|
142
|
+
"thresholds": [
|
|
143
|
+
{
|
|
144
|
+
"value": 1,
|
|
145
|
+
"colorMode": "critical",
|
|
146
|
+
"op": "gt",
|
|
147
|
+
"fill": true,
|
|
148
|
+
"line": true
|
|
149
|
+
}
|
|
150
|
+
]
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
"id": 6,
|
|
154
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
|
|
155
|
+
"type": "graph",
|
|
156
|
+
"title": "Resource Utilization",
|
|
157
|
+
"datasource": "Prometheus",
|
|
158
|
+
"targets": [
|
|
159
|
+
{
|
|
160
|
+
"expr": "unrdf_workflow_resource_utilization",
|
|
161
|
+
"legendFormat": "{{resource_type}} - {{resource_id}}",
|
|
162
|
+
"refId": "A"
|
|
163
|
+
}
|
|
164
|
+
],
|
|
165
|
+
"yaxes": [
|
|
166
|
+
{ "format": "percent", "label": "Utilization %", "max": 100, "min": 0 },
|
|
167
|
+
{ "format": "short" }
|
|
168
|
+
],
|
|
169
|
+
"legend": { "show": true, "alignAsTable": true, "values": true, "current": true }
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"id": 7,
|
|
173
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
|
|
174
|
+
"type": "graph",
|
|
175
|
+
"title": "Event Store Metrics",
|
|
176
|
+
"datasource": "Prometheus",
|
|
177
|
+
"targets": [
|
|
178
|
+
{
|
|
179
|
+
"expr": "rate(unrdf_workflow_events_appended_total[5m])",
|
|
180
|
+
"legendFormat": "{{event_type}} events/sec",
|
|
181
|
+
"refId": "A"
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
"expr": "unrdf_workflow_event_store_size_bytes / 1024 / 1024",
|
|
185
|
+
"legendFormat": "Store size MB ({{workflow_id}})",
|
|
186
|
+
"refId": "B"
|
|
187
|
+
}
|
|
188
|
+
],
|
|
189
|
+
"yaxes": [{ "format": "short" }, { "format": "decmbytes" }]
|
|
190
|
+
},
|
|
191
|
+
{
|
|
192
|
+
"id": 8,
|
|
193
|
+
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 32 },
|
|
194
|
+
"type": "graph",
|
|
195
|
+
"title": "Operation Latency Percentiles",
|
|
196
|
+
"datasource": "Prometheus",
|
|
197
|
+
"targets": [
|
|
198
|
+
{
|
|
199
|
+
"expr": "unrdf_workflow_latency_percentiles{quantile=\"0.5\"}",
|
|
200
|
+
"legendFormat": "p50 - {{operation}}",
|
|
201
|
+
"refId": "A"
|
|
202
|
+
},
|
|
203
|
+
{
|
|
204
|
+
"expr": "unrdf_workflow_latency_percentiles{quantile=\"0.9\"}",
|
|
205
|
+
"legendFormat": "p90 - {{operation}}",
|
|
206
|
+
"refId": "B"
|
|
207
|
+
},
|
|
208
|
+
{
|
|
209
|
+
"expr": "unrdf_workflow_latency_percentiles{quantile=\"0.99\"}",
|
|
210
|
+
"legendFormat": "p99 - {{operation}}",
|
|
211
|
+
"refId": "C"
|
|
212
|
+
}
|
|
213
|
+
],
|
|
214
|
+
"yaxes": [{ "format": "s", "label": "Latency" }, { "format": "short" }],
|
|
215
|
+
"legend": {
|
|
216
|
+
"show": true,
|
|
217
|
+
"alignAsTable": true,
|
|
218
|
+
"values": true,
|
|
219
|
+
"current": true,
|
|
220
|
+
"max": true
|
|
221
|
+
}
|
|
222
|
+
},
|
|
223
|
+
{
|
|
224
|
+
"id": 9,
|
|
225
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 },
|
|
226
|
+
"type": "graph",
|
|
227
|
+
"title": "Task Queue Depth",
|
|
228
|
+
"datasource": "Prometheus",
|
|
229
|
+
"targets": [
|
|
230
|
+
{
|
|
231
|
+
"expr": "unrdf_workflow_task_queue_depth",
|
|
232
|
+
"legendFormat": "{{queue_name}} - {{workflow_id}}",
|
|
233
|
+
"refId": "A"
|
|
234
|
+
}
|
|
235
|
+
],
|
|
236
|
+
"yaxes": [{ "format": "short", "label": "Queue Depth" }, { "format": "short" }]
|
|
237
|
+
},
|
|
238
|
+
{
|
|
239
|
+
"id": 10,
|
|
240
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 },
|
|
241
|
+
"type": "graph",
|
|
242
|
+
"title": "Policy Evaluations",
|
|
243
|
+
"datasource": "Prometheus",
|
|
244
|
+
"targets": [
|
|
245
|
+
{
|
|
246
|
+
"expr": "rate(unrdf_workflow_policy_evaluations_total[5m])",
|
|
247
|
+
"legendFormat": "{{policy_name}} - {{result}}",
|
|
248
|
+
"refId": "A"
|
|
249
|
+
}
|
|
250
|
+
],
|
|
251
|
+
"yaxes": [{ "format": "ops", "label": "Evaluations/sec" }, { "format": "short" }],
|
|
252
|
+
"stack": true
|
|
253
|
+
}
|
|
254
|
+
],
|
|
255
|
+
"templating": {
|
|
256
|
+
"list": [
|
|
257
|
+
{
|
|
258
|
+
"name": "workflow_id",
|
|
259
|
+
"type": "query",
|
|
260
|
+
"datasource": "Prometheus",
|
|
261
|
+
"query": "label_values(unrdf_workflow_executions_total, workflow_id)",
|
|
262
|
+
"multi": true,
|
|
263
|
+
"includeAll": true,
|
|
264
|
+
"refresh": 1,
|
|
265
|
+
"sort": 1
|
|
266
|
+
},
|
|
267
|
+
{
|
|
268
|
+
"name": "pattern",
|
|
269
|
+
"type": "query",
|
|
270
|
+
"datasource": "Prometheus",
|
|
271
|
+
"query": "label_values(unrdf_workflow_executions_total, pattern)",
|
|
272
|
+
"multi": true,
|
|
273
|
+
"includeAll": true,
|
|
274
|
+
"refresh": 1,
|
|
275
|
+
"sort": 1
|
|
276
|
+
}
|
|
277
|
+
]
|
|
278
|
+
},
|
|
279
|
+
"annotations": {
|
|
280
|
+
"list": [
|
|
281
|
+
{
|
|
282
|
+
"datasource": "Prometheus",
|
|
283
|
+
"enable": true,
|
|
284
|
+
"hide": false,
|
|
285
|
+
"iconColor": "rgba(255, 96, 96, 1)",
|
|
286
|
+
"name": "Workflow Events",
|
|
287
|
+
"target": {
|
|
288
|
+
"expr": "unrdf_workflow_executions_total"
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
]
|
|
292
|
+
}
|
|
293
|
+
},
|
|
294
|
+
"overwrite": true
|
|
295
|
+
}
|