@unrdf/observability 26.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,295 @@
1
+ {
2
+ "dashboard": {
3
+ "id": null,
4
+ "uid": "unrdf-workflow-dashboard",
5
+ "title": "UNRDF Workflow Observability Dashboard",
6
+ "description": "Comprehensive monitoring for UNRDF distributed workflows with Prometheus metrics",
7
+ "tags": ["unrdf", "workflow", "prometheus", "observability"],
8
+ "timezone": "browser",
9
+ "schemaVersion": 38,
10
+ "version": 1,
11
+ "refresh": "5s",
12
+ "time": {
13
+ "from": "now-1h",
14
+ "to": "now"
15
+ },
16
+ "timepicker": {
17
+ "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h"],
18
+ "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
19
+ },
20
+ "panels": [
21
+ {
22
+ "id": 1,
23
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
24
+ "type": "graph",
25
+ "title": "Workflow Executions by Status",
26
+ "datasource": "Prometheus",
27
+ "targets": [
28
+ {
29
+ "expr": "rate(unrdf_workflow_executions_total[5m])",
30
+ "legendFormat": "{{status}} - {{pattern}}",
31
+ "refId": "A"
32
+ }
33
+ ],
34
+ "yaxes": [{ "format": "ops", "label": "Executions/sec" }, { "format": "short" }],
35
+ "legend": {
36
+ "show": true,
37
+ "alignAsTable": true,
38
+ "values": true,
39
+ "current": true,
40
+ "max": true
41
+ },
42
+ "tooltip": { "shared": true, "sort": 2 },
43
+ "fill": 1,
44
+ "linewidth": 2
45
+ },
46
+ {
47
+ "id": 2,
48
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
49
+ "type": "stat",
50
+ "title": "Active Workflows",
51
+ "datasource": "Prometheus",
52
+ "targets": [
53
+ {
54
+ "expr": "sum(unrdf_workflow_active_workflows)",
55
+ "refId": "A"
56
+ }
57
+ ],
58
+ "options": {
59
+ "orientation": "auto",
60
+ "textMode": "value_and_name",
61
+ "colorMode": "value",
62
+ "graphMode": "area"
63
+ },
64
+ "fieldConfig": {
65
+ "defaults": {
66
+ "thresholds": {
67
+ "mode": "absolute",
68
+ "steps": [
69
+ { "value": 0, "color": "green" },
70
+ { "value": 50, "color": "yellow" },
71
+ { "value": 100, "color": "red" }
72
+ ]
73
+ }
74
+ }
75
+ }
76
+ },
77
+ {
78
+ "id": 3,
79
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 },
80
+ "type": "heatmap",
81
+ "title": "Workflow Execution Duration Distribution",
82
+ "datasource": "Prometheus",
83
+ "targets": [
84
+ {
85
+ "expr": "rate(unrdf_workflow_execution_duration_seconds_bucket[5m])",
86
+ "legendFormat": "{{le}}",
87
+ "refId": "A"
88
+ }
89
+ ],
90
+ "heatmap": {
91
+ "colorScheme": "interpolateSpectral"
92
+ },
93
+ "dataFormat": "tsbuckets"
94
+ },
95
+ {
96
+ "id": 4,
97
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
98
+ "type": "graph",
99
+ "title": "Task Executions by Type",
100
+ "datasource": "Prometheus",
101
+ "targets": [
102
+ {
103
+ "expr": "rate(unrdf_workflow_task_executions_total[5m])",
104
+ "legendFormat": "{{task_type}} - {{status}}",
105
+ "refId": "A"
106
+ }
107
+ ],
108
+ "yaxes": [{ "format": "ops", "label": "Tasks/sec" }, { "format": "short" }],
109
+ "legend": { "show": true, "alignAsTable": true, "values": true },
110
+ "stack": true
111
+ },
112
+ {
113
+ "id": 5,
114
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
115
+ "type": "graph",
116
+ "title": "Error Rate by Severity",
117
+ "datasource": "Prometheus",
118
+ "targets": [
119
+ {
120
+ "expr": "rate(unrdf_workflow_errors_total[5m])",
121
+ "legendFormat": "{{severity}} - {{error_type}}",
122
+ "refId": "A"
123
+ }
124
+ ],
125
+ "yaxes": [{ "format": "ops", "label": "Errors/sec" }, { "format": "short" }],
126
+ "alert": {
127
+ "conditions": [
128
+ {
129
+ "evaluator": { "params": [1], "type": "gt" },
130
+ "operator": { "type": "and" },
131
+ "query": { "params": ["A", "5m", "now"] },
132
+ "reducer": { "params": [], "type": "avg" },
133
+ "type": "query"
134
+ }
135
+ ],
136
+ "executionErrorState": "alerting",
137
+ "frequency": "60s",
138
+ "handler": 1,
139
+ "name": "High Error Rate",
140
+ "noDataState": "no_data"
141
+ },
142
+ "thresholds": [
143
+ {
144
+ "value": 1,
145
+ "colorMode": "critical",
146
+ "op": "gt",
147
+ "fill": true,
148
+ "line": true
149
+ }
150
+ ]
151
+ },
152
+ {
153
+ "id": 6,
154
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
155
+ "type": "graph",
156
+ "title": "Resource Utilization",
157
+ "datasource": "Prometheus",
158
+ "targets": [
159
+ {
160
+ "expr": "unrdf_workflow_resource_utilization",
161
+ "legendFormat": "{{resource_type}} - {{resource_id}}",
162
+ "refId": "A"
163
+ }
164
+ ],
165
+ "yaxes": [
166
+ { "format": "percent", "label": "Utilization %", "max": 100, "min": 0 },
167
+ { "format": "short" }
168
+ ],
169
+ "legend": { "show": true, "alignAsTable": true, "values": true, "current": true }
170
+ },
171
+ {
172
+ "id": 7,
173
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
174
+ "type": "graph",
175
+ "title": "Event Store Metrics",
176
+ "datasource": "Prometheus",
177
+ "targets": [
178
+ {
179
+ "expr": "rate(unrdf_workflow_events_appended_total[5m])",
180
+ "legendFormat": "{{event_type}} events/sec",
181
+ "refId": "A"
182
+ },
183
+ {
184
+ "expr": "unrdf_workflow_event_store_size_bytes / 1024 / 1024",
185
+ "legendFormat": "Store size MB ({{workflow_id}})",
186
+ "refId": "B"
187
+ }
188
+ ],
189
+ "yaxes": [{ "format": "short" }, { "format": "decmbytes" }]
190
+ },
191
+ {
192
+ "id": 8,
193
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 32 },
194
+ "type": "graph",
195
+ "title": "Operation Latency Percentiles",
196
+ "datasource": "Prometheus",
197
+ "targets": [
198
+ {
199
+ "expr": "unrdf_workflow_latency_percentiles{quantile=\"0.5\"}",
200
+ "legendFormat": "p50 - {{operation}}",
201
+ "refId": "A"
202
+ },
203
+ {
204
+ "expr": "unrdf_workflow_latency_percentiles{quantile=\"0.9\"}",
205
+ "legendFormat": "p90 - {{operation}}",
206
+ "refId": "B"
207
+ },
208
+ {
209
+ "expr": "unrdf_workflow_latency_percentiles{quantile=\"0.99\"}",
210
+ "legendFormat": "p99 - {{operation}}",
211
+ "refId": "C"
212
+ }
213
+ ],
214
+ "yaxes": [{ "format": "s", "label": "Latency" }, { "format": "short" }],
215
+ "legend": {
216
+ "show": true,
217
+ "alignAsTable": true,
218
+ "values": true,
219
+ "current": true,
220
+ "max": true
221
+ }
222
+ },
223
+ {
224
+ "id": 9,
225
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 },
226
+ "type": "graph",
227
+ "title": "Task Queue Depth",
228
+ "datasource": "Prometheus",
229
+ "targets": [
230
+ {
231
+ "expr": "unrdf_workflow_task_queue_depth",
232
+ "legendFormat": "{{queue_name}} - {{workflow_id}}",
233
+ "refId": "A"
234
+ }
235
+ ],
236
+ "yaxes": [{ "format": "short", "label": "Queue Depth" }, { "format": "short" }]
237
+ },
238
+ {
239
+ "id": 10,
240
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 },
241
+ "type": "graph",
242
+ "title": "Policy Evaluations",
243
+ "datasource": "Prometheus",
244
+ "targets": [
245
+ {
246
+ "expr": "rate(unrdf_workflow_policy_evaluations_total[5m])",
247
+ "legendFormat": "{{policy_name}} - {{result}}",
248
+ "refId": "A"
249
+ }
250
+ ],
251
+ "yaxes": [{ "format": "ops", "label": "Evaluations/sec" }, { "format": "short" }],
252
+ "stack": true
253
+ }
254
+ ],
255
+ "templating": {
256
+ "list": [
257
+ {
258
+ "name": "workflow_id",
259
+ "type": "query",
260
+ "datasource": "Prometheus",
261
+ "query": "label_values(unrdf_workflow_executions_total, workflow_id)",
262
+ "multi": true,
263
+ "includeAll": true,
264
+ "refresh": 1,
265
+ "sort": 1
266
+ },
267
+ {
268
+ "name": "pattern",
269
+ "type": "query",
270
+ "datasource": "Prometheus",
271
+ "query": "label_values(unrdf_workflow_executions_total, pattern)",
272
+ "multi": true,
273
+ "includeAll": true,
274
+ "refresh": 1,
275
+ "sort": 1
276
+ }
277
+ ]
278
+ },
279
+ "annotations": {
280
+ "list": [
281
+ {
282
+ "datasource": "Prometheus",
283
+ "enable": true,
284
+ "hide": false,
285
+ "iconColor": "rgba(255, 96, 96, 1)",
286
+ "name": "Workflow Events",
287
+ "target": {
288
+ "expr": "unrdf_workflow_executions_total"
289
+ }
290
+ }
291
+ ]
292
+ }
293
+ },
294
+ "overwrite": true
295
+ }