@emeryld/obs-stack 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -115,10 +115,9 @@ Each datasource now exposes a stable `uid` (`loki` and `tempo`) so the shipping
115
115
 
116
116
  We also preload three starter dashboards via `grafana/provisioning/dashboards/`. Each JSON file targets the Loki datasource (uid `loki`) so they work out of the box:
117
117
 
118
- - **Logs overview**: table-based log stream plus a severity breakout to spot noisy services that are emitting within the selected time range; the bottom table also shows the modules that currently produce the most log volume.
119
- - **Error spotlight**: graph of error rate per service, a recent error table, and a quick stat showing how many error events arrived over the past five minutes.
120
- - **Service telemetry**: log rate by logger namespace, the top namespaces by volume, and a traced-log ratio that highlights how many records still carry `trace_id`.
121
- The dashboard also now surfaces request throughput per route and a cache hit ratio stat so you can compare HTTP behavior and cache effectiveness without building ad-hoc panels.
118
+ - **Logs overview**: log volume by log type, application log level breakdown, top application event names, and a recent log stream.
119
+ - **Error spotlight**: app error counts, request 5xx counts, schedule failures, an error-rate trend, and a recent error stream with top failing routes.
120
+ - **Service telemetry**: request throughput + latency percentiles, top request paths, cache hit ratio/latency, schedule status counts, and socket event volume.
122
121
 
123
122
  Edit the JSON in `grafana/provisioning/dashboards/files/` (or export updates from Grafana) and restart the stack or reload dashboards to see your changes automatically.
124
123
 
@@ -1,209 +1,302 @@
1
1
  {
2
+ "id": null,
3
+ "uid": "errors-spotlight",
4
+ "title": "Errors Spotlight",
5
+ "tags": ["errors", "logs"],
6
+ "timezone": "browser",
7
+ "schemaVersion": 38,
8
+ "version": 1,
9
+ "refresh": "10s",
10
+ "time": {
11
+ "from": "now-6h",
12
+ "to": "now"
13
+ },
2
14
  "annotations": {
3
15
  "list": [
4
16
  {
5
17
  "builtIn": 1,
6
- "datasource": {
7
- "type": "grafana",
8
- "uid": "-- Grafana --"
9
- },
18
+ "datasource": "-- Grafana --",
10
19
  "enable": true,
11
20
  "hide": true,
12
- "iconColor": "rgba(0, 211, 255, 1)",
13
21
  "name": "Annotations & Alerts",
14
22
  "type": "dashboard"
15
23
  }
16
24
  ]
17
25
  },
18
- "editable": true,
19
- "gnetId": null,
20
- "graphTooltip": 0,
21
- "id": null,
22
- "links": [],
23
- "liveNow": false,
26
+ "templating": {
27
+ "list": [
28
+ {
29
+ "name": "service",
30
+ "type": "query",
31
+ "datasource": {
32
+ "type": "loki",
33
+ "uid": "loki"
34
+ },
35
+ "definition": "label_values({service_name!=\"\"}, service_name)",
36
+ "query": "label_values({service_name!=\"\"}, service_name)",
37
+ "refresh": 1,
38
+ "includeAll": true,
39
+ "multi": true,
40
+ "allValue": ".*",
41
+ "current": {
42
+ "text": "All",
43
+ "value": "$__all"
44
+ }
45
+ }
46
+ ]
47
+ },
24
48
  "panels": [
25
49
  {
50
+ "id": 1,
51
+ "type": "stat",
52
+ "title": "App Errors (5m)",
26
53
  "datasource": {
27
54
  "type": "loki",
28
55
  "uid": "loki"
29
56
  },
57
+ "gridPos": {
58
+ "h": 5,
59
+ "w": 8,
60
+ "x": 0,
61
+ "y": 0
62
+ },
30
63
  "fieldConfig": {
31
64
  "defaults": {
32
- "custom": {},
33
- "mappings": [],
34
- "thresholds": {
35
- "mode": "absolute",
36
- "steps": [
37
- {
38
- "color": "green",
39
- "value": null
40
- }
41
- ]
42
- },
43
65
  "unit": "short"
44
66
  },
45
67
  "overrides": []
46
68
  },
69
+ "options": {
70
+ "reduceOptions": {
71
+ "calcs": ["lastNotNull"],
72
+ "fields": "",
73
+ "values": false
74
+ },
75
+ "orientation": "auto",
76
+ "colorMode": "value",
77
+ "graphMode": "none",
78
+ "justifyMode": "auto"
79
+ },
80
+ "targets": [
81
+ {
82
+ "refId": "A",
83
+ "expr": "sum(count_over_time({service_name=~\"$service\", logType=\"application\", level=\"error\"}[5m]))",
84
+ "datasource": {
85
+ "type": "loki",
86
+ "uid": "loki"
87
+ }
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "id": 2,
93
+ "type": "stat",
94
+ "title": "Request 5xx (5m)",
95
+ "datasource": {
96
+ "type": "loki",
97
+ "uid": "loki"
98
+ },
47
99
  "gridPos": {
48
- "h": 8,
49
- "w": 12,
50
- "x": 0,
100
+ "h": 5,
101
+ "w": 8,
102
+ "x": 8,
51
103
  "y": 0
52
104
  },
53
- "id": 1,
105
+ "fieldConfig": {
106
+ "defaults": {
107
+ "unit": "short"
108
+ },
109
+ "overrides": []
110
+ },
54
111
  "options": {
55
- "legend": {
56
- "displayMode": "list",
57
- "placement": "bottom"
58
- }
112
+ "reduceOptions": {
113
+ "calcs": ["lastNotNull"],
114
+ "fields": "",
115
+ "values": false
116
+ },
117
+ "orientation": "auto",
118
+ "colorMode": "value",
119
+ "graphMode": "none",
120
+ "justifyMode": "auto"
59
121
  },
60
122
  "targets": [
61
123
  {
124
+ "refId": "A",
125
+ "expr": "sum(count_over_time({service_name=~\"$service\", logType=\"request\", status=~\"5..\"}[5m]))",
62
126
  "datasource": {
63
127
  "type": "loki",
64
128
  "uid": "loki"
65
- },
66
- "expr": "sum(rate({severityText=\"ERROR\", service_name=~\"$service_name\"}[1m])) by (service_name)",
67
- "queryType": "range",
68
- "refId": "A"
129
+ }
69
130
  }
70
- ],
71
- "title": "Error rate per service",
72
- "type": "timeseries"
131
+ ]
73
132
  },
74
133
  {
134
+ "id": 3,
135
+ "type": "stat",
136
+ "title": "Schedule Failures (15m)",
75
137
  "datasource": {
76
138
  "type": "loki",
77
139
  "uid": "loki"
78
140
  },
141
+ "gridPos": {
142
+ "h": 5,
143
+ "w": 8,
144
+ "x": 16,
145
+ "y": 0
146
+ },
79
147
  "fieldConfig": {
80
148
  "defaults": {
81
- "custom": {},
82
- "mappings": [],
83
- "thresholds": {
84
- "mode": "absolute",
85
- "steps": [
86
- {
87
- "color": "green",
88
- "value": null
89
- }
90
- ]
91
- }
149
+ "unit": "short"
92
150
  },
93
151
  "overrides": []
94
152
  },
95
- "gridPos": {
96
- "h": 8,
97
- "w": 12,
98
- "x": 0,
99
- "y": 8
100
- },
101
- "id": 2,
102
153
  "options": {
103
- "showHeader": true
154
+ "reduceOptions": {
155
+ "calcs": ["lastNotNull"],
156
+ "fields": "",
157
+ "values": false
158
+ },
159
+ "orientation": "auto",
160
+ "colorMode": "value",
161
+ "graphMode": "none",
162
+ "justifyMode": "auto"
104
163
  },
105
164
  "targets": [
106
165
  {
166
+ "refId": "A",
167
+ "expr": "sum(count_over_time({service_name=~\"$service\", logType=\"schedule\", status=~\"(fail|error|timeout).*\"}[15m]))",
107
168
  "datasource": {
108
169
  "type": "loki",
109
170
  "uid": "loki"
110
- },
111
- "expr": "{severityText=\"ERROR\", service_name=~\"$service_name\"}",
112
- "queryType": "range",
113
- "refId": "A"
171
+ }
114
172
  }
115
- ],
116
- "title": "Recent errors",
117
- "type": "table"
173
+ ]
118
174
  },
119
175
  {
176
+ "id": 4,
177
+ "type": "timeseries",
178
+ "title": "Error Rate by Type",
120
179
  "datasource": {
121
180
  "type": "loki",
122
181
  "uid": "loki"
123
182
  },
183
+ "gridPos": {
184
+ "h": 8,
185
+ "w": 24,
186
+ "x": 0,
187
+ "y": 5
188
+ },
124
189
  "fieldConfig": {
125
190
  "defaults": {
126
- "custom": {},
127
- "mappings": [],
128
191
  "unit": "short"
129
192
  },
130
193
  "overrides": []
131
194
  },
195
+ "options": {
196
+ "legend": {
197
+ "displayMode": "list",
198
+ "placement": "bottom"
199
+ },
200
+ "tooltip": {
201
+ "mode": "single"
202
+ }
203
+ },
204
+ "targets": [
205
+ {
206
+ "refId": "A",
207
+ "expr": "sum(count_over_time({service_name=~\"$service\", logType=\"application\", level=\"error\"}[5m]))",
208
+ "legendFormat": "application",
209
+ "datasource": {
210
+ "type": "loki",
211
+ "uid": "loki"
212
+ }
213
+ },
214
+ {
215
+ "refId": "B",
216
+ "expr": "sum(count_over_time({service_name=~\"$service\", logType=\"request\", status=~\"5..\"}[5m]))",
217
+ "legendFormat": "request",
218
+ "datasource": {
219
+ "type": "loki",
220
+ "uid": "loki"
221
+ }
222
+ },
223
+ {
224
+ "refId": "C",
225
+ "expr": "sum(count_over_time({service_name=~\"$service\", logType=\"schedule\", status=~\"(fail|error|timeout).*\"}[5m]))",
226
+ "legendFormat": "schedule",
227
+ "datasource": {
228
+ "type": "loki",
229
+ "uid": "loki"
230
+ }
231
+ }
232
+ ]
233
+ },
234
+ {
235
+ "id": 5,
236
+ "type": "logs",
237
+ "title": "Recent Errors",
238
+ "datasource": {
239
+ "type": "loki",
240
+ "uid": "loki"
241
+ },
132
242
  "gridPos": {
133
- "h": 4,
134
- "w": 12,
243
+ "h": 10,
244
+ "w": 24,
135
245
  "x": 0,
136
- "y": 16
246
+ "y": 13
137
247
  },
138
- "id": 3,
139
248
  "options": {
140
- "orientation": "horizontal",
141
- "reduceOptions": {
142
- "calcs": [
143
- "sum"
144
- ],
145
- "fields": "",
146
- "values": false
147
- },
148
- "textMode": "auto"
249
+ "dedupStrategy": "none",
250
+ "showLabels": true,
251
+ "showTime": true,
252
+ "sortOrder": "Descending",
253
+ "wrapLogMessage": true
149
254
  },
150
255
  "targets": [
151
256
  {
257
+ "refId": "A",
258
+ "expr": "{service_name=~\"$service\", logType=\"application\", level=\"error\"} or {service_name=~\"$service\", logType=\"request\", status=~\"5..\"} or {service_name=~\"$service\", logType=\"schedule\", status=~\"(fail|error|timeout).*\"}",
152
259
  "datasource": {
153
260
  "type": "loki",
154
261
  "uid": "loki"
155
- },
156
- "expr": "sum(count_over_time({severityText=\"ERROR\", service_name=~\"$service_name\"}[5m]))",
157
- "queryType": "range",
158
- "refId": "A"
262
+ }
159
263
  }
160
- ],
161
- "title": "Errors in last 5 minutes",
162
- "type": "stat"
163
- }
164
- ],
165
- "refresh": "5s",
166
- "schemaVersion": 38,
167
- "style": "dark",
168
- "tags": [
169
- "errors",
170
- "alerts",
171
- "logs"
172
- ],
173
- "templating": {
174
- "list": [
175
- {
176
- "allValue": ".*",
177
- "datasource": {
178
- "type": "loki",
179
- "uid": "loki"
264
+ ]
265
+ },
266
+ {
267
+ "id": 6,
268
+ "type": "table",
269
+ "title": "Top Error Routes (1h)",
270
+ "datasource": {
271
+ "type": "loki",
272
+ "uid": "loki"
273
+ },
274
+ "gridPos": {
275
+ "h": 7,
276
+ "w": 24,
277
+ "x": 0,
278
+ "y": 23
279
+ },
280
+ "fieldConfig": {
281
+ "defaults": {
282
+ "unit": "short"
180
283
  },
181
- "definition": "label_values({service_name!=\"\"}, service_name)",
182
- "includeAll": true,
183
- "label": "Service",
184
- "multi": false,
185
- "name": "service_name",
186
- "options": [],
187
- "query": "label_values({service_name!=\"\"}, service_name)",
188
- "refresh": 2,
189
- "skipUrlSync": false,
190
- "type": "query",
191
- "useTags": false,
192
- "current": {
193
- "text": "All services",
194
- "value": ".*"
284
+ "overrides": []
285
+ },
286
+ "options": {
287
+ "showHeader": true
288
+ },
289
+ "targets": [
290
+ {
291
+ "refId": "A",
292
+ "expr": "topk(10, sum by (path, method) (count_over_time({service_name=~\"$service\", logType=\"request\", status=~\"5..\"}[1h])))",
293
+ "legendFormat": "{{method}} {{path}}",
294
+ "datasource": {
295
+ "type": "loki",
296
+ "uid": "loki"
297
+ }
195
298
  }
196
- }
197
- ]
198
- },
199
- "time": {
200
- "from": "now-1h",
201
- "to": "now"
202
- },
203
- "timepicker": {},
204
- "timezone": "browser",
205
- "title": "Error spotlight",
206
- "uid": "error-spotlight",
207
- "version": 1,
208
- "weekStart": ""
299
+ ]
300
+ }
301
+ ]
209
302
  }
@@ -1,221 +1,213 @@
1
1
  {
2
+ "id": null,
3
+ "uid": "logs-overview",
4
+ "title": "Logs Overview",
5
+ "tags": ["logs"],
6
+ "timezone": "browser",
7
+ "schemaVersion": 38,
8
+ "version": 1,
9
+ "refresh": "10s",
10
+ "time": {
11
+ "from": "now-6h",
12
+ "to": "now"
13
+ },
2
14
  "annotations": {
3
15
  "list": [
4
16
  {
5
17
  "builtIn": 1,
6
- "datasource": {
7
- "type": "grafana",
8
- "uid": "-- Grafana --"
9
- },
18
+ "datasource": "-- Grafana --",
10
19
  "enable": true,
11
20
  "hide": true,
12
- "iconColor": "rgba(0, 211, 255, 1)",
13
21
  "name": "Annotations & Alerts",
14
22
  "type": "dashboard"
15
23
  }
16
24
  ]
17
25
  },
18
- "editable": true,
19
- "gnetId": null,
20
- "graphTooltip": 0,
21
- "id": null,
22
- "links": [],
23
- "liveNow": false,
26
+ "templating": {
27
+ "list": [
28
+ {
29
+ "name": "service",
30
+ "type": "query",
31
+ "datasource": {
32
+ "type": "loki",
33
+ "uid": "loki"
34
+ },
35
+ "definition": "label_values({service_name!=\"\"}, service_name)",
36
+ "query": "label_values({service_name!=\"\"}, service_name)",
37
+ "refresh": 1,
38
+ "includeAll": true,
39
+ "multi": true,
40
+ "allValue": ".*",
41
+ "current": {
42
+ "text": "All",
43
+ "value": "$__all"
44
+ }
45
+ },
46
+ {
47
+ "name": "logType",
48
+ "type": "query",
49
+ "datasource": {
50
+ "type": "loki",
51
+ "uid": "loki"
52
+ },
53
+ "definition": "label_values({logType!=\"\"}, logType)",
54
+ "query": "label_values({logType!=\"\"}, logType)",
55
+ "refresh": 1,
56
+ "includeAll": true,
57
+ "multi": true,
58
+ "allValue": ".*",
59
+ "current": {
60
+ "text": "All",
61
+ "value": "$__all"
62
+ }
63
+ }
64
+ ]
65
+ },
24
66
  "panels": [
25
67
  {
68
+ "id": 1,
69
+ "type": "timeseries",
70
+ "title": "Log Volume by Type",
26
71
  "datasource": {
27
72
  "type": "loki",
28
73
  "uid": "loki"
29
74
  },
75
+ "gridPos": {
76
+ "h": 8,
77
+ "w": 24,
78
+ "x": 0,
79
+ "y": 0
80
+ },
30
81
  "fieldConfig": {
31
82
  "defaults": {
32
- "custom": {},
33
- "mappings": [],
34
- "thresholds": {
35
- "mode": "absolute",
36
- "steps": [
37
- {
38
- "color": "green",
39
- "value": null
40
- }
41
- ]
42
- }
83
+ "unit": "short"
43
84
  },
44
85
  "overrides": []
45
86
  },
46
- "gridPos": {
47
- "h": 12,
48
- "w": 12,
49
- "x": 0,
50
- "y": 0
51
- },
52
- "id": 1,
53
87
  "options": {
54
- "showLabels": true,
55
- "showTime": true,
56
- "wrapLogMessage": true
88
+ "legend": {
89
+ "displayMode": "list",
90
+ "placement": "bottom"
91
+ },
92
+ "tooltip": {
93
+ "mode": "single"
94
+ }
57
95
  },
58
96
  "targets": [
59
97
  {
98
+ "refId": "A",
99
+ "expr": "sum by (logType) (count_over_time({service_name=~\"$service\", logType=~\"$logType\"}[5m]))",
100
+ "legendFormat": "{{logType}}",
60
101
  "datasource": {
61
102
  "type": "loki",
62
103
  "uid": "loki"
63
- },
64
- "editorMode": "builder",
65
- "expr": "{service_name=~\"$service_name\"}",
66
- "queryType": "range",
67
- "refId": "A"
104
+ }
68
105
  }
69
- ],
70
- "title": "Log stream",
71
- "type": "logs"
106
+ ]
72
107
  },
73
108
  {
109
+ "id": 2,
110
+ "type": "bargauge",
111
+ "title": "App Log Levels (15m)",
74
112
  "datasource": {
75
113
  "type": "loki",
76
114
  "uid": "loki"
77
115
  },
116
+ "gridPos": {
117
+ "h": 6,
118
+ "w": 12,
119
+ "x": 0,
120
+ "y": 8
121
+ },
78
122
  "fieldConfig": {
79
123
  "defaults": {
80
- "custom": {},
81
- "mappings": [],
82
- "thresholds": {
83
- "mode": "absolute",
84
- "steps": [
85
- {
86
- "color": "green",
87
- "value": null
88
- }
89
- ]
90
- },
91
124
  "unit": "short"
92
125
  },
93
126
  "overrides": []
94
127
  },
95
- "gridPos": {
96
- "h": 8,
97
- "w": 12,
98
- "x": 0,
99
- "y": 12
100
- },
101
- "id": 2,
102
128
  "options": {
103
- "legend": {
104
- "displayMode": "list",
105
- "placement": "bottom"
106
- },
107
- "tooltip": {
108
- "mode": "single"
109
- }
129
+ "displayMode": "basic",
130
+ "orientation": "horizontal",
131
+ "showUnfilled": true
110
132
  },
111
133
  "targets": [
112
134
  {
135
+ "refId": "A",
136
+ "expr": "sum by (level) (count_over_time({service_name=~\"$service\", logType=\"application\"}[15m]))",
137
+ "legendFormat": "{{level}}",
113
138
  "datasource": {
114
139
  "type": "loki",
115
140
  "uid": "loki"
116
- },
117
- "expr": "sum(rate({service_name=~\"$service_name\"}[5m])) by (severityText)",
118
- "queryType": "range",
119
- "refId": "A"
141
+ }
120
142
  }
121
- ],
122
- "title": "Log rate by severity",
123
- "type": "timeseries"
124
- }
125
- ,
143
+ ]
144
+ },
126
145
  {
146
+ "id": 3,
147
+ "type": "table",
148
+ "title": "Top Application Events (1h)",
127
149
  "datasource": {
128
150
  "type": "loki",
129
151
  "uid": "loki"
130
152
  },
153
+ "gridPos": {
154
+ "h": 6,
155
+ "w": 12,
156
+ "x": 12,
157
+ "y": 8
158
+ },
131
159
  "fieldConfig": {
132
160
  "defaults": {
133
- "custom": {},
134
- "mappings": [],
135
- "thresholds": {
136
- "mode": "absolute",
137
- "steps": [
138
- {
139
- "color": "green",
140
- "value": null
141
- }
142
- ]
143
- }
161
+ "unit": "short"
144
162
  },
145
163
  "overrides": []
146
164
  },
165
+ "options": {
166
+ "showHeader": true
167
+ },
168
+ "targets": [
169
+ {
170
+ "refId": "A",
171
+ "expr": "topk(10, sum by (name) (count_over_time({service_name=~\"$service\", logType=\"application\"}[1h])))",
172
+ "legendFormat": "{{name}}",
173
+ "datasource": {
174
+ "type": "loki",
175
+ "uid": "loki"
176
+ }
177
+ }
178
+ ]
179
+ },
180
+ {
181
+ "id": 4,
182
+ "type": "logs",
183
+ "title": "Recent Application Logs",
184
+ "datasource": {
185
+ "type": "loki",
186
+ "uid": "loki"
187
+ },
147
188
  "gridPos": {
148
- "h": 8,
149
- "w": 12,
189
+ "h": 10,
190
+ "w": 24,
150
191
  "x": 0,
151
- "y": 20
192
+ "y": 14
152
193
  },
153
- "id": 3,
154
194
  "options": {
155
- "showHeader": true,
156
- "sortBy": [
157
- {
158
- "desc": true,
159
- "displayName": "Value"
160
- }
161
- ]
195
+ "dedupStrategy": "none",
196
+ "showLabels": true,
197
+ "showTime": true,
198
+ "sortOrder": "Descending",
199
+ "wrapLogMessage": true
162
200
  },
163
201
  "targets": [
164
202
  {
203
+ "refId": "A",
204
+ "expr": "{service_name=~\"$service\", logType=\"application\"}",
165
205
  "datasource": {
166
206
  "type": "loki",
167
207
  "uid": "loki"
168
- },
169
- "expr": "topk(5, sum by (module) (count_over_time({module!=\"\"}[5m])))",
170
- "queryType": "range",
171
- "refId": "A"
208
+ }
172
209
  }
173
- ],
174
- "title": "Top modules by log volume",
175
- "type": "table"
210
+ ]
176
211
  }
177
- ],
178
- "refresh": "15s",
179
- "schemaVersion": 38,
180
- "style": "dark",
181
- "tags": [
182
- "logs",
183
- "observability"
184
- ],
185
- "templating": {
186
- "list": [
187
- {
188
- "allValue": ".*",
189
- "datasource": {
190
- "type": "loki",
191
- "uid": "loki"
192
- },
193
- "definition": "label_values({service_name!=\"\"}, service_name)",
194
- "includeAll": true,
195
- "label": "Service",
196
- "multi": false,
197
- "name": "service_name",
198
- "options": [],
199
- "query": "label_values({service_name!=\"\"}, service_name)",
200
- "refresh": 2,
201
- "skipUrlSync": false,
202
- "type": "query",
203
- "useTags": false,
204
- "current": {
205
- "text": "All services",
206
- "value": ".*"
207
- }
208
- }
209
- ]
210
- },
211
- "time": {
212
- "from": "now-6h",
213
- "to": "now"
214
- },
215
- "timepicker": {},
216
- "timezone": "browser",
217
- "title": "Logs overview",
218
- "uid": "logs-overview",
219
- "version": 1,
220
- "weekStart": ""
212
+ ]
221
213
  }
@@ -1,56 +1,71 @@
1
1
  {
2
+ "id": null,
3
+ "uid": "service-telemetry",
4
+ "title": "Service Telemetry",
5
+ "tags": ["requests", "cache", "schedules", "sockets"],
6
+ "timezone": "browser",
7
+ "schemaVersion": 38,
8
+ "version": 1,
9
+ "refresh": "10s",
10
+ "time": {
11
+ "from": "now-6h",
12
+ "to": "now"
13
+ },
2
14
  "annotations": {
3
15
  "list": [
4
16
  {
5
17
  "builtIn": 1,
6
- "datasource": {
7
- "type": "grafana",
8
- "uid": "-- Grafana --"
9
- },
18
+ "datasource": "-- Grafana --",
10
19
  "enable": true,
11
20
  "hide": true,
12
- "iconColor": "rgba(0, 211, 255, 1)",
13
21
  "name": "Annotations & Alerts",
14
22
  "type": "dashboard"
15
23
  }
16
24
  ]
17
25
  },
18
- "editable": true,
19
- "gnetId": null,
20
- "graphTooltip": 0,
21
- "id": null,
22
- "links": [],
23
- "liveNow": false,
26
+ "templating": {
27
+ "list": [
28
+ {
29
+ "name": "service",
30
+ "type": "query",
31
+ "datasource": {
32
+ "type": "loki",
33
+ "uid": "loki"
34
+ },
35
+ "definition": "label_values({service_name!=\"\"}, service_name)",
36
+ "query": "label_values({service_name!=\"\"}, service_name)",
37
+ "refresh": 1,
38
+ "includeAll": true,
39
+ "multi": true,
40
+ "allValue": ".*",
41
+ "current": {
42
+ "text": "All",
43
+ "value": "$__all"
44
+ }
45
+ }
46
+ ]
47
+ },
24
48
  "panels": [
25
49
  {
50
+ "id": 1,
51
+ "type": "timeseries",
52
+ "title": "Request Throughput by Method",
26
53
  "datasource": {
27
54
  "type": "loki",
28
55
  "uid": "loki"
29
56
  },
57
+ "gridPos": {
58
+ "h": 7,
59
+ "w": 12,
60
+ "x": 0,
61
+ "y": 0
62
+ },
30
63
  "fieldConfig": {
31
64
  "defaults": {
32
- "custom": {},
33
- "mappings": [],
34
- "thresholds": {
35
- "mode": "absolute",
36
- "steps": [
37
- {
38
- "color": "green",
39
- "value": null
40
- }
41
- ]
42
- },
43
65
  "unit": "short"
44
66
  },
45
67
  "overrides": []
46
68
  },
47
- "gridPos": {
48
- "h": 8,
49
- "w": 12,
50
- "x": 0,
51
- "y": 0
52
- },
53
- "id": 1,
54
69
  "options": {
55
70
  "legend": {
56
71
  "displayMode": "list",
@@ -62,129 +77,165 @@
62
77
  },
63
78
  "targets": [
64
79
  {
80
+ "refId": "A",
81
+ "expr": "sum by (method) (count_over_time({service_name=~\"$service\", logType=\"request\"}[5m]))",
82
+ "legendFormat": "{{method}}",
65
83
  "datasource": {
66
84
  "type": "loki",
67
85
  "uid": "loki"
68
- },
69
- "expr": "sum(rate({service_name=~\"$service_name\"}[5m])) by (logger_name)",
70
- "queryType": "range",
71
- "refId": "A"
86
+ }
72
87
  }
73
- ],
74
- "title": "Log rate by logger",
75
- "type": "timeseries"
88
+ ]
76
89
  },
77
90
  {
91
+ "id": 2,
92
+ "type": "timeseries",
93
+ "title": "Request Latency (p50 / p95)",
78
94
  "datasource": {
79
95
  "type": "loki",
80
96
  "uid": "loki"
81
97
  },
98
+ "gridPos": {
99
+ "h": 7,
100
+ "w": 12,
101
+ "x": 12,
102
+ "y": 0
103
+ },
82
104
  "fieldConfig": {
83
105
  "defaults": {
84
- "custom": {},
85
- "mappings": [],
86
- "thresholds": {
87
- "mode": "absolute",
88
- "steps": [
89
- {
90
- "color": "green",
91
- "value": null
92
- }
93
- ]
94
- }
106
+ "unit": "ms"
95
107
  },
96
108
  "overrides": []
97
109
  },
110
+ "options": {
111
+ "legend": {
112
+ "displayMode": "list",
113
+ "placement": "bottom"
114
+ },
115
+ "tooltip": {
116
+ "mode": "single"
117
+ }
118
+ },
119
+ "targets": [
120
+ {
121
+ "refId": "A",
122
+ "expr": "quantile_over_time(0.5, {service_name=~\"$service\", logType=\"request\"} | unwrap durationMs [5m])",
123
+ "legendFormat": "p50",
124
+ "datasource": {
125
+ "type": "loki",
126
+ "uid": "loki"
127
+ }
128
+ },
129
+ {
130
+ "refId": "B",
131
+ "expr": "quantile_over_time(0.95, {service_name=~\"$service\", logType=\"request\"} | unwrap durationMs [5m])",
132
+ "legendFormat": "p95",
133
+ "datasource": {
134
+ "type": "loki",
135
+ "uid": "loki"
136
+ }
137
+ }
138
+ ]
139
+ },
140
+ {
141
+ "id": 3,
142
+ "type": "table",
143
+ "title": "Top Request Paths (1h)",
144
+ "datasource": {
145
+ "type": "loki",
146
+ "uid": "loki"
147
+ },
98
148
  "gridPos": {
99
- "h": 8,
149
+ "h": 7,
100
150
  "w": 12,
101
151
  "x": 0,
102
- "y": 8
152
+ "y": 7
153
+ },
154
+ "fieldConfig": {
155
+ "defaults": {
156
+ "unit": "short"
157
+ },
158
+ "overrides": []
103
159
  },
104
- "id": 2,
105
160
  "options": {
106
161
  "showHeader": true
107
162
  },
108
163
  "targets": [
109
164
  {
165
+ "refId": "A",
166
+ "expr": "topk(10, sum by (path) (count_over_time({service_name=~\"$service\", logType=\"request\"}[1h])))",
167
+ "legendFormat": "{{path}}",
110
168
  "datasource": {
111
169
  "type": "loki",
112
170
  "uid": "loki"
113
- },
114
- "expr": "topk(10, sum by (logger_name) (count_over_time({service_name=~\"$service_name\"}[5m])))",
115
- "queryType": "range",
116
- "refId": "A"
171
+ }
117
172
  }
118
- ],
119
- "title": "Top log namespaces",
120
- "type": "table"
173
+ ]
121
174
  },
122
175
  {
176
+ "id": 4,
177
+ "type": "stat",
178
+ "title": "Cache Hit Ratio (5m)",
123
179
  "datasource": {
124
180
  "type": "loki",
125
181
  "uid": "loki"
126
182
  },
183
+ "gridPos": {
184
+ "h": 5,
185
+ "w": 6,
186
+ "x": 12,
187
+ "y": 7
188
+ },
127
189
  "fieldConfig": {
128
190
  "defaults": {
129
- "custom": {},
130
- "mappings": [],
131
- "unit": "percent"
191
+ "unit": "percentunit",
192
+ "min": 0,
193
+ "max": 1
132
194
  },
133
195
  "overrides": []
134
196
  },
135
- "gridPos": {
136
- "h": 4,
137
- "w": 12,
138
- "x": 0,
139
- "y": 16
140
- },
141
- "id": 3,
142
197
  "options": {
143
- "orientation": "horizontal",
144
198
  "reduceOptions": {
145
- "calcs": [
146
- "mean"
147
- ],
199
+ "calcs": ["lastNotNull"],
148
200
  "fields": "",
149
201
  "values": false
150
202
  },
151
- "textMode": "auto"
203
+ "orientation": "auto",
204
+ "colorMode": "value",
205
+ "graphMode": "none",
206
+ "justifyMode": "auto"
152
207
  },
153
208
  "targets": [
154
209
  {
210
+ "refId": "A",
211
+ "expr": "sum(count_over_time({service_name=~\"$service\", logType=\"cache_trace\", operation=\"hit\"}[5m])) / sum(count_over_time({service_name=~\"$service\", logType=\"cache_trace\", operation=~\"hit|miss\"}[5m]))",
155
212
  "datasource": {
156
213
  "type": "loki",
157
214
  "uid": "loki"
158
- },
159
- "expr": "sum(count_over_time({trace_id!=\"\", service_name=~\"$service_name\"}[5m])) / sum(count_over_time({service_name=~\"$service_name\"}[5m])) * 100",
160
- "queryType": "range",
161
- "refId": "A"
215
+ }
162
216
  }
163
- ],
164
- "title": "Traced logs",
165
- "type": "stat"
166
- }
167
- ,
217
+ ]
218
+ },
168
219
  {
220
+ "id": 5,
221
+ "type": "timeseries",
222
+ "title": "Cache Latency by Operation",
169
223
  "datasource": {
170
224
  "type": "loki",
171
225
  "uid": "loki"
172
226
  },
227
+ "gridPos": {
228
+ "h": 5,
229
+ "w": 6,
230
+ "x": 18,
231
+ "y": 7
232
+ },
173
233
  "fieldConfig": {
174
234
  "defaults": {
175
- "custom": {},
176
- "mappings": [],
177
- "unit": "reqps"
235
+ "unit": "ms"
178
236
  },
179
237
  "overrides": []
180
238
  },
181
- "gridPos": {
182
- "h": 8,
183
- "w": 12,
184
- "x": 0,
185
- "y": 20
186
- },
187
- "id": 4,
188
239
  "options": {
189
240
  "legend": {
190
241
  "displayMode": "list",
@@ -196,106 +247,93 @@
196
247
  },
197
248
  "targets": [
198
249
  {
250
+ "refId": "A",
251
+ "expr": "avg_over_time({service_name=~\"$service\", logType=\"cache_trace\"} | unwrap durationMs [5m])",
252
+ "legendFormat": "{{operation}}",
199
253
  "datasource": {
200
254
  "type": "loki",
201
255
  "uid": "loki"
202
- },
203
- "expr": "sum(rate({http.route!=\"\", service_name=~\"$service_name\"}[1m])) by (http.route)",
204
- "queryType": "range",
205
- "refId": "A"
256
+ }
206
257
  }
207
- ],
208
- "title": "Request rate by route",
209
- "type": "timeseries"
210
- }
211
- ,
258
+ ]
259
+ },
212
260
  {
261
+ "id": 6,
262
+ "type": "bargauge",
263
+ "title": "Schedule Status (15m)",
213
264
  "datasource": {
214
265
  "type": "loki",
215
266
  "uid": "loki"
216
267
  },
268
+ "gridPos": {
269
+ "h": 6,
270
+ "w": 12,
271
+ "x": 0,
272
+ "y": 14
273
+ },
217
274
  "fieldConfig": {
218
275
  "defaults": {
219
- "custom": {},
220
- "mappings": [],
221
- "unit": "percent"
276
+ "unit": "short"
222
277
  },
223
278
  "overrides": []
224
279
  },
280
+ "options": {
281
+ "displayMode": "basic",
282
+ "orientation": "horizontal",
283
+ "showUnfilled": true
284
+ },
285
+ "targets": [
286
+ {
287
+ "refId": "A",
288
+ "expr": "sum by (status) (count_over_time({service_name=~\"$service\", logType=\"schedule\"}[15m]))",
289
+ "legendFormat": "{{status}}",
290
+ "datasource": {
291
+ "type": "loki",
292
+ "uid": "loki"
293
+ }
294
+ }
295
+ ]
296
+ },
297
+ {
298
+ "id": 7,
299
+ "type": "timeseries",
300
+ "title": "Socket Events by Direction",
301
+ "datasource": {
302
+ "type": "loki",
303
+ "uid": "loki"
304
+ },
225
305
  "gridPos": {
226
- "h": 4,
306
+ "h": 6,
227
307
  "w": 12,
228
- "x": 0,
229
- "y": 28
308
+ "x": 12,
309
+ "y": 14
310
+ },
311
+ "fieldConfig": {
312
+ "defaults": {
313
+ "unit": "short"
314
+ },
315
+ "overrides": []
230
316
  },
231
- "id": 5,
232
317
  "options": {
233
- "orientation": "horizontal",
234
- "reduceOptions": {
235
- "calcs": [
236
- "mean"
237
- ],
238
- "fields": "",
239
- "values": false
318
+ "legend": {
319
+ "displayMode": "list",
320
+ "placement": "bottom"
240
321
  },
241
- "textMode": "auto"
322
+ "tooltip": {
323
+ "mode": "single"
324
+ }
242
325
  },
243
326
  "targets": [
244
327
  {
328
+ "refId": "A",
329
+ "expr": "sum by (direction) (count_over_time({service_name=~\"$service\", logType=\"socket\"}[5m]))",
330
+ "legendFormat": "{{direction}}",
245
331
  "datasource": {
246
332
  "type": "loki",
247
333
  "uid": "loki"
248
- },
249
- "expr": "sum(count_over_time({cache.system!=\"\", cache.hit=\"true\", service_name=~\"$service_name\"}[5m])) / sum(count_over_time({cache.system!=\"\", service_name=~\"$service_name\"}[5m])) * 100",
250
- "queryType": "range",
251
- "refId": "A"
334
+ }
252
335
  }
253
- ],
254
- "title": "Cache hit ratio",
255
- "type": "stat"
336
+ ]
256
337
  }
257
- ],
258
- "refresh": "10s",
259
- "schemaVersion": 38,
260
- "style": "dark",
261
- "tags": [
262
- "service",
263
- "telemetry"
264
- ],
265
- "templating": {
266
- "list": [
267
- {
268
- "allValue": ".*",
269
- "datasource": {
270
- "type": "loki",
271
- "uid": "loki"
272
- },
273
- "definition": "label_values({service_name!=\"\"}, service_name)",
274
- "includeAll": true,
275
- "label": "Service",
276
- "multi": false,
277
- "name": "service_name",
278
- "options": [],
279
- "query": "label_values({service_name!=\"\"}, service_name)",
280
- "refresh": 2,
281
- "skipUrlSync": false,
282
- "type": "query",
283
- "useTags": false,
284
- "current": {
285
- "text": "All services",
286
- "value": ".*"
287
- }
288
- }
289
- ]
290
- },
291
- "time": {
292
- "from": "now-1h",
293
- "to": "now"
294
- },
295
- "timepicker": {},
296
- "timezone": "browser",
297
- "title": "Service telemetry",
298
- "uid": "service-telemetry",
299
- "version": 1,
300
- "weekStart": ""
338
+ ]
301
339
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@emeryld/obs-stack",
3
- "version": "0.1.13",
3
+ "version": "0.1.14",
4
4
  "description": "Docker Compose-based Grafana + Tempo + Loki + OpenTelemetry Collector stack",
5
5
  "type": "commonjs",
6
6
  "bin": {