@smithers-orchestrator/observability 0.23.0 → 0.24.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ version: "3.9"
2
+
3
+ services:
4
+ otel-collector:
5
+ image: otel/opentelemetry-collector-contrib:0.148.0
6
+ restart: unless-stopped
7
+ command: ["--config", "/etc/otelcol/config.yml"]
8
+ volumes:
9
+ - ./otel-collector-config.yml:/etc/otelcol/config.yml:ro
10
+ ports:
11
+ - "4317:4317" # gRPC
12
+ - "4318:4318" # HTTP (Smithers sends here)
13
+ - "8889:8889" # Prometheus exporter scrape endpoint
14
+ depends_on:
15
+ tempo:
16
+ condition: service_healthy
17
+ loki:
18
+ condition: service_healthy
19
+ healthcheck:
20
+ test: ["CMD", "/otelcol-contrib", "--version"]
21
+ interval: 10s
22
+ timeout: 5s
23
+ retries: 12
24
+ start_period: 10s
25
+
26
+ loki:
27
+ image: grafana/loki:3.3.2
28
+ restart: unless-stopped
29
+ command: ["-config.file=/etc/loki/local-config.yaml"]
30
+ volumes:
31
+ - ./loki/loki-config.yaml:/etc/loki/local-config.yaml:ro
32
+ - ./data/loki:/loki
33
+ ports:
34
+ - "3100:3100"
35
+ healthcheck:
36
+ test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3100/ready"]
37
+ interval: 10s
38
+ timeout: 5s
39
+ retries: 18
40
+ start_period: 10s
41
+
42
+ prometheus:
43
+ image: prom/prometheus:v3.2.0
44
+ restart: unless-stopped
45
+ volumes:
46
+ - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
47
+ - ./data/prometheus:/prometheus
48
+ ports:
49
+ - "9090:9090"
50
+ depends_on:
51
+ otel-collector:
52
+ condition: service_healthy
53
+ healthcheck:
54
+ test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:9090/-/ready"]
55
+ interval: 10s
56
+ timeout: 5s
57
+ retries: 18
58
+ start_period: 10s
59
+
60
+ tempo:
61
+ image: grafana/tempo:2.7.0
62
+ restart: unless-stopped
63
+ command: ["-config.file=/etc/tempo/tempo.yml"]
64
+ volumes:
65
+ - ./tempo/tempo.yml:/etc/tempo/tempo.yml:ro
66
+ - ./data/tempo:/tmp/tempo
67
+ ports:
68
+ - "3200:3200" # Tempo query API
69
+ - "4317" # OTLP gRPC (internal)
70
+ healthcheck:
71
+ test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3200/api/search"]
72
+ interval: 10s
73
+ timeout: 5s
74
+ retries: 24
75
+ start_period: 20s
76
+
77
+ grafana:
78
+ image: grafana/grafana:11.5.0
79
+ restart: unless-stopped
80
+ environment:
81
+ # Local dev default is "admin"; override via env for anything shared.
82
+ - GF_SECURITY_ADMIN_PASSWORD=${GF_SECURITY_ADMIN_PASSWORD:-admin}
83
+ # Anonymous browsing stays on for frictionless local dev, but read-only.
84
+ - GF_AUTH_ANONYMOUS_ENABLED=${GF_AUTH_ANONYMOUS_ENABLED:-true}
85
+ - GF_AUTH_ANONYMOUS_ORG_ROLE=${GF_AUTH_ANONYMOUS_ORG_ROLE:-Viewer}
86
+ volumes:
87
+ - ./grafana/provisioning:/etc/grafana/provisioning:ro
88
+ - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
89
+ - ./data/grafana:/var/lib/grafana
90
+ ports:
91
+ - "3001:3000"
92
+ depends_on:
93
+ prometheus:
94
+ condition: service_healthy
95
+ tempo:
96
+ condition: service_healthy
97
+ loki:
98
+ condition: service_healthy
99
+ healthcheck:
100
+ test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3000/api/health"]
101
+ interval: 10s
102
+ timeout: 5s
103
+ retries: 24
104
+ start_period: 20s
@@ -0,0 +1,453 @@
1
+ {
2
+ "uid": "smithers-overview",
3
+ "title": "Smithers Overview",
4
+ "tags": ["smithers", "ai", "orchestration"],
5
+ "timezone": "browser",
6
+ "schemaVersion": 39,
7
+ "version": 1,
8
+ "refresh": "10s",
9
+ "time": { "from": "now-1h", "to": "now" },
10
+ "editable": true,
11
+ "panels": [
12
+ {
13
+ "type": "row",
14
+ "title": "Overview",
15
+ "collapsed": false,
16
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
17
+ "id": 1
18
+ },
19
+ {
20
+ "type": "stat",
21
+ "title": "Active Runs",
22
+ "id": 2,
23
+ "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
24
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
25
+ "targets": [{ "expr": "smithers_smithers_runs_active", "refId": "A" }],
26
+ "fieldConfig": {
27
+ "defaults": {
28
+ "thresholds": {
29
+ "mode": "absolute",
30
+ "steps": [
31
+ { "color": "green", "value": null },
32
+ { "color": "yellow", "value": 5 },
33
+ { "color": "red", "value": 10 }
34
+ ]
35
+ },
36
+ "color": { "mode": "thresholds" }
37
+ }
38
+ },
39
+ "options": {
40
+ "reduceOptions": { "calcs": ["lastNotNull"] },
41
+ "colorMode": "background",
42
+ "graphMode": "area"
43
+ }
44
+ },
45
+ {
46
+ "type": "stat",
47
+ "title": "Total Runs",
48
+ "id": 3,
49
+ "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
50
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
51
+ "targets": [{ "expr": "smithers_smithers_runs_total", "refId": "A" }],
52
+ "fieldConfig": { "defaults": { "color": { "mode": "fixed", "fixedColor": "blue" } } },
53
+ "options": {
54
+ "reduceOptions": { "calcs": ["lastNotNull"] },
55
+ "colorMode": "value",
56
+ "graphMode": "area"
57
+ }
58
+ },
59
+ {
60
+ "type": "stat",
61
+ "title": "Active Nodes",
62
+ "id": 4,
63
+ "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
64
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
65
+ "targets": [{ "expr": "smithers_smithers_nodes_active", "refId": "A" }],
66
+ "fieldConfig": {
67
+ "defaults": {
68
+ "thresholds": {
69
+ "mode": "absolute",
70
+ "steps": [
71
+ { "color": "green", "value": null },
72
+ { "color": "yellow", "value": 10 },
73
+ { "color": "red", "value": 20 }
74
+ ]
75
+ },
76
+ "color": { "mode": "thresholds" }
77
+ }
78
+ },
79
+ "options": {
80
+ "reduceOptions": { "calcs": ["lastNotNull"] },
81
+ "colorMode": "background",
82
+ "graphMode": "area"
83
+ }
84
+ },
85
+ {
86
+ "type": "stat",
87
+ "title": "Node Success Rate",
88
+ "id": 5,
89
+ "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
90
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
91
+ "targets": [{
92
+ "expr": "(smithers_smithers_nodes_finished / (smithers_smithers_nodes_finished + smithers_smithers_nodes_failed > 0)) * 100 or vector(100)",
93
+ "refId": "A"
94
+ }],
95
+ "fieldConfig": {
96
+ "defaults": {
97
+ "unit": "percent",
98
+ "thresholds": {
99
+ "mode": "absolute",
100
+ "steps": [
101
+ { "color": "red", "value": null },
102
+ { "color": "yellow", "value": 90 },
103
+ { "color": "green", "value": 99 }
104
+ ]
105
+ },
106
+ "color": { "mode": "thresholds" },
107
+ "min": 0,
108
+ "max": 100
109
+ }
110
+ },
111
+ "options": {
112
+ "reduceOptions": { "calcs": ["lastNotNull"] },
113
+ "colorMode": "background",
114
+ "graphMode": "none"
115
+ }
116
+ },
117
+ {
118
+ "type": "stat",
119
+ "title": "Cache Hit Rate",
120
+ "id": 6,
121
+ "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
122
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
123
+ "targets": [{
124
+ "expr": "(smithers_smithers_cache_hits / (smithers_smithers_cache_hits + smithers_smithers_cache_misses > 0)) * 100 or vector(0)",
125
+ "refId": "A"
126
+ }],
127
+ "fieldConfig": {
128
+ "defaults": {
129
+ "unit": "percent",
130
+ "thresholds": {
131
+ "mode": "absolute",
132
+ "steps": [
133
+ { "color": "red", "value": null },
134
+ { "color": "yellow", "value": 30 },
135
+ { "color": "green", "value": 70 }
136
+ ]
137
+ },
138
+ "color": { "mode": "thresholds" },
139
+ "min": 0,
140
+ "max": 100
141
+ }
142
+ },
143
+ "options": {
144
+ "reduceOptions": { "calcs": ["lastNotNull"] },
145
+ "colorMode": "background",
146
+ "graphMode": "none"
147
+ }
148
+ },
149
+ {
150
+ "type": "stat",
151
+ "title": "DB Retries",
152
+ "id": 7,
153
+ "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
154
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
155
+ "targets": [{ "expr": "smithers_smithers_db_retries", "refId": "A" }],
156
+ "fieldConfig": {
157
+ "defaults": {
158
+ "thresholds": {
159
+ "mode": "absolute",
160
+ "steps": [
161
+ { "color": "green", "value": null },
162
+ { "color": "yellow", "value": 5 },
163
+ { "color": "red", "value": 20 }
164
+ ]
165
+ },
166
+ "color": { "mode": "thresholds" }
167
+ }
168
+ },
169
+ "options": {
170
+ "reduceOptions": { "calcs": ["lastNotNull"] },
171
+ "colorMode": "value",
172
+ "graphMode": "area"
173
+ }
174
+ },
175
+
176
+ {
177
+ "type": "row",
178
+ "title": "Runs & Nodes",
179
+ "collapsed": false,
180
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
181
+ "id": 10
182
+ },
183
+ {
184
+ "type": "timeseries",
185
+ "title": "Node Throughput",
186
+ "id": 11,
187
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
188
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
189
+ "targets": [
190
+ { "expr": "rate(smithers_smithers_nodes_started[5m])", "legendFormat": "Started", "refId": "A" },
191
+ { "expr": "rate(smithers_smithers_nodes_finished[5m])", "legendFormat": "Finished", "refId": "B" },
192
+ { "expr": "rate(smithers_smithers_nodes_failed[5m])", "legendFormat": "Failed", "refId": "C" }
193
+ ],
194
+ "fieldConfig": {
195
+ "defaults": {
196
+ "custom": { "drawStyle": "line", "lineWidth": 2, "fillOpacity": 10, "gradientMode": "scheme" },
197
+ "color": { "mode": "palette-classic" },
198
+ "unit": "ops"
199
+ }
200
+ },
201
+ "options": {
202
+ "tooltip": { "mode": "multi", "sort": "desc" },
203
+ "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
204
+ }
205
+ },
206
+ {
207
+ "type": "heatmap",
208
+ "title": "Node Duration",
209
+ "id": 12,
210
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
211
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
212
+ "targets": [{
213
+ "expr": "sum(increase(smithers_smithers_node_duration_ms_bucket[5m])) by (le)",
214
+ "legendFormat": "{{le}}",
215
+ "refId": "A",
216
+ "format": "heatmap"
217
+ }],
218
+ "options": {
219
+ "calculate": false,
220
+ "yAxis": { "unit": "ms" },
221
+ "color": { "mode": "scheme", "scheme": "Oranges", "steps": 64 },
222
+ "cellGap": 1
223
+ }
224
+ },
225
+
226
+ {
227
+ "type": "row",
228
+ "title": "Tools",
229
+ "collapsed": false,
230
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
231
+ "id": 20
232
+ },
233
+ {
234
+ "type": "timeseries",
235
+ "title": "Tool Call Rate",
236
+ "id": 21,
237
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 },
238
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
239
+ "targets": [{
240
+ "expr": "rate(smithers_smithers_tool_calls_total[5m])",
241
+ "legendFormat": "Tool Calls / sec",
242
+ "refId": "A"
243
+ }],
244
+ "fieldConfig": {
245
+ "defaults": {
246
+ "custom": { "drawStyle": "line", "lineWidth": 2, "fillOpacity": 10 },
247
+ "color": { "mode": "fixed", "fixedColor": "purple" },
248
+ "unit": "ops"
249
+ }
250
+ },
251
+ "options": {
252
+ "tooltip": { "mode": "single" },
253
+ "legend": { "displayMode": "list", "placement": "bottom" }
254
+ }
255
+ },
256
+ {
257
+ "type": "heatmap",
258
+ "title": "Tool Duration",
259
+ "id": 22,
260
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
261
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
262
+ "targets": [{
263
+ "expr": "sum(increase(smithers_smithers_tool_duration_ms_bucket[5m])) by (le)",
264
+ "legendFormat": "{{le}}",
265
+ "refId": "A",
266
+ "format": "heatmap"
267
+ }],
268
+ "options": {
269
+ "calculate": false,
270
+ "yAxis": { "unit": "ms" },
271
+ "color": { "mode": "scheme", "scheme": "Blues", "steps": 64 },
272
+ "cellGap": 1
273
+ }
274
+ },
275
+
276
+ {
277
+ "type": "row",
278
+ "title": "Infrastructure",
279
+ "collapsed": false,
280
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
281
+ "id": 30
282
+ },
283
+ {
284
+ "type": "timeseries",
285
+ "title": "HTTP Request Latency",
286
+ "id": 31,
287
+ "gridPos": { "h": 8, "w": 8, "x": 0, "y": 24 },
288
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
289
+ "targets": [
290
+ {
291
+ "expr": "histogram_quantile(0.50, sum(rate(smithers_smithers_http_request_duration_ms_bucket[5m])) by (le))",
292
+ "legendFormat": "p50",
293
+ "refId": "A"
294
+ },
295
+ {
296
+ "expr": "histogram_quantile(0.95, sum(rate(smithers_smithers_http_request_duration_ms_bucket[5m])) by (le))",
297
+ "legendFormat": "p95",
298
+ "refId": "B"
299
+ },
300
+ {
301
+ "expr": "histogram_quantile(0.99, sum(rate(smithers_smithers_http_request_duration_ms_bucket[5m])) by (le))",
302
+ "legendFormat": "p99",
303
+ "refId": "C"
304
+ }
305
+ ],
306
+ "fieldConfig": {
307
+ "defaults": {
308
+ "custom": { "drawStyle": "line", "lineWidth": 2, "fillOpacity": 5 },
309
+ "color": { "mode": "palette-classic" },
310
+ "unit": "ms"
311
+ }
312
+ },
313
+ "options": {
314
+ "tooltip": { "mode": "multi", "sort": "desc" },
315
+ "legend": { "displayMode": "list", "placement": "bottom" }
316
+ }
317
+ },
318
+ {
319
+ "type": "heatmap",
320
+ "title": "DB Query Duration",
321
+ "id": 32,
322
+ "gridPos": { "h": 8, "w": 8, "x": 8, "y": 24 },
323
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
324
+ "targets": [{
325
+ "expr": "sum(increase(smithers_smithers_db_query_ms_bucket[5m])) by (le)",
326
+ "legendFormat": "{{le}}",
327
+ "refId": "A",
328
+ "format": "heatmap"
329
+ }],
330
+ "options": {
331
+ "calculate": false,
332
+ "yAxis": { "unit": "ms" },
333
+ "color": { "mode": "scheme", "scheme": "Greens", "steps": 64 },
334
+ "cellGap": 1
335
+ }
336
+ },
337
+ {
338
+ "type": "timeseries",
339
+ "title": "VCS & Hot Reload",
340
+ "id": 33,
341
+ "gridPos": { "h": 8, "w": 8, "x": 16, "y": 24 },
342
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
343
+ "targets": [
344
+ {
345
+ "expr": "rate(smithers_smithers_hot_reloads[5m])",
346
+ "legendFormat": "Hot Reloads / sec",
347
+ "refId": "A"
348
+ },
349
+ {
350
+ "expr": "rate(smithers_smithers_hot_reload_failures[5m])",
351
+ "legendFormat": "Reload Failures / sec",
352
+ "refId": "B"
353
+ },
354
+ {
355
+ "expr": "histogram_quantile(0.95, sum(rate(smithers_smithers_vcs_duration_ms_bucket[5m])) by (le))",
356
+ "legendFormat": "VCS p95 (ms)",
357
+ "refId": "C"
358
+ }
359
+ ],
360
+ "fieldConfig": {
361
+ "defaults": {
362
+ "custom": { "drawStyle": "line", "lineWidth": 2, "fillOpacity": 10 },
363
+ "color": { "mode": "palette-classic" }
364
+ }
365
+ },
366
+ "options": {
367
+ "tooltip": { "mode": "multi", "sort": "desc" },
368
+ "legend": { "displayMode": "list", "placement": "bottom" }
369
+ }
370
+ },
371
+
372
+ {
373
+ "type": "row",
374
+ "title": "Approvals",
375
+ "collapsed": false,
376
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 },
377
+ "id": 40
378
+ },
379
+ {
380
+ "type": "timeseries",
381
+ "title": "Approval Flow",
382
+ "id": 41,
383
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 },
384
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
385
+ "targets": [
386
+ { "expr": "rate(smithers_smithers_approvals_requested[5m])", "legendFormat": "Requested", "refId": "A" },
387
+ { "expr": "rate(smithers_smithers_approvals_granted[5m])", "legendFormat": "Granted", "refId": "B" },
388
+ { "expr": "rate(smithers_smithers_approvals_denied[5m])", "legendFormat": "Denied", "refId": "C" }
389
+ ],
390
+ "fieldConfig": {
391
+ "defaults": {
392
+ "custom": { "drawStyle": "bars", "lineWidth": 1, "fillOpacity": 50 },
393
+ "color": { "mode": "palette-classic" },
394
+ "unit": "ops"
395
+ }
396
+ },
397
+ "options": {
398
+ "tooltip": { "mode": "multi" },
399
+ "legend": { "displayMode": "list", "placement": "bottom" }
400
+ }
401
+ },
402
+ {
403
+ "type": "timeseries",
404
+ "title": "HTTP Request Rate",
405
+ "id": 42,
406
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 },
407
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
408
+ "targets": [{
409
+ "expr": "rate(smithers_smithers_http_requests[5m])",
410
+ "legendFormat": "Requests / sec",
411
+ "refId": "A"
412
+ }],
413
+ "fieldConfig": {
414
+ "defaults": {
415
+ "custom": { "drawStyle": "line", "lineWidth": 2, "fillOpacity": 15, "gradientMode": "scheme" },
416
+ "color": { "mode": "fixed", "fixedColor": "green" },
417
+ "unit": "reqps"
418
+ }
419
+ },
420
+ "options": {
421
+ "tooltip": { "mode": "single" },
422
+ "legend": { "displayMode": "list", "placement": "bottom" }
423
+ }
424
+ },
425
+
426
+ {
427
+ "type": "row",
428
+ "title": "Traces",
429
+ "collapsed": true,
430
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 },
431
+ "id": 50,
432
+ "panels": [
433
+ {
434
+ "type": "traces",
435
+ "title": "Recent Traces",
436
+ "id": 51,
437
+ "gridPos": { "h": 12, "w": 24, "x": 0, "y": 42 },
438
+ "datasource": { "type": "tempo", "uid": "tempo" },
439
+ "targets": [{
440
+ "datasource": { "type": "tempo", "uid": "tempo" },
441
+ "queryType": "traceqlSearch",
442
+ "limit": 20,
443
+ "tableType": "traces",
444
+ "filters": [
445
+ { "id": "service-name", "tag": "service.name", "operator": "=", "value": ["smithers"], "scope": "resource" }
446
+ ],
447
+ "refId": "A"
448
+ }]
449
+ }
450
+ ]
451
+ }
452
+ ]
453
+ }
@@ -0,0 +1,12 @@
1
+ apiVersion: 1
2
+
3
+ providers:
4
+ - name: default
5
+ orgId: 1
6
+ folder: ""
7
+ type: file
8
+ disableDeletion: false
9
+ updateIntervalSeconds: 30
10
+ allowUiUpdates: true
11
+ options:
12
+ path: /var/lib/grafana/dashboards
@@ -0,0 +1,49 @@
1
+ apiVersion: 1
2
+
3
+ datasources:
4
+ - name: Prometheus
5
+ type: prometheus
6
+ uid: prometheus
7
+ access: proxy
8
+ url: http://prometheus:9090
9
+ isDefault: true
10
+ editable: false
11
+ jsonData:
12
+ httpMethod: POST
13
+ exemplarTraceIdDestinations:
14
+ - name: traceID
15
+ datasourceUid: tempo
16
+
17
+ - name: Loki
18
+ type: loki
19
+ uid: loki
20
+ access: proxy
21
+ url: http://loki:3100
22
+ editable: false
23
+ jsonData:
24
+ maxLines: 1000
25
+
26
+ - name: Tempo
27
+ type: tempo
28
+ uid: tempo
29
+ access: proxy
30
+ url: http://tempo:3200
31
+ editable: false
32
+ jsonData:
33
+ tracesToLogs:
34
+ datasourceUid: loki
35
+ tags: ["service.name"]
36
+ mappedTags:
37
+ - key: service.name
38
+ value: service.name
39
+ mapTagNamesEnabled: false
40
+ spanStartTimeShift: "-1h"
41
+ spanEndTimeShift: "1h"
42
+ filterByTraceID: false
43
+ filterBySpanID: false
44
+ tracesToMetrics:
45
+ datasourceUid: prometheus
46
+ spanStartTimeShift: "-1h"
47
+ spanEndTimeShift: "1h"
48
+ nodeGraph:
49
+ enabled: true
@@ -0,0 +1,35 @@
1
+ auth_enabled: false
2
+
3
+ server:
4
+ http_listen_port: 3100
5
+
6
+ common:
7
+ instance_addr: 127.0.0.1
8
+ path_prefix: /loki
9
+ storage:
10
+ filesystem:
11
+ chunks_directory: /loki/chunks
12
+ rules_directory: /loki/rules
13
+ replication_factor: 1
14
+ ring:
15
+ kvstore:
16
+ store: inmemory
17
+
18
+ schema_config:
19
+ configs:
20
+ - from: 2020-10-24
21
+ store: tsdb
22
+ object_store: filesystem
23
+ schema: v13
24
+ index:
25
+ prefix: index_
26
+ period: 24h
27
+
28
+ limits_config:
29
+ allow_structured_metadata: true
30
+
31
+ ruler:
32
+ alertmanager_url: http://localhost:9093
33
+
34
+ analytics:
35
+ reporting_enabled: false
@@ -0,0 +1,38 @@
1
+ receivers:
2
+ otlp:
3
+ protocols:
4
+ grpc:
5
+ endpoint: "0.0.0.0:4317"
6
+ http:
7
+ endpoint: "0.0.0.0:4318"
8
+
9
+ exporters:
10
+ prometheus:
11
+ endpoint: "0.0.0.0:8889"
12
+ namespace: smithers
13
+ otlp_grpc/tempo:
14
+ endpoint: "tempo:4317"
15
+ tls:
16
+ insecure: true
17
+ otlp_http/loki:
18
+ endpoint: "http://loki:3100/otlp"
19
+
20
+ processors:
21
+ batch:
22
+ timeout: 5s
23
+ send_batch_size: 1024
24
+
25
+ service:
26
+ pipelines:
27
+ metrics:
28
+ receivers: [otlp]
29
+ processors: [batch]
30
+ exporters: [prometheus]
31
+ traces:
32
+ receivers: [otlp]
33
+ processors: [batch]
34
+ exporters: [otlp_grpc/tempo]
35
+ logs:
36
+ receivers: [otlp]
37
+ processors: [batch]
38
+ exporters: [otlp_http/loki]
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@smithers-orchestrator/observability",
3
- "version": "0.23.0",
3
+ "version": "0.24.2",
4
4
  "description": "Concrete Smithers metrics, logging, tracing, and observability integrations",
5
5
  "type": "module",
6
6
  "sideEffects": false,
@@ -22,14 +22,20 @@
22
22
  }
23
23
  },
24
24
  "files": [
25
- "src/"
25
+ "src/",
26
+ "docker-compose.otel.yml",
27
+ "otel-collector-config.yml",
28
+ "prometheus/",
29
+ "tempo/",
30
+ "loki/",
31
+ "grafana/"
26
32
  ],
27
33
  "dependencies": {
28
34
  "@effect/opentelemetry": "^0.63.0",
29
35
  "@effect/platform": "^0.96.0",
30
36
  "@effect/platform-bun": "^0.89.0",
31
37
  "effect": "^3.21.1",
32
- "@smithers-orchestrator/agents": "0.23.0"
38
+ "@smithers-orchestrator/agents": "0.24.2"
33
39
  },
34
40
  "devDependencies": {
35
41
  "@types/bun": "latest",
@@ -0,0 +1,8 @@
1
+ global:
2
+ scrape_interval: 15s
3
+ evaluation_interval: 15s
4
+
5
+ scrape_configs:
6
+ - job_name: "otel-collector"
7
+ static_configs:
8
+ - targets: ["otel-collector:8889"]
@@ -58,11 +58,17 @@ export function buildOtelLogRecord(body, attributes, severity) {
58
58
  * @returns {OtelLogSeverity}
59
59
  */
60
60
  export function inferCanonicalSeverity(event) {
61
- return event.event.kind === "capture.error"
62
- ? "ERROR"
63
- : event.event.kind === "capture.warning" || event.event.kind === "stderr"
64
- ? "WARN"
65
- : "INFO";
61
+ if (event.event.kind === "capture.error") {
62
+ // Truncated output is a non-fatal capture issue, not an error
63
+ if (event.payload?.reason === "truncated-json-stream") {
64
+ return "WARN";
65
+ }
66
+ return "ERROR";
67
+ }
68
+ if (event.event.kind === "capture.warning" || event.event.kind === "stderr") {
69
+ return "WARN";
70
+ }
71
+ return "INFO";
66
72
  }
67
73
 
68
74
  /**
package/src/index.d.ts CHANGED
@@ -838,6 +838,12 @@ declare const toolOutputTruncatedTotal: Metric.Metric.Counter<number>;
838
838
 
839
839
  declare const eventsEmittedTotal: Metric.Metric.Counter<number>;
840
840
 
841
+ declare const snapshotsCaptured: Metric.Metric.Counter<number>;
842
+
843
+ declare const runForksCreated: Metric.Metric.Counter<number>;
844
+
845
+ declare const replaysStarted: Metric.Metric.Counter<number>;
846
+
841
847
  declare const activeRuns: Metric.Metric.Gauge<number>;
842
848
 
843
849
  declare const activeNodes: Metric.Metric.Gauge<number>;
@@ -906,6 +912,8 @@ declare const sandboxTransportDurationMs: Metric.Metric<effect_MetricKeyType.Met
906
912
 
907
913
  declare const sandboxPatchCount: Metric.Metric<effect_MetricKeyType.MetricKeyType.Histogram, number, effect_MetricState.MetricState.Histogram>;
908
914
 
915
+ declare const snapshotDuration: Metric.Metric<effect_MetricKeyType.MetricKeyType.Histogram, number, effect_MetricState.MetricState.Histogram>;
916
+
909
917
  declare const rewindTotal: Metric.Metric.Counter<number>;
910
918
 
911
919
  declare const rewindRollbackTotal: Metric.Metric.Counter<number>;
@@ -1031,4 +1039,4 @@ type SmithersMetricDefinition = SmithersMetricDefinition$2;
1031
1039
  type SmithersObservabilityOptions = SmithersObservabilityOptions$4;
1032
1040
  type SmithersObservabilityService = SmithersObservabilityService$1;
1033
1041
 
1034
- export { type CorrelationContext, CorrelationContextLive, type CorrelationContextPatch, CorrelationContextService, type CorrelationPatch, type MetricLabels, MetricsService, MetricsServiceLive, type MetricsServiceShape, type MetricsSnapshot, type ResolvedSmithersObservabilityOptions, type SmithersEvent, type SmithersLogFormat, type SmithersMetricDefinition, SmithersObservability, type SmithersObservabilityOptions, type SmithersObservabilityService, TracingService, TracingServiceLive, activeNodes, activeRuns, annotateSmithersTrace, approvalPending, approvalWaitDuration, approvalsDenied, approvalsGranted, approvalsRequested, attemptDuration, cacheHits, cacheMisses, correlationContextFiberRef, correlationContextToLogAnnotations, createSmithersObservabilityLayer, createSmithersOtelLayer, createSmithersRuntimeLayer, dbQueryDuration, dbRetries, dbTransactionDuration, dbTransactionRetries, dbTransactionRollbacks, errorsTotal, eventsEmittedTotal, externalWaitAsyncPending, getCurrentCorrelationContext, getCurrentCorrelationContextEffect, getCurrentSmithersTraceAnnotations, getCurrentSmithersTraceSpan, hotReloadDuration, hotReloadFailures, hotReloads, httpRequestDuration, httpRequests, logDebug, logError, logInfo, logWarning, makeSmithersSpanAttributes, mergeCorrelationContext, metricsServiceAdapter, nodeDuration, nodeRetriesTotal, nodesFailed, nodesFinished, nodesStarted, processHeapUsedBytes, processMemoryRssBytes, processUptimeSeconds, prometheusContentType, promptSizeBytes, renderPrometheusMetrics, resolveSmithersObservabilityOptions, responseSizeBytes, rewindDurationMs, rewindFramesDeleted, rewindRollbackTotal, rewindSandboxesReverted, rewindTotal, runDuration, runWithCorrelationContext, runsAncestryDepth, runsCancelledTotal, runsCarriedStateBytes, runsContinuedTotal, runsFailedTotal, runsFinishedTotal, runsResumedTotal, runsTotal, sandboxActive, sandboxBundleSizeBytes, sandboxCompletedTotal, sandboxCreatedTotal, sandboxDurationMs, sandboxPatchCount, sandboxTransportDurationMs, schedulerConcurrencyUtilization, schedulerQueueDepth, schedulerWaitDuration, scorerEventsFailed, scorerEventsFinished, scorerEventsStarted, smithersMetricCatalog, smithersMetrics, smithersSpanNames, timerDelayDuration, timersCancelled, timersCreated, timersFired, timersPending, toPrometheusMetricName, tokensCacheReadTotal, tokensCacheWriteTotal, tokensContextWindowBucketTotal, tokensContextWindowPerCall, tokensInputPerCall, tokensInputTotal, tokensOutputPerCall, tokensOutputTotal, tokensReasoningTotal, toolCallErrorsTotal, toolCallsTotal, toolDuration, toolOutputTruncatedTotal, trackEvent as trackSmithersEvent, updateCurrentCorrelationContext, updateProcessMetrics, vcsDuration, withCorrelationContext, withCurrentCorrelationContext, withSmithersSpan };
1042
+ export { type CorrelationContext, CorrelationContextLive, type CorrelationContextPatch, CorrelationContextService, type CorrelationPatch, type MetricLabels, MetricsService, MetricsServiceLive, type MetricsServiceShape, type MetricsSnapshot, type ResolvedSmithersObservabilityOptions, type SmithersEvent, type SmithersLogFormat, type SmithersMetricDefinition, SmithersObservability, type SmithersObservabilityOptions, type SmithersObservabilityService, TracingService, TracingServiceLive, activeNodes, activeRuns, annotateSmithersTrace, approvalPending, approvalWaitDuration, approvalsDenied, approvalsGranted, approvalsRequested, attemptDuration, cacheHits, cacheMisses, correlationContextFiberRef, correlationContextToLogAnnotations, createSmithersObservabilityLayer, createSmithersOtelLayer, createSmithersRuntimeLayer, dbQueryDuration, dbRetries, dbTransactionDuration, dbTransactionRetries, dbTransactionRollbacks, errorsTotal, eventsEmittedTotal, externalWaitAsyncPending, getCurrentCorrelationContext, getCurrentCorrelationContextEffect, getCurrentSmithersTraceAnnotations, getCurrentSmithersTraceSpan, hotReloadDuration, hotReloadFailures, hotReloads, httpRequestDuration, httpRequests, logDebug, logError, logInfo, logWarning, makeSmithersSpanAttributes, mergeCorrelationContext, metricsServiceAdapter, nodeDuration, nodeRetriesTotal, nodesFailed, nodesFinished, nodesStarted, processHeapUsedBytes, processMemoryRssBytes, processUptimeSeconds, prometheusContentType, promptSizeBytes, renderPrometheusMetrics, replaysStarted, resolveSmithersObservabilityOptions, responseSizeBytes, rewindDurationMs, rewindFramesDeleted, rewindRollbackTotal, rewindSandboxesReverted, rewindTotal, runDuration, runForksCreated, runWithCorrelationContext, runsAncestryDepth, runsCancelledTotal, runsCarriedStateBytes, runsContinuedTotal, runsFailedTotal, runsFinishedTotal, runsResumedTotal, runsTotal, sandboxActive, sandboxBundleSizeBytes, sandboxCompletedTotal, sandboxCreatedTotal, sandboxDurationMs, sandboxPatchCount, sandboxTransportDurationMs, schedulerConcurrencyUtilization, schedulerQueueDepth, schedulerWaitDuration, scorerEventsFailed, scorerEventsFinished, scorerEventsStarted, smithersMetricCatalog, smithersMetrics, smithersSpanNames, snapshotDuration, snapshotsCaptured, timerDelayDuration, timersCancelled, timersCreated, timersFired, timersPending, toPrometheusMetricName, tokensCacheReadTotal, tokensCacheWriteTotal, tokensContextWindowBucketTotal, tokensContextWindowPerCall, tokensInputPerCall, tokensInputTotal, tokensOutputPerCall, tokensOutputTotal, tokensReasoningTotal, toolCallErrorsTotal, toolCallsTotal, toolDuration, toolOutputTruncatedTotal, trackEvent as trackSmithersEvent, updateCurrentCorrelationContext, updateProcessMetrics, vcsDuration, withCorrelationContext, withCurrentCorrelationContext, withSmithersSpan };
package/src/index.js CHANGED
@@ -29,7 +29,7 @@ export { createSmithersOtelLayer } from "./createSmithersOtelLayer.js";
29
29
  export { createSmithersObservabilityLayer } from "./createSmithersObservabilityLayer.js";
30
30
  export { createSmithersRuntimeLayer } from "./createSmithersRuntimeLayer.js";
31
31
  export { rewindTotal, rewindRollbackTotal, rewindDurationMs, rewindFramesDeleted, rewindSandboxesReverted, } from "./metrics/index.js";
32
- export { activeNodes, activeRuns, approvalPending, externalWaitAsyncPending, approvalsDenied, approvalsGranted, approvalsRequested, approvalWaitDuration, timerDelayDuration, timersCancelled, timersCreated, timersFired, timersPending, attemptDuration, cacheHits, cacheMisses, dbQueryDuration, dbRetries, dbTransactionDuration, dbTransactionRetries, dbTransactionRollbacks, errorsTotal, eventsEmittedTotal, hotReloadDuration, hotReloadFailures, hotReloads, httpRequestDuration, httpRequests, nodeDuration, nodeRetriesTotal, nodesFailed, nodesFinished, nodesStarted, processHeapUsedBytes, processMemoryRssBytes, processUptimeSeconds, promptSizeBytes, responseSizeBytes, runDuration, runsCancelledTotal, runsContinuedTotal, runsFailedTotal, runsFinishedTotal, runsResumedTotal, runsAncestryDepth, runsCarriedStateBytes, sandboxActive, sandboxBundleSizeBytes, sandboxCompletedTotal, sandboxCreatedTotal, sandboxDurationMs, sandboxPatchCount, sandboxTransportDurationMs, runsTotal, schedulerConcurrencyUtilization, schedulerQueueDepth, schedulerWaitDuration, tokensCacheReadTotal, tokensCacheWriteTotal, tokensContextWindowBucketTotal, tokensContextWindowPerCall, tokensInputPerCall, tokensInputTotal, tokensOutputPerCall, tokensOutputTotal, tokensReasoningTotal, toolCallErrorsTotal, toolCallsTotal, toolDuration, toolOutputTruncatedTotal, scorerEventsStarted, scorerEventsFinished, scorerEventsFailed, trackEvent as trackSmithersEvent, updateProcessMetrics, vcsDuration, toPrometheusMetricName, smithersMetricCatalog, metricsServiceAdapter, } from "./metrics/index.js";
32
+ export { activeNodes, activeRuns, approvalPending, externalWaitAsyncPending, approvalsDenied, approvalsGranted, approvalsRequested, approvalWaitDuration, timerDelayDuration, timersCancelled, timersCreated, timersFired, timersPending, attemptDuration, cacheHits, cacheMisses, dbQueryDuration, dbRetries, dbTransactionDuration, dbTransactionRetries, dbTransactionRollbacks, errorsTotal, eventsEmittedTotal, hotReloadDuration, hotReloadFailures, hotReloads, httpRequestDuration, httpRequests, nodeDuration, nodeRetriesTotal, nodesFailed, nodesFinished, nodesStarted, processHeapUsedBytes, processMemoryRssBytes, processUptimeSeconds, promptSizeBytes, responseSizeBytes, runDuration, runsCancelledTotal, runsContinuedTotal, runsFailedTotal, runsFinishedTotal, runsResumedTotal, runsAncestryDepth, runsCarriedStateBytes, sandboxActive, sandboxBundleSizeBytes, sandboxCompletedTotal, sandboxCreatedTotal, sandboxDurationMs, sandboxPatchCount, sandboxTransportDurationMs, runsTotal, schedulerConcurrencyUtilization, schedulerQueueDepth, schedulerWaitDuration, tokensCacheReadTotal, tokensCacheWriteTotal, tokensContextWindowBucketTotal, tokensContextWindowPerCall, tokensInputPerCall, tokensInputTotal, tokensOutputPerCall, tokensOutputTotal, tokensReasoningTotal, toolCallErrorsTotal, toolCallsTotal, toolDuration, toolOutputTruncatedTotal, scorerEventsStarted, scorerEventsFinished, scorerEventsFailed, snapshotsCaptured, runForksCreated, replaysStarted, snapshotDuration, trackEvent as trackSmithersEvent, updateProcessMetrics, vcsDuration, toPrometheusMetricName, smithersMetricCatalog, metricsServiceAdapter, } from "./metrics/index.js";
33
33
  export { correlationContextFiberRef, correlationContextToLogAnnotations, CorrelationContextLive, CorrelationContextService, getCurrentCorrelationContext, getCurrentCorrelationContextEffect, mergeCorrelationContext, runWithCorrelationContext, withCorrelationContext, withCurrentCorrelationContext, } from "./correlation.js";
34
34
  export { updateCurrentCorrelationContext } from "./correlation.js";
35
35
  export { logDebug, logInfo, logWarning, logError, } from "./logging.js";
@@ -0,0 +1,17 @@
1
+ server:
2
+ http_listen_port: 3200
3
+
4
+ distributor:
5
+ receivers:
6
+ otlp:
7
+ protocols:
8
+ grpc:
9
+ endpoint: "0.0.0.0:4317"
10
+
11
+ storage:
12
+ trace:
13
+ backend: local
14
+ local:
15
+ path: /tmp/tempo/blocks
16
+ wal:
17
+ path: /tmp/tempo/wal