@pleri/olam-cli 0.1.161 → 0.1.162

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +4 -4
  2. package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
  3. package/dist/commands/bootstrap.d.ts.map +1 -1
  4. package/dist/commands/bootstrap.js +35 -11
  5. package/dist/commands/bootstrap.js.map +1 -1
  6. package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
  7. package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
  8. package/dist/commands/flywheel/migrate-overlays.js +29 -3
  9. package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
  10. package/dist/commands/skills-source.d.ts.map +1 -1
  11. package/dist/commands/skills-source.js +57 -2
  12. package/dist/commands/skills-source.js.map +1 -1
  13. package/dist/commands/skills.d.ts.map +1 -1
  14. package/dist/commands/skills.js +14 -0
  15. package/dist/commands/skills.js.map +1 -1
  16. package/dist/image-digests.json +7 -7
  17. package/dist/index.js +996 -618
  18. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
  19. package/dist/lib/bootstrap-kubernetes.js +93 -13
  20. package/dist/lib/bootstrap-kubernetes.js.map +1 -1
  21. package/dist/mcp-server.js +568 -368
  22. package/hermes-bundle/version.json +1 -1
  23. package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
  24. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  25. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  26. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  27. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  28. package/host-cp/observability/grafana-port-forward.sh +12 -2
  29. package/host-cp/observability/kyverno-cardinality-mutate.sh +12 -2
  30. package/host-cp/observability/loki-ingest.sh +12 -2
  31. package/host-cp/observability/prom-no-double-grafana.sh +15 -5
  32. package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
  33. package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
  34. package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
  35. package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
  36. package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
  37. package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
  38. package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
  39. package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
  40. package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
  41. package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
  42. package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
  43. package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
  44. package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
  45. package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
  46. package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
  47. package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
  48. package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
  49. package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
  50. package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
  51. package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
  52. package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
  53. package/host-cp/src/plan-chat-service.mjs +147 -1
  54. package/package.json +1 -1
@@ -0,0 +1,1349 @@
1
+ # ----------------------------------------------------------------------------
2
+ # GENERATED FILE — DO NOT EDIT DIRECTLY
3
+ #
4
+ # Source: packages/peripheral-services/grafana-dashboards/*.json
5
+ # Regenerate: packages/peripheral-services/scripts/sync-grafana-dashboards.sh
6
+ #
7
+ # This ConfigMap is consumed by the grafana/grafana Helm chart via
8
+ # dashboardsConfigMaps.olam-default: olam-dashboards
9
+ # as wired in packages/peripheral-services/helm-values/grafana-values.yaml.
10
+ #
11
+ # Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B3
12
+ # ----------------------------------------------------------------------------
13
+ apiVersion: v1
14
+ kind: ConfigMap
15
+ metadata:
16
+ name: olam-dashboards
17
+ namespace: monitoring
18
+ labels:
19
+ app.kubernetes.io/name: grafana
20
+ app.kubernetes.io/managed-by: olam
21
+ grafana_dashboard: "1"
22
+ data:
23
+ host-cp.json: |
24
+ {
25
+ "uid": "host-cp",
26
+ "title": "Host-CP — Service Drill-in",
27
+ "description": "Per-route SLIs for host-cp. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. Use the route dropdown to scope a single route or view all. The world_id variable is forwarded from olam-home for context.",
28
+ "tags": ["olam", "drill-in", "phase-c", "host-cp"],
29
+ "timezone": "browser",
30
+ "refresh": "30s",
31
+ "schemaVersion": 39,
32
+ "version": 1,
33
+ "time": {
34
+ "from": "now-1h",
35
+ "to": "now"
36
+ },
37
+ "timepicker": {},
38
+ "templating": {
39
+ "list": [
40
+ {
41
+ "name": "world_id",
42
+ "label": "World",
43
+ "type": "query",
44
+ "datasource": { "type": "loki", "uid": "loki" },
45
+ "query": {
46
+ "qryType": 2,
47
+ "expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))",
48
+ "step": ""
49
+ },
50
+ "refresh": 2,
51
+ "sort": 1,
52
+ "multi": false,
53
+ "includeAll": true,
54
+ "allValue": ".+",
55
+ "current": { "selected": false, "text": "All", "value": "$__all" }
56
+ },
57
+ {
58
+ "name": "route",
59
+ "label": "Route",
60
+ "type": "query",
61
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
62
+ "query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"host-cp\"}, route)",
63
+ "refresh": 2,
64
+ "sort": 1,
65
+ "multi": true,
66
+ "includeAll": true,
67
+ "allValue": ".+",
68
+ "current": { "selected": false, "text": "All", "value": "$__all" }
69
+ }
70
+ ]
71
+ },
72
+ "annotations": {
73
+ "list": []
74
+ },
75
+ "panels": [
76
+ {
77
+ "id": 1,
78
+ "type": "timeseries",
79
+ "title": "Request rate by route",
80
+ "description": "Requests per second for each host-cp route over the last 5 minutes (pre-computed by C4 recording rule). Spikes indicate traffic surges; a route going to zero indicates it stopped receiving traffic.",
81
+ "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
82
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
83
+ "fieldConfig": {
84
+ "defaults": {
85
+ "unit": "reqps",
86
+ "color": { "mode": "palette-classic" },
87
+ "custom": {
88
+ "lineWidth": 2,
89
+ "fillOpacity": 10,
90
+ "showPoints": "never"
91
+ }
92
+ }
93
+ },
94
+ "options": {
95
+ "tooltip": { "mode": "multi", "sort": "desc" },
96
+ "legend": { "displayMode": "list", "placement": "bottom" }
97
+ },
98
+ "targets": [
99
+ {
100
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
101
+ "expr": "olam:http_requests:rate5m_by_service_route{service=\"host-cp\",route=~\"$route\"}",
102
+ "legendFormat": "{{route}}",
103
+ "instant": false,
104
+ "range": true
105
+ }
106
+ ]
107
+ },
108
+ {
109
+ "id": 2,
110
+ "type": "timeseries",
111
+ "title": "5xx error rate by route",
112
+ "description": "5xx responses per second per host-cp route (C4 recording rule). A non-zero value on a route warrants investigation. Correlate with the error ratio panel below to understand severity relative to total traffic.",
113
+ "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
114
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
115
+ "fieldConfig": {
116
+ "defaults": {
117
+ "unit": "reqps",
118
+ "color": { "mode": "palette-classic" },
119
+ "custom": {
120
+ "lineWidth": 2,
121
+ "fillOpacity": 10,
122
+ "showPoints": "never"
123
+ }
124
+ }
125
+ },
126
+ "options": {
127
+ "tooltip": { "mode": "multi", "sort": "desc" },
128
+ "legend": { "displayMode": "list", "placement": "bottom" }
129
+ },
130
+ "targets": [
131
+ {
132
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
133
+ "expr": "olam:http_errors:rate5m_by_service_route{service=\"host-cp\",route=~\"$route\"}",
134
+ "legendFormat": "{{route}}",
135
+ "instant": false,
136
+ "range": true
137
+ }
138
+ ]
139
+ },
140
+ {
141
+ "id": 3,
142
+ "type": "timeseries",
143
+ "title": "Latency p50 by route",
144
+ "description": "Median (p50) request duration per host-cp route in seconds (C4 recording rule). Represents typical user-perceived latency. Sustained increases above baseline indicate a regression or upstream dependency slowdown.",
145
+ "gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 },
146
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
147
+ "fieldConfig": {
148
+ "defaults": {
149
+ "unit": "s",
150
+ "color": { "mode": "palette-classic" },
151
+ "custom": {
152
+ "lineWidth": 2,
153
+ "fillOpacity": 10,
154
+ "showPoints": "never"
155
+ }
156
+ }
157
+ },
158
+ "options": {
159
+ "tooltip": { "mode": "multi", "sort": "desc" },
160
+ "legend": { "displayMode": "list", "placement": "bottom" }
161
+ },
162
+ "targets": [
163
+ {
164
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
165
+ "expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"host-cp\",route=~\"$route\"}",
166
+ "legendFormat": "{{route}}",
167
+ "instant": false,
168
+ "range": true
169
+ }
170
+ ]
171
+ },
172
+ {
173
+ "id": 4,
174
+ "type": "timeseries",
175
+ "title": "Latency p95 by route",
176
+ "description": "95th-percentile request duration per host-cp route in seconds (C4 recording rule). Captures the tail latency experienced by the slowest 5% of requests. The primary SLI for detecting latency regressions before they affect most users.",
177
+ "gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 },
178
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
179
+ "fieldConfig": {
180
+ "defaults": {
181
+ "unit": "s",
182
+ "color": { "mode": "palette-classic" },
183
+ "custom": {
184
+ "lineWidth": 2,
185
+ "fillOpacity": 10,
186
+ "showPoints": "never"
187
+ }
188
+ }
189
+ },
190
+ "options": {
191
+ "tooltip": { "mode": "multi", "sort": "desc" },
192
+ "legend": { "displayMode": "list", "placement": "bottom" }
193
+ },
194
+ "targets": [
195
+ {
196
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
197
+ "expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"host-cp\",route=~\"$route\"}",
198
+ "legendFormat": "{{route}}",
199
+ "instant": false,
200
+ "range": true
201
+ }
202
+ ]
203
+ },
204
+ {
205
+ "id": 5,
206
+ "type": "timeseries",
207
+ "title": "Latency p99 by route",
208
+ "description": "99th-percentile request duration per host-cp route in seconds (C4 recording rule). Worst-case latency tail. High p99 with stable p50/p95 often indicates a specific slow code path or resource contention under load.",
209
+ "gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 },
210
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
211
+ "fieldConfig": {
212
+ "defaults": {
213
+ "unit": "s",
214
+ "color": { "mode": "palette-classic" },
215
+ "custom": {
216
+ "lineWidth": 2,
217
+ "fillOpacity": 10,
218
+ "showPoints": "never"
219
+ }
220
+ }
221
+ },
222
+ "options": {
223
+ "tooltip": { "mode": "multi", "sort": "desc" },
224
+ "legend": { "displayMode": "list", "placement": "bottom" }
225
+ },
226
+ "targets": [
227
+ {
228
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
229
+ "expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"host-cp\",route=~\"$route\"}",
230
+ "legendFormat": "{{route}}",
231
+ "instant": false,
232
+ "range": true
233
+ }
234
+ ]
235
+ },
236
+ {
237
+ "id": 6,
238
+ "type": "stat",
239
+ "title": "Error ratio (5xx / total) by route",
240
+ "description": "Fraction of requests returning 5xx per host-cp route (C4 recording rule). Green < 1%; yellow 1–5%; red ≥ 5%. A route showing red means roughly 1-in-20 (or more) requests are failing — investigate immediately.",
241
+ "gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
242
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
243
+ "fieldConfig": {
244
+ "defaults": {
245
+ "unit": "percentunit",
246
+ "thresholds": {
247
+ "mode": "absolute",
248
+ "steps": [
249
+ { "color": "green", "value": null },
250
+ { "color": "yellow", "value": 0.01 },
251
+ { "color": "red", "value": 0.05 }
252
+ ]
253
+ },
254
+ "color": { "mode": "thresholds" }
255
+ }
256
+ },
257
+ "options": {
258
+ "reduceOptions": { "calcs": ["lastNotNull"] },
259
+ "orientation": "auto",
260
+ "textMode": "auto",
261
+ "colorMode": "background",
262
+ "graphMode": "none",
263
+ "justifyMode": "center"
264
+ },
265
+ "targets": [
266
+ {
267
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
268
+ "expr": "olam:http_errors:ratio_by_service_route{service=\"host-cp\",route=~\"$route\"}",
269
+ "legendFormat": "{{route}}",
270
+ "instant": true,
271
+ "range": false
272
+ }
273
+ ]
274
+ }
275
+ ]
276
+ }
277
+
278
+ kg-service.json: |
279
+ {
280
+ "uid": "kg-service",
281
+ "title": "KG-Service — Service Drill-in",
282
+ "description": "Per-route SLIs for kg-service. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. kg-service exposes 4 routes: /health, /classify, /build, /status. Use the route dropdown to scope a single route. The world_id variable is forwarded from olam-home for context.",
283
+ "tags": ["olam", "drill-in", "phase-c", "kg-service"],
284
+ "timezone": "browser",
285
+ "refresh": "30s",
286
+ "schemaVersion": 39,
287
+ "version": 1,
288
+ "time": {
289
+ "from": "now-1h",
290
+ "to": "now"
291
+ },
292
+ "timepicker": {},
293
+ "templating": {
294
+ "list": [
295
+ {
296
+ "name": "world_id",
297
+ "label": "World",
298
+ "type": "query",
299
+ "datasource": { "type": "loki", "uid": "loki" },
300
+ "query": {
301
+ "qryType": 2,
302
+ "expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))",
303
+ "step": ""
304
+ },
305
+ "refresh": 2,
306
+ "sort": 1,
307
+ "multi": false,
308
+ "includeAll": true,
309
+ "allValue": ".+",
310
+ "current": { "selected": false, "text": "All", "value": "$__all" }
311
+ },
312
+ {
313
+ "name": "route",
314
+ "label": "Route",
315
+ "type": "query",
316
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
317
+ "query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"kg-service\"}, route)",
318
+ "refresh": 2,
319
+ "sort": 1,
320
+ "multi": true,
321
+ "includeAll": true,
322
+ "allValue": ".+",
323
+ "current": { "selected": false, "text": "All", "value": "$__all" }
324
+ }
325
+ ]
326
+ },
327
+ "annotations": {
328
+ "list": []
329
+ },
330
+ "panels": [
331
+ {
332
+ "id": 1,
333
+ "type": "timeseries",
334
+ "title": "Request rate by route",
335
+ "description": "Requests per second for each kg-service route over the last 5 minutes (pre-computed by C4 recording rule). /classify is the hot path; /build is infrequent; /health should be near-constant. A drop in /classify with stable /health suggests the classifier is being bypassed or the caller is down.",
336
+ "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
337
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
338
+ "fieldConfig": {
339
+ "defaults": {
340
+ "unit": "reqps",
341
+ "color": { "mode": "palette-classic" },
342
+ "custom": {
343
+ "lineWidth": 2,
344
+ "fillOpacity": 10,
345
+ "showPoints": "never"
346
+ }
347
+ }
348
+ },
349
+ "options": {
350
+ "tooltip": { "mode": "multi", "sort": "desc" },
351
+ "legend": { "displayMode": "list", "placement": "bottom" }
352
+ },
353
+ "targets": [
354
+ {
355
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
356
+ "expr": "olam:http_requests:rate5m_by_service_route{service=\"kg-service\",route=~\"$route\"}",
357
+ "legendFormat": "{{route}}",
358
+ "instant": false,
359
+ "range": true
360
+ }
361
+ ]
362
+ },
363
+ {
364
+ "id": 2,
365
+ "type": "timeseries",
366
+ "title": "5xx error rate by route",
367
+ "description": "5xx responses per second per kg-service route (C4 recording rule). Errors on /classify indicate the graph classifier is failing; errors on /build indicate a KG rebuild failure. Either warrants immediate investigation as they affect agent search quality.",
368
+ "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
369
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
370
+ "fieldConfig": {
371
+ "defaults": {
372
+ "unit": "reqps",
373
+ "color": { "mode": "palette-classic" },
374
+ "custom": {
375
+ "lineWidth": 2,
376
+ "fillOpacity": 10,
377
+ "showPoints": "never"
378
+ }
379
+ }
380
+ },
381
+ "options": {
382
+ "tooltip": { "mode": "multi", "sort": "desc" },
383
+ "legend": { "displayMode": "list", "placement": "bottom" }
384
+ },
385
+ "targets": [
386
+ {
387
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
388
+ "expr": "olam:http_errors:rate5m_by_service_route{service=\"kg-service\",route=~\"$route\"}",
389
+ "legendFormat": "{{route}}",
390
+ "instant": false,
391
+ "range": true
392
+ }
393
+ ]
394
+ },
395
+ {
396
+ "id": 3,
397
+ "type": "timeseries",
398
+ "title": "Latency p50 by route",
399
+ "description": "Median (p50) request duration per kg-service route in seconds (C4 recording rule). /classify latency drives agent dispatch latency directly; a rising p50 on /classify means agents wait longer for graph routing decisions.",
400
+ "gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 },
401
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
402
+ "fieldConfig": {
403
+ "defaults": {
404
+ "unit": "s",
405
+ "color": { "mode": "palette-classic" },
406
+ "custom": {
407
+ "lineWidth": 2,
408
+ "fillOpacity": 10,
409
+ "showPoints": "never"
410
+ }
411
+ }
412
+ },
413
+ "options": {
414
+ "tooltip": { "mode": "multi", "sort": "desc" },
415
+ "legend": { "displayMode": "list", "placement": "bottom" }
416
+ },
417
+ "targets": [
418
+ {
419
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
420
+ "expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"kg-service\",route=~\"$route\"}",
421
+ "legendFormat": "{{route}}",
422
+ "instant": false,
423
+ "range": true
424
+ }
425
+ ]
426
+ },
427
+ {
428
+ "id": 4,
429
+ "type": "timeseries",
430
+ "title": "Latency p95 by route",
431
+ "description": "95th-percentile request duration per kg-service route in seconds (C4 recording rule). kg-service is a synchronous dependency for in-world search; a high p95 on /classify directly contributes to the >6min diagnosis-time problem this observability stack is solving.",
432
+ "gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 },
433
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
434
+ "fieldConfig": {
435
+ "defaults": {
436
+ "unit": "s",
437
+ "color": { "mode": "palette-classic" },
438
+ "custom": {
439
+ "lineWidth": 2,
440
+ "fillOpacity": 10,
441
+ "showPoints": "never"
442
+ }
443
+ }
444
+ },
445
+ "options": {
446
+ "tooltip": { "mode": "multi", "sort": "desc" },
447
+ "legend": { "displayMode": "list", "placement": "bottom" }
448
+ },
449
+ "targets": [
450
+ {
451
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
452
+ "expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"kg-service\",route=~\"$route\"}",
453
+ "legendFormat": "{{route}}",
454
+ "instant": false,
455
+ "range": true
456
+ }
457
+ ]
458
+ },
459
+ {
460
+ "id": 5,
461
+ "type": "timeseries",
462
+ "title": "Latency p99 by route",
463
+ "description": "99th-percentile request duration per kg-service route in seconds (C4 recording rule). Worst-case latency tail. A high p99 on /build (graph rebuild) with stable /classify p99 is expected; the inverse (stable /build, high /classify p99) indicates classifier graph complexity growth.",
464
+ "gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 },
465
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
466
+ "fieldConfig": {
467
+ "defaults": {
468
+ "unit": "s",
469
+ "color": { "mode": "palette-classic" },
470
+ "custom": {
471
+ "lineWidth": 2,
472
+ "fillOpacity": 10,
473
+ "showPoints": "never"
474
+ }
475
+ }
476
+ },
477
+ "options": {
478
+ "tooltip": { "mode": "multi", "sort": "desc" },
479
+ "legend": { "displayMode": "list", "placement": "bottom" }
480
+ },
481
+ "targets": [
482
+ {
483
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
484
+ "expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"kg-service\",route=~\"$route\"}",
485
+ "legendFormat": "{{route}}",
486
+ "instant": false,
487
+ "range": true
488
+ }
489
+ ]
490
+ },
491
+ {
492
+ "id": 6,
493
+ "type": "stat",
494
+ "title": "Error ratio (5xx / total) by route",
495
+ "description": "Fraction of requests returning 5xx per kg-service route (C4 recording rule). Green < 1%; yellow 1–5%; red ≥ 5%. kg-service is fail-open for /classify (returns empty result on error); a high error ratio here means callers are silently getting degraded graph routing with no local error signal.",
496
+ "gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
497
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
498
+ "fieldConfig": {
499
+ "defaults": {
500
+ "unit": "percentunit",
501
+ "thresholds": {
502
+ "mode": "absolute",
503
+ "steps": [
504
+ { "color": "green", "value": null },
505
+ { "color": "yellow", "value": 0.01 },
506
+ { "color": "red", "value": 0.05 }
507
+ ]
508
+ },
509
+ "color": { "mode": "thresholds" }
510
+ }
511
+ },
512
+ "options": {
513
+ "reduceOptions": { "calcs": ["lastNotNull"] },
514
+ "orientation": "auto",
515
+ "textMode": "auto",
516
+ "colorMode": "background",
517
+ "graphMode": "none",
518
+ "justifyMode": "center"
519
+ },
520
+ "targets": [
521
+ {
522
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
523
+ "expr": "olam:http_errors:ratio_by_service_route{service=\"kg-service\",route=~\"$route\"}",
524
+ "legendFormat": "{{route}}",
525
+ "instant": true,
526
+ "range": false
527
+ }
528
+ ]
529
+ }
530
+ ]
531
+ }
532
+
533
+ memory-service.json: |
534
+ {
535
+ "uid": "memory-service",
536
+ "title": "Memory-Service — Service Drill-in",
537
+ "description": "Per-route SLIs for memory-service. All panels consume C4 recording rules (olam:* prefix) pre-computed at 30s; no raw histogram expressions. memory-service's traffic flows through the in-container Node front-door (packages/memory-service/src/metrics-proxy.mjs) which short-circuits /metrics and instruments every agentmemory engine route ({service,route,method,status_code} taxonomy). Use the route dropdown to scope a single agentmemory endpoint. The world_id variable is forwarded from olam-home for context.",
538
+ "tags": ["olam", "drill-in", "phase-c", "memory-service"],
539
+ "timezone": "browser",
540
+ "refresh": "30s",
541
+ "schemaVersion": 39,
542
+ "version": 1,
543
+ "time": {
544
+ "from": "now-1h",
545
+ "to": "now"
546
+ },
547
+ "timepicker": {},
548
+ "templating": {
549
+ "list": [
550
+ {
551
+ "name": "world_id",
552
+ "label": "World",
553
+ "type": "query",
554
+ "datasource": { "type": "loki", "uid": "loki" },
555
+ "query": {
556
+ "qryType": 2,
557
+ "expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))",
558
+ "step": ""
559
+ },
560
+ "refresh": 2,
561
+ "sort": 1,
562
+ "multi": false,
563
+ "includeAll": true,
564
+ "allValue": ".+",
565
+ "current": { "selected": false, "text": "All", "value": "$__all" }
566
+ },
567
+ {
568
+ "name": "route",
569
+ "label": "Route",
570
+ "type": "query",
571
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
572
+ "query": "label_values(olam:http_requests:rate5m_by_service_route{service=\"memory-service\"}, route)",
573
+ "refresh": 2,
574
+ "sort": 1,
575
+ "multi": true,
576
+ "includeAll": true,
577
+ "allValue": ".+",
578
+ "current": { "selected": false, "text": "All", "value": "$__all" }
579
+ }
580
+ ]
581
+ },
582
+ "annotations": {
583
+ "list": []
584
+ },
585
+ "panels": [
586
+ {
587
+ "id": 1,
588
+ "type": "timeseries",
589
+ "title": "Request rate by route",
590
+ "description": "Requests per second for each memory-service route over the last 5 minutes (pre-computed by C4 recording rule). /agentmemory/mcp/call is the hot path that agents drive — every memory_save / memory_recall lands there. /agentmemory/livez is the readiness probe (near-constant ~0.2 rps from k8s). /agentmemory/export is bridge-debounced (~1 per ~10s burst). A drop in mcp/call with stable livez indicates the agentmemory engine is up but receiving no traffic — caller-side issue.",
591
+ "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
592
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
593
+ "fieldConfig": {
594
+ "defaults": {
595
+ "unit": "reqps",
596
+ "color": { "mode": "palette-classic" },
597
+ "custom": {
598
+ "lineWidth": 2,
599
+ "fillOpacity": 10,
600
+ "showPoints": "never"
601
+ }
602
+ }
603
+ },
604
+ "options": {
605
+ "tooltip": { "mode": "multi", "sort": "desc" },
606
+ "legend": { "displayMode": "list", "placement": "bottom" }
607
+ },
608
+ "targets": [
609
+ {
610
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
611
+ "expr": "olam:http_requests:rate5m_by_service_route{service=\"memory-service\",route=~\"$route\"}",
612
+ "legendFormat": "{{route}}",
613
+ "instant": false,
614
+ "range": true
615
+ }
616
+ ]
617
+ },
618
+ {
619
+ "id": 2,
620
+ "type": "timeseries",
621
+ "title": "5xx error rate by route",
622
+ "description": "5xx responses per second per memory-service route (C4 recording rule). Errors on /agentmemory/mcp/call indicate the iii engine is rejecting MCP tool calls — typical causes are bearer-auth failures or the engine entering a degraded state. Errors on /agentmemory/import indicate restore failures; the bridge's snapshot will retry on the next mutator-write.",
623
+ "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
624
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
625
+ "fieldConfig": {
626
+ "defaults": {
627
+ "unit": "reqps",
628
+ "color": { "mode": "palette-classic" },
629
+ "custom": {
630
+ "lineWidth": 2,
631
+ "fillOpacity": 10,
632
+ "showPoints": "never"
633
+ }
634
+ }
635
+ },
636
+ "options": {
637
+ "tooltip": { "mode": "multi", "sort": "desc" },
638
+ "legend": { "displayMode": "list", "placement": "bottom" }
639
+ },
640
+ "targets": [
641
+ {
642
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
643
+ "expr": "olam:http_errors:rate5m_by_service_route{service=\"memory-service\",route=~\"$route\"}",
644
+ "legendFormat": "{{route}}",
645
+ "instant": false,
646
+ "range": true
647
+ }
648
+ ]
649
+ },
650
+ {
651
+ "id": 3,
652
+ "type": "timeseries",
653
+ "title": "Latency p50 by route",
654
+ "description": "Median (p50) request duration per memory-service route in seconds (C4 recording rule). /agentmemory/mcp/call p50 is a direct driver of agent-memory recall+save latency in the agent loop. Sustained rise on mcp/call p50 points to engine index size growth or iii-config tuning regressions.",
655
+ "gridPos": { "x": 0, "y": 8, "w": 8, "h": 8 },
656
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
657
+ "fieldConfig": {
658
+ "defaults": {
659
+ "unit": "s",
660
+ "color": { "mode": "palette-classic" },
661
+ "custom": {
662
+ "lineWidth": 2,
663
+ "fillOpacity": 10,
664
+ "showPoints": "never"
665
+ }
666
+ }
667
+ },
668
+ "options": {
669
+ "tooltip": { "mode": "multi", "sort": "desc" },
670
+ "legend": { "displayMode": "list", "placement": "bottom" }
671
+ },
672
+ "targets": [
673
+ {
674
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
675
+ "expr": "olam:http_request_duration_seconds:p50_by_service_route{service=\"memory-service\",route=~\"$route\"}",
676
+ "legendFormat": "{{route}}",
677
+ "instant": false,
678
+ "range": true
679
+ }
680
+ ]
681
+ },
682
+ {
683
+ "id": 4,
684
+ "type": "timeseries",
685
+ "title": "Latency p95 by route",
686
+ "description": "95th-percentile request duration per memory-service route in seconds (C4 recording rule). memory-service is a synchronous dependency for agent recall paths — high p95 on /agentmemory/mcp/call directly contributes to the >6min diagnosis-time problem this observability stack is solving. /agentmemory/export p95 spikes are expected at snapshot boundaries but should fall back inside 1s.",
687
+ "gridPos": { "x": 8, "y": 8, "w": 8, "h": 8 },
688
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
689
+ "fieldConfig": {
690
+ "defaults": {
691
+ "unit": "s",
692
+ "color": { "mode": "palette-classic" },
693
+ "custom": {
694
+ "lineWidth": 2,
695
+ "fillOpacity": 10,
696
+ "showPoints": "never"
697
+ }
698
+ }
699
+ },
700
+ "options": {
701
+ "tooltip": { "mode": "multi", "sort": "desc" },
702
+ "legend": { "displayMode": "list", "placement": "bottom" }
703
+ },
704
+ "targets": [
705
+ {
706
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
707
+ "expr": "olam:http_request_duration_seconds:p95_by_service_route{service=\"memory-service\",route=~\"$route\"}",
708
+ "legendFormat": "{{route}}",
709
+ "instant": false,
710
+ "range": true
711
+ }
712
+ ]
713
+ },
714
+ {
715
+ "id": 5,
716
+ "type": "timeseries",
717
+ "title": "Latency p99 by route",
718
+ "description": "99th-percentile request duration per memory-service route in seconds (C4 recording rule). Worst-case tail. /agentmemory/import is intentionally heavy (~1s+ for a full corpus restore on cold-start) so a high p99 there with stable mcp/call p99 is expected. The inverse — stable import, rising mcp/call p99 — is the leading indicator for engine-side index degradation.",
719
+ "gridPos": { "x": 16, "y": 8, "w": 8, "h": 8 },
720
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
721
+ "fieldConfig": {
722
+ "defaults": {
723
+ "unit": "s",
724
+ "color": { "mode": "palette-classic" },
725
+ "custom": {
726
+ "lineWidth": 2,
727
+ "fillOpacity": 10,
728
+ "showPoints": "never"
729
+ }
730
+ }
731
+ },
732
+ "options": {
733
+ "tooltip": { "mode": "multi", "sort": "desc" },
734
+ "legend": { "displayMode": "list", "placement": "bottom" }
735
+ },
736
+ "targets": [
737
+ {
738
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
739
+ "expr": "olam:http_request_duration_seconds:p99_by_service_route{service=\"memory-service\",route=~\"$route\"}",
740
+ "legendFormat": "{{route}}",
741
+ "instant": false,
742
+ "range": true
743
+ }
744
+ ]
745
+ },
746
+ {
747
+ "id": 6,
748
+ "type": "stat",
749
+ "title": "Error ratio (5xx / total) by route",
750
+ "description": "Fraction of requests returning 5xx per memory-service route (C4 recording rule). Green < 1%; yellow 1-5%; red >= 5%. /agentmemory/mcp/call errors silently degrade agent memory recall quality (callers fall through to no-context paths). /agentmemory/livez errors here indicate the proxy is healthy but the engine is unreachable — check container logs.",
751
+ "gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
752
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
753
+ "fieldConfig": {
754
+ "defaults": {
755
+ "unit": "percentunit",
756
+ "thresholds": {
757
+ "mode": "absolute",
758
+ "steps": [
759
+ { "color": "green", "value": null },
760
+ { "color": "yellow", "value": 0.01 },
761
+ { "color": "red", "value": 0.05 }
762
+ ]
763
+ },
764
+ "color": { "mode": "thresholds" }
765
+ }
766
+ },
767
+ "options": {
768
+ "reduceOptions": { "calcs": ["lastNotNull"] },
769
+ "orientation": "auto",
770
+ "textMode": "auto",
771
+ "colorMode": "background",
772
+ "graphMode": "none",
773
+ "justifyMode": "center"
774
+ },
775
+ "targets": [
776
+ {
777
+ "datasource": { "type": "prometheus", "uid": "prometheus" },
778
+ "expr": "olam:http_errors:ratio_by_service_route{service=\"memory-service\",route=~\"$route\"}",
779
+ "legendFormat": "{{route}}",
780
+ "instant": true,
781
+ "range": false
782
+ }
783
+ ]
784
+ }
785
+ ]
786
+ }
787
+
788
+ olam-home.json: |
789
+ {
790
+ "uid": "olam-home",
791
+ "title": "Olam Home",
792
+ "description": "Operator's at-a-glance view. Top row: are the 5 olam peripheral services up? Middle row: how loaded are they? Bottom row: which worlds are doing dispatch work right now? Use the world_id dropdown to scope the bottom row (and host-cp/world-cp middle panels) to a specific world. Pinned 3-row IA per Phase B acceptance criteria #8. Click the host-cp, kg-service, or memory-service health panel to drill into the per-service dashboard.",
793
+ "tags": ["olam", "home", "phase-b"],
794
+ "timezone": "browser",
795
+ "refresh": "30s",
796
+ "schemaVersion": 39,
797
+ "version": 2,
798
+ "time": {
799
+ "from": "now-1h",
800
+ "to": "now"
801
+ },
802
+ "timepicker": {},
803
+ "templating": {
804
+ "list": [
805
+ {
806
+ "name": "world_id",
807
+ "label": "World",
808
+ "type": "query",
809
+ "datasource": { "type": "loki", "uid": "loki" },
810
+ "query": {
811
+ "qryType": 2,
812
+ "expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))",
813
+ "step": ""
814
+ },
815
+ "refresh": 2,
816
+ "sort": 1,
817
+ "multi": false,
818
+ "includeAll": true,
819
+ "allValue": ".+",
820
+ "current": { "selected": false, "text": "All", "value": "$__all" }
821
+ }
822
+ ]
823
+ },
824
+ "annotations": {
825
+ "list": []
826
+ },
827
+ "panels": [
828
+ {
829
+ "id": 1,
830
+ "type": "stat",
831
+ "title": "host-cp",
832
+ "description": "Green if host-cp logged at least 1 line in the last 60s; red = silent / crashed.",
833
+ "gridPos": { "x": 0, "y": 0, "w": 5, "h": 4 },
834
+ "links": [
835
+ {
836
+ "title": "Drill into host-cp",
837
+ "url": "/d/host-cp/host-cp-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}",
838
+ "targetBlank": false
839
+ }
840
+ ],
841
+ "datasource": { "type": "loki", "uid": "loki" },
842
+ "fieldConfig": {
843
+ "defaults": {
844
+ "thresholds": {
845
+ "mode": "absolute",
846
+ "steps": [
847
+ { "color": "red", "value": null },
848
+ { "color": "green", "value": 1 }
849
+ ]
850
+ },
851
+ "mappings": [
852
+ { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
853
+ { "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
854
+ ],
855
+ "unit": "short",
856
+ "color": { "mode": "thresholds" }
857
+ }
858
+ },
859
+ "options": {
860
+ "reduceOptions": { "calcs": ["lastNotNull"] },
861
+ "orientation": "auto",
862
+ "textMode": "auto",
863
+ "colorMode": "background",
864
+ "graphMode": "none",
865
+ "justifyMode": "center"
866
+ },
867
+ "targets": [
868
+ {
869
+ "datasource": { "type": "loki", "uid": "loki" },
870
+ "expr": "sum(count_over_time({service=\"host-cp\"}[1m]))",
871
+ "legendFormat": "host-cp",
872
+ "instant": true,
873
+ "range": false
874
+ }
875
+ ]
876
+ },
877
+ {
878
+ "id": 2,
879
+ "type": "stat",
880
+ "title": "kg-service",
881
+ "description": "Green if kg-service logged at least 1 line in the last 60s; red = silent / crashed.",
882
+ "gridPos": { "x": 5, "y": 0, "w": 5, "h": 4 },
883
+ "links": [
884
+ {
885
+ "title": "Drill into kg-service",
886
+ "url": "/d/kg-service/kg-service-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}",
887
+ "targetBlank": false
888
+ }
889
+ ],
890
+ "datasource": { "type": "loki", "uid": "loki" },
891
+ "fieldConfig": {
892
+ "defaults": {
893
+ "thresholds": {
894
+ "mode": "absolute",
895
+ "steps": [
896
+ { "color": "red", "value": null },
897
+ { "color": "green", "value": 1 }
898
+ ]
899
+ },
900
+ "mappings": [
901
+ { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
902
+ { "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
903
+ ],
904
+ "unit": "short",
905
+ "color": { "mode": "thresholds" }
906
+ }
907
+ },
908
+ "options": {
909
+ "reduceOptions": { "calcs": ["lastNotNull"] },
910
+ "orientation": "auto",
911
+ "textMode": "auto",
912
+ "colorMode": "background",
913
+ "graphMode": "none",
914
+ "justifyMode": "center"
915
+ },
916
+ "targets": [
917
+ {
918
+ "datasource": { "type": "loki", "uid": "loki" },
919
+ "expr": "sum(count_over_time({service=\"kg-service\"}[1m]))",
920
+ "legendFormat": "kg-service",
921
+ "instant": true,
922
+ "range": false
923
+ }
924
+ ]
925
+ },
926
+ {
927
+ "id": 3,
928
+ "type": "stat",
929
+ "title": "agent-memory",
930
+ "description": "Green if agent-memory logged at least 1 line in the last 60s; red = silent / crashed.",
931
+ "gridPos": { "x": 10, "y": 0, "w": 4, "h": 4 },
932
+ "links": [
933
+ {
934
+ "title": "Drill into memory-service",
935
+ "url": "/d/memory-service/memory-service-service-drill-in?${world_id:queryparam}&from=${__from}&to=${__to}",
936
+ "targetBlank": false
937
+ }
938
+ ],
939
+ "datasource": { "type": "loki", "uid": "loki" },
940
+ "fieldConfig": {
941
+ "defaults": {
942
+ "thresholds": {
943
+ "mode": "absolute",
944
+ "steps": [
945
+ { "color": "red", "value": null },
946
+ { "color": "green", "value": 1 }
947
+ ]
948
+ },
949
+ "mappings": [
950
+ { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
951
+ { "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
952
+ ],
953
+ "unit": "short",
954
+ "color": { "mode": "thresholds" }
955
+ }
956
+ },
957
+ "options": {
958
+ "reduceOptions": { "calcs": ["lastNotNull"] },
959
+ "orientation": "auto",
960
+ "textMode": "auto",
961
+ "colorMode": "background",
962
+ "graphMode": "none",
963
+ "justifyMode": "center"
964
+ },
965
+ "targets": [
966
+ {
967
+ "datasource": { "type": "loki", "uid": "loki" },
968
+ "expr": "sum(count_over_time({service=\"agent-memory\"}[1m]))",
969
+ "legendFormat": "agent-memory",
970
+ "instant": true,
971
+ "range": false
972
+ }
973
+ ]
974
+ },
975
+ {
976
+ "id": 4,
977
+ "type": "stat",
978
+ "title": "traefik",
979
+ "description": "Green if traefik logged at least 1 line in the last 60s; red = silent / crashed.",
980
+ "gridPos": { "x": 14, "y": 0, "w": 5, "h": 4 },
981
+ "datasource": { "type": "loki", "uid": "loki" },
982
+ "fieldConfig": {
983
+ "defaults": {
984
+ "thresholds": {
985
+ "mode": "absolute",
986
+ "steps": [
987
+ { "color": "red", "value": null },
988
+ { "color": "green", "value": 1 }
989
+ ]
990
+ },
991
+ "mappings": [
992
+ { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
993
+ { "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
994
+ ],
995
+ "unit": "short",
996
+ "color": { "mode": "thresholds" }
997
+ }
998
+ },
999
+ "options": {
1000
+ "reduceOptions": { "calcs": ["lastNotNull"] },
1001
+ "orientation": "auto",
1002
+ "textMode": "auto",
1003
+ "colorMode": "background",
1004
+ "graphMode": "none",
1005
+ "justifyMode": "center"
1006
+ },
1007
+ "targets": [
1008
+ {
1009
+ "datasource": { "type": "loki", "uid": "loki" },
1010
+ "expr": "sum(count_over_time({service=\"traefik\"}[1m]))",
1011
+ "legendFormat": "traefik",
1012
+ "instant": true,
1013
+ "range": false
1014
+ }
1015
+ ]
1016
+ },
1017
+ {
1018
+ "id": 5,
1019
+ "type": "stat",
1020
+ "title": "world-cp",
1021
+ "description": "Green if any world-cp instance logged at least 1 line in the last 60s. Aggregated across world_id labels per Promtail drop-rules.",
1022
+ "gridPos": { "x": 19, "y": 0, "w": 5, "h": 4 },
1023
+ "datasource": { "type": "loki", "uid": "loki" },
1024
+ "fieldConfig": {
1025
+ "defaults": {
1026
+ "thresholds": {
1027
+ "mode": "absolute",
1028
+ "steps": [
1029
+ { "color": "red", "value": null },
1030
+ { "color": "green", "value": 1 }
1031
+ ]
1032
+ },
1033
+ "mappings": [
1034
+ { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
1035
+ { "type": "range", "options": { "from": 1, "to": 1000000, "result": { "text": "UP", "color": "green" } } }
1036
+ ],
1037
+ "unit": "short",
1038
+ "color": { "mode": "thresholds" }
1039
+ }
1040
+ },
1041
+ "options": {
1042
+ "reduceOptions": { "calcs": ["lastNotNull"] },
1043
+ "orientation": "auto",
1044
+ "textMode": "auto",
1045
+ "colorMode": "background",
1046
+ "graphMode": "none",
1047
+ "justifyMode": "center"
1048
+ },
1049
+ "targets": [
1050
+ {
1051
+ "datasource": { "type": "loki", "uid": "loki" },
1052
+ "expr": "sum(count_over_time({service=\"world-cp\"}[1m]))",
1053
+ "legendFormat": "world-cp",
1054
+ "instant": true,
1055
+ "range": false
1056
+ }
1057
+ ]
1058
+ },
1059
+ {
1060
+ "id": 6,
1061
+ "type": "timeseries",
1062
+ "title": "Aggregate success rate",
1063
+ "description": "Total 2xx/3xx log lines per second across all services. Proxy for overall throughput.",
1064
+ "gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 },
1065
+ "datasource": { "type": "loki", "uid": "loki" },
1066
+ "fieldConfig": {
1067
+ "defaults": {
1068
+ "unit": "reqps",
1069
+ "color": { "mode": "palette-classic" },
1070
+ "custom": {
1071
+ "lineWidth": 2,
1072
+ "fillOpacity": 10,
1073
+ "showPoints": "never"
1074
+ }
1075
+ }
1076
+ },
1077
+ "options": {
1078
+ "tooltip": { "mode": "multi", "sort": "none" },
1079
+ "legend": { "displayMode": "list", "placement": "bottom" }
1080
+ },
1081
+ "targets": [
1082
+ {
1083
+ "datasource": { "type": "loki", "uid": "loki" },
1084
+ "expr": "sum(rate({job=~\".+\"} |~ \"(?:200|201|204|301|302)\" [1m]))",
1085
+ "legendFormat": "2xx/3xx rate",
1086
+ "instant": false,
1087
+ "range": true
1088
+ }
1089
+ ]
1090
+ },
1091
+ {
1092
+ "id": 7,
1093
+ "type": "timeseries",
1094
+ "title": "Aggregate error rate",
1095
+ "description": "Total error/panic/fatal log lines per second across all services. Spikes indicate incidents.",
1096
+ "gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 },
1097
+ "datasource": { "type": "loki", "uid": "loki" },
1098
+ "fieldConfig": {
1099
+ "defaults": {
1100
+ "unit": "reqps",
1101
+ "color": {
1102
+ "mode": "fixed",
1103
+ "fixedColor": "red"
1104
+ },
1105
+ "custom": {
1106
+ "lineWidth": 2,
1107
+ "fillOpacity": 10,
1108
+ "showPoints": "never"
1109
+ }
1110
+ }
1111
+ },
1112
+ "options": {
1113
+ "tooltip": { "mode": "multi", "sort": "none" },
1114
+ "legend": { "displayMode": "list", "placement": "bottom" }
1115
+ },
1116
+ "targets": [
1117
+ {
1118
+ "datasource": { "type": "loki", "uid": "loki" },
1119
+ "expr": "sum(rate({job=~\".+\"} |~ \"(?i)error|panic|fatal\" [1m]))",
1120
+ "legendFormat": "error/panic/fatal rate",
1121
+ "instant": false,
1122
+ "range": true
1123
+ }
1124
+ ]
1125
+ },
1126
+ {
1127
+ "id": 8,
1128
+ "type": "timeseries",
1129
+ "title": "World-dispatch activity (top 10 worlds)",
1130
+ "description": "Dispatch log lines per 5m per world, filtered by the world_id dropdown. world_id is a JSON field (not a Loki label); extracted via json parser. Select 'All' to see all worlds; select a specific world_id to drill down.",
1131
+ "gridPos": { "x": 0, "y": 12, "w": 24, "h": 8 },
1132
+ "datasource": { "type": "loki", "uid": "loki" },
1133
+ "fieldConfig": {
1134
+ "defaults": {
1135
+ "unit": "short",
1136
+ "color": { "mode": "palette-classic" },
1137
+ "custom": {
1138
+ "lineWidth": 2,
1139
+ "fillOpacity": 10,
1140
+ "showPoints": "never"
1141
+ }
1142
+ }
1143
+ },
1144
+ "options": {
1145
+ "tooltip": { "mode": "multi", "sort": "desc" },
1146
+ "legend": { "displayMode": "list", "placement": "bottom" }
1147
+ },
1148
+ "targets": [
1149
+ {
1150
+ "datasource": { "type": "loki", "uid": "loki" },
1151
+ "expr": "topk(10, sum by (world_id) (\n count_over_time(\n {service=\"host-cp\"}\n |~ \"dispatch\"\n | json\n | __error__ = \"\"\n | world_id =~ \"${world_id}\"\n [5m]\n )\n))",
1152
+ "legendFormat": "world {{world_id}}",
1153
+ "instant": false,
1154
+ "range": true
1155
+ }
1156
+ ]
1157
+ }
1158
+ ]
1159
+ }
1160
+
1161
+ request-rate.json: |
1162
+ {
1163
+ "uid": "request-rate",
1164
+ "title": "Request Rate / Error Rate (Log-Derived)",
1165
+ "description": "Per-service request rate + error rate derived from Loki logs. Phase B-only — kube-prometheus-stack will replace these with native HTTP metrics in Phase C.",
1166
+ "tags": ["olam", "rate", "phase-b"],
1167
+ "timezone": "browser",
1168
+ "refresh": "30s",
1169
+ "schemaVersion": 39,
1170
+ "version": 1,
1171
+ "time": {
1172
+ "from": "now-1h",
1173
+ "to": "now"
1174
+ },
1175
+ "timepicker": {},
1176
+ "templating": {
1177
+ "list": [
1178
+ {
1179
+ "name": "world_id",
1180
+ "label": "World",
1181
+ "type": "query",
1182
+ "datasource": { "type": "loki", "uid": "loki" },
1183
+ "query": {
1184
+ "qryType": 2,
1185
+ "expr": "sum by (world_id) (count_over_time({service=\"host-cp\"} | json | __error__ = \"\" | world_id != \"\" [5m]))",
1186
+ "step": ""
1187
+ },
1188
+ "refresh": 2,
1189
+ "sort": 1,
1190
+ "multi": false,
1191
+ "includeAll": true,
1192
+ "allValue": ".+",
1193
+ "current": { "selected": false, "text": "All", "value": "$__all" }
1194
+ },
1195
+ {
1196
+ "name": "service",
1197
+ "label": "Service",
1198
+ "type": "query",
1199
+ "datasource": { "type": "loki", "uid": "loki" },
1200
+ "query": { "qryType": 1, "label": "service", "stream": "{job=~\".+\"}" },
1201
+ "refresh": 2,
1202
+ "sort": 1,
1203
+ "multi": true,
1204
+ "includeAll": true,
1205
+ "allValue": ".+",
1206
+ "current": { "selected": false, "text": "All", "value": "$__all" }
1207
+ }
1208
+ ]
1209
+ },
1210
+ "annotations": {
1211
+ "list": []
1212
+ },
1213
+ "panels": [
1214
+ {
1215
+ "id": 1,
1216
+ "type": "timeseries",
1217
+ "title": "Request rate by service",
1218
+ "description": "Log line rate per second per service. Uses log volume as a proxy for request rate — appropriate for Phase B before Prometheus HTTP metrics land in Phase C.",
1219
+ "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
1220
+ "datasource": { "type": "loki", "uid": "loki" },
1221
+ "fieldConfig": {
1222
+ "defaults": {
1223
+ "unit": "reqps",
1224
+ "color": { "mode": "palette-classic" },
1225
+ "custom": {
1226
+ "lineWidth": 2,
1227
+ "fillOpacity": 10,
1228
+ "showPoints": "never"
1229
+ }
1230
+ }
1231
+ },
1232
+ "options": {
1233
+ "tooltip": { "mode": "multi", "sort": "desc" },
1234
+ "legend": { "displayMode": "list", "placement": "bottom" }
1235
+ },
1236
+ "targets": [
1237
+ {
1238
+ "datasource": { "type": "loki", "uid": "loki" },
1239
+ "expr": "sum by (service) (rate({service=~\"${service:regex}\"}[1m]))",
1240
+ "legendFormat": "{{service}}",
1241
+ "instant": false,
1242
+ "range": true
1243
+ }
1244
+ ]
1245
+ },
1246
+ {
1247
+ "id": 2,
1248
+ "type": "timeseries",
1249
+ "title": "Error rate by service",
1250
+ "description": "Log lines matching error|panic|fatal per second per service. Spikes here warrant drill-down in the Ad-hoc LogQL panel below.",
1251
+ "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
1252
+ "datasource": { "type": "loki", "uid": "loki" },
1253
+ "fieldConfig": {
1254
+ "defaults": {
1255
+ "unit": "reqps",
1256
+ "color": { "mode": "palette-classic" },
1257
+ "custom": {
1258
+ "lineWidth": 2,
1259
+ "fillOpacity": 10,
1260
+ "showPoints": "never"
1261
+ }
1262
+ }
1263
+ },
1264
+ "options": {
1265
+ "tooltip": { "mode": "multi", "sort": "desc" },
1266
+ "legend": { "displayMode": "list", "placement": "bottom" }
1267
+ },
1268
+ "targets": [
1269
+ {
1270
+ "datasource": { "type": "loki", "uid": "loki" },
1271
+ "expr": "sum by (service) (rate({service=~\"${service:regex}\"} |~ \"(?i)error|panic|fatal\" [1m]))",
1272
+ "legendFormat": "{{service}}",
1273
+ "instant": false,
1274
+ "range": true
1275
+ }
1276
+ ]
1277
+ },
1278
+ {
1279
+ "id": 3,
1280
+ "type": "table",
1281
+ "title": "Top-5 endpoints (last 5m)",
1282
+ "description": "Top 5 request paths by volume, derived from Traefik JSON access logs. Only Traefik has access-log-grade request_path (per B1 Promtail JSON stage); other services don't extract this field.",
1283
+ "gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 },
1284
+ "datasource": { "type": "loki", "uid": "loki" },
1285
+ "fieldConfig": {
1286
+ "defaults": {
1287
+ "unit": "short",
1288
+ "color": { "mode": "palette-classic" }
1289
+ },
1290
+ "overrides": [
1291
+ {
1292
+ "matcher": { "id": "byName", "options": "Value" },
1293
+ "properties": [
1294
+ { "id": "displayName", "value": "requests" }
1295
+ ]
1296
+ }
1297
+ ]
1298
+ },
1299
+ "options": {
1300
+ "showHeader": true,
1301
+ "footer": { "show": false }
1302
+ },
1303
+ "targets": [
1304
+ {
1305
+ "datasource": { "type": "loki", "uid": "loki" },
1306
+ "expr": "topk(5, sum by (request_path) (count_over_time({service=\"traefik\"} | json | __error__ = \"\" | request_path != \"\" [5m])))",
1307
+ "legendFormat": "",
1308
+ "instant": true,
1309
+ "range": false
1310
+ }
1311
+ ],
1312
+ "transformations": [
1313
+ { "id": "reduce", "options": { "reducers": ["sum"] } }
1314
+ ]
1315
+ },
1316
+ {
1317
+ "id": 4,
1318
+ "type": "logs",
1319
+ "title": "Ad-hoc LogQL (edit me)",
1320
+ "description": "Operator escape hatch. Edit the query inline; use LogQL syntax. world_id filter via JSON pipeline because Loki doesn't promote world_id as a stream label.",
1321
+ "gridPos": { "x": 0, "y": 16, "w": 24, "h": 10 },
1322
+ "datasource": { "type": "loki", "uid": "loki" },
1323
+ "fieldConfig": {
1324
+ "defaults": {},
1325
+ "overrides": []
1326
+ },
1327
+ "options": {
1328
+ "showTime": true,
1329
+ "wrapLogMessage": false,
1330
+ "dedupStrategy": "exact",
1331
+ "showLabels": false,
1332
+ "showCommonLabels": false,
1333
+ "sortOrder": "Descending",
1334
+ "prettifyLogMessage": false,
1335
+ "enableLogDetails": true
1336
+ },
1337
+ "targets": [
1338
+ {
1339
+ "datasource": { "type": "loki", "uid": "loki" },
1340
+ "expr": "{service=~\"${service:regex}\"} | json | __error__ = \"\" | world_id =~ \"${world_id}\"",
1341
+ "legendFormat": "",
1342
+ "instant": false,
1343
+ "range": true
1344
+ }
1345
+ ]
1346
+ }
1347
+ ]
1348
+ }
1349
+