@pleri/olam-cli 0.1.160 → 0.1.162

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +11 -0
  2. package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
  3. package/dist/commands/bootstrap.d.ts +15 -0
  4. package/dist/commands/bootstrap.d.ts.map +1 -1
  5. package/dist/commands/bootstrap.js +58 -5
  6. package/dist/commands/bootstrap.js.map +1 -1
  7. package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
  8. package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
  9. package/dist/commands/flywheel/migrate-overlays.js +29 -3
  10. package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
  11. package/dist/commands/skills-source.d.ts.map +1 -1
  12. package/dist/commands/skills-source.js +57 -2
  13. package/dist/commands/skills-source.js.map +1 -1
  14. package/dist/commands/skills.d.ts.map +1 -1
  15. package/dist/commands/skills.js +14 -0
  16. package/dist/commands/skills.js.map +1 -1
  17. package/dist/image-digests.json +7 -7
  18. package/dist/index.js +2424 -1781
  19. package/dist/lib/bootstrap-kubernetes.d.ts +42 -0
  20. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -0
  21. package/dist/lib/bootstrap-kubernetes.js +367 -0
  22. package/dist/lib/bootstrap-kubernetes.js.map +1 -0
  23. package/dist/lib/config.d.ts.map +1 -1
  24. package/dist/lib/config.js +6 -1
  25. package/dist/lib/config.js.map +1 -1
  26. package/dist/mcp-server.js +568 -368
  27. package/hermes-bundle/version.json +1 -1
  28. package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
  29. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  30. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  31. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  32. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  33. package/host-cp/observability/grafana-port-forward.sh +283 -0
  34. package/host-cp/observability/kyverno-cardinality-mutate.sh +462 -0
  35. package/host-cp/observability/loki-ingest.sh +253 -0
  36. package/host-cp/observability/prom-no-double-grafana.sh +311 -0
  37. package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
  38. package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
  39. package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
  40. package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
  41. package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
  42. package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
  43. package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
  44. package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
  45. package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
  46. package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
  47. package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
  48. package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
  49. package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
  50. package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
  51. package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
  52. package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
  53. package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
  54. package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
  55. package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
  56. package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
  57. package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
  58. package/host-cp/src/plan-chat-service.mjs +147 -1
  59. package/package.json +1 -1
@@ -0,0 +1,50 @@
1
+ # 90-prom-alert-cardinality.yaml — Phase C Task C2 cardinality alert rule.
2
+ #
3
+ # PrometheusRule CR: fires OlamActiveSeriesHigh when prometheus_tsdb_head_series
4
+ # exceeds 80k (80% of the 100k active-series cap defined by P4).
5
+ #
6
+ # ruleSelector match: the Prometheus CR rendered by kube-prom-stack 85.2.0 uses
7
+ # ruleSelector: matchLabels: release: "olam-prom"
8
+ # (verified via `helm template ... | grep -A3 ruleSelector`).
9
+ # The label below MUST match or this rule is silently ignored by Prometheus.
10
+ #
11
+ # Alertmanager: enabled in kube-prom-stack-values.yaml from C2 onwards.
12
+ # Receivers: not yet configured (C2 scope = rule landing; receiver config is C4+).
13
+ # Alertmanager will fire the alert to its default null receiver until receivers
14
+ # are wired — this is intentional. The alert is visible in the Prometheus UI
15
+ # at /alerts regardless of receiver config.
16
+ #
17
+ # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C2
18
+ # T1 (cardinality bomb) + P4 (<100k active series)
19
+ ---
20
+ apiVersion: monitoring.coreos.com/v1
21
+ kind: PrometheusRule
22
+ metadata:
23
+ name: olam-cardinality
24
+ namespace: monitoring
25
+ labels:
26
+ app.kubernetes.io/name: olam-prometheus-rules
27
+ app.kubernetes.io/managed-by: olam
28
+ # REQUIRED: matches Prometheus CR's ruleSelector (release: "olam-prom").
29
+ # Verified via helm template output, 2026-05-21.
30
+ release: olam-prom
31
+ spec:
32
+ groups:
33
+ - name: olam-cardinality
34
+ interval: 30s
35
+ rules:
36
+ - alert: OlamActiveSeriesHigh
37
+ expr: |
38
+ prometheus_tsdb_head_series > 80000
39
+ for: 5m
40
+ labels:
41
+ severity: warning
42
+ scope: cardinality
43
+ annotations:
44
+ summary: "Active series above 80k threshold (80% of 100k cap)"
45
+ description: |
46
+ prometheus_tsdb_head_series is {{ $value | humanize }} — within 20%
47
+ of the 100k cardinality budget (P4). Investigate which service is
48
+ emitting a new high-cardinality label, OR add a DROP rule to
49
+ kube-prom-stack-values.yaml metricRelabelings for that ServiceMonitor.
50
+ Runbook: docs/architecture/observability-cardinality.md (TBD — C4+)
@@ -0,0 +1,70 @@
1
+ # 91-servicemonitor-host-cp.yaml — Phase C Task C3 ServiceMonitor for host-cp.
2
+ #
3
+ # Registers host-cp's /metrics endpoint with Prometheus for scraping.
4
+ #
5
+ # NOTE: This manifest requires the ServiceMonitor CRD installed by
6
+ # kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
7
+ # apply-manifests.sh (which targets the Phase A ingress harness) and is
8
+ # applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
9
+ #
10
+ # Namespace placement (CRITICAL — C2 dogfood lesson):
11
+ # ServiceMonitors MUST live in the `monitoring` namespace to be discovered
12
+ # by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
13
+ # in any other namespace is silently ignored by default RBAC.
14
+ #
15
+ # Label compliance:
16
+ # `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector
17
+ # (verified via `helm template ... | grep -A3 serviceMonitorSelector`).
18
+ #
19
+ # Target selector:
20
+ # Matches the host-cp Service by its `app: olam-host-cp` label. Adjust if
21
+ # the Service label differs in the target cluster (check
22
+ # `kubectl get svc -n olam -l app=olam-host-cp`).
23
+ #
24
+ # metricRelabelings (layer-2 cardinality enforcement):
25
+ # Mirrors the `*cardinality-labeldrop` YAML anchor from
26
+ # kube-prom-stack-values.yaml. host-cp's /metrics is taxonomy-compliant
27
+ # (only {service,route,method,status_code} labels), but the labeldrop rule
28
+ # is present as defense-in-depth: if a future code change accidentally
29
+ # emits a banned label (world_id etc.), this ServiceMonitor drops it before
30
+ # ingest so the cardinality cap is never breached.
31
+ #
32
+ # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
33
+ # T1 (cardinality bomb) + P4 (<100k active series)
34
+ ---
35
+ apiVersion: monitoring.coreos.com/v1
36
+ kind: ServiceMonitor
37
+ metadata:
38
+ name: olam-host-cp
39
+ namespace: monitoring
40
+ labels:
41
+ app.kubernetes.io/name: olam-host-cp-monitor
42
+ app.kubernetes.io/managed-by: olam
43
+ # REQUIRED: matches Prometheus CR's serviceMonitorSelector.
44
+ release: olam-prom
45
+ spec:
46
+ # Discover the host-cp Service in the olam namespace.
47
+ namespaceSelector:
48
+ matchNames:
49
+ - olam
50
+ selector:
51
+ matchLabels:
52
+ app: olam-host-cp
53
+ endpoints:
54
+ - port: http
55
+ path: /metrics
56
+ interval: 15s
57
+ # Preserve the application-emitted `service` label. Without honorLabels,
58
+ # Prometheus's target-label injection (where `service` = the k8s Service
59
+ # name `olam-host-cp`) overrides the application's own `service=host-cp`
60
+ # value, moving the app's value into `exported_service`. The C5 drill-in
61
+ # dashboards filter on `service=host-cp`, so without honorLabels their
62
+ # panels show empty data. Surfaced during 2026-05-21 operator dogfood —
63
+ # see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #3.
64
+ honorLabels: true
65
+ # Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
66
+ # in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
67
+ # even if the service accidentally emits them.
68
+ metricRelabelings:
69
+ - action: labeldrop
70
+ regex: 'world_id|trace_id|user_id|request_id|operator_id'
@@ -0,0 +1,70 @@
1
+ # 92-servicemonitor-kg-service.yaml — Phase C Task C3 ServiceMonitor for kg-service.
2
+ #
3
+ # Registers kg-service's /metrics endpoint with Prometheus for scraping.
4
+ #
5
+ # NOTE: This manifest requires the ServiceMonitor CRD installed by
6
+ # kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
7
+ # apply-manifests.sh (which targets the Phase A ingress harness) and is
8
+ # applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
9
+ #
10
+ # Namespace placement (CRITICAL — C2 dogfood lesson):
11
+ # ServiceMonitors MUST live in the `monitoring` namespace to be discovered
12
+ # by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
13
+ # in any other namespace is silently ignored by default RBAC.
14
+ #
15
+ # Label compliance:
16
+ # `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector
17
+ # (verified via `helm template ... | grep -A3 serviceMonitorSelector`).
18
+ #
19
+ # Target selector:
20
+ # Matches the kg-service Service by its `app: olam-kg-service` label. Adjust
21
+ # if the Service label differs in the target cluster (check
22
+ # `kubectl get svc -n olam -l app=olam-kg-service`).
23
+ #
24
+ # metricRelabelings (layer-2 cardinality enforcement):
25
+ # Mirrors the `*cardinality-labeldrop` YAML anchor from
26
+ # kube-prom-stack-values.yaml. kg-service's /metrics is taxonomy-compliant
27
+ # (only {service,route,method,status_code} labels), but the labeldrop rule
28
+ # is present as defense-in-depth: if a future code change accidentally
29
+ # emits a banned label (world_id etc.), this ServiceMonitor drops it before
30
+ # ingest so the cardinality cap is never breached.
31
+ #
32
+ # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
33
+ # T1 (cardinality bomb) + P4 (<100k active series)
34
+ ---
35
+ apiVersion: monitoring.coreos.com/v1
36
+ kind: ServiceMonitor
37
+ metadata:
38
+ name: olam-kg-service
39
+ namespace: monitoring
40
+ labels:
41
+ app.kubernetes.io/name: olam-kg-service-monitor
42
+ app.kubernetes.io/managed-by: olam
43
+ # REQUIRED: matches Prometheus CR's serviceMonitorSelector.
44
+ release: olam-prom
45
+ spec:
46
+ # Discover the kg-service Service in the olam namespace.
47
+ namespaceSelector:
48
+ matchNames:
49
+ - olam
50
+ selector:
51
+ matchLabels:
52
+ app: olam-kg-service
53
+ endpoints:
54
+ - port: http
55
+ path: /metrics
56
+ interval: 15s
57
+ # Preserve the application-emitted `service` label. Without honorLabels,
58
+ # Prometheus's target-label injection (where `service` = the k8s Service
59
+ # name `olam-kg-service`) overrides the application's own `service=kg-service`
60
+ # value, moving the app's value into `exported_service`. The C5 drill-in
61
+ # dashboards filter on `service=kg-service`, so without honorLabels their
62
+ # panels show empty data. Surfaced during 2026-05-21 operator dogfood —
63
+ # see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #3.
64
+ honorLabels: true
65
+ # Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
66
+ # in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
67
+ # even if the service accidentally emits them.
68
+ metricRelabelings:
69
+ - action: labeldrop
70
+ regex: 'world_id|trace_id|user_id|request_id|operator_id'
@@ -0,0 +1,87 @@
1
+ # 93-servicemonitor-memory-service.yaml — Phase C Task C3 closure ServiceMonitor.
2
+ #
3
+ # Registers memory-service's /metrics endpoint with Prometheus for scraping.
4
+ # C3 originally shipped instrumentation for host-cp + kg-service (PR #787) but
5
+ # DEFERRED memory-service because the third-party `agentmemory` Node CLI that
6
+ # runs in k3s exposes no /metrics endpoint. This PR closes that deferral by
7
+ # shipping a small Node HTTP front-door (packages/memory-service/src/metrics-proxy.mjs)
8
+ # inside the container image: external traffic hits the proxy on :3111, the
9
+ # proxy short-circuits /metrics + forwards everything else to agentmemory on
10
+ # loopback :3110. End-state matches the host-cp/kg-service shape so the ServiceMonitor
11
+ # pattern below is a near-clone of 91-servicemonitor-host-cp.yaml.
12
+ #
13
+ # NOTE: This manifest requires the ServiceMonitor CRD installed by
14
+ # kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
15
+ # apply-manifests.sh (which targets the Phase A ingress harness) and is
16
+ # applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
17
+ #
18
+ # Namespace placement (CRITICAL — C2 dogfood lesson):
19
+ # ServiceMonitors MUST live in the `monitoring` namespace to be discovered
20
+ # by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
21
+ # in any other namespace is silently ignored by default RBAC.
22
+ #
23
+ # Label compliance:
24
+ # `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector.
25
+ #
26
+ # Target selector:
27
+ # Matches the memory-service Service by its `app: olam-memory-service` label.
28
+ # The Service is defined in packages/host-cp/k8s/manifests/memory-service/60-service.yaml
29
+ # (port `http` -> targetPort 3111). The 50-traefik-ingressroute-agent-memory.yaml
30
+ # IngressRoute references the same Service for /api/agent-memory/* traffic.
31
+ #
32
+ # Image rollout dependency:
33
+ # The proxy lives inside the container image. Until the next release pipeline
34
+ # refreshes ghcr.io/pleri/olam-memory-service with the post-C3-closure
35
+ # Dockerfile (npm run refresh:manifest-digests), this ServiceMonitor will scrape
36
+ # a target that responds 404 to /metrics. Prometheus tolerates that (the target
37
+ # stays UP, scrape_samples_scraped=0). When the new image lands, scraping
38
+ # begins producing real samples without any cluster-side change.
39
+ #
40
+ # metricRelabelings (layer-2 cardinality enforcement):
41
+ # Mirrors the `*cardinality-labeldrop` YAML anchor from
42
+ # kube-prom-stack-values.yaml. memory-service's /metrics is taxonomy-compliant
43
+ # (only {service,route,method,status_code} labels), but the labeldrop rule
44
+ # is present as defense-in-depth: if a future code change accidentally
45
+ # emits a banned label (world_id etc.), this ServiceMonitor drops it before
46
+ # ingest so the cardinality cap is never breached.
47
+ #
48
+ # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
49
+ # T1 (cardinality bomb) + P4 (<100k active series).
50
+ ---
51
+ apiVersion: monitoring.coreos.com/v1
52
+ kind: ServiceMonitor
53
+ metadata:
54
+ name: olam-memory-service
55
+ namespace: monitoring
56
+ labels:
57
+ app.kubernetes.io/name: olam-memory-service-monitor
58
+ app.kubernetes.io/managed-by: olam
59
+ # REQUIRED: matches Prometheus CR's serviceMonitorSelector.
60
+ release: olam-prom
61
+ spec:
62
+ # Discover the memory-service Service in the olam namespace.
63
+ namespaceSelector:
64
+ matchNames:
65
+ - olam
66
+ selector:
67
+ matchLabels:
68
+ app: olam-memory-service
69
+ endpoints:
70
+ - port: http
71
+ path: /metrics
72
+ interval: 15s
73
+ # Preserve the application-emitted `service` label. Without honorLabels,
74
+ # Prometheus's target-label injection (where `service` = the k8s Service
75
+ # name `olam-memory-service`) overrides the application's own
76
+ # `service=memory-service` value, moving the app's value into
77
+ # `exported_service`. The C5 drill-in dashboards filter on
78
+ # `service=memory-service`, so without honorLabels their panels show
79
+ # empty data. Same lesson as the host-cp/kg-service ServiceMonitors —
80
+ # see docs/incidents/2026-05-21-phase-c-dogfood.md finding #3.
81
+ honorLabels: true
82
+ # Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
83
+ # in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
84
+ # even if the service accidentally emits them.
85
+ metricRelabelings:
86
+ - action: labeldrop
87
+ regex: 'world_id|trace_id|user_id|request_id|operator_id'
@@ -0,0 +1,108 @@
1
+ # 95-prom-recording-rules.yaml — Phase C Task C4
2
+ #
3
+ # Naming convention: olam:<metric>:<aggregation>
4
+ #
5
+ # olam — project namespace prefix (all project recording rules share this)
6
+ # <metric> — the base Prometheus metric being aggregated (without _bucket/_total suffix
7
+ # when the aggregation already implies the source type)
8
+ # <aggregation> — describes what was computed + the grouping dimensions, e.g.
9
+ # p95_by_service_route, rate5m_by_service, ratio_by_service_route
10
+ #
11
+ # Modeled on the community convention from
12
+ # https://prometheus.io/docs/practices/rules/#naming — <level>:<metric>:<ops>.
13
+ # The <aggregation> suffix encodes BOTH the operation (p95, rate5m, ratio) and
14
+ # the grouping dimensions (_by_service, _by_service_route) so dashboard panels
15
+ # can select the pre-computed series without further aggregation.
16
+ #
17
+ # Source metrics (provided by C3 — host-cp + kg-service ServiceMonitors):
18
+ # http_request_duration_seconds_bucket{service, route, method, status_code, le}
19
+ # http_requests_total{service, route, method, status_code}
20
+ #
21
+ # rule group interval: 30s — half the scrape interval (15s × 2). Balances
22
+ # freshness vs evaluation CPU; at 30s each window is re-evaluated twice per
23
+ # minute, keeping percentiles and rates responsive without hammering the TSDB.
24
+ #
25
+ # NOTE: recording rules intentionally reference NO banned labels
26
+ # (world_id, trace_id, user_id, request_id, operator_id). C2's labeldrop at
27
+ # scrape time strips them before ingest; even if a metric slipped through,
28
+ # referencing them here would suppress results. Defense-in-depth: don't type
29
+ # them at all.
30
+ #
31
+ # Applied by: scripts/e2e/prom-no-double-grafana.sh (C4 assertion block)
32
+ # Skipped by: scripts/test-ingress-integration/apply-manifests.sh
33
+ # (9[0-9]-prom-* glob) — requires kube-prom-stack CRDs to be present.
34
+ apiVersion: monitoring.coreos.com/v1
35
+ kind: PrometheusRule
36
+ metadata:
37
+ name: olam-recording-rules
38
+ namespace: monitoring
39
+ labels:
40
+ app.kubernetes.io/name: olam-prometheus-rules
41
+ app.kubernetes.io/managed-by: olam
42
+ release: olam-prom # must match kube-prom-stack ruleSelector (verified C2)
43
+ spec:
44
+ groups:
45
+ - name: olam-http-aggregations
46
+ interval: 30s
47
+ rules:
48
+ # ============================================================
49
+ # Latency percentiles per service+route — Phase C Task C4
50
+ # Source: http_request_duration_seconds_bucket (C3)
51
+ # ============================================================
52
+ - record: olam:http_request_duration_seconds:p50_by_service_route
53
+ expr: |
54
+ histogram_quantile(0.50, sum by (service, route, le) (
55
+ rate(http_request_duration_seconds_bucket[5m])
56
+ ))
57
+
58
+ - record: olam:http_request_duration_seconds:p95_by_service_route
59
+ expr: |
60
+ histogram_quantile(0.95, sum by (service, route, le) (
61
+ rate(http_request_duration_seconds_bucket[5m])
62
+ ))
63
+
64
+ - record: olam:http_request_duration_seconds:p99_by_service_route
65
+ expr: |
66
+ histogram_quantile(0.99, sum by (service, route, le) (
67
+ rate(http_request_duration_seconds_bucket[5m])
68
+ ))
69
+
70
+ # Aggregate p95 across all routes (per-service summary)
71
+ - record: olam:http_request_duration_seconds:p95_by_service
72
+ expr: |
73
+ histogram_quantile(0.95, sum by (service, le) (
74
+ rate(http_request_duration_seconds_bucket[5m])
75
+ ))
76
+
77
+ # ============================================================
78
+ # Request rate per service+route
79
+ # Source: http_requests_total (C3)
80
+ # ============================================================
81
+ - record: olam:http_requests:rate5m_by_service_route
82
+ expr: |
83
+ sum by (service, route) (rate(http_requests_total[5m]))
84
+
85
+ # Aggregate request rate per service
86
+ - record: olam:http_requests:rate5m_by_service
87
+ expr: |
88
+ sum by (service) (rate(http_requests_total[5m]))
89
+
90
+ # ============================================================
91
+ # Error rate (status_code >= 500) per service+route
92
+ # 4xx are client errors and are intentionally excluded from
93
+ # the error ratio — only server-side failures count.
94
+ # ============================================================
95
+ - record: olam:http_errors:rate5m_by_service_route
96
+ expr: |
97
+ sum by (service, route) (
98
+ rate(http_requests_total{status_code=~"5.."}[5m])
99
+ )
100
+
101
+ # Error ratio (errors / total) per service+route.
102
+ # Returns NaN when total rate is 0 (no traffic) — dashboards
103
+ # should handle NaN as "no data" rather than "0% error rate".
104
+ - record: olam:http_errors:ratio_by_service_route
105
+ expr: |
106
+ sum by (service, route) (rate(http_requests_total{status_code=~"5.."}[5m]))
107
+ /
108
+ sum by (service, route) (rate(http_requests_total[5m]))
@@ -0,0 +1,195 @@
1
+ # 96-kyverno-cardinality-mutate.yaml — Phase C C8 follow-up.
2
+ #
3
+ # Closes codex's C2 concern: per-ServiceMonitor metricRelabelings is
4
+ # "policy by convention". A third-party ServiceMonitor or PodMonitor that
5
+ # olam doesn't author can bypass the labeldrop and reintroduce the
6
+ # cardinality bomb (T1). YAML anchors in kube-prom-stack-values.yaml keep
7
+ # Olam-owned manifests DRY but don't make the cluster safe.
8
+ #
9
+ # This ClusterPolicy mutates EVERY incoming ServiceMonitor and PodMonitor
10
+ # at admission time — regardless of who created it (chart, kubectl, operator,
11
+ # CI, GitOps) — to ensure the cardinality labeldrop rule is present on
12
+ # every endpoint. Once persisted, the prometheus-operator renders the
13
+ # relabel into Prometheus's scrape config.
14
+ #
15
+ # Why mutate-only (not validate):
16
+ # Validate would block a chart install or operator action mid-stride
17
+ # if a third-party ServiceMonitor lacks the rule. Mutate is the better
18
+ # posture: silently ensure the rule is present without breaking
19
+ # legitimate installs. Defense-in-depth still lives in TWO layers:
20
+ # (a) admission-time mutation (this policy)
21
+ # (b) per-ServiceMonitor metricRelabelings in
22
+ # kube-prom-stack-values.yaml + 9x-servicemonitor-*.yaml.
23
+ #
24
+ # Idempotency contract:
25
+ # Mutation must NOT add a duplicate labeldrop entry. Achieved by
26
+ # two-rule split per kind, each with a precondition that the labeldrop
27
+ # is currently ABSENT. Once present, neither rule fires:
28
+ # - Rule A (handle absent/empty case): preconditions:
29
+ # metricRelabelings is null/missing OR empty array.
30
+ # JSON patch: `add /spec/endpoints/{i}/metricRelabelings` with
31
+ # a single-element array containing our rule.
32
+ # - Rule B (handle existing-but-no-labeldrop case): preconditions:
33
+ # metricRelabelings is a non-empty array AND no entry has
34
+ # `action: labeldrop` with `regex` mentioning `world_id`.
35
+ # JSON patch: `add /spec/endpoints/{i}/metricRelabelings/-`
36
+ # appending our rule.
37
+ #
38
+ # Verified behavior (kyverno-cardinality-mutate.sh asserts):
39
+ # - Bare ServiceMonitor (no metricRelabelings) → Rule A injects
40
+ # - ServiceMonitor with metricRelabelings: [] → Rule A injects (replaces empty)
41
+ # - ServiceMonitor with unrelated metricRelabelings entries → Rule B appends
42
+ # - ServiceMonitor with matching labeldrop already present → NEITHER rule fires (idempotent)
43
+ # - Mixed: some endpoints lack it, others have it → only the lacking endpoints are mutated
44
+ #
45
+ # Background scan: OFF (background: false). Existing ServiceMonitors at
46
+ # install time are NOT auto-mutated. Re-apply them to trigger admission,
47
+ # or rely on the C2 per-ServiceMonitor metricRelabelings as the failsafe.
48
+ #
49
+ # failurePolicy: Ignore. Kyverno webhook timeout / pod outage MUST NOT
50
+ # block ServiceMonitor admission — the C2 layer-2 rules still protect
51
+ # Olam-owned monitors. Trade-off accepted: during Kyverno downtime, a
52
+ # brand-new third-party ServiceMonitor could land without the labeldrop.
53
+ # The 80k active-series PrometheusRule alert (Phase C C2,
54
+ # 90-prom-alert-cardinality.yaml) is the runtime detector that fires
55
+ # if this gap is exploited.
56
+ #
57
+ # Refs:
58
+ # - docs/plans/k3s-ingress-observability/phase-c-tasks.md — C8
59
+ # - codex review on PR #783 ("policy by convention" finding)
60
+ # - https://kyverno.io/docs/writing-policies/mutate/
61
+ # - https://kyverno.io/docs/writing-policies/mutate/#foreach
62
+ ---
63
+ apiVersion: kyverno.io/v1
64
+ kind: ClusterPolicy
65
+ metadata:
66
+ name: enforce-cardinality-labeldrop
67
+ labels:
68
+ app.kubernetes.io/part-of: olam
69
+ olam.io/phase: c-followup
70
+ annotations:
71
+ policies.kyverno.io/title: "Cluster-wide cardinality labeldrop enforcement"
72
+ policies.kyverno.io/category: "Observability"
73
+ policies.kyverno.io/severity: high
74
+ policies.kyverno.io/subject: "ServiceMonitor, PodMonitor"
75
+ policies.kyverno.io/description: >-
76
+ Ensures every ServiceMonitor and PodMonitor carries a metricRelabelings
77
+ labeldrop rule for high-cardinality labels (world_id, trace_id, user_id,
78
+ request_id, operator_id) on every endpoint. Closes the "third-party chart
79
+ bypasses C2 labeldrop" gap surfaced during PR #783 review.
80
+ spec:
81
+ background: false
82
+ failurePolicy: Ignore
83
+ mutateExistingOnPolicyUpdate: false
84
+
85
+ rules:
86
+ # ---------------------------------------------------------------------
87
+ # ServiceMonitor — Rule A: metricRelabelings absent or empty
88
+ # ---------------------------------------------------------------------
89
+ - name: inject-labeldrop-sm-absent
90
+ match:
91
+ any:
92
+ - resources:
93
+ kinds:
94
+ - monitoring.coreos.com/v1/ServiceMonitor
95
+ mutate:
96
+ foreach:
97
+ - list: "request.object.spec.endpoints"
98
+ preconditions:
99
+ all:
100
+ # length() of null/missing returns 0; length([]) is 0. So
101
+ # this fires when the field is absent OR an empty array.
102
+ - key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
103
+ operator: Equals
104
+ value: 0
105
+ patchesJson6902: |-
106
+ - op: add
107
+ path: "/spec/endpoints/{{ elementIndex }}/metricRelabelings"
108
+ value:
109
+ - action: labeldrop
110
+ regex: "world_id|trace_id|user_id|request_id|operator_id"
111
+
112
+ # ---------------------------------------------------------------------
113
+ # ServiceMonitor — Rule B: metricRelabelings has entries, but no
114
+ # matching labeldrop for our banned-label regex.
115
+ #
116
+ # We test `contains(regex, 'world_id')` rather than equality so that
117
+ # operators who include additional banned labels in their own regex
118
+ # don't trigger duplicate injection. This is the idempotency hinge.
119
+ # ---------------------------------------------------------------------
120
+ - name: inject-labeldrop-sm-append
121
+ match:
122
+ any:
123
+ - resources:
124
+ kinds:
125
+ - monitoring.coreos.com/v1/ServiceMonitor
126
+ mutate:
127
+ foreach:
128
+ - list: "request.object.spec.endpoints"
129
+ preconditions:
130
+ all:
131
+ - key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
132
+ operator: GreaterThan
133
+ value: 0
134
+ - key: >-
135
+ {{ length(element.metricRelabelings[?action == 'labeldrop' && contains(not_null(regex, ''), 'world_id')]) }}
136
+ operator: Equals
137
+ value: 0
138
+ patchesJson6902: |-
139
+ - op: add
140
+ path: "/spec/endpoints/{{ elementIndex }}/metricRelabelings/-"
141
+ value:
142
+ action: labeldrop
143
+ regex: "world_id|trace_id|user_id|request_id|operator_id"
144
+
145
+ # ---------------------------------------------------------------------
146
+ # PodMonitor — Rule A: podMetricsEndpoints[*].metricRelabelings absent
147
+ # ---------------------------------------------------------------------
148
+ - name: inject-labeldrop-pm-absent
149
+ match:
150
+ any:
151
+ - resources:
152
+ kinds:
153
+ - monitoring.coreos.com/v1/PodMonitor
154
+ mutate:
155
+ foreach:
156
+ - list: "request.object.spec.podMetricsEndpoints"
157
+ preconditions:
158
+ all:
159
+ - key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
160
+ operator: Equals
161
+ value: 0
162
+ patchesJson6902: |-
163
+ - op: add
164
+ path: "/spec/podMetricsEndpoints/{{ elementIndex }}/metricRelabelings"
165
+ value:
166
+ - action: labeldrop
167
+ regex: "world_id|trace_id|user_id|request_id|operator_id"
168
+
169
+ # ---------------------------------------------------------------------
170
+ # PodMonitor — Rule B: metricRelabelings exists, no labeldrop
171
+ # ---------------------------------------------------------------------
172
+ - name: inject-labeldrop-pm-append
173
+ match:
174
+ any:
175
+ - resources:
176
+ kinds:
177
+ - monitoring.coreos.com/v1/PodMonitor
178
+ mutate:
179
+ foreach:
180
+ - list: "request.object.spec.podMetricsEndpoints"
181
+ preconditions:
182
+ all:
183
+ - key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
184
+ operator: GreaterThan
185
+ value: 0
186
+ - key: >-
187
+ {{ length(element.metricRelabelings[?action == 'labeldrop' && contains(not_null(regex, ''), 'world_id')]) }}
188
+ operator: Equals
189
+ value: 0
190
+ patchesJson6902: |-
191
+ - op: add
192
+ path: "/spec/podMetricsEndpoints/{{ elementIndex }}/metricRelabelings/-"
193
+ value:
194
+ action: labeldrop
195
+ regex: "world_id|trace_id|user_id|request_id|operator_id"