@pleri/olam-cli 0.1.161 → 0.1.166
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +35 -11
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
- package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.js +29 -3
- package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
- package/dist/commands/skills-source.d.ts.map +1 -1
- package/dist/commands/skills-source.js +57 -2
- package/dist/commands/skills-source.js.map +1 -1
- package/dist/commands/skills.d.ts.map +1 -1
- package/dist/commands/skills.js +14 -0
- package/dist/commands/skills.js.map +1 -1
- package/dist/image-digests.json +7 -7
- package/dist/index.js +996 -618
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
- package/dist/lib/bootstrap-kubernetes.js +93 -13
- package/dist/lib/bootstrap-kubernetes.js.map +1 -1
- package/dist/mcp-server.js +568 -368
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/observability/grafana-port-forward.sh +12 -2
- package/host-cp/observability/kyverno-cardinality-mutate.sh +12 -2
- package/host-cp/observability/loki-ingest.sh +12 -2
- package/host-cp/observability/prom-no-double-grafana.sh +15 -5
- package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
- package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
- package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
- package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
- package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
- package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
- package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
- package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
- package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
- package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
- package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
- package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
- package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
- package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
- package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
- package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
- package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
- package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
- package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
- package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
- package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
- package/host-cp/src/plan-chat-service.mjs +147 -1
- package/package.json +1 -1
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# 90-prom-alert-cardinality.yaml — Phase C Task C2 cardinality alert rule.
|
|
2
|
+
#
|
|
3
|
+
# PrometheusRule CR: fires OlamActiveSeriesHigh when prometheus_tsdb_head_series
|
|
4
|
+
# exceeds 80k (80% of the 100k active-series cap defined by P4).
|
|
5
|
+
#
|
|
6
|
+
# ruleSelector match: the Prometheus CR rendered by kube-prom-stack 85.2.0 uses
|
|
7
|
+
# ruleSelector: matchLabels: release: "olam-prom"
|
|
8
|
+
# (verified via `helm template ... | grep -A3 ruleSelector`).
|
|
9
|
+
# The label below MUST match or this rule is silently ignored by Prometheus.
|
|
10
|
+
#
|
|
11
|
+
# Alertmanager: enabled in kube-prom-stack-values.yaml from C2 onwards.
|
|
12
|
+
# Receivers: not yet configured (C2 scope = rule landing; receiver config is C4+).
|
|
13
|
+
# Alertmanager will fire the alert to its default null receiver until receivers
|
|
14
|
+
# are wired — this is intentional. The alert is visible in the Prometheus UI
|
|
15
|
+
# at /alerts regardless of receiver config.
|
|
16
|
+
#
|
|
17
|
+
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C2
|
|
18
|
+
# T1 (cardinality bomb) + P4 (<100k active series)
|
|
19
|
+
---
|
|
20
|
+
apiVersion: monitoring.coreos.com/v1
|
|
21
|
+
kind: PrometheusRule
|
|
22
|
+
metadata:
|
|
23
|
+
name: olam-cardinality
|
|
24
|
+
namespace: monitoring
|
|
25
|
+
labels:
|
|
26
|
+
app.kubernetes.io/name: olam-prometheus-rules
|
|
27
|
+
app.kubernetes.io/managed-by: olam
|
|
28
|
+
# REQUIRED: matches Prometheus CR's ruleSelector (release: "olam-prom").
|
|
29
|
+
# Verified via helm template output, 2026-05-21.
|
|
30
|
+
release: olam-prom
|
|
31
|
+
spec:
|
|
32
|
+
groups:
|
|
33
|
+
- name: olam-cardinality
|
|
34
|
+
interval: 30s
|
|
35
|
+
rules:
|
|
36
|
+
- alert: OlamActiveSeriesHigh
|
|
37
|
+
expr: |
|
|
38
|
+
prometheus_tsdb_head_series > 80000
|
|
39
|
+
for: 5m
|
|
40
|
+
labels:
|
|
41
|
+
severity: warning
|
|
42
|
+
scope: cardinality
|
|
43
|
+
annotations:
|
|
44
|
+
summary: "Active series above 80k threshold (80% of 100k cap)"
|
|
45
|
+
description: |
|
|
46
|
+
prometheus_tsdb_head_series is {{ $value | humanize }} — within 20%
|
|
47
|
+
of the 100k cardinality budget (P4). Investigate which service is
|
|
48
|
+
emitting a new high-cardinality label, OR add a DROP rule to
|
|
49
|
+
kube-prom-stack-values.yaml metricRelabelings for that ServiceMonitor.
|
|
50
|
+
Runbook: docs/architecture/observability-cardinality.md (TBD — C4+)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# 91-servicemonitor-host-cp.yaml — Phase C Task C3 ServiceMonitor for host-cp.
|
|
2
|
+
#
|
|
3
|
+
# Registers host-cp's /metrics endpoint with Prometheus for scraping.
|
|
4
|
+
#
|
|
5
|
+
# NOTE: This manifest requires the ServiceMonitor CRD installed by
|
|
6
|
+
# kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
|
|
7
|
+
# apply-manifests.sh (which targets the Phase A ingress harness) and is
|
|
8
|
+
# applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
|
|
9
|
+
#
|
|
10
|
+
# Namespace placement (CRITICAL — C2 dogfood lesson):
|
|
11
|
+
# ServiceMonitors MUST live in the `monitoring` namespace to be discovered
|
|
12
|
+
# by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
|
|
13
|
+
# in any other namespace is silently ignored by default RBAC.
|
|
14
|
+
#
|
|
15
|
+
# Label compliance:
|
|
16
|
+
# `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector
|
|
17
|
+
# (verified via `helm template ... | grep -A3 serviceMonitorSelector`).
|
|
18
|
+
#
|
|
19
|
+
# Target selector:
|
|
20
|
+
# Matches the host-cp Service by its `app: olam-host-cp` label. Adjust if
|
|
21
|
+
# the Service label differs in the target cluster (check
|
|
22
|
+
# `kubectl get svc -n olam -l app=olam-host-cp`).
|
|
23
|
+
#
|
|
24
|
+
# metricRelabelings (layer-2 cardinality enforcement):
|
|
25
|
+
# Mirrors the `*cardinality-labeldrop` YAML anchor from
|
|
26
|
+
# kube-prom-stack-values.yaml. host-cp's /metrics is taxonomy-compliant
|
|
27
|
+
# (only {service,route,method,status_code} labels), but the labeldrop rule
|
|
28
|
+
# is present as defense-in-depth: if a future code change accidentally
|
|
29
|
+
# emits a banned label (world_id etc.), this ServiceMonitor drops it before
|
|
30
|
+
# ingest so the cardinality cap is never breached.
|
|
31
|
+
#
|
|
32
|
+
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
|
|
33
|
+
# T1 (cardinality bomb) + P4 (<100k active series)
|
|
34
|
+
---
|
|
35
|
+
apiVersion: monitoring.coreos.com/v1
|
|
36
|
+
kind: ServiceMonitor
|
|
37
|
+
metadata:
|
|
38
|
+
name: olam-host-cp
|
|
39
|
+
namespace: monitoring
|
|
40
|
+
labels:
|
|
41
|
+
app.kubernetes.io/name: olam-host-cp-monitor
|
|
42
|
+
app.kubernetes.io/managed-by: olam
|
|
43
|
+
# REQUIRED: matches Prometheus CR's serviceMonitorSelector.
|
|
44
|
+
release: olam-prom
|
|
45
|
+
spec:
|
|
46
|
+
# Discover the host-cp Service in the olam namespace.
|
|
47
|
+
namespaceSelector:
|
|
48
|
+
matchNames:
|
|
49
|
+
- olam
|
|
50
|
+
selector:
|
|
51
|
+
matchLabels:
|
|
52
|
+
app: olam-host-cp
|
|
53
|
+
endpoints:
|
|
54
|
+
- port: http
|
|
55
|
+
path: /metrics
|
|
56
|
+
interval: 15s
|
|
57
|
+
# Preserve the application-emitted `service` label. Without honorLabels,
|
|
58
|
+
# Prometheus's target-label injection (where `service` = the k8s Service
|
|
59
|
+
# name `olam-host-cp`) overrides the application's own `service=host-cp`
|
|
60
|
+
# value, moving the app's value into `exported_service`. The C5 drill-in
|
|
61
|
+
# dashboards filter on `service=host-cp`, so without honorLabels their
|
|
62
|
+
# panels show empty data. Surfaced during 2026-05-21 operator dogfood —
|
|
63
|
+
# see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #3.
|
|
64
|
+
honorLabels: true
|
|
65
|
+
# Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
|
|
66
|
+
# in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
|
|
67
|
+
# even if the service accidentally emits them.
|
|
68
|
+
metricRelabelings:
|
|
69
|
+
- action: labeldrop
|
|
70
|
+
regex: 'world_id|trace_id|user_id|request_id|operator_id'
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# 92-servicemonitor-kg-service.yaml — Phase C Task C3 ServiceMonitor for kg-service.
|
|
2
|
+
#
|
|
3
|
+
# Registers kg-service's /metrics endpoint with Prometheus for scraping.
|
|
4
|
+
#
|
|
5
|
+
# NOTE: This manifest requires the ServiceMonitor CRD installed by
|
|
6
|
+
# kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
|
|
7
|
+
# apply-manifests.sh (which targets the Phase A ingress harness) and is
|
|
8
|
+
# applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
|
|
9
|
+
#
|
|
10
|
+
# Namespace placement (CRITICAL — C2 dogfood lesson):
|
|
11
|
+
# ServiceMonitors MUST live in the `monitoring` namespace to be discovered
|
|
12
|
+
# by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
|
|
13
|
+
# in any other namespace is silently ignored by default RBAC.
|
|
14
|
+
#
|
|
15
|
+
# Label compliance:
|
|
16
|
+
# `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector
|
|
17
|
+
# (verified via `helm template ... | grep -A3 serviceMonitorSelector`).
|
|
18
|
+
#
|
|
19
|
+
# Target selector:
|
|
20
|
+
# Matches the kg-service Service by its `app: olam-kg-service` label. Adjust
|
|
21
|
+
# if the Service label differs in the target cluster (check
|
|
22
|
+
# `kubectl get svc -n olam -l app=olam-kg-service`).
|
|
23
|
+
#
|
|
24
|
+
# metricRelabelings (layer-2 cardinality enforcement):
|
|
25
|
+
# Mirrors the `*cardinality-labeldrop` YAML anchor from
|
|
26
|
+
# kube-prom-stack-values.yaml. kg-service's /metrics is taxonomy-compliant
|
|
27
|
+
# (only {service,route,method,status_code} labels), but the labeldrop rule
|
|
28
|
+
# is present as defense-in-depth: if a future code change accidentally
|
|
29
|
+
# emits a banned label (world_id etc.), this ServiceMonitor drops it before
|
|
30
|
+
# ingest so the cardinality cap is never breached.
|
|
31
|
+
#
|
|
32
|
+
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
|
|
33
|
+
# T1 (cardinality bomb) + P4 (<100k active series)
|
|
34
|
+
---
|
|
35
|
+
apiVersion: monitoring.coreos.com/v1
|
|
36
|
+
kind: ServiceMonitor
|
|
37
|
+
metadata:
|
|
38
|
+
name: olam-kg-service
|
|
39
|
+
namespace: monitoring
|
|
40
|
+
labels:
|
|
41
|
+
app.kubernetes.io/name: olam-kg-service-monitor
|
|
42
|
+
app.kubernetes.io/managed-by: olam
|
|
43
|
+
# REQUIRED: matches Prometheus CR's serviceMonitorSelector.
|
|
44
|
+
release: olam-prom
|
|
45
|
+
spec:
|
|
46
|
+
# Discover the kg-service Service in the olam namespace.
|
|
47
|
+
namespaceSelector:
|
|
48
|
+
matchNames:
|
|
49
|
+
- olam
|
|
50
|
+
selector:
|
|
51
|
+
matchLabels:
|
|
52
|
+
app: olam-kg-service
|
|
53
|
+
endpoints:
|
|
54
|
+
- port: http
|
|
55
|
+
path: /metrics
|
|
56
|
+
interval: 15s
|
|
57
|
+
# Preserve the application-emitted `service` label. Without honorLabels,
|
|
58
|
+
# Prometheus's target-label injection (where `service` = the k8s Service
|
|
59
|
+
# name `olam-kg-service`) overrides the application's own `service=kg-service`
|
|
60
|
+
# value, moving the app's value into `exported_service`. The C5 drill-in
|
|
61
|
+
# dashboards filter on `service=kg-service`, so without honorLabels their
|
|
62
|
+
# panels show empty data. Surfaced during 2026-05-21 operator dogfood —
|
|
63
|
+
# see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #3.
|
|
64
|
+
honorLabels: true
|
|
65
|
+
# Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
|
|
66
|
+
# in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
|
|
67
|
+
# even if the service accidentally emits them.
|
|
68
|
+
metricRelabelings:
|
|
69
|
+
- action: labeldrop
|
|
70
|
+
regex: 'world_id|trace_id|user_id|request_id|operator_id'
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# 93-servicemonitor-memory-service.yaml — Phase C Task C3 closure ServiceMonitor.
|
|
2
|
+
#
|
|
3
|
+
# Registers memory-service's /metrics endpoint with Prometheus for scraping.
|
|
4
|
+
# C3 originally shipped instrumentation for host-cp + kg-service (PR #787) but
|
|
5
|
+
# DEFERRED memory-service because the third-party `agentmemory` Node CLI that
|
|
6
|
+
# runs in k3s exposes no /metrics endpoint. This PR closes that deferral by
|
|
7
|
+
# shipping a small Node HTTP front-door (packages/memory-service/src/metrics-proxy.mjs)
|
|
8
|
+
# inside the container image: external traffic hits the proxy on :3111, the
|
|
9
|
+
# proxy short-circuits /metrics + forwards everything else to agentmemory on
|
|
10
|
+
# loopback :3110. End-state matches the host-cp/kg-service shape so the ServiceMonitor
|
|
11
|
+
# pattern below is a near-clone of 91-servicemonitor-host-cp.yaml.
|
|
12
|
+
#
|
|
13
|
+
# NOTE: This manifest requires the ServiceMonitor CRD installed by
|
|
14
|
+
# kube-prometheus-stack (Phase C Task C1). It is SKIPPED by
|
|
15
|
+
# apply-manifests.sh (which targets the Phase A ingress harness) and is
|
|
16
|
+
# applied by the phase-c-e2e harness after kube-prom-stack ships CRDs.
|
|
17
|
+
#
|
|
18
|
+
# Namespace placement (CRITICAL — C2 dogfood lesson):
|
|
19
|
+
# ServiceMonitors MUST live in the `monitoring` namespace to be discovered
|
|
20
|
+
# by the Prometheus CR's serviceMonitorNamespaceSelector. A ServiceMonitor
|
|
21
|
+
# in any other namespace is silently ignored by default RBAC.
|
|
22
|
+
#
|
|
23
|
+
# Label compliance:
|
|
24
|
+
# `release: olam-prom` matches the Prometheus CR's serviceMonitorSelector.
|
|
25
|
+
#
|
|
26
|
+
# Target selector:
|
|
27
|
+
# Matches the memory-service Service by its `app: olam-memory-service` label.
|
|
28
|
+
# The Service is defined in packages/host-cp/k8s/manifests/memory-service/60-service.yaml
|
|
29
|
+
# (port `http` -> targetPort 3111). The 50-traefik-ingressroute-agent-memory.yaml
|
|
30
|
+
# IngressRoute references the same Service for /api/agent-memory/* traffic.
|
|
31
|
+
#
|
|
32
|
+
# Image rollout dependency:
|
|
33
|
+
# The proxy lives inside the container image. Until the next release pipeline
|
|
34
|
+
# refreshes ghcr.io/pleri/olam-memory-service with the post-C3-closure
|
|
35
|
+
# Dockerfile (npm run refresh:manifest-digests), this ServiceMonitor will scrape
|
|
36
|
+
# a target that responds 404 to /metrics. Prometheus tolerates that (the target
|
|
37
|
+
# stays UP, scrape_samples_scraped=0). When the new image lands, scraping
|
|
38
|
+
# begins producing real samples without any cluster-side change.
|
|
39
|
+
#
|
|
40
|
+
# metricRelabelings (layer-2 cardinality enforcement):
|
|
41
|
+
# Mirrors the `*cardinality-labeldrop` YAML anchor from
|
|
42
|
+
# kube-prom-stack-values.yaml. memory-service's /metrics is taxonomy-compliant
|
|
43
|
+
# (only {service,route,method,status_code} labels), but the labeldrop rule
|
|
44
|
+
# is present as defense-in-depth: if a future code change accidentally
|
|
45
|
+
# emits a banned label (world_id etc.), this ServiceMonitor drops it before
|
|
46
|
+
# ingest so the cardinality cap is never breached.
|
|
47
|
+
#
|
|
48
|
+
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C3
|
|
49
|
+
# T1 (cardinality bomb) + P4 (<100k active series).
|
|
50
|
+
---
|
|
51
|
+
apiVersion: monitoring.coreos.com/v1
|
|
52
|
+
kind: ServiceMonitor
|
|
53
|
+
metadata:
|
|
54
|
+
name: olam-memory-service
|
|
55
|
+
namespace: monitoring
|
|
56
|
+
labels:
|
|
57
|
+
app.kubernetes.io/name: olam-memory-service-monitor
|
|
58
|
+
app.kubernetes.io/managed-by: olam
|
|
59
|
+
# REQUIRED: matches Prometheus CR's serviceMonitorSelector.
|
|
60
|
+
release: olam-prom
|
|
61
|
+
spec:
|
|
62
|
+
# Discover the memory-service Service in the olam namespace.
|
|
63
|
+
namespaceSelector:
|
|
64
|
+
matchNames:
|
|
65
|
+
- olam
|
|
66
|
+
selector:
|
|
67
|
+
matchLabels:
|
|
68
|
+
app: olam-memory-service
|
|
69
|
+
endpoints:
|
|
70
|
+
- port: http
|
|
71
|
+
path: /metrics
|
|
72
|
+
interval: 15s
|
|
73
|
+
# Preserve the application-emitted `service` label. Without honorLabels,
|
|
74
|
+
# Prometheus's target-label injection (where `service` = the k8s Service
|
|
75
|
+
# name `olam-memory-service`) overrides the application's own
|
|
76
|
+
# `service=memory-service` value, moving the app's value into
|
|
77
|
+
# `exported_service`. The C5 drill-in dashboards filter on
|
|
78
|
+
# `service=memory-service`, so without honorLabels their panels show
|
|
79
|
+
# empty data. Same lesson as the host-cp/kg-service ServiceMonitors —
|
|
80
|
+
# see docs/incidents/2026-05-21-phase-c-dogfood.md finding #3.
|
|
81
|
+
honorLabels: true
|
|
82
|
+
# Layer-2 cardinality enforcement — same regex as *cardinality-labeldrop
|
|
83
|
+
# in kube-prom-stack-values.yaml. Defense-in-depth: drops banned labels
|
|
84
|
+
# even if the service accidentally emits them.
|
|
85
|
+
metricRelabelings:
|
|
86
|
+
- action: labeldrop
|
|
87
|
+
regex: 'world_id|trace_id|user_id|request_id|operator_id'
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# 95-prom-recording-rules.yaml — Phase C Task C4
|
|
2
|
+
#
|
|
3
|
+
# Naming convention: olam:<metric>:<aggregation>
|
|
4
|
+
#
|
|
5
|
+
# olam — project namespace prefix (all project recording rules share this)
|
|
6
|
+
# <metric> — the base Prometheus metric being aggregated (without _bucket/_total suffix
|
|
7
|
+
# when the aggregation already implies the source type)
|
|
8
|
+
# <aggregation> — describes what was computed + the grouping dimensions, e.g.
|
|
9
|
+
# p95_by_service_route, rate5m_by_service, ratio_by_service_route
|
|
10
|
+
#
|
|
11
|
+
# Modeled on the community convention from
|
|
12
|
+
# https://prometheus.io/docs/practices/rules/#naming — <level>:<metric>:<ops>.
|
|
13
|
+
# The <aggregation> suffix encodes BOTH the operation (p95, rate5m, ratio) and
|
|
14
|
+
# the grouping dimensions (_by_service, _by_service_route) so dashboard panels
|
|
15
|
+
# can select the pre-computed series without further aggregation.
|
|
16
|
+
#
|
|
17
|
+
# Source metrics (provided by C3 — host-cp + kg-service ServiceMonitors):
|
|
18
|
+
# http_request_duration_seconds_bucket{service, route, method, status_code, le}
|
|
19
|
+
# http_requests_total{service, route, method, status_code}
|
|
20
|
+
#
|
|
21
|
+
# rule group interval: 30s — half the scrape interval (15s × 2). Balances
|
|
22
|
+
# freshness vs evaluation CPU; at 30s each window is re-evaluated twice per
|
|
23
|
+
# minute, keeping percentiles and rates responsive without hammering the TSDB.
|
|
24
|
+
#
|
|
25
|
+
# NOTE: recording rules intentionally reference NO banned labels
|
|
26
|
+
# (world_id, trace_id, user_id, request_id, operator_id). C2's labeldrop at
|
|
27
|
+
# scrape time strips them before ingest; even if a metric slipped through,
|
|
28
|
+
# referencing them here would suppress results. Defense-in-depth: don't type
|
|
29
|
+
# them at all.
|
|
30
|
+
#
|
|
31
|
+
# Applied by: scripts/e2e/prom-no-double-grafana.sh (C4 assertion block)
|
|
32
|
+
# Skipped by: scripts/test-ingress-integration/apply-manifests.sh
|
|
33
|
+
# (9[0-9]-prom-* glob) — requires kube-prom-stack CRDs to be present.
|
|
34
|
+
apiVersion: monitoring.coreos.com/v1
|
|
35
|
+
kind: PrometheusRule
|
|
36
|
+
metadata:
|
|
37
|
+
name: olam-recording-rules
|
|
38
|
+
namespace: monitoring
|
|
39
|
+
labels:
|
|
40
|
+
app.kubernetes.io/name: olam-prometheus-rules
|
|
41
|
+
app.kubernetes.io/managed-by: olam
|
|
42
|
+
release: olam-prom # must match kube-prom-stack ruleSelector (verified C2)
|
|
43
|
+
spec:
|
|
44
|
+
groups:
|
|
45
|
+
- name: olam-http-aggregations
|
|
46
|
+
interval: 30s
|
|
47
|
+
rules:
|
|
48
|
+
# ============================================================
|
|
49
|
+
# Latency percentiles per service+route — Phase C Task C4
|
|
50
|
+
# Source: http_request_duration_seconds_bucket (C3)
|
|
51
|
+
# ============================================================
|
|
52
|
+
- record: olam:http_request_duration_seconds:p50_by_service_route
|
|
53
|
+
expr: |
|
|
54
|
+
histogram_quantile(0.50, sum by (service, route, le) (
|
|
55
|
+
rate(http_request_duration_seconds_bucket[5m])
|
|
56
|
+
))
|
|
57
|
+
|
|
58
|
+
- record: olam:http_request_duration_seconds:p95_by_service_route
|
|
59
|
+
expr: |
|
|
60
|
+
histogram_quantile(0.95, sum by (service, route, le) (
|
|
61
|
+
rate(http_request_duration_seconds_bucket[5m])
|
|
62
|
+
))
|
|
63
|
+
|
|
64
|
+
- record: olam:http_request_duration_seconds:p99_by_service_route
|
|
65
|
+
expr: |
|
|
66
|
+
histogram_quantile(0.99, sum by (service, route, le) (
|
|
67
|
+
rate(http_request_duration_seconds_bucket[5m])
|
|
68
|
+
))
|
|
69
|
+
|
|
70
|
+
# Aggregate p95 across all routes (per-service summary)
|
|
71
|
+
- record: olam:http_request_duration_seconds:p95_by_service
|
|
72
|
+
expr: |
|
|
73
|
+
histogram_quantile(0.95, sum by (service, le) (
|
|
74
|
+
rate(http_request_duration_seconds_bucket[5m])
|
|
75
|
+
))
|
|
76
|
+
|
|
77
|
+
# ============================================================
|
|
78
|
+
# Request rate per service+route
|
|
79
|
+
# Source: http_requests_total (C3)
|
|
80
|
+
# ============================================================
|
|
81
|
+
- record: olam:http_requests:rate5m_by_service_route
|
|
82
|
+
expr: |
|
|
83
|
+
sum by (service, route) (rate(http_requests_total[5m]))
|
|
84
|
+
|
|
85
|
+
# Aggregate request rate per service
|
|
86
|
+
- record: olam:http_requests:rate5m_by_service
|
|
87
|
+
expr: |
|
|
88
|
+
sum by (service) (rate(http_requests_total[5m]))
|
|
89
|
+
|
|
90
|
+
# ============================================================
|
|
91
|
+
# Error rate (status_code >= 500) per service+route
|
|
92
|
+
# 4xx are client errors and are intentionally excluded from
|
|
93
|
+
# the error ratio — only server-side failures count.
|
|
94
|
+
# ============================================================
|
|
95
|
+
- record: olam:http_errors:rate5m_by_service_route
|
|
96
|
+
expr: |
|
|
97
|
+
sum by (service, route) (
|
|
98
|
+
rate(http_requests_total{status_code=~"5.."}[5m])
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Error ratio (errors / total) per service+route.
|
|
102
|
+
# Returns NaN when total rate is 0 (no traffic) — dashboards
|
|
103
|
+
# should handle NaN as "no data" rather than "0% error rate".
|
|
104
|
+
- record: olam:http_errors:ratio_by_service_route
|
|
105
|
+
expr: |
|
|
106
|
+
sum by (service, route) (rate(http_requests_total{status_code=~"5.."}[5m]))
|
|
107
|
+
/
|
|
108
|
+
sum by (service, route) (rate(http_requests_total[5m]))
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# 96-kyverno-cardinality-mutate.yaml — Phase C C8 follow-up.
|
|
2
|
+
#
|
|
3
|
+
# Closes codex's C2 concern: per-ServiceMonitor metricRelabelings is
|
|
4
|
+
# "policy by convention". A third-party ServiceMonitor or PodMonitor that
|
|
5
|
+
# olam doesn't author can bypass the labeldrop and reintroduce the
|
|
6
|
+
# cardinality bomb (T1). YAML anchors in kube-prom-stack-values.yaml keep
|
|
7
|
+
# Olam-owned manifests DRY but don't make the cluster safe.
|
|
8
|
+
#
|
|
9
|
+
# This ClusterPolicy mutates EVERY incoming ServiceMonitor and PodMonitor
|
|
10
|
+
# at admission time — regardless of who created it (chart, kubectl, operator,
|
|
11
|
+
# CI, GitOps) — to ensure the cardinality labeldrop rule is present on
|
|
12
|
+
# every endpoint. Once persisted, the prometheus-operator renders the
|
|
13
|
+
# relabel into Prometheus's scrape config.
|
|
14
|
+
#
|
|
15
|
+
# Why mutate-only (not validate):
|
|
16
|
+
# Validate would block a chart install or operator action mid-stride
|
|
17
|
+
# if a third-party ServiceMonitor lacks the rule. Mutate is the better
|
|
18
|
+
# posture: silently ensure the rule is present without breaking
|
|
19
|
+
# legitimate installs. Defense-in-depth still lives in TWO layers:
|
|
20
|
+
# (a) admission-time mutation (this policy)
|
|
21
|
+
# (b) per-ServiceMonitor metricRelabelings in
|
|
22
|
+
# kube-prom-stack-values.yaml + 9x-servicemonitor-*.yaml.
|
|
23
|
+
#
|
|
24
|
+
# Idempotency contract:
|
|
25
|
+
# Mutation must NOT add a duplicate labeldrop entry. Achieved by
|
|
26
|
+
# two-rule split per kind, each with a precondition that the labeldrop
|
|
27
|
+
# is currently ABSENT. Once present, neither rule fires:
|
|
28
|
+
# - Rule A (handle absent/empty case): preconditions:
|
|
29
|
+
# metricRelabelings is null/missing OR empty array.
|
|
30
|
+
# JSON patch: `add /spec/endpoints/{i}/metricRelabelings` with
|
|
31
|
+
# a single-element array containing our rule.
|
|
32
|
+
# - Rule B (handle existing-but-no-labeldrop case): preconditions:
|
|
33
|
+
# metricRelabelings is a non-empty array AND no entry has
|
|
34
|
+
# `action: labeldrop` with `regex` mentioning `world_id`.
|
|
35
|
+
# JSON patch: `add /spec/endpoints/{i}/metricRelabelings/-`
|
|
36
|
+
# appending our rule.
|
|
37
|
+
#
|
|
38
|
+
# Verified behavior (kyverno-cardinality-mutate.sh asserts):
|
|
39
|
+
# - Bare ServiceMonitor (no metricRelabelings) → Rule A injects
|
|
40
|
+
# - ServiceMonitor with metricRelabelings: [] → Rule A injects (replaces empty)
|
|
41
|
+
# - ServiceMonitor with unrelated metricRelabelings entries → Rule B appends
|
|
42
|
+
# - ServiceMonitor with matching labeldrop already present → NEITHER rule fires (idempotent)
|
|
43
|
+
# - Mixed: some endpoints lack it, others have it → only the lacking endpoints are mutated
|
|
44
|
+
#
|
|
45
|
+
# Background scan: OFF (background: false). Existing ServiceMonitors at
|
|
46
|
+
# install time are NOT auto-mutated. Re-apply them to trigger admission,
|
|
47
|
+
# or rely on the C2 per-ServiceMonitor metricRelabelings as the failsafe.
|
|
48
|
+
#
|
|
49
|
+
# failurePolicy: Ignore. Kyverno webhook timeout / pod outage MUST NOT
|
|
50
|
+
# block ServiceMonitor admission — the C2 layer-2 rules still protect
|
|
51
|
+
# Olam-owned monitors. Trade-off accepted: during Kyverno downtime, a
|
|
52
|
+
# brand-new third-party ServiceMonitor could land without the labeldrop.
|
|
53
|
+
# The 80k active-series PrometheusRule alert (Phase C C2,
|
|
54
|
+
# 90-prom-alert-cardinality.yaml) is the runtime detector that fires
|
|
55
|
+
# if this gap is exploited.
|
|
56
|
+
#
|
|
57
|
+
# Refs:
|
|
58
|
+
# - docs/plans/k3s-ingress-observability/phase-c-tasks.md — C8
|
|
59
|
+
# - codex review on PR #783 ("policy by convention" finding)
|
|
60
|
+
# - https://kyverno.io/docs/writing-policies/mutate/
|
|
61
|
+
# - https://kyverno.io/docs/writing-policies/mutate/#foreach
|
|
62
|
+
---
|
|
63
|
+
apiVersion: kyverno.io/v1
|
|
64
|
+
kind: ClusterPolicy
|
|
65
|
+
metadata:
|
|
66
|
+
name: enforce-cardinality-labeldrop
|
|
67
|
+
labels:
|
|
68
|
+
app.kubernetes.io/part-of: olam
|
|
69
|
+
olam.io/phase: c-followup
|
|
70
|
+
annotations:
|
|
71
|
+
policies.kyverno.io/title: "Cluster-wide cardinality labeldrop enforcement"
|
|
72
|
+
policies.kyverno.io/category: "Observability"
|
|
73
|
+
policies.kyverno.io/severity: high
|
|
74
|
+
policies.kyverno.io/subject: "ServiceMonitor, PodMonitor"
|
|
75
|
+
policies.kyverno.io/description: >-
|
|
76
|
+
Ensures every ServiceMonitor and PodMonitor carries a metricRelabelings
|
|
77
|
+
labeldrop rule for high-cardinality labels (world_id, trace_id, user_id,
|
|
78
|
+
request_id, operator_id) on every endpoint. Closes the "third-party chart
|
|
79
|
+
bypasses C2 labeldrop" gap surfaced during PR #783 review.
|
|
80
|
+
spec:
|
|
81
|
+
background: false
|
|
82
|
+
failurePolicy: Ignore
|
|
83
|
+
mutateExistingOnPolicyUpdate: false
|
|
84
|
+
|
|
85
|
+
rules:
|
|
86
|
+
# ---------------------------------------------------------------------
|
|
87
|
+
# ServiceMonitor — Rule A: metricRelabelings absent or empty
|
|
88
|
+
# ---------------------------------------------------------------------
|
|
89
|
+
- name: inject-labeldrop-sm-absent
|
|
90
|
+
match:
|
|
91
|
+
any:
|
|
92
|
+
- resources:
|
|
93
|
+
kinds:
|
|
94
|
+
- monitoring.coreos.com/v1/ServiceMonitor
|
|
95
|
+
mutate:
|
|
96
|
+
foreach:
|
|
97
|
+
- list: "request.object.spec.endpoints"
|
|
98
|
+
preconditions:
|
|
99
|
+
all:
|
|
100
|
+
# length() of null/missing returns 0; length([]) is 0. So
|
|
101
|
+
# this fires when the field is absent OR an empty array.
|
|
102
|
+
- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
|
|
103
|
+
operator: Equals
|
|
104
|
+
value: 0
|
|
105
|
+
patchesJson6902: |-
|
|
106
|
+
- op: add
|
|
107
|
+
path: "/spec/endpoints/{{ elementIndex }}/metricRelabelings"
|
|
108
|
+
value:
|
|
109
|
+
- action: labeldrop
|
|
110
|
+
regex: "world_id|trace_id|user_id|request_id|operator_id"
|
|
111
|
+
|
|
112
|
+
# ---------------------------------------------------------------------
|
|
113
|
+
# ServiceMonitor — Rule B: metricRelabelings has entries, but no
|
|
114
|
+
# matching labeldrop for our banned-label regex.
|
|
115
|
+
#
|
|
116
|
+
# We test `contains(regex, 'world_id')` rather than equality so that
|
|
117
|
+
# operators who include additional banned labels in their own regex
|
|
118
|
+
# don't trigger duplicate injection. This is the idempotency hinge.
|
|
119
|
+
# ---------------------------------------------------------------------
|
|
120
|
+
- name: inject-labeldrop-sm-append
|
|
121
|
+
match:
|
|
122
|
+
any:
|
|
123
|
+
- resources:
|
|
124
|
+
kinds:
|
|
125
|
+
- monitoring.coreos.com/v1/ServiceMonitor
|
|
126
|
+
mutate:
|
|
127
|
+
foreach:
|
|
128
|
+
- list: "request.object.spec.endpoints"
|
|
129
|
+
preconditions:
|
|
130
|
+
all:
|
|
131
|
+
- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
|
|
132
|
+
operator: GreaterThan
|
|
133
|
+
value: 0
|
|
134
|
+
- key: >-
|
|
135
|
+
{{ length(element.metricRelabelings[?action == 'labeldrop' && contains(not_null(regex, ''), 'world_id')]) }}
|
|
136
|
+
operator: Equals
|
|
137
|
+
value: 0
|
|
138
|
+
patchesJson6902: |-
|
|
139
|
+
- op: add
|
|
140
|
+
path: "/spec/endpoints/{{ elementIndex }}/metricRelabelings/-"
|
|
141
|
+
value:
|
|
142
|
+
action: labeldrop
|
|
143
|
+
regex: "world_id|trace_id|user_id|request_id|operator_id"
|
|
144
|
+
|
|
145
|
+
# ---------------------------------------------------------------------
|
|
146
|
+
# PodMonitor — Rule A: podMetricsEndpoints[*].metricRelabelings absent
|
|
147
|
+
# ---------------------------------------------------------------------
|
|
148
|
+
- name: inject-labeldrop-pm-absent
|
|
149
|
+
match:
|
|
150
|
+
any:
|
|
151
|
+
- resources:
|
|
152
|
+
kinds:
|
|
153
|
+
- monitoring.coreos.com/v1/PodMonitor
|
|
154
|
+
mutate:
|
|
155
|
+
foreach:
|
|
156
|
+
- list: "request.object.spec.podMetricsEndpoints"
|
|
157
|
+
preconditions:
|
|
158
|
+
all:
|
|
159
|
+
- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
|
|
160
|
+
operator: Equals
|
|
161
|
+
value: 0
|
|
162
|
+
patchesJson6902: |-
|
|
163
|
+
- op: add
|
|
164
|
+
path: "/spec/podMetricsEndpoints/{{ elementIndex }}/metricRelabelings"
|
|
165
|
+
value:
|
|
166
|
+
- action: labeldrop
|
|
167
|
+
regex: "world_id|trace_id|user_id|request_id|operator_id"
|
|
168
|
+
|
|
169
|
+
# ---------------------------------------------------------------------
|
|
170
|
+
# PodMonitor — Rule B: metricRelabelings exists, no labeldrop
|
|
171
|
+
# ---------------------------------------------------------------------
|
|
172
|
+
- name: inject-labeldrop-pm-append
|
|
173
|
+
match:
|
|
174
|
+
any:
|
|
175
|
+
- resources:
|
|
176
|
+
kinds:
|
|
177
|
+
- monitoring.coreos.com/v1/PodMonitor
|
|
178
|
+
mutate:
|
|
179
|
+
foreach:
|
|
180
|
+
- list: "request.object.spec.podMetricsEndpoints"
|
|
181
|
+
preconditions:
|
|
182
|
+
all:
|
|
183
|
+
- key: "{{ length(not_null(element.metricRelabelings, `[]`)) }}"
|
|
184
|
+
operator: GreaterThan
|
|
185
|
+
value: 0
|
|
186
|
+
- key: >-
|
|
187
|
+
{{ length(element.metricRelabelings[?action == 'labeldrop' && contains(not_null(regex, ''), 'world_id')]) }}
|
|
188
|
+
operator: Equals
|
|
189
|
+
value: 0
|
|
190
|
+
patchesJson6902: |-
|
|
191
|
+
- op: add
|
|
192
|
+
path: "/spec/podMetricsEndpoints/{{ elementIndex }}/metricRelabelings/-"
|
|
193
|
+
value:
|
|
194
|
+
action: labeldrop
|
|
195
|
+
regex: "world_id|trace_id|user_id|request_id|operator_id"
|