@pleri/olam-cli 0.1.161 → 0.1.166
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +35 -11
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
- package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.js +29 -3
- package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
- package/dist/commands/skills-source.d.ts.map +1 -1
- package/dist/commands/skills-source.js +57 -2
- package/dist/commands/skills-source.js.map +1 -1
- package/dist/commands/skills.d.ts.map +1 -1
- package/dist/commands/skills.js +14 -0
- package/dist/commands/skills.js.map +1 -1
- package/dist/image-digests.json +7 -7
- package/dist/index.js +996 -618
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
- package/dist/lib/bootstrap-kubernetes.js +93 -13
- package/dist/lib/bootstrap-kubernetes.js.map +1 -1
- package/dist/mcp-server.js +568 -368
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/observability/grafana-port-forward.sh +12 -2
- package/host-cp/observability/kyverno-cardinality-mutate.sh +12 -2
- package/host-cp/observability/loki-ingest.sh +12 -2
- package/host-cp/observability/prom-no-double-grafana.sh +15 -5
- package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
- package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
- package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
- package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
- package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
- package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
- package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
- package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
- package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
- package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
- package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
- package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
- package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
- package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
- package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
- package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
- package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
- package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
- package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
- package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
- package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
- package/host-cp/src/plan-chat-service.mjs +147 -1
- package/package.json +1 -1
|
@@ -111,7 +111,7 @@ spec:
|
|
|
111
111
|
# k3d), started by `olam upgrade` Step 0.7 — not inside this Pod.
|
|
112
112
|
containers:
|
|
113
113
|
- name: olam-host-cp
|
|
114
|
-
image: ghcr.io/pleri/olam-host-cp@sha256:
|
|
114
|
+
image: ghcr.io/pleri/olam-host-cp@sha256:7a49b44546d9b69c5a7448613130a43319e90e06a2999d688101657d7d851dda
|
|
115
115
|
imagePullPolicy: IfNotPresent
|
|
116
116
|
securityContext:
|
|
117
117
|
runAsNonRoot: true
|
|
@@ -70,7 +70,7 @@ spec:
|
|
|
70
70
|
mountPath: /data
|
|
71
71
|
containers:
|
|
72
72
|
- name: olam-auth-service
|
|
73
|
-
image: ghcr.io/pleri/olam-auth@sha256:
|
|
73
|
+
image: ghcr.io/pleri/olam-auth@sha256:d41a940bc9eb7016aeecc1c653e057d63d32d33c1e694d298b5340711d3d0bd8
|
|
74
74
|
imagePullPolicy: IfNotPresent
|
|
75
75
|
securityContext:
|
|
76
76
|
runAsNonRoot: true
|
|
@@ -61,7 +61,7 @@ spec:
|
|
|
61
61
|
mountPath: /data
|
|
62
62
|
containers:
|
|
63
63
|
- name: olam-kg-service
|
|
64
|
-
image: ghcr.io/pleri/olam-kg-service@sha256:
|
|
64
|
+
image: ghcr.io/pleri/olam-kg-service@sha256:b9a96be3cad11f298286d011a88309ac2e495074970bf4d860c032709a5ab72f
|
|
65
65
|
imagePullPolicy: IfNotPresent
|
|
66
66
|
securityContext:
|
|
67
67
|
runAsNonRoot: true
|
|
@@ -68,7 +68,7 @@ spec:
|
|
|
68
68
|
mountPath: /data
|
|
69
69
|
containers:
|
|
70
70
|
- name: olam-mcp-auth-service
|
|
71
|
-
image: ghcr.io/pleri/olam-mcp-auth@sha256:
|
|
71
|
+
image: ghcr.io/pleri/olam-mcp-auth@sha256:0322f65701dfda84a2d0672071914fd7276927772ccccf6c5f55c4c3617cd8fe
|
|
72
72
|
imagePullPolicy: IfNotPresent
|
|
73
73
|
securityContext:
|
|
74
74
|
runAsNonRoot: true
|
|
@@ -70,7 +70,7 @@ spec:
|
|
|
70
70
|
# bootstrap-placeholder comment + run `npm run refresh:manifest-digests`
|
|
71
71
|
# once ghcr.io/pleri/olam-memory-service has a real published digest.
|
|
72
72
|
# bootstrap-placeholder: pre-publish; refresh after first release
|
|
73
|
-
image: ghcr.io/pleri/olam-memory-service@sha256:
|
|
73
|
+
image: ghcr.io/pleri/olam-memory-service@sha256:70ae9c81efe07d8105c109aea970105709fc4daa50b0d688aa5d299a39a8b24a
|
|
74
74
|
imagePullPolicy: IfNotPresent
|
|
75
75
|
securityContext:
|
|
76
76
|
runAsNonRoot: true
|
|
@@ -115,7 +115,17 @@ log "Secret applied"
|
|
|
115
115
|
# packages/peripheral-services/scripts/sync-grafana-dashboards.sh.
|
|
116
116
|
# -------------------------------------------------------------------------
|
|
117
117
|
REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
|
|
118
|
-
|
|
118
|
+
# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
|
|
119
|
+
# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
|
|
120
|
+
# peripheral-services/{helm-values,manifests} directory is reachable.
|
|
121
|
+
# Monorepo callers leave it unset; the script falls back to the source dir
|
|
122
|
+
# under packages/peripheral-services/.
|
|
123
|
+
if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
|
|
124
|
+
PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
|
|
125
|
+
else
|
|
126
|
+
PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
|
|
127
|
+
fi
|
|
128
|
+
CONFIGMAP_MANIFEST="$PERIPHERAL_SERVICES_DIR/manifests/80-grafana-dashboard-configmap.yaml"
|
|
119
129
|
|
|
120
130
|
if [[ -f "$CONFIGMAP_MANIFEST" ]]; then
|
|
121
131
|
log "applying olam-dashboards ConfigMap from $CONFIGMAP_MANIFEST"
|
|
@@ -133,7 +143,7 @@ helm upgrade --install "$GRAFANA_RELEASE" grafana/grafana \
|
|
|
133
143
|
--version "$GRAFANA_CHART_VERSION" \
|
|
134
144
|
--namespace "$NAMESPACE" \
|
|
135
145
|
--create-namespace \
|
|
136
|
-
-f "$
|
|
146
|
+
-f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
|
|
137
147
|
--wait \
|
|
138
148
|
--timeout 300s
|
|
139
149
|
|
|
@@ -56,6 +56,16 @@ log() { printf '[kyverno-mutate] %s\n' "$*" >&2; }
|
|
|
56
56
|
fail() { printf '[kyverno-mutate] FAIL: %s\n' "$*" >&2; exit 1; }
|
|
57
57
|
|
|
58
58
|
REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
|
|
59
|
+
# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
|
|
60
|
+
# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
|
|
61
|
+
# peripheral-services/{helm-values,manifests} directory is reachable.
|
|
62
|
+
# Monorepo callers leave it unset; the script falls back to the source dir
|
|
63
|
+
# under packages/peripheral-services/.
|
|
64
|
+
if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
|
|
65
|
+
PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
|
|
66
|
+
else
|
|
67
|
+
PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
|
|
68
|
+
fi
|
|
59
69
|
|
|
60
70
|
# -------------------------------------------------------------------------
|
|
61
71
|
# Cleanup trap — kill port-forwards; remove synthetic resources on exit.
|
|
@@ -111,7 +121,7 @@ helm upgrade --install olam-kyverno kyverno/kyverno \
|
|
|
111
121
|
--version "$KYVERNO_VERSION" \
|
|
112
122
|
--namespace "$KYVERNO_NAMESPACE" \
|
|
113
123
|
--create-namespace \
|
|
114
|
-
-f "$
|
|
124
|
+
-f "$PERIPHERAL_SERVICES_DIR/helm-values/kyverno-values.yaml" \
|
|
115
125
|
--wait --timeout 300s 2>&1 | tail -8
|
|
116
126
|
|
|
117
127
|
# Sanity: kyverno-admission-controller Deployment Ready.
|
|
@@ -140,7 +150,7 @@ fi
|
|
|
140
150
|
# Step 2: Apply the ClusterPolicy
|
|
141
151
|
# -------------------------------------------------------------------------
|
|
142
152
|
log "applying ClusterPolicy enforce-cardinality-labeldrop"
|
|
143
|
-
kubectl apply -f "$
|
|
153
|
+
kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/96-kyverno-cardinality-mutate.yaml"
|
|
144
154
|
|
|
145
155
|
# Wait for policy to be Ready (Kyverno controller picks it up and reports
|
|
146
156
|
# readiness in status.ready / .conditions).
|
|
@@ -66,6 +66,16 @@ log "pre-flight checks passed"
|
|
|
66
66
|
# Resolve repo root so helm -f paths work regardless of invocation cwd
|
|
67
67
|
# -------------------------------------------------------------------------
|
|
68
68
|
REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
|
|
69
|
+
# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
|
|
70
|
+
# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
|
|
71
|
+
# peripheral-services/{helm-values,manifests} directory is reachable.
|
|
72
|
+
# Monorepo callers leave it unset; the script resolves the source dir under
|
|
73
|
+
# packages/peripheral-services/.
|
|
74
|
+
if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
|
|
75
|
+
PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
|
|
76
|
+
else
|
|
77
|
+
PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
|
|
78
|
+
fi
|
|
69
79
|
|
|
70
80
|
# -------------------------------------------------------------------------
|
|
71
81
|
# Ensure grafana Helm repo is present (idempotent — safe to re-run)
|
|
@@ -81,7 +91,7 @@ helm upgrade --install "$LOKI_RELEASE" grafana/loki \
|
|
|
81
91
|
--version 6.7.4 \
|
|
82
92
|
--namespace "$NAMESPACE" \
|
|
83
93
|
--create-namespace \
|
|
84
|
-
-f "$
|
|
94
|
+
-f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
|
|
85
95
|
--wait \
|
|
86
96
|
--timeout 300s
|
|
87
97
|
|
|
@@ -94,7 +104,7 @@ log "installing grafana/promtail ($PROMTAIL_RELEASE) in namespace $NAMESPACE"
|
|
|
94
104
|
helm upgrade --install "$PROMTAIL_RELEASE" grafana/promtail \
|
|
95
105
|
--version 6.16.6 \
|
|
96
106
|
--namespace "$NAMESPACE" \
|
|
97
|
-
-f "$
|
|
107
|
+
-f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
|
|
98
108
|
--wait \
|
|
99
109
|
--timeout 120s
|
|
100
110
|
|
|
@@ -43,6 +43,16 @@ fail() { printf '[prom-no-double-grafana] FAIL: %s\n' "$*" >&2; exit 1; }
|
|
|
43
43
|
# Resolve repo root so helm -f paths work regardless of invocation cwd
|
|
44
44
|
# -------------------------------------------------------------------------
|
|
45
45
|
REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
|
|
46
|
+
# When invoked from a published @pleri/olam-cli install (no monorepo), `olam
|
|
47
|
+
# setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
|
|
48
|
+
# peripheral-services/{helm-values,manifests} directory is reachable.
|
|
49
|
+
# Monorepo callers leave it unset; the script falls back to the source dir
|
|
50
|
+
# under packages/peripheral-services/.
|
|
51
|
+
if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
|
|
52
|
+
PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
|
|
53
|
+
else
|
|
54
|
+
PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
|
|
55
|
+
fi
|
|
46
56
|
|
|
47
57
|
# -------------------------------------------------------------------------
|
|
48
58
|
# Cleanup trap — kill port-forwards on exit; leave Helm releases in place
|
|
@@ -84,7 +94,7 @@ helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stac
|
|
|
84
94
|
--version "$PROM_CHART_VERSION" \
|
|
85
95
|
--namespace "$NAMESPACE" \
|
|
86
96
|
--create-namespace \
|
|
87
|
-
-f "$
|
|
97
|
+
-f "$PERIPHERAL_SERVICES_DIR/helm-values/kube-prom-stack-values.yaml" \
|
|
88
98
|
--wait \
|
|
89
99
|
--timeout 600s
|
|
90
100
|
|
|
@@ -131,7 +141,7 @@ log "upgrading Phase B charts with runtime --set serviceMonitor.enabled=true (pi
|
|
|
131
141
|
helm upgrade olam-loki grafana/loki \
|
|
132
142
|
--version "$LOKI_CHART_VERSION" \
|
|
133
143
|
--namespace "$NAMESPACE" \
|
|
134
|
-
-f "$
|
|
144
|
+
-f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
|
|
135
145
|
--wait \
|
|
136
146
|
--timeout 300s \
|
|
137
147
|
--reuse-values \
|
|
@@ -142,7 +152,7 @@ log "olam-loki upgraded (ServiceMonitor enabled)"
|
|
|
142
152
|
helm upgrade olam-promtail grafana/promtail \
|
|
143
153
|
--version "$PROMTAIL_CHART_VERSION" \
|
|
144
154
|
--namespace "$NAMESPACE" \
|
|
145
|
-
-f "$
|
|
155
|
+
-f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
|
|
146
156
|
--wait \
|
|
147
157
|
--timeout 300s \
|
|
148
158
|
--reuse-values \
|
|
@@ -153,7 +163,7 @@ log "olam-promtail upgraded (ServiceMonitor enabled)"
|
|
|
153
163
|
helm upgrade "$GRAFANA_RELEASE" grafana/grafana \
|
|
154
164
|
--version "$GRAFANA_CHART_VERSION" \
|
|
155
165
|
--namespace "$NAMESPACE" \
|
|
156
|
-
-f "$
|
|
166
|
+
-f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
|
|
157
167
|
--wait \
|
|
158
168
|
--timeout 300s \
|
|
159
169
|
--reuse-values \
|
|
@@ -272,7 +282,7 @@ log "PASS: $ACTIVE active scrape target(s) found in Prometheus"
|
|
|
272
282
|
PROM_URL="http://localhost:${PROM_LOCAL_PORT}"
|
|
273
283
|
|
|
274
284
|
log "applying 95-prom-recording-rules.yaml (skipped by apply-manifests due to 9[0-9]-prom-* filter)"
|
|
275
|
-
kubectl apply -f "$
|
|
285
|
+
kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/95-prom-recording-rules.yaml"
|
|
276
286
|
|
|
277
287
|
# Prometheus operator reconcile + config reload can take ~60-90s (C2 lesson).
|
|
278
288
|
# Poll /api/v1/rules until our group appears (up to 180s).
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# Grafana Helm values — k3s-ingress-observability Phase B Task B2
|
|
2
|
+
#
|
|
3
|
+
# STANDALONE grafana/grafana chart per OQ-p3-4 + Decision 16.
|
|
4
|
+
# - This is NOT the Grafana bundled with kube-prometheus-stack.
|
|
5
|
+
# - Phase C kube-prometheus-stack MUST set `grafana.enabled: false`
|
|
6
|
+
# explicitly to prevent a second Grafana Deployment from landing.
|
|
7
|
+
# - Port-forward only — NEVER expose via Traefik IngressRoute.
|
|
8
|
+
# See T7 in DESIGN.md: secret exfil mitigated by no ingress surface.
|
|
9
|
+
#
|
|
10
|
+
# Chart: grafana/grafana; pinned to 8.5.2 (latest stable as of 2026-05-20).
|
|
11
|
+
# Upgrade discipline: chart version is embedded in the e2e script comment.
|
|
12
|
+
|
|
13
|
+
# -------------------------------------------------------------------------
|
|
14
|
+
# Admin credentials — loaded from a pre-existing Secret, NOT from chart
|
|
15
|
+
# values. Secret is created by scripts/e2e/grafana-port-forward.sh before
|
|
16
|
+
# helm install, or by the operator following the procedure in
|
|
17
|
+
# packages/peripheral-services/manifests/README.md (§ "Grafana admin secret").
|
|
18
|
+
# The placeholder manifest (70-grafana-secret.yaml) was removed 2026-05-21
|
|
19
|
+
# (dogfood finding #4) because `kubectl apply` would overwrite the operator's
|
|
20
|
+
# pre-created Secret with the placeholder value.
|
|
21
|
+
# -------------------------------------------------------------------------
|
|
22
|
+
admin:
|
|
23
|
+
existingSecret: olam-grafana-admin
|
|
24
|
+
userKey: admin-user
|
|
25
|
+
passwordKey: admin-password
|
|
26
|
+
|
|
27
|
+
# -------------------------------------------------------------------------
|
|
28
|
+
# Service: ClusterIP only.
|
|
29
|
+
# Decision 16: port-forward only; never ingress-routed.
|
|
30
|
+
# Access: `kubectl port-forward -n monitoring svc/olam-grafana 3000:80`
|
|
31
|
+
# -------------------------------------------------------------------------
|
|
32
|
+
service:
|
|
33
|
+
type: ClusterIP
|
|
34
|
+
port: 80
|
|
35
|
+
|
|
36
|
+
# -------------------------------------------------------------------------
|
|
37
|
+
# Ingress: disabled.
|
|
38
|
+
# Decision 16 + OQ-p3-4: Grafana is never exposed via Traefik IngressRoute.
|
|
39
|
+
# Port-forward is the sole operator access path. Enabling ingress here would
|
|
40
|
+
# silently violate the access-control intent even if no IngressRoute manifest
|
|
41
|
+
# is committed.
|
|
42
|
+
# -------------------------------------------------------------------------
|
|
43
|
+
ingress:
|
|
44
|
+
enabled: false # Decision 16: port-forward only; never ingress-routed
|
|
45
|
+
|
|
46
|
+
# -------------------------------------------------------------------------
|
|
47
|
+
# Datasources: Loki (default) + Prometheus (added in Phase C Task C1).
|
|
48
|
+
#
|
|
49
|
+
# Dual-chart pattern:
|
|
50
|
+
# - kube-prometheus-stack (C1) provides Prometheus. Its bundled Grafana
|
|
51
|
+
# sub-chart is disabled (grafana.enabled: false in kube-prom-stack-values.yaml).
|
|
52
|
+
# - This standalone grafana/grafana chart (Phase B) is the only Grafana.
|
|
53
|
+
# - The Prometheus datasource URL points at `prometheus-operated`, which is
|
|
54
|
+
# the in-cluster Service that kube-prometheus-stack's Prometheus Operator
|
|
55
|
+
# creates for the managed Prometheus StatefulSet.
|
|
56
|
+
# - timeInterval: 15s matches the scrape interval in kube-prom-stack-values.yaml
|
|
57
|
+
# so Grafana's step calculation aligns with actual data granularity.
|
|
58
|
+
# - exemplarTraceIdDestinations.datasourceUid: tempo is harmless until Phase D
|
|
59
|
+
# adds Tempo; Grafana silently ignores unknown datasource UIDs.
|
|
60
|
+
#
|
|
61
|
+
# editable: false prevents accidental operator drift across sessions.
|
|
62
|
+
# -------------------------------------------------------------------------
|
|
63
|
+
datasources:
|
|
64
|
+
datasources.yaml:
|
|
65
|
+
apiVersion: 1
|
|
66
|
+
datasources:
|
|
67
|
+
- name: Loki
|
|
68
|
+
type: loki
|
|
69
|
+
access: proxy
|
|
70
|
+
url: http://olam-loki.monitoring.svc.cluster.local:3100
|
|
71
|
+
isDefault: true
|
|
72
|
+
editable: false
|
|
73
|
+
- name: Prometheus
|
|
74
|
+
type: prometheus
|
|
75
|
+
access: proxy
|
|
76
|
+
url: http://prometheus-operated.monitoring.svc.cluster.local:9090
|
|
77
|
+
isDefault: false
|
|
78
|
+
editable: false
|
|
79
|
+
jsonData:
|
|
80
|
+
timeInterval: 15s # matches scrape interval in kube-prom-stack-values.yaml
|
|
81
|
+
exemplarTraceIdDestinations:
|
|
82
|
+
- name: trace_id
|
|
83
|
+
datasourceUid: tempo # Phase D may add Tempo; harmless until then
|
|
84
|
+
|
|
85
|
+
# -------------------------------------------------------------------------
|
|
86
|
+
# Dashboard provisioner: file-based ConfigMap mount.
|
|
87
|
+
# B3 lands the olam-dashboards ConfigMap and the actual JSON files.
|
|
88
|
+
# B2 wires the loader so B3's ConfigMap is picked up automatically.
|
|
89
|
+
# -------------------------------------------------------------------------
|
|
90
|
+
dashboardProviders:
|
|
91
|
+
dashboardproviders.yaml:
|
|
92
|
+
apiVersion: 1
|
|
93
|
+
providers:
|
|
94
|
+
- name: olam-default
|
|
95
|
+
orgId: 1
|
|
96
|
+
folder: 'Olam'
|
|
97
|
+
type: file
|
|
98
|
+
disableDeletion: true
|
|
99
|
+
updateIntervalSeconds: 30
|
|
100
|
+
allowUiUpdates: false
|
|
101
|
+
options:
|
|
102
|
+
path: /var/lib/grafana/dashboards/olam-default
|
|
103
|
+
|
|
104
|
+
# Wire the volume mount — B3 creates this ConfigMap with the actual JSON.
|
|
105
|
+
# Grafana will warn "ConfigMap olam-dashboards not found" until B3 lands;
|
|
106
|
+
# this is benign and does not block Grafana startup.
|
|
107
|
+
dashboardsConfigMaps:
|
|
108
|
+
olam-default: olam-dashboards # B3 creates this ConfigMap
|
|
109
|
+
|
|
110
|
+
# -------------------------------------------------------------------------
|
|
111
|
+
# Resources: tuned for single-operator k3s (<256Mi idle typical).
|
|
112
|
+
# P2 acceptance criterion: <500MB idle / <1GB typical across full LGTM stack.
|
|
113
|
+
# -------------------------------------------------------------------------
|
|
114
|
+
resources:
|
|
115
|
+
requests:
|
|
116
|
+
cpu: 50m
|
|
117
|
+
memory: 128Mi
|
|
118
|
+
limits:
|
|
119
|
+
cpu: 200m
|
|
120
|
+
memory: 256Mi # P2: keeps Grafana within its share of the LGTM RAM budget
|
|
121
|
+
|
|
122
|
+
# -------------------------------------------------------------------------
|
|
123
|
+
# Persistence: disabled for Phase B.
|
|
124
|
+
# Grafana state (dashboards, users) lives in ConfigMaps / values files.
|
|
125
|
+
# Phase C may enable a PV if fine-grained alert state or annotations
|
|
126
|
+
# accumulate. For now, stateless Grafana is simpler and matches S2.
|
|
127
|
+
# -------------------------------------------------------------------------
|
|
128
|
+
persistence:
|
|
129
|
+
enabled: false # S2: ConfigMap-mounted dashboards; no PV needed in Phase B
|
|
130
|
+
|
|
131
|
+
# -------------------------------------------------------------------------
|
|
132
|
+
# ServiceMonitor: Phase C Prometheus scrapes Grafana's /metrics endpoint.
|
|
133
|
+
# Disabled in Phase B: the ServiceMonitor CRD (monitoring.coreos.com/v1) is
|
|
134
|
+
# shipped by kube-prometheus-stack in Phase C. The earlier "enable now to
|
|
135
|
+
# avoid a Phase C helm upgrade" rationale was wrong — Phase C will need a
|
|
136
|
+
# helm upgrade anyway to wire Prometheus scrape targets. Flipping this on
|
|
137
|
+
# pre-CRD breaks the install on chart versions that hard-validate.
|
|
138
|
+
# -------------------------------------------------------------------------
|
|
139
|
+
serviceMonitor:
|
|
140
|
+
# Disabled in the source-of-truth values file so a standalone Phase B install
|
|
141
|
+
# (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
|
|
142
|
+
# The C1 e2e script flips this on at RUNTIME via
|
|
143
|
+
# helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
|
|
144
|
+
# AFTER kube-prom-stack has installed the ServiceMonitor CRD.
|
|
145
|
+
enabled: false
|
|
146
|
+
|
|
147
|
+
# -------------------------------------------------------------------------
|
|
148
|
+
# Grafana.ini overrides: anonymous access disabled (default); only
|
|
149
|
+
# setting the server root_url so port-forward URLs render correctly
|
|
150
|
+
# in email / share links (cosmetic; not a security seam).
|
|
151
|
+
# -------------------------------------------------------------------------
|
|
152
|
+
grafana.ini:
|
|
153
|
+
server:
|
|
154
|
+
root_url: "%(protocol)s://%(domain)s:%(http_port)s/"
|
|
155
|
+
analytics:
|
|
156
|
+
reporting_enabled: false # no telemetry to grafana.com
|
|
157
|
+
check_for_updates: false
|
|
158
|
+
security:
|
|
159
|
+
allow_embedding: false
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# kube-prometheus-stack Helm values — k3s-ingress-observability Phase C Task C1
|
|
2
|
+
#
|
|
3
|
+
# Chart: prometheus-community/kube-prometheus-stack; pinned to 85.2.0
|
|
4
|
+
# (latest stable as of 2026-05-21).
|
|
5
|
+
# Upgrade discipline: pin in this file + e2e script comment must stay in sync.
|
|
6
|
+
#
|
|
7
|
+
# CRITICAL: grafana.enabled MUST stay false.
|
|
8
|
+
# Phase B ships a standalone grafana/grafana chart (olam-grafana release).
|
|
9
|
+
# kube-prometheus-stack's bundled Grafana sub-chart is disabled to prevent
|
|
10
|
+
# a second Grafana Deployment from landing in the cluster.
|
|
11
|
+
# Decision 16 + OQ-p3-4: Phase B's standalone Grafana is canonical.
|
|
12
|
+
# Enabling the sub-chart here would violate that decision and create two
|
|
13
|
+
# Grafana instances — caught by prom-no-double-grafana.sh's single-Grafana
|
|
14
|
+
# assertion.
|
|
15
|
+
#
|
|
16
|
+
# Resource budget summary (Phase C contribution to P2 target <500MB idle / <1GB typical):
|
|
17
|
+
# prometheus-operator: 128Mi req / 512Mi limit
|
|
18
|
+
# prometheus: 512Mi req / 2Gi limit
|
|
19
|
+
# node-exporter: 64Mi req / 128Mi limit
|
|
20
|
+
# kube-state-metrics: 128Mi req / 256Mi limit
|
|
21
|
+
# Total C1 addition: ~832Mi req / ~3Gi limit (spread across nodes)
|
|
22
|
+
#
|
|
23
|
+
# Retention policy (Decision 14): scrape 15s / retention 15d / size cap 10GiB.
|
|
24
|
+
# The size cap (T10 TSDB corruption mitigation) is the hard guard; retention 15d
|
|
25
|
+
# is advisory — the size cap enforces first.
|
|
26
|
+
#
|
|
27
|
+
# Alertmanager: disabled for C1. C2 lands the first alert rule (cardinality 80k).
|
|
28
|
+
# When C2 ships, flip alertmanager.enabled: true and configure receivers.
|
|
29
|
+
# Comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
|
|
30
|
+
|
|
31
|
+
# -------------------------------------------------------------------------
|
|
32
|
+
# CARDINALITY ENFORCEMENT — Task C2 (T1 cardinality bomb / P4 <100k active series)
|
|
33
|
+
#
|
|
34
|
+
# Goal: strip high-cardinality labels (world_id, trace_id, user_id,
|
|
35
|
+
# request_id, operator_id) from every scraped series BEFORE TSDB ingest.
|
|
36
|
+
#
|
|
37
|
+
# Architecture finding (helm template verified, 2026-05-21):
|
|
38
|
+
# The prometheus-operator Prometheus CR has NO global metricRelabelConfigs
|
|
39
|
+
# field. The Prometheus CR spec exposes only per-ServiceMonitor endpoint
|
|
40
|
+
# metricRelabelings. There is no chart-level "apply to all scrapes" slot.
|
|
41
|
+
#
|
|
42
|
+
# Enforcement strategy (two-layer):
|
|
43
|
+
# Layer 1 — chart-managed ServiceMonitors: set metricRelabelings on every
|
|
44
|
+
# ServiceMonitor the chart controls (coreDns, prometheusOperator,
|
|
45
|
+
# prometheus self-scrape, node-exporter). Belt-and-suspenders; these
|
|
46
|
+
# services don't emit world_id etc. in practice, but the rule is free.
|
|
47
|
+
# Note: kube-state-metrics sub-chart has no metricRelabelings slot in
|
|
48
|
+
# its prometheus.monitor section at chart version 85.2.0 — omitted.
|
|
49
|
+
# Layer 2 — user-deployed ServiceMonitors: the cardinality-drop.sh e2e
|
|
50
|
+
# script's synthetic violator ServiceMonitor carries the same labeldrop
|
|
51
|
+
# rule (release: olam-prom label + metricRelabelings). New services
|
|
52
|
+
# MUST include the same block — enforced by docs + code review.
|
|
53
|
+
#
|
|
54
|
+
# Why labeldrop is the right action:
|
|
55
|
+
# action: labeldrop removes the matched labels from ALL series that carry
|
|
56
|
+
# them, regardless of metric name. This is the same semantic as Promtail's
|
|
57
|
+
# pipeline drop stages (promtail-values.yaml) — both layers stay in sync.
|
|
58
|
+
# world_id surfaces in dashboards via EXEMPLARS (Decision 9), not labels.
|
|
59
|
+
#
|
|
60
|
+
# Regex covers all five taxonomy labels from observability-label-taxonomy:
|
|
61
|
+
# world_id, trace_id, user_id, request_id, operator_id
|
|
62
|
+
# -------------------------------------------------------------------------
|
|
63
|
+
_cardinalityLabeldrop: &cardinality-labeldrop
|
|
64
|
+
- action: labeldrop
|
|
65
|
+
regex: 'world_id|trace_id|user_id|request_id|operator_id'
|
|
66
|
+
|
|
67
|
+
# -------------------------------------------------------------------------
|
|
68
|
+
# HARD REQUIREMENT: grafana sub-chart is off.
|
|
69
|
+
# See top-of-file comment for rationale.
|
|
70
|
+
# -------------------------------------------------------------------------
|
|
71
|
+
grafana:
|
|
72
|
+
enabled: false # HARD: Decision 16 + OQ-p3-4 — standalone Grafana (olam-grafana) is canonical
|
|
73
|
+
|
|
74
|
+
# -------------------------------------------------------------------------
|
|
75
|
+
# Alertmanager: off until C2 lands the first alert rule.
|
|
76
|
+
# C2 comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
|
|
77
|
+
# -------------------------------------------------------------------------
|
|
78
|
+
alertmanager:
|
|
79
|
+
enabled: true # C2: first alert rule (OlamActiveSeriesHigh) lands; alertmanager enabled
|
|
80
|
+
serviceMonitor:
|
|
81
|
+
metricRelabelings: *cardinality-labeldrop
|
|
82
|
+
|
|
83
|
+
# -------------------------------------------------------------------------
|
|
84
|
+
# Default kube-controller-manager / scheduler / proxy / etcd monitors.
|
|
85
|
+
# These ServiceMonitors don't work on k3d/k3s because the endpoints are not
|
|
86
|
+
# exposed via the usual ports. Disabling avoids noisy "endpoint not found"
|
|
87
|
+
# warnings and scrape failures on every Prometheus eval cycle.
|
|
88
|
+
# -------------------------------------------------------------------------
|
|
89
|
+
kubeControllerManager:
|
|
90
|
+
enabled: false
|
|
91
|
+
|
|
92
|
+
kubeScheduler:
|
|
93
|
+
enabled: false
|
|
94
|
+
|
|
95
|
+
kubeProxy:
|
|
96
|
+
enabled: false
|
|
97
|
+
|
|
98
|
+
kubeEtcd:
|
|
99
|
+
enabled: false
|
|
100
|
+
|
|
101
|
+
# kube-apiserver and kubelet DO work on k3d but generate high-cardinality
|
|
102
|
+
# label combinations. Disable for now; re-evaluate when per-service /metrics
|
|
103
|
+
# (C3) and cardinality enforcement (C2) are in place.
|
|
104
|
+
kubeApiServer:
|
|
105
|
+
enabled: false
|
|
106
|
+
|
|
107
|
+
kubelet:
|
|
108
|
+
enabled: false
|
|
109
|
+
|
|
110
|
+
# -------------------------------------------------------------------------
|
|
111
|
+
# Default alerting rules: off.
|
|
112
|
+
# The bundled default rules generate Alertmanager receivers and PrometheusRule
|
|
113
|
+
# objects for kubelet, etcd, apiserver, etc. — most don't fire on k3d anyway
|
|
114
|
+
# and add noise before C2's focused cardinality rule lands.
|
|
115
|
+
# C2 will add targeted PrometheusRule objects separately.
|
|
116
|
+
# -------------------------------------------------------------------------
|
|
117
|
+
defaultRules:
|
|
118
|
+
create: false
|
|
119
|
+
|
|
120
|
+
# -------------------------------------------------------------------------
|
|
121
|
+
# coreDns — ServiceMonitor with labeldrop (Layer 1 cardinality enforcement)
|
|
122
|
+
# -------------------------------------------------------------------------
|
|
123
|
+
coreDns:
|
|
124
|
+
serviceMonitor:
|
|
125
|
+
metricRelabelings: *cardinality-labeldrop
|
|
126
|
+
|
|
127
|
+
# -------------------------------------------------------------------------
|
|
128
|
+
# CRDs: install via chart (default: true, explicit for clarity).
|
|
129
|
+
# These CRDs (ServiceMonitor, PodMonitor, PrometheusRule, etc.) are required
|
|
130
|
+
# before Phase B's loki/promtail/grafana charts can have serviceMonitor.enabled:true.
|
|
131
|
+
# Phase C's e2e script waits for servicemonitors.monitoring.coreos.com to be
|
|
132
|
+
# Established before helm-upgrading the Phase B charts.
|
|
133
|
+
# -------------------------------------------------------------------------
|
|
134
|
+
crds:
|
|
135
|
+
enabled: true
|
|
136
|
+
|
|
137
|
+
# -------------------------------------------------------------------------
|
|
138
|
+
# Prometheus Operator
|
|
139
|
+
# -------------------------------------------------------------------------
|
|
140
|
+
prometheusOperator:
|
|
141
|
+
enabled: true
|
|
142
|
+
serviceMonitor:
|
|
143
|
+
metricRelabelings: *cardinality-labeldrop
|
|
144
|
+
resources:
|
|
145
|
+
requests:
|
|
146
|
+
cpu: 100m
|
|
147
|
+
memory: 128Mi
|
|
148
|
+
limits:
|
|
149
|
+
cpu: 500m
|
|
150
|
+
memory: 512Mi
|
|
151
|
+
|
|
152
|
+
# -------------------------------------------------------------------------
|
|
153
|
+
# Prometheus core — Decision 14: scrape 15s / retention 15d / 10GiB cap
|
|
154
|
+
# -------------------------------------------------------------------------
|
|
155
|
+
prometheus:
|
|
156
|
+
serviceMonitor:
|
|
157
|
+
metricRelabelings: *cardinality-labeldrop
|
|
158
|
+
prometheusSpec:
|
|
159
|
+
scrapeInterval: 15s # Decision 14
|
|
160
|
+
evaluationInterval: 15s
|
|
161
|
+
retention: 15d # Decision 14 — advisory; size cap enforces first
|
|
162
|
+
retentionSize: 10GiB # Decision 14 — T10 TSDB corruption prevention
|
|
163
|
+
walCompression: true
|
|
164
|
+
enableAdminAPI: false # security: admin API allows snapshot deletion + series deletion
|
|
165
|
+
enableRemoteWriteReceiver: false # not a remote-write target; no inbound writes
|
|
166
|
+
logLevel: warn # info is noisy at 15s scrape cycle
|
|
167
|
+
|
|
168
|
+
resources:
|
|
169
|
+
requests:
|
|
170
|
+
cpu: 200m
|
|
171
|
+
memory: 512Mi
|
|
172
|
+
limits:
|
|
173
|
+
cpu: 1000m
|
|
174
|
+
memory: 2Gi
|
|
175
|
+
|
|
176
|
+
# PersistentVolume for TSDB. 12Gi = 10GiB retention cap + ~20% headroom.
|
|
177
|
+
# local-path provisioner is used on k3d; cloud providers use their default SC.
|
|
178
|
+
storageSpec:
|
|
179
|
+
volumeClaimTemplate:
|
|
180
|
+
spec:
|
|
181
|
+
accessModes:
|
|
182
|
+
- ReadWriteOnce
|
|
183
|
+
resources:
|
|
184
|
+
requests:
|
|
185
|
+
storage: 12Gi # 10GiB retention + 20% headroom for in-flight segments
|
|
186
|
+
|
|
187
|
+
# -------------------------------------------------------------------------
|
|
188
|
+
# Node exporter — keep enabled (host-level metrics: CPU, memory, disk, net).
|
|
189
|
+
# -------------------------------------------------------------------------
|
|
190
|
+
nodeExporter:
|
|
191
|
+
enabled: true
|
|
192
|
+
|
|
193
|
+
prometheus-node-exporter:
|
|
194
|
+
prometheus:
|
|
195
|
+
monitor:
|
|
196
|
+
metricRelabelings: *cardinality-labeldrop
|
|
197
|
+
resources:
|
|
198
|
+
requests:
|
|
199
|
+
cpu: 30m
|
|
200
|
+
memory: 64Mi
|
|
201
|
+
limits:
|
|
202
|
+
cpu: 100m
|
|
203
|
+
memory: 128Mi
|
|
204
|
+
|
|
205
|
+
# -------------------------------------------------------------------------
|
|
206
|
+
# kube-state-metrics — keep enabled (k8s-level metrics: pod phases, deployments).
|
|
207
|
+
# -------------------------------------------------------------------------
|
|
208
|
+
kubeStateMetrics:
|
|
209
|
+
enabled: true
|
|
210
|
+
|
|
211
|
+
kube-state-metrics:
|
|
212
|
+
resources:
|
|
213
|
+
requests:
|
|
214
|
+
cpu: 50m
|
|
215
|
+
memory: 128Mi
|
|
216
|
+
limits:
|
|
217
|
+
cpu: 200m
|
|
218
|
+
memory: 256Mi
|
|
219
|
+
|
|
220
|
+
# -------------------------------------------------------------------------
|
|
221
|
+
# Datasource auto-discovery note:
|
|
222
|
+
# kube-prometheus-stack's grafana.sidecar.datasources is N/A (grafana sub-chart
|
|
223
|
+
# is off). Phase B's standalone Grafana (grafana-values.yaml) has been updated
|
|
224
|
+
# in this same C1 PR to include a Prometheus datasource entry pointing at:
|
|
225
|
+
# http://prometheus-operated.monitoring.svc.cluster.local:9090
|
|
226
|
+
# This is the in-cluster Service that kube-prometheus-stack creates for the
|
|
227
|
+
# Prometheus StatefulSet (created by the Prometheus Operator from the
|
|
228
|
+
# Prometheus CR above).
|
|
229
|
+
# -------------------------------------------------------------------------
|