@pleri/olam-cli 0.1.161 → 0.1.166

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +4 -4
  2. package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
  3. package/dist/commands/bootstrap.d.ts.map +1 -1
  4. package/dist/commands/bootstrap.js +35 -11
  5. package/dist/commands/bootstrap.js.map +1 -1
  6. package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
  7. package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
  8. package/dist/commands/flywheel/migrate-overlays.js +29 -3
  9. package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
  10. package/dist/commands/skills-source.d.ts.map +1 -1
  11. package/dist/commands/skills-source.js +57 -2
  12. package/dist/commands/skills-source.js.map +1 -1
  13. package/dist/commands/skills.d.ts.map +1 -1
  14. package/dist/commands/skills.js +14 -0
  15. package/dist/commands/skills.js.map +1 -1
  16. package/dist/image-digests.json +7 -7
  17. package/dist/index.js +996 -618
  18. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
  19. package/dist/lib/bootstrap-kubernetes.js +93 -13
  20. package/dist/lib/bootstrap-kubernetes.js.map +1 -1
  21. package/dist/mcp-server.js +568 -368
  22. package/hermes-bundle/version.json +1 -1
  23. package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
  24. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  25. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  26. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  27. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  28. package/host-cp/observability/grafana-port-forward.sh +12 -2
  29. package/host-cp/observability/kyverno-cardinality-mutate.sh +12 -2
  30. package/host-cp/observability/loki-ingest.sh +12 -2
  31. package/host-cp/observability/prom-no-double-grafana.sh +15 -5
  32. package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
  33. package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
  34. package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
  35. package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
  36. package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
  37. package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
  38. package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
  39. package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
  40. package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
  41. package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
  42. package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
  43. package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
  44. package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
  45. package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
  46. package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
  47. package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
  48. package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
  49. package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
  50. package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
  51. package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
  52. package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
  53. package/host-cp/src/plan-chat-service.mjs +147 -1
  54. package/package.json +1 -1
@@ -1,4 +1,4 @@
1
1
  {
2
- "bundledAt": "2026-05-22T07:50:42.070Z",
2
+ "bundledAt": "2026-05-22T11:04:18.339Z",
3
3
  "kgFirstSha": "29a9ccce1b115d049e375c4a90eb5cf7c123e610e2d0590270a4db2cdbc64a28"
4
4
  }
@@ -111,7 +111,7 @@ spec:
111
111
  # k3d), started by `olam upgrade` Step 0.7 — not inside this Pod.
112
112
  containers:
113
113
  - name: olam-host-cp
114
- image: ghcr.io/pleri/olam-host-cp@sha256:a71a02ad25f03c1481d8b5a4f3cf50614eb1f9b02376935e4df5c65b9bd4fa8f
114
+ image: ghcr.io/pleri/olam-host-cp@sha256:7a49b44546d9b69c5a7448613130a43319e90e06a2999d688101657d7d851dda
115
115
  imagePullPolicy: IfNotPresent
116
116
  securityContext:
117
117
  runAsNonRoot: true
@@ -70,7 +70,7 @@ spec:
70
70
  mountPath: /data
71
71
  containers:
72
72
  - name: olam-auth-service
73
- image: ghcr.io/pleri/olam-auth@sha256:7ad7f92e5feafff3921f3219886a2aec312d83e00c66eaa568e53aac03b19b16
73
+ image: ghcr.io/pleri/olam-auth@sha256:d41a940bc9eb7016aeecc1c653e057d63d32d33c1e694d298b5340711d3d0bd8
74
74
  imagePullPolicy: IfNotPresent
75
75
  securityContext:
76
76
  runAsNonRoot: true
@@ -61,7 +61,7 @@ spec:
61
61
  mountPath: /data
62
62
  containers:
63
63
  - name: olam-kg-service
64
- image: ghcr.io/pleri/olam-kg-service@sha256:9c01fd288e136116abfd0e34c7230417a30c5036e411838b88a553c44a802f13
64
+ image: ghcr.io/pleri/olam-kg-service@sha256:b9a96be3cad11f298286d011a88309ac2e495074970bf4d860c032709a5ab72f
65
65
  imagePullPolicy: IfNotPresent
66
66
  securityContext:
67
67
  runAsNonRoot: true
@@ -68,7 +68,7 @@ spec:
68
68
  mountPath: /data
69
69
  containers:
70
70
  - name: olam-mcp-auth-service
71
- image: ghcr.io/pleri/olam-mcp-auth@sha256:ddd15d5ee0b18ed36a8916c4d8d985182f6e32b57fc2625295f5db19a14b37a0
71
+ image: ghcr.io/pleri/olam-mcp-auth@sha256:0322f65701dfda84a2d0672071914fd7276927772ccccf6c5f55c4c3617cd8fe
72
72
  imagePullPolicy: IfNotPresent
73
73
  securityContext:
74
74
  runAsNonRoot: true
@@ -70,7 +70,7 @@ spec:
70
70
  # bootstrap-placeholder comment + run `npm run refresh:manifest-digests`
71
71
  # once ghcr.io/pleri/olam-memory-service has a real published digest.
72
72
  # bootstrap-placeholder: pre-publish; refresh after first release
73
- image: ghcr.io/pleri/olam-memory-service@sha256:8dd1593af37b345a9b9b741803355254f1f719d9bf23e56339d9baed8dea9ac1
73
+ image: ghcr.io/pleri/olam-memory-service@sha256:70ae9c81efe07d8105c109aea970105709fc4daa50b0d688aa5d299a39a8b24a
74
74
  imagePullPolicy: IfNotPresent
75
75
  securityContext:
76
76
  runAsNonRoot: true
@@ -115,7 +115,17 @@ log "Secret applied"
115
115
  # packages/peripheral-services/scripts/sync-grafana-dashboards.sh.
116
116
  # -------------------------------------------------------------------------
117
117
  REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
118
- CONFIGMAP_MANIFEST="$REPO_ROOT/packages/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml"
118
+ # When invoked from a published @pleri/olam-cli install (no monorepo), `olam
119
+ # setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
120
+ # peripheral-services/{helm-values,manifests} directory is reachable.
121
+ # Monorepo callers leave it unset; the script falls back to the source dir
122
+ # under packages/peripheral-services/.
123
+ if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
124
+ PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
125
+ else
126
+ PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
127
+ fi
128
+ CONFIGMAP_MANIFEST="$PERIPHERAL_SERVICES_DIR/manifests/80-grafana-dashboard-configmap.yaml"
119
129
 
120
130
  if [[ -f "$CONFIGMAP_MANIFEST" ]]; then
121
131
  log "applying olam-dashboards ConfigMap from $CONFIGMAP_MANIFEST"
@@ -133,7 +143,7 @@ helm upgrade --install "$GRAFANA_RELEASE" grafana/grafana \
133
143
  --version "$GRAFANA_CHART_VERSION" \
134
144
  --namespace "$NAMESPACE" \
135
145
  --create-namespace \
136
- -f "$REPO_ROOT/packages/peripheral-services/helm-values/grafana-values.yaml" \
146
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
137
147
  --wait \
138
148
  --timeout 300s
139
149
 
@@ -56,6 +56,16 @@ log() { printf '[kyverno-mutate] %s\n' "$*" >&2; }
56
56
  fail() { printf '[kyverno-mutate] FAIL: %s\n' "$*" >&2; exit 1; }
57
57
 
58
58
  REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
59
+ # When invoked from a published @pleri/olam-cli install (no monorepo), `olam
60
+ # setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
61
+ # peripheral-services/{helm-values,manifests} directory is reachable.
62
+ # Monorepo callers leave it unset; the script falls back to the source dir
63
+ # under packages/peripheral-services/.
64
+ if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
65
+ PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
66
+ else
67
+ PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
68
+ fi
59
69
 
60
70
  # -------------------------------------------------------------------------
61
71
  # Cleanup trap — kill port-forwards; remove synthetic resources on exit.
@@ -111,7 +121,7 @@ helm upgrade --install olam-kyverno kyverno/kyverno \
111
121
  --version "$KYVERNO_VERSION" \
112
122
  --namespace "$KYVERNO_NAMESPACE" \
113
123
  --create-namespace \
114
- -f "$REPO_ROOT/packages/peripheral-services/helm-values/kyverno-values.yaml" \
124
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/kyverno-values.yaml" \
115
125
  --wait --timeout 300s 2>&1 | tail -8
116
126
 
117
127
  # Sanity: kyverno-admission-controller Deployment Ready.
@@ -140,7 +150,7 @@ fi
140
150
  # Step 2: Apply the ClusterPolicy
141
151
  # -------------------------------------------------------------------------
142
152
  log "applying ClusterPolicy enforce-cardinality-labeldrop"
143
- kubectl apply -f "$REPO_ROOT/packages/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml"
153
+ kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/96-kyverno-cardinality-mutate.yaml"
144
154
 
145
155
  # Wait for policy to be Ready (Kyverno controller picks it up and reports
146
156
  # readiness in status.ready / .conditions).
@@ -66,6 +66,16 @@ log "pre-flight checks passed"
66
66
  # Resolve repo root so helm -f paths work regardless of invocation cwd
67
67
  # -------------------------------------------------------------------------
68
68
  REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
69
+ # When invoked from a published @pleri/olam-cli install (no monorepo), `olam
70
+ # setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
71
+ # peripheral-services/{helm-values,manifests} directory is reachable.
72
+ # Monorepo callers leave it unset; the script resolves the source dir under
73
+ # packages/peripheral-services/.
74
+ if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
75
+ PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
76
+ else
77
+ PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
78
+ fi
69
79
 
70
80
  # -------------------------------------------------------------------------
71
81
  # Ensure grafana Helm repo is present (idempotent — safe to re-run)
@@ -81,7 +91,7 @@ helm upgrade --install "$LOKI_RELEASE" grafana/loki \
81
91
  --version 6.7.4 \
82
92
  --namespace "$NAMESPACE" \
83
93
  --create-namespace \
84
- -f "$REPO_ROOT/packages/peripheral-services/helm-values/loki-values.yaml" \
94
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
85
95
  --wait \
86
96
  --timeout 300s
87
97
 
@@ -94,7 +104,7 @@ log "installing grafana/promtail ($PROMTAIL_RELEASE) in namespace $NAMESPACE"
94
104
  helm upgrade --install "$PROMTAIL_RELEASE" grafana/promtail \
95
105
  --version 6.16.6 \
96
106
  --namespace "$NAMESPACE" \
97
- -f "$REPO_ROOT/packages/peripheral-services/helm-values/promtail-values.yaml" \
107
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
98
108
  --wait \
99
109
  --timeout 120s
100
110
 
@@ -43,6 +43,16 @@ fail() { printf '[prom-no-double-grafana] FAIL: %s\n' "$*" >&2; exit 1; }
43
43
  # Resolve repo root so helm -f paths work regardless of invocation cwd
44
44
  # -------------------------------------------------------------------------
45
45
  REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
46
+ # When invoked from a published @pleri/olam-cli install (no monorepo), `olam
47
+ # setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
48
+ # peripheral-services/{helm-values,manifests} directory is reachable.
49
+ # Monorepo callers leave it unset; the script falls back to the source dir
50
+ # under packages/peripheral-services/.
51
+ if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
52
+ PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
53
+ else
54
+ PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
55
+ fi
46
56
 
47
57
  # -------------------------------------------------------------------------
48
58
  # Cleanup trap — kill port-forwards on exit; leave Helm releases in place
@@ -84,7 +94,7 @@ helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stac
84
94
  --version "$PROM_CHART_VERSION" \
85
95
  --namespace "$NAMESPACE" \
86
96
  --create-namespace \
87
- -f "$REPO_ROOT/packages/peripheral-services/helm-values/kube-prom-stack-values.yaml" \
97
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/kube-prom-stack-values.yaml" \
88
98
  --wait \
89
99
  --timeout 600s
90
100
 
@@ -131,7 +141,7 @@ log "upgrading Phase B charts with runtime --set serviceMonitor.enabled=true (pi
131
141
  helm upgrade olam-loki grafana/loki \
132
142
  --version "$LOKI_CHART_VERSION" \
133
143
  --namespace "$NAMESPACE" \
134
- -f "$REPO_ROOT/packages/peripheral-services/helm-values/loki-values.yaml" \
144
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
135
145
  --wait \
136
146
  --timeout 300s \
137
147
  --reuse-values \
@@ -142,7 +152,7 @@ log "olam-loki upgraded (ServiceMonitor enabled)"
142
152
  helm upgrade olam-promtail grafana/promtail \
143
153
  --version "$PROMTAIL_CHART_VERSION" \
144
154
  --namespace "$NAMESPACE" \
145
- -f "$REPO_ROOT/packages/peripheral-services/helm-values/promtail-values.yaml" \
155
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
146
156
  --wait \
147
157
  --timeout 300s \
148
158
  --reuse-values \
@@ -153,7 +163,7 @@ log "olam-promtail upgraded (ServiceMonitor enabled)"
153
163
  helm upgrade "$GRAFANA_RELEASE" grafana/grafana \
154
164
  --version "$GRAFANA_CHART_VERSION" \
155
165
  --namespace "$NAMESPACE" \
156
- -f "$REPO_ROOT/packages/peripheral-services/helm-values/grafana-values.yaml" \
166
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
157
167
  --wait \
158
168
  --timeout 300s \
159
169
  --reuse-values \
@@ -272,7 +282,7 @@ log "PASS: $ACTIVE active scrape target(s) found in Prometheus"
272
282
  PROM_URL="http://localhost:${PROM_LOCAL_PORT}"
273
283
 
274
284
  log "applying 95-prom-recording-rules.yaml (skipped by apply-manifests due to 9[0-9]-prom-* filter)"
275
- kubectl apply -f "$REPO_ROOT/packages/peripheral-services/manifests/95-prom-recording-rules.yaml"
285
+ kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/95-prom-recording-rules.yaml"
276
286
 
277
287
  # Prometheus operator reconcile + config reload can take ~60-90s (C2 lesson).
278
288
  # Poll /api/v1/rules until our group appears (up to 180s).
@@ -0,0 +1,159 @@
1
+ # Grafana Helm values — k3s-ingress-observability Phase B Task B2
2
+ #
3
+ # STANDALONE grafana/grafana chart per OQ-p3-4 + Decision 16.
4
+ # - This is NOT the Grafana bundled with kube-prometheus-stack.
5
+ # - Phase C kube-prometheus-stack MUST set `grafana.enabled: false`
6
+ # explicitly to prevent a second Grafana Deployment from landing.
7
+ # - Port-forward only — NEVER expose via Traefik IngressRoute.
8
+ # See T7 in DESIGN.md: secret exfil mitigated by no ingress surface.
9
+ #
10
+ # Chart: grafana/grafana; pinned to 8.5.2 (latest stable as of 2026-05-20).
11
+ # Upgrade discipline: chart version is embedded in the e2e script comment.
12
+
13
+ # -------------------------------------------------------------------------
14
+ # Admin credentials — loaded from a pre-existing Secret, NOT from chart
15
+ # values. Secret is created by scripts/e2e/grafana-port-forward.sh before
16
+ # helm install, or by the operator following the procedure in
17
+ # packages/peripheral-services/manifests/README.md (§ "Grafana admin secret").
18
+ # The placeholder manifest (70-grafana-secret.yaml) was removed 2026-05-21
19
+ # (dogfood finding #4) because `kubectl apply` would overwrite the operator's
20
+ # pre-created Secret with the placeholder value.
21
+ # -------------------------------------------------------------------------
22
+ admin:
23
+ existingSecret: olam-grafana-admin
24
+ userKey: admin-user
25
+ passwordKey: admin-password
26
+
27
+ # -------------------------------------------------------------------------
28
+ # Service: ClusterIP only.
29
+ # Decision 16: port-forward only; never ingress-routed.
30
+ # Access: `kubectl port-forward -n monitoring svc/olam-grafana 3000:80`
31
+ # -------------------------------------------------------------------------
32
+ service:
33
+ type: ClusterIP
34
+ port: 80
35
+
36
+ # -------------------------------------------------------------------------
37
+ # Ingress: disabled.
38
+ # Decision 16 + OQ-p3-4: Grafana is never exposed via Traefik IngressRoute.
39
+ # Port-forward is the sole operator access path. Enabling ingress here would
40
+ # silently violate the access-control intent even if no IngressRoute manifest
41
+ # is committed.
42
+ # -------------------------------------------------------------------------
43
+ ingress:
44
+ enabled: false # Decision 16: port-forward only; never ingress-routed
45
+
46
+ # -------------------------------------------------------------------------
47
+ # Datasources: Loki (default) + Prometheus (added in Phase C Task C1).
48
+ #
49
+ # Dual-chart pattern:
50
+ # - kube-prometheus-stack (C1) provides Prometheus. Its bundled Grafana
51
+ # sub-chart is disabled (grafana.enabled: false in kube-prom-stack-values.yaml).
52
+ # - This standalone grafana/grafana chart (Phase B) is the only Grafana.
53
+ # - The Prometheus datasource URL points at `prometheus-operated`, which is
54
+ # the in-cluster Service that kube-prometheus-stack's Prometheus Operator
55
+ # creates for the managed Prometheus StatefulSet.
56
+ # - timeInterval: 15s matches the scrape interval in kube-prom-stack-values.yaml
57
+ # so Grafana's step calculation aligns with actual data granularity.
58
+ # - exemplarTraceIdDestinations.datasourceUid: tempo is harmless until Phase D
59
+ # adds Tempo; Grafana silently ignores unknown datasource UIDs.
60
+ #
61
+ # editable: false prevents accidental operator drift across sessions.
62
+ # -------------------------------------------------------------------------
63
+ datasources:
64
+ datasources.yaml:
65
+ apiVersion: 1
66
+ datasources:
67
+ - name: Loki
68
+ type: loki
69
+ access: proxy
70
+ url: http://olam-loki.monitoring.svc.cluster.local:3100
71
+ isDefault: true
72
+ editable: false
73
+ - name: Prometheus
74
+ type: prometheus
75
+ access: proxy
76
+ url: http://prometheus-operated.monitoring.svc.cluster.local:9090
77
+ isDefault: false
78
+ editable: false
79
+ jsonData:
80
+ timeInterval: 15s # matches scrape interval in kube-prom-stack-values.yaml
81
+ exemplarTraceIdDestinations:
82
+ - name: trace_id
83
+ datasourceUid: tempo # Phase D may add Tempo; harmless until then
84
+
85
+ # -------------------------------------------------------------------------
86
+ # Dashboard provisioner: file-based ConfigMap mount.
87
+ # B3 lands the olam-dashboards ConfigMap and the actual JSON files.
88
+ # B2 wires the loader so B3's ConfigMap is picked up automatically.
89
+ # -------------------------------------------------------------------------
90
+ dashboardProviders:
91
+ dashboardproviders.yaml:
92
+ apiVersion: 1
93
+ providers:
94
+ - name: olam-default
95
+ orgId: 1
96
+ folder: 'Olam'
97
+ type: file
98
+ disableDeletion: true
99
+ updateIntervalSeconds: 30
100
+ allowUiUpdates: false
101
+ options:
102
+ path: /var/lib/grafana/dashboards/olam-default
103
+
104
+ # Wire the volume mount — B3 creates this ConfigMap with the actual JSON.
105
+ # Grafana will warn "ConfigMap olam-dashboards not found" until B3 lands;
106
+ # this is benign and does not block Grafana startup.
107
+ dashboardsConfigMaps:
108
+ olam-default: olam-dashboards # B3 creates this ConfigMap
109
+
110
+ # -------------------------------------------------------------------------
111
+ # Resources: tuned for single-operator k3s (<256Mi idle typical).
112
+ # P2 acceptance criterion: <500MB idle / <1GB typical across full LGTM stack.
113
+ # -------------------------------------------------------------------------
114
+ resources:
115
+ requests:
116
+ cpu: 50m
117
+ memory: 128Mi
118
+ limits:
119
+ cpu: 200m
120
+ memory: 256Mi # P2: keeps Grafana within its share of the LGTM RAM budget
121
+
122
+ # -------------------------------------------------------------------------
123
+ # Persistence: disabled for Phase B.
124
+ # Grafana state (dashboards, users) lives in ConfigMaps / values files.
125
+ # Phase C may enable a PV if fine-grained alert state or annotations
126
+ # accumulate. For now, stateless Grafana is simpler and matches S2.
127
+ # -------------------------------------------------------------------------
128
+ persistence:
129
+ enabled: false # S2: ConfigMap-mounted dashboards; no PV needed in Phase B
130
+
131
+ # -------------------------------------------------------------------------
132
+ # ServiceMonitor: Phase C Prometheus scrapes Grafana's /metrics endpoint.
133
+ # Disabled in Phase B: the ServiceMonitor CRD (monitoring.coreos.com/v1) is
134
+ # shipped by kube-prometheus-stack in Phase C. The earlier "enable now to
135
+ # avoid a Phase C helm upgrade" rationale was wrong — Phase C will need a
136
+ # helm upgrade anyway to wire Prometheus scrape targets. Flipping this on
137
+ # pre-CRD breaks the install on chart versions that hard-validate.
138
+ # -------------------------------------------------------------------------
139
+ serviceMonitor:
140
+ # Disabled in the source-of-truth values file so a standalone Phase B install
141
+ # (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
142
+ # The C1 e2e script flips this on at RUNTIME via
143
+ # helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
144
+ # AFTER kube-prom-stack has installed the ServiceMonitor CRD.
145
+ enabled: false
146
+
147
+ # -------------------------------------------------------------------------
148
+ # Grafana.ini overrides: anonymous access disabled (default); only
149
+ # setting the server root_url so port-forward URLs render correctly
150
+ # in email / share links (cosmetic; not a security seam).
151
+ # -------------------------------------------------------------------------
152
+ grafana.ini:
153
+ server:
154
+ root_url: "%(protocol)s://%(domain)s:%(http_port)s/"
155
+ analytics:
156
+ reporting_enabled: false # no telemetry to grafana.com
157
+ check_for_updates: false
158
+ security:
159
+ allow_embedding: false
@@ -0,0 +1,229 @@
1
+ # kube-prometheus-stack Helm values — k3s-ingress-observability Phase C Task C1
2
+ #
3
+ # Chart: prometheus-community/kube-prometheus-stack; pinned to 85.2.0
4
+ # (latest stable as of 2026-05-21).
5
+ # Upgrade discipline: pin in this file + e2e script comment must stay in sync.
6
+ #
7
+ # CRITICAL: grafana.enabled MUST stay false.
8
+ # Phase B ships a standalone grafana/grafana chart (olam-grafana release).
9
+ # kube-prometheus-stack's bundled Grafana sub-chart is disabled to prevent
10
+ # a second Grafana Deployment from landing in the cluster.
11
+ # Decision 16 + OQ-p3-4: Phase B's standalone Grafana is canonical.
12
+ # Enabling the sub-chart here would violate that decision and create two
13
+ # Grafana instances — caught by prom-no-double-grafana.sh's single-Grafana
14
+ # assertion.
15
+ #
16
+ # Resource budget summary (Phase C contribution to P2 target <500MB idle / <1GB typical):
17
+ # prometheus-operator: 128Mi req / 512Mi limit
18
+ # prometheus: 512Mi req / 2Gi limit
19
+ # node-exporter: 64Mi req / 128Mi limit
20
+ # kube-state-metrics: 128Mi req / 256Mi limit
21
+ # Total C1 addition: ~832Mi req / ~3Gi limit (spread across nodes)
22
+ #
23
+ # Retention policy (Decision 14): scrape 15s / retention 15d / size cap 10GiB.
24
+ # The size cap (T10 TSDB corruption mitigation) is the hard guard; retention 15d
25
+ # is advisory — the size cap enforces first.
26
+ #
27
+ # Alertmanager: disabled for C1. C2 lands the first alert rule (cardinality 80k).
28
+ # When C2 ships, flip alertmanager.enabled: true and configure receivers.
29
+ # Comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
30
+
31
+ # -------------------------------------------------------------------------
32
+ # CARDINALITY ENFORCEMENT — Task C2 (T1 cardinality bomb / P4 <100k active series)
33
+ #
34
+ # Goal: strip high-cardinality labels (world_id, trace_id, user_id,
35
+ # request_id, operator_id) from every scraped series BEFORE TSDB ingest.
36
+ #
37
+ # Architecture finding (helm template verified, 2026-05-21):
38
+ # The prometheus-operator Prometheus CR has NO global metricRelabelConfigs
39
+ # field. The Prometheus CR spec exposes only per-ServiceMonitor endpoint
40
+ # metricRelabelings. There is no chart-level "apply to all scrapes" slot.
41
+ #
42
+ # Enforcement strategy (two-layer):
43
+ # Layer 1 — chart-managed ServiceMonitors: set metricRelabelings on every
44
+ # ServiceMonitor the chart controls (coreDns, prometheusOperator,
45
+ # prometheus self-scrape, node-exporter). Belt-and-suspenders; these
46
+ # services don't emit world_id etc. in practice, but the rule is free.
47
+ # Note: kube-state-metrics sub-chart has no metricRelabelings slot in
48
+ # its prometheus.monitor section at chart version 85.2.0 — omitted.
49
+ # Layer 2 — user-deployed ServiceMonitors: the cardinality-drop.sh e2e
50
+ # script's synthetic violator ServiceMonitor carries the same labeldrop
51
+ # rule (release: olam-prom label + metricRelabelings). New services
52
+ # MUST include the same block — enforced by docs + code review.
53
+ #
54
+ # Why labeldrop is the right action:
55
+ # action: labeldrop removes the matched labels from ALL series that carry
56
+ # them, regardless of metric name. This is the same semantic as Promtail's
57
+ # pipeline drop stages (promtail-values.yaml) — both layers stay in sync.
58
+ # world_id surfaces in dashboards via EXEMPLARS (Decision 9), not labels.
59
+ #
60
+ # Regex covers all five taxonomy labels from observability-label-taxonomy:
61
+ # world_id, trace_id, user_id, request_id, operator_id
62
+ # -------------------------------------------------------------------------
63
+ _cardinalityLabeldrop: &cardinality-labeldrop
64
+ - action: labeldrop
65
+ regex: 'world_id|trace_id|user_id|request_id|operator_id'
66
+
67
+ # -------------------------------------------------------------------------
68
+ # HARD REQUIREMENT: grafana sub-chart is off.
69
+ # See top-of-file comment for rationale.
70
+ # -------------------------------------------------------------------------
71
+ grafana:
72
+ enabled: false # HARD: Decision 16 + OQ-p3-4 — standalone Grafana (olam-grafana) is canonical
73
+
74
+ # -------------------------------------------------------------------------
75
+ # Alertmanager: off until C2 lands the first alert rule.
76
+ # C2 comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
77
+ # -------------------------------------------------------------------------
78
+ alertmanager:
79
+ enabled: true # C2: first alert rule (OlamActiveSeriesHigh) lands; alertmanager enabled
80
+ serviceMonitor:
81
+ metricRelabelings: *cardinality-labeldrop
82
+
83
+ # -------------------------------------------------------------------------
84
+ # Default kube-controller-manager / scheduler / proxy / etcd monitors.
85
+ # These ServiceMonitors don't work on k3d/k3s because the endpoints are not
86
+ # exposed via the usual ports. Disabling avoids noisy "endpoint not found"
87
+ # warnings and scrape failures on every Prometheus eval cycle.
88
+ # -------------------------------------------------------------------------
89
+ kubeControllerManager:
90
+ enabled: false
91
+
92
+ kubeScheduler:
93
+ enabled: false
94
+
95
+ kubeProxy:
96
+ enabled: false
97
+
98
+ kubeEtcd:
99
+ enabled: false
100
+
101
+ # kube-apiserver and kubelet DO work on k3d but generate high-cardinality
102
+ # label combinations. Disable for now; re-evaluate when per-service /metrics
103
+ # (C3) and cardinality enforcement (C2) are in place.
104
+ kubeApiServer:
105
+ enabled: false
106
+
107
+ kubelet:
108
+ enabled: false
109
+
110
+ # -------------------------------------------------------------------------
111
+ # Default alerting rules: off.
112
+ # The bundled default rules generate Alertmanager receivers and PrometheusRule
113
+ # objects for kubelet, etcd, apiserver, etc. — most don't fire on k3d anyway
114
+ # and add noise before C2's focused cardinality rule lands.
115
+ # C2 will add targeted PrometheusRule objects separately.
116
+ # -------------------------------------------------------------------------
117
+ defaultRules:
118
+ create: false
119
+
120
+ # -------------------------------------------------------------------------
121
+ # coreDns — ServiceMonitor with labeldrop (Layer 1 cardinality enforcement)
122
+ # -------------------------------------------------------------------------
123
+ coreDns:
124
+ serviceMonitor:
125
+ metricRelabelings: *cardinality-labeldrop
126
+
127
+ # -------------------------------------------------------------------------
128
+ # CRDs: install via chart (default: true, explicit for clarity).
129
+ # These CRDs (ServiceMonitor, PodMonitor, PrometheusRule, etc.) are required
130
+ # before Phase B's loki/promtail/grafana charts can have serviceMonitor.enabled:true.
131
+ # Phase C's e2e script waits for servicemonitors.monitoring.coreos.com to be
132
+ # Established before helm-upgrading the Phase B charts.
133
+ # -------------------------------------------------------------------------
134
+ crds:
135
+ enabled: true
136
+
137
+ # -------------------------------------------------------------------------
138
+ # Prometheus Operator
139
+ # -------------------------------------------------------------------------
140
+ prometheusOperator:
141
+ enabled: true
142
+ serviceMonitor:
143
+ metricRelabelings: *cardinality-labeldrop
144
+ resources:
145
+ requests:
146
+ cpu: 100m
147
+ memory: 128Mi
148
+ limits:
149
+ cpu: 500m
150
+ memory: 512Mi
151
+
152
+ # -------------------------------------------------------------------------
153
+ # Prometheus core — Decision 14: scrape 15s / retention 15d / 10GiB cap
154
+ # -------------------------------------------------------------------------
155
+ prometheus:
156
+ serviceMonitor:
157
+ metricRelabelings: *cardinality-labeldrop
158
+ prometheusSpec:
159
+ scrapeInterval: 15s # Decision 14
160
+ evaluationInterval: 15s
161
+ retention: 15d # Decision 14 — advisory; size cap enforces first
162
+ retentionSize: 10GiB # Decision 14 — T10 TSDB corruption prevention
163
+ walCompression: true
164
+ enableAdminAPI: false # security: admin API allows snapshot deletion + series deletion
165
+ enableRemoteWriteReceiver: false # not a remote-write target; no inbound writes
166
+ logLevel: warn # info is noisy at 15s scrape cycle
167
+
168
+ resources:
169
+ requests:
170
+ cpu: 200m
171
+ memory: 512Mi
172
+ limits:
173
+ cpu: 1000m
174
+ memory: 2Gi
175
+
176
+ # PersistentVolume for TSDB. 12Gi = 10GiB retention cap + ~20% headroom.
177
+ # local-path provisioner is used on k3d; cloud providers use their default SC.
178
+ storageSpec:
179
+ volumeClaimTemplate:
180
+ spec:
181
+ accessModes:
182
+ - ReadWriteOnce
183
+ resources:
184
+ requests:
185
+ storage: 12Gi # 10GiB retention + 20% headroom for in-flight segments
186
+
187
+ # -------------------------------------------------------------------------
188
+ # Node exporter — keep enabled (host-level metrics: CPU, memory, disk, net).
189
+ # -------------------------------------------------------------------------
190
+ nodeExporter:
191
+ enabled: true
192
+
193
+ prometheus-node-exporter:
194
+ prometheus:
195
+ monitor:
196
+ metricRelabelings: *cardinality-labeldrop
197
+ resources:
198
+ requests:
199
+ cpu: 30m
200
+ memory: 64Mi
201
+ limits:
202
+ cpu: 100m
203
+ memory: 128Mi
204
+
205
+ # -------------------------------------------------------------------------
206
+ # kube-state-metrics — keep enabled (k8s-level metrics: pod phases, deployments).
207
+ # -------------------------------------------------------------------------
208
+ kubeStateMetrics:
209
+ enabled: true
210
+
211
+ kube-state-metrics:
212
+ resources:
213
+ requests:
214
+ cpu: 50m
215
+ memory: 128Mi
216
+ limits:
217
+ cpu: 200m
218
+ memory: 256Mi
219
+
220
+ # -------------------------------------------------------------------------
221
+ # Datasource auto-discovery note:
222
+ # kube-prometheus-stack's grafana.sidecar.datasources is N/A (grafana sub-chart
223
+ # is off). Phase B's standalone Grafana (grafana-values.yaml) has been updated
224
+ # in this same C1 PR to include a Prometheus datasource entry pointing at:
225
+ # http://prometheus-operated.monitoring.svc.cluster.local:9090
226
+ # This is the in-cluster Service that kube-prometheus-stack creates for the
227
+ # Prometheus StatefulSet (created by the Prometheus Operator from the
228
+ # Prometheus CR above).
229
+ # -------------------------------------------------------------------------