@pleri/olam-cli 0.1.160 → 0.1.162

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +11 -0
  2. package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
  3. package/dist/commands/bootstrap.d.ts +15 -0
  4. package/dist/commands/bootstrap.d.ts.map +1 -1
  5. package/dist/commands/bootstrap.js +58 -5
  6. package/dist/commands/bootstrap.js.map +1 -1
  7. package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
  8. package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
  9. package/dist/commands/flywheel/migrate-overlays.js +29 -3
  10. package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
  11. package/dist/commands/skills-source.d.ts.map +1 -1
  12. package/dist/commands/skills-source.js +57 -2
  13. package/dist/commands/skills-source.js.map +1 -1
  14. package/dist/commands/skills.d.ts.map +1 -1
  15. package/dist/commands/skills.js +14 -0
  16. package/dist/commands/skills.js.map +1 -1
  17. package/dist/image-digests.json +7 -7
  18. package/dist/index.js +2424 -1781
  19. package/dist/lib/bootstrap-kubernetes.d.ts +42 -0
  20. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -0
  21. package/dist/lib/bootstrap-kubernetes.js +367 -0
  22. package/dist/lib/bootstrap-kubernetes.js.map +1 -0
  23. package/dist/lib/config.d.ts.map +1 -1
  24. package/dist/lib/config.js +6 -1
  25. package/dist/lib/config.js.map +1 -1
  26. package/dist/mcp-server.js +568 -368
  27. package/hermes-bundle/version.json +1 -1
  28. package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
  29. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  30. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  31. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  32. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  33. package/host-cp/observability/grafana-port-forward.sh +283 -0
  34. package/host-cp/observability/kyverno-cardinality-mutate.sh +462 -0
  35. package/host-cp/observability/loki-ingest.sh +253 -0
  36. package/host-cp/observability/prom-no-double-grafana.sh +311 -0
  37. package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
  38. package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
  39. package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
  40. package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
  41. package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
  42. package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
  43. package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
  44. package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
  45. package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
  46. package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
  47. package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
  48. package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
  49. package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
  50. package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
  51. package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
  52. package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
  53. package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
  54. package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
  55. package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
  56. package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
  57. package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
  58. package/host-cp/src/plan-chat-service.mjs +147 -1
  59. package/package.json +1 -1
@@ -0,0 +1,253 @@
1
+ #!/usr/bin/env bash
2
+ # loki-ingest.sh — e2e smoke test: Loki single-binary installs, Promtail tails,
3
+ # OAuth query-param scrubbing verified (code=REDACTED, no raw token).
4
+ #
5
+ # Usage: scripts/e2e/loki-ingest.sh
6
+ #
7
+ # Pre-conditions:
8
+ # - kubectl context is set to a live k8s cluster (does NOT spin up k3d)
9
+ # - helm binary available
10
+ # - grafana Helm repo added (helm repo add grafana https://grafana.github.io/helm-charts)
11
+ #
12
+ # This script is invoked by the A12 harness (scripts/test-ingress-integration/)
13
+ # after cluster-up.sh. It can also be run manually against any live cluster.
14
+ #
15
+ # Idempotency: `helm upgrade --install` is idempotent; re-runs succeed on an
16
+ # existing cluster. The synthetic pod is cleaned up regardless of
17
+ # pass/fail via a trap.
18
+ #
19
+ # Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B1
20
+ # Chart: grafana/loki 6.7.4 (pinned; latest stable 2026-05-20)
21
+ # Chart: grafana/promtail 6.16.6 (latest stable 2026-05-20)
22
+
23
+ set -euo pipefail
24
+
25
+ NAMESPACE="monitoring"
26
+ LOKI_RELEASE="olam-loki"
27
+ PROMTAIL_RELEASE="olam-promtail"
28
+ SYNTHETIC_POD="loki-e2e-synthetic"
29
+ LOKI_PORT="3100"
30
+ LOCAL_PORT="13100" # avoid conflict with any host-level Loki
31
+
32
+ # Magic-number commentary: Promtail's tail → ingest cycle involves:
33
+ # - inotify event (near-instant)
34
+ # - Promtail pipeline processing (~1s)
35
+ # - Loki write path (ingester chunk idle period: default 30m, but flush on
36
+ # query pressure; typically <5s in practice)
37
+ # 10s is conservative for a single log line in a lightly loaded cluster.
38
+ INGEST_LAG_SECONDS=10
39
+
40
+ log() { printf '[loki-ingest] %s\n' "$*" >&2; }
41
+ fail() { printf '[loki-ingest] FAIL: %s\n' "$*" >&2; exit 1; }
42
+
43
+ # -------------------------------------------------------------------------
44
+ # Cleanup trap — remove synthetic pod and port-forward on exit
45
+ # -------------------------------------------------------------------------
46
+ PF_PID=""
47
+ cleanup() {
48
+ if [[ -n "$PF_PID" ]] && kill -0 "$PF_PID" 2>/dev/null; then
49
+ kill "$PF_PID" 2>/dev/null || true
50
+ fi
51
+ kubectl delete pod "$SYNTHETIC_POD" -n default --ignore-not-found=true 2>/dev/null || true
52
+ }
53
+ trap cleanup EXIT
54
+
55
+ # -------------------------------------------------------------------------
56
+ # Pre-flight
57
+ # -------------------------------------------------------------------------
58
+ command -v helm >/dev/null 2>&1 || fail "helm not installed"
59
+ command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed"
60
+ command -v curl >/dev/null 2>&1 || fail "curl not installed"
61
+ kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG"
62
+
63
+ log "pre-flight checks passed"
64
+
65
+ # -------------------------------------------------------------------------
66
+ # Resolve repo root so helm -f paths work regardless of invocation cwd
67
+ # -------------------------------------------------------------------------
68
+ REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
69
+ # When invoked from a published @pleri/olam-cli install (no monorepo), `olam
70
+ # setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
71
+ # peripheral-services/{helm-values,manifests} directory is reachable.
72
+ # Monorepo callers leave it unset; the script resolves the source dir under
73
+ # packages/peripheral-services/.
74
+ if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
75
+ PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
76
+ else
77
+ PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
78
+ fi
79
+
80
+ # -------------------------------------------------------------------------
81
+ # Ensure grafana Helm repo is present (idempotent — safe to re-run)
82
+ # -------------------------------------------------------------------------
83
+ helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true
84
+ helm repo update grafana
85
+
86
+ # -------------------------------------------------------------------------
87
+ # Step 1: Install / upgrade Loki (single-binary mode)
88
+ # -------------------------------------------------------------------------
89
+ log "installing grafana/loki ($LOKI_RELEASE) in namespace $NAMESPACE"
90
+ helm upgrade --install "$LOKI_RELEASE" grafana/loki \
91
+ --version 6.7.4 \
92
+ --namespace "$NAMESPACE" \
93
+ --create-namespace \
94
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
95
+ --wait \
96
+ --timeout 300s
97
+
98
+ log "loki helm install complete"
99
+
100
+ # -------------------------------------------------------------------------
101
+ # Step 2: Install / upgrade Promtail
102
+ # -------------------------------------------------------------------------
103
+ log "installing grafana/promtail ($PROMTAIL_RELEASE) in namespace $NAMESPACE"
104
+ helm upgrade --install "$PROMTAIL_RELEASE" grafana/promtail \
105
+ --version 6.16.6 \
106
+ --namespace "$NAMESPACE" \
107
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
108
+ --wait \
109
+ --timeout 120s
110
+
111
+ log "promtail helm install complete"
112
+
113
+ # -------------------------------------------------------------------------
114
+ # Step 3: Wait for Loki pod Ready
115
+ # -------------------------------------------------------------------------
116
+ log "waiting for Loki pod Ready (120s)"
117
+ kubectl wait \
118
+ --for=condition=ready pod \
119
+ -l app.kubernetes.io/name=loki \
120
+ -n "$NAMESPACE" \
121
+ --timeout=120s
122
+
123
+ log "loki pod Ready"
124
+
125
+ # -------------------------------------------------------------------------
126
+ # Step 4: Generate synthetic log line with raw OAuth tokens in URL and headers.
127
+ #
128
+ # The pod prints a single log line containing all 4 scrub patterns:
129
+ # ?code=SECRETTOKEN123 → code=REDACTED
130
+ # &access_token=SECRETTOKEN456 → access_token=REDACTED
131
+ # &state=SESSION789 → state=REDACTED
132
+ # Authorization: Bearer SECRETBEARER000 → Authorization: Bearer REDACTED
133
+ #
134
+ # Promtail tails it, runs the scrubbing pipeline, and pushes to Loki with all
135
+ # 4 raw tokens absent and all 4 REDACTED markers present.
136
+ # -------------------------------------------------------------------------
137
+ log "launching synthetic pod (prints all 4 raw token patterns)"
138
+ kubectl run "$SYNTHETIC_POD" \
139
+ --image=busybox \
140
+ --restart=Never \
141
+ -n default \
142
+ -- sh -c 'echo "GET http://example.com/callback?code=SECRETTOKEN123&access_token=SECRETTOKEN456&state=SESSION789 HTTP/1.1 Authorization: Bearer SECRETBEARER000"'
143
+
144
+ # -------------------------------------------------------------------------
145
+ # Step 5: Wait for Promtail tail + ingest lag
146
+ # -------------------------------------------------------------------------
147
+ log "waiting ${INGEST_LAG_SECONDS}s for Promtail to tail and ingest synthetic log"
148
+ sleep "$INGEST_LAG_SECONDS"
149
+
150
+ # -------------------------------------------------------------------------
151
+ # Step 6: Port-forward Loki and query
152
+ # -------------------------------------------------------------------------
153
+ log "port-forwarding Loki svc to localhost:${LOCAL_PORT}"
154
+ kubectl port-forward \
155
+ "svc/${LOKI_RELEASE}" \
156
+ "${LOCAL_PORT}:${LOKI_PORT}" \
157
+ -n "$NAMESPACE" &
158
+ PF_PID=$!
159
+
160
+ # Give port-forward a moment to establish
161
+ sleep 2
162
+
163
+ # Query Loki for log lines from the default namespace within the last 5 minutes.
164
+ # We search broadly for "SECRETTOKEN" to catch any raw token that leaked through,
165
+ # and separately verify all 4 REDACTED markers are present.
166
+ log "querying Loki for scrubbed entries"
167
+ QUERY_RESPONSE=$(
168
+ curl -s -G \
169
+ "http://localhost:${LOCAL_PORT}/loki/api/v1/query_range" \
170
+ --data-urlencode 'query={namespace="default"} |= "REDACTED"' \
171
+ --data-urlencode "start=$(date -u -v-5M +%s 2>/dev/null || date -u -d '5 minutes ago' +%s)000000000" \
172
+ --data-urlencode "end=$(date -u +%s)000000000" \
173
+ --data-urlencode 'limit=50'
174
+ )
175
+
176
+ # -------------------------------------------------------------------------
177
+ # Step 7: Assertions — verify all 4 scrub patterns
178
+ #
179
+ # Contract (matches Phase B spec + promtail-values.yaml):
180
+ # ?code=SECRETTOKEN123 → code=REDACTED (absent: SECRETTOKEN123)
181
+ # &access_token=SECRETTOKEN456 → access_token=REDACTED (absent: SECRETTOKEN456)
182
+ # &state=SESSION789 → state=REDACTED (absent: SESSION789)
183
+ # Authorization: Bearer SECRETBEARER000 → Bearer REDACTED (absent: SECRETBEARER000)
184
+ # -------------------------------------------------------------------------
185
+ log "asserting scrubbing correctness (all 4 patterns)"
186
+
187
+ diag() {
188
+ log "DIAGNOSTIC: Loki query response:"
189
+ echo "$QUERY_RESPONSE" >&2
190
+ log "DIAGNOSTIC: last 50 lines of Promtail logs:"
191
+ kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/name=promtail --tail=50 2>&1 >&2 || true
192
+ }
193
+
194
+ # Assertion 1: query response is non-empty (Loki returned results)
195
+ if ! echo "$QUERY_RESPONSE" | grep -q '"result"'; then
196
+ diag
197
+ fail "Loki returned no result block — Promtail may not have ingested the synthetic log yet"
198
+ fi
199
+
200
+ # --- Scrubbed markers present ---
201
+
202
+ # Assertion 2a: code= is scrubbed
203
+ if ! echo "$QUERY_RESPONSE" | grep -q 'code=REDACTED'; then
204
+ diag
205
+ fail "'code=REDACTED' not found in Loki response — code= scrub stage not working"
206
+ fi
207
+
208
+ # Assertion 2b: access_token= is scrubbed
209
+ if ! echo "$QUERY_RESPONSE" | grep -q 'access_token=REDACTED'; then
210
+ diag
211
+ fail "'access_token=REDACTED' not found in Loki response — access_token= scrub stage not working"
212
+ fi
213
+
214
+ # Assertion 2c: state= is scrubbed
215
+ if ! echo "$QUERY_RESPONSE" | grep -q 'state=REDACTED'; then
216
+ diag
217
+ fail "'state=REDACTED' not found in Loki response — state= scrub stage not working"
218
+ fi
219
+
220
+ # Assertion 2d: Authorization Bearer is scrubbed
221
+ if ! echo "$QUERY_RESPONSE" | grep -q 'Bearer REDACTED'; then
222
+ diag
223
+ fail "'Bearer REDACTED' not found in Loki response — Authorization Bearer scrub stage not working"
224
+ fi
225
+
226
+ # --- Raw tokens absent ---
227
+
228
+ # Assertion 3a: raw code= token is absent
229
+ if echo "$QUERY_RESPONSE" | grep -q 'SECRETTOKEN123'; then
230
+ diag
231
+ fail "raw token 'SECRETTOKEN123' (code=) found in Loki response — scrubbing pipeline is NOT working"
232
+ fi
233
+
234
+ # Assertion 3b: raw access_token= token is absent
235
+ if echo "$QUERY_RESPONSE" | grep -q 'SECRETTOKEN456'; then
236
+ diag
237
+ fail "raw token 'SECRETTOKEN456' (access_token=) found in Loki response — scrubbing pipeline is NOT working"
238
+ fi
239
+
240
+ # Assertion 3c: raw state= token is absent
241
+ if echo "$QUERY_RESPONSE" | grep -q 'SESSION789'; then
242
+ diag
243
+ fail "raw token 'SESSION789' (state=) found in Loki response — scrubbing pipeline is NOT working"
244
+ fi
245
+
246
+ # Assertion 3d: raw Bearer token is absent
247
+ if echo "$QUERY_RESPONSE" | grep -q 'SECRETBEARER000'; then
248
+ diag
249
+ fail "raw token 'SECRETBEARER000' (Authorization Bearer) found in Loki response — scrubbing pipeline is NOT working"
250
+ fi
251
+
252
+ log "PASS: all 4 scrub patterns verified — code=REDACTED, access_token=REDACTED, state=REDACTED, Bearer REDACTED present; all raw tokens absent"
253
+ exit 0
@@ -0,0 +1,311 @@
1
+ #!/usr/bin/env bash
2
+ # prom-no-double-grafana.sh — Phase C Task C1 e2e smoke test.
3
+ #
4
+ # Verifies:
5
+ # 1. kube-prometheus-stack installs (Prometheus pod becomes Ready).
6
+ # 2. ServiceMonitor CRD is Established before Phase B charts are upgraded.
7
+ # 3. Phase B charts (Loki + Promtail + Grafana) are helm-upgraded to pick up
8
+ # serviceMonitor.enabled: true now that the CRD exists.
9
+ # 4. Exactly one Grafana Deployment is running in the cluster (no double-Grafana).
10
+ # 5. Phase B's Grafana (olam-grafana) has exactly one Prometheus datasource
11
+ # provisioned (from grafana-values.yaml datasources block added in C1).
12
+ # 6. Prometheus is scraping at least one active target.
13
+ #
14
+ # Pre-conditions:
15
+ # - kubectl context is set to a live k8s cluster.
16
+ # - Phase B e2e (loki-ingest.sh + grafana-port-forward.sh + grafana-dashboard-persistence.sh)
17
+ # has already run: olam-loki, olam-promtail, and olam-grafana releases are installed.
18
+ # - The olam-grafana-admin Secret exists (created by grafana-port-forward.sh).
19
+ # - helm, kubectl, curl, jq binaries available.
20
+ #
21
+ # Chart: prometheus-community/kube-prometheus-stack 85.2.0 (pinned; latest stable 2026-05-21).
22
+ #
23
+ # Idempotency: helm upgrade --install is idempotent; re-runs on an existing
24
+ # cluster succeed. Port-forwards are killed on exit via trap.
25
+ #
26
+ # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C1
27
+
28
+ set -euo pipefail
29
+
30
+ NAMESPACE="monitoring"
31
+ PROM_RELEASE="olam-prom"
32
+ PROM_CHART_VERSION="85.2.0"
33
+ GRAFANA_RELEASE="olam-grafana"
34
+ GRAFANA_LOCAL_PORT="3001" # avoid collision if phase-b-e2e left a port-forward on 3000
35
+ GRAFANA_SVC_PORT="80"
36
+ PROM_LOCAL_PORT="9090"
37
+ PF_BIND_SECONDS=5
38
+
39
+ log() { printf '[prom-no-double-grafana] %s\n' "$*" >&2; }
40
+ fail() { printf '[prom-no-double-grafana] FAIL: %s\n' "$*" >&2; exit 1; }
41
+
42
+ # -------------------------------------------------------------------------
43
+ # Resolve repo root so helm -f paths work regardless of invocation cwd
44
+ # -------------------------------------------------------------------------
45
+ REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
46
+ # When invoked from a published @pleri/olam-cli install (no monorepo), `olam
47
+ # setup` exports OLAM_BUNDLE_ROOT=<install>/host-cp so the bundled
48
+ # peripheral-services/{helm-values,manifests} directory is reachable.
49
+ # Monorepo callers leave it unset; the script falls back to the source dir
50
+ # under packages/peripheral-services/.
51
+ if [[ -n "${OLAM_BUNDLE_ROOT:-}" ]]; then
52
+ PERIPHERAL_SERVICES_DIR="$OLAM_BUNDLE_ROOT/peripheral-services"
53
+ else
54
+ PERIPHERAL_SERVICES_DIR="$REPO_ROOT/packages/peripheral-services"
55
+ fi
56
+
57
+ # -------------------------------------------------------------------------
58
+ # Cleanup trap — kill port-forwards on exit; leave Helm releases in place
59
+ # -------------------------------------------------------------------------
60
+ GRAFANA_PF_PID=""
61
+ PROM_PF_PID=""
62
+ cleanup() {
63
+ [[ -n "$GRAFANA_PF_PID" ]] && kill "$GRAFANA_PF_PID" 2>/dev/null || true
64
+ [[ -n "$PROM_PF_PID" ]] && kill "$PROM_PF_PID" 2>/dev/null || true
65
+ }
66
+ trap cleanup EXIT
67
+
68
+ # -------------------------------------------------------------------------
69
+ # Pre-flight
70
+ # -------------------------------------------------------------------------
71
+ command -v helm >/dev/null 2>&1 || fail "helm not installed"
72
+ command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed"
73
+ command -v curl >/dev/null 2>&1 || fail "curl not installed"
74
+ command -v jq >/dev/null 2>&1 || fail "jq not installed"
75
+ kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG"
76
+
77
+ log "pre-flight checks passed"
78
+
79
+ # Verify Phase B pre-conditions
80
+ for release in olam-loki olam-promtail "$GRAFANA_RELEASE"; do
81
+ helm status "$release" -n "$NAMESPACE" >/dev/null 2>&1 \
82
+ || fail "Phase B release '$release' not found in namespace $NAMESPACE — run phase-b-e2e first"
83
+ done
84
+ log "Phase B pre-conditions satisfied (olam-loki, olam-promtail, olam-grafana releases found)"
85
+
86
+ # -------------------------------------------------------------------------
87
+ # Step 1: Add prometheus-community repo and install kube-prometheus-stack
88
+ # -------------------------------------------------------------------------
89
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true
90
+ helm repo update prometheus-community
91
+
92
+ log "installing prometheus-community/kube-prometheus-stack ($PROM_RELEASE) version $PROM_CHART_VERSION"
93
+ helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stack \
94
+ --version "$PROM_CHART_VERSION" \
95
+ --namespace "$NAMESPACE" \
96
+ --create-namespace \
97
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/kube-prom-stack-values.yaml" \
98
+ --wait \
99
+ --timeout 600s
100
+
101
+ log "kube-prometheus-stack helm install complete"
102
+
103
+ # -------------------------------------------------------------------------
104
+ # Step 2: Wait for ServiceMonitor CRD to be Established
105
+ # This is the gate before upgrading Phase B charts — the CRD must exist
106
+ # for serviceMonitor.enabled: true to produce a valid ServiceMonitor object.
107
+ # -------------------------------------------------------------------------
108
+ log "waiting for ServiceMonitor CRD to be Established (60s)"
109
+ kubectl wait \
110
+ --for=condition=established \
111
+ crd/servicemonitors.monitoring.coreos.com \
112
+ --timeout=60s
113
+
114
+ log "ServiceMonitor CRD Established"
115
+
116
+ # -------------------------------------------------------------------------
117
+ # Step 3: Helm-upgrade Phase B charts to enable ServiceMonitor at RUNTIME
118
+ #
119
+ # The source-of-truth values files keep serviceMonitor.enabled: false so a
120
+ # standalone Phase B install (without kube-prometheus-stack) does not
121
+ # hard-fail with "no matches for kind ServiceMonitor". We flip the toggle
122
+ # at runtime here, AFTER the CRD is Established, via --set overrides. This
123
+ # preserves Phase B's standalone-installability invariant while wiring
124
+ # Prometheus discovery when kube-prom-stack is present.
125
+ #
126
+ # NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor)
127
+ # — chart-version-specific path.
128
+ # -------------------------------------------------------------------------
129
+ # Chart version pins MUST match the ones in phase-b-e2e's loki-ingest.sh +
130
+ # grafana-port-forward.sh. Without --version, helm pulls latest from the repo;
131
+ # the latest charts may reference new template values not present in our
132
+ # values files (e.g., Loki 6.8.x references .Values.loki.ui.enabled which is
133
+ # nil in our 6.7.4-shaped values, producing a nil-pointer template error
134
+ # during upgrade).
135
+ LOKI_CHART_VERSION="6.7.4"
136
+ PROMTAIL_CHART_VERSION="6.16.6"
137
+ GRAFANA_CHART_VERSION="8.5.2"
138
+
139
+ log "upgrading Phase B charts with runtime --set serviceMonitor.enabled=true (pinned versions)"
140
+
141
+ helm upgrade olam-loki grafana/loki \
142
+ --version "$LOKI_CHART_VERSION" \
143
+ --namespace "$NAMESPACE" \
144
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
145
+ --wait \
146
+ --timeout 300s \
147
+ --reuse-values \
148
+ --set monitoring.serviceMonitor.enabled=true
149
+
150
+ log "olam-loki upgraded (ServiceMonitor enabled)"
151
+
152
+ helm upgrade olam-promtail grafana/promtail \
153
+ --version "$PROMTAIL_CHART_VERSION" \
154
+ --namespace "$NAMESPACE" \
155
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
156
+ --wait \
157
+ --timeout 300s \
158
+ --reuse-values \
159
+ --set serviceMonitor.enabled=true
160
+
161
+ log "olam-promtail upgraded (ServiceMonitor enabled)"
162
+
163
+ helm upgrade "$GRAFANA_RELEASE" grafana/grafana \
164
+ --version "$GRAFANA_CHART_VERSION" \
165
+ --namespace "$NAMESPACE" \
166
+ -f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
167
+ --wait \
168
+ --timeout 300s \
169
+ --reuse-values \
170
+ --set serviceMonitor.enabled=true
171
+
172
+ log "$GRAFANA_RELEASE upgraded (ServiceMonitor enabled; Prometheus datasource provisioned)"
173
+
174
+ # -------------------------------------------------------------------------
175
+ # Step 4: Wait for Prometheus pod Ready
176
+ # -------------------------------------------------------------------------
177
+ log "waiting for Prometheus pod Ready (300s)"
178
+ kubectl wait \
179
+ --for=condition=ready pod \
180
+ -l "app.kubernetes.io/name=prometheus" \
181
+ -n "$NAMESPACE" \
182
+ --timeout=300s
183
+
184
+ log "Prometheus pod Ready"
185
+
186
+ # -------------------------------------------------------------------------
187
+ # Step 5: Assertion — exactly one Grafana Deployment in the cluster
188
+ # This catches any regression where kube-prometheus-stack's bundled Grafana
189
+ # sub-chart accidentally gets enabled.
190
+ # -------------------------------------------------------------------------
191
+ log "asserting exactly 1 Grafana Deployment in namespace $NAMESPACE"
192
+ GRAFANA_DEPS=$(kubectl get deployment \
193
+ -n "$NAMESPACE" \
194
+ -l "app.kubernetes.io/name=grafana" \
195
+ -o name \
196
+ | wc -l \
197
+ | tr -d ' ')
198
+
199
+ if [ "$GRAFANA_DEPS" != "1" ]; then
200
+ log "FAIL: expected exactly 1 Grafana Deployment, found $GRAFANA_DEPS"
201
+ kubectl get deployment -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" >&2
202
+ fail "double-Grafana detected — kube-prometheus-stack's grafana.enabled must be false"
203
+ fi
204
+
205
+ log "PASS: exactly 1 Grafana Deployment found"
206
+
207
+ # -------------------------------------------------------------------------
208
+ # Step 6: Assertion — Grafana has exactly one Prometheus datasource
209
+ # Re-read the admin password from the Secret (grafana-port-forward.sh created it).
210
+ # Use port 3001 to avoid colliding with any live phase-b-e2e port-forward on 3000.
211
+ # -------------------------------------------------------------------------
212
+ log "reading admin password from Secret olam-grafana-admin"
213
+ GRAFANA_ADMIN_PW=$(kubectl get secret olam-grafana-admin \
214
+ -n "$NAMESPACE" \
215
+ -o jsonpath='{.data.admin-password}' \
216
+ | base64 -d)
217
+
218
+ log "port-forwarding svc/$GRAFANA_RELEASE $GRAFANA_LOCAL_PORT:$GRAFANA_SVC_PORT"
219
+ kubectl port-forward \
220
+ -n "$NAMESPACE" \
221
+ "svc/$GRAFANA_RELEASE" \
222
+ "${GRAFANA_LOCAL_PORT}:${GRAFANA_SVC_PORT}" &
223
+ GRAFANA_PF_PID=$!
224
+
225
+ log "waiting ${PF_BIND_SECONDS}s for Grafana port-forward to bind"
226
+ sleep "$PF_BIND_SECONDS"
227
+ kill -0 "$GRAFANA_PF_PID" 2>/dev/null \
228
+ || fail "Grafana port-forward process exited prematurely"
229
+
230
+ log "asserting exactly 1 Prometheus datasource in Grafana (GET /api/datasources)"
231
+ DATASOURCES=$(curl -sf \
232
+ -u "admin:${GRAFANA_ADMIN_PW}" \
233
+ "http://localhost:${GRAFANA_LOCAL_PORT}/api/datasources" \
234
+ || { kubectl logs -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" --tail=30 >&2 || true
235
+ fail "GET /api/datasources failed — Grafana not reachable on port $GRAFANA_LOCAL_PORT"; })
236
+
237
+ if ! echo "$DATASOURCES" | jq -e 'map(select(.type == "prometheus")) | length == 1' >/dev/null 2>&1; then
238
+ log "FAIL: Grafana does not have exactly 1 Prometheus datasource"
239
+ echo "$DATASOURCES" | jq . >&2
240
+ fail "Prometheus datasource not provisioned — check datasources block in grafana-values.yaml"
241
+ fi
242
+
243
+ PROM_URL=$(echo "$DATASOURCES" | jq -r 'map(select(.type == "prometheus")) | .[0].url')
244
+ log "PASS: Grafana has exactly 1 Prometheus datasource (url=$PROM_URL)"
245
+
246
+ # -------------------------------------------------------------------------
247
+ # Step 7: Assertion — Prometheus is scraping at least one active target
248
+ # -------------------------------------------------------------------------
249
+ log "port-forwarding svc/prometheus-operated $PROM_LOCAL_PORT:9090"
250
+ kubectl port-forward \
251
+ -n "$NAMESPACE" \
252
+ "svc/prometheus-operated" \
253
+ "${PROM_LOCAL_PORT}:9090" &
254
+ PROM_PF_PID=$!
255
+
256
+ log "waiting ${PF_BIND_SECONDS}s for Prometheus port-forward to bind"
257
+ sleep "$PF_BIND_SECONDS"
258
+ kill -0 "$PROM_PF_PID" 2>/dev/null \
259
+ || fail "Prometheus port-forward process exited prematurely"
260
+
261
+ log "querying Prometheus /api/v1/targets for active targets"
262
+ TARGETS=$(curl -sf "http://localhost:${PROM_LOCAL_PORT}/api/v1/targets" \
263
+ || fail "GET /api/v1/targets failed — Prometheus not reachable on port $PROM_LOCAL_PORT")
264
+
265
+ ACTIVE=$(echo "$TARGETS" | jq '.data.activeTargets | length')
266
+ if [ "$ACTIVE" -lt 1 ]; then
267
+ log "FAIL: Prometheus has 0 active scrape targets"
268
+ echo "$TARGETS" | jq '.data.activeTargets' >&2
269
+ fail "Prometheus has no active targets — check ServiceMonitor CRD and scrapeConfig"
270
+ fi
271
+
272
+ log "PASS: $ACTIVE active scrape target(s) found in Prometheus"
273
+
274
+ # -------------------------------------------------------------------------
275
+ # Assertion C4: Recording rules from 95-prom-recording-rules.yaml are loaded
276
+ #
277
+ # The 9[0-9]-prom-* glob in apply-manifests.sh skips this file (requires
278
+ # kube-prom-stack CRDs to exist). We kubectl apply it here, then poll
279
+ # /api/v1/rules until the olam-http-aggregations group appears.
280
+ # The port-forward on PROM_LOCAL_PORT is already open from Step 7 above.
281
+ # -------------------------------------------------------------------------
282
+ PROM_URL="http://localhost:${PROM_LOCAL_PORT}"
283
+
284
+ log "applying 95-prom-recording-rules.yaml (skipped by apply-manifests due to 9[0-9]-prom-* filter)"
285
+ kubectl apply -f "$PERIPHERAL_SERVICES_DIR/manifests/95-prom-recording-rules.yaml"
286
+
287
+ # Prometheus operator reconcile + config reload can take ~60-90s (C2 lesson).
288
+ # Poll /api/v1/rules until our group appears (up to 180s).
289
+ RECORDING_RULES_TIMEOUT=180
290
+ log "polling ${PROM_URL}/api/v1/rules for olam-http-aggregations group (up to ${RECORDING_RULES_TIMEOUT}s)"
291
+ elapsed=0
292
+ while [ "$elapsed" -lt "$RECORDING_RULES_TIMEOUT" ]; do
293
+ if curl -sf "${PROM_URL}/api/v1/rules" 2>/dev/null \
294
+ | jq -e '.data.groups[] | select(.name == "olam-http-aggregations") | .rules[] | select(.name == "olam:http_requests:rate5m_by_service")' >/dev/null 2>&1; then
295
+ log "PASS: olam-http-aggregations rule group loaded after ${elapsed}s"
296
+ break
297
+ fi
298
+ sleep 10
299
+ elapsed=$((elapsed + 10))
300
+ done
301
+ if [ "$elapsed" -ge "$RECORDING_RULES_TIMEOUT" ]; then
302
+ log "FAIL: olam-http-aggregations rule group not found in /api/v1/rules within ${RECORDING_RULES_TIMEOUT}s"
303
+ curl -sf "${PROM_URL}/api/v1/rules" | jq '.data.groups[] | .name' >&2 || true
304
+ fail "PrometheusRule not loaded by operator"
305
+ fi
306
+
307
+ # -------------------------------------------------------------------------
308
+ # Final
309
+ # -------------------------------------------------------------------------
310
+ log "PASS: kube-prometheus-stack installed; single Grafana confirmed; Prometheus datasource provisioned; $ACTIVE active target(s); recording rules loaded — Tasks C1+C4 verified"
311
+ exit 0