@pleri/olam-cli 0.1.159 → 0.1.161

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/README.md +11 -0
  2. package/dist/agent-stream/agent-sdk-to-chunks.js +3 -0
  3. package/dist/agent-stream/driver-runner.js +9 -4
  4. package/dist/agent-stream/host-driver-launch.js +48 -0
  5. package/dist/commands/bootstrap.d.ts +15 -0
  6. package/dist/commands/bootstrap.d.ts.map +1 -1
  7. package/dist/commands/bootstrap.js +30 -1
  8. package/dist/commands/bootstrap.js.map +1 -1
  9. package/dist/commands/flywheel/check-persona-skeleton.d.ts +30 -2
  10. package/dist/commands/flywheel/check-persona-skeleton.d.ts.map +1 -1
  11. package/dist/commands/flywheel/check-persona-skeleton.js +143 -6
  12. package/dist/commands/flywheel/check-persona-skeleton.js.map +1 -1
  13. package/dist/commands/flywheel/diversity-check.d.ts +12 -2
  14. package/dist/commands/flywheel/diversity-check.d.ts.map +1 -1
  15. package/dist/commands/flywheel/diversity-check.js +56 -6
  16. package/dist/commands/flywheel/diversity-check.js.map +1 -1
  17. package/dist/commands/flywheel/index.d.ts.map +1 -1
  18. package/dist/commands/flywheel/index.js +2 -0
  19. package/dist/commands/flywheel/index.js.map +1 -1
  20. package/dist/commands/flywheel/install-shims.d.ts +36 -3
  21. package/dist/commands/flywheel/install-shims.d.ts.map +1 -1
  22. package/dist/commands/flywheel/install-shims.js +118 -7
  23. package/dist/commands/flywheel/install-shims.js.map +1 -1
  24. package/dist/commands/flywheel/k10-measure.d.ts +12 -2
  25. package/dist/commands/flywheel/k10-measure.d.ts.map +1 -1
  26. package/dist/commands/flywheel/k10-measure.js +55 -6
  27. package/dist/commands/flywheel/k10-measure.js.map +1 -1
  28. package/dist/commands/flywheel/migrate-overlays.d.ts +115 -0
  29. package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -0
  30. package/dist/commands/flywheel/migrate-overlays.js +766 -0
  31. package/dist/commands/flywheel/migrate-overlays.js.map +1 -0
  32. package/dist/commands/flywheel/sanitize-persona-output.d.ts +33 -2
  33. package/dist/commands/flywheel/sanitize-persona-output.d.ts.map +1 -1
  34. package/dist/commands/flywheel/sanitize-persona-output.js +94 -6
  35. package/dist/commands/flywheel/sanitize-persona-output.js.map +1 -1
  36. package/dist/commands/memory/index.d.ts.map +1 -1
  37. package/dist/commands/memory/index.js +2 -0
  38. package/dist/commands/memory/index.js.map +1 -1
  39. package/dist/commands/memory/install-hooks.d.ts +22 -0
  40. package/dist/commands/memory/install-hooks.d.ts.map +1 -0
  41. package/dist/commands/memory/install-hooks.js +156 -0
  42. package/dist/commands/memory/install-hooks.js.map +1 -0
  43. package/dist/commands/skills-doctor.js +2 -2
  44. package/dist/commands/skills-doctor.js.map +1 -1
  45. package/dist/commands/skills-source.d.ts.map +1 -1
  46. package/dist/commands/skills-source.js +10 -0
  47. package/dist/commands/skills-source.js.map +1 -1
  48. package/dist/commands/skills.d.ts.map +1 -1
  49. package/dist/commands/skills.js +169 -1
  50. package/dist/commands/skills.js.map +1 -1
  51. package/dist/image-digests.json +7 -7
  52. package/dist/index.js +4361 -1768
  53. package/dist/lib/bootstrap-kubernetes.d.ts +42 -0
  54. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -0
  55. package/dist/lib/bootstrap-kubernetes.js +287 -0
  56. package/dist/lib/bootstrap-kubernetes.js.map +1 -0
  57. package/dist/lib/config.d.ts.map +1 -1
  58. package/dist/lib/config.js +6 -1
  59. package/dist/lib/config.js.map +1 -1
  60. package/dist/lib/flywheel-probes.d.ts +58 -0
  61. package/dist/lib/flywheel-probes.d.ts.map +1 -0
  62. package/dist/lib/flywheel-probes.js +163 -0
  63. package/dist/lib/flywheel-probes.js.map +1 -0
  64. package/dist/lib/shim-generator.d.ts +51 -0
  65. package/dist/lib/shim-generator.d.ts.map +1 -0
  66. package/dist/lib/shim-generator.js +88 -0
  67. package/dist/lib/shim-generator.js.map +1 -0
  68. package/dist/lib/skills-apply-overlays.d.ts +35 -0
  69. package/dist/lib/skills-apply-overlays.d.ts.map +1 -0
  70. package/dist/lib/skills-apply-overlays.js +243 -0
  71. package/dist/lib/skills-apply-overlays.js.map +1 -0
  72. package/dist/mcp-server.js +1106 -453
  73. package/hermes-bundle/version.json +1 -1
  74. package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
  75. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  76. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  77. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  78. package/host-cp/k8s/manifests/memory-service/30-configmap.yaml +11 -0
  79. package/host-cp/k8s/manifests/memory-service/35-configmap-iii-config.yaml +76 -0
  80. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +11 -1
  81. package/host-cp/observability/grafana-port-forward.sh +273 -0
  82. package/host-cp/observability/kyverno-cardinality-mutate.sh +452 -0
  83. package/host-cp/observability/loki-ingest.sh +243 -0
  84. package/host-cp/observability/prom-no-double-grafana.sh +301 -0
  85. package/host-cp/src/crystallize-planning.mjs +261 -0
  86. package/host-cp/src/plan-chat-service.mjs +84 -2
  87. package/host-cp/src/planning-sessions.mjs +270 -0
  88. package/package.json +1 -1
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env bash
2
+ # loki-ingest.sh — e2e smoke test: Loki single-binary installs, Promtail tails,
3
+ # OAuth query-param scrubbing verified (code=REDACTED, no raw token).
4
+ #
5
+ # Usage: scripts/e2e/loki-ingest.sh
6
+ #
7
+ # Pre-conditions:
8
+ # - kubectl context is set to a live k8s cluster (does NOT spin up k3d)
9
+ # - helm binary available
10
+ # - grafana Helm repo added (helm repo add grafana https://grafana.github.io/helm-charts)
11
+ #
12
+ # This script is invoked by the A12 harness (scripts/test-ingress-integration/)
13
+ # after cluster-up.sh. It can also be run manually against any live cluster.
14
+ #
15
+ # Idempotency: `helm upgrade --install` is idempotent; re-runs succeed on an
16
+ # existing cluster. The synthetic pod is cleaned up regardless of
17
+ # pass/fail via a trap.
18
+ #
19
+ # Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B1
20
+ # Chart: grafana/loki 6.7.4 (pinned; latest stable 2026-05-20)
21
+ # Chart: grafana/promtail 6.16.6 (latest stable 2026-05-20)
22
+
23
+ set -euo pipefail
24
+
25
+ NAMESPACE="monitoring"
26
+ LOKI_RELEASE="olam-loki"
27
+ PROMTAIL_RELEASE="olam-promtail"
28
+ SYNTHETIC_POD="loki-e2e-synthetic"
29
+ LOKI_PORT="3100"
30
+ LOCAL_PORT="13100" # avoid conflict with any host-level Loki
31
+
32
+ # Magic-number commentary: Promtail's tail → ingest cycle involves:
33
+ # - inotify event (near-instant)
34
+ # - Promtail pipeline processing (~1s)
35
+ # - Loki write path (ingester chunk idle period: default 30m, but flush on
36
+ # query pressure; typically <5s in practice)
37
+ # 10s is conservative for a single log line in a lightly loaded cluster.
38
+ INGEST_LAG_SECONDS=10
39
+
40
+ log() { printf '[loki-ingest] %s\n' "$*" >&2; }
41
+ fail() { printf '[loki-ingest] FAIL: %s\n' "$*" >&2; exit 1; }
42
+
43
+ # -------------------------------------------------------------------------
44
+ # Cleanup trap — remove synthetic pod and port-forward on exit
45
+ # -------------------------------------------------------------------------
46
+ PF_PID=""
47
+ cleanup() {
48
+ if [[ -n "$PF_PID" ]] && kill -0 "$PF_PID" 2>/dev/null; then
49
+ kill "$PF_PID" 2>/dev/null || true
50
+ fi
51
+ kubectl delete pod "$SYNTHETIC_POD" -n default --ignore-not-found=true 2>/dev/null || true
52
+ }
53
+ trap cleanup EXIT
54
+
55
+ # -------------------------------------------------------------------------
56
+ # Pre-flight
57
+ # -------------------------------------------------------------------------
58
+ command -v helm >/dev/null 2>&1 || fail "helm not installed"
59
+ command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed"
60
+ command -v curl >/dev/null 2>&1 || fail "curl not installed"
61
+ kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG"
62
+
63
+ log "pre-flight checks passed"
64
+
65
+ # -------------------------------------------------------------------------
66
+ # Resolve repo root so helm -f paths work regardless of invocation cwd
67
+ # -------------------------------------------------------------------------
68
+ REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
69
+
70
+ # -------------------------------------------------------------------------
71
+ # Ensure grafana Helm repo is present (idempotent — safe to re-run)
72
+ # -------------------------------------------------------------------------
73
+ helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true
74
+ helm repo update grafana
75
+
76
+ # -------------------------------------------------------------------------
77
+ # Step 1: Install / upgrade Loki (single-binary mode)
78
+ # -------------------------------------------------------------------------
79
+ log "installing grafana/loki ($LOKI_RELEASE) in namespace $NAMESPACE"
80
+ helm upgrade --install "$LOKI_RELEASE" grafana/loki \
81
+ --version 6.7.4 \
82
+ --namespace "$NAMESPACE" \
83
+ --create-namespace \
84
+ -f "$REPO_ROOT/packages/peripheral-services/helm-values/loki-values.yaml" \
85
+ --wait \
86
+ --timeout 300s
87
+
88
+ log "loki helm install complete"
89
+
90
+ # -------------------------------------------------------------------------
91
+ # Step 2: Install / upgrade Promtail
92
+ # -------------------------------------------------------------------------
93
+ log "installing grafana/promtail ($PROMTAIL_RELEASE) in namespace $NAMESPACE"
94
+ helm upgrade --install "$PROMTAIL_RELEASE" grafana/promtail \
95
+ --version 6.16.6 \
96
+ --namespace "$NAMESPACE" \
97
+ -f "$REPO_ROOT/packages/peripheral-services/helm-values/promtail-values.yaml" \
98
+ --wait \
99
+ --timeout 120s
100
+
101
+ log "promtail helm install complete"
102
+
103
+ # -------------------------------------------------------------------------
104
+ # Step 3: Wait for Loki pod Ready
105
+ # -------------------------------------------------------------------------
106
+ log "waiting for Loki pod Ready (120s)"
107
+ kubectl wait \
108
+ --for=condition=ready pod \
109
+ -l app.kubernetes.io/name=loki \
110
+ -n "$NAMESPACE" \
111
+ --timeout=120s
112
+
113
+ log "loki pod Ready"
114
+
115
+ # -------------------------------------------------------------------------
116
+ # Step 4: Generate synthetic log line with raw OAuth tokens in URL and headers.
117
+ #
118
+ # The pod prints a single log line containing all 4 scrub patterns:
119
+ # ?code=SECRETTOKEN123 → code=REDACTED
120
+ # &access_token=SECRETTOKEN456 → access_token=REDACTED
121
+ # &state=SESSION789 → state=REDACTED
122
+ # Authorization: Bearer SECRETBEARER000 → Authorization: Bearer REDACTED
123
+ #
124
+ # Promtail tails it, runs the scrubbing pipeline, and pushes to Loki with all
125
+ # 4 raw tokens absent and all 4 REDACTED markers present.
126
+ # -------------------------------------------------------------------------
127
+ log "launching synthetic pod (prints all 4 raw token patterns)"
128
+ kubectl run "$SYNTHETIC_POD" \
129
+ --image=busybox \
130
+ --restart=Never \
131
+ -n default \
132
+ -- sh -c 'echo "GET http://example.com/callback?code=SECRETTOKEN123&access_token=SECRETTOKEN456&state=SESSION789 HTTP/1.1 Authorization: Bearer SECRETBEARER000"'
133
+
134
+ # -------------------------------------------------------------------------
135
+ # Step 5: Wait for Promtail tail + ingest lag
136
+ # -------------------------------------------------------------------------
137
+ log "waiting ${INGEST_LAG_SECONDS}s for Promtail to tail and ingest synthetic log"
138
+ sleep "$INGEST_LAG_SECONDS"
139
+
140
+ # -------------------------------------------------------------------------
141
+ # Step 6: Port-forward Loki and query
142
+ # -------------------------------------------------------------------------
143
+ log "port-forwarding Loki svc to localhost:${LOCAL_PORT}"
144
+ kubectl port-forward \
145
+ "svc/${LOKI_RELEASE}" \
146
+ "${LOCAL_PORT}:${LOKI_PORT}" \
147
+ -n "$NAMESPACE" &
148
+ PF_PID=$!
149
+
150
+ # Give port-forward a moment to establish
151
+ sleep 2
152
+
153
+ # Query Loki for log lines from the default namespace within the last 5 minutes.
154
+ # We search broadly for "SECRETTOKEN" to catch any raw token that leaked through,
155
+ # and separately verify all 4 REDACTED markers are present.
156
+ log "querying Loki for scrubbed entries"
157
+ QUERY_RESPONSE=$(
158
+ curl -s -G \
159
+ "http://localhost:${LOCAL_PORT}/loki/api/v1/query_range" \
160
+ --data-urlencode 'query={namespace="default"} |= "REDACTED"' \
161
+ --data-urlencode "start=$(date -u -v-5M +%s 2>/dev/null || date -u -d '5 minutes ago' +%s)000000000" \
162
+ --data-urlencode "end=$(date -u +%s)000000000" \
163
+ --data-urlencode 'limit=50'
164
+ )
165
+
166
+ # -------------------------------------------------------------------------
167
+ # Step 7: Assertions — verify all 4 scrub patterns
168
+ #
169
+ # Contract (matches Phase B spec + promtail-values.yaml):
170
+ # ?code=SECRETTOKEN123 → code=REDACTED (absent: SECRETTOKEN123)
171
+ # &access_token=SECRETTOKEN456 → access_token=REDACTED (absent: SECRETTOKEN456)
172
+ # &state=SESSION789 → state=REDACTED (absent: SESSION789)
173
+ # Authorization: Bearer SECRETBEARER000 → Bearer REDACTED (absent: SECRETBEARER000)
174
+ # -------------------------------------------------------------------------
175
+ log "asserting scrubbing correctness (all 4 patterns)"
176
+
177
+ diag() {
178
+ log "DIAGNOSTIC: Loki query response:"
179
+ echo "$QUERY_RESPONSE" >&2
180
+ log "DIAGNOSTIC: last 50 lines of Promtail logs:"
181
+ kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/name=promtail --tail=50 2>&1 >&2 || true
182
+ }
183
+
184
+ # Assertion 1: query response is non-empty (Loki returned results)
185
+ if ! echo "$QUERY_RESPONSE" | grep -q '"result"'; then
186
+ diag
187
+ fail "Loki returned no result block — Promtail may not have ingested the synthetic log yet"
188
+ fi
189
+
190
+ # --- Scrubbed markers present ---
191
+
192
+ # Assertion 2a: code= is scrubbed
193
+ if ! echo "$QUERY_RESPONSE" | grep -q 'code=REDACTED'; then
194
+ diag
195
+ fail "'code=REDACTED' not found in Loki response — code= scrub stage not working"
196
+ fi
197
+
198
+ # Assertion 2b: access_token= is scrubbed
199
+ if ! echo "$QUERY_RESPONSE" | grep -q 'access_token=REDACTED'; then
200
+ diag
201
+ fail "'access_token=REDACTED' not found in Loki response — access_token= scrub stage not working"
202
+ fi
203
+
204
+ # Assertion 2c: state= is scrubbed
205
+ if ! echo "$QUERY_RESPONSE" | grep -q 'state=REDACTED'; then
206
+ diag
207
+ fail "'state=REDACTED' not found in Loki response — state= scrub stage not working"
208
+ fi
209
+
210
+ # Assertion 2d: Authorization Bearer is scrubbed
211
+ if ! echo "$QUERY_RESPONSE" | grep -q 'Bearer REDACTED'; then
212
+ diag
213
+ fail "'Bearer REDACTED' not found in Loki response — Authorization Bearer scrub stage not working"
214
+ fi
215
+
216
+ # --- Raw tokens absent ---
217
+
218
+ # Assertion 3a: raw code= token is absent
219
+ if echo "$QUERY_RESPONSE" | grep -q 'SECRETTOKEN123'; then
220
+ diag
221
+ fail "raw token 'SECRETTOKEN123' (code=) found in Loki response — scrubbing pipeline is NOT working"
222
+ fi
223
+
224
+ # Assertion 3b: raw access_token= token is absent
225
+ if echo "$QUERY_RESPONSE" | grep -q 'SECRETTOKEN456'; then
226
+ diag
227
+ fail "raw token 'SECRETTOKEN456' (access_token=) found in Loki response — scrubbing pipeline is NOT working"
228
+ fi
229
+
230
+ # Assertion 3c: raw state= token is absent
231
+ if echo "$QUERY_RESPONSE" | grep -q 'SESSION789'; then
232
+ diag
233
+ fail "raw token 'SESSION789' (state=) found in Loki response — scrubbing pipeline is NOT working"
234
+ fi
235
+
236
+ # Assertion 3d: raw Bearer token is absent
237
+ if echo "$QUERY_RESPONSE" | grep -q 'SECRETBEARER000'; then
238
+ diag
239
+ fail "raw token 'SECRETBEARER000' (Authorization Bearer) found in Loki response — scrubbing pipeline is NOT working"
240
+ fi
241
+
242
+ log "PASS: all 4 scrub patterns verified — code=REDACTED, access_token=REDACTED, state=REDACTED, Bearer REDACTED present; all raw tokens absent"
243
+ exit 0
@@ -0,0 +1,301 @@
1
+ #!/usr/bin/env bash
2
+ # prom-no-double-grafana.sh — Phase C Task C1 e2e smoke test.
3
+ #
4
+ # Verifies:
5
+ # 1. kube-prometheus-stack installs (Prometheus pod becomes Ready).
6
+ # 2. ServiceMonitor CRD is Established before Phase B charts are upgraded.
7
+ # 3. Phase B charts (Loki + Promtail + Grafana) are helm-upgraded to pick up
8
+ # serviceMonitor.enabled: true now that the CRD exists.
9
+ # 4. Exactly one Grafana Deployment is running in the cluster (no double-Grafana).
10
+ # 5. Phase B's Grafana (olam-grafana) has exactly one Prometheus datasource
11
+ # provisioned (from grafana-values.yaml datasources block added in C1).
12
+ # 6. Prometheus is scraping at least one active target.
13
+ #
14
+ # Pre-conditions:
15
+ # - kubectl context is set to a live k8s cluster.
16
+ # - Phase B e2e (loki-ingest.sh + grafana-port-forward.sh + grafana-dashboard-persistence.sh)
17
+ # has already run: olam-loki, olam-promtail, and olam-grafana releases are installed.
18
+ # - The olam-grafana-admin Secret exists (created by grafana-port-forward.sh).
19
+ # - helm, kubectl, curl, jq binaries available.
20
+ #
21
+ # Chart: prometheus-community/kube-prometheus-stack 85.2.0 (pinned; latest stable 2026-05-21).
22
+ #
23
+ # Idempotency: helm upgrade --install is idempotent; re-runs on an existing
24
+ # cluster succeed. Port-forwards are killed on exit via trap.
25
+ #
26
+ # Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C1
27
+
28
+ set -euo pipefail
29
+
30
+ NAMESPACE="monitoring"
31
+ PROM_RELEASE="olam-prom"
32
+ PROM_CHART_VERSION="85.2.0"
33
+ GRAFANA_RELEASE="olam-grafana"
34
+ GRAFANA_LOCAL_PORT="3001" # avoid collision if phase-b-e2e left a port-forward on 3000
35
+ GRAFANA_SVC_PORT="80"
36
+ PROM_LOCAL_PORT="9090"
37
+ PF_BIND_SECONDS=5
38
+
39
+ log() { printf '[prom-no-double-grafana] %s\n' "$*" >&2; }
40
+ fail() { printf '[prom-no-double-grafana] FAIL: %s\n' "$*" >&2; exit 1; }
41
+
42
+ # -------------------------------------------------------------------------
43
+ # Resolve repo root so helm -f paths work regardless of invocation cwd
44
+ # -------------------------------------------------------------------------
45
+ REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
46
+
47
+ # -------------------------------------------------------------------------
48
+ # Cleanup trap — kill port-forwards on exit; leave Helm releases in place
49
+ # -------------------------------------------------------------------------
50
+ GRAFANA_PF_PID=""
51
+ PROM_PF_PID=""
52
+ cleanup() {
53
+ [[ -n "$GRAFANA_PF_PID" ]] && kill "$GRAFANA_PF_PID" 2>/dev/null || true
54
+ [[ -n "$PROM_PF_PID" ]] && kill "$PROM_PF_PID" 2>/dev/null || true
55
+ }
56
+ trap cleanup EXIT
57
+
58
+ # -------------------------------------------------------------------------
59
+ # Pre-flight
60
+ # -------------------------------------------------------------------------
61
+ command -v helm >/dev/null 2>&1 || fail "helm not installed"
62
+ command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed"
63
+ command -v curl >/dev/null 2>&1 || fail "curl not installed"
64
+ command -v jq >/dev/null 2>&1 || fail "jq not installed"
65
+ kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG"
66
+
67
+ log "pre-flight checks passed"
68
+
69
+ # Verify Phase B pre-conditions
70
+ for release in olam-loki olam-promtail "$GRAFANA_RELEASE"; do
71
+ helm status "$release" -n "$NAMESPACE" >/dev/null 2>&1 \
72
+ || fail "Phase B release '$release' not found in namespace $NAMESPACE — run phase-b-e2e first"
73
+ done
74
+ log "Phase B pre-conditions satisfied (olam-loki, olam-promtail, olam-grafana releases found)"
75
+
76
+ # -------------------------------------------------------------------------
77
+ # Step 1: Add prometheus-community repo and install kube-prometheus-stack
78
+ # -------------------------------------------------------------------------
79
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true
80
+ helm repo update prometheus-community
81
+
82
+ log "installing prometheus-community/kube-prometheus-stack ($PROM_RELEASE) version $PROM_CHART_VERSION"
83
+ helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stack \
84
+ --version "$PROM_CHART_VERSION" \
85
+ --namespace "$NAMESPACE" \
86
+ --create-namespace \
87
+ -f "$REPO_ROOT/packages/peripheral-services/helm-values/kube-prom-stack-values.yaml" \
88
+ --wait \
89
+ --timeout 600s
90
+
91
+ log "kube-prometheus-stack helm install complete"
92
+
93
+ # -------------------------------------------------------------------------
94
+ # Step 2: Wait for ServiceMonitor CRD to be Established
95
+ # This is the gate before upgrading Phase B charts — the CRD must exist
96
+ # for serviceMonitor.enabled: true to produce a valid ServiceMonitor object.
97
+ # -------------------------------------------------------------------------
98
+ log "waiting for ServiceMonitor CRD to be Established (60s)"
99
+ kubectl wait \
100
+ --for=condition=established \
101
+ crd/servicemonitors.monitoring.coreos.com \
102
+ --timeout=60s
103
+
104
+ log "ServiceMonitor CRD Established"
105
+
106
+ # -------------------------------------------------------------------------
107
+ # Step 3: Helm-upgrade Phase B charts to enable ServiceMonitor at RUNTIME
108
+ #
109
+ # The source-of-truth values files keep serviceMonitor.enabled: false so a
110
+ # standalone Phase B install (without kube-prometheus-stack) does not
111
+ # hard-fail with "no matches for kind ServiceMonitor". We flip the toggle
112
+ # at runtime here, AFTER the CRD is Established, via --set overrides. This
113
+ # preserves Phase B's standalone-installability invariant while wiring
114
+ # Prometheus discovery when kube-prom-stack is present.
115
+ #
116
+ # NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor)
117
+ # — chart-version-specific path.
118
+ # -------------------------------------------------------------------------
119
+ # Chart version pins MUST match the ones in phase-b-e2e's loki-ingest.sh +
120
+ # grafana-port-forward.sh. Without --version, helm pulls latest from the repo;
121
+ # the latest charts may reference new template values not present in our
122
+ # values files (e.g., Loki 6.8.x references .Values.loki.ui.enabled which is
123
+ # nil in our 6.7.4-shaped values, producing a nil-pointer template error
124
+ # during upgrade).
125
+ LOKI_CHART_VERSION="6.7.4"
126
+ PROMTAIL_CHART_VERSION="6.16.6"
127
+ GRAFANA_CHART_VERSION="8.5.2"
128
+
129
+ log "upgrading Phase B charts with runtime --set serviceMonitor.enabled=true (pinned versions)"
130
+
131
+ helm upgrade olam-loki grafana/loki \
132
+ --version "$LOKI_CHART_VERSION" \
133
+ --namespace "$NAMESPACE" \
134
+ -f "$REPO_ROOT/packages/peripheral-services/helm-values/loki-values.yaml" \
135
+ --wait \
136
+ --timeout 300s \
137
+ --reuse-values \
138
+ --set monitoring.serviceMonitor.enabled=true
139
+
140
+ log "olam-loki upgraded (ServiceMonitor enabled)"
141
+
142
+ helm upgrade olam-promtail grafana/promtail \
143
+ --version "$PROMTAIL_CHART_VERSION" \
144
+ --namespace "$NAMESPACE" \
145
+ -f "$REPO_ROOT/packages/peripheral-services/helm-values/promtail-values.yaml" \
146
+ --wait \
147
+ --timeout 300s \
148
+ --reuse-values \
149
+ --set serviceMonitor.enabled=true
150
+
151
+ log "olam-promtail upgraded (ServiceMonitor enabled)"
152
+
153
+ helm upgrade "$GRAFANA_RELEASE" grafana/grafana \
154
+ --version "$GRAFANA_CHART_VERSION" \
155
+ --namespace "$NAMESPACE" \
156
+ -f "$REPO_ROOT/packages/peripheral-services/helm-values/grafana-values.yaml" \
157
+ --wait \
158
+ --timeout 300s \
159
+ --reuse-values \
160
+ --set serviceMonitor.enabled=true
161
+
162
+ log "$GRAFANA_RELEASE upgraded (ServiceMonitor enabled; Prometheus datasource provisioned)"
163
+
164
+ # -------------------------------------------------------------------------
165
+ # Step 4: Wait for Prometheus pod Ready
166
+ # -------------------------------------------------------------------------
167
+ log "waiting for Prometheus pod Ready (300s)"
168
+ kubectl wait \
169
+ --for=condition=ready pod \
170
+ -l "app.kubernetes.io/name=prometheus" \
171
+ -n "$NAMESPACE" \
172
+ --timeout=300s
173
+
174
+ log "Prometheus pod Ready"
175
+
176
+ # -------------------------------------------------------------------------
177
+ # Step 5: Assertion — exactly one Grafana Deployment in the cluster
178
+ # This catches any regression where kube-prometheus-stack's bundled Grafana
179
+ # sub-chart accidentally gets enabled.
180
+ # -------------------------------------------------------------------------
181
+ log "asserting exactly 1 Grafana Deployment in namespace $NAMESPACE"
182
+ GRAFANA_DEPS=$(kubectl get deployment \
183
+ -n "$NAMESPACE" \
184
+ -l "app.kubernetes.io/name=grafana" \
185
+ -o name \
186
+ | wc -l \
187
+ | tr -d ' ')
188
+
189
+ if [ "$GRAFANA_DEPS" != "1" ]; then
190
+ log "FAIL: expected exactly 1 Grafana Deployment, found $GRAFANA_DEPS"
191
+ kubectl get deployment -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" >&2
192
+ fail "double-Grafana detected — kube-prometheus-stack's grafana.enabled must be false"
193
+ fi
194
+
195
+ log "PASS: exactly 1 Grafana Deployment found"
196
+
197
+ # -------------------------------------------------------------------------
198
+ # Step 6: Assertion — Grafana has exactly one Prometheus datasource
199
+ # Re-read the admin password from the Secret (grafana-port-forward.sh created it).
200
+ # Use port 3001 to avoid colliding with any live phase-b-e2e port-forward on 3000.
201
+ # -------------------------------------------------------------------------
202
+ log "reading admin password from Secret olam-grafana-admin"
203
+ GRAFANA_ADMIN_PW=$(kubectl get secret olam-grafana-admin \
204
+ -n "$NAMESPACE" \
205
+ -o jsonpath='{.data.admin-password}' \
206
+ | base64 -d)
207
+
208
+ log "port-forwarding svc/$GRAFANA_RELEASE $GRAFANA_LOCAL_PORT:$GRAFANA_SVC_PORT"
209
+ kubectl port-forward \
210
+ -n "$NAMESPACE" \
211
+ "svc/$GRAFANA_RELEASE" \
212
+ "${GRAFANA_LOCAL_PORT}:${GRAFANA_SVC_PORT}" &
213
+ GRAFANA_PF_PID=$!
214
+
215
+ log "waiting ${PF_BIND_SECONDS}s for Grafana port-forward to bind"
216
+ sleep "$PF_BIND_SECONDS"
217
+ kill -0 "$GRAFANA_PF_PID" 2>/dev/null \
218
+ || fail "Grafana port-forward process exited prematurely"
219
+
220
+ log "asserting exactly 1 Prometheus datasource in Grafana (GET /api/datasources)"
221
+ DATASOURCES=$(curl -sf \
222
+ -u "admin:${GRAFANA_ADMIN_PW}" \
223
+ "http://localhost:${GRAFANA_LOCAL_PORT}/api/datasources" \
224
+ || { kubectl logs -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" --tail=30 >&2 || true
225
+ fail "GET /api/datasources failed — Grafana not reachable on port $GRAFANA_LOCAL_PORT"; })
226
+
227
+ if ! echo "$DATASOURCES" | jq -e 'map(select(.type == "prometheus")) | length == 1' >/dev/null 2>&1; then
228
+ log "FAIL: Grafana does not have exactly 1 Prometheus datasource"
229
+ echo "$DATASOURCES" | jq . >&2
230
+ fail "Prometheus datasource not provisioned — check datasources block in grafana-values.yaml"
231
+ fi
232
+
233
+ PROM_URL=$(echo "$DATASOURCES" | jq -r 'map(select(.type == "prometheus")) | .[0].url')
234
+ log "PASS: Grafana has exactly 1 Prometheus datasource (url=$PROM_URL)"
235
+
236
+ # -------------------------------------------------------------------------
237
+ # Step 7: Assertion — Prometheus is scraping at least one active target
238
+ # -------------------------------------------------------------------------
239
+ log "port-forwarding svc/prometheus-operated $PROM_LOCAL_PORT:9090"
240
+ kubectl port-forward \
241
+ -n "$NAMESPACE" \
242
+ "svc/prometheus-operated" \
243
+ "${PROM_LOCAL_PORT}:9090" &
244
+ PROM_PF_PID=$!
245
+
246
+ log "waiting ${PF_BIND_SECONDS}s for Prometheus port-forward to bind"
247
+ sleep "$PF_BIND_SECONDS"
248
+ kill -0 "$PROM_PF_PID" 2>/dev/null \
249
+ || fail "Prometheus port-forward process exited prematurely"
250
+
251
+ log "querying Prometheus /api/v1/targets for active targets"
252
+ TARGETS=$(curl -sf "http://localhost:${PROM_LOCAL_PORT}/api/v1/targets" \
253
+ || fail "GET /api/v1/targets failed — Prometheus not reachable on port $PROM_LOCAL_PORT")
254
+
255
+ ACTIVE=$(echo "$TARGETS" | jq '.data.activeTargets | length')
256
+ if [ "$ACTIVE" -lt 1 ]; then
257
+ log "FAIL: Prometheus has 0 active scrape targets"
258
+ echo "$TARGETS" | jq '.data.activeTargets' >&2
259
+ fail "Prometheus has no active targets — check ServiceMonitor CRD and scrapeConfig"
260
+ fi
261
+
262
+ log "PASS: $ACTIVE active scrape target(s) found in Prometheus"
263
+
264
+ # -------------------------------------------------------------------------
265
+ # Assertion C4: Recording rules from 95-prom-recording-rules.yaml are loaded
266
+ #
267
+ # The 9[0-9]-prom-* glob in apply-manifests.sh skips this file (requires
268
+ # kube-prom-stack CRDs to exist). We kubectl apply it here, then poll
269
+ # /api/v1/rules until the olam-http-aggregations group appears.
270
+ # The port-forward on PROM_LOCAL_PORT is already open from Step 7 above.
271
+ # -------------------------------------------------------------------------
272
+ PROM_URL="http://localhost:${PROM_LOCAL_PORT}"
273
+
274
+ log "applying 95-prom-recording-rules.yaml (skipped by apply-manifests due to 9[0-9]-prom-* filter)"
275
+ kubectl apply -f "$REPO_ROOT/packages/peripheral-services/manifests/95-prom-recording-rules.yaml"
276
+
277
+ # Prometheus operator reconcile + config reload can take ~60-90s (C2 lesson).
278
+ # Poll /api/v1/rules until our group appears (up to 180s).
279
+ RECORDING_RULES_TIMEOUT=180
280
+ log "polling ${PROM_URL}/api/v1/rules for olam-http-aggregations group (up to ${RECORDING_RULES_TIMEOUT}s)"
281
+ elapsed=0
282
+ while [ "$elapsed" -lt "$RECORDING_RULES_TIMEOUT" ]; do
283
+ if curl -sf "${PROM_URL}/api/v1/rules" 2>/dev/null \
284
+ | jq -e '.data.groups[] | select(.name == "olam-http-aggregations") | .rules[] | select(.name == "olam:http_requests:rate5m_by_service")' >/dev/null 2>&1; then
285
+ log "PASS: olam-http-aggregations rule group loaded after ${elapsed}s"
286
+ break
287
+ fi
288
+ sleep 10
289
+ elapsed=$((elapsed + 10))
290
+ done
291
+ if [ "$elapsed" -ge "$RECORDING_RULES_TIMEOUT" ]; then
292
+ log "FAIL: olam-http-aggregations rule group not found in /api/v1/rules within ${RECORDING_RULES_TIMEOUT}s"
293
+ curl -sf "${PROM_URL}/api/v1/rules" | jq '.data.groups[] | .name' >&2 || true
294
+ fail "PrometheusRule not loaded by operator"
295
+ fi
296
+
297
+ # -------------------------------------------------------------------------
298
+ # Final
299
+ # -------------------------------------------------------------------------
300
+ log "PASS: kube-prometheus-stack installed; single Grafana confirmed; Prometheus datasource provisioned; $ACTIVE active target(s); recording rules loaded — Tasks C1+C4 verified"
301
+ exit 0