@pleri/olam-cli 0.1.159 → 0.1.161
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -0
- package/dist/agent-stream/agent-sdk-to-chunks.js +3 -0
- package/dist/agent-stream/driver-runner.js +9 -4
- package/dist/agent-stream/host-driver-launch.js +48 -0
- package/dist/commands/bootstrap.d.ts +15 -0
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +30 -1
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/flywheel/check-persona-skeleton.d.ts +30 -2
- package/dist/commands/flywheel/check-persona-skeleton.d.ts.map +1 -1
- package/dist/commands/flywheel/check-persona-skeleton.js +143 -6
- package/dist/commands/flywheel/check-persona-skeleton.js.map +1 -1
- package/dist/commands/flywheel/diversity-check.d.ts +12 -2
- package/dist/commands/flywheel/diversity-check.d.ts.map +1 -1
- package/dist/commands/flywheel/diversity-check.js +56 -6
- package/dist/commands/flywheel/diversity-check.js.map +1 -1
- package/dist/commands/flywheel/index.d.ts.map +1 -1
- package/dist/commands/flywheel/index.js +2 -0
- package/dist/commands/flywheel/index.js.map +1 -1
- package/dist/commands/flywheel/install-shims.d.ts +36 -3
- package/dist/commands/flywheel/install-shims.d.ts.map +1 -1
- package/dist/commands/flywheel/install-shims.js +118 -7
- package/dist/commands/flywheel/install-shims.js.map +1 -1
- package/dist/commands/flywheel/k10-measure.d.ts +12 -2
- package/dist/commands/flywheel/k10-measure.d.ts.map +1 -1
- package/dist/commands/flywheel/k10-measure.js +55 -6
- package/dist/commands/flywheel/k10-measure.js.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.d.ts +115 -0
- package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -0
- package/dist/commands/flywheel/migrate-overlays.js +766 -0
- package/dist/commands/flywheel/migrate-overlays.js.map +1 -0
- package/dist/commands/flywheel/sanitize-persona-output.d.ts +33 -2
- package/dist/commands/flywheel/sanitize-persona-output.d.ts.map +1 -1
- package/dist/commands/flywheel/sanitize-persona-output.js +94 -6
- package/dist/commands/flywheel/sanitize-persona-output.js.map +1 -1
- package/dist/commands/memory/index.d.ts.map +1 -1
- package/dist/commands/memory/index.js +2 -0
- package/dist/commands/memory/index.js.map +1 -1
- package/dist/commands/memory/install-hooks.d.ts +22 -0
- package/dist/commands/memory/install-hooks.d.ts.map +1 -0
- package/dist/commands/memory/install-hooks.js +156 -0
- package/dist/commands/memory/install-hooks.js.map +1 -0
- package/dist/commands/skills-doctor.js +2 -2
- package/dist/commands/skills-doctor.js.map +1 -1
- package/dist/commands/skills-source.d.ts.map +1 -1
- package/dist/commands/skills-source.js +10 -0
- package/dist/commands/skills-source.js.map +1 -1
- package/dist/commands/skills.d.ts.map +1 -1
- package/dist/commands/skills.js +169 -1
- package/dist/commands/skills.js.map +1 -1
- package/dist/image-digests.json +7 -7
- package/dist/index.js +4361 -1768
- package/dist/lib/bootstrap-kubernetes.d.ts +42 -0
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -0
- package/dist/lib/bootstrap-kubernetes.js +287 -0
- package/dist/lib/bootstrap-kubernetes.js.map +1 -0
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js +6 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/flywheel-probes.d.ts +58 -0
- package/dist/lib/flywheel-probes.d.ts.map +1 -0
- package/dist/lib/flywheel-probes.js +163 -0
- package/dist/lib/flywheel-probes.js.map +1 -0
- package/dist/lib/shim-generator.d.ts +51 -0
- package/dist/lib/shim-generator.d.ts.map +1 -0
- package/dist/lib/shim-generator.js +88 -0
- package/dist/lib/shim-generator.js.map +1 -0
- package/dist/lib/skills-apply-overlays.d.ts +35 -0
- package/dist/lib/skills-apply-overlays.d.ts.map +1 -0
- package/dist/lib/skills-apply-overlays.js +243 -0
- package/dist/lib/skills-apply-overlays.js.map +1 -0
- package/dist/mcp-server.js +1106 -453
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/30-configmap.yaml +11 -0
- package/host-cp/k8s/manifests/memory-service/35-configmap-iii-config.yaml +76 -0
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +11 -1
- package/host-cp/observability/grafana-port-forward.sh +273 -0
- package/host-cp/observability/kyverno-cardinality-mutate.sh +452 -0
- package/host-cp/observability/loki-ingest.sh +243 -0
- package/host-cp/observability/prom-no-double-grafana.sh +301 -0
- package/host-cp/src/crystallize-planning.mjs +261 -0
- package/host-cp/src/plan-chat-service.mjs +84 -2
- package/host-cp/src/planning-sessions.mjs +270 -0
- package/package.json +1 -1
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# loki-ingest.sh — e2e smoke test: Loki single-binary installs, Promtail tails,
|
|
3
|
+
# OAuth query-param scrubbing verified (code=REDACTED, no raw token).
|
|
4
|
+
#
|
|
5
|
+
# Usage: scripts/e2e/loki-ingest.sh
|
|
6
|
+
#
|
|
7
|
+
# Pre-conditions:
|
|
8
|
+
# - kubectl context is set to a live k8s cluster (does NOT spin up k3d)
|
|
9
|
+
# - helm binary available
|
|
10
|
+
# - grafana Helm repo added (helm repo add grafana https://grafana.github.io/helm-charts)
|
|
11
|
+
#
|
|
12
|
+
# This script is invoked by the A12 harness (scripts/test-ingress-integration/)
|
|
13
|
+
# after cluster-up.sh. It can also be run manually against any live cluster.
|
|
14
|
+
#
|
|
15
|
+
# Idempotency: `helm upgrade --install` is idempotent; re-runs succeed on an
|
|
16
|
+
# existing cluster. The synthetic pod is cleaned up regardless of
|
|
17
|
+
# pass/fail via a trap.
|
|
18
|
+
#
|
|
19
|
+
# Refs: docs/plans/k3s-ingress-observability/phase-b-tasks.md — Task B1
|
|
20
|
+
# Chart: grafana/loki 6.7.4 (pinned; latest stable 2026-05-20)
|
|
21
|
+
# Chart: grafana/promtail 6.16.6 (latest stable 2026-05-20)
|
|
22
|
+
|
|
23
|
+
set -euo pipefail
|
|
24
|
+
|
|
25
|
+
NAMESPACE="monitoring"
|
|
26
|
+
LOKI_RELEASE="olam-loki"
|
|
27
|
+
PROMTAIL_RELEASE="olam-promtail"
|
|
28
|
+
SYNTHETIC_POD="loki-e2e-synthetic"
|
|
29
|
+
LOKI_PORT="3100"
|
|
30
|
+
LOCAL_PORT="13100" # avoid conflict with any host-level Loki
|
|
31
|
+
|
|
32
|
+
# Magic-number commentary: Promtail's tail → ingest cycle involves:
|
|
33
|
+
# - inotify event (near-instant)
|
|
34
|
+
# - Promtail pipeline processing (~1s)
|
|
35
|
+
# - Loki write path (ingester chunk idle period: default 30m, but flush on
|
|
36
|
+
# query pressure; typically <5s in practice)
|
|
37
|
+
# 10s is conservative for a single log line in a lightly loaded cluster.
|
|
38
|
+
INGEST_LAG_SECONDS=10
|
|
39
|
+
|
|
40
|
+
log() { printf '[loki-ingest] %s\n' "$*" >&2; }
|
|
41
|
+
fail() { printf '[loki-ingest] FAIL: %s\n' "$*" >&2; exit 1; }
|
|
42
|
+
|
|
43
|
+
# -------------------------------------------------------------------------
|
|
44
|
+
# Cleanup trap — remove synthetic pod and port-forward on exit
|
|
45
|
+
# -------------------------------------------------------------------------
|
|
46
|
+
PF_PID=""
|
|
47
|
+
cleanup() {
|
|
48
|
+
if [[ -n "$PF_PID" ]] && kill -0 "$PF_PID" 2>/dev/null; then
|
|
49
|
+
kill "$PF_PID" 2>/dev/null || true
|
|
50
|
+
fi
|
|
51
|
+
kubectl delete pod "$SYNTHETIC_POD" -n default --ignore-not-found=true 2>/dev/null || true
|
|
52
|
+
}
|
|
53
|
+
trap cleanup EXIT
|
|
54
|
+
|
|
55
|
+
# -------------------------------------------------------------------------
|
|
56
|
+
# Pre-flight
|
|
57
|
+
# -------------------------------------------------------------------------
|
|
58
|
+
command -v helm >/dev/null 2>&1 || fail "helm not installed"
|
|
59
|
+
command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed"
|
|
60
|
+
command -v curl >/dev/null 2>&1 || fail "curl not installed"
|
|
61
|
+
kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG"
|
|
62
|
+
|
|
63
|
+
log "pre-flight checks passed"
|
|
64
|
+
|
|
65
|
+
# -------------------------------------------------------------------------
|
|
66
|
+
# Resolve repo root so helm -f paths work regardless of invocation cwd
|
|
67
|
+
# -------------------------------------------------------------------------
|
|
68
|
+
REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
|
|
69
|
+
|
|
70
|
+
# -------------------------------------------------------------------------
|
|
71
|
+
# Ensure grafana Helm repo is present (idempotent — safe to re-run)
|
|
72
|
+
# -------------------------------------------------------------------------
|
|
73
|
+
helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true
|
|
74
|
+
helm repo update grafana
|
|
75
|
+
|
|
76
|
+
# -------------------------------------------------------------------------
|
|
77
|
+
# Step 1: Install / upgrade Loki (single-binary mode)
|
|
78
|
+
# -------------------------------------------------------------------------
|
|
79
|
+
log "installing grafana/loki ($LOKI_RELEASE) in namespace $NAMESPACE"
|
|
80
|
+
helm upgrade --install "$LOKI_RELEASE" grafana/loki \
|
|
81
|
+
--version 6.7.4 \
|
|
82
|
+
--namespace "$NAMESPACE" \
|
|
83
|
+
--create-namespace \
|
|
84
|
+
-f "$REPO_ROOT/packages/peripheral-services/helm-values/loki-values.yaml" \
|
|
85
|
+
--wait \
|
|
86
|
+
--timeout 300s
|
|
87
|
+
|
|
88
|
+
log "loki helm install complete"
|
|
89
|
+
|
|
90
|
+
# -------------------------------------------------------------------------
|
|
91
|
+
# Step 2: Install / upgrade Promtail
|
|
92
|
+
# -------------------------------------------------------------------------
|
|
93
|
+
log "installing grafana/promtail ($PROMTAIL_RELEASE) in namespace $NAMESPACE"
|
|
94
|
+
helm upgrade --install "$PROMTAIL_RELEASE" grafana/promtail \
|
|
95
|
+
--version 6.16.6 \
|
|
96
|
+
--namespace "$NAMESPACE" \
|
|
97
|
+
-f "$REPO_ROOT/packages/peripheral-services/helm-values/promtail-values.yaml" \
|
|
98
|
+
--wait \
|
|
99
|
+
--timeout 120s
|
|
100
|
+
|
|
101
|
+
log "promtail helm install complete"
|
|
102
|
+
|
|
103
|
+
# -------------------------------------------------------------------------
|
|
104
|
+
# Step 3: Wait for Loki pod Ready
|
|
105
|
+
# -------------------------------------------------------------------------
|
|
106
|
+
log "waiting for Loki pod Ready (120s)"
|
|
107
|
+
kubectl wait \
|
|
108
|
+
--for=condition=ready pod \
|
|
109
|
+
-l app.kubernetes.io/name=loki \
|
|
110
|
+
-n "$NAMESPACE" \
|
|
111
|
+
--timeout=120s
|
|
112
|
+
|
|
113
|
+
log "loki pod Ready"
|
|
114
|
+
|
|
115
|
+
# -------------------------------------------------------------------------
|
|
116
|
+
# Step 4: Generate synthetic log line with raw OAuth tokens in URL and headers.
|
|
117
|
+
#
|
|
118
|
+
# The pod prints a single log line containing all 4 scrub patterns:
|
|
119
|
+
# ?code=SECRETTOKEN123 → code=REDACTED
|
|
120
|
+
# &access_token=SECRETTOKEN456 → access_token=REDACTED
|
|
121
|
+
# &state=SESSION789 → state=REDACTED
|
|
122
|
+
# Authorization: Bearer SECRETBEARER000 → Authorization: Bearer REDACTED
|
|
123
|
+
#
|
|
124
|
+
# Promtail tails it, runs the scrubbing pipeline, and pushes to Loki with all
|
|
125
|
+
# 4 raw tokens absent and all 4 REDACTED markers present.
|
|
126
|
+
# -------------------------------------------------------------------------
|
|
127
|
+
log "launching synthetic pod (prints all 4 raw token patterns)"
|
|
128
|
+
kubectl run "$SYNTHETIC_POD" \
|
|
129
|
+
--image=busybox \
|
|
130
|
+
--restart=Never \
|
|
131
|
+
-n default \
|
|
132
|
+
-- sh -c 'echo "GET http://example.com/callback?code=SECRETTOKEN123&access_token=SECRETTOKEN456&state=SESSION789 HTTP/1.1 Authorization: Bearer SECRETBEARER000"'
|
|
133
|
+
|
|
134
|
+
# -------------------------------------------------------------------------
|
|
135
|
+
# Step 5: Wait for Promtail tail + ingest lag
|
|
136
|
+
# -------------------------------------------------------------------------
|
|
137
|
+
log "waiting ${INGEST_LAG_SECONDS}s for Promtail to tail and ingest synthetic log"
|
|
138
|
+
sleep "$INGEST_LAG_SECONDS"
|
|
139
|
+
|
|
140
|
+
# -------------------------------------------------------------------------
|
|
141
|
+
# Step 6: Port-forward Loki and query
|
|
142
|
+
# -------------------------------------------------------------------------
|
|
143
|
+
log "port-forwarding Loki svc to localhost:${LOCAL_PORT}"
|
|
144
|
+
kubectl port-forward \
|
|
145
|
+
"svc/${LOKI_RELEASE}" \
|
|
146
|
+
"${LOCAL_PORT}:${LOKI_PORT}" \
|
|
147
|
+
-n "$NAMESPACE" &
|
|
148
|
+
PF_PID=$!
|
|
149
|
+
|
|
150
|
+
# Give port-forward a moment to establish
|
|
151
|
+
sleep 2
|
|
152
|
+
|
|
153
|
+
# Query Loki for log lines from the default namespace within the last 5 minutes.
|
|
154
|
+
# We search broadly for "SECRETTOKEN" to catch any raw token that leaked through,
|
|
155
|
+
# and separately verify all 4 REDACTED markers are present.
|
|
156
|
+
log "querying Loki for scrubbed entries"
|
|
157
|
+
QUERY_RESPONSE=$(
|
|
158
|
+
curl -s -G \
|
|
159
|
+
"http://localhost:${LOCAL_PORT}/loki/api/v1/query_range" \
|
|
160
|
+
--data-urlencode 'query={namespace="default"} |= "REDACTED"' \
|
|
161
|
+
--data-urlencode "start=$(date -u -v-5M +%s 2>/dev/null || date -u -d '5 minutes ago' +%s)000000000" \
|
|
162
|
+
--data-urlencode "end=$(date -u +%s)000000000" \
|
|
163
|
+
--data-urlencode 'limit=50'
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# -------------------------------------------------------------------------
|
|
167
|
+
# Step 7: Assertions — verify all 4 scrub patterns
|
|
168
|
+
#
|
|
169
|
+
# Contract (matches Phase B spec + promtail-values.yaml):
|
|
170
|
+
# ?code=SECRETTOKEN123 → code=REDACTED (absent: SECRETTOKEN123)
|
|
171
|
+
# &access_token=SECRETTOKEN456 → access_token=REDACTED (absent: SECRETTOKEN456)
|
|
172
|
+
# &state=SESSION789 → state=REDACTED (absent: SESSION789)
|
|
173
|
+
# Authorization: Bearer SECRETBEARER000 → Bearer REDACTED (absent: SECRETBEARER000)
|
|
174
|
+
# -------------------------------------------------------------------------
|
|
175
|
+
log "asserting scrubbing correctness (all 4 patterns)"
|
|
176
|
+
|
|
177
|
+
diag() {
|
|
178
|
+
log "DIAGNOSTIC: Loki query response:"
|
|
179
|
+
echo "$QUERY_RESPONSE" >&2
|
|
180
|
+
log "DIAGNOSTIC: last 50 lines of Promtail logs:"
|
|
181
|
+
kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/name=promtail --tail=50 2>&1 >&2 || true
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
# Assertion 1: query response is non-empty (Loki returned results)
|
|
185
|
+
if ! echo "$QUERY_RESPONSE" | grep -q '"result"'; then
|
|
186
|
+
diag
|
|
187
|
+
fail "Loki returned no result block — Promtail may not have ingested the synthetic log yet"
|
|
188
|
+
fi
|
|
189
|
+
|
|
190
|
+
# --- Scrubbed markers present ---
|
|
191
|
+
|
|
192
|
+
# Assertion 2a: code= is scrubbed
|
|
193
|
+
if ! echo "$QUERY_RESPONSE" | grep -q 'code=REDACTED'; then
|
|
194
|
+
diag
|
|
195
|
+
fail "'code=REDACTED' not found in Loki response — code= scrub stage not working"
|
|
196
|
+
fi
|
|
197
|
+
|
|
198
|
+
# Assertion 2b: access_token= is scrubbed
|
|
199
|
+
if ! echo "$QUERY_RESPONSE" | grep -q 'access_token=REDACTED'; then
|
|
200
|
+
diag
|
|
201
|
+
fail "'access_token=REDACTED' not found in Loki response — access_token= scrub stage not working"
|
|
202
|
+
fi
|
|
203
|
+
|
|
204
|
+
# Assertion 2c: state= is scrubbed
|
|
205
|
+
if ! echo "$QUERY_RESPONSE" | grep -q 'state=REDACTED'; then
|
|
206
|
+
diag
|
|
207
|
+
fail "'state=REDACTED' not found in Loki response — state= scrub stage not working"
|
|
208
|
+
fi
|
|
209
|
+
|
|
210
|
+
# Assertion 2d: Authorization Bearer is scrubbed
|
|
211
|
+
if ! echo "$QUERY_RESPONSE" | grep -q 'Bearer REDACTED'; then
|
|
212
|
+
diag
|
|
213
|
+
fail "'Bearer REDACTED' not found in Loki response — Authorization Bearer scrub stage not working"
|
|
214
|
+
fi
|
|
215
|
+
|
|
216
|
+
# --- Raw tokens absent ---
|
|
217
|
+
|
|
218
|
+
# Assertion 3a: raw code= token is absent
|
|
219
|
+
if echo "$QUERY_RESPONSE" | grep -q 'SECRETTOKEN123'; then
|
|
220
|
+
diag
|
|
221
|
+
fail "raw token 'SECRETTOKEN123' (code=) found in Loki response — scrubbing pipeline is NOT working"
|
|
222
|
+
fi
|
|
223
|
+
|
|
224
|
+
# Assertion 3b: raw access_token= token is absent
|
|
225
|
+
if echo "$QUERY_RESPONSE" | grep -q 'SECRETTOKEN456'; then
|
|
226
|
+
diag
|
|
227
|
+
fail "raw token 'SECRETTOKEN456' (access_token=) found in Loki response — scrubbing pipeline is NOT working"
|
|
228
|
+
fi
|
|
229
|
+
|
|
230
|
+
# Assertion 3c: raw state= token is absent
|
|
231
|
+
if echo "$QUERY_RESPONSE" | grep -q 'SESSION789'; then
|
|
232
|
+
diag
|
|
233
|
+
fail "raw token 'SESSION789' (state=) found in Loki response — scrubbing pipeline is NOT working"
|
|
234
|
+
fi
|
|
235
|
+
|
|
236
|
+
# Assertion 3d: raw Bearer token is absent
|
|
237
|
+
if echo "$QUERY_RESPONSE" | grep -q 'SECRETBEARER000'; then
|
|
238
|
+
diag
|
|
239
|
+
fail "raw token 'SECRETBEARER000' (Authorization Bearer) found in Loki response — scrubbing pipeline is NOT working"
|
|
240
|
+
fi
|
|
241
|
+
|
|
242
|
+
log "PASS: all 4 scrub patterns verified — code=REDACTED, access_token=REDACTED, state=REDACTED, Bearer REDACTED present; all raw tokens absent"
|
|
243
|
+
exit 0
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# prom-no-double-grafana.sh — Phase C Task C1 e2e smoke test.
|
|
3
|
+
#
|
|
4
|
+
# Verifies:
|
|
5
|
+
# 1. kube-prometheus-stack installs (Prometheus pod becomes Ready).
|
|
6
|
+
# 2. ServiceMonitor CRD is Established before Phase B charts are upgraded.
|
|
7
|
+
# 3. Phase B charts (Loki + Promtail + Grafana) are helm-upgraded to pick up
|
|
8
|
+
# serviceMonitor.enabled: true now that the CRD exists.
|
|
9
|
+
# 4. Exactly one Grafana Deployment is running in the cluster (no double-Grafana).
|
|
10
|
+
# 5. Phase B's Grafana (olam-grafana) has exactly one Prometheus datasource
|
|
11
|
+
# provisioned (from grafana-values.yaml datasources block added in C1).
|
|
12
|
+
# 6. Prometheus is scraping at least one active target.
|
|
13
|
+
#
|
|
14
|
+
# Pre-conditions:
|
|
15
|
+
# - kubectl context is set to a live k8s cluster.
|
|
16
|
+
# - Phase B e2e (loki-ingest.sh + grafana-port-forward.sh + grafana-dashboard-persistence.sh)
|
|
17
|
+
# has already run: olam-loki, olam-promtail, and olam-grafana releases are installed.
|
|
18
|
+
# - The olam-grafana-admin Secret exists (created by grafana-port-forward.sh).
|
|
19
|
+
# - helm, kubectl, curl, jq binaries available.
|
|
20
|
+
#
|
|
21
|
+
# Chart: prometheus-community/kube-prometheus-stack 85.2.0 (pinned; latest stable 2026-05-21).
|
|
22
|
+
#
|
|
23
|
+
# Idempotency: helm upgrade --install is idempotent; re-runs on an existing
|
|
24
|
+
# cluster succeed. Port-forwards are killed on exit via trap.
|
|
25
|
+
#
|
|
26
|
+
# Refs: docs/plans/k3s-ingress-observability/phase-c-tasks.md — Task C1
|
|
27
|
+
|
|
28
|
+
set -euo pipefail
|
|
29
|
+
|
|
30
|
+
NAMESPACE="monitoring"
|
|
31
|
+
PROM_RELEASE="olam-prom"
|
|
32
|
+
PROM_CHART_VERSION="85.2.0"
|
|
33
|
+
GRAFANA_RELEASE="olam-grafana"
|
|
34
|
+
GRAFANA_LOCAL_PORT="3001" # avoid collision if phase-b-e2e left a port-forward on 3000
|
|
35
|
+
GRAFANA_SVC_PORT="80"
|
|
36
|
+
PROM_LOCAL_PORT="9090"
|
|
37
|
+
PF_BIND_SECONDS=5
|
|
38
|
+
|
|
39
|
+
log() { printf '[prom-no-double-grafana] %s\n' "$*" >&2; }
|
|
40
|
+
fail() { printf '[prom-no-double-grafana] FAIL: %s\n' "$*" >&2; exit 1; }
|
|
41
|
+
|
|
42
|
+
# -------------------------------------------------------------------------
|
|
43
|
+
# Resolve repo root so helm -f paths work regardless of invocation cwd
|
|
44
|
+
# -------------------------------------------------------------------------
|
|
45
|
+
REPO_ROOT="$(git -C "$(dirname "$0")" rev-parse --show-toplevel 2>/dev/null || pwd)"
|
|
46
|
+
|
|
47
|
+
# -------------------------------------------------------------------------
|
|
48
|
+
# Cleanup trap — kill port-forwards on exit; leave Helm releases in place
|
|
49
|
+
# -------------------------------------------------------------------------
|
|
50
|
+
GRAFANA_PF_PID=""
|
|
51
|
+
PROM_PF_PID=""
|
|
52
|
+
cleanup() {
|
|
53
|
+
[[ -n "$GRAFANA_PF_PID" ]] && kill "$GRAFANA_PF_PID" 2>/dev/null || true
|
|
54
|
+
[[ -n "$PROM_PF_PID" ]] && kill "$PROM_PF_PID" 2>/dev/null || true
|
|
55
|
+
}
|
|
56
|
+
trap cleanup EXIT
|
|
57
|
+
|
|
58
|
+
# -------------------------------------------------------------------------
|
|
59
|
+
# Pre-flight
|
|
60
|
+
# -------------------------------------------------------------------------
|
|
61
|
+
command -v helm >/dev/null 2>&1 || fail "helm not installed"
|
|
62
|
+
command -v kubectl >/dev/null 2>&1 || fail "kubectl not installed"
|
|
63
|
+
command -v curl >/dev/null 2>&1 || fail "curl not installed"
|
|
64
|
+
command -v jq >/dev/null 2>&1 || fail "jq not installed"
|
|
65
|
+
kubectl cluster-info >/dev/null 2>&1 || fail "kubectl: no reachable cluster; set KUBECONFIG"
|
|
66
|
+
|
|
67
|
+
log "pre-flight checks passed"
|
|
68
|
+
|
|
69
|
+
# Verify Phase B pre-conditions
|
|
70
|
+
for release in olam-loki olam-promtail "$GRAFANA_RELEASE"; do
|
|
71
|
+
helm status "$release" -n "$NAMESPACE" >/dev/null 2>&1 \
|
|
72
|
+
|| fail "Phase B release '$release' not found in namespace $NAMESPACE — run phase-b-e2e first"
|
|
73
|
+
done
|
|
74
|
+
log "Phase B pre-conditions satisfied (olam-loki, olam-promtail, olam-grafana releases found)"
|
|
75
|
+
|
|
76
|
+
# -------------------------------------------------------------------------
|
|
77
|
+
# Step 1: Add prometheus-community repo and install kube-prometheus-stack
|
|
78
|
+
# -------------------------------------------------------------------------
|
|
79
|
+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true
|
|
80
|
+
helm repo update prometheus-community
|
|
81
|
+
|
|
82
|
+
log "installing prometheus-community/kube-prometheus-stack ($PROM_RELEASE) version $PROM_CHART_VERSION"
|
|
83
|
+
helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stack \
|
|
84
|
+
--version "$PROM_CHART_VERSION" \
|
|
85
|
+
--namespace "$NAMESPACE" \
|
|
86
|
+
--create-namespace \
|
|
87
|
+
-f "$REPO_ROOT/packages/peripheral-services/helm-values/kube-prom-stack-values.yaml" \
|
|
88
|
+
--wait \
|
|
89
|
+
--timeout 600s
|
|
90
|
+
|
|
91
|
+
log "kube-prometheus-stack helm install complete"
|
|
92
|
+
|
|
93
|
+
# -------------------------------------------------------------------------
|
|
94
|
+
# Step 2: Wait for ServiceMonitor CRD to be Established
|
|
95
|
+
# This is the gate before upgrading Phase B charts — the CRD must exist
|
|
96
|
+
# for serviceMonitor.enabled: true to produce a valid ServiceMonitor object.
|
|
97
|
+
# -------------------------------------------------------------------------
|
|
98
|
+
log "waiting for ServiceMonitor CRD to be Established (60s)"
|
|
99
|
+
kubectl wait \
|
|
100
|
+
--for=condition=established \
|
|
101
|
+
crd/servicemonitors.monitoring.coreos.com \
|
|
102
|
+
--timeout=60s
|
|
103
|
+
|
|
104
|
+
log "ServiceMonitor CRD Established"
|
|
105
|
+
|
|
106
|
+
# -------------------------------------------------------------------------
|
|
107
|
+
# Step 3: Helm-upgrade Phase B charts to enable ServiceMonitor at RUNTIME
|
|
108
|
+
#
|
|
109
|
+
# The source-of-truth values files keep serviceMonitor.enabled: false so a
|
|
110
|
+
# standalone Phase B install (without kube-prometheus-stack) does not
|
|
111
|
+
# hard-fail with "no matches for kind ServiceMonitor". We flip the toggle
|
|
112
|
+
# at runtime here, AFTER the CRD is Established, via --set overrides. This
|
|
113
|
+
# preserves Phase B's standalone-installability invariant while wiring
|
|
114
|
+
# Prometheus discovery when kube-prom-stack is present.
|
|
115
|
+
#
|
|
116
|
+
# NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor)
|
|
117
|
+
# — chart-version-specific path.
|
|
118
|
+
# -------------------------------------------------------------------------
|
|
119
|
+
# Chart version pins MUST match the ones in phase-b-e2e's loki-ingest.sh +
|
|
120
|
+
# grafana-port-forward.sh. Without --version, helm pulls latest from the repo;
|
|
121
|
+
# the latest charts may reference new template values not present in our
|
|
122
|
+
# values files (e.g., Loki 6.8.x references .Values.loki.ui.enabled which is
|
|
123
|
+
# nil in our 6.7.4-shaped values, producing a nil-pointer template error
|
|
124
|
+
# during upgrade).
|
|
125
|
+
LOKI_CHART_VERSION="6.7.4"
|
|
126
|
+
PROMTAIL_CHART_VERSION="6.16.6"
|
|
127
|
+
GRAFANA_CHART_VERSION="8.5.2"
|
|
128
|
+
|
|
129
|
+
log "upgrading Phase B charts with runtime --set serviceMonitor.enabled=true (pinned versions)"
|
|
130
|
+
|
|
131
|
+
helm upgrade olam-loki grafana/loki \
|
|
132
|
+
--version "$LOKI_CHART_VERSION" \
|
|
133
|
+
--namespace "$NAMESPACE" \
|
|
134
|
+
-f "$REPO_ROOT/packages/peripheral-services/helm-values/loki-values.yaml" \
|
|
135
|
+
--wait \
|
|
136
|
+
--timeout 300s \
|
|
137
|
+
--reuse-values \
|
|
138
|
+
--set monitoring.serviceMonitor.enabled=true
|
|
139
|
+
|
|
140
|
+
log "olam-loki upgraded (ServiceMonitor enabled)"
|
|
141
|
+
|
|
142
|
+
helm upgrade olam-promtail grafana/promtail \
|
|
143
|
+
--version "$PROMTAIL_CHART_VERSION" \
|
|
144
|
+
--namespace "$NAMESPACE" \
|
|
145
|
+
-f "$REPO_ROOT/packages/peripheral-services/helm-values/promtail-values.yaml" \
|
|
146
|
+
--wait \
|
|
147
|
+
--timeout 300s \
|
|
148
|
+
--reuse-values \
|
|
149
|
+
--set serviceMonitor.enabled=true
|
|
150
|
+
|
|
151
|
+
log "olam-promtail upgraded (ServiceMonitor enabled)"
|
|
152
|
+
|
|
153
|
+
helm upgrade "$GRAFANA_RELEASE" grafana/grafana \
|
|
154
|
+
--version "$GRAFANA_CHART_VERSION" \
|
|
155
|
+
--namespace "$NAMESPACE" \
|
|
156
|
+
-f "$REPO_ROOT/packages/peripheral-services/helm-values/grafana-values.yaml" \
|
|
157
|
+
--wait \
|
|
158
|
+
--timeout 300s \
|
|
159
|
+
--reuse-values \
|
|
160
|
+
--set serviceMonitor.enabled=true
|
|
161
|
+
|
|
162
|
+
log "$GRAFANA_RELEASE upgraded (ServiceMonitor enabled; Prometheus datasource provisioned)"
|
|
163
|
+
|
|
164
|
+
# -------------------------------------------------------------------------
|
|
165
|
+
# Step 4: Wait for Prometheus pod Ready
|
|
166
|
+
# -------------------------------------------------------------------------
|
|
167
|
+
log "waiting for Prometheus pod Ready (300s)"
|
|
168
|
+
kubectl wait \
|
|
169
|
+
--for=condition=ready pod \
|
|
170
|
+
-l "app.kubernetes.io/name=prometheus" \
|
|
171
|
+
-n "$NAMESPACE" \
|
|
172
|
+
--timeout=300s
|
|
173
|
+
|
|
174
|
+
log "Prometheus pod Ready"
|
|
175
|
+
|
|
176
|
+
# -------------------------------------------------------------------------
|
|
177
|
+
# Step 5: Assertion — exactly one Grafana Deployment in the cluster
|
|
178
|
+
# This catches any regression where kube-prometheus-stack's bundled Grafana
|
|
179
|
+
# sub-chart accidentally gets enabled.
|
|
180
|
+
# -------------------------------------------------------------------------
|
|
181
|
+
log "asserting exactly 1 Grafana Deployment in namespace $NAMESPACE"
|
|
182
|
+
GRAFANA_DEPS=$(kubectl get deployment \
|
|
183
|
+
-n "$NAMESPACE" \
|
|
184
|
+
-l "app.kubernetes.io/name=grafana" \
|
|
185
|
+
-o name \
|
|
186
|
+
| wc -l \
|
|
187
|
+
| tr -d ' ')
|
|
188
|
+
|
|
189
|
+
if [ "$GRAFANA_DEPS" != "1" ]; then
|
|
190
|
+
log "FAIL: expected exactly 1 Grafana Deployment, found $GRAFANA_DEPS"
|
|
191
|
+
kubectl get deployment -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" >&2
|
|
192
|
+
fail "double-Grafana detected — kube-prometheus-stack's grafana.enabled must be false"
|
|
193
|
+
fi
|
|
194
|
+
|
|
195
|
+
log "PASS: exactly 1 Grafana Deployment found"
|
|
196
|
+
|
|
197
|
+
# -------------------------------------------------------------------------
|
|
198
|
+
# Step 6: Assertion — Grafana has exactly one Prometheus datasource
|
|
199
|
+
# Re-read the admin password from the Secret (grafana-port-forward.sh created it).
|
|
200
|
+
# Use port 3001 to avoid colliding with any live phase-b-e2e port-forward on 3000.
|
|
201
|
+
# -------------------------------------------------------------------------
|
|
202
|
+
log "reading admin password from Secret olam-grafana-admin"
|
|
203
|
+
GRAFANA_ADMIN_PW=$(kubectl get secret olam-grafana-admin \
|
|
204
|
+
-n "$NAMESPACE" \
|
|
205
|
+
-o jsonpath='{.data.admin-password}' \
|
|
206
|
+
| base64 -d)
|
|
207
|
+
|
|
208
|
+
log "port-forwarding svc/$GRAFANA_RELEASE $GRAFANA_LOCAL_PORT:$GRAFANA_SVC_PORT"
|
|
209
|
+
kubectl port-forward \
|
|
210
|
+
-n "$NAMESPACE" \
|
|
211
|
+
"svc/$GRAFANA_RELEASE" \
|
|
212
|
+
"${GRAFANA_LOCAL_PORT}:${GRAFANA_SVC_PORT}" &
|
|
213
|
+
GRAFANA_PF_PID=$!
|
|
214
|
+
|
|
215
|
+
log "waiting ${PF_BIND_SECONDS}s for Grafana port-forward to bind"
|
|
216
|
+
sleep "$PF_BIND_SECONDS"
|
|
217
|
+
kill -0 "$GRAFANA_PF_PID" 2>/dev/null \
|
|
218
|
+
|| fail "Grafana port-forward process exited prematurely"
|
|
219
|
+
|
|
220
|
+
log "asserting exactly 1 Prometheus datasource in Grafana (GET /api/datasources)"
|
|
221
|
+
DATASOURCES=$(curl -sf \
|
|
222
|
+
-u "admin:${GRAFANA_ADMIN_PW}" \
|
|
223
|
+
"http://localhost:${GRAFANA_LOCAL_PORT}/api/datasources" \
|
|
224
|
+
|| { kubectl logs -n "$NAMESPACE" -l "app.kubernetes.io/name=grafana" --tail=30 >&2 || true
|
|
225
|
+
fail "GET /api/datasources failed — Grafana not reachable on port $GRAFANA_LOCAL_PORT"; })
|
|
226
|
+
|
|
227
|
+
if ! echo "$DATASOURCES" | jq -e 'map(select(.type == "prometheus")) | length == 1' >/dev/null 2>&1; then
|
|
228
|
+
log "FAIL: Grafana does not have exactly 1 Prometheus datasource"
|
|
229
|
+
echo "$DATASOURCES" | jq . >&2
|
|
230
|
+
fail "Prometheus datasource not provisioned — check datasources block in grafana-values.yaml"
|
|
231
|
+
fi
|
|
232
|
+
|
|
233
|
+
PROM_URL=$(echo "$DATASOURCES" | jq -r 'map(select(.type == "prometheus")) | .[0].url')
|
|
234
|
+
log "PASS: Grafana has exactly 1 Prometheus datasource (url=$PROM_URL)"
|
|
235
|
+
|
|
236
|
+
# -------------------------------------------------------------------------
|
|
237
|
+
# Step 7: Assertion — Prometheus is scraping at least one active target
|
|
238
|
+
# -------------------------------------------------------------------------
|
|
239
|
+
log "port-forwarding svc/prometheus-operated $PROM_LOCAL_PORT:9090"
|
|
240
|
+
kubectl port-forward \
|
|
241
|
+
-n "$NAMESPACE" \
|
|
242
|
+
"svc/prometheus-operated" \
|
|
243
|
+
"${PROM_LOCAL_PORT}:9090" &
|
|
244
|
+
PROM_PF_PID=$!
|
|
245
|
+
|
|
246
|
+
log "waiting ${PF_BIND_SECONDS}s for Prometheus port-forward to bind"
|
|
247
|
+
sleep "$PF_BIND_SECONDS"
|
|
248
|
+
kill -0 "$PROM_PF_PID" 2>/dev/null \
|
|
249
|
+
|| fail "Prometheus port-forward process exited prematurely"
|
|
250
|
+
|
|
251
|
+
log "querying Prometheus /api/v1/targets for active targets"
|
|
252
|
+
TARGETS=$(curl -sf "http://localhost:${PROM_LOCAL_PORT}/api/v1/targets" \
|
|
253
|
+
|| fail "GET /api/v1/targets failed — Prometheus not reachable on port $PROM_LOCAL_PORT")
|
|
254
|
+
|
|
255
|
+
ACTIVE=$(echo "$TARGETS" | jq '.data.activeTargets | length')
|
|
256
|
+
if [ "$ACTIVE" -lt 1 ]; then
|
|
257
|
+
log "FAIL: Prometheus has 0 active scrape targets"
|
|
258
|
+
echo "$TARGETS" | jq '.data.activeTargets' >&2
|
|
259
|
+
fail "Prometheus has no active targets — check ServiceMonitor CRD and scrapeConfig"
|
|
260
|
+
fi
|
|
261
|
+
|
|
262
|
+
log "PASS: $ACTIVE active scrape target(s) found in Prometheus"
|
|
263
|
+
|
|
264
|
+
# -------------------------------------------------------------------------
|
|
265
|
+
# Assertion C4: Recording rules from 95-prom-recording-rules.yaml are loaded
|
|
266
|
+
#
|
|
267
|
+
# The 9[0-9]-prom-* glob in apply-manifests.sh skips this file (requires
|
|
268
|
+
# kube-prom-stack CRDs to exist). We kubectl apply it here, then poll
|
|
269
|
+
# /api/v1/rules until the olam-http-aggregations group appears.
|
|
270
|
+
# The port-forward on PROM_LOCAL_PORT is already open from Step 7 above.
|
|
271
|
+
# -------------------------------------------------------------------------
|
|
272
|
+
PROM_URL="http://localhost:${PROM_LOCAL_PORT}"
|
|
273
|
+
|
|
274
|
+
log "applying 95-prom-recording-rules.yaml (skipped by apply-manifests due to 9[0-9]-prom-* filter)"
|
|
275
|
+
kubectl apply -f "$REPO_ROOT/packages/peripheral-services/manifests/95-prom-recording-rules.yaml"
|
|
276
|
+
|
|
277
|
+
# Prometheus operator reconcile + config reload can take ~60-90s (C2 lesson).
|
|
278
|
+
# Poll /api/v1/rules until our group appears (up to 180s).
|
|
279
|
+
RECORDING_RULES_TIMEOUT=180
|
|
280
|
+
log "polling ${PROM_URL}/api/v1/rules for olam-http-aggregations group (up to ${RECORDING_RULES_TIMEOUT}s)"
|
|
281
|
+
elapsed=0
|
|
282
|
+
while [ "$elapsed" -lt "$RECORDING_RULES_TIMEOUT" ]; do
|
|
283
|
+
if curl -sf "${PROM_URL}/api/v1/rules" 2>/dev/null \
|
|
284
|
+
| jq -e '.data.groups[] | select(.name == "olam-http-aggregations") | .rules[] | select(.name == "olam:http_requests:rate5m_by_service")' >/dev/null 2>&1; then
|
|
285
|
+
log "PASS: olam-http-aggregations rule group loaded after ${elapsed}s"
|
|
286
|
+
break
|
|
287
|
+
fi
|
|
288
|
+
sleep 10
|
|
289
|
+
elapsed=$((elapsed + 10))
|
|
290
|
+
done
|
|
291
|
+
if [ "$elapsed" -ge "$RECORDING_RULES_TIMEOUT" ]; then
|
|
292
|
+
log "FAIL: olam-http-aggregations rule group not found in /api/v1/rules within ${RECORDING_RULES_TIMEOUT}s"
|
|
293
|
+
curl -sf "${PROM_URL}/api/v1/rules" | jq '.data.groups[] | .name' >&2 || true
|
|
294
|
+
fail "PrometheusRule not loaded by operator"
|
|
295
|
+
fi
|
|
296
|
+
|
|
297
|
+
# -------------------------------------------------------------------------
|
|
298
|
+
# Final
|
|
299
|
+
# -------------------------------------------------------------------------
|
|
300
|
+
log "PASS: kube-prometheus-stack installed; single Grafana confirmed; Prometheus datasource provisioned; $ACTIVE active target(s); recording rules loaded — Tasks C1+C4 verified"
|
|
301
|
+
exit 0
|