ecip-observability-stack 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +48 -0
- package/README.md +75 -0
- package/alerts/analysis-backlog.yaml +39 -0
- package/alerts/cache-degradation.yaml +44 -0
- package/alerts/dlq-depth.yaml +56 -0
- package/alerts/lsp-daemon.yaml +43 -0
- package/alerts/mcp-latency.yaml +46 -0
- package/alerts/security-anomaly.yaml +59 -0
- package/alerts/sla-latency.yaml +61 -0
- package/chaos/kafka-broker-restart.sh +168 -0
- package/chaos/kill-lsp-daemon.sh +148 -0
- package/chaos/redis-node-failure.sh +318 -0
- package/ci/check-observability-contract.js +285 -0
- package/ci/eslint-plugin-ecip/index.js +209 -0
- package/ci/eslint-plugin-ecip/package.json +12 -0
- package/ci/github-actions-observability-gate.yaml +180 -0
- package/ci/ruff-shared.toml +41 -0
- package/collector/otel-collector-config.yaml +226 -0
- package/collector/otel-collector-daemonset.yaml +168 -0
- package/collector/sampling-config.yaml +83 -0
- package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
- package/dashboards/analysis-throughput.json +166 -0
- package/dashboards/cache-performance.json +129 -0
- package/dashboards/cross-repo-fanout.json +93 -0
- package/dashboards/event-bus-dlq.json +129 -0
- package/dashboards/lsp-daemon-health.json +104 -0
- package/dashboards/mcp-call-graph.json +114 -0
- package/dashboards/query-latency.json +160 -0
- package/dashboards/security-events.json +131 -0
- package/docs/M08-Observability-Design.md +639 -0
- package/docs/PROGRESS.md +375 -0
- package/docs/module-documentation.md +64 -0
- package/elasticsearch/ilm-policy.json +57 -0
- package/elasticsearch/index-template.json +62 -0
- package/elasticsearch/kibana-space.yaml +53 -0
- package/helm/Chart.yaml +30 -0
- package/helm/templates/configmaps.yaml +25 -0
- package/helm/templates/elasticsearch.yaml +68 -0
- package/helm/templates/grafana-secret.yaml +22 -0
- package/helm/templates/grafana.yaml +19 -0
- package/helm/templates/loki.yaml +33 -0
- package/helm/templates/otel-collector.yaml +119 -0
- package/helm/templates/prometheus.yaml +43 -0
- package/helm/templates/tempo.yaml +16 -0
- package/helm/values.prod.yaml +159 -0
- package/helm/values.yaml +146 -0
- package/logging-lib/nodejs/package.json +57 -0
- package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
- package/logging-lib/python/pyproject.toml +45 -0
- package/logging-lib/python/src/__init__.py +19 -0
- package/logging-lib/python/src/logger.py +131 -0
- package/logging-lib/python/src/security_events.py +150 -0
- package/logging-lib/python/src/tracer.py +185 -0
- package/logging-lib/python/tests/test_logger.py +113 -0
- package/package.json +21 -0
- package/prometheus/prometheus-values.yaml +170 -0
- package/prometheus/recording-rules.yaml +97 -0
- package/prometheus/scrape-configs.yaml +122 -0
- package/runbooks/SDK-INTEGRATION.md +239 -0
- package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
- package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
- package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
- package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
- package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
- package/runbooks/dashboard-guide.md +169 -0
- package/scripts/lint-dashboards.js +184 -0
- package/tempo/tempo-datasource.yaml +46 -0
- package/tempo/tempo-values.yaml +94 -0
- package/tests/alert-threshold-config.test.ts +283 -0
- package/tests/log-schema-validation.test.ts +246 -0
- package/tests/metric-label-validation.test.ts +292 -0
- package/tests/otel-pipeline-integration.test.ts +420 -0
- package/tests/security-events.test.ts +417 -0
- package/tsconfig.json +17 -0
- package/vitest.config.ts +21 -0
- package/vitest.integration.config.ts +9 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# =============================================================================
|
|
3
|
+
# Chaos Test: Kill LSP Daemon
|
|
4
|
+
# =============================================================================
|
|
5
|
+
# Simulates LSP daemon crashes in the M02 Analysis Engine.
|
|
6
|
+
# Validates that:
|
|
7
|
+
# - LSPDaemonRestartRate alert fires within expected time
|
|
8
|
+
# - Analysis backlog grows but recovers after restart
|
|
9
|
+
# - OTel traces capture the failure correctly
|
|
10
|
+
#
|
|
11
|
+
# Prerequisites:
|
|
12
|
+
# - kubectl configured for the target cluster
|
|
13
|
+
# - M02 Analysis Engine deployed
|
|
14
|
+
# - M08 Observability Stack deployed
|
|
15
|
+
#
|
|
16
|
+
# Usage:
|
|
17
|
+
# ./kill-lsp-daemon.sh [namespace] [iterations]
|
|
18
|
+
# ./kill-lsp-daemon.sh ecip 3
|
|
19
|
+
# =============================================================================
|
|
20
|
+
set -euo pipefail
|
|
21
|
+
|
|
22
|
+
NAMESPACE="${1:-ecip}"
|
|
23
|
+
ITERATIONS="${2:-3}"
|
|
24
|
+
DELAY_BETWEEN_KILLS=30 # seconds
|
|
25
|
+
|
|
26
|
+
echo "=========================================="
|
|
27
|
+
echo "Chaos Test: Kill LSP Daemon"
|
|
28
|
+
echo "Namespace: ${NAMESPACE}"
|
|
29
|
+
echo "Iterations: ${ITERATIONS}"
|
|
30
|
+
echo "Delay between kills: ${DELAY_BETWEEN_KILLS}s"
|
|
31
|
+
echo "=========================================="
|
|
32
|
+
|
|
33
|
+
# Verify prerequisites
|
|
34
|
+
echo ""
|
|
35
|
+
echo "[1/5] Verifying prerequisites..."
|
|
36
|
+
|
|
37
|
+
if ! command -v kubectl &>/dev/null; then
|
|
38
|
+
echo "ERROR: kubectl not found"
|
|
39
|
+
exit 1
|
|
40
|
+
fi
|
|
41
|
+
|
|
42
|
+
if ! kubectl get namespace "${NAMESPACE}" &>/dev/null; then
|
|
43
|
+
echo "ERROR: Namespace ${NAMESPACE} not found"
|
|
44
|
+
exit 1
|
|
45
|
+
fi
|
|
46
|
+
|
|
47
|
+
ANALYSIS_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app=ecip-analysis-engine --no-headers 2>/dev/null | wc -l)
|
|
48
|
+
if [ "${ANALYSIS_PODS}" -eq 0 ]; then
|
|
49
|
+
echo "ERROR: No ecip-analysis-engine pods found in ${NAMESPACE}"
|
|
50
|
+
exit 1
|
|
51
|
+
fi
|
|
52
|
+
echo "Found ${ANALYSIS_PODS} analysis engine pod(s)"
|
|
53
|
+
|
|
54
|
+
# Record baseline metrics
|
|
55
|
+
echo ""
|
|
56
|
+
echo "[2/5] Recording baseline metrics..."
|
|
57
|
+
|
|
58
|
+
BASELINE_RESTARTS=$(kubectl get pods -n "${NAMESPACE}" -l app=ecip-analysis-engine \
|
|
59
|
+
-o jsonpath='{.items[0].status.containerStatuses[?(@.name=="lsp-daemon")].restartCount}' 2>/dev/null || echo "0")
|
|
60
|
+
echo "Baseline LSP daemon restart count: ${BASELINE_RESTARTS}"
|
|
61
|
+
|
|
62
|
+
BASELINE_BACKLOG=$(curl -s "http://prometheus.monitoring:9090/api/v1/query?query=ecip_analysis_backlog_size" 2>/dev/null \
|
|
63
|
+
| grep -o '"value":\[.*\]' | head -1 || echo "unavailable")
|
|
64
|
+
echo "Baseline analysis backlog: ${BASELINE_BACKLOG}"
|
|
65
|
+
|
|
66
|
+
# Execute chaos
|
|
67
|
+
echo ""
|
|
68
|
+
echo "[3/5] Executing chaos — killing LSP daemon processes..."
|
|
69
|
+
|
|
70
|
+
for i in $(seq 1 "${ITERATIONS}"); do
|
|
71
|
+
echo ""
|
|
72
|
+
echo "--- Kill iteration ${i}/${ITERATIONS} ---"
|
|
73
|
+
|
|
74
|
+
# Get a random analysis engine pod
|
|
75
|
+
POD=$(kubectl get pods -n "${NAMESPACE}" -l app=ecip-analysis-engine \
|
|
76
|
+
--no-headers -o custom-columns=":metadata.name" | shuf -n 1)
|
|
77
|
+
|
|
78
|
+
echo "Target pod: ${POD}"
|
|
79
|
+
|
|
80
|
+
# Kill the LSP daemon process inside the container
|
|
81
|
+
# This simulates an unexpected daemon crash
|
|
82
|
+
kubectl exec -n "${NAMESPACE}" "${POD}" -c lsp-daemon -- \
|
|
83
|
+
kill -9 1 2>/dev/null || echo " (kill command sent — daemon may have already restarted)"
|
|
84
|
+
|
|
85
|
+
echo " Kill signal sent at $(date -u +%H:%M:%S)"
|
|
86
|
+
|
|
87
|
+
if [ "${i}" -lt "${ITERATIONS}" ]; then
|
|
88
|
+
echo " Waiting ${DELAY_BETWEEN_KILLS}s before next kill..."
|
|
89
|
+
sleep "${DELAY_BETWEEN_KILLS}"
|
|
90
|
+
fi
|
|
91
|
+
done
|
|
92
|
+
|
|
93
|
+
# Wait for alerts
|
|
94
|
+
echo ""
|
|
95
|
+
echo "[4/5] Waiting for alert propagation (60s)..."
|
|
96
|
+
sleep 60
|
|
97
|
+
|
|
98
|
+
# Validate results
|
|
99
|
+
echo ""
|
|
100
|
+
echo "[5/5] Validating results..."
|
|
101
|
+
|
|
102
|
+
# Check restart count increased
|
|
103
|
+
FINAL_RESTARTS=$(kubectl get pods -n "${NAMESPACE}" -l app=ecip-analysis-engine \
|
|
104
|
+
-o jsonpath='{.items[0].status.containerStatuses[?(@.name=="lsp-daemon")].restartCount}' 2>/dev/null || echo "0")
|
|
105
|
+
echo "Final LSP daemon restart count: ${FINAL_RESTARTS}"
|
|
106
|
+
|
|
107
|
+
RESTART_DELTA=$((FINAL_RESTARTS - BASELINE_RESTARTS))
|
|
108
|
+
echo "Restart delta: ${RESTART_DELTA}"
|
|
109
|
+
|
|
110
|
+
if [ "${RESTART_DELTA}" -ge "${ITERATIONS}" ]; then
|
|
111
|
+
echo "✅ PASS: Restart count increased by at least ${ITERATIONS}"
|
|
112
|
+
else
|
|
113
|
+
echo "⚠️ WARN: Expected at least ${ITERATIONS} restarts, got ${RESTART_DELTA}"
|
|
114
|
+
fi
|
|
115
|
+
|
|
116
|
+
# Check if alert fired
|
|
117
|
+
echo ""
|
|
118
|
+
echo "Checking Alertmanager for LSPDaemonRestartRate alert..."
|
|
119
|
+
ALERT_STATUS=$(curl -s "http://alertmanager.monitoring:9093/api/v2/alerts?filter=alertname=LSPDaemonRestartRate" 2>/dev/null || echo "unavailable")
|
|
120
|
+
if echo "${ALERT_STATUS}" | grep -q "LSPDaemonRestartRate"; then
|
|
121
|
+
echo "✅ PASS: LSPDaemonRestartRate alert is firing"
|
|
122
|
+
else
|
|
123
|
+
echo "⚠️ WARN: LSPDaemonRestartRate alert not detected (may need more time)"
|
|
124
|
+
fi
|
|
125
|
+
|
|
126
|
+
# Check pod recovery
|
|
127
|
+
echo ""
|
|
128
|
+
echo "Checking pod recovery..."
|
|
129
|
+
READY_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app=ecip-analysis-engine \
|
|
130
|
+
--no-headers | grep -c "Running" || echo "0")
|
|
131
|
+
echo "Running analysis pods: ${READY_PODS}/${ANALYSIS_PODS}"
|
|
132
|
+
|
|
133
|
+
if [ "${READY_PODS}" -eq "${ANALYSIS_PODS}" ]; then
|
|
134
|
+
echo "✅ PASS: All pods recovered"
|
|
135
|
+
else
|
|
136
|
+
echo "❌ FAIL: Not all pods recovered"
|
|
137
|
+
fi
|
|
138
|
+
|
|
139
|
+
echo ""
|
|
140
|
+
echo "=========================================="
|
|
141
|
+
echo "Chaos Test Complete"
|
|
142
|
+
echo "=========================================="
|
|
143
|
+
echo ""
|
|
144
|
+
echo "Manual checks:"
|
|
145
|
+
echo " 1. Grafana → ECIP → LSP Daemon Health — verify restart spike visible"
|
|
146
|
+
echo " 2. Grafana → ECIP → Analysis Throughput — verify backlog grew then drained"
|
|
147
|
+
echo " 3. Tempo — search for error spans in ecip-analysis-engine during test window"
|
|
148
|
+
echo " 4. Slack #ecip-alerts — verify notification received"
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
#!/bin/bash#!/usr/bin/env bash
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
echo "=========================================="echo " 4. Verify all alerts resolve within 10 minutes"echo " 3. Check Query Latency dashboard for cascading impact"echo " 2. Verify cache hit rate recovers to > 85%"echo " 1. Check Grafana → ECIP → Cache Performance dashboard"echo " Next steps:"echo ""echo " Redis recovery: $REDIS_PING"echo "=========================================="echo " Chaos Test Complete"echo "=========================================="echo ""# Summaryfi echo " QueryLatencySLABreach (cascade): $( [ "$QUERY_ALERT" -gt 0 ] && echo 'FIRING ⚠️' || echo 'Not firing' )" echo " KnowledgeStoreWriteLatencyHigh: $( [ "$LATENCY_ALERT" -gt 0 ] && echo 'FIRING ✅' || echo 'Not firing' )" echo " CacheHitRateDegraded: $( [ "$CACHE_ALERT" -gt 0 ] && echo 'FIRING ✅' || echo 'Not firing' )" | grep -c "QueryLatencySLABreach" || echo "0") wget -qO- 'http://localhost:9090/api/v1/alerts' 2>/dev/null \ QUERY_ALERT=$(kubectl exec -n monitoring "$PROM_POD" -- \ | grep -c "KnowledgeStoreWriteLatencyHigh" || echo "0") wget -qO- 'http://localhost:9090/api/v1/alerts' 2>/dev/null \ LATENCY_ALERT=$(kubectl exec -n monitoring "$PROM_POD" -- \ | grep -c "CacheHitRateDegraded" || echo "0") wget -qO- 'http://localhost:9090/api/v1/alerts' 2>/dev/null \ CACHE_ALERT=$(kubectl exec -n monitoring "$PROM_POD" -- \if [ -n "$PROM_POD" ]; thenecho "[ALERTS] Checking for cache-related alerts..."echo ""# Check alertsecho " Redis PING: $REDIS_PING"REDIS_PING=$(kubectl exec -n "$NAMESPACE" "$TARGET_POD" -- redis-cli PING 2>/dev/null || echo "FAIL")echo "[POST-TEST] Verifying Redis recovery..."echo ""# Check Redis healthsleep 30echo "[RECOVERY] Redis should now be responsive. Waiting 30s for stabilization..."echo ""wait "$SLEEP_PID" 2>/dev/null || true# Wait for the DEBUG SLEEP to completedone ELAPSED=$((ELAPSED + INTERVAL)) sleep "$INTERVAL" fi echo " Active alerts: $ALERTS" | grep -oP '"alertname":"[^"]*"' | sort -u || echo "none") wget -qO- 'http://localhost:9090/api/v1/alerts' 2>/dev/null \ ALERTS=$(kubectl exec -n monitoring "$PROM_POD" -- \ if [ -n "$PROM_POD" ]; then # Check for degradation alerts echo " [${ELAPSED}s / ${DURATION}s] Redis still paused. Remaining: ${REMAINING}s" REMAINING=$((DURATION - ELAPSED))while [ "$ELAPSED" -lt "$DURATION" ]; doINTERVAL=15ELAPSED=0echo "[MONITOR] Monitoring for ${DURATION}s..."echo ""# Monitor during failureecho " Redis is now unresponsive to client requests"echo " Redis DEBUG SLEEP issued for ${DURATION}s"SLEEP_PID=$! redis-cli DEBUG SLEEP "$DURATION" &kubectl exec -n "$NAMESPACE" "$TARGET_POD" -- \echo "[CHAOS] Pausing Redis container in pod $TARGET_POD..."echo ""# Simulate failure — pause the Redis containerfi echo " Cache hit rate before: $CACHE_HIT_RATE" | grep -oP '"value":\[[\d.]+,"([\d.]+)"\]' | head -1 || echo "unknown") wget -qO- 'http://localhost:9090/api/v1/query?query=ecip_cache_hit_rate{module="M03"}' 2>/dev/null \ CACHE_HIT_RATE=$(kubectl exec -n monitoring "$PROM_POD" -- \if [ -n "$PROM_POD" ]; then -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")PROM_POD=$(kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus \# Query Prometheus for current cache hit rateecho "[PRE-TEST] Capturing baseline metrics..."echo ""# Capture pre-test metricsecho "Target Redis pod: $TARGET_POD"TARGET_POD=$(echo "$REDIS_PODS" | tr ' ' '\n' | head -1)# Pick the first non-master Redis pod (to avoid failover complexity)fi exit 1 echo "ERROR: No Redis pods found in namespace $NAMESPACE"if [ -z "$REDIS_PODS" ]; thenfi -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) REDIS_PODS=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/name=redis \ # Try alternate labelif [ -z "$REDIS_PODS" ]; then -o jsonpath='{.items[*].metadata.name}' 2>/dev/null)REDIS_PODS=$(kubectl get pods -n "$NAMESPACE" -l app=redis \# Find Redis podsecho "=========================================="echo " Failure duration: ${DURATION}s"echo " Namespace: $NAMESPACE"echo "=========================================="echo " Chaos Test: Redis Node Failure"echo "=========================================="DURATION="${2:-60}"NAMESPACE="${1:-ecip}"set -euo pipefail# =============================================================================# ./redis-node-failure.sh [--namespace ecip] [--duration 60]# Usage:## - Prometheus and Alertmanager running# - Redis StatefulSet running (redis-cluster or redis-sentinel)# - kubectl configured with access to the ECIP namespace# Prerequisites:## 5. Redis auto-recovers# 4. Downstream QueryLatencySLABreach may fire (cascading effect)# 3. Grafana Cache Performance dashboard shows the degradation# 2. KnowledgeStoreWriteLatencyHigh alert may fire (write retries)# 1. CacheHitRateDegraded alert fires as cache hit rate drops# Validates that:# Simulates Redis node failure for M03 Knowledge Store cache.# =============================================================================# Chaos Test: Redis Node Failure# =============================================================================# =============================================================================
|
|
152
|
+
# Chaos Test: Redis Node Failure
|
|
153
|
+
# =============================================================================
|
|
154
|
+
# Simulates Redis node failure in the M03 Knowledge Store cache layer.
|
|
155
|
+
# Validates that:
|
|
156
|
+
# - CacheHitRateDegraded alert fires
|
|
157
|
+
# - Query latency increases but service remains available
|
|
158
|
+
# - Cache recovers after Redis node returns
|
|
159
|
+
#
|
|
160
|
+
# Prerequisites:
|
|
161
|
+
# - kubectl configured for the target cluster
|
|
162
|
+
# - M03 Knowledge Store deployed with Redis
|
|
163
|
+
# - M08 Observability Stack deployed
|
|
164
|
+
#
|
|
165
|
+
# Usage:
|
|
166
|
+
# ./redis-node-failure.sh [namespace] [downtime_seconds]
|
|
167
|
+
# ./redis-node-failure.sh ecip 120
|
|
168
|
+
# =============================================================================
|
|
169
|
+
set -euo pipefail
|
|
170
|
+
|
|
171
|
+
NAMESPACE="${1:-ecip}"
|
|
172
|
+
DOWNTIME="${2:-120}" # How long to keep Redis down (seconds)
|
|
173
|
+
|
|
174
|
+
echo "=========================================="
|
|
175
|
+
echo "Chaos Test: Redis Node Failure"
|
|
176
|
+
echo "Namespace: ${NAMESPACE}"
|
|
177
|
+
echo "Downtime: ${DOWNTIME}s"
|
|
178
|
+
echo "=========================================="
|
|
179
|
+
|
|
180
|
+
# Verify prerequisites
|
|
181
|
+
echo ""
|
|
182
|
+
echo "[1/6] Verifying prerequisites..."
|
|
183
|
+
|
|
184
|
+
if ! command -v kubectl &>/dev/null; then
|
|
185
|
+
echo "ERROR: kubectl not found"
|
|
186
|
+
exit 1
|
|
187
|
+
fi
|
|
188
|
+
|
|
189
|
+
# Find Redis pods
|
|
190
|
+
REDIS_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app=redis --no-headers 2>/dev/null | wc -l)
|
|
191
|
+
if [ "${REDIS_PODS}" -eq 0 ]; then
|
|
192
|
+
# Try alternative label selectors
|
|
193
|
+
REDIS_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=redis --no-headers 2>/dev/null | wc -l)
|
|
194
|
+
fi
|
|
195
|
+
|
|
196
|
+
if [ "${REDIS_PODS}" -eq 0 ]; then
|
|
197
|
+
echo "ERROR: No Redis pods found in ${NAMESPACE}"
|
|
198
|
+
echo "Tried labels: app=redis, app.kubernetes.io/name=redis"
|
|
199
|
+
exit 1
|
|
200
|
+
fi
|
|
201
|
+
echo "Found ${REDIS_PODS} Redis pod(s)"
|
|
202
|
+
|
|
203
|
+
# Record baseline
|
|
204
|
+
echo ""
|
|
205
|
+
echo "[2/6] Recording baseline metrics..."
|
|
206
|
+
|
|
207
|
+
BASELINE_CACHE_HIT=$(curl -s "http://prometheus.monitoring:9090/api/v1/query?query=ecip_cache_hit_rate{module=\"M03\"}" 2>/dev/null \
|
|
208
|
+
| grep -o '"value":\[.*\]' | head -1 || echo "unavailable")
|
|
209
|
+
echo "Baseline cache hit rate: ${BASELINE_CACHE_HIT}"
|
|
210
|
+
|
|
211
|
+
BASELINE_QUERY_P95=$(curl -s "http://prometheus.monitoring:9090/api/v1/query?query=histogram_quantile(0.95,sum(rate(ecip_query_duration_ms_bucket{module=\"M04\"}[5m]))by(le))" 2>/dev/null \
|
|
212
|
+
| grep -o '"value":\[.*\]' | head -1 || echo "unavailable")
|
|
213
|
+
echo "Baseline query p95 latency: ${BASELINE_QUERY_P95}"
|
|
214
|
+
|
|
215
|
+
# Kill Redis
|
|
216
|
+
echo ""
|
|
217
|
+
echo "[3/6] Killing Redis node..."
|
|
218
|
+
|
|
219
|
+
REDIS_POD=$(kubectl get pods -n "${NAMESPACE}" -l app=redis \
|
|
220
|
+
--no-headers -o custom-columns=":metadata.name" 2>/dev/null | head -1)
|
|
221
|
+
|
|
222
|
+
if [ -z "${REDIS_POD}" ]; then
|
|
223
|
+
REDIS_POD=$(kubectl get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=redis \
|
|
224
|
+
--no-headers -o custom-columns=":metadata.name" 2>/dev/null | head -1)
|
|
225
|
+
fi
|
|
226
|
+
|
|
227
|
+
echo "Target Redis pod: ${REDIS_POD}"
|
|
228
|
+
|
|
229
|
+
# Scale down Redis to simulate node failure
|
|
230
|
+
REDIS_DEPLOYMENT=$(kubectl get pods -n "${NAMESPACE}" "${REDIS_POD}" \
|
|
231
|
+
-o jsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null || echo "")
|
|
232
|
+
|
|
233
|
+
if [ -n "${REDIS_DEPLOYMENT}" ]; then
|
|
234
|
+
echo "Scaling down Redis (${REDIS_DEPLOYMENT}) to 0 replicas..."
|
|
235
|
+
kubectl scale -n "${NAMESPACE}" statefulset/"${REDIS_DEPLOYMENT}" --replicas=0 2>/dev/null || \
|
|
236
|
+
kubectl scale -n "${NAMESPACE}" deployment/"${REDIS_DEPLOYMENT}" --replicas=0 2>/dev/null || \
|
|
237
|
+
echo " Fallback: deleting pod directly..."
|
|
238
|
+
kubectl delete pod -n "${NAMESPACE}" "${REDIS_POD}" --grace-period=0 --force 2>/dev/null || true
|
|
239
|
+
else
|
|
240
|
+
echo "Deleting Redis pod..."
|
|
241
|
+
kubectl delete pod -n "${NAMESPACE}" "${REDIS_POD}" --grace-period=0 --force 2>/dev/null || true
|
|
242
|
+
fi
|
|
243
|
+
|
|
244
|
+
echo "Redis killed at $(date -u +%H:%M:%S)"
|
|
245
|
+
|
|
246
|
+
# Wait during downtime
|
|
247
|
+
echo ""
|
|
248
|
+
echo "[4/6] Waiting for downtime period (${DOWNTIME}s)..."
|
|
249
|
+
echo " During this time, verify:"
|
|
250
|
+
echo " - Grafana → Cache Performance → hit rate dropping"
|
|
251
|
+
echo " - Grafana → Query Latency → p95 increasing"
|
|
252
|
+
echo " - Application logs show Redis connection errors"
|
|
253
|
+
|
|
254
|
+
ELAPSED=0
|
|
255
|
+
while [ "${ELAPSED}" -lt "${DOWNTIME}" ]; do
|
|
256
|
+
REMAINING=$((DOWNTIME - ELAPSED))
|
|
257
|
+
echo " ${REMAINING}s remaining..."
|
|
258
|
+
sleep 15
|
|
259
|
+
ELAPSED=$((ELAPSED + 15))
|
|
260
|
+
done
|
|
261
|
+
|
|
262
|
+
# Restore Redis
|
|
263
|
+
echo ""
|
|
264
|
+
echo "[5/6] Restoring Redis..."
|
|
265
|
+
|
|
266
|
+
if [ -n "${REDIS_DEPLOYMENT}" ]; then
|
|
267
|
+
echo "Scaling Redis back to ${REDIS_PODS} replicas..."
|
|
268
|
+
kubectl scale -n "${NAMESPACE}" statefulset/"${REDIS_DEPLOYMENT}" --replicas="${REDIS_PODS}" 2>/dev/null || \
|
|
269
|
+
kubectl scale -n "${NAMESPACE}" deployment/"${REDIS_DEPLOYMENT}" --replicas="${REDIS_PODS}" 2>/dev/null || true
|
|
270
|
+
fi
|
|
271
|
+
|
|
272
|
+
echo "Waiting 60s for Redis recovery and cache warm-up..."
|
|
273
|
+
sleep 60
|
|
274
|
+
|
|
275
|
+
# Validate
|
|
276
|
+
echo ""
|
|
277
|
+
echo "[6/6] Validating results..."
|
|
278
|
+
|
|
279
|
+
# Check Redis is back
|
|
280
|
+
REDIS_READY=$(kubectl get pods -n "${NAMESPACE}" -l app=redis \
|
|
281
|
+
--no-headers 2>/dev/null | grep -c "Running" || echo "0")
|
|
282
|
+
if [ "${REDIS_READY}" -eq 0 ]; then
|
|
283
|
+
REDIS_READY=$(kubectl get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=redis \
|
|
284
|
+
--no-headers 2>/dev/null | grep -c "Running" || echo "0")
|
|
285
|
+
fi
|
|
286
|
+
|
|
287
|
+
if [ "${REDIS_READY}" -gt 0 ]; then
|
|
288
|
+
echo "✅ PASS: Redis is running again (${REDIS_READY} pod(s))"
|
|
289
|
+
else
|
|
290
|
+
echo "❌ FAIL: Redis has not recovered"
|
|
291
|
+
fi
|
|
292
|
+
|
|
293
|
+
# Check if cache alert fired
|
|
294
|
+
echo ""
|
|
295
|
+
echo "Checking Alertmanager for CacheHitRateDegraded alert..."
|
|
296
|
+
ALERT_STATUS=$(curl -s "http://alertmanager.monitoring:9093/api/v2/alerts?filter=alertname=CacheHitRateDegraded" 2>/dev/null || echo "unavailable")
|
|
297
|
+
if echo "${ALERT_STATUS}" | grep -q "CacheHitRateDegraded"; then
|
|
298
|
+
echo "✅ PASS: CacheHitRateDegraded alert fired"
|
|
299
|
+
else
|
|
300
|
+
echo "⚠️ WARN: CacheHitRateDegraded alert not detected"
|
|
301
|
+
fi
|
|
302
|
+
|
|
303
|
+
# Final metrics
|
|
304
|
+
echo ""
|
|
305
|
+
FINAL_CACHE_HIT=$(curl -s "http://prometheus.monitoring:9090/api/v1/query?query=ecip_cache_hit_rate{module=\"M03\"}" 2>/dev/null \
|
|
306
|
+
| grep -o '"value":\[.*\]' | head -1 || echo "unavailable")
|
|
307
|
+
echo "Final cache hit rate: ${FINAL_CACHE_HIT} (expect recovering toward baseline)"
|
|
308
|
+
|
|
309
|
+
echo ""
|
|
310
|
+
echo "=========================================="
|
|
311
|
+
echo "Chaos Test Complete"
|
|
312
|
+
echo "=========================================="
|
|
313
|
+
echo ""
|
|
314
|
+
echo "Manual checks:"
|
|
315
|
+
echo " 1. Grafana → Cache Performance — verify dip during downtime, recovery after"
|
|
316
|
+
echo " 2. Grafana → Query Latency — verify latency spike during cache miss period"
|
|
317
|
+
echo " 3. Verify M03 Knowledge Store handled Redis failure gracefully (no crash)"
|
|
318
|
+
echo " 4. Verify cache warm-up is progressing (hit rate increasing)"
|