ecip-observability-stack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CLAUDE.md +48 -0
  2. package/README.md +75 -0
  3. package/alerts/analysis-backlog.yaml +39 -0
  4. package/alerts/cache-degradation.yaml +44 -0
  5. package/alerts/dlq-depth.yaml +56 -0
  6. package/alerts/lsp-daemon.yaml +43 -0
  7. package/alerts/mcp-latency.yaml +46 -0
  8. package/alerts/security-anomaly.yaml +59 -0
  9. package/alerts/sla-latency.yaml +61 -0
  10. package/chaos/kafka-broker-restart.sh +168 -0
  11. package/chaos/kill-lsp-daemon.sh +148 -0
  12. package/chaos/redis-node-failure.sh +318 -0
  13. package/ci/check-observability-contract.js +285 -0
  14. package/ci/eslint-plugin-ecip/index.js +209 -0
  15. package/ci/eslint-plugin-ecip/package.json +12 -0
  16. package/ci/github-actions-observability-gate.yaml +180 -0
  17. package/ci/ruff-shared.toml +41 -0
  18. package/collector/otel-collector-config.yaml +226 -0
  19. package/collector/otel-collector-daemonset.yaml +168 -0
  20. package/collector/sampling-config.yaml +83 -0
  21. package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
  22. package/dashboards/analysis-throughput.json +166 -0
  23. package/dashboards/cache-performance.json +129 -0
  24. package/dashboards/cross-repo-fanout.json +93 -0
  25. package/dashboards/event-bus-dlq.json +129 -0
  26. package/dashboards/lsp-daemon-health.json +104 -0
  27. package/dashboards/mcp-call-graph.json +114 -0
  28. package/dashboards/query-latency.json +160 -0
  29. package/dashboards/security-events.json +131 -0
  30. package/docs/M08-Observability-Design.md +639 -0
  31. package/docs/PROGRESS.md +375 -0
  32. package/docs/module-documentation.md +64 -0
  33. package/elasticsearch/ilm-policy.json +57 -0
  34. package/elasticsearch/index-template.json +62 -0
  35. package/elasticsearch/kibana-space.yaml +53 -0
  36. package/helm/Chart.yaml +30 -0
  37. package/helm/templates/configmaps.yaml +25 -0
  38. package/helm/templates/elasticsearch.yaml +68 -0
  39. package/helm/templates/grafana-secret.yaml +22 -0
  40. package/helm/templates/grafana.yaml +19 -0
  41. package/helm/templates/loki.yaml +33 -0
  42. package/helm/templates/otel-collector.yaml +119 -0
  43. package/helm/templates/prometheus.yaml +43 -0
  44. package/helm/templates/tempo.yaml +16 -0
  45. package/helm/values.prod.yaml +159 -0
  46. package/helm/values.yaml +146 -0
  47. package/logging-lib/nodejs/package.json +57 -0
  48. package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
  49. package/logging-lib/python/pyproject.toml +45 -0
  50. package/logging-lib/python/src/__init__.py +19 -0
  51. package/logging-lib/python/src/logger.py +131 -0
  52. package/logging-lib/python/src/security_events.py +150 -0
  53. package/logging-lib/python/src/tracer.py +185 -0
  54. package/logging-lib/python/tests/test_logger.py +113 -0
  55. package/package.json +21 -0
  56. package/prometheus/prometheus-values.yaml +170 -0
  57. package/prometheus/recording-rules.yaml +97 -0
  58. package/prometheus/scrape-configs.yaml +122 -0
  59. package/runbooks/SDK-INTEGRATION.md +239 -0
  60. package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
  61. package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
  62. package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
  63. package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
  64. package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
  65. package/runbooks/dashboard-guide.md +169 -0
  66. package/scripts/lint-dashboards.js +184 -0
  67. package/tempo/tempo-datasource.yaml +46 -0
  68. package/tempo/tempo-values.yaml +94 -0
  69. package/tests/alert-threshold-config.test.ts +283 -0
  70. package/tests/log-schema-validation.test.ts +246 -0
  71. package/tests/metric-label-validation.test.ts +292 -0
  72. package/tests/otel-pipeline-integration.test.ts +420 -0
  73. package/tests/security-events.test.ts +417 -0
  74. package/tsconfig.json +17 -0
  75. package/vitest.config.ts +21 -0
  76. package/vitest.integration.config.ts +9 -0
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # Chaos Test: Kill LSP Daemon
4
+ # =============================================================================
5
+ # Simulates LSP daemon crashes in the M02 Analysis Engine.
6
+ # Validates that:
7
+ # - LSPDaemonRestartRate alert fires within expected time
8
+ # - Analysis backlog grows but recovers after restart
9
+ # - OTel traces capture the failure correctly
10
+ #
11
+ # Prerequisites:
12
+ # - kubectl configured for the target cluster
13
+ # - M02 Analysis Engine deployed
14
+ # - M08 Observability Stack deployed
15
+ #
16
+ # Usage:
17
+ # ./kill-lsp-daemon.sh [namespace] [iterations]
18
+ # ./kill-lsp-daemon.sh ecip 3
19
+ # =============================================================================
20
+ set -euo pipefail
21
+
22
+ NAMESPACE="${1:-ecip}"
23
+ ITERATIONS="${2:-3}"
24
+ DELAY_BETWEEN_KILLS=30 # seconds
25
+
26
+ echo "=========================================="
27
+ echo "Chaos Test: Kill LSP Daemon"
28
+ echo "Namespace: ${NAMESPACE}"
29
+ echo "Iterations: ${ITERATIONS}"
30
+ echo "Delay between kills: ${DELAY_BETWEEN_KILLS}s"
31
+ echo "=========================================="
32
+
33
+ # Verify prerequisites
34
+ echo ""
35
+ echo "[1/5] Verifying prerequisites..."
36
+
37
+ if ! command -v kubectl &>/dev/null; then
38
+ echo "ERROR: kubectl not found"
39
+ exit 1
40
+ fi
41
+
42
+ if ! kubectl get namespace "${NAMESPACE}" &>/dev/null; then
43
+ echo "ERROR: Namespace ${NAMESPACE} not found"
44
+ exit 1
45
+ fi
46
+
47
+ ANALYSIS_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app=ecip-analysis-engine --no-headers 2>/dev/null | wc -l)
48
+ if [ "${ANALYSIS_PODS}" -eq 0 ]; then
49
+ echo "ERROR: No ecip-analysis-engine pods found in ${NAMESPACE}"
50
+ exit 1
51
+ fi
52
+ echo "Found ${ANALYSIS_PODS} analysis engine pod(s)"
53
+
54
+ # Record baseline metrics
55
+ echo ""
56
+ echo "[2/5] Recording baseline metrics..."
57
+
58
+ BASELINE_RESTARTS=$(kubectl get pods -n "${NAMESPACE}" -l app=ecip-analysis-engine \
59
+ -o jsonpath='{.items[0].status.containerStatuses[?(@.name=="lsp-daemon")].restartCount}' 2>/dev/null || echo "0")
60
+ echo "Baseline LSP daemon restart count: ${BASELINE_RESTARTS}"
61
+
62
+ BASELINE_BACKLOG=$(curl -s "http://prometheus.monitoring:9090/api/v1/query?query=ecip_analysis_backlog_size" 2>/dev/null \
63
+ | grep -o '"value":\[.*\]' | head -1 || echo "unavailable")
64
+ echo "Baseline analysis backlog: ${BASELINE_BACKLOG}"
65
+
66
+ # Execute chaos
67
+ echo ""
68
+ echo "[3/5] Executing chaos — killing LSP daemon processes..."
69
+
70
+ for i in $(seq 1 "${ITERATIONS}"); do
71
+ echo ""
72
+ echo "--- Kill iteration ${i}/${ITERATIONS} ---"
73
+
74
+ # Get a random analysis engine pod
75
+ POD=$(kubectl get pods -n "${NAMESPACE}" -l app=ecip-analysis-engine \
76
+ --no-headers -o custom-columns=":metadata.name" | shuf -n 1)
77
+
78
+ echo "Target pod: ${POD}"
79
+
80
+ # Kill the LSP daemon process inside the container
81
+ # This simulates an unexpected daemon crash
82
+ kubectl exec -n "${NAMESPACE}" "${POD}" -c lsp-daemon -- \
83
+ kill -9 1 2>/dev/null || echo " (kill command sent — daemon may have already restarted)"
84
+
85
+ echo " Kill signal sent at $(date -u +%H:%M:%S)"
86
+
87
+ if [ "${i}" -lt "${ITERATIONS}" ]; then
88
+ echo " Waiting ${DELAY_BETWEEN_KILLS}s before next kill..."
89
+ sleep "${DELAY_BETWEEN_KILLS}"
90
+ fi
91
+ done
92
+
93
+ # Wait for alerts
94
+ echo ""
95
+ echo "[4/5] Waiting for alert propagation (60s)..."
96
+ sleep 60
97
+
98
+ # Validate results
99
+ echo ""
100
+ echo "[5/5] Validating results..."
101
+
102
+ # Check restart count increased
103
+ FINAL_RESTARTS=$(kubectl get pods -n "${NAMESPACE}" -l app=ecip-analysis-engine \
104
+ -o jsonpath='{.items[0].status.containerStatuses[?(@.name=="lsp-daemon")].restartCount}' 2>/dev/null || echo "0")
105
+ echo "Final LSP daemon restart count: ${FINAL_RESTARTS}"
106
+
107
+ RESTART_DELTA=$((FINAL_RESTARTS - BASELINE_RESTARTS))
108
+ echo "Restart delta: ${RESTART_DELTA}"
109
+
110
+ if [ "${RESTART_DELTA}" -ge "${ITERATIONS}" ]; then
111
+ echo "✅ PASS: Restart count increased by at least ${ITERATIONS}"
112
+ else
113
+ echo "⚠️ WARN: Expected at least ${ITERATIONS} restarts, got ${RESTART_DELTA}"
114
+ fi
115
+
116
+ # Check if alert fired
117
+ echo ""
118
+ echo "Checking Alertmanager for LSPDaemonRestartRate alert..."
119
+ ALERT_STATUS=$(curl -s "http://alertmanager.monitoring:9093/api/v2/alerts?filter=alertname=LSPDaemonRestartRate" 2>/dev/null || echo "unavailable")
120
+ if echo "${ALERT_STATUS}" | grep -q "LSPDaemonRestartRate"; then
121
+ echo "✅ PASS: LSPDaemonRestartRate alert is firing"
122
+ else
123
+ echo "⚠️ WARN: LSPDaemonRestartRate alert not detected (may need more time)"
124
+ fi
125
+
126
+ # Check pod recovery
127
+ echo ""
128
+ echo "Checking pod recovery..."
129
+ READY_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app=ecip-analysis-engine \
130
+ --no-headers | grep -c "Running" || echo "0")
131
+ echo "Running analysis pods: ${READY_PODS}/${ANALYSIS_PODS}"
132
+
133
+ if [ "${READY_PODS}" -eq "${ANALYSIS_PODS}" ]; then
134
+ echo "✅ PASS: All pods recovered"
135
+ else
136
+ echo "❌ FAIL: Not all pods recovered"
137
+ fi
138
+
139
+ echo ""
140
+ echo "=========================================="
141
+ echo "Chaos Test Complete"
142
+ echo "=========================================="
143
+ echo ""
144
+ echo "Manual checks:"
145
+ echo " 1. Grafana → ECIP → LSP Daemon Health — verify restart spike visible"
146
+ echo " 2. Grafana → ECIP → Analysis Throughput — verify backlog grew then drained"
147
+ echo " 3. Tempo — search for error spans in ecip-analysis-engine during test window"
148
+ echo " 4. Slack #ecip-alerts — verify notification received"
@@ -0,0 +1,318 @@
1
+ #!/bin/bash#!/usr/bin/env bash
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+ echo "=========================================="echo " 4. Verify all alerts resolve within 10 minutes"echo " 3. Check Query Latency dashboard for cascading impact"echo " 2. Verify cache hit rate recovers to > 85%"echo " 1. Check Grafana → ECIP → Cache Performance dashboard"echo " Next steps:"echo ""echo " Redis recovery: $REDIS_PING"echo "=========================================="echo " Chaos Test Complete"echo "=========================================="echo ""# Summaryfi echo " QueryLatencySLABreach (cascade): $( [ "$QUERY_ALERT" -gt 0 ] && echo 'FIRING ⚠️' || echo 'Not firing' )" echo " KnowledgeStoreWriteLatencyHigh: $( [ "$LATENCY_ALERT" -gt 0 ] && echo 'FIRING ✅' || echo 'Not firing' )" echo " CacheHitRateDegraded: $( [ "$CACHE_ALERT" -gt 0 ] && echo 'FIRING ✅' || echo 'Not firing' )" | grep -c "QueryLatencySLABreach" || echo "0") wget -qO- 'http://localhost:9090/api/v1/alerts' 2>/dev/null \ QUERY_ALERT=$(kubectl exec -n monitoring "$PROM_POD" -- \ | grep -c "KnowledgeStoreWriteLatencyHigh" || echo "0") wget -qO- 'http://localhost:9090/api/v1/alerts' 2>/dev/null \ LATENCY_ALERT=$(kubectl exec -n monitoring "$PROM_POD" -- \ | grep -c "CacheHitRateDegraded" || echo "0") wget -qO- 'http://localhost:9090/api/v1/alerts' 2>/dev/null \ CACHE_ALERT=$(kubectl exec -n monitoring "$PROM_POD" -- \if [ -n "$PROM_POD" ]; thenecho "[ALERTS] Checking for cache-related alerts..."echo ""# Check alertsecho " Redis PING: $REDIS_PING"REDIS_PING=$(kubectl exec -n "$NAMESPACE" "$TARGET_POD" -- redis-cli PING 2>/dev/null || echo "FAIL")echo "[POST-TEST] Verifying Redis recovery..."echo ""# Check Redis healthsleep 30echo "[RECOVERY] Redis should now be responsive. Waiting 30s for stabilization..."echo ""wait "$SLEEP_PID" 2>/dev/null || true# Wait for the DEBUG SLEEP to completedone ELAPSED=$((ELAPSED + INTERVAL)) sleep "$INTERVAL" fi echo " Active alerts: $ALERTS" | grep -oP '"alertname":"[^"]*"' | sort -u || echo "none") wget -qO- 'http://localhost:9090/api/v1/alerts' 2>/dev/null \ ALERTS=$(kubectl exec -n monitoring "$PROM_POD" -- \ if [ -n "$PROM_POD" ]; then # Check for degradation alerts echo " [${ELAPSED}s / ${DURATION}s] Redis still paused. Remaining: ${REMAINING}s" REMAINING=$((DURATION - ELAPSED))while [ "$ELAPSED" -lt "$DURATION" ]; doINTERVAL=15ELAPSED=0echo "[MONITOR] Monitoring for ${DURATION}s..."echo ""# Monitor during failureecho " Redis is now unresponsive to client requests"echo " Redis DEBUG SLEEP issued for ${DURATION}s"SLEEP_PID=$! redis-cli DEBUG SLEEP "$DURATION" &kubectl exec -n "$NAMESPACE" "$TARGET_POD" -- \echo "[CHAOS] Pausing Redis container in pod $TARGET_POD..."echo ""# Simulate failure — pause the Redis containerfi echo " Cache hit rate before: $CACHE_HIT_RATE" | grep -oP '"value":\[[\d.]+,"([\d.]+)"\]' | head -1 || echo "unknown") wget -qO- 'http://localhost:9090/api/v1/query?query=ecip_cache_hit_rate{module="M03"}' 2>/dev/null \ CACHE_HIT_RATE=$(kubectl exec -n monitoring "$PROM_POD" -- \if [ -n "$PROM_POD" ]; then -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")PROM_POD=$(kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus \# Query Prometheus for current cache hit rateecho "[PRE-TEST] Capturing baseline metrics..."echo ""# Capture pre-test metricsecho "Target Redis pod: $TARGET_POD"TARGET_POD=$(echo "$REDIS_PODS" | tr ' ' '\n' | head -1)# Pick the first non-master Redis pod (to avoid failover complexity)fi exit 1 echo "ERROR: No Redis pods found in namespace $NAMESPACE"if [ -z "$REDIS_PODS" ]; thenfi -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) REDIS_PODS=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/name=redis \ # Try alternate labelif [ -z "$REDIS_PODS" ]; then -o jsonpath='{.items[*].metadata.name}' 2>/dev/null)REDIS_PODS=$(kubectl get pods -n "$NAMESPACE" -l app=redis \# Find Redis podsecho "=========================================="echo " Failure duration: ${DURATION}s"echo " Namespace: $NAMESPACE"echo "=========================================="echo " Chaos Test: Redis Node Failure"echo "=========================================="DURATION="${2:-60}"NAMESPACE="${1:-ecip}"set -euo pipefail# =============================================================================# ./redis-node-failure.sh [--namespace ecip] [--duration 60]# Usage:## - Prometheus and Alertmanager running# - Redis StatefulSet running (redis-cluster or redis-sentinel)# - kubectl configured with access to the ECIP namespace# Prerequisites:## 5. Redis auto-recovers# 4. Downstream QueryLatencySLABreach may fire (cascading effect)# 3. Grafana Cache Performance dashboard shows the degradation# 2. KnowledgeStoreWriteLatencyHigh alert may fire (write retries)# 1. CacheHitRateDegraded alert fires as cache hit rate drops# Validates that:# Simulates Redis node failure for M03 Knowledge Store cache.# =============================================================================# Chaos Test: Redis Node Failure# =============================================================================# =============================================================================
152
+ # Chaos Test: Redis Node Failure
153
+ # =============================================================================
154
+ # Simulates Redis node failure in the M03 Knowledge Store cache layer.
155
+ # Validates that:
156
+ # - CacheHitRateDegraded alert fires
157
+ # - Query latency increases but service remains available
158
+ # - Cache recovers after Redis node returns
159
+ #
160
+ # Prerequisites:
161
+ # - kubectl configured for the target cluster
162
+ # - M03 Knowledge Store deployed with Redis
163
+ # - M08 Observability Stack deployed
164
+ #
165
+ # Usage:
166
+ # ./redis-node-failure.sh [namespace] [downtime_seconds]
167
+ # ./redis-node-failure.sh ecip 120
168
+ # =============================================================================
169
+ set -euo pipefail
170
+
171
+ NAMESPACE="${1:-ecip}"
172
+ DOWNTIME="${2:-120}" # How long to keep Redis down (seconds)
173
+
174
+ echo "=========================================="
175
+ echo "Chaos Test: Redis Node Failure"
176
+ echo "Namespace: ${NAMESPACE}"
177
+ echo "Downtime: ${DOWNTIME}s"
178
+ echo "=========================================="
179
+
180
+ # Verify prerequisites
181
+ echo ""
182
+ echo "[1/6] Verifying prerequisites..."
183
+
184
+ if ! command -v kubectl &>/dev/null; then
185
+ echo "ERROR: kubectl not found"
186
+ exit 1
187
+ fi
188
+
189
+ # Find Redis pods
190
+ REDIS_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app=redis --no-headers 2>/dev/null | wc -l)
191
+ if [ "${REDIS_PODS}" -eq 0 ]; then
192
+ # Try alternative label selectors
193
+ REDIS_PODS=$(kubectl get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=redis --no-headers 2>/dev/null | wc -l)
194
+ fi
195
+
196
+ if [ "${REDIS_PODS}" -eq 0 ]; then
197
+ echo "ERROR: No Redis pods found in ${NAMESPACE}"
198
+ echo "Tried labels: app=redis, app.kubernetes.io/name=redis"
199
+ exit 1
200
+ fi
201
+ echo "Found ${REDIS_PODS} Redis pod(s)"
202
+
203
+ # Record baseline
204
+ echo ""
205
+ echo "[2/6] Recording baseline metrics..."
206
+
207
+ BASELINE_CACHE_HIT=$(curl -s "http://prometheus.monitoring:9090/api/v1/query?query=ecip_cache_hit_rate{module=\"M03\"}" 2>/dev/null \
208
+ | grep -o '"value":\[.*\]' | head -1 || echo "unavailable")
209
+ echo "Baseline cache hit rate: ${BASELINE_CACHE_HIT}"
210
+
211
+ BASELINE_QUERY_P95=$(curl -s "http://prometheus.monitoring:9090/api/v1/query?query=histogram_quantile(0.95,sum(rate(ecip_query_duration_ms_bucket{module=\"M04\"}[5m]))by(le))" 2>/dev/null \
212
+ | grep -o '"value":\[.*\]' | head -1 || echo "unavailable")
213
+ echo "Baseline query p95 latency: ${BASELINE_QUERY_P95}"
214
+
215
+ # Kill Redis
216
+ echo ""
217
+ echo "[3/6] Killing Redis node..."
218
+
219
+ REDIS_POD=$(kubectl get pods -n "${NAMESPACE}" -l app=redis \
220
+ --no-headers -o custom-columns=":metadata.name" 2>/dev/null | head -1)
221
+
222
+ if [ -z "${REDIS_POD}" ]; then
223
+ REDIS_POD=$(kubectl get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=redis \
224
+ --no-headers -o custom-columns=":metadata.name" 2>/dev/null | head -1)
225
+ fi
226
+
227
+ echo "Target Redis pod: ${REDIS_POD}"
228
+
229
+ # Scale down Redis to simulate node failure
230
+ REDIS_DEPLOYMENT=$(kubectl get pods -n "${NAMESPACE}" "${REDIS_POD}" \
231
+ -o jsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null || echo "")
232
+
233
+ if [ -n "${REDIS_DEPLOYMENT}" ]; then
234
+ echo "Scaling down Redis (${REDIS_DEPLOYMENT}) to 0 replicas..."
235
+ kubectl scale -n "${NAMESPACE}" statefulset/"${REDIS_DEPLOYMENT}" --replicas=0 2>/dev/null || \
236
+ kubectl scale -n "${NAMESPACE}" deployment/"${REDIS_DEPLOYMENT}" --replicas=0 2>/dev/null || \
237
+ echo " Fallback: deleting pod directly..."
238
+ kubectl delete pod -n "${NAMESPACE}" "${REDIS_POD}" --grace-period=0 --force 2>/dev/null || true
239
+ else
240
+ echo "Deleting Redis pod..."
241
+ kubectl delete pod -n "${NAMESPACE}" "${REDIS_POD}" --grace-period=0 --force 2>/dev/null || true
242
+ fi
243
+
244
+ echo "Redis killed at $(date -u +%H:%M:%S)"
245
+
246
+ # Wait during downtime
247
+ echo ""
248
+ echo "[4/6] Waiting for downtime period (${DOWNTIME}s)..."
249
+ echo " During this time, verify:"
250
+ echo " - Grafana → Cache Performance → hit rate dropping"
251
+ echo " - Grafana → Query Latency → p95 increasing"
252
+ echo " - Application logs show Redis connection errors"
253
+
254
+ ELAPSED=0
255
+ while [ "${ELAPSED}" -lt "${DOWNTIME}" ]; do
256
+ REMAINING=$((DOWNTIME - ELAPSED))
257
+ echo " ${REMAINING}s remaining..."
258
+ sleep 15
259
+ ELAPSED=$((ELAPSED + 15))
260
+ done
261
+
262
+ # Restore Redis
263
+ echo ""
264
+ echo "[5/6] Restoring Redis..."
265
+
266
+ if [ -n "${REDIS_DEPLOYMENT}" ]; then
267
+ echo "Scaling Redis back to ${REDIS_PODS} replicas..."
268
+ kubectl scale -n "${NAMESPACE}" statefulset/"${REDIS_DEPLOYMENT}" --replicas="${REDIS_PODS}" 2>/dev/null || \
269
+ kubectl scale -n "${NAMESPACE}" deployment/"${REDIS_DEPLOYMENT}" --replicas="${REDIS_PODS}" 2>/dev/null || true
270
+ fi
271
+
272
+ echo "Waiting 60s for Redis recovery and cache warm-up..."
273
+ sleep 60
274
+
275
+ # Validate
276
+ echo ""
277
+ echo "[6/6] Validating results..."
278
+
279
+ # Check Redis is back
280
+ REDIS_READY=$(kubectl get pods -n "${NAMESPACE}" -l app=redis \
281
+ --no-headers 2>/dev/null | grep -c "Running" || echo "0")
282
+ if [ "${REDIS_READY}" -eq 0 ]; then
283
+ REDIS_READY=$(kubectl get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=redis \
284
+ --no-headers 2>/dev/null | grep -c "Running" || echo "0")
285
+ fi
286
+
287
+ if [ "${REDIS_READY}" -gt 0 ]; then
288
+ echo "✅ PASS: Redis is running again (${REDIS_READY} pod(s))"
289
+ else
290
+ echo "❌ FAIL: Redis has not recovered"
291
+ fi
292
+
293
+ # Check if cache alert fired
294
+ echo ""
295
+ echo "Checking Alertmanager for CacheHitRateDegraded alert..."
296
+ ALERT_STATUS=$(curl -s "http://alertmanager.monitoring:9093/api/v2/alerts?filter=alertname=CacheHitRateDegraded" 2>/dev/null || echo "unavailable")
297
+ if echo "${ALERT_STATUS}" | grep -q "CacheHitRateDegraded"; then
298
+ echo "✅ PASS: CacheHitRateDegraded alert fired"
299
+ else
300
+ echo "⚠️ WARN: CacheHitRateDegraded alert not detected"
301
+ fi
302
+
303
+ # Final metrics
304
+ echo ""
305
+ FINAL_CACHE_HIT=$(curl -s "http://prometheus.monitoring:9090/api/v1/query?query=ecip_cache_hit_rate{module=\"M03\"}" 2>/dev/null \
306
+ | grep -o '"value":\[.*\]' | head -1 || echo "unavailable")
307
+ echo "Final cache hit rate: ${FINAL_CACHE_HIT} (expect recovering toward baseline)"
308
+
309
+ echo ""
310
+ echo "=========================================="
311
+ echo "Chaos Test Complete"
312
+ echo "=========================================="
313
+ echo ""
314
+ echo "Manual checks:"
315
+ echo " 1. Grafana → Cache Performance — verify dip during downtime, recovery after"
316
+ echo " 2. Grafana → Query Latency — verify latency spike during cache miss period"
317
+ echo " 3. Verify M03 Knowledge Store handled Redis failure gracefully (no crash)"
318
+ echo " 4. Verify cache warm-up is progressing (hit rate increasing)"