ecip-observability-stack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CLAUDE.md +48 -0
  2. package/README.md +75 -0
  3. package/alerts/analysis-backlog.yaml +39 -0
  4. package/alerts/cache-degradation.yaml +44 -0
  5. package/alerts/dlq-depth.yaml +56 -0
  6. package/alerts/lsp-daemon.yaml +43 -0
  7. package/alerts/mcp-latency.yaml +46 -0
  8. package/alerts/security-anomaly.yaml +59 -0
  9. package/alerts/sla-latency.yaml +61 -0
  10. package/chaos/kafka-broker-restart.sh +168 -0
  11. package/chaos/kill-lsp-daemon.sh +148 -0
  12. package/chaos/redis-node-failure.sh +318 -0
  13. package/ci/check-observability-contract.js +285 -0
  14. package/ci/eslint-plugin-ecip/index.js +209 -0
  15. package/ci/eslint-plugin-ecip/package.json +12 -0
  16. package/ci/github-actions-observability-gate.yaml +180 -0
  17. package/ci/ruff-shared.toml +41 -0
  18. package/collector/otel-collector-config.yaml +226 -0
  19. package/collector/otel-collector-daemonset.yaml +168 -0
  20. package/collector/sampling-config.yaml +83 -0
  21. package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
  22. package/dashboards/analysis-throughput.json +166 -0
  23. package/dashboards/cache-performance.json +129 -0
  24. package/dashboards/cross-repo-fanout.json +93 -0
  25. package/dashboards/event-bus-dlq.json +129 -0
  26. package/dashboards/lsp-daemon-health.json +104 -0
  27. package/dashboards/mcp-call-graph.json +114 -0
  28. package/dashboards/query-latency.json +160 -0
  29. package/dashboards/security-events.json +131 -0
  30. package/docs/M08-Observability-Design.md +639 -0
  31. package/docs/PROGRESS.md +375 -0
  32. package/docs/module-documentation.md +64 -0
  33. package/elasticsearch/ilm-policy.json +57 -0
  34. package/elasticsearch/index-template.json +62 -0
  35. package/elasticsearch/kibana-space.yaml +53 -0
  36. package/helm/Chart.yaml +30 -0
  37. package/helm/templates/configmaps.yaml +25 -0
  38. package/helm/templates/elasticsearch.yaml +68 -0
  39. package/helm/templates/grafana-secret.yaml +22 -0
  40. package/helm/templates/grafana.yaml +19 -0
  41. package/helm/templates/loki.yaml +33 -0
  42. package/helm/templates/otel-collector.yaml +119 -0
  43. package/helm/templates/prometheus.yaml +43 -0
  44. package/helm/templates/tempo.yaml +16 -0
  45. package/helm/values.prod.yaml +159 -0
  46. package/helm/values.yaml +146 -0
  47. package/logging-lib/nodejs/package.json +57 -0
  48. package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
  49. package/logging-lib/python/pyproject.toml +45 -0
  50. package/logging-lib/python/src/__init__.py +19 -0
  51. package/logging-lib/python/src/logger.py +131 -0
  52. package/logging-lib/python/src/security_events.py +150 -0
  53. package/logging-lib/python/src/tracer.py +185 -0
  54. package/logging-lib/python/tests/test_logger.py +113 -0
  55. package/package.json +21 -0
  56. package/prometheus/prometheus-values.yaml +170 -0
  57. package/prometheus/recording-rules.yaml +97 -0
  58. package/prometheus/scrape-configs.yaml +122 -0
  59. package/runbooks/SDK-INTEGRATION.md +239 -0
  60. package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
  61. package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
  62. package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
  63. package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
  64. package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
  65. package/runbooks/dashboard-guide.md +169 -0
  66. package/scripts/lint-dashboards.js +184 -0
  67. package/tempo/tempo-datasource.yaml +46 -0
  68. package/tempo/tempo-values.yaml +94 -0
  69. package/tests/alert-threshold-config.test.ts +283 -0
  70. package/tests/log-schema-validation.test.ts +246 -0
  71. package/tests/metric-label-validation.test.ts +292 -0
  72. package/tests/otel-pipeline-integration.test.ts +420 -0
  73. package/tests/security-events.test.ts +417 -0
  74. package/tsconfig.json +17 -0
  75. package/vitest.config.ts +21 -0
  76. package/vitest.integration.config.ts +9 -0
package/CLAUDE.md ADDED
@@ -0,0 +1,48 @@
1
+ # CLAUDE.md — ecip-observability-stack (M08)
2
+
3
+ ## Module Purpose
4
+ The **Observability Stack** is the eyes and ears of the platform. This module deploys and operates the monitoring infrastructure — it does NOT contain application code.
5
+
6
+ ## Key Principle: Auto-Instrumentation
7
+ Other modules should NOT need to write custom metrics code. The OpenTelemetry auto-instrumentation should cover standard HTTP, gRPC, Kafka, and database spans automatically.
8
+
9
+ When a module needs a custom metric (e.g., `ecip_analysis_embedding_duration`), it uses the OTel SDK's metric API — not Prometheus client directly.
10
+
11
+ ## Dashboard Ownership
12
+ Each dashboard in `dashboards/` corresponds to a module:
13
+ ```
14
+ dashboards/
15
+ analysis-engine.json ← M02 metrics
16
+ query-service.json ← M04 metrics
17
+ mcp-servers.json ← M05 metrics
18
+ knowledge-store.json ← M03 metrics
19
+ event-bus.json ← M07 metrics
20
+ platform-sla.json ← Cross-module SLA overview
21
+ ```
22
+
23
+ Edit dashboards in Grafana UI → export JSON → commit here. This is the source of truth for dashboard definitions.
24
+
25
+ ## Alert Rules
26
+ Alert definitions are in `alerts/`. Each alert must have:
27
+ - `severity`: `critical`, `warning`, or `info`
28
+ - `runbook_url`: link to a runbook in `runbooks/`
29
+ - `for`: how long the condition must hold before firing
30
+
31
+ Critical alerts page on-call. Warning alerts send Slack notification only.
32
+
33
+ ## Required Span Attributes (from all modules)
34
+ Enforce these via OTel Collector attribute processor if modules don't emit them:
35
+ - `ecip.module` (e.g., `M02`, `M04`)
36
+ - `ecip.org_id`
37
+ - `ecip.repo_id` (where applicable)
38
+
39
+ ## Chaos Testing
40
+ Chaos test scripts are in `chaos/`. Run these before each production release:
41
+ - `chaos/kill-lsp-daemon.sh` — verify M02 circuit breaker fires correctly
42
+ - `chaos/redis-node-failure.sh` — verify M03 cache fallback
43
+ - `chaos/kafka-broker-restart.sh` — verify M07 producer retry
44
+
45
+ ## Do Not
46
+ - Do not add application logic here
47
+ - Do not write to M03 or any other module's data store
48
+ - Do not require other modules to install Prometheus client libraries (OTel SDK only)
package/README.md ADDED
@@ -0,0 +1,75 @@
1
+ # ecip-observability-stack (M08 — Observability Stack)
2
+
3
+ > **Team:** Platform/Infra · **Phase:** 5 (Weeks 23–28) · **Priority:** P5
4
+
5
+ Platform-wide observability infrastructure. Provides distributed tracing, metrics collection, log aggregation, Grafana dashboards, and alerting for all ECIP modules via OpenTelemetry auto-instrumentation.
6
+
7
+ ---
8
+
9
+ ## Responsibilities
10
+
11
+ - Deploy and maintain OpenTelemetry Collector
12
+ - Operate Grafana + Prometheus (metrics)
13
+ - Operate Jaeger or Tempo (distributed tracing)
14
+ - Provide dashboards for: analysis latency, query p95, cache hit rates, MCP call graphs
15
+ - Configure alerting rules and on-call routing
16
+ - Run chaos tests and load tests for production validation
17
+
18
+ ---
19
+
20
+ ## Technology Stack
21
+
22
+ | Component | Technology |
23
+ |-----------|-----------|
24
+ | Instrumentation | OpenTelemetry SDK (auto) |
25
+ | Metrics | Prometheus + Grafana |
26
+ | Tracing | Jaeger or Grafana Tempo |
27
+ | Logs | Loki or ELK |
28
+ | Alerting | Grafana Alerting + PagerDuty |
29
+
30
+ ---
31
+
32
+ ## Getting Started
33
+
34
+ ```bash
35
+ git clone git@github.com:ecip/ecip-observability-stack.git
36
+ cd ecip-observability-stack
37
+
38
+ # Deploy the full observability stack to Kubernetes
39
+ helm upgrade --install ecip-obs ./helm --namespace monitoring
40
+
41
+ # Access Grafana locally
42
+ kubectl port-forward svc/grafana 3000:3000 -n monitoring
43
+ # Open http://localhost:3000 (admin/admin)
44
+ ```
45
+
46
+ ---
47
+
48
+ ## Required Dashboards
49
+
50
+ | Dashboard | Key Metrics |
51
+ |-----------|------------|
52
+ | Analysis Pipeline | Events consumed/s, analysis duration p50/p95, embedding API latency, error rate |
53
+ | Query Service | Query duration p50/p95/p99, LLM API latency, cache hit rate, MCP fan-out depth |
54
+ | MCP Servers | Tool call latency per tool, per-repo RPS, auth failure rate |
55
+ | Knowledge Store | Redis hit rate, pgvector query duration, write throughput |
56
+ | Event Bus | Kafka consumer lag, DLQ depth, webhook processing latency |
57
+ | Platform SLAs | End-to-end query p95 < 1.5s, analysis p95 < 30s, uptime |
58
+
59
+ ---
60
+
61
+ ## SLA Targets
62
+
63
+ | SLA | Target |
64
+ |-----|--------|
65
+ | Query p95 latency | < 1.5s |
66
+ | Analysis p95 latency (trunk) | < 30s per file |
67
+ | Platform uptime | > 99.5% |
68
+ | Cache hit rate (M03) | > 80% |
69
+
70
+ ---
71
+
72
+ ## Module Dependencies
73
+
74
+ **Depends on:** OpenTelemetry SDKs in all other modules (auto-instrumented — no per-module code changes required)
75
+ **Called by:** Nothing — pull-based metrics collection
@@ -0,0 +1,39 @@
1
+ # =============================================================================
2
+ # ECIP Alert: Analysis Backlog Growing
3
+ # =============================================================================
4
+ # Fires when Kafka consumer lag for analysis topics exceeds 1000 events
5
+ # =============================================================================
6
+ groups:
7
+ - name: ecip.analysis.backlog
8
+ rules:
9
+ - alert: AnalysisBacklogCritical
10
+ expr: >
11
+ sum(kafka_consumergroup_lag{group=~"ecip-analysis.*"}) > 1000
12
+ for: 10m
13
+ labels:
14
+ severity: critical
15
+ module: M02
16
+ team: analysis-engine
17
+ annotations:
18
+ summary: "Analysis event backlog exceeds 1000 events"
19
+ description: >
20
+ The analysis engine's Kafka consumer lag is {{ $value }} events.
21
+ This means analysis is falling behind ingestion rate.
22
+ If sustained, newly pushed code will not be indexed in time.
23
+ runbook_url: "https://ecip.internal/runbooks/alert-response/ANALYSIS_BACKLOG.md"
24
+ dashboard_url: "https://grafana.ecip.internal/d/ecip-analysis-throughput"
25
+
26
+ - alert: AnalysisBacklogWarning
27
+ expr: >
28
+ sum(kafka_consumergroup_lag{group=~"ecip-analysis.*"}) > 500
29
+ for: 15m
30
+ labels:
31
+ severity: warning
32
+ module: M02
33
+ team: analysis-engine
34
+ annotations:
35
+ summary: "Analysis event backlog exceeds 500 events"
36
+ description: >
37
+ The analysis engine's Kafka consumer lag is {{ $value }} events.
38
+ Trending toward the critical threshold (1000).
39
+ runbook_url: "https://ecip.internal/runbooks/alert-response/ANALYSIS_BACKLOG.md"
@@ -0,0 +1,44 @@
1
+ # =============================================================================
2
+ # ECIP Alert: Cache Hit Rate Degradation
3
+ # =============================================================================
4
+ # SLA Target: cache_hit_rate > 80% (M03 Knowledge Store)
5
+ # Warning at < 60%
6
+ # =============================================================================
7
+ groups:
8
+ - name: ecip.cache.degradation
9
+ rules:
10
+ - alert: CacheHitRateDegraded
11
+ expr: >
12
+ cache_hit_rate{job=~"ecip-knowledge-store|ecip-query-service"} < 0.60
13
+ for: 15m
14
+ labels:
15
+ severity: warning
16
+ module: M03
17
+ team: knowledge-store
18
+ annotations:
19
+ summary: "Cache hit rate below 60% for {{ $labels.cache_type }} ({{ $labels.repo }})"
20
+ description: >
21
+ Cache hit rate for {{ $labels.cache_type }} on repo
22
+ {{ $labels.repo }} is {{ $value | printf "%.1f" }}%.
23
+ Target is > 80%. Possible causes: cold cache after
24
+ deployment, Redis eviction, or traffic pattern change.
25
+ runbook_url: "https://ecip.internal/runbooks/alert-response/HIGH_QUERY_LATENCY.md"
26
+ dashboard_url: "https://grafana.ecip.internal/d/ecip-cache-performance"
27
+
28
+ - alert: KnowledgeStoreWriteLatencyHigh
29
+ expr: >
30
+ histogram_quantile(0.95,
31
+ sum(rate(knowledge_store_write_duration_ms_bucket[5m])) by (le, store_type)
32
+ ) > 200
33
+ for: 10m
34
+ labels:
35
+ severity: warning
36
+ module: M03
37
+ team: knowledge-store
38
+ annotations:
39
+ summary: "Knowledge Store write p95 > 200ms for {{ $labels.store_type }}"
40
+ description: >
41
+ Write latency p95 for {{ $labels.store_type }} is
42
+ {{ $value | printf "%.0f" }}ms. This slows analysis
43
+ indexing and may cause backlog growth.
44
+ runbook_url: "https://ecip.internal/runbooks/alert-response/HIGH_QUERY_LATENCY.md"
@@ -0,0 +1,56 @@
1
+ # =============================================================================
2
+ # ECIP Alert: DLQ Depth Exceeded
3
+ # =============================================================================
4
+ # Dead Letter Queue depth indicates persistent message processing failures
5
+ # =============================================================================
6
+ groups:
7
+ - name: ecip.event-bus.dlq
8
+ rules:
9
+ - alert: DLQDepthExceeded
10
+ expr: >
11
+ event_bus_dlq_depth > 100
12
+ for: 5m
13
+ labels:
14
+ severity: critical
15
+ module: M07
16
+ team: event-bus
17
+ annotations:
18
+ summary: "DLQ depth > 100 for topic {{ $labels.topic }}"
19
+ description: >
20
+ The dead letter queue for topic {{ $labels.topic }} has
21
+ {{ $value }} messages. This means messages are consistently
22
+ failing processing after all retry attempts.
23
+ Investigate the DLQ messages and root cause.
24
+ runbook_url: "https://ecip.internal/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md"
25
+ dashboard_url: "https://grafana.ecip.internal/d/ecip-event-bus-dlq"
26
+
27
+ - alert: DLQDepthWarning
28
+ expr: >
29
+ event_bus_dlq_depth > 50
30
+ for: 10m
31
+ labels:
32
+ severity: warning
33
+ module: M07
34
+ team: event-bus
35
+ annotations:
36
+ summary: "DLQ depth > 50 for topic {{ $labels.topic }}"
37
+ description: >
38
+ DLQ for topic {{ $labels.topic }} has {{ $value }} messages.
39
+ Trending toward the critical threshold (100).
40
+ runbook_url: "https://ecip.internal/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md"
41
+
42
+ - alert: DLQMessageAgeHigh
43
+ expr: >
44
+ event_bus_dlq_oldest_message_age_seconds > 86400
45
+ for: 30m
46
+ labels:
47
+ severity: warning
48
+ module: M07
49
+ team: event-bus
50
+ annotations:
51
+ summary: "DLQ has messages older than 24 hours for topic {{ $labels.topic }}"
52
+ description: >
53
+ The oldest DLQ message for {{ $labels.topic }} is
54
+ {{ $value | humanizeDuration }} old. Stale DLQ messages
55
+ indicate abandoned failures that need manual investigation.
56
+ runbook_url: "https://ecip.internal/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md"
@@ -0,0 +1,43 @@
1
+ # =============================================================================
2
+ # ECIP Alert: LSP Daemon Restart Rate
3
+ # =============================================================================
4
+ # for: 0min is INTENTIONAL — daemon crash is always significant.
5
+ # The M04 circuit breaker handles graceful degradation.
6
+ # This alert gets a human involved in parallel.
7
+ # =============================================================================
8
+ groups:
9
+ - name: ecip.lsp.daemon
10
+ rules:
11
+ - alert: LSPDaemonRestartRate
12
+ expr: >
13
+ rate(lsp_daemon_restarts_total[1h]) > 2
14
+ for: 0m
15
+ labels:
16
+ severity: critical
17
+ module: M02
18
+ team: analysis-engine
19
+ annotations:
20
+ summary: "LSP daemon restart rate > 2/hour for {{ $labels.repo }} ({{ $labels.language }})"
21
+ description: >
22
+ LSP daemon for {{ $labels.repo }} ({{ $labels.language }})
23
+ is restarting at {{ $value | printf "%.1f" }} times/hour.
24
+ This indicates OOM, crash loop, or resource exhaustion.
25
+ The M04 circuit breaker should be active — verify in Grafana.
26
+ runbook_url: "https://ecip.internal/runbooks/alert-response/LSP_DAEMON_RESTART.md"
27
+ dashboard_url: "https://grafana.ecip.internal/d/ecip-lsp-daemon-health"
28
+
29
+ - alert: LSPDaemonOOMKill
30
+ expr: >
31
+ increase(kube_pod_container_status_last_terminated_reason{container=~"lsp-daemon.*", reason="OOMKilled"}[1h]) > 0
32
+ for: 0m
33
+ labels:
34
+ severity: critical
35
+ module: M02
36
+ team: analysis-engine
37
+ annotations:
38
+ summary: "LSP daemon OOM killed: {{ $labels.pod }}"
39
+ description: >
40
+ LSP daemon pod {{ $labels.pod }} was OOM-killed.
41
+ Consider increasing memory limits or reviewing the repo
42
+ that triggered the analysis.
43
+ runbook_url: "https://ecip.internal/runbooks/alert-response/LSP_DAEMON_RESTART.md"
@@ -0,0 +1,46 @@
1
+ # =============================================================================
2
+ # ECIP Alert: MCP Call Latency
3
+ # =============================================================================
4
+ # MCP fan-out calls must stay below 800ms p95 to meet the overall
5
+ # query latency SLA of 1500ms
6
+ # =============================================================================
7
+ groups:
8
+ - name: ecip.mcp.latency
9
+ rules:
10
+ - alert: MCPCallLatencyWarn
11
+ expr: >
12
+ histogram_quantile(0.95,
13
+ sum(rate(mcp_call_duration_ms_bucket[5m])) by (le, target_repo, tool_name)
14
+ ) > 800
15
+ for: 10m
16
+ labels:
17
+ severity: warning
18
+ module: M05
19
+ team: mcp-server
20
+ annotations:
21
+ summary: "MCP call p95 > 800ms for {{ $labels.tool_name }} → {{ $labels.target_repo }}"
22
+ description: >
23
+ MCP tool call {{ $labels.tool_name }} targeting
24
+ {{ $labels.target_repo }} has p95 latency of
25
+ {{ $value | printf "%.0f" }}ms. This contributes to
26
+ overall query latency SLA risk.
27
+ runbook_url: "https://ecip.internal/runbooks/alert-response/HIGH_QUERY_LATENCY.md"
28
+ dashboard_url: "https://grafana.ecip.internal/d/ecip-mcp-call-graph"
29
+
30
+ - alert: MCPCallErrorRateHigh
31
+ expr: >
32
+ sum(rate(mcp_call_duration_ms_count{status_code=~"5.."}[5m])) by (tool_name)
33
+ /
34
+ sum(rate(mcp_call_duration_ms_count[5m])) by (tool_name)
35
+ > 0.05
36
+ for: 5m
37
+ labels:
38
+ severity: warning
39
+ module: M05
40
+ team: mcp-server
41
+ annotations:
42
+ summary: "MCP tool {{ $labels.tool_name }} error rate > 5%"
43
+ description: >
44
+ MCP tool {{ $labels.tool_name }} is failing at
45
+ {{ $value | printf "%.1f" }}% error rate.
46
+ runbook_url: "https://ecip.internal/runbooks/alert-response/HIGH_QUERY_LATENCY.md"
@@ -0,0 +1,59 @@
1
+ # =============================================================================
2
+ # ECIP Alert: Security Anomaly Detection
3
+ # =============================================================================
4
+ # for: 0min — security events fire IMMEDIATELY
5
+ # Auth failure burst and RBAC denial burst detection
6
+ # =============================================================================
7
+ groups:
8
+ - name: ecip.security.anomaly
9
+ rules:
10
+ - alert: SecurityAuthBurst
11
+ expr: >
12
+ increase(auth_failure_total[5m]) > 10
13
+ for: 0m
14
+ labels:
15
+ severity: critical
16
+ module: M01
17
+ team: security
18
+ annotations:
19
+ summary: "Auth failure burst detected: {{ $value | printf \"%.0f\" }} failures in 5 minutes"
20
+ description: >
21
+ More than 10 authentication failures in the last 5 minutes.
22
+ Reason breakdown: {{ $labels.reason }}.
23
+ This may indicate a brute-force attempt or a mass token expiration event.
24
+ Check the security events dashboard and Elasticsearch for details.
25
+ runbook_url: "https://ecip.internal/runbooks/alert-response/SECURITY_ANOMALY.md"
26
+ dashboard_url: "https://grafana.ecip.internal/d/ecip-security-events"
27
+
28
+ - alert: SecurityRBACDenialBurst
29
+ expr: >
30
+ increase(rbac_denial_total[5m]) > 10
31
+ for: 0m
32
+ labels:
33
+ severity: warning
34
+ module: M06
35
+ team: security
36
+ annotations:
37
+ summary: "RBAC denial burst: {{ $value | printf \"%.0f\" }} denials in 5 minutes"
38
+ description: >
39
+ More than 10 RBAC denials in the last 5 minutes for
40
+ resource={{ $labels.resource }}, action={{ $labels.action }}.
41
+ This may indicate misconfigured permissions or
42
+ an unauthorized access attempt.
43
+ runbook_url: "https://ecip.internal/runbooks/alert-response/SECURITY_ANOMALY.md"
44
+
45
+ - alert: ServiceAuthFailure
46
+ expr: >
47
+ increase(auth_failure_total{reason="mtls_rejected"}[5m]) > 0
48
+ for: 0m
49
+ labels:
50
+ severity: critical
51
+ module: M01
52
+ team: security
53
+ annotations:
54
+ summary: "Service-to-service authentication failure (mTLS rejection)"
55
+ description: >
56
+ An mTLS authentication failure was detected. This indicates
57
+ either a certificate misconfiguration or a potential
58
+ man-in-the-middle attempt. Investigate immediately.
59
+ runbook_url: "https://ecip.internal/runbooks/alert-response/SECURITY_ANOMALY.md"
@@ -0,0 +1,61 @@
1
+ # =============================================================================
2
+ # ECIP Alert: Query Latency SLA Breach
3
+ # =============================================================================
4
+ # SLA Target: query_duration_ms p95 < 1500ms
5
+ # Uses histogram_quantile (NOT average — averaging hides tail problems)
6
+ # =============================================================================
7
+ groups:
8
+ - name: ecip.sla.latency
9
+ rules:
10
+ - alert: QueryLatencySLABreach
11
+ expr: >
12
+ histogram_quantile(0.95,
13
+ sum(rate(query_duration_ms_bucket{job="ecip-query-service"}[5m])) by (le)
14
+ ) > 1500
15
+ for: 5m
16
+ labels:
17
+ severity: critical
18
+ module: M04
19
+ team: query-service
20
+ annotations:
21
+ summary: "Query latency p95 exceeds SLA threshold (1500ms)"
22
+ description: >
23
+ Query service p95 latency is {{ $value | printf "%.0f" }}ms,
24
+ which exceeds the 1500ms SLA threshold.
25
+ This has been firing for more than 5 minutes.
26
+ runbook_url: "https://ecip.internal/runbooks/alert-response/HIGH_QUERY_LATENCY.md"
27
+ dashboard_url: "https://grafana.ecip.internal/d/ecip-query-latency"
28
+
29
+ - alert: FilterAuthorizedReposLatency
30
+ expr: >
31
+ histogram_quantile(0.95,
32
+ sum(rate(filter_authorized_repos_duration_ms_bucket{job="ecip-registry-service"}[5m])) by (le)
33
+ ) > 20
34
+ for: 5m
35
+ labels:
36
+ severity: warning
37
+ module: M06
38
+ team: registry-service
39
+ annotations:
40
+ summary: "FilterAuthorizedRepos p95 exceeds 20ms (NFR-SEC-011)"
41
+ description: >
42
+ Registry service FilterAuthorizedRepos RPC p95 latency is
43
+ {{ $value | printf "%.1f" }}ms, exceeding the 20ms SLA
44
+ from NFR-SEC-011. This directly impacts the hot query path.
45
+ runbook_url: "https://ecip.internal/runbooks/alert-response/HIGH_QUERY_LATENCY.md"
46
+
47
+ - alert: GRPCRequestLatencyHigh
48
+ expr: >
49
+ histogram_quantile(0.95,
50
+ sum(rate(grpc_request_duration_ms_bucket[5m])) by (le, service, method)
51
+ ) > 500
52
+ for: 5m
53
+ labels:
54
+ severity: warning
55
+ module: M06
56
+ annotations:
57
+ summary: "gRPC request p95 latency > 500ms for {{ $labels.service }}/{{ $labels.method }}"
58
+ description: >
59
+ gRPC p95 latency is {{ $value | printf "%.0f" }}ms for
60
+ {{ $labels.service }}/{{ $labels.method }}.
61
+ runbook_url: "https://ecip.internal/runbooks/alert-response/HIGH_QUERY_LATENCY.md"
@@ -0,0 +1,168 @@
1
+ #!/bin/bash
2
+ # =============================================================================
3
+ # Chaos Test: Kafka Broker Restart
4
+ # =============================================================================
5
+ # Simulates Kafka broker restart for M07 Event Bus.
6
+ # Validates that:
7
+ # 1. DLQDepthWarning/DLQDepthExceeded alerts fire if messages fail
8
+ # 2. Event Bus DLQ dashboard reflects message accumulation
9
+ # 3. Consumer groups rebalance correctly after broker recovery
10
+ # 4. No data loss — messages are replayed from Kafka after recovery
11
+ #
12
+ # Prerequisites:
13
+ # - kubectl configured with access to the kafka namespace
14
+ # - Kafka StatefulSet running
15
+ # - Prometheus and Alertmanager running
16
+ # - ecip-event-bus consumers running
17
+ #
18
+ # Usage:
19
+ # ./kafka-broker-restart.sh [--namespace kafka] [--broker-id 0]
20
+ # =============================================================================
21
+
22
+ set -euo pipefail
23
+
24
+ KAFKA_NAMESPACE="${1:-kafka}"
25
+ BROKER_ID="${2:-0}"
26
+
27
+ echo "=========================================="
28
+ echo " Chaos Test: Kafka Broker Restart"
29
+ echo "=========================================="
30
+ echo " Kafka namespace: $KAFKA_NAMESPACE"
31
+ echo " Target broker: kafka-$BROKER_ID"
32
+ echo "=========================================="
33
+
34
+ # Verify the target broker pod exists
35
+ BROKER_POD="kafka-$BROKER_ID"
36
+ if ! kubectl get pod -n "$KAFKA_NAMESPACE" "$BROKER_POD" &>/dev/null; then
37
+ echo "ERROR: Pod $BROKER_POD not found in namespace $KAFKA_NAMESPACE"
38
+ exit 1
39
+ fi
40
+
41
+ echo "Target pod: $BROKER_POD"
42
+
43
+ # Pre-test: capture consumer group state
44
+ echo ""
45
+ echo "[PRE-TEST] Capturing baseline state..."
46
+
47
+ # Get ECIP consumer group lag
48
+ kubectl exec -n "$KAFKA_NAMESPACE" "$BROKER_POD" -- \
49
+ kafka-consumer-groups.sh --bootstrap-server localhost:9092 \
50
+ --group ecip-event-processors --describe 2>/dev/null || echo " (consumer group info unavailable)"
51
+
52
+ # Count partitions on this broker
53
+ PARTITION_COUNT=$(kubectl exec -n "$KAFKA_NAMESPACE" "$BROKER_POD" -- \
54
+ kafka-broker-api-versions.sh --bootstrap-server localhost:9092 2>/dev/null \
55
+ | head -1 || echo "unknown")
56
+
57
+ echo " Broker $BROKER_ID status: Running"
58
+
59
+ # Check DLQ depth before
60
+ DLQ_DEPTH_BEFORE=$(kubectl exec -n "$KAFKA_NAMESPACE" "$BROKER_POD" -- \
61
+ kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list localhost:9092 \
62
+ --topic ecip.dlq --time -1 2>/dev/null \
63
+ | awk -F: '{sum += $3} END {print sum}' || echo "0")
64
+
65
+ echo " DLQ depth before: $DLQ_DEPTH_BEFORE"
66
+
67
+ # Execute: delete the broker pod (Kubernetes will restart it)
68
+ echo ""
69
+ echo "[CHAOS] Deleting Kafka broker pod $BROKER_POD..."
70
+ echo " StatefulSet will recreate it automatically."
71
+
72
+ kubectl delete pod -n "$KAFKA_NAMESPACE" "$BROKER_POD" --grace-period=0 --force 2>/dev/null
73
+
74
+ echo " Pod deleted. Watching for recreation..."
75
+
76
+ # Wait for pod to be recreated and ready
77
+ echo ""
78
+ echo "[RECOVERY] Waiting for broker to restart..."
79
+
80
+ TIMEOUT=120
81
+ ELAPSED=0
82
+ while [ "$ELAPSED" -lt "$TIMEOUT" ]; do
83
+ STATUS=$(kubectl get pod -n "$KAFKA_NAMESPACE" "$BROKER_POD" \
84
+ -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending")
85
+
86
+ READY=$(kubectl get pod -n "$KAFKA_NAMESPACE" "$BROKER_POD" \
87
+ -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
88
+
89
+ echo " [${ELAPSED}s] Phase: $STATUS, Ready: $READY"
90
+
91
+ if [ "$READY" = "true" ]; then
92
+ echo " Broker is back online!"
93
+ break
94
+ fi
95
+
96
+ sleep 10
97
+ ELAPSED=$((ELAPSED + 10))
98
+ done
99
+
100
+ if [ "$ELAPSED" -ge "$TIMEOUT" ]; then
101
+ echo " ⚠️ WARNING: Broker did not recover within ${TIMEOUT}s"
102
+ fi
103
+
104
+ # Wait for consumer group rebalance
105
+ echo ""
106
+ echo "[REBALANCE] Waiting 30s for consumer group rebalance..."
107
+ sleep 30
108
+
109
+ # Post-test: check consumer group state
110
+ echo ""
111
+ echo "[POST-TEST] Checking consumer group state..."
112
+
113
+ kubectl exec -n "$KAFKA_NAMESPACE" "$BROKER_POD" -- \
114
+ kafka-consumer-groups.sh --bootstrap-server localhost:9092 \
115
+ --group ecip-event-processors --describe 2>/dev/null || echo " (consumer group info unavailable)"
116
+
117
+ # Check DLQ depth after
118
+ DLQ_DEPTH_AFTER=$(kubectl exec -n "$KAFKA_NAMESPACE" "$BROKER_POD" -- \
119
+ kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list localhost:9092 \
120
+ --topic ecip.dlq --time -1 2>/dev/null \
121
+ | awk -F: '{sum += $3} END {print sum}' || echo "0")
122
+
123
+ echo " DLQ depth after: $DLQ_DEPTH_AFTER"
124
+ echo " New DLQ messages: $((DLQ_DEPTH_AFTER - DLQ_DEPTH_BEFORE))"
125
+
126
+ # Check alerts
127
+ echo ""
128
+ echo "[ALERTS] Checking for Event Bus alerts..."
129
+
130
+ PROM_POD=$(kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus \
131
+ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
132
+
133
+ if [ -n "$PROM_POD" ]; then
134
+ DLQ_ALERT=$(kubectl exec -n monitoring "$PROM_POD" -- \
135
+ wget -qO- 'http://localhost:9090/api/v1/alerts' 2>/dev/null \
136
+ | grep -c "DLQDepth" || echo "0")
137
+
138
+ echo " DLQ alerts firing: $DLQ_ALERT"
139
+ fi
140
+
141
+ # Verify no data loss — check consumer group caught up
142
+ echo ""
143
+ echo "[VERIFICATION] Checking for data completeness..."
144
+
145
+ LAG_TOTAL=$(kubectl exec -n "$KAFKA_NAMESPACE" "$BROKER_POD" -- \
146
+ kafka-consumer-groups.sh --bootstrap-server localhost:9092 \
147
+ --group ecip-event-processors --describe 2>/dev/null \
148
+ | awk 'NR>1 {sum += $5} END {print sum+0}' || echo "unknown")
149
+
150
+ echo " Total consumer lag: $LAG_TOTAL"
151
+
152
+ # Summary
153
+ echo ""
154
+ echo "=========================================="
155
+ echo " Chaos Test Complete"
156
+ echo "=========================================="
157
+ echo " Broker recovery: $( [ "$READY" = "true" ] && echo 'Success ✅' || echo 'FAILED ❌' )"
158
+ echo " DLQ growth: $((DLQ_DEPTH_AFTER - DLQ_DEPTH_BEFORE)) messages"
159
+ echo " Consumer lag: $LAG_TOTAL"
160
+ echo ""
161
+ echo " Next steps:"
162
+ echo " 1. Check Grafana → ECIP → Event Bus DLQ dashboard"
163
+ echo " 2. Verify consumer lag returns to 0"
164
+ echo " 3. If DLQ messages accumulated, replay with:"
165
+ echo " kubectl exec -n ecip deployment/ecip-event-bus -- \\"
166
+ echo " node scripts/replay-dlq.js --topic ecip.dlq --batch-size 50"
167
+ echo " 4. Verify all alerts resolve within 15 minutes"
168
+ echo "=========================================="