ecip-observability-stack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CLAUDE.md +48 -0
  2. package/README.md +75 -0
  3. package/alerts/analysis-backlog.yaml +39 -0
  4. package/alerts/cache-degradation.yaml +44 -0
  5. package/alerts/dlq-depth.yaml +56 -0
  6. package/alerts/lsp-daemon.yaml +43 -0
  7. package/alerts/mcp-latency.yaml +46 -0
  8. package/alerts/security-anomaly.yaml +59 -0
  9. package/alerts/sla-latency.yaml +61 -0
  10. package/chaos/kafka-broker-restart.sh +168 -0
  11. package/chaos/kill-lsp-daemon.sh +148 -0
  12. package/chaos/redis-node-failure.sh +318 -0
  13. package/ci/check-observability-contract.js +285 -0
  14. package/ci/eslint-plugin-ecip/index.js +209 -0
  15. package/ci/eslint-plugin-ecip/package.json +12 -0
  16. package/ci/github-actions-observability-gate.yaml +180 -0
  17. package/ci/ruff-shared.toml +41 -0
  18. package/collector/otel-collector-config.yaml +226 -0
  19. package/collector/otel-collector-daemonset.yaml +168 -0
  20. package/collector/sampling-config.yaml +83 -0
  21. package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
  22. package/dashboards/analysis-throughput.json +166 -0
  23. package/dashboards/cache-performance.json +129 -0
  24. package/dashboards/cross-repo-fanout.json +93 -0
  25. package/dashboards/event-bus-dlq.json +129 -0
  26. package/dashboards/lsp-daemon-health.json +104 -0
  27. package/dashboards/mcp-call-graph.json +114 -0
  28. package/dashboards/query-latency.json +160 -0
  29. package/dashboards/security-events.json +131 -0
  30. package/docs/M08-Observability-Design.md +639 -0
  31. package/docs/PROGRESS.md +375 -0
  32. package/docs/module-documentation.md +64 -0
  33. package/elasticsearch/ilm-policy.json +57 -0
  34. package/elasticsearch/index-template.json +62 -0
  35. package/elasticsearch/kibana-space.yaml +53 -0
  36. package/helm/Chart.yaml +30 -0
  37. package/helm/templates/configmaps.yaml +25 -0
  38. package/helm/templates/elasticsearch.yaml +68 -0
  39. package/helm/templates/grafana-secret.yaml +22 -0
  40. package/helm/templates/grafana.yaml +19 -0
  41. package/helm/templates/loki.yaml +33 -0
  42. package/helm/templates/otel-collector.yaml +119 -0
  43. package/helm/templates/prometheus.yaml +43 -0
  44. package/helm/templates/tempo.yaml +16 -0
  45. package/helm/values.prod.yaml +159 -0
  46. package/helm/values.yaml +146 -0
  47. package/logging-lib/nodejs/package.json +57 -0
  48. package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
  49. package/logging-lib/python/pyproject.toml +45 -0
  50. package/logging-lib/python/src/__init__.py +19 -0
  51. package/logging-lib/python/src/logger.py +131 -0
  52. package/logging-lib/python/src/security_events.py +150 -0
  53. package/logging-lib/python/src/tracer.py +185 -0
  54. package/logging-lib/python/tests/test_logger.py +113 -0
  55. package/package.json +21 -0
  56. package/prometheus/prometheus-values.yaml +170 -0
  57. package/prometheus/recording-rules.yaml +97 -0
  58. package/prometheus/scrape-configs.yaml +122 -0
  59. package/runbooks/SDK-INTEGRATION.md +239 -0
  60. package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
  61. package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
  62. package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
  63. package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
  64. package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
  65. package/runbooks/dashboard-guide.md +169 -0
  66. package/scripts/lint-dashboards.js +184 -0
  67. package/tempo/tempo-datasource.yaml +46 -0
  68. package/tempo/tempo-values.yaml +94 -0
  69. package/tests/alert-threshold-config.test.ts +283 -0
  70. package/tests/log-schema-validation.test.ts +246 -0
  71. package/tests/metric-label-validation.test.ts +292 -0
  72. package/tests/otel-pipeline-integration.test.ts +420 -0
  73. package/tests/security-events.test.ts +417 -0
  74. package/tsconfig.json +17 -0
  75. package/vitest.config.ts +21 -0
  76. package/vitest.integration.config.ts +9 -0
@@ -0,0 +1,169 @@
1
+ # Dashboard Guide — ECIP Grafana Dashboards
2
+
3
+ > **Audience:** ECIP engineers, SREs, on-call
4
+ > **Location:** Grafana → ECIP folder
5
+
6
+ ---
7
+
8
+ ## Dashboard Inventory
9
+
10
+ | Dashboard | File | Primary Module | Key SLA |
11
+ |---|---|---|---|
12
+ | Query Latency | `query-latency.json` | M04 | p95 < 2000ms |
13
+ | Analysis Throughput | `analysis-throughput.json` | M02 | — |
14
+ | Cache Performance | `cache-performance.json` | M03 | Hit rate > 85% |
15
+ | LSP Daemon Health | `lsp-daemon-health.json` | M02 | — |
16
+ | MCP Call Graph | `mcp-call-graph.json` | M04/M05 | p95 < 500ms |
17
+ | Event Bus DLQ | `event-bus-dlq.json` | M07 | DLQ depth = 0 |
18
+ | Cross-Repo Fan-out | `cross-repo-fanout.json` | M04 | — |
19
+ | Security Events | `security-events.json` | All | — |
20
+
21
+ ---
22
+
23
+ ## Delivery Sequence
24
+
25
+ Dashboards are delivered in module-dependency order, not all at once:
26
+
27
+ 1. **Week 2** — `lsp-daemon-health.json`, `event-bus-dlq.json` (M02, M07 start early)
28
+ 2. **Week 3** — `cache-performance.json`, `analysis-throughput.json` (M03 foundation)
29
+ 3. **Week 4** — `security-events.json` (tied to M08-T08)
30
+ 4. **Week 11** — `query-latency.json`, `mcp-call-graph.json` (M04 comes online)
31
+ 5. **Week 17** — `cross-repo-fanout.json` (M05 comes online)
32
+
33
+ ---
34
+
35
+ ## How Dashboards Are Provisioned
36
+
37
+ Dashboards are stored as JSON files in `dashboards/`. The Grafana sidecar automatically discovers ConfigMaps with label `grafana_dashboard: "1"` and loads them.
38
+
39
+ **To add/modify a dashboard:**
40
+ 1. Edit the JSON file in `dashboards/`
41
+ 2. Commit and push — the Helm chart creates a ConfigMap per dashboard
42
+ 3. Grafana sidecar reloads within 60 seconds
43
+
44
+ **Do not edit dashboards in the Grafana UI** — changes will be overwritten on the next Helm deploy.
45
+
46
+ ---
47
+
48
+ ## Dashboard Details
49
+
50
+ ### Query Latency (`query-latency.json`)
51
+
52
+ **What it shows:** p50, p95, p99 latency for the M04 Query Service.
53
+
54
+ **Key panels:**
55
+ - Latency histogram with SLA threshold lines at 2000ms
56
+ - Error rate percentage
57
+ - Requests per second
58
+
59
+ **Key PromQL:**
60
+ ```promql
61
+ histogram_quantile(0.95, sum(rate(ecip_query_duration_ms_bucket{module="M04"}[5m])) by (le))
62
+ ```
63
+
64
+ **When to look:** During incidents, SLA breach alerts, capacity planning.
65
+
66
+ ---
67
+
68
+ ### Analysis Throughput (`analysis-throughput.json`)
69
+
70
+ **What it shows:** M02 Analysis Engine processing rate and duration.
71
+
72
+ **Key panels:**
73
+ - Analyses per minute
74
+ - p95 analysis duration
75
+ - Error rate
76
+ - Active analyses gauge
77
+
78
+ ---
79
+
80
+ ### Cache Performance (`cache-performance.json`)
81
+
82
+ **What it shows:** M03 Knowledge Store Redis cache hit rates.
83
+
84
+ **Key panels:**
85
+ - Cache hit rate percentage (target: > 85%)
86
+ - Miss rate with type breakdown
87
+ - Eviction rate
88
+ - Memory usage
89
+
90
+ **When to look:** Cache degradation alerts, latency spikes in M04.
91
+
92
+ ---
93
+
94
+ ### LSP Daemon Health (`lsp-daemon-health.json`)
95
+
96
+ **What it shows:** M02 LSP daemon pool status.
97
+
98
+ **Key panels:**
99
+ - Active daemon count
100
+ - Restart rate (alert fires at > 3/5min)
101
+ - OOM kill events
102
+ - Memory per daemon
103
+
104
+ **When to look:** `LSPDaemonRestartRate` or `LSPDaemonOOMKill` alerts.
105
+
106
+ ---
107
+
108
+ ### MCP Call Graph (`mcp-call-graph.json`)
109
+
110
+ **What it shows:** M04/M05 MCP tool call performance.
111
+
112
+ **Key panels:**
113
+ - Call duration by tool
114
+ - Fan-out depth per query
115
+ - Error rate by tool
116
+ - Concurrent call count
117
+
118
+ ---
119
+
120
+ ### Event Bus DLQ (`event-bus-dlq.json`)
121
+
122
+ **What it shows:** M07 Kafka dead-letter queue depth and age.
123
+
124
+ **Key panels:**
125
+ - DLQ message depth (target: 0)
126
+ - Oldest message age
127
+ - DLQ ingestion rate
128
+ - Processing lag
129
+
130
+ **When to look:** `DLQDepthExceeded` alerts, event processing failures.
131
+
132
+ ---
133
+
134
+ ### Cross-Repo Fan-out (`cross-repo-fanout.json`)
135
+
136
+ **What it shows:** M04 cross-repository dependency resolution depth.
137
+
138
+ **Key panels:**
139
+ - Fan-out depth histogram
140
+ - Repos queried per request
141
+ - Timeout rate
142
+
143
+ ---
144
+
145
+ ### Security Events (`security-events.json`)
146
+
147
+ **What it shows:** Auth failures and RBAC denials from Elasticsearch.
148
+
149
+ **Key panels:**
150
+ - Auth failures over time
151
+ - RBAC denials by resource
152
+ - Top denied users (hashed)
153
+ - Geographic distribution of failures
154
+
155
+ **Data source:** Elasticsearch (`ecip-security-events` index)
156
+
157
+ ---
158
+
159
+ ## Ownership Model (Post-Week 28)
160
+
161
+ After the build phase:
162
+ - **Module teams** own panels that relate to their metrics
163
+ - **Platform team** owns infrastructure-level panels (Collector health, Prometheus performance, Tempo storage)
164
+
165
+ When adding new metrics to your module, update the relevant dashboard JSON and submit a PR.
166
+
167
+ ---
168
+
169
+ *Last updated: March 2026 · Platform Team*
@@ -0,0 +1,184 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * ECIP M08 — Dashboard JSON Linter
4
+ *
5
+ * Validates all Grafana dashboard JSON files in dashboards/.
6
+ *
7
+ * Checks:
8
+ * 1. Valid JSON (parseable without error)
9
+ * 2. Required top-level fields: title, uid, panels
10
+ * 3. schemaVersion is present and ≥ 30
11
+ * 4. uid is non-empty and unique across all dashboards
12
+ * 5. Every panel has: title, type, targets (except row type)
13
+ * 6. No panel has an empty targets array (except row/text)
14
+ * 7. All datasource references use named datasources (not hardcoded URLs)
15
+ * 8. Template variables (if present) have a name and query
16
+ *
17
+ * Exit codes:
18
+ * 0 — all dashboards pass
19
+ * 1 — one or more dashboards have lint errors
20
+ *
21
+ * Usage:
22
+ * node scripts/lint-dashboards.js
23
+ * node scripts/lint-dashboards.js dashboards/query-latency.json
24
+ */
25
+
26
+ 'use strict';
27
+
28
+ const fs = require('fs');
29
+ const path = require('path');
30
+
31
+ const DASHBOARDS_DIR = path.resolve(__dirname, '..', 'dashboards');
32
+ const REQUIRED_FIELDS = ['title', 'uid', 'panels'];
33
+ const PANEL_TYPES_NO_TARGETS = new Set(['row', 'text', 'news', 'dashlist']);
34
+ const MIN_SCHEMA_VERSION = 30;
35
+
36
+ // ---------------------------------------------------------------------------
37
+ // Lint a single dashboard
38
+ // ---------------------------------------------------------------------------
39
+
40
+ function lintDashboard(filePath) {
41
+ const errors = [];
42
+ const filename = path.basename(filePath);
43
+
44
+ // 1. Valid JSON
45
+ let dashboard;
46
+ try {
47
+ const raw = fs.readFileSync(filePath, 'utf8');
48
+ dashboard = JSON.parse(raw);
49
+ } catch (e) {
50
+ errors.push(`${filename}: Invalid JSON — ${e.message}`);
51
+ return { filename, errors, uid: null };
52
+ }
53
+
54
+ // 2. Required top-level fields
55
+ for (const field of REQUIRED_FIELDS) {
56
+ if (!(field in dashboard)) {
57
+ errors.push(`${filename}: Missing required field '${field}'`);
58
+ }
59
+ }
60
+
61
+ // 3. schemaVersion
62
+ if (dashboard.schemaVersion == null) {
63
+ errors.push(`${filename}: Missing 'schemaVersion'`);
64
+ } else if (dashboard.schemaVersion < MIN_SCHEMA_VERSION) {
65
+ errors.push(`${filename}: schemaVersion ${dashboard.schemaVersion} is below minimum (${MIN_SCHEMA_VERSION})`);
66
+ }
67
+
68
+ // 4. uid
69
+ const uid = dashboard.uid;
70
+ if (!uid || typeof uid !== 'string' || uid.trim() === '') {
71
+ errors.push(`${filename}: 'uid' is empty or missing`);
72
+ }
73
+
74
+ // 5. Panel validation
75
+ const panels = dashboard.panels || [];
76
+ if (!Array.isArray(panels)) {
77
+ errors.push(`${filename}: 'panels' is not an array`);
78
+ } else {
79
+ panels.forEach((panel, i) => {
80
+ const label = `${filename} → panel[${i}]`;
81
+
82
+ if (!panel.title) {
83
+ errors.push(`${label}: Missing 'title'`);
84
+ }
85
+ if (!panel.type) {
86
+ errors.push(`${label}: Missing 'type'`);
87
+ }
88
+
89
+ // 6. Targets check (skip row/text panels)
90
+ if (panel.type && !PANEL_TYPES_NO_TARGETS.has(panel.type)) {
91
+ if (!panel.targets || !Array.isArray(panel.targets)) {
92
+ errors.push(`${label} (${panel.title || 'untitled'}): Missing 'targets' array`);
93
+ } else if (panel.targets.length === 0) {
94
+ errors.push(`${label} (${panel.title || 'untitled'}): 'targets' array is empty`);
95
+ } else {
96
+ // 7. Datasource check on targets
97
+ panel.targets.forEach((target, ti) => {
98
+ if (target.expr === undefined && target.query === undefined && target.rawSql === undefined) {
99
+ errors.push(`${label} → target[${ti}]: No 'expr', 'query', or 'rawSql' field`);
100
+ }
101
+ });
102
+ }
103
+ }
104
+ });
105
+ }
106
+
107
+ // 8. Template variables
108
+ if (dashboard.templating && dashboard.templating.list) {
109
+ dashboard.templating.list.forEach((tVar, i) => {
110
+ if (!tVar.name) {
111
+ errors.push(`${filename} → templating[${i}]: Missing variable 'name'`);
112
+ }
113
+ });
114
+ }
115
+
116
+ return { filename, errors, uid };
117
+ }
118
+
119
+ // ---------------------------------------------------------------------------
120
+ // Main
121
+ // ---------------------------------------------------------------------------
122
+
123
+ function main() {
124
+ // Determine files to lint
125
+ let files;
126
+ if (process.argv.length > 2) {
127
+ files = process.argv.slice(2).map((f) => path.resolve(f));
128
+ } else {
129
+ if (!fs.existsSync(DASHBOARDS_DIR)) {
130
+ console.error(`Dashboard directory not found: ${DASHBOARDS_DIR}`);
131
+ process.exit(1);
132
+ }
133
+ files = fs.readdirSync(DASHBOARDS_DIR)
134
+ .filter((f) => f.endsWith('.json'))
135
+ .map((f) => path.join(DASHBOARDS_DIR, f));
136
+ }
137
+
138
+ if (files.length === 0) {
139
+ console.log('No dashboard JSON files found.');
140
+ process.exit(0);
141
+ }
142
+
143
+ console.log(`\n📊 ECIP M08 — Dashboard Lint`);
144
+ console.log(` Checking ${files.length} dashboard(s)...\n`);
145
+
146
+ const allResults = files.map((f) => lintDashboard(f));
147
+
148
+ // UID uniqueness check
149
+ const uidMap = new Map();
150
+ for (const result of allResults) {
151
+ if (result.uid) {
152
+ if (uidMap.has(result.uid)) {
153
+ result.errors.push(
154
+ `${result.filename}: Duplicate UID '${result.uid}' — also used by ${uidMap.get(result.uid)}`,
155
+ );
156
+ } else {
157
+ uidMap.set(result.uid, result.filename);
158
+ }
159
+ }
160
+ }
161
+
162
+ // Report
163
+ let totalErrors = 0;
164
+ for (const result of allResults) {
165
+ if (result.errors.length === 0) {
166
+ console.log(` ✅ ${result.filename}`);
167
+ } else {
168
+ console.log(` ❌ ${result.filename} — ${result.errors.length} error(s)`);
169
+ result.errors.forEach((e) => console.log(` ${e}`));
170
+ totalErrors += result.errors.length;
171
+ }
172
+ }
173
+
174
+ console.log('');
175
+ if (totalErrors > 0) {
176
+ console.log(`❌ ${totalErrors} lint error(s) across ${allResults.filter((r) => r.errors.length > 0).length} file(s).\n`);
177
+ process.exit(1);
178
+ } else {
179
+ console.log(`✅ All ${files.length} dashboard(s) pass lint checks.\n`);
180
+ process.exit(0);
181
+ }
182
+ }
183
+
184
+ main();
@@ -0,0 +1,46 @@
1
+ # =============================================================================
2
+ # ECIP M08 — Grafana Tempo Datasource Provisioning
3
+ # =============================================================================
4
+ # Auto-provisions Tempo as a Grafana datasource via sidecar.
5
+ # =============================================================================
6
+ apiVersion: 1
7
+
8
+ datasources:
9
+ - name: Tempo
10
+ type: tempo
11
+ access: proxy
12
+ url: http://tempo.monitoring:3200
13
+ uid: tempo
14
+ isDefault: false
15
+ editable: true
16
+ jsonData:
17
+ httpMethod: GET
18
+ tracesToMetrics:
19
+ datasourceUid: prometheus
20
+ tags:
21
+ - key: service.name
22
+ value: service
23
+ - key: ecip.module
24
+ value: module
25
+ tracesToLogs:
26
+ datasourceUid: loki
27
+ tags:
28
+ - key: trace_id
29
+ mappedTags:
30
+ - key: service.name
31
+ value: service_name
32
+ - key: ecip.module
33
+ value: module
34
+ mapTagNamesEnabled: true
35
+ spanStartTimeShift: "-1h"
36
+ spanEndTimeShift: "1h"
37
+ filterByTraceID: true
38
+ filterBySpanID: false
39
+ nodeGraph:
40
+ enabled: true
41
+ serviceMap:
42
+ datasourceUid: prometheus
43
+ search:
44
+ hide: false
45
+ lokiSearch:
46
+ datasourceUid: loki
@@ -0,0 +1,94 @@
1
+ # =============================================================================
2
+ # ECIP M08 — Grafana Tempo Helm Values
3
+ # =============================================================================
4
+ # S3-backed trace storage with 14-day retention.
5
+ # Microservices mode for production scalability.
6
+ # =============================================================================
7
+
8
+ tempo:
9
+ # Microservices mode — separate read/write/compactor for scale
10
+ multitenancyEnabled: false
11
+
12
+ storage:
13
+ trace:
14
+ backend: s3
15
+ s3:
16
+ bucket: ecip-tempo-traces
17
+ endpoint: s3.amazonaws.com
18
+ region: us-east-1
19
+ # Credentials from K8s secret (injected via Helm)
20
+ access_key: ${TEMPO_S3_ACCESS_KEY}
21
+ secret_key: ${TEMPO_S3_SECRET_KEY}
22
+
23
+ retention:
24
+ # 14-day trace retention — per design doc
25
+ max_block_duration: 1h
26
+ max_compaction_objects: 6000000
27
+ compaction:
28
+ compacted_block_retention: 336h # 14 days
29
+
30
+ server:
31
+ http_listen_port: 3200
32
+ grpc_listen_port: 9095
33
+
34
+ # OTLP receiver for traces from OTel Collector
35
+ receivers:
36
+ otlp:
37
+ protocols:
38
+ grpc:
39
+ endpoint: 0.0.0.0:4317
40
+ http:
41
+ endpoint: 0.0.0.0:4318
42
+
43
+ # Query frontend for Grafana
44
+ query_frontend:
45
+ search:
46
+ max_duration: 168h # Allow searching up to 7 days back
47
+ default_result_limit: 20
48
+
49
+ # Resource limits
50
+ resources:
51
+ requests:
52
+ cpu: 500m
53
+ memory: 1Gi
54
+ limits:
55
+ cpu: 2000m
56
+ memory: 4Gi
57
+
58
+ persistence:
59
+ enabled: true
60
+ size: 50Gi
61
+ storageClassName: standard
62
+
63
+ # Ingester configuration
64
+ ingester:
65
+ replicas: 2
66
+ resources:
67
+ requests:
68
+ cpu: 500m
69
+ memory: 1Gi
70
+ limits:
71
+ cpu: 1000m
72
+ memory: 2Gi
73
+
74
+ # Compactor — merges and deduplicates blocks
75
+ compactor:
76
+ replicas: 1
77
+ resources:
78
+ requests:
79
+ cpu: 250m
80
+ memory: 512Mi
81
+ limits:
82
+ cpu: 500m
83
+ memory: 1Gi
84
+
85
+ # Distributor — receives spans and distributes to ingesters
86
+ distributor:
87
+ replicas: 2
88
+ resources:
89
+ requests:
90
+ cpu: 250m
91
+ memory: 512Mi
92
+ limits:
93
+ cpu: 1000m
94
+ memory: 1Gi