ecip-observability-stack 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +48 -0
- package/README.md +75 -0
- package/alerts/analysis-backlog.yaml +39 -0
- package/alerts/cache-degradation.yaml +44 -0
- package/alerts/dlq-depth.yaml +56 -0
- package/alerts/lsp-daemon.yaml +43 -0
- package/alerts/mcp-latency.yaml +46 -0
- package/alerts/security-anomaly.yaml +59 -0
- package/alerts/sla-latency.yaml +61 -0
- package/chaos/kafka-broker-restart.sh +168 -0
- package/chaos/kill-lsp-daemon.sh +148 -0
- package/chaos/redis-node-failure.sh +318 -0
- package/ci/check-observability-contract.js +285 -0
- package/ci/eslint-plugin-ecip/index.js +209 -0
- package/ci/eslint-plugin-ecip/package.json +12 -0
- package/ci/github-actions-observability-gate.yaml +180 -0
- package/ci/ruff-shared.toml +41 -0
- package/collector/otel-collector-config.yaml +226 -0
- package/collector/otel-collector-daemonset.yaml +168 -0
- package/collector/sampling-config.yaml +83 -0
- package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
- package/dashboards/analysis-throughput.json +166 -0
- package/dashboards/cache-performance.json +129 -0
- package/dashboards/cross-repo-fanout.json +93 -0
- package/dashboards/event-bus-dlq.json +129 -0
- package/dashboards/lsp-daemon-health.json +104 -0
- package/dashboards/mcp-call-graph.json +114 -0
- package/dashboards/query-latency.json +160 -0
- package/dashboards/security-events.json +131 -0
- package/docs/M08-Observability-Design.md +639 -0
- package/docs/PROGRESS.md +375 -0
- package/docs/module-documentation.md +64 -0
- package/elasticsearch/ilm-policy.json +57 -0
- package/elasticsearch/index-template.json +62 -0
- package/elasticsearch/kibana-space.yaml +53 -0
- package/helm/Chart.yaml +30 -0
- package/helm/templates/configmaps.yaml +25 -0
- package/helm/templates/elasticsearch.yaml +68 -0
- package/helm/templates/grafana-secret.yaml +22 -0
- package/helm/templates/grafana.yaml +19 -0
- package/helm/templates/loki.yaml +33 -0
- package/helm/templates/otel-collector.yaml +119 -0
- package/helm/templates/prometheus.yaml +43 -0
- package/helm/templates/tempo.yaml +16 -0
- package/helm/values.prod.yaml +159 -0
- package/helm/values.yaml +146 -0
- package/logging-lib/nodejs/package.json +57 -0
- package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
- package/logging-lib/python/pyproject.toml +45 -0
- package/logging-lib/python/src/__init__.py +19 -0
- package/logging-lib/python/src/logger.py +131 -0
- package/logging-lib/python/src/security_events.py +150 -0
- package/logging-lib/python/src/tracer.py +185 -0
- package/logging-lib/python/tests/test_logger.py +113 -0
- package/package.json +21 -0
- package/prometheus/prometheus-values.yaml +170 -0
- package/prometheus/recording-rules.yaml +97 -0
- package/prometheus/scrape-configs.yaml +122 -0
- package/runbooks/SDK-INTEGRATION.md +239 -0
- package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
- package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
- package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
- package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
- package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
- package/runbooks/dashboard-guide.md +169 -0
- package/scripts/lint-dashboards.js +184 -0
- package/tempo/tempo-datasource.yaml +46 -0
- package/tempo/tempo-values.yaml +94 -0
- package/tests/alert-threshold-config.test.ts +283 -0
- package/tests/log-schema-validation.test.ts +246 -0
- package/tests/metric-label-validation.test.ts +292 -0
- package/tests/otel-pipeline-integration.test.ts +420 -0
- package/tests/security-events.test.ts +417 -0
- package/tsconfig.json +17 -0
- package/vitest.config.ts +21 -0
- package/vitest.integration.config.ts +9 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
{
|
|
2
|
+
"annotations": {
|
|
3
|
+
"list": [
|
|
4
|
+
{
|
|
5
|
+
"builtIn": 1,
|
|
6
|
+
"datasource": "-- Grafana --",
|
|
7
|
+
"enable": true,
|
|
8
|
+
"hide": true,
|
|
9
|
+
"iconColor": "rgba(0, 211, 255, 1)",
|
|
10
|
+
"name": "Annotations & Alerts",
|
|
11
|
+
"type": "dashboard"
|
|
12
|
+
}
|
|
13
|
+
]
|
|
14
|
+
},
|
|
15
|
+
"description": "ECIP Query Service (M04) — p50/p95/p99 latency per mode (lsp/vector/hybrid)",
|
|
16
|
+
"editable": true,
|
|
17
|
+
"gnetId": null,
|
|
18
|
+
"graphTooltip": 1,
|
|
19
|
+
"id": null,
|
|
20
|
+
"links": [],
|
|
21
|
+
"panels": [
|
|
22
|
+
{
|
|
23
|
+
"title": "Query Latency Overview",
|
|
24
|
+
"type": "row",
|
|
25
|
+
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
26
|
+
"collapsed": false
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"title": "Query Duration p50 / p95 / p99",
|
|
30
|
+
"type": "timeseries",
|
|
31
|
+
"datasource": "Prometheus",
|
|
32
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
|
|
33
|
+
"fieldConfig": {
|
|
34
|
+
"defaults": {
|
|
35
|
+
"unit": "ms",
|
|
36
|
+
"thresholds": {
|
|
37
|
+
"mode": "absolute",
|
|
38
|
+
"steps": [
|
|
39
|
+
{ "color": "green", "value": null },
|
|
40
|
+
{ "color": "yellow", "value": 1000 },
|
|
41
|
+
{ "color": "red", "value": 1500 }
|
|
42
|
+
]
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"targets": [
|
|
47
|
+
{
|
|
48
|
+
"expr": "histogram_quantile(0.50, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\"}[5m])) by (le))",
|
|
49
|
+
"legendFormat": "p50"
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"expr": "histogram_quantile(0.95, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\"}[5m])) by (le))",
|
|
53
|
+
"legendFormat": "p95"
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"expr": "histogram_quantile(0.99, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\"}[5m])) by (le))",
|
|
57
|
+
"legendFormat": "p99"
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"title": "Query Duration by Mode",
|
|
63
|
+
"type": "timeseries",
|
|
64
|
+
"datasource": "Prometheus",
|
|
65
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
|
|
66
|
+
"fieldConfig": {
|
|
67
|
+
"defaults": { "unit": "ms" }
|
|
68
|
+
},
|
|
69
|
+
"targets": [
|
|
70
|
+
{
|
|
71
|
+
"expr": "histogram_quantile(0.95, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\"}[5m])) by (le, mode))",
|
|
72
|
+
"legendFormat": "p95 — {{mode}}"
|
|
73
|
+
}
|
|
74
|
+
]
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
"title": "Query Rate (requests/sec)",
|
|
78
|
+
"type": "timeseries",
|
|
79
|
+
"datasource": "Prometheus",
|
|
80
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
|
81
|
+
"fieldConfig": {
|
|
82
|
+
"defaults": { "unit": "reqps" }
|
|
83
|
+
},
|
|
84
|
+
"targets": [
|
|
85
|
+
{
|
|
86
|
+
"expr": "sum(rate(query_duration_ms_count{job=\"ecip-query-service\"}[5m])) by (mode)",
|
|
87
|
+
"legendFormat": "{{mode}}"
|
|
88
|
+
}
|
|
89
|
+
]
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
"title": "Query Error Rate",
|
|
93
|
+
"type": "timeseries",
|
|
94
|
+
"datasource": "Prometheus",
|
|
95
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
|
96
|
+
"fieldConfig": {
|
|
97
|
+
"defaults": { "unit": "percentunit" }
|
|
98
|
+
},
|
|
99
|
+
"targets": [
|
|
100
|
+
{
|
|
101
|
+
"expr": "sum(rate(query_duration_ms_count{job=\"ecip-query-service\",status_code=~\"5..\"}[5m])) / sum(rate(query_duration_ms_count{job=\"ecip-query-service\"}[5m]))",
|
|
102
|
+
"legendFormat": "Error Rate"
|
|
103
|
+
}
|
|
104
|
+
]
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"title": "Query Latency by Repo (p95)",
|
|
108
|
+
"type": "table",
|
|
109
|
+
"datasource": "Prometheus",
|
|
110
|
+
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 17 },
|
|
111
|
+
"targets": [
|
|
112
|
+
{
|
|
113
|
+
"expr": "histogram_quantile(0.95, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\"}[5m])) by (le, repo))",
|
|
114
|
+
"legendFormat": "{{repo}}",
|
|
115
|
+
"format": "table",
|
|
116
|
+
"instant": true
|
|
117
|
+
}
|
|
118
|
+
]
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
"title": "Cached vs Uncached Query Latency (p95)",
|
|
122
|
+
"type": "timeseries",
|
|
123
|
+
"datasource": "Prometheus",
|
|
124
|
+
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 25 },
|
|
125
|
+
"fieldConfig": {
|
|
126
|
+
"defaults": { "unit": "ms" }
|
|
127
|
+
},
|
|
128
|
+
"targets": [
|
|
129
|
+
{
|
|
130
|
+
"expr": "histogram_quantile(0.95, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\",cached=\"true\"}[5m])) by (le))",
|
|
131
|
+
"legendFormat": "p95 — cached"
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
"expr": "histogram_quantile(0.95, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\",cached=\"false\"}[5m])) by (le))",
|
|
135
|
+
"legendFormat": "p95 — uncached"
|
|
136
|
+
}
|
|
137
|
+
]
|
|
138
|
+
}
|
|
139
|
+
],
|
|
140
|
+
"refresh": "30s",
|
|
141
|
+
"schemaVersion": 39,
|
|
142
|
+
"tags": ["ecip", "m04", "query-service", "latency"],
|
|
143
|
+
"templating": {
|
|
144
|
+
"list": [
|
|
145
|
+
{
|
|
146
|
+
"name": "repo",
|
|
147
|
+
"type": "query",
|
|
148
|
+
"datasource": "Prometheus",
|
|
149
|
+
"query": "label_values(query_duration_ms_bucket, repo)",
|
|
150
|
+
"refresh": 2,
|
|
151
|
+
"includeAll": true,
|
|
152
|
+
"multi": true
|
|
153
|
+
}
|
|
154
|
+
]
|
|
155
|
+
},
|
|
156
|
+
"time": { "from": "now-1h", "to": "now" },
|
|
157
|
+
"title": "ECIP — Query Latency",
|
|
158
|
+
"uid": "ecip-query-latency",
|
|
159
|
+
"version": 1
|
|
160
|
+
}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "ECIP Security Events — Auth failures, RBAC denials (from Elasticsearch)",
|
|
3
|
+
"editable": true,
|
|
4
|
+
"panels": [
|
|
5
|
+
{
|
|
6
|
+
"title": "Security Events",
|
|
7
|
+
"type": "row",
|
|
8
|
+
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
9
|
+
"collapsed": false
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"title": "Auth Failures (last 24h)",
|
|
13
|
+
"type": "stat",
|
|
14
|
+
"datasource": "Prometheus",
|
|
15
|
+
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 1 },
|
|
16
|
+
"fieldConfig": {
|
|
17
|
+
"defaults": {
|
|
18
|
+
"unit": "short",
|
|
19
|
+
"thresholds": {
|
|
20
|
+
"mode": "absolute",
|
|
21
|
+
"steps": [
|
|
22
|
+
{ "color": "green", "value": null },
|
|
23
|
+
{ "color": "yellow", "value": 50 },
|
|
24
|
+
{ "color": "red", "value": 100 }
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"targets": [
|
|
30
|
+
{
|
|
31
|
+
"expr": "sum(increase(auth_failure_total[24h]))",
|
|
32
|
+
"legendFormat": "Total Auth Failures"
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"title": "RBAC Denials (last 24h)",
|
|
38
|
+
"type": "stat",
|
|
39
|
+
"datasource": "Prometheus",
|
|
40
|
+
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 1 },
|
|
41
|
+
"fieldConfig": {
|
|
42
|
+
"defaults": {
|
|
43
|
+
"unit": "short",
|
|
44
|
+
"thresholds": {
|
|
45
|
+
"mode": "absolute",
|
|
46
|
+
"steps": [
|
|
47
|
+
{ "color": "green", "value": null },
|
|
48
|
+
{ "color": "yellow", "value": 50 },
|
|
49
|
+
{ "color": "red", "value": 100 }
|
|
50
|
+
]
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
},
|
|
54
|
+
"targets": [
|
|
55
|
+
{
|
|
56
|
+
"expr": "sum(increase(rbac_denial_total[24h]))",
|
|
57
|
+
"legendFormat": "Total RBAC Denials"
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"title": "Security Alert Status",
|
|
63
|
+
"type": "stat",
|
|
64
|
+
"datasource": "Prometheus",
|
|
65
|
+
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 1 },
|
|
66
|
+
"fieldConfig": {
|
|
67
|
+
"defaults": {
|
|
68
|
+
"mappings": [
|
|
69
|
+
{ "type": "value", "options": { "0": { "text": "OK", "color": "green" }, "1": { "text": "FIRING", "color": "red" } } }
|
|
70
|
+
]
|
|
71
|
+
}
|
|
72
|
+
},
|
|
73
|
+
"targets": [
|
|
74
|
+
{
|
|
75
|
+
"expr": "ALERTS{alertname=~\"SecurityAuth.*|SecurityRBAC.*\"}",
|
|
76
|
+
"legendFormat": "{{alertname}}"
|
|
77
|
+
}
|
|
78
|
+
]
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"title": "Auth Failures Over Time",
|
|
82
|
+
"type": "timeseries",
|
|
83
|
+
"datasource": "Prometheus",
|
|
84
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
|
85
|
+
"fieldConfig": {
|
|
86
|
+
"defaults": { "unit": "short" }
|
|
87
|
+
},
|
|
88
|
+
"targets": [
|
|
89
|
+
{
|
|
90
|
+
"expr": "sum(rate(auth_failure_total[5m])) by (reason)",
|
|
91
|
+
"legendFormat": "{{reason}}"
|
|
92
|
+
}
|
|
93
|
+
]
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"title": "RBAC Denials Over Time",
|
|
97
|
+
"type": "timeseries",
|
|
98
|
+
"datasource": "Prometheus",
|
|
99
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
|
100
|
+
"fieldConfig": {
|
|
101
|
+
"defaults": { "unit": "short" }
|
|
102
|
+
},
|
|
103
|
+
"targets": [
|
|
104
|
+
{
|
|
105
|
+
"expr": "sum(rate(rbac_denial_total[5m])) by (resource, action)",
|
|
106
|
+
"legendFormat": "{{resource}}/{{action}}"
|
|
107
|
+
}
|
|
108
|
+
]
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
"title": "Security Events Log (Elasticsearch)",
|
|
112
|
+
"type": "logs",
|
|
113
|
+
"datasource": "Elasticsearch",
|
|
114
|
+
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 17 },
|
|
115
|
+
"targets": [
|
|
116
|
+
{
|
|
117
|
+
"query": "event.category:authentication OR event.category:authorization",
|
|
118
|
+
"metrics": [{ "type": "logs", "id": "1" }],
|
|
119
|
+
"timeField": "@timestamp"
|
|
120
|
+
}
|
|
121
|
+
]
|
|
122
|
+
}
|
|
123
|
+
],
|
|
124
|
+
"refresh": "1m",
|
|
125
|
+
"schemaVersion": 39,
|
|
126
|
+
"tags": ["ecip", "security", "auth", "rbac"],
|
|
127
|
+
"time": { "from": "now-24h", "to": "now" },
|
|
128
|
+
"title": "ECIP — Security Events",
|
|
129
|
+
"uid": "ecip-security-events",
|
|
130
|
+
"version": 1
|
|
131
|
+
}
|