ecip-observability-stack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CLAUDE.md +48 -0
  2. package/README.md +75 -0
  3. package/alerts/analysis-backlog.yaml +39 -0
  4. package/alerts/cache-degradation.yaml +44 -0
  5. package/alerts/dlq-depth.yaml +56 -0
  6. package/alerts/lsp-daemon.yaml +43 -0
  7. package/alerts/mcp-latency.yaml +46 -0
  8. package/alerts/security-anomaly.yaml +59 -0
  9. package/alerts/sla-latency.yaml +61 -0
  10. package/chaos/kafka-broker-restart.sh +168 -0
  11. package/chaos/kill-lsp-daemon.sh +148 -0
  12. package/chaos/redis-node-failure.sh +318 -0
  13. package/ci/check-observability-contract.js +285 -0
  14. package/ci/eslint-plugin-ecip/index.js +209 -0
  15. package/ci/eslint-plugin-ecip/package.json +12 -0
  16. package/ci/github-actions-observability-gate.yaml +180 -0
  17. package/ci/ruff-shared.toml +41 -0
  18. package/collector/otel-collector-config.yaml +226 -0
  19. package/collector/otel-collector-daemonset.yaml +168 -0
  20. package/collector/sampling-config.yaml +83 -0
  21. package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
  22. package/dashboards/analysis-throughput.json +166 -0
  23. package/dashboards/cache-performance.json +129 -0
  24. package/dashboards/cross-repo-fanout.json +93 -0
  25. package/dashboards/event-bus-dlq.json +129 -0
  26. package/dashboards/lsp-daemon-health.json +104 -0
  27. package/dashboards/mcp-call-graph.json +114 -0
  28. package/dashboards/query-latency.json +160 -0
  29. package/dashboards/security-events.json +131 -0
  30. package/docs/M08-Observability-Design.md +639 -0
  31. package/docs/PROGRESS.md +375 -0
  32. package/docs/module-documentation.md +64 -0
  33. package/elasticsearch/ilm-policy.json +57 -0
  34. package/elasticsearch/index-template.json +62 -0
  35. package/elasticsearch/kibana-space.yaml +53 -0
  36. package/helm/Chart.yaml +30 -0
  37. package/helm/templates/configmaps.yaml +25 -0
  38. package/helm/templates/elasticsearch.yaml +68 -0
  39. package/helm/templates/grafana-secret.yaml +22 -0
  40. package/helm/templates/grafana.yaml +19 -0
  41. package/helm/templates/loki.yaml +33 -0
  42. package/helm/templates/otel-collector.yaml +119 -0
  43. package/helm/templates/prometheus.yaml +43 -0
  44. package/helm/templates/tempo.yaml +16 -0
  45. package/helm/values.prod.yaml +159 -0
  46. package/helm/values.yaml +146 -0
  47. package/logging-lib/nodejs/package.json +57 -0
  48. package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
  49. package/logging-lib/python/pyproject.toml +45 -0
  50. package/logging-lib/python/src/__init__.py +19 -0
  51. package/logging-lib/python/src/logger.py +131 -0
  52. package/logging-lib/python/src/security_events.py +150 -0
  53. package/logging-lib/python/src/tracer.py +185 -0
  54. package/logging-lib/python/tests/test_logger.py +113 -0
  55. package/package.json +21 -0
  56. package/prometheus/prometheus-values.yaml +170 -0
  57. package/prometheus/recording-rules.yaml +97 -0
  58. package/prometheus/scrape-configs.yaml +122 -0
  59. package/runbooks/SDK-INTEGRATION.md +239 -0
  60. package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
  61. package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
  62. package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
  63. package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
  64. package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
  65. package/runbooks/dashboard-guide.md +169 -0
  66. package/scripts/lint-dashboards.js +184 -0
  67. package/tempo/tempo-datasource.yaml +46 -0
  68. package/tempo/tempo-values.yaml +94 -0
  69. package/tests/alert-threshold-config.test.ts +283 -0
  70. package/tests/log-schema-validation.test.ts +246 -0
  71. package/tests/metric-label-validation.test.ts +292 -0
  72. package/tests/otel-pipeline-integration.test.ts +420 -0
  73. package/tests/security-events.test.ts +417 -0
  74. package/tsconfig.json +17 -0
  75. package/vitest.config.ts +21 -0
  76. package/vitest.integration.config.ts +9 -0
@@ -0,0 +1,160 @@
1
+ {
2
+ "annotations": {
3
+ "list": [
4
+ {
5
+ "builtIn": 1,
6
+ "datasource": "-- Grafana --",
7
+ "enable": true,
8
+ "hide": true,
9
+ "iconColor": "rgba(0, 211, 255, 1)",
10
+ "name": "Annotations & Alerts",
11
+ "type": "dashboard"
12
+ }
13
+ ]
14
+ },
15
+ "description": "ECIP Query Service (M04) — p50/p95/p99 latency per mode (lsp/vector/hybrid)",
16
+ "editable": true,
17
+ "gnetId": null,
18
+ "graphTooltip": 1,
19
+ "id": null,
20
+ "links": [],
21
+ "panels": [
22
+ {
23
+ "title": "Query Latency Overview",
24
+ "type": "row",
25
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
26
+ "collapsed": false
27
+ },
28
+ {
29
+ "title": "Query Duration p50 / p95 / p99",
30
+ "type": "timeseries",
31
+ "datasource": "Prometheus",
32
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
33
+ "fieldConfig": {
34
+ "defaults": {
35
+ "unit": "ms",
36
+ "thresholds": {
37
+ "mode": "absolute",
38
+ "steps": [
39
+ { "color": "green", "value": null },
40
+ { "color": "yellow", "value": 1000 },
41
+ { "color": "red", "value": 1500 }
42
+ ]
43
+ }
44
+ }
45
+ },
46
+ "targets": [
47
+ {
48
+ "expr": "histogram_quantile(0.50, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\"}[5m])) by (le))",
49
+ "legendFormat": "p50"
50
+ },
51
+ {
52
+ "expr": "histogram_quantile(0.95, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\"}[5m])) by (le))",
53
+ "legendFormat": "p95"
54
+ },
55
+ {
56
+ "expr": "histogram_quantile(0.99, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\"}[5m])) by (le))",
57
+ "legendFormat": "p99"
58
+ }
59
+ ]
60
+ },
61
+ {
62
+ "title": "Query Duration by Mode",
63
+ "type": "timeseries",
64
+ "datasource": "Prometheus",
65
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
66
+ "fieldConfig": {
67
+ "defaults": { "unit": "ms" }
68
+ },
69
+ "targets": [
70
+ {
71
+ "expr": "histogram_quantile(0.95, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\"}[5m])) by (le, mode))",
72
+ "legendFormat": "p95 — {{mode}}"
73
+ }
74
+ ]
75
+ },
76
+ {
77
+ "title": "Query Rate (requests/sec)",
78
+ "type": "timeseries",
79
+ "datasource": "Prometheus",
80
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
81
+ "fieldConfig": {
82
+ "defaults": { "unit": "reqps" }
83
+ },
84
+ "targets": [
85
+ {
86
+ "expr": "sum(rate(query_duration_ms_count{job=\"ecip-query-service\"}[5m])) by (mode)",
87
+ "legendFormat": "{{mode}}"
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "title": "Query Error Rate",
93
+ "type": "timeseries",
94
+ "datasource": "Prometheus",
95
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
96
+ "fieldConfig": {
97
+ "defaults": { "unit": "percentunit" }
98
+ },
99
+ "targets": [
100
+ {
101
+ "expr": "sum(rate(query_duration_ms_count{job=\"ecip-query-service\",status_code=~\"5..\"}[5m])) / sum(rate(query_duration_ms_count{job=\"ecip-query-service\"}[5m]))",
102
+ "legendFormat": "Error Rate"
103
+ }
104
+ ]
105
+ },
106
+ {
107
+ "title": "Query Latency by Repo (p95)",
108
+ "type": "table",
109
+ "datasource": "Prometheus",
110
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 17 },
111
+ "targets": [
112
+ {
113
+ "expr": "histogram_quantile(0.95, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\"}[5m])) by (le, repo))",
114
+ "legendFormat": "{{repo}}",
115
+ "format": "table",
116
+ "instant": true
117
+ }
118
+ ]
119
+ },
120
+ {
121
+ "title": "Cached vs Uncached Query Latency (p95)",
122
+ "type": "timeseries",
123
+ "datasource": "Prometheus",
124
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 25 },
125
+ "fieldConfig": {
126
+ "defaults": { "unit": "ms" }
127
+ },
128
+ "targets": [
129
+ {
130
+ "expr": "histogram_quantile(0.95, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\",cached=\"true\"}[5m])) by (le))",
131
+ "legendFormat": "p95 — cached"
132
+ },
133
+ {
134
+ "expr": "histogram_quantile(0.95, sum(rate(query_duration_ms_bucket{job=\"ecip-query-service\",cached=\"false\"}[5m])) by (le))",
135
+ "legendFormat": "p95 — uncached"
136
+ }
137
+ ]
138
+ }
139
+ ],
140
+ "refresh": "30s",
141
+ "schemaVersion": 39,
142
+ "tags": ["ecip", "m04", "query-service", "latency"],
143
+ "templating": {
144
+ "list": [
145
+ {
146
+ "name": "repo",
147
+ "type": "query",
148
+ "datasource": "Prometheus",
149
+ "query": "label_values(query_duration_ms_bucket, repo)",
150
+ "refresh": 2,
151
+ "includeAll": true,
152
+ "multi": true
153
+ }
154
+ ]
155
+ },
156
+ "time": { "from": "now-1h", "to": "now" },
157
+ "title": "ECIP — Query Latency",
158
+ "uid": "ecip-query-latency",
159
+ "version": 1
160
+ }
@@ -0,0 +1,131 @@
1
+ {
2
+ "description": "ECIP Security Events — Auth failures, RBAC denials (from Elasticsearch)",
3
+ "editable": true,
4
+ "panels": [
5
+ {
6
+ "title": "Security Events",
7
+ "type": "row",
8
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
9
+ "collapsed": false
10
+ },
11
+ {
12
+ "title": "Auth Failures (last 24h)",
13
+ "type": "stat",
14
+ "datasource": "Prometheus",
15
+ "gridPos": { "h": 8, "w": 8, "x": 0, "y": 1 },
16
+ "fieldConfig": {
17
+ "defaults": {
18
+ "unit": "short",
19
+ "thresholds": {
20
+ "mode": "absolute",
21
+ "steps": [
22
+ { "color": "green", "value": null },
23
+ { "color": "yellow", "value": 50 },
24
+ { "color": "red", "value": 100 }
25
+ ]
26
+ }
27
+ }
28
+ },
29
+ "targets": [
30
+ {
31
+ "expr": "sum(increase(auth_failure_total[24h]))",
32
+ "legendFormat": "Total Auth Failures"
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "title": "RBAC Denials (last 24h)",
38
+ "type": "stat",
39
+ "datasource": "Prometheus",
40
+ "gridPos": { "h": 8, "w": 8, "x": 8, "y": 1 },
41
+ "fieldConfig": {
42
+ "defaults": {
43
+ "unit": "short",
44
+ "thresholds": {
45
+ "mode": "absolute",
46
+ "steps": [
47
+ { "color": "green", "value": null },
48
+ { "color": "yellow", "value": 50 },
49
+ { "color": "red", "value": 100 }
50
+ ]
51
+ }
52
+ }
53
+ },
54
+ "targets": [
55
+ {
56
+ "expr": "sum(increase(rbac_denial_total[24h]))",
57
+ "legendFormat": "Total RBAC Denials"
58
+ }
59
+ ]
60
+ },
61
+ {
62
+ "title": "Security Alert Status",
63
+ "type": "stat",
64
+ "datasource": "Prometheus",
65
+ "gridPos": { "h": 8, "w": 8, "x": 16, "y": 1 },
66
+ "fieldConfig": {
67
+ "defaults": {
68
+ "mappings": [
69
+ { "type": "value", "options": { "0": { "text": "OK", "color": "green" }, "1": { "text": "FIRING", "color": "red" } } }
70
+ ]
71
+ }
72
+ },
73
+ "targets": [
74
+ {
75
+ "expr": "ALERTS{alertname=~\"SecurityAuth.*|SecurityRBAC.*\"}",
76
+ "legendFormat": "{{alertname}}"
77
+ }
78
+ ]
79
+ },
80
+ {
81
+ "title": "Auth Failures Over Time",
82
+ "type": "timeseries",
83
+ "datasource": "Prometheus",
84
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
85
+ "fieldConfig": {
86
+ "defaults": { "unit": "short" }
87
+ },
88
+ "targets": [
89
+ {
90
+ "expr": "sum(rate(auth_failure_total[5m])) by (reason)",
91
+ "legendFormat": "{{reason}}"
92
+ }
93
+ ]
94
+ },
95
+ {
96
+ "title": "RBAC Denials Over Time",
97
+ "type": "timeseries",
98
+ "datasource": "Prometheus",
99
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
100
+ "fieldConfig": {
101
+ "defaults": { "unit": "short" }
102
+ },
103
+ "targets": [
104
+ {
105
+ "expr": "sum(rate(rbac_denial_total[5m])) by (resource, action)",
106
+ "legendFormat": "{{resource}}/{{action}}"
107
+ }
108
+ ]
109
+ },
110
+ {
111
+ "title": "Security Events Log (Elasticsearch)",
112
+ "type": "logs",
113
+ "datasource": "Elasticsearch",
114
+ "gridPos": { "h": 12, "w": 24, "x": 0, "y": 17 },
115
+ "targets": [
116
+ {
117
+ "query": "event.category:authentication OR event.category:authorization",
118
+ "metrics": [{ "type": "logs", "id": "1" }],
119
+ "timeField": "@timestamp"
120
+ }
121
+ ]
122
+ }
123
+ ],
124
+ "refresh": "1m",
125
+ "schemaVersion": 39,
126
+ "tags": ["ecip", "security", "auth", "rbac"],
127
+ "time": { "from": "now-24h", "to": "now" },
128
+ "title": "ECIP — Security Events",
129
+ "uid": "ecip-security-events",
130
+ "version": 1
131
+ }