ecip-observability-stack 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +48 -0
- package/README.md +75 -0
- package/alerts/analysis-backlog.yaml +39 -0
- package/alerts/cache-degradation.yaml +44 -0
- package/alerts/dlq-depth.yaml +56 -0
- package/alerts/lsp-daemon.yaml +43 -0
- package/alerts/mcp-latency.yaml +46 -0
- package/alerts/security-anomaly.yaml +59 -0
- package/alerts/sla-latency.yaml +61 -0
- package/chaos/kafka-broker-restart.sh +168 -0
- package/chaos/kill-lsp-daemon.sh +148 -0
- package/chaos/redis-node-failure.sh +318 -0
- package/ci/check-observability-contract.js +285 -0
- package/ci/eslint-plugin-ecip/index.js +209 -0
- package/ci/eslint-plugin-ecip/package.json +12 -0
- package/ci/github-actions-observability-gate.yaml +180 -0
- package/ci/ruff-shared.toml +41 -0
- package/collector/otel-collector-config.yaml +226 -0
- package/collector/otel-collector-daemonset.yaml +168 -0
- package/collector/sampling-config.yaml +83 -0
- package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
- package/dashboards/analysis-throughput.json +166 -0
- package/dashboards/cache-performance.json +129 -0
- package/dashboards/cross-repo-fanout.json +93 -0
- package/dashboards/event-bus-dlq.json +129 -0
- package/dashboards/lsp-daemon-health.json +104 -0
- package/dashboards/mcp-call-graph.json +114 -0
- package/dashboards/query-latency.json +160 -0
- package/dashboards/security-events.json +131 -0
- package/docs/M08-Observability-Design.md +639 -0
- package/docs/PROGRESS.md +375 -0
- package/docs/module-documentation.md +64 -0
- package/elasticsearch/ilm-policy.json +57 -0
- package/elasticsearch/index-template.json +62 -0
- package/elasticsearch/kibana-space.yaml +53 -0
- package/helm/Chart.yaml +30 -0
- package/helm/templates/configmaps.yaml +25 -0
- package/helm/templates/elasticsearch.yaml +68 -0
- package/helm/templates/grafana-secret.yaml +22 -0
- package/helm/templates/grafana.yaml +19 -0
- package/helm/templates/loki.yaml +33 -0
- package/helm/templates/otel-collector.yaml +119 -0
- package/helm/templates/prometheus.yaml +43 -0
- package/helm/templates/tempo.yaml +16 -0
- package/helm/values.prod.yaml +159 -0
- package/helm/values.yaml +146 -0
- package/logging-lib/nodejs/package.json +57 -0
- package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
- package/logging-lib/python/pyproject.toml +45 -0
- package/logging-lib/python/src/__init__.py +19 -0
- package/logging-lib/python/src/logger.py +131 -0
- package/logging-lib/python/src/security_events.py +150 -0
- package/logging-lib/python/src/tracer.py +185 -0
- package/logging-lib/python/tests/test_logger.py +113 -0
- package/package.json +21 -0
- package/prometheus/prometheus-values.yaml +170 -0
- package/prometheus/recording-rules.yaml +97 -0
- package/prometheus/scrape-configs.yaml +122 -0
- package/runbooks/SDK-INTEGRATION.md +239 -0
- package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
- package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
- package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
- package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
- package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
- package/runbooks/dashboard-guide.md +169 -0
- package/scripts/lint-dashboards.js +184 -0
- package/tempo/tempo-datasource.yaml +46 -0
- package/tempo/tempo-values.yaml +94 -0
- package/tests/alert-threshold-config.test.ts +283 -0
- package/tests/log-schema-validation.test.ts +246 -0
- package/tests/metric-label-validation.test.ts +292 -0
- package/tests/otel-pipeline-integration.test.ts +420 -0
- package/tests/security-events.test.ts +417 -0
- package/tsconfig.json +17 -0
- package/vitest.config.ts +21 -0
- package/vitest.integration.config.ts +9 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "ECIP Knowledge Store (M03) — Cache hit rate by type and repo, Redis/pgvector performance",
|
|
3
|
+
"editable": true,
|
|
4
|
+
"panels": [
|
|
5
|
+
{
|
|
6
|
+
"title": "Cache Performance",
|
|
7
|
+
"type": "row",
|
|
8
|
+
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
9
|
+
"collapsed": false
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"title": "Cache Hit Rate",
|
|
13
|
+
"type": "gauge",
|
|
14
|
+
"datasource": "Prometheus",
|
|
15
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
|
|
16
|
+
"fieldConfig": {
|
|
17
|
+
"defaults": {
|
|
18
|
+
"unit": "percentunit",
|
|
19
|
+
"min": 0,
|
|
20
|
+
"max": 1,
|
|
21
|
+
"thresholds": {
|
|
22
|
+
"mode": "absolute",
|
|
23
|
+
"steps": [
|
|
24
|
+
{ "color": "red", "value": null },
|
|
25
|
+
{ "color": "yellow", "value": 0.6 },
|
|
26
|
+
{ "color": "green", "value": 0.8 }
|
|
27
|
+
]
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"targets": [
|
|
32
|
+
{
|
|
33
|
+
"expr": "cache_hit_rate{job=\"ecip-knowledge-store\"}",
|
|
34
|
+
"legendFormat": "{{cache_type}} — {{repo}}"
|
|
35
|
+
}
|
|
36
|
+
]
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"title": "Cache Hit Rate Over Time",
|
|
40
|
+
"type": "timeseries",
|
|
41
|
+
"datasource": "Prometheus",
|
|
42
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
|
|
43
|
+
"fieldConfig": {
|
|
44
|
+
"defaults": { "unit": "percentunit" }
|
|
45
|
+
},
|
|
46
|
+
"targets": [
|
|
47
|
+
{
|
|
48
|
+
"expr": "cache_hit_rate{job=~\"ecip-knowledge-store|ecip-query-service\"}",
|
|
49
|
+
"legendFormat": "{{cache_type}} — {{repo}}"
|
|
50
|
+
}
|
|
51
|
+
]
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"title": "Knowledge Store Write Duration (p95)",
|
|
55
|
+
"type": "timeseries",
|
|
56
|
+
"datasource": "Prometheus",
|
|
57
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
|
58
|
+
"fieldConfig": {
|
|
59
|
+
"defaults": {
|
|
60
|
+
"unit": "ms",
|
|
61
|
+
"thresholds": {
|
|
62
|
+
"mode": "absolute",
|
|
63
|
+
"steps": [
|
|
64
|
+
{ "color": "green", "value": null },
|
|
65
|
+
{ "color": "yellow", "value": 150 },
|
|
66
|
+
{ "color": "red", "value": 200 }
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
"targets": [
|
|
72
|
+
{
|
|
73
|
+
"expr": "histogram_quantile(0.95, sum(rate(knowledge_store_write_duration_ms_bucket[5m])) by (le, store_type))",
|
|
74
|
+
"legendFormat": "p95 — {{store_type}}"
|
|
75
|
+
}
|
|
76
|
+
]
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"title": "pgvector Query Duration (p95)",
|
|
80
|
+
"type": "timeseries",
|
|
81
|
+
"datasource": "Prometheus",
|
|
82
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
|
83
|
+
"fieldConfig": {
|
|
84
|
+
"defaults": { "unit": "ms" }
|
|
85
|
+
},
|
|
86
|
+
"targets": [
|
|
87
|
+
{
|
|
88
|
+
"expr": "histogram_quantile(0.95, sum(rate(knowledge_store_write_duration_ms_bucket{store_type=\"pgvector\"}[5m])) by (le, namespace))",
|
|
89
|
+
"legendFormat": "p95 — {{namespace}}"
|
|
90
|
+
}
|
|
91
|
+
]
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"title": "HNSW Rebuild Duration",
|
|
95
|
+
"type": "timeseries",
|
|
96
|
+
"datasource": "Prometheus",
|
|
97
|
+
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 17 },
|
|
98
|
+
"fieldConfig": {
|
|
99
|
+
"defaults": { "unit": "ms" }
|
|
100
|
+
},
|
|
101
|
+
"targets": [
|
|
102
|
+
{
|
|
103
|
+
"expr": "histogram_quantile(0.95, sum(rate(hnsw_rebuild_duration_ms_bucket[5m])) by (le, repo))",
|
|
104
|
+
"legendFormat": "{{repo}}"
|
|
105
|
+
}
|
|
106
|
+
]
|
|
107
|
+
}
|
|
108
|
+
],
|
|
109
|
+
"refresh": "30s",
|
|
110
|
+
"schemaVersion": 39,
|
|
111
|
+
"tags": ["ecip", "m03", "knowledge-store", "cache"],
|
|
112
|
+
"templating": {
|
|
113
|
+
"list": [
|
|
114
|
+
{
|
|
115
|
+
"name": "repo",
|
|
116
|
+
"type": "query",
|
|
117
|
+
"datasource": "Prometheus",
|
|
118
|
+
"query": "label_values(cache_hit_rate, repo)",
|
|
119
|
+
"refresh": 2,
|
|
120
|
+
"includeAll": true,
|
|
121
|
+
"multi": true
|
|
122
|
+
}
|
|
123
|
+
]
|
|
124
|
+
},
|
|
125
|
+
"time": { "from": "now-1h", "to": "now" },
|
|
126
|
+
"title": "ECIP — Cache Performance",
|
|
127
|
+
"uid": "ecip-cache-performance",
|
|
128
|
+
"version": 1
|
|
129
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "ECIP Query Service (M04) — Cross-repo fan-out depth distribution, cycle warnings",
|
|
3
|
+
"editable": true,
|
|
4
|
+
"panels": [
|
|
5
|
+
{
|
|
6
|
+
"title": "Cross-Repo Fan-out",
|
|
7
|
+
"type": "row",
|
|
8
|
+
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
9
|
+
"collapsed": false
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"title": "Fan-out Depth Distribution",
|
|
13
|
+
"type": "histogram",
|
|
14
|
+
"datasource": "Prometheus",
|
|
15
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
|
|
16
|
+
"fieldConfig": {
|
|
17
|
+
"defaults": { "unit": "short" }
|
|
18
|
+
},
|
|
19
|
+
"targets": [
|
|
20
|
+
{
|
|
21
|
+
"expr": "sum(rate(cross_repo_fanout_count_bucket[5m])) by (le, depth)",
|
|
22
|
+
"legendFormat": "depth={{depth}}"
|
|
23
|
+
}
|
|
24
|
+
]
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"title": "Fan-out Depth Over Time",
|
|
28
|
+
"type": "timeseries",
|
|
29
|
+
"datasource": "Prometheus",
|
|
30
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
|
|
31
|
+
"fieldConfig": {
|
|
32
|
+
"defaults": { "unit": "short" }
|
|
33
|
+
},
|
|
34
|
+
"targets": [
|
|
35
|
+
{
|
|
36
|
+
"expr": "histogram_quantile(0.95, sum(rate(cross_repo_fanout_count_bucket[5m])) by (le))",
|
|
37
|
+
"legendFormat": "p95 depth"
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"expr": "histogram_quantile(0.50, sum(rate(cross_repo_fanout_count_bucket[5m])) by (le))",
|
|
41
|
+
"legendFormat": "p50 depth"
|
|
42
|
+
}
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"title": "Fan-out by Repo (top 10)",
|
|
47
|
+
"type": "bargauge",
|
|
48
|
+
"datasource": "Prometheus",
|
|
49
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
|
50
|
+
"fieldConfig": {
|
|
51
|
+
"defaults": { "unit": "short" }
|
|
52
|
+
},
|
|
53
|
+
"targets": [
|
|
54
|
+
{
|
|
55
|
+
"expr": "topk(10, sum(rate(cross_repo_fanout_count_count[5m])) by (repo))",
|
|
56
|
+
"legendFormat": "{{repo}}",
|
|
57
|
+
"instant": true
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"title": "Depth > 2 Warnings (Cycle Risk)",
|
|
63
|
+
"type": "timeseries",
|
|
64
|
+
"datasource": "Prometheus",
|
|
65
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
|
66
|
+
"fieldConfig": {
|
|
67
|
+
"defaults": {
|
|
68
|
+
"unit": "short",
|
|
69
|
+
"thresholds": {
|
|
70
|
+
"mode": "absolute",
|
|
71
|
+
"steps": [
|
|
72
|
+
{ "color": "green", "value": null },
|
|
73
|
+
{ "color": "red", "value": 1 }
|
|
74
|
+
]
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
"targets": [
|
|
79
|
+
{
|
|
80
|
+
"expr": "sum(rate(cross_repo_fanout_count_bucket{le=\"+Inf\"}[5m])) - sum(rate(cross_repo_fanout_count_bucket{le=\"2\"}[5m]))",
|
|
81
|
+
"legendFormat": "Depth > 2 queries"
|
|
82
|
+
}
|
|
83
|
+
]
|
|
84
|
+
}
|
|
85
|
+
],
|
|
86
|
+
"refresh": "30s",
|
|
87
|
+
"schemaVersion": 39,
|
|
88
|
+
"tags": ["ecip", "m04", "cross-repo", "fanout"],
|
|
89
|
+
"time": { "from": "now-6h", "to": "now" },
|
|
90
|
+
"title": "ECIP — Cross-Repo Fan-out",
|
|
91
|
+
"uid": "ecip-cross-repo-fanout",
|
|
92
|
+
"version": 1
|
|
93
|
+
}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "ECIP Event Bus (M07) — DLQ depth, DLQ age, retry counts",
|
|
3
|
+
"editable": true,
|
|
4
|
+
"panels": [
|
|
5
|
+
{
|
|
6
|
+
"title": "Event Bus DLQ",
|
|
7
|
+
"type": "row",
|
|
8
|
+
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
9
|
+
"collapsed": false
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"title": "DLQ Depth by Topic",
|
|
13
|
+
"type": "stat",
|
|
14
|
+
"datasource": "Prometheus",
|
|
15
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
|
|
16
|
+
"fieldConfig": {
|
|
17
|
+
"defaults": {
|
|
18
|
+
"unit": "short",
|
|
19
|
+
"thresholds": {
|
|
20
|
+
"mode": "absolute",
|
|
21
|
+
"steps": [
|
|
22
|
+
{ "color": "green", "value": null },
|
|
23
|
+
{ "color": "yellow", "value": 50 },
|
|
24
|
+
{ "color": "red", "value": 100 }
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"targets": [
|
|
30
|
+
{
|
|
31
|
+
"expr": "event_bus_dlq_depth",
|
|
32
|
+
"legendFormat": "{{topic}}"
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"title": "DLQ Depth Over Time",
|
|
38
|
+
"type": "timeseries",
|
|
39
|
+
"datasource": "Prometheus",
|
|
40
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
|
|
41
|
+
"fieldConfig": {
|
|
42
|
+
"defaults": { "unit": "short" }
|
|
43
|
+
},
|
|
44
|
+
"targets": [
|
|
45
|
+
{
|
|
46
|
+
"expr": "event_bus_dlq_depth",
|
|
47
|
+
"legendFormat": "{{topic}}"
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"title": "Webhook Processing Latency (p95)",
|
|
53
|
+
"type": "timeseries",
|
|
54
|
+
"datasource": "Prometheus",
|
|
55
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
|
56
|
+
"fieldConfig": {
|
|
57
|
+
"defaults": { "unit": "ms" }
|
|
58
|
+
},
|
|
59
|
+
"targets": [
|
|
60
|
+
{
|
|
61
|
+
"expr": "histogram_quantile(0.95, sum(rate(webhook_processing_duration_ms_bucket[5m])) by (le, topic))",
|
|
62
|
+
"legendFormat": "p95 — {{topic}}"
|
|
63
|
+
}
|
|
64
|
+
]
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"title": "Kafka Consumer Lag",
|
|
68
|
+
"type": "timeseries",
|
|
69
|
+
"datasource": "Prometheus",
|
|
70
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
|
71
|
+
"fieldConfig": {
|
|
72
|
+
"defaults": { "unit": "short" }
|
|
73
|
+
},
|
|
74
|
+
"targets": [
|
|
75
|
+
{
|
|
76
|
+
"expr": "sum(kafka_consumergroup_lag{group=~\"ecip-event-bus.*\"}) by (topic)",
|
|
77
|
+
"legendFormat": "{{topic}}"
|
|
78
|
+
}
|
|
79
|
+
]
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"title": "DLQ Retry Counts",
|
|
83
|
+
"type": "timeseries",
|
|
84
|
+
"datasource": "Prometheus",
|
|
85
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 },
|
|
86
|
+
"fieldConfig": {
|
|
87
|
+
"defaults": { "unit": "short" }
|
|
88
|
+
},
|
|
89
|
+
"targets": [
|
|
90
|
+
{
|
|
91
|
+
"expr": "sum(rate(event_bus_dlq_retries_total[5m])) by (topic)",
|
|
92
|
+
"legendFormat": "{{topic}}"
|
|
93
|
+
}
|
|
94
|
+
]
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"title": "DLQ Message Age (oldest)",
|
|
98
|
+
"type": "stat",
|
|
99
|
+
"datasource": "Prometheus",
|
|
100
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 },
|
|
101
|
+
"fieldConfig": {
|
|
102
|
+
"defaults": {
|
|
103
|
+
"unit": "s",
|
|
104
|
+
"thresholds": {
|
|
105
|
+
"mode": "absolute",
|
|
106
|
+
"steps": [
|
|
107
|
+
{ "color": "green", "value": null },
|
|
108
|
+
{ "color": "yellow", "value": 3600 },
|
|
109
|
+
{ "color": "red", "value": 86400 }
|
|
110
|
+
]
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
},
|
|
114
|
+
"targets": [
|
|
115
|
+
{
|
|
116
|
+
"expr": "max(event_bus_dlq_oldest_message_age_seconds) by (topic)",
|
|
117
|
+
"legendFormat": "{{topic}}"
|
|
118
|
+
}
|
|
119
|
+
]
|
|
120
|
+
}
|
|
121
|
+
],
|
|
122
|
+
"refresh": "30s",
|
|
123
|
+
"schemaVersion": 39,
|
|
124
|
+
"tags": ["ecip", "m07", "event-bus", "dlq"],
|
|
125
|
+
"time": { "from": "now-3h", "to": "now" },
|
|
126
|
+
"title": "ECIP — Event Bus DLQ",
|
|
127
|
+
"uid": "ecip-event-bus-dlq",
|
|
128
|
+
"version": 1
|
|
129
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "ECIP Analysis Engine (M02) — LSP daemon status, restart rate, OOM events",
|
|
3
|
+
"editable": true,
|
|
4
|
+
"panels": [
|
|
5
|
+
{
|
|
6
|
+
"title": "LSP Daemon Health",
|
|
7
|
+
"type": "row",
|
|
8
|
+
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
9
|
+
"collapsed": false
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"title": "LSP Daemon Restart Rate (/hour)",
|
|
13
|
+
"type": "stat",
|
|
14
|
+
"datasource": "Prometheus",
|
|
15
|
+
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 1 },
|
|
16
|
+
"fieldConfig": {
|
|
17
|
+
"defaults": {
|
|
18
|
+
"unit": "short",
|
|
19
|
+
"thresholds": {
|
|
20
|
+
"mode": "absolute",
|
|
21
|
+
"steps": [
|
|
22
|
+
{ "color": "green", "value": null },
|
|
23
|
+
{ "color": "yellow", "value": 1 },
|
|
24
|
+
{ "color": "red", "value": 2 }
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"targets": [
|
|
30
|
+
{
|
|
31
|
+
"expr": "sum(rate(lsp_daemon_restarts_total[1h])) by (repo, language)",
|
|
32
|
+
"legendFormat": "{{repo}} — {{language}}"
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"title": "LSP Daemon Restarts Over Time",
|
|
38
|
+
"type": "timeseries",
|
|
39
|
+
"datasource": "Prometheus",
|
|
40
|
+
"gridPos": { "h": 8, "w": 16, "x": 8, "y": 1 },
|
|
41
|
+
"fieldConfig": {
|
|
42
|
+
"defaults": { "unit": "short" }
|
|
43
|
+
},
|
|
44
|
+
"targets": [
|
|
45
|
+
{
|
|
46
|
+
"expr": "sum(increase(lsp_daemon_restarts_total[1h])) by (repo, language)",
|
|
47
|
+
"legendFormat": "{{repo}} — {{language}}"
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"title": "LSP Daemon Memory Usage",
|
|
53
|
+
"type": "timeseries",
|
|
54
|
+
"datasource": "Prometheus",
|
|
55
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
|
56
|
+
"fieldConfig": {
|
|
57
|
+
"defaults": { "unit": "bytes" }
|
|
58
|
+
},
|
|
59
|
+
"targets": [
|
|
60
|
+
{
|
|
61
|
+
"expr": "container_memory_working_set_bytes{pod=~\"lsp-daemon.*\"}",
|
|
62
|
+
"legendFormat": "{{pod}}"
|
|
63
|
+
}
|
|
64
|
+
]
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"title": "LSP Daemon CPU Usage",
|
|
68
|
+
"type": "timeseries",
|
|
69
|
+
"datasource": "Prometheus",
|
|
70
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
|
71
|
+
"fieldConfig": {
|
|
72
|
+
"defaults": { "unit": "short" }
|
|
73
|
+
},
|
|
74
|
+
"targets": [
|
|
75
|
+
{
|
|
76
|
+
"expr": "rate(container_cpu_usage_seconds_total{pod=~\"lsp-daemon.*\"}[5m])",
|
|
77
|
+
"legendFormat": "{{pod}}"
|
|
78
|
+
}
|
|
79
|
+
]
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"title": "OOM Kill Events",
|
|
83
|
+
"type": "timeseries",
|
|
84
|
+
"datasource": "Prometheus",
|
|
85
|
+
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 17 },
|
|
86
|
+
"fieldConfig": {
|
|
87
|
+
"defaults": { "unit": "short" }
|
|
88
|
+
},
|
|
89
|
+
"targets": [
|
|
90
|
+
{
|
|
91
|
+
"expr": "sum(increase(kube_pod_container_status_last_terminated_reason{container=~\"lsp-daemon.*\", reason=\"OOMKilled\"}[1h])) by (pod)",
|
|
92
|
+
"legendFormat": "OOM — {{pod}}"
|
|
93
|
+
}
|
|
94
|
+
]
|
|
95
|
+
}
|
|
96
|
+
],
|
|
97
|
+
"refresh": "30s",
|
|
98
|
+
"schemaVersion": 39,
|
|
99
|
+
"tags": ["ecip", "m02", "lsp-daemon", "health"],
|
|
100
|
+
"time": { "from": "now-3h", "to": "now" },
|
|
101
|
+
"title": "ECIP — LSP Daemon Health",
|
|
102
|
+
"uid": "ecip-lsp-daemon-health",
|
|
103
|
+
"version": 1
|
|
104
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "ECIP Query Service (M04) + MCP Server (M05) — Fan-out topology, latency per target_repo",
|
|
3
|
+
"editable": true,
|
|
4
|
+
"panels": [
|
|
5
|
+
{
|
|
6
|
+
"title": "MCP Call Graph",
|
|
7
|
+
"type": "row",
|
|
8
|
+
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
9
|
+
"collapsed": false
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"title": "MCP Call Duration p95 by Tool",
|
|
13
|
+
"type": "timeseries",
|
|
14
|
+
"datasource": "Prometheus",
|
|
15
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
|
|
16
|
+
"fieldConfig": {
|
|
17
|
+
"defaults": {
|
|
18
|
+
"unit": "ms",
|
|
19
|
+
"thresholds": {
|
|
20
|
+
"mode": "absolute",
|
|
21
|
+
"steps": [
|
|
22
|
+
{ "color": "green", "value": null },
|
|
23
|
+
{ "color": "yellow", "value": 500 },
|
|
24
|
+
{ "color": "red", "value": 800 }
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"targets": [
|
|
30
|
+
{
|
|
31
|
+
"expr": "histogram_quantile(0.95, sum(rate(mcp_call_duration_ms_bucket[5m])) by (le, tool_name))",
|
|
32
|
+
"legendFormat": "{{tool_name}}"
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"title": "MCP Call Duration p95 by Target Repo",
|
|
38
|
+
"type": "timeseries",
|
|
39
|
+
"datasource": "Prometheus",
|
|
40
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
|
|
41
|
+
"fieldConfig": {
|
|
42
|
+
"defaults": { "unit": "ms" }
|
|
43
|
+
},
|
|
44
|
+
"targets": [
|
|
45
|
+
{
|
|
46
|
+
"expr": "histogram_quantile(0.95, sum(rate(mcp_call_duration_ms_bucket[5m])) by (le, target_repo))",
|
|
47
|
+
"legendFormat": "{{target_repo}}"
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"title": "MCP Call Rate",
|
|
53
|
+
"type": "timeseries",
|
|
54
|
+
"datasource": "Prometheus",
|
|
55
|
+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
|
|
56
|
+
"fieldConfig": {
|
|
57
|
+
"defaults": { "unit": "reqps" }
|
|
58
|
+
},
|
|
59
|
+
"targets": [
|
|
60
|
+
{
|
|
61
|
+
"expr": "sum(rate(mcp_call_duration_ms_count[5m])) by (tool_name)",
|
|
62
|
+
"legendFormat": "{{tool_name}}"
|
|
63
|
+
}
|
|
64
|
+
]
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"title": "MCP Auth Failure Rate",
|
|
68
|
+
"type": "timeseries",
|
|
69
|
+
"datasource": "Prometheus",
|
|
70
|
+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
|
|
71
|
+
"fieldConfig": {
|
|
72
|
+
"defaults": { "unit": "short" }
|
|
73
|
+
},
|
|
74
|
+
"targets": [
|
|
75
|
+
{
|
|
76
|
+
"expr": "sum(rate(auth_failure_total{module=\"M05\"}[5m]))",
|
|
77
|
+
"legendFormat": "Auth Failures/s"
|
|
78
|
+
}
|
|
79
|
+
]
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"title": "MCP Fan-out Trace View",
|
|
83
|
+
"type": "nodeGraph",
|
|
84
|
+
"datasource": "Tempo",
|
|
85
|
+
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 17 },
|
|
86
|
+
"targets": [
|
|
87
|
+
{
|
|
88
|
+
"queryType": "traceql",
|
|
89
|
+
"query": "{ span.ecip.module = \"M05\" }"
|
|
90
|
+
}
|
|
91
|
+
]
|
|
92
|
+
}
|
|
93
|
+
],
|
|
94
|
+
"refresh": "30s",
|
|
95
|
+
"schemaVersion": 39,
|
|
96
|
+
"tags": ["ecip", "m04", "m05", "mcp", "call-graph"],
|
|
97
|
+
"templating": {
|
|
98
|
+
"list": [
|
|
99
|
+
{
|
|
100
|
+
"name": "target_repo",
|
|
101
|
+
"type": "query",
|
|
102
|
+
"datasource": "Prometheus",
|
|
103
|
+
"query": "label_values(mcp_call_duration_ms_bucket, target_repo)",
|
|
104
|
+
"refresh": 2,
|
|
105
|
+
"includeAll": true,
|
|
106
|
+
"multi": true
|
|
107
|
+
}
|
|
108
|
+
]
|
|
109
|
+
},
|
|
110
|
+
"time": { "from": "now-1h", "to": "now" },
|
|
111
|
+
"title": "ECIP — MCP Call Graph",
|
|
112
|
+
"uid": "ecip-mcp-call-graph",
|
|
113
|
+
"version": 1
|
|
114
|
+
}
|